diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,299805 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.9994371570175575, + "eval_steps": 500, + "global_step": 19985, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 3.273681163787842, + "learning_rate": 9.999999349964125e-06, + "logits/chosen": -0.16166402399539948, + "logits/rejected": -0.21703395247459412, + "logps/chosen": -64.20034790039062, + "logps/rejected": -65.11337280273438, + "loss": 1.3334, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.9753754138946533, + "rewards/margins": 0.9966704249382019, + "rewards/rejected": 1.9787051677703857, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 3.1387758255004883, + "learning_rate": 9.999997399856662e-06, + "logits/chosen": -0.06945215165615082, + "logits/rejected": -0.1069088950753212, + "logps/chosen": -57.47087860107422, + "logps/rejected": -70.34770202636719, + "loss": 1.0921, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.97312068939209, + "rewards/margins": 1.2836748361587524, + "rewards/rejected": 1.6894458532333374, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 3.044142961502075, + "learning_rate": 9.99999414967812e-06, + "logits/chosen": 0.0016160234808921814, + "logits/rejected": -0.0816667228937149, + "logps/chosen": -68.4052734375, + "logps/rejected": -74.39248657226562, + "loss": 1.0203, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.2505152225494385, + "rewards/margins": 1.4675949811935425, + "rewards/rejected": 1.7829201221466064, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 2.6102373600006104, + "learning_rate": 9.999989599429347e-06, + "logits/chosen": 0.017875289544463158, + "logits/rejected": -0.1269870549440384, + "logps/chosen": -58.92835998535156, + "logps/rejected": -48.032073974609375, + "loss": 1.0273, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0204176902770996, + "rewards/margins": 1.4392646551132202, + "rewards/rejected": 1.581153154373169, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 2.126955032348633, + "learning_rate": 9.999983749111526e-06, + "logits/chosen": -0.07422295212745667, + "logits/rejected": -0.25610414147377014, + "logps/chosen": -70.56480407714844, + "logps/rejected": -72.65422058105469, + "loss": 0.878, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1630191802978516, + "rewards/margins": 2.000363826751709, + "rewards/rejected": 1.1626557111740112, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 3.008718490600586, + "learning_rate": 9.999976598726174e-06, + "logits/chosen": -0.008189070969820023, + "logits/rejected": -0.0948963612318039, + "logps/chosen": -80.30921173095703, + "logps/rejected": -60.57290267944336, + "loss": 1.0402, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.4299185276031494, + "rewards/margins": 1.6280428171157837, + "rewards/rejected": 1.8018759489059448, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 3.0797228813171387, + "learning_rate": 9.999968148275154e-06, + "logits/chosen": 0.009884568862617016, + "logits/rejected": -0.03475647419691086, + "logps/chosen": -47.7955322265625, + "logps/rejected": -58.826297760009766, + "loss": 1.0311, + "rewards/accuracies": 0.65625, + "rewards/chosen": 2.9642348289489746, + "rewards/margins": 1.0639293193817139, + "rewards/rejected": 1.9003055095672607, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 2.2473108768463135, + "learning_rate": 9.99995839776066e-06, + "logits/chosen": -0.10040563344955444, + "logits/rejected": -0.2163020819425583, + "logps/chosen": -45.00971221923828, + "logps/rejected": -48.24774169921875, + "loss": 0.8511, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.880312442779541, + "rewards/margins": 1.870854377746582, + "rewards/rejected": 1.0094581842422485, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 3.218623399734497, + "learning_rate": 9.99994734718523e-06, + "logits/chosen": 0.05619804561138153, + "logits/rejected": -0.08700394630432129, + "logps/chosen": -65.03401947021484, + "logps/rejected": -63.973304748535156, + "loss": 1.2246, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.8980600833892822, + "rewards/margins": 0.9749429821968079, + "rewards/rejected": 1.923116683959961, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 3.5940308570861816, + "learning_rate": 9.999934996551737e-06, + "logits/chosen": -0.1211872547864914, + "logits/rejected": -0.07561437785625458, + "logps/chosen": -62.423377990722656, + "logps/rejected": -82.02760314941406, + "loss": 1.2027, + "rewards/accuracies": 0.6875, + "rewards/chosen": 3.15065336227417, + "rewards/margins": 0.9632331728935242, + "rewards/rejected": 2.18742036819458, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 3.1467435359954834, + "learning_rate": 9.99992134586339e-06, + "logits/chosen": -0.0853455513715744, + "logits/rejected": -0.01855245605111122, + "logps/chosen": -54.4730110168457, + "logps/rejected": -82.02916717529297, + "loss": 1.0754, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.965994358062744, + "rewards/margins": 0.6599307060241699, + "rewards/rejected": 2.306063652038574, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 3.905104637145996, + "learning_rate": 9.999906395123742e-06, + "logits/chosen": -0.00011751428246498108, + "logits/rejected": -0.013090938329696655, + "logps/chosen": -54.47154235839844, + "logps/rejected": -92.17998504638672, + "loss": 1.1048, + "rewards/accuracies": 0.65625, + "rewards/chosen": 3.167574405670166, + "rewards/margins": 1.0644688606262207, + "rewards/rejected": 2.1031055450439453, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 2.3545000553131104, + "learning_rate": 9.999890144336677e-06, + "logits/chosen": -0.015023558400571346, + "logits/rejected": -0.150527223944664, + "logps/chosen": -58.7061653137207, + "logps/rejected": -51.65465545654297, + "loss": 0.9349, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.1642041206359863, + "rewards/margins": 1.939386248588562, + "rewards/rejected": 1.2248177528381348, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 2.778261661529541, + "learning_rate": 9.999872593506424e-06, + "logits/chosen": -0.05001533403992653, + "logits/rejected": -0.018217170611023903, + "logps/chosen": -56.267696380615234, + "logps/rejected": -83.89161682128906, + "loss": 1.0403, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.0295534133911133, + "rewards/margins": 1.4474811553955078, + "rewards/rejected": 1.5820724964141846, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 2.11838436126709, + "learning_rate": 9.999853742637541e-06, + "logits/chosen": -0.03676134720444679, + "logits/rejected": -0.24181373417377472, + "logps/chosen": -61.83561706542969, + "logps/rejected": -47.70132827758789, + "loss": 0.8885, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7518839836120605, + "rewards/margins": 1.9924577474594116, + "rewards/rejected": 0.7594259977340698, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 2.0100903511047363, + "learning_rate": 9.999833591734936e-06, + "logits/chosen": -0.0704524964094162, + "logits/rejected": -0.11501966416835785, + "logps/chosen": -63.614341735839844, + "logps/rejected": -66.67377471923828, + "loss": 0.9026, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2035415172576904, + "rewards/margins": 2.0127768516540527, + "rewards/rejected": 1.1907644271850586, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 3.1919100284576416, + "learning_rate": 9.999812140803845e-06, + "logits/chosen": -0.07021776586771011, + "logits/rejected": -0.20563620328903198, + "logps/chosen": -77.16609191894531, + "logps/rejected": -59.309837341308594, + "loss": 1.0672, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2198853492736816, + "rewards/margins": 1.1816216707229614, + "rewards/rejected": 2.0382637977600098, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 3.102100372314453, + "learning_rate": 9.999789389849845e-06, + "logits/chosen": -0.05990475416183472, + "logits/rejected": -0.20218171179294586, + "logps/chosen": -58.1601676940918, + "logps/rejected": -80.13116455078125, + "loss": 1.1035, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.19290828704834, + "rewards/margins": 1.4586477279663086, + "rewards/rejected": 1.7342603206634521, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 2.8066976070404053, + "learning_rate": 9.999765338878852e-06, + "logits/chosen": -0.027603629976511, + "logits/rejected": -0.12308358401060104, + "logps/chosen": -53.49861145019531, + "logps/rejected": -63.44694137573242, + "loss": 0.924, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.437321186065674, + "rewards/margins": 1.6950843334197998, + "rewards/rejected": 1.7422370910644531, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 2.8494133949279785, + "learning_rate": 9.999739987897124e-06, + "logits/chosen": -0.0713205486536026, + "logits/rejected": -0.13479867577552795, + "logps/chosen": -52.27033996582031, + "logps/rejected": -56.57366180419922, + "loss": 0.9775, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.98689341545105, + "rewards/margins": 1.3878891468048096, + "rewards/rejected": 1.5990039110183716, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 3.7892861366271973, + "learning_rate": 9.999713336911245e-06, + "logits/chosen": 0.09345690160989761, + "logits/rejected": -0.031678494065999985, + "logps/chosen": -74.45344543457031, + "logps/rejected": -77.19050598144531, + "loss": 1.078, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.086650848388672, + "rewards/margins": 1.3593053817749023, + "rewards/rejected": 1.7273457050323486, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 3.659702777862549, + "learning_rate": 9.999685385928149e-06, + "logits/chosen": -0.06531790643930435, + "logits/rejected": -0.0323018804192543, + "logps/chosen": -74.09233856201172, + "logps/rejected": -88.57118225097656, + "loss": 1.1779, + "rewards/accuracies": 0.6875, + "rewards/chosen": 3.1275784969329834, + "rewards/margins": 1.3818397521972656, + "rewards/rejected": 1.7457386255264282, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 3.2873880863189697, + "learning_rate": 9.999656134955103e-06, + "logits/chosen": 0.017400119453668594, + "logits/rejected": -0.04944979026913643, + "logps/chosen": -71.90046691894531, + "logps/rejected": -86.36780548095703, + "loss": 0.9766, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.045818328857422, + "rewards/margins": 1.503466010093689, + "rewards/rejected": 1.5423526763916016, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 3.540911912918091, + "learning_rate": 9.999625583999715e-06, + "logits/chosen": -0.07995801419019699, + "logits/rejected": -0.14553876221179962, + "logps/chosen": -72.7781982421875, + "logps/rejected": -78.58328247070312, + "loss": 1.2454, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.809947967529297, + "rewards/margins": 0.8456104397773743, + "rewards/rejected": 1.9643375873565674, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 3.0408458709716797, + "learning_rate": 9.999593733069923e-06, + "logits/chosen": 0.0035694651305675507, + "logits/rejected": -0.04211260750889778, + "logps/chosen": -59.06121063232422, + "logps/rejected": -80.56253814697266, + "loss": 1.0159, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7074859142303467, + "rewards/margins": 1.0912353992462158, + "rewards/rejected": 1.6162505149841309, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 3.514573574066162, + "learning_rate": 9.999560582174016e-06, + "logits/chosen": 0.009016351774334908, + "logits/rejected": -0.0023085158318281174, + "logps/chosen": -61.99059295654297, + "logps/rejected": -88.89909362792969, + "loss": 1.0171, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.0426576137542725, + "rewards/margins": 1.0861295461654663, + "rewards/rejected": 1.9565284252166748, + "step": 26 + }, + { + "epoch": 0.01, + "grad_norm": 3.4679737091064453, + "learning_rate": 9.999526131320606e-06, + "logits/chosen": -0.08791570365428925, + "logits/rejected": -0.08542875945568085, + "logps/chosen": -53.204315185546875, + "logps/rejected": -70.977783203125, + "loss": 1.0588, + "rewards/accuracies": 0.71875, + "rewards/chosen": 3.0560054779052734, + "rewards/margins": 1.207879900932312, + "rewards/rejected": 1.8481253385543823, + "step": 27 + }, + { + "epoch": 0.01, + "grad_norm": 3.9277658462524414, + "learning_rate": 9.999490380518658e-06, + "logits/chosen": -0.13154760003089905, + "logits/rejected": -0.25351575016975403, + "logps/chosen": -62.64826965332031, + "logps/rejected": -65.86223602294922, + "loss": 1.1436, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.918555974960327, + "rewards/margins": 1.5298724174499512, + "rewards/rejected": 1.3886833190917969, + "step": 28 + }, + { + "epoch": 0.01, + "grad_norm": 2.9920215606689453, + "learning_rate": 9.999453329777461e-06, + "logits/chosen": 0.014897819608449936, + "logits/rejected": -0.22311806678771973, + "logps/chosen": -64.84994506835938, + "logps/rejected": -49.53276443481445, + "loss": 0.9888, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.918053388595581, + "rewards/margins": 1.6323931217193604, + "rewards/rejected": 1.2856602668762207, + "step": 29 + }, + { + "epoch": 0.01, + "grad_norm": 3.035663604736328, + "learning_rate": 9.999414979106654e-06, + "logits/chosen": -0.0588434599339962, + "logits/rejected": -0.0734860822558403, + "logps/chosen": -51.25040817260742, + "logps/rejected": -56.32437515258789, + "loss": 1.0277, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.296513319015503, + "rewards/margins": 1.586956262588501, + "rewards/rejected": 1.7095569372177124, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 2.8137049674987793, + "learning_rate": 9.999375328516205e-06, + "logits/chosen": -0.05161597579717636, + "logits/rejected": -0.16244350373744965, + "logps/chosen": -48.867401123046875, + "logps/rejected": -53.51808166503906, + "loss": 0.9697, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7132744789123535, + "rewards/margins": 1.506608009338379, + "rewards/rejected": 1.2066667079925537, + "step": 31 + }, + { + "epoch": 0.01, + "grad_norm": 2.7211458683013916, + "learning_rate": 9.999334378016427e-06, + "logits/chosen": -0.04231391102075577, + "logits/rejected": -0.185062974691391, + "logps/chosen": -59.84729766845703, + "logps/rejected": -51.48360061645508, + "loss": 0.9936, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8169918060302734, + "rewards/margins": 1.7715940475463867, + "rewards/rejected": 1.0453977584838867, + "step": 32 + }, + { + "epoch": 0.01, + "grad_norm": 3.4365739822387695, + "learning_rate": 9.999292127617967e-06, + "logits/chosen": 0.018600571900606155, + "logits/rejected": -0.13904352486133575, + "logps/chosen": -70.50194549560547, + "logps/rejected": -88.88104248046875, + "loss": 1.0774, + "rewards/accuracies": 0.65625, + "rewards/chosen": 2.8983802795410156, + "rewards/margins": 1.2773256301879883, + "rewards/rejected": 1.6210548877716064, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 3.502842903137207, + "learning_rate": 9.999248577331808e-06, + "logits/chosen": 0.06844222545623779, + "logits/rejected": 0.022323615849018097, + "logps/chosen": -64.09044647216797, + "logps/rejected": -67.67097473144531, + "loss": 1.1581, + "rewards/accuracies": 0.65625, + "rewards/chosen": 2.6608104705810547, + "rewards/margins": 1.0492192506790161, + "rewards/rejected": 1.6115915775299072, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 3.856783390045166, + "learning_rate": 9.999203727169277e-06, + "logits/chosen": -0.00876910425722599, + "logits/rejected": -0.11184675246477127, + "logps/chosen": -66.01416778564453, + "logps/rejected": -86.54437255859375, + "loss": 1.0588, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.054853677749634, + "rewards/margins": 1.176379919052124, + "rewards/rejected": 1.8784737586975098, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 2.599158763885498, + "learning_rate": 9.999157577142033e-06, + "logits/chosen": -0.06559039652347565, + "logits/rejected": -0.14826267957687378, + "logps/chosen": -60.53675079345703, + "logps/rejected": -88.48033905029297, + "loss": 0.9371, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9633586406707764, + "rewards/margins": 1.547027349472046, + "rewards/rejected": 1.4163309335708618, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 2.9695675373077393, + "learning_rate": 9.999110127262077e-06, + "logits/chosen": -0.09949982911348343, + "logits/rejected": -0.12177674472332001, + "logps/chosen": -73.2532730102539, + "logps/rejected": -74.38162994384766, + "loss": 1.0594, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.155057430267334, + "rewards/margins": 1.2490328550338745, + "rewards/rejected": 1.906024694442749, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 3.334345579147339, + "learning_rate": 9.999061377541748e-06, + "logits/chosen": -0.09295891225337982, + "logits/rejected": -0.11261093616485596, + "logps/chosen": -52.58061981201172, + "logps/rejected": -69.38739013671875, + "loss": 0.9943, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5816242694854736, + "rewards/margins": 1.1752108335494995, + "rewards/rejected": 1.4064133167266846, + "step": 38 + }, + { + "epoch": 0.01, + "grad_norm": 3.9989657402038574, + "learning_rate": 9.99901132799372e-06, + "logits/chosen": -0.004113279283046722, + "logits/rejected": -0.014212670736014843, + "logps/chosen": -62.69401168823242, + "logps/rejected": -74.89227294921875, + "loss": 1.2249, + "rewards/accuracies": 0.625, + "rewards/chosen": 2.8183493614196777, + "rewards/margins": 0.8790755271911621, + "rewards/rejected": 1.9392738342285156, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 3.096339464187622, + "learning_rate": 9.998959978631006e-06, + "logits/chosen": 0.04865167289972305, + "logits/rejected": 0.005235133692622185, + "logps/chosen": -58.64119338989258, + "logps/rejected": -82.06582641601562, + "loss": 0.9861, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8730785846710205, + "rewards/margins": 1.384037733078003, + "rewards/rejected": 1.489040732383728, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 3.4598705768585205, + "learning_rate": 9.998907329466957e-06, + "logits/chosen": -0.06384848058223724, + "logits/rejected": -0.11329671740531921, + "logps/chosen": -57.77061462402344, + "logps/rejected": -63.26806640625, + "loss": 1.0456, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8665170669555664, + "rewards/margins": 1.1199690103530884, + "rewards/rejected": 1.7465479373931885, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 2.599841356277466, + "learning_rate": 9.998853380515267e-06, + "logits/chosen": -0.05517507344484329, + "logits/rejected": -0.23606090247631073, + "logps/chosen": -55.50849533081055, + "logps/rejected": -45.08019256591797, + "loss": 0.8954, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6887753009796143, + "rewards/margins": 1.829927682876587, + "rewards/rejected": 0.8588474988937378, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 3.4832816123962402, + "learning_rate": 9.998798131789959e-06, + "logits/chosen": -0.027126457542181015, + "logits/rejected": 0.006350012496113777, + "logps/chosen": -61.91757583618164, + "logps/rejected": -75.81735229492188, + "loss": 1.1212, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.6846566200256348, + "rewards/margins": 1.0821127891540527, + "rewards/rejected": 1.6025437116622925, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 2.9918806552886963, + "learning_rate": 9.9987415833054e-06, + "logits/chosen": 0.06795856356620789, + "logits/rejected": -0.059141356498003006, + "logps/chosen": -72.87397766113281, + "logps/rejected": -84.25154113769531, + "loss": 0.8407, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8242673873901367, + "rewards/margins": 1.7203781604766846, + "rewards/rejected": 1.1038891077041626, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 3.2039031982421875, + "learning_rate": 9.998683735076293e-06, + "logits/chosen": 0.037045009434223175, + "logits/rejected": -0.15925145149230957, + "logps/chosen": -75.11713409423828, + "logps/rejected": -47.80813217163086, + "loss": 0.9904, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.722465753555298, + "rewards/margins": 1.2359952926635742, + "rewards/rejected": 1.486470341682434, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 3.1800696849823, + "learning_rate": 9.99862458711768e-06, + "logits/chosen": 0.010354779660701752, + "logits/rejected": 0.016341859474778175, + "logps/chosen": -51.94514465332031, + "logps/rejected": -71.82947540283203, + "loss": 1.0211, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.862610101699829, + "rewards/margins": 1.2575209140777588, + "rewards/rejected": 1.6050891876220703, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 2.9826321601867676, + "learning_rate": 9.99856413944494e-06, + "logits/chosen": 0.05525381118059158, + "logits/rejected": -0.06859360635280609, + "logps/chosen": -73.58131408691406, + "logps/rejected": -63.68793487548828, + "loss": 0.9646, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7243528366088867, + "rewards/margins": 1.8238377571105957, + "rewards/rejected": 0.900515079498291, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 3.5509722232818604, + "learning_rate": 9.99850239207379e-06, + "logits/chosen": -0.05796542018651962, + "logits/rejected": -0.09665747731924057, + "logps/chosen": -62.328346252441406, + "logps/rejected": -85.74530792236328, + "loss": 1.2277, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.812417984008789, + "rewards/margins": 1.0587029457092285, + "rewards/rejected": 1.753714919090271, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 2.866922616958618, + "learning_rate": 9.998439345020286e-06, + "logits/chosen": -0.04412280023097992, + "logits/rejected": -0.20400750637054443, + "logps/chosen": -57.72895050048828, + "logps/rejected": -46.87387466430664, + "loss": 0.9295, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.951169729232788, + "rewards/margins": 2.014080762863159, + "rewards/rejected": 0.9370888471603394, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 2.66847825050354, + "learning_rate": 9.998374998300819e-06, + "logits/chosen": 0.0283362939953804, + "logits/rejected": 0.028597893193364143, + "logps/chosen": -59.56160354614258, + "logps/rejected": -65.88382720947266, + "loss": 0.8892, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.196286201477051, + "rewards/margins": 1.3232940435409546, + "rewards/rejected": 1.8729923963546753, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 3.2542874813079834, + "learning_rate": 9.998309351932124e-06, + "logits/chosen": -0.002019386738538742, + "logits/rejected": -0.022514499723911285, + "logps/chosen": -60.53147888183594, + "logps/rejected": -91.82856750488281, + "loss": 1.0297, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.0266451835632324, + "rewards/margins": 1.3905009031295776, + "rewards/rejected": 1.6361441612243652, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 2.4175500869750977, + "learning_rate": 9.998242405931267e-06, + "logits/chosen": -0.022709105163812637, + "logits/rejected": -0.11293812841176987, + "logps/chosen": -56.77979278564453, + "logps/rejected": -64.90644836425781, + "loss": 0.7812, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9246151447296143, + "rewards/margins": 1.9191956520080566, + "rewards/rejected": 1.0054194927215576, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 3.2432010173797607, + "learning_rate": 9.998174160315655e-06, + "logits/chosen": -0.07965365052223206, + "logits/rejected": -0.07104234397411346, + "logps/chosen": -56.008087158203125, + "logps/rejected": -73.97193908691406, + "loss": 1.0331, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.791998863220215, + "rewards/margins": 1.529675006866455, + "rewards/rejected": 1.2623239755630493, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 3.7014260292053223, + "learning_rate": 9.998104615103032e-06, + "logits/chosen": -0.05285903066396713, + "logits/rejected": -0.08332974463701248, + "logps/chosen": -60.01043701171875, + "logps/rejected": -70.28582763671875, + "loss": 1.0302, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.839449405670166, + "rewards/margins": 1.6668447256088257, + "rewards/rejected": 1.1726046800613403, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 3.57125186920166, + "learning_rate": 9.998033770311485e-06, + "logits/chosen": -0.07340162992477417, + "logits/rejected": -0.038314707577228546, + "logps/chosen": -51.46995544433594, + "logps/rejected": -68.1280517578125, + "loss": 1.0883, + "rewards/accuracies": 0.625, + "rewards/chosen": 2.5380873680114746, + "rewards/margins": 0.6064646244049072, + "rewards/rejected": 1.931622862815857, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 2.9917192459106445, + "learning_rate": 9.99796162595943e-06, + "logits/chosen": 0.015482936054468155, + "logits/rejected": 0.019359635189175606, + "logps/chosen": -61.00410461425781, + "logps/rejected": -107.36070251464844, + "loss": 0.904, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1259186267852783, + "rewards/margins": 1.54483962059021, + "rewards/rejected": 1.581079125404358, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 4.125944137573242, + "learning_rate": 9.997888182065627e-06, + "logits/chosen": 0.020910421386361122, + "logits/rejected": 0.057605840265750885, + "logps/chosen": -68.62923431396484, + "logps/rejected": -84.44517517089844, + "loss": 1.2528, + "rewards/accuracies": 0.65625, + "rewards/chosen": 2.5848801136016846, + "rewards/margins": 1.1454752683639526, + "rewards/rejected": 1.4394049644470215, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 3.588587999343872, + "learning_rate": 9.997813438649175e-06, + "logits/chosen": -0.10463173687458038, + "logits/rejected": -0.06443554908037186, + "logps/chosen": -55.42386245727539, + "logps/rejected": -92.3619155883789, + "loss": 1.0845, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.843074083328247, + "rewards/margins": 0.9032634496688843, + "rewards/rejected": 1.9398106336593628, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 3.4516804218292236, + "learning_rate": 9.997737395729504e-06, + "logits/chosen": 0.06148179620504379, + "logits/rejected": -0.006758004426956177, + "logps/chosen": -62.062931060791016, + "logps/rejected": -86.979736328125, + "loss": 1.0018, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.601923942565918, + "rewards/margins": 1.1556892395019531, + "rewards/rejected": 1.4462347030639648, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 3.73701548576355, + "learning_rate": 9.99766005332639e-06, + "logits/chosen": -0.034028347581624985, + "logits/rejected": -0.14404082298278809, + "logps/chosen": -67.96522521972656, + "logps/rejected": -55.164894104003906, + "loss": 1.1788, + "rewards/accuracies": 0.71875, + "rewards/chosen": 3.0976881980895996, + "rewards/margins": 1.6480218172073364, + "rewards/rejected": 1.4496665000915527, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 3.553609609603882, + "learning_rate": 9.99758141145994e-06, + "logits/chosen": -0.031238410621881485, + "logits/rejected": -0.10179577022790909, + "logps/chosen": -51.723388671875, + "logps/rejected": -69.338134765625, + "loss": 1.121, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.6213219165802, + "rewards/margins": 1.061032772064209, + "rewards/rejected": 1.5602887868881226, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 3.755044937133789, + "learning_rate": 9.997501470150606e-06, + "logits/chosen": 0.03518126159906387, + "logits/rejected": -0.043350741267204285, + "logps/chosen": -73.06661987304688, + "logps/rejected": -70.15767669677734, + "loss": 1.1329, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.941957712173462, + "rewards/margins": 1.133691668510437, + "rewards/rejected": 1.8082659244537354, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 3.497908353805542, + "learning_rate": 9.99742022941917e-06, + "logits/chosen": 0.056131523102521896, + "logits/rejected": 0.12236940860748291, + "logps/chosen": -73.78364562988281, + "logps/rejected": -109.87896728515625, + "loss": 1.1252, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7799201011657715, + "rewards/margins": 1.1792058944702148, + "rewards/rejected": 1.6007142066955566, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 2.71358060836792, + "learning_rate": 9.997337689286759e-06, + "logits/chosen": -0.15245100855827332, + "logits/rejected": -0.19022782146930695, + "logps/chosen": -50.32982635498047, + "logps/rejected": -61.30581283569336, + "loss": 0.945, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7927329540252686, + "rewards/margins": 1.6051889657974243, + "rewards/rejected": 1.1875442266464233, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 3.806593179702759, + "learning_rate": 9.99725384977483e-06, + "logits/chosen": -0.026418499648571014, + "logits/rejected": -0.07101378589868546, + "logps/chosen": -59.44581604003906, + "logps/rejected": -65.62005615234375, + "loss": 1.0806, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.945058822631836, + "rewards/margins": 1.564304232597351, + "rewards/rejected": 1.3807544708251953, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 3.265655755996704, + "learning_rate": 9.997168710905187e-06, + "logits/chosen": -0.0032281111925840378, + "logits/rejected": -0.07298119366168976, + "logps/chosen": -63.39714813232422, + "logps/rejected": -69.31575012207031, + "loss": 0.981, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7977383136749268, + "rewards/margins": 1.6510064601898193, + "rewards/rejected": 1.146732211112976, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 3.4294581413269043, + "learning_rate": 9.997082272699966e-06, + "logits/chosen": -0.02703140303492546, + "logits/rejected": -0.11627551913261414, + "logps/chosen": -70.42706298828125, + "logps/rejected": -87.13349151611328, + "loss": 1.0429, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.987126350402832, + "rewards/margins": 1.588019847869873, + "rewards/rejected": 1.3991062641143799, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 3.1271767616271973, + "learning_rate": 9.996994535181641e-06, + "logits/chosen": -0.0699058324098587, + "logits/rejected": -0.13659030199050903, + "logps/chosen": -65.52377319335938, + "logps/rejected": -67.21597290039062, + "loss": 1.0343, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.063401460647583, + "rewards/margins": 1.6412932872772217, + "rewards/rejected": 1.4221081733703613, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 2.851306915283203, + "learning_rate": 9.996905498373027e-06, + "logits/chosen": 0.11769165098667145, + "logits/rejected": -0.0016734832897782326, + "logps/chosen": -61.811912536621094, + "logps/rejected": -65.6927719116211, + "loss": 0.8776, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8726437091827393, + "rewards/margins": 1.923632264137268, + "rewards/rejected": 0.9490118622779846, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 1.7233465909957886, + "learning_rate": 9.996815162297272e-06, + "logits/chosen": 0.024383800104260445, + "logits/rejected": -0.10074977576732635, + "logps/chosen": -55.79561996459961, + "logps/rejected": -66.37539672851562, + "loss": 0.7592, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.131089210510254, + "rewards/margins": 2.1540284156799316, + "rewards/rejected": 0.9770609140396118, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 3.4741854667663574, + "learning_rate": 9.996723526977869e-06, + "logits/chosen": -0.08490478247404099, + "logits/rejected": -0.037364661693573, + "logps/chosen": -55.76829147338867, + "logps/rejected": -83.05870819091797, + "loss": 0.9942, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.8367953300476074, + "rewards/margins": 1.237148642539978, + "rewards/rejected": 1.5996464490890503, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 2.7470438480377197, + "learning_rate": 9.99663059243864e-06, + "logits/chosen": -0.15032806992530823, + "logits/rejected": -0.2669951915740967, + "logps/chosen": -54.210731506347656, + "logps/rejected": -62.53721237182617, + "loss": 0.9191, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.854806900024414, + "rewards/margins": 1.7622209787368774, + "rewards/rejected": 1.092585802078247, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 3.070969343185425, + "learning_rate": 9.99653635870375e-06, + "logits/chosen": -0.08681677281856537, + "logits/rejected": -0.12761908769607544, + "logps/chosen": -63.3287353515625, + "logps/rejected": -73.29615020751953, + "loss": 0.9725, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9835991859436035, + "rewards/margins": 1.4400986433029175, + "rewards/rejected": 1.5435004234313965, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 4.260465621948242, + "learning_rate": 9.996440825797705e-06, + "logits/chosen": 0.1061960756778717, + "logits/rejected": 0.010251779109239578, + "logps/chosen": -68.97125244140625, + "logps/rejected": -65.25022888183594, + "loss": 1.1076, + "rewards/accuracies": 0.65625, + "rewards/chosen": 2.7731215953826904, + "rewards/margins": 0.9410384893417358, + "rewards/rejected": 1.8320833444595337, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 3.353555202484131, + "learning_rate": 9.996343993745341e-06, + "logits/chosen": 0.03311352804303169, + "logits/rejected": -0.05912456661462784, + "logps/chosen": -62.103485107421875, + "logps/rejected": -70.32291412353516, + "loss": 0.9516, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9197065830230713, + "rewards/margins": 1.6448445320129395, + "rewards/rejected": 1.2748620510101318, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 5.072964668273926, + "learning_rate": 9.996245862571839e-06, + "logits/chosen": 0.007987607270479202, + "logits/rejected": 0.029951246455311775, + "logps/chosen": -69.67271423339844, + "logps/rejected": -93.65757751464844, + "loss": 1.2981, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.956468105316162, + "rewards/margins": 1.0623779296875, + "rewards/rejected": 1.8940904140472412, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 3.48819899559021, + "learning_rate": 9.996146432302709e-06, + "logits/chosen": -0.03233785182237625, + "logits/rejected": 0.020512381568551064, + "logps/chosen": -73.53594970703125, + "logps/rejected": -95.73696899414062, + "loss": 1.0749, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.480226755142212, + "rewards/margins": 1.3706369400024414, + "rewards/rejected": 1.10958993434906, + "step": 77 + }, + { + "epoch": 0.02, + "grad_norm": 4.213435173034668, + "learning_rate": 9.99604570296381e-06, + "logits/chosen": -0.13262081146240234, + "logits/rejected": -0.06321091204881668, + "logps/chosen": -61.512977600097656, + "logps/rejected": -78.87958526611328, + "loss": 1.2164, + "rewards/accuracies": 0.65625, + "rewards/chosen": 2.4635496139526367, + "rewards/margins": 0.5732765793800354, + "rewards/rejected": 1.890272855758667, + "step": 78 + }, + { + "epoch": 0.02, + "grad_norm": 2.777012348175049, + "learning_rate": 9.995943674581332e-06, + "logits/chosen": -0.18332652747631073, + "logits/rejected": -0.227662593126297, + "logps/chosen": -51.81045150756836, + "logps/rejected": -67.0077133178711, + "loss": 0.9016, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.697154998779297, + "rewards/margins": 1.882561445236206, + "rewards/rejected": 0.8145936727523804, + "step": 79 + }, + { + "epoch": 0.02, + "grad_norm": 3.8460354804992676, + "learning_rate": 9.9958403471818e-06, + "logits/chosen": -0.03394360840320587, + "logits/rejected": -0.04576661065220833, + "logps/chosen": -61.68536376953125, + "logps/rejected": -79.67647552490234, + "loss": 1.1489, + "rewards/accuracies": 0.65625, + "rewards/chosen": 2.534785747528076, + "rewards/margins": 1.0176286697387695, + "rewards/rejected": 1.5171573162078857, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 3.1297318935394287, + "learning_rate": 9.995735720792084e-06, + "logits/chosen": -0.08028427511453629, + "logits/rejected": -0.12058421969413757, + "logps/chosen": -60.2733154296875, + "logps/rejected": -85.10725402832031, + "loss": 0.9948, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.702704668045044, + "rewards/margins": 1.4771209955215454, + "rewards/rejected": 1.225583553314209, + "step": 81 + }, + { + "epoch": 0.02, + "grad_norm": 3.2262461185455322, + "learning_rate": 9.995629795439388e-06, + "logits/chosen": 0.008884275332093239, + "logits/rejected": -0.02542608417570591, + "logps/chosen": -59.75002670288086, + "logps/rejected": -69.48421478271484, + "loss": 0.9631, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.9201910495758057, + "rewards/margins": 1.552009105682373, + "rewards/rejected": 1.3681817054748535, + "step": 82 + }, + { + "epoch": 0.02, + "grad_norm": 3.6523168087005615, + "learning_rate": 9.995522571151255e-06, + "logits/chosen": -0.04748179391026497, + "logits/rejected": -0.06600961089134216, + "logps/chosen": -62.82748031616211, + "logps/rejected": -88.1055908203125, + "loss": 1.0334, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.8256993293762207, + "rewards/margins": 1.131322979927063, + "rewards/rejected": 1.6943764686584473, + "step": 83 + }, + { + "epoch": 0.02, + "grad_norm": 3.5258851051330566, + "learning_rate": 9.995414047955562e-06, + "logits/chosen": -0.17693519592285156, + "logits/rejected": -0.16302849352359772, + "logps/chosen": -72.92753601074219, + "logps/rejected": -63.422264099121094, + "loss": 1.1206, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.5284814834594727, + "rewards/margins": 1.2730610370635986, + "rewards/rejected": 1.255420207977295, + "step": 84 + }, + { + "epoch": 0.02, + "grad_norm": 3.457878828048706, + "learning_rate": 9.995304225880529e-06, + "logits/chosen": 0.0004648342728614807, + "logits/rejected": -0.03840690851211548, + "logps/chosen": -60.80211639404297, + "logps/rejected": -65.23381805419922, + "loss": 0.9996, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.74680233001709, + "rewards/margins": 1.208022117614746, + "rewards/rejected": 1.5387799739837646, + "step": 85 + }, + { + "epoch": 0.02, + "grad_norm": 4.465905666351318, + "learning_rate": 9.995193104954712e-06, + "logits/chosen": 0.14326585829257965, + "logits/rejected": 0.32282426953315735, + "logps/chosen": -64.5427017211914, + "logps/rejected": -106.22634887695312, + "loss": 1.2961, + "rewards/accuracies": 0.65625, + "rewards/chosen": 2.8325328826904297, + "rewards/margins": 0.6993204355239868, + "rewards/rejected": 2.1332123279571533, + "step": 86 + }, + { + "epoch": 0.02, + "grad_norm": 3.7174603939056396, + "learning_rate": 9.995080685207e-06, + "logits/chosen": 0.0022806432098150253, + "logits/rejected": -0.04772566631436348, + "logps/chosen": -59.92386245727539, + "logps/rejected": -69.24701690673828, + "loss": 1.066, + "rewards/accuracies": 0.65625, + "rewards/chosen": 2.8421921730041504, + "rewards/margins": 1.2175226211547852, + "rewards/rejected": 1.6246697902679443, + "step": 87 + }, + { + "epoch": 0.02, + "grad_norm": 3.50303053855896, + "learning_rate": 9.994966966666627e-06, + "logits/chosen": -0.00366981141269207, + "logits/rejected": -0.03093855455517769, + "logps/chosen": -59.70466232299805, + "logps/rejected": -67.80409240722656, + "loss": 1.0487, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7230305671691895, + "rewards/margins": 1.4397631883621216, + "rewards/rejected": 1.2832671403884888, + "step": 88 + }, + { + "epoch": 0.02, + "grad_norm": 3.324565887451172, + "learning_rate": 9.994851949363163e-06, + "logits/chosen": -0.10123173147439957, + "logits/rejected": -0.12579065561294556, + "logps/chosen": -64.59476470947266, + "logps/rejected": -80.89784240722656, + "loss": 0.9833, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6167521476745605, + "rewards/margins": 1.3844321966171265, + "rewards/rejected": 1.2323198318481445, + "step": 89 + }, + { + "epoch": 0.02, + "grad_norm": 2.756503105163574, + "learning_rate": 9.99473563332651e-06, + "logits/chosen": 0.059805937111377716, + "logits/rejected": -0.030325056985020638, + "logps/chosen": -66.24658966064453, + "logps/rejected": -75.22474670410156, + "loss": 0.887, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.822403907775879, + "rewards/margins": 1.7022185325622559, + "rewards/rejected": 1.1201852560043335, + "step": 90 + }, + { + "epoch": 0.02, + "grad_norm": 3.266810894012451, + "learning_rate": 9.994618018586913e-06, + "logits/chosen": -0.01218704879283905, + "logits/rejected": -0.09648609906435013, + "logps/chosen": -66.19169616699219, + "logps/rejected": -85.58779907226562, + "loss": 1.0198, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7855424880981445, + "rewards/margins": 1.5825233459472656, + "rewards/rejected": 1.203019142150879, + "step": 91 + }, + { + "epoch": 0.02, + "grad_norm": 2.608541965484619, + "learning_rate": 9.994499105174956e-06, + "logits/chosen": -0.10711883008480072, + "logits/rejected": -0.1682967245578766, + "logps/chosen": -69.24922180175781, + "logps/rejected": -77.65547180175781, + "loss": 0.8747, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.089376926422119, + "rewards/margins": 1.8324445486068726, + "rewards/rejected": 1.2569324970245361, + "step": 92 + }, + { + "epoch": 0.02, + "grad_norm": 4.267101287841797, + "learning_rate": 9.994378893121555e-06, + "logits/chosen": -0.019218809902668, + "logits/rejected": -0.017939655110239983, + "logps/chosen": -53.862518310546875, + "logps/rejected": -77.2769546508789, + "loss": 1.0283, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.52712345123291, + "rewards/margins": 1.6012585163116455, + "rewards/rejected": 0.9258650541305542, + "step": 93 + }, + { + "epoch": 0.02, + "grad_norm": 4.305262088775635, + "learning_rate": 9.99425738245797e-06, + "logits/chosen": -0.10150554031133652, + "logits/rejected": -0.12889200448989868, + "logps/chosen": -59.4300537109375, + "logps/rejected": -69.40481567382812, + "loss": 1.1257, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.7065134048461914, + "rewards/margins": 1.451493263244629, + "rewards/rejected": 1.2550201416015625, + "step": 94 + }, + { + "epoch": 0.02, + "grad_norm": 4.078916549682617, + "learning_rate": 9.994134573215792e-06, + "logits/chosen": 0.06800353527069092, + "logits/rejected": 0.036012522876262665, + "logps/chosen": -73.67912292480469, + "logps/rejected": -92.09085083007812, + "loss": 1.0085, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.61240816116333, + "rewards/margins": 1.3496320247650146, + "rewards/rejected": 1.2627761363983154, + "step": 95 + }, + { + "epoch": 0.02, + "grad_norm": 3.6155428886413574, + "learning_rate": 9.994010465426958e-06, + "logits/chosen": 0.025626692920923233, + "logits/rejected": -0.09501317143440247, + "logps/chosen": -64.12309265136719, + "logps/rejected": -85.5754623413086, + "loss": 1.0237, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7839789390563965, + "rewards/margins": 1.5549958944320679, + "rewards/rejected": 1.2289831638336182, + "step": 96 + }, + { + "epoch": 0.02, + "grad_norm": 4.29647159576416, + "learning_rate": 9.993885059123731e-06, + "logits/chosen": 0.05784708634018898, + "logits/rejected": 0.04794265329837799, + "logps/chosen": -70.9309310913086, + "logps/rejected": -98.09900665283203, + "loss": 1.0754, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.852473497390747, + "rewards/margins": 1.1998182535171509, + "rewards/rejected": 1.6526553630828857, + "step": 97 + }, + { + "epoch": 0.02, + "grad_norm": 3.308706283569336, + "learning_rate": 9.993758354338725e-06, + "logits/chosen": -0.1295693814754486, + "logits/rejected": -0.10508604347705841, + "logps/chosen": -60.85618591308594, + "logps/rejected": -89.45560455322266, + "loss": 1.0257, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.845616102218628, + "rewards/margins": 1.3221569061279297, + "rewards/rejected": 1.5234591960906982, + "step": 98 + }, + { + "epoch": 0.02, + "grad_norm": 3.672410726547241, + "learning_rate": 9.993630351104881e-06, + "logits/chosen": 0.19309626519680023, + "logits/rejected": 0.039567720144987106, + "logps/chosen": -83.29027557373047, + "logps/rejected": -86.22174835205078, + "loss": 0.9528, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.1612608432769775, + "rewards/margins": 1.622215747833252, + "rewards/rejected": 1.5390450954437256, + "step": 99 + }, + { + "epoch": 0.02, + "grad_norm": 3.4372124671936035, + "learning_rate": 9.993501049455485e-06, + "logits/chosen": -0.10890310257673264, + "logits/rejected": -0.11576356738805771, + "logps/chosen": -64.34217834472656, + "logps/rejected": -94.13896942138672, + "loss": 0.9744, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.963901996612549, + "rewards/margins": 1.6598089933395386, + "rewards/rejected": 1.304093360900879, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 3.189305305480957, + "learning_rate": 9.993000646930633e-06, + "logits/chosen": -0.021481666713953018, + "logits/rejected": -0.1152510941028595, + "logps/chosen": -55.86531066894531, + "logps/rejected": -54.577796936035156, + "loss": 0.9595, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.783210277557373, + "rewards/margins": 1.582822322845459, + "rewards/rejected": 1.2003880739212036, + "step": 101 + }, + { + "epoch": 0.03, + "grad_norm": 3.09198260307312, + "learning_rate": 9.9928613928892e-06, + "logits/chosen": 0.07887043803930283, + "logits/rejected": -0.051103148609399796, + "logps/chosen": -62.31296920776367, + "logps/rejected": -51.836639404296875, + "loss": 1.027, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7692008018493652, + "rewards/margins": 1.570551872253418, + "rewards/rejected": 1.1986486911773682, + "step": 102 + }, + { + "epoch": 0.03, + "grad_norm": 3.3179771900177, + "learning_rate": 9.992720768199575e-06, + "logits/chosen": 0.07788385450839996, + "logits/rejected": -0.03807784616947174, + "logps/chosen": -52.1124267578125, + "logps/rejected": -55.827552795410156, + "loss": 0.9666, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6761105060577393, + "rewards/margins": 1.8395299911499023, + "rewards/rejected": 0.836580753326416, + "step": 103 + }, + { + "epoch": 0.03, + "grad_norm": 3.3780694007873535, + "learning_rate": 9.992578772900366e-06, + "logits/chosen": 0.0219680555164814, + "logits/rejected": -0.043777819722890854, + "logps/chosen": -67.86905670166016, + "logps/rejected": -62.45096969604492, + "loss": 0.9692, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9889369010925293, + "rewards/margins": 1.9195566177368164, + "rewards/rejected": 1.0693801641464233, + "step": 104 + }, + { + "epoch": 0.03, + "grad_norm": 3.0133235454559326, + "learning_rate": 9.992435407030551e-06, + "logits/chosen": 0.011786199174821377, + "logits/rejected": -0.19354335963726044, + "logps/chosen": -44.875404357910156, + "logps/rejected": -43.22931671142578, + "loss": 0.8082, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8075242042541504, + "rewards/margins": 2.146634578704834, + "rewards/rejected": 0.6608898639678955, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 2.9169013500213623, + "learning_rate": 9.992290670629491e-06, + "logits/chosen": 0.039023786783218384, + "logits/rejected": -0.12400276213884354, + "logps/chosen": -49.03104782104492, + "logps/rejected": -56.87084197998047, + "loss": 0.857, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0316452980041504, + "rewards/margins": 1.978754997253418, + "rewards/rejected": 1.052890658378601, + "step": 106 + }, + { + "epoch": 0.03, + "grad_norm": 2.652022361755371, + "learning_rate": 9.992144563736916e-06, + "logits/chosen": -0.06096121668815613, + "logits/rejected": -0.17980492115020752, + "logps/chosen": -65.66019439697266, + "logps/rejected": -50.38265609741211, + "loss": 0.9715, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0079050064086914, + "rewards/margins": 1.8630030155181885, + "rewards/rejected": 1.144901990890503, + "step": 107 + }, + { + "epoch": 0.03, + "grad_norm": 3.19099497795105, + "learning_rate": 9.991997086392936e-06, + "logits/chosen": 0.04372570663690567, + "logits/rejected": -0.0073449574410915375, + "logps/chosen": -49.89601135253906, + "logps/rejected": -59.217742919921875, + "loss": 0.8743, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8189992904663086, + "rewards/margins": 2.2536139488220215, + "rewards/rejected": 0.5653852820396423, + "step": 108 + }, + { + "epoch": 0.03, + "grad_norm": 3.546755313873291, + "learning_rate": 9.991848238638037e-06, + "logits/chosen": 0.034367822110652924, + "logits/rejected": -0.20382915437221527, + "logps/chosen": -58.32036590576172, + "logps/rejected": -48.84999084472656, + "loss": 0.9758, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.664569616317749, + "rewards/margins": 1.725881576538086, + "rewards/rejected": 0.9386880397796631, + "step": 109 + }, + { + "epoch": 0.03, + "grad_norm": 3.3458333015441895, + "learning_rate": 9.99169802051308e-06, + "logits/chosen": 0.042452260851860046, + "logits/rejected": -0.09810177981853485, + "logps/chosen": -54.382408142089844, + "logps/rejected": -58.70013427734375, + "loss": 0.9161, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6432502269744873, + "rewards/margins": 1.7500165700912476, + "rewards/rejected": 0.8932337164878845, + "step": 110 + }, + { + "epoch": 0.03, + "grad_norm": 3.5878045558929443, + "learning_rate": 9.991546432059306e-06, + "logits/chosen": 0.0924292504787445, + "logits/rejected": -0.003370290622115135, + "logps/chosen": -68.17169952392578, + "logps/rejected": -52.196468353271484, + "loss": 1.1011, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.5111305713653564, + "rewards/margins": 1.2631406784057617, + "rewards/rejected": 1.2479897737503052, + "step": 111 + }, + { + "epoch": 0.03, + "grad_norm": 3.1082093715667725, + "learning_rate": 9.991393473318326e-06, + "logits/chosen": 0.19067718088626862, + "logits/rejected": 0.03686385601758957, + "logps/chosen": -60.524375915527344, + "logps/rejected": -65.86453247070312, + "loss": 0.9035, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.998655319213867, + "rewards/margins": 1.848504900932312, + "rewards/rejected": 1.1501505374908447, + "step": 112 + }, + { + "epoch": 0.03, + "grad_norm": 3.7037384510040283, + "learning_rate": 9.991239144332132e-06, + "logits/chosen": -0.05271312966942787, + "logits/rejected": -0.11996916681528091, + "logps/chosen": -54.48086166381836, + "logps/rejected": -72.12334442138672, + "loss": 1.0423, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.893026351928711, + "rewards/margins": 1.8019968271255493, + "rewards/rejected": 1.0910296440124512, + "step": 113 + }, + { + "epoch": 0.03, + "grad_norm": 2.391525983810425, + "learning_rate": 9.99108344514309e-06, + "logits/chosen": -0.06777942180633545, + "logits/rejected": -0.24276772141456604, + "logps/chosen": -67.926025390625, + "logps/rejected": -44.07005310058594, + "loss": 0.8573, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9906606674194336, + "rewards/margins": 2.4962551593780518, + "rewards/rejected": 0.49440550804138184, + "step": 114 + }, + { + "epoch": 0.03, + "grad_norm": 2.9481112957000732, + "learning_rate": 9.990926375793943e-06, + "logits/chosen": -0.07925830036401749, + "logits/rejected": -0.17415733635425568, + "logps/chosen": -51.56864547729492, + "logps/rejected": -49.45170593261719, + "loss": 0.9722, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.630187749862671, + "rewards/margins": 1.8877034187316895, + "rewards/rejected": 0.7424843907356262, + "step": 115 + }, + { + "epoch": 0.03, + "grad_norm": 2.9049386978149414, + "learning_rate": 9.990767936327812e-06, + "logits/chosen": -0.038400571793317795, + "logits/rejected": -0.09222601354122162, + "logps/chosen": -51.6354866027832, + "logps/rejected": -63.541805267333984, + "loss": 0.8937, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7280113697052, + "rewards/margins": 1.6709275245666504, + "rewards/rejected": 1.0570838451385498, + "step": 116 + }, + { + "epoch": 0.03, + "grad_norm": 3.113764524459839, + "learning_rate": 9.990608126788188e-06, + "logits/chosen": 0.10038081556558609, + "logits/rejected": -0.0872790515422821, + "logps/chosen": -62.55716323852539, + "logps/rejected": -59.28609848022461, + "loss": 0.9445, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6159746646881104, + "rewards/margins": 1.4925097227096558, + "rewards/rejected": 1.1234649419784546, + "step": 117 + }, + { + "epoch": 0.03, + "grad_norm": 3.2167885303497314, + "learning_rate": 9.990446947218946e-06, + "logits/chosen": -0.04900059849023819, + "logits/rejected": -0.25365743041038513, + "logps/chosen": -56.46541976928711, + "logps/rejected": -47.83744812011719, + "loss": 0.9423, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8199403285980225, + "rewards/margins": 2.207815647125244, + "rewards/rejected": 0.6121245622634888, + "step": 118 + }, + { + "epoch": 0.03, + "grad_norm": 2.6971843242645264, + "learning_rate": 9.990284397664331e-06, + "logits/chosen": 0.035556308925151825, + "logits/rejected": -0.16379673779010773, + "logps/chosen": -52.35251998901367, + "logps/rejected": -45.89836883544922, + "loss": 0.8652, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.943171739578247, + "rewards/margins": 1.9933918714523315, + "rewards/rejected": 0.9497796893119812, + "step": 119 + }, + { + "epoch": 0.03, + "grad_norm": 2.8227524757385254, + "learning_rate": 9.990120478168968e-06, + "logits/chosen": -0.023162946105003357, + "logits/rejected": -0.17886964976787567, + "logps/chosen": -58.474945068359375, + "logps/rejected": -52.10554885864258, + "loss": 0.8706, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1510348320007324, + "rewards/margins": 2.208745002746582, + "rewards/rejected": 0.9422898292541504, + "step": 120 + }, + { + "epoch": 0.03, + "grad_norm": 3.1517927646636963, + "learning_rate": 9.989955188777853e-06, + "logits/chosen": 0.05676406994462013, + "logits/rejected": -0.167734757065773, + "logps/chosen": -72.47442626953125, + "logps/rejected": -57.27033233642578, + "loss": 0.9246, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.949442148208618, + "rewards/margins": 1.7839007377624512, + "rewards/rejected": 1.1655410528182983, + "step": 121 + }, + { + "epoch": 0.03, + "grad_norm": 2.9473752975463867, + "learning_rate": 9.989788529536367e-06, + "logits/chosen": -0.10192751884460449, + "logits/rejected": -0.16901209950447083, + "logps/chosen": -59.32941436767578, + "logps/rejected": -61.36375427246094, + "loss": 1.0771, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.718956708908081, + "rewards/margins": 1.7292835712432861, + "rewards/rejected": 0.9896730184555054, + "step": 122 + }, + { + "epoch": 0.03, + "grad_norm": 3.020989179611206, + "learning_rate": 9.989620500490255e-06, + "logits/chosen": 0.007277403026819229, + "logits/rejected": -0.164003387093544, + "logps/chosen": -60.70405578613281, + "logps/rejected": -47.57758331298828, + "loss": 0.9713, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.080604076385498, + "rewards/margins": 2.106873035430908, + "rewards/rejected": 0.9737311601638794, + "step": 123 + }, + { + "epoch": 0.03, + "grad_norm": 3.8983752727508545, + "learning_rate": 9.98945110168565e-06, + "logits/chosen": -0.01942862570285797, + "logits/rejected": -0.1340469866991043, + "logps/chosen": -60.37934875488281, + "logps/rejected": -56.674407958984375, + "loss": 1.0345, + "rewards/accuracies": 0.71875, + "rewards/chosen": 3.039041519165039, + "rewards/margins": 1.756483554840088, + "rewards/rejected": 1.2825583219528198, + "step": 124 + }, + { + "epoch": 0.03, + "grad_norm": 2.151283025741577, + "learning_rate": 9.989280333169054e-06, + "logits/chosen": -0.016409922391176224, + "logits/rejected": -0.11613002419471741, + "logps/chosen": -46.87004089355469, + "logps/rejected": -59.98579406738281, + "loss": 0.7692, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.884451389312744, + "rewards/margins": 2.4049925804138184, + "rewards/rejected": 0.47945863008499146, + "step": 125 + }, + { + "epoch": 0.03, + "grad_norm": 3.0232625007629395, + "learning_rate": 9.989108194987346e-06, + "logits/chosen": 0.022830218076705933, + "logits/rejected": -0.057781580835580826, + "logps/chosen": -58.14380645751953, + "logps/rejected": -68.00019836425781, + "loss": 0.9633, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.921656847000122, + "rewards/margins": 1.7778064012527466, + "rewards/rejected": 1.143850564956665, + "step": 126 + }, + { + "epoch": 0.03, + "grad_norm": 3.114375352859497, + "learning_rate": 9.988934687187784e-06, + "logits/chosen": 0.0005736891180276871, + "logits/rejected": -0.0924849584698677, + "logps/chosen": -52.568111419677734, + "logps/rejected": -59.12225341796875, + "loss": 0.9013, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.918546199798584, + "rewards/margins": 1.974504828453064, + "rewards/rejected": 0.94404137134552, + "step": 127 + }, + { + "epoch": 0.03, + "grad_norm": 3.0840225219726562, + "learning_rate": 9.988759809817995e-06, + "logits/chosen": 0.033524882048368454, + "logits/rejected": -0.21203148365020752, + "logps/chosen": -55.859405517578125, + "logps/rejected": -44.35686492919922, + "loss": 0.8328, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.910928726196289, + "rewards/margins": 2.2887697219848633, + "rewards/rejected": 0.6221591234207153, + "step": 128 + }, + { + "epoch": 0.03, + "grad_norm": 2.7618799209594727, + "learning_rate": 9.988583562925989e-06, + "logits/chosen": 0.018217379227280617, + "logits/rejected": -0.166998028755188, + "logps/chosen": -70.16156768798828, + "logps/rejected": -57.229766845703125, + "loss": 0.8789, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8261051177978516, + "rewards/margins": 1.8212637901306152, + "rewards/rejected": 1.0048409700393677, + "step": 129 + }, + { + "epoch": 0.03, + "grad_norm": 3.75726318359375, + "learning_rate": 9.988405946560151e-06, + "logits/chosen": 0.07181181013584137, + "logits/rejected": -0.04546160250902176, + "logps/chosen": -55.7495231628418, + "logps/rejected": -57.56371307373047, + "loss": 1.0212, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.771509885787964, + "rewards/margins": 1.7307448387145996, + "rewards/rejected": 1.0407651662826538, + "step": 130 + }, + { + "epoch": 0.03, + "grad_norm": 3.2683424949645996, + "learning_rate": 9.988226960769241e-06, + "logits/chosen": 0.08656000345945358, + "logits/rejected": -0.09638756513595581, + "logps/chosen": -66.08636474609375, + "logps/rejected": -52.41812515258789, + "loss": 1.037, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.877371072769165, + "rewards/margins": 1.8647329807281494, + "rewards/rejected": 1.0126378536224365, + "step": 131 + }, + { + "epoch": 0.03, + "grad_norm": 3.208860397338867, + "learning_rate": 9.98804660560239e-06, + "logits/chosen": -0.01463126577436924, + "logits/rejected": -0.17504476010799408, + "logps/chosen": -59.915550231933594, + "logps/rejected": -50.546775817871094, + "loss": 1.0876, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9181904792785645, + "rewards/margins": 1.8733460903167725, + "rewards/rejected": 1.0448445081710815, + "step": 132 + }, + { + "epoch": 0.03, + "grad_norm": 3.3311939239501953, + "learning_rate": 9.987864881109113e-06, + "logits/chosen": 0.014840370044112206, + "logits/rejected": -0.026256686076521873, + "logps/chosen": -60.466026306152344, + "logps/rejected": -65.27140808105469, + "loss": 1.0249, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.9687695503234863, + "rewards/margins": 1.4354562759399414, + "rewards/rejected": 1.5333133935928345, + "step": 133 + }, + { + "epoch": 0.03, + "grad_norm": 3.4625327587127686, + "learning_rate": 9.987681787339297e-06, + "logits/chosen": 0.01115868054330349, + "logits/rejected": -0.0708470419049263, + "logps/chosen": -63.40885543823242, + "logps/rejected": -74.07566833496094, + "loss": 1.0196, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8089189529418945, + "rewards/margins": 1.7201740741729736, + "rewards/rejected": 1.0887449979782104, + "step": 134 + }, + { + "epoch": 0.03, + "grad_norm": 3.633497714996338, + "learning_rate": 9.987497324343206e-06, + "logits/chosen": 0.0909426361322403, + "logits/rejected": -0.14822334051132202, + "logps/chosen": -68.58879089355469, + "logps/rejected": -43.176353454589844, + "loss": 1.0703, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.580142021179199, + "rewards/margins": 1.3490761518478394, + "rewards/rejected": 1.2310662269592285, + "step": 135 + }, + { + "epoch": 0.03, + "grad_norm": 3.327320098876953, + "learning_rate": 9.987311492171474e-06, + "logits/chosen": 0.010527094826102257, + "logits/rejected": -0.12388010323047638, + "logps/chosen": -57.165184020996094, + "logps/rejected": -53.262081146240234, + "loss": 0.9406, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.743960380554199, + "rewards/margins": 1.7760015726089478, + "rewards/rejected": 0.9679589867591858, + "step": 136 + }, + { + "epoch": 0.03, + "grad_norm": 2.7211217880249023, + "learning_rate": 9.987124290875123e-06, + "logits/chosen": 0.09807469695806503, + "logits/rejected": -0.11804808676242828, + "logps/chosen": -50.21159362792969, + "logps/rejected": -49.645511627197266, + "loss": 0.7866, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9705810546875, + "rewards/margins": 2.1949591636657715, + "rewards/rejected": 0.7756219506263733, + "step": 137 + }, + { + "epoch": 0.03, + "grad_norm": 3.703263521194458, + "learning_rate": 9.986935720505539e-06, + "logits/chosen": -0.05306866019964218, + "logits/rejected": -0.12455058842897415, + "logps/chosen": -48.65645217895508, + "logps/rejected": -54.004737854003906, + "loss": 1.1061, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.684549570083618, + "rewards/margins": 1.2483534812927246, + "rewards/rejected": 1.4361960887908936, + "step": 138 + }, + { + "epoch": 0.03, + "grad_norm": 3.6137983798980713, + "learning_rate": 9.98674578111449e-06, + "logits/chosen": 0.12141013145446777, + "logits/rejected": -0.059521183371543884, + "logps/chosen": -60.84081268310547, + "logps/rejected": -54.018253326416016, + "loss": 1.063, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8944082260131836, + "rewards/margins": 1.2839678525924683, + "rewards/rejected": 1.6104404926300049, + "step": 139 + }, + { + "epoch": 0.04, + "grad_norm": 3.276249647140503, + "learning_rate": 9.98655447275412e-06, + "logits/chosen": 0.06999555230140686, + "logits/rejected": 0.010182622820138931, + "logps/chosen": -61.844512939453125, + "logps/rejected": -67.12357330322266, + "loss": 0.9486, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.972879648208618, + "rewards/margins": 1.6975497007369995, + "rewards/rejected": 1.2753299474716187, + "step": 140 + }, + { + "epoch": 0.04, + "grad_norm": 3.8105087280273438, + "learning_rate": 9.986361795476945e-06, + "logits/chosen": 0.01689644530415535, + "logits/rejected": -0.14044618606567383, + "logps/chosen": -63.152225494384766, + "logps/rejected": -52.22819519042969, + "loss": 1.1679, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.7120511531829834, + "rewards/margins": 1.3448126316070557, + "rewards/rejected": 1.3672384023666382, + "step": 141 + }, + { + "epoch": 0.04, + "grad_norm": 2.546983003616333, + "learning_rate": 9.98616774933586e-06, + "logits/chosen": -0.09994504600763321, + "logits/rejected": -0.1816963255405426, + "logps/chosen": -57.56393051147461, + "logps/rejected": -55.03179168701172, + "loss": 0.9095, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0482635498046875, + "rewards/margins": 2.004216194152832, + "rewards/rejected": 1.044047236442566, + "step": 142 + }, + { + "epoch": 0.04, + "grad_norm": 3.1275317668914795, + "learning_rate": 9.985972334384136e-06, + "logits/chosen": -0.04038657993078232, + "logits/rejected": -0.15985967218875885, + "logps/chosen": -58.82177734375, + "logps/rejected": -54.77827072143555, + "loss": 1.0334, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.927877426147461, + "rewards/margins": 1.722095012664795, + "rewards/rejected": 1.205782413482666, + "step": 143 + }, + { + "epoch": 0.04, + "grad_norm": 3.2091329097747803, + "learning_rate": 9.985775550675415e-06, + "logits/chosen": -0.07281239330768585, + "logits/rejected": -0.1461143046617508, + "logps/chosen": -53.54340362548828, + "logps/rejected": -59.022918701171875, + "loss": 0.9206, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9209659099578857, + "rewards/margins": 1.9918612241744995, + "rewards/rejected": 0.929104745388031, + "step": 144 + }, + { + "epoch": 0.04, + "grad_norm": 2.924931287765503, + "learning_rate": 9.985577398263721e-06, + "logits/chosen": -0.13137228786945343, + "logits/rejected": -0.3011148273944855, + "logps/chosen": -52.97252655029297, + "logps/rejected": -45.647621154785156, + "loss": 0.9171, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9794225692749023, + "rewards/margins": 1.9787081480026245, + "rewards/rejected": 1.0007143020629883, + "step": 145 + }, + { + "epoch": 0.04, + "grad_norm": 3.941213369369507, + "learning_rate": 9.985377877203452e-06, + "logits/chosen": -0.02969745174050331, + "logits/rejected": -0.08266055583953857, + "logps/chosen": -57.17963409423828, + "logps/rejected": -63.58135223388672, + "loss": 1.1802, + "rewards/accuracies": 0.65625, + "rewards/chosen": 2.813263177871704, + "rewards/margins": 1.077775239944458, + "rewards/rejected": 1.735487937927246, + "step": 146 + }, + { + "epoch": 0.04, + "grad_norm": 3.4987034797668457, + "learning_rate": 9.98517698754938e-06, + "logits/chosen": 0.1022428497672081, + "logits/rejected": -0.0467233806848526, + "logps/chosen": -62.68452453613281, + "logps/rejected": -54.08621597290039, + "loss": 0.9595, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.915971040725708, + "rewards/margins": 1.754659652709961, + "rewards/rejected": 1.1613115072250366, + "step": 147 + }, + { + "epoch": 0.04, + "grad_norm": 3.2076313495635986, + "learning_rate": 9.984974729356653e-06, + "logits/chosen": -0.027156444266438484, + "logits/rejected": -0.16692379117012024, + "logps/chosen": -61.88245391845703, + "logps/rejected": -52.6606559753418, + "loss": 1.0714, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.912220001220703, + "rewards/margins": 1.8126444816589355, + "rewards/rejected": 1.0995756387710571, + "step": 148 + }, + { + "epoch": 0.04, + "grad_norm": 4.388082027435303, + "learning_rate": 9.984771102680795e-06, + "logits/chosen": 0.051971063017845154, + "logits/rejected": -0.09439298510551453, + "logps/chosen": -57.881980895996094, + "logps/rejected": -60.38011169433594, + "loss": 1.0643, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.627277374267578, + "rewards/margins": 1.2737836837768555, + "rewards/rejected": 1.3534936904907227, + "step": 149 + }, + { + "epoch": 0.04, + "grad_norm": 2.8752429485321045, + "learning_rate": 9.984566107577707e-06, + "logits/chosen": -0.10830068588256836, + "logits/rejected": -0.17190146446228027, + "logps/chosen": -50.436954498291016, + "logps/rejected": -49.18791198730469, + "loss": 0.9078, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0283994674682617, + "rewards/margins": 2.1517271995544434, + "rewards/rejected": 0.8766722083091736, + "step": 150 + }, + { + "epoch": 0.04, + "grad_norm": 3.270801544189453, + "learning_rate": 9.984359744103665e-06, + "logits/chosen": 0.07100740075111389, + "logits/rejected": -0.06577593833208084, + "logps/chosen": -63.945960998535156, + "logps/rejected": -52.42609786987305, + "loss": 1.0978, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.644118309020996, + "rewards/margins": 1.391097068786621, + "rewards/rejected": 1.2530213594436646, + "step": 151 + }, + { + "epoch": 0.04, + "grad_norm": 3.6429293155670166, + "learning_rate": 9.984152012315317e-06, + "logits/chosen": 0.04692389816045761, + "logits/rejected": -0.1524173617362976, + "logps/chosen": -72.894775390625, + "logps/rejected": -62.88166046142578, + "loss": 0.9806, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9667770862579346, + "rewards/margins": 1.9463993310928345, + "rewards/rejected": 1.0203777551651, + "step": 152 + }, + { + "epoch": 0.04, + "grad_norm": 3.0746116638183594, + "learning_rate": 9.983942912269693e-06, + "logits/chosen": 0.0059276120737195015, + "logits/rejected": -0.11621472239494324, + "logps/chosen": -59.85718536376953, + "logps/rejected": -53.49744415283203, + "loss": 0.9672, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.879732131958008, + "rewards/margins": 1.7253494262695312, + "rewards/rejected": 1.1543828248977661, + "step": 153 + }, + { + "epoch": 0.04, + "grad_norm": 3.8106608390808105, + "learning_rate": 9.983732444024195e-06, + "logits/chosen": 0.08370208740234375, + "logits/rejected": -0.11228031665086746, + "logps/chosen": -67.95963287353516, + "logps/rejected": -65.44819641113281, + "loss": 1.0139, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.854377508163452, + "rewards/margins": 1.5761537551879883, + "rewards/rejected": 1.2782237529754639, + "step": 154 + }, + { + "epoch": 0.04, + "grad_norm": 3.584045648574829, + "learning_rate": 9.983520607636601e-06, + "logits/chosen": -0.02085835486650467, + "logits/rejected": -0.15557073056697845, + "logps/chosen": -56.752349853515625, + "logps/rejected": -49.30656433105469, + "loss": 1.053, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.6321144104003906, + "rewards/margins": 1.2031112909317017, + "rewards/rejected": 1.4290028810501099, + "step": 155 + }, + { + "epoch": 0.04, + "grad_norm": 2.970791816711426, + "learning_rate": 9.983307403165063e-06, + "logits/chosen": 0.04516486078500748, + "logits/rejected": -0.06160905584692955, + "logps/chosen": -54.41744613647461, + "logps/rejected": -68.84862518310547, + "loss": 0.9165, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.0372018814086914, + "rewards/margins": 2.213916063308716, + "rewards/rejected": 0.8232859373092651, + "step": 156 + }, + { + "epoch": 0.04, + "grad_norm": 2.7602553367614746, + "learning_rate": 9.983092830668112e-06, + "logits/chosen": 0.03922605887055397, + "logits/rejected": -0.0700131356716156, + "logps/chosen": -50.033077239990234, + "logps/rejected": -45.523399353027344, + "loss": 0.9956, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7446177005767822, + "rewards/margins": 1.6381644010543823, + "rewards/rejected": 1.1064531803131104, + "step": 157 + }, + { + "epoch": 0.04, + "grad_norm": 2.7060139179229736, + "learning_rate": 9.982876890204653e-06, + "logits/chosen": -0.038064539432525635, + "logits/rejected": -0.2204597294330597, + "logps/chosen": -46.18851852416992, + "logps/rejected": -49.75224304199219, + "loss": 0.8096, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.890225410461426, + "rewards/margins": 2.3526248931884766, + "rewards/rejected": 0.5376003384590149, + "step": 158 + }, + { + "epoch": 0.04, + "grad_norm": 3.595527172088623, + "learning_rate": 9.982659581833967e-06, + "logits/chosen": 0.005708668380975723, + "logits/rejected": -0.1124122142791748, + "logps/chosen": -55.19317626953125, + "logps/rejected": -49.108272552490234, + "loss": 1.066, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.769448757171631, + "rewards/margins": 1.544546127319336, + "rewards/rejected": 1.2249027490615845, + "step": 159 + }, + { + "epoch": 0.04, + "grad_norm": 2.9934608936309814, + "learning_rate": 9.982440905615705e-06, + "logits/chosen": -0.03810581564903259, + "logits/rejected": -0.09667065739631653, + "logps/chosen": -45.19405746459961, + "logps/rejected": -72.2146987915039, + "loss": 0.9414, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7962396144866943, + "rewards/margins": 1.7047940492630005, + "rewards/rejected": 1.0914456844329834, + "step": 160 + }, + { + "epoch": 0.04, + "grad_norm": 2.599822759628296, + "learning_rate": 9.982220861609904e-06, + "logits/chosen": -0.014675870537757874, + "logits/rejected": -0.1508055031299591, + "logps/chosen": -44.66288757324219, + "logps/rejected": -47.62725067138672, + "loss": 0.7816, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.885868549346924, + "rewards/margins": 2.101304292678833, + "rewards/rejected": 0.7845645546913147, + "step": 161 + }, + { + "epoch": 0.04, + "grad_norm": 2.8138229846954346, + "learning_rate": 9.981999449876968e-06, + "logits/chosen": 0.039467014372348785, + "logits/rejected": -0.12374014407396317, + "logps/chosen": -68.92998504638672, + "logps/rejected": -47.28063201904297, + "loss": 1.0232, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.915198802947998, + "rewards/margins": 1.5387848615646362, + "rewards/rejected": 1.3764142990112305, + "step": 162 + }, + { + "epoch": 0.04, + "grad_norm": 2.952584743499756, + "learning_rate": 9.98177667047768e-06, + "logits/chosen": 0.08773861080408096, + "logits/rejected": -0.13852469623088837, + "logps/chosen": -63.23244094848633, + "logps/rejected": -43.108421325683594, + "loss": 0.9417, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9583704471588135, + "rewards/margins": 1.8677372932434082, + "rewards/rejected": 1.0906332731246948, + "step": 163 + }, + { + "epoch": 0.04, + "grad_norm": 3.0911293029785156, + "learning_rate": 9.981552523473198e-06, + "logits/chosen": -0.041567910462617874, + "logits/rejected": -0.20243148505687714, + "logps/chosen": -45.93622970581055, + "logps/rejected": -46.649478912353516, + "loss": 0.9392, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7901968955993652, + "rewards/margins": 1.8446714878082275, + "rewards/rejected": 0.9455254673957825, + "step": 164 + }, + { + "epoch": 0.04, + "grad_norm": 3.476405382156372, + "learning_rate": 9.981327008925055e-06, + "logits/chosen": -0.042280301451683044, + "logits/rejected": -0.16598229110240936, + "logps/chosen": -59.05263900756836, + "logps/rejected": -53.791568756103516, + "loss": 1.0536, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.9000813961029053, + "rewards/margins": 1.983017086982727, + "rewards/rejected": 0.9170644879341125, + "step": 165 + }, + { + "epoch": 0.04, + "grad_norm": 2.89300537109375, + "learning_rate": 9.98110012689516e-06, + "logits/chosen": 0.056981414556503296, + "logits/rejected": -0.2001151144504547, + "logps/chosen": -63.22153091430664, + "logps/rejected": -45.99965286254883, + "loss": 0.8666, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.927400588989258, + "rewards/margins": 2.288208484649658, + "rewards/rejected": 0.6391921043395996, + "step": 166 + }, + { + "epoch": 0.04, + "grad_norm": 2.638326644897461, + "learning_rate": 9.980871877445794e-06, + "logits/chosen": 0.017682485282421112, + "logits/rejected": -0.1267087310552597, + "logps/chosen": -48.96157455444336, + "logps/rejected": -59.3393440246582, + "loss": 0.8066, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1632752418518066, + "rewards/margins": 2.5302364826202393, + "rewards/rejected": 0.6330382227897644, + "step": 167 + }, + { + "epoch": 0.04, + "grad_norm": 2.6835978031158447, + "learning_rate": 9.980642260639621e-06, + "logits/chosen": 0.02588096633553505, + "logits/rejected": -0.09970692545175552, + "logps/chosen": -52.510250091552734, + "logps/rejected": -61.997596740722656, + "loss": 0.7566, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.009187698364258, + "rewards/margins": 1.853018879890442, + "rewards/rejected": 1.1561691761016846, + "step": 168 + }, + { + "epoch": 0.04, + "grad_norm": 2.229585647583008, + "learning_rate": 9.980411276539674e-06, + "logits/chosen": 0.009469318203628063, + "logits/rejected": -0.13101907074451447, + "logps/chosen": -64.94052124023438, + "logps/rejected": -57.038291931152344, + "loss": 0.7775, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0035171508789062, + "rewards/margins": 2.3486790657043457, + "rewards/rejected": 0.6548379063606262, + "step": 169 + }, + { + "epoch": 0.04, + "grad_norm": 3.5422115325927734, + "learning_rate": 9.980178925209363e-06, + "logits/chosen": 0.05970479175448418, + "logits/rejected": -0.13854804635047913, + "logps/chosen": -66.35319519042969, + "logps/rejected": -47.476036071777344, + "loss": 1.0538, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.779839515686035, + "rewards/margins": 1.6176555156707764, + "rewards/rejected": 1.162184238433838, + "step": 170 + }, + { + "epoch": 0.04, + "grad_norm": 3.6829257011413574, + "learning_rate": 9.97994520671247e-06, + "logits/chosen": 0.01151879783719778, + "logits/rejected": -0.09179152548313141, + "logps/chosen": -53.48814392089844, + "logps/rejected": -59.72886276245117, + "loss": 0.9928, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.707202911376953, + "rewards/margins": 1.5960328578948975, + "rewards/rejected": 1.111169695854187, + "step": 171 + }, + { + "epoch": 0.04, + "grad_norm": 2.786639928817749, + "learning_rate": 9.979710121113163e-06, + "logits/chosen": 0.03138568624854088, + "logits/rejected": -0.20484809577465057, + "logps/chosen": -59.18574523925781, + "logps/rejected": -38.75318145751953, + "loss": 0.9625, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.76894474029541, + "rewards/margins": 1.911423921585083, + "rewards/rejected": 0.8575209975242615, + "step": 172 + }, + { + "epoch": 0.04, + "grad_norm": 2.909635543823242, + "learning_rate": 9.979473668475972e-06, + "logits/chosen": 0.04567696154117584, + "logits/rejected": -0.07134868204593658, + "logps/chosen": -67.79279327392578, + "logps/rejected": -57.669864654541016, + "loss": 0.9625, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.050354480743408, + "rewards/margins": 1.651654601097107, + "rewards/rejected": 1.3986999988555908, + "step": 173 + }, + { + "epoch": 0.04, + "grad_norm": 3.588650703430176, + "learning_rate": 9.979235848865811e-06, + "logits/chosen": 0.05245896428823471, + "logits/rejected": -0.052773527801036835, + "logps/chosen": -58.96596145629883, + "logps/rejected": -51.38550567626953, + "loss": 1.1224, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.6017162799835205, + "rewards/margins": 1.4610595703125, + "rewards/rejected": 1.14065682888031, + "step": 174 + }, + { + "epoch": 0.04, + "grad_norm": 3.1725401878356934, + "learning_rate": 9.978996662347967e-06, + "logits/chosen": 0.0618058443069458, + "logits/rejected": -0.16505524516105652, + "logps/chosen": -66.1353530883789, + "logps/rejected": -48.09904479980469, + "loss": 0.8859, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9390316009521484, + "rewards/margins": 2.2912399768829346, + "rewards/rejected": 0.647791862487793, + "step": 175 + }, + { + "epoch": 0.04, + "grad_norm": 3.0814595222473145, + "learning_rate": 9.978756108988098e-06, + "logits/chosen": -0.09909527003765106, + "logits/rejected": -0.16724827885627747, + "logps/chosen": -43.90796661376953, + "logps/rejected": -57.20535659790039, + "loss": 0.9466, + "rewards/accuracies": 0.6875, + "rewards/chosen": 3.055800437927246, + "rewards/margins": 1.674238681793213, + "rewards/rejected": 1.3815619945526123, + "step": 176 + }, + { + "epoch": 0.04, + "grad_norm": 2.1487460136413574, + "learning_rate": 9.978514188852246e-06, + "logits/chosen": -0.06860356032848358, + "logits/rejected": -0.16777688264846802, + "logps/chosen": -45.56680679321289, + "logps/rejected": -52.56700134277344, + "loss": 0.8003, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9984991550445557, + "rewards/margins": 2.4036123752593994, + "rewards/rejected": 0.5948867201805115, + "step": 177 + }, + { + "epoch": 0.04, + "grad_norm": 3.591960906982422, + "learning_rate": 9.97827090200682e-06, + "logits/chosen": 0.0655500590801239, + "logits/rejected": -0.14566007256507874, + "logps/chosen": -63.73187255859375, + "logps/rejected": -52.03754425048828, + "loss": 1.1088, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.880528450012207, + "rewards/margins": 1.6650795936584473, + "rewards/rejected": 1.2154489755630493, + "step": 178 + }, + { + "epoch": 0.04, + "grad_norm": 2.766007661819458, + "learning_rate": 9.978026248518612e-06, + "logits/chosen": 0.03852691501379013, + "logits/rejected": -0.09923157095909119, + "logps/chosen": -48.714996337890625, + "logps/rejected": -52.69049072265625, + "loss": 0.8921, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.509559392929077, + "rewards/margins": 2.086308002471924, + "rewards/rejected": 0.4232514500617981, + "step": 179 + }, + { + "epoch": 0.05, + "grad_norm": 2.7951202392578125, + "learning_rate": 9.977780228454779e-06, + "logits/chosen": -0.05599946156144142, + "logits/rejected": -0.23402051627635956, + "logps/chosen": -54.04422378540039, + "logps/rejected": -45.798988342285156, + "loss": 0.8698, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7945144176483154, + "rewards/margins": 2.0723655223846436, + "rewards/rejected": 0.7221492528915405, + "step": 180 + }, + { + "epoch": 0.05, + "grad_norm": 3.6424720287323, + "learning_rate": 9.977532841882861e-06, + "logits/chosen": -0.040046222507953644, + "logits/rejected": -0.09072600305080414, + "logps/chosen": -56.36092758178711, + "logps/rejected": -64.70320892333984, + "loss": 1.1269, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9843921661376953, + "rewards/margins": 1.8711813688278198, + "rewards/rejected": 1.113210916519165, + "step": 181 + }, + { + "epoch": 0.05, + "grad_norm": 2.906684637069702, + "learning_rate": 9.977284088870773e-06, + "logits/chosen": -0.03312622383236885, + "logits/rejected": -0.24608831107616425, + "logps/chosen": -52.813209533691406, + "logps/rejected": -45.59502410888672, + "loss": 0.9843, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8553900718688965, + "rewards/margins": 2.041381597518921, + "rewards/rejected": 0.8140087723731995, + "step": 182 + }, + { + "epoch": 0.05, + "grad_norm": 3.0062203407287598, + "learning_rate": 9.9770339694868e-06, + "logits/chosen": -0.01997799426317215, + "logits/rejected": -0.10384559631347656, + "logps/chosen": -51.352027893066406, + "logps/rejected": -57.80997848510742, + "loss": 0.9713, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.707226037979126, + "rewards/margins": 1.7766603231430054, + "rewards/rejected": 0.9305657148361206, + "step": 183 + }, + { + "epoch": 0.05, + "grad_norm": 4.276036739349365, + "learning_rate": 9.976782483799607e-06, + "logits/chosen": 0.013635195791721344, + "logits/rejected": -0.17738580703735352, + "logps/chosen": -70.90152740478516, + "logps/rejected": -57.45231628417969, + "loss": 1.0954, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.119030475616455, + "rewards/margins": 2.1369738578796387, + "rewards/rejected": 0.9820563793182373, + "step": 184 + }, + { + "epoch": 0.05, + "grad_norm": 2.771907329559326, + "learning_rate": 9.976529631878231e-06, + "logits/chosen": -0.04912268742918968, + "logits/rejected": -0.13536550104618073, + "logps/chosen": -49.542850494384766, + "logps/rejected": -45.81785583496094, + "loss": 0.9969, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8577115535736084, + "rewards/margins": 1.7033166885375977, + "rewards/rejected": 1.1543946266174316, + "step": 185 + }, + { + "epoch": 0.05, + "grad_norm": 2.847702741622925, + "learning_rate": 9.976275413792088e-06, + "logits/chosen": -0.047299448400735855, + "logits/rejected": -0.17698155343532562, + "logps/chosen": -55.69047927856445, + "logps/rejected": -55.91594314575195, + "loss": 0.9612, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6114306449890137, + "rewards/margins": 2.138007879257202, + "rewards/rejected": 0.47342270612716675, + "step": 186 + }, + { + "epoch": 0.05, + "grad_norm": 3.6063647270202637, + "learning_rate": 9.976019829610961e-06, + "logits/chosen": 0.016126932576298714, + "logits/rejected": -0.10510770976543427, + "logps/chosen": -54.510799407958984, + "logps/rejected": -55.86661911010742, + "loss": 1.0712, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.739811897277832, + "rewards/margins": 1.0468683242797852, + "rewards/rejected": 1.692944049835205, + "step": 187 + }, + { + "epoch": 0.05, + "grad_norm": 2.972360849380493, + "learning_rate": 9.975762879405018e-06, + "logits/chosen": -0.02091927081346512, + "logits/rejected": -0.1451880782842636, + "logps/chosen": -56.858482360839844, + "logps/rejected": -52.01811981201172, + "loss": 0.9436, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.0207855701446533, + "rewards/margins": 1.9153413772583008, + "rewards/rejected": 1.1054438352584839, + "step": 188 + }, + { + "epoch": 0.05, + "grad_norm": 3.2810964584350586, + "learning_rate": 9.975504563244796e-06, + "logits/chosen": -0.033278606832027435, + "logits/rejected": -0.2021145224571228, + "logps/chosen": -51.621665954589844, + "logps/rejected": -43.63349533081055, + "loss": 0.962, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7034618854522705, + "rewards/margins": 1.8527899980545044, + "rewards/rejected": 0.85067218542099, + "step": 189 + }, + { + "epoch": 0.05, + "grad_norm": 2.691697120666504, + "learning_rate": 9.975244881201209e-06, + "logits/chosen": -0.02376563474535942, + "logits/rejected": -0.027249282225966454, + "logps/chosen": -51.17850112915039, + "logps/rejected": -68.44084167480469, + "loss": 0.9345, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.966290235519409, + "rewards/margins": 1.6731284856796265, + "rewards/rejected": 1.2931619882583618, + "step": 190 + }, + { + "epoch": 0.05, + "grad_norm": 3.066201686859131, + "learning_rate": 9.974983833345546e-06, + "logits/chosen": 0.02583799697458744, + "logits/rejected": -0.15597733855247498, + "logps/chosen": -67.07563781738281, + "logps/rejected": -58.729896545410156, + "loss": 0.978, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.949875831604004, + "rewards/margins": 2.094931125640869, + "rewards/rejected": 0.8549445271492004, + "step": 191 + }, + { + "epoch": 0.05, + "grad_norm": 2.6866025924682617, + "learning_rate": 9.974721419749466e-06, + "logits/chosen": 0.025471650063991547, + "logits/rejected": -0.08623221516609192, + "logps/chosen": -50.68978500366211, + "logps/rejected": -51.172061920166016, + "loss": 0.855, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9930033683776855, + "rewards/margins": 2.010000228881836, + "rewards/rejected": 0.9830030202865601, + "step": 192 + }, + { + "epoch": 0.05, + "grad_norm": 3.5368900299072266, + "learning_rate": 9.974457640485013e-06, + "logits/chosen": 0.03072398155927658, + "logits/rejected": -0.06783923506736755, + "logps/chosen": -60.63576126098633, + "logps/rejected": -60.30327606201172, + "loss": 1.107, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.680837392807007, + "rewards/margins": 1.2811795473098755, + "rewards/rejected": 1.3996578454971313, + "step": 193 + }, + { + "epoch": 0.05, + "grad_norm": 3.0811374187469482, + "learning_rate": 9.974192495624597e-06, + "logits/chosen": -0.021053127944469452, + "logits/rejected": -0.2769005596637726, + "logps/chosen": -67.39049530029297, + "logps/rejected": -47.381874084472656, + "loss": 0.9772, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9807655811309814, + "rewards/margins": 2.2256994247436523, + "rewards/rejected": 0.75506591796875, + "step": 194 + }, + { + "epoch": 0.05, + "grad_norm": 3.267702341079712, + "learning_rate": 9.973925985241007e-06, + "logits/chosen": 0.0305474940687418, + "logits/rejected": -0.0406096875667572, + "logps/chosen": -56.442481994628906, + "logps/rejected": -66.26193237304688, + "loss": 0.9974, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.077770709991455, + "rewards/margins": 1.7560780048370361, + "rewards/rejected": 1.3216928243637085, + "step": 195 + }, + { + "epoch": 0.05, + "grad_norm": 3.0779974460601807, + "learning_rate": 9.973658109407403e-06, + "logits/chosen": 0.0058257803320884705, + "logits/rejected": -0.18197743594646454, + "logps/chosen": -77.12413787841797, + "logps/rejected": -53.92265701293945, + "loss": 0.9548, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0620107650756836, + "rewards/margins": 2.3670101165771484, + "rewards/rejected": 0.6950008273124695, + "step": 196 + }, + { + "epoch": 0.05, + "grad_norm": 3.1485180854797363, + "learning_rate": 9.973388868197326e-06, + "logits/chosen": 0.025963257998228073, + "logits/rejected": -0.06826210767030716, + "logps/chosen": -53.23310089111328, + "logps/rejected": -61.87328338623047, + "loss": 0.8633, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7966108322143555, + "rewards/margins": 2.0548129081726074, + "rewards/rejected": 0.741797924041748, + "step": 197 + }, + { + "epoch": 0.05, + "grad_norm": 3.3786580562591553, + "learning_rate": 9.973118261684687e-06, + "logits/chosen": 0.011808536946773529, + "logits/rejected": -0.0754011943936348, + "logps/chosen": -60.988502502441406, + "logps/rejected": -68.7754898071289, + "loss": 0.9538, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.881883144378662, + "rewards/margins": 1.8313813209533691, + "rewards/rejected": 1.0505021810531616, + "step": 198 + }, + { + "epoch": 0.05, + "grad_norm": 2.8267722129821777, + "learning_rate": 9.972846289943774e-06, + "logits/chosen": 0.1290525197982788, + "logits/rejected": 0.01705290749669075, + "logps/chosen": -64.52796936035156, + "logps/rejected": -67.7811508178711, + "loss": 0.9175, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7134082317352295, + "rewards/margins": 2.0745365619659424, + "rewards/rejected": 0.6388716697692871, + "step": 199 + }, + { + "epoch": 0.05, + "grad_norm": 3.1845831871032715, + "learning_rate": 9.97257295304925e-06, + "logits/chosen": 0.1287875771522522, + "logits/rejected": -0.15933403372764587, + "logps/chosen": -69.66828918457031, + "logps/rejected": -45.986507415771484, + "loss": 0.8637, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0277585983276367, + "rewards/margins": 2.1065306663513184, + "rewards/rejected": 0.9212275743484497, + "step": 200 + }, + { + "epoch": 0.05, + "grad_norm": 3.740184783935547, + "learning_rate": 9.972298251076148e-06, + "logits/chosen": 0.014298086985945702, + "logits/rejected": -0.1832594871520996, + "logps/chosen": -52.61988830566406, + "logps/rejected": -45.421199798583984, + "loss": 0.9587, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.830988883972168, + "rewards/margins": 1.8390626907348633, + "rewards/rejected": 0.9919264316558838, + "step": 201 + }, + { + "epoch": 0.05, + "grad_norm": 3.5209579467773438, + "learning_rate": 9.972022184099886e-06, + "logits/chosen": 0.02717568539083004, + "logits/rejected": -0.11702021956443787, + "logps/chosen": -57.52601623535156, + "logps/rejected": -50.97796630859375, + "loss": 1.073, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8676748275756836, + "rewards/margins": 1.7639473676681519, + "rewards/rejected": 1.10372793674469, + "step": 202 + }, + { + "epoch": 0.05, + "grad_norm": 3.699443817138672, + "learning_rate": 9.971744752196243e-06, + "logits/chosen": 0.028723031282424927, + "logits/rejected": -0.12514081597328186, + "logps/chosen": -61.60541534423828, + "logps/rejected": -56.21152114868164, + "loss": 1.0303, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.893158435821533, + "rewards/margins": 1.649371862411499, + "rewards/rejected": 1.2437866926193237, + "step": 203 + }, + { + "epoch": 0.05, + "grad_norm": 2.3142478466033936, + "learning_rate": 9.971465955441386e-06, + "logits/chosen": -0.10064352303743362, + "logits/rejected": -0.24959981441497803, + "logps/chosen": -50.66267013549805, + "logps/rejected": -56.462135314941406, + "loss": 0.7895, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.143177032470703, + "rewards/margins": 2.777561664581299, + "rewards/rejected": 0.36561548709869385, + "step": 204 + }, + { + "epoch": 0.05, + "grad_norm": 3.0995094776153564, + "learning_rate": 9.971185793911848e-06, + "logits/chosen": 0.02127302810549736, + "logits/rejected": -0.15346035361289978, + "logps/chosen": -58.57322692871094, + "logps/rejected": -49.37858581542969, + "loss": 0.9678, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7880966663360596, + "rewards/margins": 1.9601948261260986, + "rewards/rejected": 0.8279015421867371, + "step": 205 + }, + { + "epoch": 0.05, + "grad_norm": 3.3792266845703125, + "learning_rate": 9.97090426768454e-06, + "logits/chosen": 0.024151602759957314, + "logits/rejected": -0.07025053352117538, + "logps/chosen": -54.35882568359375, + "logps/rejected": -60.76668930053711, + "loss": 0.9601, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.9150776863098145, + "rewards/margins": 1.7310781478881836, + "rewards/rejected": 1.1839993000030518, + "step": 206 + }, + { + "epoch": 0.05, + "grad_norm": 3.9570441246032715, + "learning_rate": 9.970621376836747e-06, + "logits/chosen": -0.11326181888580322, + "logits/rejected": -0.17784486711025238, + "logps/chosen": -53.734100341796875, + "logps/rejected": -49.54888916015625, + "loss": 1.1395, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.839848518371582, + "rewards/margins": 1.641231894493103, + "rewards/rejected": 1.1986169815063477, + "step": 207 + }, + { + "epoch": 0.05, + "grad_norm": 2.722302198410034, + "learning_rate": 9.970337121446127e-06, + "logits/chosen": 0.010633758269250393, + "logits/rejected": -0.16631220281124115, + "logps/chosen": -63.863494873046875, + "logps/rejected": -44.920860290527344, + "loss": 0.9144, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.868048667907715, + "rewards/margins": 2.001981019973755, + "rewards/rejected": 0.8660677671432495, + "step": 208 + }, + { + "epoch": 0.05, + "grad_norm": 3.6051931381225586, + "learning_rate": 9.970051501590718e-06, + "logits/chosen": 0.001499144360423088, + "logits/rejected": -0.14939433336257935, + "logps/chosen": -64.56693267822266, + "logps/rejected": -56.4209098815918, + "loss": 0.9831, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.7691614627838135, + "rewards/margins": 1.6343052387237549, + "rewards/rejected": 1.1348562240600586, + "step": 209 + }, + { + "epoch": 0.05, + "grad_norm": 3.845919132232666, + "learning_rate": 9.969764517348924e-06, + "logits/chosen": 0.0850735679268837, + "logits/rejected": -0.21065157651901245, + "logps/chosen": -61.667808532714844, + "logps/rejected": -42.89861297607422, + "loss": 0.9548, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.613771438598633, + "rewards/margins": 1.7319896221160889, + "rewards/rejected": 0.8817819952964783, + "step": 210 + }, + { + "epoch": 0.05, + "grad_norm": 3.2183878421783447, + "learning_rate": 9.969476168799532e-06, + "logits/chosen": -0.022716421633958817, + "logits/rejected": -0.17742270231246948, + "logps/chosen": -59.556617736816406, + "logps/rejected": -46.134334564208984, + "loss": 0.9859, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7110466957092285, + "rewards/margins": 1.6273099184036255, + "rewards/rejected": 1.0837366580963135, + "step": 211 + }, + { + "epoch": 0.05, + "grad_norm": 3.2341361045837402, + "learning_rate": 9.9691864560217e-06, + "logits/chosen": 0.005724898539483547, + "logits/rejected": -0.10200914740562439, + "logps/chosen": -56.494773864746094, + "logps/rejected": -53.206764221191406, + "loss": 1.0543, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5837550163269043, + "rewards/margins": 1.4173957109451294, + "rewards/rejected": 1.1663591861724854, + "step": 212 + }, + { + "epoch": 0.05, + "grad_norm": 2.8783557415008545, + "learning_rate": 9.968895379094959e-06, + "logits/chosen": -0.05198504030704498, + "logits/rejected": -0.1687326282262802, + "logps/chosen": -54.4499397277832, + "logps/rejected": -52.98585891723633, + "loss": 0.9311, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0852012634277344, + "rewards/margins": 1.806414246559143, + "rewards/rejected": 1.2787871360778809, + "step": 213 + }, + { + "epoch": 0.05, + "grad_norm": 2.9050354957580566, + "learning_rate": 9.968602938099215e-06, + "logits/chosen": 0.029482141137123108, + "logits/rejected": -0.020634165033698082, + "logps/chosen": -57.91923522949219, + "logps/rejected": -66.06855773925781, + "loss": 1.0314, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9205029010772705, + "rewards/margins": 1.6332340240478516, + "rewards/rejected": 1.2872689962387085, + "step": 214 + }, + { + "epoch": 0.05, + "grad_norm": 3.372892141342163, + "learning_rate": 9.96830913311475e-06, + "logits/chosen": 0.05480228364467621, + "logits/rejected": -0.02448970079421997, + "logps/chosen": -59.469356536865234, + "logps/rejected": -56.952537536621094, + "loss": 1.0188, + "rewards/accuracies": 0.65625, + "rewards/chosen": 2.8314738273620605, + "rewards/margins": 1.6410804986953735, + "rewards/rejected": 1.190393328666687, + "step": 215 + }, + { + "epoch": 0.05, + "grad_norm": 2.723642110824585, + "learning_rate": 9.968013964222223e-06, + "logits/chosen": 0.015287935733795166, + "logits/rejected": -0.21109068393707275, + "logps/chosen": -53.83726119995117, + "logps/rejected": -48.75297164916992, + "loss": 0.8816, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9422335624694824, + "rewards/margins": 2.2652440071105957, + "rewards/rejected": 0.6769895553588867, + "step": 216 + }, + { + "epoch": 0.05, + "grad_norm": 4.501143455505371, + "learning_rate": 9.967717431502659e-06, + "logits/chosen": -0.08353226631879807, + "logits/rejected": -0.18324299156665802, + "logps/chosen": -67.43508911132812, + "logps/rejected": -55.618526458740234, + "loss": 1.3242, + "rewards/accuracies": 0.625, + "rewards/chosen": 2.59193754196167, + "rewards/margins": 1.3075647354125977, + "rewards/rejected": 1.2843728065490723, + "step": 217 + }, + { + "epoch": 0.05, + "grad_norm": 2.466320037841797, + "learning_rate": 9.967419535037466e-06, + "logits/chosen": -4.644226282835007e-05, + "logits/rejected": -0.19572953879833221, + "logps/chosen": -49.28120803833008, + "logps/rejected": -52.0896110534668, + "loss": 0.7334, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.802973985671997, + "rewards/margins": 2.6211957931518555, + "rewards/rejected": 0.18177831172943115, + "step": 218 + }, + { + "epoch": 0.05, + "grad_norm": 3.0461959838867188, + "learning_rate": 9.967120274908422e-06, + "logits/chosen": 0.01777651160955429, + "logits/rejected": -0.023481309413909912, + "logps/chosen": -60.907291412353516, + "logps/rejected": -72.37645721435547, + "loss": 0.9416, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.143709182739258, + "rewards/margins": 1.6497418880462646, + "rewards/rejected": 1.4939674139022827, + "step": 219 + }, + { + "epoch": 0.06, + "grad_norm": 3.9412546157836914, + "learning_rate": 9.96681965119768e-06, + "logits/chosen": -0.09815619885921478, + "logits/rejected": -0.15271010994911194, + "logps/chosen": -48.8974494934082, + "logps/rejected": -51.701393127441406, + "loss": 1.1152, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.5381672382354736, + "rewards/margins": 1.372673749923706, + "rewards/rejected": 1.1654936075210571, + "step": 220 + }, + { + "epoch": 0.06, + "grad_norm": 3.235557794570923, + "learning_rate": 9.96651766398777e-06, + "logits/chosen": 0.0032295063138008118, + "logits/rejected": -0.1314755082130432, + "logps/chosen": -58.16272735595703, + "logps/rejected": -56.92979431152344, + "loss": 0.9303, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9238829612731934, + "rewards/margins": 1.8848216533660889, + "rewards/rejected": 1.0390609502792358, + "step": 221 + }, + { + "epoch": 0.06, + "grad_norm": 3.111088752746582, + "learning_rate": 9.966214313361592e-06, + "logits/chosen": 0.05448153242468834, + "logits/rejected": -0.12330888211727142, + "logps/chosen": -65.62443542480469, + "logps/rejected": -52.847686767578125, + "loss": 0.9652, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8634159564971924, + "rewards/margins": 2.073925018310547, + "rewards/rejected": 0.7894909381866455, + "step": 222 + }, + { + "epoch": 0.06, + "grad_norm": 3.101519823074341, + "learning_rate": 9.965909599402421e-06, + "logits/chosen": -0.05859026685357094, + "logits/rejected": -0.1270999312400818, + "logps/chosen": -50.30222702026367, + "logps/rejected": -56.92089080810547, + "loss": 0.9106, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6616463661193848, + "rewards/margins": 1.8009026050567627, + "rewards/rejected": 0.8607436418533325, + "step": 223 + }, + { + "epoch": 0.06, + "grad_norm": 2.6117851734161377, + "learning_rate": 9.96560352219391e-06, + "logits/chosen": 0.04789305478334427, + "logits/rejected": -0.12388836592435837, + "logps/chosen": -54.59901809692383, + "logps/rejected": -54.93995666503906, + "loss": 0.8958, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.673236846923828, + "rewards/margins": 2.1625418663024902, + "rewards/rejected": 0.5106950998306274, + "step": 224 + }, + { + "epoch": 0.06, + "grad_norm": 2.5793516635894775, + "learning_rate": 9.965296081820083e-06, + "logits/chosen": 0.09024739265441895, + "logits/rejected": -0.12169639766216278, + "logps/chosen": -65.2392349243164, + "logps/rejected": -47.33515167236328, + "loss": 0.8682, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9074201583862305, + "rewards/margins": 2.3502936363220215, + "rewards/rejected": 0.5571264624595642, + "step": 225 + }, + { + "epoch": 0.06, + "grad_norm": 3.046361207962036, + "learning_rate": 9.964987278365338e-06, + "logits/chosen": -0.08513672649860382, + "logits/rejected": -0.2928316295146942, + "logps/chosen": -55.85397720336914, + "logps/rejected": -44.77084732055664, + "loss": 0.9401, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.960725784301758, + "rewards/margins": 2.063634157180786, + "rewards/rejected": 0.8970915079116821, + "step": 226 + }, + { + "epoch": 0.06, + "grad_norm": 2.4403703212738037, + "learning_rate": 9.96467711191445e-06, + "logits/chosen": -0.0038212668150663376, + "logits/rejected": -0.22255071997642517, + "logps/chosen": -57.07680130004883, + "logps/rejected": -43.24293899536133, + "loss": 0.8272, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1322667598724365, + "rewards/margins": 2.282233476638794, + "rewards/rejected": 0.850033164024353, + "step": 227 + }, + { + "epoch": 0.06, + "grad_norm": 2.5507304668426514, + "learning_rate": 9.964365582552566e-06, + "logits/chosen": 0.0315084308385849, + "logits/rejected": -0.12183855473995209, + "logps/chosen": -77.18348693847656, + "logps/rejected": -58.68135070800781, + "loss": 0.8873, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.008115530014038, + "rewards/margins": 2.105746269226074, + "rewards/rejected": 0.9023692011833191, + "step": 228 + }, + { + "epoch": 0.06, + "grad_norm": 3.0523154735565186, + "learning_rate": 9.964052690365205e-06, + "logits/chosen": -0.05581003427505493, + "logits/rejected": -0.21314722299575806, + "logps/chosen": -62.2508659362793, + "logps/rejected": -48.931358337402344, + "loss": 1.0835, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.013983726501465, + "rewards/margins": 1.9701482057571411, + "rewards/rejected": 1.0438352823257446, + "step": 229 + }, + { + "epoch": 0.06, + "grad_norm": 3.0141525268554688, + "learning_rate": 9.963738435438267e-06, + "logits/chosen": 0.07558739930391312, + "logits/rejected": -0.11010690033435822, + "logps/chosen": -58.11232376098633, + "logps/rejected": -46.17201614379883, + "loss": 0.9084, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.825521469116211, + "rewards/margins": 2.097318172454834, + "rewards/rejected": 0.7282033562660217, + "step": 230 + }, + { + "epoch": 0.06, + "grad_norm": 4.525566577911377, + "learning_rate": 9.963422817858018e-06, + "logits/chosen": 0.028755566105246544, + "logits/rejected": -0.07305090129375458, + "logps/chosen": -52.1358757019043, + "logps/rejected": -55.536102294921875, + "loss": 1.0432, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.482104778289795, + "rewards/margins": 1.5920205116271973, + "rewards/rejected": 0.890084445476532, + "step": 231 + }, + { + "epoch": 0.06, + "grad_norm": 3.976372718811035, + "learning_rate": 9.963105837711104e-06, + "logits/chosen": 0.041797224432229996, + "logits/rejected": -0.028179166838526726, + "logps/chosen": -64.49800109863281, + "logps/rejected": -63.091346740722656, + "loss": 1.097, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8531579971313477, + "rewards/margins": 1.5077389478683472, + "rewards/rejected": 1.345418930053711, + "step": 232 + }, + { + "epoch": 0.06, + "grad_norm": 2.9209859371185303, + "learning_rate": 9.962787495084542e-06, + "logits/chosen": 0.00449972040951252, + "logits/rejected": -0.17115117609500885, + "logps/chosen": -49.5093879699707, + "logps/rejected": -46.14327621459961, + "loss": 0.9616, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.870877265930176, + "rewards/margins": 1.7662582397460938, + "rewards/rejected": 1.1046189069747925, + "step": 233 + }, + { + "epoch": 0.06, + "grad_norm": 2.5945825576782227, + "learning_rate": 9.962467790065724e-06, + "logits/chosen": 0.03338645398616791, + "logits/rejected": -0.18994392454624176, + "logps/chosen": -66.29540252685547, + "logps/rejected": -44.70756530761719, + "loss": 0.7764, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0229249000549316, + "rewards/margins": 2.691056251525879, + "rewards/rejected": 0.3318687081336975, + "step": 234 + }, + { + "epoch": 0.06, + "grad_norm": 2.94520902633667, + "learning_rate": 9.962146722742416e-06, + "logits/chosen": 0.04836362227797508, + "logits/rejected": -0.062090616673231125, + "logps/chosen": -59.14451217651367, + "logps/rejected": -57.389469146728516, + "loss": 1.0242, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.871722459793091, + "rewards/margins": 1.2579822540283203, + "rewards/rejected": 1.613740086555481, + "step": 235 + }, + { + "epoch": 0.06, + "grad_norm": 3.1968331336975098, + "learning_rate": 9.961824293202758e-06, + "logits/chosen": 0.02153550274670124, + "logits/rejected": -0.1926732212305069, + "logps/chosen": -54.303436279296875, + "logps/rejected": -47.10759735107422, + "loss": 0.906, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8375468254089355, + "rewards/margins": 2.4251346588134766, + "rewards/rejected": 0.41241198778152466, + "step": 236 + }, + { + "epoch": 0.06, + "grad_norm": 4.014254570007324, + "learning_rate": 9.961500501535263e-06, + "logits/chosen": -0.08924422413110733, + "logits/rejected": -0.08054222911596298, + "logps/chosen": -52.33683776855469, + "logps/rejected": -60.62409591674805, + "loss": 1.0409, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.9097862243652344, + "rewards/margins": 1.003055214881897, + "rewards/rejected": 1.906731128692627, + "step": 237 + }, + { + "epoch": 0.06, + "grad_norm": 3.778219699859619, + "learning_rate": 9.96117534782882e-06, + "logits/chosen": 0.06097141653299332, + "logits/rejected": -0.07601059973239899, + "logps/chosen": -56.31871032714844, + "logps/rejected": -54.56808853149414, + "loss": 0.9805, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8033852577209473, + "rewards/margins": 1.5455081462860107, + "rewards/rejected": 1.2578771114349365, + "step": 238 + }, + { + "epoch": 0.06, + "grad_norm": 3.4609782695770264, + "learning_rate": 9.960848832172692e-06, + "logits/chosen": 0.09705175459384918, + "logits/rejected": -0.044197797775268555, + "logps/chosen": -71.32498931884766, + "logps/rejected": -59.69906234741211, + "loss": 1.0708, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.9852237701416016, + "rewards/margins": 1.8449091911315918, + "rewards/rejected": 1.1403142213821411, + "step": 239 + }, + { + "epoch": 0.06, + "grad_norm": 3.3906443119049072, + "learning_rate": 9.960520954656512e-06, + "logits/chosen": 0.022054191678762436, + "logits/rejected": -0.12632982432842255, + "logps/chosen": -62.38005828857422, + "logps/rejected": -51.30343246459961, + "loss": 0.9707, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.730910301208496, + "rewards/margins": 1.8054800033569336, + "rewards/rejected": 0.9254301190376282, + "step": 240 + }, + { + "epoch": 0.06, + "grad_norm": 3.60123348236084, + "learning_rate": 9.960191715370289e-06, + "logits/chosen": -0.014062147587537766, + "logits/rejected": -0.10791745781898499, + "logps/chosen": -55.756446838378906, + "logps/rejected": -59.924896240234375, + "loss": 0.8771, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.899184465408325, + "rewards/margins": 2.1039254665374756, + "rewards/rejected": 0.7952591776847839, + "step": 241 + }, + { + "epoch": 0.06, + "grad_norm": 2.6209659576416016, + "learning_rate": 9.959861114404408e-06, + "logits/chosen": -0.0317281112074852, + "logits/rejected": -0.1532873809337616, + "logps/chosen": -58.742679595947266, + "logps/rejected": -44.73017883300781, + "loss": 0.9337, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.969923496246338, + "rewards/margins": 2.2158777713775635, + "rewards/rejected": 0.7540459632873535, + "step": 242 + }, + { + "epoch": 0.06, + "grad_norm": 3.904805898666382, + "learning_rate": 9.959529151849627e-06, + "logits/chosen": -0.007394211366772652, + "logits/rejected": -0.06961001455783844, + "logps/chosen": -61.838104248046875, + "logps/rejected": -62.00649642944336, + "loss": 1.0753, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8921597003936768, + "rewards/margins": 1.5194437503814697, + "rewards/rejected": 1.3727158308029175, + "step": 243 + }, + { + "epoch": 0.06, + "grad_norm": 3.1350085735321045, + "learning_rate": 9.959195827797075e-06, + "logits/chosen": -0.010692513547837734, + "logits/rejected": -0.12326680123806, + "logps/chosen": -74.32156372070312, + "logps/rejected": -62.49479675292969, + "loss": 0.9513, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.038151264190674, + "rewards/margins": 1.973156213760376, + "rewards/rejected": 1.0649950504302979, + "step": 244 + }, + { + "epoch": 0.06, + "grad_norm": 4.032330513000488, + "learning_rate": 9.958861142338256e-06, + "logits/chosen": -0.010956194251775742, + "logits/rejected": -0.1312340945005417, + "logps/chosen": -66.21357727050781, + "logps/rejected": -59.63143539428711, + "loss": 1.0243, + "rewards/accuracies": 0.65625, + "rewards/chosen": 3.0633819103240967, + "rewards/margins": 1.6814600229263306, + "rewards/rejected": 1.381921648979187, + "step": 245 + }, + { + "epoch": 0.06, + "grad_norm": 3.2303600311279297, + "learning_rate": 9.958525095565052e-06, + "logits/chosen": -0.01823693886399269, + "logits/rejected": -0.13200950622558594, + "logps/chosen": -47.885887145996094, + "logps/rejected": -53.99846649169922, + "loss": 0.8838, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.915036201477051, + "rewards/margins": 2.218144178390503, + "rewards/rejected": 0.6968920230865479, + "step": 246 + }, + { + "epoch": 0.06, + "grad_norm": 3.3433594703674316, + "learning_rate": 9.958187687569712e-06, + "logits/chosen": 0.04060458391904831, + "logits/rejected": -0.16286146640777588, + "logps/chosen": -57.251731872558594, + "logps/rejected": -46.387733459472656, + "loss": 0.9586, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.928309917449951, + "rewards/margins": 1.6955748796463013, + "rewards/rejected": 1.232735276222229, + "step": 247 + }, + { + "epoch": 0.06, + "grad_norm": 3.846270799636841, + "learning_rate": 9.957848918444861e-06, + "logits/chosen": 0.05691595375537872, + "logits/rejected": -0.006114158779382706, + "logps/chosen": -58.125755310058594, + "logps/rejected": -72.37991333007812, + "loss": 0.9557, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.913275718688965, + "rewards/margins": 1.4843289852142334, + "rewards/rejected": 1.4289464950561523, + "step": 248 + }, + { + "epoch": 0.06, + "grad_norm": 3.167123556137085, + "learning_rate": 9.9575087882835e-06, + "logits/chosen": 0.06995488703250885, + "logits/rejected": -0.07510484009981155, + "logps/chosen": -57.54225158691406, + "logps/rejected": -61.48040008544922, + "loss": 0.8949, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7780842781066895, + "rewards/margins": 1.6533939838409424, + "rewards/rejected": 1.124690294265747, + "step": 249 + }, + { + "epoch": 0.06, + "grad_norm": 3.790618896484375, + "learning_rate": 9.957167297179004e-06, + "logits/chosen": 0.025422267615795135, + "logits/rejected": -0.19098742306232452, + "logps/chosen": -53.019309997558594, + "logps/rejected": -44.70158386230469, + "loss": 0.9786, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.6335554122924805, + "rewards/margins": 1.970258116722107, + "rewards/rejected": 0.6632974147796631, + "step": 250 + }, + { + "epoch": 0.06, + "grad_norm": 3.6723849773406982, + "learning_rate": 9.956824445225117e-06, + "logits/chosen": 0.00253424234688282, + "logits/rejected": -0.14169922471046448, + "logps/chosen": -66.6920166015625, + "logps/rejected": -54.69944381713867, + "loss": 1.0088, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.665832042694092, + "rewards/margins": 1.8675881624221802, + "rewards/rejected": 0.7982438206672668, + "step": 251 + }, + { + "epoch": 0.06, + "grad_norm": 3.5620858669281006, + "learning_rate": 9.956480232515959e-06, + "logits/chosen": 0.008842475712299347, + "logits/rejected": -0.04579572379589081, + "logps/chosen": -59.38133239746094, + "logps/rejected": -73.07718658447266, + "loss": 0.9939, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.821563959121704, + "rewards/margins": 1.4647516012191772, + "rewards/rejected": 1.356812596321106, + "step": 252 + }, + { + "epoch": 0.06, + "grad_norm": 3.286179304122925, + "learning_rate": 9.956134659146026e-06, + "logits/chosen": -0.05479082465171814, + "logits/rejected": -0.15994137525558472, + "logps/chosen": -55.55274200439453, + "logps/rejected": -61.354347229003906, + "loss": 0.8934, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.994795560836792, + "rewards/margins": 1.9380853176116943, + "rewards/rejected": 1.0567100048065186, + "step": 253 + }, + { + "epoch": 0.06, + "grad_norm": 3.4747934341430664, + "learning_rate": 9.955787725210183e-06, + "logits/chosen": -0.05631411820650101, + "logits/rejected": -0.08540860563516617, + "logps/chosen": -50.64372634887695, + "logps/rejected": -75.99271392822266, + "loss": 1.0317, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7424936294555664, + "rewards/margins": 1.5168315172195435, + "rewards/rejected": 1.2256619930267334, + "step": 254 + }, + { + "epoch": 0.06, + "grad_norm": 3.1712582111358643, + "learning_rate": 9.955439430803672e-06, + "logits/chosen": -0.06237261742353439, + "logits/rejected": -0.1199665442109108, + "logps/chosen": -50.778690338134766, + "logps/rejected": -61.675537109375, + "loss": 0.9465, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.9618468284606934, + "rewards/margins": 1.613165020942688, + "rewards/rejected": 1.3486820459365845, + "step": 255 + }, + { + "epoch": 0.06, + "grad_norm": 2.8613808155059814, + "learning_rate": 9.955089776022108e-06, + "logits/chosen": -0.018830642104148865, + "logits/rejected": -0.13890637457370758, + "logps/chosen": -62.839378356933594, + "logps/rejected": -48.167991638183594, + "loss": 0.8981, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6923904418945312, + "rewards/margins": 2.0640666484832764, + "rewards/rejected": 0.6283236145973206, + "step": 256 + }, + { + "epoch": 0.06, + "grad_norm": 2.3355071544647217, + "learning_rate": 9.954738760961478e-06, + "logits/chosen": -0.021126065403223038, + "logits/rejected": -0.09574566781520844, + "logps/chosen": -45.24241638183594, + "logps/rejected": -58.65120315551758, + "loss": 0.818, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0362260341644287, + "rewards/margins": 2.2124288082122803, + "rewards/rejected": 0.8237971067428589, + "step": 257 + }, + { + "epoch": 0.06, + "grad_norm": 2.945983409881592, + "learning_rate": 9.954386385718142e-06, + "logits/chosen": 0.038085971027612686, + "logits/rejected": -0.1849203109741211, + "logps/chosen": -64.95195007324219, + "logps/rejected": -61.2130241394043, + "loss": 0.8873, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.878218412399292, + "rewards/margins": 2.485759735107422, + "rewards/rejected": 0.39245903491973877, + "step": 258 + }, + { + "epoch": 0.06, + "grad_norm": 3.114866018295288, + "learning_rate": 9.954032650388838e-06, + "logits/chosen": 0.025161439552903175, + "logits/rejected": -0.15952882170677185, + "logps/chosen": -64.68595886230469, + "logps/rejected": -50.344173431396484, + "loss": 0.9087, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8156204223632812, + "rewards/margins": 2.1888604164123535, + "rewards/rejected": 0.6267600655555725, + "step": 259 + }, + { + "epoch": 0.07, + "grad_norm": 3.173144817352295, + "learning_rate": 9.953677555070671e-06, + "logits/chosen": -0.021675098687410355, + "logits/rejected": -0.2014334499835968, + "logps/chosen": -60.18293762207031, + "logps/rejected": -59.47661590576172, + "loss": 0.9204, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.813319206237793, + "rewards/margins": 1.9523682594299316, + "rewards/rejected": 0.8609510064125061, + "step": 260 + }, + { + "epoch": 0.07, + "grad_norm": 3.47957181930542, + "learning_rate": 9.953321099861125e-06, + "logits/chosen": -0.01664409227669239, + "logits/rejected": -0.13479499518871307, + "logps/chosen": -57.60078430175781, + "logps/rejected": -45.91553497314453, + "loss": 1.024, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8528048992156982, + "rewards/margins": 1.9759442806243896, + "rewards/rejected": 0.8768605589866638, + "step": 261 + }, + { + "epoch": 0.07, + "grad_norm": 2.5242416858673096, + "learning_rate": 9.952963284858049e-06, + "logits/chosen": -0.008567234501242638, + "logits/rejected": -0.29239141941070557, + "logps/chosen": -59.015342712402344, + "logps/rejected": -47.938377380371094, + "loss": 0.8327, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.143948793411255, + "rewards/margins": 2.2713019847869873, + "rewards/rejected": 0.8726466298103333, + "step": 262 + }, + { + "epoch": 0.07, + "grad_norm": 3.4378013610839844, + "learning_rate": 9.952604110159677e-06, + "logits/chosen": 0.050843093544244766, + "logits/rejected": -0.0566747710108757, + "logps/chosen": -67.88060760498047, + "logps/rejected": -63.44704818725586, + "loss": 0.8766, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8345236778259277, + "rewards/margins": 2.0747501850128174, + "rewards/rejected": 0.7597737312316895, + "step": 263 + }, + { + "epoch": 0.07, + "grad_norm": 4.514856338500977, + "learning_rate": 9.952243575864608e-06, + "logits/chosen": -0.06751511991024017, + "logits/rejected": -0.19486403465270996, + "logps/chosen": -55.143524169921875, + "logps/rejected": -63.166709899902344, + "loss": 1.0907, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.0603652000427246, + "rewards/margins": 2.0631425380706787, + "rewards/rejected": 0.9972226619720459, + "step": 264 + }, + { + "epoch": 0.07, + "grad_norm": 3.1671595573425293, + "learning_rate": 9.951881682071815e-06, + "logits/chosen": -0.04271695762872696, + "logits/rejected": -0.1482456624507904, + "logps/chosen": -53.477203369140625, + "logps/rejected": -62.2862663269043, + "loss": 1.0243, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.876809597015381, + "rewards/margins": 1.771837830543518, + "rewards/rejected": 1.1049716472625732, + "step": 265 + }, + { + "epoch": 0.07, + "grad_norm": 3.5144927501678467, + "learning_rate": 9.951518428880649e-06, + "logits/chosen": 0.011799395084381104, + "logits/rejected": -0.18356533348560333, + "logps/chosen": -57.75822448730469, + "logps/rejected": -49.11381912231445, + "loss": 0.9896, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.572988748550415, + "rewards/margins": 2.039951801300049, + "rewards/rejected": 0.5330367088317871, + "step": 266 + }, + { + "epoch": 0.07, + "grad_norm": 3.2502758502960205, + "learning_rate": 9.951153816390828e-06, + "logits/chosen": -0.04240484535694122, + "logits/rejected": -0.16351622343063354, + "logps/chosen": -54.50054931640625, + "logps/rejected": -57.1463737487793, + "loss": 0.9831, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.926792860031128, + "rewards/margins": 1.9327080249786377, + "rewards/rejected": 0.9940848350524902, + "step": 267 + }, + { + "epoch": 0.07, + "grad_norm": 2.7186038494110107, + "learning_rate": 9.950787844702447e-06, + "logits/chosen": 0.07561527192592621, + "logits/rejected": -0.08013591915369034, + "logps/chosen": -59.82940673828125, + "logps/rejected": -60.95476531982422, + "loss": 0.8105, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7437000274658203, + "rewards/margins": 1.9469077587127686, + "rewards/rejected": 0.7967923879623413, + "step": 268 + }, + { + "epoch": 0.07, + "grad_norm": 2.9716169834136963, + "learning_rate": 9.950420513915974e-06, + "logits/chosen": -0.044993333518505096, + "logits/rejected": -0.1838812381029129, + "logps/chosen": -53.99478530883789, + "logps/rejected": -52.73881149291992, + "loss": 0.8993, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.107020854949951, + "rewards/margins": 2.3299014568328857, + "rewards/rejected": 0.7771195769309998, + "step": 269 + }, + { + "epoch": 0.07, + "grad_norm": 4.011703014373779, + "learning_rate": 9.950051824132247e-06, + "logits/chosen": 0.039597757160663605, + "logits/rejected": -0.03815944492816925, + "logps/chosen": -50.6192626953125, + "logps/rejected": -66.53437805175781, + "loss": 0.9623, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6349730491638184, + "rewards/margins": 1.4311325550079346, + "rewards/rejected": 1.203840732574463, + "step": 270 + }, + { + "epoch": 0.07, + "grad_norm": 3.5865509510040283, + "learning_rate": 9.94968177545248e-06, + "logits/chosen": -0.015386410057544708, + "logits/rejected": -0.15331092476844788, + "logps/chosen": -59.10280227661133, + "logps/rejected": -52.35159683227539, + "loss": 1.049, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5914080142974854, + "rewards/margins": 1.8785828351974487, + "rewards/rejected": 0.7128249406814575, + "step": 271 + }, + { + "epoch": 0.07, + "grad_norm": 3.2685890197753906, + "learning_rate": 9.949310367978262e-06, + "logits/chosen": -0.015819206833839417, + "logits/rejected": -0.25385817885398865, + "logps/chosen": -69.10813903808594, + "logps/rejected": -47.456695556640625, + "loss": 0.9958, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.660665988922119, + "rewards/margins": 2.054964542388916, + "rewards/rejected": 0.6057014465332031, + "step": 272 + }, + { + "epoch": 0.07, + "grad_norm": 3.3216958045959473, + "learning_rate": 9.94893760181155e-06, + "logits/chosen": -0.025716427713632584, + "logits/rejected": -0.1400923877954483, + "logps/chosen": -52.0333137512207, + "logps/rejected": -55.45915985107422, + "loss": 0.8373, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.69113826751709, + "rewards/margins": 2.2123122215270996, + "rewards/rejected": 0.4788258671760559, + "step": 273 + }, + { + "epoch": 0.07, + "grad_norm": 4.938079833984375, + "learning_rate": 9.948563477054675e-06, + "logits/chosen": 0.02767482027411461, + "logits/rejected": -0.04078149423003197, + "logps/chosen": -59.554134368896484, + "logps/rejected": -66.64747619628906, + "loss": 1.2593, + "rewards/accuracies": 0.65625, + "rewards/chosen": 2.585456609725952, + "rewards/margins": 0.9795442223548889, + "rewards/rejected": 1.605912446975708, + "step": 274 + }, + { + "epoch": 0.07, + "grad_norm": 2.1929807662963867, + "learning_rate": 9.948187993810345e-06, + "logits/chosen": -0.049857206642627716, + "logits/rejected": -0.21126523613929749, + "logps/chosen": -46.82365417480469, + "logps/rejected": -47.55079650878906, + "loss": 0.7585, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0159831047058105, + "rewards/margins": 2.5892446041107178, + "rewards/rejected": 0.4267384111881256, + "step": 275 + }, + { + "epoch": 0.07, + "grad_norm": 3.231459617614746, + "learning_rate": 9.947811152181637e-06, + "logits/chosen": 0.07066163420677185, + "logits/rejected": -0.052284181118011475, + "logps/chosen": -61.377864837646484, + "logps/rejected": -58.51459884643555, + "loss": 0.996, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.93624210357666, + "rewards/margins": 1.7734848260879517, + "rewards/rejected": 1.162757158279419, + "step": 276 + }, + { + "epoch": 0.07, + "grad_norm": 2.8200905323028564, + "learning_rate": 9.947432952272003e-06, + "logits/chosen": -0.024884840473532677, + "logits/rejected": -0.23527027666568756, + "logps/chosen": -59.40528106689453, + "logps/rejected": -57.115875244140625, + "loss": 0.8405, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0489518642425537, + "rewards/margins": 2.4292662143707275, + "rewards/rejected": 0.6196856498718262, + "step": 277 + }, + { + "epoch": 0.07, + "grad_norm": 3.3689382076263428, + "learning_rate": 9.947053394185266e-06, + "logits/chosen": 0.09264566749334335, + "logits/rejected": -0.11382004618644714, + "logps/chosen": -57.027103424072266, + "logps/rejected": -66.88188934326172, + "loss": 0.8773, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.687199354171753, + "rewards/margins": 2.2280797958374023, + "rewards/rejected": 0.45911943912506104, + "step": 278 + }, + { + "epoch": 0.07, + "grad_norm": 3.046231746673584, + "learning_rate": 9.946672478025623e-06, + "logits/chosen": 0.022813180461525917, + "logits/rejected": -0.06579121947288513, + "logps/chosen": -47.12384033203125, + "logps/rejected": -61.57200622558594, + "loss": 0.9745, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.6737611293792725, + "rewards/margins": 1.9956467151641846, + "rewards/rejected": 0.6781145334243774, + "step": 279 + }, + { + "epoch": 0.07, + "grad_norm": 3.3960421085357666, + "learning_rate": 9.946290203897643e-06, + "logits/chosen": -0.05969715490937233, + "logits/rejected": -0.18275323510169983, + "logps/chosen": -51.6929931640625, + "logps/rejected": -54.971351623535156, + "loss": 0.9507, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.728266477584839, + "rewards/margins": 2.039769411087036, + "rewards/rejected": 0.6884973049163818, + "step": 280 + }, + { + "epoch": 0.07, + "grad_norm": 2.993216037750244, + "learning_rate": 9.945906571906272e-06, + "logits/chosen": -0.010306183248758316, + "logits/rejected": -0.11061849445104599, + "logps/chosen": -60.551856994628906, + "logps/rejected": -56.37430953979492, + "loss": 0.916, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8527607917785645, + "rewards/margins": 1.8460559844970703, + "rewards/rejected": 1.0067046880722046, + "step": 281 + }, + { + "epoch": 0.07, + "grad_norm": 3.059725284576416, + "learning_rate": 9.945521582156821e-06, + "logits/chosen": 0.04374184086918831, + "logits/rejected": -0.10081253200769424, + "logps/chosen": -61.94560241699219, + "logps/rejected": -47.07429504394531, + "loss": 0.9403, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7281088829040527, + "rewards/margins": 1.863473653793335, + "rewards/rejected": 0.8646350502967834, + "step": 282 + }, + { + "epoch": 0.07, + "grad_norm": 4.654338359832764, + "learning_rate": 9.945135234754981e-06, + "logits/chosen": -0.021941784769296646, + "logits/rejected": -0.18562695384025574, + "logps/chosen": -53.56764221191406, + "logps/rejected": -58.0326042175293, + "loss": 0.9643, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.888859272003174, + "rewards/margins": 2.0773367881774902, + "rewards/rejected": 0.811522364616394, + "step": 283 + }, + { + "epoch": 0.07, + "grad_norm": 4.072115421295166, + "learning_rate": 9.944747529806811e-06, + "logits/chosen": 0.07901390641927719, + "logits/rejected": -0.055033810436725616, + "logps/chosen": -60.49229431152344, + "logps/rejected": -55.31265640258789, + "loss": 1.0567, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.97866153717041, + "rewards/margins": 1.5729084014892578, + "rewards/rejected": 1.405753493309021, + "step": 284 + }, + { + "epoch": 0.07, + "grad_norm": 2.7784106731414795, + "learning_rate": 9.944358467418745e-06, + "logits/chosen": 0.07768256217241287, + "logits/rejected": -0.03685057908296585, + "logps/chosen": -45.297874450683594, + "logps/rejected": -68.57764434814453, + "loss": 0.709, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9126620292663574, + "rewards/margins": 2.6529629230499268, + "rewards/rejected": 0.2596990764141083, + "step": 285 + }, + { + "epoch": 0.07, + "grad_norm": 2.4160115718841553, + "learning_rate": 9.943968047697588e-06, + "logits/chosen": 0.042760416865348816, + "logits/rejected": -0.2139359712600708, + "logps/chosen": -51.60176086425781, + "logps/rejected": -43.60008239746094, + "loss": 0.7277, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0738766193389893, + "rewards/margins": 2.7983815670013428, + "rewards/rejected": 0.2754947543144226, + "step": 286 + }, + { + "epoch": 0.07, + "grad_norm": 2.986557960510254, + "learning_rate": 9.94357627075052e-06, + "logits/chosen": 0.11131319403648376, + "logits/rejected": -0.2052956223487854, + "logps/chosen": -80.07304382324219, + "logps/rejected": -45.05133819580078, + "loss": 0.8501, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.793893814086914, + "rewards/margins": 2.0777525901794434, + "rewards/rejected": 0.7161411643028259, + "step": 287 + }, + { + "epoch": 0.07, + "grad_norm": 3.8060524463653564, + "learning_rate": 9.943183136685092e-06, + "logits/chosen": -0.04737216979265213, + "logits/rejected": -0.17065036296844482, + "logps/chosen": -54.26647186279297, + "logps/rejected": -54.19017028808594, + "loss": 1.0911, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.4240236282348633, + "rewards/margins": 1.4296622276306152, + "rewards/rejected": 0.9943613409996033, + "step": 288 + }, + { + "epoch": 0.07, + "grad_norm": 3.9175753593444824, + "learning_rate": 9.942788645609227e-06, + "logits/chosen": 0.12948983907699585, + "logits/rejected": -0.08218345046043396, + "logps/chosen": -76.8341064453125, + "logps/rejected": -52.76068115234375, + "loss": 0.9997, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7736902236938477, + "rewards/margins": 2.0104944705963135, + "rewards/rejected": 0.7631958723068237, + "step": 289 + }, + { + "epoch": 0.07, + "grad_norm": 3.3565192222595215, + "learning_rate": 9.942392797631222e-06, + "logits/chosen": 0.022862661629915237, + "logits/rejected": -0.18920399248600006, + "logps/chosen": -48.23899841308594, + "logps/rejected": -45.552127838134766, + "loss": 0.9108, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.267625331878662, + "rewards/margins": 1.9324634075164795, + "rewards/rejected": 0.33516189455986023, + "step": 290 + }, + { + "epoch": 0.07, + "grad_norm": 3.90679669380188, + "learning_rate": 9.941995592859746e-06, + "logits/chosen": 0.008859829977154732, + "logits/rejected": -0.16182956099510193, + "logps/chosen": -66.50563049316406, + "logps/rejected": -56.94880294799805, + "loss": 1.1833, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6624906063079834, + "rewards/margins": 1.4977892637252808, + "rewards/rejected": 1.1647014617919922, + "step": 291 + }, + { + "epoch": 0.07, + "grad_norm": 4.525501251220703, + "learning_rate": 9.94159703140384e-06, + "logits/chosen": 0.013035346753895283, + "logits/rejected": -0.033685654401779175, + "logps/chosen": -57.809349060058594, + "logps/rejected": -68.5478515625, + "loss": 1.0762, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.981092691421509, + "rewards/margins": 1.452150821685791, + "rewards/rejected": 1.5289419889450073, + "step": 292 + }, + { + "epoch": 0.07, + "grad_norm": 2.7977488040924072, + "learning_rate": 9.941197113372916e-06, + "logits/chosen": -0.04754369705915451, + "logits/rejected": -0.2676200270652771, + "logps/chosen": -61.75425720214844, + "logps/rejected": -46.35275650024414, + "loss": 0.8137, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8550071716308594, + "rewards/margins": 2.8539280891418457, + "rewards/rejected": 0.0010790899395942688, + "step": 293 + }, + { + "epoch": 0.07, + "grad_norm": 2.9795143604278564, + "learning_rate": 9.940795838876763e-06, + "logits/chosen": 0.024588298052549362, + "logits/rejected": -0.22634682059288025, + "logps/chosen": -56.76502227783203, + "logps/rejected": -48.299705505371094, + "loss": 0.9028, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7959415912628174, + "rewards/margins": 1.980587363243103, + "rewards/rejected": 0.8153544068336487, + "step": 294 + }, + { + "epoch": 0.07, + "grad_norm": 2.7203431129455566, + "learning_rate": 9.940393208025539e-06, + "logits/chosen": 0.054008789360523224, + "logits/rejected": -0.039651621133089066, + "logps/chosen": -48.82220458984375, + "logps/rejected": -48.91431427001953, + "loss": 0.8628, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8641247749328613, + "rewards/margins": 1.8991299867630005, + "rewards/rejected": 0.9649950265884399, + "step": 295 + }, + { + "epoch": 0.07, + "grad_norm": 2.8994228839874268, + "learning_rate": 9.939989220929772e-06, + "logits/chosen": -0.01963082328438759, + "logits/rejected": -0.15010780096054077, + "logps/chosen": -63.491661071777344, + "logps/rejected": -54.345787048339844, + "loss": 0.8695, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.987744092941284, + "rewards/margins": 2.0596697330474854, + "rewards/rejected": 0.9280742406845093, + "step": 296 + }, + { + "epoch": 0.07, + "grad_norm": 2.9784090518951416, + "learning_rate": 9.939583877700369e-06, + "logits/chosen": -0.03786662966012955, + "logits/rejected": -0.1764030158519745, + "logps/chosen": -60.6814079284668, + "logps/rejected": -60.68187713623047, + "loss": 0.8928, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.052976608276367, + "rewards/margins": 1.9678651094436646, + "rewards/rejected": 1.0851118564605713, + "step": 297 + }, + { + "epoch": 0.07, + "grad_norm": 4.3852081298828125, + "learning_rate": 9.939177178448604e-06, + "logits/chosen": 0.05781906098127365, + "logits/rejected": -0.09468045085668564, + "logps/chosen": -62.37474822998047, + "logps/rejected": -48.33103942871094, + "loss": 1.1473, + "rewards/accuracies": 0.625, + "rewards/chosen": 2.8015382289886475, + "rewards/margins": 1.3046029806137085, + "rewards/rejected": 1.496935248374939, + "step": 298 + }, + { + "epoch": 0.07, + "grad_norm": 2.6233317852020264, + "learning_rate": 9.938769123286122e-06, + "logits/chosen": 0.008626111783087254, + "logits/rejected": -0.07832784950733185, + "logps/chosen": -48.198509216308594, + "logps/rejected": -60.18992614746094, + "loss": 0.7608, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9256765842437744, + "rewards/margins": 2.1987929344177246, + "rewards/rejected": 0.7268834710121155, + "step": 299 + }, + { + "epoch": 0.08, + "grad_norm": 2.6997079849243164, + "learning_rate": 9.938359712324948e-06, + "logits/chosen": 0.05839908495545387, + "logits/rejected": -0.09411751478910446, + "logps/chosen": -58.020362854003906, + "logps/rejected": -63.40400695800781, + "loss": 0.8689, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.759727716445923, + "rewards/margins": 2.257526397705078, + "rewards/rejected": 0.5022012591362, + "step": 300 + }, + { + "epoch": 0.08, + "grad_norm": 2.512686252593994, + "learning_rate": 9.93794894567747e-06, + "logits/chosen": -0.03542406111955643, + "logits/rejected": -0.15203432738780975, + "logps/chosen": -58.29736328125, + "logps/rejected": -52.975860595703125, + "loss": 0.8137, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8680999279022217, + "rewards/margins": 2.2765307426452637, + "rewards/rejected": 0.591569185256958, + "step": 301 + }, + { + "epoch": 0.08, + "grad_norm": 3.4966952800750732, + "learning_rate": 9.937536823456455e-06, + "logits/chosen": 0.1019967719912529, + "logits/rejected": -0.16887220740318298, + "logps/chosen": -57.35480499267578, + "logps/rejected": -44.50685119628906, + "loss": 0.8515, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8715710639953613, + "rewards/margins": 2.5779595375061035, + "rewards/rejected": 0.29361194372177124, + "step": 302 + }, + { + "epoch": 0.08, + "grad_norm": 3.3807382583618164, + "learning_rate": 9.937123345775039e-06, + "logits/chosen": -0.036843471229076385, + "logits/rejected": -0.21698512136936188, + "logps/chosen": -62.497249603271484, + "logps/rejected": -46.939613342285156, + "loss": 0.8938, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8497118949890137, + "rewards/margins": 2.3247315883636475, + "rewards/rejected": 0.5249801278114319, + "step": 303 + }, + { + "epoch": 0.08, + "grad_norm": 4.097720623016357, + "learning_rate": 9.936708512746729e-06, + "logits/chosen": -0.007653960958123207, + "logits/rejected": -0.0756809413433075, + "logps/chosen": -45.600868225097656, + "logps/rejected": -62.80945587158203, + "loss": 1.0012, + "rewards/accuracies": 0.625, + "rewards/chosen": 2.595895528793335, + "rewards/margins": 1.2639520168304443, + "rewards/rejected": 1.3319432735443115, + "step": 304 + }, + { + "epoch": 0.08, + "grad_norm": 2.8511948585510254, + "learning_rate": 9.936292324485406e-06, + "logits/chosen": 0.06443946063518524, + "logits/rejected": -0.057288192212581635, + "logps/chosen": -58.41487121582031, + "logps/rejected": -55.378562927246094, + "loss": 0.9194, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9265530109405518, + "rewards/margins": 1.9058139324188232, + "rewards/rejected": 1.0207390785217285, + "step": 305 + }, + { + "epoch": 0.08, + "grad_norm": 3.6373565196990967, + "learning_rate": 9.935874781105323e-06, + "logits/chosen": -0.0533718466758728, + "logits/rejected": -0.09892257302999496, + "logps/chosen": -57.20489501953125, + "logps/rejected": -72.27731323242188, + "loss": 1.0397, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8312389850616455, + "rewards/margins": 1.8902604579925537, + "rewards/rejected": 0.9409787058830261, + "step": 306 + }, + { + "epoch": 0.08, + "grad_norm": 2.671745777130127, + "learning_rate": 9.935455882721105e-06, + "logits/chosen": -0.04874657467007637, + "logits/rejected": -0.19493170082569122, + "logps/chosen": -51.39527130126953, + "logps/rejected": -52.45747375488281, + "loss": 0.8011, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7892723083496094, + "rewards/margins": 2.4701361656188965, + "rewards/rejected": 0.3191359043121338, + "step": 307 + }, + { + "epoch": 0.08, + "grad_norm": 3.3442728519439697, + "learning_rate": 9.935035629447749e-06, + "logits/chosen": 0.0680462121963501, + "logits/rejected": -0.059840962290763855, + "logps/chosen": -62.96919250488281, + "logps/rejected": -58.06789779663086, + "loss": 0.9093, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8943710327148438, + "rewards/margins": 2.4940788745880127, + "rewards/rejected": 0.4002920389175415, + "step": 308 + }, + { + "epoch": 0.08, + "grad_norm": 2.8272910118103027, + "learning_rate": 9.934614021400624e-06, + "logits/chosen": 0.05291680991649628, + "logits/rejected": -0.17904140055179596, + "logps/chosen": -64.06205749511719, + "logps/rejected": -53.3700065612793, + "loss": 0.7572, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9924728870391846, + "rewards/margins": 2.962538957595825, + "rewards/rejected": 0.02993379533290863, + "step": 309 + }, + { + "epoch": 0.08, + "grad_norm": 2.499281406402588, + "learning_rate": 9.934191058695467e-06, + "logits/chosen": 0.004836723208427429, + "logits/rejected": -0.049534399062395096, + "logps/chosen": -57.86066436767578, + "logps/rejected": -55.245384216308594, + "loss": 0.8539, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.833603858947754, + "rewards/margins": 2.293957233428955, + "rewards/rejected": 0.5396466255187988, + "step": 310 + }, + { + "epoch": 0.08, + "grad_norm": 3.486506938934326, + "learning_rate": 9.933766741448395e-06, + "logits/chosen": 0.015127227641642094, + "logits/rejected": -0.09758293628692627, + "logps/chosen": -71.5963363647461, + "logps/rejected": -59.51177978515625, + "loss": 1.0491, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.774867057800293, + "rewards/margins": 1.536354422569275, + "rewards/rejected": 1.2385122776031494, + "step": 311 + }, + { + "epoch": 0.08, + "grad_norm": 2.645892858505249, + "learning_rate": 9.93334106977589e-06, + "logits/chosen": 0.018511297181248665, + "logits/rejected": -0.05503586307168007, + "logps/chosen": -54.56965255737305, + "logps/rejected": -68.70792388916016, + "loss": 0.8486, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8785958290100098, + "rewards/margins": 2.3876657485961914, + "rewards/rejected": 0.4909304976463318, + "step": 312 + }, + { + "epoch": 0.08, + "grad_norm": 3.772033214569092, + "learning_rate": 9.932914043794808e-06, + "logits/chosen": 0.012671476230025291, + "logits/rejected": -0.11654003709554672, + "logps/chosen": -69.17906951904297, + "logps/rejected": -68.44788360595703, + "loss": 1.1549, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8338708877563477, + "rewards/margins": 1.7006100416183472, + "rewards/rejected": 1.13326096534729, + "step": 313 + }, + { + "epoch": 0.08, + "grad_norm": 2.7341525554656982, + "learning_rate": 9.932485663622376e-06, + "logits/chosen": 0.03662727028131485, + "logits/rejected": -0.11035530269145966, + "logps/chosen": -49.224159240722656, + "logps/rejected": -46.23178482055664, + "loss": 0.7783, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.802339553833008, + "rewards/margins": 2.503061294555664, + "rewards/rejected": 0.29927852749824524, + "step": 314 + }, + { + "epoch": 0.08, + "grad_norm": 3.286503314971924, + "learning_rate": 9.932055929376196e-06, + "logits/chosen": -0.06298713386058807, + "logits/rejected": -0.1744554340839386, + "logps/chosen": -46.43614196777344, + "logps/rejected": -60.353782653808594, + "loss": 0.9072, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.908914804458618, + "rewards/margins": 1.9509565830230713, + "rewards/rejected": 0.9579580426216125, + "step": 315 + }, + { + "epoch": 0.08, + "grad_norm": 3.602029323577881, + "learning_rate": 9.931624841174238e-06, + "logits/chosen": 0.14744846522808075, + "logits/rejected": -0.0534619465470314, + "logps/chosen": -63.366947174072266, + "logps/rejected": -48.88682556152344, + "loss": 0.9454, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.0921952724456787, + "rewards/margins": 1.754744291305542, + "rewards/rejected": 1.3374511003494263, + "step": 316 + }, + { + "epoch": 0.08, + "grad_norm": 2.9119620323181152, + "learning_rate": 9.931192399134844e-06, + "logits/chosen": 0.08313746750354767, + "logits/rejected": -0.11253957450389862, + "logps/chosen": -57.35070037841797, + "logps/rejected": -56.991233825683594, + "loss": 0.7914, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.727114200592041, + "rewards/margins": 2.448586940765381, + "rewards/rejected": 0.27852699160575867, + "step": 317 + }, + { + "epoch": 0.08, + "grad_norm": 2.0697216987609863, + "learning_rate": 9.93075860337673e-06, + "logits/chosen": -0.0068313367664813995, + "logits/rejected": -0.25468388199806213, + "logps/chosen": -56.59347152709961, + "logps/rejected": -45.57395553588867, + "loss": 0.7843, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.147831678390503, + "rewards/margins": 2.796706199645996, + "rewards/rejected": 0.3511252999305725, + "step": 318 + }, + { + "epoch": 0.08, + "grad_norm": 3.95686411857605, + "learning_rate": 9.930323454018982e-06, + "logits/chosen": -0.07530046999454498, + "logits/rejected": -0.16357611119747162, + "logps/chosen": -48.743019104003906, + "logps/rejected": -59.77751922607422, + "loss": 0.9772, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.7192201614379883, + "rewards/margins": 1.858445167541504, + "rewards/rejected": 0.8607749342918396, + "step": 319 + }, + { + "epoch": 0.08, + "grad_norm": 3.155421495437622, + "learning_rate": 9.929886951181059e-06, + "logits/chosen": 0.056040652096271515, + "logits/rejected": -0.06856440007686615, + "logps/chosen": -58.2742805480957, + "logps/rejected": -65.72766876220703, + "loss": 0.934, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.931642532348633, + "rewards/margins": 2.069840431213379, + "rewards/rejected": 0.8618027567863464, + "step": 320 + }, + { + "epoch": 0.08, + "grad_norm": 3.4551303386688232, + "learning_rate": 9.929449094982788e-06, + "logits/chosen": -0.08389665186405182, + "logits/rejected": -0.2046484500169754, + "logps/chosen": -55.898826599121094, + "logps/rejected": -53.6257438659668, + "loss": 0.8966, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8743393421173096, + "rewards/margins": 2.2279574871063232, + "rewards/rejected": 0.6463820934295654, + "step": 321 + }, + { + "epoch": 0.08, + "grad_norm": 2.7708935737609863, + "learning_rate": 9.929009885544371e-06, + "logits/chosen": 0.03172945976257324, + "logits/rejected": -0.1042415052652359, + "logps/chosen": -58.53337097167969, + "logps/rejected": -55.63811492919922, + "loss": 0.8471, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.688507318496704, + "rewards/margins": 2.2936501502990723, + "rewards/rejected": 0.39485710859298706, + "step": 322 + }, + { + "epoch": 0.08, + "grad_norm": 2.784275770187378, + "learning_rate": 9.928569322986384e-06, + "logits/chosen": 0.010031027719378471, + "logits/rejected": -0.13843072950839996, + "logps/chosen": -48.730533599853516, + "logps/rejected": -54.37205123901367, + "loss": 0.811, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.937981367111206, + "rewards/margins": 2.3255374431610107, + "rewards/rejected": 0.612443745136261, + "step": 323 + }, + { + "epoch": 0.08, + "grad_norm": 3.0935683250427246, + "learning_rate": 9.928127407429764e-06, + "logits/chosen": -0.03338048234581947, + "logits/rejected": -0.1378689408302307, + "logps/chosen": -56.138633728027344, + "logps/rejected": -61.54387283325195, + "loss": 0.9413, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.812645196914673, + "rewards/margins": 1.9581043720245361, + "rewards/rejected": 0.8545406460762024, + "step": 324 + }, + { + "epoch": 0.08, + "grad_norm": 3.7901172637939453, + "learning_rate": 9.927684138995833e-06, + "logits/chosen": 0.00662496592849493, + "logits/rejected": -0.06349006295204163, + "logps/chosen": -60.18121337890625, + "logps/rejected": -70.85355377197266, + "loss": 0.9618, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5829999446868896, + "rewards/margins": 1.4493499994277954, + "rewards/rejected": 1.1336500644683838, + "step": 325 + }, + { + "epoch": 0.08, + "grad_norm": 3.742544174194336, + "learning_rate": 9.927239517806271e-06, + "logits/chosen": -0.05551905184984207, + "logits/rejected": -0.10498897731304169, + "logps/chosen": -52.031089782714844, + "logps/rejected": -65.8486099243164, + "loss": 1.0251, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.5405704975128174, + "rewards/margins": 1.3952710628509521, + "rewards/rejected": 1.1452991962432861, + "step": 326 + }, + { + "epoch": 0.08, + "grad_norm": 3.2722370624542236, + "learning_rate": 9.926793543983141e-06, + "logits/chosen": -0.009194097481667995, + "logits/rejected": -0.14075572788715363, + "logps/chosen": -57.296417236328125, + "logps/rejected": -53.1533088684082, + "loss": 0.9043, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0440897941589355, + "rewards/margins": 2.357949733734131, + "rewards/rejected": 0.6861402988433838, + "step": 327 + }, + { + "epoch": 0.08, + "grad_norm": 2.5573694705963135, + "learning_rate": 9.926346217648874e-06, + "logits/chosen": -0.03803541511297226, + "logits/rejected": -0.1545065939426422, + "logps/chosen": -58.88774108886719, + "logps/rejected": -62.88594055175781, + "loss": 0.8683, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.551027297973633, + "rewards/margins": 2.5096588134765625, + "rewards/rejected": 0.04136842489242554, + "step": 328 + }, + { + "epoch": 0.08, + "grad_norm": 4.151278495788574, + "learning_rate": 9.925897538926267e-06, + "logits/chosen": -0.10140612721443176, + "logits/rejected": -0.21414496004581451, + "logps/chosen": -49.44990539550781, + "logps/rejected": -54.34981155395508, + "loss": 1.0346, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.526200294494629, + "rewards/margins": 1.7722766399383545, + "rewards/rejected": 0.7539236545562744, + "step": 329 + }, + { + "epoch": 0.08, + "grad_norm": 2.862481117248535, + "learning_rate": 9.925447507938493e-06, + "logits/chosen": 0.005923150572925806, + "logits/rejected": -0.14703449606895447, + "logps/chosen": -49.27286148071289, + "logps/rejected": -45.29887008666992, + "loss": 0.7849, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0067601203918457, + "rewards/margins": 2.4229676723480225, + "rewards/rejected": 0.5837924480438232, + "step": 330 + }, + { + "epoch": 0.08, + "grad_norm": 3.870870351791382, + "learning_rate": 9.924996124809095e-06, + "logits/chosen": 0.11428529024124146, + "logits/rejected": 0.03993130847811699, + "logps/chosen": -63.69690704345703, + "logps/rejected": -68.9210205078125, + "loss": 1.0913, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.4655163288116455, + "rewards/margins": 1.681807518005371, + "rewards/rejected": 0.7837087512016296, + "step": 331 + }, + { + "epoch": 0.08, + "grad_norm": 3.0586647987365723, + "learning_rate": 9.924543389661987e-06, + "logits/chosen": -0.06192668154835701, + "logits/rejected": -0.11975309252738953, + "logps/chosen": -42.73114776611328, + "logps/rejected": -52.146968841552734, + "loss": 0.8623, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6678690910339355, + "rewards/margins": 2.0698325634002686, + "rewards/rejected": 0.5980366468429565, + "step": 332 + }, + { + "epoch": 0.08, + "grad_norm": 3.6554479598999023, + "learning_rate": 9.924089302621455e-06, + "logits/chosen": -0.029238993301987648, + "logits/rejected": -0.15654680132865906, + "logps/chosen": -62.349365234375, + "logps/rejected": -55.799442291259766, + "loss": 0.9632, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.985027551651001, + "rewards/margins": 2.3443732261657715, + "rewards/rejected": 0.6406540870666504, + "step": 333 + }, + { + "epoch": 0.08, + "grad_norm": 3.1802306175231934, + "learning_rate": 9.923633863812158e-06, + "logits/chosen": 0.061075612902641296, + "logits/rejected": -0.08810597658157349, + "logps/chosen": -52.791629791259766, + "logps/rejected": -51.596736907958984, + "loss": 0.8656, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.971538782119751, + "rewards/margins": 2.482081890106201, + "rewards/rejected": 0.48945704102516174, + "step": 334 + }, + { + "epoch": 0.08, + "grad_norm": 3.895820379257202, + "learning_rate": 9.92317707335912e-06, + "logits/chosen": -0.0234904196113348, + "logits/rejected": -0.1920991688966751, + "logps/chosen": -56.92060852050781, + "logps/rejected": -60.46991729736328, + "loss": 0.9935, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7879457473754883, + "rewards/margins": 2.00956130027771, + "rewards/rejected": 0.7783843874931335, + "step": 335 + }, + { + "epoch": 0.08, + "grad_norm": 2.837878465652466, + "learning_rate": 9.922718931387742e-06, + "logits/chosen": -0.004151487722992897, + "logits/rejected": -0.14850057661533356, + "logps/chosen": -58.1928596496582, + "logps/rejected": -62.57109069824219, + "loss": 0.7803, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.879218816757202, + "rewards/margins": 2.5033061504364014, + "rewards/rejected": 0.37591269612312317, + "step": 336 + }, + { + "epoch": 0.08, + "grad_norm": 3.0506112575531006, + "learning_rate": 9.922259438023794e-06, + "logits/chosen": -0.011249048635363579, + "logits/rejected": -0.06845725327730179, + "logps/chosen": -51.65343475341797, + "logps/rejected": -74.39012145996094, + "loss": 0.7531, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9396748542785645, + "rewards/margins": 2.116377830505371, + "rewards/rejected": 0.8232971429824829, + "step": 337 + }, + { + "epoch": 0.08, + "grad_norm": 3.4133710861206055, + "learning_rate": 9.921798593393415e-06, + "logits/chosen": 0.12064415216445923, + "logits/rejected": -0.011028677225112915, + "logps/chosen": -60.694068908691406, + "logps/rejected": -65.75334167480469, + "loss": 1.0508, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.83310604095459, + "rewards/margins": 1.8921624422073364, + "rewards/rejected": 0.9409438967704773, + "step": 338 + }, + { + "epoch": 0.08, + "grad_norm": 3.05314302444458, + "learning_rate": 9.92133639762312e-06, + "logits/chosen": -0.1040511280298233, + "logits/rejected": -0.13201820850372314, + "logps/chosen": -55.355262756347656, + "logps/rejected": -63.624732971191406, + "loss": 0.9396, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.788881778717041, + "rewards/margins": 2.120029926300049, + "rewards/rejected": 0.6688517928123474, + "step": 339 + }, + { + "epoch": 0.09, + "grad_norm": 3.563896894454956, + "learning_rate": 9.92087285083979e-06, + "logits/chosen": -0.08366020023822784, + "logits/rejected": -0.2225383073091507, + "logps/chosen": -60.84995651245117, + "logps/rejected": -57.87238693237305, + "loss": 0.9404, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9855079650878906, + "rewards/margins": 2.6105589866638184, + "rewards/rejected": 0.37494876980781555, + "step": 340 + }, + { + "epoch": 0.09, + "grad_norm": 3.254312038421631, + "learning_rate": 9.920407953170677e-06, + "logits/chosen": -0.0247647762298584, + "logits/rejected": -0.1459149867296219, + "logps/chosen": -52.9565315246582, + "logps/rejected": -66.32279968261719, + "loss": 0.8645, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.842637777328491, + "rewards/margins": 2.2426505088806152, + "rewards/rejected": 0.5999871492385864, + "step": 341 + }, + { + "epoch": 0.09, + "grad_norm": 3.9004499912261963, + "learning_rate": 9.919941704743406e-06, + "logits/chosen": 0.027504848316311836, + "logits/rejected": -0.11086393892765045, + "logps/chosen": -55.6265754699707, + "logps/rejected": -54.48671340942383, + "loss": 0.9867, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.770167827606201, + "rewards/margins": 1.675506830215454, + "rewards/rejected": 1.094660997390747, + "step": 342 + }, + { + "epoch": 0.09, + "grad_norm": 3.8116917610168457, + "learning_rate": 9.919474105685974e-06, + "logits/chosen": 0.08089861273765564, + "logits/rejected": -0.09737519919872284, + "logps/chosen": -78.56693267822266, + "logps/rejected": -70.80987548828125, + "loss": 0.9646, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.549839496612549, + "rewards/margins": 1.9461981058120728, + "rewards/rejected": 0.6036416292190552, + "step": 343 + }, + { + "epoch": 0.09, + "grad_norm": 4.041927337646484, + "learning_rate": 9.919005156126746e-06, + "logits/chosen": -0.10170159488916397, + "logits/rejected": -0.15943683683872223, + "logps/chosen": -47.4798583984375, + "logps/rejected": -64.14811706542969, + "loss": 0.9495, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.649280548095703, + "rewards/margins": 1.7206032276153564, + "rewards/rejected": 0.9286773204803467, + "step": 344 + }, + { + "epoch": 0.09, + "grad_norm": 3.5190420150756836, + "learning_rate": 9.918534856194459e-06, + "logits/chosen": -0.03131860867142677, + "logits/rejected": -0.07864600419998169, + "logps/chosen": -59.79239273071289, + "logps/rejected": -63.01432418823242, + "loss": 0.9642, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6479506492614746, + "rewards/margins": 1.6378039121627808, + "rewards/rejected": 1.0101466178894043, + "step": 345 + }, + { + "epoch": 0.09, + "grad_norm": 3.054232597351074, + "learning_rate": 9.918063206018221e-06, + "logits/chosen": 0.055308807641267776, + "logits/rejected": -0.14223529398441315, + "logps/chosen": -63.87427520751953, + "logps/rejected": -55.54599380493164, + "loss": 0.8614, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6482856273651123, + "rewards/margins": 2.361841917037964, + "rewards/rejected": 0.2864437699317932, + "step": 346 + }, + { + "epoch": 0.09, + "grad_norm": 3.3150627613067627, + "learning_rate": 9.917590205727509e-06, + "logits/chosen": -0.054132163524627686, + "logits/rejected": -0.20287653803825378, + "logps/chosen": -68.64100646972656, + "logps/rejected": -54.829322814941406, + "loss": 0.9218, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9997811317443848, + "rewards/margins": 2.001194715499878, + "rewards/rejected": 0.9985861778259277, + "step": 347 + }, + { + "epoch": 0.09, + "grad_norm": 3.150925874710083, + "learning_rate": 9.917115855452172e-06, + "logits/chosen": 0.012368752621114254, + "logits/rejected": -0.157511904835701, + "logps/chosen": -56.45155334472656, + "logps/rejected": -56.22584533691406, + "loss": 0.7585, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7890625, + "rewards/margins": 2.1419568061828613, + "rewards/rejected": 0.6471056342124939, + "step": 348 + }, + { + "epoch": 0.09, + "grad_norm": 3.266169548034668, + "learning_rate": 9.916640155322431e-06, + "logits/chosen": -0.15090204775333405, + "logits/rejected": -0.3489990234375, + "logps/chosen": -56.296207427978516, + "logps/rejected": -46.97910690307617, + "loss": 0.988, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.9285387992858887, + "rewards/margins": 2.1094517707824707, + "rewards/rejected": 0.8190870881080627, + "step": 349 + }, + { + "epoch": 0.09, + "grad_norm": 5.152328968048096, + "learning_rate": 9.916163105468872e-06, + "logits/chosen": -0.03860211372375488, + "logits/rejected": -0.06827478110790253, + "logps/chosen": -55.84882354736328, + "logps/rejected": -63.97121047973633, + "loss": 1.1449, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.5811657905578613, + "rewards/margins": 1.2182658910751343, + "rewards/rejected": 1.3629001379013062, + "step": 350 + }, + { + "epoch": 0.09, + "grad_norm": 3.583503007888794, + "learning_rate": 9.91568470602246e-06, + "logits/chosen": -0.020055102184414864, + "logits/rejected": -0.05398618429899216, + "logps/chosen": -51.198551177978516, + "logps/rejected": -65.78556060791016, + "loss": 0.9181, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.715801477432251, + "rewards/margins": 2.3482139110565186, + "rewards/rejected": 0.3675873279571533, + "step": 351 + }, + { + "epoch": 0.09, + "grad_norm": 3.3242509365081787, + "learning_rate": 9.915204957114524e-06, + "logits/chosen": 0.05125410482287407, + "logits/rejected": -0.17946727573871613, + "logps/chosen": -63.136146545410156, + "logps/rejected": -52.716522216796875, + "loss": 0.7247, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7273051738739014, + "rewards/margins": 2.769472599029541, + "rewards/rejected": -0.04216724634170532, + "step": 352 + }, + { + "epoch": 0.09, + "grad_norm": 2.7838454246520996, + "learning_rate": 9.914723858876765e-06, + "logits/chosen": 0.009584503248333931, + "logits/rejected": -0.14595334231853485, + "logps/chosen": -61.37392044067383, + "logps/rejected": -60.11167907714844, + "loss": 0.7847, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.022326946258545, + "rewards/margins": 2.570662021636963, + "rewards/rejected": 0.4516650438308716, + "step": 353 + }, + { + "epoch": 0.09, + "grad_norm": 3.8072941303253174, + "learning_rate": 9.914241411441256e-06, + "logits/chosen": -4.8452289775013924e-05, + "logits/rejected": -0.1115737333893776, + "logps/chosen": -59.23798370361328, + "logps/rejected": -56.49125671386719, + "loss": 0.9688, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8017032146453857, + "rewards/margins": 2.0515832901000977, + "rewards/rejected": 0.7501198649406433, + "step": 354 + }, + { + "epoch": 0.09, + "grad_norm": 3.4634242057800293, + "learning_rate": 9.913757614940438e-06, + "logits/chosen": 0.003935475833714008, + "logits/rejected": -0.16377153992652893, + "logps/chosen": -68.03225708007812, + "logps/rejected": -56.12758255004883, + "loss": 0.9744, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0561130046844482, + "rewards/margins": 2.4718353748321533, + "rewards/rejected": 0.5842775106430054, + "step": 355 + }, + { + "epoch": 0.09, + "grad_norm": 3.6594722270965576, + "learning_rate": 9.913272469507124e-06, + "logits/chosen": 0.003912944346666336, + "logits/rejected": -0.1494949907064438, + "logps/chosen": -62.72553253173828, + "logps/rejected": -56.884765625, + "loss": 0.949, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4774086475372314, + "rewards/margins": 2.235865831375122, + "rewards/rejected": 0.24154292047023773, + "step": 356 + }, + { + "epoch": 0.09, + "grad_norm": 4.343142032623291, + "learning_rate": 9.912785975274498e-06, + "logits/chosen": 0.023795919492840767, + "logits/rejected": -0.09253933280706406, + "logps/chosen": -63.49873733520508, + "logps/rejected": -66.27703857421875, + "loss": 1.0345, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.802706480026245, + "rewards/margins": 2.0658645629882812, + "rewards/rejected": 0.7368420362472534, + "step": 357 + }, + { + "epoch": 0.09, + "grad_norm": 3.5616674423217773, + "learning_rate": 9.912298132376111e-06, + "logits/chosen": 0.04782719537615776, + "logits/rejected": -0.1838928908109665, + "logps/chosen": -58.1904411315918, + "logps/rejected": -47.46619415283203, + "loss": 0.8449, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.512868881225586, + "rewards/margins": 2.3393805027008057, + "rewards/rejected": 0.17348811030387878, + "step": 358 + }, + { + "epoch": 0.09, + "grad_norm": 4.1848835945129395, + "learning_rate": 9.911808940945888e-06, + "logits/chosen": -0.013807317242026329, + "logits/rejected": -0.12934285402297974, + "logps/chosen": -57.92335510253906, + "logps/rejected": -53.78130340576172, + "loss": 0.9629, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8698747158050537, + "rewards/margins": 2.1591193675994873, + "rewards/rejected": 0.7107552886009216, + "step": 359 + }, + { + "epoch": 0.09, + "grad_norm": 4.36995267868042, + "learning_rate": 9.911318401118124e-06, + "logits/chosen": -0.02655017375946045, + "logits/rejected": -0.12263351678848267, + "logps/chosen": -58.399818420410156, + "logps/rejected": -65.3769760131836, + "loss": 1.0388, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.611506700515747, + "rewards/margins": 1.8508206605911255, + "rewards/rejected": 0.7606862783432007, + "step": 360 + }, + { + "epoch": 0.09, + "grad_norm": 4.402544975280762, + "learning_rate": 9.910826513027478e-06, + "logits/chosen": 0.07432708144187927, + "logits/rejected": -0.07327393442392349, + "logps/chosen": -74.34163665771484, + "logps/rejected": -57.56189727783203, + "loss": 1.0571, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.3852100372314453, + "rewards/margins": 1.375607967376709, + "rewards/rejected": 1.0096018314361572, + "step": 361 + }, + { + "epoch": 0.09, + "grad_norm": 3.897686243057251, + "learning_rate": 9.910333276808989e-06, + "logits/chosen": -0.0208343043923378, + "logits/rejected": -0.11056490987539291, + "logps/chosen": -57.80479431152344, + "logps/rejected": -65.00163269042969, + "loss": 0.9242, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.507045269012451, + "rewards/margins": 2.183375835418701, + "rewards/rejected": 0.3236692547798157, + "step": 362 + }, + { + "epoch": 0.09, + "grad_norm": 4.428651332855225, + "learning_rate": 9.90983869259806e-06, + "logits/chosen": 0.013302670791745186, + "logits/rejected": -0.04668232798576355, + "logps/chosen": -55.20273971557617, + "logps/rejected": -60.382537841796875, + "loss": 1.0392, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5709173679351807, + "rewards/margins": 1.7569986581802368, + "rewards/rejected": 0.8139187693595886, + "step": 363 + }, + { + "epoch": 0.09, + "grad_norm": 3.6114375591278076, + "learning_rate": 9.909342760530461e-06, + "logits/chosen": 0.045303307473659515, + "logits/rejected": -0.09771126508712769, + "logps/chosen": -65.34380340576172, + "logps/rejected": -62.065128326416016, + "loss": 0.9501, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8153436183929443, + "rewards/margins": 1.944610595703125, + "rewards/rejected": 0.8707329034805298, + "step": 364 + }, + { + "epoch": 0.09, + "grad_norm": 3.590045928955078, + "learning_rate": 9.90884548074234e-06, + "logits/chosen": -0.011746793054044247, + "logits/rejected": -0.21658070385456085, + "logps/chosen": -57.416893005371094, + "logps/rejected": -51.135467529296875, + "loss": 0.9027, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9327900409698486, + "rewards/margins": 2.4831016063690186, + "rewards/rejected": 0.4496883749961853, + "step": 365 + }, + { + "epoch": 0.09, + "grad_norm": 3.389145612716675, + "learning_rate": 9.908346853370211e-06, + "logits/chosen": -0.07514731585979462, + "logits/rejected": -0.24838045239448547, + "logps/chosen": -54.53099822998047, + "logps/rejected": -38.19215774536133, + "loss": 0.8857, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5703964233398438, + "rewards/margins": 2.2212343215942383, + "rewards/rejected": 0.3491622805595398, + "step": 366 + }, + { + "epoch": 0.09, + "grad_norm": 3.7347664833068848, + "learning_rate": 9.907846878550956e-06, + "logits/chosen": 0.06512295454740524, + "logits/rejected": -0.04333019256591797, + "logps/chosen": -80.22962188720703, + "logps/rejected": -72.50936889648438, + "loss": 0.8337, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.88472843170166, + "rewards/margins": 2.309237241744995, + "rewards/rejected": 0.5754908323287964, + "step": 367 + }, + { + "epoch": 0.09, + "grad_norm": 3.3940138816833496, + "learning_rate": 9.90734555642183e-06, + "logits/chosen": 0.044319480657577515, + "logits/rejected": -0.08976436406373978, + "logps/chosen": -54.98198318481445, + "logps/rejected": -57.77761459350586, + "loss": 0.903, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8255414962768555, + "rewards/margins": 2.0882229804992676, + "rewards/rejected": 0.7373185157775879, + "step": 368 + }, + { + "epoch": 0.09, + "grad_norm": 3.3096370697021484, + "learning_rate": 9.906842887120457e-06, + "logits/chosen": -0.021852776408195496, + "logits/rejected": -0.22239208221435547, + "logps/chosen": -62.92076110839844, + "logps/rejected": -60.44953536987305, + "loss": 0.8119, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.655569314956665, + "rewards/margins": 2.3583388328552246, + "rewards/rejected": 0.2972303330898285, + "step": 369 + }, + { + "epoch": 0.09, + "grad_norm": 3.4550793170928955, + "learning_rate": 9.90633887078483e-06, + "logits/chosen": -0.038854341953992844, + "logits/rejected": -0.18333250284194946, + "logps/chosen": -54.97117233276367, + "logps/rejected": -51.434207916259766, + "loss": 0.9791, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5307538509368896, + "rewards/margins": 1.8606681823730469, + "rewards/rejected": 0.670085608959198, + "step": 370 + }, + { + "epoch": 0.09, + "grad_norm": 3.5624635219573975, + "learning_rate": 9.905833507553312e-06, + "logits/chosen": -0.09246698766946793, + "logits/rejected": -0.2322445809841156, + "logps/chosen": -49.86566162109375, + "logps/rejected": -76.8846206665039, + "loss": 0.8574, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9009313583374023, + "rewards/margins": 2.5470423698425293, + "rewards/rejected": 0.3538891673088074, + "step": 371 + }, + { + "epoch": 0.09, + "grad_norm": 3.2398624420166016, + "learning_rate": 9.905326797564637e-06, + "logits/chosen": 0.020293498411774635, + "logits/rejected": -0.0960804894566536, + "logps/chosen": -54.45600509643555, + "logps/rejected": -60.56000518798828, + "loss": 0.8434, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8035998344421387, + "rewards/margins": 1.8236768245697021, + "rewards/rejected": 0.9799231886863708, + "step": 372 + }, + { + "epoch": 0.09, + "grad_norm": 3.7731010913848877, + "learning_rate": 9.904818740957908e-06, + "logits/chosen": 0.044182464480400085, + "logits/rejected": -0.14266152679920197, + "logps/chosen": -66.83267974853516, + "logps/rejected": -65.95486450195312, + "loss": 0.92, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5549814701080322, + "rewards/margins": 2.0183358192443848, + "rewards/rejected": 0.5366456508636475, + "step": 373 + }, + { + "epoch": 0.09, + "grad_norm": 3.7918663024902344, + "learning_rate": 9.904309337872597e-06, + "logits/chosen": -0.026176417246460915, + "logits/rejected": -0.1660594493150711, + "logps/chosen": -60.89931869506836, + "logps/rejected": -71.32498931884766, + "loss": 0.8811, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.988490343093872, + "rewards/margins": 2.781282663345337, + "rewards/rejected": 0.20720762014389038, + "step": 374 + }, + { + "epoch": 0.09, + "grad_norm": 3.7352805137634277, + "learning_rate": 9.903798588448545e-06, + "logits/chosen": -0.060533300042152405, + "logits/rejected": -0.05976312980055809, + "logps/chosen": -57.80944061279297, + "logps/rejected": -74.00491333007812, + "loss": 0.9376, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5618460178375244, + "rewards/margins": 2.1520612239837646, + "rewards/rejected": 0.40978461503982544, + "step": 375 + }, + { + "epoch": 0.09, + "grad_norm": 5.390510559082031, + "learning_rate": 9.903286492825965e-06, + "logits/chosen": 0.053545866161584854, + "logits/rejected": -0.003934605047106743, + "logps/chosen": -65.1721420288086, + "logps/rejected": -63.68199920654297, + "loss": 1.1106, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.578622817993164, + "rewards/margins": 1.1248366832733154, + "rewards/rejected": 1.4537858963012695, + "step": 376 + }, + { + "epoch": 0.09, + "grad_norm": 2.9068970680236816, + "learning_rate": 9.902773051145439e-06, + "logits/chosen": -0.05583891272544861, + "logits/rejected": -0.16271936893463135, + "logps/chosen": -52.813087463378906, + "logps/rejected": -59.94237518310547, + "loss": 0.9191, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8316845893859863, + "rewards/margins": 2.4049603939056396, + "rewards/rejected": 0.42672404646873474, + "step": 377 + }, + { + "epoch": 0.09, + "grad_norm": 4.7912116050720215, + "learning_rate": 9.902258263547917e-06, + "logits/chosen": 0.04142524302005768, + "logits/rejected": -0.1200421079993248, + "logps/chosen": -59.06663513183594, + "logps/rejected": -55.839881896972656, + "loss": 1.1736, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7399632930755615, + "rewards/margins": 1.6515350341796875, + "rewards/rejected": 1.0884284973144531, + "step": 378 + }, + { + "epoch": 0.09, + "grad_norm": 3.883162021636963, + "learning_rate": 9.901742130174719e-06, + "logits/chosen": 0.02843184769153595, + "logits/rejected": -0.1320231854915619, + "logps/chosen": -62.96097183227539, + "logps/rejected": -57.71134948730469, + "loss": 0.9131, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.598539352416992, + "rewards/margins": 2.3534915447235107, + "rewards/rejected": 0.24504777789115906, + "step": 379 + }, + { + "epoch": 0.1, + "grad_norm": 4.0052409172058105, + "learning_rate": 9.901224651167534e-06, + "logits/chosen": 0.020998727530241013, + "logits/rejected": -0.02434578724205494, + "logps/chosen": -63.76095199584961, + "logps/rejected": -71.14498901367188, + "loss": 0.8802, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.4827029705047607, + "rewards/margins": 2.073594093322754, + "rewards/rejected": 0.40910887718200684, + "step": 380 + }, + { + "epoch": 0.1, + "grad_norm": 3.518864870071411, + "learning_rate": 9.900705826668424e-06, + "logits/chosen": 0.07111519575119019, + "logits/rejected": -0.04681570455431938, + "logps/chosen": -62.70977020263672, + "logps/rejected": -58.87867736816406, + "loss": 0.8878, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7003931999206543, + "rewards/margins": 1.7825937271118164, + "rewards/rejected": 0.9177997708320618, + "step": 381 + }, + { + "epoch": 0.1, + "grad_norm": 5.2416839599609375, + "learning_rate": 9.900185656819815e-06, + "logits/chosen": 0.005541200749576092, + "logits/rejected": -0.11708047986030579, + "logps/chosen": -72.15692138671875, + "logps/rejected": -58.54405975341797, + "loss": 1.1631, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.73160457611084, + "rewards/margins": 1.600427508354187, + "rewards/rejected": 1.1311770677566528, + "step": 382 + }, + { + "epoch": 0.1, + "grad_norm": 3.717820167541504, + "learning_rate": 9.899664141764505e-06, + "logits/chosen": -0.07814496755599976, + "logits/rejected": -0.15082132816314697, + "logps/chosen": -54.151737213134766, + "logps/rejected": -57.090946197509766, + "loss": 0.8969, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.711398124694824, + "rewards/margins": 1.9140353202819824, + "rewards/rejected": 0.797362744808197, + "step": 383 + }, + { + "epoch": 0.1, + "grad_norm": 2.841907262802124, + "learning_rate": 9.899141281645662e-06, + "logits/chosen": 0.029539119452238083, + "logits/rejected": -0.13224253058433533, + "logps/chosen": -67.70574951171875, + "logps/rejected": -64.38601684570312, + "loss": 0.8251, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9578659534454346, + "rewards/margins": 2.445857286453247, + "rewards/rejected": 0.5120083689689636, + "step": 384 + }, + { + "epoch": 0.1, + "grad_norm": 2.915846824645996, + "learning_rate": 9.89861707660682e-06, + "logits/chosen": 0.04671032354235649, + "logits/rejected": -0.04627085477113724, + "logps/chosen": -60.828914642333984, + "logps/rejected": -69.32706451416016, + "loss": 0.7601, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2160158157348633, + "rewards/margins": 2.4124908447265625, + "rewards/rejected": 0.8035250306129456, + "step": 385 + }, + { + "epoch": 0.1, + "grad_norm": 4.444840431213379, + "learning_rate": 9.898091526791889e-06, + "logits/chosen": -0.16024969518184662, + "logits/rejected": -0.14754343032836914, + "logps/chosen": -44.34174346923828, + "logps/rejected": -63.27699661254883, + "loss": 0.9924, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.80279541015625, + "rewards/margins": 1.4180946350097656, + "rewards/rejected": 1.3847010135650635, + "step": 386 + }, + { + "epoch": 0.1, + "grad_norm": 1.9504363536834717, + "learning_rate": 9.897564632345142e-06, + "logits/chosen": -0.08577616512775421, + "logits/rejected": -0.3162016272544861, + "logps/chosen": -62.90580368041992, + "logps/rejected": -47.48371124267578, + "loss": 0.7588, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1835362911224365, + "rewards/margins": 3.161766767501831, + "rewards/rejected": 0.021769538521766663, + "step": 387 + }, + { + "epoch": 0.1, + "grad_norm": 3.7078678607940674, + "learning_rate": 9.89703639341122e-06, + "logits/chosen": -0.01707632839679718, + "logits/rejected": -0.09743160009384155, + "logps/chosen": -51.695560455322266, + "logps/rejected": -58.04450988769531, + "loss": 1.0334, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9560036659240723, + "rewards/margins": 1.605919599533081, + "rewards/rejected": 1.3500839471817017, + "step": 388 + }, + { + "epoch": 0.1, + "grad_norm": 3.023120164871216, + "learning_rate": 9.89650681013514e-06, + "logits/chosen": 0.018551167100667953, + "logits/rejected": -0.15479056537151337, + "logps/chosen": -61.10029220581055, + "logps/rejected": -53.477264404296875, + "loss": 0.8295, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6549394130706787, + "rewards/margins": 2.4816834926605225, + "rewards/rejected": 0.17325574159622192, + "step": 389 + }, + { + "epoch": 0.1, + "grad_norm": 2.9540998935699463, + "learning_rate": 9.895975882662283e-06, + "logits/chosen": 0.005771517753601074, + "logits/rejected": -0.07856131345033646, + "logps/chosen": -57.454288482666016, + "logps/rejected": -66.71835327148438, + "loss": 0.8612, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.857321262359619, + "rewards/margins": 2.4672746658325195, + "rewards/rejected": 0.3900465667247772, + "step": 390 + }, + { + "epoch": 0.1, + "grad_norm": 3.3114242553710938, + "learning_rate": 9.895443611138398e-06, + "logits/chosen": -0.034254156053066254, + "logits/rejected": -0.22329767048358917, + "logps/chosen": -61.70151138305664, + "logps/rejected": -58.14949035644531, + "loss": 0.8313, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7473716735839844, + "rewards/margins": 2.643923282623291, + "rewards/rejected": 0.10344861447811127, + "step": 391 + }, + { + "epoch": 0.1, + "grad_norm": 3.3845322132110596, + "learning_rate": 9.894909995709607e-06, + "logits/chosen": 0.00548882782459259, + "logits/rejected": -0.07699635624885559, + "logps/chosen": -60.33189392089844, + "logps/rejected": -60.965782165527344, + "loss": 0.9254, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7728254795074463, + "rewards/margins": 2.1535892486572266, + "rewards/rejected": 0.6192362308502197, + "step": 392 + }, + { + "epoch": 0.1, + "grad_norm": 3.2677905559539795, + "learning_rate": 9.894375036522398e-06, + "logits/chosen": -0.03489100933074951, + "logits/rejected": -0.17526237666606903, + "logps/chosen": -58.51012420654297, + "logps/rejected": -52.975067138671875, + "loss": 0.8396, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8259732723236084, + "rewards/margins": 2.517328977584839, + "rewards/rejected": 0.30864399671554565, + "step": 393 + }, + { + "epoch": 0.1, + "grad_norm": 3.611882448196411, + "learning_rate": 9.89383873372363e-06, + "logits/chosen": 0.07587084174156189, + "logits/rejected": -0.07397718727588654, + "logps/chosen": -69.74501037597656, + "logps/rejected": -61.712005615234375, + "loss": 0.8752, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0345048904418945, + "rewards/margins": 1.9802372455596924, + "rewards/rejected": 1.0542676448822021, + "step": 394 + }, + { + "epoch": 0.1, + "grad_norm": 3.56840181350708, + "learning_rate": 9.893301087460528e-06, + "logits/chosen": -0.15427471697330475, + "logits/rejected": -0.23585423827171326, + "logps/chosen": -51.49329376220703, + "logps/rejected": -61.56583786010742, + "loss": 0.9913, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.887289047241211, + "rewards/margins": 2.151665210723877, + "rewards/rejected": 0.7356237769126892, + "step": 395 + }, + { + "epoch": 0.1, + "grad_norm": 3.735400676727295, + "learning_rate": 9.892762097880689e-06, + "logits/chosen": -0.012150549329817295, + "logits/rejected": -0.07212138175964355, + "logps/chosen": -59.69239044189453, + "logps/rejected": -57.00760269165039, + "loss": 0.9381, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6907427310943604, + "rewards/margins": 2.016556739807129, + "rewards/rejected": 0.674186110496521, + "step": 396 + }, + { + "epoch": 0.1, + "grad_norm": 3.9068188667297363, + "learning_rate": 9.892221765132075e-06, + "logits/chosen": 0.028436854481697083, + "logits/rejected": -0.03012879565358162, + "logps/chosen": -57.108455657958984, + "logps/rejected": -67.22319793701172, + "loss": 0.9628, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.829998254776001, + "rewards/margins": 1.3600358963012695, + "rewards/rejected": 1.469962477684021, + "step": 397 + }, + { + "epoch": 0.1, + "grad_norm": 3.4164087772369385, + "learning_rate": 9.891680089363022e-06, + "logits/chosen": -0.08422675728797913, + "logits/rejected": -0.18398287892341614, + "logps/chosen": -53.35639953613281, + "logps/rejected": -53.22724151611328, + "loss": 0.9354, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.803583860397339, + "rewards/margins": 2.196488857269287, + "rewards/rejected": 0.6070951223373413, + "step": 398 + }, + { + "epoch": 0.1, + "grad_norm": 3.8268139362335205, + "learning_rate": 9.89113707072223e-06, + "logits/chosen": 0.016788708046078682, + "logits/rejected": -0.12453624606132507, + "logps/chosen": -58.84000778198242, + "logps/rejected": -57.11900329589844, + "loss": 0.9722, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7771599292755127, + "rewards/margins": 1.7337439060211182, + "rewards/rejected": 1.0434160232543945, + "step": 399 + }, + { + "epoch": 0.1, + "grad_norm": 3.9995851516723633, + "learning_rate": 9.890592709358771e-06, + "logits/chosen": -0.1968020349740982, + "logits/rejected": -0.25247931480407715, + "logps/chosen": -70.05010986328125, + "logps/rejected": -54.54664993286133, + "loss": 0.9339, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9351863861083984, + "rewards/margins": 2.063750982284546, + "rewards/rejected": 0.8714355230331421, + "step": 400 + }, + { + "epoch": 0.1, + "grad_norm": 3.9133689403533936, + "learning_rate": 9.89004700542208e-06, + "logits/chosen": 0.01798737421631813, + "logits/rejected": -0.08282674849033356, + "logps/chosen": -61.3079719543457, + "logps/rejected": -62.67341613769531, + "loss": 0.9433, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.024221420288086, + "rewards/margins": 2.0261669158935547, + "rewards/rejected": 0.9980546236038208, + "step": 401 + }, + { + "epoch": 0.1, + "grad_norm": 3.233286142349243, + "learning_rate": 9.88949995906197e-06, + "logits/chosen": -0.08002828061580658, + "logits/rejected": -0.18364806473255157, + "logps/chosen": -60.39634704589844, + "logps/rejected": -57.53280258178711, + "loss": 1.048, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.814328908920288, + "rewards/margins": 2.287470579147339, + "rewards/rejected": 0.526858389377594, + "step": 402 + }, + { + "epoch": 0.1, + "grad_norm": 3.3099205493927, + "learning_rate": 9.888951570428611e-06, + "logits/chosen": -0.09816713631153107, + "logits/rejected": -0.16421358287334442, + "logps/chosen": -43.167884826660156, + "logps/rejected": -61.734092712402344, + "loss": 0.7972, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5840353965759277, + "rewards/margins": 2.4424588680267334, + "rewards/rejected": 0.14157678186893463, + "step": 403 + }, + { + "epoch": 0.1, + "grad_norm": 5.415252685546875, + "learning_rate": 9.888401839672554e-06, + "logits/chosen": -0.015325680375099182, + "logits/rejected": -0.14790059626102448, + "logps/chosen": -59.05347442626953, + "logps/rejected": -50.738197326660156, + "loss": 1.1882, + "rewards/accuracies": 0.625, + "rewards/chosen": 2.6158227920532227, + "rewards/margins": 1.7108789682388306, + "rewards/rejected": 0.9049438238143921, + "step": 404 + }, + { + "epoch": 0.1, + "grad_norm": 3.8412892818450928, + "learning_rate": 9.887850766944707e-06, + "logits/chosen": -0.021224649623036385, + "logits/rejected": -0.031679004430770874, + "logps/chosen": -48.3924446105957, + "logps/rejected": -65.30741882324219, + "loss": 0.934, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8937103748321533, + "rewards/margins": 2.096670150756836, + "rewards/rejected": 0.7970403432846069, + "step": 405 + }, + { + "epoch": 0.1, + "grad_norm": 3.174767017364502, + "learning_rate": 9.887298352396352e-06, + "logits/chosen": -0.009807571768760681, + "logits/rejected": -0.22601553797721863, + "logps/chosen": -61.24534606933594, + "logps/rejected": -48.87983322143555, + "loss": 0.8493, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.829821825027466, + "rewards/margins": 2.3329429626464844, + "rewards/rejected": 0.4968786835670471, + "step": 406 + }, + { + "epoch": 0.1, + "grad_norm": 3.3981480598449707, + "learning_rate": 9.88674459617914e-06, + "logits/chosen": 0.004210735205560923, + "logits/rejected": -0.15724879503250122, + "logps/chosen": -59.33933639526367, + "logps/rejected": -51.28857421875, + "loss": 0.9711, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6594998836517334, + "rewards/margins": 2.0637423992156982, + "rewards/rejected": 0.5957573652267456, + "step": 407 + }, + { + "epoch": 0.1, + "grad_norm": 3.832277774810791, + "learning_rate": 9.886189498445091e-06, + "logits/chosen": -0.013457506895065308, + "logits/rejected": -0.15997719764709473, + "logps/chosen": -62.26905059814453, + "logps/rejected": -63.59620666503906, + "loss": 0.9567, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.425630569458008, + "rewards/margins": 2.078704833984375, + "rewards/rejected": 0.3469254672527313, + "step": 408 + }, + { + "epoch": 0.1, + "grad_norm": 3.1717751026153564, + "learning_rate": 9.885633059346587e-06, + "logits/chosen": 0.028789237141609192, + "logits/rejected": -0.1924532651901245, + "logps/chosen": -56.66592025756836, + "logps/rejected": -62.36488723754883, + "loss": 0.8074, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.534058094024658, + "rewards/margins": 2.2026822566986084, + "rewards/rejected": 0.3313756287097931, + "step": 409 + }, + { + "epoch": 0.1, + "grad_norm": 4.38424825668335, + "learning_rate": 9.885075279036385e-06, + "logits/chosen": -0.03515272215008736, + "logits/rejected": -0.23199418187141418, + "logps/chosen": -51.047367095947266, + "logps/rejected": -46.20159149169922, + "loss": 0.9441, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.645897388458252, + "rewards/margins": 2.4649157524108887, + "rewards/rejected": 0.18098153173923492, + "step": 410 + }, + { + "epoch": 0.1, + "grad_norm": 3.8556063175201416, + "learning_rate": 9.884516157667608e-06, + "logits/chosen": -0.03602190315723419, + "logits/rejected": -0.13331957161426544, + "logps/chosen": -58.638153076171875, + "logps/rejected": -64.61663055419922, + "loss": 1.0579, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7573533058166504, + "rewards/margins": 1.6594045162200928, + "rewards/rejected": 1.0979489088058472, + "step": 411 + }, + { + "epoch": 0.1, + "grad_norm": 3.687427282333374, + "learning_rate": 9.883955695393745e-06, + "logits/chosen": -0.03930787742137909, + "logits/rejected": -0.10823757946491241, + "logps/chosen": -58.62665939331055, + "logps/rejected": -69.77872467041016, + "loss": 0.9429, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.906569719314575, + "rewards/margins": 2.4506092071533203, + "rewards/rejected": 0.45596006512641907, + "step": 412 + }, + { + "epoch": 0.1, + "grad_norm": 2.303863763809204, + "learning_rate": 9.883393892368656e-06, + "logits/chosen": 0.010260241106152534, + "logits/rejected": -0.16292685270309448, + "logps/chosen": -62.01756286621094, + "logps/rejected": -58.72306823730469, + "loss": 0.7695, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.919508695602417, + "rewards/margins": 3.026155710220337, + "rewards/rejected": -0.10664715617895126, + "step": 413 + }, + { + "epoch": 0.1, + "grad_norm": 3.830497980117798, + "learning_rate": 9.88283074874657e-06, + "logits/chosen": -2.6460736989974976e-05, + "logits/rejected": -0.13656830787658691, + "logps/chosen": -57.828765869140625, + "logps/rejected": -61.49073791503906, + "loss": 0.8362, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.94380259513855, + "rewards/margins": 2.4776499271392822, + "rewards/rejected": 0.46615248918533325, + "step": 414 + }, + { + "epoch": 0.1, + "grad_norm": 3.1328353881835938, + "learning_rate": 9.882266264682079e-06, + "logits/chosen": -0.00576329231262207, + "logits/rejected": -0.0868016853928566, + "logps/chosen": -59.53081130981445, + "logps/rejected": -59.016029357910156, + "loss": 0.9405, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.75423264503479, + "rewards/margins": 2.1126961708068848, + "rewards/rejected": 0.6415364146232605, + "step": 415 + }, + { + "epoch": 0.1, + "grad_norm": 2.879289150238037, + "learning_rate": 9.881700440330148e-06, + "logits/chosen": 0.018172360956668854, + "logits/rejected": -0.09913372993469238, + "logps/chosen": -54.46186447143555, + "logps/rejected": -71.83164978027344, + "loss": 0.7431, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.184230327606201, + "rewards/margins": 2.7396886348724365, + "rewards/rejected": 0.444541335105896, + "step": 416 + }, + { + "epoch": 0.1, + "grad_norm": 3.5712952613830566, + "learning_rate": 9.881133275846106e-06, + "logits/chosen": -0.021570900455117226, + "logits/rejected": -0.16940295696258545, + "logps/chosen": -58.596595764160156, + "logps/rejected": -58.32360076904297, + "loss": 1.0442, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.5936880111694336, + "rewards/margins": 1.8663204908370972, + "rewards/rejected": 0.727367103099823, + "step": 417 + }, + { + "epoch": 0.1, + "grad_norm": 3.745779514312744, + "learning_rate": 9.880564771385654e-06, + "logits/chosen": -0.07301205396652222, + "logits/rejected": -0.1650761067867279, + "logps/chosen": -52.50029754638672, + "logps/rejected": -59.89052963256836, + "loss": 0.9282, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5446107387542725, + "rewards/margins": 2.1081461906433105, + "rewards/rejected": 0.4364643692970276, + "step": 418 + }, + { + "epoch": 0.1, + "grad_norm": 3.5587284564971924, + "learning_rate": 9.87999492710486e-06, + "logits/chosen": -0.009858801029622555, + "logits/rejected": -0.11234398931264877, + "logps/chosen": -62.86222839355469, + "logps/rejected": -60.62103271484375, + "loss": 0.9665, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7173430919647217, + "rewards/margins": 1.9609217643737793, + "rewards/rejected": 0.7564212679862976, + "step": 419 + }, + { + "epoch": 0.11, + "grad_norm": 3.4110045433044434, + "learning_rate": 9.879423743160154e-06, + "logits/chosen": 0.021548323333263397, + "logits/rejected": -0.13743627071380615, + "logps/chosen": -55.17013931274414, + "logps/rejected": -60.812965393066406, + "loss": 0.8026, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7408978939056396, + "rewards/margins": 2.4434428215026855, + "rewards/rejected": 0.2974550724029541, + "step": 420 + }, + { + "epoch": 0.11, + "grad_norm": 3.2386908531188965, + "learning_rate": 9.878851219708341e-06, + "logits/chosen": -0.05299616605043411, + "logits/rejected": -0.09546102583408356, + "logps/chosen": -59.14761734008789, + "logps/rejected": -71.01219940185547, + "loss": 0.8731, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7260570526123047, + "rewards/margins": 2.1197052001953125, + "rewards/rejected": 0.6063520312309265, + "step": 421 + }, + { + "epoch": 0.11, + "grad_norm": 2.977909803390503, + "learning_rate": 9.87827735690659e-06, + "logits/chosen": -0.05686500295996666, + "logits/rejected": -0.1945873200893402, + "logps/chosen": -52.96527862548828, + "logps/rejected": -58.61903381347656, + "loss": 0.8292, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.789886713027954, + "rewards/margins": 2.5371744632720947, + "rewards/rejected": 0.2527121901512146, + "step": 422 + }, + { + "epoch": 0.11, + "grad_norm": 3.021721601486206, + "learning_rate": 9.877702154912442e-06, + "logits/chosen": -0.08212268352508545, + "logits/rejected": -0.1838916540145874, + "logps/chosen": -52.891536712646484, + "logps/rejected": -58.70661926269531, + "loss": 0.8254, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0862975120544434, + "rewards/margins": 2.661261796951294, + "rewards/rejected": 0.42503565549850464, + "step": 423 + }, + { + "epoch": 0.11, + "grad_norm": 3.4415125846862793, + "learning_rate": 9.877125613883799e-06, + "logits/chosen": -0.022158721461892128, + "logits/rejected": -0.21739330887794495, + "logps/chosen": -73.8882064819336, + "logps/rejected": -53.197654724121094, + "loss": 0.9205, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9076032638549805, + "rewards/margins": 2.236083984375, + "rewards/rejected": 0.6715191602706909, + "step": 424 + }, + { + "epoch": 0.11, + "grad_norm": 3.6424782276153564, + "learning_rate": 9.876547733978934e-06, + "logits/chosen": 0.05305997282266617, + "logits/rejected": -0.12808555364608765, + "logps/chosen": -58.02375411987305, + "logps/rejected": -54.810882568359375, + "loss": 0.8967, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7768492698669434, + "rewards/margins": 2.5282704830169678, + "rewards/rejected": 0.24857869744300842, + "step": 425 + }, + { + "epoch": 0.11, + "grad_norm": 4.52596378326416, + "learning_rate": 9.875968515356491e-06, + "logits/chosen": 0.02974993735551834, + "logits/rejected": -0.0699203684926033, + "logps/chosen": -52.27801513671875, + "logps/rejected": -57.88379669189453, + "loss": 1.0144, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.5507731437683105, + "rewards/margins": 1.8477492332458496, + "rewards/rejected": 0.7030245065689087, + "step": 426 + }, + { + "epoch": 0.11, + "grad_norm": 2.7132248878479004, + "learning_rate": 9.875387958175472e-06, + "logits/chosen": 0.029405944049358368, + "logits/rejected": -0.10699895024299622, + "logps/chosen": -70.82771301269531, + "logps/rejected": -64.45700073242188, + "loss": 0.8485, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7432703971862793, + "rewards/margins": 2.810202121734619, + "rewards/rejected": -0.06693145632743835, + "step": 427 + }, + { + "epoch": 0.11, + "grad_norm": 2.99861741065979, + "learning_rate": 9.87480606259526e-06, + "logits/chosen": -0.08652263879776001, + "logits/rejected": -0.170531764626503, + "logps/chosen": -70.7040023803711, + "logps/rejected": -65.63844299316406, + "loss": 0.9393, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0062482357025146, + "rewards/margins": 2.449028491973877, + "rewards/rejected": 0.5572201013565063, + "step": 428 + }, + { + "epoch": 0.11, + "grad_norm": 3.7656664848327637, + "learning_rate": 9.87422282877559e-06, + "logits/chosen": 0.0464591383934021, + "logits/rejected": -0.16404157876968384, + "logps/chosen": -56.185813903808594, + "logps/rejected": -56.15495681762695, + "loss": 0.8637, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.5831665992736816, + "rewards/margins": 2.0787296295166016, + "rewards/rejected": 0.504436731338501, + "step": 429 + }, + { + "epoch": 0.11, + "grad_norm": 2.665628671646118, + "learning_rate": 9.873638256876577e-06, + "logits/chosen": -0.10552641749382019, + "logits/rejected": -0.2585712671279907, + "logps/chosen": -56.28156280517578, + "logps/rejected": -59.73946762084961, + "loss": 0.7989, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8587234020233154, + "rewards/margins": 3.2610130310058594, + "rewards/rejected": -0.40228936076164246, + "step": 430 + }, + { + "epoch": 0.11, + "grad_norm": 3.2277770042419434, + "learning_rate": 9.873052347058698e-06, + "logits/chosen": -0.0161898210644722, + "logits/rejected": -0.11078345775604248, + "logps/chosen": -60.22744369506836, + "logps/rejected": -80.65811920166016, + "loss": 0.8361, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0871171951293945, + "rewards/margins": 2.165127992630005, + "rewards/rejected": 0.9219887852668762, + "step": 431 + }, + { + "epoch": 0.11, + "grad_norm": 3.3282132148742676, + "learning_rate": 9.872465099482798e-06, + "logits/chosen": -0.03638280928134918, + "logits/rejected": -0.13003599643707275, + "logps/chosen": -56.04096984863281, + "logps/rejected": -65.55854034423828, + "loss": 0.9321, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.505115270614624, + "rewards/margins": 2.394991636276245, + "rewards/rejected": 0.11012391000986099, + "step": 432 + }, + { + "epoch": 0.11, + "grad_norm": 3.32723069190979, + "learning_rate": 9.871876514310088e-06, + "logits/chosen": -0.04638422280550003, + "logits/rejected": -0.11753226071596146, + "logps/chosen": -56.48130798339844, + "logps/rejected": -68.75173950195312, + "loss": 0.8358, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.74375057220459, + "rewards/margins": 2.6842403411865234, + "rewards/rejected": 0.059510182589292526, + "step": 433 + }, + { + "epoch": 0.11, + "grad_norm": 4.7099432945251465, + "learning_rate": 9.87128659170215e-06, + "logits/chosen": -0.019556112587451935, + "logits/rejected": 0.003271050751209259, + "logps/chosen": -52.52851867675781, + "logps/rejected": -82.36897277832031, + "loss": 1.1272, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.244915723800659, + "rewards/margins": 1.9297704696655273, + "rewards/rejected": 1.3151453733444214, + "step": 434 + }, + { + "epoch": 0.11, + "grad_norm": 3.108020782470703, + "learning_rate": 9.870695331820925e-06, + "logits/chosen": -0.0679192990064621, + "logits/rejected": -0.09531362354755402, + "logps/chosen": -56.0777473449707, + "logps/rejected": -64.51567077636719, + "loss": 0.9184, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1273913383483887, + "rewards/margins": 2.381047010421753, + "rewards/rejected": 0.7463443875312805, + "step": 435 + }, + { + "epoch": 0.11, + "grad_norm": 3.591658592224121, + "learning_rate": 9.870102734828733e-06, + "logits/chosen": 0.010897781699895859, + "logits/rejected": -0.18251222372055054, + "logps/chosen": -57.67533874511719, + "logps/rejected": -48.49125671386719, + "loss": 0.8197, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.830575942993164, + "rewards/margins": 2.7274293899536133, + "rewards/rejected": 0.10314670205116272, + "step": 436 + }, + { + "epoch": 0.11, + "grad_norm": 3.3946311473846436, + "learning_rate": 9.869508800888252e-06, + "logits/chosen": -0.05198538303375244, + "logits/rejected": -0.1731506586074829, + "logps/chosen": -58.72792434692383, + "logps/rejected": -50.14794921875, + "loss": 1.0017, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.0503344535827637, + "rewards/margins": 2.1633007526397705, + "rewards/rejected": 0.8870338201522827, + "step": 437 + }, + { + "epoch": 0.11, + "grad_norm": 3.7223758697509766, + "learning_rate": 9.868913530162526e-06, + "logits/chosen": 0.11853660643100739, + "logits/rejected": -0.08906128257513046, + "logps/chosen": -56.722415924072266, + "logps/rejected": -52.2465934753418, + "loss": 0.8496, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4500889778137207, + "rewards/margins": 1.7692062854766846, + "rewards/rejected": 0.6808828115463257, + "step": 438 + }, + { + "epoch": 0.11, + "grad_norm": 3.366049289703369, + "learning_rate": 9.868316922814976e-06, + "logits/chosen": -0.0734679251909256, + "logits/rejected": -0.12940852344036102, + "logps/chosen": -50.01245880126953, + "logps/rejected": -60.19503402709961, + "loss": 0.8764, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.725982189178467, + "rewards/margins": 2.2208380699157715, + "rewards/rejected": 0.505143940448761, + "step": 439 + }, + { + "epoch": 0.11, + "grad_norm": 4.9084367752075195, + "learning_rate": 9.86771897900938e-06, + "logits/chosen": 0.009879160672426224, + "logits/rejected": -0.25482743978500366, + "logps/chosen": -69.05509948730469, + "logps/rejected": -62.79582977294922, + "loss": 1.0188, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.763740301132202, + "rewards/margins": 2.155562400817871, + "rewards/rejected": 0.6081778407096863, + "step": 440 + }, + { + "epoch": 0.11, + "grad_norm": 2.807692050933838, + "learning_rate": 9.867119698909888e-06, + "logits/chosen": -0.10248465836048126, + "logits/rejected": -0.1931336522102356, + "logps/chosen": -60.03171157836914, + "logps/rejected": -62.72966003417969, + "loss": 0.8214, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7821478843688965, + "rewards/margins": 2.641209602355957, + "rewards/rejected": 0.14093837141990662, + "step": 441 + }, + { + "epoch": 0.11, + "grad_norm": 3.7261242866516113, + "learning_rate": 9.866519082681014e-06, + "logits/chosen": -0.07599875330924988, + "logits/rejected": -0.17388486862182617, + "logps/chosen": -46.1452751159668, + "logps/rejected": -58.9340934753418, + "loss": 1.0055, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.650217056274414, + "rewards/margins": 1.7650940418243408, + "rewards/rejected": 0.8851231336593628, + "step": 442 + }, + { + "epoch": 0.11, + "grad_norm": 4.840351104736328, + "learning_rate": 9.86591713048764e-06, + "logits/chosen": -0.029131975024938583, + "logits/rejected": -0.08075995743274689, + "logps/chosen": -65.1787109375, + "logps/rejected": -67.21583557128906, + "loss": 1.1777, + "rewards/accuracies": 0.65625, + "rewards/chosen": 2.6149206161499023, + "rewards/margins": 1.523461103439331, + "rewards/rejected": 1.0914595127105713, + "step": 443 + }, + { + "epoch": 0.11, + "grad_norm": 3.434213638305664, + "learning_rate": 9.865313842495016e-06, + "logits/chosen": 0.020358042791485786, + "logits/rejected": -0.04285471886396408, + "logps/chosen": -59.02457046508789, + "logps/rejected": -58.95989227294922, + "loss": 0.9933, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5461559295654297, + "rewards/margins": 1.7085696458816528, + "rewards/rejected": 0.8375861644744873, + "step": 444 + }, + { + "epoch": 0.11, + "grad_norm": 2.9947352409362793, + "learning_rate": 9.864709218868757e-06, + "logits/chosen": -0.013200250454246998, + "logits/rejected": -0.22978521883487701, + "logps/chosen": -61.567626953125, + "logps/rejected": -47.65142059326172, + "loss": 0.8452, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.804619550704956, + "rewards/margins": 2.497238874435425, + "rewards/rejected": 0.3073806166648865, + "step": 445 + }, + { + "epoch": 0.11, + "grad_norm": 4.261551380157471, + "learning_rate": 9.864103259774845e-06, + "logits/chosen": -0.0799989327788353, + "logits/rejected": -0.07801874727010727, + "logps/chosen": -53.94062042236328, + "logps/rejected": -66.81256866455078, + "loss": 1.0736, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.618192434310913, + "rewards/margins": 1.441410779953003, + "rewards/rejected": 1.1767816543579102, + "step": 446 + }, + { + "epoch": 0.11, + "grad_norm": 4.083856582641602, + "learning_rate": 9.863495965379628e-06, + "logits/chosen": -0.045700348913669586, + "logits/rejected": -0.22145400941371918, + "logps/chosen": -64.2158432006836, + "logps/rejected": -54.7587776184082, + "loss": 0.9342, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.008395195007324, + "rewards/margins": 2.2872111797332764, + "rewards/rejected": 0.7211839556694031, + "step": 447 + }, + { + "epoch": 0.11, + "grad_norm": 2.3679628372192383, + "learning_rate": 9.862887335849825e-06, + "logits/chosen": -0.03622525930404663, + "logits/rejected": -0.11638150364160538, + "logps/chosen": -58.07950210571289, + "logps/rejected": -75.90277099609375, + "loss": 0.8423, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8727529048919678, + "rewards/margins": 3.034759283065796, + "rewards/rejected": -0.16200676560401917, + "step": 448 + }, + { + "epoch": 0.11, + "grad_norm": 4.107671737670898, + "learning_rate": 9.862277371352513e-06, + "logits/chosen": -0.05806519091129303, + "logits/rejected": -0.16559606790542603, + "logps/chosen": -49.288448333740234, + "logps/rejected": -63.121795654296875, + "loss": 0.9687, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.850987672805786, + "rewards/margins": 2.025204658508301, + "rewards/rejected": 0.825782835483551, + "step": 449 + }, + { + "epoch": 0.11, + "grad_norm": 4.287243366241455, + "learning_rate": 9.861666072055144e-06, + "logits/chosen": 0.011190870776772499, + "logits/rejected": -0.0617036446928978, + "logps/chosen": -62.98733139038086, + "logps/rejected": -72.34019470214844, + "loss": 1.0283, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9509384632110596, + "rewards/margins": 2.153207540512085, + "rewards/rejected": 0.797730565071106, + "step": 450 + }, + { + "epoch": 0.11, + "grad_norm": 3.6833584308624268, + "learning_rate": 9.861053438125533e-06, + "logits/chosen": 0.042592864483594894, + "logits/rejected": -0.15077394247055054, + "logps/chosen": -63.97175216674805, + "logps/rejected": -58.93878173828125, + "loss": 0.8443, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.788727283477783, + "rewards/margins": 2.603746175765991, + "rewards/rejected": 0.18498125672340393, + "step": 451 + }, + { + "epoch": 0.11, + "grad_norm": 4.268006801605225, + "learning_rate": 9.86043946973186e-06, + "logits/chosen": 0.0722481831908226, + "logits/rejected": -0.09102785587310791, + "logps/chosen": -76.43634796142578, + "logps/rejected": -69.25853729248047, + "loss": 0.9733, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6619467735290527, + "rewards/margins": 2.169867515563965, + "rewards/rejected": 0.49207961559295654, + "step": 452 + }, + { + "epoch": 0.11, + "grad_norm": 3.2851686477661133, + "learning_rate": 9.85982416704267e-06, + "logits/chosen": 0.027661608532071114, + "logits/rejected": -0.1461217701435089, + "logps/chosen": -69.65210723876953, + "logps/rejected": -62.79156494140625, + "loss": 0.8553, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.613499402999878, + "rewards/margins": 2.4376060962677, + "rewards/rejected": 0.175893634557724, + "step": 453 + }, + { + "epoch": 0.11, + "grad_norm": 3.5844571590423584, + "learning_rate": 9.859207530226882e-06, + "logits/chosen": -0.03227634355425835, + "logits/rejected": -0.19034507870674133, + "logps/chosen": -56.74742126464844, + "logps/rejected": -48.833213806152344, + "loss": 0.8917, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5521304607391357, + "rewards/margins": 1.6400330066680908, + "rewards/rejected": 0.9120975136756897, + "step": 454 + }, + { + "epoch": 0.11, + "grad_norm": 3.293713331222534, + "learning_rate": 9.85858955945377e-06, + "logits/chosen": -0.012692756950855255, + "logits/rejected": -0.18457716703414917, + "logps/chosen": -65.03292846679688, + "logps/rejected": -57.627681732177734, + "loss": 0.9028, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9127979278564453, + "rewards/margins": 2.3223226070404053, + "rewards/rejected": 0.59047532081604, + "step": 455 + }, + { + "epoch": 0.11, + "grad_norm": 3.4822208881378174, + "learning_rate": 9.857970254892987e-06, + "logits/chosen": 0.10000722110271454, + "logits/rejected": 0.013834035024046898, + "logps/chosen": -57.97611999511719, + "logps/rejected": -61.905677795410156, + "loss": 0.9809, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8087639808654785, + "rewards/margins": 1.8809846639633179, + "rewards/rejected": 0.9277796745300293, + "step": 456 + }, + { + "epoch": 0.11, + "grad_norm": 3.4643197059631348, + "learning_rate": 9.857349616714542e-06, + "logits/chosen": -0.07968011498451233, + "logits/rejected": -0.15483391284942627, + "logps/chosen": -70.49327850341797, + "logps/rejected": -62.36391067504883, + "loss": 0.929, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0861966609954834, + "rewards/margins": 2.1443440914154053, + "rewards/rejected": 0.9418526887893677, + "step": 457 + }, + { + "epoch": 0.11, + "grad_norm": 4.021754741668701, + "learning_rate": 9.856727645088812e-06, + "logits/chosen": -0.15897925198078156, + "logits/rejected": -0.09149616956710815, + "logps/chosen": -51.94213104248047, + "logps/rejected": -86.05289459228516, + "loss": 1.0423, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.685609817504883, + "rewards/margins": 1.6232128143310547, + "rewards/rejected": 1.0623968839645386, + "step": 458 + }, + { + "epoch": 0.11, + "grad_norm": 3.3858132362365723, + "learning_rate": 9.856104340186546e-06, + "logits/chosen": -0.12203320115804672, + "logits/rejected": -0.20025041699409485, + "logps/chosen": -63.176185607910156, + "logps/rejected": -51.28822708129883, + "loss": 1.022, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.863945484161377, + "rewards/margins": 2.566744804382324, + "rewards/rejected": 0.2972005307674408, + "step": 459 + }, + { + "epoch": 0.12, + "grad_norm": 3.4197356700897217, + "learning_rate": 9.855479702178851e-06, + "logits/chosen": -0.015114065259695053, + "logits/rejected": -0.13968518376350403, + "logps/chosen": -50.770389556884766, + "logps/rejected": -53.54432678222656, + "loss": 0.901, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.559453248977661, + "rewards/margins": 1.90969979763031, + "rewards/rejected": 0.6497534513473511, + "step": 460 + }, + { + "epoch": 0.12, + "grad_norm": 2.7760331630706787, + "learning_rate": 9.854853731237205e-06, + "logits/chosen": 0.030103938654065132, + "logits/rejected": -0.2144111841917038, + "logps/chosen": -55.96912384033203, + "logps/rejected": -54.291133880615234, + "loss": 0.7633, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9046239852905273, + "rewards/margins": 2.9267795085906982, + "rewards/rejected": -0.022155851125717163, + "step": 461 + }, + { + "epoch": 0.12, + "grad_norm": 3.770549774169922, + "learning_rate": 9.85422642753345e-06, + "logits/chosen": -0.0661381334066391, + "logits/rejected": -0.11352121829986572, + "logps/chosen": -55.53028869628906, + "logps/rejected": -66.22946166992188, + "loss": 0.9971, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.703213691711426, + "rewards/margins": 2.0050106048583984, + "rewards/rejected": 0.6982032060623169, + "step": 462 + }, + { + "epoch": 0.12, + "grad_norm": 4.630494117736816, + "learning_rate": 9.853597791239795e-06, + "logits/chosen": 0.03215176612138748, + "logits/rejected": -0.02613789215683937, + "logps/chosen": -64.9883804321289, + "logps/rejected": -77.44344329833984, + "loss": 1.0602, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.801762104034424, + "rewards/margins": 1.5098689794540405, + "rewards/rejected": 1.2918930053710938, + "step": 463 + }, + { + "epoch": 0.12, + "grad_norm": 3.481692314147949, + "learning_rate": 9.852967822528814e-06, + "logits/chosen": -0.09060729295015335, + "logits/rejected": -0.10133375227451324, + "logps/chosen": -52.235130310058594, + "logps/rejected": -82.69760131835938, + "loss": 1.0197, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.0269665718078613, + "rewards/margins": 2.0785973072052, + "rewards/rejected": 0.9483695030212402, + "step": 464 + }, + { + "epoch": 0.12, + "grad_norm": 3.1038739681243896, + "learning_rate": 9.852336521573447e-06, + "logits/chosen": -0.013447847217321396, + "logits/rejected": -0.14249031245708466, + "logps/chosen": -58.141929626464844, + "logps/rejected": -53.23253631591797, + "loss": 0.9559, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.734299421310425, + "rewards/margins": 1.8599414825439453, + "rewards/rejected": 0.874358057975769, + "step": 465 + }, + { + "epoch": 0.12, + "grad_norm": 3.4218637943267822, + "learning_rate": 9.851703888546998e-06, + "logits/chosen": -0.007916122674942017, + "logits/rejected": -0.15505078434944153, + "logps/chosen": -54.792144775390625, + "logps/rejected": -50.242462158203125, + "loss": 0.9347, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7431931495666504, + "rewards/margins": 1.894773244857788, + "rewards/rejected": 0.8484199047088623, + "step": 466 + }, + { + "epoch": 0.12, + "grad_norm": 3.7231857776641846, + "learning_rate": 9.851069923623142e-06, + "logits/chosen": -0.026970431208610535, + "logits/rejected": -0.19770610332489014, + "logps/chosen": -66.10816955566406, + "logps/rejected": -65.53594207763672, + "loss": 0.9283, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.700960397720337, + "rewards/margins": 1.9498785734176636, + "rewards/rejected": 0.7510817646980286, + "step": 467 + }, + { + "epoch": 0.12, + "grad_norm": 3.188694715499878, + "learning_rate": 9.850434626975913e-06, + "logits/chosen": 0.003175400197505951, + "logits/rejected": -0.11405792087316513, + "logps/chosen": -56.897239685058594, + "logps/rejected": -68.18517303466797, + "loss": 0.8581, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8374903202056885, + "rewards/margins": 2.3133134841918945, + "rewards/rejected": 0.5241771340370178, + "step": 468 + }, + { + "epoch": 0.12, + "grad_norm": 4.698224067687988, + "learning_rate": 9.849797998779715e-06, + "logits/chosen": -0.06917021423578262, + "logits/rejected": -0.13968852162361145, + "logps/chosen": -58.9385986328125, + "logps/rejected": -64.15028381347656, + "loss": 1.2067, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.762925624847412, + "rewards/margins": 1.52622389793396, + "rewards/rejected": 1.2367018461227417, + "step": 469 + }, + { + "epoch": 0.12, + "grad_norm": 3.6097660064697266, + "learning_rate": 9.849160039209317e-06, + "logits/chosen": -0.004413010086864233, + "logits/rejected": -0.12797263264656067, + "logps/chosen": -60.69731521606445, + "logps/rejected": -65.43778991699219, + "loss": 0.9601, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.889098882675171, + "rewards/margins": 2.0791218280792236, + "rewards/rejected": 0.8099772334098816, + "step": 470 + }, + { + "epoch": 0.12, + "grad_norm": 3.9221527576446533, + "learning_rate": 9.84852074843985e-06, + "logits/chosen": 0.05608116090297699, + "logits/rejected": -0.009649816900491714, + "logps/chosen": -56.121551513671875, + "logps/rejected": -66.02884674072266, + "loss": 0.9725, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.862656831741333, + "rewards/margins": 1.722922682762146, + "rewards/rejected": 1.139734148979187, + "step": 471 + }, + { + "epoch": 0.12, + "grad_norm": 3.964691162109375, + "learning_rate": 9.847880126646816e-06, + "logits/chosen": -0.06388813257217407, + "logits/rejected": -0.20222169160842896, + "logps/chosen": -58.198360443115234, + "logps/rejected": -53.716529846191406, + "loss": 0.8853, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8881173133850098, + "rewards/margins": 2.5314252376556396, + "rewards/rejected": 0.3566921353340149, + "step": 472 + }, + { + "epoch": 0.12, + "grad_norm": 2.8080625534057617, + "learning_rate": 9.847238174006078e-06, + "logits/chosen": -0.08530353009700775, + "logits/rejected": -0.21742284297943115, + "logps/chosen": -53.07892608642578, + "logps/rejected": -54.67731475830078, + "loss": 0.8073, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.747095823287964, + "rewards/margins": 2.3746097087860107, + "rewards/rejected": 0.3724862337112427, + "step": 473 + }, + { + "epoch": 0.12, + "grad_norm": 3.7247474193573, + "learning_rate": 9.846594890693865e-06, + "logits/chosen": 0.04224624112248421, + "logits/rejected": -0.015054870396852493, + "logps/chosen": -61.65394592285156, + "logps/rejected": -71.54991149902344, + "loss": 0.9005, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8484420776367188, + "rewards/margins": 2.284961462020874, + "rewards/rejected": 0.5634804368019104, + "step": 474 + }, + { + "epoch": 0.12, + "grad_norm": 3.418570041656494, + "learning_rate": 9.845950276886775e-06, + "logits/chosen": -0.06362097710371017, + "logits/rejected": -0.18925222754478455, + "logps/chosen": -56.581214904785156, + "logps/rejected": -58.23436737060547, + "loss": 0.9885, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0406270027160645, + "rewards/margins": 2.275615692138672, + "rewards/rejected": 0.7650110721588135, + "step": 475 + }, + { + "epoch": 0.12, + "grad_norm": 4.2469611167907715, + "learning_rate": 9.845304332761767e-06, + "logits/chosen": 0.036813218146562576, + "logits/rejected": -0.13768121600151062, + "logps/chosen": -58.58919906616211, + "logps/rejected": -62.97975158691406, + "loss": 0.8207, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8997554779052734, + "rewards/margins": 2.4519858360290527, + "rewards/rejected": 0.44776949286460876, + "step": 476 + }, + { + "epoch": 0.12, + "grad_norm": 2.4797844886779785, + "learning_rate": 9.844657058496165e-06, + "logits/chosen": -0.06912951916456223, + "logits/rejected": -0.2797728180885315, + "logps/chosen": -66.98773956298828, + "logps/rejected": -52.2148323059082, + "loss": 0.7652, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.106898307800293, + "rewards/margins": 2.670491933822632, + "rewards/rejected": 0.43640631437301636, + "step": 477 + }, + { + "epoch": 0.12, + "grad_norm": 3.656724691390991, + "learning_rate": 9.84400845426766e-06, + "logits/chosen": 0.028403084725141525, + "logits/rejected": -0.09584075212478638, + "logps/chosen": -54.420379638671875, + "logps/rejected": -60.19508743286133, + "loss": 0.963, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6168949604034424, + "rewards/margins": 2.2809135913848877, + "rewards/rejected": 0.33598172664642334, + "step": 478 + }, + { + "epoch": 0.12, + "grad_norm": 2.9018397331237793, + "learning_rate": 9.84335852025431e-06, + "logits/chosen": 0.00853738933801651, + "logits/rejected": -0.1198916882276535, + "logps/chosen": -64.0625, + "logps/rejected": -68.0236587524414, + "loss": 0.9138, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9419870376586914, + "rewards/margins": 2.620990514755249, + "rewards/rejected": 0.3209964334964752, + "step": 479 + }, + { + "epoch": 0.12, + "grad_norm": 3.4609925746917725, + "learning_rate": 9.842707256634534e-06, + "logits/chosen": -0.062148772180080414, + "logits/rejected": -0.13665491342544556, + "logps/chosen": -59.69514083862305, + "logps/rejected": -69.12947082519531, + "loss": 0.9042, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.636455535888672, + "rewards/margins": 1.732969880104065, + "rewards/rejected": 0.9034857749938965, + "step": 480 + }, + { + "epoch": 0.12, + "grad_norm": 3.191030502319336, + "learning_rate": 9.84205466358712e-06, + "logits/chosen": 0.000547308474779129, + "logits/rejected": -0.08674446493387222, + "logps/chosen": -57.75986862182617, + "logps/rejected": -59.826904296875, + "loss": 0.9019, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0820960998535156, + "rewards/margins": 2.1280465126037598, + "rewards/rejected": 0.9540495276451111, + "step": 481 + }, + { + "epoch": 0.12, + "grad_norm": 3.4758718013763428, + "learning_rate": 9.841400741291218e-06, + "logits/chosen": -0.007180148735642433, + "logits/rejected": -0.11468525230884552, + "logps/chosen": -58.48889923095703, + "logps/rejected": -66.64057922363281, + "loss": 0.9098, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8283615112304688, + "rewards/margins": 2.170295238494873, + "rewards/rejected": 0.6580663919448853, + "step": 482 + }, + { + "epoch": 0.12, + "grad_norm": 3.818141460418701, + "learning_rate": 9.84074548992634e-06, + "logits/chosen": 0.0050216373056173325, + "logits/rejected": -0.1915304958820343, + "logps/chosen": -64.25038146972656, + "logps/rejected": -52.19258117675781, + "loss": 0.9119, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4876646995544434, + "rewards/margins": 1.8279626369476318, + "rewards/rejected": 0.6597020030021667, + "step": 483 + }, + { + "epoch": 0.12, + "grad_norm": 2.6857616901397705, + "learning_rate": 9.840088909672373e-06, + "logits/chosen": -0.053144700825214386, + "logits/rejected": -0.2235724776983261, + "logps/chosen": -55.24123001098633, + "logps/rejected": -52.637237548828125, + "loss": 0.7941, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8921499252319336, + "rewards/margins": 2.868277072906494, + "rewards/rejected": 0.02387280762195587, + "step": 484 + }, + { + "epoch": 0.12, + "grad_norm": 4.036463737487793, + "learning_rate": 9.839431000709559e-06, + "logits/chosen": 0.017525333911180496, + "logits/rejected": -0.06975764036178589, + "logps/chosen": -51.892425537109375, + "logps/rejected": -58.18707275390625, + "loss": 0.8236, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7550339698791504, + "rewards/margins": 2.587698459625244, + "rewards/rejected": 0.16733527183532715, + "step": 485 + }, + { + "epoch": 0.12, + "grad_norm": 3.555576801300049, + "learning_rate": 9.838771763218509e-06, + "logits/chosen": -0.03847677260637283, + "logits/rejected": -0.17012850940227509, + "logps/chosen": -63.56205749511719, + "logps/rejected": -68.4330062866211, + "loss": 0.9757, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7464499473571777, + "rewards/margins": 2.2907795906066895, + "rewards/rejected": 0.45567023754119873, + "step": 486 + }, + { + "epoch": 0.12, + "grad_norm": 3.1166369915008545, + "learning_rate": 9.838111197380196e-06, + "logits/chosen": -0.02516135759651661, + "logits/rejected": -0.242792010307312, + "logps/chosen": -60.818153381347656, + "logps/rejected": -48.232906341552734, + "loss": 0.844, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6734964847564697, + "rewards/margins": 2.4358654022216797, + "rewards/rejected": 0.2376311719417572, + "step": 487 + }, + { + "epoch": 0.12, + "grad_norm": 2.3608438968658447, + "learning_rate": 9.83744930337596e-06, + "logits/chosen": 0.007731162011623383, + "logits/rejected": -0.1979444921016693, + "logps/chosen": -56.99720001220703, + "logps/rejected": -55.401641845703125, + "loss": 0.7401, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.942258834838867, + "rewards/margins": 3.3669893741607666, + "rewards/rejected": -0.42473068833351135, + "step": 488 + }, + { + "epoch": 0.12, + "grad_norm": 3.938082695007324, + "learning_rate": 9.83678608138751e-06, + "logits/chosen": -0.004825220443308353, + "logits/rejected": -0.030119318515062332, + "logps/chosen": -63.027278900146484, + "logps/rejected": -66.25724792480469, + "loss": 1.0805, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.5605368614196777, + "rewards/margins": 1.704225778579712, + "rewards/rejected": 0.856311023235321, + "step": 489 + }, + { + "epoch": 0.12, + "grad_norm": 3.958740711212158, + "learning_rate": 9.83612153159691e-06, + "logits/chosen": -0.08574078977108002, + "logits/rejected": -0.1386854350566864, + "logps/chosen": -52.838775634765625, + "logps/rejected": -68.84082794189453, + "loss": 1.0325, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.1475017070770264, + "rewards/margins": 2.1999716758728027, + "rewards/rejected": 0.9475297927856445, + "step": 490 + }, + { + "epoch": 0.12, + "grad_norm": 3.9479949474334717, + "learning_rate": 9.835455654186592e-06, + "logits/chosen": 0.05250265449285507, + "logits/rejected": -0.14192374050617218, + "logps/chosen": -59.605125427246094, + "logps/rejected": -66.10867309570312, + "loss": 0.9071, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.528876304626465, + "rewards/margins": 2.204399585723877, + "rewards/rejected": 0.3244765102863312, + "step": 491 + }, + { + "epoch": 0.12, + "grad_norm": 3.696012020111084, + "learning_rate": 9.834788449339359e-06, + "logits/chosen": -0.0025520608760416508, + "logits/rejected": -0.182359978556633, + "logps/chosen": -63.05535888671875, + "logps/rejected": -59.743736267089844, + "loss": 0.8308, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.900625467300415, + "rewards/margins": 2.5573368072509766, + "rewards/rejected": 0.34328868985176086, + "step": 492 + }, + { + "epoch": 0.12, + "grad_norm": 2.8583054542541504, + "learning_rate": 9.834119917238367e-06, + "logits/chosen": -0.0003175027668476105, + "logits/rejected": -0.18533232808113098, + "logps/chosen": -58.680824279785156, + "logps/rejected": -54.74722671508789, + "loss": 0.8107, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6925432682037354, + "rewards/margins": 2.687164783477783, + "rewards/rejected": 0.005378469824790955, + "step": 493 + }, + { + "epoch": 0.12, + "grad_norm": 3.960700273513794, + "learning_rate": 9.83345005806715e-06, + "logits/chosen": 0.041607167571783066, + "logits/rejected": -0.10215041041374207, + "logps/chosen": -55.28461456298828, + "logps/rejected": -66.87191009521484, + "loss": 0.8937, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.842144250869751, + "rewards/margins": 2.7039785385131836, + "rewards/rejected": 0.13816598057746887, + "step": 494 + }, + { + "epoch": 0.12, + "grad_norm": 3.8841464519500732, + "learning_rate": 9.832778872009589e-06, + "logits/chosen": -0.08893643319606781, + "logits/rejected": -0.13451716303825378, + "logps/chosen": -49.77823257446289, + "logps/rejected": -60.443084716796875, + "loss": 0.923, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.794346332550049, + "rewards/margins": 1.8893059492111206, + "rewards/rejected": 0.9050402641296387, + "step": 495 + }, + { + "epoch": 0.12, + "grad_norm": 3.9252748489379883, + "learning_rate": 9.83210635924995e-06, + "logits/chosen": 0.043377120047807693, + "logits/rejected": -0.15091022849082947, + "logps/chosen": -66.14641571044922, + "logps/rejected": -50.48063278198242, + "loss": 0.9842, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.622011184692383, + "rewards/margins": 2.3446195125579834, + "rewards/rejected": 0.2773916721343994, + "step": 496 + }, + { + "epoch": 0.12, + "grad_norm": 3.77693247795105, + "learning_rate": 9.831432519972841e-06, + "logits/chosen": 0.06321071833372116, + "logits/rejected": 0.014548787847161293, + "logps/chosen": -63.7274169921875, + "logps/rejected": -73.34379577636719, + "loss": 0.9152, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9543683528900146, + "rewards/margins": 1.9797430038452148, + "rewards/rejected": 0.9746251106262207, + "step": 497 + }, + { + "epoch": 0.12, + "grad_norm": 3.273487091064453, + "learning_rate": 9.830757354363257e-06, + "logits/chosen": -0.13114100694656372, + "logits/rejected": -0.22353339195251465, + "logps/chosen": -53.957611083984375, + "logps/rejected": -54.46642303466797, + "loss": 0.8576, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9723708629608154, + "rewards/margins": 2.8511972427368164, + "rewards/rejected": 0.12117373943328857, + "step": 498 + }, + { + "epoch": 0.12, + "grad_norm": 3.7030835151672363, + "learning_rate": 9.830080862606535e-06, + "logits/chosen": 0.015730539336800575, + "logits/rejected": -0.13226653635501862, + "logps/chosen": -53.01581954956055, + "logps/rejected": -53.25721740722656, + "loss": 0.8642, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8656697273254395, + "rewards/margins": 2.371922731399536, + "rewards/rejected": 0.493746817111969, + "step": 499 + }, + { + "epoch": 0.13, + "grad_norm": 2.9327585697174072, + "learning_rate": 9.829403044888393e-06, + "logits/chosen": -0.03371080383658409, + "logits/rejected": -0.20165881514549255, + "logps/chosen": -62.30732727050781, + "logps/rejected": -67.53357696533203, + "loss": 0.8268, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8243117332458496, + "rewards/margins": 2.8377413749694824, + "rewards/rejected": -0.013429783284664154, + "step": 500 + }, + { + "epoch": 0.13, + "grad_norm": 3.652475595474243, + "learning_rate": 9.828723901394906e-06, + "logits/chosen": -0.085425466299057, + "logits/rejected": -0.1234988272190094, + "logps/chosen": -54.14015197753906, + "logps/rejected": -68.406005859375, + "loss": 0.9275, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6105234622955322, + "rewards/margins": 2.023777484893799, + "rewards/rejected": 0.5867457985877991, + "step": 501 + }, + { + "epoch": 0.13, + "grad_norm": 3.042271852493286, + "learning_rate": 9.82804343231251e-06, + "logits/chosen": -0.020767521113157272, + "logits/rejected": -0.17755760252475739, + "logps/chosen": -54.41246795654297, + "logps/rejected": -53.89155197143555, + "loss": 0.8903, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.995321035385132, + "rewards/margins": 2.3284449577331543, + "rewards/rejected": 0.6668760180473328, + "step": 502 + }, + { + "epoch": 0.13, + "grad_norm": 3.810899496078491, + "learning_rate": 9.827361637828013e-06, + "logits/chosen": -0.07988891005516052, + "logits/rejected": -0.10332205146551132, + "logps/chosen": -49.19530487060547, + "logps/rejected": -55.25311279296875, + "loss": 1.0041, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.862119674682617, + "rewards/margins": 1.8835234642028809, + "rewards/rejected": 0.9785960912704468, + "step": 503 + }, + { + "epoch": 0.13, + "grad_norm": 2.9719834327697754, + "learning_rate": 9.82667851812858e-06, + "logits/chosen": 0.11198356747627258, + "logits/rejected": -0.0690038874745369, + "logps/chosen": -54.003623962402344, + "logps/rejected": -54.395172119140625, + "loss": 0.7689, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9615180492401123, + "rewards/margins": 2.888880968093872, + "rewards/rejected": 0.07263703644275665, + "step": 504 + }, + { + "epoch": 0.13, + "grad_norm": 3.6256935596466064, + "learning_rate": 9.825994073401741e-06, + "logits/chosen": 0.01643769070506096, + "logits/rejected": -0.1639700084924698, + "logps/chosen": -62.88530731201172, + "logps/rejected": -57.163177490234375, + "loss": 0.865, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.090742588043213, + "rewards/margins": 2.6983823776245117, + "rewards/rejected": 0.3923600912094116, + "step": 505 + }, + { + "epoch": 0.13, + "grad_norm": 5.312520503997803, + "learning_rate": 9.825308303835393e-06, + "logits/chosen": -0.014896306209266186, + "logits/rejected": -0.1637095957994461, + "logps/chosen": -83.91957092285156, + "logps/rejected": -59.413631439208984, + "loss": 1.1659, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.66959285736084, + "rewards/margins": 1.491684913635254, + "rewards/rejected": 1.1779078245162964, + "step": 506 + }, + { + "epoch": 0.13, + "grad_norm": 3.891472339630127, + "learning_rate": 9.824621209617795e-06, + "logits/chosen": -0.04219694435596466, + "logits/rejected": -0.12983641028404236, + "logps/chosen": -62.74514389038086, + "logps/rejected": -78.75426483154297, + "loss": 1.0699, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.963339328765869, + "rewards/margins": 2.0533900260925293, + "rewards/rejected": 0.9099494218826294, + "step": 507 + }, + { + "epoch": 0.13, + "grad_norm": 4.048131465911865, + "learning_rate": 9.823932790937565e-06, + "logits/chosen": 0.0006662048399448395, + "logits/rejected": -0.15469971299171448, + "logps/chosen": -52.33203887939453, + "logps/rejected": -54.33139419555664, + "loss": 0.8253, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.917130947113037, + "rewards/margins": 2.4925713539123535, + "rewards/rejected": 0.42455947399139404, + "step": 508 + }, + { + "epoch": 0.13, + "grad_norm": 4.257034778594971, + "learning_rate": 9.823243047983693e-06, + "logits/chosen": -0.03696047514677048, + "logits/rejected": -0.1488901823759079, + "logps/chosen": -60.70420455932617, + "logps/rejected": -64.16670227050781, + "loss": 0.8449, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6156206130981445, + "rewards/margins": 2.5404186248779297, + "rewards/rejected": 0.07520219683647156, + "step": 509 + }, + { + "epoch": 0.13, + "grad_norm": 4.6322855949401855, + "learning_rate": 9.822551980945526e-06, + "logits/chosen": -0.0840168446302414, + "logits/rejected": -0.18366177380084991, + "logps/chosen": -50.179107666015625, + "logps/rejected": -53.36334991455078, + "loss": 1.03, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.7218973636627197, + "rewards/margins": 1.7859894037246704, + "rewards/rejected": 0.9359080195426941, + "step": 510 + }, + { + "epoch": 0.13, + "grad_norm": 4.616367816925049, + "learning_rate": 9.821859590012781e-06, + "logits/chosen": 0.024544915184378624, + "logits/rejected": -0.14623841643333435, + "logps/chosen": -63.71245574951172, + "logps/rejected": -52.26854705810547, + "loss": 0.947, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9353818893432617, + "rewards/margins": 2.147740364074707, + "rewards/rejected": 0.7876416444778442, + "step": 511 + }, + { + "epoch": 0.13, + "grad_norm": 3.5393741130828857, + "learning_rate": 9.821165875375528e-06, + "logits/chosen": -0.09086433053016663, + "logits/rejected": -0.17516659200191498, + "logps/chosen": -50.93175506591797, + "logps/rejected": -58.48527526855469, + "loss": 0.9141, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.899972438812256, + "rewards/margins": 2.2061004638671875, + "rewards/rejected": 0.6938722133636475, + "step": 512 + }, + { + "epoch": 0.13, + "grad_norm": 3.7920799255371094, + "learning_rate": 9.82047083722421e-06, + "logits/chosen": 0.02260672301054001, + "logits/rejected": -0.05604167282581329, + "logps/chosen": -60.14942932128906, + "logps/rejected": -68.444580078125, + "loss": 0.9134, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6244444847106934, + "rewards/margins": 2.16825008392334, + "rewards/rejected": 0.4561944603919983, + "step": 513 + }, + { + "epoch": 0.13, + "grad_norm": 3.6998097896575928, + "learning_rate": 9.81977447574963e-06, + "logits/chosen": -0.12578615546226501, + "logits/rejected": -0.27375295758247375, + "logps/chosen": -63.39228820800781, + "logps/rejected": -65.57518005371094, + "loss": 1.0427, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7301228046417236, + "rewards/margins": 2.106004238128662, + "rewards/rejected": 0.6241183876991272, + "step": 514 + }, + { + "epoch": 0.13, + "grad_norm": 4.273169040679932, + "learning_rate": 9.819076791142954e-06, + "logits/chosen": -0.030061617493629456, + "logits/rejected": -0.1029096320271492, + "logps/chosen": -59.41151428222656, + "logps/rejected": -71.95787048339844, + "loss": 0.9041, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.837268829345703, + "rewards/margins": 1.9663000106811523, + "rewards/rejected": 0.8709684014320374, + "step": 515 + }, + { + "epoch": 0.13, + "grad_norm": 4.641862392425537, + "learning_rate": 9.818377783595712e-06, + "logits/chosen": -0.040163252502679825, + "logits/rejected": -0.1453232318162918, + "logps/chosen": -61.94207000732422, + "logps/rejected": -67.53766632080078, + "loss": 1.0915, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.6505837440490723, + "rewards/margins": 1.8322275876998901, + "rewards/rejected": 0.8183560967445374, + "step": 516 + }, + { + "epoch": 0.13, + "grad_norm": 2.942790985107422, + "learning_rate": 9.817677453299795e-06, + "logits/chosen": -0.01898978091776371, + "logits/rejected": -0.10045170783996582, + "logps/chosen": -52.49225616455078, + "logps/rejected": -63.98207092285156, + "loss": 0.7796, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0320029258728027, + "rewards/margins": 2.672926902770996, + "rewards/rejected": 0.35907602310180664, + "step": 517 + }, + { + "epoch": 0.13, + "grad_norm": 5.975857257843018, + "learning_rate": 9.816975800447461e-06, + "logits/chosen": -0.035093966871500015, + "logits/rejected": -0.14166782796382904, + "logps/chosen": -63.27382278442383, + "logps/rejected": -64.23551940917969, + "loss": 1.1571, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8225483894348145, + "rewards/margins": 1.8990042209625244, + "rewards/rejected": 0.9235442280769348, + "step": 518 + }, + { + "epoch": 0.13, + "grad_norm": 4.144193172454834, + "learning_rate": 9.816272825231325e-06, + "logits/chosen": -0.0399937741458416, + "logits/rejected": -0.16388177871704102, + "logps/chosen": -57.639102935791016, + "logps/rejected": -61.04917526245117, + "loss": 0.972, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.923555850982666, + "rewards/margins": 2.5271599292755127, + "rewards/rejected": 0.39639565348625183, + "step": 519 + }, + { + "epoch": 0.13, + "grad_norm": 3.234992265701294, + "learning_rate": 9.815568527844375e-06, + "logits/chosen": -0.07972769439220428, + "logits/rejected": -0.19795134663581848, + "logps/chosen": -52.29186248779297, + "logps/rejected": -74.02799987792969, + "loss": 0.8049, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.987074851989746, + "rewards/margins": 2.552532911300659, + "rewards/rejected": 0.4345419406890869, + "step": 520 + }, + { + "epoch": 0.13, + "grad_norm": 4.829776287078857, + "learning_rate": 9.81486290847995e-06, + "logits/chosen": -0.0484817773103714, + "logits/rejected": -0.09385447949171066, + "logps/chosen": -54.49750900268555, + "logps/rejected": -50.945716857910156, + "loss": 1.1389, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.600346803665161, + "rewards/margins": 1.6658990383148193, + "rewards/rejected": 0.9344479441642761, + "step": 521 + }, + { + "epoch": 0.13, + "grad_norm": 3.4265191555023193, + "learning_rate": 9.81415596733176e-06, + "logits/chosen": -0.13042020797729492, + "logits/rejected": -0.19311165809631348, + "logps/chosen": -51.20612335205078, + "logps/rejected": -58.13950729370117, + "loss": 0.9183, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.96053409576416, + "rewards/margins": 1.9926297664642334, + "rewards/rejected": 0.9679044485092163, + "step": 522 + }, + { + "epoch": 0.13, + "grad_norm": 5.384894371032715, + "learning_rate": 9.813447704593876e-06, + "logits/chosen": -0.05264032632112503, + "logits/rejected": -0.18974669277668, + "logps/chosen": -61.23318862915039, + "logps/rejected": -65.49850463867188, + "loss": 0.9772, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8727850914001465, + "rewards/margins": 2.440981864929199, + "rewards/rejected": 0.4318028688430786, + "step": 523 + }, + { + "epoch": 0.13, + "grad_norm": 3.1691057682037354, + "learning_rate": 9.812738120460732e-06, + "logits/chosen": 0.027985606342554092, + "logits/rejected": -0.12590044736862183, + "logps/chosen": -51.862178802490234, + "logps/rejected": -59.226375579833984, + "loss": 0.7733, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8264989852905273, + "rewards/margins": 2.423421859741211, + "rewards/rejected": 0.4030773341655731, + "step": 524 + }, + { + "epoch": 0.13, + "grad_norm": 3.735438823699951, + "learning_rate": 9.812027215127123e-06, + "logits/chosen": -0.08470699191093445, + "logits/rejected": -0.1349620223045349, + "logps/chosen": -54.640625, + "logps/rejected": -81.62757873535156, + "loss": 0.8523, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9634785652160645, + "rewards/margins": 2.5684714317321777, + "rewards/rejected": 0.39500686526298523, + "step": 525 + }, + { + "epoch": 0.13, + "grad_norm": 4.121676921844482, + "learning_rate": 9.811314988788207e-06, + "logits/chosen": 0.02384207956492901, + "logits/rejected": -0.12852323055267334, + "logps/chosen": -67.39913940429688, + "logps/rejected": -58.28443145751953, + "loss": 1.0219, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7488481998443604, + "rewards/margins": 2.2010433673858643, + "rewards/rejected": 0.54780513048172, + "step": 526 + }, + { + "epoch": 0.13, + "grad_norm": 3.632303237915039, + "learning_rate": 9.810601441639508e-06, + "logits/chosen": -0.13373807072639465, + "logits/rejected": -0.223899707198143, + "logps/chosen": -54.97880935668945, + "logps/rejected": -82.34892272949219, + "loss": 0.8595, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.62204909324646, + "rewards/margins": 2.3793230056762695, + "rewards/rejected": 0.24272623658180237, + "step": 527 + }, + { + "epoch": 0.13, + "grad_norm": 3.6386191844940186, + "learning_rate": 9.809886573876908e-06, + "logits/chosen": -0.10648635029792786, + "logits/rejected": -0.19134093821048737, + "logps/chosen": -55.78590774536133, + "logps/rejected": -57.64155197143555, + "loss": 0.9206, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7681498527526855, + "rewards/margins": 2.158360242843628, + "rewards/rejected": 0.6097896099090576, + "step": 528 + }, + { + "epoch": 0.13, + "grad_norm": 4.3225483894348145, + "learning_rate": 9.809170385696655e-06, + "logits/chosen": -0.08817293494939804, + "logits/rejected": -0.13560442626476288, + "logps/chosen": -62.483863830566406, + "logps/rejected": -74.60025024414062, + "loss": 0.9123, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9764182567596436, + "rewards/margins": 2.6768059730529785, + "rewards/rejected": 0.29961228370666504, + "step": 529 + }, + { + "epoch": 0.13, + "grad_norm": 3.42037034034729, + "learning_rate": 9.808452877295356e-06, + "logits/chosen": -0.08659806847572327, + "logits/rejected": -0.17735978960990906, + "logps/chosen": -55.78330993652344, + "logps/rejected": -55.744728088378906, + "loss": 0.9145, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.718289375305176, + "rewards/margins": 2.3336212635040283, + "rewards/rejected": 0.3846678137779236, + "step": 530 + }, + { + "epoch": 0.13, + "grad_norm": 3.7814533710479736, + "learning_rate": 9.807734048869985e-06, + "logits/chosen": 0.0036407634615898132, + "logits/rejected": -0.17239350080490112, + "logps/chosen": -51.597293853759766, + "logps/rejected": -54.03070068359375, + "loss": 0.8153, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8568623065948486, + "rewards/margins": 2.737332344055176, + "rewards/rejected": 0.11952987313270569, + "step": 531 + }, + { + "epoch": 0.13, + "grad_norm": 4.5141801834106445, + "learning_rate": 9.807013900617874e-06, + "logits/chosen": -0.037184201180934906, + "logits/rejected": -0.21959452331066132, + "logps/chosen": -63.106689453125, + "logps/rejected": -62.31342697143555, + "loss": 0.994, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.11850905418396, + "rewards/margins": 2.5395727157592773, + "rewards/rejected": 0.5789366960525513, + "step": 532 + }, + { + "epoch": 0.13, + "grad_norm": 5.357904434204102, + "learning_rate": 9.806292432736721e-06, + "logits/chosen": 0.02428213320672512, + "logits/rejected": -0.103800930082798, + "logps/chosen": -60.49140930175781, + "logps/rejected": -59.73680877685547, + "loss": 1.1861, + "rewards/accuracies": 0.65625, + "rewards/chosen": 2.5700650215148926, + "rewards/margins": 1.4413983821868896, + "rewards/rejected": 1.128666877746582, + "step": 533 + }, + { + "epoch": 0.13, + "grad_norm": 3.2346713542938232, + "learning_rate": 9.805569645424584e-06, + "logits/chosen": -0.02134331502020359, + "logits/rejected": -0.20940886437892914, + "logps/chosen": -61.0614013671875, + "logps/rejected": -53.77326202392578, + "loss": 0.9306, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0065183639526367, + "rewards/margins": 2.737579584121704, + "rewards/rejected": 0.26893889904022217, + "step": 534 + }, + { + "epoch": 0.13, + "grad_norm": 3.690262794494629, + "learning_rate": 9.804845538879882e-06, + "logits/chosen": -0.07720813900232315, + "logits/rejected": -0.1719299852848053, + "logps/chosen": -59.99851989746094, + "logps/rejected": -67.20048522949219, + "loss": 0.9715, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.716381072998047, + "rewards/margins": 1.9637534618377686, + "rewards/rejected": 0.7526274919509888, + "step": 535 + }, + { + "epoch": 0.13, + "grad_norm": 3.5530636310577393, + "learning_rate": 9.8041201133014e-06, + "logits/chosen": -0.1162959411740303, + "logits/rejected": -0.2057361900806427, + "logps/chosen": -55.16967010498047, + "logps/rejected": -60.239505767822266, + "loss": 0.9723, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7537128925323486, + "rewards/margins": 2.1778364181518555, + "rewards/rejected": 0.5758762955665588, + "step": 536 + }, + { + "epoch": 0.13, + "grad_norm": 3.3110451698303223, + "learning_rate": 9.803393368888282e-06, + "logits/chosen": -0.089426189661026, + "logits/rejected": -0.15252433717250824, + "logps/chosen": -49.94374084472656, + "logps/rejected": -71.58256530761719, + "loss": 0.8365, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1443586349487305, + "rewards/margins": 2.546736478805542, + "rewards/rejected": 0.5976219177246094, + "step": 537 + }, + { + "epoch": 0.13, + "grad_norm": 3.7779197692871094, + "learning_rate": 9.802665305840036e-06, + "logits/chosen": -0.07237163931131363, + "logits/rejected": -0.05309601128101349, + "logps/chosen": -53.36478042602539, + "logps/rejected": -73.13338470458984, + "loss": 0.9838, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8174428939819336, + "rewards/margins": 1.753239631652832, + "rewards/rejected": 1.064203143119812, + "step": 538 + }, + { + "epoch": 0.13, + "grad_norm": 4.221880912780762, + "learning_rate": 9.801935924356528e-06, + "logits/chosen": 0.043746218085289, + "logits/rejected": -0.21265479922294617, + "logps/chosen": -64.7228012084961, + "logps/rejected": -65.87630462646484, + "loss": 0.9111, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6712558269500732, + "rewards/margins": 2.8837218284606934, + "rewards/rejected": -0.2124657779932022, + "step": 539 + }, + { + "epoch": 0.14, + "grad_norm": 4.870194435119629, + "learning_rate": 9.801205224637993e-06, + "logits/chosen": 0.039397746324539185, + "logits/rejected": -0.048466891050338745, + "logps/chosen": -60.352054595947266, + "logps/rejected": -70.1075210571289, + "loss": 1.0237, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.591036796569824, + "rewards/margins": 1.696396827697754, + "rewards/rejected": 0.8946399092674255, + "step": 540 + }, + { + "epoch": 0.14, + "grad_norm": 3.754122495651245, + "learning_rate": 9.800473206885022e-06, + "logits/chosen": -0.03172747418284416, + "logits/rejected": -0.16714216768741608, + "logps/chosen": -67.63609313964844, + "logps/rejected": -58.07219696044922, + "loss": 1.0768, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.449279308319092, + "rewards/margins": 1.9269981384277344, + "rewards/rejected": 0.5222810506820679, + "step": 541 + }, + { + "epoch": 0.14, + "grad_norm": 2.86950945854187, + "learning_rate": 9.799739871298568e-06, + "logits/chosen": 0.002354845404624939, + "logits/rejected": -0.13800857961177826, + "logps/chosen": -62.05704116821289, + "logps/rejected": -62.56850814819336, + "loss": 0.8084, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.80098819732666, + "rewards/margins": 2.9201531410217285, + "rewards/rejected": -0.11916498839855194, + "step": 542 + }, + { + "epoch": 0.14, + "grad_norm": 4.210236072540283, + "learning_rate": 9.799005218079951e-06, + "logits/chosen": 0.04429261013865471, + "logits/rejected": 0.022497178986668587, + "logps/chosen": -59.29057312011719, + "logps/rejected": -71.5653076171875, + "loss": 0.9786, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6666202545166016, + "rewards/margins": 1.9367644786834717, + "rewards/rejected": 0.7298556566238403, + "step": 543 + }, + { + "epoch": 0.14, + "grad_norm": 4.082361698150635, + "learning_rate": 9.798269247430847e-06, + "logits/chosen": -0.06402771174907684, + "logits/rejected": -0.17028431594371796, + "logps/chosen": -70.03924560546875, + "logps/rejected": -91.90506744384766, + "loss": 0.8974, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7028002738952637, + "rewards/margins": 2.3807122707366943, + "rewards/rejected": 0.3220878839492798, + "step": 544 + }, + { + "epoch": 0.14, + "grad_norm": 4.880690574645996, + "learning_rate": 9.797531959553294e-06, + "logits/chosen": -0.09058420360088348, + "logits/rejected": -0.04218412563204765, + "logps/chosen": -55.48250961303711, + "logps/rejected": -85.19142150878906, + "loss": 1.0626, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7742409706115723, + "rewards/margins": 1.668440341949463, + "rewards/rejected": 1.1058006286621094, + "step": 545 + }, + { + "epoch": 0.14, + "grad_norm": 3.4292874336242676, + "learning_rate": 9.796793354649698e-06, + "logits/chosen": -0.006789376959204674, + "logits/rejected": -0.007505893707275391, + "logps/chosen": -47.90571975708008, + "logps/rejected": -77.75823211669922, + "loss": 0.8263, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0948643684387207, + "rewards/margins": 2.427980899810791, + "rewards/rejected": 0.6668835282325745, + "step": 546 + }, + { + "epoch": 0.14, + "grad_norm": 4.898957252502441, + "learning_rate": 9.79605343292282e-06, + "logits/chosen": -0.018055152148008347, + "logits/rejected": -0.07226528227329254, + "logps/chosen": -60.50147247314453, + "logps/rejected": -68.35020446777344, + "loss": 1.0855, + "rewards/accuracies": 0.625, + "rewards/chosen": 2.656992197036743, + "rewards/margins": 1.7259349822998047, + "rewards/rejected": 0.9310573935508728, + "step": 547 + }, + { + "epoch": 0.14, + "grad_norm": 4.176148891448975, + "learning_rate": 9.79531219457578e-06, + "logits/chosen": -0.017507418990135193, + "logits/rejected": -0.07778199017047882, + "logps/chosen": -50.61406707763672, + "logps/rejected": -61.97324752807617, + "loss": 0.9109, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.973308563232422, + "rewards/margins": 2.2222957611083984, + "rewards/rejected": 0.7510132193565369, + "step": 548 + }, + { + "epoch": 0.14, + "grad_norm": 3.3521223068237305, + "learning_rate": 9.794569639812072e-06, + "logits/chosen": -0.07766470313072205, + "logits/rejected": -0.21724431216716766, + "logps/chosen": -55.436031341552734, + "logps/rejected": -54.244590759277344, + "loss": 0.8569, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.001025438308716, + "rewards/margins": 2.91864013671875, + "rewards/rejected": 0.08238527178764343, + "step": 549 + }, + { + "epoch": 0.14, + "grad_norm": 4.096730709075928, + "learning_rate": 9.79382576883554e-06, + "logits/chosen": -0.08648725599050522, + "logits/rejected": -0.15783175826072693, + "logps/chosen": -56.542781829833984, + "logps/rejected": -68.93109130859375, + "loss": 0.8623, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8269035816192627, + "rewards/margins": 2.1460251808166504, + "rewards/rejected": 0.6808779835700989, + "step": 550 + }, + { + "epoch": 0.14, + "grad_norm": 3.7686195373535156, + "learning_rate": 9.793080581850387e-06, + "logits/chosen": -0.08632495999336243, + "logits/rejected": -0.2466168999671936, + "logps/chosen": -58.72197723388672, + "logps/rejected": -59.325279235839844, + "loss": 0.866, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.902545213699341, + "rewards/margins": 2.4957661628723145, + "rewards/rejected": 0.4067786931991577, + "step": 551 + }, + { + "epoch": 0.14, + "grad_norm": 3.549330949783325, + "learning_rate": 9.792334079061192e-06, + "logits/chosen": 0.007630545645952225, + "logits/rejected": -0.21514232456684113, + "logps/chosen": -53.471160888671875, + "logps/rejected": -60.44715881347656, + "loss": 0.7353, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6263620853424072, + "rewards/margins": 2.908747911453247, + "rewards/rejected": -0.28238582611083984, + "step": 552 + }, + { + "epoch": 0.14, + "grad_norm": 3.772648334503174, + "learning_rate": 9.79158626067288e-06, + "logits/chosen": 0.00355394184589386, + "logits/rejected": -0.12884116172790527, + "logps/chosen": -60.206722259521484, + "logps/rejected": -61.262916564941406, + "loss": 0.8624, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7283427715301514, + "rewards/margins": 2.411391019821167, + "rewards/rejected": 0.3169516324996948, + "step": 553 + }, + { + "epoch": 0.14, + "grad_norm": 3.2820892333984375, + "learning_rate": 9.790837126890744e-06, + "logits/chosen": 0.03989960253238678, + "logits/rejected": -0.07976630330085754, + "logps/chosen": -60.09294128417969, + "logps/rejected": -72.30180358886719, + "loss": 0.7455, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6538448333740234, + "rewards/margins": 2.6853556632995605, + "rewards/rejected": -0.031510643661022186, + "step": 554 + }, + { + "epoch": 0.14, + "grad_norm": 2.799619436264038, + "learning_rate": 9.79008667792044e-06, + "logits/chosen": -0.022855043411254883, + "logits/rejected": -0.16420575976371765, + "logps/chosen": -52.90702438354492, + "logps/rejected": -65.22110748291016, + "loss": 0.7426, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7083163261413574, + "rewards/margins": 2.7657227516174316, + "rewards/rejected": -0.057406358420848846, + "step": 555 + }, + { + "epoch": 0.14, + "grad_norm": 4.228635311126709, + "learning_rate": 9.789334913967982e-06, + "logits/chosen": -0.08930601924657822, + "logits/rejected": -0.1808561533689499, + "logps/chosen": -56.55084228515625, + "logps/rejected": -52.650325775146484, + "loss": 1.0531, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7868404388427734, + "rewards/margins": 1.9321990013122559, + "rewards/rejected": 0.8546415567398071, + "step": 556 + }, + { + "epoch": 0.14, + "grad_norm": 3.614957809448242, + "learning_rate": 9.788581835239743e-06, + "logits/chosen": 0.001227634958922863, + "logits/rejected": -0.15389038622379303, + "logps/chosen": -61.63865280151367, + "logps/rejected": -56.492889404296875, + "loss": 0.8812, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.11008358001709, + "rewards/margins": 2.488194465637207, + "rewards/rejected": 0.6218894124031067, + "step": 557 + }, + { + "epoch": 0.14, + "grad_norm": 4.144347190856934, + "learning_rate": 9.787827441942461e-06, + "logits/chosen": -0.06605175882577896, + "logits/rejected": -0.17010977864265442, + "logps/chosen": -53.28418731689453, + "logps/rejected": -72.64727783203125, + "loss": 0.8528, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9291868209838867, + "rewards/margins": 2.820697546005249, + "rewards/rejected": 0.10848955810070038, + "step": 558 + }, + { + "epoch": 0.14, + "grad_norm": 4.458292484283447, + "learning_rate": 9.787071734283235e-06, + "logits/chosen": -0.006733346730470657, + "logits/rejected": -0.07937590777873993, + "logps/chosen": -59.41773986816406, + "logps/rejected": -64.58670806884766, + "loss": 1.0146, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.3729732036590576, + "rewards/margins": 1.7409467697143555, + "rewards/rejected": 0.6320263743400574, + "step": 559 + }, + { + "epoch": 0.14, + "grad_norm": 3.274956226348877, + "learning_rate": 9.786314712469519e-06, + "logits/chosen": -0.12469957023859024, + "logits/rejected": -0.22804009914398193, + "logps/chosen": -63.16524124145508, + "logps/rejected": -70.17799377441406, + "loss": 0.7594, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7039871215820312, + "rewards/margins": 2.9215893745422363, + "rewards/rejected": -0.21760183572769165, + "step": 560 + }, + { + "epoch": 0.14, + "grad_norm": 4.000869274139404, + "learning_rate": 9.785556376709133e-06, + "logits/chosen": -0.020054057240486145, + "logits/rejected": 0.02437000721693039, + "logps/chosen": -65.91282653808594, + "logps/rejected": -79.18841552734375, + "loss": 0.9363, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5928573608398438, + "rewards/margins": 1.9191514253616333, + "rewards/rejected": 0.6737062335014343, + "step": 561 + }, + { + "epoch": 0.14, + "grad_norm": 3.714879035949707, + "learning_rate": 9.78479672721026e-06, + "logits/chosen": -0.15598830580711365, + "logits/rejected": -0.2506888508796692, + "logps/chosen": -52.53428649902344, + "logps/rejected": -73.845458984375, + "loss": 0.8684, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7790019512176514, + "rewards/margins": 2.194359064102173, + "rewards/rejected": 0.5846429467201233, + "step": 562 + }, + { + "epoch": 0.14, + "grad_norm": 3.8514857292175293, + "learning_rate": 9.784035764181437e-06, + "logits/chosen": -0.10554885864257812, + "logits/rejected": -0.17383533716201782, + "logps/chosen": -58.657470703125, + "logps/rejected": -62.32862091064453, + "loss": 0.8972, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.976750135421753, + "rewards/margins": 2.2083513736724854, + "rewards/rejected": 0.7683988809585571, + "step": 563 + }, + { + "epoch": 0.14, + "grad_norm": 5.867668151855469, + "learning_rate": 9.783273487831564e-06, + "logits/chosen": -0.04798200726509094, + "logits/rejected": -0.18473020195960999, + "logps/chosen": -59.64876937866211, + "logps/rejected": -60.69575500488281, + "loss": 0.9611, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6256303787231445, + "rewards/margins": 2.605719804763794, + "rewards/rejected": 0.019910749047994614, + "step": 564 + }, + { + "epoch": 0.14, + "grad_norm": 4.272284030914307, + "learning_rate": 9.782509898369904e-06, + "logits/chosen": -0.009750964120030403, + "logits/rejected": -0.15834778547286987, + "logps/chosen": -67.06279754638672, + "logps/rejected": -56.40859603881836, + "loss": 0.9835, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.901735305786133, + "rewards/margins": 1.9364845752716064, + "rewards/rejected": 0.9652504920959473, + "step": 565 + }, + { + "epoch": 0.14, + "grad_norm": 4.780040740966797, + "learning_rate": 9.78174499600608e-06, + "logits/chosen": 0.02548295632004738, + "logits/rejected": -0.07392958551645279, + "logps/chosen": -67.33174133300781, + "logps/rejected": -62.60832977294922, + "loss": 1.0436, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7272558212280273, + "rewards/margins": 1.6415797472000122, + "rewards/rejected": 1.0856764316558838, + "step": 566 + }, + { + "epoch": 0.14, + "grad_norm": 3.5160093307495117, + "learning_rate": 9.78097878095007e-06, + "logits/chosen": -0.0652206540107727, + "logits/rejected": -0.25069043040275574, + "logps/chosen": -57.76650619506836, + "logps/rejected": -62.542110443115234, + "loss": 0.8297, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7385857105255127, + "rewards/margins": 2.485079288482666, + "rewards/rejected": 0.25350624322891235, + "step": 567 + }, + { + "epoch": 0.14, + "grad_norm": 3.163769006729126, + "learning_rate": 9.780211253412222e-06, + "logits/chosen": -0.05429909750819206, + "logits/rejected": -0.10408273339271545, + "logps/chosen": -55.06785202026367, + "logps/rejected": -78.68091583251953, + "loss": 0.766, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1854538917541504, + "rewards/margins": 3.2524030208587646, + "rewards/rejected": -0.06694923341274261, + "step": 568 + }, + { + "epoch": 0.14, + "grad_norm": 3.2383499145507812, + "learning_rate": 9.779442413603233e-06, + "logits/chosen": 0.058036770671606064, + "logits/rejected": -0.10122086107730865, + "logps/chosen": -58.19833755493164, + "logps/rejected": -69.59786987304688, + "loss": 0.731, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9356815814971924, + "rewards/margins": 2.5155715942382812, + "rewards/rejected": 0.4201101064682007, + "step": 569 + }, + { + "epoch": 0.14, + "grad_norm": 3.8791725635528564, + "learning_rate": 9.778672261734172e-06, + "logits/chosen": -0.07040716707706451, + "logits/rejected": -0.24926117062568665, + "logps/chosen": -76.1830062866211, + "logps/rejected": -54.21876525878906, + "loss": 1.0382, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9003212451934814, + "rewards/margins": 2.5279884338378906, + "rewards/rejected": 0.37233278155326843, + "step": 570 + }, + { + "epoch": 0.14, + "grad_norm": 4.798001766204834, + "learning_rate": 9.77790079801646e-06, + "logits/chosen": -0.016169235110282898, + "logits/rejected": -0.10963049530982971, + "logps/chosen": -66.9277572631836, + "logps/rejected": -60.39448165893555, + "loss": 1.1223, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8079214096069336, + "rewards/margins": 1.563190221786499, + "rewards/rejected": 1.2447314262390137, + "step": 571 + }, + { + "epoch": 0.14, + "grad_norm": 3.013618230819702, + "learning_rate": 9.777128022661877e-06, + "logits/chosen": -0.06952086091041565, + "logits/rejected": -0.24690833687782288, + "logps/chosen": -61.58543395996094, + "logps/rejected": -60.22723388671875, + "loss": 0.8412, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8639042377471924, + "rewards/margins": 2.4591429233551025, + "rewards/rejected": 0.40476131439208984, + "step": 572 + }, + { + "epoch": 0.14, + "grad_norm": 4.966729164123535, + "learning_rate": 9.776353935882571e-06, + "logits/chosen": -0.0018759453669190407, + "logits/rejected": 0.019330395385622978, + "logps/chosen": -70.85139465332031, + "logps/rejected": -84.40101623535156, + "loss": 1.1668, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8048834800720215, + "rewards/margins": 1.517290472984314, + "rewards/rejected": 1.287593126296997, + "step": 573 + }, + { + "epoch": 0.14, + "grad_norm": 9.97231674194336, + "learning_rate": 9.775578537891044e-06, + "logits/chosen": -0.02689867839217186, + "logits/rejected": -0.14507201313972473, + "logps/chosen": -49.71651840209961, + "logps/rejected": -60.50465774536133, + "loss": 0.8739, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.516026020050049, + "rewards/margins": 2.6414875984191895, + "rewards/rejected": -0.12546119093894958, + "step": 574 + }, + { + "epoch": 0.14, + "grad_norm": 2.6908366680145264, + "learning_rate": 9.77480182890016e-06, + "logits/chosen": 0.06210434064269066, + "logits/rejected": -0.07563016563653946, + "logps/chosen": -55.726341247558594, + "logps/rejected": -67.70504760742188, + "loss": 0.7146, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.897097587585449, + "rewards/margins": 2.7187373638153076, + "rewards/rejected": 0.17836029827594757, + "step": 575 + }, + { + "epoch": 0.14, + "grad_norm": 2.7824647426605225, + "learning_rate": 9.774023809123142e-06, + "logits/chosen": -0.028678521513938904, + "logits/rejected": -0.15707841515541077, + "logps/chosen": -62.67597961425781, + "logps/rejected": -65.64997863769531, + "loss": 0.8634, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7978005409240723, + "rewards/margins": 2.5731875896453857, + "rewards/rejected": 0.22461259365081787, + "step": 576 + }, + { + "epoch": 0.14, + "grad_norm": 2.867623805999756, + "learning_rate": 9.773244478773573e-06, + "logits/chosen": -0.06451337039470673, + "logits/rejected": -0.2166135609149933, + "logps/chosen": -57.57865905761719, + "logps/rejected": -54.8756217956543, + "loss": 0.7722, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.708522081375122, + "rewards/margins": 2.753100633621216, + "rewards/rejected": -0.04457825422286987, + "step": 577 + }, + { + "epoch": 0.14, + "grad_norm": 3.9790611267089844, + "learning_rate": 9.772463838065396e-06, + "logits/chosen": -0.03444361314177513, + "logits/rejected": -0.17672772705554962, + "logps/chosen": -70.38688659667969, + "logps/rejected": -56.88001251220703, + "loss": 0.9428, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.653218984603882, + "rewards/margins": 2.441932201385498, + "rewards/rejected": 0.2112865149974823, + "step": 578 + }, + { + "epoch": 0.14, + "grad_norm": 4.700227737426758, + "learning_rate": 9.771681887212914e-06, + "logits/chosen": -0.050520576536655426, + "logits/rejected": -0.08066150546073914, + "logps/chosen": -54.35589599609375, + "logps/rejected": -67.99919891357422, + "loss": 0.9548, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9309263229370117, + "rewards/margins": 1.764562726020813, + "rewards/rejected": 1.1663635969161987, + "step": 579 + }, + { + "epoch": 0.15, + "grad_norm": 3.233797073364258, + "learning_rate": 9.770898626430786e-06, + "logits/chosen": -0.046892404556274414, + "logits/rejected": -0.1539989858865738, + "logps/chosen": -54.814918518066406, + "logps/rejected": -61.37023162841797, + "loss": 0.7956, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.010110378265381, + "rewards/margins": 2.8588197231292725, + "rewards/rejected": 0.15129098296165466, + "step": 580 + }, + { + "epoch": 0.15, + "grad_norm": 3.6350066661834717, + "learning_rate": 9.77011405593404e-06, + "logits/chosen": -0.13604378700256348, + "logits/rejected": -0.22045789659023285, + "logps/chosen": -54.217315673828125, + "logps/rejected": -66.13836669921875, + "loss": 0.8809, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.986780881881714, + "rewards/margins": 2.1321306228637695, + "rewards/rejected": 0.8546503782272339, + "step": 581 + }, + { + "epoch": 0.15, + "grad_norm": 4.171502113342285, + "learning_rate": 9.769328175938055e-06, + "logits/chosen": -0.09435081481933594, + "logits/rejected": -0.15379062294960022, + "logps/chosen": -51.520164489746094, + "logps/rejected": -73.10713958740234, + "loss": 1.0075, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.901003360748291, + "rewards/margins": 2.196108341217041, + "rewards/rejected": 0.7048949599266052, + "step": 582 + }, + { + "epoch": 0.15, + "grad_norm": 3.2961459159851074, + "learning_rate": 9.768540986658572e-06, + "logits/chosen": -0.00547918351367116, + "logits/rejected": -0.07979963719844818, + "logps/chosen": -60.77411651611328, + "logps/rejected": -64.89286041259766, + "loss": 0.8934, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7828636169433594, + "rewards/margins": 2.1307997703552246, + "rewards/rejected": 0.6520638465881348, + "step": 583 + }, + { + "epoch": 0.15, + "grad_norm": 3.508864641189575, + "learning_rate": 9.76775248831169e-06, + "logits/chosen": -0.13240016996860504, + "logits/rejected": -0.20859850943088531, + "logps/chosen": -49.90528106689453, + "logps/rejected": -67.74560546875, + "loss": 0.93, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.67329478263855, + "rewards/margins": 2.4216017723083496, + "rewards/rejected": 0.2516930103302002, + "step": 584 + }, + { + "epoch": 0.15, + "grad_norm": 3.1612513065338135, + "learning_rate": 9.766962681113871e-06, + "logits/chosen": -0.05865827947854996, + "logits/rejected": -0.1910131871700287, + "logps/chosen": -59.90278625488281, + "logps/rejected": -66.81571197509766, + "loss": 0.7981, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8392069339752197, + "rewards/margins": 2.5495240688323975, + "rewards/rejected": 0.28968289494514465, + "step": 585 + }, + { + "epoch": 0.15, + "grad_norm": 2.392113208770752, + "learning_rate": 9.766171565281932e-06, + "logits/chosen": -0.09620235115289688, + "logits/rejected": -0.29313597083091736, + "logps/chosen": -54.134033203125, + "logps/rejected": -48.50114059448242, + "loss": 0.74, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9235565662384033, + "rewards/margins": 3.0625181198120117, + "rewards/rejected": -0.1389612853527069, + "step": 586 + }, + { + "epoch": 0.15, + "grad_norm": 3.9428563117980957, + "learning_rate": 9.765379141033053e-06, + "logits/chosen": -0.0033218562602996826, + "logits/rejected": -0.08302581310272217, + "logps/chosen": -55.692352294921875, + "logps/rejected": -70.8180923461914, + "loss": 0.9068, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.62064528465271, + "rewards/margins": 2.457991600036621, + "rewards/rejected": 0.1626535952091217, + "step": 587 + }, + { + "epoch": 0.15, + "grad_norm": 3.316462278366089, + "learning_rate": 9.764585408584772e-06, + "logits/chosen": -0.072154700756073, + "logits/rejected": -0.21385978162288666, + "logps/chosen": -64.71235656738281, + "logps/rejected": -53.410343170166016, + "loss": 0.9344, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.017834186553955, + "rewards/margins": 3.364480972290039, + "rewards/rejected": -0.3466470241546631, + "step": 588 + }, + { + "epoch": 0.15, + "grad_norm": 5.292638778686523, + "learning_rate": 9.763790368154984e-06, + "logits/chosen": -0.015291927382349968, + "logits/rejected": -0.12210583686828613, + "logps/chosen": -82.31402587890625, + "logps/rejected": -68.34977722167969, + "loss": 1.0526, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6106035709381104, + "rewards/margins": 1.9432412385940552, + "rewards/rejected": 0.6673621535301208, + "step": 589 + }, + { + "epoch": 0.15, + "grad_norm": 3.659291982650757, + "learning_rate": 9.762994019961944e-06, + "logits/chosen": -0.07862928509712219, + "logits/rejected": -0.2006446123123169, + "logps/chosen": -50.506683349609375, + "logps/rejected": -56.37013244628906, + "loss": 0.8633, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7764055728912354, + "rewards/margins": 2.41933012008667, + "rewards/rejected": 0.35707542300224304, + "step": 590 + }, + { + "epoch": 0.15, + "grad_norm": 4.075985908508301, + "learning_rate": 9.762196364224271e-06, + "logits/chosen": 0.032732877880334854, + "logits/rejected": -0.05264703929424286, + "logps/chosen": -71.58252716064453, + "logps/rejected": -70.30998229980469, + "loss": 1.0995, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.626638174057007, + "rewards/margins": 1.8594292402267456, + "rewards/rejected": 0.7672086954116821, + "step": 591 + }, + { + "epoch": 0.15, + "grad_norm": 4.150206565856934, + "learning_rate": 9.761397401160934e-06, + "logits/chosen": -0.1441025733947754, + "logits/rejected": -0.15548068284988403, + "logps/chosen": -49.24338912963867, + "logps/rejected": -69.96627807617188, + "loss": 0.9766, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.784424066543579, + "rewards/margins": 2.074667453765869, + "rewards/rejected": 0.7097563743591309, + "step": 592 + }, + { + "epoch": 0.15, + "grad_norm": 4.141622543334961, + "learning_rate": 9.760597130991268e-06, + "logits/chosen": -0.055757567286491394, + "logits/rejected": -0.18382351100444794, + "logps/chosen": -57.91875076293945, + "logps/rejected": -69.65569305419922, + "loss": 0.9894, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8525896072387695, + "rewards/margins": 2.1745104789733887, + "rewards/rejected": 0.6780787706375122, + "step": 593 + }, + { + "epoch": 0.15, + "grad_norm": 4.76568603515625, + "learning_rate": 9.759795553934964e-06, + "logits/chosen": -0.09445703029632568, + "logits/rejected": -0.1488478034734726, + "logps/chosen": -50.56122970581055, + "logps/rejected": -68.20697021484375, + "loss": 0.9382, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.581730842590332, + "rewards/margins": 2.167363405227661, + "rewards/rejected": 0.41436767578125, + "step": 594 + }, + { + "epoch": 0.15, + "grad_norm": 3.3849194049835205, + "learning_rate": 9.75899267021207e-06, + "logits/chosen": 0.028297018259763718, + "logits/rejected": -0.1269189864397049, + "logps/chosen": -59.63992691040039, + "logps/rejected": -66.1673812866211, + "loss": 0.7791, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7511911392211914, + "rewards/margins": 2.7367310523986816, + "rewards/rejected": 0.014460146427154541, + "step": 595 + }, + { + "epoch": 0.15, + "grad_norm": 4.007269382476807, + "learning_rate": 9.758188480043e-06, + "logits/chosen": 0.04349014163017273, + "logits/rejected": -0.10572937875986099, + "logps/chosen": -71.10121154785156, + "logps/rejected": -62.41469192504883, + "loss": 0.86, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.111654043197632, + "rewards/margins": 2.8664767742156982, + "rewards/rejected": 0.24517729878425598, + "step": 596 + }, + { + "epoch": 0.15, + "grad_norm": 4.380400657653809, + "learning_rate": 9.757382983648518e-06, + "logits/chosen": -0.04104669392108917, + "logits/rejected": -0.12074495851993561, + "logps/chosen": -66.4924545288086, + "logps/rejected": -73.42668151855469, + "loss": 0.915, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.807899236679077, + "rewards/margins": 2.539686679840088, + "rewards/rejected": 0.26821255683898926, + "step": 597 + }, + { + "epoch": 0.15, + "grad_norm": 3.933013439178467, + "learning_rate": 9.75657618124975e-06, + "logits/chosen": -0.06625314056873322, + "logits/rejected": -0.23158083856105804, + "logps/chosen": -61.04374694824219, + "logps/rejected": -58.40450668334961, + "loss": 0.9823, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5230636596679688, + "rewards/margins": 2.2584352493286133, + "rewards/rejected": 0.2646285891532898, + "step": 598 + }, + { + "epoch": 0.15, + "grad_norm": 3.7824409008026123, + "learning_rate": 9.75576807306818e-06, + "logits/chosen": -0.047850415110588074, + "logits/rejected": -0.11792539060115814, + "logps/chosen": -57.21920394897461, + "logps/rejected": -70.29682159423828, + "loss": 0.8346, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.732259750366211, + "rewards/margins": 2.1536078453063965, + "rewards/rejected": 0.5786520838737488, + "step": 599 + }, + { + "epoch": 0.15, + "grad_norm": 1.9852197170257568, + "learning_rate": 9.754958659325656e-06, + "logits/chosen": -0.0062232185155153275, + "logits/rejected": -0.2172529399394989, + "logps/chosen": -53.641761779785156, + "logps/rejected": -48.59857177734375, + "loss": 0.5704, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.821112632751465, + "rewards/margins": 3.461282730102539, + "rewards/rejected": -0.640170693397522, + "step": 600 + }, + { + "epoch": 0.15, + "grad_norm": 2.732022523880005, + "learning_rate": 9.754147940244375e-06, + "logits/chosen": -0.011660244315862656, + "logits/rejected": -0.12860262393951416, + "logps/chosen": -53.37161636352539, + "logps/rejected": -70.54045104980469, + "loss": 0.7599, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8222267627716064, + "rewards/margins": 2.991462230682373, + "rewards/rejected": -0.16923579573631287, + "step": 601 + }, + { + "epoch": 0.15, + "grad_norm": 2.044414520263672, + "learning_rate": 9.753335916046897e-06, + "logits/chosen": 0.02539709024131298, + "logits/rejected": -0.08573082089424133, + "logps/chosen": -60.90917205810547, + "logps/rejected": -70.39403533935547, + "loss": 0.709, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8477566242218018, + "rewards/margins": 3.0705442428588867, + "rewards/rejected": -0.2227877676486969, + "step": 602 + }, + { + "epoch": 0.15, + "grad_norm": 3.419546604156494, + "learning_rate": 9.752522586956142e-06, + "logits/chosen": -0.0990016758441925, + "logits/rejected": -0.12539352476596832, + "logps/chosen": -57.96206283569336, + "logps/rejected": -66.54833984375, + "loss": 0.9338, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6960244178771973, + "rewards/margins": 2.150775909423828, + "rewards/rejected": 0.5452483296394348, + "step": 603 + }, + { + "epoch": 0.15, + "grad_norm": 5.244290351867676, + "learning_rate": 9.751707953195386e-06, + "logits/chosen": -0.11453818529844284, + "logits/rejected": -0.23711030185222626, + "logps/chosen": -61.04693603515625, + "logps/rejected": -63.40776443481445, + "loss": 1.0838, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.5850324630737305, + "rewards/margins": 1.9791604280471802, + "rewards/rejected": 0.6058722734451294, + "step": 604 + }, + { + "epoch": 0.15, + "grad_norm": 6.425807476043701, + "learning_rate": 9.750892014988261e-06, + "logits/chosen": -0.08646991848945618, + "logits/rejected": -0.14114882051944733, + "logps/chosen": -53.29065704345703, + "logps/rejected": -62.668392181396484, + "loss": 1.0886, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.758516788482666, + "rewards/margins": 1.9632855653762817, + "rewards/rejected": 0.7952312231063843, + "step": 605 + }, + { + "epoch": 0.15, + "grad_norm": 3.954298973083496, + "learning_rate": 9.750074772558764e-06, + "logits/chosen": -0.06605826318264008, + "logits/rejected": -0.19407948851585388, + "logps/chosen": -56.55421447753906, + "logps/rejected": -72.84636688232422, + "loss": 0.8295, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5139777660369873, + "rewards/margins": 2.688673973083496, + "rewards/rejected": -0.17469605803489685, + "step": 606 + }, + { + "epoch": 0.15, + "grad_norm": 4.522244453430176, + "learning_rate": 9.749256226131242e-06, + "logits/chosen": -0.04197634011507034, + "logits/rejected": -0.09043312072753906, + "logps/chosen": -62.21190643310547, + "logps/rejected": -81.57711029052734, + "loss": 0.9629, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.773500442504883, + "rewards/margins": 1.8695582151412964, + "rewards/rejected": 0.903942346572876, + "step": 607 + }, + { + "epoch": 0.15, + "grad_norm": 2.3756601810455322, + "learning_rate": 9.748436375930406e-06, + "logits/chosen": 0.022828057408332825, + "logits/rejected": -0.05447981879115105, + "logps/chosen": -52.28987503051758, + "logps/rejected": -75.03765106201172, + "loss": 0.6782, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.073739767074585, + "rewards/margins": 3.127335548400879, + "rewards/rejected": -0.05359587073326111, + "step": 608 + }, + { + "epoch": 0.15, + "grad_norm": 3.1994051933288574, + "learning_rate": 9.74761522218132e-06, + "logits/chosen": 0.010693712159991264, + "logits/rejected": -0.0855504497885704, + "logps/chosen": -51.398067474365234, + "logps/rejected": -58.39914321899414, + "loss": 0.8378, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.800062656402588, + "rewards/margins": 2.84243106842041, + "rewards/rejected": -0.04236862063407898, + "step": 609 + }, + { + "epoch": 0.15, + "grad_norm": 3.1499547958374023, + "learning_rate": 9.746792765109412e-06, + "logits/chosen": -0.11137550324201584, + "logits/rejected": -0.13716928660869598, + "logps/chosen": -53.02164840698242, + "logps/rejected": -73.88398742675781, + "loss": 0.8398, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9990131855010986, + "rewards/margins": 2.6748902797698975, + "rewards/rejected": 0.324122816324234, + "step": 610 + }, + { + "epoch": 0.15, + "grad_norm": 4.687774658203125, + "learning_rate": 9.745969004940462e-06, + "logits/chosen": -0.07979320734739304, + "logits/rejected": -0.2618739604949951, + "logps/chosen": -56.61083221435547, + "logps/rejected": -62.39603805541992, + "loss": 0.8523, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.301598072052002, + "rewards/margins": 2.3176679611206055, + "rewards/rejected": -0.0160701721906662, + "step": 611 + }, + { + "epoch": 0.15, + "grad_norm": 3.909226179122925, + "learning_rate": 9.745143941900607e-06, + "logits/chosen": -0.08001338690519333, + "logits/rejected": -0.21848337352275848, + "logps/chosen": -54.13965606689453, + "logps/rejected": -58.28851318359375, + "loss": 1.0573, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6840033531188965, + "rewards/margins": 2.154421329498291, + "rewards/rejected": 0.5295818448066711, + "step": 612 + }, + { + "epoch": 0.15, + "grad_norm": 3.3231256008148193, + "learning_rate": 9.744317576216352e-06, + "logits/chosen": -0.15358877182006836, + "logits/rejected": -0.24908046424388885, + "logps/chosen": -46.68828582763672, + "logps/rejected": -61.7827033996582, + "loss": 0.7888, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9114081859588623, + "rewards/margins": 2.754495620727539, + "rewards/rejected": 0.1569129079580307, + "step": 613 + }, + { + "epoch": 0.15, + "grad_norm": 3.7312822341918945, + "learning_rate": 9.743489908114547e-06, + "logits/chosen": -0.09218679368495941, + "logits/rejected": -0.15053382515907288, + "logps/chosen": -63.712982177734375, + "logps/rejected": -62.419437408447266, + "loss": 0.9925, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8963634967803955, + "rewards/margins": 2.300642490386963, + "rewards/rejected": 0.5957209467887878, + "step": 614 + }, + { + "epoch": 0.15, + "grad_norm": 2.611361026763916, + "learning_rate": 9.742660937822405e-06, + "logits/chosen": -0.0613497830927372, + "logits/rejected": -0.19703727960586548, + "logps/chosen": -49.87538146972656, + "logps/rejected": -65.31887817382812, + "loss": 0.654, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.798121213912964, + "rewards/margins": 3.496103286743164, + "rewards/rejected": -0.6979820132255554, + "step": 615 + }, + { + "epoch": 0.15, + "grad_norm": 3.198711633682251, + "learning_rate": 9.741830665567498e-06, + "logits/chosen": -0.0014467276632785797, + "logits/rejected": -0.18701577186584473, + "logps/chosen": -55.300819396972656, + "logps/rejected": -59.94697570800781, + "loss": 0.7988, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5677871704101562, + "rewards/margins": 2.867931842803955, + "rewards/rejected": -0.30014485120773315, + "step": 616 + }, + { + "epoch": 0.15, + "grad_norm": 4.4111762046813965, + "learning_rate": 9.74099909157775e-06, + "logits/chosen": -0.1048521175980568, + "logits/rejected": -0.17159909009933472, + "logps/chosen": -59.30287170410156, + "logps/rejected": -65.52790832519531, + "loss": 1.0169, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.745737075805664, + "rewards/margins": 2.2237257957458496, + "rewards/rejected": 0.5220112800598145, + "step": 617 + }, + { + "epoch": 0.15, + "grad_norm": 3.917234420776367, + "learning_rate": 9.740166216081451e-06, + "logits/chosen": -0.09821543097496033, + "logits/rejected": -0.20024889707565308, + "logps/chosen": -61.9556999206543, + "logps/rejected": -63.13687515258789, + "loss": 0.9149, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3982436656951904, + "rewards/margins": 2.7811479568481445, + "rewards/rejected": -0.3829042911529541, + "step": 618 + }, + { + "epoch": 0.15, + "grad_norm": 3.7579686641693115, + "learning_rate": 9.73933203930724e-06, + "logits/chosen": -0.13756535947322845, + "logits/rejected": -0.15702711045742035, + "logps/chosen": -54.83635330200195, + "logps/rejected": -70.26164245605469, + "loss": 0.8809, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.761549949645996, + "rewards/margins": 2.7708823680877686, + "rewards/rejected": -0.009332016110420227, + "step": 619 + }, + { + "epoch": 0.16, + "grad_norm": 3.7407264709472656, + "learning_rate": 9.738496561484118e-06, + "logits/chosen": -0.14077070355415344, + "logits/rejected": -0.24564114212989807, + "logps/chosen": -57.58009338378906, + "logps/rejected": -61.98149871826172, + "loss": 0.8762, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.888162612915039, + "rewards/margins": 2.5122883319854736, + "rewards/rejected": 0.3758745789527893, + "step": 620 + }, + { + "epoch": 0.16, + "grad_norm": 3.492469310760498, + "learning_rate": 9.737659782841442e-06, + "logits/chosen": -0.08854440599679947, + "logits/rejected": -0.2107028365135193, + "logps/chosen": -58.33110046386719, + "logps/rejected": -61.20471954345703, + "loss": 0.8652, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0106894969940186, + "rewards/margins": 2.4557957649230957, + "rewards/rejected": 0.5548936724662781, + "step": 621 + }, + { + "epoch": 0.16, + "grad_norm": 3.676929473876953, + "learning_rate": 9.736821703608925e-06, + "logits/chosen": -0.06970326602458954, + "logits/rejected": -0.15057343244552612, + "logps/chosen": -50.26844787597656, + "logps/rejected": -66.79492950439453, + "loss": 0.9681, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6295742988586426, + "rewards/margins": 2.691652536392212, + "rewards/rejected": -0.062078312039375305, + "step": 622 + }, + { + "epoch": 0.16, + "grad_norm": 4.665079593658447, + "learning_rate": 9.735982324016637e-06, + "logits/chosen": -0.03700428456068039, + "logits/rejected": -0.17931987345218658, + "logps/chosen": -57.534461975097656, + "logps/rejected": -58.67657470703125, + "loss": 0.9824, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6092729568481445, + "rewards/margins": 2.5104901790618896, + "rewards/rejected": 0.0987827330827713, + "step": 623 + }, + { + "epoch": 0.16, + "grad_norm": 3.7188804149627686, + "learning_rate": 9.735141644295006e-06, + "logits/chosen": -0.0056097302585840225, + "logits/rejected": -0.09042617678642273, + "logps/chosen": -60.39185333251953, + "logps/rejected": -81.87060546875, + "loss": 0.9307, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6797237396240234, + "rewards/margins": 2.42710542678833, + "rewards/rejected": 0.2526184022426605, + "step": 624 + }, + { + "epoch": 0.16, + "grad_norm": 4.609152793884277, + "learning_rate": 9.734299664674817e-06, + "logits/chosen": -0.15469321608543396, + "logits/rejected": -0.10983828455209732, + "logps/chosen": -40.389671325683594, + "logps/rejected": -71.34174346923828, + "loss": 0.9294, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9289326667785645, + "rewards/margins": 2.2874064445495605, + "rewards/rejected": 0.6415261030197144, + "step": 625 + }, + { + "epoch": 0.16, + "grad_norm": 3.8689780235290527, + "learning_rate": 9.733456385387215e-06, + "logits/chosen": -0.031778737902641296, + "logits/rejected": -0.18679983913898468, + "logps/chosen": -56.780540466308594, + "logps/rejected": -59.34837341308594, + "loss": 1.107, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.609313488006592, + "rewards/margins": 2.1827170848846436, + "rewards/rejected": 0.42659634351730347, + "step": 626 + }, + { + "epoch": 0.16, + "grad_norm": 3.4049036502838135, + "learning_rate": 9.732611806663691e-06, + "logits/chosen": 0.1443178802728653, + "logits/rejected": -0.19216519594192505, + "logps/chosen": -68.3654556274414, + "logps/rejected": -48.96138000488281, + "loss": 0.7233, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6515309810638428, + "rewards/margins": 3.0248870849609375, + "rewards/rejected": -0.3733561038970947, + "step": 627 + }, + { + "epoch": 0.16, + "grad_norm": 3.737574338912964, + "learning_rate": 9.731765928736107e-06, + "logits/chosen": -0.040161244571208954, + "logits/rejected": -0.1265835165977478, + "logps/chosen": -52.05265426635742, + "logps/rejected": -63.48036193847656, + "loss": 0.8886, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7502663135528564, + "rewards/margins": 2.0393972396850586, + "rewards/rejected": 0.7108691334724426, + "step": 628 + }, + { + "epoch": 0.16, + "grad_norm": 3.3143832683563232, + "learning_rate": 9.730918751836674e-06, + "logits/chosen": -0.0033935140818357468, + "logits/rejected": -0.10246599465608597, + "logps/chosen": -61.71723175048828, + "logps/rejected": -71.48110961914062, + "loss": 0.8977, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6450419425964355, + "rewards/margins": 2.371466875076294, + "rewards/rejected": 0.2735753357410431, + "step": 629 + }, + { + "epoch": 0.16, + "grad_norm": 3.97685170173645, + "learning_rate": 9.730070276197955e-06, + "logits/chosen": 0.02970755100250244, + "logits/rejected": -0.11980383843183517, + "logps/chosen": -57.28415298461914, + "logps/rejected": -66.18751525878906, + "loss": 0.8034, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.832315683364868, + "rewards/margins": 2.3439974784851074, + "rewards/rejected": 0.4883183240890503, + "step": 630 + }, + { + "epoch": 0.16, + "grad_norm": 4.412923336029053, + "learning_rate": 9.729220502052879e-06, + "logits/chosen": -0.13576626777648926, + "logits/rejected": -0.22222335636615753, + "logps/chosen": -61.23014831542969, + "logps/rejected": -68.2659912109375, + "loss": 0.9865, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.634538173675537, + "rewards/margins": 2.490018129348755, + "rewards/rejected": 0.14451977610588074, + "step": 631 + }, + { + "epoch": 0.16, + "grad_norm": 3.6322953701019287, + "learning_rate": 9.728369429634728e-06, + "logits/chosen": 0.020120996981859207, + "logits/rejected": -0.17209576070308685, + "logps/chosen": -70.35232543945312, + "logps/rejected": -60.794883728027344, + "loss": 0.8238, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4566397666931152, + "rewards/margins": 2.509573459625244, + "rewards/rejected": -0.05293399840593338, + "step": 632 + }, + { + "epoch": 0.16, + "grad_norm": 2.64454984664917, + "learning_rate": 9.727517059177136e-06, + "logits/chosen": 0.019982565194368362, + "logits/rejected": -0.13663601875305176, + "logps/chosen": -47.9317626953125, + "logps/rejected": -64.16201782226562, + "loss": 0.6637, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8244011402130127, + "rewards/margins": 3.3048653602600098, + "rewards/rejected": -0.48046404123306274, + "step": 633 + }, + { + "epoch": 0.16, + "grad_norm": 4.883293628692627, + "learning_rate": 9.726663390914101e-06, + "logits/chosen": -0.046084336936473846, + "logits/rejected": -0.12258598208427429, + "logps/chosen": -56.47083282470703, + "logps/rejected": -55.956703186035156, + "loss": 1.0205, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.604616165161133, + "rewards/margins": 2.052351474761963, + "rewards/rejected": 0.5522644519805908, + "step": 634 + }, + { + "epoch": 0.16, + "grad_norm": 3.658550977706909, + "learning_rate": 9.725808425079972e-06, + "logits/chosen": -0.09884350001811981, + "logits/rejected": -0.20610006153583527, + "logps/chosen": -51.57001495361328, + "logps/rejected": -56.61008834838867, + "loss": 0.8353, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.718172550201416, + "rewards/margins": 2.6915524005889893, + "rewards/rejected": 0.026619933545589447, + "step": 635 + }, + { + "epoch": 0.16, + "grad_norm": 5.371145248413086, + "learning_rate": 9.724952161909456e-06, + "logits/chosen": -0.08887338638305664, + "logits/rejected": -0.17066188156604767, + "logps/chosen": -50.03331756591797, + "logps/rejected": -63.116607666015625, + "loss": 0.9493, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5461928844451904, + "rewards/margins": 2.5398948192596436, + "rewards/rejected": 0.006298363208770752, + "step": 636 + }, + { + "epoch": 0.16, + "grad_norm": 4.272145748138428, + "learning_rate": 9.724094601637615e-06, + "logits/chosen": -0.04048721119761467, + "logits/rejected": -0.0891539603471756, + "logps/chosen": -51.70484161376953, + "logps/rejected": -77.4329605102539, + "loss": 0.878, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8818230628967285, + "rewards/margins": 2.564427375793457, + "rewards/rejected": 0.3173958957195282, + "step": 637 + }, + { + "epoch": 0.16, + "grad_norm": 4.45269250869751, + "learning_rate": 9.723235744499866e-06, + "logits/chosen": -0.14830449223518372, + "logits/rejected": -0.23623177409172058, + "logps/chosen": -57.77968215942383, + "logps/rejected": -58.75918960571289, + "loss": 0.8963, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.669696569442749, + "rewards/margins": 2.110679864883423, + "rewards/rejected": 0.5590167045593262, + "step": 638 + }, + { + "epoch": 0.16, + "grad_norm": 1.9697102308273315, + "learning_rate": 9.722375590731987e-06, + "logits/chosen": -0.029757507145404816, + "logits/rejected": -0.2445228397846222, + "logps/chosen": -61.925270080566406, + "logps/rejected": -53.166297912597656, + "loss": 0.7643, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8337554931640625, + "rewards/margins": 3.690457820892334, + "rewards/rejected": -0.8567020893096924, + "step": 639 + }, + { + "epoch": 0.16, + "grad_norm": 3.278658390045166, + "learning_rate": 9.721514140570108e-06, + "logits/chosen": -0.006965414620935917, + "logits/rejected": -0.11842670291662216, + "logps/chosen": -61.88026428222656, + "logps/rejected": -58.83252716064453, + "loss": 0.8222, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7123923301696777, + "rewards/margins": 2.2875192165374756, + "rewards/rejected": 0.4248727858066559, + "step": 640 + }, + { + "epoch": 0.16, + "grad_norm": 3.5856919288635254, + "learning_rate": 9.720651394250715e-06, + "logits/chosen": -0.03830355778336525, + "logits/rejected": -0.22113315761089325, + "logps/chosen": -64.32286071777344, + "logps/rejected": -59.549652099609375, + "loss": 0.8849, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.799598217010498, + "rewards/margins": 2.346384286880493, + "rewards/rejected": 0.4532138705253601, + "step": 641 + }, + { + "epoch": 0.16, + "grad_norm": 3.788241386413574, + "learning_rate": 9.71978735201065e-06, + "logits/chosen": -0.010716944932937622, + "logits/rejected": -0.14289350807666779, + "logps/chosen": -52.43391799926758, + "logps/rejected": -69.61023712158203, + "loss": 0.7289, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.00156307220459, + "rewards/margins": 2.857837200164795, + "rewards/rejected": 0.14372579753398895, + "step": 642 + }, + { + "epoch": 0.16, + "grad_norm": 4.228732585906982, + "learning_rate": 9.718922014087113e-06, + "logits/chosen": -0.027077414095401764, + "logits/rejected": -0.160626620054245, + "logps/chosen": -61.34032440185547, + "logps/rejected": -62.694942474365234, + "loss": 0.8845, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.813171148300171, + "rewards/margins": 2.5234439373016357, + "rewards/rejected": 0.2897275686264038, + "step": 643 + }, + { + "epoch": 0.16, + "grad_norm": 3.9341940879821777, + "learning_rate": 9.718055380717655e-06, + "logits/chosen": -0.06480620801448822, + "logits/rejected": -0.07818266749382019, + "logps/chosen": -44.415321350097656, + "logps/rejected": -64.61768341064453, + "loss": 0.7562, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7484264373779297, + "rewards/margins": 2.638524293899536, + "rewards/rejected": 0.10990239679813385, + "step": 644 + }, + { + "epoch": 0.16, + "grad_norm": 3.7411136627197266, + "learning_rate": 9.717187452140189e-06, + "logits/chosen": -0.01640426367521286, + "logits/rejected": -0.09565483778715134, + "logps/chosen": -58.585018157958984, + "logps/rejected": -74.54413604736328, + "loss": 0.9017, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.845548152923584, + "rewards/margins": 1.9297478199005127, + "rewards/rejected": 0.9158004522323608, + "step": 645 + }, + { + "epoch": 0.16, + "grad_norm": 3.9313528537750244, + "learning_rate": 9.716318228592977e-06, + "logits/chosen": -0.0861997976899147, + "logits/rejected": -0.19792811572551727, + "logps/chosen": -59.08517074584961, + "logps/rejected": -59.55896759033203, + "loss": 1.0118, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.749263286590576, + "rewards/margins": 2.0981802940368652, + "rewards/rejected": 0.6510829329490662, + "step": 646 + }, + { + "epoch": 0.16, + "grad_norm": 3.134676694869995, + "learning_rate": 9.715447710314644e-06, + "logits/chosen": -0.012904413044452667, + "logits/rejected": -0.174949049949646, + "logps/chosen": -57.24489212036133, + "logps/rejected": -59.26593780517578, + "loss": 0.783, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7266740798950195, + "rewards/margins": 2.947275400161743, + "rewards/rejected": -0.2206018567085266, + "step": 647 + }, + { + "epoch": 0.16, + "grad_norm": 5.386720180511475, + "learning_rate": 9.714575897544161e-06, + "logits/chosen": -0.06276555359363556, + "logits/rejected": -0.2173597663640976, + "logps/chosen": -57.612762451171875, + "logps/rejected": -53.84769821166992, + "loss": 0.98, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6744484901428223, + "rewards/margins": 2.2303333282470703, + "rewards/rejected": 0.44411513209342957, + "step": 648 + }, + { + "epoch": 0.16, + "grad_norm": 3.9958529472351074, + "learning_rate": 9.713702790520863e-06, + "logits/chosen": -0.12255536019802094, + "logits/rejected": -0.18283528089523315, + "logps/chosen": -55.652793884277344, + "logps/rejected": -66.757568359375, + "loss": 0.901, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8627026081085205, + "rewards/margins": 2.359602689743042, + "rewards/rejected": 0.5030999779701233, + "step": 649 + }, + { + "epoch": 0.16, + "grad_norm": 4.0031657218933105, + "learning_rate": 9.712828389484432e-06, + "logits/chosen": -0.08897458016872406, + "logits/rejected": -0.23449449241161346, + "logps/chosen": -54.746070861816406, + "logps/rejected": -45.009239196777344, + "loss": 0.8916, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.868558883666992, + "rewards/margins": 2.6854352951049805, + "rewards/rejected": 0.18312373757362366, + "step": 650 + }, + { + "epoch": 0.16, + "grad_norm": 4.45773458480835, + "learning_rate": 9.711952694674917e-06, + "logits/chosen": -0.006741403602063656, + "logits/rejected": -0.06264208257198334, + "logps/chosen": -55.882041931152344, + "logps/rejected": -73.17974853515625, + "loss": 0.9442, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6612374782562256, + "rewards/margins": 2.311858654022217, + "rewards/rejected": 0.3493790328502655, + "step": 651 + }, + { + "epoch": 0.16, + "grad_norm": 3.9297220706939697, + "learning_rate": 9.71107570633271e-06, + "logits/chosen": -0.06167980283498764, + "logits/rejected": -0.2026514708995819, + "logps/chosen": -55.994503021240234, + "logps/rejected": -57.311580657958984, + "loss": 0.9409, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6129255294799805, + "rewards/margins": 2.2705559730529785, + "rewards/rejected": 0.34236934781074524, + "step": 652 + }, + { + "epoch": 0.16, + "grad_norm": 5.507659435272217, + "learning_rate": 9.710197424698565e-06, + "logits/chosen": -0.13668102025985718, + "logits/rejected": -0.2311299443244934, + "logps/chosen": -49.123382568359375, + "logps/rejected": -80.60797882080078, + "loss": 0.9099, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8001692295074463, + "rewards/margins": 2.8169074058532715, + "rewards/rejected": -0.016738444566726685, + "step": 653 + }, + { + "epoch": 0.16, + "grad_norm": 3.4277052879333496, + "learning_rate": 9.709317850013587e-06, + "logits/chosen": -0.08315584063529968, + "logits/rejected": -0.2213190793991089, + "logps/chosen": -52.10906219482422, + "logps/rejected": -57.947696685791016, + "loss": 0.8173, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.755577802658081, + "rewards/margins": 3.002607583999634, + "rewards/rejected": -0.24702981114387512, + "step": 654 + }, + { + "epoch": 0.16, + "grad_norm": 3.455937385559082, + "learning_rate": 9.708436982519242e-06, + "logits/chosen": -0.003837066702544689, + "logits/rejected": -0.12043721973896027, + "logps/chosen": -67.05364227294922, + "logps/rejected": -72.24205017089844, + "loss": 0.8791, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.703002691268921, + "rewards/margins": 2.398696184158325, + "rewards/rejected": 0.30430641770362854, + "step": 655 + }, + { + "epoch": 0.16, + "grad_norm": 3.9972519874572754, + "learning_rate": 9.707554822457346e-06, + "logits/chosen": -0.16718599200248718, + "logits/rejected": -0.2455645352602005, + "logps/chosen": -49.08679962158203, + "logps/rejected": -56.92885971069336, + "loss": 0.9054, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.596400260925293, + "rewards/margins": 2.453360080718994, + "rewards/rejected": 0.14304038882255554, + "step": 656 + }, + { + "epoch": 0.16, + "grad_norm": 3.672400712966919, + "learning_rate": 9.706671370070069e-06, + "logits/chosen": 0.031000249087810516, + "logits/rejected": -0.011159852147102356, + "logps/chosen": -60.35926818847656, + "logps/rejected": -70.7281494140625, + "loss": 0.8581, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.857008457183838, + "rewards/margins": 2.527554988861084, + "rewards/rejected": 0.32945331931114197, + "step": 657 + }, + { + "epoch": 0.16, + "grad_norm": 3.6657612323760986, + "learning_rate": 9.705786625599939e-06, + "logits/chosen": -0.08647305518388748, + "logits/rejected": -0.2751651406288147, + "logps/chosen": -61.27818298339844, + "logps/rejected": -56.378116607666016, + "loss": 0.9158, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8170089721679688, + "rewards/margins": 2.5123448371887207, + "rewards/rejected": 0.3046647310256958, + "step": 658 + }, + { + "epoch": 0.16, + "grad_norm": 3.149782180786133, + "learning_rate": 9.704900589289836e-06, + "logits/chosen": -0.0912061482667923, + "logits/rejected": -0.20590630173683167, + "logps/chosen": -53.978580474853516, + "logps/rejected": -61.219482421875, + "loss": 0.8243, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7691526412963867, + "rewards/margins": 2.646362543106079, + "rewards/rejected": 0.12279009819030762, + "step": 659 + }, + { + "epoch": 0.17, + "grad_norm": 3.2992849349975586, + "learning_rate": 9.704013261382998e-06, + "logits/chosen": -0.12389451265335083, + "logits/rejected": -0.2939593493938446, + "logps/chosen": -50.17494583129883, + "logps/rejected": -68.52461242675781, + "loss": 0.7295, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0316357612609863, + "rewards/margins": 3.639216899871826, + "rewards/rejected": -0.607580840587616, + "step": 660 + }, + { + "epoch": 0.17, + "grad_norm": 4.075921535491943, + "learning_rate": 9.703124642123017e-06, + "logits/chosen": -0.0938420295715332, + "logits/rejected": -0.11557762324810028, + "logps/chosen": -61.55503845214844, + "logps/rejected": -85.62696075439453, + "loss": 0.8739, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.932704448699951, + "rewards/margins": 2.972782611846924, + "rewards/rejected": -0.04007836803793907, + "step": 661 + }, + { + "epoch": 0.17, + "grad_norm": 4.176951885223389, + "learning_rate": 9.702234731753836e-06, + "logits/chosen": -0.024426443502306938, + "logits/rejected": -0.16982951760292053, + "logps/chosen": -63.20216369628906, + "logps/rejected": -58.40345764160156, + "loss": 0.8947, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.905961513519287, + "rewards/margins": 3.050377368927002, + "rewards/rejected": -0.14441552758216858, + "step": 662 + }, + { + "epoch": 0.17, + "grad_norm": 4.699306488037109, + "learning_rate": 9.701343530519753e-06, + "logits/chosen": -0.08882886916399002, + "logits/rejected": -0.17033198475837708, + "logps/chosen": -47.27519607543945, + "logps/rejected": -66.52740478515625, + "loss": 0.9938, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.9084227085113525, + "rewards/margins": 2.0680150985717773, + "rewards/rejected": 0.8404079079627991, + "step": 663 + }, + { + "epoch": 0.17, + "grad_norm": 5.208045959472656, + "learning_rate": 9.700451038665427e-06, + "logits/chosen": -0.10419587790966034, + "logits/rejected": -0.2164801061153412, + "logps/chosen": -53.03565979003906, + "logps/rejected": -58.36995315551758, + "loss": 0.8345, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.942931890487671, + "rewards/margins": 3.131230115890503, + "rewards/rejected": -0.18829838931560516, + "step": 664 + }, + { + "epoch": 0.17, + "grad_norm": 4.775291919708252, + "learning_rate": 9.69955725643586e-06, + "logits/chosen": -0.05888042598962784, + "logits/rejected": -0.17505206167697906, + "logps/chosen": -50.18716049194336, + "logps/rejected": -55.8002815246582, + "loss": 1.0351, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.549710988998413, + "rewards/margins": 2.010756492614746, + "rewards/rejected": 0.5389546155929565, + "step": 665 + }, + { + "epoch": 0.17, + "grad_norm": 4.615143299102783, + "learning_rate": 9.69866218407642e-06, + "logits/chosen": -0.05220638960599899, + "logits/rejected": -0.16458797454833984, + "logps/chosen": -45.196537017822266, + "logps/rejected": -58.55729675292969, + "loss": 0.7919, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9871833324432373, + "rewards/margins": 2.7713379859924316, + "rewards/rejected": 0.2158452868461609, + "step": 666 + }, + { + "epoch": 0.17, + "grad_norm": 3.934955358505249, + "learning_rate": 9.697765821832819e-06, + "logits/chosen": -0.11471810936927795, + "logits/rejected": -0.20850490033626556, + "logps/chosen": -75.80398559570312, + "logps/rejected": -63.190223693847656, + "loss": 0.8309, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9266302585601807, + "rewards/margins": 2.578303098678589, + "rewards/rejected": 0.34832748770713806, + "step": 667 + }, + { + "epoch": 0.17, + "grad_norm": 6.330931663513184, + "learning_rate": 9.696868169951133e-06, + "logits/chosen": -0.12428722530603409, + "logits/rejected": -0.19779464602470398, + "logps/chosen": -57.295223236083984, + "logps/rejected": -80.11137390136719, + "loss": 0.8657, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.678645133972168, + "rewards/margins": 2.64953875541687, + "rewards/rejected": 0.029106199741363525, + "step": 668 + }, + { + "epoch": 0.17, + "grad_norm": 2.8444669246673584, + "learning_rate": 9.695969228677781e-06, + "logits/chosen": -0.06610718369483948, + "logits/rejected": -0.2572368383407593, + "logps/chosen": -59.490089416503906, + "logps/rejected": -53.33799362182617, + "loss": 0.8573, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7819066047668457, + "rewards/margins": 2.5949759483337402, + "rewards/rejected": 0.1869305521249771, + "step": 669 + }, + { + "epoch": 0.17, + "grad_norm": 3.230544328689575, + "learning_rate": 9.695068998259547e-06, + "logits/chosen": -0.11743825674057007, + "logits/rejected": -0.21802634000778198, + "logps/chosen": -62.06694793701172, + "logps/rejected": -67.77992248535156, + "loss": 0.8721, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.944288969039917, + "rewards/margins": 2.699702739715576, + "rewards/rejected": 0.24458572268486023, + "step": 670 + }, + { + "epoch": 0.17, + "grad_norm": 4.5450520515441895, + "learning_rate": 9.69416747894356e-06, + "logits/chosen": -0.03451481834053993, + "logits/rejected": -0.2136944681406021, + "logps/chosen": -69.544677734375, + "logps/rejected": -67.41741943359375, + "loss": 0.9176, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.557955741882324, + "rewards/margins": 2.7925801277160645, + "rewards/rejected": -0.23462432622909546, + "step": 671 + }, + { + "epoch": 0.17, + "grad_norm": 3.6176469326019287, + "learning_rate": 9.693264670977307e-06, + "logits/chosen": 0.013042853213846684, + "logits/rejected": -0.07852119207382202, + "logps/chosen": -53.166805267333984, + "logps/rejected": -60.61278533935547, + "loss": 0.7018, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7873315811157227, + "rewards/margins": 2.8128347396850586, + "rewards/rejected": -0.025503143668174744, + "step": 672 + }, + { + "epoch": 0.17, + "grad_norm": 3.1394712924957275, + "learning_rate": 9.692360574608631e-06, + "logits/chosen": -0.07741624861955643, + "logits/rejected": -0.16971167922019958, + "logps/chosen": -50.48284149169922, + "logps/rejected": -62.192527770996094, + "loss": 0.8359, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.809614419937134, + "rewards/margins": 2.776029109954834, + "rewards/rejected": 0.033585697412490845, + "step": 673 + }, + { + "epoch": 0.17, + "grad_norm": 3.8860812187194824, + "learning_rate": 9.691455190085724e-06, + "logits/chosen": -0.1499766856431961, + "logits/rejected": -0.250085711479187, + "logps/chosen": -57.88435745239258, + "logps/rejected": -64.9173583984375, + "loss": 0.8581, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3813390731811523, + "rewards/margins": 2.5723907947540283, + "rewards/rejected": -0.19105151295661926, + "step": 674 + }, + { + "epoch": 0.17, + "grad_norm": 4.269176483154297, + "learning_rate": 9.690548517657133e-06, + "logits/chosen": -0.03703727573156357, + "logits/rejected": -0.1945071965456009, + "logps/chosen": -54.87574005126953, + "logps/rejected": -59.71285629272461, + "loss": 0.8411, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.003774881362915, + "rewards/margins": 3.2748377323150635, + "rewards/rejected": -0.27106285095214844, + "step": 675 + }, + { + "epoch": 0.17, + "grad_norm": 6.517378807067871, + "learning_rate": 9.68964055757176e-06, + "logits/chosen": -0.01693626493215561, + "logits/rejected": -0.24449624121189117, + "logps/chosen": -59.93680953979492, + "logps/rejected": -55.98500061035156, + "loss": 0.8623, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7085673809051514, + "rewards/margins": 2.725513219833374, + "rewards/rejected": -0.016945943236351013, + "step": 676 + }, + { + "epoch": 0.17, + "grad_norm": 4.5227742195129395, + "learning_rate": 9.68873131007886e-06, + "logits/chosen": -0.156079962849617, + "logits/rejected": -0.28458890318870544, + "logps/chosen": -48.72652816772461, + "logps/rejected": -56.419189453125, + "loss": 0.8395, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.92758846282959, + "rewards/margins": 2.470027208328247, + "rewards/rejected": 0.4575616121292114, + "step": 677 + }, + { + "epoch": 0.17, + "grad_norm": 3.7076354026794434, + "learning_rate": 9.68782077542804e-06, + "logits/chosen": -0.06214899569749832, + "logits/rejected": -0.24718452990055084, + "logps/chosen": -53.426395416259766, + "logps/rejected": -48.90040588378906, + "loss": 0.8993, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5907676219940186, + "rewards/margins": 2.2411866188049316, + "rewards/rejected": 0.3495807945728302, + "step": 678 + }, + { + "epoch": 0.17, + "grad_norm": 4.478017330169678, + "learning_rate": 9.686908953869263e-06, + "logits/chosen": -0.09839476644992828, + "logits/rejected": -0.19316589832305908, + "logps/chosen": -56.26924514770508, + "logps/rejected": -55.408660888671875, + "loss": 1.0102, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.817039966583252, + "rewards/margins": 2.3370916843414307, + "rewards/rejected": 0.47994816303253174, + "step": 679 + }, + { + "epoch": 0.17, + "grad_norm": 3.989649772644043, + "learning_rate": 9.68599584565284e-06, + "logits/chosen": -0.02441955730319023, + "logits/rejected": -0.15250328183174133, + "logps/chosen": -62.839805603027344, + "logps/rejected": -71.71188354492188, + "loss": 0.8822, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.825171947479248, + "rewards/margins": 2.430466890335083, + "rewards/rejected": 0.39470526576042175, + "step": 680 + }, + { + "epoch": 0.17, + "grad_norm": 3.864330768585205, + "learning_rate": 9.685081451029445e-06, + "logits/chosen": -0.05719548463821411, + "logits/rejected": -0.22328636050224304, + "logps/chosen": -57.67586135864258, + "logps/rejected": -58.75616455078125, + "loss": 0.8191, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.723842144012451, + "rewards/margins": 2.8132498264312744, + "rewards/rejected": -0.08940790593624115, + "step": 681 + }, + { + "epoch": 0.17, + "grad_norm": 4.488925457000732, + "learning_rate": 9.684165770250094e-06, + "logits/chosen": -0.04478825628757477, + "logits/rejected": -0.15699848532676697, + "logps/chosen": -59.16167449951172, + "logps/rejected": -71.69654083251953, + "loss": 0.8782, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7286136150360107, + "rewards/margins": 2.510084629058838, + "rewards/rejected": 0.2185286283493042, + "step": 682 + }, + { + "epoch": 0.17, + "grad_norm": 4.634200572967529, + "learning_rate": 9.683248803566163e-06, + "logits/chosen": -0.004464304074645042, + "logits/rejected": -0.15456503629684448, + "logps/chosen": -60.026885986328125, + "logps/rejected": -55.801570892333984, + "loss": 0.9679, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.612250328063965, + "rewards/margins": 2.5413756370544434, + "rewards/rejected": 0.07087482511997223, + "step": 683 + }, + { + "epoch": 0.17, + "grad_norm": 2.9538915157318115, + "learning_rate": 9.682330551229378e-06, + "logits/chosen": -0.05000879243016243, + "logits/rejected": -0.20306509733200073, + "logps/chosen": -69.15486907958984, + "logps/rejected": -67.89201354980469, + "loss": 0.7406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9592928886413574, + "rewards/margins": 3.282891273498535, + "rewards/rejected": -0.32359781861305237, + "step": 684 + }, + { + "epoch": 0.17, + "grad_norm": 5.69992208480835, + "learning_rate": 9.681411013491819e-06, + "logits/chosen": -0.11907026916742325, + "logits/rejected": -0.24252840876579285, + "logps/chosen": -53.21369934082031, + "logps/rejected": -53.84257507324219, + "loss": 1.0841, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.546898365020752, + "rewards/margins": 1.8387398719787598, + "rewards/rejected": 0.708158552646637, + "step": 685 + }, + { + "epoch": 0.17, + "grad_norm": 3.4073433876037598, + "learning_rate": 9.680490190605923e-06, + "logits/chosen": -0.091157928109169, + "logits/rejected": -0.18291208148002625, + "logps/chosen": -54.5855712890625, + "logps/rejected": -60.96006393432617, + "loss": 0.8689, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9423696994781494, + "rewards/margins": 2.5578513145446777, + "rewards/rejected": 0.3845182955265045, + "step": 686 + }, + { + "epoch": 0.17, + "grad_norm": 3.682940721511841, + "learning_rate": 9.679568082824471e-06, + "logits/chosen": -0.08256099373102188, + "logits/rejected": -0.19463428854942322, + "logps/chosen": -57.8369255065918, + "logps/rejected": -70.70648193359375, + "loss": 0.8712, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.717383861541748, + "rewards/margins": 3.0184686183929443, + "rewards/rejected": -0.3010849356651306, + "step": 687 + }, + { + "epoch": 0.17, + "grad_norm": 3.52885103225708, + "learning_rate": 9.678644690400602e-06, + "logits/chosen": 0.036377787590026855, + "logits/rejected": -0.16950000822544098, + "logps/chosen": -63.423255920410156, + "logps/rejected": -51.27239990234375, + "loss": 0.802, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.863774299621582, + "rewards/margins": 3.1641273498535156, + "rewards/rejected": -0.30035319924354553, + "step": 688 + }, + { + "epoch": 0.17, + "grad_norm": 3.7182929515838623, + "learning_rate": 9.67772001358781e-06, + "logits/chosen": -0.017837556079030037, + "logits/rejected": -0.11424069851636887, + "logps/chosen": -58.5333251953125, + "logps/rejected": -69.04029846191406, + "loss": 0.7832, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9215786457061768, + "rewards/margins": 2.646846294403076, + "rewards/rejected": 0.27473270893096924, + "step": 689 + }, + { + "epoch": 0.17, + "grad_norm": 3.8314921855926514, + "learning_rate": 9.676794052639937e-06, + "logits/chosen": -0.1358029842376709, + "logits/rejected": -0.2146945744752884, + "logps/chosen": -53.95423126220703, + "logps/rejected": -58.13251876831055, + "loss": 0.9936, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.778870105743408, + "rewards/margins": 2.3287363052368164, + "rewards/rejected": 0.45013368129730225, + "step": 690 + }, + { + "epoch": 0.17, + "grad_norm": 4.2458977699279785, + "learning_rate": 9.675866807811178e-06, + "logits/chosen": -0.08691704273223877, + "logits/rejected": -0.19795142114162445, + "logps/chosen": -63.840579986572266, + "logps/rejected": -57.43915939331055, + "loss": 1.0343, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.69862961769104, + "rewards/margins": 2.475816249847412, + "rewards/rejected": 0.22281354665756226, + "step": 691 + }, + { + "epoch": 0.17, + "grad_norm": 6.912355422973633, + "learning_rate": 9.674938279356086e-06, + "logits/chosen": -0.0722908154129982, + "logits/rejected": -0.17087572813034058, + "logps/chosen": -68.17604064941406, + "logps/rejected": -74.54021453857422, + "loss": 0.8625, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.657212495803833, + "rewards/margins": 2.7375595569610596, + "rewards/rejected": -0.08034727722406387, + "step": 692 + }, + { + "epoch": 0.17, + "grad_norm": 4.2245192527771, + "learning_rate": 9.674008467529557e-06, + "logits/chosen": 0.07116776704788208, + "logits/rejected": -0.14389699697494507, + "logps/chosen": -66.73550415039062, + "logps/rejected": -64.68363952636719, + "loss": 0.7483, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9516983032226562, + "rewards/margins": 3.0330393314361572, + "rewards/rejected": -0.08134077489376068, + "step": 693 + }, + { + "epoch": 0.17, + "grad_norm": 4.2597832679748535, + "learning_rate": 9.673077372586845e-06, + "logits/chosen": 0.08223380893468857, + "logits/rejected": -0.14474500715732574, + "logps/chosen": -66.34217071533203, + "logps/rejected": -55.03703689575195, + "loss": 0.9568, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8309731483459473, + "rewards/margins": 2.48539137840271, + "rewards/rejected": 0.345581591129303, + "step": 694 + }, + { + "epoch": 0.17, + "grad_norm": 4.043236255645752, + "learning_rate": 9.67214499478356e-06, + "logits/chosen": -0.08771166950464249, + "logits/rejected": -0.1620027720928192, + "logps/chosen": -57.5792350769043, + "logps/rejected": -61.95730209350586, + "loss": 0.8694, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8725509643554688, + "rewards/margins": 2.201141119003296, + "rewards/rejected": 0.671409547328949, + "step": 695 + }, + { + "epoch": 0.17, + "grad_norm": 4.953446388244629, + "learning_rate": 9.671211334375655e-06, + "logits/chosen": -0.007032884284853935, + "logits/rejected": -0.11989343166351318, + "logps/chosen": -61.102577209472656, + "logps/rejected": -76.74707794189453, + "loss": 0.9157, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.784001588821411, + "rewards/margins": 2.235151529312134, + "rewards/rejected": 0.5488497018814087, + "step": 696 + }, + { + "epoch": 0.17, + "grad_norm": 4.7299628257751465, + "learning_rate": 9.670276391619443e-06, + "logits/chosen": -0.03942697122693062, + "logits/rejected": -0.15386804938316345, + "logps/chosen": -56.82627868652344, + "logps/rejected": -57.286827087402344, + "loss": 0.9514, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.607863426208496, + "rewards/margins": 2.6127922534942627, + "rewards/rejected": -0.0049286819994449615, + "step": 697 + }, + { + "epoch": 0.17, + "grad_norm": 4.202674865722656, + "learning_rate": 9.669340166771584e-06, + "logits/chosen": 0.0004936084151268005, + "logits/rejected": -0.12090190500020981, + "logps/chosen": -59.492088317871094, + "logps/rejected": -63.86345291137695, + "loss": 0.9257, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6538078784942627, + "rewards/margins": 2.1340835094451904, + "rewards/rejected": 0.5197243690490723, + "step": 698 + }, + { + "epoch": 0.17, + "grad_norm": 4.454418182373047, + "learning_rate": 9.668402660089094e-06, + "logits/chosen": -0.07227779924869537, + "logits/rejected": -0.14405342936515808, + "logps/chosen": -60.56220626831055, + "logps/rejected": -89.91521453857422, + "loss": 0.9592, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.1728620529174805, + "rewards/margins": 2.9853336811065674, + "rewards/rejected": 0.18752823770046234, + "step": 699 + }, + { + "epoch": 0.18, + "grad_norm": 4.194287300109863, + "learning_rate": 9.667463871829336e-06, + "logits/chosen": -0.14145703613758087, + "logits/rejected": -0.16436827182769775, + "logps/chosen": -59.069305419921875, + "logps/rejected": -71.66192626953125, + "loss": 0.9246, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.626129150390625, + "rewards/margins": 1.8127564191818237, + "rewards/rejected": 0.813372790813446, + "step": 700 + }, + { + "epoch": 0.18, + "grad_norm": 4.729238033294678, + "learning_rate": 9.66652380225003e-06, + "logits/chosen": 0.003190658986568451, + "logits/rejected": -0.1813044399023056, + "logps/chosen": -67.67857360839844, + "logps/rejected": -57.27639389038086, + "loss": 0.8468, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.891313314437866, + "rewards/margins": 2.7125587463378906, + "rewards/rejected": 0.17875465750694275, + "step": 701 + }, + { + "epoch": 0.18, + "grad_norm": 4.441972732543945, + "learning_rate": 9.665582451609245e-06, + "logits/chosen": -0.08628596365451813, + "logits/rejected": -0.16763833165168762, + "logps/chosen": -57.35643768310547, + "logps/rejected": -67.87931823730469, + "loss": 0.9343, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7336761951446533, + "rewards/margins": 2.493154525756836, + "rewards/rejected": 0.2405216097831726, + "step": 702 + }, + { + "epoch": 0.18, + "grad_norm": 3.5586047172546387, + "learning_rate": 9.664639820165402e-06, + "logits/chosen": -0.0264323428273201, + "logits/rejected": -0.064046710729599, + "logps/chosen": -59.55033874511719, + "logps/rejected": -84.59467315673828, + "loss": 0.8309, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.835254669189453, + "rewards/margins": 2.464064598083496, + "rewards/rejected": 0.3711901009082794, + "step": 703 + }, + { + "epoch": 0.18, + "grad_norm": 3.342880964279175, + "learning_rate": 9.663695908177272e-06, + "logits/chosen": -0.11531642824411392, + "logits/rejected": -0.20987072587013245, + "logps/chosen": -52.973182678222656, + "logps/rejected": -62.461544036865234, + "loss": 0.7903, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7685022354125977, + "rewards/margins": 2.4922609329223633, + "rewards/rejected": 0.2762415111064911, + "step": 704 + }, + { + "epoch": 0.18, + "grad_norm": 4.334110736846924, + "learning_rate": 9.662750715903982e-06, + "logits/chosen": -0.10157027095556259, + "logits/rejected": -0.18841016292572021, + "logps/chosen": -54.18696594238281, + "logps/rejected": -72.36067962646484, + "loss": 0.8764, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5876550674438477, + "rewards/margins": 2.3247132301330566, + "rewards/rejected": 0.26294195652008057, + "step": 705 + }, + { + "epoch": 0.18, + "grad_norm": 5.14777946472168, + "learning_rate": 9.661804243605006e-06, + "logits/chosen": -0.10765324532985687, + "logits/rejected": -0.16668178141117096, + "logps/chosen": -71.10035705566406, + "logps/rejected": -68.29583740234375, + "loss": 1.1786, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.557300567626953, + "rewards/margins": 1.5073233842849731, + "rewards/rejected": 1.0499769449234009, + "step": 706 + }, + { + "epoch": 0.18, + "grad_norm": 4.687394618988037, + "learning_rate": 9.66085649154017e-06, + "logits/chosen": -0.1073114350438118, + "logits/rejected": -0.17246268689632416, + "logps/chosen": -76.17790222167969, + "logps/rejected": -76.90154266357422, + "loss": 0.9155, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0284674167633057, + "rewards/margins": 2.532719612121582, + "rewards/rejected": 0.4957476556301117, + "step": 707 + }, + { + "epoch": 0.18, + "grad_norm": 3.2367594242095947, + "learning_rate": 9.659907459969656e-06, + "logits/chosen": -0.13600866496562958, + "logits/rejected": -0.2557276487350464, + "logps/chosen": -45.902809143066406, + "logps/rejected": -62.98919677734375, + "loss": 0.6776, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.023705244064331, + "rewards/margins": 3.573918104171753, + "rewards/rejected": -0.5502128005027771, + "step": 708 + }, + { + "epoch": 0.18, + "grad_norm": 3.6186556816101074, + "learning_rate": 9.658957149153989e-06, + "logits/chosen": -0.14518262445926666, + "logits/rejected": -0.17169508337974548, + "logps/chosen": -47.8570442199707, + "logps/rejected": -74.04336547851562, + "loss": 0.8435, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.185091972351074, + "rewards/margins": 2.469877004623413, + "rewards/rejected": 0.715215265750885, + "step": 709 + }, + { + "epoch": 0.18, + "grad_norm": 3.829106330871582, + "learning_rate": 9.658005559354053e-06, + "logits/chosen": -0.2580548822879791, + "logits/rejected": -0.3511759340763092, + "logps/chosen": -62.34983825683594, + "logps/rejected": -65.60289764404297, + "loss": 0.8919, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.224268913269043, + "rewards/margins": 3.1789650917053223, + "rewards/rejected": 0.0453035831451416, + "step": 710 + }, + { + "epoch": 0.18, + "grad_norm": 3.3290045261383057, + "learning_rate": 9.65705269083108e-06, + "logits/chosen": -0.06376634538173676, + "logits/rejected": -0.16361168026924133, + "logps/chosen": -53.468780517578125, + "logps/rejected": -70.38541412353516, + "loss": 0.7485, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8125696182250977, + "rewards/margins": 2.947997570037842, + "rewards/rejected": -0.13542816042900085, + "step": 711 + }, + { + "epoch": 0.18, + "grad_norm": 4.615001678466797, + "learning_rate": 9.656098543846652e-06, + "logits/chosen": -0.10196812450885773, + "logits/rejected": -0.17436319589614868, + "logps/chosen": -58.04851150512695, + "logps/rejected": -66.01873016357422, + "loss": 0.9721, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8535091876983643, + "rewards/margins": 2.7280819416046143, + "rewards/rejected": 0.1254270374774933, + "step": 712 + }, + { + "epoch": 0.18, + "grad_norm": 4.991544246673584, + "learning_rate": 9.655143118662702e-06, + "logits/chosen": -0.13286489248275757, + "logits/rejected": -0.23601287603378296, + "logps/chosen": -61.2808723449707, + "logps/rejected": -53.84611129760742, + "loss": 1.0233, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.671149730682373, + "rewards/margins": 2.149230480194092, + "rewards/rejected": 0.5219194293022156, + "step": 713 + }, + { + "epoch": 0.18, + "grad_norm": 4.294860363006592, + "learning_rate": 9.654186415541518e-06, + "logits/chosen": -0.06555411219596863, + "logits/rejected": -0.14728602766990662, + "logps/chosen": -57.0330924987793, + "logps/rejected": -66.63019561767578, + "loss": 0.9401, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7811412811279297, + "rewards/margins": 2.3722569942474365, + "rewards/rejected": 0.4088841378688812, + "step": 714 + }, + { + "epoch": 0.18, + "grad_norm": 5.1266279220581055, + "learning_rate": 9.653228434745731e-06, + "logits/chosen": -0.05632095783948898, + "logits/rejected": -0.13798953592777252, + "logps/chosen": -54.1580810546875, + "logps/rejected": -70.60231018066406, + "loss": 0.926, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.581211566925049, + "rewards/margins": 2.334054946899414, + "rewards/rejected": 0.24715641140937805, + "step": 715 + }, + { + "epoch": 0.18, + "grad_norm": 6.1832990646362305, + "learning_rate": 9.652269176538332e-06, + "logits/chosen": -0.014831394888460636, + "logits/rejected": -0.11011882871389389, + "logps/chosen": -55.31412887573242, + "logps/rejected": -62.255592346191406, + "loss": 0.9077, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6875526905059814, + "rewards/margins": 2.3196065425872803, + "rewards/rejected": 0.3679460883140564, + "step": 716 + }, + { + "epoch": 0.18, + "grad_norm": 4.573421478271484, + "learning_rate": 9.651308641182654e-06, + "logits/chosen": -0.055540166795253754, + "logits/rejected": -0.09996762871742249, + "logps/chosen": -54.828575134277344, + "logps/rejected": -74.71965026855469, + "loss": 0.8579, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6386351585388184, + "rewards/margins": 2.7281389236450195, + "rewards/rejected": -0.08950375020503998, + "step": 717 + }, + { + "epoch": 0.18, + "grad_norm": 3.893223285675049, + "learning_rate": 9.650346828942387e-06, + "logits/chosen": 0.026508579030632973, + "logits/rejected": -0.1924162209033966, + "logps/chosen": -59.86072540283203, + "logps/rejected": -57.043701171875, + "loss": 0.8217, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.576446533203125, + "rewards/margins": 3.3160130977630615, + "rewards/rejected": -0.7395665645599365, + "step": 718 + }, + { + "epoch": 0.18, + "grad_norm": 3.381709098815918, + "learning_rate": 9.649383740081571e-06, + "logits/chosen": -0.04257706552743912, + "logits/rejected": -0.16065426170825958, + "logps/chosen": -60.214378356933594, + "logps/rejected": -63.672122955322266, + "loss": 0.9398, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.807257890701294, + "rewards/margins": 2.4498610496520996, + "rewards/rejected": 0.3573968708515167, + "step": 719 + }, + { + "epoch": 0.18, + "grad_norm": 3.9800148010253906, + "learning_rate": 9.648419374864591e-06, + "logits/chosen": -0.06649306416511536, + "logits/rejected": -0.20353963971138, + "logps/chosen": -58.42688751220703, + "logps/rejected": -66.63040161132812, + "loss": 0.9391, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.73638653755188, + "rewards/margins": 2.899332284927368, + "rewards/rejected": -0.16294562816619873, + "step": 720 + }, + { + "epoch": 0.18, + "grad_norm": 4.870150089263916, + "learning_rate": 9.647453733556187e-06, + "logits/chosen": -0.015284309163689613, + "logits/rejected": -0.0706951841711998, + "logps/chosen": -63.93389129638672, + "logps/rejected": -76.85071563720703, + "loss": 0.9426, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8095104694366455, + "rewards/margins": 2.2151272296905518, + "rewards/rejected": 0.594383180141449, + "step": 721 + }, + { + "epoch": 0.18, + "grad_norm": 5.097374439239502, + "learning_rate": 9.64648681642145e-06, + "logits/chosen": -0.1629427969455719, + "logits/rejected": -0.24764156341552734, + "logps/chosen": -85.08113098144531, + "logps/rejected": -61.267330169677734, + "loss": 0.9383, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5266807079315186, + "rewards/margins": 2.7941482067108154, + "rewards/rejected": -0.267467737197876, + "step": 722 + }, + { + "epoch": 0.18, + "grad_norm": 3.3444061279296875, + "learning_rate": 9.645518623725818e-06, + "logits/chosen": -0.11946127563714981, + "logits/rejected": -0.2548782527446747, + "logps/chosen": -80.44859313964844, + "logps/rejected": -63.76898956298828, + "loss": 0.9051, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8331189155578613, + "rewards/margins": 2.7060821056365967, + "rewards/rejected": 0.12703680992126465, + "step": 723 + }, + { + "epoch": 0.18, + "grad_norm": 3.495274305343628, + "learning_rate": 9.644549155735081e-06, + "logits/chosen": -0.0965847596526146, + "logits/rejected": -0.14511854946613312, + "logps/chosen": -66.67926025390625, + "logps/rejected": -73.346435546875, + "loss": 0.9829, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.031257390975952, + "rewards/margins": 2.5216176509857178, + "rewards/rejected": 0.5096395611763, + "step": 724 + }, + { + "epoch": 0.18, + "grad_norm": 4.208229064941406, + "learning_rate": 9.643578412715379e-06, + "logits/chosen": -0.052566271275281906, + "logits/rejected": -0.1899225413799286, + "logps/chosen": -66.53837585449219, + "logps/rejected": -57.02159881591797, + "loss": 0.9012, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.762491226196289, + "rewards/margins": 2.636667251586914, + "rewards/rejected": 0.12582381069660187, + "step": 725 + }, + { + "epoch": 0.18, + "grad_norm": 5.234216690063477, + "learning_rate": 9.642606394933206e-06, + "logits/chosen": -0.06487476825714111, + "logits/rejected": -0.1466589719057083, + "logps/chosen": -67.38255310058594, + "logps/rejected": -78.24595642089844, + "loss": 0.9743, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7026329040527344, + "rewards/margins": 2.7517058849334717, + "rewards/rejected": -0.04907315969467163, + "step": 726 + }, + { + "epoch": 0.18, + "grad_norm": 3.2694807052612305, + "learning_rate": 9.641633102655394e-06, + "logits/chosen": -0.018418142572045326, + "logits/rejected": -0.1517636924982071, + "logps/chosen": -53.98406982421875, + "logps/rejected": -61.69041442871094, + "loss": 0.6565, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8418068885803223, + "rewards/margins": 3.2625303268432617, + "rewards/rejected": -0.4207236170768738, + "step": 727 + }, + { + "epoch": 0.18, + "grad_norm": 3.2363147735595703, + "learning_rate": 9.640658536149137e-06, + "logits/chosen": -0.14731423556804657, + "logits/rejected": -0.28142690658569336, + "logps/chosen": -48.13138961791992, + "logps/rejected": -56.48385238647461, + "loss": 0.7514, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8186697959899902, + "rewards/margins": 2.934077024459839, + "rewards/rejected": -0.115407794713974, + "step": 728 + }, + { + "epoch": 0.18, + "grad_norm": 4.41354513168335, + "learning_rate": 9.639682695681976e-06, + "logits/chosen": -0.151449516415596, + "logits/rejected": -0.2002861499786377, + "logps/chosen": -57.0314826965332, + "logps/rejected": -73.68804931640625, + "loss": 0.9327, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3978779315948486, + "rewards/margins": 2.8202333450317383, + "rewards/rejected": -0.4223553538322449, + "step": 729 + }, + { + "epoch": 0.18, + "grad_norm": 4.742510795593262, + "learning_rate": 9.638705581521798e-06, + "logits/chosen": -0.09459925442934036, + "logits/rejected": -0.23746998608112335, + "logps/chosen": -60.32072448730469, + "logps/rejected": -65.44833374023438, + "loss": 1.0423, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.4670512676239014, + "rewards/margins": 2.077727794647217, + "rewards/rejected": 0.3893234133720398, + "step": 730 + }, + { + "epoch": 0.18, + "grad_norm": 4.662100315093994, + "learning_rate": 9.637727193936843e-06, + "logits/chosen": -0.0352531298995018, + "logits/rejected": -0.058329664170742035, + "logps/chosen": -54.707454681396484, + "logps/rejected": -83.2746810913086, + "loss": 0.8251, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.748119592666626, + "rewards/margins": 2.1551647186279297, + "rewards/rejected": 0.5929550528526306, + "step": 731 + }, + { + "epoch": 0.18, + "grad_norm": 3.3597893714904785, + "learning_rate": 9.636747533195698e-06, + "logits/chosen": -0.0822974219918251, + "logits/rejected": -0.21419095993041992, + "logps/chosen": -58.053287506103516, + "logps/rejected": -63.761863708496094, + "loss": 0.9145, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0156493186950684, + "rewards/margins": 2.995638847351074, + "rewards/rejected": 0.02001040428876877, + "step": 732 + }, + { + "epoch": 0.18, + "grad_norm": 4.142714500427246, + "learning_rate": 9.635766599567302e-06, + "logits/chosen": 0.04484608396887779, + "logits/rejected": -0.0052437130361795425, + "logps/chosen": -67.71354675292969, + "logps/rejected": -85.92951202392578, + "loss": 1.0158, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.9659082889556885, + "rewards/margins": 2.3313841819763184, + "rewards/rejected": 0.6345239877700806, + "step": 733 + }, + { + "epoch": 0.18, + "grad_norm": 3.419210433959961, + "learning_rate": 9.634784393320943e-06, + "logits/chosen": -0.07590124011039734, + "logits/rejected": -0.25456538796424866, + "logps/chosen": -59.659358978271484, + "logps/rejected": -59.96390151977539, + "loss": 0.8209, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6345837116241455, + "rewards/margins": 3.191192865371704, + "rewards/rejected": -0.556609034538269, + "step": 734 + }, + { + "epoch": 0.18, + "grad_norm": 3.6488466262817383, + "learning_rate": 9.633800914726258e-06, + "logits/chosen": -0.1390860378742218, + "logits/rejected": -0.361880362033844, + "logps/chosen": -50.333885192871094, + "logps/rejected": -54.613563537597656, + "loss": 0.7388, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.729767322540283, + "rewards/margins": 3.007004976272583, + "rewards/rejected": -0.27723774313926697, + "step": 735 + }, + { + "epoch": 0.18, + "grad_norm": 4.112559795379639, + "learning_rate": 9.632816164053232e-06, + "logits/chosen": -0.13595451414585114, + "logits/rejected": -0.26176977157592773, + "logps/chosen": -61.31624984741211, + "logps/rejected": -69.1988296508789, + "loss": 0.8962, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8467471599578857, + "rewards/margins": 2.663910150527954, + "rewards/rejected": 0.18283683061599731, + "step": 736 + }, + { + "epoch": 0.18, + "grad_norm": 4.772015571594238, + "learning_rate": 9.631830141572198e-06, + "logits/chosen": -0.06787599623203278, + "logits/rejected": -0.218626007437706, + "logps/chosen": -59.69327163696289, + "logps/rejected": -65.31015014648438, + "loss": 0.9103, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.718792200088501, + "rewards/margins": 2.662318468093872, + "rewards/rejected": 0.056473925709724426, + "step": 737 + }, + { + "epoch": 0.18, + "grad_norm": 2.85080885887146, + "learning_rate": 9.630842847553846e-06, + "logits/chosen": 0.0880788266658783, + "logits/rejected": -0.12116695940494537, + "logps/chosen": -67.89675903320312, + "logps/rejected": -62.638572692871094, + "loss": 0.786, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9374115467071533, + "rewards/margins": 3.9813530445098877, + "rewards/rejected": -1.043941617012024, + "step": 738 + }, + { + "epoch": 0.18, + "grad_norm": 3.6573686599731445, + "learning_rate": 9.629854282269206e-06, + "logits/chosen": -0.14442414045333862, + "logits/rejected": -0.25700217485427856, + "logps/chosen": -56.02899169921875, + "logps/rejected": -67.25455474853516, + "loss": 0.8486, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.658036947250366, + "rewards/margins": 2.853228807449341, + "rewards/rejected": -0.19519217312335968, + "step": 739 + }, + { + "epoch": 0.19, + "grad_norm": 3.211665391921997, + "learning_rate": 9.62886444598966e-06, + "logits/chosen": -0.18903300166130066, + "logits/rejected": -0.3357946574687958, + "logps/chosen": -61.2865104675293, + "logps/rejected": -55.9226188659668, + "loss": 0.7558, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.690826177597046, + "rewards/margins": 2.76962947845459, + "rewards/rejected": -0.07880344241857529, + "step": 740 + }, + { + "epoch": 0.19, + "grad_norm": 4.6784348487854, + "learning_rate": 9.627873338986944e-06, + "logits/chosen": -0.1564941704273224, + "logits/rejected": -0.20266854763031006, + "logps/chosen": -58.98987579345703, + "logps/rejected": -69.91485595703125, + "loss": 1.0542, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.5888960361480713, + "rewards/margins": 1.692244052886963, + "rewards/rejected": 0.8966521620750427, + "step": 741 + }, + { + "epoch": 0.19, + "grad_norm": 3.7458019256591797, + "learning_rate": 9.626880961533131e-06, + "logits/chosen": -0.01938096061348915, + "logits/rejected": -0.18627393245697021, + "logps/chosen": -54.95818328857422, + "logps/rejected": -80.09431457519531, + "loss": 0.9051, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6435041427612305, + "rewards/margins": 3.1499640941619873, + "rewards/rejected": -0.5064598321914673, + "step": 742 + }, + { + "epoch": 0.19, + "grad_norm": 5.342312812805176, + "learning_rate": 9.625887313900656e-06, + "logits/chosen": -0.13893935084342957, + "logits/rejected": -0.1956396996974945, + "logps/chosen": -53.7729606628418, + "logps/rejected": -71.18621826171875, + "loss": 0.904, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8246824741363525, + "rewards/margins": 2.4468753337860107, + "rewards/rejected": 0.3778071999549866, + "step": 743 + }, + { + "epoch": 0.19, + "grad_norm": 4.163437843322754, + "learning_rate": 9.624892396362293e-06, + "logits/chosen": -0.14792191982269287, + "logits/rejected": -0.1519937813282013, + "logps/chosen": -53.125038146972656, + "logps/rejected": -72.6804428100586, + "loss": 0.8836, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.068466901779175, + "rewards/margins": 2.854790687561035, + "rewards/rejected": 0.2136760652065277, + "step": 744 + }, + { + "epoch": 0.19, + "grad_norm": 4.065654754638672, + "learning_rate": 9.623896209191172e-06, + "logits/chosen": -0.09412854164838791, + "logits/rejected": -0.1722213327884674, + "logps/chosen": -54.3457145690918, + "logps/rejected": -72.66976165771484, + "loss": 0.8224, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0315487384796143, + "rewards/margins": 2.736443042755127, + "rewards/rejected": 0.2951053977012634, + "step": 745 + }, + { + "epoch": 0.19, + "grad_norm": 4.2273054122924805, + "learning_rate": 9.622898752660763e-06, + "logits/chosen": -0.07188202440738678, + "logits/rejected": -0.21504709124565125, + "logps/chosen": -57.361080169677734, + "logps/rejected": -67.80815124511719, + "loss": 0.7622, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.765754222869873, + "rewards/margins": 3.541759490966797, + "rewards/rejected": -0.7760051488876343, + "step": 746 + }, + { + "epoch": 0.19, + "grad_norm": 4.663156032562256, + "learning_rate": 9.621900027044895e-06, + "logits/chosen": -0.11824523657560349, + "logits/rejected": -0.20794540643692017, + "logps/chosen": -61.15109634399414, + "logps/rejected": -76.84413146972656, + "loss": 0.97, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6387524604797363, + "rewards/margins": 2.1133909225463867, + "rewards/rejected": 0.5253616571426392, + "step": 747 + }, + { + "epoch": 0.19, + "grad_norm": 3.4176981449127197, + "learning_rate": 9.620900032617734e-06, + "logits/chosen": -0.07382262498140335, + "logits/rejected": -0.12408794462680817, + "logps/chosen": -59.77935028076172, + "logps/rejected": -71.0246353149414, + "loss": 0.8947, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8991525173187256, + "rewards/margins": 2.485057830810547, + "rewards/rejected": 0.4140949845314026, + "step": 748 + }, + { + "epoch": 0.19, + "grad_norm": 3.953826665878296, + "learning_rate": 9.619898769653806e-06, + "logits/chosen": -0.11647442728281021, + "logits/rejected": -0.25178781151771545, + "logps/chosen": -47.66145324707031, + "logps/rejected": -71.00863647460938, + "loss": 0.7915, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7481205463409424, + "rewards/margins": 3.048797845840454, + "rewards/rejected": -0.3006770610809326, + "step": 749 + }, + { + "epoch": 0.19, + "grad_norm": 5.2794718742370605, + "learning_rate": 9.618896238427973e-06, + "logits/chosen": -0.1432887315750122, + "logits/rejected": -0.2546376585960388, + "logps/chosen": -50.59043884277344, + "logps/rejected": -61.10732650756836, + "loss": 0.9006, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.553260326385498, + "rewards/margins": 2.427657127380371, + "rewards/rejected": 0.12560352683067322, + "step": 750 + }, + { + "epoch": 0.19, + "grad_norm": 3.3510758876800537, + "learning_rate": 9.617892439215456e-06, + "logits/chosen": -0.09492908418178558, + "logits/rejected": -0.2000466287136078, + "logps/chosen": -54.27619552612305, + "logps/rejected": -66.16604614257812, + "loss": 0.7713, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6883978843688965, + "rewards/margins": 2.8601744174957275, + "rewards/rejected": -0.17177650332450867, + "step": 751 + }, + { + "epoch": 0.19, + "grad_norm": 6.527529716491699, + "learning_rate": 9.61688737229182e-06, + "logits/chosen": -0.09741155058145523, + "logits/rejected": -0.2835739254951477, + "logps/chosen": -49.28322219848633, + "logps/rejected": -54.847015380859375, + "loss": 0.7776, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6641499996185303, + "rewards/margins": 3.1101644039154053, + "rewards/rejected": -0.446014404296875, + "step": 752 + }, + { + "epoch": 0.19, + "grad_norm": 4.114752292633057, + "learning_rate": 9.615881037932973e-06, + "logits/chosen": -0.10377778112888336, + "logits/rejected": -0.21920308470726013, + "logps/chosen": -53.75618362426758, + "logps/rejected": -63.08868408203125, + "loss": 0.7856, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8451414108276367, + "rewards/margins": 3.2419092655181885, + "rewards/rejected": -0.39676785469055176, + "step": 753 + }, + { + "epoch": 0.19, + "grad_norm": 3.8192317485809326, + "learning_rate": 9.614873436415179e-06, + "logits/chosen": -0.035215720534324646, + "logits/rejected": -0.1905314028263092, + "logps/chosen": -64.4052505493164, + "logps/rejected": -56.12071228027344, + "loss": 0.9163, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.590219259262085, + "rewards/margins": 2.1412813663482666, + "rewards/rejected": 0.44893795251846313, + "step": 754 + }, + { + "epoch": 0.19, + "grad_norm": 5.574986934661865, + "learning_rate": 9.613864568015047e-06, + "logits/chosen": -0.0760369822382927, + "logits/rejected": -0.22033175826072693, + "logps/chosen": -52.564308166503906, + "logps/rejected": -64.42730712890625, + "loss": 0.9099, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8097684383392334, + "rewards/margins": 2.8723549842834473, + "rewards/rejected": -0.06258651614189148, + "step": 755 + }, + { + "epoch": 0.19, + "grad_norm": 3.9796271324157715, + "learning_rate": 9.612854433009531e-06, + "logits/chosen": -0.07082716375589371, + "logits/rejected": -0.25529730319976807, + "logps/chosen": -62.38629913330078, + "logps/rejected": -57.828369140625, + "loss": 0.8153, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9838309288024902, + "rewards/margins": 3.0087575912475586, + "rewards/rejected": -0.02492670714855194, + "step": 756 + }, + { + "epoch": 0.19, + "grad_norm": 4.3793840408325195, + "learning_rate": 9.611843031675935e-06, + "logits/chosen": -0.10449382662773132, + "logits/rejected": -0.2462349832057953, + "logps/chosen": -71.42796325683594, + "logps/rejected": -63.192588806152344, + "loss": 0.9748, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6719913482666016, + "rewards/margins": 2.9814813137054443, + "rewards/rejected": -0.3094898462295532, + "step": 757 + }, + { + "epoch": 0.19, + "grad_norm": 4.246792316436768, + "learning_rate": 9.610830364291911e-06, + "logits/chosen": -0.05965249612927437, + "logits/rejected": -0.2664982080459595, + "logps/chosen": -62.21173858642578, + "logps/rejected": -58.968666076660156, + "loss": 0.8019, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.744687557220459, + "rewards/margins": 3.2917728424072266, + "rewards/rejected": -0.5470852851867676, + "step": 758 + }, + { + "epoch": 0.19, + "grad_norm": 4.640572547912598, + "learning_rate": 9.60981643113546e-06, + "logits/chosen": -0.10534649342298508, + "logits/rejected": -0.12338384985923767, + "logps/chosen": -60.823699951171875, + "logps/rejected": -74.44051361083984, + "loss": 1.0863, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.709717273712158, + "rewards/margins": 1.8416850566864014, + "rewards/rejected": 0.8680320978164673, + "step": 759 + }, + { + "epoch": 0.19, + "grad_norm": 4.340953826904297, + "learning_rate": 9.608801232484923e-06, + "logits/chosen": -0.0783306285738945, + "logits/rejected": -0.15654876828193665, + "logps/chosen": -53.5539665222168, + "logps/rejected": -69.81501770019531, + "loss": 0.9221, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8640592098236084, + "rewards/margins": 2.5359888076782227, + "rewards/rejected": 0.32807034254074097, + "step": 760 + }, + { + "epoch": 0.19, + "grad_norm": 4.714487552642822, + "learning_rate": 9.607784768619e-06, + "logits/chosen": -0.10183847695589066, + "logits/rejected": -0.22392421960830688, + "logps/chosen": -61.87389373779297, + "logps/rejected": -61.53638458251953, + "loss": 1.0354, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.588670015335083, + "rewards/margins": 2.622166395187378, + "rewards/rejected": -0.03349636495113373, + "step": 761 + }, + { + "epoch": 0.19, + "grad_norm": 4.2577314376831055, + "learning_rate": 9.60676703981673e-06, + "logits/chosen": -0.17277202010154724, + "logits/rejected": -0.18628564476966858, + "logps/chosen": -47.05036544799805, + "logps/rejected": -65.03547668457031, + "loss": 0.9154, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.866365432739258, + "rewards/margins": 2.40372371673584, + "rewards/rejected": 0.4626418650150299, + "step": 762 + }, + { + "epoch": 0.19, + "grad_norm": 4.010975360870361, + "learning_rate": 9.605748046357501e-06, + "logits/chosen": -0.03161796182394028, + "logits/rejected": -0.11978224664926529, + "logps/chosen": -56.41962432861328, + "logps/rejected": -65.0184097290039, + "loss": 0.8264, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6924259662628174, + "rewards/margins": 2.5430519580841064, + "rewards/rejected": 0.14937415719032288, + "step": 763 + }, + { + "epoch": 0.19, + "grad_norm": 3.699723482131958, + "learning_rate": 9.604727788521048e-06, + "logits/chosen": -0.02375817857682705, + "logits/rejected": -0.185707226395607, + "logps/chosen": -55.48785400390625, + "logps/rejected": -61.2232551574707, + "loss": 0.9299, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7014970779418945, + "rewards/margins": 2.753093719482422, + "rewards/rejected": -0.05159653723239899, + "step": 764 + }, + { + "epoch": 0.19, + "grad_norm": 4.262121677398682, + "learning_rate": 9.603706266587458e-06, + "logits/chosen": -0.005161836743354797, + "logits/rejected": -0.07450976222753525, + "logps/chosen": -74.58455657958984, + "logps/rejected": -70.84892272949219, + "loss": 0.9023, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8189423084259033, + "rewards/margins": 2.3864176273345947, + "rewards/rejected": 0.4325249195098877, + "step": 765 + }, + { + "epoch": 0.19, + "grad_norm": 3.6053900718688965, + "learning_rate": 9.602683480837155e-06, + "logits/chosen": -0.08816052228212357, + "logits/rejected": -0.2642270028591156, + "logps/chosen": -58.29388427734375, + "logps/rejected": -61.540260314941406, + "loss": 0.7948, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.625187397003174, + "rewards/margins": 2.8481597900390625, + "rewards/rejected": -0.22297243773937225, + "step": 766 + }, + { + "epoch": 0.19, + "grad_norm": 3.4004642963409424, + "learning_rate": 9.601659431550918e-06, + "logits/chosen": -0.04159587621688843, + "logits/rejected": -0.16087426245212555, + "logps/chosen": -52.379093170166016, + "logps/rejected": -59.12614822387695, + "loss": 0.765, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8533389568328857, + "rewards/margins": 3.045856475830078, + "rewards/rejected": -0.19251719117164612, + "step": 767 + }, + { + "epoch": 0.19, + "grad_norm": 4.121867656707764, + "learning_rate": 9.600634119009873e-06, + "logits/chosen": -0.048103317618370056, + "logits/rejected": -0.05764605849981308, + "logps/chosen": -55.20060729980469, + "logps/rejected": -82.369140625, + "loss": 0.878, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.639005184173584, + "rewards/margins": 2.4343111515045166, + "rewards/rejected": 0.20469412207603455, + "step": 768 + }, + { + "epoch": 0.19, + "grad_norm": 3.9740328788757324, + "learning_rate": 9.599607543495488e-06, + "logits/chosen": -0.10852396488189697, + "logits/rejected": -0.2309846580028534, + "logps/chosen": -57.95891571044922, + "logps/rejected": -70.97589111328125, + "loss": 0.874, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7111644744873047, + "rewards/margins": 2.937293529510498, + "rewards/rejected": -0.22612902522087097, + "step": 769 + }, + { + "epoch": 0.19, + "grad_norm": 5.344733238220215, + "learning_rate": 9.598579705289579e-06, + "logits/chosen": -0.06860566884279251, + "logits/rejected": -0.26500558853149414, + "logps/chosen": -71.14510345458984, + "logps/rejected": -54.895851135253906, + "loss": 1.0572, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8141767978668213, + "rewards/margins": 2.2999913692474365, + "rewards/rejected": 0.5141852498054504, + "step": 770 + }, + { + "epoch": 0.19, + "grad_norm": 3.5523931980133057, + "learning_rate": 9.597550604674313e-06, + "logits/chosen": -0.040792886167764664, + "logits/rejected": -0.2042219340801239, + "logps/chosen": -59.030242919921875, + "logps/rejected": -68.98417663574219, + "loss": 0.881, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.681304931640625, + "rewards/margins": 2.754279136657715, + "rewards/rejected": -0.07297450304031372, + "step": 771 + }, + { + "epoch": 0.19, + "grad_norm": 4.7709736824035645, + "learning_rate": 9.596520241932198e-06, + "logits/chosen": -0.08700723946094513, + "logits/rejected": -0.18439461290836334, + "logps/chosen": -53.546478271484375, + "logps/rejected": -70.29312133789062, + "loss": 0.8783, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6787233352661133, + "rewards/margins": 2.6606476306915283, + "rewards/rejected": 0.01807577908039093, + "step": 772 + }, + { + "epoch": 0.19, + "grad_norm": 3.497896909713745, + "learning_rate": 9.595488617346093e-06, + "logits/chosen": -0.10591921955347061, + "logits/rejected": -0.22400043904781342, + "logps/chosen": -51.288169860839844, + "logps/rejected": -64.35250854492188, + "loss": 0.8293, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.714911460876465, + "rewards/margins": 2.6964943408966064, + "rewards/rejected": 0.01841692626476288, + "step": 773 + }, + { + "epoch": 0.19, + "grad_norm": 6.113112449645996, + "learning_rate": 9.594455731199197e-06, + "logits/chosen": -0.010037860833108425, + "logits/rejected": -0.1422976702451706, + "logps/chosen": -69.25265502929688, + "logps/rejected": -73.3146743774414, + "loss": 0.8398, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.655541181564331, + "rewards/margins": 3.1864771842956543, + "rewards/rejected": -0.5309362411499023, + "step": 774 + }, + { + "epoch": 0.19, + "grad_norm": 3.522310733795166, + "learning_rate": 9.593421583775064e-06, + "logits/chosen": -0.025259237736463547, + "logits/rejected": -0.10274762660264969, + "logps/chosen": -58.2341194152832, + "logps/rejected": -71.87527465820312, + "loss": 0.8146, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6737027168273926, + "rewards/margins": 2.3212156295776367, + "rewards/rejected": 0.3524872660636902, + "step": 775 + }, + { + "epoch": 0.19, + "grad_norm": 3.8521392345428467, + "learning_rate": 9.592386175357589e-06, + "logits/chosen": -0.10556880384683609, + "logits/rejected": -0.19066770374774933, + "logps/chosen": -52.35844421386719, + "logps/rejected": -66.27303314208984, + "loss": 0.8716, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.870455503463745, + "rewards/margins": 2.8936805725097656, + "rewards/rejected": -0.02322496473789215, + "step": 776 + }, + { + "epoch": 0.19, + "grad_norm": 5.302354335784912, + "learning_rate": 9.591349506231011e-06, + "logits/chosen": -0.04556143283843994, + "logits/rejected": -0.2179257571697235, + "logps/chosen": -62.45824432373047, + "logps/rejected": -56.852725982666016, + "loss": 0.9275, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.720874071121216, + "rewards/margins": 1.9878292083740234, + "rewards/rejected": 0.7330449819564819, + "step": 777 + }, + { + "epoch": 0.19, + "grad_norm": 2.977670907974243, + "learning_rate": 9.590311576679921e-06, + "logits/chosen": -0.12255628407001495, + "logits/rejected": -0.2532398998737335, + "logps/chosen": -56.03162384033203, + "logps/rejected": -61.370662689208984, + "loss": 0.8071, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.941291570663452, + "rewards/margins": 3.110295057296753, + "rewards/rejected": -0.16900351643562317, + "step": 778 + }, + { + "epoch": 0.19, + "grad_norm": 3.8572652339935303, + "learning_rate": 9.589272386989252e-06, + "logits/chosen": -0.06517687439918518, + "logits/rejected": -0.21010029315948486, + "logps/chosen": -70.99332427978516, + "logps/rejected": -69.75968933105469, + "loss": 0.833, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0323119163513184, + "rewards/margins": 3.4193692207336426, + "rewards/rejected": -0.38705769181251526, + "step": 779 + }, + { + "epoch": 0.2, + "grad_norm": 3.672241449356079, + "learning_rate": 9.588231937444284e-06, + "logits/chosen": -0.01791660115122795, + "logits/rejected": -0.1064162403345108, + "logps/chosen": -66.96107482910156, + "logps/rejected": -65.06526184082031, + "loss": 0.9637, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.010056257247925, + "rewards/margins": 2.6354267597198486, + "rewards/rejected": 0.37462976574897766, + "step": 780 + }, + { + "epoch": 0.2, + "grad_norm": 4.023370265960693, + "learning_rate": 9.587190228330643e-06, + "logits/chosen": -0.11866515129804611, + "logits/rejected": -0.22310365736484528, + "logps/chosen": -54.52159881591797, + "logps/rejected": -68.3778305053711, + "loss": 0.9134, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7896065711975098, + "rewards/margins": 2.626767873764038, + "rewards/rejected": 0.16283881664276123, + "step": 781 + }, + { + "epoch": 0.2, + "grad_norm": 5.112190246582031, + "learning_rate": 9.5861472599343e-06, + "logits/chosen": -0.128499373793602, + "logits/rejected": -0.2753973603248596, + "logps/chosen": -56.64295959472656, + "logps/rejected": -69.1180191040039, + "loss": 0.8958, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.562385082244873, + "rewards/margins": 2.9680418968200684, + "rewards/rejected": -0.4056566059589386, + "step": 782 + }, + { + "epoch": 0.2, + "grad_norm": 5.308708667755127, + "learning_rate": 9.585103032541573e-06, + "logits/chosen": -0.06300826370716095, + "logits/rejected": -0.22386518120765686, + "logps/chosen": -64.1207275390625, + "logps/rejected": -65.29883575439453, + "loss": 0.8033, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7431111335754395, + "rewards/margins": 3.0099496841430664, + "rewards/rejected": -0.26683861017227173, + "step": 783 + }, + { + "epoch": 0.2, + "grad_norm": 3.787496328353882, + "learning_rate": 9.584057546439126e-06, + "logits/chosen": -0.15771718323230743, + "logits/rejected": -0.29735687375068665, + "logps/chosen": -57.81602096557617, + "logps/rejected": -65.25666046142578, + "loss": 0.8435, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.845428705215454, + "rewards/margins": 2.69692325592041, + "rewards/rejected": 0.1485055536031723, + "step": 784 + }, + { + "epoch": 0.2, + "grad_norm": 5.03449010848999, + "learning_rate": 9.583010801913966e-06, + "logits/chosen": -0.10408251732587814, + "logits/rejected": -0.12018898129463196, + "logps/chosen": -52.257808685302734, + "logps/rejected": -72.72894287109375, + "loss": 0.9361, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8317298889160156, + "rewards/margins": 2.281362295150757, + "rewards/rejected": 0.5503674149513245, + "step": 785 + }, + { + "epoch": 0.2, + "grad_norm": 4.447982311248779, + "learning_rate": 9.581962799253445e-06, + "logits/chosen": -0.10139525681734085, + "logits/rejected": -0.1702655553817749, + "logps/chosen": -51.50613784790039, + "logps/rejected": -67.80847930908203, + "loss": 0.818, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4537465572357178, + "rewards/margins": 2.530538320541382, + "rewards/rejected": -0.07679171115159988, + "step": 786 + }, + { + "epoch": 0.2, + "grad_norm": 6.184262752532959, + "learning_rate": 9.580913538745268e-06, + "logits/chosen": -0.02054269053041935, + "logits/rejected": -0.1073363721370697, + "logps/chosen": -61.18621826171875, + "logps/rejected": -77.7683334350586, + "loss": 0.8891, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6525986194610596, + "rewards/margins": 2.3317320346832275, + "rewards/rejected": 0.3208667039871216, + "step": 787 + }, + { + "epoch": 0.2, + "grad_norm": 5.000100135803223, + "learning_rate": 9.579863020677475e-06, + "logits/chosen": -0.07224254310131073, + "logits/rejected": -0.14572283625602722, + "logps/chosen": -67.34137725830078, + "logps/rejected": -68.15492248535156, + "loss": 1.0886, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.568610668182373, + "rewards/margins": 1.8616522550582886, + "rewards/rejected": 0.7069587111473083, + "step": 788 + }, + { + "epoch": 0.2, + "grad_norm": 4.818731307983398, + "learning_rate": 9.578811245338457e-06, + "logits/chosen": -0.12150901556015015, + "logits/rejected": -0.2732487916946411, + "logps/chosen": -58.42396926879883, + "logps/rejected": -57.51997375488281, + "loss": 1.0004, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.675992012023926, + "rewards/margins": 2.3871424198150635, + "rewards/rejected": 0.288849413394928, + "step": 789 + }, + { + "epoch": 0.2, + "grad_norm": 3.607081890106201, + "learning_rate": 9.577758213016948e-06, + "logits/chosen": -0.023793909698724747, + "logits/rejected": -0.15155550837516785, + "logps/chosen": -65.84668731689453, + "logps/rejected": -72.95936584472656, + "loss": 0.8669, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.734015464782715, + "rewards/margins": 3.1028003692626953, + "rewards/rejected": -0.36878469586372375, + "step": 790 + }, + { + "epoch": 0.2, + "grad_norm": 6.934836387634277, + "learning_rate": 9.57670392400203e-06, + "logits/chosen": -0.09231200069189072, + "logits/rejected": -0.3119755983352661, + "logps/chosen": -67.38220977783203, + "logps/rejected": -49.812713623046875, + "loss": 1.0913, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.52302885055542, + "rewards/margins": 2.5319414138793945, + "rewards/rejected": -0.00891275703907013, + "step": 791 + }, + { + "epoch": 0.2, + "grad_norm": 5.515101432800293, + "learning_rate": 9.575648378583129e-06, + "logits/chosen": -0.037733547389507294, + "logits/rejected": -0.253485769033432, + "logps/chosen": -63.97251892089844, + "logps/rejected": -73.7198715209961, + "loss": 0.9687, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6430487632751465, + "rewards/margins": 2.6053099632263184, + "rewards/rejected": 0.037738680839538574, + "step": 792 + }, + { + "epoch": 0.2, + "grad_norm": 5.033055782318115, + "learning_rate": 9.574591577050011e-06, + "logits/chosen": -0.09709848463535309, + "logits/rejected": -0.1610121726989746, + "logps/chosen": -47.82663345336914, + "logps/rejected": -61.473838806152344, + "loss": 0.879, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8055543899536133, + "rewards/margins": 2.7940261363983154, + "rewards/rejected": 0.011528253555297852, + "step": 793 + }, + { + "epoch": 0.2, + "grad_norm": 3.2056684494018555, + "learning_rate": 9.573533519692795e-06, + "logits/chosen": -0.0672820657491684, + "logits/rejected": -0.178663969039917, + "logps/chosen": -63.37205505371094, + "logps/rejected": -64.04708862304688, + "loss": 0.8408, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1565608978271484, + "rewards/margins": 3.051722526550293, + "rewards/rejected": 0.10483825206756592, + "step": 794 + }, + { + "epoch": 0.2, + "grad_norm": 3.3674933910369873, + "learning_rate": 9.57247420680194e-06, + "logits/chosen": -0.053376857191324234, + "logits/rejected": -0.182339608669281, + "logps/chosen": -60.10490417480469, + "logps/rejected": -69.08638000488281, + "loss": 0.7663, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5453286170959473, + "rewards/margins": 3.0683469772338867, + "rewards/rejected": -0.5230182409286499, + "step": 795 + }, + { + "epoch": 0.2, + "grad_norm": 3.336778402328491, + "learning_rate": 9.571413638668246e-06, + "logits/chosen": -0.06261827796697617, + "logits/rejected": -0.15127629041671753, + "logps/chosen": -68.84850311279297, + "logps/rejected": -70.62506103515625, + "loss": 0.8848, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.109623908996582, + "rewards/margins": 3.016122341156006, + "rewards/rejected": 0.09350135922431946, + "step": 796 + }, + { + "epoch": 0.2, + "grad_norm": 4.272322654724121, + "learning_rate": 9.570351815582866e-06, + "logits/chosen": -0.1670914888381958, + "logits/rejected": -0.20569223165512085, + "logps/chosen": -57.823036193847656, + "logps/rejected": -93.85922241210938, + "loss": 0.8363, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5018088817596436, + "rewards/margins": 2.743950128555298, + "rewards/rejected": -0.24214115738868713, + "step": 797 + }, + { + "epoch": 0.2, + "grad_norm": 7.587250232696533, + "learning_rate": 9.569288737837292e-06, + "logits/chosen": -0.05670807510614395, + "logits/rejected": -0.20312348008155823, + "logps/chosen": -53.82738494873047, + "logps/rejected": -65.71694946289062, + "loss": 0.996, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.345961570739746, + "rewards/margins": 2.3794734477996826, + "rewards/rejected": -0.033511996269226074, + "step": 798 + }, + { + "epoch": 0.2, + "grad_norm": 7.761821269989014, + "learning_rate": 9.568224405723362e-06, + "logits/chosen": -0.09824362397193909, + "logits/rejected": -0.1906874030828476, + "logps/chosen": -64.61942291259766, + "logps/rejected": -76.50994873046875, + "loss": 0.9083, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.54496431350708, + "rewards/margins": 2.928926467895508, + "rewards/rejected": -0.3839622437953949, + "step": 799 + }, + { + "epoch": 0.2, + "grad_norm": 3.5727713108062744, + "learning_rate": 9.56715881953326e-06, + "logits/chosen": -0.11034896969795227, + "logits/rejected": -0.22351154685020447, + "logps/chosen": -60.27545166015625, + "logps/rejected": -59.31681442260742, + "loss": 0.8809, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7508981227874756, + "rewards/margins": 2.8494391441345215, + "rewards/rejected": -0.0985412448644638, + "step": 800 + }, + { + "epoch": 0.2, + "grad_norm": 5.446577548980713, + "learning_rate": 9.566091979559509e-06, + "logits/chosen": -0.007353663444519043, + "logits/rejected": -0.1429029107093811, + "logps/chosen": -75.21764373779297, + "logps/rejected": -67.28058624267578, + "loss": 0.9478, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7777693271636963, + "rewards/margins": 2.5576794147491455, + "rewards/rejected": 0.22008970379829407, + "step": 801 + }, + { + "epoch": 0.2, + "grad_norm": 3.5178582668304443, + "learning_rate": 9.56502388609498e-06, + "logits/chosen": -0.025736264884471893, + "logits/rejected": -0.13387596607208252, + "logps/chosen": -67.31594848632812, + "logps/rejected": -70.2166748046875, + "loss": 0.8497, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.677435874938965, + "rewards/margins": 2.504904270172119, + "rewards/rejected": 0.17253193259239197, + "step": 802 + }, + { + "epoch": 0.2, + "grad_norm": 3.902522087097168, + "learning_rate": 9.563954539432891e-06, + "logits/chosen": -0.06866078823804855, + "logits/rejected": -0.19422325491905212, + "logps/chosen": -61.89042282104492, + "logps/rejected": -66.84778594970703, + "loss": 0.8467, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9145257472991943, + "rewards/margins": 3.0578625202178955, + "rewards/rejected": -0.14333680272102356, + "step": 803 + }, + { + "epoch": 0.2, + "grad_norm": 3.1642720699310303, + "learning_rate": 9.562883939866797e-06, + "logits/chosen": -0.05281982943415642, + "logits/rejected": -0.23170390725135803, + "logps/chosen": -65.02517700195312, + "logps/rejected": -64.36811828613281, + "loss": 0.8558, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8170862197875977, + "rewards/margins": 3.0947678089141846, + "rewards/rejected": -0.2776816785335541, + "step": 804 + }, + { + "epoch": 0.2, + "grad_norm": 4.577205181121826, + "learning_rate": 9.561812087690602e-06, + "logits/chosen": -0.04102461785078049, + "logits/rejected": -0.20200300216674805, + "logps/chosen": -53.9010009765625, + "logps/rejected": -56.03846740722656, + "loss": 0.9873, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.479761838912964, + "rewards/margins": 2.2099826335906982, + "rewards/rejected": 0.2697792649269104, + "step": 805 + }, + { + "epoch": 0.2, + "grad_norm": 4.042501926422119, + "learning_rate": 9.560738983198554e-06, + "logits/chosen": -0.016319304704666138, + "logits/rejected": -0.2415163367986679, + "logps/chosen": -67.7331314086914, + "logps/rejected": -60.41853332519531, + "loss": 0.7732, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.584202289581299, + "rewards/margins": 3.169313669204712, + "rewards/rejected": -0.5851117968559265, + "step": 806 + }, + { + "epoch": 0.2, + "grad_norm": 5.310415267944336, + "learning_rate": 9.559664626685242e-06, + "logits/chosen": -0.013181019574403763, + "logits/rejected": -0.15469415485858917, + "logps/chosen": -61.1738166809082, + "logps/rejected": -63.055660247802734, + "loss": 0.8547, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9368743896484375, + "rewards/margins": 2.817279577255249, + "rewards/rejected": 0.1195942834019661, + "step": 807 + }, + { + "epoch": 0.2, + "grad_norm": 3.7071726322174072, + "learning_rate": 9.5585890184456e-06, + "logits/chosen": -0.14178121089935303, + "logits/rejected": -0.2696577310562134, + "logps/chosen": -53.8408088684082, + "logps/rejected": -50.158485412597656, + "loss": 1.0021, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.820734739303589, + "rewards/margins": 2.517608642578125, + "rewards/rejected": 0.3031262159347534, + "step": 808 + }, + { + "epoch": 0.2, + "grad_norm": 4.269414901733398, + "learning_rate": 9.557512158774905e-06, + "logits/chosen": -0.12692226469516754, + "logits/rejected": -0.22237330675125122, + "logps/chosen": -58.290122985839844, + "logps/rejected": -65.59961700439453, + "loss": 0.8804, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.837230682373047, + "rewards/margins": 3.29716420173645, + "rewards/rejected": -0.45993417501449585, + "step": 809 + }, + { + "epoch": 0.2, + "grad_norm": 4.974958419799805, + "learning_rate": 9.55643404796878e-06, + "logits/chosen": -0.12004400789737701, + "logits/rejected": -0.14362305402755737, + "logps/chosen": -50.35207748413086, + "logps/rejected": -71.43190002441406, + "loss": 0.8095, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6857237815856934, + "rewards/margins": 2.8412539958953857, + "rewards/rejected": -0.1555299311876297, + "step": 810 + }, + { + "epoch": 0.2, + "grad_norm": 3.7707462310791016, + "learning_rate": 9.555354686323188e-06, + "logits/chosen": -0.1288047879934311, + "logits/rejected": -0.2404683232307434, + "logps/chosen": -56.937889099121094, + "logps/rejected": -64.04560089111328, + "loss": 0.7628, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5923521518707275, + "rewards/margins": 3.2082114219665527, + "rewards/rejected": -0.6158590316772461, + "step": 811 + }, + { + "epoch": 0.2, + "grad_norm": 5.6026611328125, + "learning_rate": 9.55427407413444e-06, + "logits/chosen": -0.08642090857028961, + "logits/rejected": -0.16978073120117188, + "logps/chosen": -54.64654541015625, + "logps/rejected": -66.72769165039062, + "loss": 0.8929, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6386141777038574, + "rewards/margins": 2.685283660888672, + "rewards/rejected": -0.04666968435049057, + "step": 812 + }, + { + "epoch": 0.2, + "grad_norm": 4.158841609954834, + "learning_rate": 9.553192211699183e-06, + "logits/chosen": -0.06157476082444191, + "logits/rejected": -0.174442321062088, + "logps/chosen": -49.8619384765625, + "logps/rejected": -54.60163116455078, + "loss": 0.8631, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0135960578918457, + "rewards/margins": 2.601684093475342, + "rewards/rejected": 0.4119119942188263, + "step": 813 + }, + { + "epoch": 0.2, + "grad_norm": 4.199512958526611, + "learning_rate": 9.552109099314412e-06, + "logits/chosen": -0.054561011493206024, + "logits/rejected": -0.16105394065380096, + "logps/chosen": -54.27195358276367, + "logps/rejected": -64.31507873535156, + "loss": 0.8255, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6797852516174316, + "rewards/margins": 2.702439785003662, + "rewards/rejected": -0.02265426516532898, + "step": 814 + }, + { + "epoch": 0.2, + "grad_norm": 4.112599849700928, + "learning_rate": 9.55102473727747e-06, + "logits/chosen": -0.15546497702598572, + "logits/rejected": -0.253682941198349, + "logps/chosen": -45.965389251708984, + "logps/rejected": -62.87873840332031, + "loss": 0.7323, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7958219051361084, + "rewards/margins": 3.327976703643799, + "rewards/rejected": -0.5321545600891113, + "step": 815 + }, + { + "epoch": 0.2, + "grad_norm": 3.6938040256500244, + "learning_rate": 9.549939125886033e-06, + "logits/chosen": -0.12061136960983276, + "logits/rejected": -0.14521503448486328, + "logps/chosen": -54.90924835205078, + "logps/rejected": -60.68656921386719, + "loss": 0.8441, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9594905376434326, + "rewards/margins": 2.63897705078125, + "rewards/rejected": 0.320513516664505, + "step": 816 + }, + { + "epoch": 0.2, + "grad_norm": 4.2964935302734375, + "learning_rate": 9.548852265438126e-06, + "logits/chosen": -0.05368511378765106, + "logits/rejected": -0.08333062380552292, + "logps/chosen": -60.43548583984375, + "logps/rejected": -71.72639465332031, + "loss": 0.953, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7272870540618896, + "rewards/margins": 2.109213352203369, + "rewards/rejected": 0.6180738806724548, + "step": 817 + }, + { + "epoch": 0.2, + "grad_norm": 3.0938360691070557, + "learning_rate": 9.547764156232115e-06, + "logits/chosen": 0.09685001522302628, + "logits/rejected": -0.12049286812543869, + "logps/chosen": -61.639495849609375, + "logps/rejected": -56.5821647644043, + "loss": 0.7151, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8343300819396973, + "rewards/margins": 3.0102479457855225, + "rewards/rejected": -0.17591778934001923, + "step": 818 + }, + { + "epoch": 0.2, + "grad_norm": 4.01059627532959, + "learning_rate": 9.546674798566711e-06, + "logits/chosen": -0.10551638901233673, + "logits/rejected": -0.2439221739768982, + "logps/chosen": -65.29782104492188, + "logps/rejected": -61.017982482910156, + "loss": 0.9221, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6559126377105713, + "rewards/margins": 2.666090488433838, + "rewards/rejected": -0.010177940130233765, + "step": 819 + }, + { + "epoch": 0.21, + "grad_norm": 3.7041285037994385, + "learning_rate": 9.545584192740965e-06, + "logits/chosen": -0.030958661809563637, + "logits/rejected": -0.18665215373039246, + "logps/chosen": -56.21635818481445, + "logps/rejected": -52.65877151489258, + "loss": 0.8132, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9340271949768066, + "rewards/margins": 2.5505666732788086, + "rewards/rejected": 0.38346076011657715, + "step": 820 + }, + { + "epoch": 0.21, + "grad_norm": 3.1767351627349854, + "learning_rate": 9.544492339054273e-06, + "logits/chosen": -0.09701203554868698, + "logits/rejected": -0.2574891149997711, + "logps/chosen": -61.819515228271484, + "logps/rejected": -54.99803161621094, + "loss": 0.7995, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.469508647918701, + "rewards/margins": 3.07598614692688, + "rewards/rejected": -0.6064776182174683, + "step": 821 + }, + { + "epoch": 0.21, + "grad_norm": 5.090101718902588, + "learning_rate": 9.54339923780637e-06, + "logits/chosen": -0.11264898627996445, + "logits/rejected": -0.30670565366744995, + "logps/chosen": -67.24663543701172, + "logps/rejected": -50.12194061279297, + "loss": 0.952, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8475658893585205, + "rewards/margins": 2.3778648376464844, + "rewards/rejected": 0.46970105171203613, + "step": 822 + }, + { + "epoch": 0.21, + "grad_norm": 3.9211089611053467, + "learning_rate": 9.542304889297338e-06, + "logits/chosen": -0.03988506644964218, + "logits/rejected": -0.11345527321100235, + "logps/chosen": -55.43803787231445, + "logps/rejected": -83.07383728027344, + "loss": 0.7871, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8715622425079346, + "rewards/margins": 2.857034683227539, + "rewards/rejected": 0.014527276158332825, + "step": 823 + }, + { + "epoch": 0.21, + "grad_norm": 4.385017395019531, + "learning_rate": 9.541209293827599e-06, + "logits/chosen": -0.13143911957740784, + "logits/rejected": -0.2652537524700165, + "logps/chosen": -64.70833587646484, + "logps/rejected": -59.66130828857422, + "loss": 0.9182, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.578988552093506, + "rewards/margins": 2.603806972503662, + "rewards/rejected": -0.024818211793899536, + "step": 824 + }, + { + "epoch": 0.21, + "grad_norm": 4.316320896148682, + "learning_rate": 9.540112451697915e-06, + "logits/chosen": -0.10766241699457169, + "logits/rejected": -0.2864358127117157, + "logps/chosen": -47.68408203125, + "logps/rejected": -56.74497985839844, + "loss": 0.6251, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.926048755645752, + "rewards/margins": 3.5176801681518555, + "rewards/rejected": -0.5916313529014587, + "step": 825 + }, + { + "epoch": 0.21, + "grad_norm": 4.271144866943359, + "learning_rate": 9.539014363209398e-06, + "logits/chosen": -0.17982713878154755, + "logits/rejected": -0.27079397439956665, + "logps/chosen": -46.29928970336914, + "logps/rejected": -49.69963455200195, + "loss": 0.9491, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7979257106781006, + "rewards/margins": 2.191899299621582, + "rewards/rejected": 0.6060259938240051, + "step": 826 + }, + { + "epoch": 0.21, + "grad_norm": 4.094520092010498, + "learning_rate": 9.537915028663493e-06, + "logits/chosen": -0.11168837547302246, + "logits/rejected": -0.22874638438224792, + "logps/chosen": -54.280540466308594, + "logps/rejected": -62.759952545166016, + "loss": 0.8726, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.524047613143921, + "rewards/margins": 2.3992385864257812, + "rewards/rejected": 0.12480942904949188, + "step": 827 + }, + { + "epoch": 0.21, + "grad_norm": 5.505963325500488, + "learning_rate": 9.536814448361993e-06, + "logits/chosen": -0.11781193315982819, + "logits/rejected": -0.1555149406194687, + "logps/chosen": -54.17803192138672, + "logps/rejected": -77.02925109863281, + "loss": 0.9748, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6453564167022705, + "rewards/margins": 2.588879346847534, + "rewards/rejected": 0.05647691339254379, + "step": 828 + }, + { + "epoch": 0.21, + "grad_norm": 4.0634307861328125, + "learning_rate": 9.53571262260703e-06, + "logits/chosen": -0.12774355709552765, + "logits/rejected": -0.241655170917511, + "logps/chosen": -49.039615631103516, + "logps/rejected": -52.39826202392578, + "loss": 0.9645, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.5337324142456055, + "rewards/margins": 2.416110038757324, + "rewards/rejected": 0.11762222647666931, + "step": 829 + }, + { + "epoch": 0.21, + "grad_norm": 3.1992218494415283, + "learning_rate": 9.534609551701078e-06, + "logits/chosen": -0.019338490441441536, + "logits/rejected": -0.1601540893316269, + "logps/chosen": -71.03568267822266, + "logps/rejected": -66.62875366210938, + "loss": 0.8433, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0505101680755615, + "rewards/margins": 3.2916817665100098, + "rewards/rejected": -0.24117140471935272, + "step": 830 + }, + { + "epoch": 0.21, + "grad_norm": 4.901125907897949, + "learning_rate": 9.533505235946956e-06, + "logits/chosen": -0.12425914406776428, + "logits/rejected": -0.21741031110286713, + "logps/chosen": -53.79277420043945, + "logps/rejected": -74.03764343261719, + "loss": 0.8685, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6556360721588135, + "rewards/margins": 2.6213204860687256, + "rewards/rejected": 0.03431575745344162, + "step": 831 + }, + { + "epoch": 0.21, + "grad_norm": 4.762125015258789, + "learning_rate": 9.53239967564782e-06, + "logits/chosen": -0.11156804114580154, + "logits/rejected": -0.18831749260425568, + "logps/chosen": -57.002655029296875, + "logps/rejected": -72.2470932006836, + "loss": 0.8767, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8059351444244385, + "rewards/margins": 2.984168291091919, + "rewards/rejected": -0.17823311686515808, + "step": 832 + }, + { + "epoch": 0.21, + "grad_norm": 6.0574259757995605, + "learning_rate": 9.531292871107173e-06, + "logits/chosen": -0.047444045543670654, + "logits/rejected": -0.16510048508644104, + "logps/chosen": -78.9583740234375, + "logps/rejected": -68.78851318359375, + "loss": 1.0931, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.554792881011963, + "rewards/margins": 1.856469750404358, + "rewards/rejected": 0.6983230113983154, + "step": 833 + }, + { + "epoch": 0.21, + "grad_norm": 4.042847156524658, + "learning_rate": 9.530184822628854e-06, + "logits/chosen": -0.09346379339694977, + "logits/rejected": -0.20797279477119446, + "logps/chosen": -87.72020721435547, + "logps/rejected": -60.49940872192383, + "loss": 0.8422, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.000243902206421, + "rewards/margins": 3.157219886779785, + "rewards/rejected": -0.15697629749774933, + "step": 834 + }, + { + "epoch": 0.21, + "grad_norm": 5.129181861877441, + "learning_rate": 9.529075530517048e-06, + "logits/chosen": -0.07507264614105225, + "logits/rejected": -0.10938471555709839, + "logps/chosen": -65.5983657836914, + "logps/rejected": -95.78341674804688, + "loss": 0.927, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7737948894500732, + "rewards/margins": 3.082940101623535, + "rewards/rejected": -0.30914515256881714, + "step": 835 + }, + { + "epoch": 0.21, + "grad_norm": 3.20560359954834, + "learning_rate": 9.527964995076279e-06, + "logits/chosen": -0.11601287871599197, + "logits/rejected": -0.3003634214401245, + "logps/chosen": -56.14046096801758, + "logps/rejected": -60.541290283203125, + "loss": 0.807, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.964890480041504, + "rewards/margins": 3.653435230255127, + "rewards/rejected": -0.6885444521903992, + "step": 836 + }, + { + "epoch": 0.21, + "grad_norm": 3.128110885620117, + "learning_rate": 9.526853216611415e-06, + "logits/chosen": -0.066445492208004, + "logits/rejected": -0.1414981186389923, + "logps/chosen": -58.16755294799805, + "logps/rejected": -77.25418853759766, + "loss": 0.7121, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.929192304611206, + "rewards/margins": 3.2139110565185547, + "rewards/rejected": -0.2847188413143158, + "step": 837 + }, + { + "epoch": 0.21, + "grad_norm": 5.208739280700684, + "learning_rate": 9.525740195427659e-06, + "logits/chosen": -0.07623268663883209, + "logits/rejected": -0.15542621910572052, + "logps/chosen": -54.72138214111328, + "logps/rejected": -78.74971771240234, + "loss": 0.8943, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6982052326202393, + "rewards/margins": 2.4940099716186523, + "rewards/rejected": 0.20419549942016602, + "step": 838 + }, + { + "epoch": 0.21, + "grad_norm": 4.1045708656311035, + "learning_rate": 9.524625931830563e-06, + "logits/chosen": -0.03973114863038063, + "logits/rejected": -0.20702624320983887, + "logps/chosen": -60.76568603515625, + "logps/rejected": -63.780189514160156, + "loss": 0.8341, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6912808418273926, + "rewards/margins": 2.592806100845337, + "rewards/rejected": 0.09847494214773178, + "step": 839 + }, + { + "epoch": 0.21, + "grad_norm": 4.273802280426025, + "learning_rate": 9.523510426126015e-06, + "logits/chosen": -0.10371606796979904, + "logits/rejected": -0.15078404545783997, + "logps/chosen": -58.46562576293945, + "logps/rejected": -71.86378479003906, + "loss": 1.0511, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5378644466400146, + "rewards/margins": 1.9581347703933716, + "rewards/rejected": 0.5797295570373535, + "step": 840 + }, + { + "epoch": 0.21, + "grad_norm": 2.4170875549316406, + "learning_rate": 9.522393678620244e-06, + "logits/chosen": -0.04078872501850128, + "logits/rejected": -0.20229411125183105, + "logps/chosen": -57.60700607299805, + "logps/rejected": -60.957218170166016, + "loss": 0.7237, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7245776653289795, + "rewards/margins": 3.17966365814209, + "rewards/rejected": -0.4550859332084656, + "step": 841 + }, + { + "epoch": 0.21, + "grad_norm": 3.653186559677124, + "learning_rate": 9.521275689619824e-06, + "logits/chosen": 0.025866877287626266, + "logits/rejected": -0.06575752049684525, + "logps/chosen": -63.417633056640625, + "logps/rejected": -83.63468170166016, + "loss": 0.85, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1661863327026367, + "rewards/margins": 2.883453369140625, + "rewards/rejected": 0.28273290395736694, + "step": 842 + }, + { + "epoch": 0.21, + "grad_norm": 5.095499038696289, + "learning_rate": 9.520156459431664e-06, + "logits/chosen": -0.09712602198123932, + "logits/rejected": -0.20895171165466309, + "logps/chosen": -62.6209716796875, + "logps/rejected": -61.37042236328125, + "loss": 1.0052, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.593045949935913, + "rewards/margins": 2.1198997497558594, + "rewards/rejected": 0.47314611077308655, + "step": 843 + }, + { + "epoch": 0.21, + "grad_norm": 4.411657810211182, + "learning_rate": 9.519035988363021e-06, + "logits/chosen": -0.09104596078395844, + "logits/rejected": -0.2505369782447815, + "logps/chosen": -55.999061584472656, + "logps/rejected": -62.15446472167969, + "loss": 0.7954, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7842140197753906, + "rewards/margins": 3.4756503105163574, + "rewards/rejected": -0.6914365291595459, + "step": 844 + }, + { + "epoch": 0.21, + "grad_norm": 3.870941638946533, + "learning_rate": 9.517914276721485e-06, + "logits/chosen": -0.10192448645830154, + "logits/rejected": -0.2364819049835205, + "logps/chosen": -50.369503021240234, + "logps/rejected": -63.996253967285156, + "loss": 0.8319, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5918614864349365, + "rewards/margins": 2.4098782539367676, + "rewards/rejected": 0.1819833368062973, + "step": 845 + }, + { + "epoch": 0.21, + "grad_norm": 5.036740303039551, + "learning_rate": 9.516791324814991e-06, + "logits/chosen": -0.002039228565990925, + "logits/rejected": -0.12952573597431183, + "logps/chosen": -71.23160552978516, + "logps/rejected": -70.19792175292969, + "loss": 0.9897, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.478069305419922, + "rewards/margins": 2.0546579360961914, + "rewards/rejected": 0.42341122031211853, + "step": 846 + }, + { + "epoch": 0.21, + "grad_norm": 3.22011399269104, + "learning_rate": 9.515667132951813e-06, + "logits/chosen": -0.09208875894546509, + "logits/rejected": -0.17954720556735992, + "logps/chosen": -57.488197326660156, + "logps/rejected": -65.81925964355469, + "loss": 0.8148, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.24998140335083, + "rewards/margins": 2.8041529655456543, + "rewards/rejected": 0.4458279609680176, + "step": 847 + }, + { + "epoch": 0.21, + "grad_norm": 5.853875637054443, + "learning_rate": 9.514541701440568e-06, + "logits/chosen": -0.0849994644522667, + "logits/rejected": -0.1740894317626953, + "logps/chosen": -53.238136291503906, + "logps/rejected": -66.07085418701172, + "loss": 1.047, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.455477476119995, + "rewards/margins": 2.1828198432922363, + "rewards/rejected": 0.2726574242115021, + "step": 848 + }, + { + "epoch": 0.21, + "grad_norm": 6.165475845336914, + "learning_rate": 9.513415030590209e-06, + "logits/chosen": -0.10373485833406448, + "logits/rejected": -0.16525594890117645, + "logps/chosen": -58.830875396728516, + "logps/rejected": -75.40181732177734, + "loss": 0.8633, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.994927167892456, + "rewards/margins": 3.0978286266326904, + "rewards/rejected": -0.10290142148733139, + "step": 849 + }, + { + "epoch": 0.21, + "grad_norm": 4.755518436431885, + "learning_rate": 9.512287120710032e-06, + "logits/chosen": -0.06086049973964691, + "logits/rejected": -0.15777617692947388, + "logps/chosen": -62.307064056396484, + "logps/rejected": -69.5114974975586, + "loss": 1.0254, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.872095823287964, + "rewards/margins": 2.6493656635284424, + "rewards/rejected": 0.2227303832769394, + "step": 850 + }, + { + "epoch": 0.21, + "grad_norm": 4.1921210289001465, + "learning_rate": 9.511157972109673e-06, + "logits/chosen": -0.0021516885608434677, + "logits/rejected": -0.1469825655221939, + "logps/chosen": -66.17603302001953, + "logps/rejected": -71.68692016601562, + "loss": 0.9965, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9124341011047363, + "rewards/margins": 1.968572735786438, + "rewards/rejected": 0.9438613653182983, + "step": 851 + }, + { + "epoch": 0.21, + "grad_norm": 3.5423269271850586, + "learning_rate": 9.510027585099107e-06, + "logits/chosen": -0.13592374324798584, + "logits/rejected": -0.23954784870147705, + "logps/chosen": -59.24249267578125, + "logps/rejected": -59.72222900390625, + "loss": 0.8459, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9935152530670166, + "rewards/margins": 2.9565343856811523, + "rewards/rejected": 0.03698053956031799, + "step": 852 + }, + { + "epoch": 0.21, + "grad_norm": 2.726154088973999, + "learning_rate": 9.508895959988651e-06, + "logits/chosen": -0.1499311923980713, + "logits/rejected": -0.22247274219989777, + "logps/chosen": -45.881874084472656, + "logps/rejected": -71.25878143310547, + "loss": 0.7199, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.032409429550171, + "rewards/margins": 3.811009168624878, + "rewards/rejected": -0.7785994410514832, + "step": 853 + }, + { + "epoch": 0.21, + "grad_norm": 4.423026084899902, + "learning_rate": 9.50776309708896e-06, + "logits/chosen": -0.0853017196059227, + "logits/rejected": -0.18472081422805786, + "logps/chosen": -69.95516967773438, + "logps/rejected": -72.55436706542969, + "loss": 1.0225, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7734339237213135, + "rewards/margins": 2.2596075534820557, + "rewards/rejected": 0.5138262510299683, + "step": 854 + }, + { + "epoch": 0.21, + "grad_norm": 5.103067874908447, + "learning_rate": 9.50662899671103e-06, + "logits/chosen": -0.09800032526254654, + "logits/rejected": -0.25565147399902344, + "logps/chosen": -53.343772888183594, + "logps/rejected": -61.43370819091797, + "loss": 0.8022, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0886638164520264, + "rewards/margins": 3.1553871631622314, + "rewards/rejected": -0.06672342866659164, + "step": 855 + }, + { + "epoch": 0.21, + "grad_norm": 3.6611576080322266, + "learning_rate": 9.505493659166193e-06, + "logits/chosen": -0.026520265266299248, + "logits/rejected": -0.19485817849636078, + "logps/chosen": -74.0309066772461, + "logps/rejected": -62.28173065185547, + "loss": 0.877, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.87564754486084, + "rewards/margins": 2.8885061740875244, + "rewards/rejected": -0.01285870373249054, + "step": 856 + }, + { + "epoch": 0.21, + "grad_norm": 3.597424030303955, + "learning_rate": 9.504357084766127e-06, + "logits/chosen": -0.029613934457302094, + "logits/rejected": -0.03906534984707832, + "logps/chosen": -52.561180114746094, + "logps/rejected": -77.53363037109375, + "loss": 0.8381, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8685317039489746, + "rewards/margins": 2.699533224105835, + "rewards/rejected": 0.16899865865707397, + "step": 857 + }, + { + "epoch": 0.21, + "grad_norm": 3.7829487323760986, + "learning_rate": 9.503219273822844e-06, + "logits/chosen": -0.13076458871364594, + "logits/rejected": -0.17895564436912537, + "logps/chosen": -48.63140869140625, + "logps/rejected": -64.42809295654297, + "loss": 0.879, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8547630310058594, + "rewards/margins": 2.4246044158935547, + "rewards/rejected": 0.4301586151123047, + "step": 858 + }, + { + "epoch": 0.21, + "grad_norm": 4.177932262420654, + "learning_rate": 9.502080226648699e-06, + "logits/chosen": -0.11246620863676071, + "logits/rejected": -0.19218279421329498, + "logps/chosen": -59.29332733154297, + "logps/rejected": -75.83860778808594, + "loss": 0.814, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.973498821258545, + "rewards/margins": 2.553429365158081, + "rewards/rejected": 0.4200693964958191, + "step": 859 + }, + { + "epoch": 0.22, + "grad_norm": 3.762617826461792, + "learning_rate": 9.500939943556383e-06, + "logits/chosen": -0.08212887495756149, + "logits/rejected": -0.1982020139694214, + "logps/chosen": -53.83660888671875, + "logps/rejected": -63.53532409667969, + "loss": 0.7856, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7988715171813965, + "rewards/margins": 2.9109225273132324, + "rewards/rejected": -0.1120506227016449, + "step": 860 + }, + { + "epoch": 0.22, + "grad_norm": 3.644911766052246, + "learning_rate": 9.49979842485893e-06, + "logits/chosen": -0.047273438423871994, + "logits/rejected": -0.15880250930786133, + "logps/chosen": -52.68492889404297, + "logps/rejected": -78.69353485107422, + "loss": 0.7011, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.728677988052368, + "rewards/margins": 3.6849067211151123, + "rewards/rejected": -0.9562287330627441, + "step": 861 + }, + { + "epoch": 0.22, + "grad_norm": 3.468583345413208, + "learning_rate": 9.498655670869715e-06, + "logits/chosen": -0.06675925850868225, + "logits/rejected": -0.16684377193450928, + "logps/chosen": -54.27362823486328, + "logps/rejected": -70.26439666748047, + "loss": 0.8783, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.835500717163086, + "rewards/margins": 2.4498579502105713, + "rewards/rejected": 0.3856427073478699, + "step": 862 + }, + { + "epoch": 0.22, + "grad_norm": 4.297532081604004, + "learning_rate": 9.497511681902441e-06, + "logits/chosen": -0.14872786402702332, + "logits/rejected": -0.24628286063671112, + "logps/chosen": -65.14231872558594, + "logps/rejected": -67.46005249023438, + "loss": 1.051, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.452328681945801, + "rewards/margins": 2.945298194885254, + "rewards/rejected": -0.4929697513580322, + "step": 863 + }, + { + "epoch": 0.22, + "grad_norm": 3.6577601432800293, + "learning_rate": 9.496366458271165e-06, + "logits/chosen": -0.14620918035507202, + "logits/rejected": -0.26309382915496826, + "logps/chosen": -56.88295364379883, + "logps/rejected": -70.9520263671875, + "loss": 0.8111, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8176069259643555, + "rewards/margins": 2.958987236022949, + "rewards/rejected": -0.14138060808181763, + "step": 864 + }, + { + "epoch": 0.22, + "grad_norm": 4.896832466125488, + "learning_rate": 9.49522000029027e-06, + "logits/chosen": -0.030031681060791016, + "logits/rejected": -0.15926848351955414, + "logps/chosen": -58.89545822143555, + "logps/rejected": -66.4287109375, + "loss": 0.89, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.709251880645752, + "rewards/margins": 2.6153247356414795, + "rewards/rejected": 0.09392683953046799, + "step": 865 + }, + { + "epoch": 0.22, + "grad_norm": 5.939055442810059, + "learning_rate": 9.494072308274486e-06, + "logits/chosen": -0.028473645448684692, + "logits/rejected": -0.1564640998840332, + "logps/chosen": -72.62440490722656, + "logps/rejected": -69.62053680419922, + "loss": 1.0727, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.5582053661346436, + "rewards/margins": 2.4257829189300537, + "rewards/rejected": 0.1324222981929779, + "step": 866 + }, + { + "epoch": 0.22, + "grad_norm": 6.34014892578125, + "learning_rate": 9.49292338253888e-06, + "logits/chosen": -0.1937168538570404, + "logits/rejected": -0.21953344345092773, + "logps/chosen": -56.55181884765625, + "logps/rejected": -64.18070220947266, + "loss": 0.8795, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6623342037200928, + "rewards/margins": 2.5568161010742188, + "rewards/rejected": 0.10551835596561432, + "step": 867 + }, + { + "epoch": 0.22, + "grad_norm": 3.921485424041748, + "learning_rate": 9.491773223398855e-06, + "logits/chosen": -0.13661542534828186, + "logits/rejected": -0.26727014780044556, + "logps/chosen": -59.57023620605469, + "logps/rejected": -64.01913452148438, + "loss": 0.8859, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6049325466156006, + "rewards/margins": 3.0114946365356445, + "rewards/rejected": -0.40656203031539917, + "step": 868 + }, + { + "epoch": 0.22, + "grad_norm": 4.936825275421143, + "learning_rate": 9.490621831170154e-06, + "logits/chosen": -0.136418417096138, + "logits/rejected": -0.17854225635528564, + "logps/chosen": -53.70631408691406, + "logps/rejected": -70.28898620605469, + "loss": 1.0156, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.9381065368652344, + "rewards/margins": 2.3635411262512207, + "rewards/rejected": 0.5745653510093689, + "step": 869 + }, + { + "epoch": 0.22, + "grad_norm": 3.995129346847534, + "learning_rate": 9.489469206168863e-06, + "logits/chosen": -0.09150892496109009, + "logits/rejected": -0.2200615108013153, + "logps/chosen": -59.11113357543945, + "logps/rejected": -63.784515380859375, + "loss": 0.8475, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.930673599243164, + "rewards/margins": 3.1049752235412598, + "rewards/rejected": -0.17430144548416138, + "step": 870 + }, + { + "epoch": 0.22, + "grad_norm": 5.3535332679748535, + "learning_rate": 9.488315348711397e-06, + "logits/chosen": -0.04878713935613632, + "logits/rejected": -0.17682957649230957, + "logps/chosen": -64.50135803222656, + "logps/rejected": -56.902679443359375, + "loss": 1.1028, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.728297233581543, + "rewards/margins": 2.127758264541626, + "rewards/rejected": 0.6005388498306274, + "step": 871 + }, + { + "epoch": 0.22, + "grad_norm": 4.938320159912109, + "learning_rate": 9.487160259114521e-06, + "logits/chosen": -0.10015231370925903, + "logits/rejected": -0.1859675496816635, + "logps/chosen": -59.09043502807617, + "logps/rejected": -69.29811096191406, + "loss": 0.8441, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.020106315612793, + "rewards/margins": 3.1825990676879883, + "rewards/rejected": -0.16249263286590576, + "step": 872 + }, + { + "epoch": 0.22, + "grad_norm": 2.655236005783081, + "learning_rate": 9.486003937695326e-06, + "logits/chosen": -0.16236616671085358, + "logits/rejected": -0.29886484146118164, + "logps/chosen": -45.00782012939453, + "logps/rejected": -59.342586517333984, + "loss": 0.6993, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.799290657043457, + "rewards/margins": 3.4351751804351807, + "rewards/rejected": -0.6358844041824341, + "step": 873 + }, + { + "epoch": 0.22, + "grad_norm": 3.655707359313965, + "learning_rate": 9.48484638477125e-06, + "logits/chosen": -0.09423147141933441, + "logits/rejected": -0.115958571434021, + "logps/chosen": -46.18817901611328, + "logps/rejected": -71.89186096191406, + "loss": 0.7871, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.905526876449585, + "rewards/margins": 2.587012767791748, + "rewards/rejected": 0.3185141086578369, + "step": 874 + }, + { + "epoch": 0.22, + "grad_norm": 2.6761491298675537, + "learning_rate": 9.483687600660067e-06, + "logits/chosen": -0.04948540776968002, + "logits/rejected": -0.16852539777755737, + "logps/chosen": -57.00883865356445, + "logps/rejected": -73.85551452636719, + "loss": 0.7041, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7495181560516357, + "rewards/margins": 3.640089273452759, + "rewards/rejected": -0.890570878982544, + "step": 875 + }, + { + "epoch": 0.22, + "grad_norm": 4.698198318481445, + "learning_rate": 9.482527585679886e-06, + "logits/chosen": -0.020618624985218048, + "logits/rejected": -0.14495329558849335, + "logps/chosen": -76.33893585205078, + "logps/rejected": -68.81216430664062, + "loss": 1.0917, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.662191867828369, + "rewards/margins": 2.0014994144439697, + "rewards/rejected": 0.6606924533843994, + "step": 876 + }, + { + "epoch": 0.22, + "grad_norm": 4.289124011993408, + "learning_rate": 9.481366340149159e-06, + "logits/chosen": 0.0053137075155973434, + "logits/rejected": -0.14768332242965698, + "logps/chosen": -63.3625373840332, + "logps/rejected": -60.58271789550781, + "loss": 0.9076, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.759721279144287, + "rewards/margins": 2.8231680393218994, + "rewards/rejected": -0.06344693899154663, + "step": 877 + }, + { + "epoch": 0.22, + "grad_norm": 4.462804317474365, + "learning_rate": 9.48020386438667e-06, + "logits/chosen": -0.06217677518725395, + "logits/rejected": -0.18134571611881256, + "logps/chosen": -58.32859420776367, + "logps/rejected": -78.1377182006836, + "loss": 0.8804, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8584089279174805, + "rewards/margins": 2.8859148025512695, + "rewards/rejected": -0.027505964040756226, + "step": 878 + }, + { + "epoch": 0.22, + "grad_norm": 5.091037750244141, + "learning_rate": 9.479040158711546e-06, + "logits/chosen": -0.026874393224716187, + "logits/rejected": -0.11861114203929901, + "logps/chosen": -61.264766693115234, + "logps/rejected": -68.67955017089844, + "loss": 0.9657, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5990142822265625, + "rewards/margins": 2.3769454956054688, + "rewards/rejected": 0.22206857800483704, + "step": 879 + }, + { + "epoch": 0.22, + "grad_norm": 4.6762847900390625, + "learning_rate": 9.477875223443249e-06, + "logits/chosen": -0.010445192456245422, + "logits/rejected": -0.22682076692581177, + "logps/chosen": -65.84745025634766, + "logps/rejected": -56.738975524902344, + "loss": 0.8465, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5086002349853516, + "rewards/margins": 2.7175967693328857, + "rewards/rejected": -0.20899632573127747, + "step": 880 + }, + { + "epoch": 0.22, + "grad_norm": 6.96931791305542, + "learning_rate": 9.476709058901577e-06, + "logits/chosen": -0.08809873461723328, + "logits/rejected": -0.20674589276313782, + "logps/chosen": -57.769962310791016, + "logps/rejected": -62.21562194824219, + "loss": 0.9405, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.13470721244812, + "rewards/margins": 2.6716742515563965, + "rewards/rejected": -0.5369669795036316, + "step": 881 + }, + { + "epoch": 0.22, + "grad_norm": 2.237816572189331, + "learning_rate": 9.475541665406669e-06, + "logits/chosen": -0.06147167459130287, + "logits/rejected": -0.18098700046539307, + "logps/chosen": -55.09095001220703, + "logps/rejected": -64.23561096191406, + "loss": 0.692, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.864032030105591, + "rewards/margins": 3.6803982257843018, + "rewards/rejected": -0.8163662552833557, + "step": 882 + }, + { + "epoch": 0.22, + "grad_norm": 4.774936676025391, + "learning_rate": 9.474373043278997e-06, + "logits/chosen": -0.18912693858146667, + "logits/rejected": -0.3249594569206238, + "logps/chosen": -52.29981231689453, + "logps/rejected": -63.29134750366211, + "loss": 0.9302, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.657196521759033, + "rewards/margins": 3.087738275527954, + "rewards/rejected": -0.4305420517921448, + "step": 883 + }, + { + "epoch": 0.22, + "grad_norm": 2.652989625930786, + "learning_rate": 9.473203192839379e-06, + "logits/chosen": -0.08797810971736908, + "logits/rejected": -0.17364531755447388, + "logps/chosen": -45.59590148925781, + "logps/rejected": -70.23059844970703, + "loss": 0.7498, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7889657020568848, + "rewards/margins": 3.4011425971984863, + "rewards/rejected": -0.6121768951416016, + "step": 884 + }, + { + "epoch": 0.22, + "grad_norm": 7.992712497711182, + "learning_rate": 9.472032114408958e-06, + "logits/chosen": 0.02400135062634945, + "logits/rejected": -0.08575595915317535, + "logps/chosen": -68.09989929199219, + "logps/rejected": -74.20880126953125, + "loss": 0.8966, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.750217914581299, + "rewards/margins": 3.0373635292053223, + "rewards/rejected": -0.28714579343795776, + "step": 885 + }, + { + "epoch": 0.22, + "grad_norm": 5.721590042114258, + "learning_rate": 9.470859808309224e-06, + "logits/chosen": -0.07438457012176514, + "logits/rejected": -0.23822522163391113, + "logps/chosen": -57.24750518798828, + "logps/rejected": -61.08980941772461, + "loss": 0.9045, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6971969604492188, + "rewards/margins": 2.6125400066375732, + "rewards/rejected": 0.08465681970119476, + "step": 886 + }, + { + "epoch": 0.22, + "grad_norm": 3.1730756759643555, + "learning_rate": 9.469686274861998e-06, + "logits/chosen": -0.04609205573797226, + "logits/rejected": -0.1348705142736435, + "logps/chosen": -53.124332427978516, + "logps/rejected": -64.16527557373047, + "loss": 0.8155, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9084272384643555, + "rewards/margins": 2.733055591583252, + "rewards/rejected": 0.17537149786949158, + "step": 887 + }, + { + "epoch": 0.22, + "grad_norm": 4.437376976013184, + "learning_rate": 9.468511514389442e-06, + "logits/chosen": -0.07298780232667923, + "logits/rejected": -0.12996673583984375, + "logps/chosen": -64.42572784423828, + "logps/rejected": -67.71726989746094, + "loss": 0.9286, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.087984085083008, + "rewards/margins": 2.4432055950164795, + "rewards/rejected": 0.6447786688804626, + "step": 888 + }, + { + "epoch": 0.22, + "grad_norm": 3.067957878112793, + "learning_rate": 9.467335527214054e-06, + "logits/chosen": -0.06295716762542725, + "logits/rejected": -0.14644436538219452, + "logps/chosen": -58.69305419921875, + "logps/rejected": -61.64311599731445, + "loss": 0.8515, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.083449125289917, + "rewards/margins": 2.906956672668457, + "rewards/rejected": 0.17649251222610474, + "step": 889 + }, + { + "epoch": 0.22, + "grad_norm": 4.1403985023498535, + "learning_rate": 9.466158313658665e-06, + "logits/chosen": -0.01304561085999012, + "logits/rejected": -0.12523624300956726, + "logps/chosen": -58.24693298339844, + "logps/rejected": -65.62474822998047, + "loss": 0.867, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.760915994644165, + "rewards/margins": 2.159708261489868, + "rewards/rejected": 0.6012077331542969, + "step": 890 + }, + { + "epoch": 0.22, + "grad_norm": 3.8799355030059814, + "learning_rate": 9.464979874046445e-06, + "logits/chosen": -0.12277419865131378, + "logits/rejected": -0.2081236094236374, + "logps/chosen": -50.02997970581055, + "logps/rejected": -73.12667846679688, + "loss": 0.7555, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7724196910858154, + "rewards/margins": 2.920103073120117, + "rewards/rejected": -0.14768345654010773, + "step": 891 + }, + { + "epoch": 0.22, + "grad_norm": 3.6161484718322754, + "learning_rate": 9.463800208700904e-06, + "logits/chosen": -0.05404636263847351, + "logits/rejected": -0.1872420459985733, + "logps/chosen": -53.05302429199219, + "logps/rejected": -68.21022033691406, + "loss": 0.7275, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6163277626037598, + "rewards/margins": 3.2425434589385986, + "rewards/rejected": -0.626215398311615, + "step": 892 + }, + { + "epoch": 0.22, + "grad_norm": 3.8477087020874023, + "learning_rate": 9.462619317945887e-06, + "logits/chosen": -0.12270346283912659, + "logits/rejected": -0.2706815302371979, + "logps/chosen": -53.15671157836914, + "logps/rejected": -67.21377563476562, + "loss": 0.8728, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.63731050491333, + "rewards/margins": 3.215902805328369, + "rewards/rejected": -0.5785927176475525, + "step": 893 + }, + { + "epoch": 0.22, + "grad_norm": 4.222252368927002, + "learning_rate": 9.46143720210557e-06, + "logits/chosen": -0.13852082192897797, + "logits/rejected": -0.268268883228302, + "logps/chosen": -57.38623809814453, + "logps/rejected": -60.49374771118164, + "loss": 1.0473, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.724224328994751, + "rewards/margins": 2.441575050354004, + "rewards/rejected": 0.2826489806175232, + "step": 894 + }, + { + "epoch": 0.22, + "grad_norm": 4.023367404937744, + "learning_rate": 9.46025386150447e-06, + "logits/chosen": -0.0660674199461937, + "logits/rejected": -0.13661278784275055, + "logps/chosen": -55.949462890625, + "logps/rejected": -69.89897155761719, + "loss": 0.8575, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.791351556777954, + "rewards/margins": 2.734201192855835, + "rewards/rejected": 0.057150453329086304, + "step": 895 + }, + { + "epoch": 0.22, + "grad_norm": 4.874499797821045, + "learning_rate": 9.45906929646744e-06, + "logits/chosen": -0.1303483247756958, + "logits/rejected": -0.21682780981063843, + "logps/chosen": -59.20553970336914, + "logps/rejected": -76.08499908447266, + "loss": 1.066, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.3940796852111816, + "rewards/margins": 2.1740126609802246, + "rewards/rejected": 0.22006718814373016, + "step": 896 + }, + { + "epoch": 0.22, + "grad_norm": 3.288654327392578, + "learning_rate": 9.45788350731967e-06, + "logits/chosen": -0.034953683614730835, + "logits/rejected": -0.23521125316619873, + "logps/chosen": -70.19593811035156, + "logps/rejected": -54.91870880126953, + "loss": 0.7791, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9721932411193848, + "rewards/margins": 3.7014408111572266, + "rewards/rejected": -0.7292474508285522, + "step": 897 + }, + { + "epoch": 0.22, + "grad_norm": 7.139138698577881, + "learning_rate": 9.456696494386683e-06, + "logits/chosen": -0.028689006343483925, + "logits/rejected": -0.22012433409690857, + "logps/chosen": -68.41622924804688, + "logps/rejected": -64.09609985351562, + "loss": 0.8769, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.477889060974121, + "rewards/margins": 3.233893871307373, + "rewards/rejected": -0.7560046315193176, + "step": 898 + }, + { + "epoch": 0.22, + "grad_norm": 3.2142691612243652, + "learning_rate": 9.45550825799434e-06, + "logits/chosen": -0.052112385630607605, + "logits/rejected": -0.12893110513687134, + "logps/chosen": -58.482845306396484, + "logps/rejected": -76.43751525878906, + "loss": 0.7795, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.169966697692871, + "rewards/margins": 3.425248384475708, + "rewards/rejected": -0.2552812099456787, + "step": 899 + }, + { + "epoch": 0.23, + "grad_norm": 5.7222900390625, + "learning_rate": 9.454318798468838e-06, + "logits/chosen": -0.10743454098701477, + "logits/rejected": -0.15471020340919495, + "logps/chosen": -45.30830001831055, + "logps/rejected": -58.87901306152344, + "loss": 0.9232, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8361005783081055, + "rewards/margins": 2.405270576477051, + "rewards/rejected": 0.430829793214798, + "step": 900 + }, + { + "epoch": 0.23, + "grad_norm": 3.5963032245635986, + "learning_rate": 9.453128116136709e-06, + "logits/chosen": -0.09341587126255035, + "logits/rejected": -0.1475905179977417, + "logps/chosen": -59.23544692993164, + "logps/rejected": -61.5289192199707, + "loss": 0.9045, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8644750118255615, + "rewards/margins": 2.407257556915283, + "rewards/rejected": 0.45721763372421265, + "step": 901 + }, + { + "epoch": 0.23, + "grad_norm": 3.9897091388702393, + "learning_rate": 9.451936211324824e-06, + "logits/chosen": -0.07548868656158447, + "logits/rejected": -0.20869080722332, + "logps/chosen": -50.78364562988281, + "logps/rejected": -58.37653350830078, + "loss": 0.8036, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.879101037979126, + "rewards/margins": 3.456101894378662, + "rewards/rejected": -0.57700115442276, + "step": 902 + }, + { + "epoch": 0.23, + "grad_norm": 3.217609167098999, + "learning_rate": 9.45074308436038e-06, + "logits/chosen": 0.020834948867559433, + "logits/rejected": -0.08786745369434357, + "logps/chosen": -55.55042266845703, + "logps/rejected": -72.68620300292969, + "loss": 0.7664, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9129319190979004, + "rewards/margins": 3.173238515853882, + "rewards/rejected": -0.26030656695365906, + "step": 903 + }, + { + "epoch": 0.23, + "grad_norm": 4.4518656730651855, + "learning_rate": 9.449548735570922e-06, + "logits/chosen": -0.10189420729875565, + "logits/rejected": -0.2646571397781372, + "logps/chosen": -57.99440002441406, + "logps/rejected": -60.849815368652344, + "loss": 1.101, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7203962802886963, + "rewards/margins": 2.616880416870117, + "rewards/rejected": 0.10351593047380447, + "step": 904 + }, + { + "epoch": 0.23, + "grad_norm": 3.3456027507781982, + "learning_rate": 9.448353165284323e-06, + "logits/chosen": -0.10021933913230896, + "logits/rejected": -0.20819927752017975, + "logps/chosen": -55.91731262207031, + "logps/rejected": -65.25873565673828, + "loss": 0.8659, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8377115726470947, + "rewards/margins": 2.629340410232544, + "rewards/rejected": 0.20837122201919556, + "step": 905 + }, + { + "epoch": 0.23, + "grad_norm": 2.55474591255188, + "learning_rate": 9.44715637382879e-06, + "logits/chosen": -0.06405581533908844, + "logits/rejected": -0.27097973227500916, + "logps/chosen": -53.683692932128906, + "logps/rejected": -60.26856231689453, + "loss": 0.7545, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.896599292755127, + "rewards/margins": 3.6536335945129395, + "rewards/rejected": -0.7570343017578125, + "step": 906 + }, + { + "epoch": 0.23, + "grad_norm": 3.887030839920044, + "learning_rate": 9.445958361532872e-06, + "logits/chosen": -0.07974585890769958, + "logits/rejected": -0.11314389854669571, + "logps/chosen": -60.4880485534668, + "logps/rejected": -72.1537857055664, + "loss": 0.9169, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7794981002807617, + "rewards/margins": 2.1478822231292725, + "rewards/rejected": 0.6316159963607788, + "step": 907 + }, + { + "epoch": 0.23, + "grad_norm": 5.336711406707764, + "learning_rate": 9.444759128725446e-06, + "logits/chosen": -0.09044206142425537, + "logits/rejected": -0.21028034389019012, + "logps/chosen": -69.6058349609375, + "logps/rejected": -75.25495910644531, + "loss": 1.0621, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.25191068649292, + "rewards/margins": 2.163313388824463, + "rewards/rejected": 0.08859734982252121, + "step": 908 + }, + { + "epoch": 0.23, + "grad_norm": 5.170251846313477, + "learning_rate": 9.443558675735732e-06, + "logits/chosen": -0.09741470217704773, + "logits/rejected": -0.12175515294075012, + "logps/chosen": -56.889522552490234, + "logps/rejected": -81.00606536865234, + "loss": 0.9055, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8470075130462646, + "rewards/margins": 2.067028760910034, + "rewards/rejected": 0.7799788117408752, + "step": 909 + }, + { + "epoch": 0.23, + "grad_norm": 3.1364433765411377, + "learning_rate": 9.442357002893275e-06, + "logits/chosen": -0.06578796356916428, + "logits/rejected": -0.1677219420671463, + "logps/chosen": -53.10167694091797, + "logps/rejected": -66.80744934082031, + "loss": 0.7418, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.894503355026245, + "rewards/margins": 2.929401397705078, + "rewards/rejected": -0.03489828109741211, + "step": 910 + }, + { + "epoch": 0.23, + "grad_norm": 4.5987067222595215, + "learning_rate": 9.441154110527962e-06, + "logits/chosen": -0.08253481239080429, + "logits/rejected": -0.26163819432258606, + "logps/chosen": -62.29685592651367, + "logps/rejected": -62.49497985839844, + "loss": 0.9735, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.4215824604034424, + "rewards/margins": 2.881235122680664, + "rewards/rejected": -0.459652841091156, + "step": 911 + }, + { + "epoch": 0.23, + "grad_norm": 3.5920729637145996, + "learning_rate": 9.439949998970012e-06, + "logits/chosen": 0.006405770778656006, + "logits/rejected": -0.12536364793777466, + "logps/chosen": -67.26350402832031, + "logps/rejected": -76.13416290283203, + "loss": 0.8845, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.825441837310791, + "rewards/margins": 2.5098671913146973, + "rewards/rejected": 0.3155747056007385, + "step": 912 + }, + { + "epoch": 0.23, + "grad_norm": 4.024339199066162, + "learning_rate": 9.438744668549983e-06, + "logits/chosen": -0.09471042454242706, + "logits/rejected": -0.1986124962568283, + "logps/chosen": -69.69586181640625, + "logps/rejected": -67.86408996582031, + "loss": 0.981, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6825318336486816, + "rewards/margins": 2.2758445739746094, + "rewards/rejected": 0.4066874384880066, + "step": 913 + }, + { + "epoch": 0.23, + "grad_norm": 2.929065704345703, + "learning_rate": 9.437538119598761e-06, + "logits/chosen": 0.03493065387010574, + "logits/rejected": -0.13496622443199158, + "logps/chosen": -47.03486633300781, + "logps/rejected": -57.61128616333008, + "loss": 0.6778, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8916518688201904, + "rewards/margins": 3.4746956825256348, + "rewards/rejected": -0.5830433964729309, + "step": 914 + }, + { + "epoch": 0.23, + "grad_norm": 4.838788032531738, + "learning_rate": 9.436330352447572e-06, + "logits/chosen": -0.12384934723377228, + "logits/rejected": -0.17398644983768463, + "logps/chosen": -51.16332244873047, + "logps/rejected": -69.08496856689453, + "loss": 0.8982, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.673213005065918, + "rewards/margins": 2.4223997592926025, + "rewards/rejected": 0.25081342458724976, + "step": 915 + }, + { + "epoch": 0.23, + "grad_norm": 5.28991174697876, + "learning_rate": 9.435121367427969e-06, + "logits/chosen": -0.06250341236591339, + "logits/rejected": -0.19538331031799316, + "logps/chosen": -57.5873908996582, + "logps/rejected": -69.78211975097656, + "loss": 0.8112, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7051424980163574, + "rewards/margins": 2.9898805618286133, + "rewards/rejected": -0.284737765789032, + "step": 916 + }, + { + "epoch": 0.23, + "grad_norm": 4.130405426025391, + "learning_rate": 9.433911164871853e-06, + "logits/chosen": -0.032042019069194794, + "logits/rejected": -0.1532270908355713, + "logps/chosen": -61.640602111816406, + "logps/rejected": -61.64269256591797, + "loss": 0.7659, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.836372137069702, + "rewards/margins": 3.127790927886963, + "rewards/rejected": -0.29141855239868164, + "step": 917 + }, + { + "epoch": 0.23, + "grad_norm": 5.2991251945495605, + "learning_rate": 9.432699745111442e-06, + "logits/chosen": -0.05866178125143051, + "logits/rejected": -0.20600147545337677, + "logps/chosen": -63.96498107910156, + "logps/rejected": -67.42875671386719, + "loss": 0.8438, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.575125217437744, + "rewards/margins": 3.3172707557678223, + "rewards/rejected": -0.7421454191207886, + "step": 918 + }, + { + "epoch": 0.23, + "grad_norm": 4.401086330413818, + "learning_rate": 9.431487108479302e-06, + "logits/chosen": -0.06930018961429596, + "logits/rejected": -0.15274971723556519, + "logps/chosen": -51.227874755859375, + "logps/rejected": -58.93720245361328, + "loss": 0.8989, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5838871002197266, + "rewards/margins": 2.39755916595459, + "rewards/rejected": 0.18632766604423523, + "step": 919 + }, + { + "epoch": 0.23, + "grad_norm": 6.106692790985107, + "learning_rate": 9.430273255308329e-06, + "logits/chosen": -0.13427616655826569, + "logits/rejected": -0.17347463965415955, + "logps/chosen": -57.5130729675293, + "logps/rejected": -63.2714729309082, + "loss": 1.0102, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6760294437408447, + "rewards/margins": 2.3960492610931396, + "rewards/rejected": 0.2799804210662842, + "step": 920 + }, + { + "epoch": 0.23, + "grad_norm": 4.85402774810791, + "learning_rate": 9.429058185931748e-06, + "logits/chosen": -0.03039679117500782, + "logits/rejected": -0.094289630651474, + "logps/chosen": -61.99136734008789, + "logps/rejected": -73.90798950195312, + "loss": 0.8033, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.913825273513794, + "rewards/margins": 2.7441513538360596, + "rewards/rejected": 0.16967368125915527, + "step": 921 + }, + { + "epoch": 0.23, + "grad_norm": 10.174006462097168, + "learning_rate": 9.427841900683123e-06, + "logits/chosen": -0.12463356554508209, + "logits/rejected": -0.2260843962430954, + "logps/chosen": -54.95741271972656, + "logps/rejected": -71.35708618164062, + "loss": 0.8816, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5296475887298584, + "rewards/margins": 3.364546775817871, + "rewards/rejected": -0.834899365901947, + "step": 922 + }, + { + "epoch": 0.23, + "grad_norm": 3.6724627017974854, + "learning_rate": 9.426624399896351e-06, + "logits/chosen": -0.05967636778950691, + "logits/rejected": -0.10196578502655029, + "logps/chosen": -58.651939392089844, + "logps/rejected": -85.38259887695312, + "loss": 0.8807, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8695621490478516, + "rewards/margins": 2.674004316329956, + "rewards/rejected": 0.19555771350860596, + "step": 923 + }, + { + "epoch": 0.23, + "grad_norm": 4.25089168548584, + "learning_rate": 9.425405683905664e-06, + "logits/chosen": -0.12004423886537552, + "logits/rejected": -0.20082516968250275, + "logps/chosen": -49.030426025390625, + "logps/rejected": -65.6478500366211, + "loss": 0.9232, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7450649738311768, + "rewards/margins": 2.589184522628784, + "rewards/rejected": 0.15588048100471497, + "step": 924 + }, + { + "epoch": 0.23, + "grad_norm": 3.385997772216797, + "learning_rate": 9.42418575304562e-06, + "logits/chosen": -0.13148407638072968, + "logits/rejected": -0.22714151442050934, + "logps/chosen": -63.890098571777344, + "logps/rejected": -69.9189453125, + "loss": 0.8574, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9538145065307617, + "rewards/margins": 3.0016186237335205, + "rewards/rejected": -0.04780381917953491, + "step": 925 + }, + { + "epoch": 0.23, + "grad_norm": 3.3910036087036133, + "learning_rate": 9.422964607651124e-06, + "logits/chosen": -0.10199624300003052, + "logits/rejected": -0.15081635117530823, + "logps/chosen": -59.84633255004883, + "logps/rejected": -90.87693786621094, + "loss": 0.8427, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6436495780944824, + "rewards/margins": 2.8863070011138916, + "rewards/rejected": -0.2426573485136032, + "step": 926 + }, + { + "epoch": 0.23, + "grad_norm": 4.298567771911621, + "learning_rate": 9.421742248057402e-06, + "logits/chosen": -0.13323378562927246, + "logits/rejected": -0.21391873061656952, + "logps/chosen": -50.43346405029297, + "logps/rejected": -73.08690643310547, + "loss": 0.8115, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.89522647857666, + "rewards/margins": 3.3321447372436523, + "rewards/rejected": -0.43691810965538025, + "step": 927 + }, + { + "epoch": 0.23, + "grad_norm": 5.100367069244385, + "learning_rate": 9.420518674600019e-06, + "logits/chosen": -0.07779699563980103, + "logits/rejected": -0.19063395261764526, + "logps/chosen": -62.97929382324219, + "logps/rejected": -72.85293579101562, + "loss": 0.9564, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4297258853912354, + "rewards/margins": 2.4795591831207275, + "rewards/rejected": -0.049833446741104126, + "step": 928 + }, + { + "epoch": 0.23, + "grad_norm": 2.889634370803833, + "learning_rate": 9.419293887614872e-06, + "logits/chosen": -0.12273672968149185, + "logits/rejected": -0.328498899936676, + "logps/chosen": -60.568115234375, + "logps/rejected": -65.35017395019531, + "loss": 0.6527, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8082776069641113, + "rewards/margins": 3.9125313758850098, + "rewards/rejected": -1.1042537689208984, + "step": 929 + }, + { + "epoch": 0.23, + "grad_norm": 5.08120584487915, + "learning_rate": 9.418067887438194e-06, + "logits/chosen": -0.12934857606887817, + "logits/rejected": -0.23665283620357513, + "logps/chosen": -53.333213806152344, + "logps/rejected": -69.79718017578125, + "loss": 0.8119, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5044586658477783, + "rewards/margins": 2.357980489730835, + "rewards/rejected": 0.14647850394248962, + "step": 930 + }, + { + "epoch": 0.23, + "grad_norm": 4.252949237823486, + "learning_rate": 9.416840674406547e-06, + "logits/chosen": -0.10169235616922379, + "logits/rejected": -0.24049413204193115, + "logps/chosen": -66.49388122558594, + "logps/rejected": -61.593109130859375, + "loss": 0.9301, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7995893955230713, + "rewards/margins": 2.9787466526031494, + "rewards/rejected": -0.17915716767311096, + "step": 931 + }, + { + "epoch": 0.23, + "grad_norm": 8.103507041931152, + "learning_rate": 9.415612248856824e-06, + "logits/chosen": -0.0905584841966629, + "logits/rejected": -0.23075008392333984, + "logps/chosen": -64.64899444580078, + "logps/rejected": -65.11227416992188, + "loss": 0.8883, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4530444145202637, + "rewards/margins": 3.56412672996521, + "rewards/rejected": -1.1110820770263672, + "step": 932 + }, + { + "epoch": 0.23, + "grad_norm": 8.034886360168457, + "learning_rate": 9.414382611126258e-06, + "logits/chosen": -0.15069745481014252, + "logits/rejected": -0.23469200730323792, + "logps/chosen": -51.376399993896484, + "logps/rejected": -69.03448486328125, + "loss": 0.8239, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5536184310913086, + "rewards/margins": 3.1120896339416504, + "rewards/rejected": -0.558471143245697, + "step": 933 + }, + { + "epoch": 0.23, + "grad_norm": 4.0900163650512695, + "learning_rate": 9.413151761552413e-06, + "logits/chosen": -0.10470511019229889, + "logits/rejected": -0.23815587162971497, + "logps/chosen": -56.3619384765625, + "logps/rejected": -67.96698760986328, + "loss": 0.7423, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6663520336151123, + "rewards/margins": 3.6260833740234375, + "rewards/rejected": -0.9597312211990356, + "step": 934 + }, + { + "epoch": 0.23, + "grad_norm": 7.845885753631592, + "learning_rate": 9.411919700473178e-06, + "logits/chosen": -0.15701860189437866, + "logits/rejected": -0.28530430793762207, + "logps/chosen": -61.095245361328125, + "logps/rejected": -73.2225341796875, + "loss": 1.0098, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.750492811203003, + "rewards/margins": 3.0422160625457764, + "rewards/rejected": -0.2917232811450958, + "step": 935 + }, + { + "epoch": 0.23, + "grad_norm": 6.052132606506348, + "learning_rate": 9.410686428226784e-06, + "logits/chosen": -0.08712797611951828, + "logits/rejected": -0.21261566877365112, + "logps/chosen": -54.72509765625, + "logps/rejected": -70.97843170166016, + "loss": 0.8567, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3465425968170166, + "rewards/margins": 2.9004135131835938, + "rewards/rejected": -0.5538714528083801, + "step": 936 + }, + { + "epoch": 0.23, + "grad_norm": 3.576935052871704, + "learning_rate": 9.409451945151793e-06, + "logits/chosen": -0.11107131838798523, + "logits/rejected": -0.23166099190711975, + "logps/chosen": -54.09856033325195, + "logps/rejected": -67.5291748046875, + "loss": 0.805, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.678143262863159, + "rewards/margins": 3.355311870574951, + "rewards/rejected": -0.6771686673164368, + "step": 937 + }, + { + "epoch": 0.23, + "grad_norm": 3.1802027225494385, + "learning_rate": 9.408216251587093e-06, + "logits/chosen": -0.10292162746191025, + "logits/rejected": -0.22473812103271484, + "logps/chosen": -55.42625427246094, + "logps/rejected": -52.64210510253906, + "loss": 0.8092, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.724637508392334, + "rewards/margins": 2.650059700012207, + "rewards/rejected": 0.07457782328128815, + "step": 938 + }, + { + "epoch": 0.23, + "grad_norm": 6.2441725730896, + "learning_rate": 9.406979347871909e-06, + "logits/chosen": -0.0652070939540863, + "logits/rejected": -0.2491629719734192, + "logps/chosen": -59.09452438354492, + "logps/rejected": -60.71260452270508, + "loss": 0.7944, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.454395055770874, + "rewards/margins": 3.2231626510620117, + "rewards/rejected": -0.7687675356864929, + "step": 939 + }, + { + "epoch": 0.24, + "grad_norm": 3.2765235900878906, + "learning_rate": 9.4057412343458e-06, + "logits/chosen": -0.12573054432868958, + "logits/rejected": -0.2845160663127899, + "logps/chosen": -62.218265533447266, + "logps/rejected": -61.79517364501953, + "loss": 0.7745, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.642481803894043, + "rewards/margins": 3.0205917358398438, + "rewards/rejected": -0.3781099021434784, + "step": 940 + }, + { + "epoch": 0.24, + "grad_norm": 2.855618476867676, + "learning_rate": 9.404501911348654e-06, + "logits/chosen": -0.15198352932929993, + "logits/rejected": -0.28012555837631226, + "logps/chosen": -47.732872009277344, + "logps/rejected": -61.961246490478516, + "loss": 0.6626, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.003957509994507, + "rewards/margins": 2.98276948928833, + "rewards/rejected": 0.021187618374824524, + "step": 941 + }, + { + "epoch": 0.24, + "grad_norm": 5.117547988891602, + "learning_rate": 9.403261379220691e-06, + "logits/chosen": -0.10949007421731949, + "logits/rejected": -0.2022971659898758, + "logps/chosen": -56.08131408691406, + "logps/rejected": -61.57100296020508, + "loss": 0.9012, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.80118465423584, + "rewards/margins": 2.2032470703125, + "rewards/rejected": 0.5979376435279846, + "step": 942 + }, + { + "epoch": 0.24, + "grad_norm": 3.691096305847168, + "learning_rate": 9.402019638302465e-06, + "logits/chosen": -0.15937572717666626, + "logits/rejected": -0.2516041398048401, + "logps/chosen": -55.76575469970703, + "logps/rejected": -67.70680236816406, + "loss": 0.7985, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7312772274017334, + "rewards/margins": 3.1239073276519775, + "rewards/rejected": -0.39262986183166504, + "step": 943 + }, + { + "epoch": 0.24, + "grad_norm": 4.064199447631836, + "learning_rate": 9.40077668893486e-06, + "logits/chosen": -0.09858226031064987, + "logits/rejected": -0.1650056689977646, + "logps/chosen": -54.14871597290039, + "logps/rejected": -74.53607940673828, + "loss": 0.8479, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.05483078956604, + "rewards/margins": 2.9687416553497314, + "rewards/rejected": 0.08608946204185486, + "step": 944 + }, + { + "epoch": 0.24, + "grad_norm": 5.509207725524902, + "learning_rate": 9.399532531459093e-06, + "logits/chosen": -0.10901995748281479, + "logits/rejected": -0.22370733320713043, + "logps/chosen": -62.1844367980957, + "logps/rejected": -76.9078369140625, + "loss": 1.0255, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.349794626235962, + "rewards/margins": 2.9217910766601562, + "rewards/rejected": -0.5719965696334839, + "step": 945 + }, + { + "epoch": 0.24, + "grad_norm": 3.9959545135498047, + "learning_rate": 9.398287166216711e-06, + "logits/chosen": -0.10130982100963593, + "logits/rejected": -0.1706700325012207, + "logps/chosen": -51.482177734375, + "logps/rejected": -74.81478118896484, + "loss": 0.8319, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6308434009552, + "rewards/margins": 2.8902363777160645, + "rewards/rejected": -0.25939279794692993, + "step": 946 + }, + { + "epoch": 0.24, + "grad_norm": 3.387843370437622, + "learning_rate": 9.397040593549594e-06, + "logits/chosen": -0.07699061185121536, + "logits/rejected": -0.1836039274930954, + "logps/chosen": -52.99269485473633, + "logps/rejected": -72.30076599121094, + "loss": 0.8699, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.040225028991699, + "rewards/margins": 3.3942880630493164, + "rewards/rejected": -0.35406333208084106, + "step": 947 + }, + { + "epoch": 0.24, + "grad_norm": 4.900750160217285, + "learning_rate": 9.395792813799954e-06, + "logits/chosen": -0.12901653349399567, + "logits/rejected": -0.23105129599571228, + "logps/chosen": -51.729644775390625, + "logps/rejected": -59.0481071472168, + "loss": 0.9083, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.645167827606201, + "rewards/margins": 2.672774076461792, + "rewards/rejected": -0.02760617434978485, + "step": 948 + }, + { + "epoch": 0.24, + "grad_norm": 4.1413702964782715, + "learning_rate": 9.394543827310333e-06, + "logits/chosen": -0.16325858235359192, + "logits/rejected": -0.266076922416687, + "logps/chosen": -51.84292221069336, + "logps/rejected": -61.22275161743164, + "loss": 0.8362, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.937772750854492, + "rewards/margins": 2.731717824935913, + "rewards/rejected": 0.20605450868606567, + "step": 949 + }, + { + "epoch": 0.24, + "grad_norm": 3.7670986652374268, + "learning_rate": 9.393293634423604e-06, + "logits/chosen": -0.04708382487297058, + "logits/rejected": -0.10898997634649277, + "logps/chosen": -66.24649047851562, + "logps/rejected": -70.16883850097656, + "loss": 0.8415, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.980496406555176, + "rewards/margins": 2.4040744304656982, + "rewards/rejected": 0.5764217376708984, + "step": 950 + }, + { + "epoch": 0.24, + "grad_norm": 7.209954738616943, + "learning_rate": 9.392042235482973e-06, + "logits/chosen": -0.13259269297122955, + "logits/rejected": -0.3304327130317688, + "logps/chosen": -73.80683898925781, + "logps/rejected": -62.669715881347656, + "loss": 1.034, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.634471893310547, + "rewards/margins": 2.4932174682617188, + "rewards/rejected": 0.14125463366508484, + "step": 951 + }, + { + "epoch": 0.24, + "grad_norm": 3.5076704025268555, + "learning_rate": 9.390789630831975e-06, + "logits/chosen": -0.1951337456703186, + "logits/rejected": -0.21642683446407318, + "logps/chosen": -55.63990020751953, + "logps/rejected": -79.78593444824219, + "loss": 0.8434, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.910539150238037, + "rewards/margins": 3.0393893718719482, + "rewards/rejected": -0.12885022163391113, + "step": 952 + }, + { + "epoch": 0.24, + "grad_norm": 4.096532344818115, + "learning_rate": 9.38953582081448e-06, + "logits/chosen": -0.14237087965011597, + "logits/rejected": -0.2492832988500595, + "logps/chosen": -50.793617248535156, + "logps/rejected": -66.88358306884766, + "loss": 0.837, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.642404556274414, + "rewards/margins": 2.9808058738708496, + "rewards/rejected": -0.33840158581733704, + "step": 953 + }, + { + "epoch": 0.24, + "grad_norm": 4.6456427574157715, + "learning_rate": 9.38828080577468e-06, + "logits/chosen": -0.07439879328012466, + "logits/rejected": -0.2345370352268219, + "logps/chosen": -59.4119758605957, + "logps/rejected": -68.02120208740234, + "loss": 0.9176, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.729623317718506, + "rewards/margins": 2.9108550548553467, + "rewards/rejected": -0.1812315583229065, + "step": 954 + }, + { + "epoch": 0.24, + "grad_norm": 3.8976025581359863, + "learning_rate": 9.38702458605711e-06, + "logits/chosen": -0.07424364984035492, + "logits/rejected": -0.1753612607717514, + "logps/chosen": -51.19685745239258, + "logps/rejected": -68.40400695800781, + "loss": 0.7974, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7607028484344482, + "rewards/margins": 2.534034252166748, + "rewards/rejected": 0.2266683280467987, + "step": 955 + }, + { + "epoch": 0.24, + "grad_norm": 3.6040635108947754, + "learning_rate": 9.385767162006626e-06, + "logits/chosen": -0.09291893243789673, + "logits/rejected": -0.20064140856266022, + "logps/chosen": -67.43834686279297, + "logps/rejected": -82.34516143798828, + "loss": 0.8496, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9228739738464355, + "rewards/margins": 3.0712597370147705, + "rewards/rejected": -0.14838583767414093, + "step": 956 + }, + { + "epoch": 0.24, + "grad_norm": 4.218699932098389, + "learning_rate": 9.384508533968418e-06, + "logits/chosen": -0.04360801726579666, + "logits/rejected": -0.11824602633714676, + "logps/chosen": -63.63580322265625, + "logps/rejected": -78.65343475341797, + "loss": 0.895, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.923771858215332, + "rewards/margins": 3.1091315746307373, + "rewards/rejected": -0.18535958230495453, + "step": 957 + }, + { + "epoch": 0.24, + "grad_norm": 4.954193115234375, + "learning_rate": 9.383248702288007e-06, + "logits/chosen": -0.08715683221817017, + "logits/rejected": -0.13486962020397186, + "logps/chosen": -60.78717803955078, + "logps/rejected": -79.91351318359375, + "loss": 1.0245, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6963672637939453, + "rewards/margins": 2.598684787750244, + "rewards/rejected": 0.09768255054950714, + "step": 958 + }, + { + "epoch": 0.24, + "grad_norm": 3.402188301086426, + "learning_rate": 9.381987667311243e-06, + "logits/chosen": -0.1328953504562378, + "logits/rejected": -0.3092760741710663, + "logps/chosen": -57.92059326171875, + "logps/rejected": -65.69807434082031, + "loss": 0.7935, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7279515266418457, + "rewards/margins": 3.5594642162323, + "rewards/rejected": -0.8315126895904541, + "step": 959 + }, + { + "epoch": 0.24, + "grad_norm": 5.457963466644287, + "learning_rate": 9.380725429384311e-06, + "logits/chosen": -0.09647057205438614, + "logits/rejected": -0.2971137762069702, + "logps/chosen": -64.56117248535156, + "logps/rejected": -77.8245849609375, + "loss": 0.9385, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7700586318969727, + "rewards/margins": 3.0976181030273438, + "rewards/rejected": -0.32755929231643677, + "step": 960 + }, + { + "epoch": 0.24, + "grad_norm": 4.774835109710693, + "learning_rate": 9.379461988853719e-06, + "logits/chosen": -0.18556919693946838, + "logits/rejected": -0.2598456144332886, + "logps/chosen": -53.94353485107422, + "logps/rejected": -66.40406799316406, + "loss": 0.9607, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.62018084526062, + "rewards/margins": 2.2614481449127197, + "rewards/rejected": 0.3587326109409332, + "step": 961 + }, + { + "epoch": 0.24, + "grad_norm": 4.389345645904541, + "learning_rate": 9.37819734606631e-06, + "logits/chosen": -0.20599783957004547, + "logits/rejected": -0.22287814319133759, + "logps/chosen": -59.46611022949219, + "logps/rejected": -72.07936096191406, + "loss": 0.9346, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.780196189880371, + "rewards/margins": 2.704164505004883, + "rewards/rejected": 0.07603172957897186, + "step": 962 + }, + { + "epoch": 0.24, + "grad_norm": 2.729130983352661, + "learning_rate": 9.376931501369255e-06, + "logits/chosen": -0.10261155664920807, + "logits/rejected": -0.18010388314723969, + "logps/chosen": -62.99714279174805, + "logps/rejected": -74.701416015625, + "loss": 0.7889, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7397656440734863, + "rewards/margins": 2.7480411529541016, + "rewards/rejected": -0.008275389671325684, + "step": 963 + }, + { + "epoch": 0.24, + "grad_norm": 4.296041011810303, + "learning_rate": 9.375664455110056e-06, + "logits/chosen": 0.005694549530744553, + "logits/rejected": -0.12148383259773254, + "logps/chosen": -78.57020568847656, + "logps/rejected": -62.28721618652344, + "loss": 0.8712, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9343202114105225, + "rewards/margins": 2.6197733879089355, + "rewards/rejected": 0.31454676389694214, + "step": 964 + }, + { + "epoch": 0.24, + "grad_norm": 6.317487716674805, + "learning_rate": 9.374396207636544e-06, + "logits/chosen": -0.08526287972927094, + "logits/rejected": -0.13569758832454681, + "logps/chosen": -52.74751281738281, + "logps/rejected": -65.99383544921875, + "loss": 0.9313, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.732942581176758, + "rewards/margins": 2.1617136001586914, + "rewards/rejected": 0.5712289810180664, + "step": 965 + }, + { + "epoch": 0.24, + "grad_norm": 4.306808948516846, + "learning_rate": 9.373126759296883e-06, + "logits/chosen": -0.09162215888500214, + "logits/rejected": -0.2250148355960846, + "logps/chosen": -56.17878723144531, + "logps/rejected": -67.59358215332031, + "loss": 0.8414, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7294223308563232, + "rewards/margins": 3.1342098712921143, + "rewards/rejected": -0.40478789806365967, + "step": 966 + }, + { + "epoch": 0.24, + "grad_norm": 5.461937427520752, + "learning_rate": 9.371856110439561e-06, + "logits/chosen": -0.07187582552433014, + "logits/rejected": -0.13293397426605225, + "logps/chosen": -70.58904266357422, + "logps/rejected": -79.11405181884766, + "loss": 1.0557, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6726596355438232, + "rewards/margins": 2.1319425106048584, + "rewards/rejected": 0.5407170057296753, + "step": 967 + }, + { + "epoch": 0.24, + "grad_norm": 5.676063537597656, + "learning_rate": 9.3705842614134e-06, + "logits/chosen": -0.012096725404262543, + "logits/rejected": -0.17251288890838623, + "logps/chosen": -65.10588073730469, + "logps/rejected": -65.31851196289062, + "loss": 0.8452, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.423865556716919, + "rewards/margins": 2.970524787902832, + "rewards/rejected": -0.5466591119766235, + "step": 968 + }, + { + "epoch": 0.24, + "grad_norm": 4.176477909088135, + "learning_rate": 9.369311212567548e-06, + "logits/chosen": -0.06866583228111267, + "logits/rejected": -0.18493306636810303, + "logps/chosen": -62.22799301147461, + "logps/rejected": -71.09873962402344, + "loss": 0.9532, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7088065147399902, + "rewards/margins": 3.354945421218872, + "rewards/rejected": -0.6461390256881714, + "step": 969 + }, + { + "epoch": 0.24, + "grad_norm": 4.980424880981445, + "learning_rate": 9.368036964251488e-06, + "logits/chosen": -0.04289174824953079, + "logits/rejected": -0.13014647364616394, + "logps/chosen": -60.83168029785156, + "logps/rejected": -60.348236083984375, + "loss": 0.9647, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7855019569396973, + "rewards/margins": 2.358006477355957, + "rewards/rejected": 0.42749521136283875, + "step": 970 + }, + { + "epoch": 0.24, + "grad_norm": 3.069282054901123, + "learning_rate": 9.366761516815023e-06, + "logits/chosen": -0.2070053517818451, + "logits/rejected": -0.2813225984573364, + "logps/chosen": -45.32058334350586, + "logps/rejected": -64.48944091796875, + "loss": 0.7417, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8377108573913574, + "rewards/margins": 3.4573700428009033, + "rewards/rejected": -0.6196589469909668, + "step": 971 + }, + { + "epoch": 0.24, + "grad_norm": 3.657731294631958, + "learning_rate": 9.365484870608298e-06, + "logits/chosen": -0.042715199291706085, + "logits/rejected": -0.2109304517507553, + "logps/chosen": -71.83831787109375, + "logps/rejected": -78.3866195678711, + "loss": 0.7717, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5406079292297363, + "rewards/margins": 3.1180765628814697, + "rewards/rejected": -0.5774685144424438, + "step": 972 + }, + { + "epoch": 0.24, + "grad_norm": 4.166426181793213, + "learning_rate": 9.364207025981775e-06, + "logits/chosen": -0.07129424810409546, + "logits/rejected": -0.19740138947963715, + "logps/chosen": -68.81385803222656, + "logps/rejected": -66.2864990234375, + "loss": 0.9666, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.671177864074707, + "rewards/margins": 2.9777896404266357, + "rewards/rejected": -0.30661195516586304, + "step": 973 + }, + { + "epoch": 0.24, + "grad_norm": 5.251474380493164, + "learning_rate": 9.362927983286252e-06, + "logits/chosen": -0.04798928275704384, + "logits/rejected": -0.08280336111783981, + "logps/chosen": -67.94319915771484, + "logps/rejected": -71.67451477050781, + "loss": 0.9899, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.910111904144287, + "rewards/margins": 2.168978214263916, + "rewards/rejected": 0.7411332130432129, + "step": 974 + }, + { + "epoch": 0.24, + "grad_norm": 4.265182018280029, + "learning_rate": 9.36164774287285e-06, + "logits/chosen": -0.03438156098127365, + "logits/rejected": -0.12663455307483673, + "logps/chosen": -57.163963317871094, + "logps/rejected": -69.629150390625, + "loss": 0.8059, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.876842498779297, + "rewards/margins": 2.7497711181640625, + "rewards/rejected": 0.12707163393497467, + "step": 975 + }, + { + "epoch": 0.24, + "grad_norm": 4.056659698486328, + "learning_rate": 9.360366305093029e-06, + "logits/chosen": -0.26580384373664856, + "logits/rejected": -0.3000062108039856, + "logps/chosen": -59.33732604980469, + "logps/rejected": -69.70472717285156, + "loss": 0.9789, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8184144496917725, + "rewards/margins": 2.59786057472229, + "rewards/rejected": 0.22055411338806152, + "step": 976 + }, + { + "epoch": 0.24, + "grad_norm": 4.414234638214111, + "learning_rate": 9.359083670298567e-06, + "logits/chosen": -0.14644186198711395, + "logits/rejected": -0.15309110283851624, + "logps/chosen": -60.64697265625, + "logps/rejected": -74.65235900878906, + "loss": 0.9853, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5857856273651123, + "rewards/margins": 2.788905382156372, + "rewards/rejected": -0.20312011241912842, + "step": 977 + }, + { + "epoch": 0.24, + "grad_norm": 3.635984182357788, + "learning_rate": 9.357799838841576e-06, + "logits/chosen": -0.11312008649110794, + "logits/rejected": -0.3116116225719452, + "logps/chosen": -60.90338134765625, + "logps/rejected": -58.91062545776367, + "loss": 0.8141, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8235249519348145, + "rewards/margins": 3.850785255432129, + "rewards/rejected": -1.0272603034973145, + "step": 978 + }, + { + "epoch": 0.24, + "grad_norm": 3.138425827026367, + "learning_rate": 9.356514811074494e-06, + "logits/chosen": -0.10268768668174744, + "logits/rejected": -0.12761536240577698, + "logps/chosen": -50.529998779296875, + "logps/rejected": -74.65298461914062, + "loss": 0.7946, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1170456409454346, + "rewards/margins": 2.878690481185913, + "rewards/rejected": 0.23835504055023193, + "step": 979 + }, + { + "epoch": 0.25, + "grad_norm": 4.2375407218933105, + "learning_rate": 9.355228587350091e-06, + "logits/chosen": -0.07087179273366928, + "logits/rejected": -0.13512232899665833, + "logps/chosen": -58.94377136230469, + "logps/rejected": -89.60743713378906, + "loss": 0.9331, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5862996578216553, + "rewards/margins": 2.5046417713165283, + "rewards/rejected": 0.08165799081325531, + "step": 980 + }, + { + "epoch": 0.25, + "grad_norm": 4.450253486633301, + "learning_rate": 9.353941168021463e-06, + "logits/chosen": -0.09878088533878326, + "logits/rejected": -0.18884572386741638, + "logps/chosen": -56.96117401123047, + "logps/rejected": -72.387451171875, + "loss": 0.8924, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.553194046020508, + "rewards/margins": 2.919929027557373, + "rewards/rejected": -0.36673474311828613, + "step": 981 + }, + { + "epoch": 0.25, + "grad_norm": 5.156918525695801, + "learning_rate": 9.352652553442034e-06, + "logits/chosen": -0.16060426831245422, + "logits/rejected": -0.2779923677444458, + "logps/chosen": -57.67535400390625, + "logps/rejected": -64.11708068847656, + "loss": 0.8025, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2080819606781006, + "rewards/margins": 3.375906467437744, + "rewards/rejected": -0.16782435774803162, + "step": 982 + }, + { + "epoch": 0.25, + "grad_norm": 3.807203531265259, + "learning_rate": 9.351362743965556e-06, + "logits/chosen": -0.13444611430168152, + "logits/rejected": -0.2051028311252594, + "logps/chosen": -46.37706756591797, + "logps/rejected": -84.0359115600586, + "loss": 0.8541, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8252038955688477, + "rewards/margins": 2.636201858520508, + "rewards/rejected": 0.1890018880367279, + "step": 983 + }, + { + "epoch": 0.25, + "grad_norm": 4.4890031814575195, + "learning_rate": 9.350071739946107e-06, + "logits/chosen": -0.18182632327079773, + "logits/rejected": -0.2221919298171997, + "logps/chosen": -56.91063690185547, + "logps/rejected": -72.38111877441406, + "loss": 0.8134, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9550065994262695, + "rewards/margins": 2.6414976119995117, + "rewards/rejected": 0.31350916624069214, + "step": 984 + }, + { + "epoch": 0.25, + "grad_norm": 3.3843259811401367, + "learning_rate": 9.348779541738102e-06, + "logits/chosen": -0.13502255082130432, + "logits/rejected": -0.2026594579219818, + "logps/chosen": -49.38432312011719, + "logps/rejected": -87.30233764648438, + "loss": 0.7324, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.928877353668213, + "rewards/margins": 3.651578903198242, + "rewards/rejected": -0.7227016687393188, + "step": 985 + }, + { + "epoch": 0.25, + "grad_norm": 3.672274351119995, + "learning_rate": 9.347486149696272e-06, + "logits/chosen": -0.0771743506193161, + "logits/rejected": -0.18830998241901398, + "logps/chosen": -64.0401840209961, + "logps/rejected": -72.07339477539062, + "loss": 0.7304, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8321940898895264, + "rewards/margins": 3.3271021842956543, + "rewards/rejected": -0.49490776658058167, + "step": 986 + }, + { + "epoch": 0.25, + "grad_norm": 4.719919204711914, + "learning_rate": 9.346191564175682e-06, + "logits/chosen": -0.1522076576948166, + "logits/rejected": -0.2380426526069641, + "logps/chosen": -51.627899169921875, + "logps/rejected": -73.34516906738281, + "loss": 0.8153, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.650049924850464, + "rewards/margins": 3.486163854598999, + "rewards/rejected": -0.836113452911377, + "step": 987 + }, + { + "epoch": 0.25, + "grad_norm": 4.0130438804626465, + "learning_rate": 9.344895785531725e-06, + "logits/chosen": -0.15521609783172607, + "logits/rejected": -0.33232980966567993, + "logps/chosen": -58.54426193237305, + "logps/rejected": -56.029258728027344, + "loss": 0.8847, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9346859455108643, + "rewards/margins": 3.528881788253784, + "rewards/rejected": -0.5941956043243408, + "step": 988 + }, + { + "epoch": 0.25, + "grad_norm": 3.2729837894439697, + "learning_rate": 9.34359881412012e-06, + "logits/chosen": -0.12965214252471924, + "logits/rejected": -0.23689699172973633, + "logps/chosen": -56.72709655761719, + "logps/rejected": -59.527076721191406, + "loss": 0.8579, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.945882558822632, + "rewards/margins": 2.5796711444854736, + "rewards/rejected": 0.366211473941803, + "step": 989 + }, + { + "epoch": 0.25, + "grad_norm": 3.0248842239379883, + "learning_rate": 9.342300650296911e-06, + "logits/chosen": -0.07770590484142303, + "logits/rejected": -0.2404259443283081, + "logps/chosen": -69.87026977539062, + "logps/rejected": -57.236427307128906, + "loss": 0.8046, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8107666969299316, + "rewards/margins": 2.8220324516296387, + "rewards/rejected": -0.011265784502029419, + "step": 990 + }, + { + "epoch": 0.25, + "grad_norm": 4.602746486663818, + "learning_rate": 9.341001294418476e-06, + "logits/chosen": -0.16089403629302979, + "logits/rejected": -0.2368265986442566, + "logps/chosen": -56.303436279296875, + "logps/rejected": -63.958351135253906, + "loss": 0.9681, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7317898273468018, + "rewards/margins": 2.8073062896728516, + "rewards/rejected": -0.07551652193069458, + "step": 991 + }, + { + "epoch": 0.25, + "grad_norm": 6.301314830780029, + "learning_rate": 9.339700746841514e-06, + "logits/chosen": -0.12386614829301834, + "logits/rejected": -0.19854125380516052, + "logps/chosen": -59.079017639160156, + "logps/rejected": -73.69719696044922, + "loss": 0.9908, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6303045749664307, + "rewards/margins": 2.9858615398406982, + "rewards/rejected": -0.3555571734905243, + "step": 992 + }, + { + "epoch": 0.25, + "grad_norm": 3.568864107131958, + "learning_rate": 9.338399007923052e-06, + "logits/chosen": -0.14706481993198395, + "logits/rejected": -0.23594488203525543, + "logps/chosen": -49.201419830322266, + "logps/rejected": -63.758140563964844, + "loss": 0.7919, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.853321075439453, + "rewards/margins": 2.961965560913086, + "rewards/rejected": -0.10864447057247162, + "step": 993 + }, + { + "epoch": 0.25, + "grad_norm": 5.894299030303955, + "learning_rate": 9.337096078020449e-06, + "logits/chosen": -0.07111842930316925, + "logits/rejected": -0.2232985645532608, + "logps/chosen": -46.92737579345703, + "logps/rejected": -61.77315902709961, + "loss": 0.7032, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8519134521484375, + "rewards/margins": 3.4741110801696777, + "rewards/rejected": -0.622197687625885, + "step": 994 + }, + { + "epoch": 0.25, + "grad_norm": 4.114288806915283, + "learning_rate": 9.335791957491385e-06, + "logits/chosen": -0.15713118016719818, + "logits/rejected": -0.2825014591217041, + "logps/chosen": -49.44172668457031, + "logps/rejected": -66.9581298828125, + "loss": 0.7729, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.629528045654297, + "rewards/margins": 3.6488497257232666, + "rewards/rejected": -1.0193214416503906, + "step": 995 + }, + { + "epoch": 0.25, + "grad_norm": 3.836484909057617, + "learning_rate": 9.334486646693868e-06, + "logits/chosen": -0.14029642939567566, + "logits/rejected": -0.17755332589149475, + "logps/chosen": -66.08098602294922, + "logps/rejected": -76.15519714355469, + "loss": 0.8726, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6245925426483154, + "rewards/margins": 3.477872133255005, + "rewards/rejected": -0.8532795906066895, + "step": 996 + }, + { + "epoch": 0.25, + "grad_norm": 4.038994789123535, + "learning_rate": 9.333180145986238e-06, + "logits/chosen": -0.15755604207515717, + "logits/rejected": -0.2739444971084595, + "logps/chosen": -53.349822998046875, + "logps/rejected": -58.436981201171875, + "loss": 0.8189, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.836596965789795, + "rewards/margins": 2.588566541671753, + "rewards/rejected": 0.24803048372268677, + "step": 997 + }, + { + "epoch": 0.25, + "grad_norm": 4.49752950668335, + "learning_rate": 9.331872455727154e-06, + "logits/chosen": -0.10862173140048981, + "logits/rejected": -0.1479358673095703, + "logps/chosen": -63.07725143432617, + "logps/rejected": -93.8182144165039, + "loss": 0.8418, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.68900728225708, + "rewards/margins": 2.794224977493286, + "rewards/rejected": -0.10521756112575531, + "step": 998 + }, + { + "epoch": 0.25, + "grad_norm": 7.994481563568115, + "learning_rate": 9.330563576275607e-06, + "logits/chosen": -0.1282631754875183, + "logits/rejected": -0.17564305663108826, + "logps/chosen": -59.817535400390625, + "logps/rejected": -75.44378662109375, + "loss": 0.9277, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0372488498687744, + "rewards/margins": 3.4621944427490234, + "rewards/rejected": -0.4249454438686371, + "step": 999 + }, + { + "epoch": 0.25, + "grad_norm": 4.525694370269775, + "learning_rate": 9.329253507990912e-06, + "logits/chosen": -0.15244165062904358, + "logits/rejected": -0.18751299381256104, + "logps/chosen": -56.17506790161133, + "logps/rejected": -80.77906799316406, + "loss": 1.0114, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5837724208831787, + "rewards/margins": 2.457655191421509, + "rewards/rejected": 0.12611743807792664, + "step": 1000 + }, + { + "epoch": 0.25, + "grad_norm": 4.995115280151367, + "learning_rate": 9.32794225123271e-06, + "logits/chosen": -0.09169584512710571, + "logits/rejected": -0.23125451803207397, + "logps/chosen": -61.56256103515625, + "logps/rejected": -48.743682861328125, + "loss": 0.9559, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.781905174255371, + "rewards/margins": 3.065410614013672, + "rewards/rejected": -0.28350552916526794, + "step": 1001 + }, + { + "epoch": 0.25, + "grad_norm": 4.647933483123779, + "learning_rate": 9.326629806360972e-06, + "logits/chosen": -0.0790531113743782, + "logits/rejected": -0.22438886761665344, + "logps/chosen": -66.2908706665039, + "logps/rejected": -67.2725830078125, + "loss": 0.7651, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.902499198913574, + "rewards/margins": 3.495582342147827, + "rewards/rejected": -0.5930830240249634, + "step": 1002 + }, + { + "epoch": 0.25, + "grad_norm": 4.120612621307373, + "learning_rate": 9.32531617373599e-06, + "logits/chosen": -0.16059979796409607, + "logits/rejected": -0.3356645703315735, + "logps/chosen": -71.46639251708984, + "logps/rejected": -65.3922119140625, + "loss": 0.8124, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5922513008117676, + "rewards/margins": 4.062073230743408, + "rewards/rejected": -1.4698221683502197, + "step": 1003 + }, + { + "epoch": 0.25, + "grad_norm": 4.644749641418457, + "learning_rate": 9.324001353718386e-06, + "logits/chosen": -0.10934490710496902, + "logits/rejected": -0.30089280009269714, + "logps/chosen": -67.97504425048828, + "logps/rejected": -74.89068603515625, + "loss": 0.8394, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.669635772705078, + "rewards/margins": 3.221095561981201, + "rewards/rejected": -0.5514597296714783, + "step": 1004 + }, + { + "epoch": 0.25, + "grad_norm": 2.714411497116089, + "learning_rate": 9.322685346669107e-06, + "logits/chosen": -0.12577661871910095, + "logits/rejected": -0.21530668437480927, + "logps/chosen": -50.34263229370117, + "logps/rejected": -73.3872299194336, + "loss": 0.6886, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6583104133605957, + "rewards/margins": 3.2864646911621094, + "rewards/rejected": -0.6281540989875793, + "step": 1005 + }, + { + "epoch": 0.25, + "grad_norm": 5.181242942810059, + "learning_rate": 9.321368152949421e-06, + "logits/chosen": -0.08858587592840195, + "logits/rejected": -0.20017080008983612, + "logps/chosen": -50.42146682739258, + "logps/rejected": -61.72474670410156, + "loss": 0.8165, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.958568811416626, + "rewards/margins": 2.5211238861083984, + "rewards/rejected": 0.43744510412216187, + "step": 1006 + }, + { + "epoch": 0.25, + "grad_norm": 3.729888439178467, + "learning_rate": 9.320049772920932e-06, + "logits/chosen": -0.11953186243772507, + "logits/rejected": -0.13732720911502838, + "logps/chosen": -62.472633361816406, + "logps/rejected": -86.6129379272461, + "loss": 0.878, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7851357460021973, + "rewards/margins": 2.675999164581299, + "rewards/rejected": 0.1091366708278656, + "step": 1007 + }, + { + "epoch": 0.25, + "grad_norm": 4.584023952484131, + "learning_rate": 9.318730206945558e-06, + "logits/chosen": -0.19137270748615265, + "logits/rejected": -0.2768482267856598, + "logps/chosen": -58.7961311340332, + "logps/rejected": -73.6083984375, + "loss": 0.895, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7299323081970215, + "rewards/margins": 3.1932151317596436, + "rewards/rejected": -0.4632823169231415, + "step": 1008 + }, + { + "epoch": 0.25, + "grad_norm": 5.5512003898620605, + "learning_rate": 9.317409455385556e-06, + "logits/chosen": -0.0941261574625969, + "logits/rejected": -0.24397870898246765, + "logps/chosen": -76.37298583984375, + "logps/rejected": -62.706787109375, + "loss": 1.058, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5619308948516846, + "rewards/margins": 2.3174326419830322, + "rewards/rejected": 0.24449805915355682, + "step": 1009 + }, + { + "epoch": 0.25, + "grad_norm": 4.177647590637207, + "learning_rate": 9.316087518603491e-06, + "logits/chosen": -0.04729720950126648, + "logits/rejected": -0.08831372857093811, + "logps/chosen": -56.58832550048828, + "logps/rejected": -75.34044647216797, + "loss": 0.7639, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9772300720214844, + "rewards/margins": 3.0484349727630615, + "rewards/rejected": -0.07120521366596222, + "step": 1010 + }, + { + "epoch": 0.25, + "grad_norm": 2.9749677181243896, + "learning_rate": 9.314764396962271e-06, + "logits/chosen": -0.088599294424057, + "logits/rejected": -0.25990864634513855, + "logps/chosen": -59.90522766113281, + "logps/rejected": -56.749107360839844, + "loss": 0.7249, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.881340503692627, + "rewards/margins": 3.7396421432495117, + "rewards/rejected": -0.8583012819290161, + "step": 1011 + }, + { + "epoch": 0.25, + "grad_norm": 4.806524753570557, + "learning_rate": 9.31344009082512e-06, + "logits/chosen": -0.07074353098869324, + "logits/rejected": -0.14903880655765533, + "logps/chosen": -59.88872146606445, + "logps/rejected": -80.86099243164062, + "loss": 0.8945, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.914687156677246, + "rewards/margins": 3.2649483680725098, + "rewards/rejected": -0.35026121139526367, + "step": 1012 + }, + { + "epoch": 0.25, + "grad_norm": 5.826336860656738, + "learning_rate": 9.312114600555584e-06, + "logits/chosen": -0.10436908900737762, + "logits/rejected": -0.220055490732193, + "logps/chosen": -63.413856506347656, + "logps/rejected": -64.64585876464844, + "loss": 1.0448, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.535231113433838, + "rewards/margins": 2.5130670070648193, + "rewards/rejected": 0.022163838148117065, + "step": 1013 + }, + { + "epoch": 0.25, + "grad_norm": 3.273115634918213, + "learning_rate": 9.310787926517545e-06, + "logits/chosen": -0.1420743763446808, + "logits/rejected": -0.3179587125778198, + "logps/chosen": -57.010623931884766, + "logps/rejected": -50.09129333496094, + "loss": 0.947, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.97747540473938, + "rewards/margins": 3.43873929977417, + "rewards/rejected": -0.46126431226730347, + "step": 1014 + }, + { + "epoch": 0.25, + "grad_norm": 4.374359607696533, + "learning_rate": 9.3094600690752e-06, + "logits/chosen": -0.16511180996894836, + "logits/rejected": -0.18810118734836578, + "logps/chosen": -62.4509162902832, + "logps/rejected": -77.27921295166016, + "loss": 0.897, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.850266933441162, + "rewards/margins": 2.469768524169922, + "rewards/rejected": 0.3804984986782074, + "step": 1015 + }, + { + "epoch": 0.25, + "grad_norm": 4.150924205780029, + "learning_rate": 9.308131028593074e-06, + "logits/chosen": -0.04950626567006111, + "logits/rejected": -0.16528233885765076, + "logps/chosen": -62.48530578613281, + "logps/rejected": -64.12410736083984, + "loss": 0.7912, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9192287921905518, + "rewards/margins": 2.91420841217041, + "rewards/rejected": 0.005020081996917725, + "step": 1016 + }, + { + "epoch": 0.25, + "grad_norm": 4.104448318481445, + "learning_rate": 9.306800805436019e-06, + "logits/chosen": -0.09085361659526825, + "logits/rejected": -0.2330082803964615, + "logps/chosen": -58.54216003417969, + "logps/rejected": -68.53722381591797, + "loss": 0.8625, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8765065670013428, + "rewards/margins": 3.1563801765441895, + "rewards/rejected": -0.27987366914749146, + "step": 1017 + }, + { + "epoch": 0.25, + "grad_norm": 5.397110939025879, + "learning_rate": 9.30546939996921e-06, + "logits/chosen": -0.22833265364170074, + "logits/rejected": -0.20797204971313477, + "logps/chosen": -79.69678497314453, + "logps/rejected": -80.2263412475586, + "loss": 0.9826, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.5034210681915283, + "rewards/margins": 2.24544095993042, + "rewards/rejected": 0.2579802870750427, + "step": 1018 + }, + { + "epoch": 0.25, + "grad_norm": 3.124394178390503, + "learning_rate": 9.304136812558147e-06, + "logits/chosen": 0.0185842402279377, + "logits/rejected": -0.1523887664079666, + "logps/chosen": -69.19519805908203, + "logps/rejected": -71.22183227539062, + "loss": 0.7398, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6287925243377686, + "rewards/margins": 3.2875349521636963, + "rewards/rejected": -0.6587422490119934, + "step": 1019 + }, + { + "epoch": 0.26, + "grad_norm": 4.670156955718994, + "learning_rate": 9.30280304356865e-06, + "logits/chosen": -0.09645988792181015, + "logits/rejected": -0.21392004191875458, + "logps/chosen": -51.3034782409668, + "logps/rejected": -63.66216278076172, + "loss": 0.9327, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5753986835479736, + "rewards/margins": 2.491450786590576, + "rewards/rejected": 0.08394765853881836, + "step": 1020 + }, + { + "epoch": 0.26, + "grad_norm": 3.5805540084838867, + "learning_rate": 9.301468093366873e-06, + "logits/chosen": -0.11561483889818192, + "logits/rejected": -0.16522666811943054, + "logps/chosen": -55.077571868896484, + "logps/rejected": -66.71158599853516, + "loss": 0.8194, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0001704692840576, + "rewards/margins": 3.21140718460083, + "rewards/rejected": -0.21123644709587097, + "step": 1021 + }, + { + "epoch": 0.26, + "grad_norm": 3.598127603530884, + "learning_rate": 9.300131962319285e-06, + "logits/chosen": -0.1490037739276886, + "logits/rejected": -0.29130280017852783, + "logps/chosen": -59.47603988647461, + "logps/rejected": -61.714439392089844, + "loss": 0.7167, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.768986940383911, + "rewards/margins": 3.911663770675659, + "rewards/rejected": -1.142676830291748, + "step": 1022 + }, + { + "epoch": 0.26, + "grad_norm": 5.561100482940674, + "learning_rate": 9.298794650792684e-06, + "logits/chosen": -0.220164492726326, + "logits/rejected": -0.3152613043785095, + "logps/chosen": -62.26943588256836, + "logps/rejected": -63.43920135498047, + "loss": 0.9322, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.815762519836426, + "rewards/margins": 2.8088111877441406, + "rewards/rejected": 0.0069512128829956055, + "step": 1023 + }, + { + "epoch": 0.26, + "grad_norm": 3.1385505199432373, + "learning_rate": 9.29745615915419e-06, + "logits/chosen": -0.10008390992879868, + "logits/rejected": -0.1601962447166443, + "logps/chosen": -58.008522033691406, + "logps/rejected": -78.48880767822266, + "loss": 0.841, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.200685501098633, + "rewards/margins": 3.868030309677124, + "rewards/rejected": -0.6673449873924255, + "step": 1024 + }, + { + "epoch": 0.26, + "grad_norm": 3.4513559341430664, + "learning_rate": 9.296116487771249e-06, + "logits/chosen": -0.13363701105117798, + "logits/rejected": -0.17535065114498138, + "logps/chosen": -48.67828369140625, + "logps/rejected": -59.15263748168945, + "loss": 0.8067, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.148987054824829, + "rewards/margins": 2.7660748958587646, + "rewards/rejected": 0.38291189074516296, + "step": 1025 + }, + { + "epoch": 0.26, + "grad_norm": 4.774328231811523, + "learning_rate": 9.29477563701163e-06, + "logits/chosen": -0.08840233087539673, + "logits/rejected": -0.19528286159038544, + "logps/chosen": -53.390995025634766, + "logps/rejected": -58.504539489746094, + "loss": 1.0322, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.651729106903076, + "rewards/margins": 2.3269317150115967, + "rewards/rejected": 0.3247976303100586, + "step": 1026 + }, + { + "epoch": 0.26, + "grad_norm": 3.6903836727142334, + "learning_rate": 9.293433607243426e-06, + "logits/chosen": -0.042492687702178955, + "logits/rejected": -0.16682811081409454, + "logps/chosen": -54.50518035888672, + "logps/rejected": -62.081817626953125, + "loss": 0.8361, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6868457794189453, + "rewards/margins": 3.0624747276306152, + "rewards/rejected": -0.3756290078163147, + "step": 1027 + }, + { + "epoch": 0.26, + "grad_norm": 4.27150297164917, + "learning_rate": 9.292090398835048e-06, + "logits/chosen": -0.08947766572237015, + "logits/rejected": -0.22569628059864044, + "logps/chosen": -70.1434555053711, + "logps/rejected": -62.30266571044922, + "loss": 0.9508, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.0019643306732178, + "rewards/margins": 3.0441396236419678, + "rewards/rejected": -0.04217533767223358, + "step": 1028 + }, + { + "epoch": 0.26, + "grad_norm": 5.425658702850342, + "learning_rate": 9.290746012155243e-06, + "logits/chosen": -0.14412713050842285, + "logits/rejected": -0.17387670278549194, + "logps/chosen": -66.35494232177734, + "logps/rejected": -72.01668548583984, + "loss": 0.9957, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.670325756072998, + "rewards/margins": 2.0053372383117676, + "rewards/rejected": 0.6649881601333618, + "step": 1029 + }, + { + "epoch": 0.26, + "grad_norm": 4.042146682739258, + "learning_rate": 9.289400447573068e-06, + "logits/chosen": -0.0671742632985115, + "logits/rejected": -0.17905782163143158, + "logps/chosen": -53.57847213745117, + "logps/rejected": -63.96678161621094, + "loss": 0.8209, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0858592987060547, + "rewards/margins": 3.4218907356262207, + "rewards/rejected": -0.336031436920166, + "step": 1030 + }, + { + "epoch": 0.26, + "grad_norm": 2.933061122894287, + "learning_rate": 9.288053705457914e-06, + "logits/chosen": -0.24596062302589417, + "logits/rejected": -0.3895095884799957, + "logps/chosen": -61.83127212524414, + "logps/rejected": -69.79736328125, + "loss": 0.7337, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8858413696289062, + "rewards/margins": 4.327299118041992, + "rewards/rejected": -1.441457986831665, + "step": 1031 + }, + { + "epoch": 0.26, + "grad_norm": 2.6700243949890137, + "learning_rate": 9.286705786179489e-06, + "logits/chosen": -0.1075604259967804, + "logits/rejected": -0.2903234362602234, + "logps/chosen": -66.03939056396484, + "logps/rejected": -60.86225891113281, + "loss": 0.7889, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.710503339767456, + "rewards/margins": 3.5682153701782227, + "rewards/rejected": -0.8577117919921875, + "step": 1032 + }, + { + "epoch": 0.26, + "grad_norm": 3.918907403945923, + "learning_rate": 9.285356690107824e-06, + "logits/chosen": -0.08697827160358429, + "logits/rejected": -0.2741522192955017, + "logps/chosen": -52.08841323852539, + "logps/rejected": -55.87983703613281, + "loss": 0.7118, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8289549350738525, + "rewards/margins": 3.922886610031128, + "rewards/rejected": -1.0939319133758545, + "step": 1033 + }, + { + "epoch": 0.26, + "grad_norm": 3.459092378616333, + "learning_rate": 9.284006417613278e-06, + "logits/chosen": -0.11485648900270462, + "logits/rejected": -0.21111133694648743, + "logps/chosen": -52.58618927001953, + "logps/rejected": -71.04336547851562, + "loss": 0.8615, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7972989082336426, + "rewards/margins": 2.3442907333374023, + "rewards/rejected": 0.4530077576637268, + "step": 1034 + }, + { + "epoch": 0.26, + "grad_norm": 5.077000141143799, + "learning_rate": 9.28265496906653e-06, + "logits/chosen": -0.16760623455047607, + "logits/rejected": -0.28052380681037903, + "logps/chosen": -57.174041748046875, + "logps/rejected": -73.66836547851562, + "loss": 1.0039, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8711607456207275, + "rewards/margins": 2.7860400676727295, + "rewards/rejected": 0.08512076735496521, + "step": 1035 + }, + { + "epoch": 0.26, + "grad_norm": 4.408838272094727, + "learning_rate": 9.281302344838579e-06, + "logits/chosen": -0.07462131977081299, + "logits/rejected": -0.13888061046600342, + "logps/chosen": -61.544158935546875, + "logps/rejected": -70.0220947265625, + "loss": 0.9057, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.894733428955078, + "rewards/margins": 2.555079221725464, + "rewards/rejected": 0.3396546244621277, + "step": 1036 + }, + { + "epoch": 0.26, + "grad_norm": 6.645236015319824, + "learning_rate": 9.27994854530075e-06, + "logits/chosen": -0.15588237345218658, + "logits/rejected": -0.2969304621219635, + "logps/chosen": -57.88871765136719, + "logps/rejected": -61.20320129394531, + "loss": 1.0056, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.062405824661255, + "rewards/margins": 2.978364944458008, + "rewards/rejected": 0.08404079079627991, + "step": 1037 + }, + { + "epoch": 0.26, + "grad_norm": 4.138969421386719, + "learning_rate": 9.278593570824694e-06, + "logits/chosen": -0.0955585166811943, + "logits/rejected": -0.2066224217414856, + "logps/chosen": -50.16203308105469, + "logps/rejected": -69.72760009765625, + "loss": 0.8604, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7397801876068115, + "rewards/margins": 3.4003942012786865, + "rewards/rejected": -0.6606143712997437, + "step": 1038 + }, + { + "epoch": 0.26, + "grad_norm": 5.090577125549316, + "learning_rate": 9.277237421782376e-06, + "logits/chosen": -0.07109740376472473, + "logits/rejected": -0.08663389086723328, + "logps/chosen": -68.89447784423828, + "logps/rejected": -92.29849243164062, + "loss": 1.0283, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8341805934906006, + "rewards/margins": 2.5687997341156006, + "rewards/rejected": 0.2653810679912567, + "step": 1039 + }, + { + "epoch": 0.26, + "grad_norm": 3.6542954444885254, + "learning_rate": 9.275880098546092e-06, + "logits/chosen": -0.10519945621490479, + "logits/rejected": -0.21322765946388245, + "logps/chosen": -50.40234375, + "logps/rejected": -77.3004379272461, + "loss": 0.7607, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.787905693054199, + "rewards/margins": 2.9177165031433105, + "rewards/rejected": -0.129811093211174, + "step": 1040 + }, + { + "epoch": 0.26, + "grad_norm": 3.4726223945617676, + "learning_rate": 9.274521601488453e-06, + "logits/chosen": -0.1040353924036026, + "logits/rejected": -0.198703795671463, + "logps/chosen": -56.43289566040039, + "logps/rejected": -87.28023529052734, + "loss": 0.6897, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5671207904815674, + "rewards/margins": 3.610816478729248, + "rewards/rejected": -1.0436954498291016, + "step": 1041 + }, + { + "epoch": 0.26, + "grad_norm": 4.237652778625488, + "learning_rate": 9.2731619309824e-06, + "logits/chosen": -0.04397863894701004, + "logits/rejected": -0.15299445390701294, + "logps/chosen": -59.869956970214844, + "logps/rejected": -67.90794372558594, + "loss": 0.7492, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1499409675598145, + "rewards/margins": 3.1751301288604736, + "rewards/rejected": -0.025189489126205444, + "step": 1042 + }, + { + "epoch": 0.26, + "grad_norm": 4.2288031578063965, + "learning_rate": 9.271801087401187e-06, + "logits/chosen": -0.09909120202064514, + "logits/rejected": -0.22870124876499176, + "logps/chosen": -65.360107421875, + "logps/rejected": -61.12604522705078, + "loss": 0.9224, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7561094760894775, + "rewards/margins": 2.7821788787841797, + "rewards/rejected": -0.02606944739818573, + "step": 1043 + }, + { + "epoch": 0.26, + "grad_norm": 3.3207809925079346, + "learning_rate": 9.270439071118398e-06, + "logits/chosen": -0.045466627925634384, + "logits/rejected": -0.14984175562858582, + "logps/chosen": -54.25589370727539, + "logps/rejected": -77.96039581298828, + "loss": 0.7274, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.879338502883911, + "rewards/margins": 3.6689746379852295, + "rewards/rejected": -0.7896358966827393, + "step": 1044 + }, + { + "epoch": 0.26, + "grad_norm": 5.909974098205566, + "learning_rate": 9.269075882507935e-06, + "logits/chosen": 0.01695566065609455, + "logits/rejected": -0.19842591881752014, + "logps/chosen": -59.04058074951172, + "logps/rejected": -56.49765396118164, + "loss": 0.7679, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7871458530426025, + "rewards/margins": 2.9806175231933594, + "rewards/rejected": -0.19347184896469116, + "step": 1045 + }, + { + "epoch": 0.26, + "grad_norm": 5.060246467590332, + "learning_rate": 9.267711521944022e-06, + "logits/chosen": -0.2200324535369873, + "logits/rejected": -0.3419118821620941, + "logps/chosen": -49.336605072021484, + "logps/rejected": -61.656761169433594, + "loss": 0.7359, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1142866611480713, + "rewards/margins": 3.8643569946289062, + "rewards/rejected": -0.7500702142715454, + "step": 1046 + }, + { + "epoch": 0.26, + "grad_norm": 5.05048942565918, + "learning_rate": 9.266345989801209e-06, + "logits/chosen": -0.12307722121477127, + "logits/rejected": -0.13094991445541382, + "logps/chosen": -67.47348022460938, + "logps/rejected": -78.81857299804688, + "loss": 0.9628, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.67935848236084, + "rewards/margins": 2.7117350101470947, + "rewards/rejected": -0.03237658366560936, + "step": 1047 + }, + { + "epoch": 0.26, + "grad_norm": 3.978734016418457, + "learning_rate": 9.264979286454358e-06, + "logits/chosen": -0.13571855425834656, + "logits/rejected": -0.2294061779975891, + "logps/chosen": -50.85662078857422, + "logps/rejected": -69.51252746582031, + "loss": 0.8003, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8679463863372803, + "rewards/margins": 2.9352779388427734, + "rewards/rejected": -0.06733144074678421, + "step": 1048 + }, + { + "epoch": 0.26, + "grad_norm": 4.437260150909424, + "learning_rate": 9.263611412278665e-06, + "logits/chosen": -0.08841567486524582, + "logits/rejected": -0.1984134465456009, + "logps/chosen": -63.41436767578125, + "logps/rejected": -67.2572021484375, + "loss": 0.8955, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.537278175354004, + "rewards/margins": 2.8254780769348145, + "rewards/rejected": -0.28820013999938965, + "step": 1049 + }, + { + "epoch": 0.26, + "grad_norm": 4.205920219421387, + "learning_rate": 9.262242367649637e-06, + "logits/chosen": -0.0019520032219588757, + "logits/rejected": -0.05939626321196556, + "logps/chosen": -57.13789749145508, + "logps/rejected": -75.64269256591797, + "loss": 0.8206, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9491539001464844, + "rewards/margins": 2.19504451751709, + "rewards/rejected": 0.754109263420105, + "step": 1050 + }, + { + "epoch": 0.26, + "grad_norm": 6.406669616699219, + "learning_rate": 9.260872152943106e-06, + "logits/chosen": -0.060166552662849426, + "logits/rejected": -0.16553758084774017, + "logps/chosen": -64.51937866210938, + "logps/rejected": -75.79224395751953, + "loss": 1.0688, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7015879154205322, + "rewards/margins": 2.162419319152832, + "rewards/rejected": 0.5391688942909241, + "step": 1051 + }, + { + "epoch": 0.26, + "grad_norm": 6.500846862792969, + "learning_rate": 9.259500768535226e-06, + "logits/chosen": -0.15596550703048706, + "logits/rejected": -0.186916321516037, + "logps/chosen": -66.80743408203125, + "logps/rejected": -79.17172241210938, + "loss": 1.1883, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.7128820419311523, + "rewards/margins": 2.3540151119232178, + "rewards/rejected": 0.3588670492172241, + "step": 1052 + }, + { + "epoch": 0.26, + "grad_norm": 5.410404682159424, + "learning_rate": 9.258128214802474e-06, + "logits/chosen": -0.14643815159797668, + "logits/rejected": -0.22261762619018555, + "logps/chosen": -57.71858215332031, + "logps/rejected": -78.89875030517578, + "loss": 0.8163, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.868298053741455, + "rewards/margins": 3.427516460418701, + "rewards/rejected": -0.5592180490493774, + "step": 1053 + }, + { + "epoch": 0.26, + "grad_norm": 5.383137226104736, + "learning_rate": 9.256754492121643e-06, + "logits/chosen": -0.13861778378486633, + "logits/rejected": -0.28582215309143066, + "logps/chosen": -61.52800369262695, + "logps/rejected": -79.82968139648438, + "loss": 0.9984, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.4519224166870117, + "rewards/margins": 3.3466134071350098, + "rewards/rejected": -0.8946908712387085, + "step": 1054 + }, + { + "epoch": 0.26, + "grad_norm": 3.601926803588867, + "learning_rate": 9.255379600869852e-06, + "logits/chosen": -0.10140880197286606, + "logits/rejected": -0.24071282148361206, + "logps/chosen": -57.024845123291016, + "logps/rejected": -62.111732482910156, + "loss": 0.7983, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.984778642654419, + "rewards/margins": 3.436601161956787, + "rewards/rejected": -0.45182254910469055, + "step": 1055 + }, + { + "epoch": 0.26, + "grad_norm": 5.152816295623779, + "learning_rate": 9.254003541424534e-06, + "logits/chosen": -0.13825298845767975, + "logits/rejected": -0.30888694524765015, + "logps/chosen": -63.184974670410156, + "logps/rejected": -60.843631744384766, + "loss": 0.8607, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8477444648742676, + "rewards/margins": 3.2213027477264404, + "rewards/rejected": -0.37355825304985046, + "step": 1056 + }, + { + "epoch": 0.26, + "grad_norm": 3.5701687335968018, + "learning_rate": 9.252626314163452e-06, + "logits/chosen": 0.006599389016628265, + "logits/rejected": -0.12651002407073975, + "logps/chosen": -52.707611083984375, + "logps/rejected": -69.42970275878906, + "loss": 0.7881, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0173099040985107, + "rewards/margins": 3.5182502269744873, + "rewards/rejected": -0.500940203666687, + "step": 1057 + }, + { + "epoch": 0.26, + "grad_norm": 2.9408178329467773, + "learning_rate": 9.251247919464682e-06, + "logits/chosen": -0.0818103477358818, + "logits/rejected": -0.31976285576820374, + "logps/chosen": -50.53556823730469, + "logps/rejected": -51.75800323486328, + "loss": 0.6818, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.705137014389038, + "rewards/margins": 3.3403611183166504, + "rewards/rejected": -0.6352238059043884, + "step": 1058 + }, + { + "epoch": 0.26, + "grad_norm": 4.444564342498779, + "learning_rate": 9.249868357706622e-06, + "logits/chosen": -0.13503459095954895, + "logits/rejected": -0.2732657790184021, + "logps/chosen": -65.25596618652344, + "logps/rejected": -72.13631439208984, + "loss": 0.8864, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8960604667663574, + "rewards/margins": 3.1965088844299316, + "rewards/rejected": -0.30044859647750854, + "step": 1059 + }, + { + "epoch": 0.27, + "grad_norm": 7.694329738616943, + "learning_rate": 9.248487629267994e-06, + "logits/chosen": -0.09975416958332062, + "logits/rejected": -0.16111250221729279, + "logps/chosen": -62.186851501464844, + "logps/rejected": -71.1956787109375, + "loss": 0.9926, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7257068157196045, + "rewards/margins": 2.552896499633789, + "rewards/rejected": 0.17281030118465424, + "step": 1060 + }, + { + "epoch": 0.27, + "grad_norm": 4.482682228088379, + "learning_rate": 9.247105734527838e-06, + "logits/chosen": -0.08263653516769409, + "logits/rejected": -0.11546052247285843, + "logps/chosen": -58.10259246826172, + "logps/rejected": -76.35404968261719, + "loss": 0.8863, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6127867698669434, + "rewards/margins": 2.964857816696167, + "rewards/rejected": -0.3520709276199341, + "step": 1061 + }, + { + "epoch": 0.27, + "grad_norm": 5.360846042633057, + "learning_rate": 9.24572267386551e-06, + "logits/chosen": -0.1610848605632782, + "logits/rejected": -0.32609042525291443, + "logps/chosen": -68.42381286621094, + "logps/rejected": -50.920005798339844, + "loss": 1.1103, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.799098491668701, + "rewards/margins": 2.4410817623138428, + "rewards/rejected": 0.3580166697502136, + "step": 1062 + }, + { + "epoch": 0.27, + "grad_norm": 4.368180751800537, + "learning_rate": 9.244338447660692e-06, + "logits/chosen": -0.16917653381824493, + "logits/rejected": -0.2792043685913086, + "logps/chosen": -61.171939849853516, + "logps/rejected": -69.90045166015625, + "loss": 0.955, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.047426700592041, + "rewards/margins": 2.914147138595581, + "rewards/rejected": 0.13327980041503906, + "step": 1063 + }, + { + "epoch": 0.27, + "grad_norm": 4.751705646514893, + "learning_rate": 9.242953056293387e-06, + "logits/chosen": -0.1046595424413681, + "logits/rejected": -0.1683615744113922, + "logps/chosen": -67.11834716796875, + "logps/rejected": -70.54529571533203, + "loss": 1.0481, + "rewards/accuracies": 0.65625, + "rewards/chosen": 2.836881160736084, + "rewards/margins": 2.439417600631714, + "rewards/rejected": 0.3974634110927582, + "step": 1064 + }, + { + "epoch": 0.27, + "grad_norm": 3.706683397293091, + "learning_rate": 9.24156650014391e-06, + "logits/chosen": -0.09086276590824127, + "logits/rejected": -0.24066215753555298, + "logps/chosen": -65.9075927734375, + "logps/rejected": -63.76836013793945, + "loss": 0.9035, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8132054805755615, + "rewards/margins": 3.023481845855713, + "rewards/rejected": -0.21027624607086182, + "step": 1065 + }, + { + "epoch": 0.27, + "grad_norm": 3.898366928100586, + "learning_rate": 9.240178779592906e-06, + "logits/chosen": -0.11192606389522552, + "logits/rejected": -0.2931104600429535, + "logps/chosen": -78.14714813232422, + "logps/rejected": -67.17153930664062, + "loss": 0.7714, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7304394245147705, + "rewards/margins": 3.4280831813812256, + "rewards/rejected": -0.6976437568664551, + "step": 1066 + }, + { + "epoch": 0.27, + "grad_norm": 5.245119571685791, + "learning_rate": 9.23878989502133e-06, + "logits/chosen": -0.09048233181238174, + "logits/rejected": -0.23374906182289124, + "logps/chosen": -64.49345397949219, + "logps/rejected": -68.72672271728516, + "loss": 0.8588, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.794959783554077, + "rewards/margins": 3.3675060272216797, + "rewards/rejected": -0.572546124458313, + "step": 1067 + }, + { + "epoch": 0.27, + "grad_norm": 4.348679542541504, + "learning_rate": 9.23739984681046e-06, + "logits/chosen": -0.10992153733968735, + "logits/rejected": -0.2537555396556854, + "logps/chosen": -63.973419189453125, + "logps/rejected": -70.13710021972656, + "loss": 0.8471, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.038052558898926, + "rewards/margins": 3.122126817703247, + "rewards/rejected": -0.08407415449619293, + "step": 1068 + }, + { + "epoch": 0.27, + "grad_norm": 3.844515562057495, + "learning_rate": 9.236008635341898e-06, + "logits/chosen": -0.11377041041851044, + "logits/rejected": -0.2033383548259735, + "logps/chosen": -84.05589294433594, + "logps/rejected": -76.42314147949219, + "loss": 0.9701, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.875281810760498, + "rewards/margins": 2.916640281677246, + "rewards/rejected": -0.04135815054178238, + "step": 1069 + }, + { + "epoch": 0.27, + "grad_norm": 3.825748920440674, + "learning_rate": 9.23461626099756e-06, + "logits/chosen": -0.03875890001654625, + "logits/rejected": -0.18180617690086365, + "logps/chosen": -59.10527420043945, + "logps/rejected": -77.4354248046875, + "loss": 0.7774, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8220109939575195, + "rewards/margins": 2.799503803253174, + "rewards/rejected": 0.022507579997181892, + "step": 1070 + }, + { + "epoch": 0.27, + "grad_norm": 3.3920207023620605, + "learning_rate": 9.233222724159682e-06, + "logits/chosen": -0.2618081569671631, + "logits/rejected": -0.31199681758880615, + "logps/chosen": -50.40976333618164, + "logps/rejected": -70.07401275634766, + "loss": 0.7289, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9561305046081543, + "rewards/margins": 3.413357734680176, + "rewards/rejected": -0.4572274684906006, + "step": 1071 + }, + { + "epoch": 0.27, + "grad_norm": 4.457772254943848, + "learning_rate": 9.231828025210821e-06, + "logits/chosen": -0.12531188130378723, + "logits/rejected": -0.2711395025253296, + "logps/chosen": -64.28221893310547, + "logps/rejected": -61.528507232666016, + "loss": 1.0053, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6411385536193848, + "rewards/margins": 2.7524943351745605, + "rewards/rejected": -0.1113557294011116, + "step": 1072 + }, + { + "epoch": 0.27, + "grad_norm": 3.1484298706054688, + "learning_rate": 9.23043216453385e-06, + "logits/chosen": -0.15678031742572784, + "logits/rejected": -0.2644728720188141, + "logps/chosen": -53.87049102783203, + "logps/rejected": -68.06608581542969, + "loss": 0.7936, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7403132915496826, + "rewards/margins": 3.1613423824310303, + "rewards/rejected": -0.42102906107902527, + "step": 1073 + }, + { + "epoch": 0.27, + "grad_norm": 4.075427055358887, + "learning_rate": 9.229035142511964e-06, + "logits/chosen": -0.14084555208683014, + "logits/rejected": -0.2514224648475647, + "logps/chosen": -54.33985137939453, + "logps/rejected": -63.763206481933594, + "loss": 0.9282, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8387351036071777, + "rewards/margins": 2.857208013534546, + "rewards/rejected": -0.018473029136657715, + "step": 1074 + }, + { + "epoch": 0.27, + "grad_norm": 5.767397403717041, + "learning_rate": 9.227636959528678e-06, + "logits/chosen": -0.1972188502550125, + "logits/rejected": -0.2771735191345215, + "logps/chosen": -50.80841064453125, + "logps/rejected": -66.90803527832031, + "loss": 0.9657, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.641779661178589, + "rewards/margins": 2.6681969165802, + "rewards/rejected": -0.02641759067773819, + "step": 1075 + }, + { + "epoch": 0.27, + "grad_norm": 3.163799285888672, + "learning_rate": 9.226237615967822e-06, + "logits/chosen": -0.12358416616916656, + "logits/rejected": -0.26018381118774414, + "logps/chosen": -61.71136474609375, + "logps/rejected": -72.13582611083984, + "loss": 0.8075, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.950286626815796, + "rewards/margins": 3.7992372512817383, + "rewards/rejected": -0.8489508628845215, + "step": 1076 + }, + { + "epoch": 0.27, + "grad_norm": 4.7930684089660645, + "learning_rate": 9.224837112213542e-06, + "logits/chosen": -0.10828876495361328, + "logits/rejected": -0.19499708712100983, + "logps/chosen": -58.47709274291992, + "logps/rejected": -76.70428466796875, + "loss": 0.8255, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.540466547012329, + "rewards/margins": 2.889101028442383, + "rewards/rejected": -0.3486344814300537, + "step": 1077 + }, + { + "epoch": 0.27, + "grad_norm": 5.854869842529297, + "learning_rate": 9.223435448650312e-06, + "logits/chosen": -0.10905355215072632, + "logits/rejected": -0.1861913651227951, + "logps/chosen": -60.23396301269531, + "logps/rejected": -74.80816650390625, + "loss": 0.9416, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5527241230010986, + "rewards/margins": 2.5488481521606445, + "rewards/rejected": 0.0038756802678108215, + "step": 1078 + }, + { + "epoch": 0.27, + "grad_norm": 3.392249584197998, + "learning_rate": 9.222032625662916e-06, + "logits/chosen": -0.22525672614574432, + "logits/rejected": -0.355680912733078, + "logps/chosen": -56.11549758911133, + "logps/rejected": -65.50201416015625, + "loss": 0.7852, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1022257804870605, + "rewards/margins": 3.5393123626708984, + "rewards/rejected": -0.43708670139312744, + "step": 1079 + }, + { + "epoch": 0.27, + "grad_norm": 3.0922904014587402, + "learning_rate": 9.220628643636462e-06, + "logits/chosen": -0.02357356995344162, + "logits/rejected": -0.21002264320850372, + "logps/chosen": -66.00736236572266, + "logps/rejected": -59.38707733154297, + "loss": 0.7795, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.110255241394043, + "rewards/margins": 3.9189417362213135, + "rewards/rejected": -0.808686375617981, + "step": 1080 + }, + { + "epoch": 0.27, + "grad_norm": 5.970247745513916, + "learning_rate": 9.21922350295637e-06, + "logits/chosen": -0.08183164894580841, + "logits/rejected": -0.2371576875448227, + "logps/chosen": -74.11322021484375, + "logps/rejected": -81.11566162109375, + "loss": 1.1475, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.3020541667938232, + "rewards/margins": 2.6357200145721436, + "rewards/rejected": -0.33366575837135315, + "step": 1081 + }, + { + "epoch": 0.27, + "grad_norm": 3.3441150188446045, + "learning_rate": 9.217817204008382e-06, + "logits/chosen": -0.16256242990493774, + "logits/rejected": -0.30409806966781616, + "logps/chosen": -57.49406433105469, + "logps/rejected": -56.96905517578125, + "loss": 0.7441, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.620327949523926, + "rewards/margins": 3.1580448150634766, + "rewards/rejected": -0.5377168655395508, + "step": 1082 + }, + { + "epoch": 0.27, + "grad_norm": 4.3805036544799805, + "learning_rate": 9.216409747178559e-06, + "logits/chosen": -0.013885801658034325, + "logits/rejected": -0.20448487997055054, + "logps/chosen": -71.69287872314453, + "logps/rejected": -65.26693725585938, + "loss": 0.7963, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5186033248901367, + "rewards/margins": 3.174511671066284, + "rewards/rejected": -0.6559081077575684, + "step": 1083 + }, + { + "epoch": 0.27, + "grad_norm": 5.096024036407471, + "learning_rate": 9.215001132853277e-06, + "logits/chosen": -0.16361384093761444, + "logits/rejected": -0.2173074185848236, + "logps/chosen": -52.869178771972656, + "logps/rejected": -69.9111557006836, + "loss": 1.0267, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.5045366287231445, + "rewards/margins": 2.8332297801971436, + "rewards/rejected": -0.3286936283111572, + "step": 1084 + }, + { + "epoch": 0.27, + "grad_norm": 3.5923912525177, + "learning_rate": 9.213591361419231e-06, + "logits/chosen": -0.15056374669075012, + "logits/rejected": -0.26505744457244873, + "logps/chosen": -57.29247283935547, + "logps/rejected": -72.34352111816406, + "loss": 0.8262, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4473683834075928, + "rewards/margins": 3.169801950454712, + "rewards/rejected": -0.7224336266517639, + "step": 1085 + }, + { + "epoch": 0.27, + "grad_norm": 4.055616855621338, + "learning_rate": 9.212180433263436e-06, + "logits/chosen": -0.13875098526477814, + "logits/rejected": -0.2948298156261444, + "logps/chosen": -57.78587341308594, + "logps/rejected": -66.45672607421875, + "loss": 0.7958, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9966979026794434, + "rewards/margins": 2.797215461730957, + "rewards/rejected": 0.19948270916938782, + "step": 1086 + }, + { + "epoch": 0.27, + "grad_norm": 4.165387153625488, + "learning_rate": 9.21076834877322e-06, + "logits/chosen": -0.2008974701166153, + "logits/rejected": -0.27338874340057373, + "logps/chosen": -53.10332107543945, + "logps/rejected": -63.69215774536133, + "loss": 0.9424, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8981380462646484, + "rewards/margins": 3.0128562450408936, + "rewards/rejected": -0.11471825838088989, + "step": 1087 + }, + { + "epoch": 0.27, + "grad_norm": 4.159326076507568, + "learning_rate": 9.20935510833623e-06, + "logits/chosen": -0.1172533854842186, + "logits/rejected": -0.21945087611675262, + "logps/chosen": -53.262943267822266, + "logps/rejected": -54.47647476196289, + "loss": 0.8375, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6356348991394043, + "rewards/margins": 3.00815749168396, + "rewards/rejected": -0.3725225031375885, + "step": 1088 + }, + { + "epoch": 0.27, + "grad_norm": 4.286810874938965, + "learning_rate": 9.207940712340433e-06, + "logits/chosen": -0.12511597573757172, + "logits/rejected": -0.22170224785804749, + "logps/chosen": -61.87845230102539, + "logps/rejected": -83.5710678100586, + "loss": 0.7966, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7571260929107666, + "rewards/margins": 3.22464656829834, + "rewards/rejected": -0.4675205647945404, + "step": 1089 + }, + { + "epoch": 0.27, + "grad_norm": 4.9947896003723145, + "learning_rate": 9.20652516117411e-06, + "logits/chosen": -0.005045775324106216, + "logits/rejected": -0.1476253718137741, + "logps/chosen": -57.94611358642578, + "logps/rejected": -75.34717559814453, + "loss": 0.7449, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.963531017303467, + "rewards/margins": 2.730790615081787, + "rewards/rejected": 0.23274032771587372, + "step": 1090 + }, + { + "epoch": 0.27, + "grad_norm": 6.763038158416748, + "learning_rate": 9.20510845522586e-06, + "logits/chosen": -0.07978639751672745, + "logits/rejected": -0.22838859260082245, + "logps/chosen": -61.14339065551758, + "logps/rejected": -77.59598541259766, + "loss": 0.823, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.772981882095337, + "rewards/margins": 3.2797820568084717, + "rewards/rejected": -0.5068001747131348, + "step": 1091 + }, + { + "epoch": 0.27, + "grad_norm": 3.2032649517059326, + "learning_rate": 9.2036905948846e-06, + "logits/chosen": -0.12394124269485474, + "logits/rejected": -0.3301861882209778, + "logps/chosen": -63.26573944091797, + "logps/rejected": -64.74536895751953, + "loss": 0.7265, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.956761360168457, + "rewards/margins": 4.230443000793457, + "rewards/rejected": -1.273681640625, + "step": 1092 + }, + { + "epoch": 0.27, + "grad_norm": 4.949277877807617, + "learning_rate": 9.202271580539564e-06, + "logits/chosen": -0.11422401666641235, + "logits/rejected": -0.2038496881723404, + "logps/chosen": -66.09920501708984, + "logps/rejected": -78.42740631103516, + "loss": 0.895, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4497838020324707, + "rewards/margins": 3.2911267280578613, + "rewards/rejected": -0.8413428664207458, + "step": 1093 + }, + { + "epoch": 0.27, + "grad_norm": 6.389739513397217, + "learning_rate": 9.2008514125803e-06, + "logits/chosen": -0.08472826331853867, + "logits/rejected": -0.20294347405433655, + "logps/chosen": -51.29851150512695, + "logps/rejected": -67.71113586425781, + "loss": 0.8927, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6255998611450195, + "rewards/margins": 3.240257501602173, + "rewards/rejected": -0.6146580576896667, + "step": 1094 + }, + { + "epoch": 0.27, + "grad_norm": 3.8733534812927246, + "learning_rate": 9.199430091396677e-06, + "logits/chosen": -0.10859040915966034, + "logits/rejected": -0.19582140445709229, + "logps/chosen": -60.89190673828125, + "logps/rejected": -67.95636749267578, + "loss": 0.8128, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.636988639831543, + "rewards/margins": 3.13369083404541, + "rewards/rejected": -0.49670231342315674, + "step": 1095 + }, + { + "epoch": 0.27, + "grad_norm": 4.916275501251221, + "learning_rate": 9.198007617378876e-06, + "logits/chosen": -0.04038895666599274, + "logits/rejected": -0.16487447917461395, + "logps/chosen": -56.674415588378906, + "logps/rejected": -65.65460205078125, + "loss": 0.8916, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7695417404174805, + "rewards/margins": 2.928370475769043, + "rewards/rejected": -0.1588287353515625, + "step": 1096 + }, + { + "epoch": 0.27, + "grad_norm": 4.469991683959961, + "learning_rate": 9.1965839909174e-06, + "logits/chosen": -0.12566953897476196, + "logits/rejected": -0.25572699308395386, + "logps/chosen": -49.02577590942383, + "logps/rejected": -65.88700103759766, + "loss": 0.6975, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.797233819961548, + "rewards/margins": 3.7717199325561523, + "rewards/rejected": -0.9744863510131836, + "step": 1097 + }, + { + "epoch": 0.27, + "grad_norm": 4.024089813232422, + "learning_rate": 9.195159212403064e-06, + "logits/chosen": -0.08827241510152817, + "logits/rejected": -0.2766229510307312, + "logps/chosen": -57.9786376953125, + "logps/rejected": -55.14748764038086, + "loss": 0.7357, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.82323956489563, + "rewards/margins": 3.2607693672180176, + "rewards/rejected": -0.4375297427177429, + "step": 1098 + }, + { + "epoch": 0.27, + "grad_norm": 5.368036270141602, + "learning_rate": 9.193733282226997e-06, + "logits/chosen": -0.18000759184360504, + "logits/rejected": -0.31686121225357056, + "logps/chosen": -56.3790283203125, + "logps/rejected": -59.414093017578125, + "loss": 0.8801, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8973608016967773, + "rewards/margins": 3.752931594848633, + "rewards/rejected": -0.8555706739425659, + "step": 1099 + }, + { + "epoch": 0.28, + "grad_norm": 2.6845333576202393, + "learning_rate": 9.192306200780652e-06, + "logits/chosen": -0.03925419598817825, + "logits/rejected": -0.1689458042383194, + "logps/chosen": -58.86277389526367, + "logps/rejected": -75.29706573486328, + "loss": 0.7274, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.879143714904785, + "rewards/margins": 4.1948747634887695, + "rewards/rejected": -1.3157315254211426, + "step": 1100 + }, + { + "epoch": 0.28, + "grad_norm": 4.267556667327881, + "learning_rate": 9.190877968455794e-06, + "logits/chosen": -0.11555404961109161, + "logits/rejected": -0.20262077450752258, + "logps/chosen": -61.614158630371094, + "logps/rejected": -66.19100189208984, + "loss": 0.96, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7379817962646484, + "rewards/margins": 2.4622933864593506, + "rewards/rejected": 0.27568840980529785, + "step": 1101 + }, + { + "epoch": 0.28, + "grad_norm": 3.608179807662964, + "learning_rate": 9.189448585644499e-06, + "logits/chosen": -0.09131714701652527, + "logits/rejected": -0.24685880541801453, + "logps/chosen": -55.1065673828125, + "logps/rejected": -58.41002655029297, + "loss": 0.8241, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.884782075881958, + "rewards/margins": 4.134772300720215, + "rewards/rejected": -1.2499899864196777, + "step": 1102 + }, + { + "epoch": 0.28, + "grad_norm": 4.594669342041016, + "learning_rate": 9.188018052739166e-06, + "logits/chosen": -0.16660089790821075, + "logits/rejected": -0.2309606373310089, + "logps/chosen": -49.83094787597656, + "logps/rejected": -83.85640716552734, + "loss": 0.7751, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.62589693069458, + "rewards/margins": 2.7041289806365967, + "rewards/rejected": -0.0782322883605957, + "step": 1103 + }, + { + "epoch": 0.28, + "grad_norm": 4.723653316497803, + "learning_rate": 9.186586370132508e-06, + "logits/chosen": -0.10531030595302582, + "logits/rejected": -0.24748429656028748, + "logps/chosen": -61.45093536376953, + "logps/rejected": -60.91629409790039, + "loss": 0.9395, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.7647705078125, + "rewards/margins": 3.0237224102020264, + "rewards/rejected": -0.2589520215988159, + "step": 1104 + }, + { + "epoch": 0.28, + "grad_norm": 4.429227828979492, + "learning_rate": 9.185153538217553e-06, + "logits/chosen": -0.10025256872177124, + "logits/rejected": -0.19261609017848969, + "logps/chosen": -69.14251708984375, + "logps/rejected": -68.91675567626953, + "loss": 0.979, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.641235113143921, + "rewards/margins": 2.9136290550231934, + "rewards/rejected": -0.2723942697048187, + "step": 1105 + }, + { + "epoch": 0.28, + "grad_norm": 3.9680936336517334, + "learning_rate": 9.18371955738764e-06, + "logits/chosen": -0.16122402250766754, + "logits/rejected": -0.2503926157951355, + "logps/chosen": -57.74052429199219, + "logps/rejected": -82.06304931640625, + "loss": 0.8525, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.91633677482605, + "rewards/margins": 3.340919017791748, + "rewards/rejected": -0.424582302570343, + "step": 1106 + }, + { + "epoch": 0.28, + "grad_norm": 5.067862510681152, + "learning_rate": 9.182284428036434e-06, + "logits/chosen": -0.10225807130336761, + "logits/rejected": -0.20920220017433167, + "logps/chosen": -52.667144775390625, + "logps/rejected": -67.5865249633789, + "loss": 0.7939, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7443904876708984, + "rewards/margins": 3.011413097381592, + "rewards/rejected": -0.26702240109443665, + "step": 1107 + }, + { + "epoch": 0.28, + "grad_norm": 3.633122205734253, + "learning_rate": 9.180848150557906e-06, + "logits/chosen": -0.1131524071097374, + "logits/rejected": -0.26276111602783203, + "logps/chosen": -59.665008544921875, + "logps/rejected": -58.489070892333984, + "loss": 0.7626, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.820193290710449, + "rewards/margins": 3.024779796600342, + "rewards/rejected": -0.204586461186409, + "step": 1108 + }, + { + "epoch": 0.28, + "grad_norm": 4.362043380737305, + "learning_rate": 9.179410725346342e-06, + "logits/chosen": -0.12575168907642365, + "logits/rejected": -0.18098898231983185, + "logps/chosen": -59.019134521484375, + "logps/rejected": -82.94776153564453, + "loss": 0.8582, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8974759578704834, + "rewards/margins": 3.398603916168213, + "rewards/rejected": -0.5011278390884399, + "step": 1109 + }, + { + "epoch": 0.28, + "grad_norm": 7.098262786865234, + "learning_rate": 9.177972152796352e-06, + "logits/chosen": -0.13358192145824432, + "logits/rejected": -0.2372535616159439, + "logps/chosen": -65.40518188476562, + "logps/rejected": -63.34043884277344, + "loss": 1.0638, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.3087732791900635, + "rewards/margins": 2.427654266357422, + "rewards/rejected": -0.11888080835342407, + "step": 1110 + }, + { + "epoch": 0.28, + "grad_norm": 2.6803486347198486, + "learning_rate": 9.176532433302854e-06, + "logits/chosen": -0.02135433256626129, + "logits/rejected": -0.2040444016456604, + "logps/chosen": -70.3957748413086, + "logps/rejected": -62.471466064453125, + "loss": 0.7425, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0438313484191895, + "rewards/margins": 3.7214431762695312, + "rewards/rejected": -0.6776119470596313, + "step": 1111 + }, + { + "epoch": 0.28, + "grad_norm": 5.1729631423950195, + "learning_rate": 9.175091567261078e-06, + "logits/chosen": -0.11067111045122147, + "logits/rejected": -0.1147952750325203, + "logps/chosen": -57.51948165893555, + "logps/rejected": -77.73716735839844, + "loss": 0.8781, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8668088912963867, + "rewards/margins": 2.3618807792663574, + "rewards/rejected": 0.5049278140068054, + "step": 1112 + }, + { + "epoch": 0.28, + "grad_norm": 3.5392470359802246, + "learning_rate": 9.173649555066574e-06, + "logits/chosen": -0.1083441898226738, + "logits/rejected": -0.21375255286693573, + "logps/chosen": -57.90522384643555, + "logps/rejected": -78.15821075439453, + "loss": 0.7113, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9221551418304443, + "rewards/margins": 3.8423409461975098, + "rewards/rejected": -0.9201857447624207, + "step": 1113 + }, + { + "epoch": 0.28, + "grad_norm": 5.154374122619629, + "learning_rate": 9.17220639711521e-06, + "logits/chosen": -0.25952261686325073, + "logits/rejected": -0.3492420017719269, + "logps/chosen": -56.33831024169922, + "logps/rejected": -66.74659729003906, + "loss": 0.88, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9355387687683105, + "rewards/margins": 3.184814453125, + "rewards/rejected": -0.24927553534507751, + "step": 1114 + }, + { + "epoch": 0.28, + "grad_norm": 5.202516078948975, + "learning_rate": 9.17076209380316e-06, + "logits/chosen": -0.21853423118591309, + "logits/rejected": -0.3503979444503784, + "logps/chosen": -62.73167419433594, + "logps/rejected": -67.09327697753906, + "loss": 0.9923, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.663637161254883, + "rewards/margins": 3.218475341796875, + "rewards/rejected": -0.5548384189605713, + "step": 1115 + }, + { + "epoch": 0.28, + "grad_norm": 3.987004280090332, + "learning_rate": 9.169316645526919e-06, + "logits/chosen": -0.060978472232818604, + "logits/rejected": -0.1382662057876587, + "logps/chosen": -59.53372573852539, + "logps/rejected": -85.96101379394531, + "loss": 0.8361, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9932336807250977, + "rewards/margins": 3.1214022636413574, + "rewards/rejected": -0.1281687617301941, + "step": 1116 + }, + { + "epoch": 0.28, + "grad_norm": 2.970632791519165, + "learning_rate": 9.167870052683288e-06, + "logits/chosen": -0.12965065240859985, + "logits/rejected": -0.3032756447792053, + "logps/chosen": -51.48820114135742, + "logps/rejected": -58.73661804199219, + "loss": 0.7149, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8824424743652344, + "rewards/margins": 3.8890085220336914, + "rewards/rejected": -1.0065661668777466, + "step": 1117 + }, + { + "epoch": 0.28, + "grad_norm": 4.633979320526123, + "learning_rate": 9.166422315669394e-06, + "logits/chosen": -0.1555308699607849, + "logits/rejected": -0.2551420032978058, + "logps/chosen": -51.04546356201172, + "logps/rejected": -71.32258605957031, + "loss": 0.7211, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0049498081207275, + "rewards/margins": 3.9036850929260254, + "rewards/rejected": -0.8987357020378113, + "step": 1118 + }, + { + "epoch": 0.28, + "grad_norm": 4.150485992431641, + "learning_rate": 9.16497343488267e-06, + "logits/chosen": -0.16214048862457275, + "logits/rejected": -0.2554890513420105, + "logps/chosen": -58.40482711791992, + "logps/rejected": -60.522743225097656, + "loss": 0.8129, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8133368492126465, + "rewards/margins": 2.6226727962493896, + "rewards/rejected": 0.19066408276557922, + "step": 1119 + }, + { + "epoch": 0.28, + "grad_norm": 4.219474792480469, + "learning_rate": 9.163523410720866e-06, + "logits/chosen": -0.14984604716300964, + "logits/rejected": -0.25015419721603394, + "logps/chosen": -55.947139739990234, + "logps/rejected": -68.7513198852539, + "loss": 0.8736, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6988909244537354, + "rewards/margins": 2.91770076751709, + "rewards/rejected": -0.2188096046447754, + "step": 1120 + }, + { + "epoch": 0.28, + "grad_norm": 4.3997063636779785, + "learning_rate": 9.162072243582044e-06, + "logits/chosen": -0.055669691413640976, + "logits/rejected": -0.260052889585495, + "logps/chosen": -59.84088134765625, + "logps/rejected": -54.49820327758789, + "loss": 0.8668, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7205400466918945, + "rewards/margins": 3.196380615234375, + "rewards/rejected": -0.4758407473564148, + "step": 1121 + }, + { + "epoch": 0.28, + "grad_norm": 3.031609535217285, + "learning_rate": 9.16061993386458e-06, + "logits/chosen": -0.21062886714935303, + "logits/rejected": -0.2354193478822708, + "logps/chosen": -58.38837814331055, + "logps/rejected": -74.86682891845703, + "loss": 0.7864, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.860549211502075, + "rewards/margins": 3.123487710952759, + "rewards/rejected": -0.26293885707855225, + "step": 1122 + }, + { + "epoch": 0.28, + "grad_norm": 5.609443664550781, + "learning_rate": 9.159166481967164e-06, + "logits/chosen": -0.2228144407272339, + "logits/rejected": -0.3216230869293213, + "logps/chosen": -51.15890121459961, + "logps/rejected": -74.58808898925781, + "loss": 0.9676, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.859006643295288, + "rewards/margins": 2.7980759143829346, + "rewards/rejected": 0.060930490493774414, + "step": 1123 + }, + { + "epoch": 0.28, + "grad_norm": 3.35602068901062, + "learning_rate": 9.157711888288802e-06, + "logits/chosen": -0.06785422563552856, + "logits/rejected": -0.16686153411865234, + "logps/chosen": -61.301673889160156, + "logps/rejected": -77.27195739746094, + "loss": 0.9211, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9978814125061035, + "rewards/margins": 3.0096683502197266, + "rewards/rejected": -0.01178702712059021, + "step": 1124 + }, + { + "epoch": 0.28, + "grad_norm": 5.9790449142456055, + "learning_rate": 9.156256153228811e-06, + "logits/chosen": -0.11786920577287674, + "logits/rejected": -0.21543936431407928, + "logps/chosen": -51.527732849121094, + "logps/rejected": -73.49209594726562, + "loss": 0.9135, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4619874954223633, + "rewards/margins": 3.2341861724853516, + "rewards/rejected": -0.7721985578536987, + "step": 1125 + }, + { + "epoch": 0.28, + "grad_norm": 5.264896392822266, + "learning_rate": 9.15479927718682e-06, + "logits/chosen": -0.1566023975610733, + "logits/rejected": -0.24331678450107574, + "logps/chosen": -58.839759826660156, + "logps/rejected": -65.84113311767578, + "loss": 0.8363, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.906172513961792, + "rewards/margins": 3.4002203941345215, + "rewards/rejected": -0.49404752254486084, + "step": 1126 + }, + { + "epoch": 0.28, + "grad_norm": 4.039844512939453, + "learning_rate": 9.153341260562774e-06, + "logits/chosen": -0.03918503224849701, + "logits/rejected": -0.19024795293807983, + "logps/chosen": -50.349586486816406, + "logps/rejected": -60.09646987915039, + "loss": 0.7425, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.918769359588623, + "rewards/margins": 3.2110509872436523, + "rewards/rejected": -0.2922815680503845, + "step": 1127 + }, + { + "epoch": 0.28, + "grad_norm": 4.052705764770508, + "learning_rate": 9.15188210375693e-06, + "logits/chosen": -0.0908157080411911, + "logits/rejected": -0.17031294107437134, + "logps/chosen": -56.59190368652344, + "logps/rejected": -69.20760345458984, + "loss": 0.8678, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.735543727874756, + "rewards/margins": 2.799036979675293, + "rewards/rejected": -0.0634932816028595, + "step": 1128 + }, + { + "epoch": 0.28, + "grad_norm": 5.553737163543701, + "learning_rate": 9.150421807169858e-06, + "logits/chosen": -0.059409260749816895, + "logits/rejected": -0.18039058148860931, + "logps/chosen": -51.45710754394531, + "logps/rejected": -62.185630798339844, + "loss": 0.6943, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.100581169128418, + "rewards/margins": 4.159060478210449, + "rewards/rejected": -1.0584793090820312, + "step": 1129 + }, + { + "epoch": 0.28, + "grad_norm": 3.8310558795928955, + "learning_rate": 9.148960371202442e-06, + "logits/chosen": -0.1703551709651947, + "logits/rejected": -0.2641439437866211, + "logps/chosen": -56.57456970214844, + "logps/rejected": -74.98146057128906, + "loss": 0.8402, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.025131940841675, + "rewards/margins": 3.050436019897461, + "rewards/rejected": -0.025304146111011505, + "step": 1130 + }, + { + "epoch": 0.28, + "grad_norm": 6.365297794342041, + "learning_rate": 9.147497796255875e-06, + "logits/chosen": -0.07238023728132248, + "logits/rejected": -0.19748805463314056, + "logps/chosen": -65.1187973022461, + "logps/rejected": -80.49304962158203, + "loss": 0.8779, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8748159408569336, + "rewards/margins": 3.298839569091797, + "rewards/rejected": -0.42402327060699463, + "step": 1131 + }, + { + "epoch": 0.28, + "grad_norm": 6.269277572631836, + "learning_rate": 9.146034082731668e-06, + "logits/chosen": -0.18060126900672913, + "logits/rejected": -0.19614848494529724, + "logps/chosen": -54.27809143066406, + "logps/rejected": -78.42192077636719, + "loss": 1.0347, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.566558599472046, + "rewards/margins": 2.2984557151794434, + "rewards/rejected": 0.26810282468795776, + "step": 1132 + }, + { + "epoch": 0.28, + "grad_norm": 7.1440019607543945, + "learning_rate": 9.144569231031642e-06, + "logits/chosen": -0.117728590965271, + "logits/rejected": -0.23861348628997803, + "logps/chosen": -62.918907165527344, + "logps/rejected": -68.490234375, + "loss": 1.0206, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.588531732559204, + "rewards/margins": 2.7626421451568604, + "rewards/rejected": -0.1741102933883667, + "step": 1133 + }, + { + "epoch": 0.28, + "grad_norm": 4.98776912689209, + "learning_rate": 9.143103241557927e-06, + "logits/chosen": -0.061340343207120895, + "logits/rejected": -0.21720924973487854, + "logps/chosen": -61.437416076660156, + "logps/rejected": -71.49703216552734, + "loss": 0.9036, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6991050243377686, + "rewards/margins": 3.0286154747009277, + "rewards/rejected": -0.3295106887817383, + "step": 1134 + }, + { + "epoch": 0.28, + "grad_norm": 3.6391305923461914, + "learning_rate": 9.141636114712972e-06, + "logits/chosen": -0.16316142678260803, + "logits/rejected": -0.25115951895713806, + "logps/chosen": -57.4339599609375, + "logps/rejected": -64.57574462890625, + "loss": 0.7913, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6740424633026123, + "rewards/margins": 3.001988410949707, + "rewards/rejected": -0.3279460668563843, + "step": 1135 + }, + { + "epoch": 0.28, + "grad_norm": 4.5978684425354, + "learning_rate": 9.140167850899533e-06, + "logits/chosen": -0.16383759677410126, + "logits/rejected": -0.3062782287597656, + "logps/chosen": -67.41927337646484, + "logps/rejected": -70.81473541259766, + "loss": 0.883, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.961576223373413, + "rewards/margins": 3.3788909912109375, + "rewards/rejected": -0.4173150062561035, + "step": 1136 + }, + { + "epoch": 0.28, + "grad_norm": 5.4174275398254395, + "learning_rate": 9.138698450520683e-06, + "logits/chosen": -0.1470949500799179, + "logits/rejected": -0.18655826151371002, + "logps/chosen": -47.65970993041992, + "logps/rejected": -87.97738647460938, + "loss": 0.7948, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.644991397857666, + "rewards/margins": 3.059861183166504, + "rewards/rejected": -0.41486993432044983, + "step": 1137 + }, + { + "epoch": 0.28, + "grad_norm": 3.732017993927002, + "learning_rate": 9.1372279139798e-06, + "logits/chosen": -0.04313429817557335, + "logits/rejected": -0.2350635826587677, + "logps/chosen": -57.06056594848633, + "logps/rejected": -68.3395004272461, + "loss": 0.8844, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.666512966156006, + "rewards/margins": 3.7007124423980713, + "rewards/rejected": -1.0341991186141968, + "step": 1138 + }, + { + "epoch": 0.28, + "grad_norm": 6.2618818283081055, + "learning_rate": 9.135756241680581e-06, + "logits/chosen": -0.21980567276477814, + "logits/rejected": -0.3056294023990631, + "logps/chosen": -64.43868255615234, + "logps/rejected": -78.38345336914062, + "loss": 0.8907, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9495861530303955, + "rewards/margins": 3.489439010620117, + "rewards/rejected": -0.5398528575897217, + "step": 1139 + }, + { + "epoch": 0.29, + "grad_norm": 3.4073832035064697, + "learning_rate": 9.134283434027033e-06, + "logits/chosen": -0.11080469936132431, + "logits/rejected": -0.1847470998764038, + "logps/chosen": -46.41365051269531, + "logps/rejected": -74.55764770507812, + "loss": 0.7538, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9680371284484863, + "rewards/margins": 3.7506866455078125, + "rewards/rejected": -0.7826493978500366, + "step": 1140 + }, + { + "epoch": 0.29, + "grad_norm": 3.9288337230682373, + "learning_rate": 9.13280949142347e-06, + "logits/chosen": -0.1428145319223404, + "logits/rejected": -0.23268532752990723, + "logps/chosen": -52.814849853515625, + "logps/rejected": -65.38280487060547, + "loss": 0.8462, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.852050542831421, + "rewards/margins": 2.8823137283325195, + "rewards/rejected": -0.030263014137744904, + "step": 1141 + }, + { + "epoch": 0.29, + "grad_norm": 3.858908176422119, + "learning_rate": 9.131334414274524e-06, + "logits/chosen": -0.03505343198776245, + "logits/rejected": -0.1476706564426422, + "logps/chosen": -56.4029541015625, + "logps/rejected": -75.07852935791016, + "loss": 0.8146, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6610729694366455, + "rewards/margins": 2.682389259338379, + "rewards/rejected": -0.021316736936569214, + "step": 1142 + }, + { + "epoch": 0.29, + "grad_norm": 4.440924644470215, + "learning_rate": 9.129858202985134e-06, + "logits/chosen": -0.12902238965034485, + "logits/rejected": -0.26977917551994324, + "logps/chosen": -62.599769592285156, + "logps/rejected": -56.864410400390625, + "loss": 0.8868, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7920522689819336, + "rewards/margins": 2.8690426349639893, + "rewards/rejected": -0.07699045538902283, + "step": 1143 + }, + { + "epoch": 0.29, + "grad_norm": 1.8168684244155884, + "learning_rate": 9.12838085796055e-06, + "logits/chosen": -0.11819716542959213, + "logits/rejected": -0.20005781948566437, + "logps/chosen": -54.2613639831543, + "logps/rejected": -74.46275329589844, + "loss": 0.6114, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2130706310272217, + "rewards/margins": 3.8056013584136963, + "rewards/rejected": -0.5925307869911194, + "step": 1144 + }, + { + "epoch": 0.29, + "grad_norm": 4.409187316894531, + "learning_rate": 9.126902379606338e-06, + "logits/chosen": -0.07915953546762466, + "logits/rejected": -0.28382542729377747, + "logps/chosen": -56.072025299072266, + "logps/rejected": -57.20463943481445, + "loss": 0.883, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7447128295898438, + "rewards/margins": 3.385448694229126, + "rewards/rejected": -0.6407362222671509, + "step": 1145 + }, + { + "epoch": 0.29, + "grad_norm": 6.888025283813477, + "learning_rate": 9.125422768328371e-06, + "logits/chosen": -0.12748651206493378, + "logits/rejected": -0.27384403347969055, + "logps/chosen": -51.077327728271484, + "logps/rejected": -70.721435546875, + "loss": 0.6504, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.943089485168457, + "rewards/margins": 4.2550764083862305, + "rewards/rejected": -1.3119860887527466, + "step": 1146 + }, + { + "epoch": 0.29, + "grad_norm": 3.922492027282715, + "learning_rate": 9.123942024532835e-06, + "logits/chosen": -0.22312304377555847, + "logits/rejected": -0.3076724112033844, + "logps/chosen": -55.29267120361328, + "logps/rejected": -65.78875732421875, + "loss": 0.7898, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.738938093185425, + "rewards/margins": 3.160334348678589, + "rewards/rejected": -0.4213966727256775, + "step": 1147 + }, + { + "epoch": 0.29, + "grad_norm": 3.60099196434021, + "learning_rate": 9.122460148626227e-06, + "logits/chosen": -0.10104811191558838, + "logits/rejected": -0.16401632130146027, + "logps/chosen": -60.49750900268555, + "logps/rejected": -69.58341979980469, + "loss": 0.8831, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.775156021118164, + "rewards/margins": 2.436288356781006, + "rewards/rejected": 0.3388673961162567, + "step": 1148 + }, + { + "epoch": 0.29, + "grad_norm": 4.860777378082275, + "learning_rate": 9.12097714101535e-06, + "logits/chosen": -0.05026143044233322, + "logits/rejected": -0.22057044506072998, + "logps/chosen": -56.5870361328125, + "logps/rejected": -78.09584045410156, + "loss": 0.7642, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.677057981491089, + "rewards/margins": 3.219698429107666, + "rewards/rejected": -0.5426403880119324, + "step": 1149 + }, + { + "epoch": 0.29, + "grad_norm": 3.77221941947937, + "learning_rate": 9.119493002107325e-06, + "logits/chosen": -0.0779230073094368, + "logits/rejected": -0.21806061267852783, + "logps/chosen": -55.69093322753906, + "logps/rejected": -70.98336029052734, + "loss": 0.8411, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9052414894104004, + "rewards/margins": 3.5095691680908203, + "rewards/rejected": -0.6043274998664856, + "step": 1150 + }, + { + "epoch": 0.29, + "grad_norm": 2.9721128940582275, + "learning_rate": 9.118007732309579e-06, + "logits/chosen": -0.1431182622909546, + "logits/rejected": -0.1934080272912979, + "logps/chosen": -44.94745635986328, + "logps/rejected": -70.15137481689453, + "loss": 0.7469, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0095624923706055, + "rewards/margins": 4.171419620513916, + "rewards/rejected": -1.1618573665618896, + "step": 1151 + }, + { + "epoch": 0.29, + "grad_norm": 3.8589673042297363, + "learning_rate": 9.116521332029852e-06, + "logits/chosen": -0.14108867943286896, + "logits/rejected": -0.2823382318019867, + "logps/chosen": -55.550132751464844, + "logps/rejected": -54.19304656982422, + "loss": 0.7715, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.866010904312134, + "rewards/margins": 3.361710548400879, + "rewards/rejected": -0.49569955468177795, + "step": 1152 + }, + { + "epoch": 0.29, + "grad_norm": 8.375604629516602, + "learning_rate": 9.115033801676192e-06, + "logits/chosen": -0.13792277872562408, + "logits/rejected": -0.19025933742523193, + "logps/chosen": -51.9151725769043, + "logps/rejected": -76.5020751953125, + "loss": 0.8611, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7009520530700684, + "rewards/margins": 2.524745464324951, + "rewards/rejected": 0.1762068271636963, + "step": 1153 + }, + { + "epoch": 0.29, + "grad_norm": 3.7169458866119385, + "learning_rate": 9.113545141656956e-06, + "logits/chosen": -0.0922759547829628, + "logits/rejected": -0.24502131342887878, + "logps/chosen": -59.47494888305664, + "logps/rejected": -69.60740661621094, + "loss": 0.8173, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.738372325897217, + "rewards/margins": 3.122411012649536, + "rewards/rejected": -0.3840385377407074, + "step": 1154 + }, + { + "epoch": 0.29, + "grad_norm": 4.897540092468262, + "learning_rate": 9.112055352380818e-06, + "logits/chosen": -0.09638109058141708, + "logits/rejected": -0.20867595076560974, + "logps/chosen": -73.44829559326172, + "logps/rejected": -69.13725280761719, + "loss": 1.1108, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.4961187839508057, + "rewards/margins": 2.345332145690918, + "rewards/rejected": 0.15078668296337128, + "step": 1155 + }, + { + "epoch": 0.29, + "grad_norm": 2.5231165885925293, + "learning_rate": 9.110564434256752e-06, + "logits/chosen": -0.1346687525510788, + "logits/rejected": -0.2620835304260254, + "logps/chosen": -57.1035041809082, + "logps/rejected": -76.07112121582031, + "loss": 0.7527, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1743650436401367, + "rewards/margins": 4.290319442749023, + "rewards/rejected": -1.1159541606903076, + "step": 1156 + }, + { + "epoch": 0.29, + "grad_norm": 5.057181358337402, + "learning_rate": 9.109072387694052e-06, + "logits/chosen": -0.003185074543580413, + "logits/rejected": -0.06745447218418121, + "logps/chosen": -60.321807861328125, + "logps/rejected": -83.32825469970703, + "loss": 0.8715, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6643285751342773, + "rewards/margins": 2.9413158893585205, + "rewards/rejected": -0.27698731422424316, + "step": 1157 + }, + { + "epoch": 0.29, + "grad_norm": 3.556302070617676, + "learning_rate": 9.107579213102312e-06, + "logits/chosen": -0.17490790784358978, + "logits/rejected": -0.23213765025138855, + "logps/chosen": -44.308563232421875, + "logps/rejected": -66.63833618164062, + "loss": 0.8205, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.799299955368042, + "rewards/margins": 3.0016982555389404, + "rewards/rejected": -0.20239797234535217, + "step": 1158 + }, + { + "epoch": 0.29, + "grad_norm": 4.486227989196777, + "learning_rate": 9.106084910891446e-06, + "logits/chosen": -0.1494445949792862, + "logits/rejected": -0.26181238889694214, + "logps/chosen": -57.64137268066406, + "logps/rejected": -70.27926635742188, + "loss": 0.902, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0184855461120605, + "rewards/margins": 3.4446401596069336, + "rewards/rejected": -0.42615458369255066, + "step": 1159 + }, + { + "epoch": 0.29, + "grad_norm": 3.8439528942108154, + "learning_rate": 9.104589481471668e-06, + "logits/chosen": -0.09960262477397919, + "logits/rejected": -0.1540917456150055, + "logps/chosen": -65.36289978027344, + "logps/rejected": -81.4924087524414, + "loss": 0.9909, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.785655975341797, + "rewards/margins": 3.047189235687256, + "rewards/rejected": -0.2615334689617157, + "step": 1160 + }, + { + "epoch": 0.29, + "grad_norm": 4.151125431060791, + "learning_rate": 9.103092925253508e-06, + "logits/chosen": -0.08317280560731888, + "logits/rejected": -0.19228267669677734, + "logps/chosen": -68.00142669677734, + "logps/rejected": -76.50713348388672, + "loss": 0.8501, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8821592330932617, + "rewards/margins": 3.347773551940918, + "rewards/rejected": -0.46561431884765625, + "step": 1161 + }, + { + "epoch": 0.29, + "grad_norm": 3.885800361633301, + "learning_rate": 9.101595242647803e-06, + "logits/chosen": -0.14705190062522888, + "logits/rejected": -0.27779415249824524, + "logps/chosen": -54.59912872314453, + "logps/rejected": -65.5654525756836, + "loss": 0.8155, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.732454538345337, + "rewards/margins": 3.2474148273468018, + "rewards/rejected": -0.5149604082107544, + "step": 1162 + }, + { + "epoch": 0.29, + "grad_norm": 4.096837997436523, + "learning_rate": 9.100096434065697e-06, + "logits/chosen": -0.0629458948969841, + "logits/rejected": -0.18467923998832703, + "logps/chosen": -65.83216857910156, + "logps/rejected": -71.13912200927734, + "loss": 0.8103, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7412490844726562, + "rewards/margins": 2.7226507663726807, + "rewards/rejected": 0.01859867572784424, + "step": 1163 + }, + { + "epoch": 0.29, + "grad_norm": 3.4703330993652344, + "learning_rate": 9.098596499918648e-06, + "logits/chosen": -0.13327521085739136, + "logits/rejected": -0.19531135261058807, + "logps/chosen": -61.551727294921875, + "logps/rejected": -82.93494415283203, + "loss": 0.8781, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8844664096832275, + "rewards/margins": 3.117880344390869, + "rewards/rejected": -0.2334136962890625, + "step": 1164 + }, + { + "epoch": 0.29, + "grad_norm": 4.636369705200195, + "learning_rate": 9.097095440618419e-06, + "logits/chosen": -0.10579735040664673, + "logits/rejected": -0.23425351083278656, + "logps/chosen": -56.309486389160156, + "logps/rejected": -73.5345230102539, + "loss": 0.8805, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7880396842956543, + "rewards/margins": 3.1026451587677, + "rewards/rejected": -0.31460559368133545, + "step": 1165 + }, + { + "epoch": 0.29, + "grad_norm": 2.7246270179748535, + "learning_rate": 9.095593256577082e-06, + "logits/chosen": -0.21789507567882538, + "logits/rejected": -0.3426295816898346, + "logps/chosen": -46.375675201416016, + "logps/rejected": -63.93787384033203, + "loss": 0.7145, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.083383083343506, + "rewards/margins": 4.642524242401123, + "rewards/rejected": -1.5591410398483276, + "step": 1166 + }, + { + "epoch": 0.29, + "grad_norm": 4.473315715789795, + "learning_rate": 9.094089948207021e-06, + "logits/chosen": -0.15103337168693542, + "logits/rejected": -0.28110241889953613, + "logps/chosen": -59.768714904785156, + "logps/rejected": -77.57475280761719, + "loss": 0.931, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8366522789001465, + "rewards/margins": 3.3259990215301514, + "rewards/rejected": -0.48934707045555115, + "step": 1167 + }, + { + "epoch": 0.29, + "grad_norm": 4.995269775390625, + "learning_rate": 9.092585515920926e-06, + "logits/chosen": -0.1674109697341919, + "logits/rejected": -0.3034052848815918, + "logps/chosen": -67.99974822998047, + "logps/rejected": -63.12842559814453, + "loss": 0.9645, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.675100803375244, + "rewards/margins": 3.392329216003418, + "rewards/rejected": -0.7172282934188843, + "step": 1168 + }, + { + "epoch": 0.29, + "grad_norm": 4.267284393310547, + "learning_rate": 9.091079960131794e-06, + "logits/chosen": -0.11198808252811432, + "logits/rejected": -0.20194846391677856, + "logps/chosen": -46.413204193115234, + "logps/rejected": -69.23207092285156, + "loss": 0.7484, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8245747089385986, + "rewards/margins": 3.174042224884033, + "rewards/rejected": -0.34946751594543457, + "step": 1169 + }, + { + "epoch": 0.29, + "grad_norm": 4.5799336433410645, + "learning_rate": 9.089573281252938e-06, + "logits/chosen": -0.15030387043952942, + "logits/rejected": -0.1922314316034317, + "logps/chosen": -59.2380485534668, + "logps/rejected": -83.03125762939453, + "loss": 0.8232, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.096832752227783, + "rewards/margins": 3.025956153869629, + "rewards/rejected": 0.07087679207324982, + "step": 1170 + }, + { + "epoch": 0.29, + "grad_norm": 2.759432554244995, + "learning_rate": 9.088065479697968e-06, + "logits/chosen": -0.10526525229215622, + "logits/rejected": -0.2338252067565918, + "logps/chosen": -61.016483306884766, + "logps/rejected": -66.1092758178711, + "loss": 0.7173, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.977074146270752, + "rewards/margins": 3.800987720489502, + "rewards/rejected": -0.8239131569862366, + "step": 1171 + }, + { + "epoch": 0.29, + "grad_norm": 4.480895519256592, + "learning_rate": 9.08655655588081e-06, + "logits/chosen": -0.08830471336841583, + "logits/rejected": -0.22566953301429749, + "logps/chosen": -64.415283203125, + "logps/rejected": -68.24015808105469, + "loss": 0.9623, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9509780406951904, + "rewards/margins": 2.9277782440185547, + "rewards/rejected": 0.023199915885925293, + "step": 1172 + }, + { + "epoch": 0.29, + "grad_norm": 3.1181373596191406, + "learning_rate": 9.085046510215696e-06, + "logits/chosen": -0.175541490316391, + "logits/rejected": -0.3243364691734314, + "logps/chosen": -60.638893127441406, + "logps/rejected": -69.33922576904297, + "loss": 0.8213, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7816030979156494, + "rewards/margins": 3.0141615867614746, + "rewards/rejected": -0.23255857825279236, + "step": 1173 + }, + { + "epoch": 0.29, + "grad_norm": 3.5812268257141113, + "learning_rate": 9.083535343117168e-06, + "logits/chosen": -0.1401069015264511, + "logits/rejected": -0.27282121777534485, + "logps/chosen": -63.11756896972656, + "logps/rejected": -73.10322570800781, + "loss": 0.7215, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0262978076934814, + "rewards/margins": 3.4050259590148926, + "rewards/rejected": -0.37872830033302307, + "step": 1174 + }, + { + "epoch": 0.29, + "grad_norm": 3.9895925521850586, + "learning_rate": 9.082023055000074e-06, + "logits/chosen": -0.17594660818576813, + "logits/rejected": -0.27862513065338135, + "logps/chosen": -59.5128173828125, + "logps/rejected": -79.16771697998047, + "loss": 0.8539, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7071712017059326, + "rewards/margins": 3.318558931350708, + "rewards/rejected": -0.6113876104354858, + "step": 1175 + }, + { + "epoch": 0.29, + "grad_norm": 7.425441741943359, + "learning_rate": 9.080509646279564e-06, + "logits/chosen": -0.07128514349460602, + "logits/rejected": -0.20536962151527405, + "logps/chosen": -59.79698181152344, + "logps/rejected": -72.33745574951172, + "loss": 0.9953, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.86099910736084, + "rewards/margins": 3.068676710128784, + "rewards/rejected": -0.20767775177955627, + "step": 1176 + }, + { + "epoch": 0.29, + "grad_norm": 4.878453254699707, + "learning_rate": 9.07899511737111e-06, + "logits/chosen": -0.13994891941547394, + "logits/rejected": -0.2168738842010498, + "logps/chosen": -56.27134704589844, + "logps/rejected": -76.72248077392578, + "loss": 0.9444, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6671993732452393, + "rewards/margins": 2.6338858604431152, + "rewards/rejected": 0.03331398218870163, + "step": 1177 + }, + { + "epoch": 0.29, + "grad_norm": 7.469846725463867, + "learning_rate": 9.077479468690475e-06, + "logits/chosen": -0.008778393268585205, + "logits/rejected": -0.22060486674308777, + "logps/chosen": -71.85088348388672, + "logps/rejected": -58.08681106567383, + "loss": 0.8463, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.892752170562744, + "rewards/margins": 3.3383588790893555, + "rewards/rejected": -0.445606529712677, + "step": 1178 + }, + { + "epoch": 0.29, + "grad_norm": 3.986495018005371, + "learning_rate": 9.075962700653742e-06, + "logits/chosen": -0.08921583741903305, + "logits/rejected": -0.1808083951473236, + "logps/chosen": -61.91451644897461, + "logps/rejected": -88.53955841064453, + "loss": 0.8026, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7899961471557617, + "rewards/margins": 3.079211473464966, + "rewards/rejected": -0.2892155647277832, + "step": 1179 + }, + { + "epoch": 0.3, + "grad_norm": 4.161162376403809, + "learning_rate": 9.074444813677297e-06, + "logits/chosen": -0.1111430898308754, + "logits/rejected": -0.24486936628818512, + "logps/chosen": -56.335594177246094, + "logps/rejected": -58.93218231201172, + "loss": 0.8118, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8234050273895264, + "rewards/margins": 3.642244577407837, + "rewards/rejected": -0.8188397884368896, + "step": 1180 + }, + { + "epoch": 0.3, + "grad_norm": 4.475872993469238, + "learning_rate": 9.072925808177829e-06, + "logits/chosen": -0.09701419621706009, + "logits/rejected": -0.21294361352920532, + "logps/chosen": -55.31779861450195, + "logps/rejected": -56.800071716308594, + "loss": 0.8393, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7068393230438232, + "rewards/margins": 2.809267282485962, + "rewards/rejected": -0.10242804139852524, + "step": 1181 + }, + { + "epoch": 0.3, + "grad_norm": 4.714232444763184, + "learning_rate": 9.071405684572341e-06, + "logits/chosen": -0.05736532807350159, + "logits/rejected": -0.17711365222930908, + "logps/chosen": -50.17228698730469, + "logps/rejected": -65.11730194091797, + "loss": 0.868, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7575409412384033, + "rewards/margins": 2.61488676071167, + "rewards/rejected": 0.14265424013137817, + "step": 1182 + }, + { + "epoch": 0.3, + "grad_norm": 6.181714057922363, + "learning_rate": 9.069884443278139e-06, + "logits/chosen": -0.05648782104253769, + "logits/rejected": -0.21115538477897644, + "logps/chosen": -66.64152526855469, + "logps/rejected": -79.61492156982422, + "loss": 1.101, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.6677491664886475, + "rewards/margins": 2.7045342922210693, + "rewards/rejected": -0.03678501397371292, + "step": 1183 + }, + { + "epoch": 0.3, + "grad_norm": 3.635890483856201, + "learning_rate": 9.068362084712835e-06, + "logits/chosen": -0.17738522589206696, + "logits/rejected": -0.21976739168167114, + "logps/chosen": -51.7534294128418, + "logps/rejected": -76.51131439208984, + "loss": 0.808, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.975034713745117, + "rewards/margins": 2.895779609680176, + "rewards/rejected": 0.07925501465797424, + "step": 1184 + }, + { + "epoch": 0.3, + "grad_norm": 3.348764181137085, + "learning_rate": 9.06683860929435e-06, + "logits/chosen": -0.06703433394432068, + "logits/rejected": -0.2585701048374176, + "logps/chosen": -54.78712844848633, + "logps/rejected": -63.28095626831055, + "loss": 0.8316, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8906168937683105, + "rewards/margins": 3.81489896774292, + "rewards/rejected": -0.9242821931838989, + "step": 1185 + }, + { + "epoch": 0.3, + "grad_norm": 4.961549282073975, + "learning_rate": 9.065314017440911e-06, + "logits/chosen": -0.13403494656085968, + "logits/rejected": -0.26222798228263855, + "logps/chosen": -59.37238693237305, + "logps/rejected": -73.83883666992188, + "loss": 0.8485, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.574495553970337, + "rewards/margins": 3.515336751937866, + "rewards/rejected": -0.9408416748046875, + "step": 1186 + }, + { + "epoch": 0.3, + "grad_norm": 3.70882511138916, + "learning_rate": 9.063788309571054e-06, + "logits/chosen": -0.17922717332839966, + "logits/rejected": -0.2508716285228729, + "logps/chosen": -55.466583251953125, + "logps/rejected": -69.14213562011719, + "loss": 0.7947, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.635542154312134, + "rewards/margins": 2.8020577430725098, + "rewards/rejected": -0.1665153205394745, + "step": 1187 + }, + { + "epoch": 0.3, + "grad_norm": 3.6061623096466064, + "learning_rate": 9.062261486103614e-06, + "logits/chosen": -0.11048530787229538, + "logits/rejected": -0.2768346071243286, + "logps/chosen": -55.36663055419922, + "logps/rejected": -62.875980377197266, + "loss": 0.7871, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6436328887939453, + "rewards/margins": 3.0474987030029297, + "rewards/rejected": -0.4038658142089844, + "step": 1188 + }, + { + "epoch": 0.3, + "grad_norm": 4.208059787750244, + "learning_rate": 9.060733547457741e-06, + "logits/chosen": -0.1659822314977646, + "logits/rejected": -0.2542930245399475, + "logps/chosen": -39.51221466064453, + "logps/rejected": -55.536773681640625, + "loss": 0.7665, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7223644256591797, + "rewards/margins": 3.0938055515289307, + "rewards/rejected": -0.3714413046836853, + "step": 1189 + }, + { + "epoch": 0.3, + "grad_norm": 3.602757215499878, + "learning_rate": 9.059204494052884e-06, + "logits/chosen": -0.09959200024604797, + "logits/rejected": -0.1756690889596939, + "logps/chosen": -57.50859069824219, + "logps/rejected": -82.98299407958984, + "loss": 0.7983, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.69795823097229, + "rewards/margins": 3.3619792461395264, + "rewards/rejected": -0.6640210747718811, + "step": 1190 + }, + { + "epoch": 0.3, + "grad_norm": 3.6974072456359863, + "learning_rate": 9.057674326308804e-06, + "logits/chosen": -0.11358387768268585, + "logits/rejected": -0.20314088463783264, + "logps/chosen": -54.72452163696289, + "logps/rejected": -64.56045532226562, + "loss": 0.8042, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6962318420410156, + "rewards/margins": 3.4245405197143555, + "rewards/rejected": -0.7283090353012085, + "step": 1191 + }, + { + "epoch": 0.3, + "grad_norm": 4.0817155838012695, + "learning_rate": 9.056143044645564e-06, + "logits/chosen": -0.1692175418138504, + "logits/rejected": -0.2854018211364746, + "logps/chosen": -59.313697814941406, + "logps/rejected": -69.75376892089844, + "loss": 0.8602, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5546481609344482, + "rewards/margins": 2.658862829208374, + "rewards/rejected": -0.10421464592218399, + "step": 1192 + }, + { + "epoch": 0.3, + "grad_norm": 3.7292277812957764, + "learning_rate": 9.054610649483533e-06, + "logits/chosen": -0.12151797115802765, + "logits/rejected": -0.20518282055854797, + "logps/chosen": -55.977134704589844, + "logps/rejected": -68.61203002929688, + "loss": 0.8313, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.007286310195923, + "rewards/margins": 3.0786631107330322, + "rewards/rejected": -0.07137695699930191, + "step": 1193 + }, + { + "epoch": 0.3, + "grad_norm": 4.04309606552124, + "learning_rate": 9.053077141243388e-06, + "logits/chosen": -0.03758931905031204, + "logits/rejected": -0.2119644582271576, + "logps/chosen": -62.953758239746094, + "logps/rejected": -63.568214416503906, + "loss": 0.8357, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.669865369796753, + "rewards/margins": 3.1599183082580566, + "rewards/rejected": -0.4900529086589813, + "step": 1194 + }, + { + "epoch": 0.3, + "grad_norm": 4.274033546447754, + "learning_rate": 9.051542520346107e-06, + "logits/chosen": -0.1311555802822113, + "logits/rejected": -0.25077390670776367, + "logps/chosen": -59.26252365112305, + "logps/rejected": -72.99332427978516, + "loss": 0.8211, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5900840759277344, + "rewards/margins": 3.336580753326416, + "rewards/rejected": -0.7464969754219055, + "step": 1195 + }, + { + "epoch": 0.3, + "grad_norm": 4.023900032043457, + "learning_rate": 9.05000678721298e-06, + "logits/chosen": -0.0379440002143383, + "logits/rejected": -0.13132841885089874, + "logps/chosen": -60.774017333984375, + "logps/rejected": -62.7907600402832, + "loss": 0.8676, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9143035411834717, + "rewards/margins": 2.866664171218872, + "rewards/rejected": 0.047639328986406326, + "step": 1196 + }, + { + "epoch": 0.3, + "grad_norm": 4.758045673370361, + "learning_rate": 9.048469942265598e-06, + "logits/chosen": -0.1076357439160347, + "logits/rejected": -0.24384236335754395, + "logps/chosen": -49.89312744140625, + "logps/rejected": -64.28951263427734, + "loss": 0.6898, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0782992839813232, + "rewards/margins": 3.9222679138183594, + "rewards/rejected": -0.8439686298370361, + "step": 1197 + }, + { + "epoch": 0.3, + "grad_norm": 3.8247904777526855, + "learning_rate": 9.046931985925857e-06, + "logits/chosen": -0.07353539019823074, + "logits/rejected": -0.13902297616004944, + "logps/chosen": -56.83817672729492, + "logps/rejected": -73.8423843383789, + "loss": 0.802, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.762814998626709, + "rewards/margins": 2.8480935096740723, + "rewards/rejected": -0.08527883887290955, + "step": 1198 + }, + { + "epoch": 0.3, + "grad_norm": 3.5120084285736084, + "learning_rate": 9.04539291861596e-06, + "logits/chosen": -0.11617527157068253, + "logits/rejected": -0.20048858225345612, + "logps/chosen": -56.60829162597656, + "logps/rejected": -71.1646499633789, + "loss": 0.7484, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6154775619506836, + "rewards/margins": 3.104236364364624, + "rewards/rejected": -0.48875877261161804, + "step": 1199 + }, + { + "epoch": 0.3, + "grad_norm": 4.498508453369141, + "learning_rate": 9.043852740758416e-06, + "logits/chosen": -0.042002785950899124, + "logits/rejected": -0.14757363498210907, + "logps/chosen": -60.45085144042969, + "logps/rejected": -72.86967468261719, + "loss": 0.9444, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.506739854812622, + "rewards/margins": 2.9167721271514893, + "rewards/rejected": -0.4100319743156433, + "step": 1200 + }, + { + "epoch": 0.3, + "grad_norm": 6.040338516235352, + "learning_rate": 9.042311452776034e-06, + "logits/chosen": -0.1734088510274887, + "logits/rejected": -0.30074062943458557, + "logps/chosen": -56.630149841308594, + "logps/rejected": -70.88642120361328, + "loss": 0.9899, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.5425519943237305, + "rewards/margins": 3.189460039138794, + "rewards/rejected": -0.6469079852104187, + "step": 1201 + }, + { + "epoch": 0.3, + "grad_norm": 3.58396577835083, + "learning_rate": 9.040769055091931e-06, + "logits/chosen": -0.09347615391016006, + "logits/rejected": -0.20148871839046478, + "logps/chosen": -56.155853271484375, + "logps/rejected": -63.47142791748047, + "loss": 0.8501, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8054442405700684, + "rewards/margins": 2.854973077774048, + "rewards/rejected": -0.049529075622558594, + "step": 1202 + }, + { + "epoch": 0.3, + "grad_norm": 3.1864092350006104, + "learning_rate": 9.03922554812953e-06, + "logits/chosen": -0.16358083486557007, + "logits/rejected": -0.22807323932647705, + "logps/chosen": -53.68172073364258, + "logps/rejected": -78.73829650878906, + "loss": 0.8081, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.790660858154297, + "rewards/margins": 3.4414286613464355, + "rewards/rejected": -0.6507678627967834, + "step": 1203 + }, + { + "epoch": 0.3, + "grad_norm": 6.7172932624816895, + "learning_rate": 9.037680932312557e-06, + "logits/chosen": -0.11336082965135574, + "logits/rejected": -0.31740570068359375, + "logps/chosen": -50.49774169921875, + "logps/rejected": -56.17555618286133, + "loss": 0.8236, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6828322410583496, + "rewards/margins": 3.6355881690979004, + "rewards/rejected": -0.9527561664581299, + "step": 1204 + }, + { + "epoch": 0.3, + "grad_norm": 3.5381946563720703, + "learning_rate": 9.036135208065042e-06, + "logits/chosen": -0.0003691297024488449, + "logits/rejected": -0.0996202751994133, + "logps/chosen": -52.59328079223633, + "logps/rejected": -66.48538208007812, + "loss": 0.7788, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.76552414894104, + "rewards/margins": 3.092932939529419, + "rewards/rejected": -0.32740867137908936, + "step": 1205 + }, + { + "epoch": 0.3, + "grad_norm": 6.647822856903076, + "learning_rate": 9.034588375811318e-06, + "logits/chosen": -0.09464297443628311, + "logits/rejected": -0.18376049399375916, + "logps/chosen": -59.8587532043457, + "logps/rejected": -83.50569915771484, + "loss": 0.9696, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8650009632110596, + "rewards/margins": 2.927724599838257, + "rewards/rejected": -0.06272360682487488, + "step": 1206 + }, + { + "epoch": 0.3, + "grad_norm": 4.086598873138428, + "learning_rate": 9.033040435976026e-06, + "logits/chosen": -0.17638584971427917, + "logits/rejected": -0.29627978801727295, + "logps/chosen": -53.13214874267578, + "logps/rejected": -64.25433349609375, + "loss": 0.8277, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0326499938964844, + "rewards/margins": 3.4050121307373047, + "rewards/rejected": -0.37236249446868896, + "step": 1207 + }, + { + "epoch": 0.3, + "grad_norm": 3.6589150428771973, + "learning_rate": 9.031491388984108e-06, + "logits/chosen": -0.16237041354179382, + "logits/rejected": -0.26492977142333984, + "logps/chosen": -56.60255432128906, + "logps/rejected": -60.19573974609375, + "loss": 0.9125, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8585469722747803, + "rewards/margins": 2.4371790885925293, + "rewards/rejected": 0.4213681221008301, + "step": 1208 + }, + { + "epoch": 0.3, + "grad_norm": 5.982592582702637, + "learning_rate": 9.029941235260811e-06, + "logits/chosen": -0.19068631529808044, + "logits/rejected": -0.19897033274173737, + "logps/chosen": -44.20703887939453, + "logps/rejected": -72.82608795166016, + "loss": 0.7105, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0010948181152344, + "rewards/margins": 3.5281100273132324, + "rewards/rejected": -0.527014970779419, + "step": 1209 + }, + { + "epoch": 0.3, + "grad_norm": 3.290144920349121, + "learning_rate": 9.028389975231687e-06, + "logits/chosen": -0.09807364642620087, + "logits/rejected": -0.30850398540496826, + "logps/chosen": -63.77085876464844, + "logps/rejected": -63.33769607543945, + "loss": 0.7651, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9177613258361816, + "rewards/margins": 3.8722152709960938, + "rewards/rejected": -0.9544536471366882, + "step": 1210 + }, + { + "epoch": 0.3, + "grad_norm": 4.5531535148620605, + "learning_rate": 9.026837609322587e-06, + "logits/chosen": -0.05956225097179413, + "logits/rejected": -0.20414932072162628, + "logps/chosen": -62.12604522705078, + "logps/rejected": -69.74327850341797, + "loss": 0.9, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8225622177124023, + "rewards/margins": 2.8673267364501953, + "rewards/rejected": -0.044764190912246704, + "step": 1211 + }, + { + "epoch": 0.3, + "grad_norm": 2.9714977741241455, + "learning_rate": 9.025284137959674e-06, + "logits/chosen": -0.06921201944351196, + "logits/rejected": -0.2726805806159973, + "logps/chosen": -58.209415435791016, + "logps/rejected": -56.654930114746094, + "loss": 0.7529, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8215394020080566, + "rewards/margins": 3.853119373321533, + "rewards/rejected": -1.0315797328948975, + "step": 1212 + }, + { + "epoch": 0.3, + "grad_norm": 3.4410336017608643, + "learning_rate": 9.023729561569404e-06, + "logits/chosen": -0.07933255285024643, + "logits/rejected": -0.24071837961673737, + "logps/chosen": -60.47179412841797, + "logps/rejected": -73.0653305053711, + "loss": 0.7372, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3074889183044434, + "rewards/margins": 3.5018885135650635, + "rewards/rejected": -0.19439956545829773, + "step": 1213 + }, + { + "epoch": 0.3, + "grad_norm": 3.3138370513916016, + "learning_rate": 9.022173880578545e-06, + "logits/chosen": -0.15509772300720215, + "logits/rejected": -0.16498279571533203, + "logps/chosen": -55.59956741333008, + "logps/rejected": -83.15139770507812, + "loss": 0.8769, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2295501232147217, + "rewards/margins": 3.5029025077819824, + "rewards/rejected": -0.27335208654403687, + "step": 1214 + }, + { + "epoch": 0.3, + "grad_norm": 3.4816083908081055, + "learning_rate": 9.020617095414163e-06, + "logits/chosen": -0.08948443830013275, + "logits/rejected": -0.19532109797000885, + "logps/chosen": -48.47815704345703, + "logps/rejected": -66.21869659423828, + "loss": 0.7742, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9316153526306152, + "rewards/margins": 3.2320141792297363, + "rewards/rejected": -0.3003988564014435, + "step": 1215 + }, + { + "epoch": 0.3, + "grad_norm": 6.2424516677856445, + "learning_rate": 9.019059206503632e-06, + "logits/chosen": -0.19160579144954681, + "logits/rejected": -0.32441672682762146, + "logps/chosen": -68.1823959350586, + "logps/rejected": -66.04113006591797, + "loss": 1.0227, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7962734699249268, + "rewards/margins": 2.3156323432922363, + "rewards/rejected": 0.4806411564350128, + "step": 1216 + }, + { + "epoch": 0.3, + "grad_norm": 3.325937032699585, + "learning_rate": 9.017500214274622e-06, + "logits/chosen": -0.17695602774620056, + "logits/rejected": -0.2678857445716858, + "logps/chosen": -43.663230895996094, + "logps/rejected": -67.81730651855469, + "loss": 0.7424, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8504061698913574, + "rewards/margins": 3.2730791568756104, + "rewards/rejected": -0.42267316579818726, + "step": 1217 + }, + { + "epoch": 0.3, + "grad_norm": 3.6259796619415283, + "learning_rate": 9.015940119155114e-06, + "logits/chosen": -0.15912264585494995, + "logits/rejected": -0.25264137983322144, + "logps/chosen": -60.07027816772461, + "logps/rejected": -71.8201675415039, + "loss": 0.8502, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.801466941833496, + "rewards/margins": 2.528719425201416, + "rewards/rejected": 0.27274760603904724, + "step": 1218 + }, + { + "epoch": 0.3, + "grad_norm": 4.445476055145264, + "learning_rate": 9.014378921573384e-06, + "logits/chosen": -0.11267158389091492, + "logits/rejected": -0.26584959030151367, + "logps/chosen": -65.79833221435547, + "logps/rejected": -57.77210235595703, + "loss": 0.9151, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9334635734558105, + "rewards/margins": 3.372708320617676, + "rewards/rejected": -0.4392446279525757, + "step": 1219 + }, + { + "epoch": 0.31, + "grad_norm": 3.275642156600952, + "learning_rate": 9.012816621958018e-06, + "logits/chosen": -0.0660114586353302, + "logits/rejected": -0.19953802227973938, + "logps/chosen": -61.39133834838867, + "logps/rejected": -67.39311218261719, + "loss": 0.7, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8243439197540283, + "rewards/margins": 3.493124008178711, + "rewards/rejected": -0.6687796115875244, + "step": 1220 + }, + { + "epoch": 0.31, + "grad_norm": 3.0289506912231445, + "learning_rate": 9.011253220737902e-06, + "logits/chosen": -0.19655264914035797, + "logits/rejected": -0.2941175103187561, + "logps/chosen": -55.40144348144531, + "logps/rejected": -69.77409362792969, + "loss": 0.7278, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.200061559677124, + "rewards/margins": 3.601807117462158, + "rewards/rejected": -0.40174537897109985, + "step": 1221 + }, + { + "epoch": 0.31, + "grad_norm": 5.008816242218018, + "learning_rate": 9.009688718342219e-06, + "logits/chosen": -0.09572344273328781, + "logits/rejected": -0.25230494141578674, + "logps/chosen": -61.05814743041992, + "logps/rejected": -66.71083068847656, + "loss": 0.982, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.857469320297241, + "rewards/margins": 2.959237813949585, + "rewards/rejected": -0.10176832973957062, + "step": 1222 + }, + { + "epoch": 0.31, + "grad_norm": 3.8397397994995117, + "learning_rate": 9.008123115200457e-06, + "logits/chosen": -0.09960907697677612, + "logits/rejected": -0.2157888561487198, + "logps/chosen": -57.24610900878906, + "logps/rejected": -69.83055877685547, + "loss": 0.8125, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8197946548461914, + "rewards/margins": 3.8525800704956055, + "rewards/rejected": -1.0327856540679932, + "step": 1223 + }, + { + "epoch": 0.31, + "grad_norm": 5.156627655029297, + "learning_rate": 9.006556411742415e-06, + "logits/chosen": -0.19835129380226135, + "logits/rejected": -0.3072640597820282, + "logps/chosen": -51.50950241088867, + "logps/rejected": -63.81269073486328, + "loss": 0.837, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7418808937072754, + "rewards/margins": 3.443638801574707, + "rewards/rejected": -0.7017581462860107, + "step": 1224 + }, + { + "epoch": 0.31, + "grad_norm": 5.855801105499268, + "learning_rate": 9.004988608398184e-06, + "logits/chosen": -0.01972711645066738, + "logits/rejected": -0.16781650483608246, + "logps/chosen": -66.18022918701172, + "logps/rejected": -71.24441528320312, + "loss": 0.9098, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.801914691925049, + "rewards/margins": 2.8220906257629395, + "rewards/rejected": -0.020176265388727188, + "step": 1225 + }, + { + "epoch": 0.31, + "grad_norm": 4.128359794616699, + "learning_rate": 9.003419705598158e-06, + "logits/chosen": -0.14133679866790771, + "logits/rejected": -0.28437480330467224, + "logps/chosen": -70.99458312988281, + "logps/rejected": -72.13369750976562, + "loss": 1.0052, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.550062417984009, + "rewards/margins": 3.127633571624756, + "rewards/rejected": -0.577571451663971, + "step": 1226 + }, + { + "epoch": 0.31, + "grad_norm": 3.721614360809326, + "learning_rate": 9.001849703773036e-06, + "logits/chosen": -0.13323983550071716, + "logits/rejected": -0.25616905093193054, + "logps/chosen": -60.195186614990234, + "logps/rejected": -70.4715576171875, + "loss": 0.841, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.598245859146118, + "rewards/margins": 3.5171396732330322, + "rewards/rejected": -0.9188938736915588, + "step": 1227 + }, + { + "epoch": 0.31, + "grad_norm": 4.797216892242432, + "learning_rate": 9.000278603353817e-06, + "logits/chosen": -0.16406746208667755, + "logits/rejected": -0.2837717533111572, + "logps/chosen": -54.343658447265625, + "logps/rejected": -73.09915924072266, + "loss": 0.792, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.001594066619873, + "rewards/margins": 3.430281400680542, + "rewards/rejected": -0.4286876618862152, + "step": 1228 + }, + { + "epoch": 0.31, + "grad_norm": 5.024661540985107, + "learning_rate": 8.998706404771803e-06, + "logits/chosen": -0.1055549681186676, + "logits/rejected": -0.2500452995300293, + "logps/chosen": -54.979400634765625, + "logps/rejected": -79.47856903076172, + "loss": 0.8697, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4142472743988037, + "rewards/margins": 3.2243919372558594, + "rewards/rejected": -0.8101444840431213, + "step": 1229 + }, + { + "epoch": 0.31, + "grad_norm": 2.102849006652832, + "learning_rate": 8.997133108458594e-06, + "logits/chosen": -0.18284666538238525, + "logits/rejected": -0.3075690269470215, + "logps/chosen": -48.33953094482422, + "logps/rejected": -66.96528625488281, + "loss": 0.6675, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.951857805252075, + "rewards/margins": 3.7698729038238525, + "rewards/rejected": -0.8180152773857117, + "step": 1230 + }, + { + "epoch": 0.31, + "grad_norm": 3.599179267883301, + "learning_rate": 8.995558714846096e-06, + "logits/chosen": -0.06679123640060425, + "logits/rejected": -0.15310056507587433, + "logps/chosen": -73.53353118896484, + "logps/rejected": -61.84735870361328, + "loss": 0.8423, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0383353233337402, + "rewards/margins": 3.1629323959350586, + "rewards/rejected": -0.12459716200828552, + "step": 1231 + }, + { + "epoch": 0.31, + "grad_norm": 5.919393062591553, + "learning_rate": 8.993983224366514e-06, + "logits/chosen": -0.07748628407716751, + "logits/rejected": -0.2081703245639801, + "logps/chosen": -64.5341796875, + "logps/rejected": -73.07779693603516, + "loss": 0.9328, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.564016819000244, + "rewards/margins": 2.830990791320801, + "rewards/rejected": -0.2669735848903656, + "step": 1232 + }, + { + "epoch": 0.31, + "grad_norm": 4.534687042236328, + "learning_rate": 8.992406637452353e-06, + "logits/chosen": -0.1948091983795166, + "logits/rejected": -0.20566269755363464, + "logps/chosen": -52.81336212158203, + "logps/rejected": -92.19775390625, + "loss": 0.9445, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.947086811065674, + "rewards/margins": 2.2220849990844727, + "rewards/rejected": 0.7250013947486877, + "step": 1233 + }, + { + "epoch": 0.31, + "grad_norm": 5.029787063598633, + "learning_rate": 8.990828954536421e-06, + "logits/chosen": -0.06793034821748734, + "logits/rejected": -0.25199270248413086, + "logps/chosen": -63.522117614746094, + "logps/rejected": -70.29898834228516, + "loss": 0.8851, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6374125480651855, + "rewards/margins": 2.8513400554656982, + "rewards/rejected": -0.2139275074005127, + "step": 1234 + }, + { + "epoch": 0.31, + "grad_norm": 2.716824531555176, + "learning_rate": 8.989250176051828e-06, + "logits/chosen": -0.20590348541736603, + "logits/rejected": -0.2996622920036316, + "logps/chosen": -50.05463790893555, + "logps/rejected": -77.6484603881836, + "loss": 0.7093, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.157089948654175, + "rewards/margins": 4.297427654266357, + "rewards/rejected": -1.1403378248214722, + "step": 1235 + }, + { + "epoch": 0.31, + "grad_norm": 5.560074806213379, + "learning_rate": 8.987670302431977e-06, + "logits/chosen": -0.11338566988706589, + "logits/rejected": -0.2119026482105255, + "logps/chosen": -60.406009674072266, + "logps/rejected": -71.67196655273438, + "loss": 0.934, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.815941095352173, + "rewards/margins": 2.7141854763031006, + "rewards/rejected": 0.10175527632236481, + "step": 1236 + }, + { + "epoch": 0.31, + "grad_norm": 3.807170867919922, + "learning_rate": 8.986089334110581e-06, + "logits/chosen": -0.1123320683836937, + "logits/rejected": -0.22031459212303162, + "logps/chosen": -63.876346588134766, + "logps/rejected": -80.64683532714844, + "loss": 0.8597, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.844055414199829, + "rewards/margins": 3.515324354171753, + "rewards/rejected": -0.6712692379951477, + "step": 1237 + }, + { + "epoch": 0.31, + "grad_norm": 3.794297933578491, + "learning_rate": 8.98450727152165e-06, + "logits/chosen": -0.11316896229982376, + "logits/rejected": -0.22165526449680328, + "logps/chosen": -50.65116500854492, + "logps/rejected": -65.096923828125, + "loss": 0.7635, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6094350814819336, + "rewards/margins": 3.1449649333953857, + "rewards/rejected": -0.5355297327041626, + "step": 1238 + }, + { + "epoch": 0.31, + "grad_norm": 13.05922794342041, + "learning_rate": 8.982924115099496e-06, + "logits/chosen": -0.14574050903320312, + "logits/rejected": -0.19305898249149323, + "logps/chosen": -43.887168884277344, + "logps/rejected": -70.17644500732422, + "loss": 0.8203, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7885689735412598, + "rewards/margins": 2.923229217529297, + "rewards/rejected": -0.1346602737903595, + "step": 1239 + }, + { + "epoch": 0.31, + "grad_norm": 10.083846092224121, + "learning_rate": 8.981339865278726e-06, + "logits/chosen": -0.2233521044254303, + "logits/rejected": -0.22947540879249573, + "logps/chosen": -51.889060974121094, + "logps/rejected": -81.89906311035156, + "loss": 0.9502, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.8025307655334473, + "rewards/margins": 2.5323610305786133, + "rewards/rejected": 0.2701697051525116, + "step": 1240 + }, + { + "epoch": 0.31, + "grad_norm": 2.141249179840088, + "learning_rate": 8.979754522494253e-06, + "logits/chosen": -0.08586004376411438, + "logits/rejected": -0.16556966304779053, + "logps/chosen": -52.74515151977539, + "logps/rejected": -76.35515594482422, + "loss": 0.7155, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2163660526275635, + "rewards/margins": 3.9370408058166504, + "rewards/rejected": -0.7206745743751526, + "step": 1241 + }, + { + "epoch": 0.31, + "grad_norm": 3.431778907775879, + "learning_rate": 8.978168087181287e-06, + "logits/chosen": -0.10008332133293152, + "logits/rejected": -0.20905296504497528, + "logps/chosen": -58.63936233520508, + "logps/rejected": -68.98452758789062, + "loss": 0.7747, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.690626382827759, + "rewards/margins": 3.048048734664917, + "rewards/rejected": -0.3574223816394806, + "step": 1242 + }, + { + "epoch": 0.31, + "grad_norm": 4.08436918258667, + "learning_rate": 8.976580559775338e-06, + "logits/chosen": -0.13265353441238403, + "logits/rejected": -0.223264679312706, + "logps/chosen": -66.11372375488281, + "logps/rejected": -70.1240234375, + "loss": 0.7429, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.103937864303589, + "rewards/margins": 3.6547844409942627, + "rewards/rejected": -0.5508466362953186, + "step": 1243 + }, + { + "epoch": 0.31, + "grad_norm": 5.629563331604004, + "learning_rate": 8.97499194071222e-06, + "logits/chosen": -0.16077236831188202, + "logits/rejected": -0.2727395296096802, + "logps/chosen": -51.65385437011719, + "logps/rejected": -68.942626953125, + "loss": 0.8068, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.773801326751709, + "rewards/margins": 3.4331247806549072, + "rewards/rejected": -0.6593236923217773, + "step": 1244 + }, + { + "epoch": 0.31, + "grad_norm": 3.3507585525512695, + "learning_rate": 8.973402230428039e-06, + "logits/chosen": -0.06362062692642212, + "logits/rejected": -0.19717776775360107, + "logps/chosen": -57.07680130004883, + "logps/rejected": -62.828025817871094, + "loss": 0.7205, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8596041202545166, + "rewards/margins": 3.5861544609069824, + "rewards/rejected": -0.7265504598617554, + "step": 1245 + }, + { + "epoch": 0.31, + "grad_norm": 4.3833699226379395, + "learning_rate": 8.971811429359208e-06, + "logits/chosen": -0.1170688048005104, + "logits/rejected": -0.28156936168670654, + "logps/chosen": -67.10787200927734, + "logps/rejected": -82.14497375488281, + "loss": 0.7587, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.673722505569458, + "rewards/margins": 2.9715607166290283, + "rewards/rejected": -0.29783833026885986, + "step": 1246 + }, + { + "epoch": 0.31, + "grad_norm": 6.987090587615967, + "learning_rate": 8.970219537942434e-06, + "logits/chosen": -0.11019700765609741, + "logits/rejected": -0.16719499230384827, + "logps/chosen": -66.0711669921875, + "logps/rejected": -69.38623809814453, + "loss": 0.9126, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.414565086364746, + "rewards/margins": 2.847881317138672, + "rewards/rejected": -0.43331658840179443, + "step": 1247 + }, + { + "epoch": 0.31, + "grad_norm": 4.41646146774292, + "learning_rate": 8.968626556614723e-06, + "logits/chosen": -0.06553477793931961, + "logits/rejected": -0.18474557995796204, + "logps/chosen": -57.33055114746094, + "logps/rejected": -69.9216079711914, + "loss": 1.0648, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.815361738204956, + "rewards/margins": 2.5266637802124023, + "rewards/rejected": 0.2886979877948761, + "step": 1248 + }, + { + "epoch": 0.31, + "grad_norm": 5.385013103485107, + "learning_rate": 8.967032485813389e-06, + "logits/chosen": -0.16402427852153778, + "logits/rejected": -0.21641045808792114, + "logps/chosen": -49.46950912475586, + "logps/rejected": -70.44773864746094, + "loss": 0.8901, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6786305904388428, + "rewards/margins": 2.5327205657958984, + "rewards/rejected": 0.1459105908870697, + "step": 1249 + }, + { + "epoch": 0.31, + "grad_norm": 2.922623634338379, + "learning_rate": 8.965437325976036e-06, + "logits/chosen": -0.10505993664264679, + "logits/rejected": -0.1706637144088745, + "logps/chosen": -64.30036926269531, + "logps/rejected": -70.58419036865234, + "loss": 0.828, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.752138614654541, + "rewards/margins": 3.287487506866455, + "rewards/rejected": -0.535348653793335, + "step": 1250 + }, + { + "epoch": 0.31, + "grad_norm": 2.8642029762268066, + "learning_rate": 8.96384107754057e-06, + "logits/chosen": -0.10768763720989227, + "logits/rejected": -0.2890819013118744, + "logps/chosen": -66.7881088256836, + "logps/rejected": -66.41905212402344, + "loss": 0.7613, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.82873272895813, + "rewards/margins": 3.658099889755249, + "rewards/rejected": -0.8293669819831848, + "step": 1251 + }, + { + "epoch": 0.31, + "grad_norm": 4.7794108390808105, + "learning_rate": 8.962243740945194e-06, + "logits/chosen": -0.06819354742765427, + "logits/rejected": -0.20215275883674622, + "logps/chosen": -60.77557373046875, + "logps/rejected": -66.90453338623047, + "loss": 0.8945, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9275803565979004, + "rewards/margins": 3.5891714096069336, + "rewards/rejected": -0.6615911722183228, + "step": 1252 + }, + { + "epoch": 0.31, + "grad_norm": 4.116332530975342, + "learning_rate": 8.960645316628414e-06, + "logits/chosen": -0.22517985105514526, + "logits/rejected": -0.30925890803337097, + "logps/chosen": -56.671661376953125, + "logps/rejected": -58.63505172729492, + "loss": 0.9328, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.9341299533843994, + "rewards/margins": 2.9115042686462402, + "rewards/rejected": 0.022625546902418137, + "step": 1253 + }, + { + "epoch": 0.31, + "grad_norm": 4.580618858337402, + "learning_rate": 8.959045805029029e-06, + "logits/chosen": -0.1308826059103012, + "logits/rejected": -0.22022297978401184, + "logps/chosen": -56.552589416503906, + "logps/rejected": -62.244163513183594, + "loss": 0.9689, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.628838539123535, + "rewards/margins": 2.811687707901001, + "rewards/rejected": -0.18284925818443298, + "step": 1254 + }, + { + "epoch": 0.31, + "grad_norm": 5.672205448150635, + "learning_rate": 8.957445206586142e-06, + "logits/chosen": -0.06358082592487335, + "logits/rejected": -0.19257882237434387, + "logps/chosen": -59.895469665527344, + "logps/rejected": -73.71954345703125, + "loss": 1.005, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.780986785888672, + "rewards/margins": 2.7498786449432373, + "rewards/rejected": 0.03110814094543457, + "step": 1255 + }, + { + "epoch": 0.31, + "grad_norm": 5.164392471313477, + "learning_rate": 8.955843521739152e-06, + "logits/chosen": -0.1064804270863533, + "logits/rejected": -0.2254040539264679, + "logps/chosen": -58.322845458984375, + "logps/rejected": -74.4418716430664, + "loss": 0.87, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.4336907863616943, + "rewards/margins": 3.5019774436950684, + "rewards/rejected": -1.068286418914795, + "step": 1256 + }, + { + "epoch": 0.31, + "grad_norm": 5.7404632568359375, + "learning_rate": 8.954240750927751e-06, + "logits/chosen": -0.1482044905424118, + "logits/rejected": -0.14273473620414734, + "logps/chosen": -82.7904052734375, + "logps/rejected": -87.18154907226562, + "loss": 0.8462, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8230745792388916, + "rewards/margins": 3.5226407051086426, + "rewards/rejected": -0.699566125869751, + "step": 1257 + }, + { + "epoch": 0.31, + "grad_norm": 6.546159744262695, + "learning_rate": 8.952636894591941e-06, + "logits/chosen": -0.09387907385826111, + "logits/rejected": -0.11816126108169556, + "logps/chosen": -63.527103424072266, + "logps/rejected": -86.1766128540039, + "loss": 1.0879, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.022381067276001, + "rewards/margins": 2.4421865940093994, + "rewards/rejected": 0.5801939964294434, + "step": 1258 + }, + { + "epoch": 0.31, + "grad_norm": 3.89485764503479, + "learning_rate": 8.951031953172014e-06, + "logits/chosen": -0.1142887994647026, + "logits/rejected": -0.19908441603183746, + "logps/chosen": -54.57371139526367, + "logps/rejected": -75.0950698852539, + "loss": 0.8139, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7416341304779053, + "rewards/margins": 3.4685842990875244, + "rewards/rejected": -0.7269500494003296, + "step": 1259 + }, + { + "epoch": 0.32, + "grad_norm": 2.720613956451416, + "learning_rate": 8.949425927108557e-06, + "logits/chosen": -0.09180281311273575, + "logits/rejected": -0.2999327778816223, + "logps/chosen": -58.1753044128418, + "logps/rejected": -63.985904693603516, + "loss": 0.6511, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.965843915939331, + "rewards/margins": 4.908709526062012, + "rewards/rejected": -1.9428658485412598, + "step": 1260 + }, + { + "epoch": 0.32, + "grad_norm": 6.0200605392456055, + "learning_rate": 8.94781881684246e-06, + "logits/chosen": -0.19257071614265442, + "logits/rejected": -0.24964122474193573, + "logps/chosen": -53.27236557006836, + "logps/rejected": -66.32084655761719, + "loss": 0.7934, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8229422569274902, + "rewards/margins": 3.218804359436035, + "rewards/rejected": -0.39586231112480164, + "step": 1261 + }, + { + "epoch": 0.32, + "grad_norm": 3.5740511417388916, + "learning_rate": 8.946210622814913e-06, + "logits/chosen": -0.09020199626684189, + "logits/rejected": -0.24833087623119354, + "logps/chosen": -67.01803588867188, + "logps/rejected": -64.94647979736328, + "loss": 0.8432, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8365137577056885, + "rewards/margins": 2.8387081623077393, + "rewards/rejected": -0.0021944046020507812, + "step": 1262 + }, + { + "epoch": 0.32, + "grad_norm": 3.2690091133117676, + "learning_rate": 8.944601345467397e-06, + "logits/chosen": -0.018750274553894997, + "logits/rejected": -0.20198258757591248, + "logps/chosen": -59.26449203491211, + "logps/rejected": -50.79292297363281, + "loss": 0.8489, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6735734939575195, + "rewards/margins": 3.351743221282959, + "rewards/rejected": -0.6781703233718872, + "step": 1263 + }, + { + "epoch": 0.32, + "grad_norm": 5.994001388549805, + "learning_rate": 8.942990985241694e-06, + "logits/chosen": -0.1516706496477127, + "logits/rejected": -0.1950926035642624, + "logps/chosen": -66.91153717041016, + "logps/rejected": -64.11122131347656, + "loss": 1.1503, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.716109275817871, + "rewards/margins": 2.262643814086914, + "rewards/rejected": 0.4534655809402466, + "step": 1264 + }, + { + "epoch": 0.32, + "grad_norm": 4.64823579788208, + "learning_rate": 8.94137954257988e-06, + "logits/chosen": -0.12384361028671265, + "logits/rejected": -0.22377675771713257, + "logps/chosen": -67.71943664550781, + "logps/rejected": -74.94404602050781, + "loss": 0.9239, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8102614879608154, + "rewards/margins": 2.5241129398345947, + "rewards/rejected": 0.2861485779285431, + "step": 1265 + }, + { + "epoch": 0.32, + "grad_norm": 3.385767936706543, + "learning_rate": 8.939767017924335e-06, + "logits/chosen": -0.06092129275202751, + "logits/rejected": -0.20347151160240173, + "logps/chosen": -65.14102935791016, + "logps/rejected": -76.18505859375, + "loss": 0.758, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1924290657043457, + "rewards/margins": 3.848971366882324, + "rewards/rejected": -0.6565424203872681, + "step": 1266 + }, + { + "epoch": 0.32, + "grad_norm": 3.9272587299346924, + "learning_rate": 8.938153411717732e-06, + "logits/chosen": -0.18142536282539368, + "logits/rejected": -0.19192568957805634, + "logps/chosen": -53.25428009033203, + "logps/rejected": -72.83096313476562, + "loss": 0.7982, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.88181734085083, + "rewards/margins": 2.9099044799804688, + "rewards/rejected": -0.02808704972267151, + "step": 1267 + }, + { + "epoch": 0.32, + "grad_norm": 3.6626322269439697, + "learning_rate": 8.936538724403036e-06, + "logits/chosen": -0.1739310920238495, + "logits/rejected": -0.3418735861778259, + "logps/chosen": -62.822261810302734, + "logps/rejected": -75.676513671875, + "loss": 0.7482, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.753450393676758, + "rewards/margins": 3.856966257095337, + "rewards/rejected": -1.103515863418579, + "step": 1268 + }, + { + "epoch": 0.32, + "grad_norm": 3.639564275741577, + "learning_rate": 8.934922956423517e-06, + "logits/chosen": -0.05865104869008064, + "logits/rejected": -0.16444377601146698, + "logps/chosen": -54.99855041503906, + "logps/rejected": -65.71974182128906, + "loss": 0.8283, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9655508995056152, + "rewards/margins": 3.2181010246276855, + "rewards/rejected": -0.2525504529476166, + "step": 1269 + }, + { + "epoch": 0.32, + "grad_norm": 2.8727049827575684, + "learning_rate": 8.93330610822274e-06, + "logits/chosen": -0.21048955619335175, + "logits/rejected": -0.3109707832336426, + "logps/chosen": -53.4548454284668, + "logps/rejected": -87.524658203125, + "loss": 0.6327, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.75925612449646, + "rewards/margins": 4.07216215133667, + "rewards/rejected": -1.3129065036773682, + "step": 1270 + }, + { + "epoch": 0.32, + "grad_norm": 5.056536674499512, + "learning_rate": 8.931688180244559e-06, + "logits/chosen": -0.09743814170360565, + "logits/rejected": -0.2751375436782837, + "logps/chosen": -62.5680046081543, + "logps/rejected": -58.50858688354492, + "loss": 0.8392, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6405489444732666, + "rewards/margins": 3.132946729660034, + "rewards/rejected": -0.4923980236053467, + "step": 1271 + }, + { + "epoch": 0.32, + "grad_norm": 3.9615252017974854, + "learning_rate": 8.930069172933133e-06, + "logits/chosen": -0.12865757942199707, + "logits/rejected": -0.16824547946453094, + "logps/chosen": -49.84679412841797, + "logps/rejected": -68.47004699707031, + "loss": 0.8335, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.627741813659668, + "rewards/margins": 2.6819844245910645, + "rewards/rejected": -0.05424262583255768, + "step": 1272 + }, + { + "epoch": 0.32, + "grad_norm": 3.765122175216675, + "learning_rate": 8.928449086732918e-06, + "logits/chosen": -0.065720334649086, + "logits/rejected": -0.20653074979782104, + "logps/chosen": -59.21674346923828, + "logps/rejected": -65.60931396484375, + "loss": 0.8233, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8854806423187256, + "rewards/margins": 3.762812614440918, + "rewards/rejected": -0.877332329750061, + "step": 1273 + }, + { + "epoch": 0.32, + "grad_norm": 5.854793071746826, + "learning_rate": 8.926827922088658e-06, + "logits/chosen": -0.10608793795108795, + "logits/rejected": -0.3069337010383606, + "logps/chosen": -62.751712799072266, + "logps/rejected": -51.437095642089844, + "loss": 0.905, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.838256597518921, + "rewards/margins": 2.9495959281921387, + "rewards/rejected": -0.11133928596973419, + "step": 1274 + }, + { + "epoch": 0.32, + "grad_norm": 3.8678812980651855, + "learning_rate": 8.9252056794454e-06, + "logits/chosen": -0.12456309795379639, + "logits/rejected": -0.16509564220905304, + "logps/chosen": -52.345703125, + "logps/rejected": -77.53907012939453, + "loss": 0.927, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.926685333251953, + "rewards/margins": 2.6402878761291504, + "rewards/rejected": 0.28639769554138184, + "step": 1275 + }, + { + "epoch": 0.32, + "grad_norm": 3.829695224761963, + "learning_rate": 8.923582359248482e-06, + "logits/chosen": -0.15913984179496765, + "logits/rejected": -0.2182452231645584, + "logps/chosen": -55.28016662597656, + "logps/rejected": -69.60665130615234, + "loss": 0.8354, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.768095016479492, + "rewards/margins": 3.270350456237793, + "rewards/rejected": -0.5022552013397217, + "step": 1276 + }, + { + "epoch": 0.32, + "grad_norm": 5.401357650756836, + "learning_rate": 8.921957961943542e-06, + "logits/chosen": -0.07407201826572418, + "logits/rejected": -0.15396994352340698, + "logps/chosen": -54.78228759765625, + "logps/rejected": -79.71731567382812, + "loss": 0.8354, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6292824745178223, + "rewards/margins": 3.0162601470947266, + "rewards/rejected": -0.3869777321815491, + "step": 1277 + }, + { + "epoch": 0.32, + "grad_norm": 4.546365261077881, + "learning_rate": 8.920332487976514e-06, + "logits/chosen": -0.07081187516450882, + "logits/rejected": -0.22830921411514282, + "logps/chosen": -63.859336853027344, + "logps/rejected": -63.33550262451172, + "loss": 0.7995, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8523221015930176, + "rewards/margins": 3.710198163986206, + "rewards/rejected": -0.8578763008117676, + "step": 1278 + }, + { + "epoch": 0.32, + "grad_norm": 5.274129867553711, + "learning_rate": 8.918705937793624e-06, + "logits/chosen": -0.05791977047920227, + "logits/rejected": -0.19630137085914612, + "logps/chosen": -59.65391159057617, + "logps/rejected": -69.10343933105469, + "loss": 0.7829, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8237462043762207, + "rewards/margins": 3.368626356124878, + "rewards/rejected": -0.5448801517486572, + "step": 1279 + }, + { + "epoch": 0.32, + "grad_norm": 4.512240886688232, + "learning_rate": 8.917078311841395e-06, + "logits/chosen": -0.17727938294410706, + "logits/rejected": -0.2669133245944977, + "logps/chosen": -60.346031188964844, + "logps/rejected": -72.77742004394531, + "loss": 0.8109, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.877870559692383, + "rewards/margins": 3.5811214447021484, + "rewards/rejected": -0.7032513618469238, + "step": 1280 + }, + { + "epoch": 0.32, + "grad_norm": 3.7041420936584473, + "learning_rate": 8.915449610566643e-06, + "logits/chosen": -0.18086941540241241, + "logits/rejected": -0.36118656396865845, + "logps/chosen": -54.14004135131836, + "logps/rejected": -66.85968780517578, + "loss": 0.7693, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.98648738861084, + "rewards/margins": 3.856001615524292, + "rewards/rejected": -0.8695142865180969, + "step": 1281 + }, + { + "epoch": 0.32, + "grad_norm": 3.908658981323242, + "learning_rate": 8.913819834416486e-06, + "logits/chosen": -0.07318058609962463, + "logits/rejected": -0.12484575808048248, + "logps/chosen": -60.36768341064453, + "logps/rejected": -83.68614196777344, + "loss": 0.79, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.016801357269287, + "rewards/margins": 3.3678083419799805, + "rewards/rejected": -0.3510064482688904, + "step": 1282 + }, + { + "epoch": 0.32, + "grad_norm": 6.1372880935668945, + "learning_rate": 8.912188983838331e-06, + "logits/chosen": -0.2116217315196991, + "logits/rejected": -0.2195865511894226, + "logps/chosen": -54.58900833129883, + "logps/rejected": -79.95579528808594, + "loss": 0.8608, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.637944459915161, + "rewards/margins": 2.890576124191284, + "rewards/rejected": -0.25263139605522156, + "step": 1283 + }, + { + "epoch": 0.32, + "grad_norm": 5.219161510467529, + "learning_rate": 8.91055705927988e-06, + "logits/chosen": -0.08228612691164017, + "logits/rejected": -0.2140231430530548, + "logps/chosen": -58.42243957519531, + "logps/rejected": -82.14086151123047, + "loss": 0.8303, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.762620687484741, + "rewards/margins": 3.1274099349975586, + "rewards/rejected": -0.3647889494895935, + "step": 1284 + }, + { + "epoch": 0.32, + "grad_norm": 3.5593576431274414, + "learning_rate": 8.908924061189134e-06, + "logits/chosen": -0.20267538726329803, + "logits/rejected": -0.32153210043907166, + "logps/chosen": -51.028079986572266, + "logps/rejected": -63.785491943359375, + "loss": 0.9065, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.689932346343994, + "rewards/margins": 3.31889271736145, + "rewards/rejected": -0.628960371017456, + "step": 1285 + }, + { + "epoch": 0.32, + "grad_norm": 4.882365703582764, + "learning_rate": 8.907289990014387e-06, + "logits/chosen": -0.17158454656600952, + "logits/rejected": -0.3039019703865051, + "logps/chosen": -53.36243438720703, + "logps/rejected": -78.90116882324219, + "loss": 0.8124, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.779329776763916, + "rewards/margins": 3.3981451988220215, + "rewards/rejected": -0.6188151836395264, + "step": 1286 + }, + { + "epoch": 0.32, + "grad_norm": 3.3667895793914795, + "learning_rate": 8.905654846204223e-06, + "logits/chosen": -0.16240395605564117, + "logits/rejected": -0.31575676798820496, + "logps/chosen": -52.10782241821289, + "logps/rejected": -55.77503967285156, + "loss": 0.7764, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.656715154647827, + "rewards/margins": 3.1572341918945312, + "rewards/rejected": -0.5005190968513489, + "step": 1287 + }, + { + "epoch": 0.32, + "grad_norm": 5.804718494415283, + "learning_rate": 8.904018630207526e-06, + "logits/chosen": -0.09644803404808044, + "logits/rejected": -0.21832120418548584, + "logps/chosen": -61.567039489746094, + "logps/rejected": -77.19378662109375, + "loss": 0.9827, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8309812545776367, + "rewards/margins": 2.5113165378570557, + "rewards/rejected": 0.31966492533683777, + "step": 1288 + }, + { + "epoch": 0.32, + "grad_norm": 5.023290157318115, + "learning_rate": 8.902381342473477e-06, + "logits/chosen": -0.040750689804553986, + "logits/rejected": -0.17368032038211823, + "logps/chosen": -76.23723602294922, + "logps/rejected": -78.50442504882812, + "loss": 0.9087, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.721017599105835, + "rewards/margins": 3.731131076812744, + "rewards/rejected": -1.0101128816604614, + "step": 1289 + }, + { + "epoch": 0.32, + "grad_norm": 6.5004425048828125, + "learning_rate": 8.900742983451538e-06, + "logits/chosen": -0.08788704872131348, + "logits/rejected": -0.17277921736240387, + "logps/chosen": -68.3167953491211, + "logps/rejected": -102.31437683105469, + "loss": 0.9353, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7528116703033447, + "rewards/margins": 2.525761604309082, + "rewards/rejected": 0.2270500808954239, + "step": 1290 + }, + { + "epoch": 0.32, + "grad_norm": 4.146306037902832, + "learning_rate": 8.899103553591482e-06, + "logits/chosen": -0.14336498081684113, + "logits/rejected": -0.1881406456232071, + "logps/chosen": -54.144962310791016, + "logps/rejected": -74.18019104003906, + "loss": 0.8089, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.552863836288452, + "rewards/margins": 3.0744495391845703, + "rewards/rejected": -0.5215857028961182, + "step": 1291 + }, + { + "epoch": 0.32, + "grad_norm": 3.0153331756591797, + "learning_rate": 8.897463053343363e-06, + "logits/chosen": -0.14740586280822754, + "logits/rejected": -0.3109232485294342, + "logps/chosen": -65.64893341064453, + "logps/rejected": -57.03159713745117, + "loss": 0.7899, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.378868818283081, + "rewards/margins": 3.5519399642944336, + "rewards/rejected": -0.17307107150554657, + "step": 1292 + }, + { + "epoch": 0.32, + "grad_norm": 4.502954959869385, + "learning_rate": 8.895821483157533e-06, + "logits/chosen": -0.04912843555212021, + "logits/rejected": -0.1956421136856079, + "logps/chosen": -59.58482360839844, + "logps/rejected": -75.71621704101562, + "loss": 0.7814, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8564751148223877, + "rewards/margins": 3.835951805114746, + "rewards/rejected": -0.9794772863388062, + "step": 1293 + }, + { + "epoch": 0.32, + "grad_norm": 5.518468379974365, + "learning_rate": 8.894178843484645e-06, + "logits/chosen": -0.17407596111297607, + "logits/rejected": -0.2052711844444275, + "logps/chosen": -58.673492431640625, + "logps/rejected": -77.12854766845703, + "loss": 0.8599, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9376823902130127, + "rewards/margins": 3.655900716781616, + "rewards/rejected": -0.718218207359314, + "step": 1294 + }, + { + "epoch": 0.32, + "grad_norm": 2.7958574295043945, + "learning_rate": 8.892535134775631e-06, + "logits/chosen": -0.17609688639640808, + "logits/rejected": -0.28241193294525146, + "logps/chosen": -54.67265319824219, + "logps/rejected": -77.6305923461914, + "loss": 0.676, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.977351188659668, + "rewards/margins": 4.0843892097473145, + "rewards/rejected": -1.1070380210876465, + "step": 1295 + }, + { + "epoch": 0.32, + "grad_norm": 7.565179347991943, + "learning_rate": 8.89089035748173e-06, + "logits/chosen": -0.23548072576522827, + "logits/rejected": -0.2969895303249359, + "logps/chosen": -50.05009460449219, + "logps/rejected": -69.07213592529297, + "loss": 0.839, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7066164016723633, + "rewards/margins": 2.5737297534942627, + "rewards/rejected": 0.1328863501548767, + "step": 1296 + }, + { + "epoch": 0.32, + "grad_norm": 3.9232711791992188, + "learning_rate": 8.889244512054466e-06, + "logits/chosen": -0.12914034724235535, + "logits/rejected": -0.2754465341567993, + "logps/chosen": -57.09978485107422, + "logps/rejected": -70.92314147949219, + "loss": 0.7272, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.774653434753418, + "rewards/margins": 4.220605373382568, + "rewards/rejected": -1.44595205783844, + "step": 1297 + }, + { + "epoch": 0.32, + "grad_norm": 3.6103458404541016, + "learning_rate": 8.887597598945662e-06, + "logits/chosen": -0.11310829222202301, + "logits/rejected": -0.17825403809547424, + "logps/chosen": -53.278953552246094, + "logps/rejected": -85.25291442871094, + "loss": 0.681, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8492050170898438, + "rewards/margins": 3.1807644367218018, + "rewards/rejected": -0.33156004548072815, + "step": 1298 + }, + { + "epoch": 0.32, + "grad_norm": 3.462888717651367, + "learning_rate": 8.885949618607428e-06, + "logits/chosen": -0.11166447401046753, + "logits/rejected": -0.2058449536561966, + "logps/chosen": -58.821678161621094, + "logps/rejected": -80.81074523925781, + "loss": 0.6753, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.745344638824463, + "rewards/margins": 3.383929491043091, + "rewards/rejected": -0.6385844349861145, + "step": 1299 + }, + { + "epoch": 0.33, + "grad_norm": 4.395800590515137, + "learning_rate": 8.884300571492173e-06, + "logits/chosen": -0.10604344308376312, + "logits/rejected": -0.18407459557056427, + "logps/chosen": -55.8571891784668, + "logps/rejected": -83.26512145996094, + "loss": 0.7979, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8184540271759033, + "rewards/margins": 3.4062275886535645, + "rewards/rejected": -0.5877737998962402, + "step": 1300 + }, + { + "epoch": 0.33, + "grad_norm": 4.415127754211426, + "learning_rate": 8.882650458052592e-06, + "logits/chosen": -0.1841970682144165, + "logits/rejected": -0.267314076423645, + "logps/chosen": -53.23952865600586, + "logps/rejected": -69.35493469238281, + "loss": 0.8206, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9464609622955322, + "rewards/margins": 3.1943914890289307, + "rewards/rejected": -0.24793073534965515, + "step": 1301 + }, + { + "epoch": 0.33, + "grad_norm": 4.528595447540283, + "learning_rate": 8.880999278741683e-06, + "logits/chosen": -0.13506512343883514, + "logits/rejected": -0.29242584109306335, + "logps/chosen": -61.116859436035156, + "logps/rejected": -62.03443145751953, + "loss": 0.9826, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.471087694168091, + "rewards/margins": 3.170863151550293, + "rewards/rejected": -0.6997753381729126, + "step": 1302 + }, + { + "epoch": 0.33, + "grad_norm": 3.789250373840332, + "learning_rate": 8.879347034012722e-06, + "logits/chosen": -0.15025369822978973, + "logits/rejected": -0.2075345814228058, + "logps/chosen": -56.413150787353516, + "logps/rejected": -72.265869140625, + "loss": 0.8758, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9032211303710938, + "rewards/margins": 3.2094528675079346, + "rewards/rejected": -0.30623191595077515, + "step": 1303 + }, + { + "epoch": 0.33, + "grad_norm": 10.769689559936523, + "learning_rate": 8.877693724319294e-06, + "logits/chosen": -0.20228135585784912, + "logits/rejected": -0.3063250780105591, + "logps/chosen": -59.40853500366211, + "logps/rejected": -67.42790222167969, + "loss": 0.8794, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7432003021240234, + "rewards/margins": 3.245255470275879, + "rewards/rejected": -0.5020551085472107, + "step": 1304 + }, + { + "epoch": 0.33, + "grad_norm": 3.947503089904785, + "learning_rate": 8.876039350115263e-06, + "logits/chosen": -0.03376191109418869, + "logits/rejected": -0.19844311475753784, + "logps/chosen": -66.8920669555664, + "logps/rejected": -75.02169799804688, + "loss": 0.8048, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8126330375671387, + "rewards/margins": 3.078364849090576, + "rewards/rejected": -0.26573196053504944, + "step": 1305 + }, + { + "epoch": 0.33, + "grad_norm": 3.037034034729004, + "learning_rate": 8.874383911854792e-06, + "logits/chosen": -0.2238374650478363, + "logits/rejected": -0.3608211576938629, + "logps/chosen": -55.624298095703125, + "logps/rejected": -65.64368438720703, + "loss": 0.6997, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.981987714767456, + "rewards/margins": 3.602917432785034, + "rewards/rejected": -0.6209297776222229, + "step": 1306 + }, + { + "epoch": 0.33, + "grad_norm": 6.176809310913086, + "learning_rate": 8.872727409992335e-06, + "logits/chosen": -0.217838317155838, + "logits/rejected": -0.24186952412128448, + "logps/chosen": -57.00798034667969, + "logps/rejected": -79.88400268554688, + "loss": 0.8308, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9337916374206543, + "rewards/margins": 3.000110626220703, + "rewards/rejected": -0.06631867587566376, + "step": 1307 + }, + { + "epoch": 0.33, + "grad_norm": 4.445062637329102, + "learning_rate": 8.871069844982639e-06, + "logits/chosen": -0.09327755123376846, + "logits/rejected": -0.18751609325408936, + "logps/chosen": -67.71380615234375, + "logps/rejected": -73.46354675292969, + "loss": 0.9022, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9548299312591553, + "rewards/margins": 2.947753667831421, + "rewards/rejected": 0.0070764124393463135, + "step": 1308 + }, + { + "epoch": 0.33, + "grad_norm": 4.3132710456848145, + "learning_rate": 8.869411217280736e-06, + "logits/chosen": -0.22880929708480835, + "logits/rejected": -0.2988625764846802, + "logps/chosen": -56.85089111328125, + "logps/rejected": -67.50996398925781, + "loss": 0.8505, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7747020721435547, + "rewards/margins": 3.3826894760131836, + "rewards/rejected": -0.6079874038696289, + "step": 1309 + }, + { + "epoch": 0.33, + "grad_norm": 3.0105316638946533, + "learning_rate": 8.867751527341962e-06, + "logits/chosen": -0.17613887786865234, + "logits/rejected": -0.2832082211971283, + "logps/chosen": -65.24885559082031, + "logps/rejected": -74.89085388183594, + "loss": 0.7029, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6584272384643555, + "rewards/margins": 3.490701913833618, + "rewards/rejected": -0.8322745561599731, + "step": 1310 + }, + { + "epoch": 0.33, + "grad_norm": 3.47274112701416, + "learning_rate": 8.866090775621931e-06, + "logits/chosen": -0.12483084201812744, + "logits/rejected": -0.3159692585468292, + "logps/chosen": -61.38236618041992, + "logps/rejected": -65.43782806396484, + "loss": 0.83, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.382802724838257, + "rewards/margins": 3.362518787384033, + "rewards/rejected": -0.9797159433364868, + "step": 1311 + }, + { + "epoch": 0.33, + "grad_norm": 7.772285461425781, + "learning_rate": 8.864428962576558e-06, + "logits/chosen": -0.14267678558826447, + "logits/rejected": -0.20469358563423157, + "logps/chosen": -54.599334716796875, + "logps/rejected": -72.33415985107422, + "loss": 1.202, + "rewards/accuracies": 0.625, + "rewards/chosen": 2.567682981491089, + "rewards/margins": 2.202434539794922, + "rewards/rejected": 0.36524835228919983, + "step": 1312 + }, + { + "epoch": 0.33, + "grad_norm": 2.8208963871002197, + "learning_rate": 8.862766088662052e-06, + "logits/chosen": -0.21678996086120605, + "logits/rejected": -0.3057379722595215, + "logps/chosen": -48.973941802978516, + "logps/rejected": -68.20497131347656, + "loss": 0.808, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9925668239593506, + "rewards/margins": 3.838334321975708, + "rewards/rejected": -0.8457673192024231, + "step": 1313 + }, + { + "epoch": 0.33, + "grad_norm": 7.509524822235107, + "learning_rate": 8.861102154334898e-06, + "logits/chosen": -0.11575770378112793, + "logits/rejected": -0.233211487531662, + "logps/chosen": -70.61119079589844, + "logps/rejected": -67.43603515625, + "loss": 1.0748, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.758253574371338, + "rewards/margins": 2.6601080894470215, + "rewards/rejected": 0.0981454998254776, + "step": 1314 + }, + { + "epoch": 0.33, + "grad_norm": 3.582660675048828, + "learning_rate": 8.85943716005189e-06, + "logits/chosen": -0.2559957802295685, + "logits/rejected": -0.3164220452308655, + "logps/chosen": -61.61867904663086, + "logps/rejected": -71.10794067382812, + "loss": 0.9003, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8554797172546387, + "rewards/margins": 3.0523877143859863, + "rewards/rejected": -0.1969079226255417, + "step": 1315 + }, + { + "epoch": 0.33, + "grad_norm": 3.479262590408325, + "learning_rate": 8.8577711062701e-06, + "logits/chosen": -0.11025796830654144, + "logits/rejected": -0.18804027140140533, + "logps/chosen": -53.202186584472656, + "logps/rejected": -81.64921569824219, + "loss": 0.9175, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8053038120269775, + "rewards/margins": 3.52628231048584, + "rewards/rejected": -0.7209790349006653, + "step": 1316 + }, + { + "epoch": 0.33, + "grad_norm": 6.938999176025391, + "learning_rate": 8.856103993446895e-06, + "logits/chosen": -0.0899042934179306, + "logits/rejected": -0.19020234048366547, + "logps/chosen": -58.00337600708008, + "logps/rejected": -70.34976959228516, + "loss": 0.8329, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.954108715057373, + "rewards/margins": 3.3448171615600586, + "rewards/rejected": -0.390708863735199, + "step": 1317 + }, + { + "epoch": 0.33, + "grad_norm": 4.519232273101807, + "learning_rate": 8.854435822039938e-06, + "logits/chosen": -0.10711662471294403, + "logits/rejected": -0.2500491142272949, + "logps/chosen": -59.73412322998047, + "logps/rejected": -64.85607147216797, + "loss": 0.7456, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6706128120422363, + "rewards/margins": 3.2748026847839355, + "rewards/rejected": -0.604189932346344, + "step": 1318 + }, + { + "epoch": 0.33, + "grad_norm": 4.287625789642334, + "learning_rate": 8.852766592507175e-06, + "logits/chosen": -0.0850057303905487, + "logits/rejected": -0.14132362604141235, + "logps/chosen": -65.49993133544922, + "logps/rejected": -75.38569641113281, + "loss": 0.8677, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0611319541931152, + "rewards/margins": 3.3626294136047363, + "rewards/rejected": -0.30149751901626587, + "step": 1319 + }, + { + "epoch": 0.33, + "grad_norm": 9.333602905273438, + "learning_rate": 8.851096305306846e-06, + "logits/chosen": -0.14060166478157043, + "logits/rejected": -0.2714458107948303, + "logps/chosen": -55.524871826171875, + "logps/rejected": -65.1989517211914, + "loss": 0.8989, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6846189498901367, + "rewards/margins": 3.1002495288848877, + "rewards/rejected": -0.4156307578086853, + "step": 1320 + }, + { + "epoch": 0.33, + "grad_norm": 2.3499269485473633, + "learning_rate": 8.849424960897482e-06, + "logits/chosen": -0.19904205203056335, + "logits/rejected": -0.393319308757782, + "logps/chosen": -54.564369201660156, + "logps/rejected": -60.47876739501953, + "loss": 0.6644, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.626253366470337, + "rewards/margins": 4.282526016235352, + "rewards/rejected": -1.6562724113464355, + "step": 1321 + }, + { + "epoch": 0.33, + "grad_norm": 3.6965367794036865, + "learning_rate": 8.847752559737902e-06, + "logits/chosen": -0.14524805545806885, + "logits/rejected": -0.27222830057144165, + "logps/chosen": -61.52315139770508, + "logps/rejected": -65.29212188720703, + "loss": 0.8501, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0375733375549316, + "rewards/margins": 3.19648814201355, + "rewards/rejected": -0.1589149683713913, + "step": 1322 + }, + { + "epoch": 0.33, + "grad_norm": 3.949253797531128, + "learning_rate": 8.846079102287215e-06, + "logits/chosen": -0.09469793736934662, + "logits/rejected": -0.202487513422966, + "logps/chosen": -57.236473083496094, + "logps/rejected": -78.14765930175781, + "loss": 0.7663, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8435380458831787, + "rewards/margins": 4.103811264038086, + "rewards/rejected": -1.2602732181549072, + "step": 1323 + }, + { + "epoch": 0.33, + "grad_norm": 7.9225993156433105, + "learning_rate": 8.844404589004825e-06, + "logits/chosen": -0.2257511168718338, + "logits/rejected": -0.3048459589481354, + "logps/chosen": -50.31788635253906, + "logps/rejected": -67.59465026855469, + "loss": 0.9714, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.842468500137329, + "rewards/margins": 3.4748551845550537, + "rewards/rejected": -0.6323869824409485, + "step": 1324 + }, + { + "epoch": 0.33, + "grad_norm": 3.4683804512023926, + "learning_rate": 8.842729020350417e-06, + "logits/chosen": -0.11994478106498718, + "logits/rejected": -0.27418604493141174, + "logps/chosen": -52.13243103027344, + "logps/rejected": -62.007877349853516, + "loss": 0.7564, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.559220552444458, + "rewards/margins": 3.7213940620422363, + "rewards/rejected": -1.1621735095977783, + "step": 1325 + }, + { + "epoch": 0.33, + "grad_norm": 3.922180652618408, + "learning_rate": 8.841052396783976e-06, + "logits/chosen": -0.16087250411510468, + "logits/rejected": -0.2905736565589905, + "logps/chosen": -57.948848724365234, + "logps/rejected": -71.42221069335938, + "loss": 0.8538, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.740936279296875, + "rewards/margins": 3.26213002204895, + "rewards/rejected": -0.5211934447288513, + "step": 1326 + }, + { + "epoch": 0.33, + "grad_norm": 4.4906206130981445, + "learning_rate": 8.839374718765766e-06, + "logits/chosen": -0.1560344696044922, + "logits/rejected": -0.3166842460632324, + "logps/chosen": -55.704139709472656, + "logps/rejected": -71.48664093017578, + "loss": 0.8197, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.535257577896118, + "rewards/margins": 3.770357370376587, + "rewards/rejected": -1.2351000308990479, + "step": 1327 + }, + { + "epoch": 0.33, + "grad_norm": 3.190633773803711, + "learning_rate": 8.837695986756351e-06, + "logits/chosen": -0.10619253665208817, + "logits/rejected": -0.2199493944644928, + "logps/chosen": -56.548152923583984, + "logps/rejected": -75.70951843261719, + "loss": 0.7331, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.80183482170105, + "rewards/margins": 3.648038625717163, + "rewards/rejected": -0.8462036848068237, + "step": 1328 + }, + { + "epoch": 0.33, + "grad_norm": 4.4335198402404785, + "learning_rate": 8.836016201216575e-06, + "logits/chosen": -0.15134207904338837, + "logits/rejected": -0.2884814143180847, + "logps/chosen": -55.61899185180664, + "logps/rejected": -69.59037780761719, + "loss": 0.7562, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6949410438537598, + "rewards/margins": 3.8025970458984375, + "rewards/rejected": -1.1076561212539673, + "step": 1329 + }, + { + "epoch": 0.33, + "grad_norm": 3.415060520172119, + "learning_rate": 8.834335362607578e-06, + "logits/chosen": -0.12135857343673706, + "logits/rejected": -0.28117865324020386, + "logps/chosen": -51.35698318481445, + "logps/rejected": -65.70181274414062, + "loss": 0.7453, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2457869052886963, + "rewards/margins": 4.007103443145752, + "rewards/rejected": -0.7613164782524109, + "step": 1330 + }, + { + "epoch": 0.33, + "grad_norm": 3.761690139770508, + "learning_rate": 8.832653471390783e-06, + "logits/chosen": -0.2862381339073181, + "logits/rejected": -0.37923309206962585, + "logps/chosen": -44.46415710449219, + "logps/rejected": -84.15593719482422, + "loss": 0.7373, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.657736301422119, + "rewards/margins": 4.05775260925293, + "rewards/rejected": -1.4000163078308105, + "step": 1331 + }, + { + "epoch": 0.33, + "grad_norm": 4.388571262359619, + "learning_rate": 8.830970528027912e-06, + "logits/chosen": -0.1821955442428589, + "logits/rejected": -0.2497093677520752, + "logps/chosen": -53.48994827270508, + "logps/rejected": -70.43257904052734, + "loss": 0.9201, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4152889251708984, + "rewards/margins": 3.1148195266723633, + "rewards/rejected": -0.6995310187339783, + "step": 1332 + }, + { + "epoch": 0.33, + "grad_norm": 5.135583400726318, + "learning_rate": 8.829286532980963e-06, + "logits/chosen": -0.08355457335710526, + "logits/rejected": -0.2507716417312622, + "logps/chosen": -56.89943313598633, + "logps/rejected": -64.89965057373047, + "loss": 0.8479, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6074984073638916, + "rewards/margins": 2.862720251083374, + "rewards/rejected": -0.25522178411483765, + "step": 1333 + }, + { + "epoch": 0.33, + "grad_norm": 5.825660705566406, + "learning_rate": 8.827601486712232e-06, + "logits/chosen": -0.11886465549468994, + "logits/rejected": -0.2793791890144348, + "logps/chosen": -68.78485870361328, + "logps/rejected": -67.49732208251953, + "loss": 0.9074, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5301523208618164, + "rewards/margins": 2.978753089904785, + "rewards/rejected": -0.4486006796360016, + "step": 1334 + }, + { + "epoch": 0.33, + "grad_norm": 4.342766761779785, + "learning_rate": 8.8259153896843e-06, + "logits/chosen": -0.12887519598007202, + "logits/rejected": -0.1727626472711563, + "logps/chosen": -48.451663970947266, + "logps/rejected": -78.17550659179688, + "loss": 0.7909, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.645921230316162, + "rewards/margins": 3.487417459487915, + "rewards/rejected": -0.8414966464042664, + "step": 1335 + }, + { + "epoch": 0.33, + "grad_norm": 7.577632427215576, + "learning_rate": 8.824228242360035e-06, + "logits/chosen": -0.04433093219995499, + "logits/rejected": -0.20669923722743988, + "logps/chosen": -77.52042388916016, + "logps/rejected": -80.10365295410156, + "loss": 0.9544, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.836519241333008, + "rewards/margins": 3.2678475379943848, + "rewards/rejected": -0.4313282370567322, + "step": 1336 + }, + { + "epoch": 0.33, + "grad_norm": 4.499130725860596, + "learning_rate": 8.8225400452026e-06, + "logits/chosen": -0.17251653969287872, + "logits/rejected": -0.26595836877822876, + "logps/chosen": -57.402870178222656, + "logps/rejected": -76.905029296875, + "loss": 0.813, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7132725715637207, + "rewards/margins": 3.2813847064971924, + "rewards/rejected": -0.5681116580963135, + "step": 1337 + }, + { + "epoch": 0.33, + "grad_norm": 5.05397367477417, + "learning_rate": 8.820850798675435e-06, + "logits/chosen": -0.19004350900650024, + "logits/rejected": -0.23259538412094116, + "logps/chosen": -49.09174728393555, + "logps/rejected": -73.00131225585938, + "loss": 0.8142, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7782883644104004, + "rewards/margins": 3.4634170532226562, + "rewards/rejected": -0.685128390789032, + "step": 1338 + }, + { + "epoch": 0.33, + "grad_norm": 4.873232364654541, + "learning_rate": 8.819160503242282e-06, + "logits/chosen": -0.20142383873462677, + "logits/rejected": -0.32768306136131287, + "logps/chosen": -56.44738006591797, + "logps/rejected": -60.45212936401367, + "loss": 0.9429, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.608762741088867, + "rewards/margins": 3.0713489055633545, + "rewards/rejected": -0.46258577704429626, + "step": 1339 + }, + { + "epoch": 0.34, + "grad_norm": 3.5540997982025146, + "learning_rate": 8.817469159367159e-06, + "logits/chosen": -0.19370169937610626, + "logits/rejected": -0.34305450320243835, + "logps/chosen": -52.35350036621094, + "logps/rejected": -69.44891357421875, + "loss": 0.7749, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.662114381790161, + "rewards/margins": 4.345302581787109, + "rewards/rejected": -1.6831883192062378, + "step": 1340 + }, + { + "epoch": 0.34, + "grad_norm": 6.899341106414795, + "learning_rate": 8.815776767514374e-06, + "logits/chosen": -0.20507332682609558, + "logits/rejected": -0.3717234134674072, + "logps/chosen": -57.536441802978516, + "logps/rejected": -57.479854583740234, + "loss": 0.8646, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.517178773880005, + "rewards/margins": 3.1754467487335205, + "rewards/rejected": -0.6582680344581604, + "step": 1341 + }, + { + "epoch": 0.34, + "grad_norm": 4.833277702331543, + "learning_rate": 8.814083328148532e-06, + "logits/chosen": -0.09182841330766678, + "logits/rejected": -0.26285025477409363, + "logps/chosen": -63.28022766113281, + "logps/rejected": -73.32353973388672, + "loss": 0.7328, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.666076898574829, + "rewards/margins": 4.032770156860352, + "rewards/rejected": -1.3666932582855225, + "step": 1342 + }, + { + "epoch": 0.34, + "grad_norm": 4.017453193664551, + "learning_rate": 8.812388841734513e-06, + "logits/chosen": -0.2049219310283661, + "logits/rejected": -0.2744123041629791, + "logps/chosen": -57.59117126464844, + "logps/rejected": -84.29424285888672, + "loss": 0.7333, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.060549020767212, + "rewards/margins": 3.9842162132263184, + "rewards/rejected": -0.9236680269241333, + "step": 1343 + }, + { + "epoch": 0.34, + "grad_norm": 7.459676742553711, + "learning_rate": 8.810693308737493e-06, + "logits/chosen": -0.12939241528511047, + "logits/rejected": -0.2766835689544678, + "logps/chosen": -61.71961212158203, + "logps/rejected": -69.63858032226562, + "loss": 0.8817, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8087220191955566, + "rewards/margins": 2.72458553314209, + "rewards/rejected": 0.08413635939359665, + "step": 1344 + }, + { + "epoch": 0.34, + "grad_norm": 3.3798909187316895, + "learning_rate": 8.808996729622931e-06, + "logits/chosen": -0.09766839444637299, + "logits/rejected": -0.22112852334976196, + "logps/chosen": -68.21566772460938, + "logps/rejected": -82.80736541748047, + "loss": 0.7942, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7606287002563477, + "rewards/margins": 4.131595611572266, + "rewards/rejected": -1.3709664344787598, + "step": 1345 + }, + { + "epoch": 0.34, + "grad_norm": 4.175424575805664, + "learning_rate": 8.807299104856575e-06, + "logits/chosen": -0.11393021047115326, + "logits/rejected": -0.15881472826004028, + "logps/chosen": -57.72193145751953, + "logps/rejected": -82.31776428222656, + "loss": 0.8922, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7272756099700928, + "rewards/margins": 3.8571979999542236, + "rewards/rejected": -1.1299222707748413, + "step": 1346 + }, + { + "epoch": 0.34, + "grad_norm": 3.596022367477417, + "learning_rate": 8.805600434904461e-06, + "logits/chosen": -0.07082128524780273, + "logits/rejected": -0.13455910980701447, + "logps/chosen": -60.29106140136719, + "logps/rejected": -80.67330932617188, + "loss": 0.8806, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7595577239990234, + "rewards/margins": 2.8959925174713135, + "rewards/rejected": -0.13643492758274078, + "step": 1347 + }, + { + "epoch": 0.34, + "grad_norm": 4.712276935577393, + "learning_rate": 8.803900720232908e-06, + "logits/chosen": -0.10783126950263977, + "logits/rejected": -0.2378195822238922, + "logps/chosen": -54.12769317626953, + "logps/rejected": -68.90292358398438, + "loss": 0.8875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8154754638671875, + "rewards/margins": 3.041372537612915, + "rewards/rejected": -0.22589708864688873, + "step": 1348 + }, + { + "epoch": 0.34, + "grad_norm": 4.659729957580566, + "learning_rate": 8.802199961308526e-06, + "logits/chosen": 0.007344476878643036, + "logits/rejected": -0.17883272469043732, + "logps/chosen": -63.410003662109375, + "logps/rejected": -68.59333801269531, + "loss": 0.8154, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.760157585144043, + "rewards/margins": 3.670093297958374, + "rewards/rejected": -0.909935474395752, + "step": 1349 + }, + { + "epoch": 0.34, + "grad_norm": 2.197103977203369, + "learning_rate": 8.80049815859821e-06, + "logits/chosen": -0.1757555603981018, + "logits/rejected": -0.3312460780143738, + "logps/chosen": -52.1156005859375, + "logps/rejected": -72.06583404541016, + "loss": 0.6798, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.065781593322754, + "rewards/margins": 4.161539554595947, + "rewards/rejected": -1.0957578420639038, + "step": 1350 + }, + { + "epoch": 0.34, + "grad_norm": 3.7720465660095215, + "learning_rate": 8.798795312569141e-06, + "logits/chosen": -0.26152634620666504, + "logits/rejected": -0.39825770258903503, + "logps/chosen": -49.98682403564453, + "logps/rejected": -71.33222961425781, + "loss": 0.7156, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7866315841674805, + "rewards/margins": 4.100205421447754, + "rewards/rejected": -1.3135735988616943, + "step": 1351 + }, + { + "epoch": 0.34, + "grad_norm": 4.735084056854248, + "learning_rate": 8.797091423688787e-06, + "logits/chosen": -0.13067828118801117, + "logits/rejected": -0.18884438276290894, + "logps/chosen": -58.4080696105957, + "logps/rejected": -74.9541015625, + "loss": 0.9222, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7874343395233154, + "rewards/margins": 2.8062918186187744, + "rewards/rejected": -0.018857449293136597, + "step": 1352 + }, + { + "epoch": 0.34, + "grad_norm": 3.714585542678833, + "learning_rate": 8.795386492424902e-06, + "logits/chosen": -0.08028688281774521, + "logits/rejected": -0.2551254332065582, + "logps/chosen": -56.40964889526367, + "logps/rejected": -58.36027526855469, + "loss": 0.7771, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7964277267456055, + "rewards/margins": 3.702500343322754, + "rewards/rejected": -0.9060724973678589, + "step": 1353 + }, + { + "epoch": 0.34, + "grad_norm": 4.018319129943848, + "learning_rate": 8.793680519245527e-06, + "logits/chosen": -0.07745862752199173, + "logits/rejected": -0.1548008918762207, + "logps/chosen": -47.705997467041016, + "logps/rejected": -79.30198669433594, + "loss": 0.7799, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7536826133728027, + "rewards/margins": 3.4558725357055664, + "rewards/rejected": -0.7021899819374084, + "step": 1354 + }, + { + "epoch": 0.34, + "grad_norm": 3.686392307281494, + "learning_rate": 8.79197350461899e-06, + "logits/chosen": -0.20293644070625305, + "logits/rejected": -0.24671992659568787, + "logps/chosen": -56.4598274230957, + "logps/rejected": -79.7083511352539, + "loss": 0.9611, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7462005615234375, + "rewards/margins": 3.1664981842041016, + "rewards/rejected": -0.4202978312969208, + "step": 1355 + }, + { + "epoch": 0.34, + "grad_norm": 4.920022010803223, + "learning_rate": 8.790265449013899e-06, + "logits/chosen": -0.047109801322221756, + "logits/rejected": -0.12320595979690552, + "logps/chosen": -63.72962951660156, + "logps/rejected": -68.70993041992188, + "loss": 0.9128, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.532421112060547, + "rewards/margins": 2.799170970916748, + "rewards/rejected": -0.26674947142601013, + "step": 1356 + }, + { + "epoch": 0.34, + "grad_norm": 6.7821831703186035, + "learning_rate": 8.788556352899156e-06, + "logits/chosen": -0.25173789262771606, + "logits/rejected": -0.30300936102867126, + "logps/chosen": -57.035125732421875, + "logps/rejected": -85.20259094238281, + "loss": 0.9782, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.722865581512451, + "rewards/margins": 2.6442782878875732, + "rewards/rejected": 0.07858766615390778, + "step": 1357 + }, + { + "epoch": 0.34, + "grad_norm": 5.9568610191345215, + "learning_rate": 8.786846216743943e-06, + "logits/chosen": -0.24118317663669586, + "logits/rejected": -0.34306007623672485, + "logps/chosen": -58.777008056640625, + "logps/rejected": -64.44780731201172, + "loss": 0.9296, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.803964614868164, + "rewards/margins": 3.513223886489868, + "rewards/rejected": -0.7092592716217041, + "step": 1358 + }, + { + "epoch": 0.34, + "grad_norm": 4.734737396240234, + "learning_rate": 8.785135041017729e-06, + "logits/chosen": -0.13383859395980835, + "logits/rejected": -0.19424079358577728, + "logps/chosen": -55.273521423339844, + "logps/rejected": -74.05965423583984, + "loss": 0.8092, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.641475200653076, + "rewards/margins": 3.0609686374664307, + "rewards/rejected": -0.41949331760406494, + "step": 1359 + }, + { + "epoch": 0.34, + "grad_norm": 7.427429676055908, + "learning_rate": 8.783422826190272e-06, + "logits/chosen": -0.09979302436113358, + "logits/rejected": -0.21110273897647858, + "logps/chosen": -61.4578971862793, + "logps/rejected": -84.93951416015625, + "loss": 0.8919, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7813284397125244, + "rewards/margins": 3.153273582458496, + "rewards/rejected": -0.3719448447227478, + "step": 1360 + }, + { + "epoch": 0.34, + "grad_norm": 4.382689952850342, + "learning_rate": 8.781709572731607e-06, + "logits/chosen": -0.1182054877281189, + "logits/rejected": -0.18246904015541077, + "logps/chosen": -58.847938537597656, + "logps/rejected": -86.78826904296875, + "loss": 0.8388, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6917903423309326, + "rewards/margins": 2.8019351959228516, + "rewards/rejected": -0.11014492809772491, + "step": 1361 + }, + { + "epoch": 0.34, + "grad_norm": 7.197299957275391, + "learning_rate": 8.779995281112063e-06, + "logits/chosen": -0.15135855972766876, + "logits/rejected": -0.2093694657087326, + "logps/chosen": -62.317405700683594, + "logps/rejected": -69.854736328125, + "loss": 0.7976, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.546684503555298, + "rewards/margins": 3.090228796005249, + "rewards/rejected": -0.5435444116592407, + "step": 1362 + }, + { + "epoch": 0.34, + "grad_norm": 3.0010578632354736, + "learning_rate": 8.778279951802249e-06, + "logits/chosen": -0.08477528393268585, + "logits/rejected": -0.2328411042690277, + "logps/chosen": -71.42220306396484, + "logps/rejected": -80.07200622558594, + "loss": 0.7902, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7864201068878174, + "rewards/margins": 3.463625907897949, + "rewards/rejected": -0.6772055625915527, + "step": 1363 + }, + { + "epoch": 0.34, + "grad_norm": 9.221405029296875, + "learning_rate": 8.776563585273057e-06, + "logits/chosen": -0.104836106300354, + "logits/rejected": -0.22371967136859894, + "logps/chosen": -47.87017059326172, + "logps/rejected": -69.88249969482422, + "loss": 0.8609, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0125176906585693, + "rewards/margins": 2.928727626800537, + "rewards/rejected": 0.083790123462677, + "step": 1364 + }, + { + "epoch": 0.34, + "grad_norm": 4.5029520988464355, + "learning_rate": 8.774846181995671e-06, + "logits/chosen": -0.1258850246667862, + "logits/rejected": -0.23851965367794037, + "logps/chosen": -51.12650680541992, + "logps/rejected": -66.36325073242188, + "loss": 0.9205, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6342949867248535, + "rewards/margins": 2.9360294342041016, + "rewards/rejected": -0.3017340898513794, + "step": 1365 + }, + { + "epoch": 0.34, + "grad_norm": 4.396109104156494, + "learning_rate": 8.773127742441552e-06, + "logits/chosen": -0.13464823365211487, + "logits/rejected": -0.2512248754501343, + "logps/chosen": -61.697593688964844, + "logps/rejected": -65.65930938720703, + "loss": 0.9258, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.904651403427124, + "rewards/margins": 2.777794361114502, + "rewards/rejected": 0.12685717642307281, + "step": 1366 + }, + { + "epoch": 0.34, + "grad_norm": 3.147986650466919, + "learning_rate": 8.77140826708245e-06, + "logits/chosen": -0.18648138642311096, + "logits/rejected": -0.29771482944488525, + "logps/chosen": -55.602821350097656, + "logps/rejected": -78.02184295654297, + "loss": 0.7638, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.102415084838867, + "rewards/margins": 3.6153409481048584, + "rewards/rejected": -0.5129256844520569, + "step": 1367 + }, + { + "epoch": 0.34, + "grad_norm": 3.1801791191101074, + "learning_rate": 8.769687756390401e-06, + "logits/chosen": -0.12394063174724579, + "logits/rejected": -0.2161087691783905, + "logps/chosen": -56.62104797363281, + "logps/rejected": -83.35388946533203, + "loss": 0.7666, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.976139545440674, + "rewards/margins": 3.2898359298706055, + "rewards/rejected": -0.31369632482528687, + "step": 1368 + }, + { + "epoch": 0.34, + "grad_norm": 4.742763042449951, + "learning_rate": 8.767966210837715e-06, + "logits/chosen": -0.08258312940597534, + "logits/rejected": -0.17061415314674377, + "logps/chosen": -54.54634475708008, + "logps/rejected": -75.21273803710938, + "loss": 0.8078, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.753953456878662, + "rewards/margins": 3.2967212200164795, + "rewards/rejected": -0.5427678227424622, + "step": 1369 + }, + { + "epoch": 0.34, + "grad_norm": 4.674061298370361, + "learning_rate": 8.766243630897002e-06, + "logits/chosen": -0.10361266136169434, + "logits/rejected": -0.14726343750953674, + "logps/chosen": -57.21774673461914, + "logps/rejected": -82.6228256225586, + "loss": 0.9414, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5538883209228516, + "rewards/margins": 2.594125270843506, + "rewards/rejected": -0.0402371808886528, + "step": 1370 + }, + { + "epoch": 0.34, + "grad_norm": 3.448359489440918, + "learning_rate": 8.764520017041141e-06, + "logits/chosen": -0.0038866866379976273, + "logits/rejected": -0.14667727053165436, + "logps/chosen": -64.71503448486328, + "logps/rejected": -75.53404998779297, + "loss": 0.7911, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8678343296051025, + "rewards/margins": 3.25190806388855, + "rewards/rejected": -0.38407349586486816, + "step": 1371 + }, + { + "epoch": 0.34, + "grad_norm": 2.9263386726379395, + "learning_rate": 8.762795369743303e-06, + "logits/chosen": -0.15336525440216064, + "logits/rejected": -0.2615340054035187, + "logps/chosen": -55.998008728027344, + "logps/rejected": -65.16043853759766, + "loss": 0.7443, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.710265874862671, + "rewards/margins": 3.6549673080444336, + "rewards/rejected": -0.944701611995697, + "step": 1372 + }, + { + "epoch": 0.34, + "grad_norm": 3.4590396881103516, + "learning_rate": 8.761069689476942e-06, + "logits/chosen": -0.21182343363761902, + "logits/rejected": -0.3555537164211273, + "logps/chosen": -60.62527847290039, + "logps/rejected": -63.418643951416016, + "loss": 0.7656, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6822781562805176, + "rewards/margins": 3.24153208732605, + "rewards/rejected": -0.5592538714408875, + "step": 1373 + }, + { + "epoch": 0.34, + "grad_norm": 5.070857524871826, + "learning_rate": 8.759342976715795e-06, + "logits/chosen": -0.0980716273188591, + "logits/rejected": -0.21992170810699463, + "logps/chosen": -58.09351348876953, + "logps/rejected": -60.943729400634766, + "loss": 0.9494, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5844876766204834, + "rewards/margins": 2.660440444946289, + "rewards/rejected": -0.07595279067754745, + "step": 1374 + }, + { + "epoch": 0.34, + "grad_norm": 2.7651567459106445, + "learning_rate": 8.757615231933879e-06, + "logits/chosen": -0.14595887064933777, + "logits/rejected": -0.3486374318599701, + "logps/chosen": -46.70328140258789, + "logps/rejected": -59.93647384643555, + "loss": 0.6743, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7954342365264893, + "rewards/margins": 4.15584659576416, + "rewards/rejected": -1.36041259765625, + "step": 1375 + }, + { + "epoch": 0.34, + "grad_norm": 3.57072114944458, + "learning_rate": 8.755886455605499e-06, + "logits/chosen": -0.17380240559577942, + "logits/rejected": -0.2989679276943207, + "logps/chosen": -65.32500457763672, + "logps/rejected": -77.20381164550781, + "loss": 0.7769, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.803584575653076, + "rewards/margins": 3.192082643508911, + "rewards/rejected": -0.3884976804256439, + "step": 1376 + }, + { + "epoch": 0.34, + "grad_norm": 4.758569240570068, + "learning_rate": 8.75415664820524e-06, + "logits/chosen": -0.07443613559007645, + "logits/rejected": -0.17264489829540253, + "logps/chosen": -56.469844818115234, + "logps/rejected": -71.72028350830078, + "loss": 0.8125, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.942589044570923, + "rewards/margins": 2.819167137145996, + "rewards/rejected": 0.12342211604118347, + "step": 1377 + }, + { + "epoch": 0.34, + "grad_norm": 6.6420183181762695, + "learning_rate": 8.752425810207976e-06, + "logits/chosen": -0.09817563742399216, + "logits/rejected": -0.22027549147605896, + "logps/chosen": -56.56708526611328, + "logps/rejected": -63.45307922363281, + "loss": 0.8365, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6952965259552, + "rewards/margins": 3.1234676837921143, + "rewards/rejected": -0.4281713366508484, + "step": 1378 + }, + { + "epoch": 0.34, + "grad_norm": 4.001518249511719, + "learning_rate": 8.750693942088855e-06, + "logits/chosen": -0.20946983993053436, + "logits/rejected": -0.3560428321361542, + "logps/chosen": -56.441810607910156, + "logps/rejected": -64.69556427001953, + "loss": 0.729, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9981892108917236, + "rewards/margins": 3.2037644386291504, + "rewards/rejected": -0.20557498931884766, + "step": 1379 + }, + { + "epoch": 0.35, + "grad_norm": 4.5694169998168945, + "learning_rate": 8.748961044323312e-06, + "logits/chosen": -0.10503870248794556, + "logits/rejected": -0.22409886121749878, + "logps/chosen": -59.81840515136719, + "logps/rejected": -68.5485610961914, + "loss": 0.7786, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.745093584060669, + "rewards/margins": 3.423089027404785, + "rewards/rejected": -0.6779953837394714, + "step": 1380 + }, + { + "epoch": 0.35, + "grad_norm": 4.063881874084473, + "learning_rate": 8.747227117387068e-06, + "logits/chosen": -0.1431444138288498, + "logits/rejected": -0.24831001460552216, + "logps/chosen": -50.46198272705078, + "logps/rejected": -75.19461059570312, + "loss": 0.7409, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.775387763977051, + "rewards/margins": 3.4845900535583496, + "rewards/rejected": -0.7092023491859436, + "step": 1381 + }, + { + "epoch": 0.35, + "grad_norm": 4.291505813598633, + "learning_rate": 8.74549216175612e-06, + "logits/chosen": -0.019925441592931747, + "logits/rejected": -0.14839787781238556, + "logps/chosen": -62.11656951904297, + "logps/rejected": -74.65064239501953, + "loss": 0.8194, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7754063606262207, + "rewards/margins": 3.332773447036743, + "rewards/rejected": -0.5573668479919434, + "step": 1382 + }, + { + "epoch": 0.35, + "grad_norm": 4.505814552307129, + "learning_rate": 8.743756177906752e-06, + "logits/chosen": -0.20519298315048218, + "logits/rejected": -0.36795443296432495, + "logps/chosen": -51.651039123535156, + "logps/rejected": -53.64351272583008, + "loss": 0.772, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7365639209747314, + "rewards/margins": 3.209451913833618, + "rewards/rejected": -0.4728880524635315, + "step": 1383 + }, + { + "epoch": 0.35, + "grad_norm": 5.262824535369873, + "learning_rate": 8.74201916631553e-06, + "logits/chosen": -0.09688416123390198, + "logits/rejected": -0.21373571455478668, + "logps/chosen": -61.23922348022461, + "logps/rejected": -72.09506225585938, + "loss": 0.9469, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7053000926971436, + "rewards/margins": 3.219727039337158, + "rewards/rejected": -0.5144268274307251, + "step": 1384 + }, + { + "epoch": 0.35, + "grad_norm": 2.3111624717712402, + "learning_rate": 8.7402811274593e-06, + "logits/chosen": -0.14728368818759918, + "logits/rejected": -0.24214325845241547, + "logps/chosen": -54.89702606201172, + "logps/rejected": -82.88089752197266, + "loss": 0.6191, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.805429220199585, + "rewards/margins": 4.485383033752441, + "rewards/rejected": -1.679954171180725, + "step": 1385 + }, + { + "epoch": 0.35, + "grad_norm": 5.343991756439209, + "learning_rate": 8.738542061815192e-06, + "logits/chosen": -0.1058264821767807, + "logits/rejected": -0.19786497950553894, + "logps/chosen": -62.06570816040039, + "logps/rejected": -73.08685302734375, + "loss": 1.0581, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.7075533866882324, + "rewards/margins": 2.2063896656036377, + "rewards/rejected": 0.5011637806892395, + "step": 1386 + }, + { + "epoch": 0.35, + "grad_norm": 3.949364423751831, + "learning_rate": 8.736801969860616e-06, + "logits/chosen": -0.18616656959056854, + "logits/rejected": -0.23060965538024902, + "logps/chosen": -58.375526428222656, + "logps/rejected": -86.88160705566406, + "loss": 0.7656, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.648435354232788, + "rewards/margins": 4.035894870758057, + "rewards/rejected": -1.3874592781066895, + "step": 1387 + }, + { + "epoch": 0.35, + "grad_norm": 3.942213535308838, + "learning_rate": 8.735060852073267e-06, + "logits/chosen": -0.15445451438426971, + "logits/rejected": -0.2398880124092102, + "logps/chosen": -46.91170883178711, + "logps/rejected": -73.75534057617188, + "loss": 0.7556, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9285500049591064, + "rewards/margins": 3.9598984718322754, + "rewards/rejected": -1.0313483476638794, + "step": 1388 + }, + { + "epoch": 0.35, + "grad_norm": 3.952366352081299, + "learning_rate": 8.733318708931117e-06, + "logits/chosen": -0.18426728248596191, + "logits/rejected": -0.23796939849853516, + "logps/chosen": -56.15251159667969, + "logps/rejected": -78.67704772949219, + "loss": 0.8061, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9206700325012207, + "rewards/margins": 3.185084581375122, + "rewards/rejected": -0.26441502571105957, + "step": 1389 + }, + { + "epoch": 0.35, + "grad_norm": 4.070239067077637, + "learning_rate": 8.731575540912422e-06, + "logits/chosen": -0.16672945022583008, + "logits/rejected": -0.3305407762527466, + "logps/chosen": -61.42167663574219, + "logps/rejected": -67.55390167236328, + "loss": 0.8039, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.821359157562256, + "rewards/margins": 3.0857725143432617, + "rewards/rejected": -0.26441311836242676, + "step": 1390 + }, + { + "epoch": 0.35, + "grad_norm": 4.108038902282715, + "learning_rate": 8.729831348495721e-06, + "logits/chosen": -0.1687307357788086, + "logits/rejected": -0.22865335643291473, + "logps/chosen": -64.75178527832031, + "logps/rejected": -82.88294982910156, + "loss": 0.9109, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5324792861938477, + "rewards/margins": 3.204188346862793, + "rewards/rejected": -0.6717085242271423, + "step": 1391 + }, + { + "epoch": 0.35, + "grad_norm": 4.942657470703125, + "learning_rate": 8.72808613215983e-06, + "logits/chosen": -0.14175717532634735, + "logits/rejected": -0.324273943901062, + "logps/chosen": -64.39058685302734, + "logps/rejected": -59.435150146484375, + "loss": 0.7279, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8380327224731445, + "rewards/margins": 3.7230429649353027, + "rewards/rejected": -0.8850104808807373, + "step": 1392 + }, + { + "epoch": 0.35, + "grad_norm": 3.7383625507354736, + "learning_rate": 8.726339892383853e-06, + "logits/chosen": -0.0037348419427871704, + "logits/rejected": -0.13855819404125214, + "logps/chosen": -59.358882904052734, + "logps/rejected": -85.89924621582031, + "loss": 0.7706, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.834662914276123, + "rewards/margins": 4.115994930267334, + "rewards/rejected": -1.2813314199447632, + "step": 1393 + }, + { + "epoch": 0.35, + "grad_norm": 3.643653154373169, + "learning_rate": 8.724592629647163e-06, + "logits/chosen": -0.1984715759754181, + "logits/rejected": -0.320168673992157, + "logps/chosen": -51.280357360839844, + "logps/rejected": -66.35184478759766, + "loss": 0.7185, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.797816038131714, + "rewards/margins": 3.863720417022705, + "rewards/rejected": -1.0659042596817017, + "step": 1394 + }, + { + "epoch": 0.35, + "grad_norm": 4.098358154296875, + "learning_rate": 8.72284434442943e-06, + "logits/chosen": -0.15155410766601562, + "logits/rejected": -0.22400452196598053, + "logps/chosen": -62.742305755615234, + "logps/rejected": -80.59945678710938, + "loss": 0.8579, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.772141218185425, + "rewards/margins": 3.2775120735168457, + "rewards/rejected": -0.5053706765174866, + "step": 1395 + }, + { + "epoch": 0.35, + "grad_norm": 2.3543343544006348, + "learning_rate": 8.72109503721059e-06, + "logits/chosen": -0.1594918668270111, + "logits/rejected": -0.2772344946861267, + "logps/chosen": -52.214385986328125, + "logps/rejected": -66.36396026611328, + "loss": 0.6624, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.113314628601074, + "rewards/margins": 4.16676139831543, + "rewards/rejected": -1.0534462928771973, + "step": 1396 + }, + { + "epoch": 0.35, + "grad_norm": 5.931587219238281, + "learning_rate": 8.719344708470868e-06, + "logits/chosen": -0.1555395871400833, + "logits/rejected": -0.25496840476989746, + "logps/chosen": -57.37736511230469, + "logps/rejected": -75.8989486694336, + "loss": 0.8173, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.431793212890625, + "rewards/margins": 3.4987096786499023, + "rewards/rejected": -1.0669162273406982, + "step": 1397 + }, + { + "epoch": 0.35, + "grad_norm": 7.952635288238525, + "learning_rate": 8.717593358690766e-06, + "logits/chosen": -0.13290175795555115, + "logits/rejected": -0.2395264357328415, + "logps/chosen": -62.78230667114258, + "logps/rejected": -77.09613800048828, + "loss": 0.9931, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.565608501434326, + "rewards/margins": 2.9202229976654053, + "rewards/rejected": -0.35461437702178955, + "step": 1398 + }, + { + "epoch": 0.35, + "grad_norm": 8.553241729736328, + "learning_rate": 8.715840988351067e-06, + "logits/chosen": -0.022720951586961746, + "logits/rejected": -0.1924818456172943, + "logps/chosen": -78.10287475585938, + "logps/rejected": -70.24246215820312, + "loss": 1.0947, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.3600947856903076, + "rewards/margins": 2.2966365814208984, + "rewards/rejected": 0.06345823407173157, + "step": 1399 + }, + { + "epoch": 0.35, + "grad_norm": 4.2806572914123535, + "learning_rate": 8.714087597932837e-06, + "logits/chosen": -0.1911955177783966, + "logits/rejected": -0.2752282917499542, + "logps/chosen": -57.46167755126953, + "logps/rejected": -83.58818054199219, + "loss": 0.875, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7634668350219727, + "rewards/margins": 3.482093334197998, + "rewards/rejected": -0.7186263203620911, + "step": 1400 + }, + { + "epoch": 0.35, + "grad_norm": 3.8113291263580322, + "learning_rate": 8.712333187917415e-06, + "logits/chosen": -0.19642315804958344, + "logits/rejected": -0.325633704662323, + "logps/chosen": -61.20653533935547, + "logps/rejected": -77.1579360961914, + "loss": 0.7679, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.107882261276245, + "rewards/margins": 3.777351140975952, + "rewards/rejected": -0.6694689989089966, + "step": 1401 + }, + { + "epoch": 0.35, + "grad_norm": 2.970839262008667, + "learning_rate": 8.710577758786429e-06, + "logits/chosen": -0.16700279712677002, + "logits/rejected": -0.28658682107925415, + "logps/chosen": -54.226104736328125, + "logps/rejected": -60.52460861206055, + "loss": 0.7694, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8791849613189697, + "rewards/margins": 3.9245080947875977, + "rewards/rejected": -1.0453225374221802, + "step": 1402 + }, + { + "epoch": 0.35, + "grad_norm": 5.799889087677002, + "learning_rate": 8.70882131102178e-06, + "logits/chosen": -0.1771843433380127, + "logits/rejected": -0.3048194646835327, + "logps/chosen": -59.850830078125, + "logps/rejected": -64.18954467773438, + "loss": 1.0507, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.574160575866699, + "rewards/margins": 3.295048713684082, + "rewards/rejected": -0.7208881974220276, + "step": 1403 + }, + { + "epoch": 0.35, + "grad_norm": 4.307799339294434, + "learning_rate": 8.70706384510565e-06, + "logits/chosen": -0.0931159257888794, + "logits/rejected": -0.27208593487739563, + "logps/chosen": -61.05410385131836, + "logps/rejected": -74.41612243652344, + "loss": 0.7065, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.876732110977173, + "rewards/margins": 4.32694673538208, + "rewards/rejected": -1.4502149820327759, + "step": 1404 + }, + { + "epoch": 0.35, + "grad_norm": 6.7064433097839355, + "learning_rate": 8.705305361520504e-06, + "logits/chosen": -0.17278072237968445, + "logits/rejected": -0.2275591790676117, + "logps/chosen": -65.10202026367188, + "logps/rejected": -80.36837005615234, + "loss": 0.9743, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.637589931488037, + "rewards/margins": 3.0791895389556885, + "rewards/rejected": -0.44159984588623047, + "step": 1405 + }, + { + "epoch": 0.35, + "grad_norm": 4.452167510986328, + "learning_rate": 8.703545860749081e-06, + "logits/chosen": -0.22348499298095703, + "logits/rejected": -0.2750079929828644, + "logps/chosen": -62.57410430908203, + "logps/rejected": -79.55599212646484, + "loss": 0.8393, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0629289150238037, + "rewards/margins": 3.6387622356414795, + "rewards/rejected": -0.5758332014083862, + "step": 1406 + }, + { + "epoch": 0.35, + "grad_norm": 5.427219867706299, + "learning_rate": 8.701785343274404e-06, + "logits/chosen": -0.1410691738128662, + "logits/rejected": -0.2604597806930542, + "logps/chosen": -71.21129608154297, + "logps/rejected": -82.85009765625, + "loss": 0.9507, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.635305404663086, + "rewards/margins": 4.084554672241211, + "rewards/rejected": -1.449249029159546, + "step": 1407 + }, + { + "epoch": 0.35, + "grad_norm": 3.060634136199951, + "learning_rate": 8.700023809579772e-06, + "logits/chosen": -0.21015630662441254, + "logits/rejected": -0.28107309341430664, + "logps/chosen": -55.689395904541016, + "logps/rejected": -84.56681060791016, + "loss": 0.6931, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8943347930908203, + "rewards/margins": 4.247999668121338, + "rewards/rejected": -1.3536651134490967, + "step": 1408 + }, + { + "epoch": 0.35, + "grad_norm": 7.872750759124756, + "learning_rate": 8.698261260148763e-06, + "logits/chosen": -0.15879957377910614, + "logits/rejected": -0.2279120832681656, + "logps/chosen": -52.01991271972656, + "logps/rejected": -79.0994644165039, + "loss": 0.8326, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6926167011260986, + "rewards/margins": 3.801025152206421, + "rewards/rejected": -1.1084085702896118, + "step": 1409 + }, + { + "epoch": 0.35, + "grad_norm": 5.661749839782715, + "learning_rate": 8.696497695465237e-06, + "logits/chosen": -0.07617703825235367, + "logits/rejected": -0.21685922145843506, + "logps/chosen": -59.94987487792969, + "logps/rejected": -73.74626159667969, + "loss": 0.7511, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.759394645690918, + "rewards/margins": 3.1604645252227783, + "rewards/rejected": -0.4010698199272156, + "step": 1410 + }, + { + "epoch": 0.35, + "grad_norm": 4.477454662322998, + "learning_rate": 8.694733116013327e-06, + "logits/chosen": -0.19994889199733734, + "logits/rejected": -0.25276249647140503, + "logps/chosen": -56.6119384765625, + "logps/rejected": -71.92110443115234, + "loss": 0.9017, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8199872970581055, + "rewards/margins": 3.3416998386383057, + "rewards/rejected": -0.5217124819755554, + "step": 1411 + }, + { + "epoch": 0.35, + "grad_norm": 3.296726942062378, + "learning_rate": 8.692967522277453e-06, + "logits/chosen": -0.20294231176376343, + "logits/rejected": -0.33176401257514954, + "logps/chosen": -57.33112335205078, + "logps/rejected": -77.2099609375, + "loss": 0.7232, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9721426963806152, + "rewards/margins": 3.9950594902038574, + "rewards/rejected": -1.0229170322418213, + "step": 1412 + }, + { + "epoch": 0.35, + "grad_norm": 4.440846920013428, + "learning_rate": 8.691200914742305e-06, + "logits/chosen": -0.1431170403957367, + "logits/rejected": -0.3069518208503723, + "logps/chosen": -54.46698760986328, + "logps/rejected": -60.44170379638672, + "loss": 0.7888, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7105281352996826, + "rewards/margins": 3.7059712409973145, + "rewards/rejected": -0.9954430460929871, + "step": 1413 + }, + { + "epoch": 0.35, + "grad_norm": 7.739573955535889, + "learning_rate": 8.689433293892857e-06, + "logits/chosen": -0.1333564817905426, + "logits/rejected": -0.26007935404777527, + "logps/chosen": -55.815860748291016, + "logps/rejected": -62.500755310058594, + "loss": 0.97, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.955418109893799, + "rewards/margins": 3.025294065475464, + "rewards/rejected": -0.06987565755844116, + "step": 1414 + }, + { + "epoch": 0.35, + "grad_norm": 5.093994617462158, + "learning_rate": 8.687664660214359e-06, + "logits/chosen": -0.1701900064945221, + "logits/rejected": -0.2963597774505615, + "logps/chosen": -69.30738830566406, + "logps/rejected": -77.50234985351562, + "loss": 0.7958, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.80116868019104, + "rewards/margins": 3.9341306686401367, + "rewards/rejected": -1.1329623460769653, + "step": 1415 + }, + { + "epoch": 0.35, + "grad_norm": 4.32901668548584, + "learning_rate": 8.685895014192336e-06, + "logits/chosen": -0.10365325212478638, + "logits/rejected": -0.1873169243335724, + "logps/chosen": -56.857234954833984, + "logps/rejected": -81.63420104980469, + "loss": 0.8193, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.081862449645996, + "rewards/margins": 3.8219454288482666, + "rewards/rejected": -0.7400832772254944, + "step": 1416 + }, + { + "epoch": 0.35, + "grad_norm": 4.110898017883301, + "learning_rate": 8.684124356312598e-06, + "logits/chosen": -0.20656950771808624, + "logits/rejected": -0.24240511655807495, + "logps/chosen": -58.887996673583984, + "logps/rejected": -78.46913146972656, + "loss": 0.8425, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1102688312530518, + "rewards/margins": 3.060563087463379, + "rewards/rejected": 0.049705833196640015, + "step": 1417 + }, + { + "epoch": 0.35, + "grad_norm": 6.019256591796875, + "learning_rate": 8.682352687061226e-06, + "logits/chosen": -0.11468639969825745, + "logits/rejected": -0.1699487566947937, + "logps/chosen": -61.9824333190918, + "logps/rejected": -86.44074249267578, + "loss": 0.8198, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.676971197128296, + "rewards/margins": 3.3385050296783447, + "rewards/rejected": -0.6615338325500488, + "step": 1418 + }, + { + "epoch": 0.35, + "grad_norm": 9.799447059631348, + "learning_rate": 8.680580006924582e-06, + "logits/chosen": -0.10170363634824753, + "logits/rejected": -0.21287758648395538, + "logps/chosen": -64.3294906616211, + "logps/rejected": -69.33575439453125, + "loss": 0.8385, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9893600940704346, + "rewards/margins": 3.2593257427215576, + "rewards/rejected": -0.2699654996395111, + "step": 1419 + }, + { + "epoch": 0.36, + "grad_norm": 5.195377826690674, + "learning_rate": 8.678806316389308e-06, + "logits/chosen": -0.20196077227592468, + "logits/rejected": -0.26070550084114075, + "logps/chosen": -59.58794403076172, + "logps/rejected": -82.38011169433594, + "loss": 0.8205, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.519120693206787, + "rewards/margins": 3.0386202335357666, + "rewards/rejected": -0.5194997191429138, + "step": 1420 + }, + { + "epoch": 0.36, + "grad_norm": 5.40983772277832, + "learning_rate": 8.677031615942315e-06, + "logits/chosen": -0.16366833448410034, + "logits/rejected": -0.19117070734500885, + "logps/chosen": -61.57881546020508, + "logps/rejected": -87.22132873535156, + "loss": 0.7714, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6257662773132324, + "rewards/margins": 3.4293227195739746, + "rewards/rejected": -0.8035565614700317, + "step": 1421 + }, + { + "epoch": 0.36, + "grad_norm": 5.256355285644531, + "learning_rate": 8.675255906070801e-06, + "logits/chosen": -0.1517990678548813, + "logits/rejected": -0.2971026599407196, + "logps/chosen": -61.07308578491211, + "logps/rejected": -73.05402374267578, + "loss": 0.8418, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.712195873260498, + "rewards/margins": 3.7545504570007324, + "rewards/rejected": -1.0423545837402344, + "step": 1422 + }, + { + "epoch": 0.36, + "grad_norm": 4.629262924194336, + "learning_rate": 8.673479187262236e-06, + "logits/chosen": -0.19026781618595123, + "logits/rejected": -0.2459002137184143, + "logps/chosen": -46.09723663330078, + "logps/rejected": -74.59028625488281, + "loss": 0.7616, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7717392444610596, + "rewards/margins": 4.0276384353637695, + "rewards/rejected": -1.25589919090271, + "step": 1423 + }, + { + "epoch": 0.36, + "grad_norm": 7.052530288696289, + "learning_rate": 8.671701460004362e-06, + "logits/chosen": -0.1730794906616211, + "logits/rejected": -0.27270886301994324, + "logps/chosen": -61.03911590576172, + "logps/rejected": -72.44400024414062, + "loss": 1.0249, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.367417335510254, + "rewards/margins": 3.046008348464966, + "rewards/rejected": -0.6785909533500671, + "step": 1424 + }, + { + "epoch": 0.36, + "grad_norm": 5.599000930786133, + "learning_rate": 8.669922724785212e-06, + "logits/chosen": -0.19977425038814545, + "logits/rejected": -0.27382567524909973, + "logps/chosen": -51.65073776245117, + "logps/rejected": -77.2054214477539, + "loss": 0.9424, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.625552177429199, + "rewards/margins": 3.6006906032562256, + "rewards/rejected": -0.975138247013092, + "step": 1425 + }, + { + "epoch": 0.36, + "grad_norm": 16.173017501831055, + "learning_rate": 8.668142982093083e-06, + "logits/chosen": -0.19392743706703186, + "logits/rejected": -0.25485336780548096, + "logps/chosen": -56.32710266113281, + "logps/rejected": -71.49900817871094, + "loss": 0.9554, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.4656143188476562, + "rewards/margins": 3.0200586318969727, + "rewards/rejected": -0.5544443726539612, + "step": 1426 + }, + { + "epoch": 0.36, + "grad_norm": 4.053257465362549, + "learning_rate": 8.666362232416554e-06, + "logits/chosen": -0.15236738324165344, + "logits/rejected": -0.14533394575119019, + "logps/chosen": -55.06657791137695, + "logps/rejected": -94.40882873535156, + "loss": 0.7159, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8369598388671875, + "rewards/margins": 4.143340110778809, + "rewards/rejected": -1.3063801527023315, + "step": 1427 + }, + { + "epoch": 0.36, + "grad_norm": 5.664916515350342, + "learning_rate": 8.664580476244476e-06, + "logits/chosen": -0.19027352333068848, + "logits/rejected": -0.2916988730430603, + "logps/chosen": -62.867431640625, + "logps/rejected": -73.77030944824219, + "loss": 0.9579, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.531170606613159, + "rewards/margins": 3.2241005897521973, + "rewards/rejected": -0.6929300427436829, + "step": 1428 + }, + { + "epoch": 0.36, + "grad_norm": 4.804934024810791, + "learning_rate": 8.662797714065984e-06, + "logits/chosen": -0.1457231044769287, + "logits/rejected": -0.2670954167842865, + "logps/chosen": -63.9912109375, + "logps/rejected": -72.14872741699219, + "loss": 0.9237, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.81365966796875, + "rewards/margins": 3.4713551998138428, + "rewards/rejected": -0.657695472240448, + "step": 1429 + }, + { + "epoch": 0.36, + "grad_norm": 4.33294153213501, + "learning_rate": 8.66101394637048e-06, + "logits/chosen": -0.11299818754196167, + "logits/rejected": -0.29096463322639465, + "logps/chosen": -65.04242706298828, + "logps/rejected": -70.31915283203125, + "loss": 0.6835, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.746309757232666, + "rewards/margins": 4.123827934265137, + "rewards/rejected": -1.3775184154510498, + "step": 1430 + }, + { + "epoch": 0.36, + "grad_norm": 4.269730567932129, + "learning_rate": 8.659229173647652e-06, + "logits/chosen": -0.19352403283119202, + "logits/rejected": -0.33012640476226807, + "logps/chosen": -66.60519409179688, + "logps/rejected": -71.03987121582031, + "loss": 0.7151, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.989060401916504, + "rewards/margins": 3.85628342628479, + "rewards/rejected": -0.8672226667404175, + "step": 1431 + }, + { + "epoch": 0.36, + "grad_norm": 4.8596014976501465, + "learning_rate": 8.657443396387456e-06, + "logits/chosen": -0.11481721699237823, + "logits/rejected": -0.2674313485622406, + "logps/chosen": -66.98885345458984, + "logps/rejected": -73.50379943847656, + "loss": 0.7935, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9393277168273926, + "rewards/margins": 4.554174900054932, + "rewards/rejected": -1.614847183227539, + "step": 1432 + }, + { + "epoch": 0.36, + "grad_norm": 5.520590305328369, + "learning_rate": 8.655656615080124e-06, + "logits/chosen": -0.16669106483459473, + "logits/rejected": -0.28134340047836304, + "logps/chosen": -58.46883010864258, + "logps/rejected": -78.62528991699219, + "loss": 0.7386, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.936723232269287, + "rewards/margins": 3.826563596725464, + "rewards/rejected": -0.8898401856422424, + "step": 1433 + }, + { + "epoch": 0.36, + "grad_norm": 6.185844898223877, + "learning_rate": 8.65386883021617e-06, + "logits/chosen": -0.12885048985481262, + "logits/rejected": -0.17867320775985718, + "logps/chosen": -62.7365837097168, + "logps/rejected": -82.20555877685547, + "loss": 0.952, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.471163272857666, + "rewards/margins": 3.336129665374756, + "rewards/rejected": -0.8649665713310242, + "step": 1434 + }, + { + "epoch": 0.36, + "grad_norm": 4.189727783203125, + "learning_rate": 8.652080042286377e-06, + "logits/chosen": -0.1723853200674057, + "logits/rejected": -0.2334875464439392, + "logps/chosen": -53.083797454833984, + "logps/rejected": -73.05107116699219, + "loss": 0.8715, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6734094619750977, + "rewards/margins": 3.299287796020508, + "rewards/rejected": -0.6258782744407654, + "step": 1435 + }, + { + "epoch": 0.36, + "grad_norm": 5.929037094116211, + "learning_rate": 8.650290251781806e-06, + "logits/chosen": -0.16788625717163086, + "logits/rejected": -0.16590219736099243, + "logps/chosen": -59.545372009277344, + "logps/rejected": -80.1422119140625, + "loss": 0.9812, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5981786251068115, + "rewards/margins": 2.631751537322998, + "rewards/rejected": -0.0335732102394104, + "step": 1436 + }, + { + "epoch": 0.36, + "grad_norm": 4.38116455078125, + "learning_rate": 8.648499459193794e-06, + "logits/chosen": -0.16977420449256897, + "logits/rejected": -0.3043690323829651, + "logps/chosen": -63.62764358520508, + "logps/rejected": -74.56963348388672, + "loss": 0.8303, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.785579204559326, + "rewards/margins": 4.002239227294922, + "rewards/rejected": -1.2166602611541748, + "step": 1437 + }, + { + "epoch": 0.36, + "grad_norm": 3.288224935531616, + "learning_rate": 8.646707665013953e-06, + "logits/chosen": -0.11208476126194, + "logits/rejected": -0.16676345467567444, + "logps/chosen": -43.46670913696289, + "logps/rejected": -80.19073486328125, + "loss": 0.7134, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9417901039123535, + "rewards/margins": 3.1472842693328857, + "rewards/rejected": -0.20549450814723969, + "step": 1438 + }, + { + "epoch": 0.36, + "grad_norm": 6.5631937980651855, + "learning_rate": 8.64491486973417e-06, + "logits/chosen": -0.13363496959209442, + "logits/rejected": -0.15686126053333282, + "logps/chosen": -71.95759582519531, + "logps/rejected": -83.37863159179688, + "loss": 1.1434, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8788437843322754, + "rewards/margins": 2.698046922683716, + "rewards/rejected": 0.1807970106601715, + "step": 1439 + }, + { + "epoch": 0.36, + "grad_norm": 4.181783676147461, + "learning_rate": 8.6431210738466e-06, + "logits/chosen": -0.19569772481918335, + "logits/rejected": -0.2847490906715393, + "logps/chosen": -62.31439208984375, + "logps/rejected": -76.59474182128906, + "loss": 0.8795, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8085923194885254, + "rewards/margins": 3.59250545501709, + "rewards/rejected": -0.7839130163192749, + "step": 1440 + }, + { + "epoch": 0.36, + "grad_norm": 6.242151260375977, + "learning_rate": 8.641326277843686e-06, + "logits/chosen": -0.1371551901102066, + "logits/rejected": -0.27274420857429504, + "logps/chosen": -67.21983337402344, + "logps/rejected": -73.21053314208984, + "loss": 0.8628, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.783237934112549, + "rewards/margins": 3.354552745819092, + "rewards/rejected": -0.5713151097297668, + "step": 1441 + }, + { + "epoch": 0.36, + "grad_norm": 5.3251261711120605, + "learning_rate": 8.639530482218133e-06, + "logits/chosen": -0.20917604863643646, + "logits/rejected": -0.3166905343532562, + "logps/chosen": -51.278724670410156, + "logps/rejected": -64.94218444824219, + "loss": 0.8084, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.836167812347412, + "rewards/margins": 4.079946994781494, + "rewards/rejected": -1.2437790632247925, + "step": 1442 + }, + { + "epoch": 0.36, + "grad_norm": 4.926841735839844, + "learning_rate": 8.63773368746293e-06, + "logits/chosen": -0.0747980922460556, + "logits/rejected": -0.17981159687042236, + "logps/chosen": -65.60541534423828, + "logps/rejected": -72.47180938720703, + "loss": 0.885, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.736398220062256, + "rewards/margins": 3.26778507232666, + "rewards/rejected": -0.5313867330551147, + "step": 1443 + }, + { + "epoch": 0.36, + "grad_norm": 3.7421844005584717, + "learning_rate": 8.635935894071332e-06, + "logits/chosen": -0.09524602442979813, + "logits/rejected": -0.24044528603553772, + "logps/chosen": -64.29698944091797, + "logps/rejected": -70.95733642578125, + "loss": 0.7315, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.436903476715088, + "rewards/margins": 4.145715713500977, + "rewards/rejected": -1.7088124752044678, + "step": 1444 + }, + { + "epoch": 0.36, + "grad_norm": 5.597628593444824, + "learning_rate": 8.634137102536874e-06, + "logits/chosen": -0.21797505021095276, + "logits/rejected": -0.35327351093292236, + "logps/chosen": -46.72868347167969, + "logps/rejected": -69.81875610351562, + "loss": 0.7534, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8452064990997314, + "rewards/margins": 3.845309257507324, + "rewards/rejected": -1.0001027584075928, + "step": 1445 + }, + { + "epoch": 0.36, + "grad_norm": 4.67233419418335, + "learning_rate": 8.632337313353364e-06, + "logits/chosen": -0.1654677540063858, + "logits/rejected": -0.3015855848789215, + "logps/chosen": -54.64838409423828, + "logps/rejected": -58.91817855834961, + "loss": 0.8892, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.79714298248291, + "rewards/margins": 3.1334280967712402, + "rewards/rejected": -0.3362847566604614, + "step": 1446 + }, + { + "epoch": 0.36, + "grad_norm": 4.680841445922852, + "learning_rate": 8.63053652701488e-06, + "logits/chosen": -0.1322866827249527, + "logits/rejected": -0.22105076909065247, + "logps/chosen": -58.9608268737793, + "logps/rejected": -72.37191772460938, + "loss": 0.9372, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.0333571434020996, + "rewards/margins": 3.5501601696014404, + "rewards/rejected": -0.5168027877807617, + "step": 1447 + }, + { + "epoch": 0.36, + "grad_norm": 4.8010125160217285, + "learning_rate": 8.628734744015781e-06, + "logits/chosen": -0.09164540469646454, + "logits/rejected": -0.1492510885000229, + "logps/chosen": -51.3331298828125, + "logps/rejected": -74.39143371582031, + "loss": 0.9544, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7167558670043945, + "rewards/margins": 3.3745219707489014, + "rewards/rejected": -0.6577661037445068, + "step": 1448 + }, + { + "epoch": 0.36, + "grad_norm": 6.2550740242004395, + "learning_rate": 8.62693196485069e-06, + "logits/chosen": -0.15685196220874786, + "logits/rejected": -0.23503443598747253, + "logps/chosen": -57.13684844970703, + "logps/rejected": -87.98179626464844, + "loss": 0.8305, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8373019695281982, + "rewards/margins": 3.7442941665649414, + "rewards/rejected": -0.9069925546646118, + "step": 1449 + }, + { + "epoch": 0.36, + "grad_norm": 4.503302097320557, + "learning_rate": 8.625128190014513e-06, + "logits/chosen": -0.1651761382818222, + "logits/rejected": -0.3443151116371155, + "logps/chosen": -49.68352508544922, + "logps/rejected": -58.83428955078125, + "loss": 0.727, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.890245199203491, + "rewards/margins": 3.728088617324829, + "rewards/rejected": -0.8378435373306274, + "step": 1450 + }, + { + "epoch": 0.36, + "grad_norm": 4.4490814208984375, + "learning_rate": 8.623323420002426e-06, + "logits/chosen": -0.1241152361035347, + "logits/rejected": -0.2448757290840149, + "logps/chosen": -45.18412399291992, + "logps/rejected": -68.54523468017578, + "loss": 0.77, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7720370292663574, + "rewards/margins": 3.2921478748321533, + "rewards/rejected": -0.5201107263565063, + "step": 1451 + }, + { + "epoch": 0.36, + "grad_norm": 12.876025199890137, + "learning_rate": 8.621517655309872e-06, + "logits/chosen": -0.27357423305511475, + "logits/rejected": -0.3647936284542084, + "logps/chosen": -63.175331115722656, + "logps/rejected": -74.33076477050781, + "loss": 0.9181, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.979193687438965, + "rewards/margins": 3.6039083003997803, + "rewards/rejected": -0.6247147917747498, + "step": 1452 + }, + { + "epoch": 0.36, + "grad_norm": 4.514266014099121, + "learning_rate": 8.619710896432577e-06, + "logits/chosen": -0.21548445522785187, + "logits/rejected": -0.2879717946052551, + "logps/chosen": -59.616600036621094, + "logps/rejected": -75.69493865966797, + "loss": 0.9405, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.906104564666748, + "rewards/margins": 4.028186798095703, + "rewards/rejected": -1.122081995010376, + "step": 1453 + }, + { + "epoch": 0.36, + "grad_norm": 4.258836269378662, + "learning_rate": 8.617903143866533e-06, + "logits/chosen": -0.1883542835712433, + "logits/rejected": -0.3576011657714844, + "logps/chosen": -56.39236068725586, + "logps/rejected": -63.25992202758789, + "loss": 0.8561, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8767151832580566, + "rewards/margins": 3.3949053287506104, + "rewards/rejected": -0.5181906223297119, + "step": 1454 + }, + { + "epoch": 0.36, + "grad_norm": 3.436962842941284, + "learning_rate": 8.616094398108007e-06, + "logits/chosen": -0.24862955510616302, + "logits/rejected": -0.31610584259033203, + "logps/chosen": -51.394744873046875, + "logps/rejected": -68.86705017089844, + "loss": 0.7951, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8003597259521484, + "rewards/margins": 3.060554265975952, + "rewards/rejected": -0.2601948380470276, + "step": 1455 + }, + { + "epoch": 0.36, + "grad_norm": 4.792339324951172, + "learning_rate": 8.61428465965354e-06, + "logits/chosen": -0.17884138226509094, + "logits/rejected": -0.3234335482120514, + "logps/chosen": -59.85930252075195, + "logps/rejected": -61.624698638916016, + "loss": 0.8731, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7706520557403564, + "rewards/margins": 3.2893013954162598, + "rewards/rejected": -0.5186494588851929, + "step": 1456 + }, + { + "epoch": 0.36, + "grad_norm": 6.7568182945251465, + "learning_rate": 8.612473928999945e-06, + "logits/chosen": -0.1389169991016388, + "logits/rejected": -0.21453975141048431, + "logps/chosen": -51.08754348754883, + "logps/rejected": -75.78495788574219, + "loss": 0.9047, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8625171184539795, + "rewards/margins": 3.0741307735443115, + "rewards/rejected": -0.21161341667175293, + "step": 1457 + }, + { + "epoch": 0.36, + "grad_norm": 3.0376229286193848, + "learning_rate": 8.610662206644304e-06, + "logits/chosen": -0.10176324099302292, + "logits/rejected": -0.31468430161476135, + "logps/chosen": -60.093238830566406, + "logps/rejected": -65.52218627929688, + "loss": 0.66, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8825249671936035, + "rewards/margins": 4.41499662399292, + "rewards/rejected": -1.5324718952178955, + "step": 1458 + }, + { + "epoch": 0.36, + "grad_norm": 9.260458946228027, + "learning_rate": 8.608849493083978e-06, + "logits/chosen": -0.11250746250152588, + "logits/rejected": -0.20107661187648773, + "logps/chosen": -60.93182373046875, + "logps/rejected": -92.7081527709961, + "loss": 0.8903, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.4575860500335693, + "rewards/margins": 3.0839617252349854, + "rewards/rejected": -0.6263753175735474, + "step": 1459 + }, + { + "epoch": 0.37, + "grad_norm": 3.142747640609741, + "learning_rate": 8.60703578881659e-06, + "logits/chosen": -0.11771316826343536, + "logits/rejected": -0.2658587694168091, + "logps/chosen": -54.76764678955078, + "logps/rejected": -62.164852142333984, + "loss": 0.7672, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7704854011535645, + "rewards/margins": 3.935295581817627, + "rewards/rejected": -1.164810061454773, + "step": 1460 + }, + { + "epoch": 0.37, + "grad_norm": 3.9443295001983643, + "learning_rate": 8.605221094340045e-06, + "logits/chosen": -0.2054484784603119, + "logits/rejected": -0.2629393935203552, + "logps/chosen": -52.44498825073242, + "logps/rejected": -77.14298248291016, + "loss": 0.7929, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9310717582702637, + "rewards/margins": 3.343780755996704, + "rewards/rejected": -0.4127088189125061, + "step": 1461 + }, + { + "epoch": 0.37, + "grad_norm": 5.149735927581787, + "learning_rate": 8.603405410152516e-06, + "logits/chosen": -0.1038922443985939, + "logits/rejected": -0.1723577082157135, + "logps/chosen": -71.02295684814453, + "logps/rejected": -94.53309631347656, + "loss": 0.8126, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5715229511260986, + "rewards/margins": 3.709739923477173, + "rewards/rejected": -1.1382169723510742, + "step": 1462 + }, + { + "epoch": 0.37, + "grad_norm": 4.614602088928223, + "learning_rate": 8.601588736752447e-06, + "logits/chosen": -0.23017863929271698, + "logits/rejected": -0.30816179513931274, + "logps/chosen": -57.8880615234375, + "logps/rejected": -60.71472930908203, + "loss": 0.8756, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.883708953857422, + "rewards/margins": 2.957545518875122, + "rewards/rejected": -0.07383649051189423, + "step": 1463 + }, + { + "epoch": 0.37, + "grad_norm": 4.220936298370361, + "learning_rate": 8.599771074638552e-06, + "logits/chosen": -0.2240845263004303, + "logits/rejected": -0.3892505168914795, + "logps/chosen": -58.21101379394531, + "logps/rejected": -62.35749816894531, + "loss": 0.9131, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7595040798187256, + "rewards/margins": 3.204683542251587, + "rewards/rejected": -0.44517970085144043, + "step": 1464 + }, + { + "epoch": 0.37, + "grad_norm": 5.3060455322265625, + "learning_rate": 8.597952424309822e-06, + "logits/chosen": -0.07379365712404251, + "logits/rejected": -0.22379888594150543, + "logps/chosen": -69.22505950927734, + "logps/rejected": -67.63258361816406, + "loss": 0.8274, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6673145294189453, + "rewards/margins": 3.2916574478149414, + "rewards/rejected": -0.6243433952331543, + "step": 1465 + }, + { + "epoch": 0.37, + "grad_norm": 3.8992810249328613, + "learning_rate": 8.596132786265513e-06, + "logits/chosen": -0.13594549894332886, + "logits/rejected": -0.2239036113023758, + "logps/chosen": -52.54233932495117, + "logps/rejected": -65.15583038330078, + "loss": 0.8793, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8920273780822754, + "rewards/margins": 3.2021782398223877, + "rewards/rejected": -0.3101508319377899, + "step": 1466 + }, + { + "epoch": 0.37, + "grad_norm": 8.605965614318848, + "learning_rate": 8.594312161005155e-06, + "logits/chosen": -0.1863236427307129, + "logits/rejected": -0.31619563698768616, + "logps/chosen": -61.01615524291992, + "logps/rejected": -64.02977752685547, + "loss": 0.8714, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7568702697753906, + "rewards/margins": 3.1435599327087402, + "rewards/rejected": -0.38668984174728394, + "step": 1467 + }, + { + "epoch": 0.37, + "grad_norm": 5.472471237182617, + "learning_rate": 8.592490549028549e-06, + "logits/chosen": -0.17888584733009338, + "logits/rejected": -0.2699936330318451, + "logps/chosen": -62.09027099609375, + "logps/rejected": -74.74655151367188, + "loss": 0.884, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.184748649597168, + "rewards/margins": 3.5064351558685303, + "rewards/rejected": -0.32168659567832947, + "step": 1468 + }, + { + "epoch": 0.37, + "grad_norm": 8.670968055725098, + "learning_rate": 8.59066795083577e-06, + "logits/chosen": -0.1144246757030487, + "logits/rejected": -0.2637382745742798, + "logps/chosen": -81.24514770507812, + "logps/rejected": -68.99520111083984, + "loss": 1.0439, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4227137565612793, + "rewards/margins": 3.2429471015930176, + "rewards/rejected": -0.8202336430549622, + "step": 1469 + }, + { + "epoch": 0.37, + "grad_norm": 5.892585754394531, + "learning_rate": 8.588844366927156e-06, + "logits/chosen": -0.11246444284915924, + "logits/rejected": -0.2723832428455353, + "logps/chosen": -68.64299011230469, + "logps/rejected": -66.48744201660156, + "loss": 0.7754, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.716644763946533, + "rewards/margins": 3.4974701404571533, + "rewards/rejected": -0.7808253765106201, + "step": 1470 + }, + { + "epoch": 0.37, + "grad_norm": 4.456183433532715, + "learning_rate": 8.587019797803322e-06, + "logits/chosen": -0.04584264010190964, + "logits/rejected": -0.10308840870857239, + "logps/chosen": -59.248008728027344, + "logps/rejected": -89.41827392578125, + "loss": 0.838, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8491086959838867, + "rewards/margins": 3.292651653289795, + "rewards/rejected": -0.4435426592826843, + "step": 1471 + }, + { + "epoch": 0.37, + "grad_norm": 4.5902605056762695, + "learning_rate": 8.585194243965154e-06, + "logits/chosen": -0.17281100153923035, + "logits/rejected": -0.23680460453033447, + "logps/chosen": -51.5313606262207, + "logps/rejected": -87.56515502929688, + "loss": 0.7322, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6119437217712402, + "rewards/margins": 3.3179931640625, + "rewards/rejected": -0.7060494422912598, + "step": 1472 + }, + { + "epoch": 0.37, + "grad_norm": 6.25732946395874, + "learning_rate": 8.5833677059138e-06, + "logits/chosen": -0.1836945116519928, + "logits/rejected": -0.3147335648536682, + "logps/chosen": -54.31581115722656, + "logps/rejected": -64.19432830810547, + "loss": 0.9309, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.790445327758789, + "rewards/margins": 2.9019837379455566, + "rewards/rejected": -0.11153848469257355, + "step": 1473 + }, + { + "epoch": 0.37, + "grad_norm": 6.162012577056885, + "learning_rate": 8.581540184150692e-06, + "logits/chosen": -0.11026092618703842, + "logits/rejected": -0.19741573929786682, + "logps/chosen": -56.93689727783203, + "logps/rejected": -73.8751449584961, + "loss": 0.9306, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.663797378540039, + "rewards/margins": 2.6143083572387695, + "rewards/rejected": 0.04948917031288147, + "step": 1474 + }, + { + "epoch": 0.37, + "grad_norm": 4.426666736602783, + "learning_rate": 8.579711679177518e-06, + "logits/chosen": -0.1123727411031723, + "logits/rejected": -0.24333393573760986, + "logps/chosen": -54.0937614440918, + "logps/rejected": -73.10030364990234, + "loss": 0.7523, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7935938835144043, + "rewards/margins": 3.9736649990081787, + "rewards/rejected": -1.1800708770751953, + "step": 1475 + }, + { + "epoch": 0.37, + "grad_norm": 5.255747318267822, + "learning_rate": 8.577882191496244e-06, + "logits/chosen": -0.19058777391910553, + "logits/rejected": -0.23448446393013, + "logps/chosen": -57.83333969116211, + "logps/rejected": -73.24365234375, + "loss": 0.8803, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7298898696899414, + "rewards/margins": 2.998688220977783, + "rewards/rejected": -0.26879799365997314, + "step": 1476 + }, + { + "epoch": 0.37, + "grad_norm": 5.238431453704834, + "learning_rate": 8.576051721609103e-06, + "logits/chosen": -0.12995587289333344, + "logits/rejected": -0.14961740374565125, + "logps/chosen": -55.218536376953125, + "logps/rejected": -72.68075561523438, + "loss": 0.9647, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7379074096679688, + "rewards/margins": 2.4659488201141357, + "rewards/rejected": 0.27195852994918823, + "step": 1477 + }, + { + "epoch": 0.37, + "grad_norm": 3.293632745742798, + "learning_rate": 8.574220270018601e-06, + "logits/chosen": -0.25088393688201904, + "logits/rejected": -0.3683813810348511, + "logps/chosen": -54.696197509765625, + "logps/rejected": -63.577571868896484, + "loss": 0.8444, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.794487953186035, + "rewards/margins": 3.151289939880371, + "rewards/rejected": -0.3568018972873688, + "step": 1478 + }, + { + "epoch": 0.37, + "grad_norm": 3.5484962463378906, + "learning_rate": 8.572387837227506e-06, + "logits/chosen": -0.1977277398109436, + "logits/rejected": -0.3067382872104645, + "logps/chosen": -73.79353332519531, + "logps/rejected": -70.58183288574219, + "loss": 0.9181, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7672572135925293, + "rewards/margins": 3.7684037685394287, + "rewards/rejected": -1.0011464357376099, + "step": 1479 + }, + { + "epoch": 0.37, + "grad_norm": 4.166703701019287, + "learning_rate": 8.570554423738865e-06, + "logits/chosen": -0.07221312075853348, + "logits/rejected": -0.14362727105617523, + "logps/chosen": -66.15105438232422, + "logps/rejected": -69.13674926757812, + "loss": 0.8887, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7885708808898926, + "rewards/margins": 2.5927791595458984, + "rewards/rejected": 0.1957915872335434, + "step": 1480 + }, + { + "epoch": 0.37, + "grad_norm": 6.47304630279541, + "learning_rate": 8.56872003005599e-06, + "logits/chosen": -0.20298632979393005, + "logits/rejected": -0.33480897545814514, + "logps/chosen": -59.89378356933594, + "logps/rejected": -59.57243347167969, + "loss": 0.7449, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.897033214569092, + "rewards/margins": 3.7770395278930664, + "rewards/rejected": -0.8800063133239746, + "step": 1481 + }, + { + "epoch": 0.37, + "grad_norm": 3.764923572540283, + "learning_rate": 8.566884656682459e-06, + "logits/chosen": -0.17988063395023346, + "logits/rejected": -0.2746809422969818, + "logps/chosen": -61.25282669067383, + "logps/rejected": -68.37406158447266, + "loss": 0.8946, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.197659969329834, + "rewards/margins": 2.9803028106689453, + "rewards/rejected": 0.21735699474811554, + "step": 1482 + }, + { + "epoch": 0.37, + "grad_norm": 7.5970306396484375, + "learning_rate": 8.565048304122123e-06, + "logits/chosen": -0.04077373817563057, + "logits/rejected": -0.09581667184829712, + "logps/chosen": -66.34147644042969, + "logps/rejected": -98.91539001464844, + "loss": 1.026, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.66260027885437, + "rewards/margins": 2.7252707481384277, + "rewards/rejected": -0.06267033517360687, + "step": 1483 + }, + { + "epoch": 0.37, + "grad_norm": 4.7457990646362305, + "learning_rate": 8.563210972879099e-06, + "logits/chosen": -0.18339593708515167, + "logits/rejected": -0.28240537643432617, + "logps/chosen": -51.149688720703125, + "logps/rejected": -61.422855377197266, + "loss": 0.8831, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8102450370788574, + "rewards/margins": 2.8543357849121094, + "rewards/rejected": -0.04409068822860718, + "step": 1484 + }, + { + "epoch": 0.37, + "grad_norm": 3.65144944190979, + "learning_rate": 8.561372663457774e-06, + "logits/chosen": -0.19445465505123138, + "logits/rejected": -0.2627809941768646, + "logps/chosen": -63.54737091064453, + "logps/rejected": -94.0953140258789, + "loss": 0.8408, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9741098880767822, + "rewards/margins": 3.173173189163208, + "rewards/rejected": -0.1990634948015213, + "step": 1485 + }, + { + "epoch": 0.37, + "grad_norm": 5.093477249145508, + "learning_rate": 8.559533376362807e-06, + "logits/chosen": -0.11648060381412506, + "logits/rejected": -0.1428559124469757, + "logps/chosen": -53.66792297363281, + "logps/rejected": -80.94213104248047, + "loss": 0.8643, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1315135955810547, + "rewards/margins": 3.0380396842956543, + "rewards/rejected": 0.09347416460514069, + "step": 1486 + }, + { + "epoch": 0.37, + "grad_norm": 4.805069446563721, + "learning_rate": 8.557693112099119e-06, + "logits/chosen": -0.17223000526428223, + "logits/rejected": -0.3116995096206665, + "logps/chosen": -54.45814514160156, + "logps/rejected": -64.17191314697266, + "loss": 0.8426, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9731862545013428, + "rewards/margins": 3.7372124195098877, + "rewards/rejected": -0.764025866985321, + "step": 1487 + }, + { + "epoch": 0.37, + "grad_norm": 4.273462295532227, + "learning_rate": 8.5558518711719e-06, + "logits/chosen": -0.20300857722759247, + "logits/rejected": -0.22860433161258698, + "logps/chosen": -54.57573699951172, + "logps/rejected": -61.834571838378906, + "loss": 1.0447, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7111258506774902, + "rewards/margins": 2.496825933456421, + "rewards/rejected": 0.21429991722106934, + "step": 1488 + }, + { + "epoch": 0.37, + "grad_norm": 2.962007999420166, + "learning_rate": 8.554009654086615e-06, + "logits/chosen": -0.10139480233192444, + "logits/rejected": -0.3224431276321411, + "logps/chosen": -63.90818786621094, + "logps/rejected": -61.90108108520508, + "loss": 0.7454, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.018960952758789, + "rewards/margins": 4.001062393188477, + "rewards/rejected": -0.9821015000343323, + "step": 1489 + }, + { + "epoch": 0.37, + "grad_norm": 2.8203513622283936, + "learning_rate": 8.552166461348991e-06, + "logits/chosen": -0.18964223563671112, + "logits/rejected": -0.281787246465683, + "logps/chosen": -45.92976760864258, + "logps/rejected": -67.2156982421875, + "loss": 0.7339, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8306844234466553, + "rewards/margins": 4.148833274841309, + "rewards/rejected": -1.3181488513946533, + "step": 1490 + }, + { + "epoch": 0.37, + "grad_norm": 3.874851942062378, + "learning_rate": 8.550322293465022e-06, + "logits/chosen": -0.06531284749507904, + "logits/rejected": -0.22409257292747498, + "logps/chosen": -65.5389633178711, + "logps/rejected": -69.43991088867188, + "loss": 0.7625, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9862136840820312, + "rewards/margins": 3.465928792953491, + "rewards/rejected": -0.4797150492668152, + "step": 1491 + }, + { + "epoch": 0.37, + "grad_norm": 3.2769134044647217, + "learning_rate": 8.548477150940976e-06, + "logits/chosen": -0.16054284572601318, + "logits/rejected": -0.23543336987495422, + "logps/chosen": -54.17474365234375, + "logps/rejected": -70.34806823730469, + "loss": 0.702, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0427207946777344, + "rewards/margins": 3.3764402866363525, + "rewards/rejected": -0.33371973037719727, + "step": 1492 + }, + { + "epoch": 0.37, + "grad_norm": 4.222873687744141, + "learning_rate": 8.546631034283381e-06, + "logits/chosen": -0.11056353896856308, + "logits/rejected": -0.17378327250480652, + "logps/chosen": -61.33837890625, + "logps/rejected": -68.30961608886719, + "loss": 0.8955, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.644320249557495, + "rewards/margins": 2.6567323207855225, + "rewards/rejected": -0.01241225004196167, + "step": 1493 + }, + { + "epoch": 0.37, + "grad_norm": 4.038939476013184, + "learning_rate": 8.544783943999036e-06, + "logits/chosen": -0.20831069350242615, + "logits/rejected": -0.32171016931533813, + "logps/chosen": -53.51260757446289, + "logps/rejected": -67.23426818847656, + "loss": 0.8173, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8207812309265137, + "rewards/margins": 3.38710618019104, + "rewards/rejected": -0.5663248896598816, + "step": 1494 + }, + { + "epoch": 0.37, + "grad_norm": 6.806187629699707, + "learning_rate": 8.54293588059501e-06, + "logits/chosen": -0.1261502504348755, + "logits/rejected": -0.14726614952087402, + "logps/chosen": -58.52511215209961, + "logps/rejected": -88.99880981445312, + "loss": 0.9562, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.530245065689087, + "rewards/margins": 2.6401968002319336, + "rewards/rejected": -0.10995234549045563, + "step": 1495 + }, + { + "epoch": 0.37, + "grad_norm": 5.766871929168701, + "learning_rate": 8.541086844578632e-06, + "logits/chosen": -0.1500297337770462, + "logits/rejected": -0.24637673795223236, + "logps/chosen": -50.578983306884766, + "logps/rejected": -59.10356140136719, + "loss": 0.886, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9892330169677734, + "rewards/margins": 3.009955644607544, + "rewards/rejected": -0.02072298526763916, + "step": 1496 + }, + { + "epoch": 0.37, + "grad_norm": 5.689574241638184, + "learning_rate": 8.539236836457505e-06, + "logits/chosen": -0.20053699612617493, + "logits/rejected": -0.3010128140449524, + "logps/chosen": -50.29634094238281, + "logps/rejected": -71.45033264160156, + "loss": 0.8254, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8476498126983643, + "rewards/margins": 3.4230880737304688, + "rewards/rejected": -0.5754380226135254, + "step": 1497 + }, + { + "epoch": 0.37, + "grad_norm": 4.525900840759277, + "learning_rate": 8.537385856739495e-06, + "logits/chosen": -0.11651423573493958, + "logits/rejected": -0.24345730245113373, + "logps/chosen": -50.203155517578125, + "logps/rejected": -67.52637481689453, + "loss": 0.7278, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.980059862136841, + "rewards/margins": 3.350102424621582, + "rewards/rejected": -0.3700423836708069, + "step": 1498 + }, + { + "epoch": 0.37, + "grad_norm": 5.713040351867676, + "learning_rate": 8.535533905932739e-06, + "logits/chosen": -0.2294880747795105, + "logits/rejected": -0.25464940071105957, + "logps/chosen": -52.03513717651367, + "logps/rejected": -69.09273529052734, + "loss": 1.0084, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8755431175231934, + "rewards/margins": 2.7997262477874756, + "rewards/rejected": 0.0758167952299118, + "step": 1499 + }, + { + "epoch": 0.38, + "grad_norm": 3.528538942337036, + "learning_rate": 8.533680984545632e-06, + "logits/chosen": -0.18510864675045013, + "logits/rejected": -0.34956640005111694, + "logps/chosen": -55.851322174072266, + "logps/rejected": -59.37240982055664, + "loss": 0.729, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.72711181640625, + "rewards/margins": 3.2929046154022217, + "rewards/rejected": -0.5657931566238403, + "step": 1500 + }, + { + "epoch": 0.38, + "grad_norm": 4.931457996368408, + "learning_rate": 8.531827093086846e-06, + "logits/chosen": -0.27549776434898376, + "logits/rejected": -0.3950992524623871, + "logps/chosen": -54.40738296508789, + "logps/rejected": -72.26824951171875, + "loss": 0.8394, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.80387020111084, + "rewards/margins": 3.7353007793426514, + "rewards/rejected": -0.9314303398132324, + "step": 1501 + }, + { + "epoch": 0.38, + "grad_norm": 4.761394023895264, + "learning_rate": 8.529972232065313e-06, + "logits/chosen": -0.13904736936092377, + "logits/rejected": -0.29384729266166687, + "logps/chosen": -56.61066436767578, + "logps/rejected": -63.88212966918945, + "loss": 0.7142, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7228076457977295, + "rewards/margins": 3.8284645080566406, + "rewards/rejected": -1.1056569814682007, + "step": 1502 + }, + { + "epoch": 0.38, + "grad_norm": 5.33054780960083, + "learning_rate": 8.52811640199023e-06, + "logits/chosen": -0.2125958502292633, + "logits/rejected": -0.3254641890525818, + "logps/chosen": -59.9399299621582, + "logps/rejected": -73.59593200683594, + "loss": 0.9622, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9820988178253174, + "rewards/margins": 2.8209989070892334, + "rewards/rejected": 0.16110016405582428, + "step": 1503 + }, + { + "epoch": 0.38, + "grad_norm": 6.04224157333374, + "learning_rate": 8.526259603371063e-06, + "logits/chosen": -0.1887984573841095, + "logits/rejected": -0.28951919078826904, + "logps/chosen": -66.9311294555664, + "logps/rejected": -54.44221115112305, + "loss": 1.08, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7100672721862793, + "rewards/margins": 3.0252015590667725, + "rewards/rejected": -0.3151341676712036, + "step": 1504 + }, + { + "epoch": 0.38, + "grad_norm": 7.396318435668945, + "learning_rate": 8.524401836717545e-06, + "logits/chosen": -0.23383517563343048, + "logits/rejected": -0.3160404562950134, + "logps/chosen": -55.77207946777344, + "logps/rejected": -74.92178344726562, + "loss": 0.9264, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.420950174331665, + "rewards/margins": 2.9178221225738525, + "rewards/rejected": -0.4968718886375427, + "step": 1505 + }, + { + "epoch": 0.38, + "grad_norm": 5.03704309463501, + "learning_rate": 8.522543102539672e-06, + "logits/chosen": -0.1757163107395172, + "logits/rejected": -0.28793689608573914, + "logps/chosen": -57.62643814086914, + "logps/rejected": -61.46466064453125, + "loss": 0.8259, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8909530639648438, + "rewards/margins": 3.447490930557251, + "rewards/rejected": -0.5565374493598938, + "step": 1506 + }, + { + "epoch": 0.38, + "grad_norm": 6.823782920837402, + "learning_rate": 8.520683401347709e-06, + "logits/chosen": -0.16312243044376373, + "logits/rejected": -0.18520674109458923, + "logps/chosen": -57.420066833496094, + "logps/rejected": -90.6010513305664, + "loss": 0.9802, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.610042095184326, + "rewards/margins": 2.690855026245117, + "rewards/rejected": -0.08081318438053131, + "step": 1507 + }, + { + "epoch": 0.38, + "grad_norm": 4.050761699676514, + "learning_rate": 8.518822733652179e-06, + "logits/chosen": -0.1645938754081726, + "logits/rejected": -0.27956539392471313, + "logps/chosen": -48.37069320678711, + "logps/rejected": -66.80570220947266, + "loss": 0.7143, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8365490436553955, + "rewards/margins": 3.5665879249572754, + "rewards/rejected": -0.730039119720459, + "step": 1508 + }, + { + "epoch": 0.38, + "grad_norm": 6.310632705688477, + "learning_rate": 8.516961099963879e-06, + "logits/chosen": 0.01038820669054985, + "logits/rejected": -0.1133866012096405, + "logps/chosen": -63.235931396484375, + "logps/rejected": -90.8045883178711, + "loss": 0.7409, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7739274501800537, + "rewards/margins": 3.9064829349517822, + "rewards/rejected": -1.132555603981018, + "step": 1509 + }, + { + "epoch": 0.38, + "grad_norm": 6.238795757293701, + "learning_rate": 8.515098500793868e-06, + "logits/chosen": -0.18007342517375946, + "logits/rejected": -0.21966956555843353, + "logps/chosen": -54.89004898071289, + "logps/rejected": -79.039306640625, + "loss": 0.8719, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.505112648010254, + "rewards/margins": 3.110525608062744, + "rewards/rejected": -0.6054130792617798, + "step": 1510 + }, + { + "epoch": 0.38, + "grad_norm": 2.6959660053253174, + "learning_rate": 8.51323493665347e-06, + "logits/chosen": -0.15268853306770325, + "logits/rejected": -0.31120699644088745, + "logps/chosen": -48.41413879394531, + "logps/rejected": -71.76718139648438, + "loss": 0.6161, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0445616245269775, + "rewards/margins": 4.368791103363037, + "rewards/rejected": -1.3242292404174805, + "step": 1511 + }, + { + "epoch": 0.38, + "grad_norm": 6.5651397705078125, + "learning_rate": 8.511370408054269e-06, + "logits/chosen": -0.1883561611175537, + "logits/rejected": -0.22972002625465393, + "logps/chosen": -53.03938293457031, + "logps/rejected": -83.29248809814453, + "loss": 0.9228, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4600284099578857, + "rewards/margins": 3.3115813732147217, + "rewards/rejected": -0.8515530824661255, + "step": 1512 + }, + { + "epoch": 0.38, + "grad_norm": 3.582005262374878, + "learning_rate": 8.509504915508124e-06, + "logits/chosen": -0.15351267158985138, + "logits/rejected": -0.1711791455745697, + "logps/chosen": -59.79228210449219, + "logps/rejected": -94.81684875488281, + "loss": 0.7946, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.751666307449341, + "rewards/margins": 3.6300768852233887, + "rewards/rejected": -0.8784104585647583, + "step": 1513 + }, + { + "epoch": 0.38, + "grad_norm": 3.878955841064453, + "learning_rate": 8.50763845952715e-06, + "logits/chosen": -0.13757780194282532, + "logits/rejected": -0.252401739358902, + "logps/chosen": -67.90888214111328, + "logps/rejected": -69.3446044921875, + "loss": 0.8486, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8897275924682617, + "rewards/margins": 3.9777112007141113, + "rewards/rejected": -1.0879836082458496, + "step": 1514 + }, + { + "epoch": 0.38, + "grad_norm": 4.237781524658203, + "learning_rate": 8.505771040623729e-06, + "logits/chosen": -0.07031288743019104, + "logits/rejected": -0.1438072770833969, + "logps/chosen": -56.2431755065918, + "logps/rejected": -77.18043518066406, + "loss": 0.7493, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.650142192840576, + "rewards/margins": 3.6298911571502686, + "rewards/rejected": -0.979749321937561, + "step": 1515 + }, + { + "epoch": 0.38, + "grad_norm": 4.964545249938965, + "learning_rate": 8.503902659310511e-06, + "logits/chosen": -0.16254718601703644, + "logits/rejected": -0.27290019392967224, + "logps/chosen": -58.23796081542969, + "logps/rejected": -77.82003784179688, + "loss": 0.8364, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.930988073348999, + "rewards/margins": 3.7921910285949707, + "rewards/rejected": -0.8612032532691956, + "step": 1516 + }, + { + "epoch": 0.38, + "grad_norm": 5.86894416809082, + "learning_rate": 8.502033316100402e-06, + "logits/chosen": -0.15205487608909607, + "logits/rejected": -0.1838068664073944, + "logps/chosen": -58.72526931762695, + "logps/rejected": -70.5073471069336, + "loss": 0.9264, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7343077659606934, + "rewards/margins": 2.3468618392944336, + "rewards/rejected": 0.38744592666625977, + "step": 1517 + }, + { + "epoch": 0.38, + "grad_norm": 3.2207565307617188, + "learning_rate": 8.500163011506581e-06, + "logits/chosen": -0.18255706131458282, + "logits/rejected": -0.2891790568828583, + "logps/chosen": -55.455772399902344, + "logps/rejected": -66.29254150390625, + "loss": 0.8325, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5910391807556152, + "rewards/margins": 3.433972120285034, + "rewards/rejected": -0.8429328203201294, + "step": 1518 + }, + { + "epoch": 0.38, + "grad_norm": 4.328484058380127, + "learning_rate": 8.498291746042486e-06, + "logits/chosen": -0.13252782821655273, + "logits/rejected": -0.2931493818759918, + "logps/chosen": -62.46278381347656, + "logps/rejected": -73.33343505859375, + "loss": 0.8316, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6577367782592773, + "rewards/margins": 3.81611967086792, + "rewards/rejected": -1.1583833694458008, + "step": 1519 + }, + { + "epoch": 0.38, + "grad_norm": 13.363031387329102, + "learning_rate": 8.49641952022182e-06, + "logits/chosen": -0.1516416072845459, + "logits/rejected": -0.2072458565235138, + "logps/chosen": -57.9066162109375, + "logps/rejected": -71.26127624511719, + "loss": 0.9582, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.4719653129577637, + "rewards/margins": 2.5864508152008057, + "rewards/rejected": -0.11448553949594498, + "step": 1520 + }, + { + "epoch": 0.38, + "grad_norm": 6.647188186645508, + "learning_rate": 8.494546334558548e-06, + "logits/chosen": -0.21617136895656586, + "logits/rejected": -0.3074077069759369, + "logps/chosen": -59.804447174072266, + "logps/rejected": -63.46274185180664, + "loss": 1.0173, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.534599542617798, + "rewards/margins": 2.6219482421875, + "rewards/rejected": -0.08734886348247528, + "step": 1521 + }, + { + "epoch": 0.38, + "grad_norm": 4.114518642425537, + "learning_rate": 8.492672189566901e-06, + "logits/chosen": -0.17217521369457245, + "logits/rejected": -0.29477617144584656, + "logps/chosen": -63.870880126953125, + "logps/rejected": -72.42850494384766, + "loss": 0.72, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9929113388061523, + "rewards/margins": 4.1452555656433105, + "rewards/rejected": -1.1523443460464478, + "step": 1522 + }, + { + "epoch": 0.38, + "grad_norm": 5.077319145202637, + "learning_rate": 8.490797085761373e-06, + "logits/chosen": -0.18513552844524384, + "logits/rejected": -0.290867418050766, + "logps/chosen": -46.89965057373047, + "logps/rejected": -61.40088653564453, + "loss": 0.9148, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.49357008934021, + "rewards/margins": 2.9470837116241455, + "rewards/rejected": -0.45351386070251465, + "step": 1523 + }, + { + "epoch": 0.38, + "grad_norm": 4.486343860626221, + "learning_rate": 8.488921023656717e-06, + "logits/chosen": -0.050446897745132446, + "logits/rejected": -0.24552597105503082, + "logps/chosen": -67.63868713378906, + "logps/rejected": -70.9874038696289, + "loss": 0.8267, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.531536817550659, + "rewards/margins": 2.757854700088501, + "rewards/rejected": -0.22631792724132538, + "step": 1524 + }, + { + "epoch": 0.38, + "grad_norm": 4.442789077758789, + "learning_rate": 8.487044003767957e-06, + "logits/chosen": -0.16106760501861572, + "logits/rejected": -0.24969521164894104, + "logps/chosen": -45.94578170776367, + "logps/rejected": -68.57479858398438, + "loss": 0.8658, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.755537986755371, + "rewards/margins": 2.821958541870117, + "rewards/rejected": -0.06642018258571625, + "step": 1525 + }, + { + "epoch": 0.38, + "grad_norm": 3.699928045272827, + "learning_rate": 8.485166026610374e-06, + "logits/chosen": -0.2071828693151474, + "logits/rejected": -0.3049900233745575, + "logps/chosen": -51.596717834472656, + "logps/rejected": -75.28771209716797, + "loss": 0.7107, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.964827060699463, + "rewards/margins": 4.02803373336792, + "rewards/rejected": -1.063206434249878, + "step": 1526 + }, + { + "epoch": 0.38, + "grad_norm": 4.6190385818481445, + "learning_rate": 8.483287092699513e-06, + "logits/chosen": -0.1854477971792221, + "logits/rejected": -0.27625781297683716, + "logps/chosen": -48.30677795410156, + "logps/rejected": -74.49223327636719, + "loss": 0.7638, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.856642723083496, + "rewards/margins": 2.9010355472564697, + "rewards/rejected": -0.04439307749271393, + "step": 1527 + }, + { + "epoch": 0.38, + "grad_norm": 4.058657646179199, + "learning_rate": 8.481407202551179e-06, + "logits/chosen": -0.2673295736312866, + "logits/rejected": -0.36198118329048157, + "logps/chosen": -55.25803756713867, + "logps/rejected": -70.26488494873047, + "loss": 0.8127, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8233542442321777, + "rewards/margins": 3.8391473293304443, + "rewards/rejected": -1.015792965888977, + "step": 1528 + }, + { + "epoch": 0.38, + "grad_norm": 4.772410869598389, + "learning_rate": 8.479526356681448e-06, + "logits/chosen": -0.10650459676980972, + "logits/rejected": -0.2870651185512543, + "logps/chosen": -53.89955520629883, + "logps/rejected": -66.58348846435547, + "loss": 0.6712, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.77781081199646, + "rewards/margins": 3.9246432781219482, + "rewards/rejected": -1.1468322277069092, + "step": 1529 + }, + { + "epoch": 0.38, + "grad_norm": 2.4874706268310547, + "learning_rate": 8.477644555606647e-06, + "logits/chosen": -0.14978833496570587, + "logits/rejected": -0.22336933016777039, + "logps/chosen": -53.52433776855469, + "logps/rejected": -82.85157012939453, + "loss": 0.6515, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0545783042907715, + "rewards/margins": 4.279202461242676, + "rewards/rejected": -1.2246243953704834, + "step": 1530 + }, + { + "epoch": 0.38, + "grad_norm": 4.915151596069336, + "learning_rate": 8.475761799843376e-06, + "logits/chosen": -0.13147497177124023, + "logits/rejected": -0.21251843869686127, + "logps/chosen": -58.2639274597168, + "logps/rejected": -73.26029968261719, + "loss": 0.8511, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7671456336975098, + "rewards/margins": 3.5603742599487305, + "rewards/rejected": -0.7932283878326416, + "step": 1531 + }, + { + "epoch": 0.38, + "grad_norm": 4.845232009887695, + "learning_rate": 8.47387808990849e-06, + "logits/chosen": -0.1044825091958046, + "logits/rejected": -0.26212114095687866, + "logps/chosen": -68.31173706054688, + "logps/rejected": -64.71257781982422, + "loss": 0.8771, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7867233753204346, + "rewards/margins": 2.4406609535217285, + "rewards/rejected": 0.3460622727870941, + "step": 1532 + }, + { + "epoch": 0.38, + "grad_norm": 9.019510269165039, + "learning_rate": 8.471993426319108e-06, + "logits/chosen": -0.2026948779821396, + "logits/rejected": -0.17893241345882416, + "logps/chosen": -52.390403747558594, + "logps/rejected": -82.46927642822266, + "loss": 1.1344, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.6151328086853027, + "rewards/margins": 2.0699820518493652, + "rewards/rejected": 0.5451510548591614, + "step": 1533 + }, + { + "epoch": 0.38, + "grad_norm": 5.195849895477295, + "learning_rate": 8.47010780959261e-06, + "logits/chosen": -0.09280511736869812, + "logits/rejected": -0.1719266176223755, + "logps/chosen": -60.08358383178711, + "logps/rejected": -76.23028564453125, + "loss": 0.9174, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.664031744003296, + "rewards/margins": 3.082028388977051, + "rewards/rejected": -0.41799652576446533, + "step": 1534 + }, + { + "epoch": 0.38, + "grad_norm": 7.448683738708496, + "learning_rate": 8.468221240246637e-06, + "logits/chosen": -0.19209635257720947, + "logits/rejected": -0.24664174020290375, + "logps/chosen": -61.06550598144531, + "logps/rejected": -72.87388610839844, + "loss": 0.8269, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6249217987060547, + "rewards/margins": 2.8455519676208496, + "rewards/rejected": -0.22063006460666656, + "step": 1535 + }, + { + "epoch": 0.38, + "grad_norm": 4.656616687774658, + "learning_rate": 8.466333718799097e-06, + "logits/chosen": -0.2286026030778885, + "logits/rejected": -0.2576694190502167, + "logps/chosen": -46.136817932128906, + "logps/rejected": -65.42298889160156, + "loss": 0.9284, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.607975482940674, + "rewards/margins": 3.391448497772217, + "rewards/rejected": -0.7834731936454773, + "step": 1536 + }, + { + "epoch": 0.38, + "grad_norm": 5.082802772521973, + "learning_rate": 8.464445245768156e-06, + "logits/chosen": -0.2355562150478363, + "logits/rejected": -0.35993319749832153, + "logps/chosen": -59.465354919433594, + "logps/rejected": -61.861907958984375, + "loss": 0.754, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0747976303100586, + "rewards/margins": 3.4912352561950684, + "rewards/rejected": -0.41643741726875305, + "step": 1537 + }, + { + "epoch": 0.38, + "grad_norm": 2.7463791370391846, + "learning_rate": 8.462555821672236e-06, + "logits/chosen": -0.244013249874115, + "logits/rejected": -0.2944823205471039, + "logps/chosen": -46.59532165527344, + "logps/rejected": -69.8862533569336, + "loss": 0.7758, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.756613254547119, + "rewards/margins": 4.043877601623535, + "rewards/rejected": -1.2872644662857056, + "step": 1538 + }, + { + "epoch": 0.38, + "grad_norm": 3.0146896839141846, + "learning_rate": 8.460665447030028e-06, + "logits/chosen": -0.19078923761844635, + "logits/rejected": -0.26411086320877075, + "logps/chosen": -50.80876159667969, + "logps/rejected": -69.14447784423828, + "loss": 0.6983, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9213802814483643, + "rewards/margins": 3.583587408065796, + "rewards/rejected": -0.6622072458267212, + "step": 1539 + }, + { + "epoch": 0.39, + "grad_norm": 3.881408452987671, + "learning_rate": 8.458774122360479e-06, + "logits/chosen": -0.1651739627122879, + "logits/rejected": -0.3668866753578186, + "logps/chosen": -58.18805694580078, + "logps/rejected": -58.183250427246094, + "loss": 0.7561, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6771228313446045, + "rewards/margins": 3.8589138984680176, + "rewards/rejected": -1.1817911863327026, + "step": 1540 + }, + { + "epoch": 0.39, + "grad_norm": 14.27798080444336, + "learning_rate": 8.456881848182796e-06, + "logits/chosen": -0.21047250926494598, + "logits/rejected": -0.27741146087646484, + "logps/chosen": -55.071773529052734, + "logps/rejected": -71.83354949951172, + "loss": 0.9809, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.826019525527954, + "rewards/margins": 2.9480066299438477, + "rewards/rejected": -0.12198704481124878, + "step": 1541 + }, + { + "epoch": 0.39, + "grad_norm": 2.595818042755127, + "learning_rate": 8.454988625016455e-06, + "logits/chosen": -0.2499745786190033, + "logits/rejected": -0.406903475522995, + "logps/chosen": -68.10578155517578, + "logps/rejected": -63.221065521240234, + "loss": 0.6918, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0838611125946045, + "rewards/margins": 4.1592912673950195, + "rewards/rejected": -1.0754307508468628, + "step": 1542 + }, + { + "epoch": 0.39, + "grad_norm": 3.2293946743011475, + "learning_rate": 8.453094453381184e-06, + "logits/chosen": -0.09889832139015198, + "logits/rejected": -0.2763769030570984, + "logps/chosen": -68.63225555419922, + "logps/rejected": -62.91252899169922, + "loss": 0.8146, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6136302947998047, + "rewards/margins": 3.121859312057495, + "rewards/rejected": -0.5082287788391113, + "step": 1543 + }, + { + "epoch": 0.39, + "grad_norm": 5.920169353485107, + "learning_rate": 8.451199333796974e-06, + "logits/chosen": -0.19265800714492798, + "logits/rejected": -0.27524131536483765, + "logps/chosen": -57.59222412109375, + "logps/rejected": -73.86695098876953, + "loss": 0.7863, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.809130907058716, + "rewards/margins": 3.719895839691162, + "rewards/rejected": -0.9107642769813538, + "step": 1544 + }, + { + "epoch": 0.39, + "grad_norm": 4.337377071380615, + "learning_rate": 8.449303266784074e-06, + "logits/chosen": -0.06275825202465057, + "logits/rejected": -0.2082071602344513, + "logps/chosen": -51.7039794921875, + "logps/rejected": -62.16464614868164, + "loss": 0.7287, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.767692804336548, + "rewards/margins": 3.2696497440338135, + "rewards/rejected": -0.5019567012786865, + "step": 1545 + }, + { + "epoch": 0.39, + "grad_norm": 5.705528259277344, + "learning_rate": 8.447406252862997e-06, + "logits/chosen": -0.15658412873744965, + "logits/rejected": -0.368623822927475, + "logps/chosen": -56.806549072265625, + "logps/rejected": -58.91620635986328, + "loss": 0.7927, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6655259132385254, + "rewards/margins": 3.225964069366455, + "rewards/rejected": -0.560437798500061, + "step": 1546 + }, + { + "epoch": 0.39, + "grad_norm": 2.214728355407715, + "learning_rate": 8.445508292554517e-06, + "logits/chosen": -0.12714269757270813, + "logits/rejected": -0.26814737915992737, + "logps/chosen": -55.130516052246094, + "logps/rejected": -72.87744140625, + "loss": 0.6764, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.225156784057617, + "rewards/margins": 4.400458335876465, + "rewards/rejected": -1.1753019094467163, + "step": 1547 + }, + { + "epoch": 0.39, + "grad_norm": 3.322265625, + "learning_rate": 8.44360938637966e-06, + "logits/chosen": -0.0332605354487896, + "logits/rejected": -0.21226836740970612, + "logps/chosen": -63.72255325317383, + "logps/rejected": -69.57138061523438, + "loss": 0.8445, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9074606895446777, + "rewards/margins": 4.191089630126953, + "rewards/rejected": -1.2836291790008545, + "step": 1548 + }, + { + "epoch": 0.39, + "grad_norm": 4.559596538543701, + "learning_rate": 8.44170953485972e-06, + "logits/chosen": -0.18287405371665955, + "logits/rejected": -0.25615203380584717, + "logps/chosen": -57.09791564941406, + "logps/rejected": -71.82556915283203, + "loss": 0.874, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.974935531616211, + "rewards/margins": 3.678893566131592, + "rewards/rejected": -0.703957736492157, + "step": 1549 + }, + { + "epoch": 0.39, + "grad_norm": 7.502275466918945, + "learning_rate": 8.439808738516248e-06, + "logits/chosen": -0.20453768968582153, + "logits/rejected": -0.18086761236190796, + "logps/chosen": -56.47896957397461, + "logps/rejected": -78.70708465576172, + "loss": 0.9856, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.759097099304199, + "rewards/margins": 2.268831253051758, + "rewards/rejected": 0.49026602506637573, + "step": 1550 + }, + { + "epoch": 0.39, + "grad_norm": 3.6392176151275635, + "learning_rate": 8.43790699787105e-06, + "logits/chosen": -0.20647601783275604, + "logits/rejected": -0.31690749526023865, + "logps/chosen": -50.71076202392578, + "logps/rejected": -76.07039642333984, + "loss": 0.8601, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7432515621185303, + "rewards/margins": 3.4467105865478516, + "rewards/rejected": -0.7034591436386108, + "step": 1551 + }, + { + "epoch": 0.39, + "grad_norm": 4.17596435546875, + "learning_rate": 8.436004313446198e-06, + "logits/chosen": -0.1861872524023056, + "logits/rejected": -0.31731152534484863, + "logps/chosen": -55.9285888671875, + "logps/rejected": -63.48949432373047, + "loss": 0.7481, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0129644870758057, + "rewards/margins": 4.028805255889893, + "rewards/rejected": -1.0158405303955078, + "step": 1552 + }, + { + "epoch": 0.39, + "grad_norm": 5.553513526916504, + "learning_rate": 8.434100685764018e-06, + "logits/chosen": -0.14818130433559418, + "logits/rejected": -0.24430902302265167, + "logps/chosen": -63.82456970214844, + "logps/rejected": -70.4973373413086, + "loss": 0.8492, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7541775703430176, + "rewards/margins": 3.47556209564209, + "rewards/rejected": -0.7213841676712036, + "step": 1553 + }, + { + "epoch": 0.39, + "grad_norm": 4.311850547790527, + "learning_rate": 8.432196115347098e-06, + "logits/chosen": -0.15873238444328308, + "logits/rejected": -0.2595410645008087, + "logps/chosen": -55.144371032714844, + "logps/rejected": -70.46768188476562, + "loss": 0.8879, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8923895359039307, + "rewards/margins": 3.3466715812683105, + "rewards/rejected": -0.45428216457366943, + "step": 1554 + }, + { + "epoch": 0.39, + "grad_norm": 5.832128524780273, + "learning_rate": 8.430290602718283e-06, + "logits/chosen": -0.04478283226490021, + "logits/rejected": -0.2732841670513153, + "logps/chosen": -60.89125061035156, + "logps/rejected": -54.50410461425781, + "loss": 0.7785, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.761178731918335, + "rewards/margins": 4.432514190673828, + "rewards/rejected": -1.6713347434997559, + "step": 1555 + }, + { + "epoch": 0.39, + "grad_norm": 3.6110308170318604, + "learning_rate": 8.428384148400679e-06, + "logits/chosen": -0.20595437288284302, + "logits/rejected": -0.246968612074852, + "logps/chosen": -47.53601837158203, + "logps/rejected": -74.75291442871094, + "loss": 0.739, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.738490343093872, + "rewards/margins": 3.487196683883667, + "rewards/rejected": -0.7487063407897949, + "step": 1556 + }, + { + "epoch": 0.39, + "grad_norm": 3.7845613956451416, + "learning_rate": 8.426476752917647e-06, + "logits/chosen": -0.19836083054542542, + "logits/rejected": -0.3214283585548401, + "logps/chosen": -47.59287643432617, + "logps/rejected": -58.323387145996094, + "loss": 0.7902, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6155827045440674, + "rewards/margins": 3.783278226852417, + "rewards/rejected": -1.1676956415176392, + "step": 1557 + }, + { + "epoch": 0.39, + "grad_norm": 5.602538108825684, + "learning_rate": 8.424568416792809e-06, + "logits/chosen": -0.25328510999679565, + "logits/rejected": -0.2685777544975281, + "logps/chosen": -50.461509704589844, + "logps/rejected": -79.08008575439453, + "loss": 1.0122, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8059332370758057, + "rewards/margins": 2.879931926727295, + "rewards/rejected": -0.07399867475032806, + "step": 1558 + }, + { + "epoch": 0.39, + "grad_norm": 4.883070468902588, + "learning_rate": 8.422659140550043e-06, + "logits/chosen": -0.24457119405269623, + "logits/rejected": -0.37241286039352417, + "logps/chosen": -59.11548614501953, + "logps/rejected": -66.9946060180664, + "loss": 0.9413, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.628422260284424, + "rewards/margins": 3.429652690887451, + "rewards/rejected": -0.8012300729751587, + "step": 1559 + }, + { + "epoch": 0.39, + "grad_norm": 5.19755744934082, + "learning_rate": 8.420748924713489e-06, + "logits/chosen": -0.2619856595993042, + "logits/rejected": -0.36292213201522827, + "logps/chosen": -68.2963638305664, + "logps/rejected": -76.61982727050781, + "loss": 0.8627, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8192782402038574, + "rewards/margins": 3.1049232482910156, + "rewards/rejected": -0.2856449782848358, + "step": 1560 + }, + { + "epoch": 0.39, + "grad_norm": 4.096813201904297, + "learning_rate": 8.41883776980754e-06, + "logits/chosen": -0.24555139243602753, + "logits/rejected": -0.34291964769363403, + "logps/chosen": -44.67637634277344, + "logps/rejected": -80.12841033935547, + "loss": 0.7582, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.937209129333496, + "rewards/margins": 4.1747541427612305, + "rewards/rejected": -1.2375452518463135, + "step": 1561 + }, + { + "epoch": 0.39, + "grad_norm": 2.688549757003784, + "learning_rate": 8.416925676356853e-06, + "logits/chosen": -0.16305650770664215, + "logits/rejected": -0.25368207693099976, + "logps/chosen": -65.88739776611328, + "logps/rejected": -78.14996337890625, + "loss": 0.731, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9508841037750244, + "rewards/margins": 3.8143117427825928, + "rewards/rejected": -0.8634277582168579, + "step": 1562 + }, + { + "epoch": 0.39, + "grad_norm": 3.074084758758545, + "learning_rate": 8.415012644886333e-06, + "logits/chosen": -0.09033083915710449, + "logits/rejected": -0.15172743797302246, + "logps/chosen": -58.91608810424805, + "logps/rejected": -77.38710021972656, + "loss": 0.7415, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0323309898376465, + "rewards/margins": 3.5239157676696777, + "rewards/rejected": -0.4915845990180969, + "step": 1563 + }, + { + "epoch": 0.39, + "grad_norm": 3.0590672492980957, + "learning_rate": 8.413098675921154e-06, + "logits/chosen": -0.12712407112121582, + "logits/rejected": -0.2503315806388855, + "logps/chosen": -63.7874755859375, + "logps/rejected": -68.33940124511719, + "loss": 0.7697, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0646393299102783, + "rewards/margins": 4.3561296463012695, + "rewards/rejected": -1.2914903163909912, + "step": 1564 + }, + { + "epoch": 0.39, + "grad_norm": 4.21851921081543, + "learning_rate": 8.411183769986739e-06, + "logits/chosen": -0.10892730206251144, + "logits/rejected": -0.2404983788728714, + "logps/chosen": -54.01283264160156, + "logps/rejected": -70.09849548339844, + "loss": 0.7574, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.69399094581604, + "rewards/margins": 3.5127291679382324, + "rewards/rejected": -0.8187381625175476, + "step": 1565 + }, + { + "epoch": 0.39, + "grad_norm": 6.189357757568359, + "learning_rate": 8.409267927608771e-06, + "logits/chosen": -0.18310274183750153, + "logits/rejected": -0.24806378781795502, + "logps/chosen": -51.626914978027344, + "logps/rejected": -71.07951354980469, + "loss": 0.9006, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7337777614593506, + "rewards/margins": 2.6854357719421387, + "rewards/rejected": 0.04834195226430893, + "step": 1566 + }, + { + "epoch": 0.39, + "grad_norm": 8.004786491394043, + "learning_rate": 8.407351149313194e-06, + "logits/chosen": -0.2272004932165146, + "logits/rejected": -0.3053188920021057, + "logps/chosen": -59.848812103271484, + "logps/rejected": -67.90340423583984, + "loss": 0.9044, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1991748809814453, + "rewards/margins": 3.469808578491211, + "rewards/rejected": -0.2706339955329895, + "step": 1567 + }, + { + "epoch": 0.39, + "grad_norm": 3.190300464630127, + "learning_rate": 8.405433435626198e-06, + "logits/chosen": -0.11406087875366211, + "logits/rejected": -0.23889927566051483, + "logps/chosen": -69.34548950195312, + "logps/rejected": -74.75140380859375, + "loss": 0.802, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9673335552215576, + "rewards/margins": 3.11487078666687, + "rewards/rejected": -0.14753714203834534, + "step": 1568 + }, + { + "epoch": 0.39, + "grad_norm": 3.991095542907715, + "learning_rate": 8.403514787074241e-06, + "logits/chosen": -0.2543492019176483, + "logits/rejected": -0.3543543815612793, + "logps/chosen": -53.969566345214844, + "logps/rejected": -63.38498306274414, + "loss": 0.8264, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4953625202178955, + "rewards/margins": 2.907102346420288, + "rewards/rejected": -0.41173988580703735, + "step": 1569 + }, + { + "epoch": 0.39, + "grad_norm": 5.93062162399292, + "learning_rate": 8.401595204184035e-06, + "logits/chosen": -0.1137838214635849, + "logits/rejected": -0.19716079533100128, + "logps/chosen": -54.63205337524414, + "logps/rejected": -72.30133819580078, + "loss": 0.8623, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.789494276046753, + "rewards/margins": 3.428931474685669, + "rewards/rejected": -0.6394367814064026, + "step": 1570 + }, + { + "epoch": 0.39, + "grad_norm": 3.9440343379974365, + "learning_rate": 8.399674687482542e-06, + "logits/chosen": -0.2154322862625122, + "logits/rejected": -0.35024338960647583, + "logps/chosen": -56.205665588378906, + "logps/rejected": -65.45346069335938, + "loss": 0.788, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.74733304977417, + "rewards/margins": 3.4981019496917725, + "rewards/rejected": -0.7507693767547607, + "step": 1571 + }, + { + "epoch": 0.39, + "grad_norm": 4.021195888519287, + "learning_rate": 8.397753237496989e-06, + "logits/chosen": -0.15672950446605682, + "logits/rejected": -0.2891194820404053, + "logps/chosen": -57.26879119873047, + "logps/rejected": -63.08964538574219, + "loss": 0.8472, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8335304260253906, + "rewards/margins": 3.6263582706451416, + "rewards/rejected": -0.7928279638290405, + "step": 1572 + }, + { + "epoch": 0.39, + "grad_norm": 2.8614301681518555, + "learning_rate": 8.395830854754856e-06, + "logits/chosen": -0.17276617884635925, + "logits/rejected": -0.2825506925582886, + "logps/chosen": -53.059600830078125, + "logps/rejected": -83.75019073486328, + "loss": 0.6345, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9387881755828857, + "rewards/margins": 5.0535149574279785, + "rewards/rejected": -2.114727020263672, + "step": 1573 + }, + { + "epoch": 0.39, + "grad_norm": 4.747697353363037, + "learning_rate": 8.393907539783875e-06, + "logits/chosen": -0.18817129731178284, + "logits/rejected": -0.2902988791465759, + "logps/chosen": -58.30116271972656, + "logps/rejected": -76.91767883300781, + "loss": 0.8418, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8975889682769775, + "rewards/margins": 2.969918966293335, + "rewards/rejected": -0.07232995331287384, + "step": 1574 + }, + { + "epoch": 0.39, + "grad_norm": 4.267659664154053, + "learning_rate": 8.39198329311204e-06, + "logits/chosen": -0.09773635119199753, + "logits/rejected": -0.28217869997024536, + "logps/chosen": -68.31449890136719, + "logps/rejected": -74.72518157958984, + "loss": 0.7862, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9069888591766357, + "rewards/margins": 4.019313812255859, + "rewards/rejected": -1.112324595451355, + "step": 1575 + }, + { + "epoch": 0.39, + "grad_norm": 7.0110883712768555, + "learning_rate": 8.390058115267599e-06, + "logits/chosen": -0.06978151947259903, + "logits/rejected": -0.12783563137054443, + "logps/chosen": -58.01149368286133, + "logps/rejected": -86.28802490234375, + "loss": 0.8949, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.853564739227295, + "rewards/margins": 2.544691324234009, + "rewards/rejected": 0.30887341499328613, + "step": 1576 + }, + { + "epoch": 0.39, + "grad_norm": 3.6412813663482666, + "learning_rate": 8.388132006779053e-06, + "logits/chosen": -0.23785513639450073, + "logits/rejected": -0.34753721952438354, + "logps/chosen": -53.20229721069336, + "logps/rejected": -68.4419174194336, + "loss": 0.752, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.713399887084961, + "rewards/margins": 3.864743232727051, + "rewards/rejected": -1.1513431072235107, + "step": 1577 + }, + { + "epoch": 0.39, + "grad_norm": 7.677171230316162, + "learning_rate": 8.386204968175163e-06, + "logits/chosen": -0.1381891369819641, + "logits/rejected": -0.2897898256778717, + "logps/chosen": -78.45100402832031, + "logps/rejected": -58.4774284362793, + "loss": 1.0647, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6489691734313965, + "rewards/margins": 3.2305259704589844, + "rewards/rejected": -0.581557035446167, + "step": 1578 + }, + { + "epoch": 0.39, + "grad_norm": 5.922934055328369, + "learning_rate": 8.384276999984937e-06, + "logits/chosen": -0.18556396663188934, + "logits/rejected": -0.2550049424171448, + "logps/chosen": -59.87171173095703, + "logps/rejected": -73.22190856933594, + "loss": 0.9648, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.733492374420166, + "rewards/margins": 2.4578285217285156, + "rewards/rejected": 0.27566397190093994, + "step": 1579 + }, + { + "epoch": 0.4, + "grad_norm": 2.5320558547973633, + "learning_rate": 8.38234810273765e-06, + "logits/chosen": -0.2186080813407898, + "logits/rejected": -0.3556419909000397, + "logps/chosen": -51.373355865478516, + "logps/rejected": -77.79137420654297, + "loss": 0.7491, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5808165073394775, + "rewards/margins": 4.4472455978393555, + "rewards/rejected": -1.866429328918457, + "step": 1580 + }, + { + "epoch": 0.4, + "grad_norm": 3.7103569507598877, + "learning_rate": 8.380418276962822e-06, + "logits/chosen": -0.22114911675453186, + "logits/rejected": -0.3444245457649231, + "logps/chosen": -52.1463508605957, + "logps/rejected": -67.53349304199219, + "loss": 0.7812, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.812547206878662, + "rewards/margins": 4.35759162902832, + "rewards/rejected": -1.5450443029403687, + "step": 1581 + }, + { + "epoch": 0.4, + "grad_norm": 5.320929050445557, + "learning_rate": 8.378487523190234e-06, + "logits/chosen": -0.16408537328243256, + "logits/rejected": -0.3155669867992401, + "logps/chosen": -63.847877502441406, + "logps/rejected": -69.20767211914062, + "loss": 0.7791, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.627314567565918, + "rewards/margins": 4.120928764343262, + "rewards/rejected": -1.493614673614502, + "step": 1582 + }, + { + "epoch": 0.4, + "grad_norm": 4.7253828048706055, + "learning_rate": 8.37655584194992e-06, + "logits/chosen": -0.19534511864185333, + "logits/rejected": -0.2914334535598755, + "logps/chosen": -53.14210510253906, + "logps/rejected": -75.47241973876953, + "loss": 0.7975, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6549854278564453, + "rewards/margins": 3.106166124343872, + "rewards/rejected": -0.4511811137199402, + "step": 1583 + }, + { + "epoch": 0.4, + "grad_norm": 6.563816547393799, + "learning_rate": 8.374623233772166e-06, + "logits/chosen": -0.2274252474308014, + "logits/rejected": -0.30996203422546387, + "logps/chosen": -49.43721008300781, + "logps/rejected": -54.96253204345703, + "loss": 0.8492, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.764749526977539, + "rewards/margins": 3.132887601852417, + "rewards/rejected": -0.36813807487487793, + "step": 1584 + }, + { + "epoch": 0.4, + "grad_norm": 4.5386962890625, + "learning_rate": 8.372689699187516e-06, + "logits/chosen": -0.21508465707302094, + "logits/rejected": -0.36216825246810913, + "logps/chosen": -55.721923828125, + "logps/rejected": -69.4259033203125, + "loss": 0.8496, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7956490516662598, + "rewards/margins": 3.956258773803711, + "rewards/rejected": -1.160609245300293, + "step": 1585 + }, + { + "epoch": 0.4, + "grad_norm": 3.318758249282837, + "learning_rate": 8.370755238726766e-06, + "logits/chosen": -0.17204323410987854, + "logits/rejected": -0.26720350980758667, + "logps/chosen": -54.87895202636719, + "logps/rejected": -69.74308013916016, + "loss": 0.6874, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.027331590652466, + "rewards/margins": 4.308668613433838, + "rewards/rejected": -1.2813374996185303, + "step": 1586 + }, + { + "epoch": 0.4, + "grad_norm": 4.588554382324219, + "learning_rate": 8.368819852920969e-06, + "logits/chosen": -0.22440260648727417, + "logits/rejected": -0.3413034677505493, + "logps/chosen": -57.944644927978516, + "logps/rejected": -72.06288146972656, + "loss": 0.7945, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8656628131866455, + "rewards/margins": 3.7423744201660156, + "rewards/rejected": -0.876711368560791, + "step": 1587 + }, + { + "epoch": 0.4, + "grad_norm": 3.1203901767730713, + "learning_rate": 8.366883542301428e-06, + "logits/chosen": -0.18865394592285156, + "logits/rejected": -0.32387077808380127, + "logps/chosen": -59.242645263671875, + "logps/rejected": -64.56275177001953, + "loss": 0.7011, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.972888231277466, + "rewards/margins": 4.490920066833496, + "rewards/rejected": -1.5180315971374512, + "step": 1588 + }, + { + "epoch": 0.4, + "grad_norm": 4.191158771514893, + "learning_rate": 8.364946307399704e-06, + "logits/chosen": -0.11231140047311783, + "logits/rejected": -0.30178502202033997, + "logps/chosen": -59.20905685424805, + "logps/rejected": -68.2796401977539, + "loss": 0.6694, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.517301321029663, + "rewards/margins": 4.372627258300781, + "rewards/rejected": -1.855325698852539, + "step": 1589 + }, + { + "epoch": 0.4, + "grad_norm": 4.7953619956970215, + "learning_rate": 8.363008148747606e-06, + "logits/chosen": -0.11570499837398529, + "logits/rejected": -0.22227752208709717, + "logps/chosen": -62.46152877807617, + "logps/rejected": -81.4503173828125, + "loss": 0.8627, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6868855953216553, + "rewards/margins": 3.292764663696289, + "rewards/rejected": -0.6058788299560547, + "step": 1590 + }, + { + "epoch": 0.4, + "grad_norm": 4.992486000061035, + "learning_rate": 8.361069066877207e-06, + "logits/chosen": -0.11941196769475937, + "logits/rejected": -0.16899250447750092, + "logps/chosen": -61.97197723388672, + "logps/rejected": -78.85794067382812, + "loss": 0.7664, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.523582935333252, + "rewards/margins": 3.8655507564544678, + "rewards/rejected": -1.3419679403305054, + "step": 1591 + }, + { + "epoch": 0.4, + "grad_norm": 8.71615219116211, + "learning_rate": 8.35912906232082e-06, + "logits/chosen": -0.17146417498588562, + "logits/rejected": -0.278361439704895, + "logps/chosen": -70.55577087402344, + "logps/rejected": -70.29457092285156, + "loss": 1.0696, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7100608348846436, + "rewards/margins": 3.1880664825439453, + "rewards/rejected": -0.47800537943840027, + "step": 1592 + }, + { + "epoch": 0.4, + "grad_norm": 3.883039712905884, + "learning_rate": 8.357188135611023e-06, + "logits/chosen": -0.157313272356987, + "logits/rejected": -0.2626041769981384, + "logps/chosen": -48.327606201171875, + "logps/rejected": -80.77579498291016, + "loss": 0.7208, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8265435695648193, + "rewards/margins": 3.6329290866851807, + "rewards/rejected": -0.8063856363296509, + "step": 1593 + }, + { + "epoch": 0.4, + "grad_norm": 4.825144290924072, + "learning_rate": 8.35524628728064e-06, + "logits/chosen": -0.16746190190315247, + "logits/rejected": -0.26822593808174133, + "logps/chosen": -66.97606658935547, + "logps/rejected": -98.78529357910156, + "loss": 0.8082, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9784669876098633, + "rewards/margins": 4.5876641273498535, + "rewards/rejected": -1.6091972589492798, + "step": 1594 + }, + { + "epoch": 0.4, + "grad_norm": 7.307861804962158, + "learning_rate": 8.35330351786275e-06, + "logits/chosen": -0.1556093990802765, + "logits/rejected": -0.2510421574115753, + "logps/chosen": -67.59998321533203, + "logps/rejected": -72.53213500976562, + "loss": 1.0649, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8220908641815186, + "rewards/margins": 3.2077925205230713, + "rewards/rejected": -0.38570159673690796, + "step": 1595 + }, + { + "epoch": 0.4, + "grad_norm": 9.150493621826172, + "learning_rate": 8.351359827890686e-06, + "logits/chosen": -0.19675424695014954, + "logits/rejected": -0.27170342206954956, + "logps/chosen": -58.024349212646484, + "logps/rejected": -85.86135864257812, + "loss": 1.0333, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.741816997528076, + "rewards/margins": 3.5571327209472656, + "rewards/rejected": -0.8153156638145447, + "step": 1596 + }, + { + "epoch": 0.4, + "grad_norm": 4.49639368057251, + "learning_rate": 8.349415217898034e-06, + "logits/chosen": -0.22715461254119873, + "logits/rejected": -0.2654673159122467, + "logps/chosen": -50.406341552734375, + "logps/rejected": -70.59834289550781, + "loss": 0.8583, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.978400230407715, + "rewards/margins": 3.286807060241699, + "rewards/rejected": -0.3084068298339844, + "step": 1597 + }, + { + "epoch": 0.4, + "grad_norm": 4.671712875366211, + "learning_rate": 8.347469688418628e-06, + "logits/chosen": -0.1323041021823883, + "logits/rejected": -0.2994949221611023, + "logps/chosen": -55.172706604003906, + "logps/rejected": -58.47931671142578, + "loss": 0.8052, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6554007530212402, + "rewards/margins": 3.8280816078186035, + "rewards/rejected": -1.1726806163787842, + "step": 1598 + }, + { + "epoch": 0.4, + "grad_norm": 4.301878929138184, + "learning_rate": 8.345523239986561e-06, + "logits/chosen": -0.14214998483657837, + "logits/rejected": -0.17290109395980835, + "logps/chosen": -49.43636703491211, + "logps/rejected": -77.47936248779297, + "loss": 0.8226, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7871851921081543, + "rewards/margins": 3.046510934829712, + "rewards/rejected": -0.25932586193084717, + "step": 1599 + }, + { + "epoch": 0.4, + "grad_norm": 7.970982074737549, + "learning_rate": 8.343575873136174e-06, + "logits/chosen": -0.19791311025619507, + "logits/rejected": -0.2877741754055023, + "logps/chosen": -57.154518127441406, + "logps/rejected": -80.89183044433594, + "loss": 0.8656, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0229392051696777, + "rewards/margins": 3.17889666557312, + "rewards/rejected": -0.15595725178718567, + "step": 1600 + }, + { + "epoch": 0.4, + "grad_norm": 5.035658359527588, + "learning_rate": 8.341627588402059e-06, + "logits/chosen": -0.15890216827392578, + "logits/rejected": -0.25840240716934204, + "logps/chosen": -62.49258804321289, + "logps/rejected": -61.098060607910156, + "loss": 0.8344, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.878795862197876, + "rewards/margins": 3.9193637371063232, + "rewards/rejected": -1.0405681133270264, + "step": 1601 + }, + { + "epoch": 0.4, + "grad_norm": 2.950392007827759, + "learning_rate": 8.339678386319068e-06, + "logits/chosen": -0.21962440013885498, + "logits/rejected": -0.312581866979599, + "logps/chosen": -55.84292221069336, + "logps/rejected": -69.14795684814453, + "loss": 0.7202, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7807273864746094, + "rewards/margins": 3.9929656982421875, + "rewards/rejected": -1.2122383117675781, + "step": 1602 + }, + { + "epoch": 0.4, + "grad_norm": 3.9125912189483643, + "learning_rate": 8.337728267422292e-06, + "logits/chosen": -0.1577477753162384, + "logits/rejected": -0.28206467628479004, + "logps/chosen": -50.20661544799805, + "logps/rejected": -59.39472961425781, + "loss": 0.7365, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9499382972717285, + "rewards/margins": 3.445206642150879, + "rewards/rejected": -0.4952683746814728, + "step": 1603 + }, + { + "epoch": 0.4, + "grad_norm": 5.901909351348877, + "learning_rate": 8.335777232247086e-06, + "logits/chosen": -0.16531828045845032, + "logits/rejected": -0.28549230098724365, + "logps/chosen": -62.65377426147461, + "logps/rejected": -72.84735107421875, + "loss": 0.8856, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8935482501983643, + "rewards/margins": 3.5575766563415527, + "rewards/rejected": -0.6640279293060303, + "step": 1604 + }, + { + "epoch": 0.4, + "grad_norm": 4.518784999847412, + "learning_rate": 8.333825281329049e-06, + "logits/chosen": -0.21433308720588684, + "logits/rejected": -0.41796165704727173, + "logps/chosen": -62.97047424316406, + "logps/rejected": -66.34645080566406, + "loss": 0.941, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8161799907684326, + "rewards/margins": 3.538506507873535, + "rewards/rejected": -0.7223261594772339, + "step": 1605 + }, + { + "epoch": 0.4, + "grad_norm": 3.092275381088257, + "learning_rate": 8.331872415204034e-06, + "logits/chosen": -0.20091834664344788, + "logits/rejected": -0.32967984676361084, + "logps/chosen": -59.97864532470703, + "logps/rejected": -67.92095184326172, + "loss": 0.7409, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1421284675598145, + "rewards/margins": 3.7028579711914062, + "rewards/rejected": -0.560729444026947, + "step": 1606 + }, + { + "epoch": 0.4, + "grad_norm": 4.398766040802002, + "learning_rate": 8.329918634408145e-06, + "logits/chosen": -0.15649953484535217, + "logits/rejected": -0.31380903720855713, + "logps/chosen": -66.31192016601562, + "logps/rejected": -78.03807830810547, + "loss": 1.0434, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7000842094421387, + "rewards/margins": 2.380974531173706, + "rewards/rejected": 0.31910964846611023, + "step": 1607 + }, + { + "epoch": 0.4, + "grad_norm": 3.0532562732696533, + "learning_rate": 8.327963939477736e-06, + "logits/chosen": -0.2541126012802124, + "logits/rejected": -0.31084996461868286, + "logps/chosen": -49.83625793457031, + "logps/rejected": -83.00448608398438, + "loss": 0.6614, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7366514205932617, + "rewards/margins": 3.4814906120300293, + "rewards/rejected": -0.7448396682739258, + "step": 1608 + }, + { + "epoch": 0.4, + "grad_norm": 3.834899425506592, + "learning_rate": 8.326008330949415e-06, + "logits/chosen": -0.21866390109062195, + "logits/rejected": -0.3330104947090149, + "logps/chosen": -54.267250061035156, + "logps/rejected": -75.02326202392578, + "loss": 0.7781, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7911787033081055, + "rewards/margins": 3.9683196544647217, + "rewards/rejected": -1.1771409511566162, + "step": 1609 + }, + { + "epoch": 0.4, + "grad_norm": 4.903616428375244, + "learning_rate": 8.324051809360037e-06, + "logits/chosen": -0.16191478073596954, + "logits/rejected": -0.2727966010570526, + "logps/chosen": -56.500118255615234, + "logps/rejected": -68.71438598632812, + "loss": 0.8421, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.740661144256592, + "rewards/margins": 3.016838312149048, + "rewards/rejected": -0.27617689967155457, + "step": 1610 + }, + { + "epoch": 0.4, + "grad_norm": 4.245769500732422, + "learning_rate": 8.32209437524671e-06, + "logits/chosen": -0.23813869059085846, + "logits/rejected": -0.3070501983165741, + "logps/chosen": -47.06715774536133, + "logps/rejected": -65.2003402709961, + "loss": 0.7993, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8461554050445557, + "rewards/margins": 2.7794957160949707, + "rewards/rejected": 0.06665990501642227, + "step": 1611 + }, + { + "epoch": 0.4, + "grad_norm": 2.8953258991241455, + "learning_rate": 8.320136029146792e-06, + "logits/chosen": -0.21423304080963135, + "logits/rejected": -0.24229669570922852, + "logps/chosen": -54.825828552246094, + "logps/rejected": -76.43824768066406, + "loss": 0.75, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9950478076934814, + "rewards/margins": 3.285460948944092, + "rewards/rejected": -0.29041334986686707, + "step": 1612 + }, + { + "epoch": 0.4, + "grad_norm": 3.9262871742248535, + "learning_rate": 8.318176771597891e-06, + "logits/chosen": -0.1859867125749588, + "logits/rejected": -0.2576786279678345, + "logps/chosen": -45.015296936035156, + "logps/rejected": -72.79647827148438, + "loss": 0.7704, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.783555746078491, + "rewards/margins": 3.060340166091919, + "rewards/rejected": -0.2767844498157501, + "step": 1613 + }, + { + "epoch": 0.4, + "grad_norm": 4.026566505432129, + "learning_rate": 8.316216603137866e-06, + "logits/chosen": -0.11404536664485931, + "logits/rejected": -0.2658577263355255, + "logps/chosen": -51.79006576538086, + "logps/rejected": -74.759521484375, + "loss": 0.7244, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.226184606552124, + "rewards/margins": 3.8665919303894043, + "rewards/rejected": -0.6404074430465698, + "step": 1614 + }, + { + "epoch": 0.4, + "grad_norm": 3.0253748893737793, + "learning_rate": 8.314255524304824e-06, + "logits/chosen": -0.15403971076011658, + "logits/rejected": -0.31677162647247314, + "logps/chosen": -66.75908660888672, + "logps/rejected": -67.6050796508789, + "loss": 0.8843, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.022050619125366, + "rewards/margins": 3.6059720516204834, + "rewards/rejected": -0.5839216709136963, + "step": 1615 + }, + { + "epoch": 0.4, + "grad_norm": 4.0343403816223145, + "learning_rate": 8.312293535637123e-06, + "logits/chosen": -0.1713191717863083, + "logits/rejected": -0.2673162817955017, + "logps/chosen": -55.3542366027832, + "logps/rejected": -86.22673034667969, + "loss": 0.7623, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6032636165618896, + "rewards/margins": 3.7978055477142334, + "rewards/rejected": -1.1945418119430542, + "step": 1616 + }, + { + "epoch": 0.4, + "grad_norm": 4.353291988372803, + "learning_rate": 8.310330637673377e-06, + "logits/chosen": -0.19853028655052185, + "logits/rejected": -0.27888721227645874, + "logps/chosen": -47.24437713623047, + "logps/rejected": -68.05465698242188, + "loss": 0.8073, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.60263729095459, + "rewards/margins": 3.4405155181884766, + "rewards/rejected": -0.8378779888153076, + "step": 1617 + }, + { + "epoch": 0.4, + "grad_norm": 4.313895225524902, + "learning_rate": 8.308366830952439e-06, + "logits/chosen": -0.28010499477386475, + "logits/rejected": -0.3678130507469177, + "logps/chosen": -47.20511245727539, + "logps/rejected": -73.11418151855469, + "loss": 0.87, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7439050674438477, + "rewards/margins": 3.4452991485595703, + "rewards/rejected": -0.7013939619064331, + "step": 1618 + }, + { + "epoch": 0.4, + "grad_norm": 8.521143913269043, + "learning_rate": 8.306402116013415e-06, + "logits/chosen": -0.13974419236183167, + "logits/rejected": -0.25739216804504395, + "logps/chosen": -45.360877990722656, + "logps/rejected": -69.44783782958984, + "loss": 0.7966, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9301693439483643, + "rewards/margins": 3.2058537006378174, + "rewards/rejected": -0.27568405866622925, + "step": 1619 + }, + { + "epoch": 0.41, + "grad_norm": 4.340816020965576, + "learning_rate": 8.304436493395663e-06, + "logits/chosen": -0.13689345121383667, + "logits/rejected": -0.24425290524959564, + "logps/chosen": -60.18471145629883, + "logps/rejected": -75.61183166503906, + "loss": 0.8047, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.806170701980591, + "rewards/margins": 3.585211753845215, + "rewards/rejected": -0.7790408134460449, + "step": 1620 + }, + { + "epoch": 0.41, + "grad_norm": 3.335625648498535, + "learning_rate": 8.30246996363879e-06, + "logits/chosen": -0.16382911801338196, + "logits/rejected": -0.20268669724464417, + "logps/chosen": -55.619625091552734, + "logps/rejected": -85.15352630615234, + "loss": 0.7144, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.041391611099243, + "rewards/margins": 4.103680610656738, + "rewards/rejected": -1.0622894763946533, + "step": 1621 + }, + { + "epoch": 0.41, + "grad_norm": 3.1706457138061523, + "learning_rate": 8.300502527282651e-06, + "logits/chosen": -0.20627328753471375, + "logits/rejected": -0.3551097512245178, + "logps/chosen": -58.09886932373047, + "logps/rejected": -63.92136001586914, + "loss": 0.7436, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.073183059692383, + "rewards/margins": 4.249591827392578, + "rewards/rejected": -1.1764088869094849, + "step": 1622 + }, + { + "epoch": 0.41, + "grad_norm": 4.714480876922607, + "learning_rate": 8.298534184867351e-06, + "logits/chosen": -0.20279869437217712, + "logits/rejected": -0.2057167887687683, + "logps/chosen": -56.70213317871094, + "logps/rejected": -83.19881439208984, + "loss": 0.8932, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7808969020843506, + "rewards/margins": 3.7189512252807617, + "rewards/rejected": -0.9380538463592529, + "step": 1623 + }, + { + "epoch": 0.41, + "grad_norm": 13.003889083862305, + "learning_rate": 8.296564936933237e-06, + "logits/chosen": -0.09659933298826218, + "logits/rejected": -0.2056264579296112, + "logps/chosen": -71.18017578125, + "logps/rejected": -69.56343078613281, + "loss": 1.0142, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.722557544708252, + "rewards/margins": 3.0812466144561768, + "rewards/rejected": -0.3586888909339905, + "step": 1624 + }, + { + "epoch": 0.41, + "grad_norm": 4.748072624206543, + "learning_rate": 8.294594784020917e-06, + "logits/chosen": -0.2821257412433624, + "logits/rejected": -0.3148282766342163, + "logps/chosen": -54.749053955078125, + "logps/rejected": -68.1790542602539, + "loss": 1.0128, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0180206298828125, + "rewards/margins": 2.864823341369629, + "rewards/rejected": 0.1531972587108612, + "step": 1625 + }, + { + "epoch": 0.41, + "grad_norm": 4.640674114227295, + "learning_rate": 8.292623726671235e-06, + "logits/chosen": -0.23180517554283142, + "logits/rejected": -0.3813406825065613, + "logps/chosen": -52.722137451171875, + "logps/rejected": -66.5992202758789, + "loss": 0.8424, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.707770586013794, + "rewards/margins": 2.9691591262817383, + "rewards/rejected": -0.2613891065120697, + "step": 1626 + }, + { + "epoch": 0.41, + "grad_norm": 4.163804054260254, + "learning_rate": 8.29065176542529e-06, + "logits/chosen": -0.17531447112560272, + "logits/rejected": -0.27298593521118164, + "logps/chosen": -71.40122985839844, + "logps/rejected": -83.3439712524414, + "loss": 0.8901, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.08301043510437, + "rewards/margins": 3.1930348873138428, + "rewards/rejected": -0.11002460867166519, + "step": 1627 + }, + { + "epoch": 0.41, + "grad_norm": 7.534667491912842, + "learning_rate": 8.28867890082443e-06, + "logits/chosen": -0.08920343220233917, + "logits/rejected": -0.1708301156759262, + "logps/chosen": -64.13910675048828, + "logps/rejected": -73.32135009765625, + "loss": 1.0606, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7900357246398926, + "rewards/margins": 2.988940715789795, + "rewards/rejected": -0.1989051252603531, + "step": 1628 + }, + { + "epoch": 0.41, + "grad_norm": 4.514366149902344, + "learning_rate": 8.286705133410247e-06, + "logits/chosen": -0.1331884264945984, + "logits/rejected": -0.24359916150569916, + "logps/chosen": -56.46233367919922, + "logps/rejected": -65.39031982421875, + "loss": 0.8893, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0628280639648438, + "rewards/margins": 3.2217819690704346, + "rewards/rejected": -0.15895387530326843, + "step": 1629 + }, + { + "epoch": 0.41, + "grad_norm": 13.703529357910156, + "learning_rate": 8.284730463724585e-06, + "logits/chosen": -0.18596209585666656, + "logits/rejected": -0.34472522139549255, + "logps/chosen": -82.08980560302734, + "logps/rejected": -58.470001220703125, + "loss": 0.9718, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.761601448059082, + "rewards/margins": 3.3362271785736084, + "rewards/rejected": -0.5746257305145264, + "step": 1630 + }, + { + "epoch": 0.41, + "grad_norm": 4.293346881866455, + "learning_rate": 8.28275489230953e-06, + "logits/chosen": -0.21201622486114502, + "logits/rejected": -0.30898258090019226, + "logps/chosen": -55.23657989501953, + "logps/rejected": -73.87750244140625, + "loss": 0.7968, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.179685592651367, + "rewards/margins": 3.6556801795959473, + "rewards/rejected": -0.4759945571422577, + "step": 1631 + }, + { + "epoch": 0.41, + "grad_norm": 2.525430917739868, + "learning_rate": 8.280778419707421e-06, + "logits/chosen": -0.326897531747818, + "logits/rejected": -0.3887087404727936, + "logps/chosen": -51.52537536621094, + "logps/rejected": -76.23641204833984, + "loss": 0.6684, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0996158123016357, + "rewards/margins": 4.031256198883057, + "rewards/rejected": -0.9316400289535522, + "step": 1632 + }, + { + "epoch": 0.41, + "grad_norm": 5.366549968719482, + "learning_rate": 8.278801046460842e-06, + "logits/chosen": -0.24122895300388336, + "logits/rejected": -0.40019190311431885, + "logps/chosen": -58.448158264160156, + "logps/rejected": -62.346595764160156, + "loss": 1.0014, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.709754467010498, + "rewards/margins": 2.8670711517333984, + "rewards/rejected": -0.15731674432754517, + "step": 1633 + }, + { + "epoch": 0.41, + "grad_norm": 5.218432903289795, + "learning_rate": 8.276822773112626e-06, + "logits/chosen": -0.25700363516807556, + "logits/rejected": -0.4148108959197998, + "logps/chosen": -63.579078674316406, + "logps/rejected": -66.8577651977539, + "loss": 0.8034, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.892561435699463, + "rewards/margins": 3.54996919631958, + "rewards/rejected": -0.6574074029922485, + "step": 1634 + }, + { + "epoch": 0.41, + "grad_norm": 3.844224452972412, + "learning_rate": 8.274843600205849e-06, + "logits/chosen": -0.06994572281837463, + "logits/rejected": -0.1749313324689865, + "logps/chosen": -62.70248794555664, + "logps/rejected": -63.753692626953125, + "loss": 0.8896, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9726274013519287, + "rewards/margins": 2.783322811126709, + "rewards/rejected": 0.18930432200431824, + "step": 1635 + }, + { + "epoch": 0.41, + "grad_norm": 4.348442554473877, + "learning_rate": 8.27286352828384e-06, + "logits/chosen": -0.2431572526693344, + "logits/rejected": -0.3200206756591797, + "logps/chosen": -50.795143127441406, + "logps/rejected": -83.54153442382812, + "loss": 0.8018, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1032772064208984, + "rewards/margins": 3.646912097930908, + "rewards/rejected": -0.5436350107192993, + "step": 1636 + }, + { + "epoch": 0.41, + "grad_norm": 5.412968635559082, + "learning_rate": 8.270882557890168e-06, + "logits/chosen": -0.1355007141828537, + "logits/rejected": -0.3032006323337555, + "logps/chosen": -59.09581756591797, + "logps/rejected": -59.50476837158203, + "loss": 0.9108, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.3546504974365234, + "rewards/margins": 3.3444085121154785, + "rewards/rejected": -0.9897581338882446, + "step": 1637 + }, + { + "epoch": 0.41, + "grad_norm": 2.4166266918182373, + "learning_rate": 8.268900689568655e-06, + "logits/chosen": -0.09284232556819916, + "logits/rejected": -0.24572598934173584, + "logps/chosen": -51.534019470214844, + "logps/rejected": -61.89712905883789, + "loss": 0.6691, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8463497161865234, + "rewards/margins": 4.299954891204834, + "rewards/rejected": -1.4536046981811523, + "step": 1638 + }, + { + "epoch": 0.41, + "grad_norm": 2.81986927986145, + "learning_rate": 8.266917923863364e-06, + "logits/chosen": -0.10630609095096588, + "logits/rejected": -0.18439863622188568, + "logps/chosen": -50.704627990722656, + "logps/rejected": -82.46012878417969, + "loss": 0.6715, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.880476713180542, + "rewards/margins": 4.097535610198975, + "rewards/rejected": -1.2170588970184326, + "step": 1639 + }, + { + "epoch": 0.41, + "grad_norm": 3.040609121322632, + "learning_rate": 8.26493426131861e-06, + "logits/chosen": -0.1855771243572235, + "logits/rejected": -0.25743573904037476, + "logps/chosen": -53.6263427734375, + "logps/rejected": -76.61207580566406, + "loss": 0.7462, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1651759147644043, + "rewards/margins": 3.5259222984313965, + "rewards/rejected": -0.36074674129486084, + "step": 1640 + }, + { + "epoch": 0.41, + "grad_norm": 4.3896484375, + "learning_rate": 8.262949702478949e-06, + "logits/chosen": -0.18981234729290009, + "logits/rejected": -0.2658312916755676, + "logps/chosen": -54.384971618652344, + "logps/rejected": -77.56560516357422, + "loss": 0.8794, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7621963024139404, + "rewards/margins": 3.5737380981445312, + "rewards/rejected": -0.8115423917770386, + "step": 1641 + }, + { + "epoch": 0.41, + "grad_norm": 3.38480806350708, + "learning_rate": 8.260964247889185e-06, + "logits/chosen": -0.23617474734783173, + "logits/rejected": -0.32515838742256165, + "logps/chosen": -63.5872917175293, + "logps/rejected": -78.6375961303711, + "loss": 0.7271, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9237866401672363, + "rewards/margins": 4.028233051300049, + "rewards/rejected": -1.104446291923523, + "step": 1642 + }, + { + "epoch": 0.41, + "grad_norm": 3.656785011291504, + "learning_rate": 8.25897789809437e-06, + "logits/chosen": -0.19028884172439575, + "logits/rejected": -0.30120399594306946, + "logps/chosen": -62.74716567993164, + "logps/rejected": -80.67390441894531, + "loss": 0.7662, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.971078395843506, + "rewards/margins": 3.7519564628601074, + "rewards/rejected": -0.7808781862258911, + "step": 1643 + }, + { + "epoch": 0.41, + "grad_norm": 6.480548858642578, + "learning_rate": 8.256990653639798e-06, + "logits/chosen": -0.26761767268180847, + "logits/rejected": -0.2845485508441925, + "logps/chosen": -48.575836181640625, + "logps/rejected": -75.8602294921875, + "loss": 0.9408, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8238039016723633, + "rewards/margins": 2.9900457859039307, + "rewards/rejected": -0.1662420630455017, + "step": 1644 + }, + { + "epoch": 0.41, + "grad_norm": 3.590409994125366, + "learning_rate": 8.255002515071012e-06, + "logits/chosen": -0.2545625567436218, + "logits/rejected": -0.33014044165611267, + "logps/chosen": -57.05370330810547, + "logps/rejected": -68.12236785888672, + "loss": 0.8293, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.074188232421875, + "rewards/margins": 3.351637840270996, + "rewards/rejected": -0.2774493098258972, + "step": 1645 + }, + { + "epoch": 0.41, + "grad_norm": 4.3734846115112305, + "learning_rate": 8.253013482933798e-06, + "logits/chosen": -0.1436225175857544, + "logits/rejected": -0.27369895577430725, + "logps/chosen": -57.08271789550781, + "logps/rejected": -81.9180679321289, + "loss": 0.6879, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7024383544921875, + "rewards/margins": 3.5626556873321533, + "rewards/rejected": -0.8602172136306763, + "step": 1646 + }, + { + "epoch": 0.41, + "grad_norm": 3.3114545345306396, + "learning_rate": 8.251023557774187e-06, + "logits/chosen": -0.19936083257198334, + "logits/rejected": -0.2811715304851532, + "logps/chosen": -51.35980224609375, + "logps/rejected": -80.037109375, + "loss": 0.7403, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.923194169998169, + "rewards/margins": 3.799147844314575, + "rewards/rejected": -0.8759537935256958, + "step": 1647 + }, + { + "epoch": 0.41, + "grad_norm": 3.940683364868164, + "learning_rate": 8.24903274013846e-06, + "logits/chosen": -0.16773197054862976, + "logits/rejected": -0.2792694568634033, + "logps/chosen": -53.41220474243164, + "logps/rejected": -68.68717193603516, + "loss": 0.8074, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9040815830230713, + "rewards/margins": 4.243581295013428, + "rewards/rejected": -1.3394999504089355, + "step": 1648 + }, + { + "epoch": 0.41, + "grad_norm": 2.858898878097534, + "learning_rate": 8.247041030573135e-06, + "logits/chosen": -0.19047006964683533, + "logits/rejected": -0.3030381500720978, + "logps/chosen": -51.382598876953125, + "logps/rejected": -81.63883972167969, + "loss": 0.6007, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.115565061569214, + "rewards/margins": 4.61182975769043, + "rewards/rejected": -1.4962648153305054, + "step": 1649 + }, + { + "epoch": 0.41, + "grad_norm": 3.207477569580078, + "learning_rate": 8.245048429624983e-06, + "logits/chosen": -0.22688093781471252, + "logits/rejected": -0.3348875343799591, + "logps/chosen": -51.73900604248047, + "logps/rejected": -65.00178527832031, + "loss": 0.6854, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.087810754776001, + "rewards/margins": 4.565837383270264, + "rewards/rejected": -1.4780266284942627, + "step": 1650 + }, + { + "epoch": 0.41, + "grad_norm": 5.567611217498779, + "learning_rate": 8.243054937841013e-06, + "logits/chosen": -0.17622524499893188, + "logits/rejected": -0.23037254810333252, + "logps/chosen": -66.87863159179688, + "logps/rejected": -81.71463012695312, + "loss": 0.9512, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7706568241119385, + "rewards/margins": 3.090003728866577, + "rewards/rejected": -0.3193467855453491, + "step": 1651 + }, + { + "epoch": 0.41, + "grad_norm": 3.8958849906921387, + "learning_rate": 8.241060555768487e-06, + "logits/chosen": -0.2250332236289978, + "logits/rejected": -0.29734012484550476, + "logps/chosen": -47.979103088378906, + "logps/rejected": -82.52964782714844, + "loss": 0.6971, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.844898223876953, + "rewards/margins": 3.9983928203582764, + "rewards/rejected": -1.1534945964813232, + "step": 1652 + }, + { + "epoch": 0.41, + "grad_norm": 5.491119384765625, + "learning_rate": 8.239065283954898e-06, + "logits/chosen": -0.15308301150798798, + "logits/rejected": -0.2207605242729187, + "logps/chosen": -50.065040588378906, + "logps/rejected": -68.41184997558594, + "loss": 0.8557, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.812211036682129, + "rewards/margins": 3.3157591819763184, + "rewards/rejected": -0.5035482048988342, + "step": 1653 + }, + { + "epoch": 0.41, + "grad_norm": 4.3278093338012695, + "learning_rate": 8.237069122947996e-06, + "logits/chosen": -0.17491309344768524, + "logits/rejected": -0.24506314098834991, + "logps/chosen": -62.556243896484375, + "logps/rejected": -88.71896362304688, + "loss": 0.7519, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8259222507476807, + "rewards/margins": 3.660168170928955, + "rewards/rejected": -0.8342458605766296, + "step": 1654 + }, + { + "epoch": 0.41, + "grad_norm": 4.20912504196167, + "learning_rate": 8.235072073295769e-06, + "logits/chosen": -0.22730593383312225, + "logits/rejected": -0.32373476028442383, + "logps/chosen": -54.52595901489258, + "logps/rejected": -79.06210327148438, + "loss": 0.8192, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.748260021209717, + "rewards/margins": 3.999086856842041, + "rewards/rejected": -1.2508265972137451, + "step": 1655 + }, + { + "epoch": 0.41, + "grad_norm": 3.750927448272705, + "learning_rate": 8.23307413554645e-06, + "logits/chosen": -0.17485469579696655, + "logits/rejected": -0.2723804712295532, + "logps/chosen": -50.90540313720703, + "logps/rejected": -80.57588195800781, + "loss": 0.7197, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9757027626037598, + "rewards/margins": 4.349027633666992, + "rewards/rejected": -1.3733251094818115, + "step": 1656 + }, + { + "epoch": 0.41, + "grad_norm": 5.8988728523254395, + "learning_rate": 8.231075310248519e-06, + "logits/chosen": -0.20895542204380035, + "logits/rejected": -0.3204348683357239, + "logps/chosen": -57.971458435058594, + "logps/rejected": -69.48363494873047, + "loss": 1.0078, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.550516128540039, + "rewards/margins": 3.476685047149658, + "rewards/rejected": -0.9261690378189087, + "step": 1657 + }, + { + "epoch": 0.41, + "grad_norm": 6.5846967697143555, + "learning_rate": 8.229075597950694e-06, + "logits/chosen": -0.1944389045238495, + "logits/rejected": -0.26442068815231323, + "logps/chosen": -74.80888366699219, + "logps/rejected": -83.64085388183594, + "loss": 0.963, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.65946626663208, + "rewards/margins": 3.652866840362549, + "rewards/rejected": -0.9934004545211792, + "step": 1658 + }, + { + "epoch": 0.42, + "grad_norm": 7.135316371917725, + "learning_rate": 8.227074999201937e-06, + "logits/chosen": -0.22750766575336456, + "logits/rejected": -0.3253709673881531, + "logps/chosen": -58.49535369873047, + "logps/rejected": -61.459293365478516, + "loss": 1.0971, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.604696750640869, + "rewards/margins": 2.7943081855773926, + "rewards/rejected": -0.18961185216903687, + "step": 1659 + }, + { + "epoch": 0.42, + "grad_norm": 4.3831562995910645, + "learning_rate": 8.225073514551458e-06, + "logits/chosen": -0.2527036964893341, + "logits/rejected": -0.3177448809146881, + "logps/chosen": -47.90028762817383, + "logps/rejected": -71.71896362304688, + "loss": 0.7294, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8446996212005615, + "rewards/margins": 3.7850968837738037, + "rewards/rejected": -0.9403969049453735, + "step": 1660 + }, + { + "epoch": 0.42, + "grad_norm": 5.874752044677734, + "learning_rate": 8.223071144548708e-06, + "logits/chosen": -0.24216341972351074, + "logits/rejected": -0.34891724586486816, + "logps/chosen": -59.46095275878906, + "logps/rejected": -85.84126281738281, + "loss": 0.8569, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.972550392150879, + "rewards/margins": 3.785181760787964, + "rewards/rejected": -0.812631368637085, + "step": 1661 + }, + { + "epoch": 0.42, + "grad_norm": 12.771658897399902, + "learning_rate": 8.221067889743382e-06, + "logits/chosen": -0.19744649529457092, + "logits/rejected": -0.20067714154720306, + "logps/chosen": -51.54011535644531, + "logps/rejected": -93.55609130859375, + "loss": 0.9102, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5266733169555664, + "rewards/margins": 2.899965286254883, + "rewards/rejected": -0.3732919692993164, + "step": 1662 + }, + { + "epoch": 0.42, + "grad_norm": 3.857349157333374, + "learning_rate": 8.219063750685414e-06, + "logits/chosen": -0.292976438999176, + "logits/rejected": -0.3638157248497009, + "logps/chosen": -52.64866256713867, + "logps/rejected": -79.46673583984375, + "loss": 0.8478, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7481980323791504, + "rewards/margins": 4.858015060424805, + "rewards/rejected": -2.1098170280456543, + "step": 1663 + }, + { + "epoch": 0.42, + "grad_norm": 3.2362802028656006, + "learning_rate": 8.217058727924985e-06, + "logits/chosen": -0.23228704929351807, + "logits/rejected": -0.3450968861579895, + "logps/chosen": -57.93731689453125, + "logps/rejected": -77.62720489501953, + "loss": 0.7376, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8435018062591553, + "rewards/margins": 4.171047210693359, + "rewards/rejected": -1.327545166015625, + "step": 1664 + }, + { + "epoch": 0.42, + "grad_norm": 4.740300178527832, + "learning_rate": 8.215052822012516e-06, + "logits/chosen": -0.15985165536403656, + "logits/rejected": -0.2676325738430023, + "logps/chosen": -63.88865661621094, + "logps/rejected": -81.86092376708984, + "loss": 0.8495, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5858397483825684, + "rewards/margins": 3.467026710510254, + "rewards/rejected": -0.881186842918396, + "step": 1665 + }, + { + "epoch": 0.42, + "grad_norm": 5.439769268035889, + "learning_rate": 8.213046033498672e-06, + "logits/chosen": -0.1563752293586731, + "logits/rejected": -0.23001636564731598, + "logps/chosen": -54.09965896606445, + "logps/rejected": -83.04200744628906, + "loss": 0.6279, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8233039379119873, + "rewards/margins": 4.270688533782959, + "rewards/rejected": -1.4473845958709717, + "step": 1666 + }, + { + "epoch": 0.42, + "grad_norm": 3.85552716255188, + "learning_rate": 8.21103836293436e-06, + "logits/chosen": -0.2670237123966217, + "logits/rejected": -0.4043213129043579, + "logps/chosen": -57.100128173828125, + "logps/rejected": -81.18138122558594, + "loss": 0.7792, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8085100650787354, + "rewards/margins": 4.232720375061035, + "rewards/rejected": -1.4242103099822998, + "step": 1667 + }, + { + "epoch": 0.42, + "grad_norm": 4.700788974761963, + "learning_rate": 8.20902981087073e-06, + "logits/chosen": -0.21357086300849915, + "logits/rejected": -0.3651758134365082, + "logps/chosen": -56.45991134643555, + "logps/rejected": -61.944801330566406, + "loss": 0.7984, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.083104372024536, + "rewards/margins": 4.3551177978515625, + "rewards/rejected": -1.2720129489898682, + "step": 1668 + }, + { + "epoch": 0.42, + "grad_norm": 4.752292156219482, + "learning_rate": 8.20702037785917e-06, + "logits/chosen": -0.17942972481250763, + "logits/rejected": -0.27252262830734253, + "logps/chosen": -52.93473434448242, + "logps/rejected": -81.21116638183594, + "loss": 0.8074, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.698871612548828, + "rewards/margins": 4.190573215484619, + "rewards/rejected": -1.4917011260986328, + "step": 1669 + }, + { + "epoch": 0.42, + "grad_norm": 4.043341636657715, + "learning_rate": 8.205010064451314e-06, + "logits/chosen": -0.30492934584617615, + "logits/rejected": -0.33543047308921814, + "logps/chosen": -54.368202209472656, + "logps/rejected": -86.43276977539062, + "loss": 0.8794, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0432071685791016, + "rewards/margins": 3.575050115585327, + "rewards/rejected": -0.5318425893783569, + "step": 1670 + }, + { + "epoch": 0.42, + "grad_norm": 5.223824977874756, + "learning_rate": 8.202998871199037e-06, + "logits/chosen": -0.20295631885528564, + "logits/rejected": -0.2799883782863617, + "logps/chosen": -65.09303283691406, + "logps/rejected": -82.30980682373047, + "loss": 0.7206, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7790749073028564, + "rewards/margins": 4.020563125610352, + "rewards/rejected": -1.2414875030517578, + "step": 1671 + }, + { + "epoch": 0.42, + "grad_norm": 6.61589241027832, + "learning_rate": 8.200986798654454e-06, + "logits/chosen": -0.1638600379228592, + "logits/rejected": -0.3231179714202881, + "logps/chosen": -55.436241149902344, + "logps/rejected": -62.4598503112793, + "loss": 0.8331, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9597909450531006, + "rewards/margins": 3.852463960647583, + "rewards/rejected": -0.8926730155944824, + "step": 1672 + }, + { + "epoch": 0.42, + "grad_norm": 3.724447011947632, + "learning_rate": 8.198973847369924e-06, + "logits/chosen": -0.19083884358406067, + "logits/rejected": -0.3463481664657593, + "logps/chosen": -65.30162048339844, + "logps/rejected": -80.07644653320312, + "loss": 0.7224, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.124173879623413, + "rewards/margins": 4.22266149520874, + "rewards/rejected": -1.098487377166748, + "step": 1673 + }, + { + "epoch": 0.42, + "grad_norm": 4.380552768707275, + "learning_rate": 8.196960017898044e-06, + "logits/chosen": -0.21230363845825195, + "logits/rejected": -0.26042601466178894, + "logps/chosen": -55.05026626586914, + "logps/rejected": -75.71435546875, + "loss": 0.6809, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7590866088867188, + "rewards/margins": 3.400675058364868, + "rewards/rejected": -0.6415885090827942, + "step": 1674 + }, + { + "epoch": 0.42, + "grad_norm": 5.997303485870361, + "learning_rate": 8.194945310791653e-06, + "logits/chosen": -0.2807367146015167, + "logits/rejected": -0.41298168897628784, + "logps/chosen": -62.49645233154297, + "logps/rejected": -65.62944030761719, + "loss": 0.8819, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4179553985595703, + "rewards/margins": 3.8398118019104004, + "rewards/rejected": -1.42185640335083, + "step": 1675 + }, + { + "epoch": 0.42, + "grad_norm": 4.404083728790283, + "learning_rate": 8.192929726603834e-06, + "logits/chosen": -0.22846075892448425, + "logits/rejected": -0.3556073307991028, + "logps/chosen": -50.13285446166992, + "logps/rejected": -71.41449737548828, + "loss": 0.7309, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.00817608833313, + "rewards/margins": 3.9066004753112793, + "rewards/rejected": -0.8984240889549255, + "step": 1676 + }, + { + "epoch": 0.42, + "grad_norm": 6.227643013000488, + "learning_rate": 8.190913265887908e-06, + "logits/chosen": -0.1918518990278244, + "logits/rejected": -0.2526398003101349, + "logps/chosen": -52.256038665771484, + "logps/rejected": -78.3982925415039, + "loss": 0.8084, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.750201940536499, + "rewards/margins": 3.7608680725097656, + "rewards/rejected": -1.0106664896011353, + "step": 1677 + }, + { + "epoch": 0.42, + "grad_norm": 8.66251277923584, + "learning_rate": 8.188895929197435e-06, + "logits/chosen": -0.16143114864826202, + "logits/rejected": -0.23015744984149933, + "logps/chosen": -60.1259765625, + "logps/rejected": -76.90079498291016, + "loss": 0.9844, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6904475688934326, + "rewards/margins": 3.7607078552246094, + "rewards/rejected": -1.0702602863311768, + "step": 1678 + }, + { + "epoch": 0.42, + "grad_norm": 3.360677480697632, + "learning_rate": 8.18687771708622e-06, + "logits/chosen": -0.2910892069339752, + "logits/rejected": -0.30484867095947266, + "logps/chosen": -46.392215728759766, + "logps/rejected": -81.83273315429688, + "loss": 0.7694, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.983464479446411, + "rewards/margins": 4.1397504806518555, + "rewards/rejected": -1.1562857627868652, + "step": 1679 + }, + { + "epoch": 0.42, + "grad_norm": 4.456235885620117, + "learning_rate": 8.184858630108301e-06, + "logits/chosen": -0.25938552618026733, + "logits/rejected": -0.3514863848686218, + "logps/chosen": -60.73794937133789, + "logps/rejected": -70.64028930664062, + "loss": 0.8725, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.442516803741455, + "rewards/margins": 3.391860008239746, + "rewards/rejected": -0.9493435621261597, + "step": 1680 + }, + { + "epoch": 0.42, + "grad_norm": 6.525341033935547, + "learning_rate": 8.182838668817965e-06, + "logits/chosen": -0.22246551513671875, + "logits/rejected": -0.28524327278137207, + "logps/chosen": -54.025596618652344, + "logps/rejected": -76.43372344970703, + "loss": 0.8683, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5145387649536133, + "rewards/margins": 3.251100540161133, + "rewards/rejected": -0.7365618348121643, + "step": 1681 + }, + { + "epoch": 0.42, + "grad_norm": 7.916150093078613, + "learning_rate": 8.180817833769736e-06, + "logits/chosen": -0.26962339878082275, + "logits/rejected": -0.342035174369812, + "logps/chosen": -69.32380676269531, + "logps/rejected": -85.25040435791016, + "loss": 1.0149, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7093231678009033, + "rewards/margins": 4.335145473480225, + "rewards/rejected": -1.6258225440979004, + "step": 1682 + }, + { + "epoch": 0.42, + "grad_norm": 3.4347329139709473, + "learning_rate": 8.178796125518373e-06, + "logits/chosen": -0.18060331046581268, + "logits/rejected": -0.28886449337005615, + "logps/chosen": -47.3290901184082, + "logps/rejected": -82.65103149414062, + "loss": 0.738, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9087042808532715, + "rewards/margins": 4.668524742126465, + "rewards/rejected": -1.7598202228546143, + "step": 1683 + }, + { + "epoch": 0.42, + "grad_norm": 5.44994592666626, + "learning_rate": 8.176773544618884e-06, + "logits/chosen": -0.30310532450675964, + "logits/rejected": -0.3896830081939697, + "logps/chosen": -54.89373016357422, + "logps/rejected": -77.25179290771484, + "loss": 1.0394, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.857252359390259, + "rewards/margins": 2.626492977142334, + "rewards/rejected": 0.23075956106185913, + "step": 1684 + }, + { + "epoch": 0.42, + "grad_norm": 3.7050838470458984, + "learning_rate": 8.174750091626505e-06, + "logits/chosen": -0.17148727178573608, + "logits/rejected": -0.24950741231441498, + "logps/chosen": -64.2018814086914, + "logps/rejected": -78.16490936279297, + "loss": 0.8287, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8550612926483154, + "rewards/margins": 4.329134941101074, + "rewards/rejected": -1.474074125289917, + "step": 1685 + }, + { + "epoch": 0.42, + "grad_norm": 8.930347442626953, + "learning_rate": 8.172725767096719e-06, + "logits/chosen": -0.0940350815653801, + "logits/rejected": -0.23282137513160706, + "logps/chosen": -64.151123046875, + "logps/rejected": -73.69303131103516, + "loss": 0.9191, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.111438035964966, + "rewards/margins": 4.299016952514648, + "rewards/rejected": -1.187578797340393, + "step": 1686 + }, + { + "epoch": 0.42, + "grad_norm": 4.332763671875, + "learning_rate": 8.170700571585249e-06, + "logits/chosen": -0.2638089060783386, + "logits/rejected": -0.35509616136550903, + "logps/chosen": -47.192970275878906, + "logps/rejected": -81.46739196777344, + "loss": 0.8459, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7235164642333984, + "rewards/margins": 3.7236971855163574, + "rewards/rejected": -1.0001804828643799, + "step": 1687 + }, + { + "epoch": 0.42, + "grad_norm": 3.427142858505249, + "learning_rate": 8.168674505648055e-06, + "logits/chosen": -0.09676828235387802, + "logits/rejected": -0.21820016205310822, + "logps/chosen": -52.809410095214844, + "logps/rejected": -75.20084381103516, + "loss": 0.7017, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5416393280029297, + "rewards/margins": 3.8308846950531006, + "rewards/rejected": -1.289245367050171, + "step": 1688 + }, + { + "epoch": 0.42, + "grad_norm": 9.35930061340332, + "learning_rate": 8.166647569841333e-06, + "logits/chosen": -0.1590537428855896, + "logits/rejected": -0.2777903378009796, + "logps/chosen": -69.90304565429688, + "logps/rejected": -80.23286437988281, + "loss": 0.8932, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6914103031158447, + "rewards/margins": 2.9687387943267822, + "rewards/rejected": -0.27732858061790466, + "step": 1689 + }, + { + "epoch": 0.42, + "grad_norm": 5.927656173706055, + "learning_rate": 8.164619764721523e-06, + "logits/chosen": -0.11923323571681976, + "logits/rejected": -0.23526349663734436, + "logps/chosen": -58.14073181152344, + "logps/rejected": -74.97148895263672, + "loss": 0.8357, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.732865810394287, + "rewards/margins": 3.9838106632232666, + "rewards/rejected": -1.250944972038269, + "step": 1690 + }, + { + "epoch": 0.42, + "grad_norm": 4.965846538543701, + "learning_rate": 8.162591090845299e-06, + "logits/chosen": -0.21992744505405426, + "logits/rejected": -0.22714218497276306, + "logps/chosen": -52.213069915771484, + "logps/rejected": -80.19025421142578, + "loss": 0.8278, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8695147037506104, + "rewards/margins": 4.036870002746582, + "rewards/rejected": -1.1673552989959717, + "step": 1691 + }, + { + "epoch": 0.42, + "grad_norm": 6.059340000152588, + "learning_rate": 8.16056154876958e-06, + "logits/chosen": -0.1401171088218689, + "logits/rejected": -0.24124042689800262, + "logps/chosen": -57.215179443359375, + "logps/rejected": -66.35662841796875, + "loss": 0.8281, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.882066011428833, + "rewards/margins": 3.315901279449463, + "rewards/rejected": -0.433835506439209, + "step": 1692 + }, + { + "epoch": 0.42, + "grad_norm": 4.334751605987549, + "learning_rate": 8.158531139051515e-06, + "logits/chosen": -0.22339364886283875, + "logits/rejected": -0.32584965229034424, + "logps/chosen": -52.07951354980469, + "logps/rejected": -71.54090881347656, + "loss": 0.7463, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0368564128875732, + "rewards/margins": 3.7687594890594482, + "rewards/rejected": -0.7319029569625854, + "step": 1693 + }, + { + "epoch": 0.42, + "grad_norm": 4.309633255004883, + "learning_rate": 8.156499862248498e-06, + "logits/chosen": -0.1994941532611847, + "logits/rejected": -0.3042319416999817, + "logps/chosen": -58.054161071777344, + "logps/rejected": -72.05913543701172, + "loss": 0.801, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9305951595306396, + "rewards/margins": 3.883110761642456, + "rewards/rejected": -0.9525156617164612, + "step": 1694 + }, + { + "epoch": 0.42, + "grad_norm": 5.172971725463867, + "learning_rate": 8.154467718918155e-06, + "logits/chosen": -0.21552424132823944, + "logits/rejected": -0.3127270042896271, + "logps/chosen": -47.904205322265625, + "logps/rejected": -67.64049530029297, + "loss": 0.7955, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8804614543914795, + "rewards/margins": 3.838758945465088, + "rewards/rejected": -0.9582973718643188, + "step": 1695 + }, + { + "epoch": 0.42, + "grad_norm": 9.407977104187012, + "learning_rate": 8.152434709618355e-06, + "logits/chosen": -0.15464092791080475, + "logits/rejected": -0.25175347924232483, + "logps/chosen": -67.8198471069336, + "logps/rejected": -72.63764190673828, + "loss": 0.9642, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6274428367614746, + "rewards/margins": 2.95388126373291, + "rewards/rejected": -0.32643821835517883, + "step": 1696 + }, + { + "epoch": 0.42, + "grad_norm": 6.763296604156494, + "learning_rate": 8.150400834907202e-06, + "logits/chosen": -0.21342875063419342, + "logits/rejected": -0.2929603159427643, + "logps/chosen": -59.930233001708984, + "logps/rejected": -69.91062927246094, + "loss": 0.9046, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8481996059417725, + "rewards/margins": 3.137054920196533, + "rewards/rejected": -0.2888551950454712, + "step": 1697 + }, + { + "epoch": 0.42, + "grad_norm": 3.666844129562378, + "learning_rate": 8.148366095343043e-06, + "logits/chosen": -0.18671229481697083, + "logits/rejected": -0.29344239830970764, + "logps/chosen": -57.9406623840332, + "logps/rejected": -72.86033630371094, + "loss": 0.7917, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6593713760375977, + "rewards/margins": 2.8687329292297363, + "rewards/rejected": -0.20936152338981628, + "step": 1698 + }, + { + "epoch": 0.43, + "grad_norm": 4.286396503448486, + "learning_rate": 8.146330491484453e-06, + "logits/chosen": -0.19155153632164001, + "logits/rejected": -0.3029884994029999, + "logps/chosen": -55.381187438964844, + "logps/rejected": -66.71578979492188, + "loss": 0.763, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.876697540283203, + "rewards/margins": 3.260279417037964, + "rewards/rejected": -0.3835817575454712, + "step": 1699 + }, + { + "epoch": 0.43, + "grad_norm": 3.9742791652679443, + "learning_rate": 8.144294023890249e-06, + "logits/chosen": -0.13399763405323029, + "logits/rejected": -0.26902440190315247, + "logps/chosen": -64.16744995117188, + "logps/rejected": -87.30210876464844, + "loss": 0.7659, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.372387409210205, + "rewards/margins": 3.4585039615631104, + "rewards/rejected": -1.0861163139343262, + "step": 1700 + }, + { + "epoch": 0.43, + "grad_norm": 4.1638102531433105, + "learning_rate": 8.142256693119488e-06, + "logits/chosen": -0.19208085536956787, + "logits/rejected": -0.3864471912384033, + "logps/chosen": -68.47747039794922, + "logps/rejected": -71.67830657958984, + "loss": 0.7282, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6737024784088135, + "rewards/margins": 4.0711212158203125, + "rewards/rejected": -1.39741849899292, + "step": 1701 + }, + { + "epoch": 0.43, + "grad_norm": 8.499794006347656, + "learning_rate": 8.140218499731461e-06, + "logits/chosen": -0.15578624606132507, + "logits/rejected": -0.19391755759716034, + "logps/chosen": -61.124027252197266, + "logps/rejected": -83.65269470214844, + "loss": 0.9697, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.93253231048584, + "rewards/margins": 2.1536664962768555, + "rewards/rejected": 0.7788663506507874, + "step": 1702 + }, + { + "epoch": 0.43, + "grad_norm": 4.462421417236328, + "learning_rate": 8.138179444285695e-06, + "logits/chosen": -0.10953328758478165, + "logits/rejected": -0.22797791659832, + "logps/chosen": -59.961219787597656, + "logps/rejected": -77.29109191894531, + "loss": 0.7251, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6438417434692383, + "rewards/margins": 4.063980579376221, + "rewards/rejected": -1.4201388359069824, + "step": 1703 + }, + { + "epoch": 0.43, + "grad_norm": 5.649763584136963, + "learning_rate": 8.136139527341954e-06, + "logits/chosen": -0.21413670480251312, + "logits/rejected": -0.37460947036743164, + "logps/chosen": -69.7454605102539, + "logps/rejected": -63.304710388183594, + "loss": 0.9263, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7294745445251465, + "rewards/margins": 3.6648104190826416, + "rewards/rejected": -0.9353357553482056, + "step": 1704 + }, + { + "epoch": 0.43, + "grad_norm": 4.363309383392334, + "learning_rate": 8.134098749460239e-06, + "logits/chosen": -0.2566072344779968, + "logits/rejected": -0.33514755964279175, + "logps/chosen": -60.94669723510742, + "logps/rejected": -81.8026123046875, + "loss": 0.8255, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.871917247772217, + "rewards/margins": 3.798466444015503, + "rewards/rejected": -0.9265488982200623, + "step": 1705 + }, + { + "epoch": 0.43, + "grad_norm": 7.401480674743652, + "learning_rate": 8.132057111200791e-06, + "logits/chosen": -0.19446244835853577, + "logits/rejected": -0.2752099335193634, + "logps/chosen": -61.37411880493164, + "logps/rejected": -73.33264923095703, + "loss": 0.9372, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.3788695335388184, + "rewards/margins": 2.709221363067627, + "rewards/rejected": -0.3303518295288086, + "step": 1706 + }, + { + "epoch": 0.43, + "grad_norm": 3.2075469493865967, + "learning_rate": 8.130014613124082e-06, + "logits/chosen": -0.2195395827293396, + "logits/rejected": -0.33315330743789673, + "logps/chosen": -50.900150299072266, + "logps/rejected": -69.20574188232422, + "loss": 0.8044, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9157488346099854, + "rewards/margins": 4.464025020599365, + "rewards/rejected": -1.548276424407959, + "step": 1707 + }, + { + "epoch": 0.43, + "grad_norm": 7.511850833892822, + "learning_rate": 8.12797125579082e-06, + "logits/chosen": -0.11716538667678833, + "logits/rejected": -0.19297900795936584, + "logps/chosen": -66.28681182861328, + "logps/rejected": -72.90188598632812, + "loss": 0.9689, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.911339282989502, + "rewards/margins": 3.472588062286377, + "rewards/rejected": -0.5612492561340332, + "step": 1708 + }, + { + "epoch": 0.43, + "grad_norm": 2.6581664085388184, + "learning_rate": 8.125927039761953e-06, + "logits/chosen": -0.20528613030910492, + "logits/rejected": -0.27849969267845154, + "logps/chosen": -54.05965042114258, + "logps/rejected": -91.14088439941406, + "loss": 0.7231, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0256338119506836, + "rewards/margins": 4.599452018737793, + "rewards/rejected": -1.5738179683685303, + "step": 1709 + }, + { + "epoch": 0.43, + "grad_norm": 3.334433078765869, + "learning_rate": 8.12388196559866e-06, + "logits/chosen": -0.3173573613166809, + "logits/rejected": -0.3875806927680969, + "logps/chosen": -48.29175567626953, + "logps/rejected": -75.14707946777344, + "loss": 0.7083, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.089313268661499, + "rewards/margins": 4.167947769165039, + "rewards/rejected": -1.0786350965499878, + "step": 1710 + }, + { + "epoch": 0.43, + "grad_norm": 2.4684066772460938, + "learning_rate": 8.12183603386236e-06, + "logits/chosen": -0.15386073291301727, + "logits/rejected": -0.28628385066986084, + "logps/chosen": -61.97562026977539, + "logps/rejected": -74.00978088378906, + "loss": 0.7097, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0464789867401123, + "rewards/margins": 4.233894348144531, + "rewards/rejected": -1.1874147653579712, + "step": 1711 + }, + { + "epoch": 0.43, + "grad_norm": 7.251490592956543, + "learning_rate": 8.119789245114704e-06, + "logits/chosen": -0.10139745473861694, + "logits/rejected": -0.26898297667503357, + "logps/chosen": -68.35736083984375, + "logps/rejected": -76.30107879638672, + "loss": 0.9224, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8154430389404297, + "rewards/margins": 3.8633463382720947, + "rewards/rejected": -1.0479029417037964, + "step": 1712 + }, + { + "epoch": 0.43, + "grad_norm": 7.702066421508789, + "learning_rate": 8.117741599917584e-06, + "logits/chosen": -0.12059536576271057, + "logits/rejected": -0.22826611995697021, + "logps/chosen": -64.21329498291016, + "logps/rejected": -84.8667221069336, + "loss": 0.8998, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.801651954650879, + "rewards/margins": 3.0604419708251953, + "rewards/rejected": -0.2587902545928955, + "step": 1713 + }, + { + "epoch": 0.43, + "grad_norm": 5.397154331207275, + "learning_rate": 8.115693098833116e-06, + "logits/chosen": -0.21344994008541107, + "logits/rejected": -0.2796032428741455, + "logps/chosen": -75.9455795288086, + "logps/rejected": -78.83604431152344, + "loss": 1.0107, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9858884811401367, + "rewards/margins": 2.7245335578918457, + "rewards/rejected": 0.261354923248291, + "step": 1714 + }, + { + "epoch": 0.43, + "grad_norm": 2.8690271377563477, + "learning_rate": 8.11364374242366e-06, + "logits/chosen": -0.2381945550441742, + "logits/rejected": -0.3524559438228607, + "logps/chosen": -60.2452392578125, + "logps/rejected": -63.78712463378906, + "loss": 0.7179, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9568471908569336, + "rewards/margins": 3.9674766063690186, + "rewards/rejected": -1.010629415512085, + "step": 1715 + }, + { + "epoch": 0.43, + "grad_norm": 3.665308952331543, + "learning_rate": 8.111593531251812e-06, + "logits/chosen": -0.26027169823646545, + "logits/rejected": -0.39342111349105835, + "logps/chosen": -45.57676315307617, + "logps/rejected": -65.48416137695312, + "loss": 0.8375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.070686101913452, + "rewards/margins": 3.851825475692749, + "rewards/rejected": -0.7811394929885864, + "step": 1716 + }, + { + "epoch": 0.43, + "grad_norm": 3.1760263442993164, + "learning_rate": 8.109542465880395e-06, + "logits/chosen": -0.19718235731124878, + "logits/rejected": -0.3388790190219879, + "logps/chosen": -52.42033004760742, + "logps/rejected": -71.16621398925781, + "loss": 0.674, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.785691261291504, + "rewards/margins": 4.2183027267456055, + "rewards/rejected": -1.4326117038726807, + "step": 1717 + }, + { + "epoch": 0.43, + "grad_norm": 4.465742588043213, + "learning_rate": 8.107490546872475e-06, + "logits/chosen": -0.20224322378635406, + "logits/rejected": -0.3716127574443817, + "logps/chosen": -53.628807067871094, + "logps/rejected": -60.167572021484375, + "loss": 0.7999, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8213422298431396, + "rewards/margins": 4.2322678565979, + "rewards/rejected": -1.4109256267547607, + "step": 1718 + }, + { + "epoch": 0.43, + "grad_norm": 4.985026836395264, + "learning_rate": 8.105437774791345e-06, + "logits/chosen": -0.2136719524860382, + "logits/rejected": -0.21119678020477295, + "logps/chosen": -50.8854866027832, + "logps/rejected": -80.77832794189453, + "loss": 0.8137, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.909759044647217, + "rewards/margins": 2.49530291557312, + "rewards/rejected": 0.4144561290740967, + "step": 1719 + }, + { + "epoch": 0.43, + "grad_norm": 5.258267402648926, + "learning_rate": 8.103384150200535e-06, + "logits/chosen": -0.07340633869171143, + "logits/rejected": -0.24971653521060944, + "logps/chosen": -59.03742980957031, + "logps/rejected": -71.9974136352539, + "loss": 0.8396, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5780251026153564, + "rewards/margins": 3.1746888160705566, + "rewards/rejected": -0.5966635942459106, + "step": 1720 + }, + { + "epoch": 0.43, + "grad_norm": 4.601618766784668, + "learning_rate": 8.10132967366381e-06, + "logits/chosen": -0.18154460191726685, + "logits/rejected": -0.23755112290382385, + "logps/chosen": -64.42005920410156, + "logps/rejected": -78.50260162353516, + "loss": 0.8912, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.839186191558838, + "rewards/margins": 2.7750675678253174, + "rewards/rejected": 0.06411843746900558, + "step": 1721 + }, + { + "epoch": 0.43, + "grad_norm": 4.4973344802856445, + "learning_rate": 8.099274345745165e-06, + "logits/chosen": -0.09521211683750153, + "logits/rejected": -0.2756202220916748, + "logps/chosen": -62.65028762817383, + "logps/rejected": -82.45591735839844, + "loss": 0.7835, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.704592704772949, + "rewards/margins": 3.6071698665618896, + "rewards/rejected": -0.9025770425796509, + "step": 1722 + }, + { + "epoch": 0.43, + "grad_norm": 6.707269191741943, + "learning_rate": 8.09721816700884e-06, + "logits/chosen": -0.2220381647348404, + "logits/rejected": -0.2915833592414856, + "logps/chosen": -52.142234802246094, + "logps/rejected": -83.8537368774414, + "loss": 0.9827, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8481500148773193, + "rewards/margins": 3.0001208782196045, + "rewards/rejected": -0.15197083353996277, + "step": 1723 + }, + { + "epoch": 0.43, + "grad_norm": 3.5791819095611572, + "learning_rate": 8.095161138019294e-06, + "logits/chosen": -0.25896990299224854, + "logits/rejected": -0.35649409890174866, + "logps/chosen": -54.60301208496094, + "logps/rejected": -65.0859146118164, + "loss": 0.7925, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.975277900695801, + "rewards/margins": 4.348419666290283, + "rewards/rejected": -1.3731417655944824, + "step": 1724 + }, + { + "epoch": 0.43, + "grad_norm": 4.385493278503418, + "learning_rate": 8.093103259341226e-06, + "logits/chosen": -0.17723222076892853, + "logits/rejected": -0.2388399839401245, + "logps/chosen": -55.94923782348633, + "logps/rejected": -76.82205963134766, + "loss": 0.8449, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8591880798339844, + "rewards/margins": 3.0635530948638916, + "rewards/rejected": -0.2043646275997162, + "step": 1725 + }, + { + "epoch": 0.43, + "grad_norm": 3.274613380432129, + "learning_rate": 8.09104453153957e-06, + "logits/chosen": -0.16258403658866882, + "logits/rejected": -0.3091995418071747, + "logps/chosen": -61.685821533203125, + "logps/rejected": -75.27279663085938, + "loss": 0.7372, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9299066066741943, + "rewards/margins": 4.467824935913086, + "rewards/rejected": -1.5379185676574707, + "step": 1726 + }, + { + "epoch": 0.43, + "grad_norm": 3.7845041751861572, + "learning_rate": 8.088984955179491e-06, + "logits/chosen": -0.18249636888504028, + "logits/rejected": -0.2753666043281555, + "logps/chosen": -57.07945251464844, + "logps/rejected": -70.5691146850586, + "loss": 0.8193, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7790679931640625, + "rewards/margins": 3.9052200317382812, + "rewards/rejected": -1.1261520385742188, + "step": 1727 + }, + { + "epoch": 0.43, + "grad_norm": 4.4517388343811035, + "learning_rate": 8.086924530826386e-06, + "logits/chosen": -0.15041032433509827, + "logits/rejected": -0.24343450367450714, + "logps/chosen": -67.27946472167969, + "logps/rejected": -89.1932373046875, + "loss": 0.8113, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8910460472106934, + "rewards/margins": 3.4053263664245605, + "rewards/rejected": -0.514280378818512, + "step": 1728 + }, + { + "epoch": 0.43, + "grad_norm": 4.291643142700195, + "learning_rate": 8.084863259045887e-06, + "logits/chosen": -0.2407596856355667, + "logits/rejected": -0.37954291701316833, + "logps/chosen": -62.278682708740234, + "logps/rejected": -61.111976623535156, + "loss": 0.8137, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.550132989883423, + "rewards/margins": 3.4755442142486572, + "rewards/rejected": -0.9254113435745239, + "step": 1729 + }, + { + "epoch": 0.43, + "grad_norm": 4.443802356719971, + "learning_rate": 8.082801140403857e-06, + "logits/chosen": -0.2207341194152832, + "logits/rejected": -0.31092047691345215, + "logps/chosen": -61.0257568359375, + "logps/rejected": -75.46858978271484, + "loss": 0.7844, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.197813034057617, + "rewards/margins": 4.207949638366699, + "rewards/rejected": -1.0101358890533447, + "step": 1730 + }, + { + "epoch": 0.43, + "grad_norm": 4.602757453918457, + "learning_rate": 8.080738175466395e-06, + "logits/chosen": -0.12699653208255768, + "logits/rejected": -0.23769059777259827, + "logps/chosen": -57.23815155029297, + "logps/rejected": -68.70140838623047, + "loss": 0.8276, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6032211780548096, + "rewards/margins": 2.8493380546569824, + "rewards/rejected": -0.2461167722940445, + "step": 1731 + }, + { + "epoch": 0.43, + "grad_norm": 6.2059783935546875, + "learning_rate": 8.078674364799823e-06, + "logits/chosen": -0.2280762791633606, + "logits/rejected": -0.3157624304294586, + "logps/chosen": -57.356842041015625, + "logps/rejected": -70.17658233642578, + "loss": 0.8155, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.851489543914795, + "rewards/margins": 3.1800012588500977, + "rewards/rejected": -0.3285115361213684, + "step": 1732 + }, + { + "epoch": 0.43, + "grad_norm": 5.845063209533691, + "learning_rate": 8.076609708970708e-06, + "logits/chosen": -0.14570346474647522, + "logits/rejected": -0.3236366808414459, + "logps/chosen": -62.762725830078125, + "logps/rejected": -68.83012390136719, + "loss": 0.8181, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.724526882171631, + "rewards/margins": 4.214263916015625, + "rewards/rejected": -1.4897371530532837, + "step": 1733 + }, + { + "epoch": 0.43, + "grad_norm": 5.0127387046813965, + "learning_rate": 8.07454420854584e-06, + "logits/chosen": -0.20255817472934723, + "logits/rejected": -0.30203065276145935, + "logps/chosen": -56.521121978759766, + "logps/rejected": -77.82000732421875, + "loss": 0.8895, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.589146614074707, + "rewards/margins": 4.152599334716797, + "rewards/rejected": -1.5634527206420898, + "step": 1734 + }, + { + "epoch": 0.43, + "grad_norm": 4.099724769592285, + "learning_rate": 8.072477864092241e-06, + "logits/chosen": -0.26259034872055054, + "logits/rejected": -0.3538839817047119, + "logps/chosen": -58.37925720214844, + "logps/rejected": -61.095420837402344, + "loss": 0.8324, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7691757678985596, + "rewards/margins": 3.091416358947754, + "rewards/rejected": -0.32224041223526, + "step": 1735 + }, + { + "epoch": 0.43, + "grad_norm": 2.4410557746887207, + "learning_rate": 8.070410676177171e-06, + "logits/chosen": -0.19941604137420654, + "logits/rejected": -0.34043821692466736, + "logps/chosen": -59.25009536743164, + "logps/rejected": -64.36759185791016, + "loss": 0.6972, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0335114002227783, + "rewards/margins": 4.505105972290039, + "rewards/rejected": -1.4715938568115234, + "step": 1736 + }, + { + "epoch": 0.43, + "grad_norm": 3.346625804901123, + "learning_rate": 8.068342645368114e-06, + "logits/chosen": -0.19727958738803864, + "logits/rejected": -0.345851868391037, + "logps/chosen": -59.2623291015625, + "logps/rejected": -69.61854553222656, + "loss": 0.7336, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.628419876098633, + "rewards/margins": 4.028111457824707, + "rewards/rejected": -1.3996915817260742, + "step": 1737 + }, + { + "epoch": 0.43, + "grad_norm": 6.053718566894531, + "learning_rate": 8.066273772232796e-06, + "logits/chosen": -0.23989829421043396, + "logits/rejected": -0.310962975025177, + "logps/chosen": -56.08174514770508, + "logps/rejected": -72.07905578613281, + "loss": 0.9473, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7725634574890137, + "rewards/margins": 2.593630075454712, + "rewards/rejected": 0.17893311381340027, + "step": 1738 + }, + { + "epoch": 0.44, + "grad_norm": 5.45777702331543, + "learning_rate": 8.064204057339158e-06, + "logits/chosen": -0.27460968494415283, + "logits/rejected": -0.39610832929611206, + "logps/chosen": -57.93745803833008, + "logps/rejected": -73.06587982177734, + "loss": 0.796, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.949988842010498, + "rewards/margins": 4.043880939483643, + "rewards/rejected": -1.093892216682434, + "step": 1739 + }, + { + "epoch": 0.44, + "grad_norm": 4.9116740226745605, + "learning_rate": 8.062133501255388e-06, + "logits/chosen": -0.2887418270111084, + "logits/rejected": -0.3790338635444641, + "logps/chosen": -55.619293212890625, + "logps/rejected": -73.35218048095703, + "loss": 0.8069, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6246724128723145, + "rewards/margins": 3.887575387954712, + "rewards/rejected": -1.2629029750823975, + "step": 1740 + }, + { + "epoch": 0.44, + "grad_norm": 3.4925243854522705, + "learning_rate": 8.060062104549895e-06, + "logits/chosen": -0.1614515334367752, + "logits/rejected": -0.22973740100860596, + "logps/chosen": -51.16929244995117, + "logps/rejected": -84.38407135009766, + "loss": 0.7153, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.060431957244873, + "rewards/margins": 4.389358997344971, + "rewards/rejected": -1.3289270401000977, + "step": 1741 + }, + { + "epoch": 0.44, + "grad_norm": 5.2324371337890625, + "learning_rate": 8.057989867791326e-06, + "logits/chosen": -0.15746834874153137, + "logits/rejected": -0.25849828124046326, + "logps/chosen": -55.722801208496094, + "logps/rejected": -72.13438415527344, + "loss": 0.8202, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6842904090881348, + "rewards/margins": 3.5901918411254883, + "rewards/rejected": -0.9059015512466431, + "step": 1742 + }, + { + "epoch": 0.44, + "grad_norm": 6.12266206741333, + "learning_rate": 8.05591679154855e-06, + "logits/chosen": -0.16740311682224274, + "logits/rejected": -0.20063486695289612, + "logps/chosen": -63.604949951171875, + "logps/rejected": -83.60186004638672, + "loss": 0.7754, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9790873527526855, + "rewards/margins": 3.8526546955108643, + "rewards/rejected": -0.8735675811767578, + "step": 1743 + }, + { + "epoch": 0.44, + "grad_norm": 3.375838041305542, + "learning_rate": 8.053842876390673e-06, + "logits/chosen": -0.2664903402328491, + "logits/rejected": -0.44947975873947144, + "logps/chosen": -65.44550323486328, + "logps/rejected": -66.93356323242188, + "loss": 0.8098, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8627126216888428, + "rewards/margins": 4.192815780639648, + "rewards/rejected": -1.3301030397415161, + "step": 1744 + }, + { + "epoch": 0.44, + "grad_norm": 6.3347296714782715, + "learning_rate": 8.051768122887029e-06, + "logits/chosen": -0.21366411447525024, + "logits/rejected": -0.37653812766075134, + "logps/chosen": -60.11049270629883, + "logps/rejected": -72.88800811767578, + "loss": 0.9086, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.698115587234497, + "rewards/margins": 3.301687717437744, + "rewards/rejected": -0.6035720109939575, + "step": 1745 + }, + { + "epoch": 0.44, + "grad_norm": 7.8712005615234375, + "learning_rate": 8.049692531607185e-06, + "logits/chosen": -0.26558807492256165, + "logits/rejected": -0.30334511399269104, + "logps/chosen": -53.73592758178711, + "logps/rejected": -94.5577621459961, + "loss": 0.77, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8889782428741455, + "rewards/margins": 4.759153366088867, + "rewards/rejected": -1.8701750040054321, + "step": 1746 + }, + { + "epoch": 0.44, + "grad_norm": 4.746574878692627, + "learning_rate": 8.047616103120933e-06, + "logits/chosen": -0.1845153570175171, + "logits/rejected": -0.240042582154274, + "logps/chosen": -47.09567642211914, + "logps/rejected": -87.19853210449219, + "loss": 0.7922, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6875414848327637, + "rewards/margins": 3.941659927368164, + "rewards/rejected": -1.2541186809539795, + "step": 1747 + }, + { + "epoch": 0.44, + "grad_norm": 6.868456840515137, + "learning_rate": 8.045538837998299e-06, + "logits/chosen": -0.19617000222206116, + "logits/rejected": -0.2976672649383545, + "logps/chosen": -54.73404312133789, + "logps/rejected": -79.41997528076172, + "loss": 0.8907, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6266486644744873, + "rewards/margins": 3.4718856811523438, + "rewards/rejected": -0.8452372550964355, + "step": 1748 + }, + { + "epoch": 0.44, + "grad_norm": 4.146646022796631, + "learning_rate": 8.043460736809537e-06, + "logits/chosen": -0.18252813816070557, + "logits/rejected": -0.2898898124694824, + "logps/chosen": -58.46680450439453, + "logps/rejected": -63.480491638183594, + "loss": 0.804, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.882822036743164, + "rewards/margins": 4.279842376708984, + "rewards/rejected": -1.3970203399658203, + "step": 1749 + }, + { + "epoch": 0.44, + "grad_norm": 4.939940452575684, + "learning_rate": 8.041381800125129e-06, + "logits/chosen": -0.12910196185112, + "logits/rejected": -0.2829244136810303, + "logps/chosen": -54.89267349243164, + "logps/rejected": -67.50167846679688, + "loss": 0.7291, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9697628021240234, + "rewards/margins": 3.661868095397949, + "rewards/rejected": -0.6921052932739258, + "step": 1750 + }, + { + "epoch": 0.44, + "grad_norm": 5.007732391357422, + "learning_rate": 8.03930202851579e-06, + "logits/chosen": -0.09815029799938202, + "logits/rejected": -0.27519285678863525, + "logps/chosen": -60.9217414855957, + "logps/rejected": -74.28987884521484, + "loss": 0.7432, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6866979598999023, + "rewards/margins": 4.0924391746521, + "rewards/rejected": -1.4057410955429077, + "step": 1751 + }, + { + "epoch": 0.44, + "grad_norm": 5.430930137634277, + "learning_rate": 8.03722142255246e-06, + "logits/chosen": -0.20291094481945038, + "logits/rejected": -0.3292034864425659, + "logps/chosen": -57.62452697753906, + "logps/rejected": -79.68966674804688, + "loss": 0.7997, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.816448450088501, + "rewards/margins": 4.228440761566162, + "rewards/rejected": -1.4119926691055298, + "step": 1752 + }, + { + "epoch": 0.44, + "grad_norm": 5.677451133728027, + "learning_rate": 8.035139982806312e-06, + "logits/chosen": -0.20714552700519562, + "logits/rejected": -0.27105242013931274, + "logps/chosen": -70.59072875976562, + "logps/rejected": -72.93766021728516, + "loss": 1.0561, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.671372890472412, + "rewards/margins": 3.022932529449463, + "rewards/rejected": -0.35155993700027466, + "step": 1753 + }, + { + "epoch": 0.44, + "grad_norm": 2.3294975757598877, + "learning_rate": 8.033057709848745e-06, + "logits/chosen": -0.20135560631752014, + "logits/rejected": -0.28686073422431946, + "logps/chosen": -80.60452270507812, + "logps/rejected": -93.67655181884766, + "loss": 0.6509, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7183735370635986, + "rewards/margins": 4.998566627502441, + "rewards/rejected": -2.280193328857422, + "step": 1754 + }, + { + "epoch": 0.44, + "grad_norm": 4.621457576751709, + "learning_rate": 8.030974604251389e-06, + "logits/chosen": -0.26596182584762573, + "logits/rejected": -0.333158016204834, + "logps/chosen": -56.969261169433594, + "logps/rejected": -100.51460266113281, + "loss": 0.7911, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.894134283065796, + "rewards/margins": 4.141395092010498, + "rewards/rejected": -1.2472608089447021, + "step": 1755 + }, + { + "epoch": 0.44, + "grad_norm": 5.243577480316162, + "learning_rate": 8.0288906665861e-06, + "logits/chosen": -0.18753761053085327, + "logits/rejected": -0.3297141492366791, + "logps/chosen": -66.23593139648438, + "logps/rejected": -71.08926391601562, + "loss": 0.8029, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7849669456481934, + "rewards/margins": 4.052006721496582, + "rewards/rejected": -1.2670398950576782, + "step": 1756 + }, + { + "epoch": 0.44, + "grad_norm": 7.6964616775512695, + "learning_rate": 8.026805897424965e-06, + "logits/chosen": -0.20420829951763153, + "logits/rejected": -0.316916286945343, + "logps/chosen": -65.19314575195312, + "logps/rejected": -71.5252456665039, + "loss": 0.9007, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.703972816467285, + "rewards/margins": 3.8725709915161133, + "rewards/rejected": -1.168597936630249, + "step": 1757 + }, + { + "epoch": 0.44, + "grad_norm": 5.679409980773926, + "learning_rate": 8.024720297340299e-06, + "logits/chosen": -0.1448591649532318, + "logits/rejected": -0.2791324853897095, + "logps/chosen": -61.89936065673828, + "logps/rejected": -76.19466400146484, + "loss": 0.7965, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8314931392669678, + "rewards/margins": 3.8800997734069824, + "rewards/rejected": -1.0486066341400146, + "step": 1758 + }, + { + "epoch": 0.44, + "grad_norm": 5.7674784660339355, + "learning_rate": 8.022633866904644e-06, + "logits/chosen": -0.17875732481479645, + "logits/rejected": -0.2655794322490692, + "logps/chosen": -54.5105094909668, + "logps/rejected": -72.00877380371094, + "loss": 0.964, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6123952865600586, + "rewards/margins": 3.1026196479797363, + "rewards/rejected": -0.49022412300109863, + "step": 1759 + }, + { + "epoch": 0.44, + "grad_norm": 5.468902587890625, + "learning_rate": 8.020546606690767e-06, + "logits/chosen": -0.22627171874046326, + "logits/rejected": -0.3437560200691223, + "logps/chosen": -60.88566589355469, + "logps/rejected": -77.1708755493164, + "loss": 0.7489, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.120708703994751, + "rewards/margins": 3.926508903503418, + "rewards/rejected": -0.8057996034622192, + "step": 1760 + }, + { + "epoch": 0.44, + "grad_norm": 5.278048515319824, + "learning_rate": 8.018458517271669e-06, + "logits/chosen": -0.21698588132858276, + "logits/rejected": -0.31799113750457764, + "logps/chosen": -48.6442756652832, + "logps/rejected": -71.10161590576172, + "loss": 0.7473, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0125365257263184, + "rewards/margins": 4.169010162353516, + "rewards/rejected": -1.156473159790039, + "step": 1761 + }, + { + "epoch": 0.44, + "grad_norm": 5.704358100891113, + "learning_rate": 8.016369599220572e-06, + "logits/chosen": -0.16124005615711212, + "logits/rejected": -0.3076803982257843, + "logps/chosen": -63.928348541259766, + "logps/rejected": -70.06449127197266, + "loss": 0.8386, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.573415756225586, + "rewards/margins": 2.576664686203003, + "rewards/rejected": -0.0032489895820617676, + "step": 1762 + }, + { + "epoch": 0.44, + "grad_norm": 5.514134883880615, + "learning_rate": 8.014279853110935e-06, + "logits/chosen": -0.17235508561134338, + "logits/rejected": -0.20416978001594543, + "logps/chosen": -55.850704193115234, + "logps/rejected": -73.12986755371094, + "loss": 0.8782, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9617626667022705, + "rewards/margins": 3.0015628337860107, + "rewards/rejected": -0.03980012983083725, + "step": 1763 + }, + { + "epoch": 0.44, + "grad_norm": 3.8114306926727295, + "learning_rate": 8.012189279516435e-06, + "logits/chosen": -0.2713758945465088, + "logits/rejected": -0.33627259731292725, + "logps/chosen": -53.74748992919922, + "logps/rejected": -79.73198699951172, + "loss": 0.7287, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.18167781829834, + "rewards/margins": 4.0430378913879395, + "rewards/rejected": -0.8613599538803101, + "step": 1764 + }, + { + "epoch": 0.44, + "grad_norm": 10.939913749694824, + "learning_rate": 8.01009787901098e-06, + "logits/chosen": -0.17068253457546234, + "logits/rejected": -0.35474100708961487, + "logps/chosen": -53.62992858886719, + "logps/rejected": -67.81415557861328, + "loss": 0.7762, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6247854232788086, + "rewards/margins": 4.117390155792236, + "rewards/rejected": -1.4926047325134277, + "step": 1765 + }, + { + "epoch": 0.44, + "grad_norm": 4.648033142089844, + "learning_rate": 8.008005652168705e-06, + "logits/chosen": -0.23254191875457764, + "logits/rejected": -0.31928378343582153, + "logps/chosen": -52.12516403198242, + "logps/rejected": -83.34687805175781, + "loss": 0.6669, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7979576587677, + "rewards/margins": 4.628548622131348, + "rewards/rejected": -1.8305909633636475, + "step": 1766 + }, + { + "epoch": 0.44, + "grad_norm": 5.271640777587891, + "learning_rate": 8.00591259956397e-06, + "logits/chosen": -0.254818320274353, + "logits/rejected": -0.3822571635246277, + "logps/chosen": -54.98949432373047, + "logps/rejected": -76.6606674194336, + "loss": 0.8476, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4578464031219482, + "rewards/margins": 4.479788780212402, + "rewards/rejected": -2.021942138671875, + "step": 1767 + }, + { + "epoch": 0.44, + "grad_norm": 6.583390235900879, + "learning_rate": 8.003818721771364e-06, + "logits/chosen": -0.24667736887931824, + "logits/rejected": -0.38219791650772095, + "logps/chosen": -58.57258987426758, + "logps/rejected": -81.12173461914062, + "loss": 0.9837, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.3115007877349854, + "rewards/margins": 3.432997226715088, + "rewards/rejected": -1.1214964389801025, + "step": 1768 + }, + { + "epoch": 0.44, + "grad_norm": 4.514747619628906, + "learning_rate": 8.001724019365701e-06, + "logits/chosen": -0.1853492558002472, + "logits/rejected": -0.27150458097457886, + "logps/chosen": -74.54023742675781, + "logps/rejected": -71.36424255371094, + "loss": 0.9373, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.695904493331909, + "rewards/margins": 3.2345056533813477, + "rewards/rejected": -0.5386010408401489, + "step": 1769 + }, + { + "epoch": 0.44, + "grad_norm": 3.5610828399658203, + "learning_rate": 7.999628492922022e-06, + "logits/chosen": -0.25261372327804565, + "logits/rejected": -0.3697715401649475, + "logps/chosen": -51.26913833618164, + "logps/rejected": -75.4597396850586, + "loss": 0.7148, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4008588790893555, + "rewards/margins": 3.357830762863159, + "rewards/rejected": -0.9569716453552246, + "step": 1770 + }, + { + "epoch": 0.44, + "grad_norm": 9.74728775024414, + "learning_rate": 7.997532143015596e-06, + "logits/chosen": -0.2533348798751831, + "logits/rejected": -0.3348197937011719, + "logps/chosen": -53.369178771972656, + "logps/rejected": -69.98421478271484, + "loss": 0.7944, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.619368553161621, + "rewards/margins": 4.192420959472656, + "rewards/rejected": -1.5730525255203247, + "step": 1771 + }, + { + "epoch": 0.44, + "grad_norm": 7.9701128005981445, + "learning_rate": 7.995434970221915e-06, + "logits/chosen": -0.1971980333328247, + "logits/rejected": -0.2323532998561859, + "logps/chosen": -53.1256217956543, + "logps/rejected": -86.39873504638672, + "loss": 0.9918, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8042173385620117, + "rewards/margins": 2.7833430767059326, + "rewards/rejected": 0.020874381065368652, + "step": 1772 + }, + { + "epoch": 0.44, + "grad_norm": 3.8093411922454834, + "learning_rate": 7.9933369751167e-06, + "logits/chosen": -0.17181262373924255, + "logits/rejected": -0.2873757481575012, + "logps/chosen": -53.75655746459961, + "logps/rejected": -62.608646392822266, + "loss": 0.8344, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.014359712600708, + "rewards/margins": 3.6908066272735596, + "rewards/rejected": -0.6764466762542725, + "step": 1773 + }, + { + "epoch": 0.44, + "grad_norm": 4.753620624542236, + "learning_rate": 7.991238158275892e-06, + "logits/chosen": -0.21728166937828064, + "logits/rejected": -0.23698486387729645, + "logps/chosen": -59.34202194213867, + "logps/rejected": -80.15239715576172, + "loss": 1.0361, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.8567256927490234, + "rewards/margins": 2.5634729862213135, + "rewards/rejected": 0.29325249791145325, + "step": 1774 + }, + { + "epoch": 0.44, + "grad_norm": 3.113135814666748, + "learning_rate": 7.989138520275663e-06, + "logits/chosen": -0.2804839015007019, + "logits/rejected": -0.33641374111175537, + "logps/chosen": -49.31660842895508, + "logps/rejected": -80.72769927978516, + "loss": 0.724, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7891645431518555, + "rewards/margins": 4.0830159187316895, + "rewards/rejected": -1.293851375579834, + "step": 1775 + }, + { + "epoch": 0.44, + "grad_norm": 8.251978874206543, + "learning_rate": 7.987038061692412e-06, + "logits/chosen": -0.23247838020324707, + "logits/rejected": -0.3671422302722931, + "logps/chosen": -54.60115432739258, + "logps/rejected": -74.12772369384766, + "loss": 0.8624, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6770973205566406, + "rewards/margins": 3.8854317665100098, + "rewards/rejected": -1.20833420753479, + "step": 1776 + }, + { + "epoch": 0.44, + "grad_norm": 6.754026412963867, + "learning_rate": 7.984936783102755e-06, + "logits/chosen": -0.2485520839691162, + "logits/rejected": -0.39556610584259033, + "logps/chosen": -69.38522338867188, + "logps/rejected": -73.3214340209961, + "loss": 0.9843, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5912508964538574, + "rewards/margins": 2.645442247390747, + "rewards/rejected": -0.05419134348630905, + "step": 1777 + }, + { + "epoch": 0.44, + "grad_norm": 3.6722235679626465, + "learning_rate": 7.982834685083545e-06, + "logits/chosen": -0.2142447829246521, + "logits/rejected": -0.3597344756126404, + "logps/chosen": -62.02881622314453, + "logps/rejected": -70.57743072509766, + "loss": 0.8592, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1496286392211914, + "rewards/margins": 3.555525302886963, + "rewards/rejected": -0.40589678287506104, + "step": 1778 + }, + { + "epoch": 0.45, + "grad_norm": 3.379810094833374, + "learning_rate": 7.980731768211847e-06, + "logits/chosen": -0.23671334981918335, + "logits/rejected": -0.30385398864746094, + "logps/chosen": -49.630859375, + "logps/rejected": -74.79344177246094, + "loss": 0.7455, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.142608642578125, + "rewards/margins": 4.041141033172607, + "rewards/rejected": -0.8985327482223511, + "step": 1779 + }, + { + "epoch": 0.45, + "grad_norm": 5.186649799346924, + "learning_rate": 7.97862803306496e-06, + "logits/chosen": -0.29230690002441406, + "logits/rejected": -0.3847663998603821, + "logps/chosen": -50.85049819946289, + "logps/rejected": -67.43083953857422, + "loss": 0.7867, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9615015983581543, + "rewards/margins": 3.0560457706451416, + "rewards/rejected": -0.09454414248466492, + "step": 1780 + }, + { + "epoch": 0.45, + "grad_norm": 4.029677391052246, + "learning_rate": 7.976523480220403e-06, + "logits/chosen": -0.223796546459198, + "logits/rejected": -0.37169909477233887, + "logps/chosen": -50.72425079345703, + "logps/rejected": -74.15059661865234, + "loss": 0.7051, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6622631549835205, + "rewards/margins": 3.6407384872436523, + "rewards/rejected": -0.9784751534461975, + "step": 1781 + }, + { + "epoch": 0.45, + "grad_norm": 4.066418647766113, + "learning_rate": 7.974418110255924e-06, + "logits/chosen": -0.2506306767463684, + "logits/rejected": -0.3636554479598999, + "logps/chosen": -50.440223693847656, + "logps/rejected": -69.27079772949219, + "loss": 0.7816, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.852778434753418, + "rewards/margins": 3.4478533267974854, + "rewards/rejected": -0.5950744152069092, + "step": 1782 + }, + { + "epoch": 0.45, + "grad_norm": 3.740962266921997, + "learning_rate": 7.97231192374949e-06, + "logits/chosen": -0.19980651140213013, + "logits/rejected": -0.34064167737960815, + "logps/chosen": -54.269615173339844, + "logps/rejected": -70.67890930175781, + "loss": 0.7554, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.695835828781128, + "rewards/margins": 3.8842854499816895, + "rewards/rejected": -1.1884493827819824, + "step": 1783 + }, + { + "epoch": 0.45, + "grad_norm": 3.3846724033355713, + "learning_rate": 7.970204921279296e-06, + "logits/chosen": -0.1799154132604599, + "logits/rejected": -0.2480798363685608, + "logps/chosen": -55.75230026245117, + "logps/rejected": -79.13446044921875, + "loss": 0.7508, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9937527179718018, + "rewards/margins": 3.862234115600586, + "rewards/rejected": -0.8684810996055603, + "step": 1784 + }, + { + "epoch": 0.45, + "grad_norm": 3.9113194942474365, + "learning_rate": 7.968097103423759e-06, + "logits/chosen": -0.23031604290008545, + "logits/rejected": -0.3549240827560425, + "logps/chosen": -47.82299041748047, + "logps/rejected": -64.1576919555664, + "loss": 0.7184, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8058407306671143, + "rewards/margins": 4.011551856994629, + "rewards/rejected": -1.205711007118225, + "step": 1785 + }, + { + "epoch": 0.45, + "grad_norm": 3.7651219367980957, + "learning_rate": 7.96598847076152e-06, + "logits/chosen": -0.16319303214550018, + "logits/rejected": -0.29017511010169983, + "logps/chosen": -54.704524993896484, + "logps/rejected": -70.44892883300781, + "loss": 0.8632, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8609962463378906, + "rewards/margins": 3.747863531112671, + "rewards/rejected": -0.8868668675422668, + "step": 1786 + }, + { + "epoch": 0.45, + "grad_norm": 2.971203565597534, + "learning_rate": 7.963879023871446e-06, + "logits/chosen": -0.1897648274898529, + "logits/rejected": -0.3034612536430359, + "logps/chosen": -52.50320816040039, + "logps/rejected": -77.567138671875, + "loss": 0.6582, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0226523876190186, + "rewards/margins": 3.9730238914489746, + "rewards/rejected": -0.950371503829956, + "step": 1787 + }, + { + "epoch": 0.45, + "grad_norm": 3.4232773780822754, + "learning_rate": 7.961768763332624e-06, + "logits/chosen": -0.19640488922595978, + "logits/rejected": -0.3318410813808441, + "logps/chosen": -62.740692138671875, + "logps/rejected": -75.45805358886719, + "loss": 0.7456, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.3720808029174805, + "rewards/margins": 4.016732215881348, + "rewards/rejected": -1.6446514129638672, + "step": 1788 + }, + { + "epoch": 0.45, + "grad_norm": 4.112892150878906, + "learning_rate": 7.959657689724367e-06, + "logits/chosen": -0.17935135960578918, + "logits/rejected": -0.33189278841018677, + "logps/chosen": -59.4039306640625, + "logps/rejected": -71.46073150634766, + "loss": 0.8201, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6082067489624023, + "rewards/margins": 4.080207824707031, + "rewards/rejected": -1.472001075744629, + "step": 1789 + }, + { + "epoch": 0.45, + "grad_norm": 3.216372013092041, + "learning_rate": 7.957545803626208e-06, + "logits/chosen": -0.14917337894439697, + "logits/rejected": -0.26396358013153076, + "logps/chosen": -56.31501770019531, + "logps/rejected": -62.790557861328125, + "loss": 0.6926, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0757570266723633, + "rewards/margins": 4.141897201538086, + "rewards/rejected": -1.0661406517028809, + "step": 1790 + }, + { + "epoch": 0.45, + "grad_norm": 4.556146621704102, + "learning_rate": 7.955433105617909e-06, + "logits/chosen": -0.21146361529827118, + "logits/rejected": -0.33524367213249207, + "logps/chosen": -49.376075744628906, + "logps/rejected": -79.64056396484375, + "loss": 0.7483, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8138341903686523, + "rewards/margins": 3.53389310836792, + "rewards/rejected": -0.7200589776039124, + "step": 1791 + }, + { + "epoch": 0.45, + "grad_norm": 2.753343343734741, + "learning_rate": 7.953319596279447e-06, + "logits/chosen": -0.2065325677394867, + "logits/rejected": -0.3700879216194153, + "logps/chosen": -51.18397521972656, + "logps/rejected": -63.930946350097656, + "loss": 0.6517, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.907224178314209, + "rewards/margins": 3.8428986072540283, + "rewards/rejected": -0.9356745481491089, + "step": 1792 + }, + { + "epoch": 0.45, + "grad_norm": 3.4680581092834473, + "learning_rate": 7.951205276191032e-06, + "logits/chosen": -0.2882591784000397, + "logits/rejected": -0.3456876277923584, + "logps/chosen": -45.134544372558594, + "logps/rejected": -64.96417236328125, + "loss": 0.7392, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8982315063476562, + "rewards/margins": 3.6173954010009766, + "rewards/rejected": -0.7191638350486755, + "step": 1793 + }, + { + "epoch": 0.45, + "grad_norm": 3.35823655128479, + "learning_rate": 7.949090145933083e-06, + "logits/chosen": -0.19727592170238495, + "logits/rejected": -0.3068870007991791, + "logps/chosen": -59.778663635253906, + "logps/rejected": -62.991432189941406, + "loss": 0.781, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.048171043395996, + "rewards/margins": 3.756424903869629, + "rewards/rejected": -0.7082537412643433, + "step": 1794 + }, + { + "epoch": 0.45, + "grad_norm": 5.84950590133667, + "learning_rate": 7.946974206086254e-06, + "logits/chosen": -0.26962634921073914, + "logits/rejected": -0.3259945511817932, + "logps/chosen": -43.53786849975586, + "logps/rejected": -73.8988037109375, + "loss": 0.8384, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7775588035583496, + "rewards/margins": 3.0915889739990234, + "rewards/rejected": -0.3140302896499634, + "step": 1795 + }, + { + "epoch": 0.45, + "grad_norm": 5.115624904632568, + "learning_rate": 7.944857457231415e-06, + "logits/chosen": -0.2582470178604126, + "logits/rejected": -0.3732915222644806, + "logps/chosen": -55.00090789794922, + "logps/rejected": -64.00827026367188, + "loss": 0.9616, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.593506336212158, + "rewards/margins": 3.143202543258667, + "rewards/rejected": -0.5496964454650879, + "step": 1796 + }, + { + "epoch": 0.45, + "grad_norm": 2.3829472064971924, + "learning_rate": 7.942739899949658e-06, + "logits/chosen": -0.1464191973209381, + "logits/rejected": -0.25372758507728577, + "logps/chosen": -55.84355926513672, + "logps/rejected": -71.7178955078125, + "loss": 0.6614, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8423986434936523, + "rewards/margins": 4.2253828048706055, + "rewards/rejected": -1.3829842805862427, + "step": 1797 + }, + { + "epoch": 0.45, + "grad_norm": 4.649389266967773, + "learning_rate": 7.9406215348223e-06, + "logits/chosen": -0.1924501359462738, + "logits/rejected": -0.28146281838417053, + "logps/chosen": -55.06500244140625, + "logps/rejected": -89.57627868652344, + "loss": 0.8943, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8286523818969727, + "rewards/margins": 3.7351527214050293, + "rewards/rejected": -0.9065004587173462, + "step": 1798 + }, + { + "epoch": 0.45, + "grad_norm": 2.3903636932373047, + "learning_rate": 7.938502362430877e-06, + "logits/chosen": -0.18628892302513123, + "logits/rejected": -0.29852646589279175, + "logps/chosen": -53.59150695800781, + "logps/rejected": -75.9359359741211, + "loss": 0.6376, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7081871032714844, + "rewards/margins": 3.9569873809814453, + "rewards/rejected": -1.2487999200820923, + "step": 1799 + }, + { + "epoch": 0.45, + "grad_norm": 6.2723236083984375, + "learning_rate": 7.936382383357149e-06, + "logits/chosen": -0.15563161671161652, + "logits/rejected": -0.177275151014328, + "logps/chosen": -49.77653121948242, + "logps/rejected": -71.09188079833984, + "loss": 0.911, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.716541290283203, + "rewards/margins": 3.0844573974609375, + "rewards/rejected": -0.3679160475730896, + "step": 1800 + }, + { + "epoch": 0.45, + "grad_norm": 3.308387041091919, + "learning_rate": 7.934261598183093e-06, + "logits/chosen": -0.23422066867351532, + "logits/rejected": -0.24773190915584564, + "logps/chosen": -67.29147338867188, + "logps/rejected": -76.1103515625, + "loss": 0.7869, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8422963619232178, + "rewards/margins": 3.284893035888672, + "rewards/rejected": -0.44259676337242126, + "step": 1801 + }, + { + "epoch": 0.45, + "grad_norm": 5.3449273109436035, + "learning_rate": 7.932140007490911e-06, + "logits/chosen": -0.1663125604391098, + "logits/rejected": -0.3575603663921356, + "logps/chosen": -71.42021179199219, + "logps/rejected": -61.17017364501953, + "loss": 0.8945, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7158243656158447, + "rewards/margins": 3.37143874168396, + "rewards/rejected": -0.65561443567276, + "step": 1802 + }, + { + "epoch": 0.45, + "grad_norm": 5.164384365081787, + "learning_rate": 7.930017611863028e-06, + "logits/chosen": -0.19150716066360474, + "logits/rejected": -0.27599582076072693, + "logps/chosen": -58.62407302856445, + "logps/rejected": -84.3442611694336, + "loss": 0.7862, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.79618239402771, + "rewards/margins": 3.9000661373138428, + "rewards/rejected": -1.103883981704712, + "step": 1803 + }, + { + "epoch": 0.45, + "grad_norm": 4.307892799377441, + "learning_rate": 7.927894411882086e-06, + "logits/chosen": -0.21808978915214539, + "logits/rejected": -0.25061383843421936, + "logps/chosen": -55.427146911621094, + "logps/rejected": -74.36943054199219, + "loss": 0.822, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.907855987548828, + "rewards/margins": 3.449613332748413, + "rewards/rejected": -0.5417572855949402, + "step": 1804 + }, + { + "epoch": 0.45, + "grad_norm": 11.755049705505371, + "learning_rate": 7.925770408130948e-06, + "logits/chosen": -0.24120530486106873, + "logits/rejected": -0.2881939709186554, + "logps/chosen": -54.48896408081055, + "logps/rejected": -74.39122009277344, + "loss": 0.8442, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8587419986724854, + "rewards/margins": 3.2478833198547363, + "rewards/rejected": -0.38914114236831665, + "step": 1805 + }, + { + "epoch": 0.45, + "grad_norm": 3.6554994583129883, + "learning_rate": 7.9236456011927e-06, + "logits/chosen": -0.2303234487771988, + "logits/rejected": -0.35073286294937134, + "logps/chosen": -59.867244720458984, + "logps/rejected": -70.54106140136719, + "loss": 0.8178, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6976664066314697, + "rewards/margins": 4.290329933166504, + "rewards/rejected": -1.592663049697876, + "step": 1806 + }, + { + "epoch": 0.45, + "grad_norm": 10.030557632446289, + "learning_rate": 7.921519991650647e-06, + "logits/chosen": -0.18910491466522217, + "logits/rejected": -0.34018391370773315, + "logps/chosen": -59.162391662597656, + "logps/rejected": -59.90137481689453, + "loss": 0.9599, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7445921897888184, + "rewards/margins": 3.6528549194335938, + "rewards/rejected": -0.9082623720169067, + "step": 1807 + }, + { + "epoch": 0.45, + "grad_norm": 5.901556015014648, + "learning_rate": 7.919393580088317e-06, + "logits/chosen": -0.12783534824848175, + "logits/rejected": -0.3514983355998993, + "logps/chosen": -69.61103820800781, + "logps/rejected": -62.04985809326172, + "loss": 0.8936, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7503108978271484, + "rewards/margins": 3.4186840057373047, + "rewards/rejected": -0.6683732271194458, + "step": 1808 + }, + { + "epoch": 0.45, + "grad_norm": 3.6349260807037354, + "learning_rate": 7.917266367089451e-06, + "logits/chosen": -0.2177135944366455, + "logits/rejected": -0.3368738889694214, + "logps/chosen": -56.9092903137207, + "logps/rejected": -71.55924987792969, + "loss": 0.752, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1022298336029053, + "rewards/margins": 4.253995418548584, + "rewards/rejected": -1.1517654657363892, + "step": 1809 + }, + { + "epoch": 0.45, + "grad_norm": 16.754179000854492, + "learning_rate": 7.915138353238018e-06, + "logits/chosen": -0.1757752150297165, + "logits/rejected": -0.2962260842323303, + "logps/chosen": -63.675758361816406, + "logps/rejected": -73.99021911621094, + "loss": 0.9794, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.537362575531006, + "rewards/margins": 3.50285005569458, + "rewards/rejected": -0.9654874801635742, + "step": 1810 + }, + { + "epoch": 0.45, + "grad_norm": 4.614346504211426, + "learning_rate": 7.913009539118204e-06, + "logits/chosen": -0.18797782063484192, + "logits/rejected": -0.2434568703174591, + "logps/chosen": -62.28858184814453, + "logps/rejected": -79.08220672607422, + "loss": 0.836, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9903199672698975, + "rewards/margins": 3.3863067626953125, + "rewards/rejected": -0.3959863781929016, + "step": 1811 + }, + { + "epoch": 0.45, + "grad_norm": 9.957528114318848, + "learning_rate": 7.910879925314413e-06, + "logits/chosen": -0.17862805724143982, + "logits/rejected": -0.3028736412525177, + "logps/chosen": -58.39765167236328, + "logps/rejected": -66.98175048828125, + "loss": 0.8697, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.734787940979004, + "rewards/margins": 3.6849143505096436, + "rewards/rejected": -0.9501264691352844, + "step": 1812 + }, + { + "epoch": 0.45, + "grad_norm": 3.8188095092773438, + "learning_rate": 7.908749512411272e-06, + "logits/chosen": -0.15131433308124542, + "logits/rejected": -0.330276221036911, + "logps/chosen": -64.74278259277344, + "logps/rejected": -70.21223449707031, + "loss": 0.7137, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8975160121917725, + "rewards/margins": 3.928135395050049, + "rewards/rejected": -1.0306192636489868, + "step": 1813 + }, + { + "epoch": 0.45, + "grad_norm": 8.228631019592285, + "learning_rate": 7.906618300993623e-06, + "logits/chosen": -0.07352860271930695, + "logits/rejected": -0.20655950903892517, + "logps/chosen": -68.65430450439453, + "logps/rejected": -73.54530334472656, + "loss": 0.932, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7286009788513184, + "rewards/margins": 2.9789438247680664, + "rewards/rejected": -0.25034287571907043, + "step": 1814 + }, + { + "epoch": 0.45, + "grad_norm": 5.782932758331299, + "learning_rate": 7.90448629164653e-06, + "logits/chosen": -0.2087727040052414, + "logits/rejected": -0.32496172189712524, + "logps/chosen": -56.066978454589844, + "logps/rejected": -66.01978302001953, + "loss": 0.8966, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5974628925323486, + "rewards/margins": 3.6892528533935547, + "rewards/rejected": -1.091789722442627, + "step": 1815 + }, + { + "epoch": 0.45, + "grad_norm": 4.103924751281738, + "learning_rate": 7.902353484955277e-06, + "logits/chosen": -0.22218915820121765, + "logits/rejected": -0.3558899462223053, + "logps/chosen": -52.485923767089844, + "logps/rejected": -68.96080780029297, + "loss": 0.7515, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6546783447265625, + "rewards/margins": 3.9873156547546387, + "rewards/rejected": -1.3326373100280762, + "step": 1816 + }, + { + "epoch": 0.45, + "grad_norm": 4.372605323791504, + "learning_rate": 7.900219881505365e-06, + "logits/chosen": -0.23783782124519348, + "logits/rejected": -0.29053449630737305, + "logps/chosen": -64.60260009765625, + "logps/rejected": -85.18112182617188, + "loss": 0.7895, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.871121406555176, + "rewards/margins": 3.9368205070495605, + "rewards/rejected": -1.0656988620758057, + "step": 1817 + }, + { + "epoch": 0.45, + "grad_norm": 4.57191801071167, + "learning_rate": 7.898085481882513e-06, + "logits/chosen": -0.22122159600257874, + "logits/rejected": -0.32131749391555786, + "logps/chosen": -57.14823913574219, + "logps/rejected": -72.01346588134766, + "loss": 0.83, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.938863515853882, + "rewards/margins": 3.4989404678344727, + "rewards/rejected": -0.560076892375946, + "step": 1818 + }, + { + "epoch": 0.46, + "grad_norm": 23.430362701416016, + "learning_rate": 7.89595028667266e-06, + "logits/chosen": -0.17368490993976593, + "logits/rejected": -0.2704179883003235, + "logps/chosen": -62.90434265136719, + "logps/rejected": -84.50720977783203, + "loss": 1.1048, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8914878368377686, + "rewards/margins": 2.907367467880249, + "rewards/rejected": -0.015879839658737183, + "step": 1819 + }, + { + "epoch": 0.46, + "grad_norm": 7.44756555557251, + "learning_rate": 7.893814296461964e-06, + "logits/chosen": -0.24913789331912994, + "logits/rejected": -0.367879718542099, + "logps/chosen": -61.5194091796875, + "logps/rejected": -66.39997100830078, + "loss": 0.9297, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.821669816970825, + "rewards/margins": 4.277559280395508, + "rewards/rejected": -1.4558898210525513, + "step": 1820 + }, + { + "epoch": 0.46, + "grad_norm": 4.151988983154297, + "learning_rate": 7.891677511836799e-06, + "logits/chosen": -0.040311671793460846, + "logits/rejected": -0.20736216008663177, + "logps/chosen": -68.40013885498047, + "logps/rejected": -82.72209930419922, + "loss": 0.8368, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.936370372772217, + "rewards/margins": 4.453336238861084, + "rewards/rejected": -1.516965627670288, + "step": 1821 + }, + { + "epoch": 0.46, + "grad_norm": 4.387645721435547, + "learning_rate": 7.889539933383761e-06, + "logits/chosen": -0.13398411870002747, + "logits/rejected": -0.3013942837715149, + "logps/chosen": -64.92617797851562, + "logps/rejected": -63.54611587524414, + "loss": 0.8468, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.87394380569458, + "rewards/margins": 4.045333385467529, + "rewards/rejected": -1.1713895797729492, + "step": 1822 + }, + { + "epoch": 0.46, + "grad_norm": 5.516906261444092, + "learning_rate": 7.887401561689661e-06, + "logits/chosen": -0.19225548207759857, + "logits/rejected": -0.2898210883140564, + "logps/chosen": -64.20303344726562, + "logps/rejected": -75.28482818603516, + "loss": 0.9596, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.839552879333496, + "rewards/margins": 2.814361572265625, + "rewards/rejected": 0.025191277265548706, + "step": 1823 + }, + { + "epoch": 0.46, + "grad_norm": 3.6262543201446533, + "learning_rate": 7.885262397341524e-06, + "logits/chosen": -0.17538896203041077, + "logits/rejected": -0.2738551199436188, + "logps/chosen": -55.81664276123047, + "logps/rejected": -89.66853332519531, + "loss": 0.6896, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2430121898651123, + "rewards/margins": 3.8961591720581055, + "rewards/rejected": -0.6531472206115723, + "step": 1824 + }, + { + "epoch": 0.46, + "grad_norm": 8.846476554870605, + "learning_rate": 7.883122440926603e-06, + "logits/chosen": -0.20989055931568146, + "logits/rejected": -0.2654920816421509, + "logps/chosen": -59.29637908935547, + "logps/rejected": -80.7483901977539, + "loss": 0.9175, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.643254280090332, + "rewards/margins": 2.7066893577575684, + "rewards/rejected": -0.06343497335910797, + "step": 1825 + }, + { + "epoch": 0.46, + "grad_norm": 3.863471746444702, + "learning_rate": 7.880981693032357e-06, + "logits/chosen": -0.3254064619541168, + "logits/rejected": -0.3931344747543335, + "logps/chosen": -43.0505485534668, + "logps/rejected": -79.33193969726562, + "loss": 0.6391, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.96414852142334, + "rewards/margins": 4.215088844299316, + "rewards/rejected": -1.2509406805038452, + "step": 1826 + }, + { + "epoch": 0.46, + "grad_norm": 5.153562068939209, + "learning_rate": 7.87884015424647e-06, + "logits/chosen": -0.12905918061733246, + "logits/rejected": -0.1944364607334137, + "logps/chosen": -62.724002838134766, + "logps/rejected": -86.22917175292969, + "loss": 0.8944, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6571297645568848, + "rewards/margins": 2.7033183574676514, + "rewards/rejected": -0.04618873819708824, + "step": 1827 + }, + { + "epoch": 0.46, + "grad_norm": 2.5489706993103027, + "learning_rate": 7.876697825156841e-06, + "logits/chosen": -0.23678390681743622, + "logits/rejected": -0.33380094170570374, + "logps/chosen": -64.52687072753906, + "logps/rejected": -87.2994384765625, + "loss": 0.81, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.027712106704712, + "rewards/margins": 4.706689834594727, + "rewards/rejected": -1.6789782047271729, + "step": 1828 + }, + { + "epoch": 0.46, + "grad_norm": 4.996366024017334, + "learning_rate": 7.874554706351585e-06, + "logits/chosen": -0.2648176848888397, + "logits/rejected": -0.3665645718574524, + "logps/chosen": -56.02958679199219, + "logps/rejected": -71.8882827758789, + "loss": 0.7595, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.881730556488037, + "rewards/margins": 4.171236515045166, + "rewards/rejected": -1.289506196975708, + "step": 1829 + }, + { + "epoch": 0.46, + "grad_norm": 4.012654781341553, + "learning_rate": 7.872410798419033e-06, + "logits/chosen": -0.26015424728393555, + "logits/rejected": -0.32696646451950073, + "logps/chosen": -60.597625732421875, + "logps/rejected": -84.58758544921875, + "loss": 0.7894, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.511526107788086, + "rewards/margins": 3.6981239318847656, + "rewards/rejected": -1.1865979433059692, + "step": 1830 + }, + { + "epoch": 0.46, + "grad_norm": 3.923139810562134, + "learning_rate": 7.870266101947734e-06, + "logits/chosen": -0.193882554769516, + "logits/rejected": -0.27901315689086914, + "logps/chosen": -49.04669189453125, + "logps/rejected": -69.84347534179688, + "loss": 0.8295, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0039548873901367, + "rewards/margins": 3.665940523147583, + "rewards/rejected": -0.6619856357574463, + "step": 1831 + }, + { + "epoch": 0.46, + "grad_norm": 4.744539737701416, + "learning_rate": 7.868120617526456e-06, + "logits/chosen": -0.22052791714668274, + "logits/rejected": -0.2981380820274353, + "logps/chosen": -56.08686447143555, + "logps/rejected": -70.7549819946289, + "loss": 0.9072, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.820326805114746, + "rewards/margins": 3.145617961883545, + "rewards/rejected": -0.3252912163734436, + "step": 1832 + }, + { + "epoch": 0.46, + "grad_norm": 3.982541799545288, + "learning_rate": 7.865974345744181e-06, + "logits/chosen": -0.17611975967884064, + "logits/rejected": -0.25376462936401367, + "logps/chosen": -47.53276062011719, + "logps/rejected": -74.72203063964844, + "loss": 0.7325, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7568252086639404, + "rewards/margins": 3.6438260078430176, + "rewards/rejected": -0.8870006203651428, + "step": 1833 + }, + { + "epoch": 0.46, + "grad_norm": 3.8013737201690674, + "learning_rate": 7.863827287190102e-06, + "logits/chosen": -0.18733970820903778, + "logits/rejected": -0.24653442203998566, + "logps/chosen": -47.777957916259766, + "logps/rejected": -84.87617492675781, + "loss": 0.7012, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.988046169281006, + "rewards/margins": 3.461707830429077, + "rewards/rejected": -0.4736618995666504, + "step": 1834 + }, + { + "epoch": 0.46, + "grad_norm": 4.471004009246826, + "learning_rate": 7.861679442453637e-06, + "logits/chosen": -0.1744895875453949, + "logits/rejected": -0.27415722608566284, + "logps/chosen": -62.5938720703125, + "logps/rejected": -92.50432586669922, + "loss": 0.7374, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2103188037872314, + "rewards/margins": 3.4610087871551514, + "rewards/rejected": -0.2506900429725647, + "step": 1835 + }, + { + "epoch": 0.46, + "grad_norm": 2.6822497844696045, + "learning_rate": 7.859530812124416e-06, + "logits/chosen": -0.255154013633728, + "logits/rejected": -0.3908950090408325, + "logps/chosen": -60.24970245361328, + "logps/rejected": -75.6225357055664, + "loss": 0.719, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.059302806854248, + "rewards/margins": 4.511282920837402, + "rewards/rejected": -1.451980471611023, + "step": 1836 + }, + { + "epoch": 0.46, + "grad_norm": 2.587702989578247, + "learning_rate": 7.857381396792283e-06, + "logits/chosen": -0.302814245223999, + "logits/rejected": -0.4314301311969757, + "logps/chosen": -57.63097381591797, + "logps/rejected": -74.92828369140625, + "loss": 0.7728, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1024675369262695, + "rewards/margins": 4.5402936935424805, + "rewards/rejected": -1.4378262758255005, + "step": 1837 + }, + { + "epoch": 0.46, + "grad_norm": 7.06931734085083, + "learning_rate": 7.8552311970473e-06, + "logits/chosen": -0.13700200617313385, + "logits/rejected": -0.24876832962036133, + "logps/chosen": -59.09709930419922, + "logps/rejected": -85.31517028808594, + "loss": 0.7658, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7147438526153564, + "rewards/margins": 3.804847002029419, + "rewards/rejected": -1.0901029109954834, + "step": 1838 + }, + { + "epoch": 0.46, + "grad_norm": 4.4458184242248535, + "learning_rate": 7.85308021347974e-06, + "logits/chosen": -0.20226137340068817, + "logits/rejected": -0.36161473393440247, + "logps/chosen": -53.143985748291016, + "logps/rejected": -58.14398193359375, + "loss": 0.7186, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.754563331604004, + "rewards/margins": 4.310478687286377, + "rewards/rejected": -1.555915117263794, + "step": 1839 + }, + { + "epoch": 0.46, + "grad_norm": 3.7181639671325684, + "learning_rate": 7.850928446680099e-06, + "logits/chosen": -0.2748956084251404, + "logits/rejected": -0.3763326406478882, + "logps/chosen": -63.59806442260742, + "logps/rejected": -80.88626861572266, + "loss": 0.8992, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.93062686920166, + "rewards/margins": 4.2039313316345215, + "rewards/rejected": -1.27330482006073, + "step": 1840 + }, + { + "epoch": 0.46, + "grad_norm": 3.5482730865478516, + "learning_rate": 7.84877589723908e-06, + "logits/chosen": -0.19348359107971191, + "logits/rejected": -0.27629566192626953, + "logps/chosen": -54.5972900390625, + "logps/rejected": -68.09081268310547, + "loss": 0.7736, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8856706619262695, + "rewards/margins": 3.9749393463134766, + "rewards/rejected": -1.0892685651779175, + "step": 1841 + }, + { + "epoch": 0.46, + "grad_norm": 5.473876953125, + "learning_rate": 7.846622565747606e-06, + "logits/chosen": -0.2656118869781494, + "logits/rejected": -0.261049747467041, + "logps/chosen": -59.04589080810547, + "logps/rejected": -79.97818756103516, + "loss": 1.0213, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.86342453956604, + "rewards/margins": 2.607069492340088, + "rewards/rejected": 0.2563549876213074, + "step": 1842 + }, + { + "epoch": 0.46, + "grad_norm": 3.757845640182495, + "learning_rate": 7.844468452796812e-06, + "logits/chosen": -0.1945541799068451, + "logits/rejected": -0.2652180790901184, + "logps/chosen": -53.63962936401367, + "logps/rejected": -90.26860046386719, + "loss": 0.719, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.678623676300049, + "rewards/margins": 4.249658584594727, + "rewards/rejected": -1.5710351467132568, + "step": 1843 + }, + { + "epoch": 0.46, + "grad_norm": 4.812536716461182, + "learning_rate": 7.84231355897805e-06, + "logits/chosen": -0.15985733270645142, + "logits/rejected": -0.202803835272789, + "logps/chosen": -54.32985305786133, + "logps/rejected": -78.20576477050781, + "loss": 0.7893, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.989945650100708, + "rewards/margins": 3.9796712398529053, + "rewards/rejected": -0.9897255897521973, + "step": 1844 + }, + { + "epoch": 0.46, + "grad_norm": 3.717242479324341, + "learning_rate": 7.840157884882881e-06, + "logits/chosen": -0.19630809128284454, + "logits/rejected": -0.2984745502471924, + "logps/chosen": -54.14665222167969, + "logps/rejected": -73.63765716552734, + "loss": 0.6654, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.071685791015625, + "rewards/margins": 4.118305206298828, + "rewards/rejected": -1.046619176864624, + "step": 1845 + }, + { + "epoch": 0.46, + "grad_norm": 4.3882365226745605, + "learning_rate": 7.83800143110309e-06, + "logits/chosen": -0.21467168629169464, + "logits/rejected": -0.33878612518310547, + "logps/chosen": -55.264915466308594, + "logps/rejected": -73.15159606933594, + "loss": 0.8404, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.856234550476074, + "rewards/margins": 4.102132320404053, + "rewards/rejected": -1.2458975315093994, + "step": 1846 + }, + { + "epoch": 0.46, + "grad_norm": 11.659061431884766, + "learning_rate": 7.835844198230664e-06, + "logits/chosen": -0.16680575907230377, + "logits/rejected": -0.2576248347759247, + "logps/chosen": -61.656410217285156, + "logps/rejected": -78.056640625, + "loss": 0.7911, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7870044708251953, + "rewards/margins": 3.3778600692749023, + "rewards/rejected": -0.5908557176589966, + "step": 1847 + }, + { + "epoch": 0.46, + "grad_norm": 5.020567893981934, + "learning_rate": 7.833686186857815e-06, + "logits/chosen": -0.21221792697906494, + "logits/rejected": -0.2487194836139679, + "logps/chosen": -72.42745208740234, + "logps/rejected": -83.52815246582031, + "loss": 0.9839, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4819138050079346, + "rewards/margins": 3.248063087463379, + "rewards/rejected": -0.7661494016647339, + "step": 1848 + }, + { + "epoch": 0.46, + "grad_norm": 3.610668659210205, + "learning_rate": 7.831527397576962e-06, + "logits/chosen": -0.2747556269168854, + "logits/rejected": -0.3419802784919739, + "logps/chosen": -58.57574462890625, + "logps/rejected": -84.8522720336914, + "loss": 0.8148, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.027791738510132, + "rewards/margins": 4.005544662475586, + "rewards/rejected": -0.9777527451515198, + "step": 1849 + }, + { + "epoch": 0.46, + "grad_norm": 3.893657922744751, + "learning_rate": 7.829367830980739e-06, + "logits/chosen": -0.1754305064678192, + "logits/rejected": -0.2662968635559082, + "logps/chosen": -54.16462707519531, + "logps/rejected": -70.52960968017578, + "loss": 0.7861, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8990602493286133, + "rewards/margins": 3.0391879081726074, + "rewards/rejected": -0.1401277780532837, + "step": 1850 + }, + { + "epoch": 0.46, + "grad_norm": 4.57757043838501, + "learning_rate": 7.827207487661992e-06, + "logits/chosen": -0.31416064500808716, + "logits/rejected": -0.39677631855010986, + "logps/chosen": -57.61139678955078, + "logps/rejected": -66.70502471923828, + "loss": 0.8455, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8787364959716797, + "rewards/margins": 3.585956573486328, + "rewards/rejected": -0.7072200179100037, + "step": 1851 + }, + { + "epoch": 0.46, + "grad_norm": 5.27242374420166, + "learning_rate": 7.825046368213782e-06, + "logits/chosen": -0.2812258303165436, + "logits/rejected": -0.39041030406951904, + "logps/chosen": -62.936824798583984, + "logps/rejected": -72.79993438720703, + "loss": 0.896, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4794504642486572, + "rewards/margins": 3.330573081970215, + "rewards/rejected": -0.851122260093689, + "step": 1852 + }, + { + "epoch": 0.46, + "grad_norm": 5.998598575592041, + "learning_rate": 7.822884473229387e-06, + "logits/chosen": -0.18860602378845215, + "logits/rejected": -0.28240764141082764, + "logps/chosen": -55.84553527832031, + "logps/rejected": -72.84410858154297, + "loss": 0.8645, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.58864688873291, + "rewards/margins": 3.4186766147613525, + "rewards/rejected": -0.8300291895866394, + "step": 1853 + }, + { + "epoch": 0.46, + "grad_norm": 3.8124780654907227, + "learning_rate": 7.82072180330229e-06, + "logits/chosen": -0.24873505532741547, + "logits/rejected": -0.33289027214050293, + "logps/chosen": -53.79332733154297, + "logps/rejected": -81.90797424316406, + "loss": 0.7467, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9596405029296875, + "rewards/margins": 4.851418972015381, + "rewards/rejected": -1.8917778730392456, + "step": 1854 + }, + { + "epoch": 0.46, + "grad_norm": 3.1356253623962402, + "learning_rate": 7.818558359026192e-06, + "logits/chosen": -0.30155861377716064, + "logits/rejected": -0.34604334831237793, + "logps/chosen": -51.41407012939453, + "logps/rejected": -90.04000091552734, + "loss": 0.6511, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1514198780059814, + "rewards/margins": 4.3717546463012695, + "rewards/rejected": -1.220334529876709, + "step": 1855 + }, + { + "epoch": 0.46, + "grad_norm": 4.374295711517334, + "learning_rate": 7.816394140995004e-06, + "logits/chosen": -0.2773858308792114, + "logits/rejected": -0.28341394662857056, + "logps/chosen": -63.5758056640625, + "logps/rejected": -79.61814880371094, + "loss": 0.8728, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7790825366973877, + "rewards/margins": 3.1555488109588623, + "rewards/rejected": -0.3764662742614746, + "step": 1856 + }, + { + "epoch": 0.46, + "grad_norm": 6.0836262702941895, + "learning_rate": 7.814229149802852e-06, + "logits/chosen": -0.26464658975601196, + "logits/rejected": -0.365934282541275, + "logps/chosen": -57.02216339111328, + "logps/rejected": -80.28082275390625, + "loss": 0.8284, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7903270721435547, + "rewards/margins": 4.193934440612793, + "rewards/rejected": -1.4036076068878174, + "step": 1857 + }, + { + "epoch": 0.46, + "grad_norm": 6.993725776672363, + "learning_rate": 7.812063386044071e-06, + "logits/chosen": -0.3247412443161011, + "logits/rejected": -0.38138800859451294, + "logps/chosen": -47.187076568603516, + "logps/rejected": -65.50738525390625, + "loss": 0.9292, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.437551498413086, + "rewards/margins": 2.9515256881713867, + "rewards/rejected": -0.5139741897583008, + "step": 1858 + }, + { + "epoch": 0.47, + "grad_norm": 5.070134162902832, + "learning_rate": 7.809896850313211e-06, + "logits/chosen": -0.22730834782123566, + "logits/rejected": -0.33061590790748596, + "logps/chosen": -60.315948486328125, + "logps/rejected": -76.99348449707031, + "loss": 0.8339, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6646883487701416, + "rewards/margins": 3.6141061782836914, + "rewards/rejected": -0.9494178295135498, + "step": 1859 + }, + { + "epoch": 0.47, + "grad_norm": 4.127013206481934, + "learning_rate": 7.807729543205035e-06, + "logits/chosen": -0.21829557418823242, + "logits/rejected": -0.3566858768463135, + "logps/chosen": -53.908538818359375, + "logps/rejected": -69.8206787109375, + "loss": 0.7946, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7555813789367676, + "rewards/margins": 3.919628858566284, + "rewards/rejected": -1.1640474796295166, + "step": 1860 + }, + { + "epoch": 0.47, + "grad_norm": 3.479271650314331, + "learning_rate": 7.805561465314512e-06, + "logits/chosen": -0.2877143323421478, + "logits/rejected": -0.38114237785339355, + "logps/chosen": -54.04362487792969, + "logps/rejected": -83.13837432861328, + "loss": 0.6954, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8996262550354004, + "rewards/margins": 4.73415994644165, + "rewards/rejected": -1.83453369140625, + "step": 1861 + }, + { + "epoch": 0.47, + "grad_norm": 4.57629919052124, + "learning_rate": 7.803392617236827e-06, + "logits/chosen": -0.13717879354953766, + "logits/rejected": -0.2876952886581421, + "logps/chosen": -67.24417114257812, + "logps/rejected": -82.197509765625, + "loss": 0.8578, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.628201961517334, + "rewards/margins": 3.7108750343322754, + "rewards/rejected": -1.082673192024231, + "step": 1862 + }, + { + "epoch": 0.47, + "grad_norm": 4.664823532104492, + "learning_rate": 7.801222999567377e-06, + "logits/chosen": -0.25327515602111816, + "logits/rejected": -0.3701566159725189, + "logps/chosen": -50.422698974609375, + "logps/rejected": -82.4579086303711, + "loss": 0.799, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7003426551818848, + "rewards/margins": 3.4948863983154297, + "rewards/rejected": -0.7945439219474792, + "step": 1863 + }, + { + "epoch": 0.47, + "grad_norm": 6.242748737335205, + "learning_rate": 7.799052612901767e-06, + "logits/chosen": -0.20528191328048706, + "logits/rejected": -0.30611342191696167, + "logps/chosen": -59.80344009399414, + "logps/rejected": -68.04634094238281, + "loss": 0.9467, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6799638271331787, + "rewards/margins": 3.3395516872406006, + "rewards/rejected": -0.6595878601074219, + "step": 1864 + }, + { + "epoch": 0.47, + "grad_norm": 4.908541679382324, + "learning_rate": 7.796881457835817e-06, + "logits/chosen": -0.2072538137435913, + "logits/rejected": -0.29564744234085083, + "logps/chosen": -55.290523529052734, + "logps/rejected": -85.57514953613281, + "loss": 0.7282, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.840210199356079, + "rewards/margins": 4.263132095336914, + "rewards/rejected": -1.4229223728179932, + "step": 1865 + }, + { + "epoch": 0.47, + "grad_norm": 4.171896934509277, + "learning_rate": 7.794709534965555e-06, + "logits/chosen": -0.19726261496543884, + "logits/rejected": -0.288727343082428, + "logps/chosen": -52.635345458984375, + "logps/rejected": -77.31690979003906, + "loss": 0.7848, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.078338623046875, + "rewards/margins": 4.328754425048828, + "rewards/rejected": -1.2504162788391113, + "step": 1866 + }, + { + "epoch": 0.47, + "grad_norm": 3.8946475982666016, + "learning_rate": 7.79253684488722e-06, + "logits/chosen": -0.21199779212474823, + "logits/rejected": -0.3312862515449524, + "logps/chosen": -47.53546905517578, + "logps/rejected": -68.2325210571289, + "loss": 0.7019, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.129328727722168, + "rewards/margins": 4.354333400726318, + "rewards/rejected": -1.2250044345855713, + "step": 1867 + }, + { + "epoch": 0.47, + "grad_norm": 7.674198627471924, + "learning_rate": 7.790363388197263e-06, + "logits/chosen": -0.2429915815591812, + "logits/rejected": -0.2889087200164795, + "logps/chosen": -57.808406829833984, + "logps/rejected": -83.23441314697266, + "loss": 0.9882, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.4702999591827393, + "rewards/margins": 3.6332950592041016, + "rewards/rejected": -1.1629953384399414, + "step": 1868 + }, + { + "epoch": 0.47, + "grad_norm": 5.104580402374268, + "learning_rate": 7.788189165492344e-06, + "logits/chosen": -0.22196480631828308, + "logits/rejected": -0.32737797498703003, + "logps/chosen": -50.43860626220703, + "logps/rejected": -77.8105697631836, + "loss": 0.7029, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8512749671936035, + "rewards/margins": 4.129324913024902, + "rewards/rejected": -1.278050184249878, + "step": 1869 + }, + { + "epoch": 0.47, + "grad_norm": 4.1199564933776855, + "learning_rate": 7.786014177369336e-06, + "logits/chosen": -0.2067677527666092, + "logits/rejected": -0.27525532245635986, + "logps/chosen": -49.5717658996582, + "logps/rejected": -86.8854751586914, + "loss": 0.6695, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.522919178009033, + "rewards/margins": 4.433608531951904, + "rewards/rejected": -1.9106892347335815, + "step": 1870 + }, + { + "epoch": 0.47, + "grad_norm": 6.482148170471191, + "learning_rate": 7.783838424425318e-06, + "logits/chosen": -0.18235723674297333, + "logits/rejected": -0.19513848423957825, + "logps/chosen": -54.286319732666016, + "logps/rejected": -77.86244201660156, + "loss": 0.88, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3240604400634766, + "rewards/margins": 3.3109612464904785, + "rewards/rejected": 0.013099439442157745, + "step": 1871 + }, + { + "epoch": 0.47, + "grad_norm": 5.65177059173584, + "learning_rate": 7.781661907257581e-06, + "logits/chosen": -0.1955094039440155, + "logits/rejected": -0.34234923124313354, + "logps/chosen": -67.52352142333984, + "logps/rejected": -74.23297119140625, + "loss": 0.8324, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.901439905166626, + "rewards/margins": 3.9790844917297363, + "rewards/rejected": -1.077644944190979, + "step": 1872 + }, + { + "epoch": 0.47, + "grad_norm": 3.8665359020233154, + "learning_rate": 7.77948462646363e-06, + "logits/chosen": -0.10527244210243225, + "logits/rejected": -0.3183535933494568, + "logps/chosen": -63.60488510131836, + "logps/rejected": -63.525421142578125, + "loss": 0.8007, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7891452312469482, + "rewards/margins": 4.381382465362549, + "rewards/rejected": -1.5922373533248901, + "step": 1873 + }, + { + "epoch": 0.47, + "grad_norm": 4.6942973136901855, + "learning_rate": 7.77730658264117e-06, + "logits/chosen": -0.2174759805202484, + "logits/rejected": -0.24727687239646912, + "logps/chosen": -55.24300003051758, + "logps/rejected": -100.98766326904297, + "loss": 0.7791, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9940478801727295, + "rewards/margins": 4.011072635650635, + "rewards/rejected": -1.0170243978500366, + "step": 1874 + }, + { + "epoch": 0.47, + "grad_norm": 3.60969614982605, + "learning_rate": 7.775127776388124e-06, + "logits/chosen": -0.2672508656978607, + "logits/rejected": -0.36827483773231506, + "logps/chosen": -62.74238586425781, + "logps/rejected": -88.90653228759766, + "loss": 0.7588, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.127228260040283, + "rewards/margins": 5.250990867614746, + "rewards/rejected": -2.123762369155884, + "step": 1875 + }, + { + "epoch": 0.47, + "grad_norm": 6.492301940917969, + "learning_rate": 7.77294820830262e-06, + "logits/chosen": -0.15388935804367065, + "logits/rejected": -0.22990824282169342, + "logps/chosen": -61.013954162597656, + "logps/rejected": -77.75444793701172, + "loss": 0.9809, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.872021436691284, + "rewards/margins": 3.1943905353546143, + "rewards/rejected": -0.3223692774772644, + "step": 1876 + }, + { + "epoch": 0.47, + "grad_norm": 8.038922309875488, + "learning_rate": 7.770767878983e-06, + "logits/chosen": -0.25019216537475586, + "logits/rejected": -0.3441777229309082, + "logps/chosen": -65.05680084228516, + "logps/rejected": -73.71086120605469, + "loss": 0.9258, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7156925201416016, + "rewards/margins": 3.6778059005737305, + "rewards/rejected": -0.962113618850708, + "step": 1877 + }, + { + "epoch": 0.47, + "grad_norm": 3.3278677463531494, + "learning_rate": 7.768586789027806e-06, + "logits/chosen": -0.21602359414100647, + "logits/rejected": -0.3690018653869629, + "logps/chosen": -56.3840446472168, + "logps/rejected": -69.66759490966797, + "loss": 0.689, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.695470094680786, + "rewards/margins": 4.126038074493408, + "rewards/rejected": -1.4305682182312012, + "step": 1878 + }, + { + "epoch": 0.47, + "grad_norm": 6.754083156585693, + "learning_rate": 7.766404939035799e-06, + "logits/chosen": -0.20962274074554443, + "logits/rejected": -0.32034024596214294, + "logps/chosen": -64.70010375976562, + "logps/rejected": -73.59223937988281, + "loss": 0.9241, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5912373065948486, + "rewards/margins": 3.533669948577881, + "rewards/rejected": -0.942432701587677, + "step": 1879 + }, + { + "epoch": 0.47, + "grad_norm": 4.35101842880249, + "learning_rate": 7.764222329605939e-06, + "logits/chosen": -0.2472400963306427, + "logits/rejected": -0.3632031977176666, + "logps/chosen": -64.21478271484375, + "logps/rejected": -85.66201782226562, + "loss": 0.7678, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.929110527038574, + "rewards/margins": 4.789971351623535, + "rewards/rejected": -1.8608604669570923, + "step": 1880 + }, + { + "epoch": 0.47, + "grad_norm": 4.297358512878418, + "learning_rate": 7.762038961337406e-06, + "logits/chosen": -0.20373916625976562, + "logits/rejected": -0.32597944140434265, + "logps/chosen": -63.53277587890625, + "logps/rejected": -77.46681213378906, + "loss": 0.7771, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9347550868988037, + "rewards/margins": 4.2279558181762695, + "rewards/rejected": -1.2932007312774658, + "step": 1881 + }, + { + "epoch": 0.47, + "grad_norm": 5.544567108154297, + "learning_rate": 7.759854834829575e-06, + "logits/chosen": -0.27058547735214233, + "logits/rejected": -0.4335000813007355, + "logps/chosen": -55.96961212158203, + "logps/rejected": -74.4168701171875, + "loss": 0.6245, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1040611267089844, + "rewards/margins": 4.846240520477295, + "rewards/rejected": -1.7421793937683105, + "step": 1882 + }, + { + "epoch": 0.47, + "grad_norm": 7.099678993225098, + "learning_rate": 7.75766995068204e-06, + "logits/chosen": -0.20660533010959625, + "logits/rejected": -0.2984856963157654, + "logps/chosen": -56.15023422241211, + "logps/rejected": -78.5651626586914, + "loss": 0.8315, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4331462383270264, + "rewards/margins": 4.2169928550720215, + "rewards/rejected": -1.7838464975357056, + "step": 1883 + }, + { + "epoch": 0.47, + "grad_norm": 3.1891088485717773, + "learning_rate": 7.755484309494599e-06, + "logits/chosen": -0.16775725781917572, + "logits/rejected": -0.3207022249698639, + "logps/chosen": -57.3300666809082, + "logps/rejected": -67.19377136230469, + "loss": 0.7356, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8484320640563965, + "rewards/margins": 4.104080677032471, + "rewards/rejected": -1.2556486129760742, + "step": 1884 + }, + { + "epoch": 0.47, + "grad_norm": 3.8092551231384277, + "learning_rate": 7.753297911867254e-06, + "logits/chosen": -0.19158339500427246, + "logits/rejected": -0.2313070297241211, + "logps/chosen": -55.753074645996094, + "logps/rejected": -91.27957153320312, + "loss": 0.8095, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.72192645072937, + "rewards/margins": 4.069538593292236, + "rewards/rejected": -1.3476120233535767, + "step": 1885 + }, + { + "epoch": 0.47, + "grad_norm": 4.9392008781433105, + "learning_rate": 7.751110758400223e-06, + "logits/chosen": -0.21634052693843842, + "logits/rejected": -0.32041481137275696, + "logps/chosen": -57.828369140625, + "logps/rejected": -83.2179183959961, + "loss": 0.719, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.687572956085205, + "rewards/margins": 3.9018571376800537, + "rewards/rejected": -1.2142844200134277, + "step": 1886 + }, + { + "epoch": 0.47, + "grad_norm": 3.147796869277954, + "learning_rate": 7.748922849693923e-06, + "logits/chosen": -0.20669597387313843, + "logits/rejected": -0.34310001134872437, + "logps/chosen": -50.55901336669922, + "logps/rejected": -79.19438171386719, + "loss": 0.655, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8325576782226562, + "rewards/margins": 5.4138665199279785, + "rewards/rejected": -2.5813090801239014, + "step": 1887 + }, + { + "epoch": 0.47, + "grad_norm": 9.683496475219727, + "learning_rate": 7.746734186348986e-06, + "logits/chosen": -0.2882635295391083, + "logits/rejected": -0.3584939241409302, + "logps/chosen": -68.9865493774414, + "logps/rejected": -71.45169067382812, + "loss": 1.1133, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7341785430908203, + "rewards/margins": 3.0503811836242676, + "rewards/rejected": -0.3162023723125458, + "step": 1888 + }, + { + "epoch": 0.47, + "grad_norm": 4.954323768615723, + "learning_rate": 7.744544768966243e-06, + "logits/chosen": -0.21647271513938904, + "logits/rejected": -0.3286539316177368, + "logps/chosen": -53.192413330078125, + "logps/rejected": -77.8250732421875, + "loss": 0.7188, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0350985527038574, + "rewards/margins": 4.389698028564453, + "rewards/rejected": -1.354599118232727, + "step": 1889 + }, + { + "epoch": 0.47, + "grad_norm": 5.560997009277344, + "learning_rate": 7.742354598146737e-06, + "logits/chosen": -0.24250377714633942, + "logits/rejected": -0.35382938385009766, + "logps/chosen": -48.988685607910156, + "logps/rejected": -69.48681640625, + "loss": 0.7823, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.959524393081665, + "rewards/margins": 4.402872562408447, + "rewards/rejected": -1.4433480501174927, + "step": 1890 + }, + { + "epoch": 0.47, + "grad_norm": 6.123242378234863, + "learning_rate": 7.74016367449172e-06, + "logits/chosen": -0.1235036626458168, + "logits/rejected": -0.2117612063884735, + "logps/chosen": -57.724639892578125, + "logps/rejected": -72.52838134765625, + "loss": 0.8122, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.656219005584717, + "rewards/margins": 2.9290647506713867, + "rewards/rejected": -0.27284568548202515, + "step": 1891 + }, + { + "epoch": 0.47, + "grad_norm": 3.6028549671173096, + "learning_rate": 7.737971998602648e-06, + "logits/chosen": -0.22608403861522675, + "logits/rejected": -0.2880495488643646, + "logps/chosen": -50.397605895996094, + "logps/rejected": -81.2474365234375, + "loss": 0.7347, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7115211486816406, + "rewards/margins": 4.559044361114502, + "rewards/rejected": -1.8475233316421509, + "step": 1892 + }, + { + "epoch": 0.47, + "grad_norm": 7.540833473205566, + "learning_rate": 7.735779571081179e-06, + "logits/chosen": -0.19438350200653076, + "logits/rejected": -0.34554797410964966, + "logps/chosen": -58.20322799682617, + "logps/rejected": -64.19480895996094, + "loss": 0.698, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.987450122833252, + "rewards/margins": 4.614252090454102, + "rewards/rejected": -1.6268022060394287, + "step": 1893 + }, + { + "epoch": 0.47, + "grad_norm": 7.905322074890137, + "learning_rate": 7.733586392529184e-06, + "logits/chosen": -0.1958041489124298, + "logits/rejected": -0.27151310443878174, + "logps/chosen": -57.02552795410156, + "logps/rejected": -69.35284423828125, + "loss": 0.8295, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1515069007873535, + "rewards/margins": 3.479264259338379, + "rewards/rejected": -0.32775720953941345, + "step": 1894 + }, + { + "epoch": 0.47, + "grad_norm": 6.151071071624756, + "learning_rate": 7.73139246354874e-06, + "logits/chosen": -0.17776453495025635, + "logits/rejected": -0.26192182302474976, + "logps/chosen": -54.435604095458984, + "logps/rejected": -69.87916564941406, + "loss": 0.8432, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8475594520568848, + "rewards/margins": 3.4172160625457764, + "rewards/rejected": -0.5696564316749573, + "step": 1895 + }, + { + "epoch": 0.47, + "grad_norm": 3.9728081226348877, + "learning_rate": 7.729197784742125e-06, + "logits/chosen": -0.24694819748401642, + "logits/rejected": -0.38612401485443115, + "logps/chosen": -74.44841766357422, + "logps/rejected": -81.19180297851562, + "loss": 0.7589, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2696475982666016, + "rewards/margins": 4.8683061599731445, + "rewards/rejected": -1.5986580848693848, + "step": 1896 + }, + { + "epoch": 0.47, + "grad_norm": 6.2821550369262695, + "learning_rate": 7.727002356711827e-06, + "logits/chosen": -0.1472373902797699, + "logits/rejected": -0.2695334851741791, + "logps/chosen": -60.45979309082031, + "logps/rejected": -75.74577331542969, + "loss": 0.7028, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.040719985961914, + "rewards/margins": 4.236011505126953, + "rewards/rejected": -1.195291519165039, + "step": 1897 + }, + { + "epoch": 0.47, + "grad_norm": 6.061825275421143, + "learning_rate": 7.724806180060538e-06, + "logits/chosen": -0.18377166986465454, + "logits/rejected": -0.3403196930885315, + "logps/chosen": -66.34261322021484, + "logps/rejected": -78.37788391113281, + "loss": 0.8122, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6665291786193848, + "rewards/margins": 4.315334320068359, + "rewards/rejected": -1.6488057374954224, + "step": 1898 + }, + { + "epoch": 0.48, + "grad_norm": 3.8209805488586426, + "learning_rate": 7.722609255391156e-06, + "logits/chosen": -0.20010024309158325, + "logits/rejected": -0.30911684036254883, + "logps/chosen": -53.45388412475586, + "logps/rejected": -81.84864044189453, + "loss": 0.7413, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9410314559936523, + "rewards/margins": 4.438779830932617, + "rewards/rejected": -1.4977482557296753, + "step": 1899 + }, + { + "epoch": 0.48, + "grad_norm": 6.426652908325195, + "learning_rate": 7.720411583306784e-06, + "logits/chosen": -0.10346118360757828, + "logits/rejected": -0.18320417404174805, + "logps/chosen": -59.55072021484375, + "logps/rejected": -80.82920837402344, + "loss": 0.8707, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.781102180480957, + "rewards/margins": 4.000382900238037, + "rewards/rejected": -1.2192803621292114, + "step": 1900 + }, + { + "epoch": 0.48, + "grad_norm": 5.327872276306152, + "learning_rate": 7.718213164410729e-06, + "logits/chosen": -0.27356094121932983, + "logits/rejected": -0.3307046890258789, + "logps/chosen": -47.65751647949219, + "logps/rejected": -85.54835510253906, + "loss": 0.7517, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9594554901123047, + "rewards/margins": 3.745649576187134, + "rewards/rejected": -0.78619384765625, + "step": 1901 + }, + { + "epoch": 0.48, + "grad_norm": 2.348249673843384, + "learning_rate": 7.716013999306507e-06, + "logits/chosen": -0.2314336597919464, + "logits/rejected": -0.4271140396595001, + "logps/chosen": -59.96207046508789, + "logps/rejected": -67.99049377441406, + "loss": 0.7236, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6154816150665283, + "rewards/margins": 4.850067138671875, + "rewards/rejected": -2.234585762023926, + "step": 1902 + }, + { + "epoch": 0.48, + "grad_norm": 6.290485858917236, + "learning_rate": 7.713814088597838e-06, + "logits/chosen": -0.1274053305387497, + "logits/rejected": -0.23660995066165924, + "logps/chosen": -65.32221984863281, + "logps/rejected": -84.00363159179688, + "loss": 0.9825, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8357980251312256, + "rewards/margins": 3.1459767818450928, + "rewards/rejected": -0.31017881631851196, + "step": 1903 + }, + { + "epoch": 0.48, + "grad_norm": 8.478782653808594, + "learning_rate": 7.711613432888639e-06, + "logits/chosen": -0.31465214490890503, + "logits/rejected": -0.423950731754303, + "logps/chosen": -52.47235107421875, + "logps/rejected": -61.92457962036133, + "loss": 0.9217, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.819026231765747, + "rewards/margins": 3.21968412399292, + "rewards/rejected": -0.4006580412387848, + "step": 1904 + }, + { + "epoch": 0.48, + "grad_norm": 5.172289848327637, + "learning_rate": 7.709412032783042e-06, + "logits/chosen": -0.19392246007919312, + "logits/rejected": -0.2738029956817627, + "logps/chosen": -57.2745246887207, + "logps/rejected": -77.72488403320312, + "loss": 0.8036, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8799819946289062, + "rewards/margins": 3.3985214233398438, + "rewards/rejected": -0.518539547920227, + "step": 1905 + }, + { + "epoch": 0.48, + "grad_norm": 7.442145824432373, + "learning_rate": 7.707209888885376e-06, + "logits/chosen": -0.1729540079832077, + "logits/rejected": -0.28552091121673584, + "logps/chosen": -62.55741882324219, + "logps/rejected": -81.9890365600586, + "loss": 0.9308, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7364909648895264, + "rewards/margins": 3.8766911029815674, + "rewards/rejected": -1.140200138092041, + "step": 1906 + }, + { + "epoch": 0.48, + "grad_norm": 4.336467266082764, + "learning_rate": 7.70500700180018e-06, + "logits/chosen": -0.24676568806171417, + "logits/rejected": -0.4164709448814392, + "logps/chosen": -65.74465942382812, + "logps/rejected": -61.81282043457031, + "loss": 0.9248, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8380441665649414, + "rewards/margins": 3.446317195892334, + "rewards/rejected": -0.608272910118103, + "step": 1907 + }, + { + "epoch": 0.48, + "grad_norm": 3.9681360721588135, + "learning_rate": 7.702803372132192e-06, + "logits/chosen": -0.12785975635051727, + "logits/rejected": -0.24258580803871155, + "logps/chosen": -57.102294921875, + "logps/rejected": -75.5986099243164, + "loss": 0.7638, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8006858825683594, + "rewards/margins": 3.9241514205932617, + "rewards/rejected": -1.1234652996063232, + "step": 1908 + }, + { + "epoch": 0.48, + "grad_norm": 5.384232044219971, + "learning_rate": 7.700599000486356e-06, + "logits/chosen": -0.18404307961463928, + "logits/rejected": -0.2535555362701416, + "logps/chosen": -68.41444396972656, + "logps/rejected": -81.48969268798828, + "loss": 0.8561, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.997387409210205, + "rewards/margins": 3.8847551345825195, + "rewards/rejected": -0.8873676061630249, + "step": 1909 + }, + { + "epoch": 0.48, + "grad_norm": 3.5005664825439453, + "learning_rate": 7.698393887467822e-06, + "logits/chosen": -0.25567203760147095, + "logits/rejected": -0.31054893136024475, + "logps/chosen": -51.57126998901367, + "logps/rejected": -87.6065902709961, + "loss": 0.854, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9871697425842285, + "rewards/margins": 3.8893136978149414, + "rewards/rejected": -0.9021435976028442, + "step": 1910 + }, + { + "epoch": 0.48, + "grad_norm": 4.294981479644775, + "learning_rate": 7.696188033681935e-06, + "logits/chosen": -0.21885672211647034, + "logits/rejected": -0.335734486579895, + "logps/chosen": -48.296295166015625, + "logps/rejected": -72.79542541503906, + "loss": 0.6709, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.011803388595581, + "rewards/margins": 3.861294746398926, + "rewards/rejected": -0.8494913578033447, + "step": 1911 + }, + { + "epoch": 0.48, + "grad_norm": 5.578243732452393, + "learning_rate": 7.693981439734257e-06, + "logits/chosen": -0.2862475514411926, + "logits/rejected": -0.3891865611076355, + "logps/chosen": -67.89933013916016, + "logps/rejected": -85.74755859375, + "loss": 0.8824, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.800219774246216, + "rewards/margins": 3.586289167404175, + "rewards/rejected": -0.7860695719718933, + "step": 1912 + }, + { + "epoch": 0.48, + "grad_norm": 4.405774116516113, + "learning_rate": 7.691774106230543e-06, + "logits/chosen": -0.3237658441066742, + "logits/rejected": -0.37583133578300476, + "logps/chosen": -58.40355682373047, + "logps/rejected": -75.68390655517578, + "loss": 0.8365, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8861732482910156, + "rewards/margins": 3.4417831897735596, + "rewards/rejected": -0.5556101202964783, + "step": 1913 + }, + { + "epoch": 0.48, + "grad_norm": 4.148435115814209, + "learning_rate": 7.68956603377675e-06, + "logits/chosen": -0.20621606707572937, + "logits/rejected": -0.27882716059684753, + "logps/chosen": -59.4298210144043, + "logps/rejected": -65.65125274658203, + "loss": 0.8618, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.871779441833496, + "rewards/margins": 2.9900550842285156, + "rewards/rejected": -0.11827506870031357, + "step": 1914 + }, + { + "epoch": 0.48, + "grad_norm": 5.077398300170898, + "learning_rate": 7.687357222979046e-06, + "logits/chosen": -0.23336714506149292, + "logits/rejected": -0.43101686239242554, + "logps/chosen": -64.8909683227539, + "logps/rejected": -59.30339050292969, + "loss": 0.9215, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8175175189971924, + "rewards/margins": 4.271396636962891, + "rewards/rejected": -1.4538791179656982, + "step": 1915 + }, + { + "epoch": 0.48, + "grad_norm": 4.4375481605529785, + "learning_rate": 7.685147674443796e-06, + "logits/chosen": -0.272485613822937, + "logits/rejected": -0.3494795858860016, + "logps/chosen": -55.009029388427734, + "logps/rejected": -81.10130310058594, + "loss": 0.792, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.121945381164551, + "rewards/margins": 4.375361442565918, + "rewards/rejected": -1.2534160614013672, + "step": 1916 + }, + { + "epoch": 0.48, + "grad_norm": 4.4765729904174805, + "learning_rate": 7.682937388777567e-06, + "logits/chosen": -0.2760840356349945, + "logits/rejected": -0.3633973300457001, + "logps/chosen": -50.86423873901367, + "logps/rejected": -78.81727600097656, + "loss": 0.8437, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.129769802093506, + "rewards/margins": 3.772167921066284, + "rewards/rejected": -0.6423976421356201, + "step": 1917 + }, + { + "epoch": 0.48, + "grad_norm": 3.931640148162842, + "learning_rate": 7.680726366587134e-06, + "logits/chosen": -0.12712755799293518, + "logits/rejected": -0.28622519969940186, + "logps/chosen": -63.666358947753906, + "logps/rejected": -86.89054107666016, + "loss": 0.7501, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8836677074432373, + "rewards/margins": 3.9142956733703613, + "rewards/rejected": -1.030627965927124, + "step": 1918 + }, + { + "epoch": 0.48, + "grad_norm": 4.3404741287231445, + "learning_rate": 7.678514608479466e-06, + "logits/chosen": -0.23096859455108643, + "logits/rejected": -0.33042681217193604, + "logps/chosen": -59.31999206542969, + "logps/rejected": -79.53499603271484, + "loss": 0.806, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.770303249359131, + "rewards/margins": 3.763704776763916, + "rewards/rejected": -0.9934016466140747, + "step": 1919 + }, + { + "epoch": 0.48, + "grad_norm": 5.927015781402588, + "learning_rate": 7.676302115061742e-06, + "logits/chosen": -0.2178213894367218, + "logits/rejected": -0.31653034687042236, + "logps/chosen": -61.14255142211914, + "logps/rejected": -70.59346771240234, + "loss": 0.9222, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7283198833465576, + "rewards/margins": 2.7918717861175537, + "rewards/rejected": -0.06355232000350952, + "step": 1920 + }, + { + "epoch": 0.48, + "grad_norm": 5.27386999130249, + "learning_rate": 7.674088886941334e-06, + "logits/chosen": -0.2426597625017166, + "logits/rejected": -0.26906904578208923, + "logps/chosen": -70.05902099609375, + "logps/rejected": -77.26010131835938, + "loss": 0.9585, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.012634754180908, + "rewards/margins": 2.7569663524627686, + "rewards/rejected": 0.25566864013671875, + "step": 1921 + }, + { + "epoch": 0.48, + "grad_norm": 6.070827484130859, + "learning_rate": 7.671874924725827e-06, + "logits/chosen": -0.14219969511032104, + "logits/rejected": -0.1972990483045578, + "logps/chosen": -61.17020797729492, + "logps/rejected": -83.16841125488281, + "loss": 1.071, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.3939261436462402, + "rewards/margins": 1.9324369430541992, + "rewards/rejected": 0.4614892601966858, + "step": 1922 + }, + { + "epoch": 0.48, + "grad_norm": 4.706201553344727, + "learning_rate": 7.669660229022999e-06, + "logits/chosen": -0.25732383131980896, + "logits/rejected": -0.33094367384910583, + "logps/chosen": -60.429046630859375, + "logps/rejected": -84.59552001953125, + "loss": 0.8, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.949215888977051, + "rewards/margins": 4.170113563537598, + "rewards/rejected": -1.2208975553512573, + "step": 1923 + }, + { + "epoch": 0.48, + "grad_norm": 4.987563610076904, + "learning_rate": 7.66744480044083e-06, + "logits/chosen": -0.17202073335647583, + "logits/rejected": -0.29950904846191406, + "logps/chosen": -60.942665100097656, + "logps/rejected": -73.00203704833984, + "loss": 0.8646, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5210976600646973, + "rewards/margins": 3.8539071083068848, + "rewards/rejected": -1.332809329032898, + "step": 1924 + }, + { + "epoch": 0.48, + "grad_norm": 4.228361129760742, + "learning_rate": 7.665228639587505e-06, + "logits/chosen": -0.24277350306510925, + "logits/rejected": -0.2802812457084656, + "logps/chosen": -42.809322357177734, + "logps/rejected": -80.66236877441406, + "loss": 0.7613, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.039318799972534, + "rewards/margins": 4.154236793518066, + "rewards/rejected": -1.114917516708374, + "step": 1925 + }, + { + "epoch": 0.48, + "grad_norm": 4.042144775390625, + "learning_rate": 7.663011747071407e-06, + "logits/chosen": -0.197174534201622, + "logits/rejected": -0.21866722404956818, + "logps/chosen": -50.93728256225586, + "logps/rejected": -85.78203582763672, + "loss": 0.7603, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1185383796691895, + "rewards/margins": 3.2685627937316895, + "rewards/rejected": -0.15002408623695374, + "step": 1926 + }, + { + "epoch": 0.48, + "grad_norm": 2.835510015487671, + "learning_rate": 7.66079412350112e-06, + "logits/chosen": -0.2063470035791397, + "logits/rejected": -0.38934195041656494, + "logps/chosen": -45.365901947021484, + "logps/rejected": -60.4124755859375, + "loss": 0.5832, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.933847188949585, + "rewards/margins": 3.791200637817383, + "rewards/rejected": -0.857353925704956, + "step": 1927 + }, + { + "epoch": 0.48, + "grad_norm": 4.686367988586426, + "learning_rate": 7.658575769485432e-06, + "logits/chosen": -0.223897323012352, + "logits/rejected": -0.3817197382450104, + "logps/chosen": -60.118324279785156, + "logps/rejected": -70.66868591308594, + "loss": 0.936, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.059828996658325, + "rewards/margins": 3.5457074642181396, + "rewards/rejected": -0.4858786165714264, + "step": 1928 + }, + { + "epoch": 0.48, + "grad_norm": 8.500815391540527, + "learning_rate": 7.656356685633328e-06, + "logits/chosen": -0.2760545313358307, + "logits/rejected": -0.38260558247566223, + "logps/chosen": -56.6505126953125, + "logps/rejected": -61.828826904296875, + "loss": 0.9214, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7177324295043945, + "rewards/margins": 3.123845100402832, + "rewards/rejected": -0.4061128795146942, + "step": 1929 + }, + { + "epoch": 0.48, + "grad_norm": 4.754215717315674, + "learning_rate": 7.654136872553994e-06, + "logits/chosen": -0.20789751410484314, + "logits/rejected": -0.34053918719291687, + "logps/chosen": -69.81266021728516, + "logps/rejected": -70.72478485107422, + "loss": 1.0539, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7163939476013184, + "rewards/margins": 3.2112038135528564, + "rewards/rejected": -0.49480998516082764, + "step": 1930 + }, + { + "epoch": 0.48, + "grad_norm": 6.460668087005615, + "learning_rate": 7.651916330856816e-06, + "logits/chosen": -0.1690518856048584, + "logits/rejected": -0.29952162504196167, + "logps/chosen": -67.31298065185547, + "logps/rejected": -73.28048706054688, + "loss": 0.9116, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5929012298583984, + "rewards/margins": 2.971651077270508, + "rewards/rejected": -0.37874987721443176, + "step": 1931 + }, + { + "epoch": 0.48, + "grad_norm": 4.090494155883789, + "learning_rate": 7.649695061151383e-06, + "logits/chosen": -0.18664094805717468, + "logits/rejected": -0.30635491013526917, + "logps/chosen": -62.483924865722656, + "logps/rejected": -68.69698333740234, + "loss": 0.8737, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8615193367004395, + "rewards/margins": 3.6791889667510986, + "rewards/rejected": -0.8176693320274353, + "step": 1932 + }, + { + "epoch": 0.48, + "grad_norm": 3.539330244064331, + "learning_rate": 7.647473064047478e-06, + "logits/chosen": -0.2470875382423401, + "logits/rejected": -0.34376829862594604, + "logps/chosen": -54.0460319519043, + "logps/rejected": -82.041259765625, + "loss": 0.7593, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.114328384399414, + "rewards/margins": 3.978713274002075, + "rewards/rejected": -0.8643843531608582, + "step": 1933 + }, + { + "epoch": 0.48, + "grad_norm": 3.877798318862915, + "learning_rate": 7.64525034015509e-06, + "logits/chosen": -0.27351200580596924, + "logits/rejected": -0.3283195197582245, + "logps/chosen": -52.99517059326172, + "logps/rejected": -74.39070129394531, + "loss": 0.8091, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.667947769165039, + "rewards/margins": 3.1449904441833496, + "rewards/rejected": -0.47704288363456726, + "step": 1934 + }, + { + "epoch": 0.48, + "grad_norm": 2.9689652919769287, + "learning_rate": 7.643026890084404e-06, + "logits/chosen": -0.3210107684135437, + "logits/rejected": -0.4321798086166382, + "logps/chosen": -51.01796340942383, + "logps/rejected": -71.34293365478516, + "loss": 0.6484, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.84647536277771, + "rewards/margins": 4.696261405944824, + "rewards/rejected": -1.8497860431671143, + "step": 1935 + }, + { + "epoch": 0.48, + "grad_norm": 4.762024402618408, + "learning_rate": 7.640802714445803e-06, + "logits/chosen": -0.28298455476760864, + "logits/rejected": -0.355695515871048, + "logps/chosen": -49.13285827636719, + "logps/rejected": -66.52802276611328, + "loss": 0.8257, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9381000995635986, + "rewards/margins": 4.012421607971191, + "rewards/rejected": -1.07432222366333, + "step": 1936 + }, + { + "epoch": 0.48, + "grad_norm": 2.942927122116089, + "learning_rate": 7.638577813849873e-06, + "logits/chosen": -0.2461572289466858, + "logits/rejected": -0.34626534581184387, + "logps/chosen": -58.22710037231445, + "logps/rejected": -86.63983917236328, + "loss": 0.7386, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7637457847595215, + "rewards/margins": 4.035621166229248, + "rewards/rejected": -1.2718756198883057, + "step": 1937 + }, + { + "epoch": 0.48, + "grad_norm": 4.301900863647461, + "learning_rate": 7.6363521889074e-06, + "logits/chosen": -0.2533873915672302, + "logits/rejected": -0.35514599084854126, + "logps/chosen": -57.12974166870117, + "logps/rejected": -80.60867309570312, + "loss": 0.7205, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6590046882629395, + "rewards/margins": 4.478847980499268, + "rewards/rejected": -1.8198431730270386, + "step": 1938 + }, + { + "epoch": 0.49, + "grad_norm": 3.5725491046905518, + "learning_rate": 7.634125840229359e-06, + "logits/chosen": -0.13019725680351257, + "logits/rejected": -0.2400582879781723, + "logps/chosen": -48.219966888427734, + "logps/rejected": -83.56256866455078, + "loss": 0.6279, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.128805637359619, + "rewards/margins": 3.7758514881134033, + "rewards/rejected": -0.6470460891723633, + "step": 1939 + }, + { + "epoch": 0.49, + "grad_norm": 2.6653013229370117, + "learning_rate": 7.631898768426938e-06, + "logits/chosen": -0.2626519799232483, + "logits/rejected": -0.31614041328430176, + "logps/chosen": -49.45024108886719, + "logps/rejected": -84.1177749633789, + "loss": 0.6868, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9145216941833496, + "rewards/margins": 3.842719316482544, + "rewards/rejected": -0.9281978011131287, + "step": 1940 + }, + { + "epoch": 0.49, + "grad_norm": 4.983346939086914, + "learning_rate": 7.629670974111512e-06, + "logits/chosen": -0.22878113389015198, + "logits/rejected": -0.31736278533935547, + "logps/chosen": -55.112606048583984, + "logps/rejected": -70.26268768310547, + "loss": 0.8408, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5344018936157227, + "rewards/margins": 3.0010628700256348, + "rewards/rejected": -0.4666614234447479, + "step": 1941 + }, + { + "epoch": 0.49, + "grad_norm": 3.6524622440338135, + "learning_rate": 7.627442457894659e-06, + "logits/chosen": -0.16908952593803406, + "logits/rejected": -0.3085087537765503, + "logps/chosen": -58.95724868774414, + "logps/rejected": -68.90016174316406, + "loss": 0.6966, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.994006633758545, + "rewards/margins": 4.098779201507568, + "rewards/rejected": -1.1047725677490234, + "step": 1942 + }, + { + "epoch": 0.49, + "grad_norm": 7.91251277923584, + "learning_rate": 7.625213220388157e-06, + "logits/chosen": -0.25925564765930176, + "logits/rejected": -0.29393139481544495, + "logps/chosen": -49.755462646484375, + "logps/rejected": -77.45293426513672, + "loss": 0.7999, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.241299629211426, + "rewards/margins": 3.7599570751190186, + "rewards/rejected": -0.5186569690704346, + "step": 1943 + }, + { + "epoch": 0.49, + "grad_norm": 4.433170318603516, + "learning_rate": 7.6229832622039776e-06, + "logits/chosen": -0.23315994441509247, + "logits/rejected": -0.341437429189682, + "logps/chosen": -54.20657730102539, + "logps/rejected": -73.43356323242188, + "loss": 0.8168, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9935226440429688, + "rewards/margins": 3.9321115016937256, + "rewards/rejected": -0.9385887980461121, + "step": 1944 + }, + { + "epoch": 0.49, + "grad_norm": 9.211579322814941, + "learning_rate": 7.620752583954294e-06, + "logits/chosen": -0.19721253216266632, + "logits/rejected": -0.3742499351501465, + "logps/chosen": -60.65299987792969, + "logps/rejected": -67.34857177734375, + "loss": 0.7129, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.761500358581543, + "rewards/margins": 4.477779388427734, + "rewards/rejected": -1.7162795066833496, + "step": 1945 + }, + { + "epoch": 0.49, + "grad_norm": 6.114449977874756, + "learning_rate": 7.618521186251474e-06, + "logits/chosen": -0.18119870126247406, + "logits/rejected": -0.3280108571052551, + "logps/chosen": -57.70205307006836, + "logps/rejected": -69.18635559082031, + "loss": 0.8296, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.052969455718994, + "rewards/margins": 3.878366470336914, + "rewards/rejected": -0.8253969550132751, + "step": 1946 + }, + { + "epoch": 0.49, + "grad_norm": 4.233829975128174, + "learning_rate": 7.616289069708085e-06, + "logits/chosen": -0.16537536680698395, + "logits/rejected": -0.21296679973602295, + "logps/chosen": -65.38064575195312, + "logps/rejected": -84.77720642089844, + "loss": 0.8882, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.665855646133423, + "rewards/margins": 2.7503137588500977, + "rewards/rejected": -0.08445823192596436, + "step": 1947 + }, + { + "epoch": 0.49, + "grad_norm": 3.146697998046875, + "learning_rate": 7.61405623493689e-06, + "logits/chosen": -0.18423809111118317, + "logits/rejected": -0.3343295753002167, + "logps/chosen": -59.26106643676758, + "logps/rejected": -78.44773864746094, + "loss": 0.6855, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8126821517944336, + "rewards/margins": 4.479374408721924, + "rewards/rejected": -1.666691780090332, + "step": 1948 + }, + { + "epoch": 0.49, + "grad_norm": 3.9350109100341797, + "learning_rate": 7.611822682550856e-06, + "logits/chosen": -0.21799659729003906, + "logits/rejected": -0.3815011978149414, + "logps/chosen": -61.659542083740234, + "logps/rejected": -71.6908950805664, + "loss": 0.8015, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.529585838317871, + "rewards/margins": 4.452095985412598, + "rewards/rejected": -1.9225103855133057, + "step": 1949 + }, + { + "epoch": 0.49, + "grad_norm": 3.9095797538757324, + "learning_rate": 7.609588413163133e-06, + "logits/chosen": -0.21739158034324646, + "logits/rejected": -0.29694950580596924, + "logps/chosen": -62.732826232910156, + "logps/rejected": -81.23313903808594, + "loss": 0.8566, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.880253791809082, + "rewards/margins": 4.0527753829956055, + "rewards/rejected": -1.1725220680236816, + "step": 1950 + }, + { + "epoch": 0.49, + "grad_norm": 6.861201286315918, + "learning_rate": 7.6073534273870804e-06, + "logits/chosen": -0.2426571398973465, + "logits/rejected": -0.27493730187416077, + "logps/chosen": -70.57368469238281, + "logps/rejected": -103.94051361083984, + "loss": 0.9444, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5821285247802734, + "rewards/margins": 4.70379638671875, + "rewards/rejected": -2.1216676235198975, + "step": 1951 + }, + { + "epoch": 0.49, + "grad_norm": 4.655007839202881, + "learning_rate": 7.605117725836251e-06, + "logits/chosen": -0.22302216291427612, + "logits/rejected": -0.2799334228038788, + "logps/chosen": -52.95418167114258, + "logps/rejected": -88.66413879394531, + "loss": 0.8555, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7027900218963623, + "rewards/margins": 4.128477096557617, + "rewards/rejected": -1.4256869554519653, + "step": 1952 + }, + { + "epoch": 0.49, + "grad_norm": 3.743044137954712, + "learning_rate": 7.602881309124392e-06, + "logits/chosen": -0.21200448274612427, + "logits/rejected": -0.3258962333202362, + "logps/chosen": -60.83403778076172, + "logps/rejected": -69.08464813232422, + "loss": 0.8231, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.16792631149292, + "rewards/margins": 4.636747360229492, + "rewards/rejected": -1.4688208103179932, + "step": 1953 + }, + { + "epoch": 0.49, + "grad_norm": 4.302679538726807, + "learning_rate": 7.600644177865447e-06, + "logits/chosen": -0.16387897729873657, + "logits/rejected": -0.30915313959121704, + "logps/chosen": -55.479042053222656, + "logps/rejected": -77.64979553222656, + "loss": 0.6697, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7726869583129883, + "rewards/margins": 4.092343330383301, + "rewards/rejected": -1.3196561336517334, + "step": 1954 + }, + { + "epoch": 0.49, + "grad_norm": 4.757946968078613, + "learning_rate": 7.598406332673558e-06, + "logits/chosen": -0.2269361913204193, + "logits/rejected": -0.34865710139274597, + "logps/chosen": -61.459712982177734, + "logps/rejected": -77.2875747680664, + "loss": 0.8154, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8029491901397705, + "rewards/margins": 3.7835028171539307, + "rewards/rejected": -0.980553388595581, + "step": 1955 + }, + { + "epoch": 0.49, + "grad_norm": 4.401974678039551, + "learning_rate": 7.596167774163061e-06, + "logits/chosen": -0.170942485332489, + "logits/rejected": -0.20970581471920013, + "logps/chosen": -47.92631530761719, + "logps/rejected": -88.2237319946289, + "loss": 0.7047, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5765819549560547, + "rewards/margins": 3.7012155055999756, + "rewards/rejected": -1.1246337890625, + "step": 1956 + }, + { + "epoch": 0.49, + "grad_norm": 4.447781562805176, + "learning_rate": 7.593928502948491e-06, + "logits/chosen": -0.2398969978094101, + "logits/rejected": -0.3518989682197571, + "logps/chosen": -60.29056930541992, + "logps/rejected": -76.0194091796875, + "loss": 0.802, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6970462799072266, + "rewards/margins": 4.1904683113098145, + "rewards/rejected": -1.4934219121932983, + "step": 1957 + }, + { + "epoch": 0.49, + "grad_norm": 8.610234260559082, + "learning_rate": 7.5916885196445735e-06, + "logits/chosen": -0.32293835282325745, + "logits/rejected": -0.3122580349445343, + "logps/chosen": -48.163631439208984, + "logps/rejected": -81.855224609375, + "loss": 0.8302, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.548473596572876, + "rewards/margins": 3.700054168701172, + "rewards/rejected": -1.1515803337097168, + "step": 1958 + }, + { + "epoch": 0.49, + "grad_norm": 4.149646282196045, + "learning_rate": 7.589447824866232e-06, + "logits/chosen": -0.1826079934835434, + "logits/rejected": -0.35741305351257324, + "logps/chosen": -60.690040588378906, + "logps/rejected": -69.43766784667969, + "loss": 0.6909, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9358716011047363, + "rewards/margins": 4.531767845153809, + "rewards/rejected": -1.5958962440490723, + "step": 1959 + }, + { + "epoch": 0.49, + "grad_norm": 9.547279357910156, + "learning_rate": 7.587206419228587e-06, + "logits/chosen": -0.24070978164672852, + "logits/rejected": -0.37480252981185913, + "logps/chosen": -54.291141510009766, + "logps/rejected": -77.54720306396484, + "loss": 0.7295, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.006244659423828, + "rewards/margins": 4.518747806549072, + "rewards/rejected": -1.512502670288086, + "step": 1960 + }, + { + "epoch": 0.49, + "grad_norm": 2.8208158016204834, + "learning_rate": 7.584964303346953e-06, + "logits/chosen": -0.26664942502975464, + "logits/rejected": -0.3645073175430298, + "logps/chosen": -57.53052520751953, + "logps/rejected": -84.95222473144531, + "loss": 0.7202, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9438185691833496, + "rewards/margins": 4.282240390777588, + "rewards/rejected": -1.3384212255477905, + "step": 1961 + }, + { + "epoch": 0.49, + "grad_norm": 16.648910522460938, + "learning_rate": 7.582721477836837e-06, + "logits/chosen": -0.240538090467453, + "logits/rejected": -0.36544445157051086, + "logps/chosen": -67.30219268798828, + "logps/rejected": -76.24103546142578, + "loss": 1.0003, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4441211223602295, + "rewards/margins": 3.2443246841430664, + "rewards/rejected": -0.8002034425735474, + "step": 1962 + }, + { + "epoch": 0.49, + "grad_norm": 4.353855609893799, + "learning_rate": 7.580477943313948e-06, + "logits/chosen": -0.2441003918647766, + "logits/rejected": -0.32410770654678345, + "logps/chosen": -64.37535095214844, + "logps/rejected": -70.89215850830078, + "loss": 0.8412, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.057361602783203, + "rewards/margins": 3.610957145690918, + "rewards/rejected": -0.5535951852798462, + "step": 1963 + }, + { + "epoch": 0.49, + "grad_norm": 5.137190818786621, + "learning_rate": 7.578233700394178e-06, + "logits/chosen": -0.22225800156593323, + "logits/rejected": -0.36933499574661255, + "logps/chosen": -49.374839782714844, + "logps/rejected": -73.3279037475586, + "loss": 0.7565, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.959015369415283, + "rewards/margins": 3.9727325439453125, + "rewards/rejected": -1.0137172937393188, + "step": 1964 + }, + { + "epoch": 0.49, + "grad_norm": 12.20388412475586, + "learning_rate": 7.575988749693626e-06, + "logits/chosen": -0.19948552548885345, + "logits/rejected": -0.3869474232196808, + "logps/chosen": -73.77043914794922, + "logps/rejected": -72.3638916015625, + "loss": 0.9113, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.3687968254089355, + "rewards/margins": 3.675879955291748, + "rewards/rejected": -1.3070831298828125, + "step": 1965 + }, + { + "epoch": 0.49, + "grad_norm": 8.742332458496094, + "learning_rate": 7.573743091828573e-06, + "logits/chosen": -0.26182886958122253, + "logits/rejected": -0.3542460799217224, + "logps/chosen": -67.43328857421875, + "logps/rejected": -79.86711883544922, + "loss": 0.9069, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5522351264953613, + "rewards/margins": 3.6959993839263916, + "rewards/rejected": -1.1437642574310303, + "step": 1966 + }, + { + "epoch": 0.49, + "grad_norm": 11.27313232421875, + "learning_rate": 7.5714967274155086e-06, + "logits/chosen": -0.29636150598526, + "logits/rejected": -0.38845592737197876, + "logps/chosen": -58.40966796875, + "logps/rejected": -83.75686645507812, + "loss": 0.7801, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.816995859146118, + "rewards/margins": 4.4220757484436035, + "rewards/rejected": -1.6050795316696167, + "step": 1967 + }, + { + "epoch": 0.49, + "grad_norm": 4.6545305252075195, + "learning_rate": 7.569249657071102e-06, + "logits/chosen": -0.24413761496543884, + "logits/rejected": -0.3680959939956665, + "logps/chosen": -63.873085021972656, + "logps/rejected": -83.3845443725586, + "loss": 0.774, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.755586862564087, + "rewards/margins": 4.112039089202881, + "rewards/rejected": -1.3564523458480835, + "step": 1968 + }, + { + "epoch": 0.49, + "grad_norm": 4.473301410675049, + "learning_rate": 7.567001881412224e-06, + "logits/chosen": -0.16266360878944397, + "logits/rejected": -0.2658658027648926, + "logps/chosen": -70.4184799194336, + "logps/rejected": -82.56629180908203, + "loss": 0.8689, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.704517126083374, + "rewards/margins": 3.9426403045654297, + "rewards/rejected": -1.2381230592727661, + "step": 1969 + }, + { + "epoch": 0.49, + "grad_norm": 14.175238609313965, + "learning_rate": 7.564753401055938e-06, + "logits/chosen": -0.18920429050922394, + "logits/rejected": -0.2905029058456421, + "logps/chosen": -58.034610748291016, + "logps/rejected": -77.534423828125, + "loss": 0.8137, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9100914001464844, + "rewards/margins": 3.6598892211914062, + "rewards/rejected": -0.749798059463501, + "step": 1970 + }, + { + "epoch": 0.49, + "grad_norm": 7.500058650970459, + "learning_rate": 7.562504216619501e-06, + "logits/chosen": -0.23827695846557617, + "logits/rejected": -0.3138608932495117, + "logps/chosen": -60.7061767578125, + "logps/rejected": -76.17516326904297, + "loss": 0.9175, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.734337329864502, + "rewards/margins": 3.314688205718994, + "rewards/rejected": -0.5803509950637817, + "step": 1971 + }, + { + "epoch": 0.49, + "grad_norm": 6.699099540710449, + "learning_rate": 7.560254328720362e-06, + "logits/chosen": -0.2029874324798584, + "logits/rejected": -0.30037161707878113, + "logps/chosen": -51.97987747192383, + "logps/rejected": -75.996826171875, + "loss": 0.9339, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9894630908966064, + "rewards/margins": 3.5280239582061768, + "rewards/rejected": -0.5385611057281494, + "step": 1972 + }, + { + "epoch": 0.49, + "grad_norm": 2.95143723487854, + "learning_rate": 7.5580037379761654e-06, + "logits/chosen": -0.1547917127609253, + "logits/rejected": -0.3379122018814087, + "logps/chosen": -58.949859619140625, + "logps/rejected": -61.55281448364258, + "loss": 0.7268, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.105879068374634, + "rewards/margins": 3.9735488891601562, + "rewards/rejected": -0.8676695823669434, + "step": 1973 + }, + { + "epoch": 0.49, + "grad_norm": 6.38707971572876, + "learning_rate": 7.5557524450047424e-06, + "logits/chosen": -0.2607814371585846, + "logits/rejected": -0.2786339819431305, + "logps/chosen": -59.068458557128906, + "logps/rejected": -81.21256256103516, + "loss": 0.8595, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7716610431671143, + "rewards/margins": 3.5463485717773438, + "rewards/rejected": -0.7746878862380981, + "step": 1974 + }, + { + "epoch": 0.49, + "grad_norm": 7.233056545257568, + "learning_rate": 7.5535004504241275e-06, + "logits/chosen": -0.16527420282363892, + "logits/rejected": -0.21143367886543274, + "logps/chosen": -66.08866119384766, + "logps/rejected": -83.11415100097656, + "loss": 1.0444, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.5594944953918457, + "rewards/margins": 3.349144458770752, + "rewards/rejected": -0.7896497845649719, + "step": 1975 + }, + { + "epoch": 0.49, + "grad_norm": 7.3542866706848145, + "learning_rate": 7.551247754852535e-06, + "logits/chosen": -0.26388517022132874, + "logits/rejected": -0.3181689381599426, + "logps/chosen": -58.78184509277344, + "logps/rejected": -82.09964752197266, + "loss": 0.8601, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.986496925354004, + "rewards/margins": 3.7286970615386963, + "rewards/rejected": -0.7422000169754028, + "step": 1976 + }, + { + "epoch": 0.49, + "grad_norm": 3.759206771850586, + "learning_rate": 7.548994358908386e-06, + "logits/chosen": -0.2018672376871109, + "logits/rejected": -0.3728608787059784, + "logps/chosen": -56.342899322509766, + "logps/rejected": -70.70579528808594, + "loss": 0.7579, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6527223587036133, + "rewards/margins": 4.472803592681885, + "rewards/rejected": -1.8200809955596924, + "step": 1977 + }, + { + "epoch": 0.49, + "grad_norm": 3.063650369644165, + "learning_rate": 7.54674026321028e-06, + "logits/chosen": -0.24595659971237183, + "logits/rejected": -0.32694968581199646, + "logps/chosen": -54.38386535644531, + "logps/rejected": -83.75906372070312, + "loss": 0.679, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8773367404937744, + "rewards/margins": 4.073326587677002, + "rewards/rejected": -1.1959898471832275, + "step": 1978 + }, + { + "epoch": 0.5, + "grad_norm": 3.4295566082000732, + "learning_rate": 7.544485468377018e-06, + "logits/chosen": -0.1801244467496872, + "logits/rejected": -0.29643189907073975, + "logps/chosen": -69.53437805175781, + "logps/rejected": -76.22532653808594, + "loss": 0.8157, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6128334999084473, + "rewards/margins": 4.239686489105225, + "rewards/rejected": -1.6268534660339355, + "step": 1979 + }, + { + "epoch": 0.5, + "grad_norm": 5.140175819396973, + "learning_rate": 7.542229975027591e-06, + "logits/chosen": -0.24664057791233063, + "logits/rejected": -0.33334872126579285, + "logps/chosen": -66.14240264892578, + "logps/rejected": -77.18768310546875, + "loss": 0.8212, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7403974533081055, + "rewards/margins": 3.4123809337615967, + "rewards/rejected": -0.6719835996627808, + "step": 1980 + }, + { + "epoch": 0.5, + "grad_norm": 5.32132625579834, + "learning_rate": 7.539973783781176e-06, + "logits/chosen": -0.25137776136398315, + "logits/rejected": -0.28214049339294434, + "logps/chosen": -53.74224090576172, + "logps/rejected": -70.37625122070312, + "loss": 1.0121, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.708984613418579, + "rewards/margins": 2.5186827182769775, + "rewards/rejected": 0.19030196964740753, + "step": 1981 + }, + { + "epoch": 0.5, + "grad_norm": 3.750333309173584, + "learning_rate": 7.53771689525715e-06, + "logits/chosen": -0.2636469602584839, + "logits/rejected": -0.35081416368484497, + "logps/chosen": -50.28957748413086, + "logps/rejected": -72.87803649902344, + "loss": 0.8618, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8993172645568848, + "rewards/margins": 3.414079427719116, + "rewards/rejected": -0.5147623419761658, + "step": 1982 + }, + { + "epoch": 0.5, + "grad_norm": 2.7269015312194824, + "learning_rate": 7.535459310075079e-06, + "logits/chosen": -0.29355862736701965, + "logits/rejected": -0.3776263892650604, + "logps/chosen": -45.571311950683594, + "logps/rejected": -80.5390853881836, + "loss": 0.6988, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.121678590774536, + "rewards/margins": 4.53593111038208, + "rewards/rejected": -1.4142520427703857, + "step": 1983 + }, + { + "epoch": 0.5, + "grad_norm": 2.891315221786499, + "learning_rate": 7.533201028854713e-06, + "logits/chosen": -0.17890028655529022, + "logits/rejected": -0.30305442214012146, + "logps/chosen": -54.35236358642578, + "logps/rejected": -76.10565185546875, + "loss": 0.7094, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.008603096008301, + "rewards/margins": 4.629825592041016, + "rewards/rejected": -1.6212220191955566, + "step": 1984 + }, + { + "epoch": 0.5, + "grad_norm": 4.909289360046387, + "learning_rate": 7.530942052216005e-06, + "logits/chosen": -0.17713956534862518, + "logits/rejected": -0.29076912999153137, + "logps/chosen": -68.60353088378906, + "logps/rejected": -60.985504150390625, + "loss": 0.8324, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.923816442489624, + "rewards/margins": 3.5763893127441406, + "rewards/rejected": -0.6525729894638062, + "step": 1985 + }, + { + "epoch": 0.5, + "grad_norm": 5.054781913757324, + "learning_rate": 7.528682380779089e-06, + "logits/chosen": -0.24975070357322693, + "logits/rejected": -0.36191216111183167, + "logps/chosen": -57.097293853759766, + "logps/rejected": -78.56855773925781, + "loss": 0.8486, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.679551839828491, + "rewards/margins": 4.1333794593811035, + "rewards/rejected": -1.4538272619247437, + "step": 1986 + }, + { + "epoch": 0.5, + "grad_norm": 3.612194776535034, + "learning_rate": 7.526422015164295e-06, + "logits/chosen": -0.23622412979602814, + "logits/rejected": -0.3078162968158722, + "logps/chosen": -49.06092834472656, + "logps/rejected": -72.3940658569336, + "loss": 0.7339, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8762950897216797, + "rewards/margins": 3.7856900691986084, + "rewards/rejected": -0.9093952178955078, + "step": 1987 + }, + { + "epoch": 0.5, + "grad_norm": 5.341261386871338, + "learning_rate": 7.524160955992142e-06, + "logits/chosen": -0.2686449885368347, + "logits/rejected": -0.30450719594955444, + "logps/chosen": -55.98866271972656, + "logps/rejected": -72.24458312988281, + "loss": 0.8717, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9292759895324707, + "rewards/margins": 3.0806362628936768, + "rewards/rejected": -0.15136036276817322, + "step": 1988 + }, + { + "epoch": 0.5, + "grad_norm": 4.470733165740967, + "learning_rate": 7.521899203883341e-06, + "logits/chosen": -0.28188151121139526, + "logits/rejected": -0.35486626625061035, + "logps/chosen": -58.66609191894531, + "logps/rejected": -78.4171142578125, + "loss": 0.7948, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.963029384613037, + "rewards/margins": 4.181676864624023, + "rewards/rejected": -1.2186470031738281, + "step": 1989 + }, + { + "epoch": 0.5, + "grad_norm": 4.466814041137695, + "learning_rate": 7.519636759458787e-06, + "logits/chosen": -0.2031155377626419, + "logits/rejected": -0.32881465554237366, + "logps/chosen": -51.167510986328125, + "logps/rejected": -57.35157012939453, + "loss": 0.8389, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8783626556396484, + "rewards/margins": 4.0243754386901855, + "rewards/rejected": -1.1460126638412476, + "step": 1990 + }, + { + "epoch": 0.5, + "grad_norm": 5.198913097381592, + "learning_rate": 7.5173736233395754e-06, + "logits/chosen": -0.24438904225826263, + "logits/rejected": -0.3957277834415436, + "logps/chosen": -57.76525115966797, + "logps/rejected": -71.98535919189453, + "loss": 0.8121, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.932093620300293, + "rewards/margins": 3.128927707672119, + "rewards/rejected": -0.19683454930782318, + "step": 1991 + }, + { + "epoch": 0.5, + "grad_norm": 4.70208215713501, + "learning_rate": 7.515109796146982e-06, + "logits/chosen": -0.16743172705173492, + "logits/rejected": -0.3660814166069031, + "logps/chosen": -65.38452911376953, + "logps/rejected": -66.28311157226562, + "loss": 0.7849, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7975940704345703, + "rewards/margins": 3.510460138320923, + "rewards/rejected": -0.7128661274909973, + "step": 1992 + }, + { + "epoch": 0.5, + "grad_norm": 3.215911388397217, + "learning_rate": 7.512845278502478e-06, + "logits/chosen": -0.24969542026519775, + "logits/rejected": -0.39145612716674805, + "logps/chosen": -53.85342788696289, + "logps/rejected": -70.245849609375, + "loss": 0.7344, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.98636531829834, + "rewards/margins": 4.455268383026123, + "rewards/rejected": -1.4689030647277832, + "step": 1993 + }, + { + "epoch": 0.5, + "grad_norm": 8.351296424865723, + "learning_rate": 7.510580071027724e-06, + "logits/chosen": -0.23590479791164398, + "logits/rejected": -0.4064832031726837, + "logps/chosen": -56.13319396972656, + "logps/rejected": -76.19368743896484, + "loss": 0.9508, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.432210683822632, + "rewards/margins": 4.005546569824219, + "rewards/rejected": -1.573335886001587, + "step": 1994 + }, + { + "epoch": 0.5, + "grad_norm": 4.13419771194458, + "learning_rate": 7.508314174344567e-06, + "logits/chosen": -0.1926887482404709, + "logits/rejected": -0.36861783266067505, + "logps/chosen": -64.79364013671875, + "logps/rejected": -69.04682159423828, + "loss": 0.8325, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.919733762741089, + "rewards/margins": 4.923795223236084, + "rewards/rejected": -2.004061698913574, + "step": 1995 + }, + { + "epoch": 0.5, + "grad_norm": 6.209656238555908, + "learning_rate": 7.506047589075041e-06, + "logits/chosen": -0.14670690894126892, + "logits/rejected": -0.2544839084148407, + "logps/chosen": -56.42799377441406, + "logps/rejected": -74.5350341796875, + "loss": 0.8729, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.817037582397461, + "rewards/margins": 3.4461374282836914, + "rewards/rejected": -0.6290999054908752, + "step": 1996 + }, + { + "epoch": 0.5, + "grad_norm": 8.182992935180664, + "learning_rate": 7.503780315841377e-06, + "logits/chosen": -0.29502859711647034, + "logits/rejected": -0.361275851726532, + "logps/chosen": -47.723453521728516, + "logps/rejected": -77.30945587158203, + "loss": 0.9528, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3770437240600586, + "rewards/margins": 3.8222217559814453, + "rewards/rejected": -1.4451780319213867, + "step": 1997 + }, + { + "epoch": 0.5, + "grad_norm": 5.764414310455322, + "learning_rate": 7.50151235526599e-06, + "logits/chosen": -0.2242804318666458, + "logits/rejected": -0.36243993043899536, + "logps/chosen": -68.49608612060547, + "logps/rejected": -75.32843780517578, + "loss": 0.9561, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.988755464553833, + "rewards/margins": 3.5833284854888916, + "rewards/rejected": -0.594572901725769, + "step": 1998 + }, + { + "epoch": 0.5, + "grad_norm": 4.048060417175293, + "learning_rate": 7.499243707971479e-06, + "logits/chosen": -0.18065962195396423, + "logits/rejected": -0.28002509474754333, + "logps/chosen": -70.59066772460938, + "logps/rejected": -79.77536010742188, + "loss": 0.8197, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.955453395843506, + "rewards/margins": 4.114700794219971, + "rewards/rejected": -1.159247636795044, + "step": 1999 + }, + { + "epoch": 0.5, + "grad_norm": 4.869962215423584, + "learning_rate": 7.496974374580645e-06, + "logits/chosen": -0.22396790981292725, + "logits/rejected": -0.3204975426197052, + "logps/chosen": -54.651126861572266, + "logps/rejected": -89.5643310546875, + "loss": 0.7455, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6971750259399414, + "rewards/margins": 4.331631660461426, + "rewards/rejected": -1.6344571113586426, + "step": 2000 + }, + { + "epoch": 0.5, + "grad_norm": 5.084163188934326, + "learning_rate": 7.494704355716465e-06, + "logits/chosen": -0.21307972073554993, + "logits/rejected": -0.3276970684528351, + "logps/chosen": -63.59079360961914, + "logps/rejected": -81.4106674194336, + "loss": 0.8614, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.641427993774414, + "rewards/margins": 3.346381425857544, + "rewards/rejected": -0.7049536108970642, + "step": 2001 + }, + { + "epoch": 0.5, + "grad_norm": 4.391397476196289, + "learning_rate": 7.492433652002105e-06, + "logits/chosen": -0.17685478925704956, + "logits/rejected": -0.24878349900245667, + "logps/chosen": -52.55965805053711, + "logps/rejected": -74.22588348388672, + "loss": 0.7779, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.882114887237549, + "rewards/margins": 3.775358200073242, + "rewards/rejected": -0.8932433724403381, + "step": 2002 + }, + { + "epoch": 0.5, + "grad_norm": 5.322356224060059, + "learning_rate": 7.4901622640609264e-06, + "logits/chosen": -0.24240823090076447, + "logits/rejected": -0.2852243483066559, + "logps/chosen": -56.454689025878906, + "logps/rejected": -80.02412414550781, + "loss": 0.7658, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5687317848205566, + "rewards/margins": 4.197616100311279, + "rewards/rejected": -1.6288843154907227, + "step": 2003 + }, + { + "epoch": 0.5, + "grad_norm": 3.8478240966796875, + "learning_rate": 7.487890192516472e-06, + "logits/chosen": -0.2522457540035248, + "logits/rejected": -0.3709235191345215, + "logps/chosen": -55.562774658203125, + "logps/rejected": -73.95464324951172, + "loss": 0.7168, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7084414958953857, + "rewards/margins": 4.875396728515625, + "rewards/rejected": -2.1669552326202393, + "step": 2004 + }, + { + "epoch": 0.5, + "grad_norm": 14.224620819091797, + "learning_rate": 7.4856174379924765e-06, + "logits/chosen": -0.30973079800605774, + "logits/rejected": -0.4967919886112213, + "logps/chosen": -55.079010009765625, + "logps/rejected": -58.3954963684082, + "loss": 0.7264, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1391983032226562, + "rewards/margins": 4.6987624168396, + "rewards/rejected": -1.5595641136169434, + "step": 2005 + }, + { + "epoch": 0.5, + "grad_norm": 2.6182398796081543, + "learning_rate": 7.4833440011128585e-06, + "logits/chosen": -0.21304932236671448, + "logits/rejected": -0.3711051344871521, + "logps/chosen": -55.313629150390625, + "logps/rejected": -71.51107788085938, + "loss": 0.6699, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.083509683609009, + "rewards/margins": 4.820944309234619, + "rewards/rejected": -1.7374346256256104, + "step": 2006 + }, + { + "epoch": 0.5, + "grad_norm": 3.258535861968994, + "learning_rate": 7.481069882501726e-06, + "logits/chosen": -0.20639634132385254, + "logits/rejected": -0.28403401374816895, + "logps/chosen": -53.61602020263672, + "logps/rejected": -81.90266418457031, + "loss": 0.7089, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7364516258239746, + "rewards/margins": 3.688767671585083, + "rewards/rejected": -0.9523161053657532, + "step": 2007 + }, + { + "epoch": 0.5, + "grad_norm": 3.588179349899292, + "learning_rate": 7.478795082783374e-06, + "logits/chosen": -0.23927977681159973, + "logits/rejected": -0.35686051845550537, + "logps/chosen": -59.080841064453125, + "logps/rejected": -68.6656494140625, + "loss": 0.773, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0046117305755615, + "rewards/margins": 5.220296859741211, + "rewards/rejected": -2.2156858444213867, + "step": 2008 + }, + { + "epoch": 0.5, + "grad_norm": 6.161584854125977, + "learning_rate": 7.476519602582282e-06, + "logits/chosen": -0.2120731770992279, + "logits/rejected": -0.28374600410461426, + "logps/chosen": -58.650856018066406, + "logps/rejected": -79.69637298583984, + "loss": 0.8999, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.486283779144287, + "rewards/margins": 3.120265245437622, + "rewards/rejected": -0.6339813470840454, + "step": 2009 + }, + { + "epoch": 0.5, + "grad_norm": 6.745330810546875, + "learning_rate": 7.474243442523122e-06, + "logits/chosen": -0.2610008418560028, + "logits/rejected": -0.3394772410392761, + "logps/chosen": -52.43653869628906, + "logps/rejected": -88.429931640625, + "loss": 0.8743, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.769840717315674, + "rewards/margins": 3.3811120986938477, + "rewards/rejected": -0.6112714409828186, + "step": 2010 + }, + { + "epoch": 0.5, + "grad_norm": 4.594130516052246, + "learning_rate": 7.471966603230744e-06, + "logits/chosen": -0.25094854831695557, + "logits/rejected": -0.35495254397392273, + "logps/chosen": -56.94798278808594, + "logps/rejected": -76.80574798583984, + "loss": 0.7922, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0221807956695557, + "rewards/margins": 4.420464038848877, + "rewards/rejected": -1.3982830047607422, + "step": 2011 + }, + { + "epoch": 0.5, + "grad_norm": 6.423919200897217, + "learning_rate": 7.469689085330196e-06, + "logits/chosen": -0.26726216077804565, + "logits/rejected": -0.3097223937511444, + "logps/chosen": -58.4944953918457, + "logps/rejected": -79.24488067626953, + "loss": 0.8502, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0532312393188477, + "rewards/margins": 3.864715337753296, + "rewards/rejected": -0.8114842176437378, + "step": 2012 + }, + { + "epoch": 0.5, + "grad_norm": 5.015124320983887, + "learning_rate": 7.467410889446701e-06, + "logits/chosen": -0.18824642896652222, + "logits/rejected": -0.23131033778190613, + "logps/chosen": -57.139556884765625, + "logps/rejected": -86.10012817382812, + "loss": 0.8847, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8573384284973145, + "rewards/margins": 3.340397357940674, + "rewards/rejected": -0.4830586314201355, + "step": 2013 + }, + { + "epoch": 0.5, + "grad_norm": 3.6239395141601562, + "learning_rate": 7.465132016205674e-06, + "logits/chosen": -0.24478302896022797, + "logits/rejected": -0.3858999013900757, + "logps/chosen": -73.78907012939453, + "logps/rejected": -77.37078094482422, + "loss": 0.8125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7460896968841553, + "rewards/margins": 4.717555999755859, + "rewards/rejected": -1.971466302871704, + "step": 2014 + }, + { + "epoch": 0.5, + "grad_norm": 2.2107961177825928, + "learning_rate": 7.462852466232715e-06, + "logits/chosen": -0.2918417453765869, + "logits/rejected": -0.4667920768260956, + "logps/chosen": -55.14739990234375, + "logps/rejected": -73.41316223144531, + "loss": 0.5852, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8181514739990234, + "rewards/margins": 5.18372106552124, + "rewards/rejected": -2.365570068359375, + "step": 2015 + }, + { + "epoch": 0.5, + "grad_norm": 3.408130168914795, + "learning_rate": 7.4605722401536106e-06, + "logits/chosen": -0.2704653739929199, + "logits/rejected": -0.4375876486301422, + "logps/chosen": -48.685577392578125, + "logps/rejected": -66.06913757324219, + "loss": 0.6, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9840450286865234, + "rewards/margins": 4.958310127258301, + "rewards/rejected": -1.9742655754089355, + "step": 2016 + }, + { + "epoch": 0.5, + "grad_norm": 1.888656497001648, + "learning_rate": 7.458291338594331e-06, + "logits/chosen": -0.1466880440711975, + "logits/rejected": -0.3335815370082855, + "logps/chosen": -57.14665603637695, + "logps/rejected": -75.04350280761719, + "loss": 0.551, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0156846046447754, + "rewards/margins": 5.197877407073975, + "rewards/rejected": -2.1821930408477783, + "step": 2017 + }, + { + "epoch": 0.5, + "grad_norm": 4.170684337615967, + "learning_rate": 7.4560097621810346e-06, + "logits/chosen": -0.0873708724975586, + "logits/rejected": -0.27521684765815735, + "logps/chosen": -71.06947326660156, + "logps/rejected": -71.44621276855469, + "loss": 0.811, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7070071697235107, + "rewards/margins": 3.55180287361145, + "rewards/rejected": -0.8447954058647156, + "step": 2018 + }, + { + "epoch": 0.51, + "grad_norm": 5.905140399932861, + "learning_rate": 7.45372751154006e-06, + "logits/chosen": -0.2027454972267151, + "logits/rejected": -0.3279259204864502, + "logps/chosen": -61.55508804321289, + "logps/rejected": -88.31771087646484, + "loss": 0.7815, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.843630313873291, + "rewards/margins": 4.306580066680908, + "rewards/rejected": -1.4629499912261963, + "step": 2019 + }, + { + "epoch": 0.51, + "grad_norm": 3.5519421100616455, + "learning_rate": 7.451444587297937e-06, + "logits/chosen": -0.34477269649505615, + "logits/rejected": -0.40406474471092224, + "logps/chosen": -49.1024284362793, + "logps/rejected": -80.08708190917969, + "loss": 0.6931, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9605164527893066, + "rewards/margins": 4.038399696350098, + "rewards/rejected": -1.0778834819793701, + "step": 2020 + }, + { + "epoch": 0.51, + "grad_norm": 7.653642654418945, + "learning_rate": 7.449160990081377e-06, + "logits/chosen": -0.32138144969940186, + "logits/rejected": -0.3939605951309204, + "logps/chosen": -52.7991828918457, + "logps/rejected": -80.5954818725586, + "loss": 0.7657, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.299492120742798, + "rewards/margins": 4.483851909637451, + "rewards/rejected": -1.1843597888946533, + "step": 2021 + }, + { + "epoch": 0.51, + "grad_norm": 7.710453510284424, + "learning_rate": 7.4468767205172755e-06, + "logits/chosen": -0.346699595451355, + "logits/rejected": -0.42676255106925964, + "logps/chosen": -54.258140563964844, + "logps/rejected": -70.38383483886719, + "loss": 0.8197, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7294325828552246, + "rewards/margins": 3.807925224304199, + "rewards/rejected": -1.0784924030303955, + "step": 2022 + }, + { + "epoch": 0.51, + "grad_norm": 3.7460837364196777, + "learning_rate": 7.444591779232716e-06, + "logits/chosen": -0.303888738155365, + "logits/rejected": -0.38418638706207275, + "logps/chosen": -57.67195510864258, + "logps/rejected": -77.1217269897461, + "loss": 0.8228, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.995758295059204, + "rewards/margins": 4.553310871124268, + "rewards/rejected": -1.5575522184371948, + "step": 2023 + }, + { + "epoch": 0.51, + "grad_norm": 7.034392356872559, + "learning_rate": 7.4423061668549625e-06, + "logits/chosen": -0.1754245012998581, + "logits/rejected": -0.2151910960674286, + "logps/chosen": -64.45460510253906, + "logps/rejected": -78.97660827636719, + "loss": 0.9048, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.559277057647705, + "rewards/margins": 3.2995057106018066, + "rewards/rejected": -0.7402283549308777, + "step": 2024 + }, + { + "epoch": 0.51, + "grad_norm": 4.345500946044922, + "learning_rate": 7.440019884011464e-06, + "logits/chosen": -0.2524023652076721, + "logits/rejected": -0.3447258472442627, + "logps/chosen": -64.96368408203125, + "logps/rejected": -70.57785034179688, + "loss": 0.8937, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.064932346343994, + "rewards/margins": 3.3579025268554688, + "rewards/rejected": -0.2929702699184418, + "step": 2025 + }, + { + "epoch": 0.51, + "grad_norm": 6.5379252433776855, + "learning_rate": 7.437732931329858e-06, + "logits/chosen": -0.2660989761352539, + "logits/rejected": -0.39530497789382935, + "logps/chosen": -53.4444580078125, + "logps/rejected": -77.89935302734375, + "loss": 0.8345, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7044479846954346, + "rewards/margins": 3.823922872543335, + "rewards/rejected": -1.1194744110107422, + "step": 2026 + }, + { + "epoch": 0.51, + "grad_norm": 4.046133041381836, + "learning_rate": 7.435445309437958e-06, + "logits/chosen": -0.17644785344600677, + "logits/rejected": -0.2863476276397705, + "logps/chosen": -63.99126434326172, + "logps/rejected": -78.52992248535156, + "loss": 0.882, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.635319709777832, + "rewards/margins": 3.769259452819824, + "rewards/rejected": -1.133939504623413, + "step": 2027 + }, + { + "epoch": 0.51, + "grad_norm": 3.930593252182007, + "learning_rate": 7.433157018963771e-06, + "logits/chosen": -0.22834360599517822, + "logits/rejected": -0.3532628118991852, + "logps/chosen": -52.83848571777344, + "logps/rejected": -83.9590835571289, + "loss": 0.7914, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9611408710479736, + "rewards/margins": 4.8842573165893555, + "rewards/rejected": -1.923116683959961, + "step": 2028 + }, + { + "epoch": 0.51, + "grad_norm": 3.8332674503326416, + "learning_rate": 7.4308680605354765e-06, + "logits/chosen": -0.27346840500831604, + "logits/rejected": -0.33725202083587646, + "logps/chosen": -57.455657958984375, + "logps/rejected": -77.4564437866211, + "loss": 0.6896, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2864878177642822, + "rewards/margins": 4.1062211990356445, + "rewards/rejected": -0.8197335004806519, + "step": 2029 + }, + { + "epoch": 0.51, + "grad_norm": 4.677030086517334, + "learning_rate": 7.428578434781446e-06, + "logits/chosen": -0.18795812129974365, + "logits/rejected": -0.288632333278656, + "logps/chosen": -67.26300048828125, + "logps/rejected": -82.6047592163086, + "loss": 0.8288, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5953071117401123, + "rewards/margins": 3.7234034538269043, + "rewards/rejected": -1.128096580505371, + "step": 2030 + }, + { + "epoch": 0.51, + "grad_norm": 5.079602241516113, + "learning_rate": 7.42628814233023e-06, + "logits/chosen": -0.23473820090293884, + "logits/rejected": -0.32557612657546997, + "logps/chosen": -50.6103515625, + "logps/rejected": -64.73087310791016, + "loss": 0.8381, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3015096187591553, + "rewards/margins": 2.971374988555908, + "rewards/rejected": -0.6698655486106873, + "step": 2031 + }, + { + "epoch": 0.51, + "grad_norm": 7.914364814758301, + "learning_rate": 7.423997183810565e-06, + "logits/chosen": -0.28995245695114136, + "logits/rejected": -0.3833315372467041, + "logps/chosen": -60.950767517089844, + "logps/rejected": -73.50104522705078, + "loss": 0.9288, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.929222583770752, + "rewards/margins": 3.4333889484405518, + "rewards/rejected": -0.504166305065155, + "step": 2032 + }, + { + "epoch": 0.51, + "grad_norm": 5.290042877197266, + "learning_rate": 7.421705559851366e-06, + "logits/chosen": -0.2721322476863861, + "logits/rejected": -0.3076219856739044, + "logps/chosen": -47.66267395019531, + "logps/rejected": -76.56462860107422, + "loss": 0.7613, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0211405754089355, + "rewards/margins": 3.7193381786346436, + "rewards/rejected": -0.6981972455978394, + "step": 2033 + }, + { + "epoch": 0.51, + "grad_norm": 4.071559429168701, + "learning_rate": 7.419413271081737e-06, + "logits/chosen": -0.2731415927410126, + "logits/rejected": -0.3746749758720398, + "logps/chosen": -49.80088806152344, + "logps/rejected": -65.42546844482422, + "loss": 0.809, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.004429340362549, + "rewards/margins": 3.7851474285125732, + "rewards/rejected": -0.7807180881500244, + "step": 2034 + }, + { + "epoch": 0.51, + "grad_norm": 9.376165390014648, + "learning_rate": 7.417120318130955e-06, + "logits/chosen": -0.27604159712791443, + "logits/rejected": -0.3416292667388916, + "logps/chosen": -44.363746643066406, + "logps/rejected": -65.56401824951172, + "loss": 0.7273, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.044750928878784, + "rewards/margins": 3.40490984916687, + "rewards/rejected": -0.36015886068344116, + "step": 2035 + }, + { + "epoch": 0.51, + "grad_norm": 5.000338554382324, + "learning_rate": 7.41482670162849e-06, + "logits/chosen": -0.2368626743555069, + "logits/rejected": -0.27850911021232605, + "logps/chosen": -67.69868469238281, + "logps/rejected": -90.0087661743164, + "loss": 0.9085, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9312922954559326, + "rewards/margins": 3.944992780685425, + "rewards/rejected": -1.0137004852294922, + "step": 2036 + }, + { + "epoch": 0.51, + "grad_norm": 5.981419086456299, + "learning_rate": 7.4125324222039866e-06, + "logits/chosen": -0.23543652892112732, + "logits/rejected": -0.3320864737033844, + "logps/chosen": -53.617794036865234, + "logps/rejected": -75.68546295166016, + "loss": 0.9439, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.843459129333496, + "rewards/margins": 3.815166473388672, + "rewards/rejected": -0.9717071056365967, + "step": 2037 + }, + { + "epoch": 0.51, + "grad_norm": 7.050544738769531, + "learning_rate": 7.410237480487277e-06, + "logits/chosen": -0.2495870143175125, + "logits/rejected": -0.2746140956878662, + "logps/chosen": -65.48970031738281, + "logps/rejected": -89.18656921386719, + "loss": 0.9654, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.704893112182617, + "rewards/margins": 2.991886615753174, + "rewards/rejected": -0.28699368238449097, + "step": 2038 + }, + { + "epoch": 0.51, + "grad_norm": 2.7966976165771484, + "learning_rate": 7.4079418771083686e-06, + "logits/chosen": -0.21586616337299347, + "logits/rejected": -0.344774454832077, + "logps/chosen": -54.84006881713867, + "logps/rejected": -82.27418518066406, + "loss": 0.6372, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.312681198120117, + "rewards/margins": 4.580569744110107, + "rewards/rejected": -1.2678884267807007, + "step": 2039 + }, + { + "epoch": 0.51, + "grad_norm": 4.9612226486206055, + "learning_rate": 7.405645612697456e-06, + "logits/chosen": -0.14450372755527496, + "logits/rejected": -0.2241482138633728, + "logps/chosen": -55.6500129699707, + "logps/rejected": -90.31792449951172, + "loss": 0.6533, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0112593173980713, + "rewards/margins": 3.964576244354248, + "rewards/rejected": -0.9533169865608215, + "step": 2040 + }, + { + "epoch": 0.51, + "grad_norm": 3.212876319885254, + "learning_rate": 7.403348687884912e-06, + "logits/chosen": -0.24734817445278168, + "logits/rejected": -0.3489946126937866, + "logps/chosen": -57.44252014160156, + "logps/rejected": -72.89502716064453, + "loss": 0.7286, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1976001262664795, + "rewards/margins": 4.353909492492676, + "rewards/rejected": -1.1563094854354858, + "step": 2041 + }, + { + "epoch": 0.51, + "grad_norm": 7.126574993133545, + "learning_rate": 7.401051103301294e-06, + "logits/chosen": -0.23392383754253387, + "logits/rejected": -0.2849414646625519, + "logps/chosen": -69.52767181396484, + "logps/rejected": -74.83759307861328, + "loss": 0.9504, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.935159683227539, + "rewards/margins": 2.949432134628296, + "rewards/rejected": -0.01427246630191803, + "step": 2042 + }, + { + "epoch": 0.51, + "grad_norm": 3.507089614868164, + "learning_rate": 7.398752859577338e-06, + "logits/chosen": -0.19765448570251465, + "logits/rejected": -0.25755858421325684, + "logps/chosen": -63.298030853271484, + "logps/rejected": -83.05992889404297, + "loss": 0.7426, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7916667461395264, + "rewards/margins": 3.9129104614257812, + "rewards/rejected": -1.1212437152862549, + "step": 2043 + }, + { + "epoch": 0.51, + "grad_norm": 4.459366321563721, + "learning_rate": 7.396453957343961e-06, + "logits/chosen": -0.22729116678237915, + "logits/rejected": -0.3002552092075348, + "logps/chosen": -59.06534194946289, + "logps/rejected": -68.19488525390625, + "loss": 0.8124, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.845163583755493, + "rewards/margins": 3.5686845779418945, + "rewards/rejected": -0.7235209345817566, + "step": 2044 + }, + { + "epoch": 0.51, + "grad_norm": 3.5589253902435303, + "learning_rate": 7.394154397232261e-06, + "logits/chosen": -0.2273729145526886, + "logits/rejected": -0.37634652853012085, + "logps/chosen": -55.2147102355957, + "logps/rejected": -64.52738952636719, + "loss": 0.7577, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.835970878601074, + "rewards/margins": 3.7305123805999756, + "rewards/rejected": -0.8945415019989014, + "step": 2045 + }, + { + "epoch": 0.51, + "grad_norm": 5.306704044342041, + "learning_rate": 7.3918541798735165e-06, + "logits/chosen": -0.3116873502731323, + "logits/rejected": -0.35925930738449097, + "logps/chosen": -45.64842987060547, + "logps/rejected": -64.21912384033203, + "loss": 0.8077, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8505539894104004, + "rewards/margins": 3.5491251945495605, + "rewards/rejected": -0.6985713839530945, + "step": 2046 + }, + { + "epoch": 0.51, + "grad_norm": 3.8150811195373535, + "learning_rate": 7.389553305899187e-06, + "logits/chosen": -0.2536656856536865, + "logits/rejected": -0.34441882371902466, + "logps/chosen": -54.885353088378906, + "logps/rejected": -67.26605987548828, + "loss": 0.7896, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6781089305877686, + "rewards/margins": 3.9322545528411865, + "rewards/rejected": -1.2541457414627075, + "step": 2047 + }, + { + "epoch": 0.51, + "grad_norm": 6.747729301452637, + "learning_rate": 7.3872517759409135e-06, + "logits/chosen": -0.24538400769233704, + "logits/rejected": -0.3308284878730774, + "logps/chosen": -65.71475982666016, + "logps/rejected": -74.9765853881836, + "loss": 0.8114, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.453991413116455, + "rewards/margins": 3.5808489322662354, + "rewards/rejected": -1.1268575191497803, + "step": 2048 + }, + { + "epoch": 0.51, + "grad_norm": 7.14964485168457, + "learning_rate": 7.384949590630516e-06, + "logits/chosen": -0.24924354255199432, + "logits/rejected": -0.37323710322380066, + "logps/chosen": -62.88913345336914, + "logps/rejected": -75.01614379882812, + "loss": 0.9141, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.577914237976074, + "rewards/margins": 3.744093656539917, + "rewards/rejected": -1.166179895401001, + "step": 2049 + }, + { + "epoch": 0.51, + "grad_norm": 6.711708068847656, + "learning_rate": 7.382646750599991e-06, + "logits/chosen": -0.2381880283355713, + "logits/rejected": -0.3786148428916931, + "logps/chosen": -51.77519226074219, + "logps/rejected": -66.15270233154297, + "loss": 0.8675, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.863072156906128, + "rewards/margins": 3.93599796295166, + "rewards/rejected": -1.0729258060455322, + "step": 2050 + }, + { + "epoch": 0.51, + "grad_norm": 3.400735855102539, + "learning_rate": 7.380343256481519e-06, + "logits/chosen": -0.24014434218406677, + "logits/rejected": -0.41069847345352173, + "logps/chosen": -61.807777404785156, + "logps/rejected": -78.17539978027344, + "loss": 0.7281, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.968540668487549, + "rewards/margins": 5.153913497924805, + "rewards/rejected": -2.185373067855835, + "step": 2051 + }, + { + "epoch": 0.51, + "grad_norm": 4.923370838165283, + "learning_rate": 7.378039108907461e-06, + "logits/chosen": -0.24869967997074127, + "logits/rejected": -0.36754652857780457, + "logps/chosen": -56.96467590332031, + "logps/rejected": -67.87550354003906, + "loss": 0.8212, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6468276977539062, + "rewards/margins": 3.600038528442383, + "rewards/rejected": -0.9532108902931213, + "step": 2052 + }, + { + "epoch": 0.51, + "grad_norm": 4.52139139175415, + "learning_rate": 7.3757343085103525e-06, + "logits/chosen": -0.23878014087677002, + "logits/rejected": -0.41601869463920593, + "logps/chosen": -63.280067443847656, + "logps/rejected": -76.57746887207031, + "loss": 0.8242, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.007178783416748, + "rewards/margins": 4.296503067016602, + "rewards/rejected": -1.2893240451812744, + "step": 2053 + }, + { + "epoch": 0.51, + "grad_norm": 3.1611483097076416, + "learning_rate": 7.373428855922912e-06, + "logits/chosen": -0.3349289894104004, + "logits/rejected": -0.4890783131122589, + "logps/chosen": -50.76967239379883, + "logps/rejected": -68.35528564453125, + "loss": 0.661, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.059022903442383, + "rewards/margins": 4.864102840423584, + "rewards/rejected": -1.805079698562622, + "step": 2054 + }, + { + "epoch": 0.51, + "grad_norm": 5.534727096557617, + "learning_rate": 7.371122751778037e-06, + "logits/chosen": -0.2722058594226837, + "logits/rejected": -0.33858850598335266, + "logps/chosen": -52.21108627319336, + "logps/rejected": -85.47112274169922, + "loss": 0.8046, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.931114673614502, + "rewards/margins": 3.3483643531799316, + "rewards/rejected": -0.417249858379364, + "step": 2055 + }, + { + "epoch": 0.51, + "grad_norm": 4.050187587738037, + "learning_rate": 7.368815996708801e-06, + "logits/chosen": -0.2299463301897049, + "logits/rejected": -0.3088679909706116, + "logps/chosen": -52.88954162597656, + "logps/rejected": -73.02452087402344, + "loss": 0.76, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9711766242980957, + "rewards/margins": 3.4494638442993164, + "rewards/rejected": -0.47828730940818787, + "step": 2056 + }, + { + "epoch": 0.51, + "grad_norm": 4.090572834014893, + "learning_rate": 7.366508591348458e-06, + "logits/chosen": -0.23462232947349548, + "logits/rejected": -0.3681827485561371, + "logps/chosen": -71.71769714355469, + "logps/rejected": -66.09500122070312, + "loss": 0.7735, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.683533191680908, + "rewards/margins": 3.2488675117492676, + "rewards/rejected": -0.5653342008590698, + "step": 2057 + }, + { + "epoch": 0.51, + "grad_norm": 4.911806106567383, + "learning_rate": 7.364200536330444e-06, + "logits/chosen": -0.17477190494537354, + "logits/rejected": -0.22873444855213165, + "logps/chosen": -55.54302215576172, + "logps/rejected": -80.09762573242188, + "loss": 0.6779, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9648778438568115, + "rewards/margins": 3.8699746131896973, + "rewards/rejected": -0.9050968885421753, + "step": 2058 + }, + { + "epoch": 0.52, + "grad_norm": 4.488318920135498, + "learning_rate": 7.361891832288366e-06, + "logits/chosen": -0.30625271797180176, + "logits/rejected": -0.3983533978462219, + "logps/chosen": -54.064781188964844, + "logps/rejected": -73.30754089355469, + "loss": 0.7472, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0122976303100586, + "rewards/margins": 4.199054718017578, + "rewards/rejected": -1.1867570877075195, + "step": 2059 + }, + { + "epoch": 0.52, + "grad_norm": 2.9025235176086426, + "learning_rate": 7.3595824798560135e-06, + "logits/chosen": -0.16901102662086487, + "logits/rejected": -0.34812772274017334, + "logps/chosen": -48.431251525878906, + "logps/rejected": -64.00824737548828, + "loss": 0.6063, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1727521419525146, + "rewards/margins": 4.774794101715088, + "rewards/rejected": -1.6020417213439941, + "step": 2060 + }, + { + "epoch": 0.52, + "grad_norm": 5.538657188415527, + "learning_rate": 7.357272479667355e-06, + "logits/chosen": -0.17894692718982697, + "logits/rejected": -0.2839222252368927, + "logps/chosen": -58.61553192138672, + "logps/rejected": -102.00946807861328, + "loss": 0.7304, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7142999172210693, + "rewards/margins": 4.3200836181640625, + "rewards/rejected": -1.6057839393615723, + "step": 2061 + }, + { + "epoch": 0.52, + "grad_norm": 6.4923095703125, + "learning_rate": 7.354961832356535e-06, + "logits/chosen": -0.20286378264427185, + "logits/rejected": -0.2832581400871277, + "logps/chosen": -66.45477294921875, + "logps/rejected": -84.80127716064453, + "loss": 0.686, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.637491464614868, + "rewards/margins": 3.8889317512512207, + "rewards/rejected": -1.2514402866363525, + "step": 2062 + }, + { + "epoch": 0.52, + "grad_norm": 5.536057472229004, + "learning_rate": 7.352650538557876e-06, + "logits/chosen": -0.16792449355125427, + "logits/rejected": -0.21968349814414978, + "logps/chosen": -59.52082061767578, + "logps/rejected": -83.37074279785156, + "loss": 0.9399, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.735802173614502, + "rewards/margins": 3.1429858207702637, + "rewards/rejected": -0.4071836471557617, + "step": 2063 + }, + { + "epoch": 0.52, + "grad_norm": 7.881381988525391, + "learning_rate": 7.350338598905878e-06, + "logits/chosen": -0.25947701930999756, + "logits/rejected": -0.3511209785938263, + "logps/chosen": -50.362945556640625, + "logps/rejected": -81.897705078125, + "loss": 0.856, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.827019691467285, + "rewards/margins": 3.664013147354126, + "rewards/rejected": -0.8369938135147095, + "step": 2064 + }, + { + "epoch": 0.52, + "grad_norm": 7.310354232788086, + "learning_rate": 7.348026014035219e-06, + "logits/chosen": -0.26471689343452454, + "logits/rejected": -0.32652032375335693, + "logps/chosen": -53.4123420715332, + "logps/rejected": -82.22843170166016, + "loss": 0.8083, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.692817449569702, + "rewards/margins": 3.983654260635376, + "rewards/rejected": -1.2908364534378052, + "step": 2065 + }, + { + "epoch": 0.52, + "grad_norm": 6.767618656158447, + "learning_rate": 7.345712784580752e-06, + "logits/chosen": -0.23177585005760193, + "logits/rejected": -0.39526256918907166, + "logps/chosen": -64.7835922241211, + "logps/rejected": -70.30067443847656, + "loss": 0.7759, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.041672706604004, + "rewards/margins": 4.918066501617432, + "rewards/rejected": -1.8763936758041382, + "step": 2066 + }, + { + "epoch": 0.52, + "grad_norm": 2.245635986328125, + "learning_rate": 7.343398911177509e-06, + "logits/chosen": -0.3291325271129608, + "logits/rejected": -0.43139514327049255, + "logps/chosen": -43.02790069580078, + "logps/rejected": -65.79292297363281, + "loss": 0.7033, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7586445808410645, + "rewards/margins": 4.593416690826416, + "rewards/rejected": -1.8347715139389038, + "step": 2067 + }, + { + "epoch": 0.52, + "grad_norm": 13.169415473937988, + "learning_rate": 7.341084394460698e-06, + "logits/chosen": -0.24557659029960632, + "logits/rejected": -0.3385309875011444, + "logps/chosen": -58.24976348876953, + "logps/rejected": -67.68768310546875, + "loss": 1.007, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7197506427764893, + "rewards/margins": 3.412100076675415, + "rewards/rejected": -0.6923493146896362, + "step": 2068 + }, + { + "epoch": 0.52, + "grad_norm": 8.987151145935059, + "learning_rate": 7.3387692350657054e-06, + "logits/chosen": -0.25358453392982483, + "logits/rejected": -0.3702850639820099, + "logps/chosen": -62.42345428466797, + "logps/rejected": -83.05934143066406, + "loss": 0.7213, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9678430557250977, + "rewards/margins": 4.320582389831543, + "rewards/rejected": -1.3527389764785767, + "step": 2069 + }, + { + "epoch": 0.52, + "grad_norm": 7.313377857208252, + "learning_rate": 7.33645343362809e-06, + "logits/chosen": -0.2632209360599518, + "logits/rejected": -0.3191608786582947, + "logps/chosen": -56.459110260009766, + "logps/rejected": -82.69539642333984, + "loss": 0.8117, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6175644397735596, + "rewards/margins": 4.150304794311523, + "rewards/rejected": -1.5327399969100952, + "step": 2070 + }, + { + "epoch": 0.52, + "grad_norm": 3.445343494415283, + "learning_rate": 7.334136990783591e-06, + "logits/chosen": -0.2618882656097412, + "logits/rejected": -0.3507225811481476, + "logps/chosen": -66.76499938964844, + "logps/rejected": -74.58076477050781, + "loss": 0.7744, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.940216064453125, + "rewards/margins": 4.044379711151123, + "rewards/rejected": -1.1041638851165771, + "step": 2071 + }, + { + "epoch": 0.52, + "grad_norm": 9.421112060546875, + "learning_rate": 7.331819907168121e-06, + "logits/chosen": -0.2419019341468811, + "logits/rejected": -0.351729154586792, + "logps/chosen": -50.799278259277344, + "logps/rejected": -72.44195556640625, + "loss": 0.6891, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.623361825942993, + "rewards/margins": 4.150581359863281, + "rewards/rejected": -1.5272194147109985, + "step": 2072 + }, + { + "epoch": 0.52, + "grad_norm": 4.913817405700684, + "learning_rate": 7.32950218341777e-06, + "logits/chosen": -0.2205074429512024, + "logits/rejected": -0.3464016318321228, + "logps/chosen": -52.10936737060547, + "logps/rejected": -71.19570922851562, + "loss": 0.6775, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0370640754699707, + "rewards/margins": 4.446653842926025, + "rewards/rejected": -1.409590244293213, + "step": 2073 + }, + { + "epoch": 0.52, + "grad_norm": 10.670051574707031, + "learning_rate": 7.327183820168804e-06, + "logits/chosen": -0.2742867171764374, + "logits/rejected": -0.41906502842903137, + "logps/chosen": -64.97261810302734, + "logps/rejected": -65.9269027709961, + "loss": 1.0266, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7857630252838135, + "rewards/margins": 3.709426164627075, + "rewards/rejected": -0.9236629605293274, + "step": 2074 + }, + { + "epoch": 0.52, + "grad_norm": 7.159564018249512, + "learning_rate": 7.324864818057662e-06, + "logits/chosen": -0.316403329372406, + "logits/rejected": -0.34925249218940735, + "logps/chosen": -58.432525634765625, + "logps/rejected": -91.12086486816406, + "loss": 0.9962, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.548617362976074, + "rewards/margins": 3.400287628173828, + "rewards/rejected": -0.8516703844070435, + "step": 2075 + }, + { + "epoch": 0.52, + "grad_norm": 5.875775337219238, + "learning_rate": 7.3225451777209585e-06, + "logits/chosen": -0.22969821095466614, + "logits/rejected": -0.3661605417728424, + "logps/chosen": -60.600013732910156, + "logps/rejected": -76.4335708618164, + "loss": 0.7811, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9075794219970703, + "rewards/margins": 4.173696041107178, + "rewards/rejected": -1.2661163806915283, + "step": 2076 + }, + { + "epoch": 0.52, + "grad_norm": 2.5007433891296387, + "learning_rate": 7.32022489979549e-06, + "logits/chosen": -0.24954640865325928, + "logits/rejected": -0.38225287199020386, + "logps/chosen": -53.24699783325195, + "logps/rejected": -77.05742645263672, + "loss": 0.6266, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.871309757232666, + "rewards/margins": 4.429826736450195, + "rewards/rejected": -1.558516502380371, + "step": 2077 + }, + { + "epoch": 0.52, + "grad_norm": 4.9619669914245605, + "learning_rate": 7.3179039849182175e-06, + "logits/chosen": -0.24818600714206696, + "logits/rejected": -0.37324318289756775, + "logps/chosen": -57.5074577331543, + "logps/rejected": -77.20793914794922, + "loss": 0.8504, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.698500633239746, + "rewards/margins": 3.881077289581299, + "rewards/rejected": -1.1825768947601318, + "step": 2078 + }, + { + "epoch": 0.52, + "grad_norm": 5.662818431854248, + "learning_rate": 7.315582433726288e-06, + "logits/chosen": -0.24162662029266357, + "logits/rejected": -0.2668762803077698, + "logps/chosen": -52.495548248291016, + "logps/rejected": -84.98725891113281, + "loss": 0.8267, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9065325260162354, + "rewards/margins": 3.366579294204712, + "rewards/rejected": -0.46004703640937805, + "step": 2079 + }, + { + "epoch": 0.52, + "grad_norm": 4.602102756500244, + "learning_rate": 7.313260246857013e-06, + "logits/chosen": -0.26671767234802246, + "logits/rejected": -0.31610268354415894, + "logps/chosen": -65.49657440185547, + "logps/rejected": -81.13349914550781, + "loss": 0.9653, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7414824962615967, + "rewards/margins": 3.279559373855591, + "rewards/rejected": -0.5380772352218628, + "step": 2080 + }, + { + "epoch": 0.52, + "grad_norm": 7.0436787605285645, + "learning_rate": 7.310937424947882e-06, + "logits/chosen": -0.2006620466709137, + "logits/rejected": -0.336665540933609, + "logps/chosen": -63.8536262512207, + "logps/rejected": -72.18111419677734, + "loss": 0.7783, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8075029850006104, + "rewards/margins": 3.8261537551879883, + "rewards/rejected": -1.0186511278152466, + "step": 2081 + }, + { + "epoch": 0.52, + "grad_norm": 3.9627277851104736, + "learning_rate": 7.3086139686365635e-06, + "logits/chosen": -0.25483232736587524, + "logits/rejected": -0.29376649856567383, + "logps/chosen": -54.50377655029297, + "logps/rejected": -85.09626007080078, + "loss": 0.799, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.187960624694824, + "rewards/margins": 3.991730213165283, + "rewards/rejected": -0.8037698268890381, + "step": 2082 + }, + { + "epoch": 0.52, + "grad_norm": 4.713006019592285, + "learning_rate": 7.306289878560895e-06, + "logits/chosen": -0.21972446143627167, + "logits/rejected": -0.2961924374103546, + "logps/chosen": -60.695091247558594, + "logps/rejected": -74.36054992675781, + "loss": 0.8304, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8340985774993896, + "rewards/margins": 4.114471435546875, + "rewards/rejected": -1.2803728580474854, + "step": 2083 + }, + { + "epoch": 0.52, + "grad_norm": 2.8522891998291016, + "learning_rate": 7.303965155358888e-06, + "logits/chosen": -0.3324827551841736, + "logits/rejected": -0.4174790680408478, + "logps/chosen": -55.106658935546875, + "logps/rejected": -75.32711791992188, + "loss": 0.7076, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.908848285675049, + "rewards/margins": 4.370555877685547, + "rewards/rejected": -1.4617074728012085, + "step": 2084 + }, + { + "epoch": 0.52, + "grad_norm": 6.1174635887146, + "learning_rate": 7.3016397996687306e-06, + "logits/chosen": -0.1675405651330948, + "logits/rejected": -0.2973198890686035, + "logps/chosen": -56.346839904785156, + "logps/rejected": -80.32139587402344, + "loss": 0.697, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8075613975524902, + "rewards/margins": 3.8238914012908936, + "rewards/rejected": -1.0163300037384033, + "step": 2085 + }, + { + "epoch": 0.52, + "grad_norm": 5.0569658279418945, + "learning_rate": 7.299313812128782e-06, + "logits/chosen": -0.35499072074890137, + "logits/rejected": -0.40524041652679443, + "logps/chosen": -60.829769134521484, + "logps/rejected": -77.91041564941406, + "loss": 0.8511, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8584275245666504, + "rewards/margins": 4.034635066986084, + "rewards/rejected": -1.1762075424194336, + "step": 2086 + }, + { + "epoch": 0.52, + "grad_norm": 7.868639945983887, + "learning_rate": 7.296987193377578e-06, + "logits/chosen": -0.19185015559196472, + "logits/rejected": -0.34446394443511963, + "logps/chosen": -62.81184005737305, + "logps/rejected": -77.0833740234375, + "loss": 0.8884, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.743121385574341, + "rewards/margins": 4.126437187194824, + "rewards/rejected": -1.383315920829773, + "step": 2087 + }, + { + "epoch": 0.52, + "grad_norm": 8.106045722961426, + "learning_rate": 7.294659944053822e-06, + "logits/chosen": -0.22667355835437775, + "logits/rejected": -0.3412662446498871, + "logps/chosen": -59.68603515625, + "logps/rejected": -68.44670104980469, + "loss": 0.9965, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5451436042785645, + "rewards/margins": 3.1651690006256104, + "rewards/rejected": -0.6200255751609802, + "step": 2088 + }, + { + "epoch": 0.52, + "grad_norm": 4.140552520751953, + "learning_rate": 7.292332064796401e-06, + "logits/chosen": -0.3148074448108673, + "logits/rejected": -0.3884027302265167, + "logps/chosen": -46.192935943603516, + "logps/rejected": -75.75712585449219, + "loss": 0.871, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9170608520507812, + "rewards/margins": 3.6441493034362793, + "rewards/rejected": -0.7270885109901428, + "step": 2089 + }, + { + "epoch": 0.52, + "grad_norm": 6.681706428527832, + "learning_rate": 7.290003556244359e-06, + "logits/chosen": -0.28057920932769775, + "logits/rejected": -0.3176347315311432, + "logps/chosen": -45.25886154174805, + "logps/rejected": -74.42143249511719, + "loss": 0.8526, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7241508960723877, + "rewards/margins": 3.1786389350891113, + "rewards/rejected": -0.45448780059814453, + "step": 2090 + }, + { + "epoch": 0.52, + "grad_norm": 2.329613208770752, + "learning_rate": 7.287674419036928e-06, + "logits/chosen": -0.2198261171579361, + "logits/rejected": -0.38438284397125244, + "logps/chosen": -53.120540618896484, + "logps/rejected": -68.99951934814453, + "loss": 0.661, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.019338607788086, + "rewards/margins": 4.983926773071289, + "rewards/rejected": -1.9645886421203613, + "step": 2091 + }, + { + "epoch": 0.52, + "grad_norm": 6.316582202911377, + "learning_rate": 7.285344653813505e-06, + "logits/chosen": -0.2508038580417633, + "logits/rejected": -0.379817932844162, + "logps/chosen": -53.23464584350586, + "logps/rejected": -62.820960998535156, + "loss": 0.8406, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6384835243225098, + "rewards/margins": 3.5787978172302246, + "rewards/rejected": -0.9403144717216492, + "step": 2092 + }, + { + "epoch": 0.52, + "grad_norm": 8.622722625732422, + "learning_rate": 7.283014261213661e-06, + "logits/chosen": -0.19779619574546814, + "logits/rejected": -0.3179297149181366, + "logps/chosen": -54.237464904785156, + "logps/rejected": -63.696998596191406, + "loss": 0.8211, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6637704372406006, + "rewards/margins": 2.8589606285095215, + "rewards/rejected": -0.19518986344337463, + "step": 2093 + }, + { + "epoch": 0.52, + "grad_norm": 5.761071681976318, + "learning_rate": 7.280683241877139e-06, + "logits/chosen": -0.1861533373594284, + "logits/rejected": -0.3021687865257263, + "logps/chosen": -52.256080627441406, + "logps/rejected": -71.71859741210938, + "loss": 0.9066, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7744414806365967, + "rewards/margins": 3.213606595993042, + "rewards/rejected": -0.4391651451587677, + "step": 2094 + }, + { + "epoch": 0.52, + "grad_norm": 3.4696578979492188, + "learning_rate": 7.2783515964438535e-06, + "logits/chosen": -0.21310536563396454, + "logits/rejected": -0.31019163131713867, + "logps/chosen": -54.424964904785156, + "logps/rejected": -92.39840698242188, + "loss": 0.6789, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.816286563873291, + "rewards/margins": 4.931187629699707, + "rewards/rejected": -2.114901304244995, + "step": 2095 + }, + { + "epoch": 0.52, + "grad_norm": 3.5182015895843506, + "learning_rate": 7.276019325553891e-06, + "logits/chosen": -0.24464690685272217, + "logits/rejected": -0.31683722138404846, + "logps/chosen": -50.57717514038086, + "logps/rejected": -73.8388442993164, + "loss": 0.7493, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0866124629974365, + "rewards/margins": 4.171935558319092, + "rewards/rejected": -1.0853229761123657, + "step": 2096 + }, + { + "epoch": 0.52, + "grad_norm": 8.290116310119629, + "learning_rate": 7.273686429847512e-06, + "logits/chosen": -0.19941627979278564, + "logits/rejected": -0.34877246618270874, + "logps/chosen": -55.101654052734375, + "logps/rejected": -62.09992218017578, + "loss": 0.7807, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.94063663482666, + "rewards/margins": 4.1176557540893555, + "rewards/rejected": -1.1770187616348267, + "step": 2097 + }, + { + "epoch": 0.52, + "grad_norm": 4.416600704193115, + "learning_rate": 7.271352909965145e-06, + "logits/chosen": -0.24448511004447937, + "logits/rejected": -0.33857956528663635, + "logps/chosen": -60.65593719482422, + "logps/rejected": -86.94500732421875, + "loss": 0.8446, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.957784652709961, + "rewards/margins": 3.9555716514587402, + "rewards/rejected": -0.9977872967720032, + "step": 2098 + }, + { + "epoch": 0.53, + "grad_norm": 3.337999105453491, + "learning_rate": 7.269018766547393e-06, + "logits/chosen": -0.19616061449050903, + "logits/rejected": -0.2813672721385956, + "logps/chosen": -60.591888427734375, + "logps/rejected": -73.23295593261719, + "loss": 0.7879, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.004932403564453, + "rewards/margins": 3.919996738433838, + "rewards/rejected": -0.9150645136833191, + "step": 2099 + }, + { + "epoch": 0.53, + "grad_norm": 4.0430216789245605, + "learning_rate": 7.266684000235028e-06, + "logits/chosen": -0.23725536465644836, + "logits/rejected": -0.3245375156402588, + "logps/chosen": -55.9874382019043, + "logps/rejected": -75.18001556396484, + "loss": 0.8255, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0180001258850098, + "rewards/margins": 4.346855163574219, + "rewards/rejected": -1.3288546800613403, + "step": 2100 + }, + { + "epoch": 0.53, + "grad_norm": 4.140961170196533, + "learning_rate": 7.2643486116689946e-06, + "logits/chosen": -0.2286938726902008, + "logits/rejected": -0.2819022238254547, + "logps/chosen": -52.21028518676758, + "logps/rejected": -81.05109405517578, + "loss": 0.891, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9993019104003906, + "rewards/margins": 3.3613476753234863, + "rewards/rejected": -0.3620458245277405, + "step": 2101 + }, + { + "epoch": 0.53, + "grad_norm": 5.301571369171143, + "learning_rate": 7.262012601490404e-06, + "logits/chosen": -0.12069676071405411, + "logits/rejected": -0.17303995788097382, + "logps/chosen": -51.73942184448242, + "logps/rejected": -81.309326171875, + "loss": 0.763, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7564549446105957, + "rewards/margins": 3.2186191082000732, + "rewards/rejected": -0.4621641933917999, + "step": 2102 + }, + { + "epoch": 0.53, + "grad_norm": 7.0192389488220215, + "learning_rate": 7.259675970340545e-06, + "logits/chosen": -0.2565092146396637, + "logits/rejected": -0.3506331443786621, + "logps/chosen": -65.52970123291016, + "logps/rejected": -85.48530578613281, + "loss": 0.9182, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9034528732299805, + "rewards/margins": 3.9191770553588867, + "rewards/rejected": -1.0157241821289062, + "step": 2103 + }, + { + "epoch": 0.53, + "grad_norm": 3.5950262546539307, + "learning_rate": 7.2573387188608735e-06, + "logits/chosen": -0.25964659452438354, + "logits/rejected": -0.3612785339355469, + "logps/chosen": -60.79608154296875, + "logps/rejected": -66.95133209228516, + "loss": 0.7674, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9641494750976562, + "rewards/margins": 3.9396679401397705, + "rewards/rejected": -0.9755182266235352, + "step": 2104 + }, + { + "epoch": 0.53, + "grad_norm": 3.9432358741760254, + "learning_rate": 7.255000847693012e-06, + "logits/chosen": -0.10919509828090668, + "logits/rejected": -0.22949427366256714, + "logps/chosen": -67.62755584716797, + "logps/rejected": -79.91526794433594, + "loss": 0.7382, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9052746295928955, + "rewards/margins": 3.613133430480957, + "rewards/rejected": -0.7078588604927063, + "step": 2105 + }, + { + "epoch": 0.53, + "grad_norm": 8.666848182678223, + "learning_rate": 7.2526623574787606e-06, + "logits/chosen": -0.215314120054245, + "logits/rejected": -0.27935799956321716, + "logps/chosen": -53.84457778930664, + "logps/rejected": -65.91619873046875, + "loss": 0.815, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8374180793762207, + "rewards/margins": 3.287283420562744, + "rewards/rejected": -0.449865460395813, + "step": 2106 + }, + { + "epoch": 0.53, + "grad_norm": 4.4877119064331055, + "learning_rate": 7.250323248860083e-06, + "logits/chosen": -0.30959662795066833, + "logits/rejected": -0.3172222375869751, + "logps/chosen": -59.406185150146484, + "logps/rejected": -81.44170379638672, + "loss": 0.8257, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.75163197517395, + "rewards/margins": 2.99306321144104, + "rewards/rejected": -0.24143090844154358, + "step": 2107 + }, + { + "epoch": 0.53, + "grad_norm": 2.2886993885040283, + "learning_rate": 7.247983522479114e-06, + "logits/chosen": -0.22570976614952087, + "logits/rejected": -0.2770098149776459, + "logps/chosen": -45.98332214355469, + "logps/rejected": -75.5556411743164, + "loss": 0.7201, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0105817317962646, + "rewards/margins": 4.1801910400390625, + "rewards/rejected": -1.1696089506149292, + "step": 2108 + }, + { + "epoch": 0.53, + "grad_norm": 4.4398980140686035, + "learning_rate": 7.2456431789781615e-06, + "logits/chosen": -0.26532670855522156, + "logits/rejected": -0.3551254868507385, + "logps/chosen": -57.90483093261719, + "logps/rejected": -76.79934692382812, + "loss": 0.8775, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7153000831604004, + "rewards/margins": 3.3124561309814453, + "rewards/rejected": -0.5971561074256897, + "step": 2109 + }, + { + "epoch": 0.53, + "grad_norm": 3.1345481872558594, + "learning_rate": 7.2433022189997e-06, + "logits/chosen": -0.1419697403907776, + "logits/rejected": -0.24167747795581818, + "logps/chosen": -51.9352912902832, + "logps/rejected": -72.84088897705078, + "loss": 0.7021, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.113093376159668, + "rewards/margins": 4.167756080627441, + "rewards/rejected": -1.0546622276306152, + "step": 2110 + }, + { + "epoch": 0.53, + "grad_norm": 5.723386287689209, + "learning_rate": 7.240960643186372e-06, + "logits/chosen": -0.10207544267177582, + "logits/rejected": -0.2426389455795288, + "logps/chosen": -62.152313232421875, + "logps/rejected": -80.80458068847656, + "loss": 0.9496, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6558823585510254, + "rewards/margins": 3.637558698654175, + "rewards/rejected": -0.981676459312439, + "step": 2111 + }, + { + "epoch": 0.53, + "grad_norm": 4.388839244842529, + "learning_rate": 7.238618452180991e-06, + "logits/chosen": -0.12510398030281067, + "logits/rejected": -0.2914459705352783, + "logps/chosen": -71.38643646240234, + "logps/rejected": -81.64039611816406, + "loss": 0.8722, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.606921911239624, + "rewards/margins": 3.408459424972534, + "rewards/rejected": -0.8015375733375549, + "step": 2112 + }, + { + "epoch": 0.53, + "grad_norm": 10.624415397644043, + "learning_rate": 7.23627564662654e-06, + "logits/chosen": -0.22286614775657654, + "logits/rejected": -0.2988591194152832, + "logps/chosen": -57.655208587646484, + "logps/rejected": -74.51353454589844, + "loss": 0.9372, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.0337793827056885, + "rewards/margins": 3.0676488876342773, + "rewards/rejected": -0.03386949375271797, + "step": 2113 + }, + { + "epoch": 0.53, + "grad_norm": 4.199228286743164, + "learning_rate": 7.233932227166168e-06, + "logits/chosen": -0.22427275776863098, + "logits/rejected": -0.3126496374607086, + "logps/chosen": -54.39653015136719, + "logps/rejected": -70.67086029052734, + "loss": 0.842, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9078357219696045, + "rewards/margins": 3.5097947120666504, + "rewards/rejected": -0.6019591689109802, + "step": 2114 + }, + { + "epoch": 0.53, + "grad_norm": 4.036316394805908, + "learning_rate": 7.231588194443195e-06, + "logits/chosen": -0.16974757611751556, + "logits/rejected": -0.3359791338443756, + "logps/chosen": -64.6299057006836, + "logps/rejected": -75.14533233642578, + "loss": 0.7782, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.917609214782715, + "rewards/margins": 4.350124835968018, + "rewards/rejected": -1.4325156211853027, + "step": 2115 + }, + { + "epoch": 0.53, + "grad_norm": 5.699647903442383, + "learning_rate": 7.22924354910111e-06, + "logits/chosen": -0.2191344052553177, + "logits/rejected": -0.32026058435440063, + "logps/chosen": -64.87940979003906, + "logps/rejected": -72.28089141845703, + "loss": 0.8559, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.585909843444824, + "rewards/margins": 3.3657050132751465, + "rewards/rejected": -0.7797951102256775, + "step": 2116 + }, + { + "epoch": 0.53, + "grad_norm": 5.629717826843262, + "learning_rate": 7.226898291783567e-06, + "logits/chosen": -0.22571343183517456, + "logits/rejected": -0.3032863438129425, + "logps/chosen": -66.38634490966797, + "logps/rejected": -85.15673828125, + "loss": 0.9234, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7786054611206055, + "rewards/margins": 2.5665268898010254, + "rewards/rejected": 0.2120785117149353, + "step": 2117 + }, + { + "epoch": 0.53, + "grad_norm": 3.5435330867767334, + "learning_rate": 7.22455242313439e-06, + "logits/chosen": -0.17439977824687958, + "logits/rejected": -0.2233477532863617, + "logps/chosen": -52.54704666137695, + "logps/rejected": -80.86050415039062, + "loss": 0.7425, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.799535036087036, + "rewards/margins": 3.3217039108276367, + "rewards/rejected": -0.522168755531311, + "step": 2118 + }, + { + "epoch": 0.53, + "grad_norm": 5.471793174743652, + "learning_rate": 7.222205943797572e-06, + "logits/chosen": -0.19072861969470978, + "logits/rejected": -0.3350444436073303, + "logps/chosen": -56.996971130371094, + "logps/rejected": -62.000850677490234, + "loss": 0.788, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7574243545532227, + "rewards/margins": 3.9401261806488037, + "rewards/rejected": -1.182701587677002, + "step": 2119 + }, + { + "epoch": 0.53, + "grad_norm": 3.767533302307129, + "learning_rate": 7.21985885441727e-06, + "logits/chosen": -0.18489666283130646, + "logits/rejected": -0.30737417936325073, + "logps/chosen": -53.51290512084961, + "logps/rejected": -66.7872314453125, + "loss": 0.6814, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9485890865325928, + "rewards/margins": 3.7306787967681885, + "rewards/rejected": -0.7820897102355957, + "step": 2120 + }, + { + "epoch": 0.53, + "grad_norm": 3.4630043506622314, + "learning_rate": 7.2175111556378106e-06, + "logits/chosen": -0.18875053524971008, + "logits/rejected": -0.3186931908130646, + "logps/chosen": -61.43252182006836, + "logps/rejected": -65.21137237548828, + "loss": 0.8523, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.940591812133789, + "rewards/margins": 3.4702413082122803, + "rewards/rejected": -0.5296494364738464, + "step": 2121 + }, + { + "epoch": 0.53, + "grad_norm": 9.572556495666504, + "learning_rate": 7.215162848103692e-06, + "logits/chosen": -0.2322601079940796, + "logits/rejected": -0.24946461617946625, + "logps/chosen": -57.701332092285156, + "logps/rejected": -80.36878204345703, + "loss": 0.8283, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7957160472869873, + "rewards/margins": 3.664440155029297, + "rewards/rejected": -0.8687242269515991, + "step": 2122 + }, + { + "epoch": 0.53, + "grad_norm": 3.082906484603882, + "learning_rate": 7.212813932459568e-06, + "logits/chosen": -0.19082987308502197, + "logits/rejected": -0.238847553730011, + "logps/chosen": -50.37446212768555, + "logps/rejected": -75.17485046386719, + "loss": 0.6623, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.399871349334717, + "rewards/margins": 3.8262178897857666, + "rewards/rejected": -0.4263468384742737, + "step": 2123 + }, + { + "epoch": 0.53, + "grad_norm": 3.9343976974487305, + "learning_rate": 7.210464409350275e-06, + "logits/chosen": -0.20159898698329926, + "logits/rejected": -0.3357987403869629, + "logps/chosen": -68.71673583984375, + "logps/rejected": -76.17279052734375, + "loss": 0.719, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8684427738189697, + "rewards/margins": 3.9444503784179688, + "rewards/rejected": -1.0760079622268677, + "step": 2124 + }, + { + "epoch": 0.53, + "grad_norm": 4.18886661529541, + "learning_rate": 7.2081142794208e-06, + "logits/chosen": -0.1919044554233551, + "logits/rejected": -0.3380545973777771, + "logps/chosen": -66.31999969482422, + "logps/rejected": -78.51263427734375, + "loss": 0.7353, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8666391372680664, + "rewards/margins": 4.282486915588379, + "rewards/rejected": -1.4158486127853394, + "step": 2125 + }, + { + "epoch": 0.53, + "grad_norm": 4.428689956665039, + "learning_rate": 7.205763543316311e-06, + "logits/chosen": -0.2709237337112427, + "logits/rejected": -0.4007962644100189, + "logps/chosen": -61.39257049560547, + "logps/rejected": -70.40531158447266, + "loss": 0.8972, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.505126476287842, + "rewards/margins": 3.9639365673065186, + "rewards/rejected": -1.4588102102279663, + "step": 2126 + }, + { + "epoch": 0.53, + "grad_norm": 4.370171070098877, + "learning_rate": 7.203412201682129e-06, + "logits/chosen": -0.28098219633102417, + "logits/rejected": -0.32678675651550293, + "logps/chosen": -49.819366455078125, + "logps/rejected": -77.37136840820312, + "loss": 0.9196, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.972430467605591, + "rewards/margins": 4.016095161437988, + "rewards/rejected": -1.043664813041687, + "step": 2127 + }, + { + "epoch": 0.53, + "grad_norm": 8.412707328796387, + "learning_rate": 7.201060255163755e-06, + "logits/chosen": -0.27912336587905884, + "logits/rejected": -0.3049127459526062, + "logps/chosen": -50.15874481201172, + "logps/rejected": -77.48271179199219, + "loss": 0.8497, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6165342330932617, + "rewards/margins": 3.4846928119659424, + "rewards/rejected": -0.8681586980819702, + "step": 2128 + }, + { + "epoch": 0.53, + "grad_norm": 3.95151948928833, + "learning_rate": 7.198707704406845e-06, + "logits/chosen": -0.264046311378479, + "logits/rejected": -0.3587387800216675, + "logps/chosen": -42.654335021972656, + "logps/rejected": -73.9524154663086, + "loss": 0.7394, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.843846321105957, + "rewards/margins": 3.5712103843688965, + "rewards/rejected": -0.7273642420768738, + "step": 2129 + }, + { + "epoch": 0.53, + "grad_norm": 5.143805980682373, + "learning_rate": 7.196354550057226e-06, + "logits/chosen": -0.197999507188797, + "logits/rejected": -0.2916644513607025, + "logps/chosen": -68.455810546875, + "logps/rejected": -70.31995391845703, + "loss": 0.9712, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7811179161071777, + "rewards/margins": 3.0933966636657715, + "rewards/rejected": -0.3122788965702057, + "step": 2130 + }, + { + "epoch": 0.53, + "grad_norm": 7.216501235961914, + "learning_rate": 7.194000792760889e-06, + "logits/chosen": -0.2338201254606247, + "logits/rejected": -0.37323176860809326, + "logps/chosen": -59.203956604003906, + "logps/rejected": -68.54634857177734, + "loss": 0.9512, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.652282238006592, + "rewards/margins": 3.7024893760681152, + "rewards/rejected": -1.050207257270813, + "step": 2131 + }, + { + "epoch": 0.53, + "grad_norm": 4.894639015197754, + "learning_rate": 7.191646433163992e-06, + "logits/chosen": -0.28805214166641235, + "logits/rejected": -0.38994771242141724, + "logps/chosen": -61.282752990722656, + "logps/rejected": -71.05770874023438, + "loss": 0.8075, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0813825130462646, + "rewards/margins": 3.9917774200439453, + "rewards/rejected": -0.9103948473930359, + "step": 2132 + }, + { + "epoch": 0.53, + "grad_norm": 8.654647827148438, + "learning_rate": 7.189291471912856e-06, + "logits/chosen": -0.2471623718738556, + "logits/rejected": -0.27270758152008057, + "logps/chosen": -58.01677703857422, + "logps/rejected": -113.31908416748047, + "loss": 0.7265, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.781402349472046, + "rewards/margins": 4.334849834442139, + "rewards/rejected": -1.5534476041793823, + "step": 2133 + }, + { + "epoch": 0.53, + "grad_norm": 4.466431140899658, + "learning_rate": 7.1869359096539716e-06, + "logits/chosen": -0.2284967452287674, + "logits/rejected": -0.36217430233955383, + "logps/chosen": -49.20930099487305, + "logps/rejected": -72.1107177734375, + "loss": 0.7197, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7914838790893555, + "rewards/margins": 4.2633891105651855, + "rewards/rejected": -1.47190523147583, + "step": 2134 + }, + { + "epoch": 0.53, + "grad_norm": 4.652332782745361, + "learning_rate": 7.184579747033987e-06, + "logits/chosen": -0.21144980192184448, + "logits/rejected": -0.32282865047454834, + "logps/chosen": -56.33832550048828, + "logps/rejected": -86.52503967285156, + "loss": 0.6941, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6950254440307617, + "rewards/margins": 4.36305046081543, + "rewards/rejected": -1.6680248975753784, + "step": 2135 + }, + { + "epoch": 0.53, + "grad_norm": 7.6232452392578125, + "learning_rate": 7.1822229846997246e-06, + "logits/chosen": -0.257004976272583, + "logits/rejected": -0.31852102279663086, + "logps/chosen": -61.07678985595703, + "logps/rejected": -80.62193298339844, + "loss": 0.923, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6585640907287598, + "rewards/margins": 3.2361059188842773, + "rewards/rejected": -0.5775417685508728, + "step": 2136 + }, + { + "epoch": 0.53, + "grad_norm": 4.957407474517822, + "learning_rate": 7.179865623298162e-06, + "logits/chosen": -0.2846813201904297, + "logits/rejected": -0.41238701343536377, + "logps/chosen": -54.489341735839844, + "logps/rejected": -69.87989044189453, + "loss": 0.7748, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7583131790161133, + "rewards/margins": 3.788989782333374, + "rewards/rejected": -1.0306767225265503, + "step": 2137 + }, + { + "epoch": 0.53, + "grad_norm": 5.627381324768066, + "learning_rate": 7.177507663476451e-06, + "logits/chosen": -0.23077242076396942, + "logits/rejected": -0.34436988830566406, + "logps/chosen": -54.82635498046875, + "logps/rejected": -72.39103698730469, + "loss": 0.8909, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7508697509765625, + "rewards/margins": 3.524214267730713, + "rewards/rejected": -0.7733446955680847, + "step": 2138 + }, + { + "epoch": 0.54, + "grad_norm": 3.9963912963867188, + "learning_rate": 7.1751491058818975e-06, + "logits/chosen": -0.20074701309204102, + "logits/rejected": -0.2749602794647217, + "logps/chosen": -57.096553802490234, + "logps/rejected": -73.5846176147461, + "loss": 0.6963, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8499667644500732, + "rewards/margins": 3.879169225692749, + "rewards/rejected": -1.0292024612426758, + "step": 2139 + }, + { + "epoch": 0.54, + "grad_norm": 3.8088302612304688, + "learning_rate": 7.172789951161979e-06, + "logits/chosen": -0.2590927183628082, + "logits/rejected": -0.3448055386543274, + "logps/chosen": -42.18486022949219, + "logps/rejected": -65.17781066894531, + "loss": 0.6957, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.989107370376587, + "rewards/margins": 3.2153971195220947, + "rewards/rejected": -0.22628965973854065, + "step": 2140 + }, + { + "epoch": 0.54, + "grad_norm": 2.981898307800293, + "learning_rate": 7.170430199964333e-06, + "logits/chosen": -0.21901080012321472, + "logits/rejected": -0.35818952322006226, + "logps/chosen": -47.747467041015625, + "logps/rejected": -63.37987518310547, + "loss": 0.74, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.010751247406006, + "rewards/margins": 4.501746654510498, + "rewards/rejected": -1.4909952878952026, + "step": 2141 + }, + { + "epoch": 0.54, + "grad_norm": 3.1739158630371094, + "learning_rate": 7.168069852936765e-06, + "logits/chosen": -0.12093103677034378, + "logits/rejected": -0.28927549719810486, + "logps/chosen": -62.663848876953125, + "logps/rejected": -64.99440002441406, + "loss": 0.7404, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1756997108459473, + "rewards/margins": 3.9639720916748047, + "rewards/rejected": -0.7882722616195679, + "step": 2142 + }, + { + "epoch": 0.54, + "grad_norm": 2.2770867347717285, + "learning_rate": 7.165708910727236e-06, + "logits/chosen": -0.2009964883327484, + "logits/rejected": -0.3332411050796509, + "logps/chosen": -54.14063262939453, + "logps/rejected": -85.85527038574219, + "loss": 0.6216, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0368030071258545, + "rewards/margins": 5.0392913818359375, + "rewards/rejected": -2.0024890899658203, + "step": 2143 + }, + { + "epoch": 0.54, + "grad_norm": 6.028482437133789, + "learning_rate": 7.1633473739838824e-06, + "logits/chosen": -0.29393500089645386, + "logits/rejected": -0.41696882247924805, + "logps/chosen": -63.13322067260742, + "logps/rejected": -70.06517791748047, + "loss": 0.859, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.644477367401123, + "rewards/margins": 3.7195606231689453, + "rewards/rejected": -1.0750830173492432, + "step": 2144 + }, + { + "epoch": 0.54, + "grad_norm": 3.8740627765655518, + "learning_rate": 7.1609852433549895e-06, + "logits/chosen": -0.24101538956165314, + "logits/rejected": -0.36226576566696167, + "logps/chosen": -47.597740173339844, + "logps/rejected": -63.160133361816406, + "loss": 0.7108, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9528582096099854, + "rewards/margins": 4.475135326385498, + "rewards/rejected": -1.5222773551940918, + "step": 2145 + }, + { + "epoch": 0.54, + "grad_norm": 3.762195110321045, + "learning_rate": 7.158622519489019e-06, + "logits/chosen": -0.2716050446033478, + "logits/rejected": -0.3729082942008972, + "logps/chosen": -57.85533142089844, + "logps/rejected": -68.64582824707031, + "loss": 0.8006, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.43519926071167, + "rewards/margins": 4.481760025024414, + "rewards/rejected": -2.0465614795684814, + "step": 2146 + }, + { + "epoch": 0.54, + "grad_norm": 5.509428977966309, + "learning_rate": 7.156259203034587e-06, + "logits/chosen": -0.3162801265716553, + "logits/rejected": -0.45295965671539307, + "logps/chosen": -60.86884307861328, + "logps/rejected": -66.16267395019531, + "loss": 0.9018, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7626469135284424, + "rewards/margins": 3.714003562927246, + "rewards/rejected": -0.9513565897941589, + "step": 2147 + }, + { + "epoch": 0.54, + "grad_norm": 4.078427791595459, + "learning_rate": 7.153895294640476e-06, + "logits/chosen": -0.2405543476343155, + "logits/rejected": -0.3847839832305908, + "logps/chosen": -49.281471252441406, + "logps/rejected": -70.2813720703125, + "loss": 0.7057, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8430557250976562, + "rewards/margins": 4.46357536315918, + "rewards/rejected": -1.6205195188522339, + "step": 2148 + }, + { + "epoch": 0.54, + "grad_norm": 7.062694072723389, + "learning_rate": 7.151530794955629e-06, + "logits/chosen": -0.2735748291015625, + "logits/rejected": -0.33945128321647644, + "logps/chosen": -48.56411361694336, + "logps/rejected": -74.10926818847656, + "loss": 0.8593, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8402504920959473, + "rewards/margins": 3.658799171447754, + "rewards/rejected": -0.8185487389564514, + "step": 2149 + }, + { + "epoch": 0.54, + "grad_norm": 3.8289756774902344, + "learning_rate": 7.1491657046291526e-06, + "logits/chosen": -0.20835386216640472, + "logits/rejected": -0.35014235973358154, + "logps/chosen": -57.04189682006836, + "logps/rejected": -76.52433013916016, + "loss": 0.7189, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7039906978607178, + "rewards/margins": 4.8284101486206055, + "rewards/rejected": -2.1244192123413086, + "step": 2150 + }, + { + "epoch": 0.54, + "grad_norm": 7.392413139343262, + "learning_rate": 7.146800024310314e-06, + "logits/chosen": -0.31419140100479126, + "logits/rejected": -0.442136287689209, + "logps/chosen": -61.59906005859375, + "logps/rejected": -73.51862335205078, + "loss": 0.8922, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0383095741271973, + "rewards/margins": 3.8377203941345215, + "rewards/rejected": -0.7994108200073242, + "step": 2151 + }, + { + "epoch": 0.54, + "grad_norm": 3.3992767333984375, + "learning_rate": 7.144433754648545e-06, + "logits/chosen": -0.18807341158390045, + "logits/rejected": -0.3236675262451172, + "logps/chosen": -58.775352478027344, + "logps/rejected": -66.77693176269531, + "loss": 0.7733, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8022711277008057, + "rewards/margins": 4.30206823348999, + "rewards/rejected": -1.499796986579895, + "step": 2152 + }, + { + "epoch": 0.54, + "grad_norm": 4.825097560882568, + "learning_rate": 7.142066896293436e-06, + "logits/chosen": -0.27834510803222656, + "logits/rejected": -0.3489286005496979, + "logps/chosen": -55.44414138793945, + "logps/rejected": -86.18834686279297, + "loss": 0.9032, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7859833240509033, + "rewards/margins": 3.673034191131592, + "rewards/rejected": -0.887050986289978, + "step": 2153 + }, + { + "epoch": 0.54, + "grad_norm": 8.624494552612305, + "learning_rate": 7.139699449894745e-06, + "logits/chosen": -0.2855803370475769, + "logits/rejected": -0.33008256554603577, + "logps/chosen": -54.59489440917969, + "logps/rejected": -80.07552337646484, + "loss": 0.7983, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.902381420135498, + "rewards/margins": 3.7687020301818848, + "rewards/rejected": -0.8663206100463867, + "step": 2154 + }, + { + "epoch": 0.54, + "grad_norm": 5.638518333435059, + "learning_rate": 7.137331416102381e-06, + "logits/chosen": -0.21772146224975586, + "logits/rejected": -0.297832190990448, + "logps/chosen": -52.82605743408203, + "logps/rejected": -81.43048095703125, + "loss": 0.7497, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.829119920730591, + "rewards/margins": 4.006318092346191, + "rewards/rejected": -1.1771985292434692, + "step": 2155 + }, + { + "epoch": 0.54, + "grad_norm": 7.070441246032715, + "learning_rate": 7.134962795566425e-06, + "logits/chosen": -0.18664467334747314, + "logits/rejected": -0.2840312719345093, + "logps/chosen": -64.64632415771484, + "logps/rejected": -76.36978149414062, + "loss": 0.9143, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.722536087036133, + "rewards/margins": 3.303227663040161, + "rewards/rejected": -0.580691933631897, + "step": 2156 + }, + { + "epoch": 0.54, + "grad_norm": 2.7596304416656494, + "learning_rate": 7.13259358893711e-06, + "logits/chosen": -0.37264853715896606, + "logits/rejected": -0.5010896921157837, + "logps/chosen": -53.829994201660156, + "logps/rejected": -68.42576599121094, + "loss": 0.7295, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.270023822784424, + "rewards/margins": 4.186154365539551, + "rewards/rejected": -0.9161303639411926, + "step": 2157 + }, + { + "epoch": 0.54, + "grad_norm": 2.519331216812134, + "learning_rate": 7.130223796864839e-06, + "logits/chosen": -0.18861275911331177, + "logits/rejected": -0.2703794240951538, + "logps/chosen": -58.20011901855469, + "logps/rejected": -84.0133056640625, + "loss": 0.6994, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.941972494125366, + "rewards/margins": 4.767979621887207, + "rewards/rejected": -1.8260070085525513, + "step": 2158 + }, + { + "epoch": 0.54, + "grad_norm": 8.844120025634766, + "learning_rate": 7.12785342000017e-06, + "logits/chosen": -0.277068167924881, + "logits/rejected": -0.3811173439025879, + "logps/chosen": -57.41020965576172, + "logps/rejected": -80.37606048583984, + "loss": 0.7321, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7589337825775146, + "rewards/margins": 4.381366729736328, + "rewards/rejected": -1.6224327087402344, + "step": 2159 + }, + { + "epoch": 0.54, + "grad_norm": 4.333135604858398, + "learning_rate": 7.125482458993821e-06, + "logits/chosen": -0.23515816032886505, + "logits/rejected": -0.2830526530742645, + "logps/chosen": -53.53828430175781, + "logps/rejected": -88.52408599853516, + "loss": 0.7572, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8675529956817627, + "rewards/margins": 4.4282426834106445, + "rewards/rejected": -1.56069016456604, + "step": 2160 + }, + { + "epoch": 0.54, + "grad_norm": 4.760975360870361, + "learning_rate": 7.123110914496672e-06, + "logits/chosen": -0.21218940615653992, + "logits/rejected": -0.33642134070396423, + "logps/chosen": -66.92955780029297, + "logps/rejected": -76.63885498046875, + "loss": 0.9366, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8346874713897705, + "rewards/margins": 3.994040012359619, + "rewards/rejected": -1.1593528985977173, + "step": 2161 + }, + { + "epoch": 0.54, + "grad_norm": 3.717268943786621, + "learning_rate": 7.1207387871597644e-06, + "logits/chosen": -0.23044417798519135, + "logits/rejected": -0.28803691267967224, + "logps/chosen": -53.01698684692383, + "logps/rejected": -89.62395477294922, + "loss": 0.8259, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9002902507781982, + "rewards/margins": 4.08236026763916, + "rewards/rejected": -1.182070255279541, + "step": 2162 + }, + { + "epoch": 0.54, + "grad_norm": 6.233337879180908, + "learning_rate": 7.118366077634298e-06, + "logits/chosen": -0.22750940918922424, + "logits/rejected": -0.3739393353462219, + "logps/chosen": -64.72093963623047, + "logps/rejected": -66.60702514648438, + "loss": 0.7715, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7416598796844482, + "rewards/margins": 3.791661262512207, + "rewards/rejected": -1.0500013828277588, + "step": 2163 + }, + { + "epoch": 0.54, + "grad_norm": 4.549508571624756, + "learning_rate": 7.115992786571633e-06, + "logits/chosen": -0.2770846486091614, + "logits/rejected": -0.3732168674468994, + "logps/chosen": -44.14689254760742, + "logps/rejected": -62.40007400512695, + "loss": 0.7753, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.890160322189331, + "rewards/margins": 4.347545146942139, + "rewards/rejected": -1.4573845863342285, + "step": 2164 + }, + { + "epoch": 0.54, + "grad_norm": 4.149749755859375, + "learning_rate": 7.113618914623288e-06, + "logits/chosen": -0.23536328971385956, + "logits/rejected": -0.36513885855674744, + "logps/chosen": -62.1788444519043, + "logps/rejected": -73.1451416015625, + "loss": 0.8291, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.878617286682129, + "rewards/margins": 4.339931488037109, + "rewards/rejected": -1.4613139629364014, + "step": 2165 + }, + { + "epoch": 0.54, + "grad_norm": 3.1083805561065674, + "learning_rate": 7.111244462440943e-06, + "logits/chosen": -0.24379687011241913, + "logits/rejected": -0.3196272850036621, + "logps/chosen": -62.88493347167969, + "logps/rejected": -76.40199279785156, + "loss": 0.7868, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.845424175262451, + "rewards/margins": 4.080931663513184, + "rewards/rejected": -1.2355074882507324, + "step": 2166 + }, + { + "epoch": 0.54, + "grad_norm": 5.772772312164307, + "learning_rate": 7.108869430676435e-06, + "logits/chosen": -0.28194451332092285, + "logits/rejected": -0.4249650239944458, + "logps/chosen": -62.68077850341797, + "logps/rejected": -66.68639373779297, + "loss": 0.8016, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.927851438522339, + "rewards/margins": 4.424508571624756, + "rewards/rejected": -1.4966572523117065, + "step": 2167 + }, + { + "epoch": 0.54, + "grad_norm": 13.78699016571045, + "learning_rate": 7.106493819981763e-06, + "logits/chosen": -0.2186567634344101, + "logits/rejected": -0.3987939655780792, + "logps/chosen": -58.48643112182617, + "logps/rejected": -75.25631713867188, + "loss": 0.8268, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9198482036590576, + "rewards/margins": 4.839520454406738, + "rewards/rejected": -1.9196720123291016, + "step": 2168 + }, + { + "epoch": 0.54, + "grad_norm": 6.1414995193481445, + "learning_rate": 7.104117631009083e-06, + "logits/chosen": -0.26842552423477173, + "logits/rejected": -0.4160640239715576, + "logps/chosen": -49.59482192993164, + "logps/rejected": -69.37479400634766, + "loss": 0.7652, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.081556797027588, + "rewards/margins": 4.236740589141846, + "rewards/rejected": -1.1551841497421265, + "step": 2169 + }, + { + "epoch": 0.54, + "grad_norm": 7.250634670257568, + "learning_rate": 7.101740864410709e-06, + "logits/chosen": -0.1811446249485016, + "logits/rejected": -0.36432135105133057, + "logps/chosen": -63.61213302612305, + "logps/rejected": -78.4363021850586, + "loss": 0.8182, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.762444019317627, + "rewards/margins": 3.890031576156616, + "rewards/rejected": -1.1275875568389893, + "step": 2170 + }, + { + "epoch": 0.54, + "grad_norm": 4.682471752166748, + "learning_rate": 7.099363520839117e-06, + "logits/chosen": -0.2220858633518219, + "logits/rejected": -0.3111707270145416, + "logps/chosen": -57.94256591796875, + "logps/rejected": -80.72994995117188, + "loss": 0.7214, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.959247589111328, + "rewards/margins": 4.338869571685791, + "rewards/rejected": -1.379621982574463, + "step": 2171 + }, + { + "epoch": 0.54, + "grad_norm": 3.02140736579895, + "learning_rate": 7.096985600946937e-06, + "logits/chosen": -0.27161070704460144, + "logits/rejected": -0.31310778856277466, + "logps/chosen": -48.19274139404297, + "logps/rejected": -100.34103393554688, + "loss": 0.6362, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.980844259262085, + "rewards/margins": 5.0531697273254395, + "rewards/rejected": -2.0723252296447754, + "step": 2172 + }, + { + "epoch": 0.54, + "grad_norm": 6.421169281005859, + "learning_rate": 7.0946071053869616e-06, + "logits/chosen": -0.24249058961868286, + "logits/rejected": -0.2881755232810974, + "logps/chosen": -63.76639175415039, + "logps/rejected": -90.22985076904297, + "loss": 0.7479, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8781850337982178, + "rewards/margins": 4.056720733642578, + "rewards/rejected": -1.1785355806350708, + "step": 2173 + }, + { + "epoch": 0.54, + "grad_norm": 2.659231424331665, + "learning_rate": 7.092228034812136e-06, + "logits/chosen": -0.2626168727874756, + "logits/rejected": -0.39786970615386963, + "logps/chosen": -53.9213981628418, + "logps/rejected": -71.05613708496094, + "loss": 0.655, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.256685972213745, + "rewards/margins": 5.325951099395752, + "rewards/rejected": -2.069265365600586, + "step": 2174 + }, + { + "epoch": 0.54, + "grad_norm": 10.308039665222168, + "learning_rate": 7.08984838987557e-06, + "logits/chosen": -0.189753919839859, + "logits/rejected": -0.35349202156066895, + "logps/chosen": -51.727691650390625, + "logps/rejected": -79.44313049316406, + "loss": 0.7555, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8632168769836426, + "rewards/margins": 4.327174186706543, + "rewards/rejected": -1.463957667350769, + "step": 2175 + }, + { + "epoch": 0.54, + "grad_norm": 4.886754035949707, + "learning_rate": 7.0874681712305236e-06, + "logits/chosen": -0.3652026355266571, + "logits/rejected": -0.4745958149433136, + "logps/chosen": -56.55002975463867, + "logps/rejected": -73.58544158935547, + "loss": 0.8174, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.097663640975952, + "rewards/margins": 4.398472309112549, + "rewards/rejected": -1.3008087873458862, + "step": 2176 + }, + { + "epoch": 0.54, + "grad_norm": 3.721088409423828, + "learning_rate": 7.085087379530422e-06, + "logits/chosen": -0.3231937289237976, + "logits/rejected": -0.43154358863830566, + "logps/chosen": -51.87694549560547, + "logps/rejected": -83.5472640991211, + "loss": 0.7183, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7532429695129395, + "rewards/margins": 4.144248008728027, + "rewards/rejected": -1.3910053968429565, + "step": 2177 + }, + { + "epoch": 0.54, + "grad_norm": 6.750102996826172, + "learning_rate": 7.0827060154288395e-06, + "logits/chosen": -0.3086169362068176, + "logits/rejected": -0.32981371879577637, + "logps/chosen": -48.6280517578125, + "logps/rejected": -80.62934112548828, + "loss": 0.688, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2200210094451904, + "rewards/margins": 4.227400302886963, + "rewards/rejected": -1.0073796510696411, + "step": 2178 + }, + { + "epoch": 0.55, + "grad_norm": 7.9474310874938965, + "learning_rate": 7.080324079579518e-06, + "logits/chosen": -0.3326946794986725, + "logits/rejected": -0.433430552482605, + "logps/chosen": -63.18421173095703, + "logps/rejected": -86.07332611083984, + "loss": 0.8711, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.04946231842041, + "rewards/margins": 4.725198745727539, + "rewards/rejected": -1.6757361888885498, + "step": 2179 + }, + { + "epoch": 0.55, + "grad_norm": 4.633083343505859, + "learning_rate": 7.0779415726363446e-06, + "logits/chosen": -0.17477940022945404, + "logits/rejected": -0.251044362783432, + "logps/chosen": -58.290008544921875, + "logps/rejected": -84.39749908447266, + "loss": 0.8463, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.745133638381958, + "rewards/margins": 3.33081316947937, + "rewards/rejected": -0.5856795310974121, + "step": 2180 + }, + { + "epoch": 0.55, + "grad_norm": 7.454007625579834, + "learning_rate": 7.075558495253372e-06, + "logits/chosen": -0.37624233961105347, + "logits/rejected": -0.43457335233688354, + "logps/chosen": -52.58190155029297, + "logps/rejected": -67.47207641601562, + "loss": 0.8973, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.72153377532959, + "rewards/margins": 3.5372555255889893, + "rewards/rejected": -0.8157220482826233, + "step": 2181 + }, + { + "epoch": 0.55, + "grad_norm": 10.68105411529541, + "learning_rate": 7.073174848084804e-06, + "logits/chosen": -0.2339385449886322, + "logits/rejected": -0.3502194583415985, + "logps/chosen": -50.45697021484375, + "logps/rejected": -78.61422729492188, + "loss": 0.791, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.736889123916626, + "rewards/margins": 4.8919243812561035, + "rewards/rejected": -2.1550357341766357, + "step": 2182 + }, + { + "epoch": 0.55, + "grad_norm": 6.0242767333984375, + "learning_rate": 7.070790631785006e-06, + "logits/chosen": -0.24587108194828033, + "logits/rejected": -0.3433818817138672, + "logps/chosen": -64.87098693847656, + "logps/rejected": -82.12150573730469, + "loss": 0.8716, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7731080055236816, + "rewards/margins": 4.314580917358398, + "rewards/rejected": -1.5414727926254272, + "step": 2183 + }, + { + "epoch": 0.55, + "grad_norm": 5.301903247833252, + "learning_rate": 7.0684058470084946e-06, + "logits/chosen": -0.21113917231559753, + "logits/rejected": -0.412406861782074, + "logps/chosen": -64.7618408203125, + "logps/rejected": -69.82719421386719, + "loss": 0.797, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.848310947418213, + "rewards/margins": 4.857138633728027, + "rewards/rejected": -2.0088279247283936, + "step": 2184 + }, + { + "epoch": 0.55, + "grad_norm": 4.65496301651001, + "learning_rate": 7.066020494409947e-06, + "logits/chosen": -0.39443913102149963, + "logits/rejected": -0.4536891579627991, + "logps/chosen": -46.0360221862793, + "logps/rejected": -68.53288269042969, + "loss": 0.7403, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5466742515563965, + "rewards/margins": 3.4842123985290527, + "rewards/rejected": -0.9375382661819458, + "step": 2185 + }, + { + "epoch": 0.55, + "grad_norm": 7.128180980682373, + "learning_rate": 7.0636345746441904e-06, + "logits/chosen": -0.25745534896850586, + "logits/rejected": -0.37739911675453186, + "logps/chosen": -63.966094970703125, + "logps/rejected": -74.20927429199219, + "loss": 0.9234, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8632242679595947, + "rewards/margins": 4.368405818939209, + "rewards/rejected": -1.5051816701889038, + "step": 2186 + }, + { + "epoch": 0.55, + "grad_norm": 3.9273288249969482, + "learning_rate": 7.061248088366215e-06, + "logits/chosen": -0.21304824948310852, + "logits/rejected": -0.348280668258667, + "logps/chosen": -52.355995178222656, + "logps/rejected": -72.16081237792969, + "loss": 0.7066, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.111299514770508, + "rewards/margins": 3.9090065956115723, + "rewards/rejected": -0.797707200050354, + "step": 2187 + }, + { + "epoch": 0.55, + "grad_norm": 4.868480682373047, + "learning_rate": 7.0588610362311595e-06, + "logits/chosen": -0.11799459159374237, + "logits/rejected": -0.16948696970939636, + "logps/chosen": -69.56949615478516, + "logps/rejected": -74.12286376953125, + "loss": 0.9818, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1989026069641113, + "rewards/margins": 3.3586113452911377, + "rewards/rejected": -0.1597088724374771, + "step": 2188 + }, + { + "epoch": 0.55, + "grad_norm": 5.207952499389648, + "learning_rate": 7.056473418894325e-06, + "logits/chosen": -0.24161222577095032, + "logits/rejected": -0.36678892374038696, + "logps/chosen": -60.93948745727539, + "logps/rejected": -73.89952087402344, + "loss": 0.825, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8428738117218018, + "rewards/margins": 4.537807464599609, + "rewards/rejected": -1.6949336528778076, + "step": 2189 + }, + { + "epoch": 0.55, + "grad_norm": 6.521247863769531, + "learning_rate": 7.05408523701116e-06, + "logits/chosen": -0.22969838976860046, + "logits/rejected": -0.25140219926834106, + "logps/chosen": -59.96739959716797, + "logps/rejected": -92.13648986816406, + "loss": 0.8113, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8827719688415527, + "rewards/margins": 3.4354796409606934, + "rewards/rejected": -0.552707850933075, + "step": 2190 + }, + { + "epoch": 0.55, + "grad_norm": 5.867504596710205, + "learning_rate": 7.051696491237274e-06, + "logits/chosen": -0.29990285634994507, + "logits/rejected": -0.3970067501068115, + "logps/chosen": -48.531673431396484, + "logps/rejected": -74.71993255615234, + "loss": 0.7246, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.969721555709839, + "rewards/margins": 4.321260452270508, + "rewards/rejected": -1.3515387773513794, + "step": 2191 + }, + { + "epoch": 0.55, + "grad_norm": 5.536362171173096, + "learning_rate": 7.049307182228428e-06, + "logits/chosen": -0.2805875539779663, + "logits/rejected": -0.3716208338737488, + "logps/chosen": -54.44029998779297, + "logps/rejected": -74.02240753173828, + "loss": 0.8294, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7841622829437256, + "rewards/margins": 3.7941863536834717, + "rewards/rejected": -1.010023593902588, + "step": 2192 + }, + { + "epoch": 0.55, + "grad_norm": 6.0337934494018555, + "learning_rate": 7.046917310640543e-06, + "logits/chosen": -0.27468541264533997, + "logits/rejected": -0.3618333637714386, + "logps/chosen": -58.91586685180664, + "logps/rejected": -81.01634216308594, + "loss": 0.8631, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7934436798095703, + "rewards/margins": 4.320389270782471, + "rewards/rejected": -1.5269455909729004, + "step": 2193 + }, + { + "epoch": 0.55, + "grad_norm": 7.06914758682251, + "learning_rate": 7.044526877129686e-06, + "logits/chosen": -0.23537926375865936, + "logits/rejected": -0.3598650097846985, + "logps/chosen": -68.18719482421875, + "logps/rejected": -72.40636444091797, + "loss": 0.8068, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7416880130767822, + "rewards/margins": 4.405424118041992, + "rewards/rejected": -1.663736343383789, + "step": 2194 + }, + { + "epoch": 0.55, + "grad_norm": 4.36969518661499, + "learning_rate": 7.042135882352084e-06, + "logits/chosen": -0.2788405418395996, + "logits/rejected": -0.41027435660362244, + "logps/chosen": -56.0950813293457, + "logps/rejected": -77.80047607421875, + "loss": 0.7745, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.589298963546753, + "rewards/margins": 3.968933582305908, + "rewards/rejected": -1.3796343803405762, + "step": 2195 + }, + { + "epoch": 0.55, + "grad_norm": 3.9065158367156982, + "learning_rate": 7.0397443269641155e-06, + "logits/chosen": -0.3476441502571106, + "logits/rejected": -0.36392441391944885, + "logps/chosen": -48.779258728027344, + "logps/rejected": -82.59516906738281, + "loss": 0.7036, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.923116445541382, + "rewards/margins": 3.765493631362915, + "rewards/rejected": -0.8423771858215332, + "step": 2196 + }, + { + "epoch": 0.55, + "grad_norm": 4.450543403625488, + "learning_rate": 7.037352211622314e-06, + "logits/chosen": -0.3013477623462677, + "logits/rejected": -0.3733377158641815, + "logps/chosen": -50.486854553222656, + "logps/rejected": -80.50621032714844, + "loss": 0.8024, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7821204662323, + "rewards/margins": 3.6048965454101562, + "rewards/rejected": -0.8227764368057251, + "step": 2197 + }, + { + "epoch": 0.55, + "grad_norm": 4.784986972808838, + "learning_rate": 7.034959536983369e-06, + "logits/chosen": -0.2489067167043686, + "logits/rejected": -0.3656291663646698, + "logps/chosen": -62.89530944824219, + "logps/rejected": -69.04312896728516, + "loss": 0.7887, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7597014904022217, + "rewards/margins": 3.725727081298828, + "rewards/rejected": -0.9660260081291199, + "step": 2198 + }, + { + "epoch": 0.55, + "grad_norm": 3.5180225372314453, + "learning_rate": 7.032566303704123e-06, + "logits/chosen": -0.32578134536743164, + "logits/rejected": -0.4116763174533844, + "logps/chosen": -52.52745056152344, + "logps/rejected": -70.50540924072266, + "loss": 0.7362, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8302345275878906, + "rewards/margins": 3.7118847370147705, + "rewards/rejected": -0.8816503882408142, + "step": 2199 + }, + { + "epoch": 0.55, + "grad_norm": 4.35419225692749, + "learning_rate": 7.030172512441563e-06, + "logits/chosen": -0.10461321473121643, + "logits/rejected": -0.3141116201877594, + "logps/chosen": -66.39462280273438, + "logps/rejected": -65.11900329589844, + "loss": 0.7277, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8153772354125977, + "rewards/margins": 3.9484872817993164, + "rewards/rejected": -1.13310968875885, + "step": 2200 + }, + { + "epoch": 0.55, + "grad_norm": 6.329372882843018, + "learning_rate": 7.027778163852843e-06, + "logits/chosen": -0.23815247416496277, + "logits/rejected": -0.358104407787323, + "logps/chosen": -58.64645004272461, + "logps/rejected": -74.32137298583984, + "loss": 0.8209, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8528618812561035, + "rewards/margins": 3.9135613441467285, + "rewards/rejected": -1.0606989860534668, + "step": 2201 + }, + { + "epoch": 0.55, + "grad_norm": 5.920349597930908, + "learning_rate": 7.02538325859526e-06, + "logits/chosen": -0.2645833194255829, + "logits/rejected": -0.3549160361289978, + "logps/chosen": -59.34248352050781, + "logps/rejected": -74.39634704589844, + "loss": 0.8671, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6451306343078613, + "rewards/margins": 3.680938720703125, + "rewards/rejected": -1.0358082056045532, + "step": 2202 + }, + { + "epoch": 0.55, + "grad_norm": 5.448188781738281, + "learning_rate": 7.022987797326269e-06, + "logits/chosen": -0.1490100473165512, + "logits/rejected": -0.2800839841365814, + "logps/chosen": -74.6780776977539, + "logps/rejected": -72.19674682617188, + "loss": 0.854, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.859375476837158, + "rewards/margins": 3.6560781002044678, + "rewards/rejected": -0.7967023253440857, + "step": 2203 + }, + { + "epoch": 0.55, + "grad_norm": 2.8609862327575684, + "learning_rate": 7.020591780703474e-06, + "logits/chosen": -0.20705632865428925, + "logits/rejected": -0.30903881788253784, + "logps/chosen": -53.41268539428711, + "logps/rejected": -74.91081237792969, + "loss": 0.666, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0809454917907715, + "rewards/margins": 4.066544532775879, + "rewards/rejected": -0.985599160194397, + "step": 2204 + }, + { + "epoch": 0.55, + "grad_norm": 4.08139181137085, + "learning_rate": 7.018195209384635e-06, + "logits/chosen": -0.1636236310005188, + "logits/rejected": -0.32510024309158325, + "logps/chosen": -76.7123794555664, + "logps/rejected": -77.34385681152344, + "loss": 0.8855, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.595897674560547, + "rewards/margins": 3.355273485183716, + "rewards/rejected": -0.7593759298324585, + "step": 2205 + }, + { + "epoch": 0.55, + "grad_norm": 3.9180748462677, + "learning_rate": 7.015798084027661e-06, + "logits/chosen": -0.21192902326583862, + "logits/rejected": -0.2848838269710541, + "logps/chosen": -60.94661331176758, + "logps/rejected": -82.62203216552734, + "loss": 0.8123, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7025790214538574, + "rewards/margins": 3.3677384853363037, + "rewards/rejected": -0.6651594638824463, + "step": 2206 + }, + { + "epoch": 0.55, + "grad_norm": 7.603266716003418, + "learning_rate": 7.013400405290617e-06, + "logits/chosen": -0.20427849888801575, + "logits/rejected": -0.2654078006744385, + "logps/chosen": -66.39057159423828, + "logps/rejected": -88.01193237304688, + "loss": 0.9177, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.996495485305786, + "rewards/margins": 3.7331624031066895, + "rewards/rejected": -0.7366667985916138, + "step": 2207 + }, + { + "epoch": 0.55, + "grad_norm": 7.297916889190674, + "learning_rate": 7.011002173831714e-06, + "logits/chosen": -0.15257789194583893, + "logits/rejected": -0.27365803718566895, + "logps/chosen": -56.943504333496094, + "logps/rejected": -78.54780578613281, + "loss": 0.9028, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5682919025421143, + "rewards/margins": 4.581478595733643, + "rewards/rejected": -2.013186454772949, + "step": 2208 + }, + { + "epoch": 0.55, + "grad_norm": 4.680709362030029, + "learning_rate": 7.008603390309323e-06, + "logits/chosen": -0.22289839386940002, + "logits/rejected": -0.3070744574069977, + "logps/chosen": -56.8741569519043, + "logps/rejected": -82.35144805908203, + "loss": 0.8302, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7990992069244385, + "rewards/margins": 3.8084394931793213, + "rewards/rejected": -1.0093398094177246, + "step": 2209 + }, + { + "epoch": 0.55, + "grad_norm": 4.305746555328369, + "learning_rate": 7.006204055381957e-06, + "logits/chosen": -0.18066832423210144, + "logits/rejected": -0.30746954679489136, + "logps/chosen": -51.47187042236328, + "logps/rejected": -74.52996063232422, + "loss": 0.6375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0479297637939453, + "rewards/margins": 4.8501811027526855, + "rewards/rejected": -1.8022507429122925, + "step": 2210 + }, + { + "epoch": 0.55, + "grad_norm": 4.565629005432129, + "learning_rate": 7.003804169708287e-06, + "logits/chosen": -0.2425491213798523, + "logits/rejected": -0.31444063782691956, + "logps/chosen": -60.75004577636719, + "logps/rejected": -78.08224487304688, + "loss": 0.878, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.26259708404541, + "rewards/margins": 3.706082344055176, + "rewards/rejected": -0.4434852600097656, + "step": 2211 + }, + { + "epoch": 0.55, + "grad_norm": 3.847269058227539, + "learning_rate": 7.001403733947134e-06, + "logits/chosen": -0.21519336104393005, + "logits/rejected": -0.37698978185653687, + "logps/chosen": -51.49526596069336, + "logps/rejected": -74.092041015625, + "loss": 0.6214, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7048873901367188, + "rewards/margins": 4.890185832977295, + "rewards/rejected": -2.185298204421997, + "step": 2212 + }, + { + "epoch": 0.55, + "grad_norm": 3.0763919353485107, + "learning_rate": 6.999002748757471e-06, + "logits/chosen": -0.30901798605918884, + "logits/rejected": -0.4168202877044678, + "logps/chosen": -56.226715087890625, + "logps/rejected": -61.9610710144043, + "loss": 0.711, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9339945316314697, + "rewards/margins": 4.096256256103516, + "rewards/rejected": -1.1622620820999146, + "step": 2213 + }, + { + "epoch": 0.55, + "grad_norm": 9.021475791931152, + "learning_rate": 6.996601214798416e-06, + "logits/chosen": -0.20310688018798828, + "logits/rejected": -0.28175777196884155, + "logps/chosen": -60.228614807128906, + "logps/rejected": -77.10736846923828, + "loss": 0.8539, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5462121963500977, + "rewards/margins": 3.454869508743286, + "rewards/rejected": -0.9086571931838989, + "step": 2214 + }, + { + "epoch": 0.55, + "grad_norm": 11.486977577209473, + "learning_rate": 6.994199132729246e-06, + "logits/chosen": -0.18550549447536469, + "logits/rejected": -0.34383708238601685, + "logps/chosen": -68.37081146240234, + "logps/rejected": -75.70903015136719, + "loss": 0.913, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.773285388946533, + "rewards/margins": 4.132447242736816, + "rewards/rejected": -1.3591614961624146, + "step": 2215 + }, + { + "epoch": 0.55, + "grad_norm": 3.648939371109009, + "learning_rate": 6.991796503209382e-06, + "logits/chosen": -0.1286790370941162, + "logits/rejected": -0.24307343363761902, + "logps/chosen": -52.40827178955078, + "logps/rejected": -73.95986938476562, + "loss": 0.6891, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9144694805145264, + "rewards/margins": 4.0601043701171875, + "rewards/rejected": -1.1456347703933716, + "step": 2216 + }, + { + "epoch": 0.55, + "grad_norm": 7.915054798126221, + "learning_rate": 6.989393326898398e-06, + "logits/chosen": -0.1763366311788559, + "logits/rejected": -0.29122093319892883, + "logps/chosen": -70.98926544189453, + "logps/rejected": -74.5962905883789, + "loss": 0.962, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.373781681060791, + "rewards/margins": 3.149521827697754, + "rewards/rejected": -0.7757401466369629, + "step": 2217 + }, + { + "epoch": 0.55, + "grad_norm": 2.2083189487457275, + "learning_rate": 6.986989604456018e-06, + "logits/chosen": -0.27736979722976685, + "logits/rejected": -0.41381996870040894, + "logps/chosen": -50.648101806640625, + "logps/rejected": -72.17912292480469, + "loss": 0.6151, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.886934995651245, + "rewards/margins": 4.982431888580322, + "rewards/rejected": -2.095496654510498, + "step": 2218 + }, + { + "epoch": 0.56, + "grad_norm": 9.989644050598145, + "learning_rate": 6.984585336542116e-06, + "logits/chosen": -0.18127265572547913, + "logits/rejected": -0.2531619668006897, + "logps/chosen": -59.81507110595703, + "logps/rejected": -92.78829956054688, + "loss": 0.8581, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5897982120513916, + "rewards/margins": 3.564244031906128, + "rewards/rejected": -0.9744458198547363, + "step": 2219 + }, + { + "epoch": 0.56, + "grad_norm": 10.765397071838379, + "learning_rate": 6.982180523816715e-06, + "logits/chosen": -0.32436662912368774, + "logits/rejected": -0.39279547333717346, + "logps/chosen": -47.21675109863281, + "logps/rejected": -74.09104919433594, + "loss": 0.8226, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6839888095855713, + "rewards/margins": 3.7942559719085693, + "rewards/rejected": -1.110267162322998, + "step": 2220 + }, + { + "epoch": 0.56, + "grad_norm": 2.364177942276001, + "learning_rate": 6.979775166939989e-06, + "logits/chosen": -0.2533315420150757, + "logits/rejected": -0.34693366289138794, + "logps/chosen": -60.00786209106445, + "logps/rejected": -79.27423095703125, + "loss": 0.6723, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.996183156967163, + "rewards/margins": 4.80856990814209, + "rewards/rejected": -1.8123867511749268, + "step": 2221 + }, + { + "epoch": 0.56, + "grad_norm": 12.552780151367188, + "learning_rate": 6.9773692665722584e-06, + "logits/chosen": -0.21429891884326935, + "logits/rejected": -0.32826828956604004, + "logps/chosen": -55.76451873779297, + "logps/rejected": -73.4132080078125, + "loss": 0.8224, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.632295608520508, + "rewards/margins": 4.324836254119873, + "rewards/rejected": -1.6925408840179443, + "step": 2222 + }, + { + "epoch": 0.56, + "grad_norm": 6.204311847686768, + "learning_rate": 6.974962823373996e-06, + "logits/chosen": -0.24863241612911224, + "logits/rejected": -0.3711005449295044, + "logps/chosen": -53.34604263305664, + "logps/rejected": -77.037109375, + "loss": 0.6148, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.955456018447876, + "rewards/margins": 5.308162689208984, + "rewards/rejected": -2.35270619392395, + "step": 2223 + }, + { + "epoch": 0.56, + "grad_norm": 3.3334438800811768, + "learning_rate": 6.972555838005823e-06, + "logits/chosen": -0.2978094816207886, + "logits/rejected": -0.4613376557826996, + "logps/chosen": -47.09065246582031, + "logps/rejected": -67.50279235839844, + "loss": 0.6697, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0337460041046143, + "rewards/margins": 4.499781608581543, + "rewards/rejected": -1.466036081314087, + "step": 2224 + }, + { + "epoch": 0.56, + "grad_norm": 3.419721841812134, + "learning_rate": 6.970148311128508e-06, + "logits/chosen": -0.2391962707042694, + "logits/rejected": -0.3345002233982086, + "logps/chosen": -57.15155792236328, + "logps/rejected": -82.68122863769531, + "loss": 0.7782, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.948570728302002, + "rewards/margins": 4.392740249633789, + "rewards/rejected": -1.444169521331787, + "step": 2225 + }, + { + "epoch": 0.56, + "grad_norm": 2.621757984161377, + "learning_rate": 6.967740243402968e-06, + "logits/chosen": -0.26887640357017517, + "logits/rejected": -0.34310343861579895, + "logps/chosen": -56.05082321166992, + "logps/rejected": -79.21781921386719, + "loss": 0.7506, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.138838052749634, + "rewards/margins": 4.2998809814453125, + "rewards/rejected": -1.16104257106781, + "step": 2226 + }, + { + "epoch": 0.56, + "grad_norm": 6.184925079345703, + "learning_rate": 6.965331635490271e-06, + "logits/chosen": -0.21576754748821259, + "logits/rejected": -0.2990608811378479, + "logps/chosen": -48.40826416015625, + "logps/rejected": -80.50318145751953, + "loss": 0.9197, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6579556465148926, + "rewards/margins": 3.5304861068725586, + "rewards/rejected": -0.8725306391716003, + "step": 2227 + }, + { + "epoch": 0.56, + "grad_norm": 3.383727550506592, + "learning_rate": 6.962922488051632e-06, + "logits/chosen": -0.19228702783584595, + "logits/rejected": -0.3031560182571411, + "logps/chosen": -48.88413619995117, + "logps/rejected": -63.462554931640625, + "loss": 0.7405, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.070570945739746, + "rewards/margins": 4.285033226013184, + "rewards/rejected": -1.2144628763198853, + "step": 2228 + }, + { + "epoch": 0.56, + "grad_norm": 4.483423233032227, + "learning_rate": 6.960512801748414e-06, + "logits/chosen": -0.2804282307624817, + "logits/rejected": -0.4130229651927948, + "logps/chosen": -53.651283264160156, + "logps/rejected": -84.91410827636719, + "loss": 0.7295, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9774954319000244, + "rewards/margins": 4.069486141204834, + "rewards/rejected": -1.0919911861419678, + "step": 2229 + }, + { + "epoch": 0.56, + "grad_norm": 3.8504793643951416, + "learning_rate": 6.958102577242128e-06, + "logits/chosen": -0.22817635536193848, + "logits/rejected": -0.3052997291088104, + "logps/chosen": -52.811946868896484, + "logps/rejected": -98.57889556884766, + "loss": 0.7647, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9287149906158447, + "rewards/margins": 4.832657814025879, + "rewards/rejected": -1.9039431810379028, + "step": 2230 + }, + { + "epoch": 0.56, + "grad_norm": 5.171489238739014, + "learning_rate": 6.9556918151944294e-06, + "logits/chosen": -0.2727546691894531, + "logits/rejected": -0.3521202802658081, + "logps/chosen": -55.6912841796875, + "logps/rejected": -69.51399230957031, + "loss": 0.8344, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8724687099456787, + "rewards/margins": 3.57951021194458, + "rewards/rejected": -0.7070415019989014, + "step": 2231 + }, + { + "epoch": 0.56, + "grad_norm": 5.1242356300354, + "learning_rate": 6.953280516267129e-06, + "logits/chosen": -0.17249993979930878, + "logits/rejected": -0.3606746792793274, + "logps/chosen": -47.32781219482422, + "logps/rejected": -67.7097396850586, + "loss": 0.651, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.179311513900757, + "rewards/margins": 5.1656341552734375, + "rewards/rejected": -1.9863224029541016, + "step": 2232 + }, + { + "epoch": 0.56, + "grad_norm": 5.3315911293029785, + "learning_rate": 6.950868681122177e-06, + "logits/chosen": -0.23490546643733978, + "logits/rejected": -0.3112834692001343, + "logps/chosen": -81.16582489013672, + "logps/rejected": -73.95829010009766, + "loss": 1.052, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7748830318450928, + "rewards/margins": 2.7030720710754395, + "rewards/rejected": 0.07181093096733093, + "step": 2233 + }, + { + "epoch": 0.56, + "grad_norm": 6.562669277191162, + "learning_rate": 6.948456310421678e-06, + "logits/chosen": -0.17970781028270721, + "logits/rejected": -0.27212315797805786, + "logps/chosen": -61.395957946777344, + "logps/rejected": -79.14967346191406, + "loss": 0.8364, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.895512104034424, + "rewards/margins": 3.3949801921844482, + "rewards/rejected": -0.49946826696395874, + "step": 2234 + }, + { + "epoch": 0.56, + "grad_norm": 4.409260272979736, + "learning_rate": 6.946043404827876e-06, + "logits/chosen": -0.2964181900024414, + "logits/rejected": -0.37158265709877014, + "logps/chosen": -53.653690338134766, + "logps/rejected": -74.69507598876953, + "loss": 0.8607, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7169322967529297, + "rewards/margins": 3.9372177124023438, + "rewards/rejected": -1.2202855348587036, + "step": 2235 + }, + { + "epoch": 0.56, + "grad_norm": 2.8183515071868896, + "learning_rate": 6.943629965003167e-06, + "logits/chosen": -0.09079708158969879, + "logits/rejected": -0.19922089576721191, + "logps/chosen": -61.47840881347656, + "logps/rejected": -83.61289978027344, + "loss": 0.6448, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.630587577819824, + "rewards/margins": 4.528114318847656, + "rewards/rejected": -1.8975270986557007, + "step": 2236 + }, + { + "epoch": 0.56, + "grad_norm": 3.4520533084869385, + "learning_rate": 6.9412159916100905e-06, + "logits/chosen": -0.18814192712306976, + "logits/rejected": -0.2664529085159302, + "logps/chosen": -62.8154411315918, + "logps/rejected": -86.67208099365234, + "loss": 0.7372, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9641263484954834, + "rewards/margins": 4.167428016662598, + "rewards/rejected": -1.2033014297485352, + "step": 2237 + }, + { + "epoch": 0.56, + "grad_norm": 5.516094207763672, + "learning_rate": 6.938801485311339e-06, + "logits/chosen": -0.30349618196487427, + "logits/rejected": -0.360537052154541, + "logps/chosen": -70.32898712158203, + "logps/rejected": -75.26567077636719, + "loss": 0.9109, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6296396255493164, + "rewards/margins": 3.8437659740448, + "rewards/rejected": -1.2141263484954834, + "step": 2238 + }, + { + "epoch": 0.56, + "grad_norm": 6.67809534072876, + "learning_rate": 6.936386446769742e-06, + "logits/chosen": -0.21971584856510162, + "logits/rejected": -0.23294022679328918, + "logps/chosen": -47.81827926635742, + "logps/rejected": -77.67259979248047, + "loss": 0.6924, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8696560859680176, + "rewards/margins": 3.2139670848846436, + "rewards/rejected": -0.3443112075328827, + "step": 2239 + }, + { + "epoch": 0.56, + "grad_norm": 5.947567462921143, + "learning_rate": 6.933970876648284e-06, + "logits/chosen": -0.2799491286277771, + "logits/rejected": -0.3454265892505646, + "logps/chosen": -53.856807708740234, + "logps/rejected": -77.29074096679688, + "loss": 0.8024, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0805373191833496, + "rewards/margins": 4.34078311920166, + "rewards/rejected": -1.2602459192276, + "step": 2240 + }, + { + "epoch": 0.56, + "grad_norm": 6.558019161224365, + "learning_rate": 6.931554775610086e-06, + "logits/chosen": -0.19645626842975616, + "logits/rejected": -0.22361895442008972, + "logps/chosen": -57.603694915771484, + "logps/rejected": -87.89879608154297, + "loss": 0.9116, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0218026638031006, + "rewards/margins": 3.3340349197387695, + "rewards/rejected": -0.31223201751708984, + "step": 2241 + }, + { + "epoch": 0.56, + "grad_norm": 8.605561256408691, + "learning_rate": 6.929138144318424e-06, + "logits/chosen": -0.29182615876197815, + "logits/rejected": -0.3357090950012207, + "logps/chosen": -54.33101272583008, + "logps/rejected": -74.97106170654297, + "loss": 0.9683, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.760561227798462, + "rewards/margins": 2.908121347427368, + "rewards/rejected": -0.14756019413471222, + "step": 2242 + }, + { + "epoch": 0.56, + "grad_norm": 3.2700576782226562, + "learning_rate": 6.926720983436712e-06, + "logits/chosen": -0.307039737701416, + "logits/rejected": -0.38608312606811523, + "logps/chosen": -50.76862335205078, + "logps/rejected": -64.68865203857422, + "loss": 0.7553, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9897594451904297, + "rewards/margins": 3.4263992309570312, + "rewards/rejected": -0.43663978576660156, + "step": 2243 + }, + { + "epoch": 0.56, + "grad_norm": 4.85791015625, + "learning_rate": 6.924303293628517e-06, + "logits/chosen": -0.31072068214416504, + "logits/rejected": -0.34648412466049194, + "logps/chosen": -59.06928253173828, + "logps/rejected": -84.62761688232422, + "loss": 1.0048, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6004116535186768, + "rewards/margins": 3.19364595413208, + "rewards/rejected": -0.5932343006134033, + "step": 2244 + }, + { + "epoch": 0.56, + "grad_norm": 3.8257129192352295, + "learning_rate": 6.921885075557543e-06, + "logits/chosen": -0.17935681343078613, + "logits/rejected": -0.30803266167640686, + "logps/chosen": -66.65669250488281, + "logps/rejected": -71.45838165283203, + "loss": 0.8834, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0215632915496826, + "rewards/margins": 3.6789441108703613, + "rewards/rejected": -0.6573807597160339, + "step": 2245 + }, + { + "epoch": 0.56, + "grad_norm": 3.553978443145752, + "learning_rate": 6.919466329887645e-06, + "logits/chosen": -0.31809455156326294, + "logits/rejected": -0.37143489718437195, + "logps/chosen": -54.55828094482422, + "logps/rejected": -73.64391326904297, + "loss": 0.8003, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8188886642456055, + "rewards/margins": 3.279010772705078, + "rewards/rejected": -0.4601220190525055, + "step": 2246 + }, + { + "epoch": 0.56, + "grad_norm": 4.289100170135498, + "learning_rate": 6.917047057282821e-06, + "logits/chosen": -0.3114086389541626, + "logits/rejected": -0.4251461923122406, + "logps/chosen": -53.2240104675293, + "logps/rejected": -73.16344451904297, + "loss": 0.8682, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8479483127593994, + "rewards/margins": 4.123493194580078, + "rewards/rejected": -1.275545358657837, + "step": 2247 + }, + { + "epoch": 0.56, + "grad_norm": 6.536381244659424, + "learning_rate": 6.914627258407214e-06, + "logits/chosen": -0.33847764134407043, + "logits/rejected": -0.4149314761161804, + "logps/chosen": -60.258514404296875, + "logps/rejected": -72.78759765625, + "loss": 0.8625, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.931680679321289, + "rewards/margins": 3.7346343994140625, + "rewards/rejected": -0.8029536604881287, + "step": 2248 + }, + { + "epoch": 0.56, + "grad_norm": 11.290375709533691, + "learning_rate": 6.912206933925108e-06, + "logits/chosen": -0.3050363063812256, + "logits/rejected": -0.4409130811691284, + "logps/chosen": -54.30811309814453, + "logps/rejected": -73.84073638916016, + "loss": 0.7796, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8596713542938232, + "rewards/margins": 4.423455238342285, + "rewards/rejected": -1.5637836456298828, + "step": 2249 + }, + { + "epoch": 0.56, + "grad_norm": 4.96183967590332, + "learning_rate": 6.909786084500939e-06, + "logits/chosen": -0.24495045840740204, + "logits/rejected": -0.2354004979133606, + "logps/chosen": -51.59889602661133, + "logps/rejected": -89.1943588256836, + "loss": 0.8661, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.511828660964966, + "rewards/margins": 3.296863317489624, + "rewards/rejected": -0.7850350141525269, + "step": 2250 + }, + { + "epoch": 0.56, + "grad_norm": 14.950433731079102, + "learning_rate": 6.907364710799278e-06, + "logits/chosen": -0.2819380760192871, + "logits/rejected": -0.2955304682254791, + "logps/chosen": -56.231258392333984, + "logps/rejected": -85.55622863769531, + "loss": 0.9019, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6986613273620605, + "rewards/margins": 3.5697081089019775, + "rewards/rejected": -0.8710470795631409, + "step": 2251 + }, + { + "epoch": 0.56, + "grad_norm": 1.0878527164459229, + "learning_rate": 6.9049428134848475e-06, + "logits/chosen": -0.2774292826652527, + "logits/rejected": -0.5087088346481323, + "logps/chosen": -54.62467956542969, + "logps/rejected": -70.02761840820312, + "loss": 0.568, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.675208568572998, + "rewards/margins": 5.853962421417236, + "rewards/rejected": -3.178753614425659, + "step": 2252 + }, + { + "epoch": 0.56, + "grad_norm": 19.58792495727539, + "learning_rate": 6.902520393222507e-06, + "logits/chosen": -0.280648410320282, + "logits/rejected": -0.44405224919319153, + "logps/chosen": -54.12734603881836, + "logps/rejected": -67.10392761230469, + "loss": 0.8086, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7105469703674316, + "rewards/margins": 4.143403053283691, + "rewards/rejected": -1.4328560829162598, + "step": 2253 + }, + { + "epoch": 0.56, + "grad_norm": 3.598259210586548, + "learning_rate": 6.900097450677269e-06, + "logits/chosen": -0.18709959089756012, + "logits/rejected": -0.33484524488449097, + "logps/chosen": -66.55802917480469, + "logps/rejected": -69.66138458251953, + "loss": 0.7516, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.833217144012451, + "rewards/margins": 3.7691810131073, + "rewards/rejected": -0.9359638690948486, + "step": 2254 + }, + { + "epoch": 0.56, + "grad_norm": 14.155569076538086, + "learning_rate": 6.897673986514277e-06, + "logits/chosen": -0.2690243124961853, + "logits/rejected": -0.3775947093963623, + "logps/chosen": -58.20574951171875, + "logps/rejected": -78.88951873779297, + "loss": 0.9385, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2656936645507812, + "rewards/margins": 3.8537991046905518, + "rewards/rejected": -0.5881050825119019, + "step": 2255 + }, + { + "epoch": 0.56, + "grad_norm": 6.059490203857422, + "learning_rate": 6.895250001398828e-06, + "logits/chosen": -0.19664126634597778, + "logits/rejected": -0.3336173892021179, + "logps/chosen": -67.78443908691406, + "logps/rejected": -93.13632202148438, + "loss": 0.852, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.864274501800537, + "rewards/margins": 3.8447093963623047, + "rewards/rejected": -0.980434775352478, + "step": 2256 + }, + { + "epoch": 0.56, + "grad_norm": 5.498383522033691, + "learning_rate": 6.892825495996357e-06, + "logits/chosen": -0.2122190296649933, + "logits/rejected": -0.3865102231502533, + "logps/chosen": -63.47053527832031, + "logps/rejected": -56.33842468261719, + "loss": 0.8852, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.624380588531494, + "rewards/margins": 3.7637836933135986, + "rewards/rejected": -1.1394034624099731, + "step": 2257 + }, + { + "epoch": 0.56, + "grad_norm": 3.3527257442474365, + "learning_rate": 6.890400470972443e-06, + "logits/chosen": -0.24105219542980194, + "logits/rejected": -0.379492849111557, + "logps/chosen": -63.49933624267578, + "logps/rejected": -71.53952026367188, + "loss": 0.7124, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8359146118164062, + "rewards/margins": 3.983682632446289, + "rewards/rejected": -1.1477675437927246, + "step": 2258 + }, + { + "epoch": 0.57, + "grad_norm": 6.381855487823486, + "learning_rate": 6.88797492699281e-06, + "logits/chosen": -0.23447933793067932, + "logits/rejected": -0.33704113960266113, + "logps/chosen": -46.52033615112305, + "logps/rejected": -66.94044494628906, + "loss": 0.7026, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.612597942352295, + "rewards/margins": 4.040402412414551, + "rewards/rejected": -1.4278048276901245, + "step": 2259 + }, + { + "epoch": 0.57, + "grad_norm": 7.357161521911621, + "learning_rate": 6.885548864723319e-06, + "logits/chosen": -0.21988558769226074, + "logits/rejected": -0.33300265669822693, + "logps/chosen": -69.9413833618164, + "logps/rejected": -79.47920227050781, + "loss": 0.8649, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8473470211029053, + "rewards/margins": 3.7800939083099365, + "rewards/rejected": -0.9327467679977417, + "step": 2260 + }, + { + "epoch": 0.57, + "grad_norm": 6.307981014251709, + "learning_rate": 6.883122284829977e-06, + "logits/chosen": -0.28301891684532166, + "logits/rejected": -0.4488978683948517, + "logps/chosen": -57.110538482666016, + "logps/rejected": -62.46773147583008, + "loss": 0.8308, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.913529634475708, + "rewards/margins": 4.6332011222839355, + "rewards/rejected": -1.7196714878082275, + "step": 2261 + }, + { + "epoch": 0.57, + "grad_norm": 4.289108753204346, + "learning_rate": 6.880695187978935e-06, + "logits/chosen": -0.2660656273365021, + "logits/rejected": -0.4107843041419983, + "logps/chosen": -64.1595230102539, + "logps/rejected": -82.8469467163086, + "loss": 0.7069, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9733121395111084, + "rewards/margins": 4.681445121765137, + "rewards/rejected": -1.7081334590911865, + "step": 2262 + }, + { + "epoch": 0.57, + "grad_norm": 5.418886184692383, + "learning_rate": 6.87826757483648e-06, + "logits/chosen": -0.23192274570465088, + "logits/rejected": -0.33738595247268677, + "logps/chosen": -71.299560546875, + "logps/rejected": -77.31781005859375, + "loss": 0.942, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8143770694732666, + "rewards/margins": 3.807258367538452, + "rewards/rejected": -0.9928812980651855, + "step": 2263 + }, + { + "epoch": 0.57, + "grad_norm": 9.662459373474121, + "learning_rate": 6.875839446069048e-06, + "logits/chosen": -0.21122491359710693, + "logits/rejected": -0.36643046140670776, + "logps/chosen": -51.33163070678711, + "logps/rejected": -71.860107421875, + "loss": 0.823, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7086856365203857, + "rewards/margins": 4.105876445770264, + "rewards/rejected": -1.397190809249878, + "step": 2264 + }, + { + "epoch": 0.57, + "grad_norm": 3.1975576877593994, + "learning_rate": 6.8734108023432086e-06, + "logits/chosen": -0.29815638065338135, + "logits/rejected": -0.39052513241767883, + "logps/chosen": -78.27008819580078, + "logps/rejected": -94.15592956542969, + "loss": 0.7986, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9791722297668457, + "rewards/margins": 4.079360008239746, + "rewards/rejected": -1.1001875400543213, + "step": 2265 + }, + { + "epoch": 0.57, + "grad_norm": 4.691897869110107, + "learning_rate": 6.870981644325681e-06, + "logits/chosen": -0.31758347153663635, + "logits/rejected": -0.47075650095939636, + "logps/chosen": -61.161067962646484, + "logps/rejected": -72.97759246826172, + "loss": 0.7116, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.937119483947754, + "rewards/margins": 4.8276567459106445, + "rewards/rejected": -1.8905370235443115, + "step": 2266 + }, + { + "epoch": 0.57, + "grad_norm": 3.981083869934082, + "learning_rate": 6.868551972683316e-06, + "logits/chosen": -0.2034275084733963, + "logits/rejected": -0.263744056224823, + "logps/chosen": -46.036216735839844, + "logps/rejected": -67.14610290527344, + "loss": 0.8179, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.974419116973877, + "rewards/margins": 3.188397169113159, + "rewards/rejected": -0.21397821605205536, + "step": 2267 + }, + { + "epoch": 0.57, + "grad_norm": 3.981735944747925, + "learning_rate": 6.866121788083118e-06, + "logits/chosen": -0.1998559981584549, + "logits/rejected": -0.30495506525039673, + "logps/chosen": -51.39120864868164, + "logps/rejected": -83.35626983642578, + "loss": 0.732, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.932328224182129, + "rewards/margins": 4.165550708770752, + "rewards/rejected": -1.2332221269607544, + "step": 2268 + }, + { + "epoch": 0.57, + "grad_norm": 4.0076093673706055, + "learning_rate": 6.86369109119222e-06, + "logits/chosen": -0.23023860156536102, + "logits/rejected": -0.28574132919311523, + "logps/chosen": -69.30136108398438, + "logps/rejected": -93.02330017089844, + "loss": 0.8648, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8048150539398193, + "rewards/margins": 4.606062412261963, + "rewards/rejected": -1.8012473583221436, + "step": 2269 + }, + { + "epoch": 0.57, + "grad_norm": 5.578742980957031, + "learning_rate": 6.861259882677902e-06, + "logits/chosen": -0.2072431445121765, + "logits/rejected": -0.3181793689727783, + "logps/chosen": -65.41407775878906, + "logps/rejected": -72.87474060058594, + "loss": 0.9087, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7022695541381836, + "rewards/margins": 3.5519797801971436, + "rewards/rejected": -0.8497103452682495, + "step": 2270 + }, + { + "epoch": 0.57, + "grad_norm": 7.729179382324219, + "learning_rate": 6.8588281632075824e-06, + "logits/chosen": -0.2546984851360321, + "logits/rejected": -0.39705246686935425, + "logps/chosen": -46.005104064941406, + "logps/rejected": -72.29354858398438, + "loss": 0.8108, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.840963840484619, + "rewards/margins": 3.9138002395629883, + "rewards/rejected": -1.0728362798690796, + "step": 2271 + }, + { + "epoch": 0.57, + "grad_norm": 3.629775047302246, + "learning_rate": 6.856395933448823e-06, + "logits/chosen": -0.26219674944877625, + "logits/rejected": -0.3935016989707947, + "logps/chosen": -60.16495895385742, + "logps/rejected": -82.29276275634766, + "loss": 0.7221, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.862889289855957, + "rewards/margins": 4.188898086547852, + "rewards/rejected": -1.3260087966918945, + "step": 2272 + }, + { + "epoch": 0.57, + "grad_norm": 2.6047799587249756, + "learning_rate": 6.85396319406932e-06, + "logits/chosen": -0.2270825356245041, + "logits/rejected": -0.3858655095100403, + "logps/chosen": -63.8133430480957, + "logps/rejected": -64.93549346923828, + "loss": 0.7723, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.887511730194092, + "rewards/margins": 4.222175598144531, + "rewards/rejected": -1.3346638679504395, + "step": 2273 + }, + { + "epoch": 0.57, + "grad_norm": 3.707465648651123, + "learning_rate": 6.851529945736918e-06, + "logits/chosen": -0.24050889909267426, + "logits/rejected": -0.33086445927619934, + "logps/chosen": -65.19241333007812, + "logps/rejected": -91.35579681396484, + "loss": 0.7906, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8234469890594482, + "rewards/margins": 4.3947319984436035, + "rewards/rejected": -1.5712852478027344, + "step": 2274 + }, + { + "epoch": 0.57, + "grad_norm": 5.076174259185791, + "learning_rate": 6.8490961891195916e-06, + "logits/chosen": -0.2155458629131317, + "logits/rejected": -0.41770702600479126, + "logps/chosen": -54.0674934387207, + "logps/rejected": -69.56893920898438, + "loss": 0.7022, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.975140333175659, + "rewards/margins": 4.896241188049316, + "rewards/rejected": -1.921100378036499, + "step": 2275 + }, + { + "epoch": 0.57, + "grad_norm": 4.303147315979004, + "learning_rate": 6.846661924885461e-06, + "logits/chosen": -0.15960252285003662, + "logits/rejected": -0.3314881920814514, + "logps/chosen": -68.13273620605469, + "logps/rejected": -84.76637268066406, + "loss": 0.6905, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.080249071121216, + "rewards/margins": 4.602632522583008, + "rewards/rejected": -1.5223835706710815, + "step": 2276 + }, + { + "epoch": 0.57, + "grad_norm": 5.966677665710449, + "learning_rate": 6.8442271537027826e-06, + "logits/chosen": -0.1818888783454895, + "logits/rejected": -0.29903683066368103, + "logps/chosen": -55.22060012817383, + "logps/rejected": -74.7247314453125, + "loss": 0.9521, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.723262071609497, + "rewards/margins": 3.251934289932251, + "rewards/rejected": -0.5286722183227539, + "step": 2277 + }, + { + "epoch": 0.57, + "grad_norm": 4.449918270111084, + "learning_rate": 6.8417918762399585e-06, + "logits/chosen": -0.2747299373149872, + "logits/rejected": -0.36485373973846436, + "logps/chosen": -51.490970611572266, + "logps/rejected": -78.91574096679688, + "loss": 0.7666, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7611052989959717, + "rewards/margins": 3.9130876064300537, + "rewards/rejected": -1.151982307434082, + "step": 2278 + }, + { + "epoch": 0.57, + "grad_norm": 4.238665580749512, + "learning_rate": 6.839356093165519e-06, + "logits/chosen": -0.25503766536712646, + "logits/rejected": -0.3116533160209656, + "logps/chosen": -47.39862823486328, + "logps/rejected": -74.51339721679688, + "loss": 0.8147, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7374260425567627, + "rewards/margins": 3.5779364109039307, + "rewards/rejected": -0.8405102491378784, + "step": 2279 + }, + { + "epoch": 0.57, + "grad_norm": 4.927570343017578, + "learning_rate": 6.836919805148142e-06, + "logits/chosen": -0.2922721803188324, + "logits/rejected": -0.3643825352191925, + "logps/chosen": -47.35123825073242, + "logps/rejected": -72.94136810302734, + "loss": 0.8846, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9238691329956055, + "rewards/margins": 3.4269227981567383, + "rewards/rejected": -0.5030536651611328, + "step": 2280 + }, + { + "epoch": 0.57, + "grad_norm": 5.318206787109375, + "learning_rate": 6.834483012856642e-06, + "logits/chosen": -0.241342693567276, + "logits/rejected": -0.3131634593009949, + "logps/chosen": -56.956932067871094, + "logps/rejected": -81.43738555908203, + "loss": 0.7267, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7746920585632324, + "rewards/margins": 3.7225732803344727, + "rewards/rejected": -0.9478808641433716, + "step": 2281 + }, + { + "epoch": 0.57, + "grad_norm": 11.184977531433105, + "learning_rate": 6.832045716959969e-06, + "logits/chosen": -0.24493442475795746, + "logits/rejected": -0.321582168340683, + "logps/chosen": -55.47127914428711, + "logps/rejected": -67.8532943725586, + "loss": 0.9616, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.6874775886535645, + "rewards/margins": 3.047912359237671, + "rewards/rejected": -0.3604346215724945, + "step": 2282 + }, + { + "epoch": 0.57, + "grad_norm": 7.445114612579346, + "learning_rate": 6.829607918127215e-06, + "logits/chosen": -0.2047497034072876, + "logits/rejected": -0.3096594214439392, + "logps/chosen": -52.38456726074219, + "logps/rejected": -73.58134460449219, + "loss": 0.7157, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.025275945663452, + "rewards/margins": 4.207504749298096, + "rewards/rejected": -1.1822291612625122, + "step": 2283 + }, + { + "epoch": 0.57, + "grad_norm": 6.626951694488525, + "learning_rate": 6.827169617027607e-06, + "logits/chosen": -0.29365307092666626, + "logits/rejected": -0.4104301333427429, + "logps/chosen": -61.17181396484375, + "logps/rejected": -77.48443603515625, + "loss": 0.8241, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0164170265197754, + "rewards/margins": 3.727768898010254, + "rewards/rejected": -0.711351752281189, + "step": 2284 + }, + { + "epoch": 0.57, + "grad_norm": 4.168869972229004, + "learning_rate": 6.824730814330513e-06, + "logits/chosen": -0.3155412971973419, + "logits/rejected": -0.4002313017845154, + "logps/chosen": -61.93138122558594, + "logps/rejected": -81.78014373779297, + "loss": 0.8065, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.795738458633423, + "rewards/margins": 3.615561008453369, + "rewards/rejected": -0.8198227882385254, + "step": 2285 + }, + { + "epoch": 0.57, + "grad_norm": 4.182807922363281, + "learning_rate": 6.822291510705434e-06, + "logits/chosen": -0.2745720446109772, + "logits/rejected": -0.2779427766799927, + "logps/chosen": -54.17859649658203, + "logps/rejected": -87.06804656982422, + "loss": 0.7674, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.738795518875122, + "rewards/margins": 3.785823106765747, + "rewards/rejected": -1.0470277070999146, + "step": 2286 + }, + { + "epoch": 0.57, + "grad_norm": 4.807878494262695, + "learning_rate": 6.819851706822015e-06, + "logits/chosen": -0.15392334759235382, + "logits/rejected": -0.25817716121673584, + "logps/chosen": -56.77204132080078, + "logps/rejected": -74.625244140625, + "loss": 0.8385, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.877878189086914, + "rewards/margins": 3.514498233795166, + "rewards/rejected": -0.6366196870803833, + "step": 2287 + }, + { + "epoch": 0.57, + "grad_norm": 4.497987270355225, + "learning_rate": 6.817411403350031e-06, + "logits/chosen": -0.18717172741889954, + "logits/rejected": -0.2957853674888611, + "logps/chosen": -61.54710388183594, + "logps/rejected": -90.16958618164062, + "loss": 0.8483, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.979658842086792, + "rewards/margins": 4.145641326904297, + "rewards/rejected": -1.1659823656082153, + "step": 2288 + }, + { + "epoch": 0.57, + "grad_norm": 11.523336410522461, + "learning_rate": 6.814970600959404e-06, + "logits/chosen": -0.20322462916374207, + "logits/rejected": -0.2769932150840759, + "logps/chosen": -76.9164047241211, + "logps/rejected": -85.64633178710938, + "loss": 0.9797, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8735899925231934, + "rewards/margins": 3.5266377925872803, + "rewards/rejected": -0.6530481576919556, + "step": 2289 + }, + { + "epoch": 0.57, + "grad_norm": 7.871955871582031, + "learning_rate": 6.8125293003201806e-06, + "logits/chosen": -0.2606789469718933, + "logits/rejected": -0.3427082896232605, + "logps/chosen": -56.93289566040039, + "logps/rejected": -86.07916259765625, + "loss": 0.6987, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.952755928039551, + "rewards/margins": 4.629875659942627, + "rewards/rejected": -1.6771196126937866, + "step": 2290 + }, + { + "epoch": 0.57, + "grad_norm": 2.3572804927825928, + "learning_rate": 6.810087502102554e-06, + "logits/chosen": -0.23916202783584595, + "logits/rejected": -0.28195300698280334, + "logps/chosen": -47.40843963623047, + "logps/rejected": -74.71632385253906, + "loss": 0.684, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8253469467163086, + "rewards/margins": 3.9018259048461914, + "rewards/rejected": -1.0764788389205933, + "step": 2291 + }, + { + "epoch": 0.57, + "grad_norm": 3.9817707538604736, + "learning_rate": 6.807645206976847e-06, + "logits/chosen": -0.2553868293762207, + "logits/rejected": -0.37495219707489014, + "logps/chosen": -53.30366516113281, + "logps/rejected": -68.7001953125, + "loss": 0.8127, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9619383811950684, + "rewards/margins": 3.5567965507507324, + "rewards/rejected": -0.5948580503463745, + "step": 2292 + }, + { + "epoch": 0.57, + "grad_norm": 5.7680253982543945, + "learning_rate": 6.805202415613528e-06, + "logits/chosen": -0.2712360918521881, + "logits/rejected": -0.29817306995391846, + "logps/chosen": -65.4447021484375, + "logps/rejected": -75.24821472167969, + "loss": 0.9942, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1140360832214355, + "rewards/margins": 3.067620277404785, + "rewards/rejected": 0.04641532897949219, + "step": 2293 + }, + { + "epoch": 0.57, + "grad_norm": 3.736534833908081, + "learning_rate": 6.802759128683191e-06, + "logits/chosen": -0.14840397238731384, + "logits/rejected": -0.223671093583107, + "logps/chosen": -65.97126770019531, + "logps/rejected": -82.31190490722656, + "loss": 0.7497, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7798690795898438, + "rewards/margins": 4.155898094177246, + "rewards/rejected": -1.3760286569595337, + "step": 2294 + }, + { + "epoch": 0.57, + "grad_norm": 6.171814918518066, + "learning_rate": 6.800315346856573e-06, + "logits/chosen": -0.3014931082725525, + "logits/rejected": -0.37598857283592224, + "logps/chosen": -67.9521484375, + "logps/rejected": -78.02714538574219, + "loss": 0.9072, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.599740505218506, + "rewards/margins": 3.2794241905212402, + "rewards/rejected": -0.6796832084655762, + "step": 2295 + }, + { + "epoch": 0.57, + "grad_norm": 4.28226900100708, + "learning_rate": 6.797871070804543e-06, + "logits/chosen": -0.24542933702468872, + "logits/rejected": -0.38991668820381165, + "logps/chosen": -63.077144622802734, + "logps/rejected": -81.04048156738281, + "loss": 0.8778, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.93316388130188, + "rewards/margins": 5.1177659034729, + "rewards/rejected": -2.1846017837524414, + "step": 2296 + }, + { + "epoch": 0.57, + "grad_norm": 3.6855947971343994, + "learning_rate": 6.7954263011981115e-06, + "logits/chosen": -0.2967504560947418, + "logits/rejected": -0.3440488576889038, + "logps/chosen": -55.50800323486328, + "logps/rejected": -101.15290832519531, + "loss": 0.7501, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7041585445404053, + "rewards/margins": 4.541876792907715, + "rewards/rejected": -1.83771812915802, + "step": 2297 + }, + { + "epoch": 0.57, + "grad_norm": 2.500722885131836, + "learning_rate": 6.792981038708417e-06, + "logits/chosen": -0.22562013566493988, + "logits/rejected": -0.40982764959335327, + "logps/chosen": -62.086334228515625, + "logps/rejected": -69.63956451416016, + "loss": 0.6846, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.922883987426758, + "rewards/margins": 5.335667133331299, + "rewards/rejected": -2.412783145904541, + "step": 2298 + }, + { + "epoch": 0.58, + "grad_norm": 3.179993152618408, + "learning_rate": 6.790535284006738e-06, + "logits/chosen": -0.20223671197891235, + "logits/rejected": -0.2847859561443329, + "logps/chosen": -57.99312210083008, + "logps/rejected": -93.44390106201172, + "loss": 0.7325, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1191418170928955, + "rewards/margins": 4.800265789031982, + "rewards/rejected": -1.6811240911483765, + "step": 2299 + }, + { + "epoch": 0.58, + "grad_norm": 4.301145553588867, + "learning_rate": 6.788089037764487e-06, + "logits/chosen": -0.20627641677856445, + "logits/rejected": -0.3088917136192322, + "logps/chosen": -55.723697662353516, + "logps/rejected": -71.86735534667969, + "loss": 0.7917, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8301706314086914, + "rewards/margins": 3.701223134994507, + "rewards/rejected": -0.8710526823997498, + "step": 2300 + }, + { + "epoch": 0.58, + "grad_norm": 3.1970534324645996, + "learning_rate": 6.785642300653211e-06, + "logits/chosen": -0.22671552002429962, + "logits/rejected": -0.2604409456253052, + "logps/chosen": -55.262020111083984, + "logps/rejected": -73.8447265625, + "loss": 0.8013, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7395670413970947, + "rewards/margins": 3.3547985553741455, + "rewards/rejected": -0.6152316927909851, + "step": 2301 + }, + { + "epoch": 0.58, + "grad_norm": 4.564061164855957, + "learning_rate": 6.7831950733445916e-06, + "logits/chosen": -0.14584939181804657, + "logits/rejected": -0.1770719736814499, + "logps/chosen": -70.17156982421875, + "logps/rejected": -96.47877502441406, + "loss": 0.831, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.981241226196289, + "rewards/margins": 3.7656126022338867, + "rewards/rejected": -0.7843711376190186, + "step": 2302 + }, + { + "epoch": 0.58, + "grad_norm": 6.2469635009765625, + "learning_rate": 6.780747356510448e-06, + "logits/chosen": -0.3065480589866638, + "logits/rejected": -0.3732335865497589, + "logps/chosen": -56.02752685546875, + "logps/rejected": -80.51123046875, + "loss": 0.9995, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.3907265663146973, + "rewards/margins": 3.263607978820801, + "rewards/rejected": -0.8728814721107483, + "step": 2303 + }, + { + "epoch": 0.58, + "grad_norm": 6.039930820465088, + "learning_rate": 6.7782991508227295e-06, + "logits/chosen": -0.27815455198287964, + "logits/rejected": -0.3804636001586914, + "logps/chosen": -61.13496780395508, + "logps/rejected": -79.35000610351562, + "loss": 1.01, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.801753520965576, + "rewards/margins": 3.260389566421509, + "rewards/rejected": -0.4586362838745117, + "step": 2304 + }, + { + "epoch": 0.58, + "grad_norm": 9.893735885620117, + "learning_rate": 6.77585045695352e-06, + "logits/chosen": -0.27579599618911743, + "logits/rejected": -0.390455961227417, + "logps/chosen": -66.86557006835938, + "logps/rejected": -70.24195098876953, + "loss": 1.097, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.290013074874878, + "rewards/margins": 3.4053661823272705, + "rewards/rejected": -1.115352988243103, + "step": 2305 + }, + { + "epoch": 0.58, + "grad_norm": 5.311185359954834, + "learning_rate": 6.77340127557504e-06, + "logits/chosen": -0.28183820843696594, + "logits/rejected": -0.3970247805118561, + "logps/chosen": -55.81182098388672, + "logps/rejected": -72.44236755371094, + "loss": 0.7466, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.790647506713867, + "rewards/margins": 4.067066669464111, + "rewards/rejected": -1.2764191627502441, + "step": 2306 + }, + { + "epoch": 0.58, + "grad_norm": 6.037176132202148, + "learning_rate": 6.770951607359644e-06, + "logits/chosen": -0.24293671548366547, + "logits/rejected": -0.3475426137447357, + "logps/chosen": -70.03440856933594, + "logps/rejected": -74.52963256835938, + "loss": 0.9385, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.753650426864624, + "rewards/margins": 3.108720541000366, + "rewards/rejected": -0.355070024728775, + "step": 2307 + }, + { + "epoch": 0.58, + "grad_norm": 7.302443981170654, + "learning_rate": 6.768501452979817e-06, + "logits/chosen": -0.3302965462207794, + "logits/rejected": -0.3966856002807617, + "logps/chosen": -51.118682861328125, + "logps/rejected": -81.60063934326172, + "loss": 0.8171, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.741283655166626, + "rewards/margins": 3.6407947540283203, + "rewards/rejected": -0.8995110988616943, + "step": 2308 + }, + { + "epoch": 0.58, + "grad_norm": 7.276098728179932, + "learning_rate": 6.766050813108182e-06, + "logits/chosen": -0.1965871900320053, + "logits/rejected": -0.30517590045928955, + "logps/chosen": -47.124996185302734, + "logps/rejected": -67.29063415527344, + "loss": 0.7912, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4659264087677, + "rewards/margins": 3.5961809158325195, + "rewards/rejected": -1.1302547454833984, + "step": 2309 + }, + { + "epoch": 0.58, + "grad_norm": 4.47252082824707, + "learning_rate": 6.763599688417487e-06, + "logits/chosen": -0.30058038234710693, + "logits/rejected": -0.3814864754676819, + "logps/chosen": -60.33155059814453, + "logps/rejected": -79.40241241455078, + "loss": 0.8216, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.002552032470703, + "rewards/margins": 4.435670375823975, + "rewards/rejected": -1.433118462562561, + "step": 2310 + }, + { + "epoch": 0.58, + "grad_norm": 3.5719828605651855, + "learning_rate": 6.761148079580624e-06, + "logits/chosen": -0.2542354166507721, + "logits/rejected": -0.3657970428466797, + "logps/chosen": -65.16372680664062, + "logps/rejected": -91.33060455322266, + "loss": 0.7063, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8049778938293457, + "rewards/margins": 4.875582695007324, + "rewards/rejected": -2.0706052780151367, + "step": 2311 + }, + { + "epoch": 0.58, + "grad_norm": 4.644759178161621, + "learning_rate": 6.758695987270609e-06, + "logits/chosen": -0.2170868217945099, + "logits/rejected": -0.33837321400642395, + "logps/chosen": -61.32481002807617, + "logps/rejected": -83.30747985839844, + "loss": 0.8574, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7227554321289062, + "rewards/margins": 3.856597661972046, + "rewards/rejected": -1.1338424682617188, + "step": 2312 + }, + { + "epoch": 0.58, + "grad_norm": 3.964076280593872, + "learning_rate": 6.756243412160598e-06, + "logits/chosen": -0.26101166009902954, + "logits/rejected": -0.31145113706588745, + "logps/chosen": -54.995304107666016, + "logps/rejected": -87.29012298583984, + "loss": 0.7624, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.012789011001587, + "rewards/margins": 4.229609489440918, + "rewards/rejected": -1.2168207168579102, + "step": 2313 + }, + { + "epoch": 0.58, + "grad_norm": 4.73489236831665, + "learning_rate": 6.753790354923872e-06, + "logits/chosen": -0.27614152431488037, + "logits/rejected": -0.4204583764076233, + "logps/chosen": -56.63085174560547, + "logps/rejected": -60.903995513916016, + "loss": 0.8264, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.978865623474121, + "rewards/margins": 3.9257965087890625, + "rewards/rejected": -0.9469310641288757, + "step": 2314 + }, + { + "epoch": 0.58, + "grad_norm": 4.138552665710449, + "learning_rate": 6.751336816233852e-06, + "logits/chosen": -0.24936607480049133, + "logits/rejected": -0.354300320148468, + "logps/chosen": -63.61201095581055, + "logps/rejected": -75.91291809082031, + "loss": 0.8342, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.038404941558838, + "rewards/margins": 4.10561466217041, + "rewards/rejected": -1.067209243774414, + "step": 2315 + }, + { + "epoch": 0.58, + "grad_norm": 5.908290386199951, + "learning_rate": 6.748882796764083e-06, + "logits/chosen": -0.2649608552455902, + "logits/rejected": -0.389688640832901, + "logps/chosen": -60.12038040161133, + "logps/rejected": -63.74466323852539, + "loss": 0.8351, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.978682041168213, + "rewards/margins": 3.5570433139801025, + "rewards/rejected": -0.5783613324165344, + "step": 2316 + }, + { + "epoch": 0.58, + "grad_norm": 4.405956268310547, + "learning_rate": 6.746428297188249e-06, + "logits/chosen": -0.25040334463119507, + "logits/rejected": -0.3526550829410553, + "logps/chosen": -58.67744064331055, + "logps/rejected": -64.7685775756836, + "loss": 0.8623, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.866544485092163, + "rewards/margins": 3.087425947189331, + "rewards/rejected": -0.22088144719600677, + "step": 2317 + }, + { + "epoch": 0.58, + "grad_norm": 4.323265552520752, + "learning_rate": 6.7439733181801615e-06, + "logits/chosen": -0.24428990483283997, + "logits/rejected": -0.276833713054657, + "logps/chosen": -54.86597442626953, + "logps/rejected": -74.5811538696289, + "loss": 0.9057, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.748030185699463, + "rewards/margins": 3.1053123474121094, + "rewards/rejected": -0.357281893491745, + "step": 2318 + }, + { + "epoch": 0.58, + "grad_norm": 6.9624481201171875, + "learning_rate": 6.7415178604137686e-06, + "logits/chosen": -0.23163992166519165, + "logits/rejected": -0.3328404724597931, + "logps/chosen": -58.726566314697266, + "logps/rejected": -71.56261444091797, + "loss": 0.8711, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.662544012069702, + "rewards/margins": 3.54032826423645, + "rewards/rejected": -0.8777843713760376, + "step": 2319 + }, + { + "epoch": 0.58, + "grad_norm": 3.957343816757202, + "learning_rate": 6.739061924563141e-06, + "logits/chosen": -0.2642049491405487, + "logits/rejected": -0.3672357201576233, + "logps/chosen": -53.96847152709961, + "logps/rejected": -66.2410888671875, + "loss": 0.8206, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7986998558044434, + "rewards/margins": 3.6635751724243164, + "rewards/rejected": -0.8648752570152283, + "step": 2320 + }, + { + "epoch": 0.58, + "grad_norm": 4.163896560668945, + "learning_rate": 6.736605511302492e-06, + "logits/chosen": -0.1894742250442505, + "logits/rejected": -0.2734220623970032, + "logps/chosen": -56.70046615600586, + "logps/rejected": -77.3481674194336, + "loss": 0.7046, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.140850305557251, + "rewards/margins": 4.047544479370117, + "rewards/rejected": -0.9066943526268005, + "step": 2321 + }, + { + "epoch": 0.58, + "grad_norm": 3.5969743728637695, + "learning_rate": 6.734148621306155e-06, + "logits/chosen": -0.21797631680965424, + "logits/rejected": -0.35869142413139343, + "logps/chosen": -55.542625427246094, + "logps/rejected": -69.4615478515625, + "loss": 0.7494, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.83038330078125, + "rewards/margins": 4.431549549102783, + "rewards/rejected": -1.6011663675308228, + "step": 2322 + }, + { + "epoch": 0.58, + "grad_norm": 3.104339122772217, + "learning_rate": 6.731691255248602e-06, + "logits/chosen": -0.2270834743976593, + "logits/rejected": -0.28533241152763367, + "logps/chosen": -58.47134017944336, + "logps/rejected": -87.93716430664062, + "loss": 0.8185, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0109543800354004, + "rewards/margins": 4.27298641204834, + "rewards/rejected": -1.262032389640808, + "step": 2323 + }, + { + "epoch": 0.58, + "grad_norm": 3.7142670154571533, + "learning_rate": 6.729233413804434e-06, + "logits/chosen": -0.33854609727859497, + "logits/rejected": -0.41751477122306824, + "logps/chosen": -63.50868606567383, + "logps/rejected": -75.58417510986328, + "loss": 0.83, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0680174827575684, + "rewards/margins": 4.115274429321289, + "rewards/rejected": -1.0472570657730103, + "step": 2324 + }, + { + "epoch": 0.58, + "grad_norm": 6.161027908325195, + "learning_rate": 6.726775097648378e-06, + "logits/chosen": -0.2321314811706543, + "logits/rejected": -0.3183329701423645, + "logps/chosen": -65.9196548461914, + "logps/rejected": -70.44296264648438, + "loss": 0.9826, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.791567802429199, + "rewards/margins": 2.6182994842529297, + "rewards/rejected": 0.1732684075832367, + "step": 2325 + }, + { + "epoch": 0.58, + "grad_norm": 3.886491537094116, + "learning_rate": 6.724316307455297e-06, + "logits/chosen": -0.19925294816493988, + "logits/rejected": -0.3407626450061798, + "logps/chosen": -65.19623565673828, + "logps/rejected": -76.48274993896484, + "loss": 0.7683, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.781468629837036, + "rewards/margins": 3.7315402030944824, + "rewards/rejected": -0.9500715136528015, + "step": 2326 + }, + { + "epoch": 0.58, + "grad_norm": 3.1536951065063477, + "learning_rate": 6.721857043900183e-06, + "logits/chosen": -0.22354337573051453, + "logits/rejected": -0.3161129951477051, + "logps/chosen": -60.63908004760742, + "logps/rejected": -84.06692504882812, + "loss": 0.66, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.057011842727661, + "rewards/margins": 4.3701491355896, + "rewards/rejected": -1.3131372928619385, + "step": 2327 + }, + { + "epoch": 0.58, + "grad_norm": 3.3605892658233643, + "learning_rate": 6.719397307658154e-06, + "logits/chosen": -0.2174544483423233, + "logits/rejected": -0.3500569760799408, + "logps/chosen": -65.60147094726562, + "logps/rejected": -75.74195098876953, + "loss": 0.7651, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.98840069770813, + "rewards/margins": 3.987560272216797, + "rewards/rejected": -0.9991592168807983, + "step": 2328 + }, + { + "epoch": 0.58, + "grad_norm": 3.3668158054351807, + "learning_rate": 6.716937099404463e-06, + "logits/chosen": -0.23932963609695435, + "logits/rejected": -0.3364477753639221, + "logps/chosen": -61.26926803588867, + "logps/rejected": -69.58575439453125, + "loss": 0.8044, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.847261428833008, + "rewards/margins": 3.548192024230957, + "rewards/rejected": -0.7009304761886597, + "step": 2329 + }, + { + "epoch": 0.58, + "grad_norm": 8.608647346496582, + "learning_rate": 6.714476419814492e-06, + "logits/chosen": -0.18750417232513428, + "logits/rejected": -0.2813730239868164, + "logps/chosen": -61.691532135009766, + "logps/rejected": -80.81163787841797, + "loss": 0.9454, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6756415367126465, + "rewards/margins": 3.414072275161743, + "rewards/rejected": -0.7384306192398071, + "step": 2330 + }, + { + "epoch": 0.58, + "grad_norm": 3.469139337539673, + "learning_rate": 6.712015269563745e-06, + "logits/chosen": -0.2552376985549927, + "logits/rejected": -0.24327123165130615, + "logps/chosen": -55.44352340698242, + "logps/rejected": -87.32891845703125, + "loss": 0.775, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.866004467010498, + "rewards/margins": 3.515296220779419, + "rewards/rejected": -0.6492918729782104, + "step": 2331 + }, + { + "epoch": 0.58, + "grad_norm": 3.046760082244873, + "learning_rate": 6.709553649327865e-06, + "logits/chosen": -0.20595380663871765, + "logits/rejected": -0.3991485834121704, + "logps/chosen": -56.824275970458984, + "logps/rejected": -71.3918228149414, + "loss": 0.6942, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8560726642608643, + "rewards/margins": 4.581584930419922, + "rewards/rejected": -1.7255122661590576, + "step": 2332 + }, + { + "epoch": 0.58, + "grad_norm": 4.750133991241455, + "learning_rate": 6.707091559782621e-06, + "logits/chosen": -0.24545758962631226, + "logits/rejected": -0.37435609102249146, + "logps/chosen": -56.6243782043457, + "logps/rejected": -84.1644287109375, + "loss": 0.6527, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8351597785949707, + "rewards/margins": 5.084596157073975, + "rewards/rejected": -2.249436378479004, + "step": 2333 + }, + { + "epoch": 0.58, + "grad_norm": 8.656519889831543, + "learning_rate": 6.704629001603906e-06, + "logits/chosen": -0.2156500518321991, + "logits/rejected": -0.30008235573768616, + "logps/chosen": -63.89897918701172, + "logps/rejected": -74.76335144042969, + "loss": 0.849, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7673070430755615, + "rewards/margins": 3.4415860176086426, + "rewards/rejected": -0.6742787957191467, + "step": 2334 + }, + { + "epoch": 0.58, + "grad_norm": 4.954250335693359, + "learning_rate": 6.702165975467747e-06, + "logits/chosen": -0.2514744699001312, + "logits/rejected": -0.3673059046268463, + "logps/chosen": -51.38825607299805, + "logps/rejected": -79.23851013183594, + "loss": 0.6285, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.751462936401367, + "rewards/margins": 4.219707489013672, + "rewards/rejected": -1.4682445526123047, + "step": 2335 + }, + { + "epoch": 0.58, + "grad_norm": 4.926753997802734, + "learning_rate": 6.699702482050298e-06, + "logits/chosen": -0.24680598080158234, + "logits/rejected": -0.388717919588089, + "logps/chosen": -49.53788375854492, + "logps/rejected": -63.13505935668945, + "loss": 0.7962, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.594228506088257, + "rewards/margins": 3.6134018898010254, + "rewards/rejected": -1.0191733837127686, + "step": 2336 + }, + { + "epoch": 0.58, + "grad_norm": 3.8852884769439697, + "learning_rate": 6.69723852202784e-06, + "logits/chosen": -0.1809719204902649, + "logits/rejected": -0.29469379782676697, + "logps/chosen": -62.23057556152344, + "logps/rejected": -75.998779296875, + "loss": 0.7188, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.795074701309204, + "rewards/margins": 3.791659116744995, + "rewards/rejected": -0.9965842962265015, + "step": 2337 + }, + { + "epoch": 0.58, + "grad_norm": 6.241151809692383, + "learning_rate": 6.694774096076786e-06, + "logits/chosen": -0.13687679171562195, + "logits/rejected": -0.31410345435142517, + "logps/chosen": -64.63123321533203, + "logps/rejected": -65.34974670410156, + "loss": 0.7782, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6252782344818115, + "rewards/margins": 4.259078025817871, + "rewards/rejected": -1.6337995529174805, + "step": 2338 + }, + { + "epoch": 0.59, + "grad_norm": 10.09404182434082, + "learning_rate": 6.692309204873672e-06, + "logits/chosen": -0.19252918660640717, + "logits/rejected": -0.22109951078891754, + "logps/chosen": -51.695804595947266, + "logps/rejected": -76.97731018066406, + "loss": 0.9552, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8759047985076904, + "rewards/margins": 3.2088005542755127, + "rewards/rejected": -0.3328954577445984, + "step": 2339 + }, + { + "epoch": 0.59, + "grad_norm": 4.213667869567871, + "learning_rate": 6.689843849095164e-06, + "logits/chosen": -0.32527583837509155, + "logits/rejected": -0.3974604904651642, + "logps/chosen": -59.584617614746094, + "logps/rejected": -96.03350830078125, + "loss": 0.8469, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.00114369392395, + "rewards/margins": 4.216712474822998, + "rewards/rejected": -1.2155687808990479, + "step": 2340 + }, + { + "epoch": 0.59, + "grad_norm": 2.9298036098480225, + "learning_rate": 6.687378029418054e-06, + "logits/chosen": -0.23156428337097168, + "logits/rejected": -0.4180980324745178, + "logps/chosen": -55.075626373291016, + "logps/rejected": -70.17648315429688, + "loss": 0.6313, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0003533363342285, + "rewards/margins": 4.986567497253418, + "rewards/rejected": -1.986214280128479, + "step": 2341 + }, + { + "epoch": 0.59, + "grad_norm": 3.4707798957824707, + "learning_rate": 6.684911746519267e-06, + "logits/chosen": -0.2135552167892456, + "logits/rejected": -0.35002097487449646, + "logps/chosen": -65.64598846435547, + "logps/rejected": -79.29388427734375, + "loss": 0.7949, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.601618528366089, + "rewards/margins": 4.507610321044922, + "rewards/rejected": -1.9059916734695435, + "step": 2342 + }, + { + "epoch": 0.59, + "grad_norm": 4.82611083984375, + "learning_rate": 6.682445001075846e-06, + "logits/chosen": -0.25418204069137573, + "logits/rejected": -0.3062707781791687, + "logps/chosen": -44.60671615600586, + "logps/rejected": -81.67996978759766, + "loss": 0.8597, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8575878143310547, + "rewards/margins": 3.8209025859832764, + "rewards/rejected": -0.9633145332336426, + "step": 2343 + }, + { + "epoch": 0.59, + "grad_norm": 3.303199291229248, + "learning_rate": 6.679977793764969e-06, + "logits/chosen": -0.2396218329668045, + "logits/rejected": -0.3736666142940521, + "logps/chosen": -72.25897216796875, + "logps/rejected": -78.20608520507812, + "loss": 0.7218, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8795406818389893, + "rewards/margins": 4.243851184844971, + "rewards/rejected": -1.364310622215271, + "step": 2344 + }, + { + "epoch": 0.59, + "grad_norm": 3.630422830581665, + "learning_rate": 6.677510125263935e-06, + "logits/chosen": -0.19946670532226562, + "logits/rejected": -0.2531070113182068, + "logps/chosen": -54.65922546386719, + "logps/rejected": -95.17095947265625, + "loss": 0.749, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7249033451080322, + "rewards/margins": 3.4525370597839355, + "rewards/rejected": -0.7276340126991272, + "step": 2345 + }, + { + "epoch": 0.59, + "grad_norm": 3.366577625274658, + "learning_rate": 6.675041996250175e-06, + "logits/chosen": -0.29597020149230957, + "logits/rejected": -0.3476721942424774, + "logps/chosen": -46.99615478515625, + "logps/rejected": -85.60021209716797, + "loss": 0.6476, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.13350510597229, + "rewards/margins": 4.051357746124268, + "rewards/rejected": -0.9178525805473328, + "step": 2346 + }, + { + "epoch": 0.59, + "grad_norm": 8.658427238464355, + "learning_rate": 6.672573407401241e-06, + "logits/chosen": -0.2050398290157318, + "logits/rejected": -0.31570154428482056, + "logps/chosen": -66.3660659790039, + "logps/rejected": -73.82298278808594, + "loss": 1.1188, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.4909560680389404, + "rewards/margins": 2.98172664642334, + "rewards/rejected": -0.4907706081867218, + "step": 2347 + }, + { + "epoch": 0.59, + "grad_norm": 5.16353702545166, + "learning_rate": 6.6701043593948174e-06, + "logits/chosen": -0.240912064909935, + "logits/rejected": -0.28216639161109924, + "logps/chosen": -71.96672821044922, + "logps/rejected": -96.31605529785156, + "loss": 0.8564, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.980090618133545, + "rewards/margins": 3.5157358646392822, + "rewards/rejected": -0.5356451869010925, + "step": 2348 + }, + { + "epoch": 0.59, + "grad_norm": 4.173378944396973, + "learning_rate": 6.667634852908709e-06, + "logits/chosen": -0.20851609110832214, + "logits/rejected": -0.3049042820930481, + "logps/chosen": -64.70338439941406, + "logps/rejected": -82.5193099975586, + "loss": 0.8299, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1172709465026855, + "rewards/margins": 4.192423343658447, + "rewards/rejected": -1.0751525163650513, + "step": 2349 + }, + { + "epoch": 0.59, + "grad_norm": 4.483341693878174, + "learning_rate": 6.665164888620848e-06, + "logits/chosen": -0.233147993683815, + "logits/rejected": -0.36025890707969666, + "logps/chosen": -62.753456115722656, + "logps/rejected": -82.70891571044922, + "loss": 0.6949, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.76224684715271, + "rewards/margins": 4.510880947113037, + "rewards/rejected": -1.7486339807510376, + "step": 2350 + }, + { + "epoch": 0.59, + "grad_norm": 3.872311592102051, + "learning_rate": 6.662694467209293e-06, + "logits/chosen": -0.28984948992729187, + "logits/rejected": -0.37431976199150085, + "logps/chosen": -54.03924560546875, + "logps/rejected": -91.86058807373047, + "loss": 0.7131, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7838289737701416, + "rewards/margins": 4.693930149078369, + "rewards/rejected": -1.9101003408432007, + "step": 2351 + }, + { + "epoch": 0.59, + "grad_norm": 4.810640335083008, + "learning_rate": 6.6602235893522294e-06, + "logits/chosen": -0.3099077045917511, + "logits/rejected": -0.3473871350288391, + "logps/chosen": -53.551246643066406, + "logps/rejected": -88.35556030273438, + "loss": 0.9113, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8924758434295654, + "rewards/margins": 3.92451810836792, + "rewards/rejected": -1.032042384147644, + "step": 2352 + }, + { + "epoch": 0.59, + "grad_norm": 3.0007033348083496, + "learning_rate": 6.657752255727965e-06, + "logits/chosen": -0.2535855770111084, + "logits/rejected": -0.41138723492622375, + "logps/chosen": -56.8116569519043, + "logps/rejected": -85.77323913574219, + "loss": 0.6285, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9020633697509766, + "rewards/margins": 5.652226448059082, + "rewards/rejected": -2.7501633167266846, + "step": 2353 + }, + { + "epoch": 0.59, + "grad_norm": 4.397826671600342, + "learning_rate": 6.6552804670149374e-06, + "logits/chosen": -0.1513954997062683, + "logits/rejected": -0.24675273895263672, + "logps/chosen": -68.90606689453125, + "logps/rejected": -80.08294677734375, + "loss": 0.7736, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9103987216949463, + "rewards/margins": 3.3809618949890137, + "rewards/rejected": -0.4705633819103241, + "step": 2354 + }, + { + "epoch": 0.59, + "grad_norm": 4.291917324066162, + "learning_rate": 6.6528082238917e-06, + "logits/chosen": -0.2686416208744049, + "logits/rejected": -0.35509562492370605, + "logps/chosen": -51.4677734375, + "logps/rejected": -83.33488464355469, + "loss": 0.845, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9671871662139893, + "rewards/margins": 3.8820087909698486, + "rewards/rejected": -0.9148219227790833, + "step": 2355 + }, + { + "epoch": 0.59, + "grad_norm": 4.712212085723877, + "learning_rate": 6.650335527036943e-06, + "logits/chosen": -0.2638486623764038, + "logits/rejected": -0.34015408158302307, + "logps/chosen": -55.92709732055664, + "logps/rejected": -88.25981140136719, + "loss": 0.6859, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.704240322113037, + "rewards/margins": 4.585728645324707, + "rewards/rejected": -1.8814880847930908, + "step": 2356 + }, + { + "epoch": 0.59, + "grad_norm": 4.124452590942383, + "learning_rate": 6.64786237712947e-06, + "logits/chosen": -0.25116461515426636, + "logits/rejected": -0.3584229052066803, + "logps/chosen": -61.53289794921875, + "logps/rejected": -89.62702178955078, + "loss": 0.7174, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7477383613586426, + "rewards/margins": 4.190291881561279, + "rewards/rejected": -1.4425537586212158, + "step": 2357 + }, + { + "epoch": 0.59, + "grad_norm": 11.777384757995605, + "learning_rate": 6.645388774848218e-06, + "logits/chosen": -0.17113491892814636, + "logits/rejected": -0.3351094126701355, + "logps/chosen": -62.25340270996094, + "logps/rejected": -66.29595184326172, + "loss": 0.8732, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.5726490020751953, + "rewards/margins": 3.9417505264282227, + "rewards/rejected": -1.3691015243530273, + "step": 2358 + }, + { + "epoch": 0.59, + "grad_norm": 7.714550018310547, + "learning_rate": 6.642914720872244e-06, + "logits/chosen": -0.3220186233520508, + "logits/rejected": -0.3485202193260193, + "logps/chosen": -53.02643585205078, + "logps/rejected": -81.02090454101562, + "loss": 1.0439, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8435206413269043, + "rewards/margins": 3.908665180206299, + "rewards/rejected": -1.0651445388793945, + "step": 2359 + }, + { + "epoch": 0.59, + "grad_norm": 9.611409187316895, + "learning_rate": 6.640440215880726e-06, + "logits/chosen": -0.2154613584280014, + "logits/rejected": -0.282553493976593, + "logps/chosen": -65.25334930419922, + "logps/rejected": -85.42216491699219, + "loss": 0.8915, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.500338315963745, + "rewards/margins": 3.9902491569519043, + "rewards/rejected": -1.4899108409881592, + "step": 2360 + }, + { + "epoch": 0.59, + "grad_norm": 3.1560983657836914, + "learning_rate": 6.63796526055297e-06, + "logits/chosen": -0.22958427667617798, + "logits/rejected": -0.36238959431648254, + "logps/chosen": -52.57307052612305, + "logps/rejected": -71.26551055908203, + "loss": 0.625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8195443153381348, + "rewards/margins": 4.268289566040039, + "rewards/rejected": -1.4487451314926147, + "step": 2361 + }, + { + "epoch": 0.59, + "grad_norm": 6.444140434265137, + "learning_rate": 6.635489855568407e-06, + "logits/chosen": -0.17283305525779724, + "logits/rejected": -0.24491798877716064, + "logps/chosen": -64.43939971923828, + "logps/rejected": -96.42276000976562, + "loss": 0.8362, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4971258640289307, + "rewards/margins": 4.026914119720459, + "rewards/rejected": -1.529787540435791, + "step": 2362 + }, + { + "epoch": 0.59, + "grad_norm": 4.092118263244629, + "learning_rate": 6.633014001606586e-06, + "logits/chosen": -0.2924876809120178, + "logits/rejected": -0.36559581756591797, + "logps/chosen": -46.03392028808594, + "logps/rejected": -78.82596588134766, + "loss": 0.6779, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9527220726013184, + "rewards/margins": 4.95905065536499, + "rewards/rejected": -2.006328821182251, + "step": 2363 + }, + { + "epoch": 0.59, + "grad_norm": 4.649460792541504, + "learning_rate": 6.630537699347187e-06, + "logits/chosen": -0.22894786298274994, + "logits/rejected": -0.32099634408950806, + "logps/chosen": -56.408203125, + "logps/rejected": -85.89360809326172, + "loss": 0.707, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.142228841781616, + "rewards/margins": 4.935342788696289, + "rewards/rejected": -1.7931139469146729, + "step": 2364 + }, + { + "epoch": 0.59, + "grad_norm": 16.183656692504883, + "learning_rate": 6.628060949470002e-06, + "logits/chosen": -0.3126731812953949, + "logits/rejected": -0.45854485034942627, + "logps/chosen": -49.936424255371094, + "logps/rejected": -72.37430572509766, + "loss": 0.7067, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7091574668884277, + "rewards/margins": 4.997995853424072, + "rewards/rejected": -2.2888383865356445, + "step": 2365 + }, + { + "epoch": 0.59, + "grad_norm": 8.313422203063965, + "learning_rate": 6.625583752654958e-06, + "logits/chosen": -0.21671992540359497, + "logits/rejected": -0.3026522397994995, + "logps/chosen": -52.378509521484375, + "logps/rejected": -83.86202239990234, + "loss": 0.7779, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.988852024078369, + "rewards/margins": 3.8828229904174805, + "rewards/rejected": -0.8939708471298218, + "step": 2366 + }, + { + "epoch": 0.59, + "grad_norm": 5.843348503112793, + "learning_rate": 6.623106109582094e-06, + "logits/chosen": -0.2828335464000702, + "logits/rejected": -0.3796948790550232, + "logps/chosen": -65.84241485595703, + "logps/rejected": -84.44859313964844, + "loss": 0.8162, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.883910655975342, + "rewards/margins": 4.9272847175598145, + "rewards/rejected": -2.0433740615844727, + "step": 2367 + }, + { + "epoch": 0.59, + "grad_norm": 11.171675682067871, + "learning_rate": 6.620628020931581e-06, + "logits/chosen": -0.23408563435077667, + "logits/rejected": -0.3027798533439636, + "logps/chosen": -58.737144470214844, + "logps/rejected": -83.35490417480469, + "loss": 0.8089, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.451662302017212, + "rewards/margins": 3.7385077476501465, + "rewards/rejected": -1.286845326423645, + "step": 2368 + }, + { + "epoch": 0.59, + "grad_norm": 4.247323513031006, + "learning_rate": 6.618149487383706e-06, + "logits/chosen": -0.29844123125076294, + "logits/rejected": -0.3281051218509674, + "logps/chosen": -45.72072219848633, + "logps/rejected": -99.07455444335938, + "loss": 0.6088, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8989310264587402, + "rewards/margins": 5.23196268081665, + "rewards/rejected": -2.333031177520752, + "step": 2369 + }, + { + "epoch": 0.59, + "grad_norm": 4.935944080352783, + "learning_rate": 6.615670509618879e-06, + "logits/chosen": -0.23666970431804657, + "logits/rejected": -0.3324103057384491, + "logps/chosen": -62.17732620239258, + "logps/rejected": -71.35295867919922, + "loss": 0.8617, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6027321815490723, + "rewards/margins": 3.8228585720062256, + "rewards/rejected": -1.2201268672943115, + "step": 2370 + }, + { + "epoch": 0.59, + "grad_norm": 4.390990734100342, + "learning_rate": 6.6131910883176335e-06, + "logits/chosen": -0.25482985377311707, + "logits/rejected": -0.34812748432159424, + "logps/chosen": -58.475826263427734, + "logps/rejected": -73.05303192138672, + "loss": 0.8366, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8806023597717285, + "rewards/margins": 3.0852503776550293, + "rewards/rejected": -0.20464780926704407, + "step": 2371 + }, + { + "epoch": 0.59, + "grad_norm": 3.5615222454071045, + "learning_rate": 6.6107112241606255e-06, + "logits/chosen": -0.20274507999420166, + "logits/rejected": -0.22809062898159027, + "logps/chosen": -62.201316833496094, + "logps/rejected": -93.07960510253906, + "loss": 0.8485, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.151181936264038, + "rewards/margins": 4.031282901763916, + "rewards/rejected": -0.880101203918457, + "step": 2372 + }, + { + "epoch": 0.59, + "grad_norm": 4.150888442993164, + "learning_rate": 6.6082309178286285e-06, + "logits/chosen": -0.32208359241485596, + "logits/rejected": -0.41170692443847656, + "logps/chosen": -55.09674835205078, + "logps/rejected": -77.91519165039062, + "loss": 0.7987, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.977924346923828, + "rewards/margins": 4.238765716552734, + "rewards/rejected": -1.2608416080474854, + "step": 2373 + }, + { + "epoch": 0.59, + "grad_norm": 5.400059223175049, + "learning_rate": 6.605750170002543e-06, + "logits/chosen": -0.23605842888355255, + "logits/rejected": -0.3418237864971161, + "logps/chosen": -62.92330551147461, + "logps/rejected": -85.70604705810547, + "loss": 0.8756, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.2842819690704346, + "rewards/margins": 3.7768144607543945, + "rewards/rejected": -1.4925328493118286, + "step": 2374 + }, + { + "epoch": 0.59, + "grad_norm": 8.285250663757324, + "learning_rate": 6.603268981363386e-06, + "logits/chosen": -0.3055391311645508, + "logits/rejected": -0.380536288022995, + "logps/chosen": -55.894737243652344, + "logps/rejected": -77.31905364990234, + "loss": 0.9021, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6644883155822754, + "rewards/margins": 3.5043063163757324, + "rewards/rejected": -0.8398177027702332, + "step": 2375 + }, + { + "epoch": 0.59, + "grad_norm": 4.915406227111816, + "learning_rate": 6.600787352592297e-06, + "logits/chosen": -0.18464112281799316, + "logits/rejected": -0.2846842408180237, + "logps/chosen": -56.86688995361328, + "logps/rejected": -73.7296371459961, + "loss": 0.7097, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.905752658843994, + "rewards/margins": 4.0183539390563965, + "rewards/rejected": -1.1126009225845337, + "step": 2376 + }, + { + "epoch": 0.59, + "grad_norm": 4.08867883682251, + "learning_rate": 6.598305284370539e-06, + "logits/chosen": -0.22710438072681427, + "logits/rejected": -0.33819079399108887, + "logps/chosen": -57.690311431884766, + "logps/rejected": -76.51728057861328, + "loss": 0.7068, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.894763946533203, + "rewards/margins": 3.8573875427246094, + "rewards/rejected": -0.9626232981681824, + "step": 2377 + }, + { + "epoch": 0.59, + "grad_norm": 8.721590042114258, + "learning_rate": 6.595822777379491e-06, + "logits/chosen": -0.28497636318206787, + "logits/rejected": -0.32392776012420654, + "logps/chosen": -55.29808044433594, + "logps/rejected": -80.32003784179688, + "loss": 0.9589, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7438697814941406, + "rewards/margins": 3.6737117767333984, + "rewards/rejected": -0.9298422336578369, + "step": 2378 + }, + { + "epoch": 0.6, + "grad_norm": 8.794150352478027, + "learning_rate": 6.593339832300655e-06, + "logits/chosen": -0.279500275850296, + "logits/rejected": -0.40792161226272583, + "logps/chosen": -54.913814544677734, + "logps/rejected": -69.62933349609375, + "loss": 0.8751, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.800292491912842, + "rewards/margins": 4.276015281677246, + "rewards/rejected": -1.4757227897644043, + "step": 2379 + }, + { + "epoch": 0.6, + "grad_norm": 4.160419464111328, + "learning_rate": 6.590856449815654e-06, + "logits/chosen": -0.2892672121524811, + "logits/rejected": -0.3998187482357025, + "logps/chosen": -55.86741638183594, + "logps/rejected": -78.43852996826172, + "loss": 0.7077, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1122798919677734, + "rewards/margins": 4.517875671386719, + "rewards/rejected": -1.4055960178375244, + "step": 2380 + }, + { + "epoch": 0.6, + "grad_norm": 4.3855299949646, + "learning_rate": 6.588372630606229e-06, + "logits/chosen": -0.16314053535461426, + "logits/rejected": -0.31361711025238037, + "logps/chosen": -54.4995231628418, + "logps/rejected": -71.56626892089844, + "loss": 0.7532, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8072879314422607, + "rewards/margins": 4.1992597579956055, + "rewards/rejected": -1.3919720649719238, + "step": 2381 + }, + { + "epoch": 0.6, + "grad_norm": 5.855231761932373, + "learning_rate": 6.585888375354243e-06, + "logits/chosen": -0.2683250904083252, + "logits/rejected": -0.3258088231086731, + "logps/chosen": -62.32560348510742, + "logps/rejected": -78.68628692626953, + "loss": 0.9714, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8064894676208496, + "rewards/margins": 3.303499937057495, + "rewards/rejected": -0.49701040983200073, + "step": 2382 + }, + { + "epoch": 0.6, + "grad_norm": 18.90546417236328, + "learning_rate": 6.583403684741676e-06, + "logits/chosen": -0.21249637007713318, + "logits/rejected": -0.2851840853691101, + "logps/chosen": -60.440513610839844, + "logps/rejected": -88.40709686279297, + "loss": 0.9018, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.858013868331909, + "rewards/margins": 3.635146379470825, + "rewards/rejected": -0.777132511138916, + "step": 2383 + }, + { + "epoch": 0.6, + "grad_norm": 4.338751792907715, + "learning_rate": 6.580918559450632e-06, + "logits/chosen": -0.29021307826042175, + "logits/rejected": -0.3659554719924927, + "logps/chosen": -44.26591873168945, + "logps/rejected": -70.87190246582031, + "loss": 0.7766, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8791394233703613, + "rewards/margins": 3.5918359756469727, + "rewards/rejected": -0.7126966714859009, + "step": 2384 + }, + { + "epoch": 0.6, + "grad_norm": 6.48972749710083, + "learning_rate": 6.57843300016333e-06, + "logits/chosen": -0.3221474885940552, + "logits/rejected": -0.3903815746307373, + "logps/chosen": -56.80942153930664, + "logps/rejected": -82.2088394165039, + "loss": 0.7356, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0116748809814453, + "rewards/margins": 4.588281154632568, + "rewards/rejected": -1.576606273651123, + "step": 2385 + }, + { + "epoch": 0.6, + "grad_norm": 4.67816162109375, + "learning_rate": 6.575947007562108e-06, + "logits/chosen": -0.22096699476242065, + "logits/rejected": -0.3263801634311676, + "logps/chosen": -70.15016174316406, + "logps/rejected": -102.69819641113281, + "loss": 0.7807, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0597667694091797, + "rewards/margins": 4.209478378295898, + "rewards/rejected": -1.1497117280960083, + "step": 2386 + }, + { + "epoch": 0.6, + "grad_norm": 5.7324700355529785, + "learning_rate": 6.573460582329427e-06, + "logits/chosen": -0.20966430008411407, + "logits/rejected": -0.27028775215148926, + "logps/chosen": -50.38083267211914, + "logps/rejected": -84.61055755615234, + "loss": 0.7044, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9465456008911133, + "rewards/margins": 3.6344852447509766, + "rewards/rejected": -0.6879398822784424, + "step": 2387 + }, + { + "epoch": 0.6, + "grad_norm": 7.954919815063477, + "learning_rate": 6.5709737251478646e-06, + "logits/chosen": -0.17929428815841675, + "logits/rejected": -0.31304359436035156, + "logps/chosen": -61.604339599609375, + "logps/rejected": -78.63749694824219, + "loss": 0.9034, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5471906661987305, + "rewards/margins": 3.4323954582214355, + "rewards/rejected": -0.8852046728134155, + "step": 2388 + }, + { + "epoch": 0.6, + "grad_norm": 4.299665451049805, + "learning_rate": 6.5684864367001145e-06, + "logits/chosen": -0.19657695293426514, + "logits/rejected": -0.27795878052711487, + "logps/chosen": -58.11832046508789, + "logps/rejected": -77.94427490234375, + "loss": 0.7292, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6903393268585205, + "rewards/margins": 3.8642544746398926, + "rewards/rejected": -1.1739152669906616, + "step": 2389 + }, + { + "epoch": 0.6, + "grad_norm": 4.2332539558410645, + "learning_rate": 6.565998717668992e-06, + "logits/chosen": -0.30838412046432495, + "logits/rejected": -0.4065301716327667, + "logps/chosen": -59.27647399902344, + "logps/rejected": -83.45962524414062, + "loss": 0.7566, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.722543716430664, + "rewards/margins": 5.024869441986084, + "rewards/rejected": -2.30232572555542, + "step": 2390 + }, + { + "epoch": 0.6, + "grad_norm": 10.185470581054688, + "learning_rate": 6.563510568737431e-06, + "logits/chosen": -0.3287534713745117, + "logits/rejected": -0.4256255328655243, + "logps/chosen": -58.23342514038086, + "logps/rejected": -89.94759368896484, + "loss": 0.9032, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.69577956199646, + "rewards/margins": 5.740190029144287, + "rewards/rejected": -3.044410228729248, + "step": 2391 + }, + { + "epoch": 0.6, + "grad_norm": 4.491815567016602, + "learning_rate": 6.561021990588479e-06, + "logits/chosen": -0.2775972783565521, + "logits/rejected": -0.38202157616615295, + "logps/chosen": -60.6087646484375, + "logps/rejected": -77.94232940673828, + "loss": 0.7447, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.83028507232666, + "rewards/margins": 4.222383499145508, + "rewards/rejected": -1.3920984268188477, + "step": 2392 + }, + { + "epoch": 0.6, + "grad_norm": 10.300070762634277, + "learning_rate": 6.5585329839053095e-06, + "logits/chosen": -0.33554044365882874, + "logits/rejected": -0.4304906725883484, + "logps/chosen": -62.554931640625, + "logps/rejected": -76.58583068847656, + "loss": 1.0787, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.784233570098877, + "rewards/margins": 2.563183546066284, + "rewards/rejected": 0.2210501730442047, + "step": 2393 + }, + { + "epoch": 0.6, + "grad_norm": 6.490281105041504, + "learning_rate": 6.556043549371204e-06, + "logits/chosen": -0.15167203545570374, + "logits/rejected": -0.2565819323062897, + "logps/chosen": -63.25520706176758, + "logps/rejected": -88.09512329101562, + "loss": 0.7146, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.752533435821533, + "rewards/margins": 4.648679733276367, + "rewards/rejected": -1.896146297454834, + "step": 2394 + }, + { + "epoch": 0.6, + "grad_norm": 4.906703948974609, + "learning_rate": 6.553553687669567e-06, + "logits/chosen": -0.29614177346229553, + "logits/rejected": -0.36629605293273926, + "logps/chosen": -63.60292053222656, + "logps/rejected": -93.14718627929688, + "loss": 0.8743, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.654162883758545, + "rewards/margins": 4.161883354187012, + "rewards/rejected": -1.507720947265625, + "step": 2395 + }, + { + "epoch": 0.6, + "grad_norm": 8.033626556396484, + "learning_rate": 6.551063399483919e-06, + "logits/chosen": -0.2355738878250122, + "logits/rejected": -0.35056963562965393, + "logps/chosen": -50.690528869628906, + "logps/rejected": -72.62014770507812, + "loss": 0.8604, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5719170570373535, + "rewards/margins": 4.347813606262207, + "rewards/rejected": -1.7758960723876953, + "step": 2396 + }, + { + "epoch": 0.6, + "grad_norm": 9.128534317016602, + "learning_rate": 6.5485726854979006e-06, + "logits/chosen": -0.2588077783584595, + "logits/rejected": -0.3316742181777954, + "logps/chosen": -59.777591705322266, + "logps/rejected": -73.4776611328125, + "loss": 0.9807, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6911888122558594, + "rewards/margins": 3.324800491333008, + "rewards/rejected": -0.6336113214492798, + "step": 2397 + }, + { + "epoch": 0.6, + "grad_norm": 9.809260368347168, + "learning_rate": 6.546081546395262e-06, + "logits/chosen": -0.33951956033706665, + "logits/rejected": -0.36140817403793335, + "logps/chosen": -54.761924743652344, + "logps/rejected": -104.27528381347656, + "loss": 0.9102, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.964110851287842, + "rewards/margins": 3.815018653869629, + "rewards/rejected": -0.8509082198143005, + "step": 2398 + }, + { + "epoch": 0.6, + "grad_norm": 5.741252899169922, + "learning_rate": 6.543589982859879e-06, + "logits/chosen": -0.21055558323860168, + "logits/rejected": -0.35064566135406494, + "logps/chosen": -52.37772750854492, + "logps/rejected": -72.65480041503906, + "loss": 0.6383, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.684095621109009, + "rewards/margins": 4.73469877243042, + "rewards/rejected": -2.050602436065674, + "step": 2399 + }, + { + "epoch": 0.6, + "grad_norm": 5.674532413482666, + "learning_rate": 6.541097995575737e-06, + "logits/chosen": -0.22431473433971405, + "logits/rejected": -0.35525286197662354, + "logps/chosen": -63.56126403808594, + "logps/rejected": -82.60285186767578, + "loss": 0.8274, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.522883892059326, + "rewards/margins": 3.84080171585083, + "rewards/rejected": -1.317918062210083, + "step": 2400 + }, + { + "epoch": 0.6, + "grad_norm": 7.19002103805542, + "learning_rate": 6.538605585226941e-06, + "logits/chosen": -0.22190359234809875, + "logits/rejected": -0.3090305030345917, + "logps/chosen": -65.43272399902344, + "logps/rejected": -74.72377014160156, + "loss": 0.8404, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.361898899078369, + "rewards/margins": 3.5935518741607666, + "rewards/rejected": -1.2316526174545288, + "step": 2401 + }, + { + "epoch": 0.6, + "grad_norm": 4.840889930725098, + "learning_rate": 6.536112752497711e-06, + "logits/chosen": -0.21977686882019043, + "logits/rejected": -0.2941407263278961, + "logps/chosen": -55.717620849609375, + "logps/rejected": -71.17484283447266, + "loss": 0.911, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.647245407104492, + "rewards/margins": 3.6368300914764404, + "rewards/rejected": -0.9895846843719482, + "step": 2402 + }, + { + "epoch": 0.6, + "grad_norm": 3.9790310859680176, + "learning_rate": 6.533619498072385e-06, + "logits/chosen": -0.2648182213306427, + "logits/rejected": -0.35885417461395264, + "logps/chosen": -56.645835876464844, + "logps/rejected": -77.71710968017578, + "loss": 0.7453, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.116147756576538, + "rewards/margins": 4.419142723083496, + "rewards/rejected": -1.302995204925537, + "step": 2403 + }, + { + "epoch": 0.6, + "grad_norm": 7.6784868240356445, + "learning_rate": 6.531125822635413e-06, + "logits/chosen": -0.3028249442577362, + "logits/rejected": -0.31620901823043823, + "logps/chosen": -48.663970947265625, + "logps/rejected": -89.99842834472656, + "loss": 0.8786, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.949086904525757, + "rewards/margins": 3.48695707321167, + "rewards/rejected": -0.5378704071044922, + "step": 2404 + }, + { + "epoch": 0.6, + "grad_norm": 7.371305465698242, + "learning_rate": 6.528631726871364e-06, + "logits/chosen": -0.22391405701637268, + "logits/rejected": -0.31416407227516174, + "logps/chosen": -53.57503890991211, + "logps/rejected": -88.4188003540039, + "loss": 0.726, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.584639549255371, + "rewards/margins": 3.7514684200286865, + "rewards/rejected": -1.166828989982605, + "step": 2405 + }, + { + "epoch": 0.6, + "grad_norm": 4.138017177581787, + "learning_rate": 6.526137211464919e-06, + "logits/chosen": -0.12347894161939621, + "logits/rejected": -0.2561458647251129, + "logps/chosen": -65.36508178710938, + "logps/rejected": -76.06813049316406, + "loss": 0.8407, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8799149990081787, + "rewards/margins": 3.7222323417663574, + "rewards/rejected": -0.8423174619674683, + "step": 2406 + }, + { + "epoch": 0.6, + "grad_norm": 4.8543853759765625, + "learning_rate": 6.523642277100879e-06, + "logits/chosen": -0.25054115056991577, + "logits/rejected": -0.37377816438674927, + "logps/chosen": -60.5808219909668, + "logps/rejected": -81.04822540283203, + "loss": 0.8049, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6065752506256104, + "rewards/margins": 4.285365104675293, + "rewards/rejected": -1.6787893772125244, + "step": 2407 + }, + { + "epoch": 0.6, + "grad_norm": 7.9444403648376465, + "learning_rate": 6.521146924464157e-06, + "logits/chosen": -0.1885499209165573, + "logits/rejected": -0.2842620313167572, + "logps/chosen": -56.611289978027344, + "logps/rejected": -84.63892364501953, + "loss": 0.7741, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8022546768188477, + "rewards/margins": 4.606776237487793, + "rewards/rejected": -1.8045216798782349, + "step": 2408 + }, + { + "epoch": 0.6, + "grad_norm": 4.792812824249268, + "learning_rate": 6.518651154239781e-06, + "logits/chosen": -0.28780093789100647, + "logits/rejected": -0.3642820715904236, + "logps/chosen": -55.76398849487305, + "logps/rejected": -65.36564636230469, + "loss": 0.992, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.8651304244995117, + "rewards/margins": 2.8514933586120605, + "rewards/rejected": 0.013637281954288483, + "step": 2409 + }, + { + "epoch": 0.6, + "grad_norm": 8.626462936401367, + "learning_rate": 6.516154967112891e-06, + "logits/chosen": -0.2583797574043274, + "logits/rejected": -0.3852226734161377, + "logps/chosen": -66.86747741699219, + "logps/rejected": -92.0395736694336, + "loss": 0.867, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8980276584625244, + "rewards/margins": 4.684231758117676, + "rewards/rejected": -1.78620445728302, + "step": 2410 + }, + { + "epoch": 0.6, + "grad_norm": 8.20740032196045, + "learning_rate": 6.513658363768749e-06, + "logits/chosen": -0.274433434009552, + "logits/rejected": -0.36847835779190063, + "logps/chosen": -49.142173767089844, + "logps/rejected": -88.79109191894531, + "loss": 0.7658, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.951467752456665, + "rewards/margins": 4.308877944946289, + "rewards/rejected": -1.3574098348617554, + "step": 2411 + }, + { + "epoch": 0.6, + "grad_norm": 7.088037490844727, + "learning_rate": 6.511161344892721e-06, + "logits/chosen": -0.15786416828632355, + "logits/rejected": -0.30849704146385193, + "logps/chosen": -60.97871780395508, + "logps/rejected": -70.99143981933594, + "loss": 0.8833, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5040011405944824, + "rewards/margins": 3.0440151691436768, + "rewards/rejected": -0.5400140285491943, + "step": 2412 + }, + { + "epoch": 0.6, + "grad_norm": 5.310237884521484, + "learning_rate": 6.508663911170299e-06, + "logits/chosen": -0.27912387251853943, + "logits/rejected": -0.3714727759361267, + "logps/chosen": -46.165340423583984, + "logps/rejected": -80.39584350585938, + "loss": 0.9327, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6648812294006348, + "rewards/margins": 3.59566593170166, + "rewards/rejected": -0.930785059928894, + "step": 2413 + }, + { + "epoch": 0.6, + "grad_norm": 2.732088565826416, + "learning_rate": 6.506166063287077e-06, + "logits/chosen": -0.36695849895477295, + "logits/rejected": -0.36706483364105225, + "logps/chosen": -53.71345520019531, + "logps/rejected": -82.1866226196289, + "loss": 0.8427, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.83054780960083, + "rewards/margins": 4.161114692687988, + "rewards/rejected": -1.330566644668579, + "step": 2414 + }, + { + "epoch": 0.6, + "grad_norm": 6.082529544830322, + "learning_rate": 6.5036678019287704e-06, + "logits/chosen": -0.21758028864860535, + "logits/rejected": -0.24261191487312317, + "logps/chosen": -57.15068817138672, + "logps/rejected": -78.2997817993164, + "loss": 0.9959, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6723053455352783, + "rewards/margins": 3.2146267890930176, + "rewards/rejected": -0.5423212647438049, + "step": 2415 + }, + { + "epoch": 0.6, + "grad_norm": 5.426101207733154, + "learning_rate": 6.501169127781205e-06, + "logits/chosen": -0.15559443831443787, + "logits/rejected": -0.23664839565753937, + "logps/chosen": -68.16444396972656, + "logps/rejected": -86.34134674072266, + "loss": 0.9798, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.985217809677124, + "rewards/margins": 4.404209136962891, + "rewards/rejected": -1.4189908504486084, + "step": 2416 + }, + { + "epoch": 0.6, + "grad_norm": 5.534124374389648, + "learning_rate": 6.498670041530322e-06, + "logits/chosen": -0.3262655436992645, + "logits/rejected": -0.44049981236457825, + "logps/chosen": -49.382347106933594, + "logps/rejected": -75.41656494140625, + "loss": 0.7064, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7963032722473145, + "rewards/margins": 5.101690292358398, + "rewards/rejected": -2.305386781692505, + "step": 2417 + }, + { + "epoch": 0.6, + "grad_norm": 12.391228675842285, + "learning_rate": 6.496170543862174e-06, + "logits/chosen": -0.22578495740890503, + "logits/rejected": -0.2605697810649872, + "logps/chosen": -60.09058380126953, + "logps/rejected": -76.99080657958984, + "loss": 0.9254, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8956949710845947, + "rewards/margins": 3.458791732788086, + "rewards/rejected": -0.5630964636802673, + "step": 2418 + }, + { + "epoch": 0.61, + "grad_norm": 5.007035255432129, + "learning_rate": 6.493670635462928e-06, + "logits/chosen": -0.3126664161682129, + "logits/rejected": -0.40736252069473267, + "logps/chosen": -58.4188346862793, + "logps/rejected": -76.46121978759766, + "loss": 0.8163, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.856734037399292, + "rewards/margins": 3.605692148208618, + "rewards/rejected": -0.7489579916000366, + "step": 2419 + }, + { + "epoch": 0.61, + "grad_norm": 19.28598976135254, + "learning_rate": 6.491170317018859e-06, + "logits/chosen": -0.26994869112968445, + "logits/rejected": -0.3304896652698517, + "logps/chosen": -41.675193786621094, + "logps/rejected": -82.68899536132812, + "loss": 0.8417, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.206049919128418, + "rewards/margins": 3.87298846244812, + "rewards/rejected": -0.6669384837150574, + "step": 2420 + }, + { + "epoch": 0.61, + "grad_norm": 3.8569259643554688, + "learning_rate": 6.4886695892163635e-06, + "logits/chosen": -0.20162567496299744, + "logits/rejected": -0.33626896142959595, + "logps/chosen": -60.69471740722656, + "logps/rejected": -76.53243255615234, + "loss": 0.7779, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9876961708068848, + "rewards/margins": 4.511584281921387, + "rewards/rejected": -1.5238878726959229, + "step": 2421 + }, + { + "epoch": 0.61, + "grad_norm": 3.163719654083252, + "learning_rate": 6.486168452741941e-06, + "logits/chosen": -0.21143600344657898, + "logits/rejected": -0.35380271077156067, + "logps/chosen": -54.94413757324219, + "logps/rejected": -77.59105682373047, + "loss": 0.8053, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0776443481445312, + "rewards/margins": 3.8878586292266846, + "rewards/rejected": -0.8102144002914429, + "step": 2422 + }, + { + "epoch": 0.61, + "grad_norm": 4.055624008178711, + "learning_rate": 6.48366690828221e-06, + "logits/chosen": -0.2949312627315521, + "logits/rejected": -0.3187795877456665, + "logps/chosen": -49.37270736694336, + "logps/rejected": -80.01837158203125, + "loss": 0.7893, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7449207305908203, + "rewards/margins": 3.7594776153564453, + "rewards/rejected": -1.0145570039749146, + "step": 2423 + }, + { + "epoch": 0.61, + "grad_norm": 3.807116985321045, + "learning_rate": 6.481164956523898e-06, + "logits/chosen": -0.2597799599170685, + "logits/rejected": -0.32131436467170715, + "logps/chosen": -48.35491180419922, + "logps/rejected": -84.17831420898438, + "loss": 0.7228, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7633392810821533, + "rewards/margins": 3.9891159534454346, + "rewards/rejected": -1.2257764339447021, + "step": 2424 + }, + { + "epoch": 0.61, + "grad_norm": 8.238213539123535, + "learning_rate": 6.4786625981538444e-06, + "logits/chosen": -0.24231114983558655, + "logits/rejected": -0.3053175210952759, + "logps/chosen": -57.95757293701172, + "logps/rejected": -79.92666625976562, + "loss": 0.773, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.829397678375244, + "rewards/margins": 3.9037253856658936, + "rewards/rejected": -1.0743275880813599, + "step": 2425 + }, + { + "epoch": 0.61, + "grad_norm": 5.181244373321533, + "learning_rate": 6.476159833858999e-06, + "logits/chosen": -0.19661684334278107, + "logits/rejected": -0.2835017740726471, + "logps/chosen": -58.731903076171875, + "logps/rejected": -77.55364227294922, + "loss": 0.8609, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0078814029693604, + "rewards/margins": 3.7279129028320312, + "rewards/rejected": -0.7200316190719604, + "step": 2426 + }, + { + "epoch": 0.61, + "grad_norm": 3.98372745513916, + "learning_rate": 6.473656664326429e-06, + "logits/chosen": -0.1945028454065323, + "logits/rejected": -0.21828925609588623, + "logps/chosen": -59.255943298339844, + "logps/rejected": -84.6368179321289, + "loss": 0.8086, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8448421955108643, + "rewards/margins": 3.7651426792144775, + "rewards/rejected": -0.9203006029129028, + "step": 2427 + }, + { + "epoch": 0.61, + "grad_norm": 4.770023822784424, + "learning_rate": 6.4711530902433024e-06, + "logits/chosen": -0.3152971565723419, + "logits/rejected": -0.3219850957393646, + "logps/chosen": -47.3079719543457, + "logps/rejected": -68.63494873046875, + "loss": 0.8959, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.05439829826355, + "rewards/margins": 3.277801036834717, + "rewards/rejected": -0.22340261936187744, + "step": 2428 + }, + { + "epoch": 0.61, + "grad_norm": 6.702280521392822, + "learning_rate": 6.468649112296911e-06, + "logits/chosen": -0.2187333106994629, + "logits/rejected": -0.2928038239479065, + "logps/chosen": -60.86482238769531, + "logps/rejected": -76.14309692382812, + "loss": 0.7092, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.900578498840332, + "rewards/margins": 4.060443878173828, + "rewards/rejected": -1.159865379333496, + "step": 2429 + }, + { + "epoch": 0.61, + "grad_norm": 5.058351516723633, + "learning_rate": 6.466144731174645e-06, + "logits/chosen": -0.25795868039131165, + "logits/rejected": -0.3408740162849426, + "logps/chosen": -48.83511734008789, + "logps/rejected": -78.22271728515625, + "loss": 0.7477, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.912571907043457, + "rewards/margins": 4.199935436248779, + "rewards/rejected": -1.2873637676239014, + "step": 2430 + }, + { + "epoch": 0.61, + "grad_norm": 5.418821811676025, + "learning_rate": 6.463639947564014e-06, + "logits/chosen": -0.2825169861316681, + "logits/rejected": -0.3613624572753906, + "logps/chosen": -65.44898986816406, + "logps/rejected": -91.17466735839844, + "loss": 0.7458, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9974799156188965, + "rewards/margins": 4.354318618774414, + "rewards/rejected": -1.356838345527649, + "step": 2431 + }, + { + "epoch": 0.61, + "grad_norm": 4.709342002868652, + "learning_rate": 6.461134762152634e-06, + "logits/chosen": -0.21832126379013062, + "logits/rejected": -0.32192057371139526, + "logps/chosen": -67.01234436035156, + "logps/rejected": -80.79904174804688, + "loss": 0.8068, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.017327308654785, + "rewards/margins": 4.175724983215332, + "rewards/rejected": -1.1583976745605469, + "step": 2432 + }, + { + "epoch": 0.61, + "grad_norm": 3.153473138809204, + "learning_rate": 6.458629175628234e-06, + "logits/chosen": -0.2718341052532196, + "logits/rejected": -0.4373210668563843, + "logps/chosen": -49.51150131225586, + "logps/rejected": -68.08372497558594, + "loss": 0.6992, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6701676845550537, + "rewards/margins": 4.562232971191406, + "rewards/rejected": -1.8920656442642212, + "step": 2433 + }, + { + "epoch": 0.61, + "grad_norm": 4.255213260650635, + "learning_rate": 6.45612318867865e-06, + "logits/chosen": -0.18944098055362701, + "logits/rejected": -0.2346736341714859, + "logps/chosen": -59.26005935668945, + "logps/rejected": -79.01571655273438, + "loss": 0.9061, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8688063621520996, + "rewards/margins": 3.644162654876709, + "rewards/rejected": -0.7753560543060303, + "step": 2434 + }, + { + "epoch": 0.61, + "grad_norm": 4.387005805969238, + "learning_rate": 6.453616801991831e-06, + "logits/chosen": -0.27433592081069946, + "logits/rejected": -0.3624337315559387, + "logps/chosen": -56.24699401855469, + "logps/rejected": -66.65327453613281, + "loss": 0.8191, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.977382183074951, + "rewards/margins": 4.2198686599731445, + "rewards/rejected": -1.2424867153167725, + "step": 2435 + }, + { + "epoch": 0.61, + "grad_norm": 3.8279430866241455, + "learning_rate": 6.45111001625583e-06, + "logits/chosen": -0.24462704360485077, + "logits/rejected": -0.38665515184402466, + "logps/chosen": -50.42778778076172, + "logps/rejected": -78.26808166503906, + "loss": 0.6894, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.590968132019043, + "rewards/margins": 4.569403648376465, + "rewards/rejected": -1.9784348011016846, + "step": 2436 + }, + { + "epoch": 0.61, + "grad_norm": 9.067312240600586, + "learning_rate": 6.448602832158821e-06, + "logits/chosen": -0.3302440643310547, + "logits/rejected": -0.4534315764904022, + "logps/chosen": -55.3427848815918, + "logps/rejected": -84.22933959960938, + "loss": 0.7553, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.732687473297119, + "rewards/margins": 4.611333847045898, + "rewards/rejected": -1.8786462545394897, + "step": 2437 + }, + { + "epoch": 0.61, + "grad_norm": 7.029847145080566, + "learning_rate": 6.446095250389074e-06, + "logits/chosen": -0.15859033167362213, + "logits/rejected": -0.2605246305465698, + "logps/chosen": -68.04682159423828, + "logps/rejected": -83.99497985839844, + "loss": 0.8463, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6749465465545654, + "rewards/margins": 3.9673080444335938, + "rewards/rejected": -1.2923616170883179, + "step": 2438 + }, + { + "epoch": 0.61, + "grad_norm": 3.5291988849639893, + "learning_rate": 6.443587271634977e-06, + "logits/chosen": -0.11493740230798721, + "logits/rejected": -0.2484094500541687, + "logps/chosen": -69.88185119628906, + "logps/rejected": -72.52799987792969, + "loss": 0.7092, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.927586555480957, + "rewards/margins": 4.288020133972168, + "rewards/rejected": -1.3604332208633423, + "step": 2439 + }, + { + "epoch": 0.61, + "grad_norm": 6.009391784667969, + "learning_rate": 6.441078896585024e-06, + "logits/chosen": -0.22961241006851196, + "logits/rejected": -0.3477604389190674, + "logps/chosen": -59.031707763671875, + "logps/rejected": -69.0917739868164, + "loss": 0.9309, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8253965377807617, + "rewards/margins": 3.004178524017334, + "rewards/rejected": -0.1787824034690857, + "step": 2440 + }, + { + "epoch": 0.61, + "grad_norm": 8.257542610168457, + "learning_rate": 6.4385701259278175e-06, + "logits/chosen": -0.18388907611370087, + "logits/rejected": -0.27226683497428894, + "logps/chosen": -67.04208374023438, + "logps/rejected": -80.5355224609375, + "loss": 0.9015, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.779256820678711, + "rewards/margins": 3.5308854579925537, + "rewards/rejected": -0.751628577709198, + "step": 2441 + }, + { + "epoch": 0.61, + "grad_norm": 8.49451732635498, + "learning_rate": 6.436060960352071e-06, + "logits/chosen": -0.2598230242729187, + "logits/rejected": -0.2901688814163208, + "logps/chosen": -60.15061569213867, + "logps/rejected": -82.79423522949219, + "loss": 0.9481, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5832700729370117, + "rewards/margins": 2.92104434967041, + "rewards/rejected": -0.33777424693107605, + "step": 2442 + }, + { + "epoch": 0.61, + "grad_norm": 4.501804351806641, + "learning_rate": 6.433551400546602e-06, + "logits/chosen": -0.2298850566148758, + "logits/rejected": -0.30461007356643677, + "logps/chosen": -49.195220947265625, + "logps/rejected": -85.13348388671875, + "loss": 0.8385, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0129034519195557, + "rewards/margins": 3.5508956909179688, + "rewards/rejected": -0.537992000579834, + "step": 2443 + }, + { + "epoch": 0.61, + "grad_norm": 4.934782028198242, + "learning_rate": 6.431041447200339e-06, + "logits/chosen": -0.2139630913734436, + "logits/rejected": -0.2774721086025238, + "logps/chosen": -58.106353759765625, + "logps/rejected": -74.56243896484375, + "loss": 0.8476, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0382907390594482, + "rewards/margins": 3.8859424591064453, + "rewards/rejected": -0.8476513624191284, + "step": 2444 + }, + { + "epoch": 0.61, + "grad_norm": 6.528410911560059, + "learning_rate": 6.42853110100232e-06, + "logits/chosen": -0.26726388931274414, + "logits/rejected": -0.2967085540294647, + "logps/chosen": -67.22618103027344, + "logps/rejected": -84.72791290283203, + "loss": 0.888, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7345521450042725, + "rewards/margins": 3.230426788330078, + "rewards/rejected": -0.4958745241165161, + "step": 2445 + }, + { + "epoch": 0.61, + "grad_norm": 4.698214530944824, + "learning_rate": 6.426020362641689e-06, + "logits/chosen": -0.15554480254650116, + "logits/rejected": -0.3377452492713928, + "logps/chosen": -69.41051483154297, + "logps/rejected": -84.64791870117188, + "loss": 0.7353, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.735239267349243, + "rewards/margins": 4.597095966339111, + "rewards/rejected": -1.8618566989898682, + "step": 2446 + }, + { + "epoch": 0.61, + "grad_norm": 3.1943271160125732, + "learning_rate": 6.423509232807697e-06, + "logits/chosen": -0.18223930895328522, + "logits/rejected": -0.2590015232563019, + "logps/chosen": -56.501991271972656, + "logps/rejected": -86.20854187011719, + "loss": 0.7422, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.809420108795166, + "rewards/margins": 4.418047904968262, + "rewards/rejected": -1.6086279153823853, + "step": 2447 + }, + { + "epoch": 0.61, + "grad_norm": 3.5639472007751465, + "learning_rate": 6.4209977121897025e-06, + "logits/chosen": -0.3053293228149414, + "logits/rejected": -0.40383175015449524, + "logps/chosen": -58.60816192626953, + "logps/rejected": -70.26841735839844, + "loss": 0.7581, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.79556941986084, + "rewards/margins": 3.7246921062469482, + "rewards/rejected": -0.9291228652000427, + "step": 2448 + }, + { + "epoch": 0.61, + "grad_norm": 3.2825632095336914, + "learning_rate": 6.418485801477175e-06, + "logits/chosen": -0.2372443974018097, + "logits/rejected": -0.38742169737815857, + "logps/chosen": -59.44062805175781, + "logps/rejected": -78.23002624511719, + "loss": 0.6095, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.803997039794922, + "rewards/margins": 4.413264751434326, + "rewards/rejected": -1.6092679500579834, + "step": 2449 + }, + { + "epoch": 0.61, + "grad_norm": 4.213153839111328, + "learning_rate": 6.415973501359687e-06, + "logits/chosen": -0.23122625052928925, + "logits/rejected": -0.33393582701683044, + "logps/chosen": -46.71833038330078, + "logps/rejected": -94.3465576171875, + "loss": 0.6588, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2355477809906006, + "rewards/margins": 5.793526649475098, + "rewards/rejected": -2.557978630065918, + "step": 2450 + }, + { + "epoch": 0.61, + "grad_norm": 4.065317630767822, + "learning_rate": 6.413460812526917e-06, + "logits/chosen": -0.180665522813797, + "logits/rejected": -0.26794755458831787, + "logps/chosen": -61.18805694580078, + "logps/rejected": -91.70899200439453, + "loss": 0.712, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.788665771484375, + "rewards/margins": 4.413288116455078, + "rewards/rejected": -1.6246228218078613, + "step": 2451 + }, + { + "epoch": 0.61, + "grad_norm": 7.885544300079346, + "learning_rate": 6.4109477356686545e-06, + "logits/chosen": -0.311413049697876, + "logits/rejected": -0.3560791015625, + "logps/chosen": -55.8195686340332, + "logps/rejected": -77.06678009033203, + "loss": 0.9675, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5392041206359863, + "rewards/margins": 3.655488967895508, + "rewards/rejected": -1.1162853240966797, + "step": 2452 + }, + { + "epoch": 0.61, + "grad_norm": 4.953274726867676, + "learning_rate": 6.408434271474792e-06, + "logits/chosen": -0.27961456775665283, + "logits/rejected": -0.421653151512146, + "logps/chosen": -67.96266174316406, + "logps/rejected": -84.0965576171875, + "loss": 0.7801, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8247456550598145, + "rewards/margins": 4.83259391784668, + "rewards/rejected": -2.0078485012054443, + "step": 2453 + }, + { + "epoch": 0.61, + "grad_norm": 5.941098213195801, + "learning_rate": 6.405920420635332e-06, + "logits/chosen": -0.29617878794670105, + "logits/rejected": -0.3867623507976532, + "logps/chosen": -64.34432220458984, + "logps/rejected": -72.20877838134766, + "loss": 0.9456, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7731916904449463, + "rewards/margins": 3.701061487197876, + "rewards/rejected": -0.9278696775436401, + "step": 2454 + }, + { + "epoch": 0.61, + "grad_norm": 3.9605748653411865, + "learning_rate": 6.403406183840378e-06, + "logits/chosen": -0.19294102489948273, + "logits/rejected": -0.3547998070716858, + "logps/chosen": -70.0020523071289, + "logps/rejected": -78.62980651855469, + "loss": 0.7434, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1278324127197266, + "rewards/margins": 4.243292331695557, + "rewards/rejected": -1.1154600381851196, + "step": 2455 + }, + { + "epoch": 0.61, + "grad_norm": 5.194540977478027, + "learning_rate": 6.400891561780144e-06, + "logits/chosen": -0.23890718817710876, + "logits/rejected": -0.350868284702301, + "logps/chosen": -50.7793083190918, + "logps/rejected": -80.74018859863281, + "loss": 0.7904, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.681326389312744, + "rewards/margins": 4.670687675476074, + "rewards/rejected": -1.9893608093261719, + "step": 2456 + }, + { + "epoch": 0.61, + "grad_norm": 5.302389621734619, + "learning_rate": 6.398376555144946e-06, + "logits/chosen": -0.28730636835098267, + "logits/rejected": -0.4066827595233917, + "logps/chosen": -47.46380615234375, + "logps/rejected": -67.39097595214844, + "loss": 0.6415, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.244935989379883, + "rewards/margins": 4.986268997192383, + "rewards/rejected": -1.7413331270217896, + "step": 2457 + }, + { + "epoch": 0.61, + "grad_norm": 9.51457691192627, + "learning_rate": 6.395861164625211e-06, + "logits/chosen": -0.25048309564590454, + "logits/rejected": -0.38889047503471375, + "logps/chosen": -59.042388916015625, + "logps/rejected": -56.56617736816406, + "loss": 0.8194, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8981411457061768, + "rewards/margins": 3.295210838317871, + "rewards/rejected": -0.3970694839954376, + "step": 2458 + }, + { + "epoch": 0.62, + "grad_norm": 3.551375389099121, + "learning_rate": 6.393345390911464e-06, + "logits/chosen": -0.18981170654296875, + "logits/rejected": -0.2964600622653961, + "logps/chosen": -63.904335021972656, + "logps/rejected": -85.3945541381836, + "loss": 0.7301, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8481388092041016, + "rewards/margins": 4.75338888168335, + "rewards/rejected": -1.9052503108978271, + "step": 2459 + }, + { + "epoch": 0.62, + "grad_norm": 4.550394058227539, + "learning_rate": 6.390829234694345e-06, + "logits/chosen": -0.2609187662601471, + "logits/rejected": -0.34580159187316895, + "logps/chosen": -51.919158935546875, + "logps/rejected": -72.95922088623047, + "loss": 0.8027, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.788546562194824, + "rewards/margins": 4.264743804931641, + "rewards/rejected": -1.476197361946106, + "step": 2460 + }, + { + "epoch": 0.62, + "grad_norm": 5.897933483123779, + "learning_rate": 6.388312696664584e-06, + "logits/chosen": -0.23687715828418732, + "logits/rejected": -0.32461273670196533, + "logps/chosen": -58.17579650878906, + "logps/rejected": -91.34515380859375, + "loss": 0.7906, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.90629243850708, + "rewards/margins": 4.471574306488037, + "rewards/rejected": -1.5652822256088257, + "step": 2461 + }, + { + "epoch": 0.62, + "grad_norm": 3.140495538711548, + "learning_rate": 6.3857957775130345e-06, + "logits/chosen": -0.22652149200439453, + "logits/rejected": -0.35232943296432495, + "logps/chosen": -51.8906135559082, + "logps/rejected": -66.0038070678711, + "loss": 0.6927, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9273202419281006, + "rewards/margins": 4.718843460083008, + "rewards/rejected": -1.7915226221084595, + "step": 2462 + }, + { + "epoch": 0.62, + "grad_norm": 12.3125, + "learning_rate": 6.383278477930639e-06, + "logits/chosen": -0.2509213089942932, + "logits/rejected": -0.3524366617202759, + "logps/chosen": -54.3116569519043, + "logps/rejected": -77.22181701660156, + "loss": 0.7849, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9374656677246094, + "rewards/margins": 4.434358596801758, + "rewards/rejected": -1.4968934059143066, + "step": 2463 + }, + { + "epoch": 0.62, + "grad_norm": 3.3620712757110596, + "learning_rate": 6.380760798608452e-06, + "logits/chosen": -0.32389289140701294, + "logits/rejected": -0.45685046911239624, + "logps/chosen": -55.5223388671875, + "logps/rejected": -71.6745834350586, + "loss": 0.7108, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6455769538879395, + "rewards/margins": 4.797577857971191, + "rewards/rejected": -2.1520004272460938, + "step": 2464 + }, + { + "epoch": 0.62, + "grad_norm": 5.0890936851501465, + "learning_rate": 6.378242740237633e-06, + "logits/chosen": -0.3377023935317993, + "logits/rejected": -0.4282413125038147, + "logps/chosen": -43.7579345703125, + "logps/rejected": -70.94622802734375, + "loss": 0.7456, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0342702865600586, + "rewards/margins": 4.319050312042236, + "rewards/rejected": -1.2847800254821777, + "step": 2465 + }, + { + "epoch": 0.62, + "grad_norm": 4.831173896789551, + "learning_rate": 6.375724303509441e-06, + "logits/chosen": -0.2877717912197113, + "logits/rejected": -0.3193126916885376, + "logps/chosen": -54.28137969970703, + "logps/rejected": -85.91289520263672, + "loss": 0.7778, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.032059907913208, + "rewards/margins": 2.976515293121338, + "rewards/rejected": 0.05554461106657982, + "step": 2466 + }, + { + "epoch": 0.62, + "grad_norm": 4.738738059997559, + "learning_rate": 6.3732054891152415e-06, + "logits/chosen": -0.2544039189815521, + "logits/rejected": -0.26620662212371826, + "logps/chosen": -53.80023956298828, + "logps/rejected": -101.44368743896484, + "loss": 0.7661, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.958434581756592, + "rewards/margins": 4.650696277618408, + "rewards/rejected": -1.6922621726989746, + "step": 2467 + }, + { + "epoch": 0.62, + "grad_norm": 5.123055934906006, + "learning_rate": 6.370686297746504e-06, + "logits/chosen": -0.16371217370033264, + "logits/rejected": -0.26035135984420776, + "logps/chosen": -67.20189666748047, + "logps/rejected": -78.29209899902344, + "loss": 0.8857, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.740664482116699, + "rewards/margins": 3.818586587905884, + "rewards/rejected": -1.0779221057891846, + "step": 2468 + }, + { + "epoch": 0.62, + "grad_norm": 4.518092632293701, + "learning_rate": 6.3681667300948004e-06, + "logits/chosen": -0.19680869579315186, + "logits/rejected": -0.28853094577789307, + "logps/chosen": -53.692527770996094, + "logps/rejected": -68.07998657226562, + "loss": 0.7076, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.948420524597168, + "rewards/margins": 4.027789115905762, + "rewards/rejected": -1.079368233680725, + "step": 2469 + }, + { + "epoch": 0.62, + "grad_norm": 3.7668375968933105, + "learning_rate": 6.365646786851809e-06, + "logits/chosen": -0.25137853622436523, + "logits/rejected": -0.33911624550819397, + "logps/chosen": -64.21783447265625, + "logps/rejected": -85.80901336669922, + "loss": 0.7652, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1788904666900635, + "rewards/margins": 4.193728446960449, + "rewards/rejected": -1.0148382186889648, + "step": 2470 + }, + { + "epoch": 0.62, + "grad_norm": 4.5282135009765625, + "learning_rate": 6.363126468709303e-06, + "logits/chosen": -0.18705244362354279, + "logits/rejected": -0.2760741114616394, + "logps/chosen": -51.43250274658203, + "logps/rejected": -70.92162322998047, + "loss": 0.7884, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7780468463897705, + "rewards/margins": 4.092350006103516, + "rewards/rejected": -1.3143032789230347, + "step": 2471 + }, + { + "epoch": 0.62, + "grad_norm": 4.606449604034424, + "learning_rate": 6.36060577635917e-06, + "logits/chosen": -0.3434377908706665, + "logits/rejected": -0.4258526563644409, + "logps/chosen": -62.127140045166016, + "logps/rejected": -70.03004455566406, + "loss": 0.9248, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0777153968811035, + "rewards/margins": 3.863558769226074, + "rewards/rejected": -0.7858433723449707, + "step": 2472 + }, + { + "epoch": 0.62, + "grad_norm": 3.9585280418395996, + "learning_rate": 6.358084710493388e-06, + "logits/chosen": -0.2598208785057068, + "logits/rejected": -0.34912291169166565, + "logps/chosen": -57.14427185058594, + "logps/rejected": -84.8580551147461, + "loss": 0.8185, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9461452960968018, + "rewards/margins": 4.194855213165283, + "rewards/rejected": -1.24871027469635, + "step": 2473 + }, + { + "epoch": 0.62, + "grad_norm": 5.440154552459717, + "learning_rate": 6.355563271804052e-06, + "logits/chosen": -0.263266384601593, + "logits/rejected": -0.35706213116645813, + "logps/chosen": -65.58332061767578, + "logps/rejected": -72.39985656738281, + "loss": 0.8908, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9479939937591553, + "rewards/margins": 3.840301275253296, + "rewards/rejected": -0.8923070430755615, + "step": 2474 + }, + { + "epoch": 0.62, + "grad_norm": 12.39706802368164, + "learning_rate": 6.353041460983346e-06, + "logits/chosen": -0.23905010521411896, + "logits/rejected": -0.32039839029312134, + "logps/chosen": -63.138267517089844, + "logps/rejected": -81.5567626953125, + "loss": 0.9201, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5607988834381104, + "rewards/margins": 3.130967140197754, + "rewards/rejected": -0.570168137550354, + "step": 2475 + }, + { + "epoch": 0.62, + "grad_norm": 6.351542949676514, + "learning_rate": 6.350519278723563e-06, + "logits/chosen": -0.18208369612693787, + "logits/rejected": -0.3349577784538269, + "logps/chosen": -60.190269470214844, + "logps/rejected": -64.92530059814453, + "loss": 0.7717, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.027101755142212, + "rewards/margins": 3.9000611305236816, + "rewards/rejected": -0.8729592561721802, + "step": 2476 + }, + { + "epoch": 0.62, + "grad_norm": 4.898034572601318, + "learning_rate": 6.347996725717094e-06, + "logits/chosen": -0.3308722972869873, + "logits/rejected": -0.3855017125606537, + "logps/chosen": -50.855865478515625, + "logps/rejected": -64.90697479248047, + "loss": 0.9804, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8863158226013184, + "rewards/margins": 3.3472025394439697, + "rewards/rejected": -0.4608864188194275, + "step": 2477 + }, + { + "epoch": 0.62, + "grad_norm": 4.151493549346924, + "learning_rate": 6.345473802656438e-06, + "logits/chosen": -0.2175900787115097, + "logits/rejected": -0.2659575045108795, + "logps/chosen": -54.38637924194336, + "logps/rejected": -88.52959442138672, + "loss": 0.8228, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.904787302017212, + "rewards/margins": 3.6919844150543213, + "rewards/rejected": -0.7871968150138855, + "step": 2478 + }, + { + "epoch": 0.62, + "grad_norm": 4.471075057983398, + "learning_rate": 6.342950510234189e-06, + "logits/chosen": -0.2546154260635376, + "logits/rejected": -0.30109843611717224, + "logps/chosen": -52.819522857666016, + "logps/rejected": -89.6247787475586, + "loss": 0.8089, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9232099056243896, + "rewards/margins": 3.992831230163574, + "rewards/rejected": -1.0696214437484741, + "step": 2479 + }, + { + "epoch": 0.62, + "grad_norm": 4.614304542541504, + "learning_rate": 6.340426849143048e-06, + "logits/chosen": -0.18134352564811707, + "logits/rejected": -0.24041634798049927, + "logps/chosen": -59.77754211425781, + "logps/rejected": -92.13560485839844, + "loss": 0.7569, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.835160493850708, + "rewards/margins": 4.300533771514893, + "rewards/rejected": -1.465372920036316, + "step": 2480 + }, + { + "epoch": 0.62, + "grad_norm": 6.0198073387146, + "learning_rate": 6.3379028200758115e-06, + "logits/chosen": -0.2696770131587982, + "logits/rejected": -0.3322456479072571, + "logps/chosen": -58.04494094848633, + "logps/rejected": -82.42523956298828, + "loss": 0.8176, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.897840738296509, + "rewards/margins": 3.9140288829803467, + "rewards/rejected": -1.0161882638931274, + "step": 2481 + }, + { + "epoch": 0.62, + "grad_norm": 4.7419915199279785, + "learning_rate": 6.335378423725383e-06, + "logits/chosen": -0.2756797969341278, + "logits/rejected": -0.36236223578453064, + "logps/chosen": -57.00312805175781, + "logps/rejected": -85.42178344726562, + "loss": 0.8201, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0753369331359863, + "rewards/margins": 4.738410949707031, + "rewards/rejected": -1.6630738973617554, + "step": 2482 + }, + { + "epoch": 0.62, + "grad_norm": 4.411188125610352, + "learning_rate": 6.332853660784759e-06, + "logits/chosen": -0.23843659460544586, + "logits/rejected": -0.32869404554367065, + "logps/chosen": -62.234703063964844, + "logps/rejected": -83.17208099365234, + "loss": 0.7506, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.249789237976074, + "rewards/margins": 4.707516670227051, + "rewards/rejected": -1.4577271938323975, + "step": 2483 + }, + { + "epoch": 0.62, + "grad_norm": 8.032968521118164, + "learning_rate": 6.330328531947045e-06, + "logits/chosen": -0.2710195779800415, + "logits/rejected": -0.37368345260620117, + "logps/chosen": -53.96358108520508, + "logps/rejected": -70.99688720703125, + "loss": 0.7955, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.740830659866333, + "rewards/margins": 4.194461822509766, + "rewards/rejected": -1.4536309242248535, + "step": 2484 + }, + { + "epoch": 0.62, + "grad_norm": 4.6212615966796875, + "learning_rate": 6.327803037905445e-06, + "logits/chosen": -0.2687760591506958, + "logits/rejected": -0.34658703207969666, + "logps/chosen": -50.851654052734375, + "logps/rejected": -68.96320343017578, + "loss": 0.7595, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.085505962371826, + "rewards/margins": 4.440272331237793, + "rewards/rejected": -1.3547661304473877, + "step": 2485 + }, + { + "epoch": 0.62, + "grad_norm": 3.5600321292877197, + "learning_rate": 6.325277179353258e-06, + "logits/chosen": -0.21021854877471924, + "logits/rejected": -0.33156001567840576, + "logps/chosen": -66.1597671508789, + "logps/rejected": -70.78436279296875, + "loss": 0.7688, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.826737403869629, + "rewards/margins": 3.6821470260620117, + "rewards/rejected": -0.8554098606109619, + "step": 2486 + }, + { + "epoch": 0.62, + "grad_norm": 6.430885314941406, + "learning_rate": 6.3227509569838874e-06, + "logits/chosen": -0.2342405915260315, + "logits/rejected": -0.34998008608818054, + "logps/chosen": -58.51868438720703, + "logps/rejected": -87.29048156738281, + "loss": 0.8119, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6273813247680664, + "rewards/margins": 4.52006721496582, + "rewards/rejected": -1.8926857709884644, + "step": 2487 + }, + { + "epoch": 0.62, + "grad_norm": 4.292456150054932, + "learning_rate": 6.3202243714908374e-06, + "logits/chosen": -0.2699451744556427, + "logits/rejected": -0.3157005310058594, + "logps/chosen": -52.147300720214844, + "logps/rejected": -76.3331069946289, + "loss": 0.787, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.94058895111084, + "rewards/margins": 4.727906703948975, + "rewards/rejected": -1.7873177528381348, + "step": 2488 + }, + { + "epoch": 0.62, + "grad_norm": 4.643594741821289, + "learning_rate": 6.317697423567708e-06, + "logits/chosen": -0.2165040671825409, + "logits/rejected": -0.3371052145957947, + "logps/chosen": -63.65340805053711, + "logps/rejected": -76.32736206054688, + "loss": 0.8063, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.839832305908203, + "rewards/margins": 4.213180065155029, + "rewards/rejected": -1.3733477592468262, + "step": 2489 + }, + { + "epoch": 0.62, + "grad_norm": 4.6084136962890625, + "learning_rate": 6.315170113908204e-06, + "logits/chosen": -0.2410271018743515, + "logits/rejected": -0.32776808738708496, + "logps/chosen": -59.50868225097656, + "logps/rejected": -77.55738830566406, + "loss": 0.8649, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9046764373779297, + "rewards/margins": 3.7721636295318604, + "rewards/rejected": -0.867487370967865, + "step": 2490 + }, + { + "epoch": 0.62, + "grad_norm": 7.185351371765137, + "learning_rate": 6.312642443206124e-06, + "logits/chosen": -0.24935247004032135, + "logits/rejected": -0.38169166445732117, + "logps/chosen": -70.67989349365234, + "logps/rejected": -65.54434967041016, + "loss": 0.9822, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.857988119125366, + "rewards/margins": 4.508753776550293, + "rewards/rejected": -1.6507651805877686, + "step": 2491 + }, + { + "epoch": 0.62, + "grad_norm": 8.747049331665039, + "learning_rate": 6.310114412155369e-06, + "logits/chosen": -0.2058296501636505, + "logits/rejected": -0.2880744934082031, + "logps/chosen": -69.85150146484375, + "logps/rejected": -95.60475158691406, + "loss": 0.8329, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7972846031188965, + "rewards/margins": 4.327661037445068, + "rewards/rejected": -1.5303765535354614, + "step": 2492 + }, + { + "epoch": 0.62, + "grad_norm": 3.5883121490478516, + "learning_rate": 6.307586021449937e-06, + "logits/chosen": -0.2649957239627838, + "logits/rejected": -0.36767178773880005, + "logps/chosen": -61.18926239013672, + "logps/rejected": -78.71459197998047, + "loss": 0.7165, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.976560115814209, + "rewards/margins": 4.170707702636719, + "rewards/rejected": -1.1941479444503784, + "step": 2493 + }, + { + "epoch": 0.62, + "grad_norm": 6.185720443725586, + "learning_rate": 6.305057271783926e-06, + "logits/chosen": -0.19815826416015625, + "logits/rejected": -0.3230036199092865, + "logps/chosen": -75.07048034667969, + "logps/rejected": -71.84459686279297, + "loss": 1.0422, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6931445598602295, + "rewards/margins": 3.298456907272339, + "rewards/rejected": -0.6053119897842407, + "step": 2494 + }, + { + "epoch": 0.62, + "grad_norm": 4.98475980758667, + "learning_rate": 6.302528163851536e-06, + "logits/chosen": -0.2073267698287964, + "logits/rejected": -0.2530997097492218, + "logps/chosen": -52.007057189941406, + "logps/rejected": -80.57344818115234, + "loss": 0.8035, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8323750495910645, + "rewards/margins": 4.321345806121826, + "rewards/rejected": -1.4889707565307617, + "step": 2495 + }, + { + "epoch": 0.62, + "grad_norm": 3.824929714202881, + "learning_rate": 6.299998698347055e-06, + "logits/chosen": -0.16109329462051392, + "logits/rejected": -0.2925594449043274, + "logps/chosen": -63.12300109863281, + "logps/rejected": -67.45510864257812, + "loss": 0.7979, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7578554153442383, + "rewards/margins": 3.946831464767456, + "rewards/rejected": -1.1889761686325073, + "step": 2496 + }, + { + "epoch": 0.62, + "grad_norm": 3.8453028202056885, + "learning_rate": 6.297468875964882e-06, + "logits/chosen": -0.16919785737991333, + "logits/rejected": -0.30836692452430725, + "logps/chosen": -75.07313537597656, + "logps/rejected": -88.03411865234375, + "loss": 0.8179, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9907913208007812, + "rewards/margins": 4.4882378578186035, + "rewards/rejected": -1.4974464178085327, + "step": 2497 + }, + { + "epoch": 0.62, + "grad_norm": 4.565946102142334, + "learning_rate": 6.2949386973995045e-06, + "logits/chosen": -0.27186596393585205, + "logits/rejected": -0.3082585632801056, + "logps/chosen": -50.702579498291016, + "logps/rejected": -77.27102661132812, + "loss": 0.7825, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1097681522369385, + "rewards/margins": 3.848580837249756, + "rewards/rejected": -0.7388127446174622, + "step": 2498 + }, + { + "epoch": 0.63, + "grad_norm": 5.672173500061035, + "learning_rate": 6.292408163345512e-06, + "logits/chosen": -0.2086588442325592, + "logits/rejected": -0.30355164408683777, + "logps/chosen": -60.662200927734375, + "logps/rejected": -67.95794677734375, + "loss": 0.8345, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0074121952056885, + "rewards/margins": 3.040813446044922, + "rewards/rejected": -0.03340107202529907, + "step": 2499 + }, + { + "epoch": 0.63, + "grad_norm": 10.226999282836914, + "learning_rate": 6.28987727449759e-06, + "logits/chosen": -0.20029222965240479, + "logits/rejected": -0.21228653192520142, + "logps/chosen": -51.23951721191406, + "logps/rejected": -94.0301513671875, + "loss": 0.715, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0763916969299316, + "rewards/margins": 3.9550650119781494, + "rewards/rejected": -0.8786731362342834, + "step": 2500 + }, + { + "epoch": 0.63, + "grad_norm": 4.928595542907715, + "learning_rate": 6.2873460315505245e-06, + "logits/chosen": -0.20518247783184052, + "logits/rejected": -0.31116247177124023, + "logps/chosen": -57.83543014526367, + "logps/rejected": -63.74083709716797, + "loss": 0.8561, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8694345951080322, + "rewards/margins": 3.62858247756958, + "rewards/rejected": -0.7591478824615479, + "step": 2501 + }, + { + "epoch": 0.63, + "grad_norm": 9.24068546295166, + "learning_rate": 6.284814435199193e-06, + "logits/chosen": -0.241682767868042, + "logits/rejected": -0.31351175904273987, + "logps/chosen": -63.337669372558594, + "logps/rejected": -74.96753692626953, + "loss": 1.0179, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.683311939239502, + "rewards/margins": 2.62488055229187, + "rewards/rejected": 0.058431476354599, + "step": 2502 + }, + { + "epoch": 0.63, + "grad_norm": 6.300940036773682, + "learning_rate": 6.282282486138576e-06, + "logits/chosen": -0.23960113525390625, + "logits/rejected": -0.31815987825393677, + "logps/chosen": -64.5316390991211, + "logps/rejected": -84.85819244384766, + "loss": 0.7882, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0719492435455322, + "rewards/margins": 4.609708786010742, + "rewards/rejected": -1.5377600193023682, + "step": 2503 + }, + { + "epoch": 0.63, + "grad_norm": 12.267195701599121, + "learning_rate": 6.2797501850637465e-06, + "logits/chosen": -0.3471893072128296, + "logits/rejected": -0.4595613181591034, + "logps/chosen": -50.201045989990234, + "logps/rejected": -64.48454284667969, + "loss": 0.6891, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0205399990081787, + "rewards/margins": 4.044516563415527, + "rewards/rejected": -1.0239763259887695, + "step": 2504 + }, + { + "epoch": 0.63, + "grad_norm": 5.1644287109375, + "learning_rate": 6.277217532669876e-06, + "logits/chosen": -0.20218439400196075, + "logits/rejected": -0.257079154253006, + "logps/chosen": -54.214569091796875, + "logps/rejected": -74.3613052368164, + "loss": 0.8892, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8588874340057373, + "rewards/margins": 3.7767462730407715, + "rewards/rejected": -0.9178592562675476, + "step": 2505 + }, + { + "epoch": 0.63, + "grad_norm": 13.34088134765625, + "learning_rate": 6.274684529652233e-06, + "logits/chosen": -0.20314793288707733, + "logits/rejected": -0.3065309524536133, + "logps/chosen": -52.512290954589844, + "logps/rejected": -71.82075500488281, + "loss": 0.7817, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.770439386367798, + "rewards/margins": 3.9254088401794434, + "rewards/rejected": -1.1549696922302246, + "step": 2506 + }, + { + "epoch": 0.63, + "grad_norm": 6.226156711578369, + "learning_rate": 6.2721511767061805e-06, + "logits/chosen": -0.2616078853607178, + "logits/rejected": -0.3560646176338196, + "logps/chosen": -58.54632568359375, + "logps/rejected": -80.25849914550781, + "loss": 0.8014, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.847703456878662, + "rewards/margins": 3.9562606811523438, + "rewards/rejected": -1.1085572242736816, + "step": 2507 + }, + { + "epoch": 0.63, + "grad_norm": 10.102713584899902, + "learning_rate": 6.2696174745271795e-06, + "logits/chosen": -0.3163183927536011, + "logits/rejected": -0.35704493522644043, + "logps/chosen": -57.370079040527344, + "logps/rejected": -87.15995788574219, + "loss": 0.8134, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8384294509887695, + "rewards/margins": 3.8367676734924316, + "rewards/rejected": -0.9983385801315308, + "step": 2508 + }, + { + "epoch": 0.63, + "grad_norm": 5.427992820739746, + "learning_rate": 6.267083423810787e-06, + "logits/chosen": -0.2633402347564697, + "logits/rejected": -0.37318915128707886, + "logps/chosen": -53.912906646728516, + "logps/rejected": -57.574066162109375, + "loss": 0.8132, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1055796146392822, + "rewards/margins": 3.7863149642944336, + "rewards/rejected": -0.6807350516319275, + "step": 2509 + }, + { + "epoch": 0.63, + "grad_norm": 4.171212196350098, + "learning_rate": 6.264549025252652e-06, + "logits/chosen": -0.2738204598426819, + "logits/rejected": -0.30640068650245667, + "logps/chosen": -54.220069885253906, + "logps/rejected": -89.95343017578125, + "loss": 0.6822, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1874616146087646, + "rewards/margins": 3.655777931213379, + "rewards/rejected": -0.46831637620925903, + "step": 2510 + }, + { + "epoch": 0.63, + "grad_norm": 4.575974464416504, + "learning_rate": 6.262014279548523e-06, + "logits/chosen": -0.24460388720035553, + "logits/rejected": -0.38584405183792114, + "logps/chosen": -58.465755462646484, + "logps/rejected": -67.040283203125, + "loss": 0.81, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8130040168762207, + "rewards/margins": 4.454723358154297, + "rewards/rejected": -1.6417187452316284, + "step": 2511 + }, + { + "epoch": 0.63, + "grad_norm": 3.4715890884399414, + "learning_rate": 6.25947918739424e-06, + "logits/chosen": -0.23375482857227325, + "logits/rejected": -0.3483830988407135, + "logps/chosen": -49.808074951171875, + "logps/rejected": -76.59195709228516, + "loss": 0.6782, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8705029487609863, + "rewards/margins": 3.873687505722046, + "rewards/rejected": -1.0031846761703491, + "step": 2512 + }, + { + "epoch": 0.63, + "grad_norm": 5.374179840087891, + "learning_rate": 6.256943749485746e-06, + "logits/chosen": -0.39031994342803955, + "logits/rejected": -0.4889484941959381, + "logps/chosen": -44.95145034790039, + "logps/rejected": -87.9690933227539, + "loss": 0.7113, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0140578746795654, + "rewards/margins": 5.167712211608887, + "rewards/rejected": -2.1536545753479004, + "step": 2513 + }, + { + "epoch": 0.63, + "grad_norm": 5.593556880950928, + "learning_rate": 6.254407966519067e-06, + "logits/chosen": -0.20203906297683716, + "logits/rejected": -0.2374977171421051, + "logps/chosen": -73.58345031738281, + "logps/rejected": -80.87744140625, + "loss": 0.9441, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7038376331329346, + "rewards/margins": 2.7477071285247803, + "rewards/rejected": -0.04386957734823227, + "step": 2514 + }, + { + "epoch": 0.63, + "grad_norm": 5.762692451477051, + "learning_rate": 6.251871839190336e-06, + "logits/chosen": -0.23988665640354156, + "logits/rejected": -0.34755444526672363, + "logps/chosen": -64.09085845947266, + "logps/rejected": -82.23237609863281, + "loss": 0.9967, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7259535789489746, + "rewards/margins": 3.5878567695617676, + "rewards/rejected": -0.8619035482406616, + "step": 2515 + }, + { + "epoch": 0.63, + "grad_norm": 6.282260894775391, + "learning_rate": 6.249335368195771e-06, + "logits/chosen": -0.29712939262390137, + "logits/rejected": -0.32756415009498596, + "logps/chosen": -57.33379364013672, + "logps/rejected": -100.72103118896484, + "loss": 0.9483, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.83611798286438, + "rewards/margins": 4.364340782165527, + "rewards/rejected": -1.5282230377197266, + "step": 2516 + }, + { + "epoch": 0.63, + "grad_norm": 6.474961757659912, + "learning_rate": 6.246798554231689e-06, + "logits/chosen": -0.2500773072242737, + "logits/rejected": -0.3460853397846222, + "logps/chosen": -61.92824935913086, + "logps/rejected": -84.1977767944336, + "loss": 0.9409, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9924449920654297, + "rewards/margins": 3.6221890449523926, + "rewards/rejected": -0.6297441720962524, + "step": 2517 + }, + { + "epoch": 0.63, + "grad_norm": 5.522187232971191, + "learning_rate": 6.244261397994499e-06, + "logits/chosen": -0.24930620193481445, + "logits/rejected": -0.28927141427993774, + "logps/chosen": -62.79267120361328, + "logps/rejected": -89.66744995117188, + "loss": 0.7766, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.812119483947754, + "rewards/margins": 3.7864623069763184, + "rewards/rejected": -0.9743428230285645, + "step": 2518 + }, + { + "epoch": 0.63, + "grad_norm": 4.32465124130249, + "learning_rate": 6.2417239001807075e-06, + "logits/chosen": -0.3042967915534973, + "logits/rejected": -0.4039044976234436, + "logps/chosen": -63.72386932373047, + "logps/rejected": -66.74266052246094, + "loss": 0.8932, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8949756622314453, + "rewards/margins": 3.5494463443756104, + "rewards/rejected": -0.654470682144165, + "step": 2519 + }, + { + "epoch": 0.63, + "grad_norm": 7.600937843322754, + "learning_rate": 6.239186061486911e-06, + "logits/chosen": -0.2331470549106598, + "logits/rejected": -0.29309555888175964, + "logps/chosen": -61.751953125, + "logps/rejected": -75.02578735351562, + "loss": 1.0572, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.839632749557495, + "rewards/margins": 2.7673773765563965, + "rewards/rejected": 0.072255939245224, + "step": 2520 + }, + { + "epoch": 0.63, + "grad_norm": 4.941147327423096, + "learning_rate": 6.236647882609801e-06, + "logits/chosen": -0.18275006115436554, + "logits/rejected": -0.2842336893081665, + "logps/chosen": -61.81420135498047, + "logps/rejected": -76.43171691894531, + "loss": 0.8838, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.1325178146362305, + "rewards/margins": 4.183694362640381, + "rewards/rejected": -1.0511763095855713, + "step": 2521 + }, + { + "epoch": 0.63, + "grad_norm": 4.144866466522217, + "learning_rate": 6.23410936424616e-06, + "logits/chosen": -0.32411858439445496, + "logits/rejected": -0.3643660247325897, + "logps/chosen": -51.95708465576172, + "logps/rejected": -86.92516326904297, + "loss": 0.7324, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0430843830108643, + "rewards/margins": 4.143488883972168, + "rewards/rejected": -1.1004043817520142, + "step": 2522 + }, + { + "epoch": 0.63, + "grad_norm": 3.455517530441284, + "learning_rate": 6.231570507092871e-06, + "logits/chosen": -0.3018924593925476, + "logits/rejected": -0.38619786500930786, + "logps/chosen": -62.30526351928711, + "logps/rejected": -72.45301055908203, + "loss": 0.6959, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.575732469558716, + "rewards/margins": 3.754624366760254, + "rewards/rejected": -1.1788920164108276, + "step": 2523 + }, + { + "epoch": 0.63, + "grad_norm": 17.839815139770508, + "learning_rate": 6.229031311846902e-06, + "logits/chosen": -0.23142307996749878, + "logits/rejected": -0.30728042125701904, + "logps/chosen": -54.809120178222656, + "logps/rejected": -89.32672119140625, + "loss": 0.7665, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.888533115386963, + "rewards/margins": 3.3905396461486816, + "rewards/rejected": -0.5020062327384949, + "step": 2524 + }, + { + "epoch": 0.63, + "grad_norm": 4.988797187805176, + "learning_rate": 6.2264917792053195e-06, + "logits/chosen": -0.16327276825904846, + "logits/rejected": -0.302992582321167, + "logps/chosen": -81.3333969116211, + "logps/rejected": -74.06578826904297, + "loss": 0.8432, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.770375967025757, + "rewards/margins": 3.26823353767395, + "rewards/rejected": -0.49785739183425903, + "step": 2525 + }, + { + "epoch": 0.63, + "grad_norm": 4.5871453285217285, + "learning_rate": 6.2239519098652755e-06, + "logits/chosen": -0.2500019073486328, + "logits/rejected": -0.31167304515838623, + "logps/chosen": -53.51496505737305, + "logps/rejected": -87.46410369873047, + "loss": 0.7938, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9764418601989746, + "rewards/margins": 4.267643928527832, + "rewards/rejected": -1.2912019491195679, + "step": 2526 + }, + { + "epoch": 0.63, + "grad_norm": 5.263805389404297, + "learning_rate": 6.2214117045240215e-06, + "logits/chosen": -0.19199253618717194, + "logits/rejected": -0.29700469970703125, + "logps/chosen": -52.81937789916992, + "logps/rejected": -86.6796646118164, + "loss": 0.8197, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7730588912963867, + "rewards/margins": 4.216723442077637, + "rewards/rejected": -1.4436649084091187, + "step": 2527 + }, + { + "epoch": 0.63, + "grad_norm": 4.224605083465576, + "learning_rate": 6.218871163878899e-06, + "logits/chosen": -0.2917421758174896, + "logits/rejected": -0.37809231877326965, + "logps/chosen": -46.985496520996094, + "logps/rejected": -88.13868713378906, + "loss": 0.7612, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7712764739990234, + "rewards/margins": 4.182352066040039, + "rewards/rejected": -1.4110755920410156, + "step": 2528 + }, + { + "epoch": 0.63, + "grad_norm": 8.042765617370605, + "learning_rate": 6.216330288627341e-06, + "logits/chosen": -0.1553756445646286, + "logits/rejected": -0.2454218566417694, + "logps/chosen": -73.1299057006836, + "logps/rejected": -78.81812286376953, + "loss": 0.9927, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.4390459060668945, + "rewards/margins": 3.094226360321045, + "rewards/rejected": -0.6551807522773743, + "step": 2529 + }, + { + "epoch": 0.63, + "grad_norm": 6.693915843963623, + "learning_rate": 6.213789079466873e-06, + "logits/chosen": -0.2956125736236572, + "logits/rejected": -0.34764257073402405, + "logps/chosen": -52.33220672607422, + "logps/rejected": -90.91842651367188, + "loss": 0.7725, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.761449098587036, + "rewards/margins": 4.184696674346924, + "rewards/rejected": -1.4232476949691772, + "step": 2530 + }, + { + "epoch": 0.63, + "grad_norm": 5.196319103240967, + "learning_rate": 6.211247537095112e-06, + "logits/chosen": -0.26278597116470337, + "logits/rejected": -0.34599336981773376, + "logps/chosen": -57.46031951904297, + "logps/rejected": -83.01988983154297, + "loss": 0.8549, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5493345260620117, + "rewards/margins": 3.6681344509124756, + "rewards/rejected": -1.1187999248504639, + "step": 2531 + }, + { + "epoch": 0.63, + "grad_norm": 4.960145950317383, + "learning_rate": 6.208705662209763e-06, + "logits/chosen": -0.2498701512813568, + "logits/rejected": -0.3200080692768097, + "logps/chosen": -45.161006927490234, + "logps/rejected": -81.21220397949219, + "loss": 0.7438, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0019524097442627, + "rewards/margins": 3.6076407432556152, + "rewards/rejected": -0.6056880950927734, + "step": 2532 + }, + { + "epoch": 0.63, + "grad_norm": 7.675584316253662, + "learning_rate": 6.206163455508629e-06, + "logits/chosen": -0.2625683546066284, + "logits/rejected": -0.39750680327415466, + "logps/chosen": -64.83353424072266, + "logps/rejected": -66.27915954589844, + "loss": 1.0614, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.603148937225342, + "rewards/margins": 3.173056125640869, + "rewards/rejected": -0.5699073076248169, + "step": 2533 + }, + { + "epoch": 0.63, + "grad_norm": 6.158703804016113, + "learning_rate": 6.203620917689599e-06, + "logits/chosen": -0.23063033819198608, + "logits/rejected": -0.4116741418838501, + "logps/chosen": -68.08891296386719, + "logps/rejected": -61.33454132080078, + "loss": 0.8965, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.464427947998047, + "rewards/margins": 3.346972703933716, + "rewards/rejected": -0.882544755935669, + "step": 2534 + }, + { + "epoch": 0.63, + "grad_norm": 7.754306793212891, + "learning_rate": 6.201078049450657e-06, + "logits/chosen": -0.21523429453372955, + "logits/rejected": -0.26646214723587036, + "logps/chosen": -49.590980529785156, + "logps/rejected": -74.99258422851562, + "loss": 0.8839, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8456661701202393, + "rewards/margins": 3.121771812438965, + "rewards/rejected": -0.2761056423187256, + "step": 2535 + }, + { + "epoch": 0.63, + "grad_norm": 20.19062042236328, + "learning_rate": 6.198534851489872e-06, + "logits/chosen": -0.21872906386852264, + "logits/rejected": -0.3086845874786377, + "logps/chosen": -63.80682373046875, + "logps/rejected": -91.05599975585938, + "loss": 0.8561, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6120846271514893, + "rewards/margins": 4.624955654144287, + "rewards/rejected": -2.012871265411377, + "step": 2536 + }, + { + "epoch": 0.63, + "grad_norm": 4.412131309509277, + "learning_rate": 6.195991324505407e-06, + "logits/chosen": -0.2597125768661499, + "logits/rejected": -0.35986411571502686, + "logps/chosen": -59.12506866455078, + "logps/rejected": -96.9564437866211, + "loss": 0.879, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7147135734558105, + "rewards/margins": 4.736982345581055, + "rewards/rejected": -2.022268295288086, + "step": 2537 + }, + { + "epoch": 0.63, + "grad_norm": 4.638238906860352, + "learning_rate": 6.193447469195516e-06, + "logits/chosen": -0.27546969056129456, + "logits/rejected": -0.37393245100975037, + "logps/chosen": -52.12300109863281, + "logps/rejected": -75.01632690429688, + "loss": 0.7565, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8356292247772217, + "rewards/margins": 4.228917598724365, + "rewards/rejected": -1.3932886123657227, + "step": 2538 + }, + { + "epoch": 0.64, + "grad_norm": 2.8319592475891113, + "learning_rate": 6.190903286258543e-06, + "logits/chosen": -0.3635895252227783, + "logits/rejected": -0.41799551248550415, + "logps/chosen": -50.33144760131836, + "logps/rejected": -88.08879089355469, + "loss": 0.7313, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.030034303665161, + "rewards/margins": 4.344851016998291, + "rewards/rejected": -1.3148170709609985, + "step": 2539 + }, + { + "epoch": 0.64, + "grad_norm": 3.2392499446868896, + "learning_rate": 6.188358776392921e-06, + "logits/chosen": -0.2510261833667755, + "logits/rejected": -0.3636649250984192, + "logps/chosen": -62.323326110839844, + "logps/rejected": -75.72357940673828, + "loss": 0.6671, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9841372966766357, + "rewards/margins": 4.316438674926758, + "rewards/rejected": -1.3323010206222534, + "step": 2540 + }, + { + "epoch": 0.64, + "grad_norm": 6.165319442749023, + "learning_rate": 6.185813940297172e-06, + "logits/chosen": -0.2673300802707672, + "logits/rejected": -0.3548751771450043, + "logps/chosen": -51.57410430908203, + "logps/rejected": -71.84262084960938, + "loss": 0.7418, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7591712474823, + "rewards/margins": 3.2657153606414795, + "rewards/rejected": -0.5065439939498901, + "step": 2541 + }, + { + "epoch": 0.64, + "grad_norm": 4.203352451324463, + "learning_rate": 6.183268778669906e-06, + "logits/chosen": -0.21904700994491577, + "logits/rejected": -0.4057634174823761, + "logps/chosen": -53.46393585205078, + "logps/rejected": -62.77016067504883, + "loss": 0.7323, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9457433223724365, + "rewards/margins": 4.2158403396606445, + "rewards/rejected": -1.270097017288208, + "step": 2542 + }, + { + "epoch": 0.64, + "grad_norm": 6.079718112945557, + "learning_rate": 6.180723292209829e-06, + "logits/chosen": -0.25856727361679077, + "logits/rejected": -0.42570915818214417, + "logps/chosen": -71.44061279296875, + "logps/rejected": -69.71308898925781, + "loss": 0.8459, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7153854370117188, + "rewards/margins": 3.7145931720733643, + "rewards/rejected": -0.9992079734802246, + "step": 2543 + }, + { + "epoch": 0.64, + "grad_norm": 4.8270978927612305, + "learning_rate": 6.178177481615731e-06, + "logits/chosen": -0.21725532412528992, + "logits/rejected": -0.2973659336566925, + "logps/chosen": -50.00048065185547, + "logps/rejected": -80.21562957763672, + "loss": 0.7436, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1929616928100586, + "rewards/margins": 3.9249870777130127, + "rewards/rejected": -0.732025146484375, + "step": 2544 + }, + { + "epoch": 0.64, + "grad_norm": 2.4174306392669678, + "learning_rate": 6.175631347586492e-06, + "logits/chosen": -0.21571683883666992, + "logits/rejected": -0.3592641353607178, + "logps/chosen": -62.476112365722656, + "logps/rejected": -92.49702453613281, + "loss": 0.6679, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.890817880630493, + "rewards/margins": 5.15739631652832, + "rewards/rejected": -2.266578435897827, + "step": 2545 + }, + { + "epoch": 0.64, + "grad_norm": 6.80381965637207, + "learning_rate": 6.173084890821078e-06, + "logits/chosen": -0.2937919497489929, + "logits/rejected": -0.4397062659263611, + "logps/chosen": -60.70286560058594, + "logps/rejected": -71.77213287353516, + "loss": 0.7659, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.078193426132202, + "rewards/margins": 4.102634429931641, + "rewards/rejected": -1.0244405269622803, + "step": 2546 + }, + { + "epoch": 0.64, + "grad_norm": 4.883707046508789, + "learning_rate": 6.170538112018548e-06, + "logits/chosen": -0.261066734790802, + "logits/rejected": -0.33498692512512207, + "logps/chosen": -52.36735153198242, + "logps/rejected": -85.31742858886719, + "loss": 0.7448, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.814481735229492, + "rewards/margins": 4.56474494934082, + "rewards/rejected": -1.75026273727417, + "step": 2547 + }, + { + "epoch": 0.64, + "grad_norm": 5.406548976898193, + "learning_rate": 6.1679910118780485e-06, + "logits/chosen": -0.30330830812454224, + "logits/rejected": -0.4110085368156433, + "logps/chosen": -59.70018768310547, + "logps/rejected": -74.86060333251953, + "loss": 0.8568, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.817504644393921, + "rewards/margins": 3.9357457160949707, + "rewards/rejected": -1.118241310119629, + "step": 2548 + }, + { + "epoch": 0.64, + "grad_norm": 3.434602975845337, + "learning_rate": 6.165443591098813e-06, + "logits/chosen": -0.2790365219116211, + "logits/rejected": -0.38939690589904785, + "logps/chosen": -48.925697326660156, + "logps/rejected": -70.28971099853516, + "loss": 0.7153, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9153013229370117, + "rewards/margins": 3.816365957260132, + "rewards/rejected": -0.9010647535324097, + "step": 2549 + }, + { + "epoch": 0.64, + "grad_norm": 9.013968467712402, + "learning_rate": 6.1628958503801635e-06, + "logits/chosen": -0.28429922461509705, + "logits/rejected": -0.3293779194355011, + "logps/chosen": -54.04032897949219, + "logps/rejected": -83.56109619140625, + "loss": 0.8998, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7272450923919678, + "rewards/margins": 3.4636030197143555, + "rewards/rejected": -0.736357569694519, + "step": 2550 + }, + { + "epoch": 0.64, + "grad_norm": 6.143036365509033, + "learning_rate": 6.16034779042151e-06, + "logits/chosen": -0.2725609540939331, + "logits/rejected": -0.37046536803245544, + "logps/chosen": -52.608734130859375, + "logps/rejected": -75.09123229980469, + "loss": 0.8288, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.488274335861206, + "rewards/margins": 3.9969966411590576, + "rewards/rejected": -1.5087223052978516, + "step": 2551 + }, + { + "epoch": 0.64, + "grad_norm": 5.49657678604126, + "learning_rate": 6.1577994119223505e-06, + "logits/chosen": -0.27077868580818176, + "logits/rejected": -0.32888665795326233, + "logps/chosen": -50.84010314941406, + "logps/rejected": -83.41497802734375, + "loss": 0.7891, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9667389392852783, + "rewards/margins": 4.337296962738037, + "rewards/rejected": -1.3705579042434692, + "step": 2552 + }, + { + "epoch": 0.64, + "grad_norm": 17.934040069580078, + "learning_rate": 6.1552507155822685e-06, + "logits/chosen": -0.29803285002708435, + "logits/rejected": -0.27836135029792786, + "logps/chosen": -48.03799819946289, + "logps/rejected": -91.37285614013672, + "loss": 0.9825, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8837618827819824, + "rewards/margins": 3.828540325164795, + "rewards/rejected": -0.9447779059410095, + "step": 2553 + }, + { + "epoch": 0.64, + "grad_norm": 5.179437160491943, + "learning_rate": 6.152701702100936e-06, + "logits/chosen": -0.10199615359306335, + "logits/rejected": -0.25586050748825073, + "logps/chosen": -61.49018478393555, + "logps/rejected": -79.07317352294922, + "loss": 0.7193, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3609180450439453, + "rewards/margins": 4.397169589996338, + "rewards/rejected": -2.0362515449523926, + "step": 2554 + }, + { + "epoch": 0.64, + "grad_norm": 5.4708099365234375, + "learning_rate": 6.1501523721781145e-06, + "logits/chosen": -0.18783822655677795, + "logits/rejected": -0.29895317554473877, + "logps/chosen": -56.23081970214844, + "logps/rejected": -79.15106201171875, + "loss": 0.7389, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.789160966873169, + "rewards/margins": 4.3183441162109375, + "rewards/rejected": -1.5291835069656372, + "step": 2555 + }, + { + "epoch": 0.64, + "grad_norm": 3.3463962078094482, + "learning_rate": 6.147602726513648e-06, + "logits/chosen": -0.3439393937587738, + "logits/rejected": -0.40280207991600037, + "logps/chosen": -51.27448654174805, + "logps/rejected": -89.90278625488281, + "loss": 0.6615, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.797344923019409, + "rewards/margins": 4.427221298217773, + "rewards/rejected": -1.6298763751983643, + "step": 2556 + }, + { + "epoch": 0.64, + "grad_norm": 4.754143238067627, + "learning_rate": 6.1450527658074675e-06, + "logits/chosen": -0.2322288155555725, + "logits/rejected": -0.31490182876586914, + "logps/chosen": -49.535484313964844, + "logps/rejected": -74.28010559082031, + "loss": 0.6768, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.655144691467285, + "rewards/margins": 4.2847723960876465, + "rewards/rejected": -1.6296277046203613, + "step": 2557 + }, + { + "epoch": 0.64, + "grad_norm": 6.889032363891602, + "learning_rate": 6.142502490759597e-06, + "logits/chosen": -0.2176055908203125, + "logits/rejected": -0.32870006561279297, + "logps/chosen": -58.48577117919922, + "logps/rejected": -78.0177230834961, + "loss": 0.8663, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9703187942504883, + "rewards/margins": 4.089543342590332, + "rewards/rejected": -1.1192244291305542, + "step": 2558 + }, + { + "epoch": 0.64, + "grad_norm": 20.396615982055664, + "learning_rate": 6.139951902070139e-06, + "logits/chosen": -0.23294447362422943, + "logits/rejected": -0.34224945306777954, + "logps/chosen": -61.723777770996094, + "logps/rejected": -65.03317260742188, + "loss": 0.8941, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.616072416305542, + "rewards/margins": 3.3952159881591797, + "rewards/rejected": -0.7791434526443481, + "step": 2559 + }, + { + "epoch": 0.64, + "grad_norm": 8.269461631774902, + "learning_rate": 6.137401000439286e-06, + "logits/chosen": -0.2867090106010437, + "logits/rejected": -0.37691694498062134, + "logps/chosen": -47.18983459472656, + "logps/rejected": -72.34258270263672, + "loss": 0.7773, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0972177982330322, + "rewards/margins": 4.134111404418945, + "rewards/rejected": -1.0368938446044922, + "step": 2560 + }, + { + "epoch": 0.64, + "grad_norm": 4.558012008666992, + "learning_rate": 6.134849786567314e-06, + "logits/chosen": -0.26243653893470764, + "logits/rejected": -0.3268650472164154, + "logps/chosen": -58.42709732055664, + "logps/rejected": -82.55743408203125, + "loss": 0.7347, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.887878179550171, + "rewards/margins": 4.70267391204834, + "rewards/rejected": -1.8147954940795898, + "step": 2561 + }, + { + "epoch": 0.64, + "grad_norm": 13.20406723022461, + "learning_rate": 6.132298261154588e-06, + "logits/chosen": -0.33119648694992065, + "logits/rejected": -0.3688468635082245, + "logps/chosen": -51.536964416503906, + "logps/rejected": -87.48653411865234, + "loss": 0.824, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6536738872528076, + "rewards/margins": 3.878720998764038, + "rewards/rejected": -1.2250471115112305, + "step": 2562 + }, + { + "epoch": 0.64, + "grad_norm": 5.441410541534424, + "learning_rate": 6.129746424901556e-06, + "logits/chosen": -0.22619417309761047, + "logits/rejected": -0.2940923273563385, + "logps/chosen": -55.97880935668945, + "logps/rejected": -94.75865936279297, + "loss": 0.8739, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.157106637954712, + "rewards/margins": 4.176260471343994, + "rewards/rejected": -1.0191541910171509, + "step": 2563 + }, + { + "epoch": 0.64, + "grad_norm": 4.872075080871582, + "learning_rate": 6.127194278508753e-06, + "logits/chosen": -0.2227785885334015, + "logits/rejected": -0.3470878303050995, + "logps/chosen": -65.4013671875, + "logps/rejected": -79.18958282470703, + "loss": 0.8504, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0119423866271973, + "rewards/margins": 3.9129481315612793, + "rewards/rejected": -0.9010058045387268, + "step": 2564 + }, + { + "epoch": 0.64, + "grad_norm": 6.802582263946533, + "learning_rate": 6.124641822676798e-06, + "logits/chosen": -0.38263773918151855, + "logits/rejected": -0.5491223335266113, + "logps/chosen": -48.565887451171875, + "logps/rejected": -66.41732025146484, + "loss": 0.7413, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6959547996520996, + "rewards/margins": 4.835335731506348, + "rewards/rejected": -2.139380693435669, + "step": 2565 + }, + { + "epoch": 0.64, + "grad_norm": 3.339768409729004, + "learning_rate": 6.122089058106394e-06, + "logits/chosen": -0.32210564613342285, + "logits/rejected": -0.4047650098800659, + "logps/chosen": -48.694664001464844, + "logps/rejected": -66.10985565185547, + "loss": 0.7219, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.204261541366577, + "rewards/margins": 3.800636053085327, + "rewards/rejected": -0.5963743925094604, + "step": 2566 + }, + { + "epoch": 0.64, + "grad_norm": 7.779882431030273, + "learning_rate": 6.119535985498331e-06, + "logits/chosen": -0.217898428440094, + "logits/rejected": -0.29061850905418396, + "logps/chosen": -64.96106719970703, + "logps/rejected": -90.10723114013672, + "loss": 0.8665, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8047635555267334, + "rewards/margins": 3.4131228923797607, + "rewards/rejected": -0.6083594560623169, + "step": 2567 + }, + { + "epoch": 0.64, + "grad_norm": 5.2337493896484375, + "learning_rate": 6.116982605553482e-06, + "logits/chosen": -0.14796459674835205, + "logits/rejected": -0.2718147337436676, + "logps/chosen": -65.24369812011719, + "logps/rejected": -61.89164352416992, + "loss": 0.9363, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7876811027526855, + "rewards/margins": 3.2179975509643555, + "rewards/rejected": -0.4303162693977356, + "step": 2568 + }, + { + "epoch": 0.64, + "grad_norm": 6.203577041625977, + "learning_rate": 6.114428918972804e-06, + "logits/chosen": -0.18107545375823975, + "logits/rejected": -0.33842119574546814, + "logps/chosen": -52.909568786621094, + "logps/rejected": -67.3675537109375, + "loss": 0.82, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8720767498016357, + "rewards/margins": 3.6240456104278564, + "rewards/rejected": -0.7519686818122864, + "step": 2569 + }, + { + "epoch": 0.64, + "grad_norm": 3.0166313648223877, + "learning_rate": 6.111874926457344e-06, + "logits/chosen": -0.2439262866973877, + "logits/rejected": -0.34384042024612427, + "logps/chosen": -48.504737854003906, + "logps/rejected": -75.94255828857422, + "loss": 0.7338, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.735569715499878, + "rewards/margins": 4.169593811035156, + "rewards/rejected": -1.4340238571166992, + "step": 2570 + }, + { + "epoch": 0.64, + "grad_norm": 4.308104991912842, + "learning_rate": 6.109320628708221e-06, + "logits/chosen": -0.21393883228302002, + "logits/rejected": -0.22391851246356964, + "logps/chosen": -64.30043029785156, + "logps/rejected": -87.57521057128906, + "loss": 0.8883, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7319891452789307, + "rewards/margins": 2.968207836151123, + "rewards/rejected": -0.23621872067451477, + "step": 2571 + }, + { + "epoch": 0.64, + "grad_norm": 6.558363914489746, + "learning_rate": 6.1067660264266496e-06, + "logits/chosen": -0.26409316062927246, + "logits/rejected": -0.3900579512119293, + "logps/chosen": -50.236305236816406, + "logps/rejected": -87.3222427368164, + "loss": 0.6922, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9403247833251953, + "rewards/margins": 4.9856977462768555, + "rewards/rejected": -2.04537296295166, + "step": 2572 + }, + { + "epoch": 0.64, + "grad_norm": 4.71142578125, + "learning_rate": 6.104211120313921e-06, + "logits/chosen": -0.25949403643608093, + "logits/rejected": -0.3250305652618408, + "logps/chosen": -59.8671989440918, + "logps/rejected": -83.59735107421875, + "loss": 0.7502, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8317971229553223, + "rewards/margins": 4.333851337432861, + "rewards/rejected": -1.5020536184310913, + "step": 2573 + }, + { + "epoch": 0.64, + "grad_norm": 2.751565933227539, + "learning_rate": 6.1016559110714136e-06, + "logits/chosen": -0.2909882068634033, + "logits/rejected": -0.40159666538238525, + "logps/chosen": -51.31596755981445, + "logps/rejected": -66.9110336303711, + "loss": 0.7198, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7446186542510986, + "rewards/margins": 3.8997154235839844, + "rewards/rejected": -1.1550966501235962, + "step": 2574 + }, + { + "epoch": 0.64, + "grad_norm": 2.6861672401428223, + "learning_rate": 6.099100399400586e-06, + "logits/chosen": -0.19678562879562378, + "logits/rejected": -0.32214921712875366, + "logps/chosen": -64.76042175292969, + "logps/rejected": -74.35960388183594, + "loss": 0.712, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.939415693283081, + "rewards/margins": 3.710533380508423, + "rewards/rejected": -0.7711181640625, + "step": 2575 + }, + { + "epoch": 0.64, + "grad_norm": 3.29819655418396, + "learning_rate": 6.096544586002983e-06, + "logits/chosen": -0.1889992654323578, + "logits/rejected": -0.36753618717193604, + "logps/chosen": -63.28816223144531, + "logps/rejected": -71.03672790527344, + "loss": 0.7023, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4741737842559814, + "rewards/margins": 4.0793538093566895, + "rewards/rejected": -1.6051801443099976, + "step": 2576 + }, + { + "epoch": 0.64, + "grad_norm": 3.4569547176361084, + "learning_rate": 6.0939884715802275e-06, + "logits/chosen": -0.22678513824939728, + "logits/rejected": -0.23805275559425354, + "logps/chosen": -47.944435119628906, + "logps/rejected": -91.97036743164062, + "loss": 0.6941, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0310230255126953, + "rewards/margins": 4.676429748535156, + "rewards/rejected": -1.6454061269760132, + "step": 2577 + }, + { + "epoch": 0.64, + "grad_norm": 5.227230548858643, + "learning_rate": 6.091432056834033e-06, + "logits/chosen": -0.23108145594596863, + "logits/rejected": -0.3203011155128479, + "logps/chosen": -58.62926483154297, + "logps/rejected": -87.1308822631836, + "loss": 0.8233, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0540778636932373, + "rewards/margins": 3.8907017707824707, + "rewards/rejected": -0.8366237878799438, + "step": 2578 + }, + { + "epoch": 0.65, + "grad_norm": 4.658964157104492, + "learning_rate": 6.088875342466185e-06, + "logits/chosen": -0.22449913620948792, + "logits/rejected": -0.32881784439086914, + "logps/chosen": -57.96587371826172, + "logps/rejected": -86.12873840332031, + "loss": 0.8532, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8544399738311768, + "rewards/margins": 3.9129607677459717, + "rewards/rejected": -1.058520793914795, + "step": 2579 + }, + { + "epoch": 0.65, + "grad_norm": 8.069110870361328, + "learning_rate": 6.0863183291785626e-06, + "logits/chosen": -0.31043747067451477, + "logits/rejected": -0.3847716450691223, + "logps/chosen": -51.565155029296875, + "logps/rejected": -73.70573425292969, + "loss": 0.7988, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1198935508728027, + "rewards/margins": 4.135083198547363, + "rewards/rejected": -1.015189528465271, + "step": 2580 + }, + { + "epoch": 0.65, + "grad_norm": 6.040144920349121, + "learning_rate": 6.0837610176731155e-06, + "logits/chosen": -0.24565473198890686, + "logits/rejected": -0.38450729846954346, + "logps/chosen": -59.79881286621094, + "logps/rejected": -63.321197509765625, + "loss": 0.8811, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7507212162017822, + "rewards/margins": 3.569024085998535, + "rewards/rejected": -0.8183032870292664, + "step": 2581 + }, + { + "epoch": 0.65, + "grad_norm": 3.471097946166992, + "learning_rate": 6.081203408651884e-06, + "logits/chosen": -0.26848307251930237, + "logits/rejected": -0.3797609508037567, + "logps/chosen": -62.309383392333984, + "logps/rejected": -75.97416687011719, + "loss": 0.8564, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9719436168670654, + "rewards/margins": 4.568103790283203, + "rewards/rejected": -1.5961604118347168, + "step": 2582 + }, + { + "epoch": 0.65, + "grad_norm": 4.725557804107666, + "learning_rate": 6.078645502816985e-06, + "logits/chosen": -0.38195422291755676, + "logits/rejected": -0.42867544293403625, + "logps/chosen": -83.40007019042969, + "logps/rejected": -83.86563110351562, + "loss": 0.892, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.870347499847412, + "rewards/margins": 4.685753345489502, + "rewards/rejected": -1.815406084060669, + "step": 2583 + }, + { + "epoch": 0.65, + "grad_norm": 9.650558471679688, + "learning_rate": 6.076087300870622e-06, + "logits/chosen": -0.23137424886226654, + "logits/rejected": -0.26464200019836426, + "logps/chosen": -58.93777847290039, + "logps/rejected": -74.23051452636719, + "loss": 1.013, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6344075202941895, + "rewards/margins": 3.6964380741119385, + "rewards/rejected": -1.0620307922363281, + "step": 2584 + }, + { + "epoch": 0.65, + "grad_norm": 6.481294631958008, + "learning_rate": 6.073528803515076e-06, + "logits/chosen": -0.23270955681800842, + "logits/rejected": -0.22372424602508545, + "logps/chosen": -52.847259521484375, + "logps/rejected": -89.98001098632812, + "loss": 0.857, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.90106201171875, + "rewards/margins": 4.159103870391846, + "rewards/rejected": -1.2580418586730957, + "step": 2585 + }, + { + "epoch": 0.65, + "grad_norm": 5.870649337768555, + "learning_rate": 6.070970011452706e-06, + "logits/chosen": -0.21402306854724884, + "logits/rejected": -0.3487168550491333, + "logps/chosen": -68.13644409179688, + "logps/rejected": -78.03800201416016, + "loss": 0.8473, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.706174373626709, + "rewards/margins": 3.2863519191741943, + "rewards/rejected": -0.5801773071289062, + "step": 2586 + }, + { + "epoch": 0.65, + "grad_norm": 6.149914741516113, + "learning_rate": 6.068410925385958e-06, + "logits/chosen": -0.2390850931406021, + "logits/rejected": -0.39301055669784546, + "logps/chosen": -68.70006561279297, + "logps/rejected": -78.96680450439453, + "loss": 0.7901, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.902555465698242, + "rewards/margins": 4.519831657409668, + "rewards/rejected": -1.6172764301300049, + "step": 2587 + }, + { + "epoch": 0.65, + "grad_norm": 4.195746421813965, + "learning_rate": 6.065851546017357e-06, + "logits/chosen": -0.232837975025177, + "logits/rejected": -0.290401816368103, + "logps/chosen": -64.38036346435547, + "logps/rejected": -87.5641860961914, + "loss": 0.7851, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.800118923187256, + "rewards/margins": 3.813490867614746, + "rewards/rejected": -1.0133723020553589, + "step": 2588 + }, + { + "epoch": 0.65, + "grad_norm": 6.4736175537109375, + "learning_rate": 6.063291874049507e-06, + "logits/chosen": -0.27591654658317566, + "logits/rejected": -0.35187193751335144, + "logps/chosen": -52.90541076660156, + "logps/rejected": -65.51947021484375, + "loss": 1.01, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8028931617736816, + "rewards/margins": 2.9096131324768066, + "rewards/rejected": -0.10672017931938171, + "step": 2589 + }, + { + "epoch": 0.65, + "grad_norm": 4.88045072555542, + "learning_rate": 6.0607319101850955e-06, + "logits/chosen": -0.22430630028247833, + "logits/rejected": -0.3685073256492615, + "logps/chosen": -59.24736785888672, + "logps/rejected": -71.0860824584961, + "loss": 0.7623, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2936105728149414, + "rewards/margins": 4.282142639160156, + "rewards/rejected": -0.9885317087173462, + "step": 2590 + }, + { + "epoch": 0.65, + "grad_norm": 6.081259727478027, + "learning_rate": 6.058171655126884e-06, + "logits/chosen": -0.2504575550556183, + "logits/rejected": -0.34220272302627563, + "logps/chosen": -46.618900299072266, + "logps/rejected": -71.79879760742188, + "loss": 0.6862, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.949695110321045, + "rewards/margins": 4.375556945800781, + "rewards/rejected": -1.4258619546890259, + "step": 2591 + }, + { + "epoch": 0.65, + "grad_norm": 4.344744682312012, + "learning_rate": 6.055611109577722e-06, + "logits/chosen": -0.2865486443042755, + "logits/rejected": -0.29102692008018494, + "logps/chosen": -53.14684295654297, + "logps/rejected": -89.79661560058594, + "loss": 0.8475, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0061938762664795, + "rewards/margins": 3.0553393363952637, + "rewards/rejected": -0.04914574325084686, + "step": 2592 + }, + { + "epoch": 0.65, + "grad_norm": 4.982793807983398, + "learning_rate": 6.053050274240528e-06, + "logits/chosen": -0.1941547691822052, + "logits/rejected": -0.3278743326663971, + "logps/chosen": -60.44442367553711, + "logps/rejected": -68.65272521972656, + "loss": 0.7651, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9591176509857178, + "rewards/margins": 3.2179040908813477, + "rewards/rejected": -0.25878646969795227, + "step": 2593 + }, + { + "epoch": 0.65, + "grad_norm": 4.274339199066162, + "learning_rate": 6.050489149818314e-06, + "logits/chosen": -0.17773950099945068, + "logits/rejected": -0.28102850914001465, + "logps/chosen": -61.32072448730469, + "logps/rejected": -76.10165405273438, + "loss": 0.8302, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8945350646972656, + "rewards/margins": 3.8521037101745605, + "rewards/rejected": -0.9575686454772949, + "step": 2594 + }, + { + "epoch": 0.65, + "grad_norm": 6.2907280921936035, + "learning_rate": 6.04792773701416e-06, + "logits/chosen": -0.3175300061702728, + "logits/rejected": -0.4271746277809143, + "logps/chosen": -52.67941665649414, + "logps/rejected": -67.12039184570312, + "loss": 0.9113, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8389556407928467, + "rewards/margins": 3.2787909507751465, + "rewards/rejected": -0.4398356080055237, + "step": 2595 + }, + { + "epoch": 0.65, + "grad_norm": 2.2539613246917725, + "learning_rate": 6.045366036531229e-06, + "logits/chosen": -0.26876547932624817, + "logits/rejected": -0.3949669599533081, + "logps/chosen": -56.54308319091797, + "logps/rejected": -72.50090026855469, + "loss": 0.7122, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.971344232559204, + "rewards/margins": 4.565530776977539, + "rewards/rejected": -1.594186782836914, + "step": 2596 + }, + { + "epoch": 0.65, + "grad_norm": 7.104804515838623, + "learning_rate": 6.042804049072763e-06, + "logits/chosen": -0.15927112102508545, + "logits/rejected": -0.2917381823062897, + "logps/chosen": -59.210113525390625, + "logps/rejected": -80.02955627441406, + "loss": 0.8203, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.886261463165283, + "rewards/margins": 4.345773220062256, + "rewards/rejected": -1.459511399269104, + "step": 2597 + }, + { + "epoch": 0.65, + "grad_norm": 6.961991786956787, + "learning_rate": 6.040241775342086e-06, + "logits/chosen": -0.19216029345989227, + "logits/rejected": -0.20207835733890533, + "logps/chosen": -65.18778228759766, + "logps/rejected": -81.42608642578125, + "loss": 0.9692, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7254838943481445, + "rewards/margins": 3.4683051109313965, + "rewards/rejected": -0.7428210973739624, + "step": 2598 + }, + { + "epoch": 0.65, + "grad_norm": 5.201050281524658, + "learning_rate": 6.037679216042592e-06, + "logits/chosen": -0.29051876068115234, + "logits/rejected": -0.40923815965652466, + "logps/chosen": -56.28729248046875, + "logps/rejected": -76.64763641357422, + "loss": 0.8579, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6760923862457275, + "rewards/margins": 3.6094751358032227, + "rewards/rejected": -0.9333828687667847, + "step": 2599 + }, + { + "epoch": 0.65, + "grad_norm": 10.40021800994873, + "learning_rate": 6.035116371877766e-06, + "logits/chosen": -0.2698400318622589, + "logits/rejected": -0.35048893094062805, + "logps/chosen": -46.74573516845703, + "logps/rejected": -76.9150390625, + "loss": 0.8125, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0185744762420654, + "rewards/margins": 4.700194835662842, + "rewards/rejected": -1.6816202402114868, + "step": 2600 + }, + { + "epoch": 0.65, + "grad_norm": 6.170834541320801, + "learning_rate": 6.032553243551155e-06, + "logits/chosen": -0.3034352660179138, + "logits/rejected": -0.4024248719215393, + "logps/chosen": -63.69032287597656, + "logps/rejected": -81.21875, + "loss": 0.8444, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8707306385040283, + "rewards/margins": 4.174290657043457, + "rewards/rejected": -1.3035600185394287, + "step": 2601 + }, + { + "epoch": 0.65, + "grad_norm": 5.727986812591553, + "learning_rate": 6.029989831766402e-06, + "logits/chosen": -0.23025520145893097, + "logits/rejected": -0.2881200313568115, + "logps/chosen": -54.799560546875, + "logps/rejected": -76.5248031616211, + "loss": 0.872, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8706729412078857, + "rewards/margins": 3.7525107860565186, + "rewards/rejected": -0.881838321685791, + "step": 2602 + }, + { + "epoch": 0.65, + "grad_norm": 6.618283748626709, + "learning_rate": 6.027426137227213e-06, + "logits/chosen": -0.34828421473503113, + "logits/rejected": -0.4672386944293976, + "logps/chosen": -53.04855728149414, + "logps/rejected": -76.02262878417969, + "loss": 0.8106, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.989741325378418, + "rewards/margins": 4.176693916320801, + "rewards/rejected": -1.186952829360962, + "step": 2603 + }, + { + "epoch": 0.65, + "grad_norm": 3.107074022293091, + "learning_rate": 6.024862160637379e-06, + "logits/chosen": -0.2076360136270523, + "logits/rejected": -0.2734302878379822, + "logps/chosen": -66.78511047363281, + "logps/rejected": -87.34577178955078, + "loss": 0.7456, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.946521282196045, + "rewards/margins": 4.1917643547058105, + "rewards/rejected": -1.2452430725097656, + "step": 2604 + }, + { + "epoch": 0.65, + "grad_norm": 5.0452961921691895, + "learning_rate": 6.0222979027007674e-06, + "logits/chosen": -0.31008127331733704, + "logits/rejected": -0.40800008177757263, + "logps/chosen": -50.375579833984375, + "logps/rejected": -82.75563049316406, + "loss": 0.7575, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.933305025100708, + "rewards/margins": 5.1439337730407715, + "rewards/rejected": -2.2106289863586426, + "step": 2605 + }, + { + "epoch": 0.65, + "grad_norm": 3.049812078475952, + "learning_rate": 6.019733364121322e-06, + "logits/chosen": -0.28460967540740967, + "logits/rejected": -0.38967782258987427, + "logps/chosen": -62.300079345703125, + "logps/rejected": -83.57774353027344, + "loss": 0.7777, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9051077365875244, + "rewards/margins": 4.582918167114258, + "rewards/rejected": -1.6778103113174438, + "step": 2606 + }, + { + "epoch": 0.65, + "grad_norm": 9.430989265441895, + "learning_rate": 6.017168545603064e-06, + "logits/chosen": -0.261846125125885, + "logits/rejected": -0.3188549280166626, + "logps/chosen": -58.1511116027832, + "logps/rejected": -85.79058074951172, + "loss": 0.877, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.815713882446289, + "rewards/margins": 3.4929592609405518, + "rewards/rejected": -0.6772456169128418, + "step": 2607 + }, + { + "epoch": 0.65, + "grad_norm": 3.5333378314971924, + "learning_rate": 6.014603447850091e-06, + "logits/chosen": -0.25484904646873474, + "logits/rejected": -0.3143368363380432, + "logps/chosen": -50.24706268310547, + "logps/rejected": -103.62349700927734, + "loss": 0.7882, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.795074939727783, + "rewards/margins": 4.2558088302612305, + "rewards/rejected": -1.460734248161316, + "step": 2608 + }, + { + "epoch": 0.65, + "grad_norm": 5.514252185821533, + "learning_rate": 6.012038071566579e-06, + "logits/chosen": -0.2357824742794037, + "logits/rejected": -0.40401318669319153, + "logps/chosen": -59.27204132080078, + "logps/rejected": -65.06719970703125, + "loss": 0.8352, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6990158557891846, + "rewards/margins": 4.954904556274414, + "rewards/rejected": -2.2558884620666504, + "step": 2609 + }, + { + "epoch": 0.65, + "grad_norm": 3.9561288356781006, + "learning_rate": 6.009472417456779e-06, + "logits/chosen": -0.30032145977020264, + "logits/rejected": -0.3332715630531311, + "logps/chosen": -53.46410369873047, + "logps/rejected": -100.46156311035156, + "loss": 0.7113, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.95174241065979, + "rewards/margins": 5.2893595695495605, + "rewards/rejected": -2.3376169204711914, + "step": 2610 + }, + { + "epoch": 0.65, + "grad_norm": 3.8236210346221924, + "learning_rate": 6.006906486225016e-06, + "logits/chosen": -0.2877217233181, + "logits/rejected": -0.43414461612701416, + "logps/chosen": -72.16753387451172, + "logps/rejected": -75.61458587646484, + "loss": 0.7591, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8899717330932617, + "rewards/margins": 4.906352996826172, + "rewards/rejected": -2.0163822174072266, + "step": 2611 + }, + { + "epoch": 0.65, + "grad_norm": 5.140329360961914, + "learning_rate": 6.004340278575695e-06, + "logits/chosen": -0.31627991795539856, + "logits/rejected": -0.39769044518470764, + "logps/chosen": -51.56874084472656, + "logps/rejected": -83.81553649902344, + "loss": 0.7747, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.792886734008789, + "rewards/margins": 4.8023786544799805, + "rewards/rejected": -2.0094921588897705, + "step": 2612 + }, + { + "epoch": 0.65, + "grad_norm": 4.320090293884277, + "learning_rate": 6.001773795213297e-06, + "logits/chosen": -0.24445778131484985, + "logits/rejected": -0.39698728919029236, + "logps/chosen": -60.32893753051758, + "logps/rejected": -76.46537780761719, + "loss": 0.7954, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0988974571228027, + "rewards/margins": 4.369699478149414, + "rewards/rejected": -1.27080237865448, + "step": 2613 + }, + { + "epoch": 0.65, + "grad_norm": 3.6449124813079834, + "learning_rate": 5.999207036842376e-06, + "logits/chosen": -0.19974760711193085, + "logits/rejected": -0.3210347890853882, + "logps/chosen": -58.22532272338867, + "logps/rejected": -79.38725280761719, + "loss": 0.6911, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9938488006591797, + "rewards/margins": 4.192319869995117, + "rewards/rejected": -1.1984705924987793, + "step": 2614 + }, + { + "epoch": 0.65, + "grad_norm": 5.256712436676025, + "learning_rate": 5.996640004167562e-06, + "logits/chosen": -0.2687053978443146, + "logits/rejected": -0.4105020761489868, + "logps/chosen": -52.97184371948242, + "logps/rejected": -81.30855560302734, + "loss": 0.7566, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.737727642059326, + "rewards/margins": 3.6579108238220215, + "rewards/rejected": -0.9201831221580505, + "step": 2615 + }, + { + "epoch": 0.65, + "grad_norm": 5.621128559112549, + "learning_rate": 5.994072697893559e-06, + "logits/chosen": -0.2777644991874695, + "logits/rejected": -0.44379329681396484, + "logps/chosen": -62.67521286010742, + "logps/rejected": -76.53607940673828, + "loss": 0.8756, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.886580228805542, + "rewards/margins": 4.211307048797607, + "rewards/rejected": -1.3247270584106445, + "step": 2616 + }, + { + "epoch": 0.65, + "grad_norm": 7.770283222198486, + "learning_rate": 5.991505118725152e-06, + "logits/chosen": -0.22549663484096527, + "logits/rejected": -0.33137914538383484, + "logps/chosen": -66.6775894165039, + "logps/rejected": -71.9771499633789, + "loss": 0.9917, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.635141611099243, + "rewards/margins": 3.9074366092681885, + "rewards/rejected": -1.2722952365875244, + "step": 2617 + }, + { + "epoch": 0.65, + "grad_norm": 5.018031597137451, + "learning_rate": 5.988937267367194e-06, + "logits/chosen": -0.3485340178012848, + "logits/rejected": -0.34661081433296204, + "logps/chosen": -45.309814453125, + "logps/rejected": -87.53254699707031, + "loss": 0.7703, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.944864273071289, + "rewards/margins": 4.248068809509277, + "rewards/rejected": -1.3032046556472778, + "step": 2618 + }, + { + "epoch": 0.66, + "grad_norm": 5.701935768127441, + "learning_rate": 5.9863691445246165e-06, + "logits/chosen": -0.31308549642562866, + "logits/rejected": -0.4019659459590912, + "logps/chosen": -55.00234603881836, + "logps/rejected": -75.52407836914062, + "loss": 0.8257, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.887075185775757, + "rewards/margins": 3.445072889328003, + "rewards/rejected": -0.5579978823661804, + "step": 2619 + }, + { + "epoch": 0.66, + "grad_norm": 4.701308727264404, + "learning_rate": 5.983800750902425e-06, + "logits/chosen": -0.2211841642856598, + "logits/rejected": -0.209224134683609, + "logps/chosen": -52.5363655090332, + "logps/rejected": -91.54376983642578, + "loss": 0.7608, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9351556301116943, + "rewards/margins": 2.996323347091675, + "rewards/rejected": -0.06116768717765808, + "step": 2620 + }, + { + "epoch": 0.66, + "grad_norm": 5.303420066833496, + "learning_rate": 5.9812320872057e-06, + "logits/chosen": -0.2024436891078949, + "logits/rejected": -0.3342861235141754, + "logps/chosen": -67.38866424560547, + "logps/rejected": -79.641357421875, + "loss": 0.7641, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.75329852104187, + "rewards/margins": 4.341157913208008, + "rewards/rejected": -1.5878592729568481, + "step": 2621 + }, + { + "epoch": 0.66, + "grad_norm": 12.740730285644531, + "learning_rate": 5.978663154139591e-06, + "logits/chosen": -0.28832247853279114, + "logits/rejected": -0.40775609016418457, + "logps/chosen": -56.98994445800781, + "logps/rejected": -75.69842529296875, + "loss": 0.856, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6077568531036377, + "rewards/margins": 3.460174560546875, + "rewards/rejected": -0.8524178862571716, + "step": 2622 + }, + { + "epoch": 0.66, + "grad_norm": 4.855856418609619, + "learning_rate": 5.976093952409331e-06, + "logits/chosen": -0.3072342276573181, + "logits/rejected": -0.38045650720596313, + "logps/chosen": -62.315486907958984, + "logps/rejected": -81.38423919677734, + "loss": 0.8632, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.558271646499634, + "rewards/margins": 3.5642595291137695, + "rewards/rejected": -1.0059878826141357, + "step": 2623 + }, + { + "epoch": 0.66, + "grad_norm": 4.86749267578125, + "learning_rate": 5.973524482720216e-06, + "logits/chosen": -0.3367909789085388, + "logits/rejected": -0.4653750956058502, + "logps/chosen": -60.174705505371094, + "logps/rejected": -61.058902740478516, + "loss": 0.797, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8536837100982666, + "rewards/margins": 3.75716495513916, + "rewards/rejected": -0.9034810066223145, + "step": 2624 + }, + { + "epoch": 0.66, + "grad_norm": 5.481653690338135, + "learning_rate": 5.970954745777626e-06, + "logits/chosen": -0.2137438803911209, + "logits/rejected": -0.28249526023864746, + "logps/chosen": -59.656742095947266, + "logps/rejected": -85.88082122802734, + "loss": 0.8303, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.681293249130249, + "rewards/margins": 3.3917102813720703, + "rewards/rejected": -0.7104169130325317, + "step": 2625 + }, + { + "epoch": 0.66, + "grad_norm": 13.794976234436035, + "learning_rate": 5.9683847422870055e-06, + "logits/chosen": -0.3408610224723816, + "logits/rejected": -0.39771631360054016, + "logps/chosen": -61.04291915893555, + "logps/rejected": -76.95500183105469, + "loss": 0.9138, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.736131191253662, + "rewards/margins": 3.5276479721069336, + "rewards/rejected": -0.7915171384811401, + "step": 2626 + }, + { + "epoch": 0.66, + "grad_norm": 4.462400913238525, + "learning_rate": 5.965814472953877e-06, + "logits/chosen": -0.23399850726127625, + "logits/rejected": -0.34688499569892883, + "logps/chosen": -62.84617233276367, + "logps/rejected": -73.92051696777344, + "loss": 0.8847, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.003019094467163, + "rewards/margins": 3.5595571994781494, + "rewards/rejected": -0.5565377473831177, + "step": 2627 + }, + { + "epoch": 0.66, + "grad_norm": 5.881984233856201, + "learning_rate": 5.963243938483834e-06, + "logits/chosen": -0.18864910304546356, + "logits/rejected": -0.3059803545475006, + "logps/chosen": -56.71966552734375, + "logps/rejected": -72.95004272460938, + "loss": 0.954, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.021406412124634, + "rewards/margins": 4.25866174697876, + "rewards/rejected": -1.237255334854126, + "step": 2628 + }, + { + "epoch": 0.66, + "grad_norm": 4.961138725280762, + "learning_rate": 5.9606731395825465e-06, + "logits/chosen": -0.2003607600927353, + "logits/rejected": -0.24538812041282654, + "logps/chosen": -61.32056427001953, + "logps/rejected": -84.9591064453125, + "loss": 0.8769, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8445048332214355, + "rewards/margins": 2.960472345352173, + "rewards/rejected": -0.11596761643886566, + "step": 2629 + }, + { + "epoch": 0.66, + "grad_norm": 2.728152275085449, + "learning_rate": 5.958102076955753e-06, + "logits/chosen": -0.3291171193122864, + "logits/rejected": -0.43185895681381226, + "logps/chosen": -52.43351745605469, + "logps/rejected": -74.32630920410156, + "loss": 0.732, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.961134433746338, + "rewards/margins": 5.205204486846924, + "rewards/rejected": -2.244070291519165, + "step": 2630 + }, + { + "epoch": 0.66, + "grad_norm": 25.098281860351562, + "learning_rate": 5.955530751309264e-06, + "logits/chosen": -0.23926487565040588, + "logits/rejected": -0.33240342140197754, + "logps/chosen": -75.9124755859375, + "logps/rejected": -92.47148132324219, + "loss": 0.8601, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8718020915985107, + "rewards/margins": 4.306736946105957, + "rewards/rejected": -1.4349347352981567, + "step": 2631 + }, + { + "epoch": 0.66, + "grad_norm": 3.167421340942383, + "learning_rate": 5.952959163348965e-06, + "logits/chosen": -0.3160240948200226, + "logits/rejected": -0.45705685019493103, + "logps/chosen": -52.88304901123047, + "logps/rejected": -77.43553161621094, + "loss": 0.5942, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.976400136947632, + "rewards/margins": 5.414956092834473, + "rewards/rejected": -2.4385557174682617, + "step": 2632 + }, + { + "epoch": 0.66, + "grad_norm": 5.7157979011535645, + "learning_rate": 5.950387313780812e-06, + "logits/chosen": -0.25834032893180847, + "logits/rejected": -0.3702907860279083, + "logps/chosen": -54.43462371826172, + "logps/rejected": -80.22996520996094, + "loss": 0.8741, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7351009845733643, + "rewards/margins": 3.63423228263855, + "rewards/rejected": -0.8991311192512512, + "step": 2633 + }, + { + "epoch": 0.66, + "grad_norm": 3.335139751434326, + "learning_rate": 5.947815203310834e-06, + "logits/chosen": -0.23810550570487976, + "logits/rejected": -0.37511396408081055, + "logps/chosen": -58.818328857421875, + "logps/rejected": -70.22655487060547, + "loss": 0.7654, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8884854316711426, + "rewards/margins": 4.44864559173584, + "rewards/rejected": -1.5601601600646973, + "step": 2634 + }, + { + "epoch": 0.66, + "grad_norm": 6.25617790222168, + "learning_rate": 5.945242832645133e-06, + "logits/chosen": -0.32220616936683655, + "logits/rejected": -0.4253625273704529, + "logps/chosen": -53.40373611450195, + "logps/rejected": -82.65352630615234, + "loss": 0.8609, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.550734758377075, + "rewards/margins": 4.658227920532227, + "rewards/rejected": -2.107492685317993, + "step": 2635 + }, + { + "epoch": 0.66, + "grad_norm": 5.264156818389893, + "learning_rate": 5.942670202489874e-06, + "logits/chosen": -0.18384218215942383, + "logits/rejected": -0.2904718220233917, + "logps/chosen": -68.29400634765625, + "logps/rejected": -84.76643371582031, + "loss": 0.7834, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.97479248046875, + "rewards/margins": 4.82545280456543, + "rewards/rejected": -1.8506603240966797, + "step": 2636 + }, + { + "epoch": 0.66, + "grad_norm": 6.10153865814209, + "learning_rate": 5.940097313551305e-06, + "logits/chosen": -0.28684693574905396, + "logits/rejected": -0.3388480544090271, + "logps/chosen": -51.07239532470703, + "logps/rejected": -77.06668090820312, + "loss": 0.7708, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.869431257247925, + "rewards/margins": 3.05904221534729, + "rewards/rejected": -0.18961083889007568, + "step": 2637 + }, + { + "epoch": 0.66, + "grad_norm": 3.960679292678833, + "learning_rate": 5.937524166535737e-06, + "logits/chosen": -0.26360970735549927, + "logits/rejected": -0.38146865367889404, + "logps/chosen": -59.32931137084961, + "logps/rejected": -75.83467102050781, + "loss": 0.7983, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.728214740753174, + "rewards/margins": 4.55014181137085, + "rewards/rejected": -1.8219268321990967, + "step": 2638 + }, + { + "epoch": 0.66, + "grad_norm": 2.882956027984619, + "learning_rate": 5.934950762149555e-06, + "logits/chosen": -0.32771870493888855, + "logits/rejected": -0.42562636733055115, + "logps/chosen": -49.22129821777344, + "logps/rejected": -79.81380462646484, + "loss": 0.602, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1608641147613525, + "rewards/margins": 4.600874423980713, + "rewards/rejected": -1.44001042842865, + "step": 2639 + }, + { + "epoch": 0.66, + "grad_norm": 9.542099952697754, + "learning_rate": 5.932377101099215e-06, + "logits/chosen": -0.33457425236701965, + "logits/rejected": -0.4623042345046997, + "logps/chosen": -55.15964126586914, + "logps/rejected": -60.382389068603516, + "loss": 0.8146, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.780722141265869, + "rewards/margins": 3.685231924057007, + "rewards/rejected": -0.9045097231864929, + "step": 2640 + }, + { + "epoch": 0.66, + "grad_norm": 9.657191276550293, + "learning_rate": 5.929803184091242e-06, + "logits/chosen": -0.31603938341140747, + "logits/rejected": -0.38168683648109436, + "logps/chosen": -60.0321044921875, + "logps/rejected": -68.24407196044922, + "loss": 0.9062, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.970771312713623, + "rewards/margins": 3.0793163776397705, + "rewards/rejected": -0.10854494571685791, + "step": 2641 + }, + { + "epoch": 0.66, + "grad_norm": 3.840500593185425, + "learning_rate": 5.9272290118322294e-06, + "logits/chosen": -0.2683895230293274, + "logits/rejected": -0.30461621284484863, + "logps/chosen": -54.318077087402344, + "logps/rejected": -81.44927978515625, + "loss": 0.7465, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8249764442443848, + "rewards/margins": 3.443542003631592, + "rewards/rejected": -0.6185652613639832, + "step": 2642 + }, + { + "epoch": 0.66, + "grad_norm": 10.05594539642334, + "learning_rate": 5.924654585028846e-06, + "logits/chosen": -0.30417749285697937, + "logits/rejected": -0.38410264253616333, + "logps/chosen": -64.21858215332031, + "logps/rejected": -94.46783447265625, + "loss": 0.909, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.623505115509033, + "rewards/margins": 4.110620975494385, + "rewards/rejected": -1.4871158599853516, + "step": 2643 + }, + { + "epoch": 0.66, + "grad_norm": 5.583017349243164, + "learning_rate": 5.922079904387826e-06, + "logits/chosen": -0.21030208468437195, + "logits/rejected": -0.3406023383140564, + "logps/chosen": -62.3822135925293, + "logps/rejected": -74.72261810302734, + "loss": 0.8696, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6530559062957764, + "rewards/margins": 3.8135650157928467, + "rewards/rejected": -1.1605089902877808, + "step": 2644 + }, + { + "epoch": 0.66, + "grad_norm": 7.896036148071289, + "learning_rate": 5.919504970615978e-06, + "logits/chosen": -0.2564079761505127, + "logits/rejected": -0.2926669418811798, + "logps/chosen": -55.2130126953125, + "logps/rejected": -86.73928833007812, + "loss": 0.9746, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.505836248397827, + "rewards/margins": 3.562971830368042, + "rewards/rejected": -1.0571355819702148, + "step": 2645 + }, + { + "epoch": 0.66, + "grad_norm": 4.749951362609863, + "learning_rate": 5.916929784420171e-06, + "logits/chosen": -0.29166319966316223, + "logits/rejected": -0.4428222179412842, + "logps/chosen": -54.381675720214844, + "logps/rejected": -73.46941375732422, + "loss": 0.7705, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0007052421569824, + "rewards/margins": 4.351312637329102, + "rewards/rejected": -1.3506076335906982, + "step": 2646 + }, + { + "epoch": 0.66, + "grad_norm": 4.480696201324463, + "learning_rate": 5.914354346507355e-06, + "logits/chosen": -0.2867961823940277, + "logits/rejected": -0.3933469355106354, + "logps/chosen": -54.461002349853516, + "logps/rejected": -72.27994537353516, + "loss": 0.7509, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7389883995056152, + "rewards/margins": 4.502493381500244, + "rewards/rejected": -1.763505458831787, + "step": 2647 + }, + { + "epoch": 0.66, + "grad_norm": 5.1556925773620605, + "learning_rate": 5.911778657584538e-06, + "logits/chosen": -0.30468568205833435, + "logits/rejected": -0.40246614813804626, + "logps/chosen": -64.26347351074219, + "logps/rejected": -81.42813873291016, + "loss": 0.9063, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.621208429336548, + "rewards/margins": 3.5592691898345947, + "rewards/rejected": -0.9380608797073364, + "step": 2648 + }, + { + "epoch": 0.66, + "grad_norm": 6.45966911315918, + "learning_rate": 5.909202718358807e-06, + "logits/chosen": -0.2462451457977295, + "logits/rejected": -0.3565473258495331, + "logps/chosen": -65.40435791015625, + "logps/rejected": -74.01016235351562, + "loss": 0.8089, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.886092185974121, + "rewards/margins": 3.610712766647339, + "rewards/rejected": -0.7246206998825073, + "step": 2649 + }, + { + "epoch": 0.66, + "grad_norm": 5.13409948348999, + "learning_rate": 5.906626529537311e-06, + "logits/chosen": -0.19024600088596344, + "logits/rejected": -0.28591668605804443, + "logps/chosen": -71.63763427734375, + "logps/rejected": -76.85001373291016, + "loss": 0.8258, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5966298580169678, + "rewards/margins": 3.8625128269195557, + "rewards/rejected": -1.265883445739746, + "step": 2650 + }, + { + "epoch": 0.66, + "grad_norm": 5.119041442871094, + "learning_rate": 5.904050091827268e-06, + "logits/chosen": -0.28250864148139954, + "logits/rejected": -0.38301995396614075, + "logps/chosen": -57.690513610839844, + "logps/rejected": -71.47825622558594, + "loss": 0.8228, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.691082000732422, + "rewards/margins": 4.276388168334961, + "rewards/rejected": -1.585306167602539, + "step": 2651 + }, + { + "epoch": 0.66, + "grad_norm": 5.093367099761963, + "learning_rate": 5.9014734059359665e-06, + "logits/chosen": -0.1805417835712433, + "logits/rejected": -0.21793633699417114, + "logps/chosen": -58.31645965576172, + "logps/rejected": -103.62594604492188, + "loss": 0.8307, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9277286529541016, + "rewards/margins": 4.005168914794922, + "rewards/rejected": -1.0774402618408203, + "step": 2652 + }, + { + "epoch": 0.66, + "grad_norm": 4.993836879730225, + "learning_rate": 5.898896472570763e-06, + "logits/chosen": -0.27889856696128845, + "logits/rejected": -0.39961177110671997, + "logps/chosen": -53.0211181640625, + "logps/rejected": -73.55310821533203, + "loss": 0.7572, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.796808958053589, + "rewards/margins": 4.026041030883789, + "rewards/rejected": -1.2292314767837524, + "step": 2653 + }, + { + "epoch": 0.66, + "grad_norm": 11.108967781066895, + "learning_rate": 5.8963192924390814e-06, + "logits/chosen": -0.32827091217041016, + "logits/rejected": -0.4468502700328827, + "logps/chosen": -52.4607048034668, + "logps/rejected": -82.23958587646484, + "loss": 0.7061, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8611690998077393, + "rewards/margins": 5.1570048332214355, + "rewards/rejected": -2.2958362102508545, + "step": 2654 + }, + { + "epoch": 0.66, + "grad_norm": 9.42678451538086, + "learning_rate": 5.893741866248414e-06, + "logits/chosen": -0.2729994058609009, + "logits/rejected": -0.32973620295524597, + "logps/chosen": -52.777679443359375, + "logps/rejected": -75.17276000976562, + "loss": 0.7653, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.628230094909668, + "rewards/margins": 4.015626430511475, + "rewards/rejected": -1.3873968124389648, + "step": 2655 + }, + { + "epoch": 0.66, + "grad_norm": 17.17561912536621, + "learning_rate": 5.891164194706318e-06, + "logits/chosen": -0.2704141438007355, + "logits/rejected": -0.3634262979030609, + "logps/chosen": -61.552303314208984, + "logps/rejected": -89.73262023925781, + "loss": 0.9075, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3488504886627197, + "rewards/margins": 4.405829906463623, + "rewards/rejected": -2.0569794178009033, + "step": 2656 + }, + { + "epoch": 0.66, + "grad_norm": 5.2903947830200195, + "learning_rate": 5.888586278520421e-06, + "logits/chosen": -0.20314233005046844, + "logits/rejected": -0.2783381938934326, + "logps/chosen": -57.54365158081055, + "logps/rejected": -83.87841033935547, + "loss": 0.917, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5509634017944336, + "rewards/margins": 3.3829119205474854, + "rewards/rejected": -0.8319483995437622, + "step": 2657 + }, + { + "epoch": 0.66, + "grad_norm": 12.881206512451172, + "learning_rate": 5.8860081183984156e-06, + "logits/chosen": -0.2802942097187042, + "logits/rejected": -0.3988359570503235, + "logps/chosen": -61.25290298461914, + "logps/rejected": -74.58283996582031, + "loss": 0.9573, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.5058584213256836, + "rewards/margins": 4.180344581604004, + "rewards/rejected": -1.6744863986968994, + "step": 2658 + }, + { + "epoch": 0.67, + "grad_norm": 6.646353721618652, + "learning_rate": 5.883429715048064e-06, + "logits/chosen": -0.2570275664329529, + "logits/rejected": -0.34712228178977966, + "logps/chosen": -70.56774139404297, + "logps/rejected": -88.57742309570312, + "loss": 0.9219, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.711526393890381, + "rewards/margins": 3.877286672592163, + "rewards/rejected": -1.1657601594924927, + "step": 2659 + }, + { + "epoch": 0.67, + "grad_norm": 6.1358256340026855, + "learning_rate": 5.880851069177193e-06, + "logits/chosen": -0.2503822147846222, + "logits/rejected": -0.2504452168941498, + "logps/chosen": -53.145748138427734, + "logps/rejected": -98.13395690917969, + "loss": 0.7128, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.91510009765625, + "rewards/margins": 5.006555080413818, + "rewards/rejected": -2.0914552211761475, + "step": 2660 + }, + { + "epoch": 0.67, + "grad_norm": 15.308517456054688, + "learning_rate": 5.878272181493694e-06, + "logits/chosen": -0.2827919125556946, + "logits/rejected": -0.3074829578399658, + "logps/chosen": -56.63972854614258, + "logps/rejected": -76.16740417480469, + "loss": 1.0267, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.655078172683716, + "rewards/margins": 3.010504722595215, + "rewards/rejected": -0.3554263114929199, + "step": 2661 + }, + { + "epoch": 0.67, + "grad_norm": 3.246549129486084, + "learning_rate": 5.875693052705532e-06, + "logits/chosen": -0.32125914096832275, + "logits/rejected": -0.4082317352294922, + "logps/chosen": -52.06718444824219, + "logps/rejected": -75.67676544189453, + "loss": 0.7404, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8960235118865967, + "rewards/margins": 4.220019340515137, + "rewards/rejected": -1.3239961862564087, + "step": 2662 + }, + { + "epoch": 0.67, + "grad_norm": 19.69570541381836, + "learning_rate": 5.87311368352073e-06, + "logits/chosen": -0.2664299011230469, + "logits/rejected": -0.3740397095680237, + "logps/chosen": -54.34031295776367, + "logps/rejected": -75.48566436767578, + "loss": 0.8552, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6702470779418945, + "rewards/margins": 4.044428825378418, + "rewards/rejected": -1.374182105064392, + "step": 2663 + }, + { + "epoch": 0.67, + "grad_norm": 6.973392486572266, + "learning_rate": 5.870534074647382e-06, + "logits/chosen": -0.22660864889621735, + "logits/rejected": -0.3305951952934265, + "logps/chosen": -65.22389221191406, + "logps/rejected": -84.00736999511719, + "loss": 0.95, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8049941062927246, + "rewards/margins": 4.220174312591553, + "rewards/rejected": -1.4151798486709595, + "step": 2664 + }, + { + "epoch": 0.67, + "grad_norm": 9.960790634155273, + "learning_rate": 5.867954226793646e-06, + "logits/chosen": -0.3011628985404968, + "logits/rejected": -0.382500559091568, + "logps/chosen": -50.99094772338867, + "logps/rejected": -64.89025115966797, + "loss": 1.007, + "rewards/accuracies": 0.65625, + "rewards/chosen": 3.003718614578247, + "rewards/margins": 3.5990757942199707, + "rewards/rejected": -0.5953569412231445, + "step": 2665 + }, + { + "epoch": 0.67, + "grad_norm": 4.839418888092041, + "learning_rate": 5.865374140667745e-06, + "logits/chosen": -0.15555429458618164, + "logits/rejected": -0.2820635437965393, + "logps/chosen": -69.17688751220703, + "logps/rejected": -77.93329620361328, + "loss": 0.7702, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.750476598739624, + "rewards/margins": 4.036717414855957, + "rewards/rejected": -1.2862409353256226, + "step": 2666 + }, + { + "epoch": 0.67, + "grad_norm": 4.284943580627441, + "learning_rate": 5.8627938169779694e-06, + "logits/chosen": -0.2633183002471924, + "logits/rejected": -0.3633948862552643, + "logps/chosen": -60.12696838378906, + "logps/rejected": -86.89810943603516, + "loss": 0.8599, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.854555606842041, + "rewards/margins": 4.316365718841553, + "rewards/rejected": -1.4618096351623535, + "step": 2667 + }, + { + "epoch": 0.67, + "grad_norm": 4.918830394744873, + "learning_rate": 5.860213256432674e-06, + "logits/chosen": -0.22207331657409668, + "logits/rejected": -0.3573134243488312, + "logps/chosen": -67.64903259277344, + "logps/rejected": -95.77155303955078, + "loss": 0.791, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.944260358810425, + "rewards/margins": 4.884308815002441, + "rewards/rejected": -1.9400488138198853, + "step": 2668 + }, + { + "epoch": 0.67, + "grad_norm": 3.9237101078033447, + "learning_rate": 5.857632459740278e-06, + "logits/chosen": -0.32518693804740906, + "logits/rejected": -0.48216259479522705, + "logps/chosen": -49.35855484008789, + "logps/rejected": -79.52017974853516, + "loss": 0.6384, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5960142612457275, + "rewards/margins": 4.978638648986816, + "rewards/rejected": -2.3826241493225098, + "step": 2669 + }, + { + "epoch": 0.67, + "grad_norm": 6.074036598205566, + "learning_rate": 5.855051427609265e-06, + "logits/chosen": -0.25282567739486694, + "logits/rejected": -0.3510434031486511, + "logps/chosen": -55.534061431884766, + "logps/rejected": -77.27437591552734, + "loss": 0.7635, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.634488105773926, + "rewards/margins": 3.7686634063720703, + "rewards/rejected": -1.1341753005981445, + "step": 2670 + }, + { + "epoch": 0.67, + "grad_norm": 2.954113245010376, + "learning_rate": 5.852470160748184e-06, + "logits/chosen": -0.3470766246318817, + "logits/rejected": -0.4153882563114166, + "logps/chosen": -49.93156433105469, + "logps/rejected": -78.17434692382812, + "loss": 0.6836, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.076220750808716, + "rewards/margins": 4.398258686065674, + "rewards/rejected": -1.3220382928848267, + "step": 2671 + }, + { + "epoch": 0.67, + "grad_norm": 7.284067153930664, + "learning_rate": 5.849888659865649e-06, + "logits/chosen": -0.21483631432056427, + "logits/rejected": -0.3341207504272461, + "logps/chosen": -58.852500915527344, + "logps/rejected": -70.86199951171875, + "loss": 0.9002, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.662170886993408, + "rewards/margins": 4.010262489318848, + "rewards/rejected": -1.3480918407440186, + "step": 2672 + }, + { + "epoch": 0.67, + "grad_norm": 3.215108633041382, + "learning_rate": 5.847306925670337e-06, + "logits/chosen": -0.37531018257141113, + "logits/rejected": -0.4718508720397949, + "logps/chosen": -41.60902404785156, + "logps/rejected": -67.5217514038086, + "loss": 0.7074, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.057664632797241, + "rewards/margins": 4.180478572845459, + "rewards/rejected": -1.1228137016296387, + "step": 2673 + }, + { + "epoch": 0.67, + "grad_norm": 5.100039958953857, + "learning_rate": 5.844724958870993e-06, + "logits/chosen": -0.32273226976394653, + "logits/rejected": -0.36773669719696045, + "logps/chosen": -54.00288772583008, + "logps/rejected": -78.43934631347656, + "loss": 0.8466, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7787985801696777, + "rewards/margins": 3.8228111267089844, + "rewards/rejected": -1.0440125465393066, + "step": 2674 + }, + { + "epoch": 0.67, + "grad_norm": 7.162332057952881, + "learning_rate": 5.842142760176419e-06, + "logits/chosen": -0.2887071371078491, + "logits/rejected": -0.377164363861084, + "logps/chosen": -59.622840881347656, + "logps/rejected": -81.75181579589844, + "loss": 0.8649, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7601921558380127, + "rewards/margins": 3.9715592861175537, + "rewards/rejected": -1.211367130279541, + "step": 2675 + }, + { + "epoch": 0.67, + "grad_norm": 12.303272247314453, + "learning_rate": 5.839560330295485e-06, + "logits/chosen": -0.2957260310649872, + "logits/rejected": -0.3838386535644531, + "logps/chosen": -49.93128204345703, + "logps/rejected": -72.09062194824219, + "loss": 0.8568, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.625430107116699, + "rewards/margins": 4.264768600463867, + "rewards/rejected": -1.639338493347168, + "step": 2676 + }, + { + "epoch": 0.67, + "grad_norm": 3.888554811477661, + "learning_rate": 5.836977669937124e-06, + "logits/chosen": -0.20126822590827942, + "logits/rejected": -0.35801321268081665, + "logps/chosen": -58.38359451293945, + "logps/rejected": -82.96926879882812, + "loss": 0.7481, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.777526378631592, + "rewards/margins": 4.723810195922852, + "rewards/rejected": -1.9462835788726807, + "step": 2677 + }, + { + "epoch": 0.67, + "grad_norm": 5.474559783935547, + "learning_rate": 5.834394779810332e-06, + "logits/chosen": -0.23564976453781128, + "logits/rejected": -0.380298376083374, + "logps/chosen": -61.53886413574219, + "logps/rejected": -78.16507720947266, + "loss": 0.7205, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0752477645874023, + "rewards/margins": 4.3171491622924805, + "rewards/rejected": -1.2419016361236572, + "step": 2678 + }, + { + "epoch": 0.67, + "grad_norm": 5.4745707511901855, + "learning_rate": 5.831811660624167e-06, + "logits/chosen": -0.18924298882484436, + "logits/rejected": -0.2970581650733948, + "logps/chosen": -64.21805572509766, + "logps/rejected": -86.67326354980469, + "loss": 0.8745, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.902226686477661, + "rewards/margins": 3.9920461177825928, + "rewards/rejected": -1.0898196697235107, + "step": 2679 + }, + { + "epoch": 0.67, + "grad_norm": 4.454422473907471, + "learning_rate": 5.829228313087756e-06, + "logits/chosen": -0.28100571036338806, + "logits/rejected": -0.39628371596336365, + "logps/chosen": -56.05036163330078, + "logps/rejected": -80.19674682617188, + "loss": 0.6966, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7192893028259277, + "rewards/margins": 4.9527587890625, + "rewards/rejected": -2.233469247817993, + "step": 2680 + }, + { + "epoch": 0.67, + "grad_norm": 5.578094959259033, + "learning_rate": 5.826644737910275e-06, + "logits/chosen": -0.33579951524734497, + "logits/rejected": -0.4628533124923706, + "logps/chosen": -66.73555755615234, + "logps/rejected": -76.1073226928711, + "loss": 0.7773, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9701271057128906, + "rewards/margins": 4.832947731018066, + "rewards/rejected": -1.8628206253051758, + "step": 2681 + }, + { + "epoch": 0.67, + "grad_norm": 7.175793647766113, + "learning_rate": 5.824060935800979e-06, + "logits/chosen": -0.15820296108722687, + "logits/rejected": -0.362815260887146, + "logps/chosen": -61.10015106201172, + "logps/rejected": -80.83750915527344, + "loss": 0.8371, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7617673873901367, + "rewards/margins": 4.64404296875, + "rewards/rejected": -1.8822752237319946, + "step": 2682 + }, + { + "epoch": 0.67, + "grad_norm": 4.599946975708008, + "learning_rate": 5.821476907469173e-06, + "logits/chosen": -0.28347811102867126, + "logits/rejected": -0.4835282266139984, + "logps/chosen": -63.80168914794922, + "logps/rejected": -64.99656677246094, + "loss": 0.7862, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.96215558052063, + "rewards/margins": 4.426211357116699, + "rewards/rejected": -1.4640557765960693, + "step": 2683 + }, + { + "epoch": 0.67, + "grad_norm": 2.6934258937835693, + "learning_rate": 5.818892653624229e-06, + "logits/chosen": -0.26223626732826233, + "logits/rejected": -0.4257259964942932, + "logps/chosen": -49.79087829589844, + "logps/rejected": -71.32091522216797, + "loss": 0.6393, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9328551292419434, + "rewards/margins": 4.499953269958496, + "rewards/rejected": -1.5670982599258423, + "step": 2684 + }, + { + "epoch": 0.67, + "grad_norm": 4.124790191650391, + "learning_rate": 5.816308174975584e-06, + "logits/chosen": -0.2675005793571472, + "logits/rejected": -0.42525744438171387, + "logps/chosen": -68.51560974121094, + "logps/rejected": -61.3480224609375, + "loss": 0.7396, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1342978477478027, + "rewards/margins": 4.893152713775635, + "rewards/rejected": -1.7588553428649902, + "step": 2685 + }, + { + "epoch": 0.67, + "grad_norm": 4.087627410888672, + "learning_rate": 5.813723472232729e-06, + "logits/chosen": -0.2795971632003784, + "logits/rejected": -0.3424884080886841, + "logps/chosen": -50.000213623046875, + "logps/rejected": -98.2143783569336, + "loss": 0.732, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9013419151306152, + "rewards/margins": 4.75199031829834, + "rewards/rejected": -1.8506488800048828, + "step": 2686 + }, + { + "epoch": 0.67, + "grad_norm": 5.317717552185059, + "learning_rate": 5.81113854610522e-06, + "logits/chosen": -0.32237881422042847, + "logits/rejected": -0.40525317192077637, + "logps/chosen": -50.91497039794922, + "logps/rejected": -82.37825012207031, + "loss": 0.7789, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8178634643554688, + "rewards/margins": 4.106235027313232, + "rewards/rejected": -1.2883718013763428, + "step": 2687 + }, + { + "epoch": 0.67, + "grad_norm": 4.547678470611572, + "learning_rate": 5.80855339730268e-06, + "logits/chosen": -0.2747650444507599, + "logits/rejected": -0.2909901440143585, + "logps/chosen": -45.63663864135742, + "logps/rejected": -89.40685272216797, + "loss": 0.7095, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.961503267288208, + "rewards/margins": 3.441403865814209, + "rewards/rejected": -0.4799007177352905, + "step": 2688 + }, + { + "epoch": 0.67, + "grad_norm": 4.795703411102295, + "learning_rate": 5.8059680265347825e-06, + "logits/chosen": -0.24935196340084076, + "logits/rejected": -0.32232916355133057, + "logps/chosen": -55.90775680541992, + "logps/rejected": -83.9455795288086, + "loss": 0.7379, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.93163800239563, + "rewards/margins": 4.12489128112793, + "rewards/rejected": -1.1932532787322998, + "step": 2689 + }, + { + "epoch": 0.67, + "grad_norm": 2.71272873878479, + "learning_rate": 5.803382434511274e-06, + "logits/chosen": -0.27090519666671753, + "logits/rejected": -0.353086918592453, + "logps/chosen": -52.35041809082031, + "logps/rejected": -92.26530456542969, + "loss": 0.6182, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1098949909210205, + "rewards/margins": 4.65391206741333, + "rewards/rejected": -1.5440170764923096, + "step": 2690 + }, + { + "epoch": 0.67, + "grad_norm": 4.441038131713867, + "learning_rate": 5.800796621941949e-06, + "logits/chosen": -0.21802440285682678, + "logits/rejected": -0.3696003556251526, + "logps/chosen": -60.709712982177734, + "logps/rejected": -80.69161224365234, + "loss": 0.6596, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.976323366165161, + "rewards/margins": 4.527423858642578, + "rewards/rejected": -1.551100492477417, + "step": 2691 + }, + { + "epoch": 0.67, + "grad_norm": 5.978572845458984, + "learning_rate": 5.7982105895366725e-06, + "logits/chosen": -0.2622532248497009, + "logits/rejected": -0.3358396291732788, + "logps/chosen": -62.284706115722656, + "logps/rejected": -72.01959228515625, + "loss": 0.9036, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9166808128356934, + "rewards/margins": 3.3698840141296387, + "rewards/rejected": -0.4532034397125244, + "step": 2692 + }, + { + "epoch": 0.67, + "grad_norm": 3.3313825130462646, + "learning_rate": 5.795624338005364e-06, + "logits/chosen": -0.32667532563209534, + "logits/rejected": -0.48068466782569885, + "logps/chosen": -53.071502685546875, + "logps/rejected": -66.47366333007812, + "loss": 0.6613, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8281641006469727, + "rewards/margins": 4.908420562744141, + "rewards/rejected": -2.080256223678589, + "step": 2693 + }, + { + "epoch": 0.67, + "grad_norm": 4.8685712814331055, + "learning_rate": 5.793037868058008e-06, + "logits/chosen": -0.25282371044158936, + "logits/rejected": -0.31968459486961365, + "logps/chosen": -55.56336212158203, + "logps/rejected": -85.14933776855469, + "loss": 0.7298, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.969036817550659, + "rewards/margins": 4.47388219833374, + "rewards/rejected": -1.5048454999923706, + "step": 2694 + }, + { + "epoch": 0.67, + "grad_norm": 3.6611576080322266, + "learning_rate": 5.790451180404644e-06, + "logits/chosen": -0.2950460910797119, + "logits/rejected": -0.43556785583496094, + "logps/chosen": -44.373477935791016, + "logps/rejected": -63.533103942871094, + "loss": 0.6643, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.625222682952881, + "rewards/margins": 4.509173393249512, + "rewards/rejected": -1.8839507102966309, + "step": 2695 + }, + { + "epoch": 0.67, + "grad_norm": 5.331439018249512, + "learning_rate": 5.787864275755375e-06, + "logits/chosen": -0.2839667499065399, + "logits/rejected": -0.3690812289714813, + "logps/chosen": -55.280731201171875, + "logps/rejected": -80.7630615234375, + "loss": 0.7527, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.973072052001953, + "rewards/margins": 4.346845626831055, + "rewards/rejected": -1.3737735748291016, + "step": 2696 + }, + { + "epoch": 0.67, + "grad_norm": 7.764094829559326, + "learning_rate": 5.78527715482036e-06, + "logits/chosen": -0.30304741859436035, + "logits/rejected": -0.3300946056842804, + "logps/chosen": -52.29938507080078, + "logps/rejected": -88.1371841430664, + "loss": 0.7782, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.932926893234253, + "rewards/margins": 3.7858428955078125, + "rewards/rejected": -0.8529160618782043, + "step": 2697 + }, + { + "epoch": 0.67, + "grad_norm": 7.923892974853516, + "learning_rate": 5.7826898183098225e-06, + "logits/chosen": -0.20861777663230896, + "logits/rejected": -0.3324388861656189, + "logps/chosen": -62.8353271484375, + "logps/rejected": -77.96434020996094, + "loss": 0.9538, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.108104705810547, + "rewards/margins": 2.9301605224609375, + "rewards/rejected": 0.17794400453567505, + "step": 2698 + }, + { + "epoch": 0.68, + "grad_norm": 3.531519889831543, + "learning_rate": 5.780102266934039e-06, + "logits/chosen": -0.17803700268268585, + "logits/rejected": -0.26315078139305115, + "logps/chosen": -51.02355194091797, + "logps/rejected": -86.61990356445312, + "loss": 0.645, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.647221803665161, + "rewards/margins": 4.9037322998046875, + "rewards/rejected": -2.2565104961395264, + "step": 2699 + }, + { + "epoch": 0.68, + "grad_norm": 2.62168550491333, + "learning_rate": 5.777514501403353e-06, + "logits/chosen": -0.22056330740451813, + "logits/rejected": -0.34667766094207764, + "logps/chosen": -59.78632354736328, + "logps/rejected": -78.93509674072266, + "loss": 0.6056, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0016629695892334, + "rewards/margins": 4.587688446044922, + "rewards/rejected": -1.586025595664978, + "step": 2700 + }, + { + "epoch": 0.68, + "grad_norm": 8.172914505004883, + "learning_rate": 5.7749265224281555e-06, + "logits/chosen": -0.2770344018936157, + "logits/rejected": -0.3845813274383545, + "logps/chosen": -63.31332778930664, + "logps/rejected": -77.22635650634766, + "loss": 0.8716, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.775379180908203, + "rewards/margins": 4.3437910079956055, + "rewards/rejected": -1.5684123039245605, + "step": 2701 + }, + { + "epoch": 0.68, + "grad_norm": 6.459545135498047, + "learning_rate": 5.772338330718909e-06, + "logits/chosen": -0.30223512649536133, + "logits/rejected": -0.38995108008384705, + "logps/chosen": -51.079185485839844, + "logps/rejected": -82.97378540039062, + "loss": 0.8035, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8160815238952637, + "rewards/margins": 4.730459213256836, + "rewards/rejected": -1.9143779277801514, + "step": 2702 + }, + { + "epoch": 0.68, + "grad_norm": 5.896755218505859, + "learning_rate": 5.769749926986123e-06, + "logits/chosen": -0.311445027589798, + "logits/rejected": -0.41705775260925293, + "logps/chosen": -64.67344665527344, + "logps/rejected": -84.04795837402344, + "loss": 0.857, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6930904388427734, + "rewards/margins": 4.400396347045898, + "rewards/rejected": -1.707305908203125, + "step": 2703 + }, + { + "epoch": 0.68, + "grad_norm": 8.009054183959961, + "learning_rate": 5.767161311940372e-06, + "logits/chosen": -0.20721258223056793, + "logits/rejected": -0.257249653339386, + "logps/chosen": -66.30533599853516, + "logps/rejected": -91.81754302978516, + "loss": 0.8667, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.864043951034546, + "rewards/margins": 3.8808481693267822, + "rewards/rejected": -1.016804575920105, + "step": 2704 + }, + { + "epoch": 0.68, + "grad_norm": 6.842408657073975, + "learning_rate": 5.764572486292288e-06, + "logits/chosen": -0.24641433358192444, + "logits/rejected": -0.2783711552619934, + "logps/chosen": -55.11616897583008, + "logps/rejected": -75.54194641113281, + "loss": 0.8205, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4749977588653564, + "rewards/margins": 2.7009053230285645, + "rewards/rejected": -0.22590772807598114, + "step": 2705 + }, + { + "epoch": 0.68, + "grad_norm": 5.285826683044434, + "learning_rate": 5.761983450752558e-06, + "logits/chosen": -0.22555918991565704, + "logits/rejected": -0.32483479380607605, + "logps/chosen": -64.53926086425781, + "logps/rejected": -82.67862701416016, + "loss": 0.8835, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0079383850097656, + "rewards/margins": 4.325674057006836, + "rewards/rejected": -1.3177356719970703, + "step": 2706 + }, + { + "epoch": 0.68, + "grad_norm": 4.463963031768799, + "learning_rate": 5.759394206031929e-06, + "logits/chosen": -0.337560772895813, + "logits/rejected": -0.457414835691452, + "logps/chosen": -54.59480285644531, + "logps/rejected": -67.6686782836914, + "loss": 0.7212, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.558806896209717, + "rewards/margins": 3.778660297393799, + "rewards/rejected": -1.2198535203933716, + "step": 2707 + }, + { + "epoch": 0.68, + "grad_norm": 9.063739776611328, + "learning_rate": 5.756804752841203e-06, + "logits/chosen": -0.2159050852060318, + "logits/rejected": -0.36951470375061035, + "logps/chosen": -50.316776275634766, + "logps/rejected": -70.73713684082031, + "loss": 0.6835, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8045921325683594, + "rewards/margins": 4.276679039001465, + "rewards/rejected": -1.472087025642395, + "step": 2708 + }, + { + "epoch": 0.68, + "grad_norm": 3.721897602081299, + "learning_rate": 5.754215091891241e-06, + "logits/chosen": -0.22090822458267212, + "logits/rejected": -0.3808037042617798, + "logps/chosen": -53.457122802734375, + "logps/rejected": -74.05354309082031, + "loss": 0.7284, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9757866859436035, + "rewards/margins": 4.504749774932861, + "rewards/rejected": -1.5289627313613892, + "step": 2709 + }, + { + "epoch": 0.68, + "grad_norm": 8.347140312194824, + "learning_rate": 5.751625223892964e-06, + "logits/chosen": -0.2222711443901062, + "logits/rejected": -0.38485562801361084, + "logps/chosen": -60.44895553588867, + "logps/rejected": -74.16770935058594, + "loss": 0.7736, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8667056560516357, + "rewards/margins": 4.630934238433838, + "rewards/rejected": -1.7642292976379395, + "step": 2710 + }, + { + "epoch": 0.68, + "grad_norm": 4.008055686950684, + "learning_rate": 5.7490351495573424e-06, + "logits/chosen": -0.32660728693008423, + "logits/rejected": -0.3361719250679016, + "logps/chosen": -46.74715042114258, + "logps/rejected": -100.00886535644531, + "loss": 0.6996, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6565794944763184, + "rewards/margins": 5.911740779876709, + "rewards/rejected": -3.2551610469818115, + "step": 2711 + }, + { + "epoch": 0.68, + "grad_norm": 4.434991836547852, + "learning_rate": 5.746444869595411e-06, + "logits/chosen": -0.2788456678390503, + "logits/rejected": -0.30454403162002563, + "logps/chosen": -50.553592681884766, + "logps/rejected": -76.86841583251953, + "loss": 0.8349, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.080232620239258, + "rewards/margins": 3.7064456939697266, + "rewards/rejected": -0.6262130737304688, + "step": 2712 + }, + { + "epoch": 0.68, + "grad_norm": 6.077460765838623, + "learning_rate": 5.743854384718253e-06, + "logits/chosen": -0.22432728111743927, + "logits/rejected": -0.29717761278152466, + "logps/chosen": -55.473358154296875, + "logps/rejected": -74.15685272216797, + "loss": 0.8615, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.068484306335449, + "rewards/margins": 3.565638303756714, + "rewards/rejected": -0.49715396761894226, + "step": 2713 + }, + { + "epoch": 0.68, + "grad_norm": 5.91323184967041, + "learning_rate": 5.7412636956370184e-06, + "logits/chosen": -0.2794516682624817, + "logits/rejected": -0.3709081709384918, + "logps/chosen": -66.36141967773438, + "logps/rejected": -89.2283935546875, + "loss": 1.003, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.5998668670654297, + "rewards/margins": 3.9824769496917725, + "rewards/rejected": -1.3826098442077637, + "step": 2714 + }, + { + "epoch": 0.68, + "grad_norm": 4.287428379058838, + "learning_rate": 5.7386728030629036e-06, + "logits/chosen": -0.37606701254844666, + "logits/rejected": -0.4751163423061371, + "logps/chosen": -51.67311477661133, + "logps/rejected": -67.5966567993164, + "loss": 0.7776, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6107726097106934, + "rewards/margins": 3.997523307800293, + "rewards/rejected": -1.3867506980895996, + "step": 2715 + }, + { + "epoch": 0.68, + "grad_norm": 4.706434726715088, + "learning_rate": 5.736081707707163e-06, + "logits/chosen": -0.25287872552871704, + "logits/rejected": -0.34700891375541687, + "logps/chosen": -65.6083755493164, + "logps/rejected": -80.31169128417969, + "loss": 0.893, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.116713047027588, + "rewards/margins": 3.1199514865875244, + "rewards/rejected": -0.0032385215163230896, + "step": 2716 + }, + { + "epoch": 0.68, + "grad_norm": 11.129504203796387, + "learning_rate": 5.733490410281112e-06, + "logits/chosen": -0.2657892107963562, + "logits/rejected": -0.3075506091117859, + "logps/chosen": -49.559112548828125, + "logps/rejected": -84.78488159179688, + "loss": 0.9778, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.690178155899048, + "rewards/margins": 3.179856777191162, + "rewards/rejected": -0.4896785318851471, + "step": 2717 + }, + { + "epoch": 0.68, + "grad_norm": 3.714327812194824, + "learning_rate": 5.730898911496115e-06, + "logits/chosen": -0.21953052282333374, + "logits/rejected": -0.32760536670684814, + "logps/chosen": -57.950233459472656, + "logps/rejected": -77.28926086425781, + "loss": 0.7861, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8671836853027344, + "rewards/margins": 4.35579776763916, + "rewards/rejected": -1.4886142015457153, + "step": 2718 + }, + { + "epoch": 0.68, + "grad_norm": 4.320084095001221, + "learning_rate": 5.728307212063596e-06, + "logits/chosen": -0.33304014801979065, + "logits/rejected": -0.38457122445106506, + "logps/chosen": -54.91083908081055, + "logps/rejected": -81.16299438476562, + "loss": 0.8249, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.860583782196045, + "rewards/margins": 4.273001670837402, + "rewards/rejected": -1.4124181270599365, + "step": 2719 + }, + { + "epoch": 0.68, + "grad_norm": 6.172819137573242, + "learning_rate": 5.725715312695032e-06, + "logits/chosen": -0.25203216075897217, + "logits/rejected": -0.32907477021217346, + "logps/chosen": -53.58102035522461, + "logps/rejected": -80.03560638427734, + "loss": 0.8056, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6368091106414795, + "rewards/margins": 4.200719356536865, + "rewards/rejected": -1.563909888267517, + "step": 2720 + }, + { + "epoch": 0.68, + "grad_norm": 5.50250768661499, + "learning_rate": 5.723123214101954e-06, + "logits/chosen": -0.323674738407135, + "logits/rejected": -0.36344432830810547, + "logps/chosen": -53.92403793334961, + "logps/rejected": -106.7809066772461, + "loss": 0.7813, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.970022201538086, + "rewards/margins": 3.9507088661193848, + "rewards/rejected": -0.9806861877441406, + "step": 2721 + }, + { + "epoch": 0.68, + "grad_norm": 3.442066192626953, + "learning_rate": 5.72053091699595e-06, + "logits/chosen": -0.2698493003845215, + "logits/rejected": -0.38648608326911926, + "logps/chosen": -49.54069137573242, + "logps/rejected": -77.64861297607422, + "loss": 0.6226, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.834259510040283, + "rewards/margins": 4.732665061950684, + "rewards/rejected": -1.8984057903289795, + "step": 2722 + }, + { + "epoch": 0.68, + "grad_norm": 5.4581828117370605, + "learning_rate": 5.717938422088662e-06, + "logits/chosen": -0.27959752082824707, + "logits/rejected": -0.3559824228286743, + "logps/chosen": -61.28850555419922, + "logps/rejected": -74.41252899169922, + "loss": 0.8655, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7203612327575684, + "rewards/margins": 2.6878318786621094, + "rewards/rejected": 0.032529789954423904, + "step": 2723 + }, + { + "epoch": 0.68, + "grad_norm": 4.4964799880981445, + "learning_rate": 5.715345730091786e-06, + "logits/chosen": -0.3475259244441986, + "logits/rejected": -0.4349293112754822, + "logps/chosen": -66.05671691894531, + "logps/rejected": -74.9930419921875, + "loss": 0.8645, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7751755714416504, + "rewards/margins": 3.8533735275268555, + "rewards/rejected": -1.0781983137130737, + "step": 2724 + }, + { + "epoch": 0.68, + "grad_norm": 5.1156110763549805, + "learning_rate": 5.712752841717069e-06, + "logits/chosen": -0.19695723056793213, + "logits/rejected": -0.3763984143733978, + "logps/chosen": -61.12993621826172, + "logps/rejected": -67.95079040527344, + "loss": 0.849, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7097535133361816, + "rewards/margins": 4.101850509643555, + "rewards/rejected": -1.3920968770980835, + "step": 2725 + }, + { + "epoch": 0.68, + "grad_norm": 4.436155796051025, + "learning_rate": 5.710159757676318e-06, + "logits/chosen": -0.2521924376487732, + "logits/rejected": -0.34314239025115967, + "logps/chosen": -47.35499572753906, + "logps/rejected": -75.63864135742188, + "loss": 0.6922, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8655333518981934, + "rewards/margins": 4.873384475708008, + "rewards/rejected": -2.0078511238098145, + "step": 2726 + }, + { + "epoch": 0.68, + "grad_norm": 5.606225967407227, + "learning_rate": 5.70756647868139e-06, + "logits/chosen": -0.2699412405490875, + "logits/rejected": -0.3826550245285034, + "logps/chosen": -72.53103637695312, + "logps/rejected": -86.26289367675781, + "loss": 0.867, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.914167642593384, + "rewards/margins": 3.583813190460205, + "rewards/rejected": -0.6696454286575317, + "step": 2727 + }, + { + "epoch": 0.68, + "grad_norm": 2.8301808834075928, + "learning_rate": 5.704973005444194e-06, + "logits/chosen": -0.34710121154785156, + "logits/rejected": -0.40766477584838867, + "logps/chosen": -67.87203979492188, + "logps/rejected": -78.74182891845703, + "loss": 0.7834, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0875377655029297, + "rewards/margins": 4.5672149658203125, + "rewards/rejected": -1.4796770811080933, + "step": 2728 + }, + { + "epoch": 0.68, + "grad_norm": 6.344775199890137, + "learning_rate": 5.702379338676698e-06, + "logits/chosen": -0.2808108329772949, + "logits/rejected": -0.39671042561531067, + "logps/chosen": -65.36180114746094, + "logps/rejected": -86.26504516601562, + "loss": 0.9573, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.5825812816619873, + "rewards/margins": 3.315333604812622, + "rewards/rejected": -0.73275226354599, + "step": 2729 + }, + { + "epoch": 0.68, + "grad_norm": 12.94282054901123, + "learning_rate": 5.699785479090917e-06, + "logits/chosen": -0.30303284525871277, + "logits/rejected": -0.3788343071937561, + "logps/chosen": -50.553565979003906, + "logps/rejected": -74.6680679321289, + "loss": 0.7246, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0216007232666016, + "rewards/margins": 4.462188720703125, + "rewards/rejected": -1.4405878782272339, + "step": 2730 + }, + { + "epoch": 0.68, + "grad_norm": 6.3361992835998535, + "learning_rate": 5.69719142739892e-06, + "logits/chosen": -0.2476377934217453, + "logits/rejected": -0.42441216111183167, + "logps/chosen": -59.17102813720703, + "logps/rejected": -68.87123107910156, + "loss": 0.8014, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9350106716156006, + "rewards/margins": 4.149417877197266, + "rewards/rejected": -1.214406967163086, + "step": 2731 + }, + { + "epoch": 0.68, + "grad_norm": 4.95121955871582, + "learning_rate": 5.6945971843128334e-06, + "logits/chosen": -0.34771299362182617, + "logits/rejected": -0.41597306728363037, + "logps/chosen": -47.477108001708984, + "logps/rejected": -70.41630554199219, + "loss": 0.8662, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9047465324401855, + "rewards/margins": 3.016921043395996, + "rewards/rejected": -0.11217471212148666, + "step": 2732 + }, + { + "epoch": 0.68, + "grad_norm": 14.09959888458252, + "learning_rate": 5.692002750544832e-06, + "logits/chosen": -0.2975146174430847, + "logits/rejected": -0.3714989125728607, + "logps/chosen": -49.71614456176758, + "logps/rejected": -107.56085968017578, + "loss": 0.734, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.937239408493042, + "rewards/margins": 4.344747066497803, + "rewards/rejected": -1.4075076580047607, + "step": 2733 + }, + { + "epoch": 0.68, + "grad_norm": 4.981692790985107, + "learning_rate": 5.689408126807141e-06, + "logits/chosen": -0.239328533411026, + "logits/rejected": -0.40387648344039917, + "logps/chosen": -62.44953918457031, + "logps/rejected": -80.64177703857422, + "loss": 0.737, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5649638175964355, + "rewards/margins": 5.303467273712158, + "rewards/rejected": -2.7385034561157227, + "step": 2734 + }, + { + "epoch": 0.68, + "grad_norm": 3.3811473846435547, + "learning_rate": 5.686813313812046e-06, + "logits/chosen": -0.21498221158981323, + "logits/rejected": -0.2875533103942871, + "logps/chosen": -72.83755493164062, + "logps/rejected": -83.91680145263672, + "loss": 0.7367, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7630515098571777, + "rewards/margins": 4.080052375793457, + "rewards/rejected": -1.3170005083084106, + "step": 2735 + }, + { + "epoch": 0.68, + "grad_norm": 2.9315521717071533, + "learning_rate": 5.684218312271874e-06, + "logits/chosen": -0.3293561637401581, + "logits/rejected": -0.4011853039264679, + "logps/chosen": -66.36834716796875, + "logps/rejected": -72.68119049072266, + "loss": 0.7239, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9882686138153076, + "rewards/margins": 4.293519496917725, + "rewards/rejected": -1.3052510023117065, + "step": 2736 + }, + { + "epoch": 0.68, + "grad_norm": 7.286871910095215, + "learning_rate": 5.681623122899012e-06, + "logits/chosen": -0.24237120151519775, + "logits/rejected": -0.3331664800643921, + "logps/chosen": -61.305450439453125, + "logps/rejected": -80.66996765136719, + "loss": 0.8749, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5039515495300293, + "rewards/margins": 3.9153897762298584, + "rewards/rejected": -1.4114384651184082, + "step": 2737 + }, + { + "epoch": 0.68, + "grad_norm": 3.295893430709839, + "learning_rate": 5.679027746405894e-06, + "logits/chosen": -0.24229474365711212, + "logits/rejected": -0.39323690533638, + "logps/chosen": -54.42536926269531, + "logps/rejected": -74.77933502197266, + "loss": 0.7281, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.697436809539795, + "rewards/margins": 4.778620719909668, + "rewards/rejected": -2.081183910369873, + "step": 2738 + }, + { + "epoch": 0.69, + "grad_norm": 6.0672736167907715, + "learning_rate": 5.676432183505008e-06, + "logits/chosen": -0.32845139503479004, + "logits/rejected": -0.42379915714263916, + "logps/chosen": -65.06053924560547, + "logps/rejected": -74.59015655517578, + "loss": 0.9307, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.872953176498413, + "rewards/margins": 4.420430660247803, + "rewards/rejected": -1.5474778413772583, + "step": 2739 + }, + { + "epoch": 0.69, + "grad_norm": 3.4127471446990967, + "learning_rate": 5.6738364349088905e-06, + "logits/chosen": -0.2180243730545044, + "logits/rejected": -0.28502127528190613, + "logps/chosen": -51.68706130981445, + "logps/rejected": -85.347900390625, + "loss": 0.6843, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.897430181503296, + "rewards/margins": 4.297460556030273, + "rewards/rejected": -1.4000298976898193, + "step": 2740 + }, + { + "epoch": 0.69, + "grad_norm": 3.4378464221954346, + "learning_rate": 5.671240501330132e-06, + "logits/chosen": -0.35396111011505127, + "logits/rejected": -0.43631070852279663, + "logps/chosen": -59.88655471801758, + "logps/rejected": -78.29557037353516, + "loss": 0.811, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9086496829986572, + "rewards/margins": 3.792363166809082, + "rewards/rejected": -0.8837133646011353, + "step": 2741 + }, + { + "epoch": 0.69, + "grad_norm": 5.1016974449157715, + "learning_rate": 5.6686443834813705e-06, + "logits/chosen": -0.30259549617767334, + "logits/rejected": -0.4417557716369629, + "logps/chosen": -57.00810241699219, + "logps/rejected": -80.51151275634766, + "loss": 0.7471, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7321019172668457, + "rewards/margins": 4.242718696594238, + "rewards/rejected": -1.5106172561645508, + "step": 2742 + }, + { + "epoch": 0.69, + "grad_norm": 2.1522903442382812, + "learning_rate": 5.6660480820753e-06, + "logits/chosen": -0.26871591806411743, + "logits/rejected": -0.4303090274333954, + "logps/chosen": -49.71424865722656, + "logps/rejected": -66.20625305175781, + "loss": 0.6479, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7655837535858154, + "rewards/margins": 5.112886428833008, + "rewards/rejected": -2.3473024368286133, + "step": 2743 + }, + { + "epoch": 0.69, + "grad_norm": 3.6250863075256348, + "learning_rate": 5.663451597824655e-06, + "logits/chosen": -0.27562442421913147, + "logits/rejected": -0.3586081564426422, + "logps/chosen": -64.07119750976562, + "logps/rejected": -77.14716339111328, + "loss": 0.8184, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9557065963745117, + "rewards/margins": 4.4209394454956055, + "rewards/rejected": -1.4652330875396729, + "step": 2744 + }, + { + "epoch": 0.69, + "grad_norm": 2.3270747661590576, + "learning_rate": 5.660854931442235e-06, + "logits/chosen": -0.2907789349555969, + "logits/rejected": -0.422026127576828, + "logps/chosen": -55.767181396484375, + "logps/rejected": -77.61080932617188, + "loss": 0.6947, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.185103178024292, + "rewards/margins": 4.32659912109375, + "rewards/rejected": -1.1414960622787476, + "step": 2745 + }, + { + "epoch": 0.69, + "grad_norm": 6.004953861236572, + "learning_rate": 5.658258083640872e-06, + "logits/chosen": -0.22888202965259552, + "logits/rejected": -0.35100048780441284, + "logps/chosen": -57.663726806640625, + "logps/rejected": -69.07513427734375, + "loss": 0.8757, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9920694828033447, + "rewards/margins": 3.1667325496673584, + "rewards/rejected": -0.17466312646865845, + "step": 2746 + }, + { + "epoch": 0.69, + "grad_norm": 3.8533458709716797, + "learning_rate": 5.655661055133463e-06, + "logits/chosen": -0.31347930431365967, + "logits/rejected": -0.3845909833908081, + "logps/chosen": -54.88081359863281, + "logps/rejected": -70.12715148925781, + "loss": 0.8395, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.098339080810547, + "rewards/margins": 3.115863561630249, + "rewards/rejected": -0.017524808645248413, + "step": 2747 + }, + { + "epoch": 0.69, + "grad_norm": 4.855789661407471, + "learning_rate": 5.653063846632945e-06, + "logits/chosen": -0.289690762758255, + "logits/rejected": -0.3742157518863678, + "logps/chosen": -57.692832946777344, + "logps/rejected": -93.04145812988281, + "loss": 0.7397, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.786487579345703, + "rewards/margins": 4.531589508056641, + "rewards/rejected": -1.7451024055480957, + "step": 2748 + }, + { + "epoch": 0.69, + "grad_norm": 6.335375785827637, + "learning_rate": 5.65046645885231e-06, + "logits/chosen": -0.18996740877628326, + "logits/rejected": -0.29067325592041016, + "logps/chosen": -51.10524368286133, + "logps/rejected": -68.6501235961914, + "loss": 0.7061, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0525832176208496, + "rewards/margins": 3.8745880126953125, + "rewards/rejected": -0.8220047950744629, + "step": 2749 + }, + { + "epoch": 0.69, + "grad_norm": 2.665719985961914, + "learning_rate": 5.647868892504597e-06, + "logits/chosen": -0.28156498074531555, + "logits/rejected": -0.4101550579071045, + "logps/chosen": -53.0802116394043, + "logps/rejected": -58.93701934814453, + "loss": 0.7759, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1893129348754883, + "rewards/margins": 4.038333892822266, + "rewards/rejected": -0.8490211963653564, + "step": 2750 + }, + { + "epoch": 0.69, + "grad_norm": 3.2616591453552246, + "learning_rate": 5.645271148302892e-06, + "logits/chosen": -0.2999941408634186, + "logits/rejected": -0.38350632786750793, + "logps/chosen": -57.3421745300293, + "logps/rejected": -80.70761108398438, + "loss": 0.7573, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0090792179107666, + "rewards/margins": 4.208228588104248, + "rewards/rejected": -1.199149489402771, + "step": 2751 + }, + { + "epoch": 0.69, + "grad_norm": 9.841734886169434, + "learning_rate": 5.642673226960332e-06, + "logits/chosen": -0.22586604952812195, + "logits/rejected": -0.40084508061408997, + "logps/chosen": -63.42292022705078, + "logps/rejected": -64.55009460449219, + "loss": 0.8918, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.837958812713623, + "rewards/margins": 4.457900047302246, + "rewards/rejected": -1.6199419498443604, + "step": 2752 + }, + { + "epoch": 0.69, + "grad_norm": 2.8245577812194824, + "learning_rate": 5.640075129190106e-06, + "logits/chosen": -0.3284633755683899, + "logits/rejected": -0.41761109232902527, + "logps/chosen": -45.95001983642578, + "logps/rejected": -81.52743530273438, + "loss": 0.6198, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.882784843444824, + "rewards/margins": 4.481623649597168, + "rewards/rejected": -1.5988394021987915, + "step": 2753 + }, + { + "epoch": 0.69, + "grad_norm": 3.5100932121276855, + "learning_rate": 5.637476855705442e-06, + "logits/chosen": -0.2544110417366028, + "logits/rejected": -0.35786527395248413, + "logps/chosen": -67.32240295410156, + "logps/rejected": -70.74784851074219, + "loss": 0.776, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0818886756896973, + "rewards/margins": 3.6153488159179688, + "rewards/rejected": -0.5334599614143372, + "step": 2754 + }, + { + "epoch": 0.69, + "grad_norm": 4.929476261138916, + "learning_rate": 5.634878407219629e-06, + "logits/chosen": -0.27889689803123474, + "logits/rejected": -0.37757954001426697, + "logps/chosen": -53.85990905761719, + "logps/rejected": -65.96554565429688, + "loss": 0.7642, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9583346843719482, + "rewards/margins": 3.763765811920166, + "rewards/rejected": -0.8054314255714417, + "step": 2755 + }, + { + "epoch": 0.69, + "grad_norm": 5.531796932220459, + "learning_rate": 5.632279784445991e-06, + "logits/chosen": -0.2976698875427246, + "logits/rejected": -0.36608704924583435, + "logps/chosen": -63.09885787963867, + "logps/rejected": -78.44320678710938, + "loss": 0.9358, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.232142925262451, + "rewards/margins": 2.990389347076416, + "rewards/rejected": 0.24175360798835754, + "step": 2756 + }, + { + "epoch": 0.69, + "grad_norm": 13.885335922241211, + "learning_rate": 5.62968098809791e-06, + "logits/chosen": -0.33063527941703796, + "logits/rejected": -0.43904802203178406, + "logps/chosen": -58.408111572265625, + "logps/rejected": -80.13095092773438, + "loss": 0.8757, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.868018865585327, + "rewards/margins": 4.286470413208008, + "rewards/rejected": -1.4184515476226807, + "step": 2757 + }, + { + "epoch": 0.69, + "grad_norm": 3.292397975921631, + "learning_rate": 5.62708201888881e-06, + "logits/chosen": -0.229023277759552, + "logits/rejected": -0.28394201397895813, + "logps/chosen": -60.381988525390625, + "logps/rejected": -82.17244720458984, + "loss": 0.7471, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7312166690826416, + "rewards/margins": 3.5419888496398926, + "rewards/rejected": -0.8107722401618958, + "step": 2758 + }, + { + "epoch": 0.69, + "grad_norm": 3.359884738922119, + "learning_rate": 5.624482877532163e-06, + "logits/chosen": -0.22047945857048035, + "logits/rejected": -0.30255693197250366, + "logps/chosen": -63.17852783203125, + "logps/rejected": -75.86324310302734, + "loss": 0.8412, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2943403720855713, + "rewards/margins": 4.555432319641113, + "rewards/rejected": -1.2610924243927002, + "step": 2759 + }, + { + "epoch": 0.69, + "grad_norm": 4.294152736663818, + "learning_rate": 5.621883564741494e-06, + "logits/chosen": -0.2735764980316162, + "logits/rejected": -0.3719469904899597, + "logps/chosen": -49.89202880859375, + "logps/rejected": -80.603271484375, + "loss": 0.7758, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4537270069122314, + "rewards/margins": 4.548964977264404, + "rewards/rejected": -2.0952377319335938, + "step": 2760 + }, + { + "epoch": 0.69, + "grad_norm": 4.441330432891846, + "learning_rate": 5.619284081230365e-06, + "logits/chosen": -0.3442048132419586, + "logits/rejected": -0.4503527283668518, + "logps/chosen": -53.69041442871094, + "logps/rejected": -72.91158294677734, + "loss": 0.8732, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.2363157272338867, + "rewards/margins": 3.611996650695801, + "rewards/rejected": -0.37568119168281555, + "step": 2761 + }, + { + "epoch": 0.69, + "grad_norm": 3.464425802230835, + "learning_rate": 5.616684427712392e-06, + "logits/chosen": -0.24837586283683777, + "logits/rejected": -0.3844926357269287, + "logps/chosen": -55.83854675292969, + "logps/rejected": -80.64663696289062, + "loss": 0.7568, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6247236728668213, + "rewards/margins": 3.8214111328125, + "rewards/rejected": -1.1966874599456787, + "step": 2762 + }, + { + "epoch": 0.69, + "grad_norm": 2.4232101440429688, + "learning_rate": 5.614084604901239e-06, + "logits/chosen": -0.32475560903549194, + "logits/rejected": -0.42622363567352295, + "logps/chosen": -61.93511962890625, + "logps/rejected": -84.14604949951172, + "loss": 0.688, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.335219144821167, + "rewards/margins": 5.672542572021484, + "rewards/rejected": -2.3373234272003174, + "step": 2763 + }, + { + "epoch": 0.69, + "grad_norm": 4.413216590881348, + "learning_rate": 5.611484613510608e-06, + "logits/chosen": -0.26866599917411804, + "logits/rejected": -0.36917397379875183, + "logps/chosen": -48.284934997558594, + "logps/rejected": -77.58775329589844, + "loss": 0.7514, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9393367767333984, + "rewards/margins": 4.36331844329834, + "rewards/rejected": -1.4239816665649414, + "step": 2764 + }, + { + "epoch": 0.69, + "grad_norm": 4.303645610809326, + "learning_rate": 5.6088844542542585e-06, + "logits/chosen": -0.24155573546886444, + "logits/rejected": -0.36645716428756714, + "logps/chosen": -61.288169860839844, + "logps/rejected": -68.84232330322266, + "loss": 0.8076, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.098355531692505, + "rewards/margins": 4.004024028778076, + "rewards/rejected": -0.9056686758995056, + "step": 2765 + }, + { + "epoch": 0.69, + "grad_norm": 6.7309112548828125, + "learning_rate": 5.6062841278459866e-06, + "logits/chosen": -0.27060577273368835, + "logits/rejected": -0.3033824563026428, + "logps/chosen": -55.21635818481445, + "logps/rejected": -85.00382995605469, + "loss": 0.9348, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.883100748062134, + "rewards/margins": 3.7128095626831055, + "rewards/rejected": -0.8297087550163269, + "step": 2766 + }, + { + "epoch": 0.69, + "grad_norm": 5.61694860458374, + "learning_rate": 5.6036836349996394e-06, + "logits/chosen": -0.17683622241020203, + "logits/rejected": -0.23308944702148438, + "logps/chosen": -65.19013214111328, + "logps/rejected": -85.86271667480469, + "loss": 0.8233, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7625904083251953, + "rewards/margins": 3.2405385971069336, + "rewards/rejected": -0.4779479503631592, + "step": 2767 + }, + { + "epoch": 0.69, + "grad_norm": 6.662501335144043, + "learning_rate": 5.6010829764291074e-06, + "logits/chosen": -0.25919193029403687, + "logits/rejected": -0.3670097291469574, + "logps/chosen": -65.55572509765625, + "logps/rejected": -82.20174407958984, + "loss": 0.7839, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6985886096954346, + "rewards/margins": 4.10060453414917, + "rewards/rejected": -1.4020156860351562, + "step": 2768 + }, + { + "epoch": 0.69, + "grad_norm": 4.73153018951416, + "learning_rate": 5.598482152848328e-06, + "logits/chosen": -0.2173205018043518, + "logits/rejected": -0.28781256079673767, + "logps/chosen": -61.83443832397461, + "logps/rejected": -77.59779357910156, + "loss": 0.9217, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9256787300109863, + "rewards/margins": 3.7121927738189697, + "rewards/rejected": -0.7865142226219177, + "step": 2769 + }, + { + "epoch": 0.69, + "grad_norm": 4.169668674468994, + "learning_rate": 5.595881164971284e-06, + "logits/chosen": -0.2882085144519806, + "logits/rejected": -0.39445367455482483, + "logps/chosen": -54.15386962890625, + "logps/rejected": -80.54483032226562, + "loss": 0.6794, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.915764331817627, + "rewards/margins": 5.033810615539551, + "rewards/rejected": -2.1180460453033447, + "step": 2770 + }, + { + "epoch": 0.69, + "grad_norm": 5.73460054397583, + "learning_rate": 5.5932800135120015e-06, + "logits/chosen": -0.30778831243515015, + "logits/rejected": -0.4391663670539856, + "logps/chosen": -49.358184814453125, + "logps/rejected": -73.36727905273438, + "loss": 0.6765, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0431647300720215, + "rewards/margins": 4.889808177947998, + "rewards/rejected": -1.8466439247131348, + "step": 2771 + }, + { + "epoch": 0.69, + "grad_norm": 2.8955771923065186, + "learning_rate": 5.590678699184553e-06, + "logits/chosen": -0.29026007652282715, + "logits/rejected": -0.46671608090400696, + "logps/chosen": -54.18803405761719, + "logps/rejected": -70.53074645996094, + "loss": 0.6598, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.728813886642456, + "rewards/margins": 4.989696025848389, + "rewards/rejected": -2.2608823776245117, + "step": 2772 + }, + { + "epoch": 0.69, + "grad_norm": 3.683088779449463, + "learning_rate": 5.5880772227030565e-06, + "logits/chosen": -0.15668246150016785, + "logits/rejected": -0.24277673661708832, + "logps/chosen": -54.42861557006836, + "logps/rejected": -87.91048431396484, + "loss": 0.7258, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9228901863098145, + "rewards/margins": 3.8920164108276367, + "rewards/rejected": -0.9691261053085327, + "step": 2773 + }, + { + "epoch": 0.69, + "grad_norm": 3.709352731704712, + "learning_rate": 5.585475584781672e-06, + "logits/chosen": -0.31767547130584717, + "logits/rejected": -0.4113433361053467, + "logps/chosen": -65.74115753173828, + "logps/rejected": -80.79893493652344, + "loss": 0.7993, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.858579158782959, + "rewards/margins": 4.607346057891846, + "rewards/rejected": -1.748766541481018, + "step": 2774 + }, + { + "epoch": 0.69, + "grad_norm": 7.553737163543701, + "learning_rate": 5.582873786134607e-06, + "logits/chosen": -0.24951788783073425, + "logits/rejected": -0.34621965885162354, + "logps/chosen": -69.6484603881836, + "logps/rejected": -89.06185913085938, + "loss": 0.7364, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.941255807876587, + "rewards/margins": 4.889581680297852, + "rewards/rejected": -1.948325753211975, + "step": 2775 + }, + { + "epoch": 0.69, + "grad_norm": 3.7680909633636475, + "learning_rate": 5.580271827476111e-06, + "logits/chosen": -0.25508278608322144, + "logits/rejected": -0.3270558714866638, + "logps/chosen": -53.499629974365234, + "logps/rejected": -69.68091583251953, + "loss": 0.8178, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.143540859222412, + "rewards/margins": 3.280460834503174, + "rewards/rejected": -0.1369197964668274, + "step": 2776 + }, + { + "epoch": 0.69, + "grad_norm": 7.8997931480407715, + "learning_rate": 5.577669709520474e-06, + "logits/chosen": -0.26492127776145935, + "logits/rejected": -0.33937984704971313, + "logps/chosen": -51.91690444946289, + "logps/rejected": -73.66439819335938, + "loss": 0.9043, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7129783630371094, + "rewards/margins": 3.2556488513946533, + "rewards/rejected": -0.5426705479621887, + "step": 2777 + }, + { + "epoch": 0.69, + "grad_norm": 7.290090084075928, + "learning_rate": 5.575067432982039e-06, + "logits/chosen": -0.22175383567810059, + "logits/rejected": -0.3708910346031189, + "logps/chosen": -57.416343688964844, + "logps/rejected": -72.33441925048828, + "loss": 0.7734, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8584060668945312, + "rewards/margins": 4.758057594299316, + "rewards/rejected": -1.899651050567627, + "step": 2778 + }, + { + "epoch": 0.7, + "grad_norm": 7.721799850463867, + "learning_rate": 5.5724649985751835e-06, + "logits/chosen": -0.27168017625808716, + "logits/rejected": -0.34707847237586975, + "logps/chosen": -58.55352020263672, + "logps/rejected": -77.024658203125, + "loss": 0.9869, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7218711376190186, + "rewards/margins": 3.1350810527801514, + "rewards/rejected": -0.4132100045681, + "step": 2779 + }, + { + "epoch": 0.7, + "grad_norm": 8.466852188110352, + "learning_rate": 5.569862407014334e-06, + "logits/chosen": -0.2101348340511322, + "logits/rejected": -0.3604104518890381, + "logps/chosen": -53.63972473144531, + "logps/rejected": -72.18154907226562, + "loss": 0.7829, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9069933891296387, + "rewards/margins": 4.168671607971191, + "rewards/rejected": -1.2616779804229736, + "step": 2780 + }, + { + "epoch": 0.7, + "grad_norm": 3.8337957859039307, + "learning_rate": 5.567259659013956e-06, + "logits/chosen": -0.27880412340164185, + "logits/rejected": -0.3142540156841278, + "logps/chosen": -60.69603729248047, + "logps/rejected": -102.84404754638672, + "loss": 0.779, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.086005687713623, + "rewards/margins": 4.67868185043335, + "rewards/rejected": -1.592676043510437, + "step": 2781 + }, + { + "epoch": 0.7, + "grad_norm": 4.116815090179443, + "learning_rate": 5.564656755288562e-06, + "logits/chosen": -0.28495633602142334, + "logits/rejected": -0.27936407923698425, + "logps/chosen": -52.4936408996582, + "logps/rejected": -94.66899108886719, + "loss": 0.7822, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8408257961273193, + "rewards/margins": 3.787951946258545, + "rewards/rejected": -0.9471256732940674, + "step": 2782 + }, + { + "epoch": 0.7, + "grad_norm": 3.492387056350708, + "learning_rate": 5.562053696552704e-06, + "logits/chosen": -0.28294360637664795, + "logits/rejected": -0.3633377254009247, + "logps/chosen": -43.55708694458008, + "logps/rejected": -79.36543273925781, + "loss": 0.6692, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.939444065093994, + "rewards/margins": 4.8473944664001465, + "rewards/rejected": -1.9079508781433105, + "step": 2783 + }, + { + "epoch": 0.7, + "grad_norm": 3.4371471405029297, + "learning_rate": 5.5594504835209786e-06, + "logits/chosen": -0.24950578808784485, + "logits/rejected": -0.3617490530014038, + "logps/chosen": -54.423187255859375, + "logps/rejected": -63.18277359008789, + "loss": 0.7774, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.067469358444214, + "rewards/margins": 4.008002281188965, + "rewards/rejected": -0.9405327439308167, + "step": 2784 + }, + { + "epoch": 0.7, + "grad_norm": 3.606114387512207, + "learning_rate": 5.556847116908023e-06, + "logits/chosen": -0.24668700993061066, + "logits/rejected": -0.3688584864139557, + "logps/chosen": -59.73535919189453, + "logps/rejected": -76.47584533691406, + "loss": 0.7877, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8595056533813477, + "rewards/margins": 4.742863178253174, + "rewards/rejected": -1.883358120918274, + "step": 2785 + }, + { + "epoch": 0.7, + "grad_norm": 3.2733664512634277, + "learning_rate": 5.554243597428518e-06, + "logits/chosen": -0.22784526646137238, + "logits/rejected": -0.3504765033721924, + "logps/chosen": -53.82316207885742, + "logps/rejected": -81.01375579833984, + "loss": 0.6351, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.00105619430542, + "rewards/margins": 4.410263538360596, + "rewards/rejected": -1.4092075824737549, + "step": 2786 + }, + { + "epoch": 0.7, + "grad_norm": 5.536989212036133, + "learning_rate": 5.551639925797185e-06, + "logits/chosen": -0.338347852230072, + "logits/rejected": -0.4094947874546051, + "logps/chosen": -49.7553596496582, + "logps/rejected": -76.52046203613281, + "loss": 0.7459, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6891536712646484, + "rewards/margins": 4.094974994659424, + "rewards/rejected": -1.4058213233947754, + "step": 2787 + }, + { + "epoch": 0.7, + "grad_norm": 3.774416208267212, + "learning_rate": 5.549036102728791e-06, + "logits/chosen": -0.23652757704257965, + "logits/rejected": -0.35176023840904236, + "logps/chosen": -67.92544555664062, + "logps/rejected": -81.80760192871094, + "loss": 0.7897, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8739423751831055, + "rewards/margins": 4.137165069580078, + "rewards/rejected": -1.2632228136062622, + "step": 2788 + }, + { + "epoch": 0.7, + "grad_norm": 3.7040464878082275, + "learning_rate": 5.546432128938138e-06, + "logits/chosen": -0.25235334038734436, + "logits/rejected": -0.39200645685195923, + "logps/chosen": -58.12028503417969, + "logps/rejected": -75.53691101074219, + "loss": 0.6959, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9229347705841064, + "rewards/margins": 4.074070930480957, + "rewards/rejected": -1.1511361598968506, + "step": 2789 + }, + { + "epoch": 0.7, + "grad_norm": 3.8628759384155273, + "learning_rate": 5.543828005140076e-06, + "logits/chosen": -0.1877691149711609, + "logits/rejected": -0.3273371458053589, + "logps/chosen": -57.09735870361328, + "logps/rejected": -78.38887786865234, + "loss": 0.7313, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6950995922088623, + "rewards/margins": 4.9874420166015625, + "rewards/rejected": -2.2923426628112793, + "step": 2790 + }, + { + "epoch": 0.7, + "grad_norm": 6.147007465362549, + "learning_rate": 5.54122373204949e-06, + "logits/chosen": -0.28166550397872925, + "logits/rejected": -0.3303152322769165, + "logps/chosen": -57.89848327636719, + "logps/rejected": -81.78709411621094, + "loss": 0.9474, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.709876537322998, + "rewards/margins": 3.0579192638397217, + "rewards/rejected": -0.34804239869117737, + "step": 2791 + }, + { + "epoch": 0.7, + "grad_norm": 9.165631294250488, + "learning_rate": 5.538619310381313e-06, + "logits/chosen": -0.30193576216697693, + "logits/rejected": -0.4157409965991974, + "logps/chosen": -55.85403060913086, + "logps/rejected": -74.98136901855469, + "loss": 0.826, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.692692995071411, + "rewards/margins": 3.7496514320373535, + "rewards/rejected": -1.0569586753845215, + "step": 2792 + }, + { + "epoch": 0.7, + "grad_norm": 5.743026256561279, + "learning_rate": 5.53601474085051e-06, + "logits/chosen": -0.23226937651634216, + "logits/rejected": -0.31691065430641174, + "logps/chosen": -61.162445068359375, + "logps/rejected": -90.51185607910156, + "loss": 0.7236, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0498743057250977, + "rewards/margins": 3.7321972846984863, + "rewards/rejected": -0.6823229193687439, + "step": 2793 + }, + { + "epoch": 0.7, + "grad_norm": 4.543135643005371, + "learning_rate": 5.533410024172095e-06, + "logits/chosen": -0.329474538564682, + "logits/rejected": -0.428413063287735, + "logps/chosen": -62.04939651489258, + "logps/rejected": -78.18106842041016, + "loss": 0.8591, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9989874362945557, + "rewards/margins": 4.187645435333252, + "rewards/rejected": -1.1886577606201172, + "step": 2794 + }, + { + "epoch": 0.7, + "grad_norm": 2.5719151496887207, + "learning_rate": 5.53080516106112e-06, + "logits/chosen": -0.292829304933548, + "logits/rejected": -0.4590306878089905, + "logps/chosen": -58.57171630859375, + "logps/rejected": -73.2189712524414, + "loss": 0.6414, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.880392551422119, + "rewards/margins": 5.092073440551758, + "rewards/rejected": -2.2116806507110596, + "step": 2795 + }, + { + "epoch": 0.7, + "grad_norm": 5.279204368591309, + "learning_rate": 5.528200152232674e-06, + "logits/chosen": -0.28117913007736206, + "logits/rejected": -0.38599616289138794, + "logps/chosen": -64.27201080322266, + "logps/rejected": -87.91600799560547, + "loss": 0.891, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.551151990890503, + "rewards/margins": 4.499148368835449, + "rewards/rejected": -1.9479963779449463, + "step": 2796 + }, + { + "epoch": 0.7, + "grad_norm": 8.587461471557617, + "learning_rate": 5.525594998401887e-06, + "logits/chosen": -0.24456515908241272, + "logits/rejected": -0.36271655559539795, + "logps/chosen": -59.73145294189453, + "logps/rejected": -85.04736328125, + "loss": 0.8739, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8675408363342285, + "rewards/margins": 4.065208435058594, + "rewards/rejected": -1.197667121887207, + "step": 2797 + }, + { + "epoch": 0.7, + "grad_norm": 4.238657474517822, + "learning_rate": 5.522989700283933e-06, + "logits/chosen": -0.2265099585056305, + "logits/rejected": -0.3337503671646118, + "logps/chosen": -48.350074768066406, + "logps/rejected": -69.58074951171875, + "loss": 0.7261, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.985874652862549, + "rewards/margins": 3.6550824642181396, + "rewards/rejected": -0.6692078709602356, + "step": 2798 + }, + { + "epoch": 0.7, + "grad_norm": 5.308276176452637, + "learning_rate": 5.520384258594019e-06, + "logits/chosen": -0.3006325960159302, + "logits/rejected": -0.37714993953704834, + "logps/chosen": -55.40867233276367, + "logps/rejected": -91.3948974609375, + "loss": 0.771, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8902740478515625, + "rewards/margins": 3.686544418334961, + "rewards/rejected": -0.7962703704833984, + "step": 2799 + }, + { + "epoch": 0.7, + "grad_norm": 2.897186517715454, + "learning_rate": 5.5177786740474e-06, + "logits/chosen": -0.27778708934783936, + "logits/rejected": -0.34882915019989014, + "logps/chosen": -50.32892608642578, + "logps/rejected": -93.47750854492188, + "loss": 0.5564, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8708279132843018, + "rewards/margins": 4.6491594314575195, + "rewards/rejected": -1.7783315181732178, + "step": 2800 + }, + { + "epoch": 0.7, + "grad_norm": 4.823211669921875, + "learning_rate": 5.515172947359361e-06, + "logits/chosen": -0.20814242959022522, + "logits/rejected": -0.316440612077713, + "logps/chosen": -70.60221099853516, + "logps/rejected": -89.52676391601562, + "loss": 0.8188, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8689277172088623, + "rewards/margins": 4.095756530761719, + "rewards/rejected": -1.2268288135528564, + "step": 2801 + }, + { + "epoch": 0.7, + "grad_norm": 5.483523845672607, + "learning_rate": 5.512567079245231e-06, + "logits/chosen": -0.2593022584915161, + "logits/rejected": -0.3828117549419403, + "logps/chosen": -52.616859436035156, + "logps/rejected": -76.97686004638672, + "loss": 0.6635, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.106812000274658, + "rewards/margins": 4.514823913574219, + "rewards/rejected": -1.4080119132995605, + "step": 2802 + }, + { + "epoch": 0.7, + "grad_norm": 4.882467746734619, + "learning_rate": 5.509961070420377e-06, + "logits/chosen": -0.21450872719287872, + "logits/rejected": -0.3293997347354889, + "logps/chosen": -68.10994720458984, + "logps/rejected": -74.28157043457031, + "loss": 0.8162, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.086728811264038, + "rewards/margins": 4.128758430480957, + "rewards/rejected": -1.0420293807983398, + "step": 2803 + }, + { + "epoch": 0.7, + "grad_norm": 4.660257339477539, + "learning_rate": 5.507354921600205e-06, + "logits/chosen": -0.2305106222629547, + "logits/rejected": -0.3217957317829132, + "logps/chosen": -52.73008728027344, + "logps/rejected": -80.39676666259766, + "loss": 0.7948, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.071315050125122, + "rewards/margins": 4.0649847984313965, + "rewards/rejected": -0.9936696290969849, + "step": 2804 + }, + { + "epoch": 0.7, + "grad_norm": 6.95418119430542, + "learning_rate": 5.504748633500161e-06, + "logits/chosen": -0.3034842610359192, + "logits/rejected": -0.4107702970504761, + "logps/chosen": -63.71675491333008, + "logps/rejected": -90.4009780883789, + "loss": 0.7157, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2330570220947266, + "rewards/margins": 5.562137126922607, + "rewards/rejected": -2.329080104827881, + "step": 2805 + }, + { + "epoch": 0.7, + "grad_norm": 5.953375339508057, + "learning_rate": 5.5021422068357244e-06, + "logits/chosen": -0.27348798513412476, + "logits/rejected": -0.33458077907562256, + "logps/chosen": -54.5091438293457, + "logps/rejected": -64.45844268798828, + "loss": 0.9335, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.020359992980957, + "rewards/margins": 2.7977819442749023, + "rewards/rejected": 0.22257791459560394, + "step": 2806 + }, + { + "epoch": 0.7, + "grad_norm": 4.389054298400879, + "learning_rate": 5.499535642322415e-06, + "logits/chosen": -0.21855047345161438, + "logits/rejected": -0.35672304034233093, + "logps/chosen": -57.185462951660156, + "logps/rejected": -82.35564422607422, + "loss": 0.6854, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.676280975341797, + "rewards/margins": 4.889578819274902, + "rewards/rejected": -2.2132973670959473, + "step": 2807 + }, + { + "epoch": 0.7, + "grad_norm": 3.3001444339752197, + "learning_rate": 5.496928940675795e-06, + "logits/chosen": -0.27152836322784424, + "logits/rejected": -0.3439643979072571, + "logps/chosen": -59.2743034362793, + "logps/rejected": -80.38017272949219, + "loss": 0.6928, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.055595874786377, + "rewards/margins": 4.341270446777344, + "rewards/rejected": -1.2856743335723877, + "step": 2808 + }, + { + "epoch": 0.7, + "grad_norm": 3.7320775985717773, + "learning_rate": 5.494322102611457e-06, + "logits/chosen": -0.21756437420845032, + "logits/rejected": -0.3471546471118927, + "logps/chosen": -65.23038482666016, + "logps/rejected": -78.21099090576172, + "loss": 0.7475, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8246750831604004, + "rewards/margins": 3.5800461769104004, + "rewards/rejected": -0.7553708553314209, + "step": 2809 + }, + { + "epoch": 0.7, + "grad_norm": 4.365367889404297, + "learning_rate": 5.491715128845037e-06, + "logits/chosen": -0.2409743070602417, + "logits/rejected": -0.3393404483795166, + "logps/chosen": -56.726463317871094, + "logps/rejected": -81.16034698486328, + "loss": 0.8435, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.888200283050537, + "rewards/margins": 4.314857482910156, + "rewards/rejected": -1.4266573190689087, + "step": 2810 + }, + { + "epoch": 0.7, + "grad_norm": 4.1363067626953125, + "learning_rate": 5.4891080200922e-06, + "logits/chosen": -0.18603307008743286, + "logits/rejected": -0.32263273000717163, + "logps/chosen": -49.29545593261719, + "logps/rejected": -79.8858871459961, + "loss": 0.7141, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.817422866821289, + "rewards/margins": 4.433371543884277, + "rewards/rejected": -1.6159486770629883, + "step": 2811 + }, + { + "epoch": 0.7, + "grad_norm": 4.477903842926025, + "learning_rate": 5.486500777068659e-06, + "logits/chosen": -0.259769469499588, + "logits/rejected": -0.38670530915260315, + "logps/chosen": -71.63240051269531, + "logps/rejected": -77.22180938720703, + "loss": 0.8352, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7631444931030273, + "rewards/margins": 4.30509090423584, + "rewards/rejected": -1.5419464111328125, + "step": 2812 + }, + { + "epoch": 0.7, + "grad_norm": 5.270361423492432, + "learning_rate": 5.4838934004901575e-06, + "logits/chosen": -0.19784784317016602, + "logits/rejected": -0.23484230041503906, + "logps/chosen": -54.263916015625, + "logps/rejected": -86.60733032226562, + "loss": 0.8511, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.739388942718506, + "rewards/margins": 2.900812864303589, + "rewards/rejected": -0.16142362356185913, + "step": 2813 + }, + { + "epoch": 0.7, + "grad_norm": 3.416290760040283, + "learning_rate": 5.4812858910724765e-06, + "logits/chosen": -0.335374116897583, + "logits/rejected": -0.4170306324958801, + "logps/chosen": -47.907981872558594, + "logps/rejected": -86.14915466308594, + "loss": 0.7378, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1398797035217285, + "rewards/margins": 4.9982500076293945, + "rewards/rejected": -1.858370065689087, + "step": 2814 + }, + { + "epoch": 0.7, + "grad_norm": 7.273072242736816, + "learning_rate": 5.478678249531431e-06, + "logits/chosen": -0.24819304049015045, + "logits/rejected": -0.3577951192855835, + "logps/chosen": -85.75377655029297, + "logps/rejected": -98.46204376220703, + "loss": 1.0141, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.2878432273864746, + "rewards/margins": 4.111649036407471, + "rewards/rejected": -1.823805570602417, + "step": 2815 + }, + { + "epoch": 0.7, + "grad_norm": 4.818975925445557, + "learning_rate": 5.47607047658288e-06, + "logits/chosen": -0.29149338603019714, + "logits/rejected": -0.41278040409088135, + "logps/chosen": -62.01109313964844, + "logps/rejected": -81.96543884277344, + "loss": 0.6895, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.971334457397461, + "rewards/margins": 4.646450519561768, + "rewards/rejected": -1.6751165390014648, + "step": 2816 + }, + { + "epoch": 0.7, + "grad_norm": 5.725890159606934, + "learning_rate": 5.473462572942707e-06, + "logits/chosen": -0.2663618326187134, + "logits/rejected": -0.3972441554069519, + "logps/chosen": -60.07971954345703, + "logps/rejected": -64.92943572998047, + "loss": 0.7729, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.208387851715088, + "rewards/margins": 4.368744850158691, + "rewards/rejected": -1.1603573560714722, + "step": 2817 + }, + { + "epoch": 0.7, + "grad_norm": 5.707344055175781, + "learning_rate": 5.4708545393268445e-06, + "logits/chosen": -0.21544219553470612, + "logits/rejected": -0.2655305564403534, + "logps/chosen": -59.11492919921875, + "logps/rejected": -97.98969268798828, + "loss": 0.8877, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8991377353668213, + "rewards/margins": 3.7881689071655273, + "rewards/rejected": -0.8890313506126404, + "step": 2818 + }, + { + "epoch": 0.71, + "grad_norm": 1.9851175546646118, + "learning_rate": 5.468246376451248e-06, + "logits/chosen": -0.3029557466506958, + "logits/rejected": -0.44445323944091797, + "logps/chosen": -41.606319427490234, + "logps/rejected": -75.7922592163086, + "loss": 0.5008, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.869304895401001, + "rewards/margins": 5.660238265991211, + "rewards/rejected": -2.790933609008789, + "step": 2819 + }, + { + "epoch": 0.71, + "grad_norm": 10.848751068115234, + "learning_rate": 5.465638085031918e-06, + "logits/chosen": -0.2398422360420227, + "logits/rejected": -0.4014996886253357, + "logps/chosen": -72.90376281738281, + "logps/rejected": -85.78473663330078, + "loss": 0.8875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9958229064941406, + "rewards/margins": 4.300818920135498, + "rewards/rejected": -1.3049960136413574, + "step": 2820 + }, + { + "epoch": 0.71, + "grad_norm": 5.580363750457764, + "learning_rate": 5.4630296657848865e-06, + "logits/chosen": -0.26255568861961365, + "logits/rejected": -0.2939246594905853, + "logps/chosen": -58.820011138916016, + "logps/rejected": -91.12340545654297, + "loss": 0.8016, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7973577976226807, + "rewards/margins": 4.693004608154297, + "rewards/rejected": -1.8956472873687744, + "step": 2821 + }, + { + "epoch": 0.71, + "grad_norm": 3.406134843826294, + "learning_rate": 5.4604211194262205e-06, + "logits/chosen": -0.19432717561721802, + "logits/rejected": -0.27919524908065796, + "logps/chosen": -57.482112884521484, + "logps/rejected": -85.80217742919922, + "loss": 0.6853, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8406243324279785, + "rewards/margins": 4.441525936126709, + "rewards/rejected": -1.6009019613265991, + "step": 2822 + }, + { + "epoch": 0.71, + "grad_norm": 4.474498748779297, + "learning_rate": 5.457812446672021e-06, + "logits/chosen": -0.28886574506759644, + "logits/rejected": -0.4579761028289795, + "logps/chosen": -61.49492645263672, + "logps/rejected": -62.1033935546875, + "loss": 0.915, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.098904609680176, + "rewards/margins": 4.248169422149658, + "rewards/rejected": -1.149264931678772, + "step": 2823 + }, + { + "epoch": 0.71, + "grad_norm": 8.720470428466797, + "learning_rate": 5.4552036482384275e-06, + "logits/chosen": -0.2603107988834381, + "logits/rejected": -0.3873884975910187, + "logps/chosen": -66.74283599853516, + "logps/rejected": -76.53473663330078, + "loss": 1.0445, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3627524375915527, + "rewards/margins": 3.5412538051605225, + "rewards/rejected": -1.1785016059875488, + "step": 2824 + }, + { + "epoch": 0.71, + "grad_norm": 14.73303508758545, + "learning_rate": 5.452594724841608e-06, + "logits/chosen": -0.23151686787605286, + "logits/rejected": -0.2814774215221405, + "logps/chosen": -61.23655700683594, + "logps/rejected": -81.35974884033203, + "loss": 1.0328, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7346982955932617, + "rewards/margins": 3.0532944202423096, + "rewards/rejected": -0.3185960054397583, + "step": 2825 + }, + { + "epoch": 0.71, + "grad_norm": 7.740044593811035, + "learning_rate": 5.449985677197772e-06, + "logits/chosen": -0.259524405002594, + "logits/rejected": -0.39982351660728455, + "logps/chosen": -61.136417388916016, + "logps/rejected": -87.41976165771484, + "loss": 0.8191, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8212685585021973, + "rewards/margins": 4.755370616912842, + "rewards/rejected": -1.934101939201355, + "step": 2826 + }, + { + "epoch": 0.71, + "grad_norm": 6.067266464233398, + "learning_rate": 5.447376506023158e-06, + "logits/chosen": -0.1608574539422989, + "logits/rejected": -0.22434446215629578, + "logps/chosen": -62.2615852355957, + "logps/rejected": -98.47723388671875, + "loss": 0.8639, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.599400520324707, + "rewards/margins": 3.905367851257324, + "rewards/rejected": -1.3059675693511963, + "step": 2827 + }, + { + "epoch": 0.71, + "grad_norm": 5.48815393447876, + "learning_rate": 5.444767212034039e-06, + "logits/chosen": -0.3183467984199524, + "logits/rejected": -0.42889559268951416, + "logps/chosen": -66.24181365966797, + "logps/rejected": -65.07035827636719, + "loss": 0.8648, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.023308038711548, + "rewards/margins": 4.365368843078613, + "rewards/rejected": -1.3420604467391968, + "step": 2828 + }, + { + "epoch": 0.71, + "grad_norm": 5.1385416984558105, + "learning_rate": 5.442157795946722e-06, + "logits/chosen": -0.2547774314880371, + "logits/rejected": -0.4142606854438782, + "logps/chosen": -61.02204895019531, + "logps/rejected": -73.9490966796875, + "loss": 0.8225, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.64823579788208, + "rewards/margins": 4.116336345672607, + "rewards/rejected": -1.468100905418396, + "step": 2829 + }, + { + "epoch": 0.71, + "grad_norm": 3.331733226776123, + "learning_rate": 5.439548258477549e-06, + "logits/chosen": -0.2530086636543274, + "logits/rejected": -0.4018007218837738, + "logps/chosen": -57.41359329223633, + "logps/rejected": -67.56887817382812, + "loss": 0.6921, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0821151733398438, + "rewards/margins": 4.309067726135254, + "rewards/rejected": -1.2269525527954102, + "step": 2830 + }, + { + "epoch": 0.71, + "grad_norm": 3.981607437133789, + "learning_rate": 5.436938600342896e-06, + "logits/chosen": -0.30487313866615295, + "logits/rejected": -0.3924633264541626, + "logps/chosen": -61.05720138549805, + "logps/rejected": -87.30960845947266, + "loss": 0.7793, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9253928661346436, + "rewards/margins": 4.711539268493652, + "rewards/rejected": -1.7861460447311401, + "step": 2831 + }, + { + "epoch": 0.71, + "grad_norm": 8.2328519821167, + "learning_rate": 5.434328822259167e-06, + "logits/chosen": -0.3053210377693176, + "logits/rejected": -0.4012514650821686, + "logps/chosen": -65.65404510498047, + "logps/rejected": -71.91615295410156, + "loss": 0.8851, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.757678508758545, + "rewards/margins": 3.5380361080169678, + "rewards/rejected": -0.7803574204444885, + "step": 2832 + }, + { + "epoch": 0.71, + "grad_norm": 2.334660291671753, + "learning_rate": 5.431718924942806e-06, + "logits/chosen": -0.22977003455162048, + "logits/rejected": -0.3932211399078369, + "logps/chosen": -71.60643768310547, + "logps/rejected": -85.261474609375, + "loss": 0.6438, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.890298843383789, + "rewards/margins": 5.131758213043213, + "rewards/rejected": -2.241459608078003, + "step": 2833 + }, + { + "epoch": 0.71, + "grad_norm": 4.327956676483154, + "learning_rate": 5.429108909110283e-06, + "logits/chosen": -0.3886215090751648, + "logits/rejected": -0.4780173599720001, + "logps/chosen": -47.821083068847656, + "logps/rejected": -79.76942443847656, + "loss": 0.6722, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.994901180267334, + "rewards/margins": 4.9275641441345215, + "rewards/rejected": -1.9326624870300293, + "step": 2834 + }, + { + "epoch": 0.71, + "grad_norm": 3.743079662322998, + "learning_rate": 5.426498775478105e-06, + "logits/chosen": -0.3725106716156006, + "logits/rejected": -0.4204014837741852, + "logps/chosen": -38.70680618286133, + "logps/rejected": -77.89148712158203, + "loss": 0.7278, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8603994846343994, + "rewards/margins": 4.075432300567627, + "rewards/rejected": -1.2150330543518066, + "step": 2835 + }, + { + "epoch": 0.71, + "grad_norm": 6.1573486328125, + "learning_rate": 5.423888524762812e-06, + "logits/chosen": -0.26795515418052673, + "logits/rejected": -0.34765762090682983, + "logps/chosen": -63.54472732543945, + "logps/rejected": -73.36591339111328, + "loss": 0.924, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.601649761199951, + "rewards/margins": 3.292440176010132, + "rewards/rejected": -0.6907902956008911, + "step": 2836 + }, + { + "epoch": 0.71, + "grad_norm": 4.213156223297119, + "learning_rate": 5.42127815768097e-06, + "logits/chosen": -0.3127574622631073, + "logits/rejected": -0.4481913447380066, + "logps/chosen": -50.38462829589844, + "logps/rejected": -64.51715087890625, + "loss": 0.7999, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8899171352386475, + "rewards/margins": 3.9301626682281494, + "rewards/rejected": -1.0402456521987915, + "step": 2837 + }, + { + "epoch": 0.71, + "grad_norm": 4.053149223327637, + "learning_rate": 5.418667674949183e-06, + "logits/chosen": -0.32230156660079956, + "logits/rejected": -0.4752282500267029, + "logps/chosen": -70.25773620605469, + "logps/rejected": -71.41351318359375, + "loss": 0.7776, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.941629648208618, + "rewards/margins": 4.70877742767334, + "rewards/rejected": -1.7671481370925903, + "step": 2838 + }, + { + "epoch": 0.71, + "grad_norm": 2.8165481090545654, + "learning_rate": 5.416057077284086e-06, + "logits/chosen": -0.2994973063468933, + "logits/rejected": -0.33214011788368225, + "logps/chosen": -50.979454040527344, + "logps/rejected": -82.3619613647461, + "loss": 0.7203, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.993726968765259, + "rewards/margins": 4.467446327209473, + "rewards/rejected": -1.473719596862793, + "step": 2839 + }, + { + "epoch": 0.71, + "grad_norm": 3.9576611518859863, + "learning_rate": 5.413446365402344e-06, + "logits/chosen": -0.20630550384521484, + "logits/rejected": -0.3400459885597229, + "logps/chosen": -54.27949905395508, + "logps/rejected": -64.76434326171875, + "loss": 0.6844, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.840742349624634, + "rewards/margins": 4.496584415435791, + "rewards/rejected": -1.6558425426483154, + "step": 2840 + }, + { + "epoch": 0.71, + "grad_norm": 4.9635443687438965, + "learning_rate": 5.410835540020654e-06, + "logits/chosen": -0.22278454899787903, + "logits/rejected": -0.30155855417251587, + "logps/chosen": -71.50330352783203, + "logps/rejected": -87.00068664550781, + "loss": 0.8979, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9916656017303467, + "rewards/margins": 3.6288111209869385, + "rewards/rejected": -0.6371455192565918, + "step": 2841 + }, + { + "epoch": 0.71, + "grad_norm": 4.3918375968933105, + "learning_rate": 5.40822460185574e-06, + "logits/chosen": -0.19237996637821198, + "logits/rejected": -0.39404839277267456, + "logps/chosen": -65.91226959228516, + "logps/rejected": -76.07504272460938, + "loss": 0.7373, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8080551624298096, + "rewards/margins": 3.8010201454162598, + "rewards/rejected": -0.9929651021957397, + "step": 2842 + }, + { + "epoch": 0.71, + "grad_norm": 3.2697222232818604, + "learning_rate": 5.405613551624367e-06, + "logits/chosen": -0.2698063850402832, + "logits/rejected": -0.39725610613822937, + "logps/chosen": -51.844661712646484, + "logps/rejected": -80.5174789428711, + "loss": 0.6764, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1266376972198486, + "rewards/margins": 5.05477237701416, + "rewards/rejected": -1.928134799003601, + "step": 2843 + }, + { + "epoch": 0.71, + "grad_norm": 3.676898956298828, + "learning_rate": 5.403002390043319e-06, + "logits/chosen": -0.3830936849117279, + "logits/rejected": -0.4554797410964966, + "logps/chosen": -47.87030792236328, + "logps/rejected": -83.79545593261719, + "loss": 0.7409, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.766765594482422, + "rewards/margins": 3.9553046226501465, + "rewards/rejected": -1.1885392665863037, + "step": 2844 + }, + { + "epoch": 0.71, + "grad_norm": 3.327007293701172, + "learning_rate": 5.400391117829421e-06, + "logits/chosen": -0.3040286600589752, + "logits/rejected": -0.40618279576301575, + "logps/chosen": -55.20924758911133, + "logps/rejected": -79.91937255859375, + "loss": 0.8228, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9850356578826904, + "rewards/margins": 4.545913219451904, + "rewards/rejected": -1.5608773231506348, + "step": 2845 + }, + { + "epoch": 0.71, + "grad_norm": 9.1427583694458, + "learning_rate": 5.3977797356995206e-06, + "logits/chosen": -0.26977235078811646, + "logits/rejected": -0.429146945476532, + "logps/chosen": -64.8375244140625, + "logps/rejected": -78.7135009765625, + "loss": 0.8507, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.519667625427246, + "rewards/margins": 3.9547970294952393, + "rewards/rejected": -1.4351292848587036, + "step": 2846 + }, + { + "epoch": 0.71, + "grad_norm": 4.338391304016113, + "learning_rate": 5.3951682443705e-06, + "logits/chosen": -0.3603042960166931, + "logits/rejected": -0.4742734432220459, + "logps/chosen": -46.15153884887695, + "logps/rejected": -62.62230682373047, + "loss": 0.7445, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8550050258636475, + "rewards/margins": 4.4965410232543945, + "rewards/rejected": -1.6415361166000366, + "step": 2847 + }, + { + "epoch": 0.71, + "grad_norm": 4.623550891876221, + "learning_rate": 5.3925566445592684e-06, + "logits/chosen": -0.21867482364177704, + "logits/rejected": -0.33032143115997314, + "logps/chosen": -66.01785278320312, + "logps/rejected": -76.56596374511719, + "loss": 0.7816, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8489344120025635, + "rewards/margins": 4.651574611663818, + "rewards/rejected": -1.802640438079834, + "step": 2848 + }, + { + "epoch": 0.71, + "grad_norm": 5.593756675720215, + "learning_rate": 5.389944936982767e-06, + "logits/chosen": -0.34626829624176025, + "logits/rejected": -0.4226139783859253, + "logps/chosen": -70.42416381835938, + "logps/rejected": -79.68898010253906, + "loss": 0.9814, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8403360843658447, + "rewards/margins": 3.6735446453094482, + "rewards/rejected": -0.8332079648971558, + "step": 2849 + }, + { + "epoch": 0.71, + "grad_norm": 5.80548095703125, + "learning_rate": 5.387333122357966e-06, + "logits/chosen": -0.2852804958820343, + "logits/rejected": -0.3694967031478882, + "logps/chosen": -60.50685501098633, + "logps/rejected": -83.9622802734375, + "loss": 0.8984, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7988526821136475, + "rewards/margins": 3.8766374588012695, + "rewards/rejected": -1.0777850151062012, + "step": 2850 + }, + { + "epoch": 0.71, + "grad_norm": 7.177895545959473, + "learning_rate": 5.384721201401867e-06, + "logits/chosen": -0.2040184587240219, + "logits/rejected": -0.3161970376968384, + "logps/chosen": -60.87141418457031, + "logps/rejected": -64.14421844482422, + "loss": 0.853, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.066718339920044, + "rewards/margins": 3.5893824100494385, + "rewards/rejected": -0.5226641297340393, + "step": 2851 + }, + { + "epoch": 0.71, + "grad_norm": 14.032357215881348, + "learning_rate": 5.382109174831493e-06, + "logits/chosen": -0.2206079214811325, + "logits/rejected": -0.36816439032554626, + "logps/chosen": -56.874244689941406, + "logps/rejected": -84.4105224609375, + "loss": 0.7073, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.848782539367676, + "rewards/margins": 5.118204593658447, + "rewards/rejected": -2.2694225311279297, + "step": 2852 + }, + { + "epoch": 0.71, + "grad_norm": 3.9348411560058594, + "learning_rate": 5.3794970433639085e-06, + "logits/chosen": -0.3393353819847107, + "logits/rejected": -0.44314906001091003, + "logps/chosen": -57.83454513549805, + "logps/rejected": -69.77547454833984, + "loss": 0.8487, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.164503574371338, + "rewards/margins": 4.33046817779541, + "rewards/rejected": -1.1659647226333618, + "step": 2853 + }, + { + "epoch": 0.71, + "grad_norm": 6.868438720703125, + "learning_rate": 5.376884807716195e-06, + "logits/chosen": -0.25426122546195984, + "logits/rejected": -0.364361435174942, + "logps/chosen": -61.627410888671875, + "logps/rejected": -96.55684661865234, + "loss": 0.8743, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.985461711883545, + "rewards/margins": 4.187994480133057, + "rewards/rejected": -1.2025330066680908, + "step": 2854 + }, + { + "epoch": 0.71, + "grad_norm": 6.22898006439209, + "learning_rate": 5.374272468605469e-06, + "logits/chosen": -0.24857386946678162, + "logits/rejected": -0.37844282388687134, + "logps/chosen": -51.48332977294922, + "logps/rejected": -77.01360321044922, + "loss": 0.7989, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.958341360092163, + "rewards/margins": 4.1212544441223145, + "rewards/rejected": -1.16291344165802, + "step": 2855 + }, + { + "epoch": 0.71, + "grad_norm": 5.958951950073242, + "learning_rate": 5.3716600267488764e-06, + "logits/chosen": -0.1484205573797226, + "logits/rejected": -0.181283101439476, + "logps/chosen": -71.51872253417969, + "logps/rejected": -90.43760681152344, + "loss": 0.9483, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8384158611297607, + "rewards/margins": 3.4546058177948, + "rewards/rejected": -0.6161900162696838, + "step": 2856 + }, + { + "epoch": 0.71, + "grad_norm": 6.031160354614258, + "learning_rate": 5.3690474828635855e-06, + "logits/chosen": -0.33708587288856506, + "logits/rejected": -0.4451373517513275, + "logps/chosen": -93.4432601928711, + "logps/rejected": -76.97809600830078, + "loss": 1.0004, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.9244754314422607, + "rewards/margins": 4.286827087402344, + "rewards/rejected": -1.3623512983322144, + "step": 2857 + }, + { + "epoch": 0.71, + "grad_norm": 5.873049736022949, + "learning_rate": 5.366434837666795e-06, + "logits/chosen": -0.2956219017505646, + "logits/rejected": -0.4516445994377136, + "logps/chosen": -58.86860656738281, + "logps/rejected": -76.64334869384766, + "loss": 0.7307, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9221179485321045, + "rewards/margins": 4.0132670402526855, + "rewards/rejected": -1.091148853302002, + "step": 2858 + }, + { + "epoch": 0.72, + "grad_norm": 4.0987019538879395, + "learning_rate": 5.363822091875736e-06, + "logits/chosen": -0.24827075004577637, + "logits/rejected": -0.3060002624988556, + "logps/chosen": -59.189247131347656, + "logps/rejected": -93.59523010253906, + "loss": 0.7696, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8550360202789307, + "rewards/margins": 4.463405132293701, + "rewards/rejected": -1.6083687543869019, + "step": 2859 + }, + { + "epoch": 0.72, + "grad_norm": 6.926389217376709, + "learning_rate": 5.361209246207662e-06, + "logits/chosen": -0.2218908667564392, + "logits/rejected": -0.2472648173570633, + "logps/chosen": -59.88911437988281, + "logps/rejected": -100.41836547851562, + "loss": 0.8093, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0052199363708496, + "rewards/margins": 3.2951204776763916, + "rewards/rejected": -0.28990086913108826, + "step": 2860 + }, + { + "epoch": 0.72, + "grad_norm": 6.560847759246826, + "learning_rate": 5.358596301379858e-06, + "logits/chosen": -0.23469077050685883, + "logits/rejected": -0.3667217493057251, + "logps/chosen": -62.63316345214844, + "logps/rejected": -76.23602294921875, + "loss": 0.8467, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6221771240234375, + "rewards/margins": 4.0757880210876465, + "rewards/rejected": -1.4536112546920776, + "step": 2861 + }, + { + "epoch": 0.72, + "grad_norm": 3.0321767330169678, + "learning_rate": 5.355983258109629e-06, + "logits/chosen": -0.28431832790374756, + "logits/rejected": -0.3853820860385895, + "logps/chosen": -57.53982925415039, + "logps/rejected": -83.17699432373047, + "loss": 0.6384, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.828728675842285, + "rewards/margins": 5.292852401733398, + "rewards/rejected": -2.4641237258911133, + "step": 2862 + }, + { + "epoch": 0.72, + "grad_norm": 4.7938127517700195, + "learning_rate": 5.353370117114315e-06, + "logits/chosen": -0.1805281937122345, + "logits/rejected": -0.31822261214256287, + "logps/chosen": -68.76099395751953, + "logps/rejected": -85.7876205444336, + "loss": 0.8314, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7081761360168457, + "rewards/margins": 4.482479572296143, + "rewards/rejected": -1.7743031978607178, + "step": 2863 + }, + { + "epoch": 0.72, + "grad_norm": 3.9916090965270996, + "learning_rate": 5.350756879111278e-06, + "logits/chosen": -0.3218843638896942, + "logits/rejected": -0.4761596620082855, + "logps/chosen": -63.91004943847656, + "logps/rejected": -65.86100769042969, + "loss": 0.8312, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.044168710708618, + "rewards/margins": 4.6388936042785645, + "rewards/rejected": -1.594724416732788, + "step": 2864 + }, + { + "epoch": 0.72, + "grad_norm": 5.131894111633301, + "learning_rate": 5.348143544817909e-06, + "logits/chosen": -0.4147537648677826, + "logits/rejected": -0.4689149558544159, + "logps/chosen": -57.8258171081543, + "logps/rejected": -84.25779724121094, + "loss": 0.8298, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.79414701461792, + "rewards/margins": 4.687534332275391, + "rewards/rejected": -1.8933876752853394, + "step": 2865 + }, + { + "epoch": 0.72, + "grad_norm": 2.1521453857421875, + "learning_rate": 5.345530114951625e-06, + "logits/chosen": -0.32249903678894043, + "logits/rejected": -0.454475998878479, + "logps/chosen": -52.29509735107422, + "logps/rejected": -70.07410430908203, + "loss": 0.6366, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.176203727722168, + "rewards/margins": 4.913479328155518, + "rewards/rejected": -1.7372760772705078, + "step": 2866 + }, + { + "epoch": 0.72, + "grad_norm": 3.788524627685547, + "learning_rate": 5.342916590229868e-06, + "logits/chosen": -0.3316941559314728, + "logits/rejected": -0.3520967364311218, + "logps/chosen": -48.99536895751953, + "logps/rejected": -102.2657699584961, + "loss": 0.6825, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3534111976623535, + "rewards/margins": 5.209319591522217, + "rewards/rejected": -1.8559083938598633, + "step": 2867 + }, + { + "epoch": 0.72, + "grad_norm": 6.453124523162842, + "learning_rate": 5.340302971370108e-06, + "logits/chosen": -0.3341631293296814, + "logits/rejected": -0.3758694529533386, + "logps/chosen": -49.871952056884766, + "logps/rejected": -72.6478271484375, + "loss": 0.6853, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1866455078125, + "rewards/margins": 3.9767231941223145, + "rewards/rejected": -0.7900773882865906, + "step": 2868 + }, + { + "epoch": 0.72, + "grad_norm": 8.373204231262207, + "learning_rate": 5.337689259089839e-06, + "logits/chosen": -0.25841182470321655, + "logits/rejected": -0.32927384972572327, + "logps/chosen": -52.200927734375, + "logps/rejected": -88.99071502685547, + "loss": 0.7328, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9909708499908447, + "rewards/margins": 4.473287105560303, + "rewards/rejected": -1.482316493988037, + "step": 2869 + }, + { + "epoch": 0.72, + "grad_norm": 4.064604759216309, + "learning_rate": 5.3350754541065805e-06, + "logits/chosen": -0.2315141260623932, + "logits/rejected": -0.3877009451389313, + "logps/chosen": -58.58448028564453, + "logps/rejected": -70.08878326416016, + "loss": 0.7042, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7633914947509766, + "rewards/margins": 4.181623935699463, + "rewards/rejected": -1.4182322025299072, + "step": 2870 + }, + { + "epoch": 0.72, + "grad_norm": 4.756444931030273, + "learning_rate": 5.332461557137882e-06, + "logits/chosen": -0.27241480350494385, + "logits/rejected": -0.3210923373699188, + "logps/chosen": -54.781551361083984, + "logps/rejected": -90.90523529052734, + "loss": 0.7445, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9930148124694824, + "rewards/margins": 4.653497219085693, + "rewards/rejected": -1.6604828834533691, + "step": 2871 + }, + { + "epoch": 0.72, + "grad_norm": 6.504068851470947, + "learning_rate": 5.329847568901311e-06, + "logits/chosen": -0.1847158819437027, + "logits/rejected": -0.34282156825065613, + "logps/chosen": -56.30391311645508, + "logps/rejected": -78.956298828125, + "loss": 0.6776, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7792444229125977, + "rewards/margins": 4.401383399963379, + "rewards/rejected": -1.6221387386322021, + "step": 2872 + }, + { + "epoch": 0.72, + "grad_norm": 7.104898452758789, + "learning_rate": 5.327233490114464e-06, + "logits/chosen": -0.2749258875846863, + "logits/rejected": -0.4704415202140808, + "logps/chosen": -67.77266693115234, + "logps/rejected": -68.27449035644531, + "loss": 0.8423, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7026917934417725, + "rewards/margins": 4.607289791107178, + "rewards/rejected": -1.9045976400375366, + "step": 2873 + }, + { + "epoch": 0.72, + "grad_norm": 7.035367012023926, + "learning_rate": 5.324619321494963e-06, + "logits/chosen": -0.277157723903656, + "logits/rejected": -0.4179953634738922, + "logps/chosen": -70.28244018554688, + "logps/rejected": -75.96583557128906, + "loss": 0.8374, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.430034637451172, + "rewards/margins": 4.1926703453063965, + "rewards/rejected": -1.762635588645935, + "step": 2874 + }, + { + "epoch": 0.72, + "grad_norm": 4.268209457397461, + "learning_rate": 5.322005063760454e-06, + "logits/chosen": -0.38363468647003174, + "logits/rejected": -0.3875373303890228, + "logps/chosen": -43.99019241333008, + "logps/rejected": -88.64421844482422, + "loss": 0.7549, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.857170343399048, + "rewards/margins": 4.113522529602051, + "rewards/rejected": -1.256352186203003, + "step": 2875 + }, + { + "epoch": 0.72, + "grad_norm": 8.686541557312012, + "learning_rate": 5.319390717628607e-06, + "logits/chosen": -0.27579566836357117, + "logits/rejected": -0.3966784179210663, + "logps/chosen": -59.80488586425781, + "logps/rejected": -84.90469360351562, + "loss": 0.8616, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7524797916412354, + "rewards/margins": 3.87217378616333, + "rewards/rejected": -1.1196941137313843, + "step": 2876 + }, + { + "epoch": 0.72, + "grad_norm": 4.7415571212768555, + "learning_rate": 5.316776283817116e-06, + "logits/chosen": -0.25705841183662415, + "logits/rejected": -0.3068249821662903, + "logps/chosen": -52.20174789428711, + "logps/rejected": -77.36539459228516, + "loss": 0.7301, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8061347007751465, + "rewards/margins": 4.330670356750488, + "rewards/rejected": -1.5245360136032104, + "step": 2877 + }, + { + "epoch": 0.72, + "grad_norm": 2.8524861335754395, + "learning_rate": 5.314161763043698e-06, + "logits/chosen": -0.2515701651573181, + "logits/rejected": -0.4244222044944763, + "logps/chosen": -60.344627380371094, + "logps/rejected": -87.364013671875, + "loss": 0.6801, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8146374225616455, + "rewards/margins": 5.808663845062256, + "rewards/rejected": -2.9940268993377686, + "step": 2878 + }, + { + "epoch": 0.72, + "grad_norm": 9.015199661254883, + "learning_rate": 5.311547156026099e-06, + "logits/chosen": -0.2798353433609009, + "logits/rejected": -0.31488147377967834, + "logps/chosen": -55.0454216003418, + "logps/rejected": -85.12276458740234, + "loss": 0.8691, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.042161464691162, + "rewards/margins": 3.2189230918884277, + "rewards/rejected": -0.17676135897636414, + "step": 2879 + }, + { + "epoch": 0.72, + "grad_norm": 6.122841835021973, + "learning_rate": 5.308932463482082e-06, + "logits/chosen": -0.27665776014328003, + "logits/rejected": -0.3504815101623535, + "logps/chosen": -51.45150375366211, + "logps/rejected": -86.80915832519531, + "loss": 0.8427, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7076094150543213, + "rewards/margins": 4.076397895812988, + "rewards/rejected": -1.368788719177246, + "step": 2880 + }, + { + "epoch": 0.72, + "grad_norm": 6.590558052062988, + "learning_rate": 5.306317686129437e-06, + "logits/chosen": -0.3583000600337982, + "logits/rejected": -0.4767257571220398, + "logps/chosen": -53.66687774658203, + "logps/rejected": -76.78887176513672, + "loss": 0.9274, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6758875846862793, + "rewards/margins": 3.631336212158203, + "rewards/rejected": -0.9554486870765686, + "step": 2881 + }, + { + "epoch": 0.72, + "grad_norm": 11.88197135925293, + "learning_rate": 5.303702824685979e-06, + "logits/chosen": -0.3017479479312897, + "logits/rejected": -0.489724338054657, + "logps/chosen": -62.846961975097656, + "logps/rejected": -62.9890022277832, + "loss": 0.7876, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8150644302368164, + "rewards/margins": 4.3971099853515625, + "rewards/rejected": -1.5820454359054565, + "step": 2882 + }, + { + "epoch": 0.72, + "grad_norm": 4.827563285827637, + "learning_rate": 5.301087879869541e-06, + "logits/chosen": -0.2983522415161133, + "logits/rejected": -0.46498504281044006, + "logps/chosen": -50.250450134277344, + "logps/rejected": -76.98235321044922, + "loss": 0.7096, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.164802312850952, + "rewards/margins": 5.105395317077637, + "rewards/rejected": -1.9405930042266846, + "step": 2883 + }, + { + "epoch": 0.72, + "grad_norm": 5.140795707702637, + "learning_rate": 5.298472852397983e-06, + "logits/chosen": -0.2583574056625366, + "logits/rejected": -0.37959593534469604, + "logps/chosen": -53.80388259887695, + "logps/rejected": -75.35989379882812, + "loss": 0.6729, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6809964179992676, + "rewards/margins": 4.4879021644592285, + "rewards/rejected": -1.8069058656692505, + "step": 2884 + }, + { + "epoch": 0.72, + "grad_norm": 4.564888000488281, + "learning_rate": 5.295857742989187e-06, + "logits/chosen": -0.303562730550766, + "logits/rejected": -0.39701783657073975, + "logps/chosen": -75.67998504638672, + "logps/rejected": -83.81942749023438, + "loss": 0.8264, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.645273447036743, + "rewards/margins": 4.226018905639648, + "rewards/rejected": -1.5807451009750366, + "step": 2885 + }, + { + "epoch": 0.72, + "grad_norm": 4.7197771072387695, + "learning_rate": 5.293242552361056e-06, + "logits/chosen": -0.2282174527645111, + "logits/rejected": -0.34769919514656067, + "logps/chosen": -59.74958801269531, + "logps/rejected": -94.08626556396484, + "loss": 0.7238, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9038798809051514, + "rewards/margins": 4.663952827453613, + "rewards/rejected": -1.7600734233856201, + "step": 2886 + }, + { + "epoch": 0.72, + "grad_norm": 2.9854843616485596, + "learning_rate": 5.290627281231516e-06, + "logits/chosen": -0.2869781255722046, + "logits/rejected": -0.4208456873893738, + "logps/chosen": -52.96248245239258, + "logps/rejected": -77.03958129882812, + "loss": 0.6846, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.072709560394287, + "rewards/margins": 5.0570387840271, + "rewards/rejected": -1.9843292236328125, + "step": 2887 + }, + { + "epoch": 0.72, + "grad_norm": 2.783041000366211, + "learning_rate": 5.288011930318518e-06, + "logits/chosen": -0.35085368156433105, + "logits/rejected": -0.4246392250061035, + "logps/chosen": -50.535308837890625, + "logps/rejected": -83.95614624023438, + "loss": 0.6159, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9595935344696045, + "rewards/margins": 5.22418737411499, + "rewards/rejected": -2.2645936012268066, + "step": 2888 + }, + { + "epoch": 0.72, + "grad_norm": 4.670507431030273, + "learning_rate": 5.28539650034003e-06, + "logits/chosen": -0.3964754045009613, + "logits/rejected": -0.4537167549133301, + "logps/chosen": -58.71852493286133, + "logps/rejected": -64.61233520507812, + "loss": 0.742, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.885206460952759, + "rewards/margins": 4.023334980010986, + "rewards/rejected": -1.1381282806396484, + "step": 2889 + }, + { + "epoch": 0.72, + "grad_norm": 6.497567653656006, + "learning_rate": 5.282780992014043e-06, + "logits/chosen": -0.203996479511261, + "logits/rejected": -0.3149411380290985, + "logps/chosen": -49.15053176879883, + "logps/rejected": -70.89100646972656, + "loss": 0.7468, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8219785690307617, + "rewards/margins": 3.7791309356689453, + "rewards/rejected": -0.9571523666381836, + "step": 2890 + }, + { + "epoch": 0.72, + "grad_norm": 10.681902885437012, + "learning_rate": 5.280165406058573e-06, + "logits/chosen": -0.32142552733421326, + "logits/rejected": -0.40872058272361755, + "logps/chosen": -51.482513427734375, + "logps/rejected": -78.08727264404297, + "loss": 0.8311, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.968424081802368, + "rewards/margins": 4.888383865356445, + "rewards/rejected": -1.9199597835540771, + "step": 2891 + }, + { + "epoch": 0.72, + "grad_norm": 13.916271209716797, + "learning_rate": 5.277549743191653e-06, + "logits/chosen": -0.3448847532272339, + "logits/rejected": -0.47171780467033386, + "logps/chosen": -51.00798797607422, + "logps/rejected": -80.68124389648438, + "loss": 0.6865, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.764665126800537, + "rewards/margins": 4.089561462402344, + "rewards/rejected": -1.324896216392517, + "step": 2892 + }, + { + "epoch": 0.72, + "grad_norm": 3.046363115310669, + "learning_rate": 5.274934004131338e-06, + "logits/chosen": -0.22360336780548096, + "logits/rejected": -0.3129144310951233, + "logps/chosen": -52.8192138671875, + "logps/rejected": -75.1193618774414, + "loss": 0.7027, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0174407958984375, + "rewards/margins": 4.355450630187988, + "rewards/rejected": -1.3380097150802612, + "step": 2893 + }, + { + "epoch": 0.72, + "grad_norm": 5.25040340423584, + "learning_rate": 5.272318189595707e-06, + "logits/chosen": -0.2684840261936188, + "logits/rejected": -0.3916071653366089, + "logps/chosen": -43.01762771606445, + "logps/rejected": -58.527099609375, + "loss": 0.7534, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.863881826400757, + "rewards/margins": 4.107774257659912, + "rewards/rejected": -1.2438926696777344, + "step": 2894 + }, + { + "epoch": 0.72, + "grad_norm": 3.5964651107788086, + "learning_rate": 5.269702300302859e-06, + "logits/chosen": -0.30995413661003113, + "logits/rejected": -0.4794287085533142, + "logps/chosen": -51.104881286621094, + "logps/rejected": -67.50733947753906, + "loss": 0.7259, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9432320594787598, + "rewards/margins": 4.677792549133301, + "rewards/rejected": -1.7345603704452515, + "step": 2895 + }, + { + "epoch": 0.72, + "grad_norm": 3.379427909851074, + "learning_rate": 5.267086336970908e-06, + "logits/chosen": -0.30757206678390503, + "logits/rejected": -0.42731207609176636, + "logps/chosen": -66.59371948242188, + "logps/rejected": -66.7746353149414, + "loss": 0.7299, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.661672592163086, + "rewards/margins": 4.227315902709961, + "rewards/rejected": -1.565643310546875, + "step": 2896 + }, + { + "epoch": 0.72, + "grad_norm": 3.921530246734619, + "learning_rate": 5.264470300317994e-06, + "logits/chosen": -0.23964965343475342, + "logits/rejected": -0.32598036527633667, + "logps/chosen": -55.589195251464844, + "logps/rejected": -78.65036010742188, + "loss": 0.7427, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0870156288146973, + "rewards/margins": 4.397225379943848, + "rewards/rejected": -1.3102097511291504, + "step": 2897 + }, + { + "epoch": 0.72, + "grad_norm": 3.9033215045928955, + "learning_rate": 5.261854191062276e-06, + "logits/chosen": -0.3030111491680145, + "logits/rejected": -0.40748441219329834, + "logps/chosen": -57.984832763671875, + "logps/rejected": -83.48810577392578, + "loss": 0.7246, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.800633430480957, + "rewards/margins": 4.106573581695557, + "rewards/rejected": -1.3059396743774414, + "step": 2898 + }, + { + "epoch": 0.73, + "grad_norm": 10.915602684020996, + "learning_rate": 5.259238009921932e-06, + "logits/chosen": -0.255338579416275, + "logits/rejected": -0.4167666733264923, + "logps/chosen": -62.160030364990234, + "logps/rejected": -73.46392822265625, + "loss": 0.8474, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.931380271911621, + "rewards/margins": 5.024898052215576, + "rewards/rejected": -2.093517780303955, + "step": 2899 + }, + { + "epoch": 0.73, + "grad_norm": 3.7616190910339355, + "learning_rate": 5.256621757615161e-06, + "logits/chosen": -0.3613582253456116, + "logits/rejected": -0.42553722858428955, + "logps/chosen": -47.3275146484375, + "logps/rejected": -73.57540893554688, + "loss": 0.7696, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8312435150146484, + "rewards/margins": 3.9500935077667236, + "rewards/rejected": -1.1188496351242065, + "step": 2900 + }, + { + "epoch": 0.73, + "grad_norm": 2.9707531929016113, + "learning_rate": 5.254005434860181e-06, + "logits/chosen": -0.23872873187065125, + "logits/rejected": -0.25411927700042725, + "logps/chosen": -59.13111114501953, + "logps/rejected": -85.4354019165039, + "loss": 0.7408, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.820923089981079, + "rewards/margins": 3.523263454437256, + "rewards/rejected": -0.7023400664329529, + "step": 2901 + }, + { + "epoch": 0.73, + "grad_norm": 4.391684532165527, + "learning_rate": 5.251389042375226e-06, + "logits/chosen": -0.2688668966293335, + "logits/rejected": -0.4155951738357544, + "logps/chosen": -60.1358642578125, + "logps/rejected": -84.64828491210938, + "loss": 0.7069, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.948756694793701, + "rewards/margins": 4.654595375061035, + "rewards/rejected": -1.705838680267334, + "step": 2902 + }, + { + "epoch": 0.73, + "grad_norm": 3.5113189220428467, + "learning_rate": 5.248772580878555e-06, + "logits/chosen": -0.2910064458847046, + "logits/rejected": -0.3005146384239197, + "logps/chosen": -45.720420837402344, + "logps/rejected": -79.15711975097656, + "loss": 0.6607, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8642618656158447, + "rewards/margins": 3.756577968597412, + "rewards/rejected": -0.8923158645629883, + "step": 2903 + }, + { + "epoch": 0.73, + "grad_norm": 5.8703293800354, + "learning_rate": 5.2461560510884436e-06, + "logits/chosen": -0.1505277305841446, + "logits/rejected": -0.22631628811359406, + "logps/chosen": -68.76533508300781, + "logps/rejected": -83.97197723388672, + "loss": 0.784, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8386480808258057, + "rewards/margins": 4.24495267868042, + "rewards/rejected": -1.4063044786453247, + "step": 2904 + }, + { + "epoch": 0.73, + "grad_norm": 3.7823028564453125, + "learning_rate": 5.243539453723183e-06, + "logits/chosen": -0.33803072571754456, + "logits/rejected": -0.422804594039917, + "logps/chosen": -59.143585205078125, + "logps/rejected": -84.59140014648438, + "loss": 0.7959, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.206429958343506, + "rewards/margins": 3.987884044647217, + "rewards/rejected": -0.7814540863037109, + "step": 2905 + }, + { + "epoch": 0.73, + "grad_norm": 2.76009464263916, + "learning_rate": 5.240922789501089e-06, + "logits/chosen": -0.2630036473274231, + "logits/rejected": -0.4137500822544098, + "logps/chosen": -58.24089050292969, + "logps/rejected": -68.10639953613281, + "loss": 0.6284, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1444883346557617, + "rewards/margins": 4.839620590209961, + "rewards/rejected": -1.6951321363449097, + "step": 2906 + }, + { + "epoch": 0.73, + "grad_norm": 5.134281158447266, + "learning_rate": 5.238306059140489e-06, + "logits/chosen": -0.27622899413108826, + "logits/rejected": -0.463813841342926, + "logps/chosen": -58.978668212890625, + "logps/rejected": -59.05828857421875, + "loss": 0.7904, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8108139038085938, + "rewards/margins": 4.228947162628174, + "rewards/rejected": -1.4181333780288696, + "step": 2907 + }, + { + "epoch": 0.73, + "grad_norm": 3.3888673782348633, + "learning_rate": 5.235689263359734e-06, + "logits/chosen": -0.22667473554611206, + "logits/rejected": -0.37304484844207764, + "logps/chosen": -51.94270324707031, + "logps/rejected": -71.50171661376953, + "loss": 0.6736, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1212987899780273, + "rewards/margins": 4.770318984985352, + "rewards/rejected": -1.6490205526351929, + "step": 2908 + }, + { + "epoch": 0.73, + "grad_norm": 5.386788845062256, + "learning_rate": 5.233072402877189e-06, + "logits/chosen": -0.3135760426521301, + "logits/rejected": -0.35000643134117126, + "logps/chosen": -51.016578674316406, + "logps/rejected": -84.8111572265625, + "loss": 0.7561, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9904329776763916, + "rewards/margins": 4.340306758880615, + "rewards/rejected": -1.3498740196228027, + "step": 2909 + }, + { + "epoch": 0.73, + "grad_norm": 5.080260753631592, + "learning_rate": 5.230455478411243e-06, + "logits/chosen": -0.316375195980072, + "logits/rejected": -0.4285851716995239, + "logps/chosen": -53.7784423828125, + "logps/rejected": -71.53016662597656, + "loss": 0.7996, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.892840623855591, + "rewards/margins": 3.2136788368225098, + "rewards/rejected": -0.32083848118782043, + "step": 2910 + }, + { + "epoch": 0.73, + "grad_norm": 5.9536614418029785, + "learning_rate": 5.2278384906802935e-06, + "logits/chosen": -0.26130199432373047, + "logits/rejected": -0.4147193431854248, + "logps/chosen": -52.72380065917969, + "logps/rejected": -72.56208801269531, + "loss": 0.7314, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6933581829071045, + "rewards/margins": 4.844354629516602, + "rewards/rejected": -2.150996208190918, + "step": 2911 + }, + { + "epoch": 0.73, + "grad_norm": 8.004846572875977, + "learning_rate": 5.2252214404027636e-06, + "logits/chosen": -0.34236961603164673, + "logits/rejected": -0.4491702914237976, + "logps/chosen": -56.15776824951172, + "logps/rejected": -74.19839477539062, + "loss": 0.728, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0406339168548584, + "rewards/margins": 4.447443008422852, + "rewards/rejected": -1.4068094491958618, + "step": 2912 + }, + { + "epoch": 0.73, + "grad_norm": 2.7817275524139404, + "learning_rate": 5.222604328297086e-06, + "logits/chosen": -0.3549501895904541, + "logits/rejected": -0.46796831488609314, + "logps/chosen": -51.139732360839844, + "logps/rejected": -76.643798828125, + "loss": 0.6668, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6863980293273926, + "rewards/margins": 4.687912940979004, + "rewards/rejected": -2.0015153884887695, + "step": 2913 + }, + { + "epoch": 0.73, + "grad_norm": 9.530006408691406, + "learning_rate": 5.2199871550817184e-06, + "logits/chosen": -0.32539135217666626, + "logits/rejected": -0.4154561758041382, + "logps/chosen": -50.05657958984375, + "logps/rejected": -72.12155151367188, + "loss": 0.8374, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8774852752685547, + "rewards/margins": 4.162259101867676, + "rewards/rejected": -1.284773349761963, + "step": 2914 + }, + { + "epoch": 0.73, + "grad_norm": 4.439727783203125, + "learning_rate": 5.217369921475129e-06, + "logits/chosen": -0.35164904594421387, + "logits/rejected": -0.45697933435440063, + "logps/chosen": -49.19853210449219, + "logps/rejected": -79.25360107421875, + "loss": 0.7011, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0966978073120117, + "rewards/margins": 4.884563446044922, + "rewards/rejected": -1.787865161895752, + "step": 2915 + }, + { + "epoch": 0.73, + "grad_norm": 4.155093193054199, + "learning_rate": 5.214752628195807e-06, + "logits/chosen": -0.33697959780693054, + "logits/rejected": -0.4340222477912903, + "logps/chosen": -55.54656982421875, + "logps/rejected": -79.34424591064453, + "loss": 0.7728, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.102530002593994, + "rewards/margins": 4.984873294830322, + "rewards/rejected": -1.8823432922363281, + "step": 2916 + }, + { + "epoch": 0.73, + "grad_norm": 6.557300567626953, + "learning_rate": 5.212135275962252e-06, + "logits/chosen": -0.32743462920188904, + "logits/rejected": -0.36597639322280884, + "logps/chosen": -49.04771423339844, + "logps/rejected": -76.67341613769531, + "loss": 0.9205, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.914644718170166, + "rewards/margins": 2.869324207305908, + "rewards/rejected": 0.045320749282836914, + "step": 2917 + }, + { + "epoch": 0.73, + "grad_norm": 7.697874069213867, + "learning_rate": 5.209517865492989e-06, + "logits/chosen": -0.27945467829704285, + "logits/rejected": -0.33463597297668457, + "logps/chosen": -49.72102737426758, + "logps/rejected": -85.56918334960938, + "loss": 0.7503, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.827054738998413, + "rewards/margins": 3.7771859169006348, + "rewards/rejected": -0.9501317143440247, + "step": 2918 + }, + { + "epoch": 0.73, + "grad_norm": 7.95044469833374, + "learning_rate": 5.206900397506549e-06, + "logits/chosen": -0.29325228929519653, + "logits/rejected": -0.35953056812286377, + "logps/chosen": -62.55583572387695, + "logps/rejected": -84.69705963134766, + "loss": 1.0038, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.835836410522461, + "rewards/margins": 3.805328369140625, + "rewards/rejected": -0.9694918990135193, + "step": 2919 + }, + { + "epoch": 0.73, + "grad_norm": 5.98389196395874, + "learning_rate": 5.2042828727214866e-06, + "logits/chosen": -0.25781816244125366, + "logits/rejected": -0.3672142028808594, + "logps/chosen": -68.82453918457031, + "logps/rejected": -97.38024139404297, + "loss": 0.9312, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.35868501663208, + "rewards/margins": 4.369026184082031, + "rewards/rejected": -2.010341167449951, + "step": 2920 + }, + { + "epoch": 0.73, + "grad_norm": 6.468847274780273, + "learning_rate": 5.201665291856367e-06, + "logits/chosen": -0.2672887444496155, + "logits/rejected": -0.2988058626651764, + "logps/chosen": -57.02789306640625, + "logps/rejected": -93.49857330322266, + "loss": 0.7874, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7791478633880615, + "rewards/margins": 3.491206169128418, + "rewards/rejected": -0.7120580673217773, + "step": 2921 + }, + { + "epoch": 0.73, + "grad_norm": 5.359180927276611, + "learning_rate": 5.199047655629774e-06, + "logits/chosen": -0.29793423414230347, + "logits/rejected": -0.4269193708896637, + "logps/chosen": -54.549774169921875, + "logps/rejected": -76.48729705810547, + "loss": 0.7242, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.967360734939575, + "rewards/margins": 4.5438947677612305, + "rewards/rejected": -1.5765334367752075, + "step": 2922 + }, + { + "epoch": 0.73, + "grad_norm": 7.365172863006592, + "learning_rate": 5.1964299647603025e-06, + "logits/chosen": -0.15987414121627808, + "logits/rejected": -0.21732427179813385, + "logps/chosen": -63.60091781616211, + "logps/rejected": -91.40656280517578, + "loss": 0.8586, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.597327470779419, + "rewards/margins": 3.4194681644439697, + "rewards/rejected": -0.8221408128738403, + "step": 2923 + }, + { + "epoch": 0.73, + "grad_norm": 4.390432834625244, + "learning_rate": 5.1938122199665685e-06, + "logits/chosen": -0.31315672397613525, + "logits/rejected": -0.36914771795272827, + "logps/chosen": -48.58375549316406, + "logps/rejected": -78.82901763916016, + "loss": 0.7115, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9401144981384277, + "rewards/margins": 4.373589038848877, + "rewards/rejected": -1.4334743022918701, + "step": 2924 + }, + { + "epoch": 0.73, + "grad_norm": 15.033819198608398, + "learning_rate": 5.191194421967198e-06, + "logits/chosen": -0.3271224796772003, + "logits/rejected": -0.4030264914035797, + "logps/chosen": -54.414794921875, + "logps/rejected": -72.25594329833984, + "loss": 0.8033, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7283267974853516, + "rewards/margins": 3.692439079284668, + "rewards/rejected": -0.964112401008606, + "step": 2925 + }, + { + "epoch": 0.73, + "grad_norm": 4.6686906814575195, + "learning_rate": 5.188576571480833e-06, + "logits/chosen": -0.28364458680152893, + "logits/rejected": -0.41566744446754456, + "logps/chosen": -66.55086517333984, + "logps/rejected": -87.73307037353516, + "loss": 0.7602, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6295313835144043, + "rewards/margins": 4.298244476318359, + "rewards/rejected": -1.6687133312225342, + "step": 2926 + }, + { + "epoch": 0.73, + "grad_norm": 9.393442153930664, + "learning_rate": 5.185958669226129e-06, + "logits/chosen": -0.23611469566822052, + "logits/rejected": -0.3126585781574249, + "logps/chosen": -60.496028900146484, + "logps/rejected": -80.80120086669922, + "loss": 0.8101, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7773489952087402, + "rewards/margins": 4.237039566040039, + "rewards/rejected": -1.459690809249878, + "step": 2927 + }, + { + "epoch": 0.73, + "grad_norm": 4.121209621429443, + "learning_rate": 5.183340715921759e-06, + "logits/chosen": -0.2761422097682953, + "logits/rejected": -0.4507679045200348, + "logps/chosen": -70.36833953857422, + "logps/rejected": -67.8285903930664, + "loss": 0.8161, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0194854736328125, + "rewards/margins": 4.709859848022461, + "rewards/rejected": -1.690374493598938, + "step": 2928 + }, + { + "epoch": 0.73, + "grad_norm": 14.133914947509766, + "learning_rate": 5.180722712286404e-06, + "logits/chosen": -0.2810303568840027, + "logits/rejected": -0.35129567980766296, + "logps/chosen": -62.530792236328125, + "logps/rejected": -77.97270965576172, + "loss": 0.9172, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.442685127258301, + "rewards/margins": 3.652963638305664, + "rewards/rejected": -1.2102783918380737, + "step": 2929 + }, + { + "epoch": 0.73, + "grad_norm": 7.652806758880615, + "learning_rate": 5.1781046590387675e-06, + "logits/chosen": -0.2864728569984436, + "logits/rejected": -0.40976205468177795, + "logps/chosen": -57.42046356201172, + "logps/rejected": -69.86787414550781, + "loss": 0.732, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7750086784362793, + "rewards/margins": 4.1095452308654785, + "rewards/rejected": -1.3345367908477783, + "step": 2930 + }, + { + "epoch": 0.73, + "grad_norm": 9.731801986694336, + "learning_rate": 5.175486556897557e-06, + "logits/chosen": -0.21505729854106903, + "logits/rejected": -0.32522132992744446, + "logps/chosen": -65.4896240234375, + "logps/rejected": -81.13825225830078, + "loss": 0.7967, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9327597618103027, + "rewards/margins": 4.45320987701416, + "rewards/rejected": -1.520450234413147, + "step": 2931 + }, + { + "epoch": 0.73, + "grad_norm": 18.23439598083496, + "learning_rate": 5.172868406581502e-06, + "logits/chosen": -0.34278813004493713, + "logits/rejected": -0.4077521562576294, + "logps/chosen": -49.18756866455078, + "logps/rejected": -89.87535858154297, + "loss": 0.8662, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9014453887939453, + "rewards/margins": 4.619087219238281, + "rewards/rejected": -1.717642068862915, + "step": 2932 + }, + { + "epoch": 0.73, + "grad_norm": 5.848930358886719, + "learning_rate": 5.170250208809337e-06, + "logits/chosen": -0.3658580482006073, + "logits/rejected": -0.40912458300590515, + "logps/chosen": -42.296600341796875, + "logps/rejected": -65.1319351196289, + "loss": 0.7682, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.66477108001709, + "rewards/margins": 3.3687496185302734, + "rewards/rejected": -0.7039783000946045, + "step": 2933 + }, + { + "epoch": 0.73, + "grad_norm": 5.180313587188721, + "learning_rate": 5.167631964299817e-06, + "logits/chosen": -0.2592776119709015, + "logits/rejected": -0.37724411487579346, + "logps/chosen": -60.52986145019531, + "logps/rejected": -75.06431579589844, + "loss": 0.787, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.744594097137451, + "rewards/margins": 3.667329788208008, + "rewards/rejected": -0.9227357506752014, + "step": 2934 + }, + { + "epoch": 0.73, + "grad_norm": 4.010045051574707, + "learning_rate": 5.165013673771705e-06, + "logits/chosen": -0.3597034215927124, + "logits/rejected": -0.411629855632782, + "logps/chosen": -52.57682418823242, + "logps/rejected": -80.44065856933594, + "loss": 0.8006, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7534921169281006, + "rewards/margins": 4.337504863739014, + "rewards/rejected": -1.584012746810913, + "step": 2935 + }, + { + "epoch": 0.73, + "grad_norm": 8.074520111083984, + "learning_rate": 5.16239533794378e-06, + "logits/chosen": -0.3296111226081848, + "logits/rejected": -0.4120875895023346, + "logps/chosen": -53.34086227416992, + "logps/rejected": -80.50288391113281, + "loss": 0.7348, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.695540428161621, + "rewards/margins": 4.131801605224609, + "rewards/rejected": -1.4362603425979614, + "step": 2936 + }, + { + "epoch": 0.73, + "grad_norm": 8.32206916809082, + "learning_rate": 5.159776957534829e-06, + "logits/chosen": -0.26881733536720276, + "logits/rejected": -0.3540366291999817, + "logps/chosen": -46.16437530517578, + "logps/rejected": -72.92404174804688, + "loss": 0.7283, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0029096603393555, + "rewards/margins": 3.743697166442871, + "rewards/rejected": -0.7407875061035156, + "step": 2937 + }, + { + "epoch": 0.73, + "grad_norm": 7.138663291931152, + "learning_rate": 5.1571585332636566e-06, + "logits/chosen": -0.2933461368083954, + "logits/rejected": -0.4243531823158264, + "logps/chosen": -71.3326416015625, + "logps/rejected": -73.9839859008789, + "loss": 0.8024, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2306580543518066, + "rewards/margins": 4.9257893562316895, + "rewards/rejected": -1.6951311826705933, + "step": 2938 + }, + { + "epoch": 0.74, + "grad_norm": 6.535924434661865, + "learning_rate": 5.1545400658490755e-06, + "logits/chosen": -0.25069016218185425, + "logits/rejected": -0.3324652314186096, + "logps/chosen": -51.85710525512695, + "logps/rejected": -70.64714050292969, + "loss": 0.8764, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.007307529449463, + "rewards/margins": 3.4186198711395264, + "rewards/rejected": -0.41131263971328735, + "step": 2939 + }, + { + "epoch": 0.74, + "grad_norm": 3.8600234985351562, + "learning_rate": 5.1519215560099116e-06, + "logits/chosen": -0.3326963186264038, + "logits/rejected": -0.4069267213344574, + "logps/chosen": -46.951324462890625, + "logps/rejected": -70.83428192138672, + "loss": 0.7276, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.991175651550293, + "rewards/margins": 4.213071823120117, + "rewards/rejected": -1.2218958139419556, + "step": 2940 + }, + { + "epoch": 0.74, + "grad_norm": 12.451398849487305, + "learning_rate": 5.149303004465002e-06, + "logits/chosen": -0.35636162757873535, + "logits/rejected": -0.46363627910614014, + "logps/chosen": -67.47420501708984, + "logps/rejected": -78.14439392089844, + "loss": 0.9854, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.849896192550659, + "rewards/margins": 3.2079105377197266, + "rewards/rejected": -0.358014315366745, + "step": 2941 + }, + { + "epoch": 0.74, + "grad_norm": 6.571239471435547, + "learning_rate": 5.1466844119331945e-06, + "logits/chosen": -0.3609806299209595, + "logits/rejected": -0.47518786787986755, + "logps/chosen": -50.772037506103516, + "logps/rejected": -65.2954330444336, + "loss": 0.7644, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.875330686569214, + "rewards/margins": 4.135157108306885, + "rewards/rejected": -1.259826898574829, + "step": 2942 + }, + { + "epoch": 0.74, + "grad_norm": 3.7269835472106934, + "learning_rate": 5.1440657791333536e-06, + "logits/chosen": -0.17015290260314941, + "logits/rejected": -0.2993389666080475, + "logps/chosen": -56.220794677734375, + "logps/rejected": -81.54788208007812, + "loss": 0.6284, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7326722145080566, + "rewards/margins": 4.4038801193237305, + "rewards/rejected": -1.671208381652832, + "step": 2943 + }, + { + "epoch": 0.74, + "grad_norm": 4.8598103523254395, + "learning_rate": 5.141447106784347e-06, + "logits/chosen": -0.33408111333847046, + "logits/rejected": -0.4312123954296112, + "logps/chosen": -58.18785858154297, + "logps/rejected": -78.1939697265625, + "loss": 0.8919, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.850263833999634, + "rewards/margins": 3.874072551727295, + "rewards/rejected": -1.0238085985183716, + "step": 2944 + }, + { + "epoch": 0.74, + "grad_norm": 4.04213809967041, + "learning_rate": 5.1388283956050556e-06, + "logits/chosen": -0.2348935902118683, + "logits/rejected": -0.3647507429122925, + "logps/chosen": -66.88618469238281, + "logps/rejected": -77.28690338134766, + "loss": 0.8061, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.78060245513916, + "rewards/margins": 3.932116746902466, + "rewards/rejected": -1.1515146493911743, + "step": 2945 + }, + { + "epoch": 0.74, + "grad_norm": 3.0044617652893066, + "learning_rate": 5.136209646314375e-06, + "logits/chosen": -0.2704944908618927, + "logits/rejected": -0.4089536666870117, + "logps/chosen": -64.27574157714844, + "logps/rejected": -74.71054077148438, + "loss": 0.7405, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9909520149230957, + "rewards/margins": 4.847046852111816, + "rewards/rejected": -1.8560951948165894, + "step": 2946 + }, + { + "epoch": 0.74, + "grad_norm": 4.639011383056641, + "learning_rate": 5.1335908596312075e-06, + "logits/chosen": -0.3210272192955017, + "logits/rejected": -0.4316015839576721, + "logps/chosen": -59.532630920410156, + "logps/rejected": -82.681396484375, + "loss": 0.8521, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0517940521240234, + "rewards/margins": 5.4786224365234375, + "rewards/rejected": -2.426827907562256, + "step": 2947 + }, + { + "epoch": 0.74, + "grad_norm": 4.322412967681885, + "learning_rate": 5.130972036274466e-06, + "logits/chosen": -0.2528221607208252, + "logits/rejected": -0.3763106167316437, + "logps/chosen": -57.41674041748047, + "logps/rejected": -79.69278717041016, + "loss": 0.7998, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9017536640167236, + "rewards/margins": 4.378820419311523, + "rewards/rejected": -1.4770667552947998, + "step": 2948 + }, + { + "epoch": 0.74, + "grad_norm": 4.9999895095825195, + "learning_rate": 5.128353176963074e-06, + "logits/chosen": -0.33398059010505676, + "logits/rejected": -0.42310404777526855, + "logps/chosen": -55.99258804321289, + "logps/rejected": -70.73893737792969, + "loss": 0.8742, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7569801807403564, + "rewards/margins": 3.832672119140625, + "rewards/rejected": -1.0756916999816895, + "step": 2949 + }, + { + "epoch": 0.74, + "grad_norm": 5.1316704750061035, + "learning_rate": 5.125734282415967e-06, + "logits/chosen": -0.285688579082489, + "logits/rejected": -0.3206040859222412, + "logps/chosen": -49.33244323730469, + "logps/rejected": -74.95526885986328, + "loss": 0.7328, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1493048667907715, + "rewards/margins": 4.075868606567383, + "rewards/rejected": -0.92656409740448, + "step": 2950 + }, + { + "epoch": 0.74, + "grad_norm": 3.365851879119873, + "learning_rate": 5.123115353352086e-06, + "logits/chosen": -0.26218852400779724, + "logits/rejected": -0.4112749695777893, + "logps/chosen": -63.33475875854492, + "logps/rejected": -82.22549438476562, + "loss": 0.6605, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9260237216949463, + "rewards/margins": 5.24836540222168, + "rewards/rejected": -2.3223419189453125, + "step": 2951 + }, + { + "epoch": 0.74, + "grad_norm": 3.140488624572754, + "learning_rate": 5.120496390490382e-06, + "logits/chosen": -0.30424895882606506, + "logits/rejected": -0.49260738492012024, + "logps/chosen": -62.62126541137695, + "logps/rejected": -76.63034057617188, + "loss": 0.6776, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.114319086074829, + "rewards/margins": 5.261959075927734, + "rewards/rejected": -2.147639513015747, + "step": 2952 + }, + { + "epoch": 0.74, + "grad_norm": 4.216053485870361, + "learning_rate": 5.117877394549821e-06, + "logits/chosen": -0.23399707674980164, + "logits/rejected": -0.3307426869869232, + "logps/chosen": -42.49800109863281, + "logps/rejected": -74.72356414794922, + "loss": 0.6975, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8660144805908203, + "rewards/margins": 3.551253318786621, + "rewards/rejected": -0.6852390766143799, + "step": 2953 + }, + { + "epoch": 0.74, + "grad_norm": 4.028669357299805, + "learning_rate": 5.115258366249369e-06, + "logits/chosen": -0.3626914620399475, + "logits/rejected": -0.49710386991500854, + "logps/chosen": -58.0020637512207, + "logps/rejected": -67.86618041992188, + "loss": 0.7712, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.042618989944458, + "rewards/margins": 4.379493236541748, + "rewards/rejected": -1.336874008178711, + "step": 2954 + }, + { + "epoch": 0.74, + "grad_norm": 3.7933220863342285, + "learning_rate": 5.11263930630801e-06, + "logits/chosen": -0.3117920458316803, + "logits/rejected": -0.5115566849708557, + "logps/chosen": -46.526947021484375, + "logps/rejected": -68.92620086669922, + "loss": 0.6349, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0344290733337402, + "rewards/margins": 5.175626754760742, + "rewards/rejected": -2.14119815826416, + "step": 2955 + }, + { + "epoch": 0.74, + "grad_norm": 8.664752006530762, + "learning_rate": 5.110020215444731e-06, + "logits/chosen": -0.22556494176387787, + "logits/rejected": -0.36036497354507446, + "logps/chosen": -61.451847076416016, + "logps/rejected": -84.0935287475586, + "loss": 0.8327, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.169517755508423, + "rewards/margins": 4.471704006195068, + "rewards/rejected": -1.3021866083145142, + "step": 2956 + }, + { + "epoch": 0.74, + "grad_norm": 6.284876346588135, + "learning_rate": 5.107401094378529e-06, + "logits/chosen": -0.3247361481189728, + "logits/rejected": -0.4256904423236847, + "logps/chosen": -58.61643600463867, + "logps/rejected": -74.8463363647461, + "loss": 0.8509, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9107463359832764, + "rewards/margins": 4.013683319091797, + "rewards/rejected": -1.102936863899231, + "step": 2957 + }, + { + "epoch": 0.74, + "grad_norm": 7.219917297363281, + "learning_rate": 5.104781943828408e-06, + "logits/chosen": -0.3993567228317261, + "logits/rejected": -0.38693028688430786, + "logps/chosen": -51.81650924682617, + "logps/rejected": -92.84725952148438, + "loss": 0.8262, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8387203216552734, + "rewards/margins": 3.487935781478882, + "rewards/rejected": -0.6492155194282532, + "step": 2958 + }, + { + "epoch": 0.74, + "grad_norm": 5.888704776763916, + "learning_rate": 5.102162764513383e-06, + "logits/chosen": -0.2635709345340729, + "logits/rejected": -0.3734865188598633, + "logps/chosen": -75.38015747070312, + "logps/rejected": -77.76826477050781, + "loss": 0.8615, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0319936275482178, + "rewards/margins": 4.682644844055176, + "rewards/rejected": -1.6506513357162476, + "step": 2959 + }, + { + "epoch": 0.74, + "grad_norm": 3.62021803855896, + "learning_rate": 5.099543557152474e-06, + "logits/chosen": -0.26109403371810913, + "logits/rejected": -0.314894437789917, + "logps/chosen": -62.64452362060547, + "logps/rejected": -92.73590087890625, + "loss": 0.7474, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.757467031478882, + "rewards/margins": 4.600407600402832, + "rewards/rejected": -1.8429405689239502, + "step": 2960 + }, + { + "epoch": 0.74, + "grad_norm": 9.927331924438477, + "learning_rate": 5.096924322464712e-06, + "logits/chosen": -0.37401339411735535, + "logits/rejected": -0.44306033849716187, + "logps/chosen": -48.612632751464844, + "logps/rejected": -80.82659912109375, + "loss": 0.8181, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.927459955215454, + "rewards/margins": 3.599506139755249, + "rewards/rejected": -0.6720463037490845, + "step": 2961 + }, + { + "epoch": 0.74, + "grad_norm": 3.852257490158081, + "learning_rate": 5.0943050611691314e-06, + "logits/chosen": -0.25656914710998535, + "logits/rejected": -0.44032031297683716, + "logps/chosen": -55.30506134033203, + "logps/rejected": -68.01628875732422, + "loss": 0.6081, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9358034133911133, + "rewards/margins": 4.6060471534729, + "rewards/rejected": -1.6702436208724976, + "step": 2962 + }, + { + "epoch": 0.74, + "grad_norm": 6.344740867614746, + "learning_rate": 5.091685773984777e-06, + "logits/chosen": -0.23094584047794342, + "logits/rejected": -0.3326396346092224, + "logps/chosen": -50.19969177246094, + "logps/rejected": -83.09176635742188, + "loss": 0.6812, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1856701374053955, + "rewards/margins": 5.264514923095703, + "rewards/rejected": -2.0788450241088867, + "step": 2963 + }, + { + "epoch": 0.74, + "grad_norm": 3.5270042419433594, + "learning_rate": 5.089066461630698e-06, + "logits/chosen": -0.314117968082428, + "logits/rejected": -0.4783551096916199, + "logps/chosen": -63.341583251953125, + "logps/rejected": -85.93084716796875, + "loss": 0.7189, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.144207715988159, + "rewards/margins": 5.7874274253845215, + "rewards/rejected": -2.643219470977783, + "step": 2964 + }, + { + "epoch": 0.74, + "grad_norm": 5.068674087524414, + "learning_rate": 5.086447124825954e-06, + "logits/chosen": -0.29535675048828125, + "logits/rejected": -0.4107073247432709, + "logps/chosen": -61.18754577636719, + "logps/rejected": -80.15103912353516, + "loss": 0.7348, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2131595611572266, + "rewards/margins": 4.386255741119385, + "rewards/rejected": -1.1730960607528687, + "step": 2965 + }, + { + "epoch": 0.74, + "grad_norm": 4.496212482452393, + "learning_rate": 5.083827764289608e-06, + "logits/chosen": -0.2604827284812927, + "logits/rejected": -0.408539354801178, + "logps/chosen": -59.57378387451172, + "logps/rejected": -72.6666488647461, + "loss": 0.7368, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.020714282989502, + "rewards/margins": 4.917097091674805, + "rewards/rejected": -1.8963826894760132, + "step": 2966 + }, + { + "epoch": 0.74, + "grad_norm": 5.1537885665893555, + "learning_rate": 5.081208380740735e-06, + "logits/chosen": -0.3169940114021301, + "logits/rejected": -0.41115236282348633, + "logps/chosen": -73.18207550048828, + "logps/rejected": -87.24983215332031, + "loss": 0.8195, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.008317470550537, + "rewards/margins": 4.9844841957092285, + "rewards/rejected": -1.9761662483215332, + "step": 2967 + }, + { + "epoch": 0.74, + "grad_norm": 7.169623851776123, + "learning_rate": 5.0785889748984055e-06, + "logits/chosen": -0.27515095472335815, + "logits/rejected": -0.3627171516418457, + "logps/chosen": -56.054176330566406, + "logps/rejected": -88.96066284179688, + "loss": 0.7734, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4168570041656494, + "rewards/margins": 3.723666191101074, + "rewards/rejected": -1.3068091869354248, + "step": 2968 + }, + { + "epoch": 0.74, + "grad_norm": 6.802195072174072, + "learning_rate": 5.075969547481708e-06, + "logits/chosen": -0.26790493726730347, + "logits/rejected": -0.393189013004303, + "logps/chosen": -64.97826385498047, + "logps/rejected": -75.46493530273438, + "loss": 0.7672, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.062227964401245, + "rewards/margins": 5.115974426269531, + "rewards/rejected": -2.053746461868286, + "step": 2969 + }, + { + "epoch": 0.74, + "grad_norm": 4.041318893432617, + "learning_rate": 5.0733500992097295e-06, + "logits/chosen": -0.28670215606689453, + "logits/rejected": -0.3736035227775574, + "logps/chosen": -53.52467346191406, + "logps/rejected": -73.90916442871094, + "loss": 0.7627, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.804277181625366, + "rewards/margins": 3.933065414428711, + "rewards/rejected": -1.1287884712219238, + "step": 2970 + }, + { + "epoch": 0.74, + "grad_norm": 6.2491655349731445, + "learning_rate": 5.070730630801568e-06, + "logits/chosen": -0.2547723352909088, + "logits/rejected": -0.4064614772796631, + "logps/chosen": -60.17451477050781, + "logps/rejected": -72.26681518554688, + "loss": 0.7429, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.086679458618164, + "rewards/margins": 5.339383125305176, + "rewards/rejected": -2.252703905105591, + "step": 2971 + }, + { + "epoch": 0.74, + "grad_norm": 6.901508808135986, + "learning_rate": 5.06811114297632e-06, + "logits/chosen": -0.28834718465805054, + "logits/rejected": -0.38062846660614014, + "logps/chosen": -53.31899642944336, + "logps/rejected": -80.20184326171875, + "loss": 0.793, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7299718856811523, + "rewards/margins": 3.4283013343811035, + "rewards/rejected": -0.6983298659324646, + "step": 2972 + }, + { + "epoch": 0.74, + "grad_norm": 3.527250289916992, + "learning_rate": 5.065491636453095e-06, + "logits/chosen": -0.288032203912735, + "logits/rejected": -0.3546670079231262, + "logps/chosen": -56.633323669433594, + "logps/rejected": -87.05691528320312, + "loss": 0.7323, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9742565155029297, + "rewards/margins": 5.007637977600098, + "rewards/rejected": -2.033381938934326, + "step": 2973 + }, + { + "epoch": 0.74, + "grad_norm": 4.962469100952148, + "learning_rate": 5.062872111951002e-06, + "logits/chosen": -0.2608291804790497, + "logits/rejected": -0.32537925243377686, + "logps/chosen": -67.68292236328125, + "logps/rejected": -98.338134765625, + "loss": 0.798, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.80293607711792, + "rewards/margins": 4.250290393829346, + "rewards/rejected": -1.4473543167114258, + "step": 2974 + }, + { + "epoch": 0.74, + "grad_norm": 4.5134968757629395, + "learning_rate": 5.060252570189157e-06, + "logits/chosen": -0.4013795256614685, + "logits/rejected": -0.5004841685295105, + "logps/chosen": -46.900604248046875, + "logps/rejected": -74.38151550292969, + "loss": 0.7244, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8710403442382812, + "rewards/margins": 4.270406723022461, + "rewards/rejected": -1.399366855621338, + "step": 2975 + }, + { + "epoch": 0.74, + "grad_norm": 8.33946418762207, + "learning_rate": 5.057633011886683e-06, + "logits/chosen": -0.27731484174728394, + "logits/rejected": -0.2980390191078186, + "logps/chosen": -55.83200454711914, + "logps/rejected": -83.56331634521484, + "loss": 0.8865, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.910094976425171, + "rewards/margins": 3.5721492767333984, + "rewards/rejected": -0.6620539426803589, + "step": 2976 + }, + { + "epoch": 0.74, + "grad_norm": 3.6807498931884766, + "learning_rate": 5.055013437762703e-06, + "logits/chosen": -0.26420462131500244, + "logits/rejected": -0.29221147298812866, + "logps/chosen": -49.82624816894531, + "logps/rejected": -87.04524993896484, + "loss": 0.7399, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.103379249572754, + "rewards/margins": 3.919475793838501, + "rewards/rejected": -0.8160967826843262, + "step": 2977 + }, + { + "epoch": 0.74, + "grad_norm": 12.308053016662598, + "learning_rate": 5.052393848536345e-06, + "logits/chosen": -0.23718789219856262, + "logits/rejected": -0.43104881048202515, + "logps/chosen": -63.44776916503906, + "logps/rejected": -81.2222900390625, + "loss": 0.8652, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6864402294158936, + "rewards/margins": 5.022850036621094, + "rewards/rejected": -2.3364098072052, + "step": 2978 + }, + { + "epoch": 0.75, + "grad_norm": 3.676379919052124, + "learning_rate": 5.049774244926747e-06, + "logits/chosen": -0.4140721261501312, + "logits/rejected": -0.46978771686553955, + "logps/chosen": -54.39003372192383, + "logps/rejected": -92.78829956054688, + "loss": 0.6745, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3041396141052246, + "rewards/margins": 5.918825149536133, + "rewards/rejected": -2.6146860122680664, + "step": 2979 + }, + { + "epoch": 0.75, + "grad_norm": 3.8206541538238525, + "learning_rate": 5.047154627653045e-06, + "logits/chosen": -0.39745521545410156, + "logits/rejected": -0.4455854296684265, + "logps/chosen": -55.30635452270508, + "logps/rejected": -75.86820220947266, + "loss": 0.8675, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.929236888885498, + "rewards/margins": 3.902036428451538, + "rewards/rejected": -0.9727996587753296, + "step": 2980 + }, + { + "epoch": 0.75, + "grad_norm": 12.605094909667969, + "learning_rate": 5.044534997434381e-06, + "logits/chosen": -0.3645178973674774, + "logits/rejected": -0.4169554114341736, + "logps/chosen": -48.129032135009766, + "logps/rejected": -89.93587493896484, + "loss": 0.8358, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5609233379364014, + "rewards/margins": 3.274953842163086, + "rewards/rejected": -0.7140308618545532, + "step": 2981 + }, + { + "epoch": 0.75, + "grad_norm": 5.631014347076416, + "learning_rate": 5.041915354989897e-06, + "logits/chosen": -0.3538818955421448, + "logits/rejected": -0.4443129301071167, + "logps/chosen": -50.37594985961914, + "logps/rejected": -95.88802337646484, + "loss": 0.6863, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.651215076446533, + "rewards/margins": 4.982149600982666, + "rewards/rejected": -2.330934524536133, + "step": 2982 + }, + { + "epoch": 0.75, + "grad_norm": 7.893601894378662, + "learning_rate": 5.039295701038745e-06, + "logits/chosen": -0.23852664232254028, + "logits/rejected": -0.2963041663169861, + "logps/chosen": -65.16282653808594, + "logps/rejected": -93.98949432373047, + "loss": 0.76, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1334574222564697, + "rewards/margins": 4.517352104187012, + "rewards/rejected": -1.3838943243026733, + "step": 2983 + }, + { + "epoch": 0.75, + "grad_norm": 4.7895283699035645, + "learning_rate": 5.036676036300074e-06, + "logits/chosen": -0.3264959454536438, + "logits/rejected": -0.39617177844047546, + "logps/chosen": -48.41461944580078, + "logps/rejected": -81.44449615478516, + "loss": 0.6798, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9429712295532227, + "rewards/margins": 4.205837249755859, + "rewards/rejected": -1.2628660202026367, + "step": 2984 + }, + { + "epoch": 0.75, + "grad_norm": 5.048844814300537, + "learning_rate": 5.03405636149304e-06, + "logits/chosen": -0.3198080062866211, + "logits/rejected": -0.44225308299064636, + "logps/chosen": -55.32288360595703, + "logps/rejected": -76.18962860107422, + "loss": 0.8142, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.159564971923828, + "rewards/margins": 4.387272834777832, + "rewards/rejected": -1.2277084589004517, + "step": 2985 + }, + { + "epoch": 0.75, + "grad_norm": 11.110570907592773, + "learning_rate": 5.0314366773368e-06, + "logits/chosen": -0.3299991190433502, + "logits/rejected": -0.3720197379589081, + "logps/chosen": -56.07337188720703, + "logps/rejected": -84.27177429199219, + "loss": 0.9008, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9442250728607178, + "rewards/margins": 4.1374382972717285, + "rewards/rejected": -1.1932132244110107, + "step": 2986 + }, + { + "epoch": 0.75, + "grad_norm": 3.567546844482422, + "learning_rate": 5.028816984550515e-06, + "logits/chosen": -0.36019372940063477, + "logits/rejected": -0.3884311616420746, + "logps/chosen": -50.461971282958984, + "logps/rejected": -85.47592163085938, + "loss": 0.7318, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2081804275512695, + "rewards/margins": 4.957740783691406, + "rewards/rejected": -1.7495601177215576, + "step": 2987 + }, + { + "epoch": 0.75, + "grad_norm": 6.024531841278076, + "learning_rate": 5.026197283853345e-06, + "logits/chosen": -0.30112671852111816, + "logits/rejected": -0.3312672972679138, + "logps/chosen": -68.14301300048828, + "logps/rejected": -76.34302520751953, + "loss": 0.954, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5635673999786377, + "rewards/margins": 3.2930891513824463, + "rewards/rejected": -0.729521632194519, + "step": 2988 + }, + { + "epoch": 0.75, + "grad_norm": 4.038647174835205, + "learning_rate": 5.023577575964455e-06, + "logits/chosen": -0.27538418769836426, + "logits/rejected": -0.40638622641563416, + "logps/chosen": -54.18159484863281, + "logps/rejected": -86.67167663574219, + "loss": 0.8031, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8140931129455566, + "rewards/margins": 5.31114387512207, + "rewards/rejected": -2.4970510005950928, + "step": 2989 + }, + { + "epoch": 0.75, + "grad_norm": 25.396121978759766, + "learning_rate": 5.020957861603011e-06, + "logits/chosen": -0.30565333366394043, + "logits/rejected": -0.3563494384288788, + "logps/chosen": -50.40304946899414, + "logps/rejected": -73.43391418457031, + "loss": 0.8996, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.702777862548828, + "rewards/margins": 3.4730117321014404, + "rewards/rejected": -0.7702338695526123, + "step": 2990 + }, + { + "epoch": 0.75, + "grad_norm": 9.489278793334961, + "learning_rate": 5.018338141488182e-06, + "logits/chosen": -0.2172682285308838, + "logits/rejected": -0.29427576065063477, + "logps/chosen": -57.18115997314453, + "logps/rejected": -79.44259643554688, + "loss": 0.8097, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.925875186920166, + "rewards/margins": 3.8819990158081055, + "rewards/rejected": -0.956123948097229, + "step": 2991 + }, + { + "epoch": 0.75, + "grad_norm": 5.651737213134766, + "learning_rate": 5.015718416339139e-06, + "logits/chosen": -0.3460127115249634, + "logits/rejected": -0.4347396492958069, + "logps/chosen": -52.88710021972656, + "logps/rejected": -80.27510833740234, + "loss": 0.858, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.788351058959961, + "rewards/margins": 4.1977081298828125, + "rewards/rejected": -1.4093574285507202, + "step": 2992 + }, + { + "epoch": 0.75, + "grad_norm": 4.622745990753174, + "learning_rate": 5.013098686875051e-06, + "logits/chosen": -0.24597978591918945, + "logits/rejected": -0.3535315692424774, + "logps/chosen": -44.99045944213867, + "logps/rejected": -59.036617279052734, + "loss": 0.6977, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.982036828994751, + "rewards/margins": 4.013937473297119, + "rewards/rejected": -1.0319007635116577, + "step": 2993 + }, + { + "epoch": 0.75, + "grad_norm": 3.2134697437286377, + "learning_rate": 5.010478953815089e-06, + "logits/chosen": -0.3434136211872101, + "logits/rejected": -0.45212700963020325, + "logps/chosen": -51.981075286865234, + "logps/rejected": -74.08501434326172, + "loss": 0.7209, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.793365001678467, + "rewards/margins": 4.676076412200928, + "rewards/rejected": -1.8827111721038818, + "step": 2994 + }, + { + "epoch": 0.75, + "grad_norm": 3.6709885597229004, + "learning_rate": 5.0078592178784305e-06, + "logits/chosen": -0.3704683780670166, + "logits/rejected": -0.3963223993778229, + "logps/chosen": -50.6022834777832, + "logps/rejected": -90.21556091308594, + "loss": 0.7543, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8704521656036377, + "rewards/margins": 3.70631742477417, + "rewards/rejected": -0.8358653783798218, + "step": 2995 + }, + { + "epoch": 0.75, + "grad_norm": 6.470154762268066, + "learning_rate": 5.005239479784246e-06, + "logits/chosen": -0.3171199560165405, + "logits/rejected": -0.4472517967224121, + "logps/chosen": -62.42390441894531, + "logps/rejected": -67.968017578125, + "loss": 0.7965, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9081079959869385, + "rewards/margins": 4.372992515563965, + "rewards/rejected": -1.4648844003677368, + "step": 2996 + }, + { + "epoch": 0.75, + "grad_norm": 5.165666103363037, + "learning_rate": 5.0026197402517105e-06, + "logits/chosen": -0.2801518440246582, + "logits/rejected": -0.41433146595954895, + "logps/chosen": -57.385009765625, + "logps/rejected": -67.62122344970703, + "loss": 0.7422, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9514424800872803, + "rewards/margins": 4.3741679191589355, + "rewards/rejected": -1.4227254390716553, + "step": 2997 + }, + { + "epoch": 0.75, + "grad_norm": 4.311984062194824, + "learning_rate": 5e-06, + "logits/chosen": -0.2994457185268402, + "logits/rejected": -0.3410252630710602, + "logps/chosen": -52.853824615478516, + "logps/rejected": -84.86810302734375, + "loss": 0.8719, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.921678304672241, + "rewards/margins": 4.078214168548584, + "rewards/rejected": -1.1565361022949219, + "step": 2998 + }, + { + "epoch": 0.75, + "grad_norm": 19.417131423950195, + "learning_rate": 4.997380259748291e-06, + "logits/chosen": -0.25666263699531555, + "logits/rejected": -0.37611156702041626, + "logps/chosen": -75.52969360351562, + "logps/rejected": -83.12753295898438, + "loss": 0.9398, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7506535053253174, + "rewards/margins": 4.058938026428223, + "rewards/rejected": -1.3082845211029053, + "step": 2999 + }, + { + "epoch": 0.75, + "grad_norm": 9.710296630859375, + "learning_rate": 4.994760520215755e-06, + "logits/chosen": -0.3036300837993622, + "logits/rejected": -0.3906691074371338, + "logps/chosen": -45.0434684753418, + "logps/rejected": -87.82166290283203, + "loss": 0.8026, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.737203598022461, + "rewards/margins": 4.988407611846924, + "rewards/rejected": -2.251204490661621, + "step": 3000 + }, + { + "epoch": 0.75, + "grad_norm": 3.519116163253784, + "learning_rate": 4.992140782121571e-06, + "logits/chosen": -0.2422206997871399, + "logits/rejected": -0.392616331577301, + "logps/chosen": -59.07563781738281, + "logps/rejected": -80.20679473876953, + "loss": 0.7878, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.703364133834839, + "rewards/margins": 5.305666923522949, + "rewards/rejected": -2.6023025512695312, + "step": 3001 + }, + { + "epoch": 0.75, + "grad_norm": 4.9356231689453125, + "learning_rate": 4.989521046184911e-06, + "logits/chosen": -0.2629053592681885, + "logits/rejected": -0.3688313364982605, + "logps/chosen": -61.05861282348633, + "logps/rejected": -88.27207946777344, + "loss": 0.8922, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5810186862945557, + "rewards/margins": 4.6235127449035645, + "rewards/rejected": -2.0424938201904297, + "step": 3002 + }, + { + "epoch": 0.75, + "grad_norm": 4.249786376953125, + "learning_rate": 4.9869013131249505e-06, + "logits/chosen": -0.2995379865169525, + "logits/rejected": -0.46498048305511475, + "logps/chosen": -57.05510330200195, + "logps/rejected": -82.32244873046875, + "loss": 0.8739, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7684006690979004, + "rewards/margins": 4.668545722961426, + "rewards/rejected": -1.9001449346542358, + "step": 3003 + }, + { + "epoch": 0.75, + "grad_norm": 5.794661045074463, + "learning_rate": 4.984281583660864e-06, + "logits/chosen": -0.3777688145637512, + "logits/rejected": -0.46558886766433716, + "logps/chosen": -60.31829833984375, + "logps/rejected": -61.2696533203125, + "loss": 0.7976, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9429214000701904, + "rewards/margins": 3.7896499633789062, + "rewards/rejected": -0.8467288017272949, + "step": 3004 + }, + { + "epoch": 0.75, + "grad_norm": 3.422565221786499, + "learning_rate": 4.981661858511818e-06, + "logits/chosen": -0.31014081835746765, + "logits/rejected": -0.3739880621433258, + "logps/chosen": -55.94728088378906, + "logps/rejected": -90.22254180908203, + "loss": 0.7345, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.130552053451538, + "rewards/margins": 4.414545059204102, + "rewards/rejected": -1.2839930057525635, + "step": 3005 + }, + { + "epoch": 0.75, + "grad_norm": 4.460886001586914, + "learning_rate": 4.97904213839699e-06, + "logits/chosen": -0.22840113937854767, + "logits/rejected": -0.3573639392852783, + "logps/chosen": -58.89793395996094, + "logps/rejected": -79.15730285644531, + "loss": 0.8173, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.827208995819092, + "rewards/margins": 4.639745712280273, + "rewards/rejected": -1.8125368356704712, + "step": 3006 + }, + { + "epoch": 0.75, + "grad_norm": 5.403207302093506, + "learning_rate": 4.976422424035547e-06, + "logits/chosen": -0.29125189781188965, + "logits/rejected": -0.3616922199726105, + "logps/chosen": -48.16242980957031, + "logps/rejected": -79.9442367553711, + "loss": 0.6614, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.685011625289917, + "rewards/margins": 3.382061719894409, + "rewards/rejected": -0.6970499157905579, + "step": 3007 + }, + { + "epoch": 0.75, + "grad_norm": 8.381304740905762, + "learning_rate": 4.973802716146658e-06, + "logits/chosen": -0.2964540719985962, + "logits/rejected": -0.3326111137866974, + "logps/chosen": -51.60662078857422, + "logps/rejected": -77.59556579589844, + "loss": 0.8617, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8889427185058594, + "rewards/margins": 3.6363368034362793, + "rewards/rejected": -0.7473944425582886, + "step": 3008 + }, + { + "epoch": 0.75, + "grad_norm": 5.6808037757873535, + "learning_rate": 4.971183015449487e-06, + "logits/chosen": -0.18497195839881897, + "logits/rejected": -0.3317018747329712, + "logps/chosen": -65.66349029541016, + "logps/rejected": -73.55870819091797, + "loss": 0.764, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.869290590286255, + "rewards/margins": 4.367864608764648, + "rewards/rejected": -1.4985740184783936, + "step": 3009 + }, + { + "epoch": 0.75, + "grad_norm": 2.9818360805511475, + "learning_rate": 4.9685633226632004e-06, + "logits/chosen": -0.30845245718955994, + "logits/rejected": -0.4375746250152588, + "logps/chosen": -57.562408447265625, + "logps/rejected": -82.96178436279297, + "loss": 0.6846, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3038430213928223, + "rewards/margins": 4.963503837585449, + "rewards/rejected": -1.6596603393554688, + "step": 3010 + }, + { + "epoch": 0.75, + "grad_norm": 3.6575236320495605, + "learning_rate": 4.9659436385069605e-06, + "logits/chosen": -0.29406964778900146, + "logits/rejected": -0.39081230759620667, + "logps/chosen": -46.62651443481445, + "logps/rejected": -79.20695495605469, + "loss": 0.7543, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.111405849456787, + "rewards/margins": 4.078797340393066, + "rewards/rejected": -0.9673919677734375, + "step": 3011 + }, + { + "epoch": 0.75, + "grad_norm": 5.501311302185059, + "learning_rate": 4.963323963699926e-06, + "logits/chosen": -0.3530530333518982, + "logits/rejected": -0.46598291397094727, + "logps/chosen": -56.728843688964844, + "logps/rejected": -85.27090454101562, + "loss": 0.7213, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9565377235412598, + "rewards/margins": 5.094977855682373, + "rewards/rejected": -2.1384401321411133, + "step": 3012 + }, + { + "epoch": 0.75, + "grad_norm": 4.622067928314209, + "learning_rate": 4.960704298961257e-06, + "logits/chosen": -0.30301713943481445, + "logits/rejected": -0.41439202427864075, + "logps/chosen": -60.69865417480469, + "logps/rejected": -86.2589111328125, + "loss": 0.7624, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.607546806335449, + "rewards/margins": 4.478132724761963, + "rewards/rejected": -1.870585560798645, + "step": 3013 + }, + { + "epoch": 0.75, + "grad_norm": 4.648158073425293, + "learning_rate": 4.958084645010105e-06, + "logits/chosen": -0.3093584179878235, + "logits/rejected": -0.40607890486717224, + "logps/chosen": -59.24372100830078, + "logps/rejected": -83.81013488769531, + "loss": 0.6967, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.761017322540283, + "rewards/margins": 4.158686637878418, + "rewards/rejected": -1.3976689577102661, + "step": 3014 + }, + { + "epoch": 0.75, + "grad_norm": 3.6903653144836426, + "learning_rate": 4.955465002565621e-06, + "logits/chosen": -0.2703171968460083, + "logits/rejected": -0.4096642732620239, + "logps/chosen": -63.33079528808594, + "logps/rejected": -84.20736694335938, + "loss": 0.7226, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.969630718231201, + "rewards/margins": 4.6913323402404785, + "rewards/rejected": -1.7217016220092773, + "step": 3015 + }, + { + "epoch": 0.75, + "grad_norm": 10.313974380493164, + "learning_rate": 4.9528453723469575e-06, + "logits/chosen": -0.32433629035949707, + "logits/rejected": -0.3889032006263733, + "logps/chosen": -60.962745666503906, + "logps/rejected": -84.3000717163086, + "loss": 0.8684, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.691221237182617, + "rewards/margins": 3.5808265209198, + "rewards/rejected": -0.8896050453186035, + "step": 3016 + }, + { + "epoch": 0.75, + "grad_norm": 7.506126403808594, + "learning_rate": 4.950225755073252e-06, + "logits/chosen": -0.2871345579624176, + "logits/rejected": -0.41628119349479675, + "logps/chosen": -62.26585388183594, + "logps/rejected": -69.9645004272461, + "loss": 0.7517, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.044135808944702, + "rewards/margins": 4.838805675506592, + "rewards/rejected": -1.7946701049804688, + "step": 3017 + }, + { + "epoch": 0.75, + "grad_norm": 4.127213478088379, + "learning_rate": 4.9476061514636555e-06, + "logits/chosen": -0.2657882571220398, + "logits/rejected": -0.37325721979141235, + "logps/chosen": -59.412086486816406, + "logps/rejected": -86.27450561523438, + "loss": 0.7721, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.997199773788452, + "rewards/margins": 4.638891696929932, + "rewards/rejected": -1.641692042350769, + "step": 3018 + }, + { + "epoch": 0.76, + "grad_norm": 6.611828804016113, + "learning_rate": 4.9449865622373e-06, + "logits/chosen": -0.2628169357776642, + "logits/rejected": -0.3787982165813446, + "logps/chosen": -65.17220306396484, + "logps/rejected": -78.08349609375, + "loss": 0.803, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1215648651123047, + "rewards/margins": 4.139617443084717, + "rewards/rejected": -1.018052577972412, + "step": 3019 + }, + { + "epoch": 0.76, + "grad_norm": 4.7067718505859375, + "learning_rate": 4.942366988113319e-06, + "logits/chosen": -0.32820039987564087, + "logits/rejected": -0.47559136152267456, + "logps/chosen": -51.18568420410156, + "logps/rejected": -71.31220245361328, + "loss": 0.686, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6815974712371826, + "rewards/margins": 4.833612442016602, + "rewards/rejected": -2.15201473236084, + "step": 3020 + }, + { + "epoch": 0.76, + "grad_norm": 4.913428783416748, + "learning_rate": 4.939747429810845e-06, + "logits/chosen": -0.28912031650543213, + "logits/rejected": -0.42037975788116455, + "logps/chosen": -55.43324279785156, + "logps/rejected": -71.83350372314453, + "loss": 0.744, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7251720428466797, + "rewards/margins": 4.578120708465576, + "rewards/rejected": -1.8529484272003174, + "step": 3021 + }, + { + "epoch": 0.76, + "grad_norm": 4.67929220199585, + "learning_rate": 4.937127888048999e-06, + "logits/chosen": -0.3458598256111145, + "logits/rejected": -0.4653015732765198, + "logps/chosen": -57.32166290283203, + "logps/rejected": -72.48709869384766, + "loss": 0.778, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1053054332733154, + "rewards/margins": 4.466564178466797, + "rewards/rejected": -1.3612592220306396, + "step": 3022 + }, + { + "epoch": 0.76, + "grad_norm": 4.819404602050781, + "learning_rate": 4.934508363546906e-06, + "logits/chosen": -0.2373536229133606, + "logits/rejected": -0.4450695514678955, + "logps/chosen": -56.78292465209961, + "logps/rejected": -59.1710205078125, + "loss": 0.7152, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7646148204803467, + "rewards/margins": 4.495580673217773, + "rewards/rejected": -1.7309657335281372, + "step": 3023 + }, + { + "epoch": 0.76, + "grad_norm": 6.603288173675537, + "learning_rate": 4.931888857023682e-06, + "logits/chosen": -0.2881511449813843, + "logits/rejected": -0.37689828872680664, + "logps/chosen": -46.56312942504883, + "logps/rejected": -76.51705932617188, + "loss": 0.696, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.842698335647583, + "rewards/margins": 4.467310905456543, + "rewards/rejected": -1.6246129274368286, + "step": 3024 + }, + { + "epoch": 0.76, + "grad_norm": 6.274590015411377, + "learning_rate": 4.929269369198433e-06, + "logits/chosen": -0.29056599736213684, + "logits/rejected": -0.4462814927101135, + "logps/chosen": -59.25848388671875, + "logps/rejected": -72.80863952636719, + "loss": 0.6173, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.634605646133423, + "rewards/margins": 4.086395740509033, + "rewards/rejected": -1.4517897367477417, + "step": 3025 + }, + { + "epoch": 0.76, + "grad_norm": 11.199725151062012, + "learning_rate": 4.926649900790272e-06, + "logits/chosen": -0.16545455157756805, + "logits/rejected": -0.2720740735530853, + "logps/chosen": -58.2507438659668, + "logps/rejected": -83.83932495117188, + "loss": 0.7229, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.965888023376465, + "rewards/margins": 4.379280090332031, + "rewards/rejected": -1.413392186164856, + "step": 3026 + }, + { + "epoch": 0.76, + "grad_norm": 4.973666191101074, + "learning_rate": 4.924030452518292e-06, + "logits/chosen": -0.3630087673664093, + "logits/rejected": -0.4451223611831665, + "logps/chosen": -52.56803894042969, + "logps/rejected": -80.1173324584961, + "loss": 0.8267, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9186692237854004, + "rewards/margins": 4.241790771484375, + "rewards/rejected": -1.3231213092803955, + "step": 3027 + }, + { + "epoch": 0.76, + "grad_norm": 3.785090208053589, + "learning_rate": 4.921411025101597e-06, + "logits/chosen": -0.3898302912712097, + "logits/rejected": -0.47044092416763306, + "logps/chosen": -57.373779296875, + "logps/rejected": -75.63418579101562, + "loss": 0.8087, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9522511959075928, + "rewards/margins": 4.121056079864502, + "rewards/rejected": -1.1688048839569092, + "step": 3028 + }, + { + "epoch": 0.76, + "grad_norm": 4.952892780303955, + "learning_rate": 4.918791619259269e-06, + "logits/chosen": -0.30413469672203064, + "logits/rejected": -0.42822280526161194, + "logps/chosen": -58.375221252441406, + "logps/rejected": -67.13993835449219, + "loss": 0.7986, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.552339792251587, + "rewards/margins": 4.500758647918701, + "rewards/rejected": -1.9484186172485352, + "step": 3029 + }, + { + "epoch": 0.76, + "grad_norm": 4.217815399169922, + "learning_rate": 4.916172235710393e-06, + "logits/chosen": -0.36179664731025696, + "logits/rejected": -0.5268373489379883, + "logps/chosen": -57.1559944152832, + "logps/rejected": -82.5537338256836, + "loss": 0.7732, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.118374824523926, + "rewards/margins": 5.129663944244385, + "rewards/rejected": -2.011289119720459, + "step": 3030 + }, + { + "epoch": 0.76, + "grad_norm": 5.535085678100586, + "learning_rate": 4.913552875174048e-06, + "logits/chosen": -0.29595303535461426, + "logits/rejected": -0.3636290729045868, + "logps/chosen": -55.68000411987305, + "logps/rejected": -89.10393524169922, + "loss": 0.7894, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.845510721206665, + "rewards/margins": 4.313895225524902, + "rewards/rejected": -1.4683845043182373, + "step": 3031 + }, + { + "epoch": 0.76, + "grad_norm": 5.077549934387207, + "learning_rate": 4.910933538369303e-06, + "logits/chosen": -0.31825825572013855, + "logits/rejected": -0.40416163206100464, + "logps/chosen": -47.10080337524414, + "logps/rejected": -77.78056335449219, + "loss": 0.7017, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8072729110717773, + "rewards/margins": 4.37066650390625, + "rewards/rejected": -1.5633937120437622, + "step": 3032 + }, + { + "epoch": 0.76, + "grad_norm": 9.929214477539062, + "learning_rate": 4.908314226015225e-06, + "logits/chosen": -0.2738085687160492, + "logits/rejected": -0.38749462366104126, + "logps/chosen": -66.44918060302734, + "logps/rejected": -83.83612823486328, + "loss": 0.9234, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.832751750946045, + "rewards/margins": 4.442568778991699, + "rewards/rejected": -1.6098171472549438, + "step": 3033 + }, + { + "epoch": 0.76, + "grad_norm": 4.617720127105713, + "learning_rate": 4.90569493883087e-06, + "logits/chosen": -0.285666823387146, + "logits/rejected": -0.3810884356498718, + "logps/chosen": -61.23303985595703, + "logps/rejected": -85.83927917480469, + "loss": 0.7952, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.021097183227539, + "rewards/margins": 3.753995656967163, + "rewards/rejected": -0.732898473739624, + "step": 3034 + }, + { + "epoch": 0.76, + "grad_norm": 2.8808133602142334, + "learning_rate": 4.903075677535289e-06, + "logits/chosen": -0.3540939688682556, + "logits/rejected": -0.396492600440979, + "logps/chosen": -49.31529235839844, + "logps/rejected": -88.18680572509766, + "loss": 0.6397, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.759725332260132, + "rewards/margins": 4.515213489532471, + "rewards/rejected": -1.7554881572723389, + "step": 3035 + }, + { + "epoch": 0.76, + "grad_norm": 3.988612174987793, + "learning_rate": 4.900456442847528e-06, + "logits/chosen": -0.22239309549331665, + "logits/rejected": -0.31539028882980347, + "logps/chosen": -53.67955017089844, + "logps/rejected": -86.35813903808594, + "loss": 0.6963, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5654029846191406, + "rewards/margins": 4.185908317565918, + "rewards/rejected": -1.6205052137374878, + "step": 3036 + }, + { + "epoch": 0.76, + "grad_norm": 6.667260646820068, + "learning_rate": 4.8978372354866175e-06, + "logits/chosen": -0.3344370126724243, + "logits/rejected": -0.44301798939704895, + "logps/chosen": -55.39439010620117, + "logps/rejected": -81.9665298461914, + "loss": 0.736, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.075498104095459, + "rewards/margins": 4.852550029754639, + "rewards/rejected": -1.7770516872406006, + "step": 3037 + }, + { + "epoch": 0.76, + "grad_norm": 3.189878225326538, + "learning_rate": 4.895218056171593e-06, + "logits/chosen": -0.3302624523639679, + "logits/rejected": -0.45072248578071594, + "logps/chosen": -47.818328857421875, + "logps/rejected": -70.35010528564453, + "loss": 0.7171, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.108004093170166, + "rewards/margins": 4.82924222946167, + "rewards/rejected": -1.7212387323379517, + "step": 3038 + }, + { + "epoch": 0.76, + "grad_norm": 3.8328516483306885, + "learning_rate": 4.892598905621473e-06, + "logits/chosen": -0.32460182905197144, + "logits/rejected": -0.391448050737381, + "logps/chosen": -47.785491943359375, + "logps/rejected": -69.70697784423828, + "loss": 0.7387, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1151788234710693, + "rewards/margins": 4.078667640686035, + "rewards/rejected": -0.9634888768196106, + "step": 3039 + }, + { + "epoch": 0.76, + "grad_norm": 6.02952241897583, + "learning_rate": 4.88997978455527e-06, + "logits/chosen": -0.2905171811580658, + "logits/rejected": -0.41780662536621094, + "logps/chosen": -65.30786895751953, + "logps/rejected": -72.17972564697266, + "loss": 0.8039, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9560775756835938, + "rewards/margins": 4.618526935577393, + "rewards/rejected": -1.6624493598937988, + "step": 3040 + }, + { + "epoch": 0.76, + "grad_norm": 7.036392688751221, + "learning_rate": 4.887360693691991e-06, + "logits/chosen": -0.28564366698265076, + "logits/rejected": -0.3931255340576172, + "logps/chosen": -59.09593963623047, + "logps/rejected": -91.66423034667969, + "loss": 0.8555, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7498931884765625, + "rewards/margins": 5.075676441192627, + "rewards/rejected": -2.3257830142974854, + "step": 3041 + }, + { + "epoch": 0.76, + "grad_norm": 5.03404426574707, + "learning_rate": 4.884741633750632e-06, + "logits/chosen": -0.39324769377708435, + "logits/rejected": -0.4294118881225586, + "logps/chosen": -51.581520080566406, + "logps/rejected": -101.82337951660156, + "loss": 0.7083, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9465689659118652, + "rewards/margins": 5.445178985595703, + "rewards/rejected": -2.498610258102417, + "step": 3042 + }, + { + "epoch": 0.76, + "grad_norm": 12.609029769897461, + "learning_rate": 4.882122605450181e-06, + "logits/chosen": -0.30937373638153076, + "logits/rejected": -0.5152533650398254, + "logps/chosen": -62.24604034423828, + "logps/rejected": -73.3188705444336, + "loss": 0.6138, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8024630546569824, + "rewards/margins": 5.554919719696045, + "rewards/rejected": -2.7524566650390625, + "step": 3043 + }, + { + "epoch": 0.76, + "grad_norm": 7.231804847717285, + "learning_rate": 4.879503609509619e-06, + "logits/chosen": -0.29334232211112976, + "logits/rejected": -0.3776787519454956, + "logps/chosen": -59.198150634765625, + "logps/rejected": -71.38232421875, + "loss": 0.9154, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6330454349517822, + "rewards/margins": 3.2273290157318115, + "rewards/rejected": -0.594283938407898, + "step": 3044 + }, + { + "epoch": 0.76, + "grad_norm": 7.147055149078369, + "learning_rate": 4.876884646647916e-06, + "logits/chosen": -0.34484872221946716, + "logits/rejected": -0.35920751094818115, + "logps/chosen": -50.20205307006836, + "logps/rejected": -78.82990264892578, + "loss": 0.825, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.648454427719116, + "rewards/margins": 3.3039073944091797, + "rewards/rejected": -0.6554528474807739, + "step": 3045 + }, + { + "epoch": 0.76, + "grad_norm": 12.312824249267578, + "learning_rate": 4.874265717584034e-06, + "logits/chosen": -0.3549245297908783, + "logits/rejected": -0.43661561608314514, + "logps/chosen": -44.52496337890625, + "logps/rejected": -65.931884765625, + "loss": 0.9572, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.673553228378296, + "rewards/margins": 2.928032159805298, + "rewards/rejected": -0.25447878241539, + "step": 3046 + }, + { + "epoch": 0.76, + "grad_norm": 4.748405933380127, + "learning_rate": 4.871646823036925e-06, + "logits/chosen": -0.4139266610145569, + "logits/rejected": -0.5088695883750916, + "logps/chosen": -61.019561767578125, + "logps/rejected": -69.51554107666016, + "loss": 0.8092, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9406979084014893, + "rewards/margins": 3.999101161956787, + "rewards/rejected": -1.0584032535552979, + "step": 3047 + }, + { + "epoch": 0.76, + "grad_norm": 4.650622844696045, + "learning_rate": 4.869027963725536e-06, + "logits/chosen": -0.36925506591796875, + "logits/rejected": -0.4294235110282898, + "logps/chosen": -57.47101593017578, + "logps/rejected": -102.94839477539062, + "loss": 0.7853, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.965609312057495, + "rewards/margins": 4.996096134185791, + "rewards/rejected": -2.030486583709717, + "step": 3048 + }, + { + "epoch": 0.76, + "grad_norm": 3.6264193058013916, + "learning_rate": 4.866409140368795e-06, + "logits/chosen": -0.2803688943386078, + "logits/rejected": -0.3561522364616394, + "logps/chosen": -62.94910430908203, + "logps/rejected": -94.46192932128906, + "loss": 0.7252, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7238965034484863, + "rewards/margins": 4.750492095947266, + "rewards/rejected": -2.0265960693359375, + "step": 3049 + }, + { + "epoch": 0.76, + "grad_norm": 4.557307720184326, + "learning_rate": 4.8637903536856266e-06, + "logits/chosen": -0.25923818349838257, + "logits/rejected": -0.36370038986206055, + "logps/chosen": -57.289649963378906, + "logps/rejected": -85.24275970458984, + "loss": 0.7241, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.044424057006836, + "rewards/margins": 4.692355155944824, + "rewards/rejected": -1.6479308605194092, + "step": 3050 + }, + { + "epoch": 0.76, + "grad_norm": 5.408829689025879, + "learning_rate": 4.861171604394946e-06, + "logits/chosen": -0.3077453374862671, + "logits/rejected": -0.38759931921958923, + "logps/chosen": -57.82349395751953, + "logps/rejected": -83.83961486816406, + "loss": 0.724, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.799727439880371, + "rewards/margins": 3.67311429977417, + "rewards/rejected": -0.873386800289154, + "step": 3051 + }, + { + "epoch": 0.76, + "grad_norm": 6.315060138702393, + "learning_rate": 4.858552893215655e-06, + "logits/chosen": -0.3898613452911377, + "logits/rejected": -0.41285598278045654, + "logps/chosen": -44.28948974609375, + "logps/rejected": -81.288818359375, + "loss": 0.7767, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8509910106658936, + "rewards/margins": 4.0556111335754395, + "rewards/rejected": -1.2046202421188354, + "step": 3052 + }, + { + "epoch": 0.76, + "grad_norm": 4.684748649597168, + "learning_rate": 4.855934220866648e-06, + "logits/chosen": -0.2624739408493042, + "logits/rejected": -0.45128196477890015, + "logps/chosen": -60.84051513671875, + "logps/rejected": -86.57749938964844, + "loss": 0.636, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.558971405029297, + "rewards/margins": 5.056859016418457, + "rewards/rejected": -2.4978880882263184, + "step": 3053 + }, + { + "epoch": 0.76, + "grad_norm": 4.400240898132324, + "learning_rate": 4.853315588066806e-06, + "logits/chosen": -0.27569258213043213, + "logits/rejected": -0.36796092987060547, + "logps/chosen": -64.43128967285156, + "logps/rejected": -80.66788482666016, + "loss": 0.8958, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.811776638031006, + "rewards/margins": 4.014388084411621, + "rewards/rejected": -1.2026116847991943, + "step": 3054 + }, + { + "epoch": 0.76, + "grad_norm": 3.4538848400115967, + "learning_rate": 4.850696995535e-06, + "logits/chosen": -0.3036878705024719, + "logits/rejected": -0.4249902367591858, + "logps/chosen": -51.223426818847656, + "logps/rejected": -84.04381561279297, + "loss": 0.7363, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.056471824645996, + "rewards/margins": 4.763962745666504, + "rewards/rejected": -1.707491159439087, + "step": 3055 + }, + { + "epoch": 0.76, + "grad_norm": 5.690296173095703, + "learning_rate": 4.84807844399009e-06, + "logits/chosen": -0.28220003843307495, + "logits/rejected": -0.3825848698616028, + "logps/chosen": -59.117286682128906, + "logps/rejected": -72.3951416015625, + "loss": 0.8317, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.673948287963867, + "rewards/margins": 3.344970941543579, + "rewards/rejected": -0.6710226535797119, + "step": 3056 + }, + { + "epoch": 0.76, + "grad_norm": 9.625473976135254, + "learning_rate": 4.845459934150925e-06, + "logits/chosen": -0.2597830593585968, + "logits/rejected": -0.398674875497818, + "logps/chosen": -69.2138442993164, + "logps/rejected": -77.79917907714844, + "loss": 1.2704, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.4133331775665283, + "rewards/margins": 3.1490895748138428, + "rewards/rejected": -0.7357565760612488, + "step": 3057 + }, + { + "epoch": 0.76, + "grad_norm": 3.541346549987793, + "learning_rate": 4.842841466736344e-06, + "logits/chosen": -0.3215515613555908, + "logits/rejected": -0.5201030969619751, + "logps/chosen": -47.87913513183594, + "logps/rejected": -61.3140983581543, + "loss": 0.5955, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.925403594970703, + "rewards/margins": 5.37601375579834, + "rewards/rejected": -2.4506101608276367, + "step": 3058 + }, + { + "epoch": 0.77, + "grad_norm": 4.316867828369141, + "learning_rate": 4.840223042465173e-06, + "logits/chosen": -0.28839677572250366, + "logits/rejected": -0.4097106158733368, + "logps/chosen": -61.78440856933594, + "logps/rejected": -72.1072998046875, + "loss": 0.7048, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.659897565841675, + "rewards/margins": 3.8177363872528076, + "rewards/rejected": -1.1578385829925537, + "step": 3059 + }, + { + "epoch": 0.77, + "grad_norm": 4.679884433746338, + "learning_rate": 4.837604662056222e-06, + "logits/chosen": -0.31547582149505615, + "logits/rejected": -0.43389004468917847, + "logps/chosen": -52.12153625488281, + "logps/rejected": -73.04979705810547, + "loss": 0.6538, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0240068435668945, + "rewards/margins": 4.417196273803711, + "rewards/rejected": -1.3931890726089478, + "step": 3060 + }, + { + "epoch": 0.77, + "grad_norm": 3.8911640644073486, + "learning_rate": 4.834986326228297e-06, + "logits/chosen": -0.36634474992752075, + "logits/rejected": -0.3771263062953949, + "logps/chosen": -43.136131286621094, + "logps/rejected": -95.9066162109375, + "loss": 0.7844, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0068581104278564, + "rewards/margins": 4.935697078704834, + "rewards/rejected": -1.928838849067688, + "step": 3061 + }, + { + "epoch": 0.77, + "grad_norm": 6.487798690795898, + "learning_rate": 4.832368035700184e-06, + "logits/chosen": -0.28250622749328613, + "logits/rejected": -0.38125962018966675, + "logps/chosen": -56.75364303588867, + "logps/rejected": -69.78699493408203, + "loss": 0.7098, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8013455867767334, + "rewards/margins": 3.714404821395874, + "rewards/rejected": -0.9130592942237854, + "step": 3062 + }, + { + "epoch": 0.77, + "grad_norm": 8.341302871704102, + "learning_rate": 4.829749791190665e-06, + "logits/chosen": -0.3876707851886749, + "logits/rejected": -0.45215916633605957, + "logps/chosen": -45.25009536743164, + "logps/rejected": -78.620849609375, + "loss": 0.6799, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.159377098083496, + "rewards/margins": 4.448867321014404, + "rewards/rejected": -1.28948974609375, + "step": 3063 + }, + { + "epoch": 0.77, + "grad_norm": 12.922962188720703, + "learning_rate": 4.827131593418501e-06, + "logits/chosen": -0.3340527415275574, + "logits/rejected": -0.4132305979728699, + "logps/chosen": -55.04671859741211, + "logps/rejected": -78.9515609741211, + "loss": 0.8351, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.906078338623047, + "rewards/margins": 4.261179447174072, + "rewards/rejected": -1.3551013469696045, + "step": 3064 + }, + { + "epoch": 0.77, + "grad_norm": 5.096838474273682, + "learning_rate": 4.8245134431024434e-06, + "logits/chosen": -0.2821447253227234, + "logits/rejected": -0.3152148723602295, + "logps/chosen": -55.40129852294922, + "logps/rejected": -86.72409057617188, + "loss": 0.6997, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.775125026702881, + "rewards/margins": 4.146474361419678, + "rewards/rejected": -1.371349811553955, + "step": 3065 + }, + { + "epoch": 0.77, + "grad_norm": 4.702424049377441, + "learning_rate": 4.821895340961235e-06, + "logits/chosen": -0.29263460636138916, + "logits/rejected": -0.3354506492614746, + "logps/chosen": -67.05650329589844, + "logps/rejected": -99.45486450195312, + "loss": 0.8418, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7468655109405518, + "rewards/margins": 4.708756446838379, + "rewards/rejected": -1.9618908166885376, + "step": 3066 + }, + { + "epoch": 0.77, + "grad_norm": 4.386317253112793, + "learning_rate": 4.8192772877135965e-06, + "logits/chosen": -0.30750927329063416, + "logits/rejected": -0.3765814006328583, + "logps/chosen": -48.67847442626953, + "logps/rejected": -73.625244140625, + "loss": 0.7831, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.628052234649658, + "rewards/margins": 4.0755934715271, + "rewards/rejected": -1.4475409984588623, + "step": 3067 + }, + { + "epoch": 0.77, + "grad_norm": 6.332527160644531, + "learning_rate": 4.816659284078243e-06, + "logits/chosen": -0.32549849152565, + "logits/rejected": -0.34717151522636414, + "logps/chosen": -53.16516876220703, + "logps/rejected": -92.77716064453125, + "loss": 0.7919, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8733584880828857, + "rewards/margins": 4.194353103637695, + "rewards/rejected": -1.3209946155548096, + "step": 3068 + }, + { + "epoch": 0.77, + "grad_norm": 7.635319232940674, + "learning_rate": 4.814041330773874e-06, + "logits/chosen": -0.24707865715026855, + "logits/rejected": -0.42107051610946655, + "logps/chosen": -58.63449478149414, + "logps/rejected": -76.36845397949219, + "loss": 0.8881, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.428755760192871, + "rewards/margins": 4.377598762512207, + "rewards/rejected": -1.9488435983657837, + "step": 3069 + }, + { + "epoch": 0.77, + "grad_norm": 5.349026203155518, + "learning_rate": 4.811423428519169e-06, + "logits/chosen": -0.16569046676158905, + "logits/rejected": -0.3045768439769745, + "logps/chosen": -64.18519592285156, + "logps/rejected": -63.524131774902344, + "loss": 0.7929, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.709116220474243, + "rewards/margins": 3.331395149230957, + "rewards/rejected": -0.6222788095474243, + "step": 3070 + }, + { + "epoch": 0.77, + "grad_norm": 8.538989067077637, + "learning_rate": 4.808805578032805e-06, + "logits/chosen": -0.20247666537761688, + "logits/rejected": -0.31260785460472107, + "logps/chosen": -59.01909255981445, + "logps/rejected": -81.29428100585938, + "loss": 0.6665, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.795309066772461, + "rewards/margins": 4.656279563903809, + "rewards/rejected": -1.8609704971313477, + "step": 3071 + }, + { + "epoch": 0.77, + "grad_norm": 3.896792411804199, + "learning_rate": 4.8061877800334315e-06, + "logits/chosen": -0.25572994351387024, + "logits/rejected": -0.4383924603462219, + "logps/chosen": -60.6844596862793, + "logps/rejected": -77.14994812011719, + "loss": 0.8356, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6701605319976807, + "rewards/margins": 4.763697624206543, + "rewards/rejected": -2.0935373306274414, + "step": 3072 + }, + { + "epoch": 0.77, + "grad_norm": 30.70270347595215, + "learning_rate": 4.803570035239699e-06, + "logits/chosen": -0.28292936086654663, + "logits/rejected": -0.4065239429473877, + "logps/chosen": -52.27139663696289, + "logps/rejected": -66.59255981445312, + "loss": 0.799, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.865339756011963, + "rewards/margins": 4.403210163116455, + "rewards/rejected": -1.5378706455230713, + "step": 3073 + }, + { + "epoch": 0.77, + "grad_norm": 8.869002342224121, + "learning_rate": 4.800952344370229e-06, + "logits/chosen": -0.3144509792327881, + "logits/rejected": -0.46177130937576294, + "logps/chosen": -58.75611877441406, + "logps/rejected": -66.45309448242188, + "loss": 0.7347, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7994728088378906, + "rewards/margins": 4.512420654296875, + "rewards/rejected": -1.7129476070404053, + "step": 3074 + }, + { + "epoch": 0.77, + "grad_norm": 3.8868935108184814, + "learning_rate": 4.798334708143634e-06, + "logits/chosen": -0.27132463455200195, + "logits/rejected": -0.39172685146331787, + "logps/chosen": -51.70927429199219, + "logps/rejected": -75.92920684814453, + "loss": 0.7843, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.024597644805908, + "rewards/margins": 4.437369346618652, + "rewards/rejected": -1.4127713441848755, + "step": 3075 + }, + { + "epoch": 0.77, + "grad_norm": 9.117644309997559, + "learning_rate": 4.795717127278515e-06, + "logits/chosen": -0.40935730934143066, + "logits/rejected": -0.4743596911430359, + "logps/chosen": -55.861019134521484, + "logps/rejected": -76.78842163085938, + "loss": 0.8068, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0279252529144287, + "rewards/margins": 4.204817295074463, + "rewards/rejected": -1.1768920421600342, + "step": 3076 + }, + { + "epoch": 0.77, + "grad_norm": 4.5502753257751465, + "learning_rate": 4.793099602493451e-06, + "logits/chosen": -0.25519078969955444, + "logits/rejected": -0.34118878841400146, + "logps/chosen": -66.39208984375, + "logps/rejected": -94.05956268310547, + "loss": 0.8034, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.755559206008911, + "rewards/margins": 4.227481842041016, + "rewards/rejected": -1.4719229936599731, + "step": 3077 + }, + { + "epoch": 0.77, + "grad_norm": 5.909065246582031, + "learning_rate": 4.790482134507013e-06, + "logits/chosen": -0.3481157720088959, + "logits/rejected": -0.4090951681137085, + "logps/chosen": -58.53630447387695, + "logps/rejected": -73.86305236816406, + "loss": 0.8907, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.974273681640625, + "rewards/margins": 3.405432939529419, + "rewards/rejected": -0.4311588704586029, + "step": 3078 + }, + { + "epoch": 0.77, + "grad_norm": 3.460092067718506, + "learning_rate": 4.78786472403775e-06, + "logits/chosen": -0.34681323170661926, + "logits/rejected": -0.39904195070266724, + "logps/chosen": -46.26806640625, + "logps/rejected": -68.61272430419922, + "loss": 0.8499, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9336302280426025, + "rewards/margins": 3.622803211212158, + "rewards/rejected": -0.6891731023788452, + "step": 3079 + }, + { + "epoch": 0.77, + "grad_norm": 6.600325107574463, + "learning_rate": 4.785247371804195e-06, + "logits/chosen": -0.33152568340301514, + "logits/rejected": -0.3758848309516907, + "logps/chosen": -51.54472351074219, + "logps/rejected": -78.86487579345703, + "loss": 0.859, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.718305826187134, + "rewards/margins": 3.742231607437134, + "rewards/rejected": -1.02392578125, + "step": 3080 + }, + { + "epoch": 0.77, + "grad_norm": 7.307646751403809, + "learning_rate": 4.782630078524873e-06, + "logits/chosen": -0.21448290348052979, + "logits/rejected": -0.36163297295570374, + "logps/chosen": -66.01589965820312, + "logps/rejected": -86.68146514892578, + "loss": 0.7759, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8403308391571045, + "rewards/margins": 4.219036102294922, + "rewards/rejected": -1.3787050247192383, + "step": 3081 + }, + { + "epoch": 0.77, + "grad_norm": 3.514364719390869, + "learning_rate": 4.780012844918282e-06, + "logits/chosen": -0.25059375166893005, + "logits/rejected": -0.3588123917579651, + "logps/chosen": -55.53975296020508, + "logps/rejected": -84.5465087890625, + "loss": 0.6583, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8712496757507324, + "rewards/margins": 4.539581775665283, + "rewards/rejected": -1.6683317422866821, + "step": 3082 + }, + { + "epoch": 0.77, + "grad_norm": 4.650937080383301, + "learning_rate": 4.777395671702916e-06, + "logits/chosen": -0.35031789541244507, + "logits/rejected": -0.44631606340408325, + "logps/chosen": -52.15183639526367, + "logps/rejected": -73.32642364501953, + "loss": 0.796, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.948740243911743, + "rewards/margins": 3.5262959003448486, + "rewards/rejected": -0.5775558948516846, + "step": 3083 + }, + { + "epoch": 0.77, + "grad_norm": 6.3605523109436035, + "learning_rate": 4.77477855959724e-06, + "logits/chosen": -0.18456843495368958, + "logits/rejected": -0.38369041681289673, + "logps/chosen": -62.34592819213867, + "logps/rejected": -71.77067565917969, + "loss": 0.8129, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.864537000656128, + "rewards/margins": 4.603054523468018, + "rewards/rejected": -1.7385177612304688, + "step": 3084 + }, + { + "epoch": 0.77, + "grad_norm": 3.5743565559387207, + "learning_rate": 4.772161509319707e-06, + "logits/chosen": -0.2926687002182007, + "logits/rejected": -0.38470250368118286, + "logps/chosen": -56.793853759765625, + "logps/rejected": -82.52705383300781, + "loss": 0.6603, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.00020694732666, + "rewards/margins": 4.241044998168945, + "rewards/rejected": -1.2408380508422852, + "step": 3085 + }, + { + "epoch": 0.77, + "grad_norm": 5.695867538452148, + "learning_rate": 4.769544521588759e-06, + "logits/chosen": -0.26986947655677795, + "logits/rejected": -0.44573938846588135, + "logps/chosen": -59.84601974487305, + "logps/rejected": -67.73031616210938, + "loss": 0.7403, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.706047534942627, + "rewards/margins": 4.978918075561523, + "rewards/rejected": -2.272871255874634, + "step": 3086 + }, + { + "epoch": 0.77, + "grad_norm": 6.967496871948242, + "learning_rate": 4.7669275971228106e-06, + "logits/chosen": -0.40791943669319153, + "logits/rejected": -0.5053626894950867, + "logps/chosen": -49.98096466064453, + "logps/rejected": -71.70336151123047, + "loss": 0.8218, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8243560791015625, + "rewards/margins": 4.114754676818848, + "rewards/rejected": -1.2903985977172852, + "step": 3087 + }, + { + "epoch": 0.77, + "grad_norm": 7.8823981285095215, + "learning_rate": 4.764310736640267e-06, + "logits/chosen": -0.3702043890953064, + "logits/rejected": -0.5007522702217102, + "logps/chosen": -55.666900634765625, + "logps/rejected": -59.55112838745117, + "loss": 0.8154, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.137849807739258, + "rewards/margins": 4.339187145233154, + "rewards/rejected": -1.201337456703186, + "step": 3088 + }, + { + "epoch": 0.77, + "grad_norm": 5.098115921020508, + "learning_rate": 4.761693940859512e-06, + "logits/chosen": -0.24356865882873535, + "logits/rejected": -0.41142815351486206, + "logps/chosen": -54.84083557128906, + "logps/rejected": -80.02799987792969, + "loss": 0.6993, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.856417655944824, + "rewards/margins": 4.504225730895996, + "rewards/rejected": -1.6478075981140137, + "step": 3089 + }, + { + "epoch": 0.77, + "grad_norm": 6.308203220367432, + "learning_rate": 4.759077210498913e-06, + "logits/chosen": -0.32591861486434937, + "logits/rejected": -0.4461280405521393, + "logps/chosen": -62.0037727355957, + "logps/rejected": -67.25125122070312, + "loss": 0.7337, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1056697368621826, + "rewards/margins": 3.9704341888427734, + "rewards/rejected": -0.8647643327713013, + "step": 3090 + }, + { + "epoch": 0.77, + "grad_norm": 3.463392972946167, + "learning_rate": 4.756460546276819e-06, + "logits/chosen": -0.3146510720252991, + "logits/rejected": -0.41097599267959595, + "logps/chosen": -51.757957458496094, + "logps/rejected": -76.10989379882812, + "loss": 0.6869, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.017322540283203, + "rewards/margins": 4.3395490646362305, + "rewards/rejected": -1.3222264051437378, + "step": 3091 + }, + { + "epoch": 0.77, + "grad_norm": 7.312197685241699, + "learning_rate": 4.753843948911556e-06, + "logits/chosen": -0.2785179018974304, + "logits/rejected": -0.4852733314037323, + "logps/chosen": -57.61671447753906, + "logps/rejected": -65.9403305053711, + "loss": 0.6737, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0463101863861084, + "rewards/margins": 4.854257106781006, + "rewards/rejected": -1.8079464435577393, + "step": 3092 + }, + { + "epoch": 0.77, + "grad_norm": 4.122835636138916, + "learning_rate": 4.751227419121446e-06, + "logits/chosen": -0.29138821363449097, + "logits/rejected": -0.4236949682235718, + "logps/chosen": -54.92835235595703, + "logps/rejected": -72.52478790283203, + "loss": 0.7734, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.065276622772217, + "rewards/margins": 5.3576154708862305, + "rewards/rejected": -2.2923388481140137, + "step": 3093 + }, + { + "epoch": 0.77, + "grad_norm": 5.857145309448242, + "learning_rate": 4.748610957624776e-06, + "logits/chosen": -0.29559075832366943, + "logits/rejected": -0.3522030711174011, + "logps/chosen": -71.68971252441406, + "logps/rejected": -96.1700210571289, + "loss": 0.8644, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6479105949401855, + "rewards/margins": 4.3492937088012695, + "rewards/rejected": -1.7013828754425049, + "step": 3094 + }, + { + "epoch": 0.77, + "grad_norm": 4.905324459075928, + "learning_rate": 4.745994565139821e-06, + "logits/chosen": -0.2838480472564697, + "logits/rejected": -0.36746442317962646, + "logps/chosen": -52.370948791503906, + "logps/rejected": -72.14820098876953, + "loss": 0.8124, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7207798957824707, + "rewards/margins": 3.5624213218688965, + "rewards/rejected": -0.8416415452957153, + "step": 3095 + }, + { + "epoch": 0.77, + "grad_norm": 2.9703564643859863, + "learning_rate": 4.74337824238484e-06, + "logits/chosen": -0.23211662471294403, + "logits/rejected": -0.36734476685523987, + "logps/chosen": -63.50848388671875, + "logps/rejected": -73.94234466552734, + "loss": 0.6319, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9508183002471924, + "rewards/margins": 4.617962837219238, + "rewards/rejected": -1.667144775390625, + "step": 3096 + }, + { + "epoch": 0.77, + "grad_norm": 2.471972703933716, + "learning_rate": 4.7407619900780685e-06, + "logits/chosen": -0.4000677764415741, + "logits/rejected": -0.48190295696258545, + "logps/chosen": -58.389434814453125, + "logps/rejected": -85.36673736572266, + "loss": 0.6232, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8468055725097656, + "rewards/margins": 4.8854570388793945, + "rewards/rejected": -2.038651466369629, + "step": 3097 + }, + { + "epoch": 0.77, + "grad_norm": 6.063811779022217, + "learning_rate": 4.7381458089377245e-06, + "logits/chosen": -0.3119436204433441, + "logits/rejected": -0.3986780047416687, + "logps/chosen": -64.98129272460938, + "logps/rejected": -74.67269134521484, + "loss": 0.9589, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.722703456878662, + "rewards/margins": 3.9775259494781494, + "rewards/rejected": -1.2548226118087769, + "step": 3098 + }, + { + "epoch": 0.78, + "grad_norm": 5.04144811630249, + "learning_rate": 4.735529699682007e-06, + "logits/chosen": -0.2942078411579132, + "logits/rejected": -0.37750986218452454, + "logps/chosen": -70.88455200195312, + "logps/rejected": -77.39306640625, + "loss": 0.8807, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.913065195083618, + "rewards/margins": 3.9370977878570557, + "rewards/rejected": -1.0240328311920166, + "step": 3099 + }, + { + "epoch": 0.78, + "grad_norm": 5.173310279846191, + "learning_rate": 4.732913663029093e-06, + "logits/chosen": -0.3672761619091034, + "logits/rejected": -0.4500340521335602, + "logps/chosen": -57.67046356201172, + "logps/rejected": -74.94441223144531, + "loss": 0.7827, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.380950689315796, + "rewards/margins": 3.902573347091675, + "rewards/rejected": -1.5216225385665894, + "step": 3100 + }, + { + "epoch": 0.78, + "grad_norm": 5.994065761566162, + "learning_rate": 4.7302976996971425e-06, + "logits/chosen": -0.34529826045036316, + "logits/rejected": -0.5035682916641235, + "logps/chosen": -56.67772674560547, + "logps/rejected": -70.79972839355469, + "loss": 0.8831, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.950953960418701, + "rewards/margins": 4.06153678894043, + "rewards/rejected": -1.1105823516845703, + "step": 3101 + }, + { + "epoch": 0.78, + "grad_norm": 5.315736293792725, + "learning_rate": 4.727681810404292e-06, + "logits/chosen": -0.35924676060676575, + "logits/rejected": -0.41970011591911316, + "logps/chosen": -54.083438873291016, + "logps/rejected": -78.76278686523438, + "loss": 0.9022, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.792571783065796, + "rewards/margins": 3.8314523696899414, + "rewards/rejected": -1.0388808250427246, + "step": 3102 + }, + { + "epoch": 0.78, + "grad_norm": 5.8685197830200195, + "learning_rate": 4.725065995868663e-06, + "logits/chosen": -0.2712559401988983, + "logits/rejected": -0.3321723937988281, + "logps/chosen": -50.21714401245117, + "logps/rejected": -85.21699523925781, + "loss": 0.7307, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.698665142059326, + "rewards/margins": 4.354088306427002, + "rewards/rejected": -1.6554230451583862, + "step": 3103 + }, + { + "epoch": 0.78, + "grad_norm": 5.27936315536499, + "learning_rate": 4.72245025680835e-06, + "logits/chosen": -0.3030937910079956, + "logits/rejected": -0.40874728560447693, + "logps/chosen": -53.178226470947266, + "logps/rejected": -63.673805236816406, + "loss": 0.8326, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.503817319869995, + "rewards/margins": 3.883639097213745, + "rewards/rejected": -1.3798216581344604, + "step": 3104 + }, + { + "epoch": 0.78, + "grad_norm": 4.6001739501953125, + "learning_rate": 4.719834593941429e-06, + "logits/chosen": -0.3417809009552002, + "logits/rejected": -0.345931738615036, + "logps/chosen": -57.85498809814453, + "logps/rejected": -108.66709899902344, + "loss": 0.8137, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.911283493041992, + "rewards/margins": 5.439757823944092, + "rewards/rejected": -2.5284738540649414, + "step": 3105 + }, + { + "epoch": 0.78, + "grad_norm": 10.098719596862793, + "learning_rate": 4.7172190079859586e-06, + "logits/chosen": -0.3190375566482544, + "logits/rejected": -0.4159260392189026, + "logps/chosen": -51.28656005859375, + "logps/rejected": -74.91496276855469, + "loss": 0.7803, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.061062812805176, + "rewards/margins": 4.13105583190918, + "rewards/rejected": -1.0699927806854248, + "step": 3106 + }, + { + "epoch": 0.78, + "grad_norm": 4.53516960144043, + "learning_rate": 4.7146034996599715e-06, + "logits/chosen": -0.4356548488140106, + "logits/rejected": -0.504984438419342, + "logps/chosen": -52.781070709228516, + "logps/rejected": -81.17759704589844, + "loss": 0.7359, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.733769416809082, + "rewards/margins": 3.568084478378296, + "rewards/rejected": -0.8343154191970825, + "step": 3107 + }, + { + "epoch": 0.78, + "grad_norm": 8.795157432556152, + "learning_rate": 4.7119880696814835e-06, + "logits/chosen": -0.32944655418395996, + "logits/rejected": -0.451093852519989, + "logps/chosen": -66.16978454589844, + "logps/rejected": -71.3012466430664, + "loss": 1.0143, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.4816408157348633, + "rewards/margins": 3.5087451934814453, + "rewards/rejected": -1.027104377746582, + "step": 3108 + }, + { + "epoch": 0.78, + "grad_norm": 3.999239921569824, + "learning_rate": 4.709372718768485e-06, + "logits/chosen": -0.22460244596004486, + "logits/rejected": -0.26290765404701233, + "logps/chosen": -68.02654266357422, + "logps/rejected": -96.86044311523438, + "loss": 0.8173, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9082510471343994, + "rewards/margins": 4.309776782989502, + "rewards/rejected": -1.4015253782272339, + "step": 3109 + }, + { + "epoch": 0.78, + "grad_norm": 5.647953033447266, + "learning_rate": 4.706757447638945e-06, + "logits/chosen": -0.3273419737815857, + "logits/rejected": -0.39757806062698364, + "logps/chosen": -52.907100677490234, + "logps/rejected": -80.29244995117188, + "loss": 0.7694, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6566781997680664, + "rewards/margins": 3.8760998249053955, + "rewards/rejected": -1.21942138671875, + "step": 3110 + }, + { + "epoch": 0.78, + "grad_norm": 4.415513515472412, + "learning_rate": 4.704142257010814e-06, + "logits/chosen": -0.32069119811058044, + "logits/rejected": -0.3845372200012207, + "logps/chosen": -47.82819366455078, + "logps/rejected": -86.2443618774414, + "loss": 0.6267, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9145448207855225, + "rewards/margins": 4.826516628265381, + "rewards/rejected": -1.9119715690612793, + "step": 3111 + }, + { + "epoch": 0.78, + "grad_norm": 4.005977153778076, + "learning_rate": 4.701527147602018e-06, + "logits/chosen": -0.2968183755874634, + "logits/rejected": -0.4333871006965637, + "logps/chosen": -61.05113220214844, + "logps/rejected": -94.24861145019531, + "loss": 0.6932, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7601873874664307, + "rewards/margins": 4.464561462402344, + "rewards/rejected": -1.7043745517730713, + "step": 3112 + }, + { + "epoch": 0.78, + "grad_norm": 7.257612228393555, + "learning_rate": 4.69891212013046e-06, + "logits/chosen": -0.24900202453136444, + "logits/rejected": -0.34062281250953674, + "logps/chosen": -55.678504943847656, + "logps/rejected": -82.54544067382812, + "loss": 0.7379, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8891890048980713, + "rewards/margins": 4.008955478668213, + "rewards/rejected": -1.1197669506072998, + "step": 3113 + }, + { + "epoch": 0.78, + "grad_norm": 7.475027561187744, + "learning_rate": 4.696297175314024e-06, + "logits/chosen": -0.21047347784042358, + "logits/rejected": -0.31165608763694763, + "logps/chosen": -60.64242172241211, + "logps/rejected": -84.75183868408203, + "loss": 0.8897, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5378806591033936, + "rewards/margins": 4.041058540344238, + "rewards/rejected": -1.5031777620315552, + "step": 3114 + }, + { + "epoch": 0.78, + "grad_norm": 5.359520435333252, + "learning_rate": 4.693682313870564e-06, + "logits/chosen": -0.29456230998039246, + "logits/rejected": -0.3277309536933899, + "logps/chosen": -58.77156066894531, + "logps/rejected": -83.90581512451172, + "loss": 0.8036, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0371689796447754, + "rewards/margins": 4.252773284912109, + "rewards/rejected": -1.2156041860580444, + "step": 3115 + }, + { + "epoch": 0.78, + "grad_norm": 6.426365852355957, + "learning_rate": 4.691067536517921e-06, + "logits/chosen": -0.2649955749511719, + "logits/rejected": -0.3574727177619934, + "logps/chosen": -62.392948150634766, + "logps/rejected": -80.00394439697266, + "loss": 0.7268, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.095529556274414, + "rewards/margins": 3.9721291065216064, + "rewards/rejected": -0.8765997290611267, + "step": 3116 + }, + { + "epoch": 0.78, + "grad_norm": 2.715935230255127, + "learning_rate": 4.688452843973903e-06, + "logits/chosen": -0.29466474056243896, + "logits/rejected": -0.35561686754226685, + "logps/chosen": -59.247047424316406, + "logps/rejected": -95.93138885498047, + "loss": 0.7151, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9791781902313232, + "rewards/margins": 4.7457594871521, + "rewards/rejected": -1.7665820121765137, + "step": 3117 + }, + { + "epoch": 0.78, + "grad_norm": 7.563657760620117, + "learning_rate": 4.685838236956303e-06, + "logits/chosen": -0.2146448791027069, + "logits/rejected": -0.33678603172302246, + "logps/chosen": -67.30660247802734, + "logps/rejected": -69.72264099121094, + "loss": 0.9231, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4793179035186768, + "rewards/margins": 3.8155317306518555, + "rewards/rejected": -1.3362139463424683, + "step": 3118 + }, + { + "epoch": 0.78, + "grad_norm": 4.955010414123535, + "learning_rate": 4.683223716182887e-06, + "logits/chosen": -0.28441762924194336, + "logits/rejected": -0.37798166275024414, + "logps/chosen": -54.360809326171875, + "logps/rejected": -81.03216552734375, + "loss": 0.777, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1281182765960693, + "rewards/margins": 5.0128397941589355, + "rewards/rejected": -1.8847219944000244, + "step": 3119 + }, + { + "epoch": 0.78, + "grad_norm": 3.676501512527466, + "learning_rate": 4.6806092823713944e-06, + "logits/chosen": -0.23920023441314697, + "logits/rejected": -0.31499239802360535, + "logps/chosen": -56.82616424560547, + "logps/rejected": -90.83965301513672, + "loss": 0.7297, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7927794456481934, + "rewards/margins": 4.760974884033203, + "rewards/rejected": -1.9681956768035889, + "step": 3120 + }, + { + "epoch": 0.78, + "grad_norm": 4.313656330108643, + "learning_rate": 4.677994936239547e-06, + "logits/chosen": -0.3071957528591156, + "logits/rejected": -0.42952868342399597, + "logps/chosen": -60.12025451660156, + "logps/rejected": -72.93370056152344, + "loss": 0.7817, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9711976051330566, + "rewards/margins": 4.046980381011963, + "rewards/rejected": -1.0757824182510376, + "step": 3121 + }, + { + "epoch": 0.78, + "grad_norm": 6.934418201446533, + "learning_rate": 4.675380678505037e-06, + "logits/chosen": -0.3563922047615051, + "logits/rejected": -0.48440733551979065, + "logps/chosen": -58.057411193847656, + "logps/rejected": -65.58959197998047, + "loss": 0.9121, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.637016534805298, + "rewards/margins": 3.862025737762451, + "rewards/rejected": -1.225009560585022, + "step": 3122 + }, + { + "epoch": 0.78, + "grad_norm": 5.163082122802734, + "learning_rate": 4.6727665098855364e-06, + "logits/chosen": -0.2915027141571045, + "logits/rejected": -0.4020020067691803, + "logps/chosen": -52.98881530761719, + "logps/rejected": -77.33212280273438, + "loss": 0.7757, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8454465866088867, + "rewards/margins": 4.048549652099609, + "rewards/rejected": -1.2031034231185913, + "step": 3123 + }, + { + "epoch": 0.78, + "grad_norm": 6.664052486419678, + "learning_rate": 4.670152431098692e-06, + "logits/chosen": -0.2581811547279358, + "logits/rejected": -0.36193957924842834, + "logps/chosen": -52.617088317871094, + "logps/rejected": -76.22510528564453, + "loss": 0.805, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.002319812774658, + "rewards/margins": 3.940492868423462, + "rewards/rejected": -0.9381731152534485, + "step": 3124 + }, + { + "epoch": 0.78, + "grad_norm": 6.722942352294922, + "learning_rate": 4.667538442862119e-06, + "logits/chosen": -0.23889943957328796, + "logits/rejected": -0.3184435963630676, + "logps/chosen": -68.77538299560547, + "logps/rejected": -77.2042465209961, + "loss": 0.8009, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8805315494537354, + "rewards/margins": 3.9970428943634033, + "rewards/rejected": -1.116511344909668, + "step": 3125 + }, + { + "epoch": 0.78, + "grad_norm": 4.298229694366455, + "learning_rate": 4.66492454589342e-06, + "logits/chosen": -0.31655076146125793, + "logits/rejected": -0.36328017711639404, + "logps/chosen": -56.05772399902344, + "logps/rejected": -82.26740264892578, + "loss": 0.7435, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.592496871948242, + "rewards/margins": 4.790867328643799, + "rewards/rejected": -2.1983699798583984, + "step": 3126 + }, + { + "epoch": 0.78, + "grad_norm": 9.945462226867676, + "learning_rate": 4.662310740910161e-06, + "logits/chosen": -0.30974912643432617, + "logits/rejected": -0.4135570824146271, + "logps/chosen": -62.141563415527344, + "logps/rejected": -75.47612762451172, + "loss": 0.7136, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1671416759490967, + "rewards/margins": 4.238826751708984, + "rewards/rejected": -1.0716850757598877, + "step": 3127 + }, + { + "epoch": 0.78, + "grad_norm": 2.1084728240966797, + "learning_rate": 4.659697028629894e-06, + "logits/chosen": -0.3497476875782013, + "logits/rejected": -0.47273361682891846, + "logps/chosen": -58.532752990722656, + "logps/rejected": -94.78217315673828, + "loss": 0.6179, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7620022296905518, + "rewards/margins": 6.034872531890869, + "rewards/rejected": -3.2728703022003174, + "step": 3128 + }, + { + "epoch": 0.78, + "grad_norm": 7.220322608947754, + "learning_rate": 4.6570834097701335e-06, + "logits/chosen": -0.3167579174041748, + "logits/rejected": -0.3636332154273987, + "logps/chosen": -61.39026641845703, + "logps/rejected": -89.46692657470703, + "loss": 0.8819, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.886197328567505, + "rewards/margins": 3.990143060684204, + "rewards/rejected": -1.1039458513259888, + "step": 3129 + }, + { + "epoch": 0.78, + "grad_norm": 3.015408754348755, + "learning_rate": 4.654469885048376e-06, + "logits/chosen": -0.2827966511249542, + "logits/rejected": -0.41141477227211, + "logps/chosen": -54.34307861328125, + "logps/rejected": -87.69451904296875, + "loss": 0.6427, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.75944185256958, + "rewards/margins": 5.546976566314697, + "rewards/rejected": -2.7875351905822754, + "step": 3130 + }, + { + "epoch": 0.78, + "grad_norm": 4.647951602935791, + "learning_rate": 4.651856455182092e-06, + "logits/chosen": -0.26934361457824707, + "logits/rejected": -0.29612043499946594, + "logps/chosen": -47.998443603515625, + "logps/rejected": -81.55045318603516, + "loss": 0.8305, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.014504909515381, + "rewards/margins": 3.6268744468688965, + "rewards/rejected": -0.6123691201210022, + "step": 3131 + }, + { + "epoch": 0.78, + "grad_norm": 4.867238998413086, + "learning_rate": 4.649243120888723e-06, + "logits/chosen": -0.35296082496643066, + "logits/rejected": -0.43440645933151245, + "logps/chosen": -56.802066802978516, + "logps/rejected": -85.5505599975586, + "loss": 0.8258, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7732832431793213, + "rewards/margins": 4.459170341491699, + "rewards/rejected": -1.6858872175216675, + "step": 3132 + }, + { + "epoch": 0.78, + "grad_norm": 4.828614234924316, + "learning_rate": 4.646629882885687e-06, + "logits/chosen": -0.33779463171958923, + "logits/rejected": -0.40431055426597595, + "logps/chosen": -52.70521545410156, + "logps/rejected": -86.61119842529297, + "loss": 0.7804, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6831040382385254, + "rewards/margins": 4.995476245880127, + "rewards/rejected": -2.3123719692230225, + "step": 3133 + }, + { + "epoch": 0.78, + "grad_norm": 4.888737678527832, + "learning_rate": 4.6440167418903735e-06, + "logits/chosen": -0.1689225733280182, + "logits/rejected": -0.3202771544456482, + "logps/chosen": -60.78763961791992, + "logps/rejected": -85.2085189819336, + "loss": 0.6898, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0683441162109375, + "rewards/margins": 5.208978652954102, + "rewards/rejected": -2.140634298324585, + "step": 3134 + }, + { + "epoch": 0.78, + "grad_norm": 4.76914119720459, + "learning_rate": 4.641403698620143e-06, + "logits/chosen": -0.3040210008621216, + "logits/rejected": -0.4147643446922302, + "logps/chosen": -55.49628448486328, + "logps/rejected": -78.40560913085938, + "loss": 0.7771, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8997035026550293, + "rewards/margins": 4.621053218841553, + "rewards/rejected": -1.7213499546051025, + "step": 3135 + }, + { + "epoch": 0.78, + "grad_norm": 4.583505630493164, + "learning_rate": 4.6387907537923385e-06, + "logits/chosen": -0.30832359194755554, + "logits/rejected": -0.4154598116874695, + "logps/chosen": -57.49189758300781, + "logps/rejected": -86.4151382446289, + "loss": 0.8239, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.047499656677246, + "rewards/margins": 5.122016906738281, + "rewards/rejected": -2.074517011642456, + "step": 3136 + }, + { + "epoch": 0.78, + "grad_norm": 6.006283760070801, + "learning_rate": 4.636177908124263e-06, + "logits/chosen": -0.28001776337623596, + "logits/rejected": -0.37300577759742737, + "logps/chosen": -41.7215576171875, + "logps/rejected": -80.78173065185547, + "loss": 0.7078, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.860593318939209, + "rewards/margins": 4.892687797546387, + "rewards/rejected": -2.0320940017700195, + "step": 3137 + }, + { + "epoch": 0.78, + "grad_norm": 4.751398086547852, + "learning_rate": 4.6335651623332054e-06, + "logits/chosen": -0.2736169397830963, + "logits/rejected": -0.4059566855430603, + "logps/chosen": -60.89522171020508, + "logps/rejected": -71.79371643066406, + "loss": 0.7414, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8611578941345215, + "rewards/margins": 4.332876682281494, + "rewards/rejected": -1.4717192649841309, + "step": 3138 + }, + { + "epoch": 0.79, + "grad_norm": 15.151618003845215, + "learning_rate": 4.630952517136418e-06, + "logits/chosen": -0.2694738209247589, + "logits/rejected": -0.3116624355316162, + "logps/chosen": -52.42958068847656, + "logps/rejected": -95.03189086914062, + "loss": 0.7658, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6917049884796143, + "rewards/margins": 4.225916385650635, + "rewards/rejected": -1.53421151638031, + "step": 3139 + }, + { + "epoch": 0.79, + "grad_norm": 5.908226490020752, + "learning_rate": 4.628339973251126e-06, + "logits/chosen": -0.2422686517238617, + "logits/rejected": -0.33199450373649597, + "logps/chosen": -56.83325958251953, + "logps/rejected": -90.08624267578125, + "loss": 0.7612, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8640379905700684, + "rewards/margins": 4.580672264099121, + "rewards/rejected": -1.716634750366211, + "step": 3140 + }, + { + "epoch": 0.79, + "grad_norm": 5.021437644958496, + "learning_rate": 4.6257275313945315e-06, + "logits/chosen": -0.2949216067790985, + "logits/rejected": -0.4099626839160919, + "logps/chosen": -69.19248962402344, + "logps/rejected": -74.11961364746094, + "loss": 0.8433, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0157976150512695, + "rewards/margins": 3.4234230518341064, + "rewards/rejected": -0.4076251685619354, + "step": 3141 + }, + { + "epoch": 0.79, + "grad_norm": 3.9306890964508057, + "learning_rate": 4.623115192283806e-06, + "logits/chosen": -0.30773311853408813, + "logits/rejected": -0.4268559515476227, + "logps/chosen": -60.22665023803711, + "logps/rejected": -93.2336196899414, + "loss": 0.7282, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.974107503890991, + "rewards/margins": 5.224464416503906, + "rewards/rejected": -2.250357151031494, + "step": 3142 + }, + { + "epoch": 0.79, + "grad_norm": 6.228579521179199, + "learning_rate": 4.620502956636093e-06, + "logits/chosen": -0.3068433105945587, + "logits/rejected": -0.41273587942123413, + "logps/chosen": -54.051544189453125, + "logps/rejected": -79.71770477294922, + "loss": 0.7492, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.868577003479004, + "rewards/margins": 4.323030948638916, + "rewards/rejected": -1.454453706741333, + "step": 3143 + }, + { + "epoch": 0.79, + "grad_norm": 2.7013185024261475, + "learning_rate": 4.617890825168507e-06, + "logits/chosen": -0.2773860692977905, + "logits/rejected": -0.41453930735588074, + "logps/chosen": -53.39454650878906, + "logps/rejected": -82.86997985839844, + "loss": 0.6471, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8662893772125244, + "rewards/margins": 5.110174655914307, + "rewards/rejected": -2.2438852787017822, + "step": 3144 + }, + { + "epoch": 0.79, + "grad_norm": 3.9396047592163086, + "learning_rate": 4.615278798598135e-06, + "logits/chosen": -0.28024160861968994, + "logits/rejected": -0.39860132336616516, + "logps/chosen": -58.856056213378906, + "logps/rejected": -85.02763366699219, + "loss": 0.7199, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.833786964416504, + "rewards/margins": 4.771003246307373, + "rewards/rejected": -1.9372162818908691, + "step": 3145 + }, + { + "epoch": 0.79, + "grad_norm": 3.3215675354003906, + "learning_rate": 4.612666877642036e-06, + "logits/chosen": -0.33315426111221313, + "logits/rejected": -0.4130297303199768, + "logps/chosen": -58.315757751464844, + "logps/rejected": -80.85409545898438, + "loss": 0.6748, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.049569606781006, + "rewards/margins": 4.282837390899658, + "rewards/rejected": -1.2332682609558105, + "step": 3146 + }, + { + "epoch": 0.79, + "grad_norm": 4.429026126861572, + "learning_rate": 4.610055063017233e-06, + "logits/chosen": -0.37217843532562256, + "logits/rejected": -0.43026119470596313, + "logps/chosen": -53.06080627441406, + "logps/rejected": -79.92642974853516, + "loss": 0.7489, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.174062490463257, + "rewards/margins": 3.9837284088134766, + "rewards/rejected": -0.8096659779548645, + "step": 3147 + }, + { + "epoch": 0.79, + "grad_norm": 5.544900417327881, + "learning_rate": 4.607443355440734e-06, + "logits/chosen": -0.27765506505966187, + "logits/rejected": -0.4332231283187866, + "logps/chosen": -48.416019439697266, + "logps/rejected": -72.80516052246094, + "loss": 0.8168, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.786576509475708, + "rewards/margins": 4.493323802947998, + "rewards/rejected": -1.70674729347229, + "step": 3148 + }, + { + "epoch": 0.79, + "grad_norm": 4.006649971008301, + "learning_rate": 4.604831755629503e-06, + "logits/chosen": -0.3516707420349121, + "logits/rejected": -0.46434691548347473, + "logps/chosen": -61.23950958251953, + "logps/rejected": -73.97561645507812, + "loss": 0.8117, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0855982303619385, + "rewards/margins": 3.846280336380005, + "rewards/rejected": -0.7606820464134216, + "step": 3149 + }, + { + "epoch": 0.79, + "grad_norm": 4.394798278808594, + "learning_rate": 4.602220264300481e-06, + "logits/chosen": -0.2787788510322571, + "logits/rejected": -0.30477988719940186, + "logps/chosen": -54.21572494506836, + "logps/rejected": -95.90234375, + "loss": 0.8166, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9995949268341064, + "rewards/margins": 3.66593074798584, + "rewards/rejected": -0.6663357615470886, + "step": 3150 + }, + { + "epoch": 0.79, + "grad_norm": 3.636813163757324, + "learning_rate": 4.599608882170581e-06, + "logits/chosen": -0.27282822132110596, + "logits/rejected": -0.4490308463573456, + "logps/chosen": -68.18598937988281, + "logps/rejected": -79.69358825683594, + "loss": 0.743, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7895941734313965, + "rewards/margins": 4.9978108406066895, + "rewards/rejected": -2.208217144012451, + "step": 3151 + }, + { + "epoch": 0.79, + "grad_norm": 4.141483783721924, + "learning_rate": 4.596997609956682e-06, + "logits/chosen": -0.35508549213409424, + "logits/rejected": -0.4218153953552246, + "logps/chosen": -54.575679779052734, + "logps/rejected": -84.48968505859375, + "loss": 0.8202, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6950786113739014, + "rewards/margins": 4.147313594818115, + "rewards/rejected": -1.452235221862793, + "step": 3152 + }, + { + "epoch": 0.79, + "grad_norm": 7.504097938537598, + "learning_rate": 4.594386448375635e-06, + "logits/chosen": -0.3283625841140747, + "logits/rejected": -0.42725133895874023, + "logps/chosen": -58.800296783447266, + "logps/rejected": -81.38726043701172, + "loss": 0.7967, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7399368286132812, + "rewards/margins": 4.321460247039795, + "rewards/rejected": -1.581523060798645, + "step": 3153 + }, + { + "epoch": 0.79, + "grad_norm": 4.405930995941162, + "learning_rate": 4.591775398144261e-06, + "logits/chosen": -0.27360644936561584, + "logits/rejected": -0.3634253442287445, + "logps/chosen": -68.13088989257812, + "logps/rejected": -86.29121398925781, + "loss": 0.7838, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5488085746765137, + "rewards/margins": 3.968411922454834, + "rewards/rejected": -1.4196035861968994, + "step": 3154 + }, + { + "epoch": 0.79, + "grad_norm": 6.793346881866455, + "learning_rate": 4.589164459979348e-06, + "logits/chosen": -0.3340021073818207, + "logits/rejected": -0.43384531140327454, + "logps/chosen": -51.28031539916992, + "logps/rejected": -75.22908020019531, + "loss": 0.7486, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.683103322982788, + "rewards/margins": 4.2045207023620605, + "rewards/rejected": -1.5214178562164307, + "step": 3155 + }, + { + "epoch": 0.79, + "grad_norm": 5.102126598358154, + "learning_rate": 4.5865536345976576e-06, + "logits/chosen": -0.38050520420074463, + "logits/rejected": -0.45067793130874634, + "logps/chosen": -56.84663772583008, + "logps/rejected": -78.91697692871094, + "loss": 0.7079, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.175812005996704, + "rewards/margins": 4.59144401550293, + "rewards/rejected": -1.4156320095062256, + "step": 3156 + }, + { + "epoch": 0.79, + "grad_norm": 4.337998390197754, + "learning_rate": 4.583942922715914e-06, + "logits/chosen": -0.3064877390861511, + "logits/rejected": -0.4113721251487732, + "logps/chosen": -58.66271209716797, + "logps/rejected": -87.18484497070312, + "loss": 0.7178, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7670414447784424, + "rewards/margins": 4.699244976043701, + "rewards/rejected": -1.9322036504745483, + "step": 3157 + }, + { + "epoch": 0.79, + "grad_norm": 7.496462821960449, + "learning_rate": 4.5813323250508185e-06, + "logits/chosen": -0.35332292318344116, + "logits/rejected": -0.4861626923084259, + "logps/chosen": -44.48860168457031, + "logps/rejected": -68.6004638671875, + "loss": 0.787, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7899298667907715, + "rewards/margins": 4.258841514587402, + "rewards/rejected": -1.4689116477966309, + "step": 3158 + }, + { + "epoch": 0.79, + "grad_norm": 6.083308219909668, + "learning_rate": 4.5787218423190326e-06, + "logits/chosen": -0.30134841799736023, + "logits/rejected": -0.3608262836933136, + "logps/chosen": -47.569461822509766, + "logps/rejected": -84.54833984375, + "loss": 0.7454, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.750124931335449, + "rewards/margins": 4.384366512298584, + "rewards/rejected": -1.6342413425445557, + "step": 3159 + }, + { + "epoch": 0.79, + "grad_norm": 5.741901874542236, + "learning_rate": 4.576111475237191e-06, + "logits/chosen": -0.2704910635948181, + "logits/rejected": -0.3827875256538391, + "logps/chosen": -60.14230728149414, + "logps/rejected": -85.66400146484375, + "loss": 0.714, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.693979024887085, + "rewards/margins": 4.685215950012207, + "rewards/rejected": -1.991236925125122, + "step": 3160 + }, + { + "epoch": 0.79, + "grad_norm": 4.6930108070373535, + "learning_rate": 4.5735012245218965e-06, + "logits/chosen": -0.2653471529483795, + "logits/rejected": -0.4221211075782776, + "logps/chosen": -63.968624114990234, + "logps/rejected": -76.3163833618164, + "loss": 0.6692, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8748929500579834, + "rewards/margins": 4.981176376342773, + "rewards/rejected": -2.106282949447632, + "step": 3161 + }, + { + "epoch": 0.79, + "grad_norm": 5.417236328125, + "learning_rate": 4.570891090889718e-06, + "logits/chosen": -0.34941089153289795, + "logits/rejected": -0.4232935607433319, + "logps/chosen": -52.07868957519531, + "logps/rejected": -82.2348403930664, + "loss": 0.6938, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1035878658294678, + "rewards/margins": 4.9402079582214355, + "rewards/rejected": -1.8366199731826782, + "step": 3162 + }, + { + "epoch": 0.79, + "grad_norm": 5.232279300689697, + "learning_rate": 4.568281075057196e-06, + "logits/chosen": -0.3118170499801636, + "logits/rejected": -0.3930598497390747, + "logps/chosen": -60.62199401855469, + "logps/rejected": -86.38935852050781, + "loss": 0.7434, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.807415008544922, + "rewards/margins": 4.084277153015137, + "rewards/rejected": -1.2768627405166626, + "step": 3163 + }, + { + "epoch": 0.79, + "grad_norm": 6.190615653991699, + "learning_rate": 4.565671177740834e-06, + "logits/chosen": -0.25512757897377014, + "logits/rejected": -0.47174838185310364, + "logps/chosen": -57.314388275146484, + "logps/rejected": -74.10173034667969, + "loss": 0.6752, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.615424394607544, + "rewards/margins": 5.224410533905029, + "rewards/rejected": -2.6089859008789062, + "step": 3164 + }, + { + "epoch": 0.79, + "grad_norm": 7.826780796051025, + "learning_rate": 4.563061399657105e-06, + "logits/chosen": -0.35633838176727295, + "logits/rejected": -0.4779496192932129, + "logps/chosen": -61.811485290527344, + "logps/rejected": -88.60406494140625, + "loss": 0.8, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5561113357543945, + "rewards/margins": 5.140608787536621, + "rewards/rejected": -2.5844974517822266, + "step": 3165 + }, + { + "epoch": 0.79, + "grad_norm": 8.786886215209961, + "learning_rate": 4.5604517415224516e-06, + "logits/chosen": -0.41057562828063965, + "logits/rejected": -0.4476601779460907, + "logps/chosen": -51.83378219604492, + "logps/rejected": -98.0117416381836, + "loss": 0.9147, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5740432739257812, + "rewards/margins": 4.344837188720703, + "rewards/rejected": -1.7707942724227905, + "step": 3166 + }, + { + "epoch": 0.79, + "grad_norm": 2.1415340900421143, + "learning_rate": 4.557842204053279e-06, + "logits/chosen": -0.3715389668941498, + "logits/rejected": -0.46233367919921875, + "logps/chosen": -55.67587661743164, + "logps/rejected": -88.5085220336914, + "loss": 0.5858, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0952255725860596, + "rewards/margins": 5.958600997924805, + "rewards/rejected": -2.863375425338745, + "step": 3167 + }, + { + "epoch": 0.79, + "grad_norm": 8.074989318847656, + "learning_rate": 4.555232787965963e-06, + "logits/chosen": -0.31638264656066895, + "logits/rejected": -0.43773430585861206, + "logps/chosen": -59.18285369873047, + "logps/rejected": -84.75886535644531, + "loss": 0.8608, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6783273220062256, + "rewards/margins": 4.229948043823242, + "rewards/rejected": -1.5516209602355957, + "step": 3168 + }, + { + "epoch": 0.79, + "grad_norm": 5.947497844696045, + "learning_rate": 4.552623493976845e-06, + "logits/chosen": -0.357940673828125, + "logits/rejected": -0.4241527318954468, + "logps/chosen": -61.56277847290039, + "logps/rejected": -70.94486999511719, + "loss": 0.831, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8075315952301025, + "rewards/margins": 4.166093826293945, + "rewards/rejected": -1.3585628271102905, + "step": 3169 + }, + { + "epoch": 0.79, + "grad_norm": 2.764798402786255, + "learning_rate": 4.55001432280223e-06, + "logits/chosen": -0.32893693447113037, + "logits/rejected": -0.41331911087036133, + "logps/chosen": -53.00796890258789, + "logps/rejected": -79.79716491699219, + "loss": 0.6204, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.838387966156006, + "rewards/margins": 4.367629528045654, + "rewards/rejected": -1.5292413234710693, + "step": 3170 + }, + { + "epoch": 0.79, + "grad_norm": 2.647707462310791, + "learning_rate": 4.547405275158394e-06, + "logits/chosen": -0.31310468912124634, + "logits/rejected": -0.4181271195411682, + "logps/chosen": -50.43903350830078, + "logps/rejected": -73.8033218383789, + "loss": 0.6546, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.032057762145996, + "rewards/margins": 4.249370574951172, + "rewards/rejected": -1.217313289642334, + "step": 3171 + }, + { + "epoch": 0.79, + "grad_norm": 4.9444122314453125, + "learning_rate": 4.544796351761574e-06, + "logits/chosen": -0.3051932752132416, + "logits/rejected": -0.42838069796562195, + "logps/chosen": -59.12540817260742, + "logps/rejected": -70.4928207397461, + "loss": 0.8649, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.039419651031494, + "rewards/margins": 4.053775310516357, + "rewards/rejected": -1.0143557786941528, + "step": 3172 + }, + { + "epoch": 0.79, + "grad_norm": 7.147032737731934, + "learning_rate": 4.542187553327981e-06, + "logits/chosen": -0.37932029366493225, + "logits/rejected": -0.42391252517700195, + "logps/chosen": -47.28681564331055, + "logps/rejected": -88.51851654052734, + "loss": 0.7439, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0418245792388916, + "rewards/margins": 4.917843818664551, + "rewards/rejected": -1.876018762588501, + "step": 3173 + }, + { + "epoch": 0.79, + "grad_norm": 3.9404296875, + "learning_rate": 4.539578880573782e-06, + "logits/chosen": -0.2983015179634094, + "logits/rejected": -0.3516613245010376, + "logps/chosen": -60.98970413208008, + "logps/rejected": -86.40406799316406, + "loss": 0.7758, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.839437484741211, + "rewards/margins": 3.7127342224121094, + "rewards/rejected": -0.8732969760894775, + "step": 3174 + }, + { + "epoch": 0.79, + "grad_norm": 3.3093924522399902, + "learning_rate": 4.536970334215115e-06, + "logits/chosen": -0.339039146900177, + "logits/rejected": -0.43292272090911865, + "logps/chosen": -58.174503326416016, + "logps/rejected": -78.04448699951172, + "loss": 0.6784, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9888994693756104, + "rewards/margins": 4.449357986450195, + "rewards/rejected": -1.460458755493164, + "step": 3175 + }, + { + "epoch": 0.79, + "grad_norm": 4.078344345092773, + "learning_rate": 4.534361914968083e-06, + "logits/chosen": -0.3094772398471832, + "logits/rejected": -0.4501318633556366, + "logps/chosen": -60.70256805419922, + "logps/rejected": -77.3115463256836, + "loss": 0.8178, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8154497146606445, + "rewards/margins": 4.597800254821777, + "rewards/rejected": -1.782351016998291, + "step": 3176 + }, + { + "epoch": 0.79, + "grad_norm": 2.4591667652130127, + "learning_rate": 4.531753623548753e-06, + "logits/chosen": -0.3781759440898895, + "logits/rejected": -0.49267786741256714, + "logps/chosen": -53.58842468261719, + "logps/rejected": -78.70276641845703, + "loss": 0.5938, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8991458415985107, + "rewards/margins": 4.782358169555664, + "rewards/rejected": -1.8832119703292847, + "step": 3177 + }, + { + "epoch": 0.79, + "grad_norm": 3.8625481128692627, + "learning_rate": 4.529145460673158e-06, + "logits/chosen": -0.24751001596450806, + "logits/rejected": -0.3238067328929901, + "logps/chosen": -59.422523498535156, + "logps/rejected": -95.4852066040039, + "loss": 0.7443, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8612453937530518, + "rewards/margins": 4.766201019287109, + "rewards/rejected": -1.9049553871154785, + "step": 3178 + }, + { + "epoch": 0.8, + "grad_norm": 3.643618583679199, + "learning_rate": 4.526537427057295e-06, + "logits/chosen": -0.2594892680644989, + "logits/rejected": -0.40202924609184265, + "logps/chosen": -55.05159378051758, + "logps/rejected": -68.08285522460938, + "loss": 0.7393, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.108227252960205, + "rewards/margins": 4.767777442932129, + "rewards/rejected": -1.6595499515533447, + "step": 3179 + }, + { + "epoch": 0.8, + "grad_norm": 6.547048568725586, + "learning_rate": 4.523929523417122e-06, + "logits/chosen": -0.3204531967639923, + "logits/rejected": -0.4061836302280426, + "logps/chosen": -59.81816864013672, + "logps/rejected": -70.8143310546875, + "loss": 1.0665, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.746581554412842, + "rewards/margins": 3.2509961128234863, + "rewards/rejected": -0.5044143199920654, + "step": 3180 + }, + { + "epoch": 0.8, + "grad_norm": 13.544836044311523, + "learning_rate": 4.52132175046857e-06, + "logits/chosen": -0.22921061515808105, + "logits/rejected": -0.3067929148674011, + "logps/chosen": -71.24620819091797, + "logps/rejected": -93.7730941772461, + "loss": 1.1227, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6183786392211914, + "rewards/margins": 3.8960299491882324, + "rewards/rejected": -1.2776514291763306, + "step": 3181 + }, + { + "epoch": 0.8, + "grad_norm": 6.890996932983398, + "learning_rate": 4.518714108927524e-06, + "logits/chosen": -0.34673064947128296, + "logits/rejected": -0.4201420247554779, + "logps/chosen": -59.363765716552734, + "logps/rejected": -84.24424743652344, + "loss": 0.8833, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0696518421173096, + "rewards/margins": 4.246393203735352, + "rewards/rejected": -1.176741361618042, + "step": 3182 + }, + { + "epoch": 0.8, + "grad_norm": 3.284674882888794, + "learning_rate": 4.516106599509844e-06, + "logits/chosen": -0.24739518761634827, + "logits/rejected": -0.2908446788787842, + "logps/chosen": -66.14994812011719, + "logps/rejected": -105.6968994140625, + "loss": 0.7338, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8624563217163086, + "rewards/margins": 5.064159393310547, + "rewards/rejected": -2.20170259475708, + "step": 3183 + }, + { + "epoch": 0.8, + "grad_norm": 7.652886867523193, + "learning_rate": 4.513499222931342e-06, + "logits/chosen": -0.3000064790248871, + "logits/rejected": -0.4069823622703552, + "logps/chosen": -59.55492401123047, + "logps/rejected": -82.45426940917969, + "loss": 0.7829, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.838197708129883, + "rewards/margins": 4.66117000579834, + "rewards/rejected": -1.8229724168777466, + "step": 3184 + }, + { + "epoch": 0.8, + "grad_norm": 2.6450462341308594, + "learning_rate": 4.510891979907801e-06, + "logits/chosen": -0.2913987636566162, + "logits/rejected": -0.3648783564567566, + "logps/chosen": -55.63920593261719, + "logps/rejected": -87.17803192138672, + "loss": 0.6505, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.929677963256836, + "rewards/margins": 4.69586181640625, + "rewards/rejected": -1.7661842107772827, + "step": 3185 + }, + { + "epoch": 0.8, + "grad_norm": 5.309201240539551, + "learning_rate": 4.508284871154967e-06, + "logits/chosen": -0.3449075520038605, + "logits/rejected": -0.38827866315841675, + "logps/chosen": -51.8079833984375, + "logps/rejected": -78.44058227539062, + "loss": 0.8156, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8945560455322266, + "rewards/margins": 4.670445442199707, + "rewards/rejected": -1.7758898735046387, + "step": 3186 + }, + { + "epoch": 0.8, + "grad_norm": 5.65134859085083, + "learning_rate": 4.505677897388544e-06, + "logits/chosen": -0.24558599293231964, + "logits/rejected": -0.33198660612106323, + "logps/chosen": -64.70159912109375, + "logps/rejected": -93.54813385009766, + "loss": 0.8555, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.678945302963257, + "rewards/margins": 4.9078688621521, + "rewards/rejected": -2.22892427444458, + "step": 3187 + }, + { + "epoch": 0.8, + "grad_norm": 6.169495582580566, + "learning_rate": 4.503071059324206e-06, + "logits/chosen": -0.28385356068611145, + "logits/rejected": -0.3473871350288391, + "logps/chosen": -59.76909255981445, + "logps/rejected": -72.40165710449219, + "loss": 0.8441, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8575873374938965, + "rewards/margins": 3.2062458992004395, + "rewards/rejected": -0.3486586809158325, + "step": 3188 + }, + { + "epoch": 0.8, + "grad_norm": 5.203793048858643, + "learning_rate": 4.500464357677587e-06, + "logits/chosen": -0.30459851026535034, + "logits/rejected": -0.4170384407043457, + "logps/chosen": -58.666847229003906, + "logps/rejected": -71.76508331298828, + "loss": 0.7981, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.910753011703491, + "rewards/margins": 4.141082763671875, + "rewards/rejected": -1.2303295135498047, + "step": 3189 + }, + { + "epoch": 0.8, + "grad_norm": 9.324180603027344, + "learning_rate": 4.497857793164277e-06, + "logits/chosen": -0.3355814516544342, + "logits/rejected": -0.44159135222435, + "logps/chosen": -69.93990325927734, + "logps/rejected": -84.67707061767578, + "loss": 0.9922, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3314287662506104, + "rewards/margins": 3.636535167694092, + "rewards/rejected": -1.305106282234192, + "step": 3190 + }, + { + "epoch": 0.8, + "grad_norm": 5.153910160064697, + "learning_rate": 4.495251366499842e-06, + "logits/chosen": -0.2987782061100006, + "logits/rejected": -0.37902888655662537, + "logps/chosen": -63.610260009765625, + "logps/rejected": -72.02999114990234, + "loss": 0.8476, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0045411586761475, + "rewards/margins": 4.0038838386535645, + "rewards/rejected": -0.9993423223495483, + "step": 3191 + }, + { + "epoch": 0.8, + "grad_norm": 4.456375598907471, + "learning_rate": 4.492645078399795e-06, + "logits/chosen": -0.28150108456611633, + "logits/rejected": -0.4186263084411621, + "logps/chosen": -43.630592346191406, + "logps/rejected": -68.20609283447266, + "loss": 0.6541, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9122154712677, + "rewards/margins": 5.004469871520996, + "rewards/rejected": -2.092254400253296, + "step": 3192 + }, + { + "epoch": 0.8, + "grad_norm": 3.665797710418701, + "learning_rate": 4.490038929579625e-06, + "logits/chosen": -0.2526264488697052, + "logits/rejected": -0.4050018787384033, + "logps/chosen": -61.665245056152344, + "logps/rejected": -88.09153747558594, + "loss": 0.714, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.040572166442871, + "rewards/margins": 5.1393513679504395, + "rewards/rejected": -2.0987789630889893, + "step": 3193 + }, + { + "epoch": 0.8, + "grad_norm": 2.7350282669067383, + "learning_rate": 4.487432920754772e-06, + "logits/chosen": -0.3067425489425659, + "logits/rejected": -0.44161686301231384, + "logps/chosen": -85.59967041015625, + "logps/rejected": -81.89311218261719, + "loss": 0.8003, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9518816471099854, + "rewards/margins": 4.899327754974365, + "rewards/rejected": -1.9474458694458008, + "step": 3194 + }, + { + "epoch": 0.8, + "grad_norm": 5.829957485198975, + "learning_rate": 4.484827052640642e-06, + "logits/chosen": -0.2700934410095215, + "logits/rejected": -0.39857596158981323, + "logps/chosen": -68.6694107055664, + "logps/rejected": -71.3271484375, + "loss": 0.9992, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.4381067752838135, + "rewards/margins": 2.620150327682495, + "rewards/rejected": -0.1820433884859085, + "step": 3195 + }, + { + "epoch": 0.8, + "grad_norm": 6.554303169250488, + "learning_rate": 4.482221325952602e-06, + "logits/chosen": -0.27993273735046387, + "logits/rejected": -0.3495629131793976, + "logps/chosen": -56.79043960571289, + "logps/rejected": -75.67584228515625, + "loss": 0.8122, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.750596284866333, + "rewards/margins": 3.9020533561706543, + "rewards/rejected": -1.1514569520950317, + "step": 3196 + }, + { + "epoch": 0.8, + "grad_norm": 7.019300937652588, + "learning_rate": 4.479615741405981e-06, + "logits/chosen": -0.2462122142314911, + "logits/rejected": -0.26299741864204407, + "logps/chosen": -52.004852294921875, + "logps/rejected": -83.1473159790039, + "loss": 0.7136, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0017294883728027, + "rewards/margins": 3.7974445819854736, + "rewards/rejected": -0.7957152724266052, + "step": 3197 + }, + { + "epoch": 0.8, + "grad_norm": 8.140166282653809, + "learning_rate": 4.477010299716069e-06, + "logits/chosen": -0.3130335211753845, + "logits/rejected": -0.40168339014053345, + "logps/chosen": -55.49043273925781, + "logps/rejected": -84.35247039794922, + "loss": 0.7809, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.080893039703369, + "rewards/margins": 4.4854817390441895, + "rewards/rejected": -1.4045886993408203, + "step": 3198 + }, + { + "epoch": 0.8, + "grad_norm": 6.696409702301025, + "learning_rate": 4.474405001598114e-06, + "logits/chosen": -0.36932528018951416, + "logits/rejected": -0.44608741998672485, + "logps/chosen": -60.73983383178711, + "logps/rejected": -85.33998107910156, + "loss": 0.8881, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.789839029312134, + "rewards/margins": 3.971635580062866, + "rewards/rejected": -1.1817965507507324, + "step": 3199 + }, + { + "epoch": 0.8, + "grad_norm": 4.267550945281982, + "learning_rate": 4.471799847767328e-06, + "logits/chosen": -0.2912604808807373, + "logits/rejected": -0.3742569386959076, + "logps/chosen": -56.781097412109375, + "logps/rejected": -90.64994812011719, + "loss": 0.7611, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.869204521179199, + "rewards/margins": 5.6147003173828125, + "rewards/rejected": -2.7454960346221924, + "step": 3200 + }, + { + "epoch": 0.8, + "grad_norm": 14.427129745483398, + "learning_rate": 4.469194838938883e-06, + "logits/chosen": -0.3149391710758209, + "logits/rejected": -0.42308223247528076, + "logps/chosen": -50.845550537109375, + "logps/rejected": -71.38481140136719, + "loss": 0.7625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9690747261047363, + "rewards/margins": 4.1054487228393555, + "rewards/rejected": -1.1363739967346191, + "step": 3201 + }, + { + "epoch": 0.8, + "grad_norm": 6.738971710205078, + "learning_rate": 4.466589975827905e-06, + "logits/chosen": -0.2452181875705719, + "logits/rejected": -0.32872509956359863, + "logps/chosen": -59.570838928222656, + "logps/rejected": -77.27196502685547, + "loss": 0.9473, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.089914083480835, + "rewards/margins": 4.011771202087402, + "rewards/rejected": -0.9218573570251465, + "step": 3202 + }, + { + "epoch": 0.8, + "grad_norm": 4.682988166809082, + "learning_rate": 4.463985259149492e-06, + "logits/chosen": -0.32633912563323975, + "logits/rejected": -0.4197673499584198, + "logps/chosen": -54.35230255126953, + "logps/rejected": -88.18732452392578, + "loss": 0.7966, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.649838924407959, + "rewards/margins": 4.090736389160156, + "rewards/rejected": -1.4408974647521973, + "step": 3203 + }, + { + "epoch": 0.8, + "grad_norm": 3.931321382522583, + "learning_rate": 4.4613806896186906e-06, + "logits/chosen": -0.3131193220615387, + "logits/rejected": -0.4658612012863159, + "logps/chosen": -55.890663146972656, + "logps/rejected": -73.35920715332031, + "loss": 0.7023, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9725239276885986, + "rewards/margins": 5.632133960723877, + "rewards/rejected": -2.6596100330352783, + "step": 3204 + }, + { + "epoch": 0.8, + "grad_norm": 4.933537483215332, + "learning_rate": 4.4587762679505115e-06, + "logits/chosen": -0.31460410356521606, + "logits/rejected": -0.4059039056301117, + "logps/chosen": -70.19265747070312, + "logps/rejected": -79.21766662597656, + "loss": 0.8363, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6516714096069336, + "rewards/margins": 4.160384654998779, + "rewards/rejected": -1.5087130069732666, + "step": 3205 + }, + { + "epoch": 0.8, + "grad_norm": 8.260817527770996, + "learning_rate": 4.456171994859926e-06, + "logits/chosen": -0.32671377062797546, + "logits/rejected": -0.3659014403820038, + "logps/chosen": -50.838401794433594, + "logps/rejected": -78.42019653320312, + "loss": 0.9032, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.710588216781616, + "rewards/margins": 3.9035754203796387, + "rewards/rejected": -1.192987322807312, + "step": 3206 + }, + { + "epoch": 0.8, + "grad_norm": 12.044652938842773, + "learning_rate": 4.453567871061862e-06, + "logits/chosen": -0.2850152552127838, + "logits/rejected": -0.4230421781539917, + "logps/chosen": -67.3717269897461, + "logps/rejected": -80.8861083984375, + "loss": 0.8302, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.992443561553955, + "rewards/margins": 4.0553154945373535, + "rewards/rejected": -1.0628719329833984, + "step": 3207 + }, + { + "epoch": 0.8, + "grad_norm": 6.107736587524414, + "learning_rate": 4.450963897271211e-06, + "logits/chosen": -0.2794431149959564, + "logits/rejected": -0.3596750795841217, + "logps/chosen": -66.05785369873047, + "logps/rejected": -76.76300048828125, + "loss": 0.747, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7827868461608887, + "rewards/margins": 3.531259536743164, + "rewards/rejected": -0.7484728693962097, + "step": 3208 + }, + { + "epoch": 0.8, + "grad_norm": 3.9356064796447754, + "learning_rate": 4.4483600742028155e-06, + "logits/chosen": -0.32826006412506104, + "logits/rejected": -0.3505997955799103, + "logps/chosen": -61.07130432128906, + "logps/rejected": -97.90679931640625, + "loss": 0.8179, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.84739089012146, + "rewards/margins": 4.595808506011963, + "rewards/rejected": -1.748417615890503, + "step": 3209 + }, + { + "epoch": 0.8, + "grad_norm": 4.501946449279785, + "learning_rate": 4.445756402571483e-06, + "logits/chosen": -0.29777792096138, + "logits/rejected": -0.34428098797798157, + "logps/chosen": -59.440128326416016, + "logps/rejected": -92.39866638183594, + "loss": 0.8617, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.774791717529297, + "rewards/margins": 4.172682762145996, + "rewards/rejected": -1.3978911638259888, + "step": 3210 + }, + { + "epoch": 0.8, + "grad_norm": 10.214826583862305, + "learning_rate": 4.443152883091979e-06, + "logits/chosen": -0.2924306392669678, + "logits/rejected": -0.3619616627693176, + "logps/chosen": -65.8036880493164, + "logps/rejected": -88.06932830810547, + "loss": 0.7638, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.897524833679199, + "rewards/margins": 4.10182523727417, + "rewards/rejected": -1.2043001651763916, + "step": 3211 + }, + { + "epoch": 0.8, + "grad_norm": 11.23413372039795, + "learning_rate": 4.440549516479022e-06, + "logits/chosen": -0.2269173264503479, + "logits/rejected": -0.3888404369354248, + "logps/chosen": -62.38603973388672, + "logps/rejected": -70.95999908447266, + "loss": 0.8911, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6098885536193848, + "rewards/margins": 3.803224802017212, + "rewards/rejected": -1.1933361291885376, + "step": 3212 + }, + { + "epoch": 0.8, + "grad_norm": 4.851478576660156, + "learning_rate": 4.437946303447298e-06, + "logits/chosen": -0.33551931381225586, + "logits/rejected": -0.4200597107410431, + "logps/chosen": -52.09714126586914, + "logps/rejected": -70.5899887084961, + "loss": 0.6966, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1318142414093018, + "rewards/margins": 3.89674973487854, + "rewards/rejected": -0.7649353742599487, + "step": 3213 + }, + { + "epoch": 0.8, + "grad_norm": 4.746203422546387, + "learning_rate": 4.43534324471144e-06, + "logits/chosen": -0.29145488142967224, + "logits/rejected": -0.40923649072647095, + "logps/chosen": -50.88202667236328, + "logps/rejected": -78.90866088867188, + "loss": 0.6913, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8517487049102783, + "rewards/margins": 4.056921482086182, + "rewards/rejected": -1.205173373222351, + "step": 3214 + }, + { + "epoch": 0.8, + "grad_norm": 6.118863582611084, + "learning_rate": 4.432740340986046e-06, + "logits/chosen": -0.3640396296977997, + "logits/rejected": -0.4323795437812805, + "logps/chosen": -54.94331359863281, + "logps/rejected": -87.30585479736328, + "loss": 0.7794, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8788416385650635, + "rewards/margins": 4.191152095794678, + "rewards/rejected": -1.3123105764389038, + "step": 3215 + }, + { + "epoch": 0.8, + "grad_norm": 7.814754962921143, + "learning_rate": 4.430137592985669e-06, + "logits/chosen": -0.4265969693660736, + "logits/rejected": -0.4473329186439514, + "logps/chosen": -50.7874870300293, + "logps/rejected": -75.87006378173828, + "loss": 0.8814, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.924031972885132, + "rewards/margins": 3.091651439666748, + "rewards/rejected": -0.16761967539787292, + "step": 3216 + }, + { + "epoch": 0.8, + "grad_norm": 3.773841619491577, + "learning_rate": 4.427535001424817e-06, + "logits/chosen": -0.2822129428386688, + "logits/rejected": -0.4382786452770233, + "logps/chosen": -54.51744842529297, + "logps/rejected": -86.46621704101562, + "loss": 0.6129, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9861581325531006, + "rewards/margins": 5.6522932052612305, + "rewards/rejected": -2.66613507270813, + "step": 3217 + }, + { + "epoch": 0.8, + "grad_norm": 4.524022102355957, + "learning_rate": 4.424932567017963e-06, + "logits/chosen": -0.32049596309661865, + "logits/rejected": -0.45703017711639404, + "logps/chosen": -54.39418029785156, + "logps/rejected": -71.28042602539062, + "loss": 0.7276, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8903894424438477, + "rewards/margins": 4.647736549377441, + "rewards/rejected": -1.7573472261428833, + "step": 3218 + }, + { + "epoch": 0.81, + "grad_norm": 4.16290283203125, + "learning_rate": 4.422330290479527e-06, + "logits/chosen": -0.2410324364900589, + "logits/rejected": -0.4265527129173279, + "logps/chosen": -56.73065948486328, + "logps/rejected": -81.38105010986328, + "loss": 0.6843, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.067276954650879, + "rewards/margins": 5.056260108947754, + "rewards/rejected": -1.988983392715454, + "step": 3219 + }, + { + "epoch": 0.81, + "grad_norm": 4.57896089553833, + "learning_rate": 4.419728172523892e-06, + "logits/chosen": -0.33940836787223816, + "logits/rejected": -0.43535539507865906, + "logps/chosen": -53.35246658325195, + "logps/rejected": -70.9531478881836, + "loss": 0.8303, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9945478439331055, + "rewards/margins": 4.171505928039551, + "rewards/rejected": -1.1769585609436035, + "step": 3220 + }, + { + "epoch": 0.81, + "grad_norm": 10.335412979125977, + "learning_rate": 4.417126213865395e-06, + "logits/chosen": -0.38458967208862305, + "logits/rejected": -0.5651134252548218, + "logps/chosen": -65.74711608886719, + "logps/rejected": -64.98747253417969, + "loss": 0.9057, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.61332368850708, + "rewards/margins": 4.101660251617432, + "rewards/rejected": -1.4883370399475098, + "step": 3221 + }, + { + "epoch": 0.81, + "grad_norm": 6.270942687988281, + "learning_rate": 4.414524415218328e-06, + "logits/chosen": -0.2968303859233856, + "logits/rejected": -0.3663575351238251, + "logps/chosen": -54.554161071777344, + "logps/rejected": -93.47655487060547, + "loss": 0.7698, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8109240531921387, + "rewards/margins": 4.387218952178955, + "rewards/rejected": -1.5762946605682373, + "step": 3222 + }, + { + "epoch": 0.81, + "grad_norm": 6.758150100708008, + "learning_rate": 4.411922777296944e-06, + "logits/chosen": -0.34918248653411865, + "logits/rejected": -0.41716378927230835, + "logps/chosen": -45.748416900634766, + "logps/rejected": -85.24855041503906, + "loss": 0.7904, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.947138547897339, + "rewards/margins": 4.764669418334961, + "rewards/rejected": -1.8175307512283325, + "step": 3223 + }, + { + "epoch": 0.81, + "grad_norm": 4.185750961303711, + "learning_rate": 4.409321300815449e-06, + "logits/chosen": -0.41686955094337463, + "logits/rejected": -0.5261354446411133, + "logps/chosen": -52.79216766357422, + "logps/rejected": -72.28772735595703, + "loss": 0.695, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8188369274139404, + "rewards/margins": 4.665644645690918, + "rewards/rejected": -1.846807837486267, + "step": 3224 + }, + { + "epoch": 0.81, + "grad_norm": 4.624680519104004, + "learning_rate": 4.406719986488e-06, + "logits/chosen": -0.32895684242248535, + "logits/rejected": -0.423713743686676, + "logps/chosen": -57.03172302246094, + "logps/rejected": -85.72660827636719, + "loss": 0.7756, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9489920139312744, + "rewards/margins": 4.504095077514648, + "rewards/rejected": -1.5551029443740845, + "step": 3225 + }, + { + "epoch": 0.81, + "grad_norm": 7.505025386810303, + "learning_rate": 4.404118835028718e-06, + "logits/chosen": -0.31286266446113586, + "logits/rejected": -0.29427823424339294, + "logps/chosen": -53.46699523925781, + "logps/rejected": -83.23906707763672, + "loss": 0.9234, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7454988956451416, + "rewards/margins": 2.8200645446777344, + "rewards/rejected": -0.07456538081169128, + "step": 3226 + }, + { + "epoch": 0.81, + "grad_norm": 4.802574157714844, + "learning_rate": 4.401517847151673e-06, + "logits/chosen": -0.3524186611175537, + "logits/rejected": -0.4682151675224304, + "logps/chosen": -42.696739196777344, + "logps/rejected": -78.47541046142578, + "loss": 0.7204, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0169262886047363, + "rewards/margins": 5.101250171661377, + "rewards/rejected": -2.0843241214752197, + "step": 3227 + }, + { + "epoch": 0.81, + "grad_norm": 4.900545597076416, + "learning_rate": 4.398917023570894e-06, + "logits/chosen": -0.3591374456882477, + "logits/rejected": -0.39117875695228577, + "logps/chosen": -51.47370910644531, + "logps/rejected": -89.6981201171875, + "loss": 0.7151, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.900106430053711, + "rewards/margins": 4.658337593078613, + "rewards/rejected": -1.758231520652771, + "step": 3228 + }, + { + "epoch": 0.81, + "grad_norm": 7.316983699798584, + "learning_rate": 4.396316365000362e-06, + "logits/chosen": -0.32654866576194763, + "logits/rejected": -0.3927823603153229, + "logps/chosen": -57.43888854980469, + "logps/rejected": -74.7543716430664, + "loss": 0.7272, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9013419151306152, + "rewards/margins": 3.8324873447418213, + "rewards/rejected": -0.9311453104019165, + "step": 3229 + }, + { + "epoch": 0.81, + "grad_norm": 3.9016928672790527, + "learning_rate": 4.393715872154014e-06, + "logits/chosen": -0.2740315794944763, + "logits/rejected": -0.3894695043563843, + "logps/chosen": -65.55146789550781, + "logps/rejected": -100.23355102539062, + "loss": 0.716, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7514286041259766, + "rewards/margins": 4.673294544219971, + "rewards/rejected": -1.9218666553497314, + "step": 3230 + }, + { + "epoch": 0.81, + "grad_norm": 13.78514575958252, + "learning_rate": 4.391115545745743e-06, + "logits/chosen": -0.32910510897636414, + "logits/rejected": -0.37529870867729187, + "logps/chosen": -63.806514739990234, + "logps/rejected": -84.46919250488281, + "loss": 0.9353, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.968618631362915, + "rewards/margins": 3.448435068130493, + "rewards/rejected": -0.4798164367675781, + "step": 3231 + }, + { + "epoch": 0.81, + "grad_norm": 9.923236846923828, + "learning_rate": 4.388515386489391e-06, + "logits/chosen": -0.3987615704536438, + "logits/rejected": -0.4814112186431885, + "logps/chosen": -52.444419860839844, + "logps/rejected": -73.22882843017578, + "loss": 0.8192, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1295511722564697, + "rewards/margins": 3.9207818508148193, + "rewards/rejected": -0.7912309169769287, + "step": 3232 + }, + { + "epoch": 0.81, + "grad_norm": 4.656767845153809, + "learning_rate": 4.385915395098763e-06, + "logits/chosen": -0.30699077248573303, + "logits/rejected": -0.41042450070381165, + "logps/chosen": -60.15101623535156, + "logps/rejected": -82.77951049804688, + "loss": 0.7165, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8114919662475586, + "rewards/margins": 4.735920429229736, + "rewards/rejected": -1.9244282245635986, + "step": 3233 + }, + { + "epoch": 0.81, + "grad_norm": 7.539966583251953, + "learning_rate": 4.383315572287609e-06, + "logits/chosen": -0.3724254369735718, + "logits/rejected": -0.42673036456108093, + "logps/chosen": -62.382835388183594, + "logps/rejected": -95.0137710571289, + "loss": 0.8695, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9378433227539062, + "rewards/margins": 4.258655071258545, + "rewards/rejected": -1.3208116292953491, + "step": 3234 + }, + { + "epoch": 0.81, + "grad_norm": 26.42540168762207, + "learning_rate": 4.380715918769636e-06, + "logits/chosen": -0.22517873346805573, + "logits/rejected": -0.27952447533607483, + "logps/chosen": -56.65437316894531, + "logps/rejected": -95.68661499023438, + "loss": 0.8964, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.600389003753662, + "rewards/margins": 3.9210000038146973, + "rewards/rejected": -1.3206112384796143, + "step": 3235 + }, + { + "epoch": 0.81, + "grad_norm": 16.474069595336914, + "learning_rate": 4.378116435258509e-06, + "logits/chosen": -0.390647828578949, + "logits/rejected": -0.5031808614730835, + "logps/chosen": -57.676727294921875, + "logps/rejected": -93.46092987060547, + "loss": 0.8033, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.315507173538208, + "rewards/margins": 5.153567790985107, + "rewards/rejected": -2.8380606174468994, + "step": 3236 + }, + { + "epoch": 0.81, + "grad_norm": 6.63691520690918, + "learning_rate": 4.375517122467836e-06, + "logits/chosen": -0.33435767889022827, + "logits/rejected": -0.4011067748069763, + "logps/chosen": -61.41770935058594, + "logps/rejected": -78.74662780761719, + "loss": 0.8925, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0976014137268066, + "rewards/margins": 3.6136507987976074, + "rewards/rejected": -0.5160492062568665, + "step": 3237 + }, + { + "epoch": 0.81, + "grad_norm": 4.661809921264648, + "learning_rate": 4.3729179811111925e-06, + "logits/chosen": -0.3240166902542114, + "logits/rejected": -0.45116573572158813, + "logps/chosen": -52.041934967041016, + "logps/rejected": -67.28008270263672, + "loss": 0.8309, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7582614421844482, + "rewards/margins": 4.439234256744385, + "rewards/rejected": -1.6809725761413574, + "step": 3238 + }, + { + "epoch": 0.81, + "grad_norm": 5.424771785736084, + "learning_rate": 4.3703190119020926e-06, + "logits/chosen": -0.38843420147895813, + "logits/rejected": -0.4715733230113983, + "logps/chosen": -54.473445892333984, + "logps/rejected": -77.47419738769531, + "loss": 0.8376, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5807549953460693, + "rewards/margins": 4.454634666442871, + "rewards/rejected": -1.8738799095153809, + "step": 3239 + }, + { + "epoch": 0.81, + "grad_norm": 9.881586074829102, + "learning_rate": 4.36772021555401e-06, + "logits/chosen": -0.30395835638046265, + "logits/rejected": -0.4406259059906006, + "logps/chosen": -56.37934112548828, + "logps/rejected": -83.72157287597656, + "loss": 0.7359, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9037108421325684, + "rewards/margins": 5.168896198272705, + "rewards/rejected": -2.2651853561401367, + "step": 3240 + }, + { + "epoch": 0.81, + "grad_norm": 10.710871696472168, + "learning_rate": 4.3651215927803735e-06, + "logits/chosen": -0.365424245595932, + "logits/rejected": -0.4329984784126282, + "logps/chosen": -55.86139678955078, + "logps/rejected": -79.18896484375, + "loss": 0.7905, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6442172527313232, + "rewards/margins": 4.006774425506592, + "rewards/rejected": -1.3625571727752686, + "step": 3241 + }, + { + "epoch": 0.81, + "grad_norm": 3.8792850971221924, + "learning_rate": 4.362523144294558e-06, + "logits/chosen": -0.2756160497665405, + "logits/rejected": -0.37346625328063965, + "logps/chosen": -58.55925750732422, + "logps/rejected": -79.11708068847656, + "loss": 0.7944, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8486533164978027, + "rewards/margins": 4.346016883850098, + "rewards/rejected": -1.497363567352295, + "step": 3242 + }, + { + "epoch": 0.81, + "grad_norm": 4.029684543609619, + "learning_rate": 4.359924870809896e-06, + "logits/chosen": -0.3537447452545166, + "logits/rejected": -0.41022375226020813, + "logps/chosen": -54.54392623901367, + "logps/rejected": -76.33882141113281, + "loss": 0.7463, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7419211864471436, + "rewards/margins": 4.466089248657227, + "rewards/rejected": -1.7241681814193726, + "step": 3243 + }, + { + "epoch": 0.81, + "grad_norm": 7.056362152099609, + "learning_rate": 4.357326773039669e-06, + "logits/chosen": -0.32662296295166016, + "logits/rejected": -0.3790261745452881, + "logps/chosen": -59.79531478881836, + "logps/rejected": -86.192138671875, + "loss": 0.8453, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7801544666290283, + "rewards/margins": 4.192391395568848, + "rewards/rejected": -1.4122366905212402, + "step": 3244 + }, + { + "epoch": 0.81, + "grad_norm": 3.9345622062683105, + "learning_rate": 4.3547288516971095e-06, + "logits/chosen": -0.2586422264575958, + "logits/rejected": -0.3969593942165375, + "logps/chosen": -60.310115814208984, + "logps/rejected": -75.40679931640625, + "loss": 0.7564, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0844194889068604, + "rewards/margins": 5.008038520812988, + "rewards/rejected": -1.923619270324707, + "step": 3245 + }, + { + "epoch": 0.81, + "grad_norm": 9.143858909606934, + "learning_rate": 4.3521311074954055e-06, + "logits/chosen": -0.2973885238170624, + "logits/rejected": -0.43755558133125305, + "logps/chosen": -51.87357711791992, + "logps/rejected": -87.35493469238281, + "loss": 0.7063, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.811054229736328, + "rewards/margins": 5.522879600524902, + "rewards/rejected": -2.711825370788574, + "step": 3246 + }, + { + "epoch": 0.81, + "grad_norm": 7.2450056076049805, + "learning_rate": 4.34953354114769e-06, + "logits/chosen": -0.25566861033439636, + "logits/rejected": -0.37082746624946594, + "logps/chosen": -63.034690856933594, + "logps/rejected": -82.254150390625, + "loss": 0.8194, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8092803955078125, + "rewards/margins": 4.1369123458862305, + "rewards/rejected": -1.3276318311691284, + "step": 3247 + }, + { + "epoch": 0.81, + "grad_norm": 3.9393861293792725, + "learning_rate": 4.346936153367056e-06, + "logits/chosen": -0.3239867389202118, + "logits/rejected": -0.42834556102752686, + "logps/chosen": -53.853843688964844, + "logps/rejected": -82.2969970703125, + "loss": 0.8038, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8397982120513916, + "rewards/margins": 4.457448959350586, + "rewards/rejected": -1.6176506280899048, + "step": 3248 + }, + { + "epoch": 0.81, + "grad_norm": 7.878835201263428, + "learning_rate": 4.34433894486654e-06, + "logits/chosen": -0.25393688678741455, + "logits/rejected": -0.3519510328769684, + "logps/chosen": -63.00361633300781, + "logps/rejected": -79.2540512084961, + "loss": 0.8014, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.46004319190979, + "rewards/margins": 3.8936307430267334, + "rewards/rejected": -1.433587670326233, + "step": 3249 + }, + { + "epoch": 0.81, + "grad_norm": 13.659318923950195, + "learning_rate": 4.3417419163591296e-06, + "logits/chosen": -0.23389285802841187, + "logits/rejected": -0.34831055998802185, + "logps/chosen": -56.45633316040039, + "logps/rejected": -83.89790344238281, + "loss": 0.7466, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7006454467773438, + "rewards/margins": 4.210093975067139, + "rewards/rejected": -1.5094482898712158, + "step": 3250 + }, + { + "epoch": 0.81, + "grad_norm": 11.367833137512207, + "learning_rate": 4.339145068557769e-06, + "logits/chosen": -0.26422804594039917, + "logits/rejected": -0.36339032649993896, + "logps/chosen": -50.7752571105957, + "logps/rejected": -94.52713012695312, + "loss": 0.676, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.619518518447876, + "rewards/margins": 4.967195987701416, + "rewards/rejected": -2.347677707672119, + "step": 3251 + }, + { + "epoch": 0.81, + "grad_norm": 3.6613855361938477, + "learning_rate": 4.336548402175345e-06, + "logits/chosen": -0.38048774003982544, + "logits/rejected": -0.45580780506134033, + "logps/chosen": -54.45246124267578, + "logps/rejected": -85.79788970947266, + "loss": 0.7633, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.772260904312134, + "rewards/margins": 4.541068077087402, + "rewards/rejected": -1.768807053565979, + "step": 3252 + }, + { + "epoch": 0.81, + "grad_norm": 6.0502190589904785, + "learning_rate": 4.333951917924703e-06, + "logits/chosen": -0.30830684304237366, + "logits/rejected": -0.42065268754959106, + "logps/chosen": -59.59995651245117, + "logps/rejected": -72.38555145263672, + "loss": 0.8064, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.20595645904541, + "rewards/margins": 4.632212162017822, + "rewards/rejected": -1.426255464553833, + "step": 3253 + }, + { + "epoch": 0.81, + "grad_norm": 47.02488708496094, + "learning_rate": 4.331355616518631e-06, + "logits/chosen": -0.3274150788784027, + "logits/rejected": -0.43194860219955444, + "logps/chosen": -59.96408462524414, + "logps/rejected": -74.81310272216797, + "loss": 0.8011, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.964604377746582, + "rewards/margins": 3.707533597946167, + "rewards/rejected": -0.7429290413856506, + "step": 3254 + }, + { + "epoch": 0.81, + "grad_norm": 6.751739978790283, + "learning_rate": 4.3287594986698694e-06, + "logits/chosen": -0.2581571042537689, + "logits/rejected": -0.39712250232696533, + "logps/chosen": -58.63792419433594, + "logps/rejected": -69.02127838134766, + "loss": 0.9499, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7254228591918945, + "rewards/margins": 3.5752968788146973, + "rewards/rejected": -0.8498739004135132, + "step": 3255 + }, + { + "epoch": 0.81, + "grad_norm": 6.265187740325928, + "learning_rate": 4.326163565091112e-06, + "logits/chosen": -0.3121238648891449, + "logits/rejected": -0.43132689595222473, + "logps/chosen": -49.56338882446289, + "logps/rejected": -74.8160400390625, + "loss": 0.735, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.786464214324951, + "rewards/margins": 4.732926845550537, + "rewards/rejected": -1.9464625120162964, + "step": 3256 + }, + { + "epoch": 0.81, + "grad_norm": 4.830085754394531, + "learning_rate": 4.323567816494993e-06, + "logits/chosen": -0.30219322443008423, + "logits/rejected": -0.4106855094432831, + "logps/chosen": -57.448577880859375, + "logps/rejected": -86.71192932128906, + "loss": 0.8134, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5414960384368896, + "rewards/margins": 4.409964561462402, + "rewards/rejected": -1.8684687614440918, + "step": 3257 + }, + { + "epoch": 0.81, + "grad_norm": 4.43763542175293, + "learning_rate": 4.320972253594108e-06, + "logits/chosen": -0.30642956495285034, + "logits/rejected": -0.41614365577697754, + "logps/chosen": -57.79146957397461, + "logps/rejected": -92.13858795166016, + "loss": 0.767, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.048248767852783, + "rewards/margins": 4.699695587158203, + "rewards/rejected": -1.6514465808868408, + "step": 3258 + }, + { + "epoch": 0.82, + "grad_norm": 6.398218631744385, + "learning_rate": 4.318376877100991e-06, + "logits/chosen": -0.2719752788543701, + "logits/rejected": -0.41707897186279297, + "logps/chosen": -61.15513610839844, + "logps/rejected": -71.63069152832031, + "loss": 0.7483, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.971156120300293, + "rewards/margins": 5.097894668579102, + "rewards/rejected": -2.1267383098602295, + "step": 3259 + }, + { + "epoch": 0.82, + "grad_norm": 14.591238021850586, + "learning_rate": 4.315781687728127e-06, + "logits/chosen": -0.34217318892478943, + "logits/rejected": -0.36306291818618774, + "logps/chosen": -54.952293395996094, + "logps/rejected": -71.88127136230469, + "loss": 0.9788, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0990383625030518, + "rewards/margins": 3.138298511505127, + "rewards/rejected": -0.03926023840904236, + "step": 3260 + }, + { + "epoch": 0.82, + "grad_norm": 8.264537811279297, + "learning_rate": 4.3131866861879564e-06, + "logits/chosen": -0.3001382648944855, + "logits/rejected": -0.3715839087963104, + "logps/chosen": -62.53778839111328, + "logps/rejected": -78.05607604980469, + "loss": 0.859, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7477571964263916, + "rewards/margins": 3.8740813732147217, + "rewards/rejected": -1.126323938369751, + "step": 3261 + }, + { + "epoch": 0.82, + "grad_norm": 4.434957981109619, + "learning_rate": 4.31059187319286e-06, + "logits/chosen": -0.2907206118106842, + "logits/rejected": -0.4090273380279541, + "logps/chosen": -52.18798828125, + "logps/rejected": -74.27012634277344, + "loss": 0.6763, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.667137622833252, + "rewards/margins": 4.107693672180176, + "rewards/rejected": -1.4405560493469238, + "step": 3262 + }, + { + "epoch": 0.82, + "grad_norm": 3.91195011138916, + "learning_rate": 4.30799724945517e-06, + "logits/chosen": -0.30521637201309204, + "logits/rejected": -0.40119484066963196, + "logps/chosen": -50.45227813720703, + "logps/rejected": -79.75122833251953, + "loss": 0.7509, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9734549522399902, + "rewards/margins": 4.583122253417969, + "rewards/rejected": -1.609667181968689, + "step": 3263 + }, + { + "epoch": 0.82, + "grad_norm": 3.5557548999786377, + "learning_rate": 4.305402815687168e-06, + "logits/chosen": -0.3309289216995239, + "logits/rejected": -0.3739183843135834, + "logps/chosen": -59.424049377441406, + "logps/rejected": -102.81085968017578, + "loss": 0.689, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.307201385498047, + "rewards/margins": 5.566436290740967, + "rewards/rejected": -2.2592344284057617, + "step": 3264 + }, + { + "epoch": 0.82, + "grad_norm": 3.621021270751953, + "learning_rate": 4.302808572601081e-06, + "logits/chosen": -0.24992336332798004, + "logits/rejected": -0.35775527358055115, + "logps/chosen": -59.694557189941406, + "logps/rejected": -81.9240493774414, + "loss": 0.7207, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0262508392333984, + "rewards/margins": 4.939427375793457, + "rewards/rejected": -1.9131766557693481, + "step": 3265 + }, + { + "epoch": 0.82, + "grad_norm": 2.942608118057251, + "learning_rate": 4.300214520909085e-06, + "logits/chosen": -0.3646374046802521, + "logits/rejected": -0.46289771795272827, + "logps/chosen": -46.079673767089844, + "logps/rejected": -84.87518310546875, + "loss": 0.6478, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.957441806793213, + "rewards/margins": 5.04515266418457, + "rewards/rejected": -2.0877108573913574, + "step": 3266 + }, + { + "epoch": 0.82, + "grad_norm": 3.6468136310577393, + "learning_rate": 4.297620661323303e-06, + "logits/chosen": -0.3317136764526367, + "logits/rejected": -0.39484623074531555, + "logps/chosen": -49.92444610595703, + "logps/rejected": -86.55059814453125, + "loss": 0.7227, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0475411415100098, + "rewards/margins": 4.325298309326172, + "rewards/rejected": -1.2777576446533203, + "step": 3267 + }, + { + "epoch": 0.82, + "grad_norm": 6.515221118927002, + "learning_rate": 4.295026994555807e-06, + "logits/chosen": -0.2937811613082886, + "logits/rejected": -0.40006911754608154, + "logps/chosen": -58.234718322753906, + "logps/rejected": -80.42809295654297, + "loss": 0.7826, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.577342987060547, + "rewards/margins": 3.848816394805908, + "rewards/rejected": -1.27147376537323, + "step": 3268 + }, + { + "epoch": 0.82, + "grad_norm": 3.1892855167388916, + "learning_rate": 4.292433521318613e-06, + "logits/chosen": -0.327534019947052, + "logits/rejected": -0.4611174762248993, + "logps/chosen": -59.48370361328125, + "logps/rejected": -69.60501861572266, + "loss": 0.778, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.521117925643921, + "rewards/margins": 4.559216499328613, + "rewards/rejected": -2.0380983352661133, + "step": 3269 + }, + { + "epoch": 0.82, + "grad_norm": 3.4202332496643066, + "learning_rate": 4.2898402423236835e-06, + "logits/chosen": -0.29619553685188293, + "logits/rejected": -0.3962576389312744, + "logps/chosen": -56.607906341552734, + "logps/rejected": -77.5106201171875, + "loss": 0.7249, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.913013219833374, + "rewards/margins": 4.514589309692383, + "rewards/rejected": -1.6015760898590088, + "step": 3270 + }, + { + "epoch": 0.82, + "grad_norm": 3.321626663208008, + "learning_rate": 4.287247158282933e-06, + "logits/chosen": -0.32478082180023193, + "logits/rejected": -0.40175968408584595, + "logps/chosen": -59.61494445800781, + "logps/rejected": -75.0197982788086, + "loss": 0.7288, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.811995029449463, + "rewards/margins": 4.218114852905273, + "rewards/rejected": -1.4061195850372314, + "step": 3271 + }, + { + "epoch": 0.82, + "grad_norm": 4.06953763961792, + "learning_rate": 4.284654269908216e-06, + "logits/chosen": -0.41154932975769043, + "logits/rejected": -0.4690619707107544, + "logps/chosen": -46.98773193359375, + "logps/rejected": -90.75374603271484, + "loss": 0.6808, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9343082904815674, + "rewards/margins": 5.308559894561768, + "rewards/rejected": -2.3742525577545166, + "step": 3272 + }, + { + "epoch": 0.82, + "grad_norm": 3.6002395153045654, + "learning_rate": 4.28206157791134e-06, + "logits/chosen": -0.32585573196411133, + "logits/rejected": -0.3966725468635559, + "logps/chosen": -47.39275360107422, + "logps/rejected": -84.33633422851562, + "loss": 0.6682, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2705836296081543, + "rewards/margins": 5.242301940917969, + "rewards/rejected": -1.971718430519104, + "step": 3273 + }, + { + "epoch": 0.82, + "grad_norm": 6.389831066131592, + "learning_rate": 4.279469083004052e-06, + "logits/chosen": -0.3835816979408264, + "logits/rejected": -0.4450950026512146, + "logps/chosen": -50.615638732910156, + "logps/rejected": -74.73429870605469, + "loss": 0.8104, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.719968557357788, + "rewards/margins": 3.7746596336364746, + "rewards/rejected": -1.0546905994415283, + "step": 3274 + }, + { + "epoch": 0.82, + "grad_norm": 6.689310073852539, + "learning_rate": 4.276876785898047e-06, + "logits/chosen": -0.2538072466850281, + "logits/rejected": -0.34989190101623535, + "logps/chosen": -69.48221588134766, + "logps/rejected": -70.93891906738281, + "loss": 0.9673, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.649406671524048, + "rewards/margins": 3.7137069702148438, + "rewards/rejected": -1.0643001794815063, + "step": 3275 + }, + { + "epoch": 0.82, + "grad_norm": 2.634950876235962, + "learning_rate": 4.27428468730497e-06, + "logits/chosen": -0.3717503845691681, + "logits/rejected": -0.46460697054862976, + "logps/chosen": -56.80355453491211, + "logps/rejected": -89.88572692871094, + "loss": 0.575, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9698312282562256, + "rewards/margins": 5.134102821350098, + "rewards/rejected": -2.1642723083496094, + "step": 3276 + }, + { + "epoch": 0.82, + "grad_norm": 2.876688003540039, + "learning_rate": 4.2716927879364046e-06, + "logits/chosen": -0.16277524828910828, + "logits/rejected": -0.358408659696579, + "logps/chosen": -65.95820617675781, + "logps/rejected": -78.1024398803711, + "loss": 0.6451, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0115792751312256, + "rewards/margins": 5.51349401473999, + "rewards/rejected": -2.5019145011901855, + "step": 3277 + }, + { + "epoch": 0.82, + "grad_norm": 4.091349124908447, + "learning_rate": 4.2691010885038856e-06, + "logits/chosen": -0.28718364238739014, + "logits/rejected": -0.3565191626548767, + "logps/chosen": -52.60171127319336, + "logps/rejected": -81.99225616455078, + "loss": 0.7269, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1276302337646484, + "rewards/margins": 4.875370025634766, + "rewards/rejected": -1.747739553451538, + "step": 3278 + }, + { + "epoch": 0.82, + "grad_norm": 7.023463726043701, + "learning_rate": 4.26650958971889e-06, + "logits/chosen": -0.2777288556098938, + "logits/rejected": -0.2714115083217621, + "logps/chosen": -67.33601379394531, + "logps/rejected": -96.16779327392578, + "loss": 0.9432, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7326974868774414, + "rewards/margins": 3.677663803100586, + "rewards/rejected": -0.9449663758277893, + "step": 3279 + }, + { + "epoch": 0.82, + "grad_norm": 2.9717183113098145, + "learning_rate": 4.263918292292838e-06, + "logits/chosen": -0.3326184153556824, + "logits/rejected": -0.4461490511894226, + "logps/chosen": -48.643253326416016, + "logps/rejected": -85.2696304321289, + "loss": 0.6011, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.936797618865967, + "rewards/margins": 4.613631725311279, + "rewards/rejected": -1.6768341064453125, + "step": 3280 + }, + { + "epoch": 0.82, + "grad_norm": 4.381846904754639, + "learning_rate": 4.2613271969371e-06, + "logits/chosen": -0.23873256146907806, + "logits/rejected": -0.41228947043418884, + "logps/chosen": -55.900184631347656, + "logps/rejected": -73.60813903808594, + "loss": 0.7821, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2439494132995605, + "rewards/margins": 4.493776321411133, + "rewards/rejected": -1.249826431274414, + "step": 3281 + }, + { + "epoch": 0.82, + "grad_norm": 3.376042604446411, + "learning_rate": 4.258736304362983e-06, + "logits/chosen": -0.33423855900764465, + "logits/rejected": -0.40626609325408936, + "logps/chosen": -52.74665069580078, + "logps/rejected": -85.57356262207031, + "loss": 0.6892, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.85509991645813, + "rewards/margins": 4.162177085876465, + "rewards/rejected": -1.3070769309997559, + "step": 3282 + }, + { + "epoch": 0.82, + "grad_norm": 5.355032444000244, + "learning_rate": 4.2561456152817475e-06, + "logits/chosen": -0.2918359041213989, + "logits/rejected": -0.3979620933532715, + "logps/chosen": -54.99555587768555, + "logps/rejected": -78.01239776611328, + "loss": 0.6982, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.685868263244629, + "rewards/margins": 4.602303504943848, + "rewards/rejected": -1.9164353609085083, + "step": 3283 + }, + { + "epoch": 0.82, + "grad_norm": 5.058257102966309, + "learning_rate": 4.253555130404592e-06, + "logits/chosen": -0.2707209587097168, + "logits/rejected": -0.30722057819366455, + "logps/chosen": -53.414459228515625, + "logps/rejected": -92.88760375976562, + "loss": 0.7392, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1153435707092285, + "rewards/margins": 3.950326442718506, + "rewards/rejected": -0.8349831104278564, + "step": 3284 + }, + { + "epoch": 0.82, + "grad_norm": 4.332162380218506, + "learning_rate": 4.250964850442658e-06, + "logits/chosen": -0.3229028284549713, + "logits/rejected": -0.4343816936016083, + "logps/chosen": -51.78630828857422, + "logps/rejected": -76.71533203125, + "loss": 0.7171, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7177295684814453, + "rewards/margins": 4.968813896179199, + "rewards/rejected": -2.251084566116333, + "step": 3285 + }, + { + "epoch": 0.82, + "grad_norm": 4.9597487449646, + "learning_rate": 4.2483747761070385e-06, + "logits/chosen": -0.33812665939331055, + "logits/rejected": -0.4175111651420593, + "logps/chosen": -57.440879821777344, + "logps/rejected": -90.43289947509766, + "loss": 0.7812, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8029654026031494, + "rewards/margins": 4.6519341468811035, + "rewards/rejected": -1.8489688634872437, + "step": 3286 + }, + { + "epoch": 0.82, + "grad_norm": 11.471761703491211, + "learning_rate": 4.245784908108759e-06, + "logits/chosen": -0.25831329822540283, + "logits/rejected": -0.420304536819458, + "logps/chosen": -58.613792419433594, + "logps/rejected": -78.27503204345703, + "loss": 0.7603, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6213014125823975, + "rewards/margins": 5.135101318359375, + "rewards/rejected": -2.5137999057769775, + "step": 3287 + }, + { + "epoch": 0.82, + "grad_norm": 5.946494102478027, + "learning_rate": 4.243195247158798e-06, + "logits/chosen": -0.3243466913700104, + "logits/rejected": -0.40054231882095337, + "logps/chosen": -58.523189544677734, + "logps/rejected": -76.1644058227539, + "loss": 0.7511, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.745912551879883, + "rewards/margins": 4.436108112335205, + "rewards/rejected": -1.6901955604553223, + "step": 3288 + }, + { + "epoch": 0.82, + "grad_norm": 5.881018161773682, + "learning_rate": 4.240605793968075e-06, + "logits/chosen": -0.3474053740501404, + "logits/rejected": -0.4110928773880005, + "logps/chosen": -70.9202880859375, + "logps/rejected": -78.36480712890625, + "loss": 0.8572, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.797543525695801, + "rewards/margins": 4.215104103088379, + "rewards/rejected": -1.4175605773925781, + "step": 3289 + }, + { + "epoch": 0.82, + "grad_norm": 21.46819496154785, + "learning_rate": 4.238016549247443e-06, + "logits/chosen": -0.3367011845111847, + "logits/rejected": -0.4734993577003479, + "logps/chosen": -66.8614501953125, + "logps/rejected": -72.33858489990234, + "loss": 0.8698, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.730656147003174, + "rewards/margins": 4.120810508728027, + "rewards/rejected": -1.3901543617248535, + "step": 3290 + }, + { + "epoch": 0.82, + "grad_norm": 3.118258237838745, + "learning_rate": 4.235427513707714e-06, + "logits/chosen": -0.31202420592308044, + "logits/rejected": -0.41068345308303833, + "logps/chosen": -50.24861526489258, + "logps/rejected": -87.58659362792969, + "loss": 0.5828, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.959171772003174, + "rewards/margins": 5.905083179473877, + "rewards/rejected": -2.945911407470703, + "step": 3291 + }, + { + "epoch": 0.82, + "grad_norm": 9.501091957092285, + "learning_rate": 4.232838688059628e-06, + "logits/chosen": -0.39749157428741455, + "logits/rejected": -0.47510719299316406, + "logps/chosen": -48.399391174316406, + "logps/rejected": -78.23344421386719, + "loss": 0.6725, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8811793327331543, + "rewards/margins": 4.549811840057373, + "rewards/rejected": -1.6686322689056396, + "step": 3292 + }, + { + "epoch": 0.82, + "grad_norm": 6.657238960266113, + "learning_rate": 4.230250073013879e-06, + "logits/chosen": -0.28180333971977234, + "logits/rejected": -0.36871111392974854, + "logps/chosen": -62.19576644897461, + "logps/rejected": -93.45142364501953, + "loss": 0.7685, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.651731491088867, + "rewards/margins": 4.624480724334717, + "rewards/rejected": -1.9727493524551392, + "step": 3293 + }, + { + "epoch": 0.82, + "grad_norm": 8.912755012512207, + "learning_rate": 4.227661669281094e-06, + "logits/chosen": -0.47886142134666443, + "logits/rejected": -0.5378298163414001, + "logps/chosen": -62.487205505371094, + "logps/rejected": -89.76307678222656, + "loss": 0.7926, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7122392654418945, + "rewards/margins": 4.661676406860352, + "rewards/rejected": -1.9494366645812988, + "step": 3294 + }, + { + "epoch": 0.82, + "grad_norm": 5.707769393920898, + "learning_rate": 4.225073477571845e-06, + "logits/chosen": -0.33004361391067505, + "logits/rejected": -0.4155294895172119, + "logps/chosen": -53.34256362915039, + "logps/rejected": -93.88365936279297, + "loss": 0.7792, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.882319450378418, + "rewards/margins": 4.851572036743164, + "rewards/rejected": -1.9692524671554565, + "step": 3295 + }, + { + "epoch": 0.82, + "grad_norm": 6.217495918273926, + "learning_rate": 4.2224854985966495e-06, + "logits/chosen": -0.3491297662258148, + "logits/rejected": -0.37033891677856445, + "logps/chosen": -53.69940948486328, + "logps/rejected": -101.05865478515625, + "loss": 0.8615, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.752145290374756, + "rewards/margins": 4.194507598876953, + "rewards/rejected": -1.4423624277114868, + "step": 3296 + }, + { + "epoch": 0.82, + "grad_norm": 17.06060028076172, + "learning_rate": 4.219897733065961e-06, + "logits/chosen": -0.22494041919708252, + "logits/rejected": -0.37832850217819214, + "logps/chosen": -60.126380920410156, + "logps/rejected": -74.86380004882812, + "loss": 0.9676, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5383377075195312, + "rewards/margins": 4.326070785522461, + "rewards/rejected": -1.7877328395843506, + "step": 3297 + }, + { + "epoch": 0.83, + "grad_norm": 4.612310409545898, + "learning_rate": 4.217310181690179e-06, + "logits/chosen": -0.3855774402618408, + "logits/rejected": -0.4827297031879425, + "logps/chosen": -57.544456481933594, + "logps/rejected": -83.34221649169922, + "loss": 0.8768, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.836656332015991, + "rewards/margins": 4.676800727844238, + "rewards/rejected": -1.840144157409668, + "step": 3298 + }, + { + "epoch": 0.83, + "grad_norm": 10.606658935546875, + "learning_rate": 4.214722845179643e-06, + "logits/chosen": -0.3928362727165222, + "logits/rejected": -0.5765625238418579, + "logps/chosen": -65.8216781616211, + "logps/rejected": -78.60552215576172, + "loss": 0.8463, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8292229175567627, + "rewards/margins": 3.6589438915252686, + "rewards/rejected": -0.829721212387085, + "step": 3299 + }, + { + "epoch": 0.83, + "grad_norm": 4.04857063293457, + "learning_rate": 4.212135724244627e-06, + "logits/chosen": -0.3837841749191284, + "logits/rejected": -0.46708840131759644, + "logps/chosen": -40.77635955810547, + "logps/rejected": -75.43049621582031, + "loss": 0.5853, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0799002647399902, + "rewards/margins": 5.107787609100342, + "rewards/rejected": -2.0278868675231934, + "step": 3300 + }, + { + "epoch": 0.83, + "grad_norm": 27.873971939086914, + "learning_rate": 4.2095488195953585e-06, + "logits/chosen": -0.3075941801071167, + "logits/rejected": -0.3665499985218048, + "logps/chosen": -52.33761978149414, + "logps/rejected": -79.9240951538086, + "loss": 0.8896, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.041062593460083, + "rewards/margins": 3.755284547805786, + "rewards/rejected": -0.7142220735549927, + "step": 3301 + }, + { + "epoch": 0.83, + "grad_norm": 5.06494665145874, + "learning_rate": 4.206962131941993e-06, + "logits/chosen": -0.2922227084636688, + "logits/rejected": -0.37533777952194214, + "logps/chosen": -55.34006118774414, + "logps/rejected": -83.65668487548828, + "loss": 0.8118, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6003763675689697, + "rewards/margins": 4.526150703430176, + "rewards/rejected": -1.9257739782333374, + "step": 3302 + }, + { + "epoch": 0.83, + "grad_norm": 6.998075008392334, + "learning_rate": 4.204375661994637e-06, + "logits/chosen": -0.31289729475975037, + "logits/rejected": -0.4395056962966919, + "logps/chosen": -57.66719436645508, + "logps/rejected": -100.70990753173828, + "loss": 0.7279, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7678213119506836, + "rewards/margins": 5.6494832038879395, + "rewards/rejected": -2.881662130355835, + "step": 3303 + }, + { + "epoch": 0.83, + "grad_norm": 4.042408466339111, + "learning_rate": 4.20178941046333e-06, + "logits/chosen": -0.39612582325935364, + "logits/rejected": -0.4966962933540344, + "logps/chosen": -46.184776306152344, + "logps/rejected": -81.83733367919922, + "loss": 0.586, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.734196901321411, + "rewards/margins": 5.4953718185424805, + "rewards/rejected": -2.7611749172210693, + "step": 3304 + }, + { + "epoch": 0.83, + "grad_norm": 4.706082820892334, + "learning_rate": 4.199203378058052e-06, + "logits/chosen": -0.22385944426059723, + "logits/rejected": -0.42821311950683594, + "logps/chosen": -63.010887145996094, + "logps/rejected": -81.68766021728516, + "loss": 0.7585, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.560810089111328, + "rewards/margins": 4.9486985206604, + "rewards/rejected": -2.387888193130493, + "step": 3305 + }, + { + "epoch": 0.83, + "grad_norm": 6.574129581451416, + "learning_rate": 4.196617565488728e-06, + "logits/chosen": -0.2861632704734802, + "logits/rejected": -0.42558759450912476, + "logps/chosen": -61.58720397949219, + "logps/rejected": -74.7790298461914, + "loss": 0.7721, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0066874027252197, + "rewards/margins": 4.310561180114746, + "rewards/rejected": -1.303873896598816, + "step": 3306 + }, + { + "epoch": 0.83, + "grad_norm": 4.37584114074707, + "learning_rate": 4.194031973465217e-06, + "logits/chosen": -0.3118456304073334, + "logits/rejected": -0.48431897163391113, + "logps/chosen": -60.263267517089844, + "logps/rejected": -71.42911529541016, + "loss": 0.7882, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.010908365249634, + "rewards/margins": 4.657073020935059, + "rewards/rejected": -1.6461641788482666, + "step": 3307 + }, + { + "epoch": 0.83, + "grad_norm": 13.68326187133789, + "learning_rate": 4.191446602697321e-06, + "logits/chosen": -0.3749805986881256, + "logits/rejected": -0.37638115882873535, + "logps/chosen": -88.43719482421875, + "logps/rejected": -95.78742218017578, + "loss": 1.0007, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.572394371032715, + "rewards/margins": 3.829054832458496, + "rewards/rejected": -1.2566604614257812, + "step": 3308 + }, + { + "epoch": 0.83, + "grad_norm": 2.80002760887146, + "learning_rate": 4.188861453894781e-06, + "logits/chosen": -0.4085686206817627, + "logits/rejected": -0.5035088062286377, + "logps/chosen": -43.9675407409668, + "logps/rejected": -79.06549835205078, + "loss": 0.5863, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.914949655532837, + "rewards/margins": 5.481174468994141, + "rewards/rejected": -2.5662243366241455, + "step": 3309 + }, + { + "epoch": 0.83, + "grad_norm": 5.296900749206543, + "learning_rate": 4.186276527767273e-06, + "logits/chosen": -0.3827342689037323, + "logits/rejected": -0.4193241000175476, + "logps/chosen": -55.47004699707031, + "logps/rejected": -88.8700942993164, + "loss": 0.9428, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8936922550201416, + "rewards/margins": 3.7995848655700684, + "rewards/rejected": -0.9058924913406372, + "step": 3310 + }, + { + "epoch": 0.83, + "grad_norm": 4.181336402893066, + "learning_rate": 4.183691825024419e-06, + "logits/chosen": -0.30803820490837097, + "logits/rejected": -0.44295862317085266, + "logps/chosen": -58.59702682495117, + "logps/rejected": -93.75555419921875, + "loss": 0.7996, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7774782180786133, + "rewards/margins": 5.248005390167236, + "rewards/rejected": -2.4705276489257812, + "step": 3311 + }, + { + "epoch": 0.83, + "grad_norm": 6.112049102783203, + "learning_rate": 4.181107346375771e-06, + "logits/chosen": -0.26476627588272095, + "logits/rejected": -0.2983597218990326, + "logps/chosen": -54.24671936035156, + "logps/rejected": -80.40992736816406, + "loss": 0.8634, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9664487838745117, + "rewards/margins": 3.5128273963928223, + "rewards/rejected": -0.5463786721229553, + "step": 3312 + }, + { + "epoch": 0.83, + "grad_norm": 10.667548179626465, + "learning_rate": 4.178523092530829e-06, + "logits/chosen": -0.3525296449661255, + "logits/rejected": -0.36840569972991943, + "logps/chosen": -67.91935729980469, + "logps/rejected": -87.5671157836914, + "loss": 1.0331, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.878453493118286, + "rewards/margins": 3.6745800971984863, + "rewards/rejected": -0.7961268424987793, + "step": 3313 + }, + { + "epoch": 0.83, + "grad_norm": 6.243203163146973, + "learning_rate": 4.1759390641990235e-06, + "logits/chosen": -0.32662469148635864, + "logits/rejected": -0.4148326814174652, + "logps/chosen": -59.92475891113281, + "logps/rejected": -90.50310516357422, + "loss": 0.7568, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7094764709472656, + "rewards/margins": 5.053670406341553, + "rewards/rejected": -2.344193935394287, + "step": 3314 + }, + { + "epoch": 0.83, + "grad_norm": 4.809047222137451, + "learning_rate": 4.173355262089726e-06, + "logits/chosen": -0.3534550666809082, + "logits/rejected": -0.3844924569129944, + "logps/chosen": -51.74704360961914, + "logps/rejected": -87.78267669677734, + "loss": 0.7387, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7559618949890137, + "rewards/margins": 4.504871845245361, + "rewards/rejected": -1.748909831047058, + "step": 3315 + }, + { + "epoch": 0.83, + "grad_norm": 4.403275012969971, + "learning_rate": 4.170771686912247e-06, + "logits/chosen": -0.27594107389450073, + "logits/rejected": -0.39197537302970886, + "logps/chosen": -59.4066047668457, + "logps/rejected": -67.47064971923828, + "loss": 0.7497, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1019227504730225, + "rewards/margins": 4.461843490600586, + "rewards/rejected": -1.359920859336853, + "step": 3316 + }, + { + "epoch": 0.83, + "grad_norm": 6.170682907104492, + "learning_rate": 4.168188339375833e-06, + "logits/chosen": -0.3735157251358032, + "logits/rejected": -0.3983519375324249, + "logps/chosen": -60.12969207763672, + "logps/rejected": -100.41413879394531, + "loss": 1.0136, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.854538917541504, + "rewards/margins": 3.9806408882141113, + "rewards/rejected": -1.1261013746261597, + "step": 3317 + }, + { + "epoch": 0.83, + "grad_norm": 6.152069568634033, + "learning_rate": 4.165605220189669e-06, + "logits/chosen": -0.3550572395324707, + "logits/rejected": -0.36090975999832153, + "logps/chosen": -42.957923889160156, + "logps/rejected": -75.80577850341797, + "loss": 0.8185, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.922189474105835, + "rewards/margins": 3.2609641551971436, + "rewards/rejected": -0.33877456188201904, + "step": 3318 + }, + { + "epoch": 0.83, + "grad_norm": 5.716320037841797, + "learning_rate": 4.163022330062878e-06, + "logits/chosen": -0.2620483338832855, + "logits/rejected": -0.38660892844200134, + "logps/chosen": -51.67377471923828, + "logps/rejected": -71.66858673095703, + "loss": 0.7814, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.788796901702881, + "rewards/margins": 4.275944232940674, + "rewards/rejected": -1.487147331237793, + "step": 3319 + }, + { + "epoch": 0.83, + "grad_norm": 4.287824630737305, + "learning_rate": 4.160439669704516e-06, + "logits/chosen": -0.3160906136035919, + "logits/rejected": -0.43533506989479065, + "logps/chosen": -56.01547622680664, + "logps/rejected": -77.93592071533203, + "loss": 0.7377, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0198652744293213, + "rewards/margins": 4.511427402496338, + "rewards/rejected": -1.491561770439148, + "step": 3320 + }, + { + "epoch": 0.83, + "grad_norm": 7.182918548583984, + "learning_rate": 4.157857239823583e-06, + "logits/chosen": -0.22112531960010529, + "logits/rejected": -0.29582107067108154, + "logps/chosen": -59.76123809814453, + "logps/rejected": -79.40644836425781, + "loss": 0.7716, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.792567729949951, + "rewards/margins": 4.173297882080078, + "rewards/rejected": -1.3807297945022583, + "step": 3321 + }, + { + "epoch": 0.83, + "grad_norm": 5.466817378997803, + "learning_rate": 4.155275041129008e-06, + "logits/chosen": -0.26919692754745483, + "logits/rejected": -0.2866671681404114, + "logps/chosen": -56.59039306640625, + "logps/rejected": -82.49214935302734, + "loss": 0.8359, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7507262229919434, + "rewards/margins": 3.020081043243408, + "rewards/rejected": -0.26935461163520813, + "step": 3322 + }, + { + "epoch": 0.83, + "grad_norm": 4.781553268432617, + "learning_rate": 4.152693074329664e-06, + "logits/chosen": -0.2749805450439453, + "logits/rejected": -0.42913931608200073, + "logps/chosen": -56.56785583496094, + "logps/rejected": -80.2679214477539, + "loss": 0.7098, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.809854507446289, + "rewards/margins": 5.144346237182617, + "rewards/rejected": -2.334491729736328, + "step": 3323 + }, + { + "epoch": 0.83, + "grad_norm": 3.0405800342559814, + "learning_rate": 4.150111340134353e-06, + "logits/chosen": -0.34444373846054077, + "logits/rejected": -0.4527534246444702, + "logps/chosen": -48.803470611572266, + "logps/rejected": -84.34175872802734, + "loss": 0.6769, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.873624563217163, + "rewards/margins": 5.311270713806152, + "rewards/rejected": -2.4376463890075684, + "step": 3324 + }, + { + "epoch": 0.83, + "grad_norm": 4.653281211853027, + "learning_rate": 4.147529839251818e-06, + "logits/chosen": -0.29665839672088623, + "logits/rejected": -0.39503195881843567, + "logps/chosen": -57.12781524658203, + "logps/rejected": -73.9998779296875, + "loss": 0.7368, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8662819862365723, + "rewards/margins": 4.785120964050293, + "rewards/rejected": -1.9188382625579834, + "step": 3325 + }, + { + "epoch": 0.83, + "grad_norm": 2.644608497619629, + "learning_rate": 4.1449485723907375e-06, + "logits/chosen": -0.3162221312522888, + "logits/rejected": -0.3452519476413727, + "logps/chosen": -58.98676681518555, + "logps/rejected": -105.34900665283203, + "loss": 0.634, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.961045503616333, + "rewards/margins": 4.843053817749023, + "rewards/rejected": -1.8820079565048218, + "step": 3326 + }, + { + "epoch": 0.83, + "grad_norm": 4.741249084472656, + "learning_rate": 4.142367540259724e-06, + "logits/chosen": -0.28156334161758423, + "logits/rejected": -0.42765843868255615, + "logps/chosen": -51.075782775878906, + "logps/rejected": -83.42942810058594, + "loss": 0.6966, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.031745672225952, + "rewards/margins": 4.929967880249023, + "rewards/rejected": -1.8982219696044922, + "step": 3327 + }, + { + "epoch": 0.83, + "grad_norm": 8.15688419342041, + "learning_rate": 4.139786743567328e-06, + "logits/chosen": -0.28568604588508606, + "logits/rejected": -0.3851662874221802, + "logps/chosen": -60.16222381591797, + "logps/rejected": -101.53111267089844, + "loss": 0.8036, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.875223159790039, + "rewards/margins": 4.7840399742126465, + "rewards/rejected": -1.9088176488876343, + "step": 3328 + }, + { + "epoch": 0.83, + "grad_norm": 5.811701774597168, + "learning_rate": 4.137206183022032e-06, + "logits/chosen": -0.2744927406311035, + "logits/rejected": -0.3539283275604248, + "logps/chosen": -47.129486083984375, + "logps/rejected": -90.19454956054688, + "loss": 0.7123, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5032894611358643, + "rewards/margins": 4.295154571533203, + "rewards/rejected": -1.7918651103973389, + "step": 3329 + }, + { + "epoch": 0.83, + "grad_norm": 5.234736919403076, + "learning_rate": 4.1346258593322555e-06, + "logits/chosen": -0.24197694659233093, + "logits/rejected": -0.34375685453414917, + "logps/chosen": -63.58612060546875, + "logps/rejected": -87.89051818847656, + "loss": 0.7861, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.672685384750366, + "rewards/margins": 4.602415084838867, + "rewards/rejected": -1.92972993850708, + "step": 3330 + }, + { + "epoch": 0.83, + "grad_norm": 2.7138874530792236, + "learning_rate": 4.132045773206355e-06, + "logits/chosen": -0.26892825961112976, + "logits/rejected": -0.38654518127441406, + "logps/chosen": -51.02491760253906, + "logps/rejected": -87.69758605957031, + "loss": 0.6457, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8961973190307617, + "rewards/margins": 5.076758861541748, + "rewards/rejected": -2.1805615425109863, + "step": 3331 + }, + { + "epoch": 0.83, + "grad_norm": 4.94822883605957, + "learning_rate": 4.129465925352619e-06, + "logits/chosen": -0.33047616481781006, + "logits/rejected": -0.44278019666671753, + "logps/chosen": -47.47076416015625, + "logps/rejected": -75.64006042480469, + "loss": 0.7742, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.531737804412842, + "rewards/margins": 4.191462516784668, + "rewards/rejected": -1.6597247123718262, + "step": 3332 + }, + { + "epoch": 0.83, + "grad_norm": 1.8934412002563477, + "learning_rate": 4.126886316479271e-06, + "logits/chosen": -0.31449639797210693, + "logits/rejected": -0.4542529881000519, + "logps/chosen": -52.66801452636719, + "logps/rejected": -72.4451904296875, + "loss": 0.6358, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.285160779953003, + "rewards/margins": 5.923716068267822, + "rewards/rejected": -2.6385555267333984, + "step": 3333 + }, + { + "epoch": 0.83, + "grad_norm": 4.011206150054932, + "learning_rate": 4.1243069472944705e-06, + "logits/chosen": -0.26246604323387146, + "logits/rejected": -0.34078824520111084, + "logps/chosen": -66.05714416503906, + "logps/rejected": -90.42951965332031, + "loss": 0.779, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7830429077148438, + "rewards/margins": 3.700894355773926, + "rewards/rejected": -0.917851448059082, + "step": 3334 + }, + { + "epoch": 0.83, + "grad_norm": 3.7713770866394043, + "learning_rate": 4.121727818506307e-06, + "logits/chosen": -0.3552249073982239, + "logits/rejected": -0.505845844745636, + "logps/chosen": -55.390071868896484, + "logps/rejected": -76.15563201904297, + "loss": 0.683, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8308022022247314, + "rewards/margins": 4.718406677246094, + "rewards/rejected": -1.8876041173934937, + "step": 3335 + }, + { + "epoch": 0.83, + "grad_norm": 7.912765979766846, + "learning_rate": 4.11914893082281e-06, + "logits/chosen": -0.2317667156457901, + "logits/rejected": -0.3868582248687744, + "logps/chosen": -63.53472137451172, + "logps/rejected": -74.71782684326172, + "loss": 0.8406, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8426947593688965, + "rewards/margins": 3.987844228744507, + "rewards/rejected": -1.1451497077941895, + "step": 3336 + }, + { + "epoch": 0.83, + "grad_norm": 6.955915927886963, + "learning_rate": 4.116570284951938e-06, + "logits/chosen": -0.3167744576931, + "logits/rejected": -0.3695457875728607, + "logps/chosen": -60.72956848144531, + "logps/rejected": -86.98652648925781, + "loss": 0.9019, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3895633220672607, + "rewards/margins": 3.632580041885376, + "rewards/rejected": -1.2430164813995361, + "step": 3337 + }, + { + "epoch": 0.84, + "grad_norm": 5.410691261291504, + "learning_rate": 4.113991881601586e-06, + "logits/chosen": -0.2898018956184387, + "logits/rejected": -0.3937774896621704, + "logps/chosen": -64.8072280883789, + "logps/rejected": -79.28458404541016, + "loss": 0.8347, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.643878698348999, + "rewards/margins": 4.075623512268066, + "rewards/rejected": -1.4317444562911987, + "step": 3338 + }, + { + "epoch": 0.84, + "grad_norm": 8.43017864227295, + "learning_rate": 4.111413721479581e-06, + "logits/chosen": -0.29944688081741333, + "logits/rejected": -0.4044466018676758, + "logps/chosen": -63.46845626831055, + "logps/rejected": -84.71491241455078, + "loss": 0.8303, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7777469158172607, + "rewards/margins": 4.637063503265381, + "rewards/rejected": -1.8593162298202515, + "step": 3339 + }, + { + "epoch": 0.84, + "grad_norm": 4.786588191986084, + "learning_rate": 4.108835805293684e-06, + "logits/chosen": -0.4052220582962036, + "logits/rejected": -0.4764496684074402, + "logps/chosen": -45.23675537109375, + "logps/rejected": -75.1406478881836, + "loss": 0.8525, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.876636505126953, + "rewards/margins": 3.0627264976501465, + "rewards/rejected": -0.18608993291854858, + "step": 3340 + }, + { + "epoch": 0.84, + "grad_norm": 4.7003655433654785, + "learning_rate": 4.106258133751588e-06, + "logits/chosen": -0.37240591645240784, + "logits/rejected": -0.4714084267616272, + "logps/chosen": -54.19984436035156, + "logps/rejected": -68.65509796142578, + "loss": 0.8152, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9315223693847656, + "rewards/margins": 4.1163330078125, + "rewards/rejected": -1.1848101615905762, + "step": 3341 + }, + { + "epoch": 0.84, + "grad_norm": 11.486217498779297, + "learning_rate": 4.103680707560919e-06, + "logits/chosen": -0.3053732216358185, + "logits/rejected": -0.4762319326400757, + "logps/chosen": -57.00684356689453, + "logps/rejected": -81.26296997070312, + "loss": 0.6641, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9159419536590576, + "rewards/margins": 5.365809440612793, + "rewards/rejected": -2.449866771697998, + "step": 3342 + }, + { + "epoch": 0.84, + "grad_norm": 3.2816996574401855, + "learning_rate": 4.1011035274292375e-06, + "logits/chosen": -0.34923744201660156, + "logits/rejected": -0.4462534189224243, + "logps/chosen": -55.67559051513672, + "logps/rejected": -80.52315521240234, + "loss": 0.6729, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9884259700775146, + "rewards/margins": 4.556291103363037, + "rewards/rejected": -1.5678651332855225, + "step": 3343 + }, + { + "epoch": 0.84, + "grad_norm": 8.552159309387207, + "learning_rate": 4.098526594064036e-06, + "logits/chosen": -0.292147696018219, + "logits/rejected": -0.32206442952156067, + "logps/chosen": -51.489017486572266, + "logps/rejected": -87.84791564941406, + "loss": 0.8885, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.905395984649658, + "rewards/margins": 3.702124834060669, + "rewards/rejected": -0.7967286705970764, + "step": 3344 + }, + { + "epoch": 0.84, + "grad_norm": 5.146547794342041, + "learning_rate": 4.095949908172734e-06, + "logits/chosen": -0.27848073840141296, + "logits/rejected": -0.35620564222335815, + "logps/chosen": -61.066646575927734, + "logps/rejected": -87.19287872314453, + "loss": 0.889, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7028005123138428, + "rewards/margins": 4.483572959899902, + "rewards/rejected": -1.78077232837677, + "step": 3345 + }, + { + "epoch": 0.84, + "grad_norm": 4.699113845825195, + "learning_rate": 4.093373470462692e-06, + "logits/chosen": -0.32462018728256226, + "logits/rejected": -0.40737923979759216, + "logps/chosen": -54.87642288208008, + "logps/rejected": -88.46554565429688, + "loss": 0.66, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8621585369110107, + "rewards/margins": 5.156276226043701, + "rewards/rejected": -2.2941176891326904, + "step": 3346 + }, + { + "epoch": 0.84, + "grad_norm": 4.128557205200195, + "learning_rate": 4.090797281641193e-06, + "logits/chosen": -0.37769949436187744, + "logits/rejected": -0.46476492285728455, + "logps/chosen": -48.556705474853516, + "logps/rejected": -72.929443359375, + "loss": 0.7904, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5767602920532227, + "rewards/margins": 4.372598648071289, + "rewards/rejected": -1.795838475227356, + "step": 3347 + }, + { + "epoch": 0.84, + "grad_norm": 4.8613715171813965, + "learning_rate": 4.0882213424154635e-06, + "logits/chosen": -0.3725219666957855, + "logits/rejected": -0.44933968782424927, + "logps/chosen": -57.54050827026367, + "logps/rejected": -82.11798858642578, + "loss": 0.9887, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8270063400268555, + "rewards/margins": 3.8482165336608887, + "rewards/rejected": -1.0212106704711914, + "step": 3348 + }, + { + "epoch": 0.84, + "grad_norm": 6.660611152648926, + "learning_rate": 4.085645653492648e-06, + "logits/chosen": -0.24869635701179504, + "logits/rejected": -0.36024534702301025, + "logps/chosen": -67.5379867553711, + "logps/rejected": -79.61558532714844, + "loss": 0.7992, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7747199535369873, + "rewards/margins": 3.40901780128479, + "rewards/rejected": -0.6342976093292236, + "step": 3349 + }, + { + "epoch": 0.84, + "grad_norm": 8.621200561523438, + "learning_rate": 4.08307021557983e-06, + "logits/chosen": -0.2891254425048828, + "logits/rejected": -0.4241340756416321, + "logps/chosen": -61.0968132019043, + "logps/rejected": -80.3844223022461, + "loss": 0.823, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6245343685150146, + "rewards/margins": 4.016127109527588, + "rewards/rejected": -1.3915927410125732, + "step": 3350 + }, + { + "epoch": 0.84, + "grad_norm": 5.151454925537109, + "learning_rate": 4.080495029384024e-06, + "logits/chosen": -0.19795726239681244, + "logits/rejected": -0.3258003294467926, + "logps/chosen": -68.47456359863281, + "logps/rejected": -80.93769836425781, + "loss": 0.8438, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.965930700302124, + "rewards/margins": 3.53824782371521, + "rewards/rejected": -0.572317361831665, + "step": 3351 + }, + { + "epoch": 0.84, + "grad_norm": 10.300162315368652, + "learning_rate": 4.077920095612174e-06, + "logits/chosen": -0.25790417194366455, + "logits/rejected": -0.30533096194267273, + "logps/chosen": -61.27796936035156, + "logps/rejected": -94.71955871582031, + "loss": 0.8686, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.741572141647339, + "rewards/margins": 4.217671871185303, + "rewards/rejected": -1.4760993719100952, + "step": 3352 + }, + { + "epoch": 0.84, + "grad_norm": 5.876389503479004, + "learning_rate": 4.075345414971155e-06, + "logits/chosen": -0.3284982442855835, + "logits/rejected": -0.4136679768562317, + "logps/chosen": -45.168006896972656, + "logps/rejected": -79.08447265625, + "loss": 0.803, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9039618968963623, + "rewards/margins": 4.454245090484619, + "rewards/rejected": -1.5502827167510986, + "step": 3353 + }, + { + "epoch": 0.84, + "grad_norm": 4.881089687347412, + "learning_rate": 4.072770988167773e-06, + "logits/chosen": -0.31415048241615295, + "logits/rejected": -0.38038355112075806, + "logps/chosen": -53.23414993286133, + "logps/rejected": -90.47230529785156, + "loss": 0.7199, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.019731044769287, + "rewards/margins": 4.976905822753906, + "rewards/rejected": -1.957174301147461, + "step": 3354 + }, + { + "epoch": 0.84, + "grad_norm": 4.6863508224487305, + "learning_rate": 4.07019681590876e-06, + "logits/chosen": -0.38230469822883606, + "logits/rejected": -0.38286346197128296, + "logps/chosen": -49.14556884765625, + "logps/rejected": -86.15510559082031, + "loss": 0.824, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8319251537323, + "rewards/margins": 3.6123642921447754, + "rewards/rejected": -0.780439019203186, + "step": 3355 + }, + { + "epoch": 0.84, + "grad_norm": 6.435615539550781, + "learning_rate": 4.067622898900788e-06, + "logits/chosen": -0.3812143802642822, + "logits/rejected": -0.45457011461257935, + "logps/chosen": -49.294944763183594, + "logps/rejected": -78.09832763671875, + "loss": 0.6849, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8149430751800537, + "rewards/margins": 4.440464973449707, + "rewards/rejected": -1.6255221366882324, + "step": 3356 + }, + { + "epoch": 0.84, + "grad_norm": 11.78372573852539, + "learning_rate": 4.0650492378504455e-06, + "logits/chosen": -0.3291940987110138, + "logits/rejected": -0.4172162711620331, + "logps/chosen": -66.46858215332031, + "logps/rejected": -67.58089447021484, + "loss": 1.0398, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7152099609375, + "rewards/margins": 2.8659584522247314, + "rewards/rejected": -0.1507485806941986, + "step": 3357 + }, + { + "epoch": 0.84, + "grad_norm": 7.680607795715332, + "learning_rate": 4.062475833464265e-06, + "logits/chosen": -0.3106101155281067, + "logits/rejected": -0.4109709858894348, + "logps/chosen": -63.57318115234375, + "logps/rejected": -90.24111938476562, + "loss": 0.8581, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.82674503326416, + "rewards/margins": 4.40289306640625, + "rewards/rejected": -1.5761483907699585, + "step": 3358 + }, + { + "epoch": 0.84, + "grad_norm": 5.930662631988525, + "learning_rate": 4.059902686448698e-06, + "logits/chosen": -0.32416272163391113, + "logits/rejected": -0.3914717137813568, + "logps/chosen": -64.91802215576172, + "logps/rejected": -88.13812255859375, + "loss": 0.8939, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5169517993927, + "rewards/margins": 3.553201675415039, + "rewards/rejected": -1.0362502336502075, + "step": 3359 + }, + { + "epoch": 0.84, + "grad_norm": 11.462692260742188, + "learning_rate": 4.057329797510128e-06, + "logits/chosen": -0.4112272262573242, + "logits/rejected": -0.5069305896759033, + "logps/chosen": -52.125511169433594, + "logps/rejected": -84.26966094970703, + "loss": 0.7983, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7060039043426514, + "rewards/margins": 4.786389350891113, + "rewards/rejected": -2.080385446548462, + "step": 3360 + }, + { + "epoch": 0.84, + "grad_norm": 4.525661468505859, + "learning_rate": 4.05475716735487e-06, + "logits/chosen": -0.3488124907016754, + "logits/rejected": -0.470163494348526, + "logps/chosen": -53.845237731933594, + "logps/rejected": -92.58377075195312, + "loss": 0.6502, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7280080318450928, + "rewards/margins": 5.579023361206055, + "rewards/rejected": -2.851015329360962, + "step": 3361 + }, + { + "epoch": 0.84, + "grad_norm": 3.89447283744812, + "learning_rate": 4.052184796689166e-06, + "logits/chosen": -0.3249996602535248, + "logits/rejected": -0.38002413511276245, + "logps/chosen": -56.72969055175781, + "logps/rejected": -102.92124938964844, + "loss": 0.7034, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5305700302124023, + "rewards/margins": 4.555217266082764, + "rewards/rejected": -2.024646759033203, + "step": 3362 + }, + { + "epoch": 0.84, + "grad_norm": 4.892177104949951, + "learning_rate": 4.049612686219189e-06, + "logits/chosen": -0.24913664162158966, + "logits/rejected": -0.34626320004463196, + "logps/chosen": -56.83365249633789, + "logps/rejected": -92.46136474609375, + "loss": 0.8232, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5571129322052, + "rewards/margins": 4.153261184692383, + "rewards/rejected": -1.5961487293243408, + "step": 3363 + }, + { + "epoch": 0.84, + "grad_norm": 5.772363185882568, + "learning_rate": 4.047040836651037e-06, + "logits/chosen": -0.4149855375289917, + "logits/rejected": -0.4390837550163269, + "logps/chosen": -53.06031799316406, + "logps/rejected": -89.14863586425781, + "loss": 0.7604, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9893369674682617, + "rewards/margins": 4.77593469619751, + "rewards/rejected": -1.786597728729248, + "step": 3364 + }, + { + "epoch": 0.84, + "grad_norm": 8.776297569274902, + "learning_rate": 4.044469248690737e-06, + "logits/chosen": -0.3533290922641754, + "logits/rejected": -0.4745543599128723, + "logps/chosen": -49.011104583740234, + "logps/rejected": -80.04396057128906, + "loss": 0.6189, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6853835582733154, + "rewards/margins": 5.158247947692871, + "rewards/rejected": -2.4728646278381348, + "step": 3365 + }, + { + "epoch": 0.84, + "grad_norm": 5.974641799926758, + "learning_rate": 4.041897923044249e-06, + "logits/chosen": -0.3586554229259491, + "logits/rejected": -0.5359422564506531, + "logps/chosen": -61.52168273925781, + "logps/rejected": -65.5470962524414, + "loss": 0.7188, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0999808311462402, + "rewards/margins": 4.850979804992676, + "rewards/rejected": -1.7509995698928833, + "step": 3366 + }, + { + "epoch": 0.84, + "grad_norm": 4.266636848449707, + "learning_rate": 4.0393268604174535e-06, + "logits/chosen": -0.2778058648109436, + "logits/rejected": -0.3513715863227844, + "logps/chosen": -61.0274658203125, + "logps/rejected": -92.77261352539062, + "loss": 0.6877, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9155526161193848, + "rewards/margins": 4.831392288208008, + "rewards/rejected": -1.9158393144607544, + "step": 3367 + }, + { + "epoch": 0.84, + "grad_norm": 6.67026424407959, + "learning_rate": 4.036756061516166e-06, + "logits/chosen": -0.2802722454071045, + "logits/rejected": -0.43249937891960144, + "logps/chosen": -69.58187866210938, + "logps/rejected": -76.76844787597656, + "loss": 0.841, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6251206398010254, + "rewards/margins": 3.5468618869781494, + "rewards/rejected": -0.921741247177124, + "step": 3368 + }, + { + "epoch": 0.84, + "grad_norm": 4.118321895599365, + "learning_rate": 4.034185527046125e-06, + "logits/chosen": -0.32990995049476624, + "logits/rejected": -0.3999679684638977, + "logps/chosen": -61.867576599121094, + "logps/rejected": -87.32656860351562, + "loss": 0.6442, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.134193181991577, + "rewards/margins": 5.1705827713012695, + "rewards/rejected": -2.0363893508911133, + "step": 3369 + }, + { + "epoch": 0.84, + "grad_norm": 6.086511135101318, + "learning_rate": 4.031615257712996e-06, + "logits/chosen": -0.3245519697666168, + "logits/rejected": -0.4208415746688843, + "logps/chosen": -55.42348861694336, + "logps/rejected": -70.45304870605469, + "loss": 0.8276, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.843240261077881, + "rewards/margins": 4.136701583862305, + "rewards/rejected": -1.293461799621582, + "step": 3370 + }, + { + "epoch": 0.84, + "grad_norm": 4.436947345733643, + "learning_rate": 4.0290452542223766e-06, + "logits/chosen": -0.31825074553489685, + "logits/rejected": -0.3768029808998108, + "logps/chosen": -60.87152862548828, + "logps/rejected": -83.3409652709961, + "loss": 0.7174, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0537314414978027, + "rewards/margins": 4.107483863830566, + "rewards/rejected": -1.0537524223327637, + "step": 3371 + }, + { + "epoch": 0.84, + "grad_norm": 6.14866304397583, + "learning_rate": 4.026475517279785e-06, + "logits/chosen": -0.3196459114551544, + "logits/rejected": -0.4072754979133606, + "logps/chosen": -52.7575569152832, + "logps/rejected": -77.3038101196289, + "loss": 0.8713, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3758351802825928, + "rewards/margins": 3.970146656036377, + "rewards/rejected": -1.5943113565444946, + "step": 3372 + }, + { + "epoch": 0.84, + "grad_norm": 5.922863960266113, + "learning_rate": 4.023906047590671e-06, + "logits/chosen": -0.40086525678634644, + "logits/rejected": -0.5058702826499939, + "logps/chosen": -71.3837661743164, + "logps/rejected": -76.7750244140625, + "loss": 0.935, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8011369705200195, + "rewards/margins": 4.792013645172119, + "rewards/rejected": -1.9908766746520996, + "step": 3373 + }, + { + "epoch": 0.84, + "grad_norm": 5.160863876342773, + "learning_rate": 4.021336845860409e-06, + "logits/chosen": -0.4085557758808136, + "logits/rejected": -0.5258634686470032, + "logps/chosen": -50.24049377441406, + "logps/rejected": -78.27701568603516, + "loss": 0.7016, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.094423770904541, + "rewards/margins": 5.197552680969238, + "rewards/rejected": -2.10312819480896, + "step": 3374 + }, + { + "epoch": 0.84, + "grad_norm": 7.53198766708374, + "learning_rate": 4.018767912794302e-06, + "logits/chosen": -0.2963959574699402, + "logits/rejected": -0.46420586109161377, + "logps/chosen": -60.45198059082031, + "logps/rejected": -66.5620346069336, + "loss": 0.7507, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.548506498336792, + "rewards/margins": 4.155868053436279, + "rewards/rejected": -1.607361078262329, + "step": 3375 + }, + { + "epoch": 0.84, + "grad_norm": 8.077814102172852, + "learning_rate": 4.0161992490975756e-06, + "logits/chosen": -0.32793480157852173, + "logits/rejected": -0.44498977065086365, + "logps/chosen": -60.48921585083008, + "logps/rejected": -77.39283752441406, + "loss": 0.9789, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7042534351348877, + "rewards/margins": 4.217395305633545, + "rewards/rejected": -1.5131416320800781, + "step": 3376 + }, + { + "epoch": 0.84, + "grad_norm": 5.367419719696045, + "learning_rate": 4.0136308554753835e-06, + "logits/chosen": -0.3748238980770111, + "logits/rejected": -0.48194214701652527, + "logps/chosen": -50.736698150634766, + "logps/rejected": -71.34417724609375, + "loss": 0.7161, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7894821166992188, + "rewards/margins": 4.312671184539795, + "rewards/rejected": -1.5231887102127075, + "step": 3377 + }, + { + "epoch": 0.85, + "grad_norm": 17.803003311157227, + "learning_rate": 4.011062732632808e-06, + "logits/chosen": -0.2947288751602173, + "logits/rejected": -0.4489964246749878, + "logps/chosen": -57.36589813232422, + "logps/rejected": -68.46285247802734, + "loss": 0.8398, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.644984722137451, + "rewards/margins": 4.41335391998291, + "rewards/rejected": -1.7683697938919067, + "step": 3378 + }, + { + "epoch": 0.85, + "grad_norm": 4.874030590057373, + "learning_rate": 4.008494881274848e-06, + "logits/chosen": -0.37770724296569824, + "logits/rejected": -0.4615169167518616, + "logps/chosen": -47.78837966918945, + "logps/rejected": -77.20829010009766, + "loss": 0.711, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.831968069076538, + "rewards/margins": 4.157811164855957, + "rewards/rejected": -1.3258432149887085, + "step": 3379 + }, + { + "epoch": 0.85, + "grad_norm": 5.415175437927246, + "learning_rate": 4.005927302106442e-06, + "logits/chosen": -0.3808458745479584, + "logits/rejected": -0.4430771470069885, + "logps/chosen": -51.54395294189453, + "logps/rejected": -93.1248779296875, + "loss": 0.7381, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8217999935150146, + "rewards/margins": 4.712202072143555, + "rewards/rejected": -1.8904023170471191, + "step": 3380 + }, + { + "epoch": 0.85, + "grad_norm": 19.7506160736084, + "learning_rate": 4.003359995832441e-06, + "logits/chosen": -0.3467826545238495, + "logits/rejected": -0.35840386152267456, + "logps/chosen": -62.375877380371094, + "logps/rejected": -83.74520874023438, + "loss": 0.9789, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5889344215393066, + "rewards/margins": 2.845055341720581, + "rewards/rejected": -0.2561207115650177, + "step": 3381 + }, + { + "epoch": 0.85, + "grad_norm": 4.271739959716797, + "learning_rate": 4.000792963157626e-06, + "logits/chosen": -0.24584029614925385, + "logits/rejected": -0.364349365234375, + "logps/chosen": -60.70964050292969, + "logps/rejected": -70.54666900634766, + "loss": 0.7144, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6998775005340576, + "rewards/margins": 3.756455898284912, + "rewards/rejected": -1.0565781593322754, + "step": 3382 + }, + { + "epoch": 0.85, + "grad_norm": 7.102136135101318, + "learning_rate": 3.998226204786704e-06, + "logits/chosen": -0.2186303734779358, + "logits/rejected": -0.31563490629196167, + "logps/chosen": -58.798465728759766, + "logps/rejected": -76.86709594726562, + "loss": 0.7905, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.131074905395508, + "rewards/margins": 3.27069091796875, + "rewards/rejected": -0.13961616158485413, + "step": 3383 + }, + { + "epoch": 0.85, + "grad_norm": 20.587501525878906, + "learning_rate": 3.995659721424305e-06, + "logits/chosen": -0.3294673562049866, + "logits/rejected": -0.3880752921104431, + "logps/chosen": -55.003299713134766, + "logps/rejected": -84.40373229980469, + "loss": 0.8514, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6356475353240967, + "rewards/margins": 4.033058166503906, + "rewards/rejected": -1.3974103927612305, + "step": 3384 + }, + { + "epoch": 0.85, + "grad_norm": 5.881772994995117, + "learning_rate": 3.993093513774985e-06, + "logits/chosen": -0.36272791028022766, + "logits/rejected": -0.434571236371994, + "logps/chosen": -61.44660949707031, + "logps/rejected": -85.5503921508789, + "loss": 0.7355, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9845547676086426, + "rewards/margins": 4.251899719238281, + "rewards/rejected": -1.2673450708389282, + "step": 3385 + }, + { + "epoch": 0.85, + "grad_norm": 7.520561218261719, + "learning_rate": 3.990527582543223e-06, + "logits/chosen": -0.25562453269958496, + "logits/rejected": -0.4049248695373535, + "logps/chosen": -64.43286895751953, + "logps/rejected": -71.82527923583984, + "loss": 0.717, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.665205955505371, + "rewards/margins": 3.9544565677642822, + "rewards/rejected": -1.2892507314682007, + "step": 3386 + }, + { + "epoch": 0.85, + "grad_norm": 6.323330402374268, + "learning_rate": 3.987961928433421e-06, + "logits/chosen": -0.3271462321281433, + "logits/rejected": -0.44445720314979553, + "logps/chosen": -55.357261657714844, + "logps/rejected": -76.41256713867188, + "loss": 0.6042, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.933676242828369, + "rewards/margins": 4.432621955871582, + "rewards/rejected": -1.4989458322525024, + "step": 3387 + }, + { + "epoch": 0.85, + "grad_norm": 16.450145721435547, + "learning_rate": 3.98539655214991e-06, + "logits/chosen": -0.31460368633270264, + "logits/rejected": -0.3851792812347412, + "logps/chosen": -48.835506439208984, + "logps/rejected": -88.87843322753906, + "loss": 0.7665, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9507789611816406, + "rewards/margins": 5.187107086181641, + "rewards/rejected": -2.236327648162842, + "step": 3388 + }, + { + "epoch": 0.85, + "grad_norm": 6.026069164276123, + "learning_rate": 3.982831454396936e-06, + "logits/chosen": -0.33555567264556885, + "logits/rejected": -0.44625288248062134, + "logps/chosen": -49.72029495239258, + "logps/rejected": -66.62635803222656, + "loss": 0.7506, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.897571325302124, + "rewards/margins": 4.391274452209473, + "rewards/rejected": -1.4937031269073486, + "step": 3389 + }, + { + "epoch": 0.85, + "grad_norm": 3.410661220550537, + "learning_rate": 3.98026663587868e-06, + "logits/chosen": -0.29305166006088257, + "logits/rejected": -0.413607656955719, + "logps/chosen": -49.95259094238281, + "logps/rejected": -80.30154418945312, + "loss": 0.6561, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.997290849685669, + "rewards/margins": 4.868797302246094, + "rewards/rejected": -1.8715059757232666, + "step": 3390 + }, + { + "epoch": 0.85, + "grad_norm": 6.199704647064209, + "learning_rate": 3.977702097299235e-06, + "logits/chosen": -0.45549824833869934, + "logits/rejected": -0.4892239570617676, + "logps/chosen": -54.0549430847168, + "logps/rejected": -95.16967010498047, + "loss": 0.8306, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6361494064331055, + "rewards/margins": 3.995739459991455, + "rewards/rejected": -1.359589695930481, + "step": 3391 + }, + { + "epoch": 0.85, + "grad_norm": 18.776958465576172, + "learning_rate": 3.9751378393626224e-06, + "logits/chosen": -0.3086016774177551, + "logits/rejected": -0.5112230777740479, + "logps/chosen": -55.458866119384766, + "logps/rejected": -68.01343536376953, + "loss": 0.7217, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3763506412506104, + "rewards/margins": 4.787423610687256, + "rewards/rejected": -2.4110729694366455, + "step": 3392 + }, + { + "epoch": 0.85, + "grad_norm": 6.627609729766846, + "learning_rate": 3.972573862772789e-06, + "logits/chosen": -0.3203366994857788, + "logits/rejected": -0.46938472986221313, + "logps/chosen": -70.52804565429688, + "logps/rejected": -79.69235229492188, + "loss": 0.7956, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8863418102264404, + "rewards/margins": 4.627599716186523, + "rewards/rejected": -1.741257667541504, + "step": 3393 + }, + { + "epoch": 0.85, + "grad_norm": 3.8142480850219727, + "learning_rate": 3.9700101682335995e-06, + "logits/chosen": -0.25641849637031555, + "logits/rejected": -0.37237393856048584, + "logps/chosen": -59.06256103515625, + "logps/rejected": -92.17738342285156, + "loss": 0.7271, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7029776573181152, + "rewards/margins": 5.394498348236084, + "rewards/rejected": -2.691521167755127, + "step": 3394 + }, + { + "epoch": 0.85, + "grad_norm": 3.4298367500305176, + "learning_rate": 3.967446756448845e-06, + "logits/chosen": -0.2803874611854553, + "logits/rejected": -0.3932644724845886, + "logps/chosen": -59.1992073059082, + "logps/rejected": -86.19258880615234, + "loss": 0.6907, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9353675842285156, + "rewards/margins": 5.146238327026367, + "rewards/rejected": -2.2108707427978516, + "step": 3395 + }, + { + "epoch": 0.85, + "grad_norm": 2.7991323471069336, + "learning_rate": 3.964883628122237e-06, + "logits/chosen": -0.3020744025707245, + "logits/rejected": -0.44595545530319214, + "logps/chosen": -58.053802490234375, + "logps/rejected": -89.45480346679688, + "loss": 0.6778, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0200600624084473, + "rewards/margins": 5.505305290222168, + "rewards/rejected": -2.4852449893951416, + "step": 3396 + }, + { + "epoch": 0.85, + "grad_norm": 3.5016894340515137, + "learning_rate": 3.962320783957407e-06, + "logits/chosen": -0.3012779653072357, + "logits/rejected": -0.4117613136768341, + "logps/chosen": -56.61344909667969, + "logps/rejected": -72.03958129882812, + "loss": 0.6818, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.15594482421875, + "rewards/margins": 4.868605136871338, + "rewards/rejected": -1.7126604318618774, + "step": 3397 + }, + { + "epoch": 0.85, + "grad_norm": 2.7800843715667725, + "learning_rate": 3.959758224657916e-06, + "logits/chosen": -0.2963286340236664, + "logits/rejected": -0.351695716381073, + "logps/chosen": -53.644161224365234, + "logps/rejected": -83.71084594726562, + "loss": 0.6341, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9678902626037598, + "rewards/margins": 5.131593704223633, + "rewards/rejected": -2.163703203201294, + "step": 3398 + }, + { + "epoch": 0.85, + "grad_norm": 4.65484619140625, + "learning_rate": 3.957195950927236e-06, + "logits/chosen": -0.3947924077510834, + "logits/rejected": -0.44542911648750305, + "logps/chosen": -58.66464614868164, + "logps/rejected": -82.94842529296875, + "loss": 0.7835, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.08567214012146, + "rewards/margins": 4.05739688873291, + "rewards/rejected": -0.9717249274253845, + "step": 3399 + }, + { + "epoch": 0.85, + "grad_norm": 3.8868422508239746, + "learning_rate": 3.954633963468772e-06, + "logits/chosen": -0.3533472716808319, + "logits/rejected": -0.39501523971557617, + "logps/chosen": -54.735172271728516, + "logps/rejected": -85.27769470214844, + "loss": 0.6264, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9472463130950928, + "rewards/margins": 4.7949066162109375, + "rewards/rejected": -1.8476604223251343, + "step": 3400 + }, + { + "epoch": 0.85, + "grad_norm": 6.8223652839660645, + "learning_rate": 3.952072262985842e-06, + "logits/chosen": -0.39148154854774475, + "logits/rejected": -0.4985395669937134, + "logps/chosen": -64.96839141845703, + "logps/rejected": -90.57988739013672, + "loss": 0.9452, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9703032970428467, + "rewards/margins": 4.637556552886963, + "rewards/rejected": -1.6672532558441162, + "step": 3401 + }, + { + "epoch": 0.85, + "grad_norm": 22.848012924194336, + "learning_rate": 3.949510850181686e-06, + "logits/chosen": -0.24843832850456238, + "logits/rejected": -0.31142735481262207, + "logps/chosen": -65.24899291992188, + "logps/rejected": -78.90441131591797, + "loss": 0.913, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9298996925354004, + "rewards/margins": 3.2287588119506836, + "rewards/rejected": -0.2988589406013489, + "step": 3402 + }, + { + "epoch": 0.85, + "grad_norm": 6.169217109680176, + "learning_rate": 3.946949725759473e-06, + "logits/chosen": -0.33614465594291687, + "logits/rejected": -0.3945074677467346, + "logps/chosen": -53.465946197509766, + "logps/rejected": -87.61822509765625, + "loss": 0.7716, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.761523485183716, + "rewards/margins": 5.540320873260498, + "rewards/rejected": -2.7787973880767822, + "step": 3403 + }, + { + "epoch": 0.85, + "grad_norm": 6.773406028747559, + "learning_rate": 3.944388890422281e-06, + "logits/chosen": -0.21216139197349548, + "logits/rejected": -0.32240259647369385, + "logps/chosen": -59.77775955200195, + "logps/rejected": -70.67134094238281, + "loss": 0.8628, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0615828037261963, + "rewards/margins": 3.8446807861328125, + "rewards/rejected": -0.783098042011261, + "step": 3404 + }, + { + "epoch": 0.85, + "grad_norm": 10.646727561950684, + "learning_rate": 3.941828344873117e-06, + "logits/chosen": -0.252359539270401, + "logits/rejected": -0.28685081005096436, + "logps/chosen": -75.74647521972656, + "logps/rejected": -67.18168640136719, + "loss": 1.063, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.908144235610962, + "rewards/margins": 2.8879644870758057, + "rewards/rejected": 0.020179979503154755, + "step": 3405 + }, + { + "epoch": 0.85, + "grad_norm": 5.615136623382568, + "learning_rate": 3.939268089814907e-06, + "logits/chosen": -0.2571248412132263, + "logits/rejected": -0.3515778183937073, + "logps/chosen": -53.326683044433594, + "logps/rejected": -82.92935180664062, + "loss": 0.7361, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.009572744369507, + "rewards/margins": 3.254514694213867, + "rewards/rejected": -0.24494236707687378, + "step": 3406 + }, + { + "epoch": 0.85, + "grad_norm": 4.434447288513184, + "learning_rate": 3.936708125950493e-06, + "logits/chosen": -0.3595748543739319, + "logits/rejected": -0.46671542525291443, + "logps/chosen": -65.54524230957031, + "logps/rejected": -81.93276977539062, + "loss": 0.7151, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7694501876831055, + "rewards/margins": 4.634866237640381, + "rewards/rejected": -1.8654158115386963, + "step": 3407 + }, + { + "epoch": 0.85, + "grad_norm": 5.354745864868164, + "learning_rate": 3.934148453982643e-06, + "logits/chosen": -0.3624608814716339, + "logits/rejected": -0.3763556182384491, + "logps/chosen": -57.626731872558594, + "logps/rejected": -92.55791473388672, + "loss": 0.899, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.71303129196167, + "rewards/margins": 3.62644100189209, + "rewards/rejected": -0.913409411907196, + "step": 3408 + }, + { + "epoch": 0.85, + "grad_norm": 2.9311444759368896, + "learning_rate": 3.931589074614042e-06, + "logits/chosen": -0.2773810625076294, + "logits/rejected": -0.4227041006088257, + "logps/chosen": -54.17102813720703, + "logps/rejected": -68.9804916381836, + "loss": 0.6069, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1301751136779785, + "rewards/margins": 4.8885626792907715, + "rewards/rejected": -1.7583876848220825, + "step": 3409 + }, + { + "epoch": 0.85, + "grad_norm": 9.748279571533203, + "learning_rate": 3.929029988547296e-06, + "logits/chosen": -0.34818875789642334, + "logits/rejected": -0.39810824394226074, + "logps/chosen": -40.55400466918945, + "logps/rejected": -86.38844299316406, + "loss": 0.657, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0823802947998047, + "rewards/margins": 5.160775184631348, + "rewards/rejected": -2.078394889831543, + "step": 3410 + }, + { + "epoch": 0.85, + "grad_norm": 8.938821792602539, + "learning_rate": 3.926471196484928e-06, + "logits/chosen": -0.31903693079948425, + "logits/rejected": -0.42131173610687256, + "logps/chosen": -56.181884765625, + "logps/rejected": -70.14913177490234, + "loss": 0.9234, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.767009973526001, + "rewards/margins": 3.8624444007873535, + "rewards/rejected": -1.0954347848892212, + "step": 3411 + }, + { + "epoch": 0.85, + "grad_norm": 5.309558868408203, + "learning_rate": 3.923912699129378e-06, + "logits/chosen": -0.27338922023773193, + "logits/rejected": -0.4363255798816681, + "logps/chosen": -58.89216995239258, + "logps/rejected": -77.47534942626953, + "loss": 0.7566, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.799215316772461, + "rewards/margins": 5.331908702850342, + "rewards/rejected": -2.53269362449646, + "step": 3412 + }, + { + "epoch": 0.85, + "grad_norm": 5.1578898429870605, + "learning_rate": 3.921354497183016e-06, + "logits/chosen": -0.32064008712768555, + "logits/rejected": -0.3767349421977997, + "logps/chosen": -57.44077682495117, + "logps/rejected": -85.42674255371094, + "loss": 0.7194, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.728883981704712, + "rewards/margins": 4.010683536529541, + "rewards/rejected": -1.281799554824829, + "step": 3413 + }, + { + "epoch": 0.85, + "grad_norm": 7.284006118774414, + "learning_rate": 3.918796591348117e-06, + "logits/chosen": -0.3940570056438446, + "logits/rejected": -0.46803873777389526, + "logps/chosen": -59.009620666503906, + "logps/rejected": -76.35772705078125, + "loss": 0.7669, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6130764484405518, + "rewards/margins": 4.480632305145264, + "rewards/rejected": -1.867555856704712, + "step": 3414 + }, + { + "epoch": 0.85, + "grad_norm": 4.775644302368164, + "learning_rate": 3.916238982326886e-06, + "logits/chosen": -0.2952653169631958, + "logits/rejected": -0.4009947180747986, + "logps/chosen": -57.434146881103516, + "logps/rejected": -85.27944946289062, + "loss": 0.7513, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8917737007141113, + "rewards/margins": 4.297481060028076, + "rewards/rejected": -1.405707597732544, + "step": 3415 + }, + { + "epoch": 0.85, + "grad_norm": 5.649609088897705, + "learning_rate": 3.91368167082144e-06, + "logits/chosen": -0.2784387469291687, + "logits/rejected": -0.4085908830165863, + "logps/chosen": -59.61465072631836, + "logps/rejected": -79.62091064453125, + "loss": 0.7513, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.117737054824829, + "rewards/margins": 4.908046722412109, + "rewards/rejected": -1.7903099060058594, + "step": 3416 + }, + { + "epoch": 0.85, + "grad_norm": 6.666743755340576, + "learning_rate": 3.911124657533814e-06, + "logits/chosen": -0.2997947335243225, + "logits/rejected": -0.40252479910850525, + "logps/chosen": -63.65355682373047, + "logps/rejected": -86.94236755371094, + "loss": 0.8718, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.580054759979248, + "rewards/margins": 4.535136699676514, + "rewards/rejected": -1.9550821781158447, + "step": 3417 + }, + { + "epoch": 0.86, + "grad_norm": 3.923257827758789, + "learning_rate": 3.908567943165968e-06, + "logits/chosen": -0.2539800703525543, + "logits/rejected": -0.3483607769012451, + "logps/chosen": -73.20854949951172, + "logps/rejected": -78.82576751708984, + "loss": 0.8076, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0429069995880127, + "rewards/margins": 3.957505702972412, + "rewards/rejected": -0.9145985245704651, + "step": 3418 + }, + { + "epoch": 0.86, + "grad_norm": 5.585988521575928, + "learning_rate": 3.906011528419772e-06, + "logits/chosen": -0.3104659914970398, + "logits/rejected": -0.3739687204360962, + "logps/chosen": -76.63793182373047, + "logps/rejected": -85.81416320800781, + "loss": 0.8998, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9273900985717773, + "rewards/margins": 4.2559638023376465, + "rewards/rejected": -1.3285741806030273, + "step": 3419 + }, + { + "epoch": 0.86, + "grad_norm": 6.175220966339111, + "learning_rate": 3.903455413997018e-06, + "logits/chosen": -0.26119571924209595, + "logits/rejected": -0.30183711647987366, + "logps/chosen": -58.56443786621094, + "logps/rejected": -93.19913482666016, + "loss": 0.8488, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8060355186462402, + "rewards/margins": 4.580651760101318, + "rewards/rejected": -1.7746162414550781, + "step": 3420 + }, + { + "epoch": 0.86, + "grad_norm": 5.859825611114502, + "learning_rate": 3.900899600599417e-06, + "logits/chosen": -0.40576857328414917, + "logits/rejected": -0.48429134488105774, + "logps/chosen": -50.45088195800781, + "logps/rejected": -78.4251708984375, + "loss": 0.747, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7643821239471436, + "rewards/margins": 4.08051872253418, + "rewards/rejected": -1.316136121749878, + "step": 3421 + }, + { + "epoch": 0.86, + "grad_norm": 5.423647403717041, + "learning_rate": 3.898344088928587e-06, + "logits/chosen": -0.2490084022283554, + "logits/rejected": -0.3410794138908386, + "logps/chosen": -70.42078399658203, + "logps/rejected": -80.12371826171875, + "loss": 0.9599, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.833756923675537, + "rewards/margins": 4.263152599334717, + "rewards/rejected": -1.429395079612732, + "step": 3422 + }, + { + "epoch": 0.86, + "grad_norm": 5.394958972930908, + "learning_rate": 3.895788879686081e-06, + "logits/chosen": -0.32858771085739136, + "logits/rejected": -0.44490739703178406, + "logps/chosen": -59.421241760253906, + "logps/rejected": -72.87864685058594, + "loss": 0.7051, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6532251834869385, + "rewards/margins": 4.724504470825195, + "rewards/rejected": -2.0712790489196777, + "step": 3423 + }, + { + "epoch": 0.86, + "grad_norm": 2.5375514030456543, + "learning_rate": 3.893233973573351e-06, + "logits/chosen": -0.34278738498687744, + "logits/rejected": -0.4473985433578491, + "logps/chosen": -46.44664764404297, + "logps/rejected": -69.61956024169922, + "loss": 0.6256, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0568666458129883, + "rewards/margins": 5.155766010284424, + "rewards/rejected": -2.0988998413085938, + "step": 3424 + }, + { + "epoch": 0.86, + "grad_norm": 5.1526899337768555, + "learning_rate": 3.8906793712917805e-06, + "logits/chosen": -0.3327134847640991, + "logits/rejected": -0.3884190022945404, + "logps/chosen": -58.92958450317383, + "logps/rejected": -78.69783782958984, + "loss": 0.8947, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.570338726043701, + "rewards/margins": 2.898451805114746, + "rewards/rejected": -0.32811325788497925, + "step": 3425 + }, + { + "epoch": 0.86, + "grad_norm": 2.312309503555298, + "learning_rate": 3.888125073542659e-06, + "logits/chosen": -0.2662968933582306, + "logits/rejected": -0.4463733434677124, + "logps/chosen": -65.49640655517578, + "logps/rejected": -76.92916870117188, + "loss": 0.6757, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.042306661605835, + "rewards/margins": 5.346231937408447, + "rewards/rejected": -2.3039255142211914, + "step": 3426 + }, + { + "epoch": 0.86, + "grad_norm": 3.498697280883789, + "learning_rate": 3.885571081027196e-06, + "logits/chosen": -0.3691861033439636, + "logits/rejected": -0.5019162893295288, + "logps/chosen": -59.583717346191406, + "logps/rejected": -80.8743667602539, + "loss": 0.7154, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7666404247283936, + "rewards/margins": 5.794564247131348, + "rewards/rejected": -3.027924060821533, + "step": 3427 + }, + { + "epoch": 0.86, + "grad_norm": 4.049836158752441, + "learning_rate": 3.88301739444652e-06, + "logits/chosen": -0.277714341878891, + "logits/rejected": -0.4065132141113281, + "logps/chosen": -64.09880828857422, + "logps/rejected": -77.26644134521484, + "loss": 0.7477, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.049968719482422, + "rewards/margins": 3.539463996887207, + "rewards/rejected": -0.48949530720710754, + "step": 3428 + }, + { + "epoch": 0.86, + "grad_norm": 8.487495422363281, + "learning_rate": 3.88046401450167e-06, + "logits/chosen": -0.3040195405483246, + "logits/rejected": -0.33986347913742065, + "logps/chosen": -59.220619201660156, + "logps/rejected": -84.59613037109375, + "loss": 0.8631, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6896090507507324, + "rewards/margins": 3.3136634826660156, + "rewards/rejected": -0.6240546703338623, + "step": 3429 + }, + { + "epoch": 0.86, + "grad_norm": 4.278233528137207, + "learning_rate": 3.8779109418936074e-06, + "logits/chosen": -0.2486463189125061, + "logits/rejected": -0.4270178973674774, + "logps/chosen": -66.44871520996094, + "logps/rejected": -76.88794708251953, + "loss": 0.713, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.767894744873047, + "rewards/margins": 5.251605987548828, + "rewards/rejected": -2.483710527420044, + "step": 3430 + }, + { + "epoch": 0.86, + "grad_norm": 4.869420528411865, + "learning_rate": 3.8753581773232045e-06, + "logits/chosen": -0.2943327724933624, + "logits/rejected": -0.4314563274383545, + "logps/chosen": -49.95556640625, + "logps/rejected": -71.00659942626953, + "loss": 0.7878, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.798168420791626, + "rewards/margins": 4.493934631347656, + "rewards/rejected": -1.695766568183899, + "step": 3431 + }, + { + "epoch": 0.86, + "grad_norm": 2.2340192794799805, + "learning_rate": 3.872805721491247e-06, + "logits/chosen": -0.21932452917099, + "logits/rejected": -0.4171297252178192, + "logps/chosen": -59.93279266357422, + "logps/rejected": -80.54713439941406, + "loss": 0.6419, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8327016830444336, + "rewards/margins": 5.665917873382568, + "rewards/rejected": -2.8332161903381348, + "step": 3432 + }, + { + "epoch": 0.86, + "grad_norm": 5.43721866607666, + "learning_rate": 3.870253575098446e-06, + "logits/chosen": -0.3694063127040863, + "logits/rejected": -0.46572941541671753, + "logps/chosen": -50.564910888671875, + "logps/rejected": -87.4061279296875, + "loss": 0.7903, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7191858291625977, + "rewards/margins": 4.867195129394531, + "rewards/rejected": -2.1480090618133545, + "step": 3433 + }, + { + "epoch": 0.86, + "grad_norm": 3.0251550674438477, + "learning_rate": 3.867701738845412e-06, + "logits/chosen": -0.28689685463905334, + "logits/rejected": -0.4124877154827118, + "logps/chosen": -53.924320220947266, + "logps/rejected": -84.92166900634766, + "loss": 0.6395, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2006542682647705, + "rewards/margins": 4.453437805175781, + "rewards/rejected": -1.2527834177017212, + "step": 3434 + }, + { + "epoch": 0.86, + "grad_norm": 6.672414779663086, + "learning_rate": 3.865150213432688e-06, + "logits/chosen": -0.3009588420391083, + "logits/rejected": -0.3389435112476349, + "logps/chosen": -58.143985748291016, + "logps/rejected": -98.31915283203125, + "loss": 0.8123, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.066953182220459, + "rewards/margins": 4.28579044342041, + "rewards/rejected": -1.218837022781372, + "step": 3435 + }, + { + "epoch": 0.86, + "grad_norm": 2.7326529026031494, + "learning_rate": 3.862598999560717e-06, + "logits/chosen": -0.33434411883354187, + "logits/rejected": -0.47505488991737366, + "logps/chosen": -63.03047180175781, + "logps/rejected": -82.2012939453125, + "loss": 0.6229, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.144955635070801, + "rewards/margins": 4.675856113433838, + "rewards/rejected": -1.5309003591537476, + "step": 3436 + }, + { + "epoch": 0.86, + "grad_norm": 5.244316577911377, + "learning_rate": 3.860048097929862e-06, + "logits/chosen": -0.34571316838264465, + "logits/rejected": -0.3416616916656494, + "logps/chosen": -49.654197692871094, + "logps/rejected": -88.04381561279297, + "loss": 0.7238, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8109469413757324, + "rewards/margins": 4.074038505554199, + "rewards/rejected": -1.2630913257598877, + "step": 3437 + }, + { + "epoch": 0.86, + "grad_norm": 4.628462791442871, + "learning_rate": 3.857497509240404e-06, + "logits/chosen": -0.2534717321395874, + "logits/rejected": -0.4173694849014282, + "logps/chosen": -56.93363571166992, + "logps/rejected": -78.22472381591797, + "loss": 0.647, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8453338146209717, + "rewards/margins": 4.654468059539795, + "rewards/rejected": -1.8091340065002441, + "step": 3438 + }, + { + "epoch": 0.86, + "grad_norm": 6.049118518829346, + "learning_rate": 3.8549472341925325e-06, + "logits/chosen": -0.28341948986053467, + "logits/rejected": -0.36095407605171204, + "logps/chosen": -66.69203186035156, + "logps/rejected": -85.11327362060547, + "loss": 0.9375, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.858828544616699, + "rewards/margins": 3.0079712867736816, + "rewards/rejected": -0.14914260804653168, + "step": 3439 + }, + { + "epoch": 0.86, + "grad_norm": 5.080698490142822, + "learning_rate": 3.852397273486354e-06, + "logits/chosen": -0.3364952504634857, + "logits/rejected": -0.45448118448257446, + "logps/chosen": -58.66551208496094, + "logps/rejected": -80.03038787841797, + "loss": 0.8848, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6872947216033936, + "rewards/margins": 4.337760925292969, + "rewards/rejected": -1.6504662036895752, + "step": 3440 + }, + { + "epoch": 0.86, + "grad_norm": 3.3792505264282227, + "learning_rate": 3.849847627821888e-06, + "logits/chosen": -0.3490430414676666, + "logits/rejected": -0.43291884660720825, + "logps/chosen": -51.388999938964844, + "logps/rejected": -74.94188690185547, + "loss": 0.686, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7503201961517334, + "rewards/margins": 5.118420124053955, + "rewards/rejected": -2.3680999279022217, + "step": 3441 + }, + { + "epoch": 0.86, + "grad_norm": 1.839744210243225, + "learning_rate": 3.847298297899064e-06, + "logits/chosen": -0.3184376358985901, + "logits/rejected": -0.46795910596847534, + "logps/chosen": -52.629966735839844, + "logps/rejected": -77.43830108642578, + "loss": 0.6078, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.143658399581909, + "rewards/margins": 5.036122798919678, + "rewards/rejected": -1.8924640417099, + "step": 3442 + }, + { + "epoch": 0.86, + "grad_norm": 4.421519756317139, + "learning_rate": 3.844749284417732e-06, + "logits/chosen": -0.31708070635795593, + "logits/rejected": -0.5103400945663452, + "logps/chosen": -69.40835571289062, + "logps/rejected": -68.77102661132812, + "loss": 0.7542, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.525162696838379, + "rewards/margins": 5.016836643218994, + "rewards/rejected": -2.4916741847991943, + "step": 3443 + }, + { + "epoch": 0.86, + "grad_norm": 19.271787643432617, + "learning_rate": 3.8422005880776495e-06, + "logits/chosen": -0.2550586760044098, + "logits/rejected": -0.3504805564880371, + "logps/chosen": -58.916839599609375, + "logps/rejected": -74.07281494140625, + "loss": 0.8326, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9683661460876465, + "rewards/margins": 4.201441764831543, + "rewards/rejected": -1.2330758571624756, + "step": 3444 + }, + { + "epoch": 0.86, + "grad_norm": 6.8414435386657715, + "learning_rate": 3.839652209578491e-06, + "logits/chosen": -0.2652539312839508, + "logits/rejected": -0.3943449854850769, + "logps/chosen": -59.30138397216797, + "logps/rejected": -85.91896057128906, + "loss": 0.7236, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.845337390899658, + "rewards/margins": 4.517240047454834, + "rewards/rejected": -1.6719024181365967, + "step": 3445 + }, + { + "epoch": 0.86, + "grad_norm": 3.357316017150879, + "learning_rate": 3.837104149619838e-06, + "logits/chosen": -0.29852133989334106, + "logits/rejected": -0.437347412109375, + "logps/chosen": -54.08466339111328, + "logps/rejected": -71.10449981689453, + "loss": 0.6656, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0074987411499023, + "rewards/margins": 5.360290050506592, + "rewards/rejected": -2.3527913093566895, + "step": 3446 + }, + { + "epoch": 0.86, + "grad_norm": 7.401638031005859, + "learning_rate": 3.834556408901188e-06, + "logits/chosen": -0.2449808269739151, + "logits/rejected": -0.34974056482315063, + "logps/chosen": -57.97361755371094, + "logps/rejected": -76.08753967285156, + "loss": 0.8069, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.758277416229248, + "rewards/margins": 3.601264476776123, + "rewards/rejected": -0.8429867029190063, + "step": 3447 + }, + { + "epoch": 0.86, + "grad_norm": 19.424732208251953, + "learning_rate": 3.832008988121953e-06, + "logits/chosen": -0.37935009598731995, + "logits/rejected": -0.4285085201263428, + "logps/chosen": -67.54254150390625, + "logps/rejected": -89.99433135986328, + "loss": 0.9946, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.703988552093506, + "rewards/margins": 3.264683246612549, + "rewards/rejected": -0.5606951713562012, + "step": 3448 + }, + { + "epoch": 0.86, + "grad_norm": 3.806882619857788, + "learning_rate": 3.8294618879814535e-06, + "logits/chosen": -0.27318331599235535, + "logits/rejected": -0.4239528477191925, + "logps/chosen": -79.10293579101562, + "logps/rejected": -93.66775512695312, + "loss": 0.8266, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.771360158920288, + "rewards/margins": 4.397802829742432, + "rewards/rejected": -1.6264429092407227, + "step": 3449 + }, + { + "epoch": 0.86, + "grad_norm": 21.99333381652832, + "learning_rate": 3.826915109178925e-06, + "logits/chosen": -0.34388038516044617, + "logits/rejected": -0.4555141031742096, + "logps/chosen": -47.63896179199219, + "logps/rejected": -61.57361602783203, + "loss": 0.9541, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.554309606552124, + "rewards/margins": 3.673654794692993, + "rewards/rejected": -1.1193455457687378, + "step": 3450 + }, + { + "epoch": 0.86, + "grad_norm": 9.385088920593262, + "learning_rate": 3.824368652413512e-06, + "logits/chosen": -0.2773086726665497, + "logits/rejected": -0.43662506341934204, + "logps/chosen": -59.78974151611328, + "logps/rejected": -77.70084381103516, + "loss": 0.7376, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.571078300476074, + "rewards/margins": 4.440420627593994, + "rewards/rejected": -1.8693422079086304, + "step": 3451 + }, + { + "epoch": 0.86, + "grad_norm": 4.333817958831787, + "learning_rate": 3.82182251838427e-06, + "logits/chosen": -0.23559406399726868, + "logits/rejected": -0.38442620635032654, + "logps/chosen": -54.599273681640625, + "logps/rejected": -80.79999542236328, + "loss": 0.7582, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.677961826324463, + "rewards/margins": 4.515880107879639, + "rewards/rejected": -1.8379186391830444, + "step": 3452 + }, + { + "epoch": 0.86, + "grad_norm": 2.401996374130249, + "learning_rate": 3.819276707790172e-06, + "logits/chosen": -0.2909601926803589, + "logits/rejected": -0.39196670055389404, + "logps/chosen": -59.43016815185547, + "logps/rejected": -87.76582336425781, + "loss": 0.5809, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9551732540130615, + "rewards/margins": 5.229028701782227, + "rewards/rejected": -2.2738561630249023, + "step": 3453 + }, + { + "epoch": 0.86, + "grad_norm": 3.205209970474243, + "learning_rate": 3.8167312213300946e-06, + "logits/chosen": -0.24625976383686066, + "logits/rejected": -0.41331055760383606, + "logps/chosen": -73.98822021484375, + "logps/rejected": -86.884765625, + "loss": 0.6502, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3979883193969727, + "rewards/margins": 5.088328838348389, + "rewards/rejected": -1.690340280532837, + "step": 3454 + }, + { + "epoch": 0.86, + "grad_norm": 6.547616481781006, + "learning_rate": 3.81418605970283e-06, + "logits/chosen": -0.337299108505249, + "logits/rejected": -0.43578335642814636, + "logps/chosen": -46.01691436767578, + "logps/rejected": -70.63652038574219, + "loss": 0.8102, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8965582847595215, + "rewards/margins": 4.625949382781982, + "rewards/rejected": -1.72939133644104, + "step": 3455 + }, + { + "epoch": 0.86, + "grad_norm": 3.6642942428588867, + "learning_rate": 3.8116412236070823e-06, + "logits/chosen": -0.38299500942230225, + "logits/rejected": -0.5117090344429016, + "logps/chosen": -65.78822326660156, + "logps/rejected": -74.30714416503906, + "loss": 0.6467, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.968569278717041, + "rewards/margins": 4.638729572296143, + "rewards/rejected": -1.6701604127883911, + "step": 3456 + }, + { + "epoch": 0.86, + "grad_norm": 4.974480628967285, + "learning_rate": 3.809096713741457e-06, + "logits/chosen": -0.32898372411727905, + "logits/rejected": -0.3213036060333252, + "logps/chosen": -50.01944351196289, + "logps/rejected": -92.62471771240234, + "loss": 0.7837, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.740459680557251, + "rewards/margins": 4.844037055969238, + "rewards/rejected": -2.1035773754119873, + "step": 3457 + }, + { + "epoch": 0.87, + "grad_norm": 8.901095390319824, + "learning_rate": 3.8065525308044853e-06, + "logits/chosen": -0.24187752604484558, + "logits/rejected": -0.3693273663520813, + "logps/chosen": -59.04227066040039, + "logps/rejected": -80.25269317626953, + "loss": 0.9497, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7968332767486572, + "rewards/margins": 4.288172721862793, + "rewards/rejected": -1.4913395643234253, + "step": 3458 + }, + { + "epoch": 0.87, + "grad_norm": 4.190587520599365, + "learning_rate": 3.804008675494594e-06, + "logits/chosen": -0.4023590683937073, + "logits/rejected": -0.4776647090911865, + "logps/chosen": -43.32160568237305, + "logps/rejected": -93.595947265625, + "loss": 0.6012, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.064610242843628, + "rewards/margins": 5.191132068634033, + "rewards/rejected": -2.1265218257904053, + "step": 3459 + }, + { + "epoch": 0.87, + "grad_norm": 15.090627670288086, + "learning_rate": 3.8014651485101304e-06, + "logits/chosen": -0.30857306718826294, + "logits/rejected": -0.4471679627895355, + "logps/chosen": -63.890106201171875, + "logps/rejected": -88.20735168457031, + "loss": 0.8656, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.663818597793579, + "rewards/margins": 5.081604957580566, + "rewards/rejected": -2.417786121368408, + "step": 3460 + }, + { + "epoch": 0.87, + "grad_norm": 10.585310935974121, + "learning_rate": 3.7989219505493453e-06, + "logits/chosen": -0.3412000238895416, + "logits/rejected": -0.47466108202934265, + "logps/chosen": -62.0039176940918, + "logps/rejected": -82.91578674316406, + "loss": 0.7928, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7747392654418945, + "rewards/margins": 4.432860374450684, + "rewards/rejected": -1.6581217050552368, + "step": 3461 + }, + { + "epoch": 0.87, + "grad_norm": 4.467348575592041, + "learning_rate": 3.7963790823104015e-06, + "logits/chosen": -0.2931095361709595, + "logits/rejected": -0.4185553789138794, + "logps/chosen": -56.43141174316406, + "logps/rejected": -78.98544311523438, + "loss": 0.7396, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6146721839904785, + "rewards/margins": 5.337801933288574, + "rewards/rejected": -2.7231295108795166, + "step": 3462 + }, + { + "epoch": 0.87, + "grad_norm": 9.328959465026855, + "learning_rate": 3.7938365444913723e-06, + "logits/chosen": -0.3240944743156433, + "logits/rejected": -0.4057156443595886, + "logps/chosen": -57.808326721191406, + "logps/rejected": -73.27586364746094, + "loss": 0.8782, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.681674003601074, + "rewards/margins": 3.3624448776245117, + "rewards/rejected": -0.680770993232727, + "step": 3463 + }, + { + "epoch": 0.87, + "grad_norm": 5.72704553604126, + "learning_rate": 3.7912943377902374e-06, + "logits/chosen": -0.32054805755615234, + "logits/rejected": -0.4394562542438507, + "logps/chosen": -61.23638153076172, + "logps/rejected": -90.79190063476562, + "loss": 0.7102, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0982232093811035, + "rewards/margins": 4.139186859130859, + "rewards/rejected": -1.0409636497497559, + "step": 3464 + }, + { + "epoch": 0.87, + "grad_norm": 8.94210147857666, + "learning_rate": 3.78875246290489e-06, + "logits/chosen": -0.27113330364227295, + "logits/rejected": -0.40376555919647217, + "logps/chosen": -65.83180236816406, + "logps/rejected": -89.18179321289062, + "loss": 0.7865, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7876663208007812, + "rewards/margins": 4.572454452514648, + "rewards/rejected": -1.7847881317138672, + "step": 3465 + }, + { + "epoch": 0.87, + "grad_norm": 19.39277458190918, + "learning_rate": 3.786210920533129e-06, + "logits/chosen": -0.3715846538543701, + "logits/rejected": -0.4545120894908905, + "logps/chosen": -55.11053466796875, + "logps/rejected": -75.0928726196289, + "loss": 0.8537, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7038118839263916, + "rewards/margins": 4.3107781410217285, + "rewards/rejected": -1.606966495513916, + "step": 3466 + }, + { + "epoch": 0.87, + "grad_norm": 6.478857517242432, + "learning_rate": 3.7836697113726587e-06, + "logits/chosen": -0.24693061411380768, + "logits/rejected": -0.2989217936992645, + "logps/chosen": -56.33685302734375, + "logps/rejected": -73.98284912109375, + "loss": 0.8167, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9585719108581543, + "rewards/margins": 3.5028998851776123, + "rewards/rejected": -0.5443280339241028, + "step": 3467 + }, + { + "epoch": 0.87, + "grad_norm": 3.929063320159912, + "learning_rate": 3.7811288361211025e-06, + "logits/chosen": -0.300137460231781, + "logits/rejected": -0.41922053694725037, + "logps/chosen": -61.76227569580078, + "logps/rejected": -89.75520324707031, + "loss": 0.6583, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9886322021484375, + "rewards/margins": 4.593547344207764, + "rewards/rejected": -1.6049151420593262, + "step": 3468 + }, + { + "epoch": 0.87, + "grad_norm": 9.309918403625488, + "learning_rate": 3.7785882954759784e-06, + "logits/chosen": -0.24833878874778748, + "logits/rejected": -0.34941351413726807, + "logps/chosen": -63.84226608276367, + "logps/rejected": -76.8746566772461, + "loss": 0.972, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7850019931793213, + "rewards/margins": 3.7142133712768555, + "rewards/rejected": -0.9292113780975342, + "step": 3469 + }, + { + "epoch": 0.87, + "grad_norm": 4.536198616027832, + "learning_rate": 3.776048090134726e-06, + "logits/chosen": -0.2514592409133911, + "logits/rejected": -0.40992292761802673, + "logps/chosen": -62.230464935302734, + "logps/rejected": -72.95797729492188, + "loss": 0.7171, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8405399322509766, + "rewards/margins": 4.652190208435059, + "rewards/rejected": -1.811650037765503, + "step": 3470 + }, + { + "epoch": 0.87, + "grad_norm": 4.0503926277160645, + "learning_rate": 3.7735082207946835e-06, + "logits/chosen": -0.4120821952819824, + "logits/rejected": -0.4460379481315613, + "logps/chosen": -55.68617248535156, + "logps/rejected": -87.72386169433594, + "loss": 0.8717, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1456775665283203, + "rewards/margins": 4.711128234863281, + "rewards/rejected": -1.5654505491256714, + "step": 3471 + }, + { + "epoch": 0.87, + "grad_norm": 5.856356143951416, + "learning_rate": 3.770968688153098e-06, + "logits/chosen": -0.3249519467353821, + "logits/rejected": -0.30161580443382263, + "logps/chosen": -49.633628845214844, + "logps/rejected": -97.08784484863281, + "loss": 0.7795, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9262890815734863, + "rewards/margins": 3.8977627754211426, + "rewards/rejected": -0.9714736938476562, + "step": 3472 + }, + { + "epoch": 0.87, + "grad_norm": 9.299866676330566, + "learning_rate": 3.7684294929071296e-06, + "logits/chosen": -0.41895365715026855, + "logits/rejected": -0.37943604588508606, + "logps/chosen": -85.14418029785156, + "logps/rejected": -88.04698944091797, + "loss": 1.0206, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.592524528503418, + "rewards/margins": 3.2063372135162354, + "rewards/rejected": -0.6138127446174622, + "step": 3473 + }, + { + "epoch": 0.87, + "grad_norm": 7.7582011222839355, + "learning_rate": 3.7658906357538392e-06, + "logits/chosen": -0.391019344329834, + "logits/rejected": -0.5046471357345581, + "logps/chosen": -40.655277252197266, + "logps/rejected": -79.28194427490234, + "loss": 0.6495, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8191146850585938, + "rewards/margins": 5.197195529937744, + "rewards/rejected": -2.3780808448791504, + "step": 3474 + }, + { + "epoch": 0.87, + "grad_norm": 3.606212615966797, + "learning_rate": 3.763352117390201e-06, + "logits/chosen": -0.3031253218650818, + "logits/rejected": -0.3573324978351593, + "logps/chosen": -53.76990509033203, + "logps/rejected": -84.82192993164062, + "loss": 0.7975, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.347614288330078, + "rewards/margins": 3.8549067974090576, + "rewards/rejected": -0.5072923898696899, + "step": 3475 + }, + { + "epoch": 0.87, + "grad_norm": 6.259392261505127, + "learning_rate": 3.760813938513092e-06, + "logits/chosen": -0.31858521699905396, + "logits/rejected": -0.4448999762535095, + "logps/chosen": -52.391380310058594, + "logps/rejected": -74.34808349609375, + "loss": 0.7685, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9049651622772217, + "rewards/margins": 3.8023464679718018, + "rewards/rejected": -0.8973814845085144, + "step": 3476 + }, + { + "epoch": 0.87, + "grad_norm": 3.9983246326446533, + "learning_rate": 3.7582760998192934e-06, + "logits/chosen": -0.27989462018013, + "logits/rejected": -0.4441525340080261, + "logps/chosen": -56.37080383300781, + "logps/rejected": -54.19593811035156, + "loss": 0.8305, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.628328561782837, + "rewards/margins": 3.905792236328125, + "rewards/rejected": -1.277463436126709, + "step": 3477 + }, + { + "epoch": 0.87, + "grad_norm": 3.4891865253448486, + "learning_rate": 3.7557386020055027e-06, + "logits/chosen": -0.23797942698001862, + "logits/rejected": -0.3899744153022766, + "logps/chosen": -51.672340393066406, + "logps/rejected": -62.974029541015625, + "loss": 0.6776, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.238124370574951, + "rewards/margins": 4.653635025024414, + "rewards/rejected": -1.415511131286621, + "step": 3478 + }, + { + "epoch": 0.87, + "grad_norm": 6.649990558624268, + "learning_rate": 3.7532014457683114e-06, + "logits/chosen": -0.33105283975601196, + "logits/rejected": -0.43471014499664307, + "logps/chosen": -65.04393005371094, + "logps/rejected": -71.94367218017578, + "loss": 0.9681, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.733076810836792, + "rewards/margins": 3.5840158462524414, + "rewards/rejected": -0.8509389758110046, + "step": 3479 + }, + { + "epoch": 0.87, + "grad_norm": 2.988840103149414, + "learning_rate": 3.7506646318042307e-06, + "logits/chosen": -0.2741341292858124, + "logits/rejected": -0.41767552495002747, + "logps/chosen": -44.97246551513672, + "logps/rejected": -72.74608612060547, + "loss": 0.6196, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.901867389678955, + "rewards/margins": 4.899415969848633, + "rewards/rejected": -1.9975485801696777, + "step": 3480 + }, + { + "epoch": 0.87, + "grad_norm": 3.874796152114868, + "learning_rate": 3.7481281608096655e-06, + "logits/chosen": -0.30661657452583313, + "logits/rejected": -0.367927610874176, + "logps/chosen": -65.25635528564453, + "logps/rejected": -82.37411499023438, + "loss": 0.8018, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.891052484512329, + "rewards/margins": 4.025605201721191, + "rewards/rejected": -1.1345528364181519, + "step": 3481 + }, + { + "epoch": 0.87, + "grad_norm": 5.253555774688721, + "learning_rate": 3.7455920334809326e-06, + "logits/chosen": -0.31604260206222534, + "logits/rejected": -0.31051474809646606, + "logps/chosen": -58.40364074707031, + "logps/rejected": -94.82015991210938, + "loss": 0.7912, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5677011013031006, + "rewards/margins": 3.467254400253296, + "rewards/rejected": -0.8995532989501953, + "step": 3482 + }, + { + "epoch": 0.87, + "grad_norm": 6.372125625610352, + "learning_rate": 3.7430562505142554e-06, + "logits/chosen": -0.4005153179168701, + "logits/rejected": -0.4895213544368744, + "logps/chosen": -61.250518798828125, + "logps/rejected": -70.91133117675781, + "loss": 0.7979, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.848100423812866, + "rewards/margins": 4.654064178466797, + "rewards/rejected": -1.8059637546539307, + "step": 3483 + }, + { + "epoch": 0.87, + "grad_norm": 4.243200302124023, + "learning_rate": 3.740520812605759e-06, + "logits/chosen": -0.3394244313240051, + "logits/rejected": -0.34876155853271484, + "logps/chosen": -45.95703887939453, + "logps/rejected": -93.22247314453125, + "loss": 0.7473, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.856994867324829, + "rewards/margins": 4.051732540130615, + "rewards/rejected": -1.1947375535964966, + "step": 3484 + }, + { + "epoch": 0.87, + "grad_norm": 3.130094528198242, + "learning_rate": 3.7379857204514786e-06, + "logits/chosen": -0.2995290160179138, + "logits/rejected": -0.3511515259742737, + "logps/chosen": -43.14106750488281, + "logps/rejected": -77.27610778808594, + "loss": 0.6911, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.766571521759033, + "rewards/margins": 4.236175537109375, + "rewards/rejected": -1.4696038961410522, + "step": 3485 + }, + { + "epoch": 0.87, + "grad_norm": 6.664662837982178, + "learning_rate": 3.7354509747473498e-06, + "logits/chosen": -0.18648934364318848, + "logits/rejected": -0.2671803832054138, + "logps/chosen": -58.27548599243164, + "logps/rejected": -94.19944763183594, + "loss": 1.0037, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.829639434814453, + "rewards/margins": 3.2979695796966553, + "rewards/rejected": -0.4683302044868469, + "step": 3486 + }, + { + "epoch": 0.87, + "grad_norm": 5.240157127380371, + "learning_rate": 3.732916576189214e-06, + "logits/chosen": -0.38514119386672974, + "logits/rejected": -0.49651414155960083, + "logps/chosen": -79.78739929199219, + "logps/rejected": -112.12208557128906, + "loss": 0.7772, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9678938388824463, + "rewards/margins": 4.711273193359375, + "rewards/rejected": -1.7433795928955078, + "step": 3487 + }, + { + "epoch": 0.87, + "grad_norm": 4.08035135269165, + "learning_rate": 3.730382525472822e-06, + "logits/chosen": -0.28115102648735046, + "logits/rejected": -0.4474923014640808, + "logps/chosen": -62.69837951660156, + "logps/rejected": -76.0567855834961, + "loss": 0.7268, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8047428131103516, + "rewards/margins": 4.131551742553711, + "rewards/rejected": -1.3268085718154907, + "step": 3488 + }, + { + "epoch": 0.87, + "grad_norm": 4.837747097015381, + "learning_rate": 3.727848823293819e-06, + "logits/chosen": -0.2296670526266098, + "logits/rejected": -0.3774956166744232, + "logps/chosen": -57.531455993652344, + "logps/rejected": -79.23072052001953, + "loss": 0.6727, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8522419929504395, + "rewards/margins": 4.247058868408203, + "rewards/rejected": -1.3948171138763428, + "step": 3489 + }, + { + "epoch": 0.87, + "grad_norm": 2.8550119400024414, + "learning_rate": 3.725315470347769e-06, + "logits/chosen": -0.29703423380851746, + "logits/rejected": -0.4508664011955261, + "logps/chosen": -59.10201644897461, + "logps/rejected": -82.21398162841797, + "loss": 0.6717, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0360965728759766, + "rewards/margins": 6.082952976226807, + "rewards/rejected": -3.046856164932251, + "step": 3490 + }, + { + "epoch": 0.87, + "grad_norm": 6.670107841491699, + "learning_rate": 3.722782467330126e-06, + "logits/chosen": -0.31703051924705505, + "logits/rejected": -0.37740951776504517, + "logps/chosen": -47.421749114990234, + "logps/rejected": -88.33712768554688, + "loss": 0.7533, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7125847339630127, + "rewards/margins": 4.454017639160156, + "rewards/rejected": -1.7414332628250122, + "step": 3491 + }, + { + "epoch": 0.87, + "grad_norm": 11.218138694763184, + "learning_rate": 3.720249814936255e-06, + "logits/chosen": -0.3348383903503418, + "logits/rejected": -0.4229448437690735, + "logps/chosen": -67.98369598388672, + "logps/rejected": -78.98426818847656, + "loss": 0.9775, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.566838502883911, + "rewards/margins": 3.5407700538635254, + "rewards/rejected": -0.9739318490028381, + "step": 3492 + }, + { + "epoch": 0.87, + "grad_norm": 8.206247329711914, + "learning_rate": 3.7177175138614265e-06, + "logits/chosen": -0.30368125438690186, + "logits/rejected": -0.39959579706192017, + "logps/chosen": -55.468536376953125, + "logps/rejected": -70.53802490234375, + "loss": 0.8764, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.664902687072754, + "rewards/margins": 3.6590194702148438, + "rewards/rejected": -0.9941169619560242, + "step": 3493 + }, + { + "epoch": 0.87, + "grad_norm": 7.855775833129883, + "learning_rate": 3.7151855648008084e-06, + "logits/chosen": -0.3712747395038605, + "logits/rejected": -0.4599372148513794, + "logps/chosen": -47.39116287231445, + "logps/rejected": -84.46080017089844, + "loss": 0.7527, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.805471658706665, + "rewards/margins": 4.366310119628906, + "rewards/rejected": -1.560837984085083, + "step": 3494 + }, + { + "epoch": 0.87, + "grad_norm": 8.816280364990234, + "learning_rate": 3.712653968449478e-06, + "logits/chosen": -0.3047787547111511, + "logits/rejected": -0.40366822481155396, + "logps/chosen": -57.66780090332031, + "logps/rejected": -83.20928955078125, + "loss": 0.8762, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.910612106323242, + "rewards/margins": 4.0868401527404785, + "rewards/rejected": -1.1762285232543945, + "step": 3495 + }, + { + "epoch": 0.87, + "grad_norm": 3.937053680419922, + "learning_rate": 3.7101227255024113e-06, + "logits/chosen": -0.30273422598838806, + "logits/rejected": -0.4415338933467865, + "logps/chosen": -48.919525146484375, + "logps/rejected": -77.65996551513672, + "loss": 0.6553, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.916745662689209, + "rewards/margins": 4.8274431228637695, + "rewards/rejected": -1.9106978178024292, + "step": 3496 + }, + { + "epoch": 0.87, + "grad_norm": 3.5768985748291016, + "learning_rate": 3.7075918366544896e-06, + "logits/chosen": -0.30243393778800964, + "logits/rejected": -0.3821539282798767, + "logps/chosen": -50.49551010131836, + "logps/rejected": -83.9909439086914, + "loss": 0.6855, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.149742603302002, + "rewards/margins": 5.225965976715088, + "rewards/rejected": -2.076223373413086, + "step": 3497 + }, + { + "epoch": 0.88, + "grad_norm": 8.373273849487305, + "learning_rate": 3.705061302600497e-06, + "logits/chosen": -0.2742936909198761, + "logits/rejected": -0.38813456892967224, + "logps/chosen": -58.755088806152344, + "logps/rejected": -76.70240020751953, + "loss": 0.9614, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.860058546066284, + "rewards/margins": 3.9782474040985107, + "rewards/rejected": -1.1181890964508057, + "step": 3498 + }, + { + "epoch": 0.88, + "grad_norm": 3.278663396835327, + "learning_rate": 3.702531124035119e-06, + "logits/chosen": -0.3876282572746277, + "logits/rejected": -0.4502994120121002, + "logps/chosen": -52.639739990234375, + "logps/rejected": -80.0301742553711, + "loss": 0.7308, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8562746047973633, + "rewards/margins": 4.466184139251709, + "rewards/rejected": -1.6099092960357666, + "step": 3499 + }, + { + "epoch": 0.88, + "grad_norm": 5.610363483428955, + "learning_rate": 3.700001301652947e-06, + "logits/chosen": -0.2854800820350647, + "logits/rejected": -0.32248374819755554, + "logps/chosen": -52.363853454589844, + "logps/rejected": -96.9234619140625, + "loss": 0.7121, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9491019248962402, + "rewards/margins": 4.7550950050354, + "rewards/rejected": -1.8059930801391602, + "step": 3500 + }, + { + "epoch": 0.88, + "grad_norm": 4.279600620269775, + "learning_rate": 3.6974718361484676e-06, + "logits/chosen": -0.37316226959228516, + "logits/rejected": -0.42930206656455994, + "logps/chosen": -53.087303161621094, + "logps/rejected": -81.76951599121094, + "loss": 0.7405, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9861559867858887, + "rewards/margins": 4.5678606033325195, + "rewards/rejected": -1.5817043781280518, + "step": 3501 + }, + { + "epoch": 0.88, + "grad_norm": 4.554892063140869, + "learning_rate": 3.6949427282160743e-06, + "logits/chosen": -0.36627721786499023, + "logits/rejected": -0.46267810463905334, + "logps/chosen": -50.776268005371094, + "logps/rejected": -60.70083999633789, + "loss": 0.6454, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0208020210266113, + "rewards/margins": 4.056958198547363, + "rewards/rejected": -1.0361566543579102, + "step": 3502 + }, + { + "epoch": 0.88, + "grad_norm": 4.211069107055664, + "learning_rate": 3.6924139785500646e-06, + "logits/chosen": -0.33179131150245667, + "logits/rejected": -0.36747559905052185, + "logps/chosen": -48.994110107421875, + "logps/rejected": -81.52519226074219, + "loss": 0.6759, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0806119441986084, + "rewards/margins": 4.491289138793945, + "rewards/rejected": -1.4106773138046265, + "step": 3503 + }, + { + "epoch": 0.88, + "grad_norm": 7.196694850921631, + "learning_rate": 3.689885587844633e-06, + "logits/chosen": -0.29822999238967896, + "logits/rejected": -0.4222060441970825, + "logps/chosen": -61.21969223022461, + "logps/rejected": -73.12566375732422, + "loss": 0.7499, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.880906581878662, + "rewards/margins": 4.299982070922852, + "rewards/rejected": -1.419075846672058, + "step": 3504 + }, + { + "epoch": 0.88, + "grad_norm": 3.6941494941711426, + "learning_rate": 3.6873575567938772e-06, + "logits/chosen": -0.40352365374565125, + "logits/rejected": -0.49812084436416626, + "logps/chosen": -61.653076171875, + "logps/rejected": -67.0084457397461, + "loss": 0.7616, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9466195106506348, + "rewards/margins": 4.095901966094971, + "rewards/rejected": -1.1492829322814941, + "step": 3505 + }, + { + "epoch": 0.88, + "grad_norm": 3.8444607257843018, + "learning_rate": 3.6848298860917977e-06, + "logits/chosen": -0.31470543146133423, + "logits/rejected": -0.4337671399116516, + "logps/chosen": -47.85147476196289, + "logps/rejected": -72.72488403320312, + "loss": 0.6514, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1475956439971924, + "rewards/margins": 4.547244548797607, + "rewards/rejected": -1.3996491432189941, + "step": 3506 + }, + { + "epoch": 0.88, + "grad_norm": 4.139969825744629, + "learning_rate": 3.6823025764322916e-06, + "logits/chosen": -0.31569206714630127, + "logits/rejected": -0.4353911578655243, + "logps/chosen": -53.91411590576172, + "logps/rejected": -68.19276428222656, + "loss": 0.72, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.117966651916504, + "rewards/margins": 3.9603562355041504, + "rewards/rejected": -0.8423895239830017, + "step": 3507 + }, + { + "epoch": 0.88, + "grad_norm": 6.971879482269287, + "learning_rate": 3.6797756285091634e-06, + "logits/chosen": -0.29508861899375916, + "logits/rejected": -0.42384520173072815, + "logps/chosen": -59.03086471557617, + "logps/rejected": -87.96796417236328, + "loss": 0.8504, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.584151268005371, + "rewards/margins": 4.643465995788574, + "rewards/rejected": -2.0593149662017822, + "step": 3508 + }, + { + "epoch": 0.88, + "grad_norm": 4.506887912750244, + "learning_rate": 3.677249043016112e-06, + "logits/chosen": -0.26678040623664856, + "logits/rejected": -0.3443481922149658, + "logps/chosen": -64.61492156982422, + "logps/rejected": -89.03206634521484, + "loss": 0.7807, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.028127908706665, + "rewards/margins": 4.163874626159668, + "rewards/rejected": -1.135746717453003, + "step": 3509 + }, + { + "epoch": 0.88, + "grad_norm": 6.149580001831055, + "learning_rate": 3.6747228206467435e-06, + "logits/chosen": -0.3260325789451599, + "logits/rejected": -0.4026145935058594, + "logps/chosen": -67.39076232910156, + "logps/rejected": -77.1422119140625, + "loss": 0.9588, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5968568325042725, + "rewards/margins": 3.4232330322265625, + "rewards/rejected": -0.8263763189315796, + "step": 3510 + }, + { + "epoch": 0.88, + "grad_norm": 5.320631504058838, + "learning_rate": 3.6721969620945576e-06, + "logits/chosen": -0.3576504588127136, + "logits/rejected": -0.39461398124694824, + "logps/chosen": -50.7601203918457, + "logps/rejected": -86.48844146728516, + "loss": 0.8489, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.760010242462158, + "rewards/margins": 3.3732094764709473, + "rewards/rejected": -0.6131995320320129, + "step": 3511 + }, + { + "epoch": 0.88, + "grad_norm": 2.63000750541687, + "learning_rate": 3.6696714680529544e-06, + "logits/chosen": -0.33860883116722107, + "logits/rejected": -0.5124734044075012, + "logps/chosen": -49.281803131103516, + "logps/rejected": -61.40614700317383, + "loss": 0.6126, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.920405149459839, + "rewards/margins": 4.888105869293213, + "rewards/rejected": -1.9677013158798218, + "step": 3512 + }, + { + "epoch": 0.88, + "grad_norm": 3.4230539798736572, + "learning_rate": 3.667146339215243e-06, + "logits/chosen": -0.2893487513065338, + "logits/rejected": -0.4899197816848755, + "logps/chosen": -53.92600631713867, + "logps/rejected": -64.10723876953125, + "loss": 0.6714, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0359814167022705, + "rewards/margins": 5.4082441329956055, + "rewards/rejected": -2.372262716293335, + "step": 3513 + }, + { + "epoch": 0.88, + "grad_norm": 5.1037678718566895, + "learning_rate": 3.6646215762746195e-06, + "logits/chosen": -0.26127609610557556, + "logits/rejected": -0.44327354431152344, + "logps/chosen": -56.06285858154297, + "logps/rejected": -70.34561157226562, + "loss": 0.7348, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9729628562927246, + "rewards/margins": 4.60502290725708, + "rewards/rejected": -1.6320598125457764, + "step": 3514 + }, + { + "epoch": 0.88, + "grad_norm": 6.887115955352783, + "learning_rate": 3.66209717992419e-06, + "logits/chosen": -0.32578980922698975, + "logits/rejected": -0.4223591685295105, + "logps/chosen": -62.042686462402344, + "logps/rejected": -71.42112731933594, + "loss": 0.9279, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8505361080169678, + "rewards/margins": 3.676422119140625, + "rewards/rejected": -0.8258858919143677, + "step": 3515 + }, + { + "epoch": 0.88, + "grad_norm": 4.658977031707764, + "learning_rate": 3.659573150856954e-06, + "logits/chosen": -0.3192289471626282, + "logits/rejected": -0.4495075047016144, + "logps/chosen": -62.30008316040039, + "logps/rejected": -78.16649627685547, + "loss": 0.8947, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.967804431915283, + "rewards/margins": 4.503647804260254, + "rewards/rejected": -1.5358433723449707, + "step": 3516 + }, + { + "epoch": 0.88, + "grad_norm": 7.974287986755371, + "learning_rate": 3.6570494897658115e-06, + "logits/chosen": -0.20604491233825684, + "logits/rejected": -0.33082932233810425, + "logps/chosen": -60.73265075683594, + "logps/rejected": -89.0035629272461, + "loss": 0.9264, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.786956548690796, + "rewards/margins": 4.356729507446289, + "rewards/rejected": -1.5697729587554932, + "step": 3517 + }, + { + "epoch": 0.88, + "grad_norm": 2.981083869934082, + "learning_rate": 3.6545261973435637e-06, + "logits/chosen": -0.24557770788669586, + "logits/rejected": -0.3510575294494629, + "logps/chosen": -53.68278503417969, + "logps/rejected": -81.03034973144531, + "loss": 0.683, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2943994998931885, + "rewards/margins": 4.54748010635376, + "rewards/rejected": -1.2530807256698608, + "step": 3518 + }, + { + "epoch": 0.88, + "grad_norm": 20.075992584228516, + "learning_rate": 3.6520032742829066e-06, + "logits/chosen": -0.33087682723999023, + "logits/rejected": -0.5154093503952026, + "logps/chosen": -67.26468658447266, + "logps/rejected": -75.87738800048828, + "loss": 0.7393, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4789552688598633, + "rewards/margins": 5.654702663421631, + "rewards/rejected": -3.1757471561431885, + "step": 3519 + }, + { + "epoch": 0.88, + "grad_norm": 17.81089210510254, + "learning_rate": 3.6494807212764396e-06, + "logits/chosen": -0.3394409120082855, + "logits/rejected": -0.44041669368743896, + "logps/chosen": -58.749088287353516, + "logps/rejected": -70.96087646484375, + "loss": 0.8884, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.642998456954956, + "rewards/margins": 3.4572129249572754, + "rewards/rejected": -0.8142144680023193, + "step": 3520 + }, + { + "epoch": 0.88, + "grad_norm": 4.1622314453125, + "learning_rate": 3.646958539016657e-06, + "logits/chosen": -0.2190442532300949, + "logits/rejected": -0.3523394763469696, + "logps/chosen": -54.74028396606445, + "logps/rejected": -74.94500732421875, + "loss": 0.6715, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.878884792327881, + "rewards/margins": 4.753100395202637, + "rewards/rejected": -1.874215841293335, + "step": 3521 + }, + { + "epoch": 0.88, + "grad_norm": 3.9755611419677734, + "learning_rate": 3.644436728195948e-06, + "logits/chosen": -0.34282296895980835, + "logits/rejected": -0.44403865933418274, + "logps/chosen": -59.843997955322266, + "logps/rejected": -88.46739196777344, + "loss": 0.7132, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0167160034179688, + "rewards/margins": 5.138199806213379, + "rewards/rejected": -2.12148380279541, + "step": 3522 + }, + { + "epoch": 0.88, + "grad_norm": 4.461757183074951, + "learning_rate": 3.641915289506612e-06, + "logits/chosen": -0.31010812520980835, + "logits/rejected": -0.45578205585479736, + "logps/chosen": -61.11772155761719, + "logps/rejected": -85.66596984863281, + "loss": 0.7085, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0807089805603027, + "rewards/margins": 4.998818874359131, + "rewards/rejected": -1.9181101322174072, + "step": 3523 + }, + { + "epoch": 0.88, + "grad_norm": 8.150958061218262, + "learning_rate": 3.639394223640831e-06, + "logits/chosen": -0.3581623435020447, + "logits/rejected": -0.3838873505592346, + "logps/chosen": -47.53865051269531, + "logps/rejected": -79.7879409790039, + "loss": 0.9663, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8929972648620605, + "rewards/margins": 3.5652709007263184, + "rewards/rejected": -0.6722736954689026, + "step": 3524 + }, + { + "epoch": 0.88, + "grad_norm": 5.6128249168396, + "learning_rate": 3.6368735312906984e-06, + "logits/chosen": -0.384179949760437, + "logits/rejected": -0.46833425760269165, + "logps/chosen": -52.965370178222656, + "logps/rejected": -74.9340591430664, + "loss": 0.8831, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.827986717224121, + "rewards/margins": 3.9813544750213623, + "rewards/rejected": -1.1533679962158203, + "step": 3525 + }, + { + "epoch": 0.88, + "grad_norm": 4.747976779937744, + "learning_rate": 3.634353213148194e-06, + "logits/chosen": -0.2670110762119293, + "logits/rejected": -0.3884279131889343, + "logps/chosen": -66.71513366699219, + "logps/rejected": -88.73182678222656, + "loss": 0.7625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0569229125976562, + "rewards/margins": 4.161048412322998, + "rewards/rejected": -1.1041253805160522, + "step": 3526 + }, + { + "epoch": 0.88, + "grad_norm": 6.287786960601807, + "learning_rate": 3.6318332699051995e-06, + "logits/chosen": -0.29772841930389404, + "logits/rejected": -0.37929704785346985, + "logps/chosen": -73.22882843017578, + "logps/rejected": -88.93058776855469, + "loss": 0.7556, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0160164833068848, + "rewards/margins": 5.080040454864502, + "rewards/rejected": -2.064023971557617, + "step": 3527 + }, + { + "epoch": 0.88, + "grad_norm": 4.210752010345459, + "learning_rate": 3.6293137022534967e-06, + "logits/chosen": -0.33238399028778076, + "logits/rejected": -0.41796186566352844, + "logps/chosen": -53.09605407714844, + "logps/rejected": -95.61593627929688, + "loss": 0.6624, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2009363174438477, + "rewards/margins": 5.87520694732666, + "rewards/rejected": -2.674269676208496, + "step": 3528 + }, + { + "epoch": 0.88, + "grad_norm": 3.2895402908325195, + "learning_rate": 3.626794510884759e-06, + "logits/chosen": -0.23998968303203583, + "logits/rejected": -0.3424574136734009, + "logps/chosen": -64.62813568115234, + "logps/rejected": -96.62482452392578, + "loss": 0.68, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0879952907562256, + "rewards/margins": 5.702777862548828, + "rewards/rejected": -2.6147828102111816, + "step": 3529 + }, + { + "epoch": 0.88, + "grad_norm": 3.850520133972168, + "learning_rate": 3.62427569649056e-06, + "logits/chosen": -0.4327390491962433, + "logits/rejected": -0.4977278411388397, + "logps/chosen": -48.50770568847656, + "logps/rejected": -76.23629760742188, + "loss": 0.8401, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8949294090270996, + "rewards/margins": 4.357543468475342, + "rewards/rejected": -1.4626137018203735, + "step": 3530 + }, + { + "epoch": 0.88, + "grad_norm": 9.623108863830566, + "learning_rate": 3.6217572597623697e-06, + "logits/chosen": -0.39722415804862976, + "logits/rejected": -0.4726453721523285, + "logps/chosen": -54.097450256347656, + "logps/rejected": -75.83487701416016, + "loss": 0.8432, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8083369731903076, + "rewards/margins": 4.200750350952148, + "rewards/rejected": -1.3924131393432617, + "step": 3531 + }, + { + "epoch": 0.88, + "grad_norm": 4.334911823272705, + "learning_rate": 3.619239201391548e-06, + "logits/chosen": -0.3388962745666504, + "logits/rejected": -0.40408745408058167, + "logps/chosen": -52.30253219604492, + "logps/rejected": -91.13018798828125, + "loss": 0.8206, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.893439292907715, + "rewards/margins": 4.830422401428223, + "rewards/rejected": -1.9369832277297974, + "step": 3532 + }, + { + "epoch": 0.88, + "grad_norm": 8.59410285949707, + "learning_rate": 3.616721522069363e-06, + "logits/chosen": -0.2977360785007477, + "logits/rejected": -0.3328157961368561, + "logps/chosen": -53.72834396362305, + "logps/rejected": -87.94768524169922, + "loss": 0.73, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8961198329925537, + "rewards/margins": 4.771578788757324, + "rewards/rejected": -1.875458836555481, + "step": 3533 + }, + { + "epoch": 0.88, + "grad_norm": 4.2169880867004395, + "learning_rate": 3.614204222486966e-06, + "logits/chosen": -0.32909858226776123, + "logits/rejected": -0.42539215087890625, + "logps/chosen": -56.87757110595703, + "logps/rejected": -74.72158813476562, + "loss": 0.7067, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7434136867523193, + "rewards/margins": 4.566518783569336, + "rewards/rejected": -1.8231048583984375, + "step": 3534 + }, + { + "epoch": 0.88, + "grad_norm": 4.677512168884277, + "learning_rate": 3.6116873033354164e-06, + "logits/chosen": -0.3601039946079254, + "logits/rejected": -0.5311319231987, + "logps/chosen": -61.35584259033203, + "logps/rejected": -72.90653228759766, + "loss": 0.7904, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5643157958984375, + "rewards/margins": 5.006951332092285, + "rewards/rejected": -2.4426352977752686, + "step": 3535 + }, + { + "epoch": 0.88, + "grad_norm": 7.086791515350342, + "learning_rate": 3.6091707653056584e-06, + "logits/chosen": -0.31059888005256653, + "logits/rejected": -0.36502528190612793, + "logps/chosen": -60.82992935180664, + "logps/rejected": -109.35118865966797, + "loss": 0.7997, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.566185712814331, + "rewards/margins": 4.710829734802246, + "rewards/rejected": -2.144644021987915, + "step": 3536 + }, + { + "epoch": 0.88, + "grad_norm": 15.572346687316895, + "learning_rate": 3.606654609088536e-06, + "logits/chosen": -0.26233312487602234, + "logits/rejected": -0.3924172520637512, + "logps/chosen": -67.45067596435547, + "logps/rejected": -85.74925994873047, + "loss": 0.8163, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.661938428878784, + "rewards/margins": 4.222092628479004, + "rewards/rejected": -1.5601537227630615, + "step": 3537 + }, + { + "epoch": 0.89, + "grad_norm": 8.645262718200684, + "learning_rate": 3.6041388353747908e-06, + "logits/chosen": -0.29337409138679504, + "logits/rejected": -0.4020698070526123, + "logps/chosen": -60.8870964050293, + "logps/rejected": -88.68328094482422, + "loss": 0.7822, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6332309246063232, + "rewards/margins": 5.328564643859863, + "rewards/rejected": -2.695333480834961, + "step": 3538 + }, + { + "epoch": 0.89, + "grad_norm": 5.014599800109863, + "learning_rate": 3.6016234448550534e-06, + "logits/chosen": -0.28756484389305115, + "logits/rejected": -0.35858458280563354, + "logps/chosen": -54.71067810058594, + "logps/rejected": -85.76599884033203, + "loss": 0.7099, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.14967679977417, + "rewards/margins": 5.170358657836914, + "rewards/rejected": -2.020681619644165, + "step": 3539 + }, + { + "epoch": 0.89, + "grad_norm": 5.6147141456604, + "learning_rate": 3.599108438219857e-06, + "logits/chosen": -0.27597206830978394, + "logits/rejected": -0.3844517469406128, + "logps/chosen": -56.5318489074707, + "logps/rejected": -77.70457458496094, + "loss": 0.7699, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.788201332092285, + "rewards/margins": 4.209607124328613, + "rewards/rejected": -1.4214057922363281, + "step": 3540 + }, + { + "epoch": 0.89, + "grad_norm": 3.8050103187561035, + "learning_rate": 3.596593816159624e-06, + "logits/chosen": -0.2616625130176544, + "logits/rejected": -0.38121822476387024, + "logps/chosen": -56.85978698730469, + "logps/rejected": -85.57728576660156, + "loss": 0.6698, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7156624794006348, + "rewards/margins": 5.163731098175049, + "rewards/rejected": -2.448068618774414, + "step": 3541 + }, + { + "epoch": 0.89, + "grad_norm": 10.50909423828125, + "learning_rate": 3.5940795793646686e-06, + "logits/chosen": -0.4123989939689636, + "logits/rejected": -0.5417221784591675, + "logps/chosen": -52.74374008178711, + "logps/rejected": -77.9727783203125, + "loss": 0.7215, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9460561275482178, + "rewards/margins": 4.559076309204102, + "rewards/rejected": -1.6130200624465942, + "step": 3542 + }, + { + "epoch": 0.89, + "grad_norm": 5.4935503005981445, + "learning_rate": 3.5915657285252094e-06, + "logits/chosen": -0.37735578417778015, + "logits/rejected": -0.45601147413253784, + "logps/chosen": -47.073509216308594, + "logps/rejected": -75.40750122070312, + "loss": 0.7124, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.799893379211426, + "rewards/margins": 4.753591060638428, + "rewards/rejected": -1.9536972045898438, + "step": 3543 + }, + { + "epoch": 0.89, + "grad_norm": 9.811630249023438, + "learning_rate": 3.5890522643313454e-06, + "logits/chosen": -0.3636443316936493, + "logits/rejected": -0.5155429840087891, + "logps/chosen": -52.33674621582031, + "logps/rejected": -69.53089904785156, + "loss": 0.7096, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9751620292663574, + "rewards/margins": 5.5027570724487305, + "rewards/rejected": -2.527595043182373, + "step": 3544 + }, + { + "epoch": 0.89, + "grad_norm": 9.770499229431152, + "learning_rate": 3.5865391874730847e-06, + "logits/chosen": -0.311756432056427, + "logits/rejected": -0.44023430347442627, + "logps/chosen": -71.01618957519531, + "logps/rejected": -76.83143615722656, + "loss": 0.8676, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1114673614501953, + "rewards/margins": 4.011867523193359, + "rewards/rejected": -0.9004002809524536, + "step": 3545 + }, + { + "epoch": 0.89, + "grad_norm": 10.582890510559082, + "learning_rate": 3.584026498640315e-06, + "logits/chosen": -0.2195788323879242, + "logits/rejected": -0.3730939030647278, + "logps/chosen": -59.37263870239258, + "logps/rejected": -89.96440887451172, + "loss": 0.7247, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5620269775390625, + "rewards/margins": 4.704606533050537, + "rewards/rejected": -2.1425795555114746, + "step": 3546 + }, + { + "epoch": 0.89, + "grad_norm": 5.896821975708008, + "learning_rate": 3.581514198522826e-06, + "logits/chosen": -0.3169393241405487, + "logits/rejected": -0.4192698895931244, + "logps/chosen": -57.92019271850586, + "logps/rejected": -87.67727661132812, + "loss": 0.6869, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.991680145263672, + "rewards/margins": 5.729123592376709, + "rewards/rejected": -2.737443447113037, + "step": 3547 + }, + { + "epoch": 0.89, + "grad_norm": 8.726127624511719, + "learning_rate": 3.5790022878102983e-06, + "logits/chosen": -0.3565412163734436, + "logits/rejected": -0.3705637454986572, + "logps/chosen": -47.06145477294922, + "logps/rejected": -88.7219467163086, + "loss": 0.8479, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.980978488922119, + "rewards/margins": 3.7342095375061035, + "rewards/rejected": -0.7532309293746948, + "step": 3548 + }, + { + "epoch": 0.89, + "grad_norm": 10.020931243896484, + "learning_rate": 3.5764907671923045e-06, + "logits/chosen": -0.2842997610569, + "logits/rejected": -0.30143678188323975, + "logps/chosen": -53.444427490234375, + "logps/rejected": -86.03225708007812, + "loss": 0.7251, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.801151990890503, + "rewards/margins": 4.178455829620361, + "rewards/rejected": -1.3773040771484375, + "step": 3549 + }, + { + "epoch": 0.89, + "grad_norm": 4.140770435333252, + "learning_rate": 3.5739796373583124e-06, + "logits/chosen": -0.3252311944961548, + "logits/rejected": -0.46770787239074707, + "logps/chosen": -54.68116760253906, + "logps/rejected": -76.42469787597656, + "loss": 0.7173, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8005316257476807, + "rewards/margins": 4.748000621795654, + "rewards/rejected": -1.9474694728851318, + "step": 3550 + }, + { + "epoch": 0.89, + "grad_norm": 4.944102764129639, + "learning_rate": 3.5714688989976816e-06, + "logits/chosen": -0.2984216809272766, + "logits/rejected": -0.4320840537548065, + "logps/chosen": -62.78063201904297, + "logps/rejected": -74.19038391113281, + "loss": 0.7316, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0325927734375, + "rewards/margins": 4.80495548248291, + "rewards/rejected": -1.7723628282546997, + "step": 3551 + }, + { + "epoch": 0.89, + "grad_norm": 6.183071613311768, + "learning_rate": 3.568958552799662e-06, + "logits/chosen": -0.22489286959171295, + "logits/rejected": -0.3293370306491852, + "logps/chosen": -58.39658737182617, + "logps/rejected": -79.22415924072266, + "loss": 0.6927, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.195969343185425, + "rewards/margins": 4.837103366851807, + "rewards/rejected": -1.641134262084961, + "step": 3552 + }, + { + "epoch": 0.89, + "grad_norm": 3.35351300239563, + "learning_rate": 3.5664485994534003e-06, + "logits/chosen": -0.4045470058917999, + "logits/rejected": -0.4761594831943512, + "logps/chosen": -53.21468734741211, + "logps/rejected": -91.78240203857422, + "loss": 0.7035, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8720691204071045, + "rewards/margins": 4.994905948638916, + "rewards/rejected": -2.1228365898132324, + "step": 3553 + }, + { + "epoch": 0.89, + "grad_norm": 4.379897594451904, + "learning_rate": 3.56393903964793e-06, + "logits/chosen": -0.2707763612270355, + "logits/rejected": -0.3839870095252991, + "logps/chosen": -58.760223388671875, + "logps/rejected": -90.83423614501953, + "loss": 0.7217, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8754634857177734, + "rewards/margins": 5.261960506439209, + "rewards/rejected": -2.3864967823028564, + "step": 3554 + }, + { + "epoch": 0.89, + "grad_norm": 3.0244622230529785, + "learning_rate": 3.561429874072184e-06, + "logits/chosen": -0.3202625513076782, + "logits/rejected": -0.445200651884079, + "logps/chosen": -47.09545135498047, + "logps/rejected": -74.42916107177734, + "loss": 0.6576, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9966177940368652, + "rewards/margins": 5.106924533843994, + "rewards/rejected": -2.110306739807129, + "step": 3555 + }, + { + "epoch": 0.89, + "grad_norm": 5.945516109466553, + "learning_rate": 3.5589211034149783e-06, + "logits/chosen": -0.32323288917541504, + "logits/rejected": -0.3750922679901123, + "logps/chosen": -54.54111099243164, + "logps/rejected": -78.49817657470703, + "loss": 0.7784, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9846324920654297, + "rewards/margins": 4.444354057312012, + "rewards/rejected": -1.459721326828003, + "step": 3556 + }, + { + "epoch": 0.89, + "grad_norm": 8.173820495605469, + "learning_rate": 3.556412728365024e-06, + "logits/chosen": -0.35641756653785706, + "logits/rejected": -0.4521770477294922, + "logps/chosen": -58.079532623291016, + "logps/rejected": -80.12909698486328, + "loss": 0.7677, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.141401529312134, + "rewards/margins": 4.430367946624756, + "rewards/rejected": -1.2889665365219116, + "step": 3557 + }, + { + "epoch": 0.89, + "grad_norm": 4.7436203956604, + "learning_rate": 3.553904749610928e-06, + "logits/chosen": -0.32509881258010864, + "logits/rejected": -0.3587707281112671, + "logps/chosen": -64.93060302734375, + "logps/rejected": -89.26065826416016, + "loss": 0.8966, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.124451160430908, + "rewards/margins": 4.132857322692871, + "rewards/rejected": -1.0084060430526733, + "step": 3558 + }, + { + "epoch": 0.89, + "grad_norm": 4.419124126434326, + "learning_rate": 3.5513971678411808e-06, + "logits/chosen": -0.27131932973861694, + "logits/rejected": -0.35451987385749817, + "logps/chosen": -59.348106384277344, + "logps/rejected": -85.31829833984375, + "loss": 0.8227, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5745632648468018, + "rewards/margins": 4.95052433013916, + "rewards/rejected": -2.3759610652923584, + "step": 3559 + }, + { + "epoch": 0.89, + "grad_norm": 4.202325344085693, + "learning_rate": 3.5488899837441703e-06, + "logits/chosen": -0.32771769165992737, + "logits/rejected": -0.4387012720108032, + "logps/chosen": -60.365230560302734, + "logps/rejected": -94.20598602294922, + "loss": 0.7212, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6033196449279785, + "rewards/margins": 4.6422343254089355, + "rewards/rejected": -2.038914442062378, + "step": 3560 + }, + { + "epoch": 0.89, + "grad_norm": 4.575601577758789, + "learning_rate": 3.546383198008172e-06, + "logits/chosen": -0.2578160762786865, + "logits/rejected": -0.43849658966064453, + "logps/chosen": -65.91311645507812, + "logps/rejected": -87.06611633300781, + "loss": 0.7163, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9174487590789795, + "rewards/margins": 5.205932140350342, + "rewards/rejected": -2.2884833812713623, + "step": 3561 + }, + { + "epoch": 0.89, + "grad_norm": 2.88921856880188, + "learning_rate": 3.543876811321352e-06, + "logits/chosen": -0.24771972000598907, + "logits/rejected": -0.3290710151195526, + "logps/chosen": -57.06778335571289, + "logps/rejected": -83.42398834228516, + "loss": 0.6425, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.113286018371582, + "rewards/margins": 4.121833801269531, + "rewards/rejected": -1.0085476636886597, + "step": 3562 + }, + { + "epoch": 0.89, + "grad_norm": 4.836394309997559, + "learning_rate": 3.5413708243717676e-06, + "logits/chosen": -0.32494473457336426, + "logits/rejected": -0.4311492443084717, + "logps/chosen": -46.89023208618164, + "logps/rejected": -72.17372131347656, + "loss": 0.6231, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1424190998077393, + "rewards/margins": 4.720170021057129, + "rewards/rejected": -1.577750563621521, + "step": 3563 + }, + { + "epoch": 0.89, + "grad_norm": 5.616461753845215, + "learning_rate": 3.5388652378473664e-06, + "logits/chosen": -0.35995638370513916, + "logits/rejected": -0.4996228814125061, + "logps/chosen": -63.75463104248047, + "logps/rejected": -78.9709243774414, + "loss": 0.7821, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9085538387298584, + "rewards/margins": 5.003339767456055, + "rewards/rejected": -2.0947861671447754, + "step": 3564 + }, + { + "epoch": 0.89, + "grad_norm": 5.009089946746826, + "learning_rate": 3.5363600524359874e-06, + "logits/chosen": -0.3210696578025818, + "logits/rejected": -0.41059577465057373, + "logps/chosen": -56.6977653503418, + "logps/rejected": -89.73863220214844, + "loss": 0.8854, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8893933296203613, + "rewards/margins": 4.24077844619751, + "rewards/rejected": -1.351385235786438, + "step": 3565 + }, + { + "epoch": 0.89, + "grad_norm": 6.420635223388672, + "learning_rate": 3.533855268825358e-06, + "logits/chosen": -0.30994558334350586, + "logits/rejected": -0.36291956901550293, + "logps/chosen": -47.77398681640625, + "logps/rejected": -75.8951187133789, + "loss": 0.7442, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8354315757751465, + "rewards/margins": 3.7336816787719727, + "rewards/rejected": -0.8982502222061157, + "step": 3566 + }, + { + "epoch": 0.89, + "grad_norm": 12.29133415222168, + "learning_rate": 3.5313508877030907e-06, + "logits/chosen": -0.46350905299186707, + "logits/rejected": -0.5355633497238159, + "logps/chosen": -60.032508850097656, + "logps/rejected": -87.18898010253906, + "loss": 0.7669, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0668628215789795, + "rewards/margins": 4.597502708435059, + "rewards/rejected": -1.530639410018921, + "step": 3567 + }, + { + "epoch": 0.89, + "grad_norm": 5.886276721954346, + "learning_rate": 3.5288469097566992e-06, + "logits/chosen": -0.2787512540817261, + "logits/rejected": -0.4261326491832733, + "logps/chosen": -55.774635314941406, + "logps/rejected": -86.41732788085938, + "loss": 0.7701, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.71514630317688, + "rewards/margins": 5.025588512420654, + "rewards/rejected": -2.3104422092437744, + "step": 3568 + }, + { + "epoch": 0.89, + "grad_norm": 5.915582656860352, + "learning_rate": 3.526343335673573e-06, + "logits/chosen": -0.3081630766391754, + "logits/rejected": -0.3898216784000397, + "logps/chosen": -54.76791763305664, + "logps/rejected": -87.96306610107422, + "loss": 0.7218, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7929470539093018, + "rewards/margins": 4.54376220703125, + "rewards/rejected": -1.75081467628479, + "step": 3569 + }, + { + "epoch": 0.89, + "grad_norm": 11.046170234680176, + "learning_rate": 3.5238401661410016e-06, + "logits/chosen": -0.3110209107398987, + "logits/rejected": -0.3995549976825714, + "logps/chosen": -55.63154602050781, + "logps/rejected": -92.05232238769531, + "loss": 0.8114, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6724071502685547, + "rewards/margins": 4.639831066131592, + "rewards/rejected": -1.9674240350723267, + "step": 3570 + }, + { + "epoch": 0.89, + "grad_norm": 5.054980278015137, + "learning_rate": 3.521337401846158e-06, + "logits/chosen": -0.33339595794677734, + "logits/rejected": -0.3982199430465698, + "logps/chosen": -65.88631439208984, + "logps/rejected": -80.02971649169922, + "loss": 0.7752, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.901693344116211, + "rewards/margins": 4.308835983276367, + "rewards/rejected": -1.4071428775787354, + "step": 3571 + }, + { + "epoch": 0.89, + "grad_norm": 4.775091648101807, + "learning_rate": 3.518835043476103e-06, + "logits/chosen": -0.3205023407936096, + "logits/rejected": -0.44656801223754883, + "logps/chosen": -58.400047302246094, + "logps/rejected": -91.65972900390625, + "loss": 0.7506, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6819992065429688, + "rewards/margins": 5.2407636642456055, + "rewards/rejected": -2.558764934539795, + "step": 3572 + }, + { + "epoch": 0.89, + "grad_norm": 18.691957473754883, + "learning_rate": 3.516333091717792e-06, + "logits/chosen": -0.39789360761642456, + "logits/rejected": -0.48832499980926514, + "logps/chosen": -56.04457092285156, + "logps/rejected": -82.6270523071289, + "loss": 0.8627, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.923457622528076, + "rewards/margins": 4.438371658325195, + "rewards/rejected": -1.51491379737854, + "step": 3573 + }, + { + "epoch": 0.89, + "grad_norm": 4.671407222747803, + "learning_rate": 3.51383154725806e-06, + "logits/chosen": -0.2674921452999115, + "logits/rejected": -0.3478483557701111, + "logps/chosen": -59.69335174560547, + "logps/rejected": -94.36395263671875, + "loss": 0.7324, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0015389919281006, + "rewards/margins": 4.7762017250061035, + "rewards/rejected": -1.7746626138687134, + "step": 3574 + }, + { + "epoch": 0.89, + "grad_norm": 2.8237738609313965, + "learning_rate": 3.5113304107836386e-06, + "logits/chosen": -0.3453029990196228, + "logits/rejected": -0.48975318670272827, + "logps/chosen": -56.747215270996094, + "logps/rejected": -78.26986694335938, + "loss": 0.646, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0345299243927, + "rewards/margins": 5.41951847076416, + "rewards/rejected": -2.384988784790039, + "step": 3575 + }, + { + "epoch": 0.89, + "grad_norm": 15.713354110717773, + "learning_rate": 3.508829682981143e-06, + "logits/chosen": -0.2429179698228836, + "logits/rejected": -0.2989910840988159, + "logps/chosen": -55.703582763671875, + "logps/rejected": -74.56555938720703, + "loss": 0.7049, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6806211471557617, + "rewards/margins": 3.7776224613189697, + "rewards/rejected": -1.097001314163208, + "step": 3576 + }, + { + "epoch": 0.89, + "grad_norm": 4.454945087432861, + "learning_rate": 3.506329364537074e-06, + "logits/chosen": -0.3317177891731262, + "logits/rejected": -0.42183154821395874, + "logps/chosen": -52.59981918334961, + "logps/rejected": -79.14543914794922, + "loss": 0.7149, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.044391632080078, + "rewards/margins": 4.220727920532227, + "rewards/rejected": -1.1763356924057007, + "step": 3577 + }, + { + "epoch": 0.9, + "grad_norm": 3.1869001388549805, + "learning_rate": 3.5038294561378283e-06, + "logits/chosen": -0.2839418947696686, + "logits/rejected": -0.4039354920387268, + "logps/chosen": -53.26649475097656, + "logps/rejected": -69.4798583984375, + "loss": 0.7271, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2301292419433594, + "rewards/margins": 4.770484924316406, + "rewards/rejected": -1.540356159210205, + "step": 3578 + }, + { + "epoch": 0.9, + "grad_norm": 5.987605094909668, + "learning_rate": 3.501329958469678e-06, + "logits/chosen": -0.4156437814235687, + "logits/rejected": -0.5294138193130493, + "logps/chosen": -51.98729705810547, + "logps/rejected": -76.53563690185547, + "loss": 0.7816, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0332891941070557, + "rewards/margins": 4.418083667755127, + "rewards/rejected": -1.3847944736480713, + "step": 3579 + }, + { + "epoch": 0.9, + "grad_norm": 5.559040546417236, + "learning_rate": 3.4988308722187963e-06, + "logits/chosen": -0.17147500813007355, + "logits/rejected": -0.32089561223983765, + "logps/chosen": -65.92162322998047, + "logps/rejected": -96.71284484863281, + "loss": 0.7776, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.729038715362549, + "rewards/margins": 4.504322528839111, + "rewards/rejected": -1.7752838134765625, + "step": 3580 + }, + { + "epoch": 0.9, + "grad_norm": 3.377053737640381, + "learning_rate": 3.4963321980712317e-06, + "logits/chosen": -0.299020379781723, + "logits/rejected": -0.34636828303337097, + "logps/chosen": -51.72996520996094, + "logps/rejected": -85.28878021240234, + "loss": 0.6965, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0143356323242188, + "rewards/margins": 4.787107467651367, + "rewards/rejected": -1.772771954536438, + "step": 3581 + }, + { + "epoch": 0.9, + "grad_norm": 7.238025188446045, + "learning_rate": 3.493833936712925e-06, + "logits/chosen": -0.34279659390449524, + "logits/rejected": -0.4588525891304016, + "logps/chosen": -53.431636810302734, + "logps/rejected": -87.0582504272461, + "loss": 0.7021, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.885624647140503, + "rewards/margins": 5.622858047485352, + "rewards/rejected": -2.7372331619262695, + "step": 3582 + }, + { + "epoch": 0.9, + "grad_norm": 7.7391862869262695, + "learning_rate": 3.491336088829703e-06, + "logits/chosen": -0.32039767503738403, + "logits/rejected": -0.40895527601242065, + "logps/chosen": -54.74591064453125, + "logps/rejected": -73.66819763183594, + "loss": 0.8248, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.52061128616333, + "rewards/margins": 4.295165538787842, + "rewards/rejected": -1.7745548486709595, + "step": 3583 + }, + { + "epoch": 0.9, + "grad_norm": 4.477625370025635, + "learning_rate": 3.488838655107279e-06, + "logits/chosen": -0.28209102153778076, + "logits/rejected": -0.39050403237342834, + "logps/chosen": -70.83198547363281, + "logps/rejected": -84.93045043945312, + "loss": 0.8134, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.21040940284729, + "rewards/margins": 5.484251976013184, + "rewards/rejected": -2.2738423347473145, + "step": 3584 + }, + { + "epoch": 0.9, + "grad_norm": 4.694389820098877, + "learning_rate": 3.486341636231253e-06, + "logits/chosen": -0.3234049379825592, + "logits/rejected": -0.34971311688423157, + "logps/chosen": -61.88931655883789, + "logps/rejected": -88.7817153930664, + "loss": 0.861, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9332115650177, + "rewards/margins": 3.9904651641845703, + "rewards/rejected": -1.0572532415390015, + "step": 3585 + }, + { + "epoch": 0.9, + "grad_norm": 3.409897804260254, + "learning_rate": 3.483845032887111e-06, + "logits/chosen": -0.375265896320343, + "logits/rejected": -0.49495929479599, + "logps/chosen": -65.51490020751953, + "logps/rejected": -71.25393676757812, + "loss": 0.6552, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.932673454284668, + "rewards/margins": 4.568192481994629, + "rewards/rejected": -1.6355191469192505, + "step": 3586 + }, + { + "epoch": 0.9, + "grad_norm": 4.58223295211792, + "learning_rate": 3.4813488457602206e-06, + "logits/chosen": -0.3393198847770691, + "logits/rejected": -0.46703314781188965, + "logps/chosen": -60.459983825683594, + "logps/rejected": -81.00927734375, + "loss": 0.7925, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.855954885482788, + "rewards/margins": 4.87074089050293, + "rewards/rejected": -2.0147862434387207, + "step": 3587 + }, + { + "epoch": 0.9, + "grad_norm": 6.715905666351318, + "learning_rate": 3.4788530755358453e-06, + "logits/chosen": -0.2622228264808655, + "logits/rejected": -0.3737719655036926, + "logps/chosen": -63.76835632324219, + "logps/rejected": -81.24891662597656, + "loss": 0.6937, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.085285186767578, + "rewards/margins": 4.953472137451172, + "rewards/rejected": -1.8681869506835938, + "step": 3588 + }, + { + "epoch": 0.9, + "grad_norm": 4.368579387664795, + "learning_rate": 3.476357722899121e-06, + "logits/chosen": -0.3993898034095764, + "logits/rejected": -0.41823071241378784, + "logps/chosen": -67.28242492675781, + "logps/rejected": -94.61982727050781, + "loss": 0.8184, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9438562393188477, + "rewards/margins": 4.635744094848633, + "rewards/rejected": -1.6918878555297852, + "step": 3589 + }, + { + "epoch": 0.9, + "grad_norm": 14.845057487487793, + "learning_rate": 3.473862788535083e-06, + "logits/chosen": -0.34609636664390564, + "logits/rejected": -0.3489644229412079, + "logps/chosen": -53.20240020751953, + "logps/rejected": -92.41754150390625, + "loss": 0.7578, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7871768474578857, + "rewards/margins": 4.308737754821777, + "rewards/rejected": -1.5215611457824707, + "step": 3590 + }, + { + "epoch": 0.9, + "grad_norm": 10.96911334991455, + "learning_rate": 3.4713682731286392e-06, + "logits/chosen": -0.32697704434394836, + "logits/rejected": -0.4086584150791168, + "logps/chosen": -55.0831298828125, + "logps/rejected": -81.28895568847656, + "loss": 0.7751, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4652180671691895, + "rewards/margins": 4.348709583282471, + "rewards/rejected": -1.8834913969039917, + "step": 3591 + }, + { + "epoch": 0.9, + "grad_norm": 6.012779235839844, + "learning_rate": 3.468874177364589e-06, + "logits/chosen": -0.2792898714542389, + "logits/rejected": -0.3287406265735626, + "logps/chosen": -47.98699951171875, + "logps/rejected": -90.11445617675781, + "loss": 0.9221, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9070277214050293, + "rewards/margins": 3.7055726051330566, + "rewards/rejected": -0.7985448837280273, + "step": 3592 + }, + { + "epoch": 0.9, + "grad_norm": 18.56855583190918, + "learning_rate": 3.4663805019276174e-06, + "logits/chosen": -0.3356884717941284, + "logits/rejected": -0.43711504340171814, + "logps/chosen": -62.32204055786133, + "logps/rejected": -74.85983276367188, + "loss": 1.018, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.741732120513916, + "rewards/margins": 2.957535743713379, + "rewards/rejected": -0.21580353379249573, + "step": 3593 + }, + { + "epoch": 0.9, + "grad_norm": 3.477623701095581, + "learning_rate": 3.46388724750229e-06, + "logits/chosen": -0.23399712145328522, + "logits/rejected": -0.3234517276287079, + "logps/chosen": -57.36677551269531, + "logps/rejected": -91.76289367675781, + "loss": 0.6871, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7753028869628906, + "rewards/margins": 4.558207035064697, + "rewards/rejected": -1.7829041481018066, + "step": 3594 + }, + { + "epoch": 0.9, + "grad_norm": 8.653770446777344, + "learning_rate": 3.4613944147730606e-06, + "logits/chosen": -0.24936562776565552, + "logits/rejected": -0.33796849846839905, + "logps/chosen": -63.900917053222656, + "logps/rejected": -73.43199920654297, + "loss": 0.9429, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.541501998901367, + "rewards/margins": 3.5443320274353027, + "rewards/rejected": -1.0028302669525146, + "step": 3595 + }, + { + "epoch": 0.9, + "grad_norm": 5.142205238342285, + "learning_rate": 3.458902004424265e-06, + "logits/chosen": -0.3689662218093872, + "logits/rejected": -0.41783562302589417, + "logps/chosen": -51.811431884765625, + "logps/rejected": -76.48197937011719, + "loss": 0.8057, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.987213134765625, + "rewards/margins": 3.9044928550720215, + "rewards/rejected": -0.9172797203063965, + "step": 3596 + }, + { + "epoch": 0.9, + "grad_norm": 8.0028715133667, + "learning_rate": 3.4564100171401216e-06, + "logits/chosen": -0.3951188921928406, + "logits/rejected": -0.46898338198661804, + "logps/chosen": -62.698360443115234, + "logps/rejected": -92.04549407958984, + "loss": 0.8166, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6415648460388184, + "rewards/margins": 4.541203498840332, + "rewards/rejected": -1.8996386528015137, + "step": 3597 + }, + { + "epoch": 0.9, + "grad_norm": 6.684895038604736, + "learning_rate": 3.45391845360474e-06, + "logits/chosen": -0.33596739172935486, + "logits/rejected": -0.49808183312416077, + "logps/chosen": -71.72731018066406, + "logps/rejected": -79.15230560302734, + "loss": 0.8633, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.952712059020996, + "rewards/margins": 4.707842826843262, + "rewards/rejected": -1.7551305294036865, + "step": 3598 + }, + { + "epoch": 0.9, + "grad_norm": 4.898581504821777, + "learning_rate": 3.4514273145021003e-06, + "logits/chosen": -0.39077135920524597, + "logits/rejected": -0.4725147485733032, + "logps/chosen": -58.910091400146484, + "logps/rejected": -79.79264068603516, + "loss": 0.8003, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.868612289428711, + "rewards/margins": 3.6637330055236816, + "rewards/rejected": -0.7951210737228394, + "step": 3599 + }, + { + "epoch": 0.9, + "grad_norm": 7.940664291381836, + "learning_rate": 3.4489366005160823e-06, + "logits/chosen": -0.3650195002555847, + "logits/rejected": -0.5297693014144897, + "logps/chosen": -60.091712951660156, + "logps/rejected": -82.45594024658203, + "loss": 0.6848, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.942894458770752, + "rewards/margins": 4.945642948150635, + "rewards/rejected": -2.002748966217041, + "step": 3600 + }, + { + "epoch": 0.9, + "grad_norm": 10.12127685546875, + "learning_rate": 3.446446312330435e-06, + "logits/chosen": -0.3017216920852661, + "logits/rejected": -0.36028623580932617, + "logps/chosen": -68.87674713134766, + "logps/rejected": -80.88387298583984, + "loss": 0.8827, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5772106647491455, + "rewards/margins": 3.743497848510742, + "rewards/rejected": -1.1662871837615967, + "step": 3601 + }, + { + "epoch": 0.9, + "grad_norm": 3.1837947368621826, + "learning_rate": 3.443956450628798e-06, + "logits/chosen": -0.2899550497531891, + "logits/rejected": -0.40692785382270813, + "logps/chosen": -60.898277282714844, + "logps/rejected": -87.0475082397461, + "loss": 0.6428, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8107519149780273, + "rewards/margins": 4.588687419891357, + "rewards/rejected": -1.77793550491333, + "step": 3602 + }, + { + "epoch": 0.9, + "grad_norm": 4.523251056671143, + "learning_rate": 3.441467016094693e-06, + "logits/chosen": -0.31061309576034546, + "logits/rejected": -0.40957796573638916, + "logps/chosen": -60.34979248046875, + "logps/rejected": -78.69747924804688, + "loss": 0.7157, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9774692058563232, + "rewards/margins": 4.233269214630127, + "rewards/rejected": -1.2558001279830933, + "step": 3603 + }, + { + "epoch": 0.9, + "grad_norm": 11.060761451721191, + "learning_rate": 3.4389780094115206e-06, + "logits/chosen": -0.3528088331222534, + "logits/rejected": -0.39655205607414246, + "logps/chosen": -52.44121551513672, + "logps/rejected": -83.39900970458984, + "loss": 0.7826, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8559885025024414, + "rewards/margins": 4.161036014556885, + "rewards/rejected": -1.3050473928451538, + "step": 3604 + }, + { + "epoch": 0.9, + "grad_norm": 4.190938472747803, + "learning_rate": 3.436489431262571e-06, + "logits/chosen": -0.33710092306137085, + "logits/rejected": -0.3664979636669159, + "logps/chosen": -55.98324203491211, + "logps/rejected": -102.42904663085938, + "loss": 0.7571, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7363641262054443, + "rewards/margins": 4.347066879272461, + "rewards/rejected": -1.6107028722763062, + "step": 3605 + }, + { + "epoch": 0.9, + "grad_norm": 5.015069961547852, + "learning_rate": 3.434001282331009e-06, + "logits/chosen": -0.2957160770893097, + "logits/rejected": -0.46592986583709717, + "logps/chosen": -54.36188507080078, + "logps/rejected": -76.01228332519531, + "loss": 0.7043, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8809947967529297, + "rewards/margins": 5.823605537414551, + "rewards/rejected": -2.942610502243042, + "step": 3606 + }, + { + "epoch": 0.9, + "grad_norm": 4.495609283447266, + "learning_rate": 3.4315135632998864e-06, + "logits/chosen": -0.373443067073822, + "logits/rejected": -0.5621671676635742, + "logps/chosen": -63.93406295776367, + "logps/rejected": -81.18556213378906, + "loss": 0.7339, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8327081203460693, + "rewards/margins": 4.9913649559021, + "rewards/rejected": -2.1586568355560303, + "step": 3607 + }, + { + "epoch": 0.9, + "grad_norm": 4.941033363342285, + "learning_rate": 3.429026274852137e-06, + "logits/chosen": -0.41635820269584656, + "logits/rejected": -0.518997848033905, + "logps/chosen": -58.87406921386719, + "logps/rejected": -88.09632873535156, + "loss": 0.742, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.910590171813965, + "rewards/margins": 5.864732265472412, + "rewards/rejected": -2.9541420936584473, + "step": 3608 + }, + { + "epoch": 0.9, + "grad_norm": 10.502106666564941, + "learning_rate": 3.426539417670573e-06, + "logits/chosen": -0.26107698678970337, + "logits/rejected": -0.40055590867996216, + "logps/chosen": -59.8648567199707, + "logps/rejected": -86.53748321533203, + "loss": 0.8001, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5773160457611084, + "rewards/margins": 5.0053205490112305, + "rewards/rejected": -2.428004741668701, + "step": 3609 + }, + { + "epoch": 0.9, + "grad_norm": 5.1443986892700195, + "learning_rate": 3.4240529924378933e-06, + "logits/chosen": -0.4085453450679779, + "logits/rejected": -0.4997497797012329, + "logps/chosen": -60.804325103759766, + "logps/rejected": -77.20211029052734, + "loss": 0.8191, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5792009830474854, + "rewards/margins": 4.3785858154296875, + "rewards/rejected": -1.7993849515914917, + "step": 3610 + }, + { + "epoch": 0.9, + "grad_norm": 4.897305011749268, + "learning_rate": 3.421566999836673e-06, + "logits/chosen": -0.34064555168151855, + "logits/rejected": -0.44688498973846436, + "logps/chosen": -64.02410125732422, + "logps/rejected": -80.57550811767578, + "loss": 0.8432, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.014618158340454, + "rewards/margins": 5.131062030792236, + "rewards/rejected": -2.1164441108703613, + "step": 3611 + }, + { + "epoch": 0.9, + "grad_norm": 4.418508529663086, + "learning_rate": 3.4190814405493687e-06, + "logits/chosen": -0.3794991970062256, + "logits/rejected": -0.4630519151687622, + "logps/chosen": -59.604942321777344, + "logps/rejected": -73.95005798339844, + "loss": 0.794, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9067468643188477, + "rewards/margins": 3.9274163246154785, + "rewards/rejected": -1.0206693410873413, + "step": 3612 + }, + { + "epoch": 0.9, + "grad_norm": 3.9192473888397217, + "learning_rate": 3.4165963152583246e-06, + "logits/chosen": -0.477183073759079, + "logits/rejected": -0.5403723120689392, + "logps/chosen": -52.958553314208984, + "logps/rejected": -75.15569305419922, + "loss": 0.7281, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.061358690261841, + "rewards/margins": 4.511743545532227, + "rewards/rejected": -1.4503847360610962, + "step": 3613 + }, + { + "epoch": 0.9, + "grad_norm": 5.109883785247803, + "learning_rate": 3.4141116246457583e-06, + "logits/chosen": -0.36993396282196045, + "logits/rejected": -0.4231865108013153, + "logps/chosen": -58.450721740722656, + "logps/rejected": -84.98841857910156, + "loss": 0.7068, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.756671905517578, + "rewards/margins": 4.300900459289551, + "rewards/rejected": -1.5442283153533936, + "step": 3614 + }, + { + "epoch": 0.9, + "grad_norm": 14.04738998413086, + "learning_rate": 3.411627369393773e-06, + "logits/chosen": -0.29840514063835144, + "logits/rejected": -0.37950778007507324, + "logps/chosen": -66.75674438476562, + "logps/rejected": -89.53557586669922, + "loss": 0.9565, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.653846502304077, + "rewards/margins": 4.422951698303223, + "rewards/rejected": -1.7691049575805664, + "step": 3615 + }, + { + "epoch": 0.9, + "grad_norm": 30.482629776000977, + "learning_rate": 3.4091435501843486e-06, + "logits/chosen": -0.2879839837551117, + "logits/rejected": -0.42080092430114746, + "logps/chosen": -69.15933990478516, + "logps/rejected": -75.59193420410156, + "loss": 0.9646, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.603348731994629, + "rewards/margins": 4.229432106018066, + "rewards/rejected": -1.626083254814148, + "step": 3616 + }, + { + "epoch": 0.9, + "grad_norm": 3.6012027263641357, + "learning_rate": 3.4066601676993468e-06, + "logits/chosen": -0.2463988959789276, + "logits/rejected": -0.3792761266231537, + "logps/chosen": -62.69147491455078, + "logps/rejected": -87.343505859375, + "loss": 0.6719, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9610371589660645, + "rewards/margins": 4.689510822296143, + "rewards/rejected": -1.72847318649292, + "step": 3617 + }, + { + "epoch": 0.91, + "grad_norm": 4.869377613067627, + "learning_rate": 3.404177222620511e-06, + "logits/chosen": -0.38197389245033264, + "logits/rejected": -0.4432428181171417, + "logps/chosen": -53.79975509643555, + "logps/rejected": -84.30317687988281, + "loss": 0.7575, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.777914524078369, + "rewards/margins": 4.254757404327393, + "rewards/rejected": -1.4768425226211548, + "step": 3618 + }, + { + "epoch": 0.91, + "grad_norm": 8.683509826660156, + "learning_rate": 3.4016947156294623e-06, + "logits/chosen": -0.28094491362571716, + "logits/rejected": -0.42576077580451965, + "logps/chosen": -60.88224792480469, + "logps/rejected": -81.92324829101562, + "loss": 0.7499, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.970325469970703, + "rewards/margins": 4.81912899017334, + "rewards/rejected": -1.8488037586212158, + "step": 3619 + }, + { + "epoch": 0.91, + "grad_norm": 3.677720785140991, + "learning_rate": 3.3992126474077035e-06, + "logits/chosen": -0.37630903720855713, + "logits/rejected": -0.4881896674633026, + "logps/chosen": -53.14991760253906, + "logps/rejected": -91.67304992675781, + "loss": 0.7699, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.964290142059326, + "rewards/margins": 5.250143051147461, + "rewards/rejected": -2.285853385925293, + "step": 3620 + }, + { + "epoch": 0.91, + "grad_norm": 7.681219577789307, + "learning_rate": 3.396731018636617e-06, + "logits/chosen": -0.326478511095047, + "logits/rejected": -0.4262702763080597, + "logps/chosen": -58.264610290527344, + "logps/rejected": -68.56370544433594, + "loss": 0.859, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.913302421569824, + "rewards/margins": 3.5659775733947754, + "rewards/rejected": -0.6526749134063721, + "step": 3621 + }, + { + "epoch": 0.91, + "grad_norm": 3.3632006645202637, + "learning_rate": 3.394249829997458e-06, + "logits/chosen": -0.3042030334472656, + "logits/rejected": -0.42749056220054626, + "logps/chosen": -60.08019256591797, + "logps/rejected": -88.89118957519531, + "loss": 0.6574, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.763272762298584, + "rewards/margins": 5.047625541687012, + "rewards/rejected": -2.2843525409698486, + "step": 3622 + }, + { + "epoch": 0.91, + "grad_norm": 5.662586212158203, + "learning_rate": 3.3917690821713736e-06, + "logits/chosen": -0.34807804226875305, + "logits/rejected": -0.47581517696380615, + "logps/chosen": -62.479312896728516, + "logps/rejected": -84.47982788085938, + "loss": 0.779, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9807801246643066, + "rewards/margins": 3.9656691551208496, + "rewards/rejected": -0.9848892688751221, + "step": 3623 + }, + { + "epoch": 0.91, + "grad_norm": 6.822195529937744, + "learning_rate": 3.3892887758393766e-06, + "logits/chosen": -0.3638896942138672, + "logits/rejected": -0.4151526093482971, + "logps/chosen": -51.515811920166016, + "logps/rejected": -85.0691146850586, + "loss": 0.7958, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8895394802093506, + "rewards/margins": 5.146076679229736, + "rewards/rejected": -2.256537437438965, + "step": 3624 + }, + { + "epoch": 0.91, + "grad_norm": 10.203275680541992, + "learning_rate": 3.3868089116823678e-06, + "logits/chosen": -0.4134622812271118, + "logits/rejected": -0.46203169226646423, + "logps/chosen": -46.35633850097656, + "logps/rejected": -83.71985626220703, + "loss": 0.7572, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9806487560272217, + "rewards/margins": 4.196814060211182, + "rewards/rejected": -1.2161654233932495, + "step": 3625 + }, + { + "epoch": 0.91, + "grad_norm": 10.202530860900879, + "learning_rate": 3.384329490381123e-06, + "logits/chosen": -0.3809303641319275, + "logits/rejected": -0.43418172001838684, + "logps/chosen": -60.232749938964844, + "logps/rejected": -74.63572692871094, + "loss": 0.9758, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6384668350219727, + "rewards/margins": 3.813778877258301, + "rewards/rejected": -1.1753119230270386, + "step": 3626 + }, + { + "epoch": 0.91, + "grad_norm": 3.883472204208374, + "learning_rate": 3.381850512616296e-06, + "logits/chosen": -0.34442421793937683, + "logits/rejected": -0.45191389322280884, + "logps/chosen": -65.35476684570312, + "logps/rejected": -72.15989685058594, + "loss": 0.7691, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6796839237213135, + "rewards/margins": 4.375411033630371, + "rewards/rejected": -1.6957279443740845, + "step": 3627 + }, + { + "epoch": 0.91, + "grad_norm": 5.658717632293701, + "learning_rate": 3.37937197906842e-06, + "logits/chosen": -0.36742231249809265, + "logits/rejected": -0.4991547465324402, + "logps/chosen": -52.25732421875, + "logps/rejected": -76.90892791748047, + "loss": 0.7955, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.731550931930542, + "rewards/margins": 4.433035850524902, + "rewards/rejected": -1.7014849185943604, + "step": 3628 + }, + { + "epoch": 0.91, + "grad_norm": 6.265352725982666, + "learning_rate": 3.376893890417906e-06, + "logits/chosen": -0.2740567624568939, + "logits/rejected": -0.36790338158607483, + "logps/chosen": -58.79853057861328, + "logps/rejected": -88.30130004882812, + "loss": 0.7595, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7778191566467285, + "rewards/margins": 3.9627702236175537, + "rewards/rejected": -1.1849507093429565, + "step": 3629 + }, + { + "epoch": 0.91, + "grad_norm": 7.719736099243164, + "learning_rate": 3.3744162473450435e-06, + "logits/chosen": -0.3316044211387634, + "logits/rejected": -0.4452883303165436, + "logps/chosen": -60.11357879638672, + "logps/rejected": -83.75342559814453, + "loss": 0.7211, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.696366786956787, + "rewards/margins": 4.7567338943481445, + "rewards/rejected": -2.0603668689727783, + "step": 3630 + }, + { + "epoch": 0.91, + "grad_norm": 3.9632184505462646, + "learning_rate": 3.37193905053e-06, + "logits/chosen": -0.35864758491516113, + "logits/rejected": -0.46628671884536743, + "logps/chosen": -56.04530334472656, + "logps/rejected": -81.84422302246094, + "loss": 0.6981, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.989119052886963, + "rewards/margins": 5.133507251739502, + "rewards/rejected": -2.144388198852539, + "step": 3631 + }, + { + "epoch": 0.91, + "grad_norm": 10.406654357910156, + "learning_rate": 3.3694623006528148e-06, + "logits/chosen": -0.3789587616920471, + "logits/rejected": -0.5087695717811584, + "logps/chosen": -68.17086791992188, + "logps/rejected": -96.72962188720703, + "loss": 0.7825, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.2523632049560547, + "rewards/margins": 5.5797624588012695, + "rewards/rejected": -3.3273985385894775, + "step": 3632 + }, + { + "epoch": 0.91, + "grad_norm": 7.611649036407471, + "learning_rate": 3.366985998393415e-06, + "logits/chosen": -0.2998586595058441, + "logits/rejected": -0.428069144487381, + "logps/chosen": -68.28792572021484, + "logps/rejected": -85.35871887207031, + "loss": 0.7428, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8820688724517822, + "rewards/margins": 4.914510250091553, + "rewards/rejected": -2.0324411392211914, + "step": 3633 + }, + { + "epoch": 0.91, + "grad_norm": 6.5320210456848145, + "learning_rate": 3.364510144431593e-06, + "logits/chosen": -0.25861284136772156, + "logits/rejected": -0.4054839611053467, + "logps/chosen": -64.15054321289062, + "logps/rejected": -68.34751892089844, + "loss": 0.8094, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6096649169921875, + "rewards/margins": 3.836345911026001, + "rewards/rejected": -1.226680874824524, + "step": 3634 + }, + { + "epoch": 0.91, + "grad_norm": 7.403895378112793, + "learning_rate": 3.362034739447031e-06, + "logits/chosen": -0.23235571384429932, + "logits/rejected": -0.332383394241333, + "logps/chosen": -65.85675048828125, + "logps/rejected": -77.58464050292969, + "loss": 0.8636, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.788055896759033, + "rewards/margins": 3.051980972290039, + "rewards/rejected": -0.2639250159263611, + "step": 3635 + }, + { + "epoch": 0.91, + "grad_norm": 3.5523107051849365, + "learning_rate": 3.359559784119277e-06, + "logits/chosen": -0.2625308930873871, + "logits/rejected": -0.3726518154144287, + "logps/chosen": -67.00260925292969, + "logps/rejected": -93.59017181396484, + "loss": 0.7398, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.575531005859375, + "rewards/margins": 4.880173206329346, + "rewards/rejected": -2.3046422004699707, + "step": 3636 + }, + { + "epoch": 0.91, + "grad_norm": 5.528594970703125, + "learning_rate": 3.357085279127758e-06, + "logits/chosen": -0.26024916768074036, + "logits/rejected": -0.34187257289886475, + "logps/chosen": -61.65449905395508, + "logps/rejected": -102.1510009765625, + "loss": 0.8211, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.752347469329834, + "rewards/margins": 4.253911018371582, + "rewards/rejected": -1.5015639066696167, + "step": 3637 + }, + { + "epoch": 0.91, + "grad_norm": 3.9677934646606445, + "learning_rate": 3.354611225151783e-06, + "logits/chosen": -0.3271873891353607, + "logits/rejected": -0.44166243076324463, + "logps/chosen": -51.61951446533203, + "logps/rejected": -80.70550537109375, + "loss": 0.6304, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.953127145767212, + "rewards/margins": 5.544256687164307, + "rewards/rejected": -2.5911295413970947, + "step": 3638 + }, + { + "epoch": 0.91, + "grad_norm": 7.6629533767700195, + "learning_rate": 3.35213762287053e-06, + "logits/chosen": -0.34308579564094543, + "logits/rejected": -0.34603065252304077, + "logps/chosen": -57.481605529785156, + "logps/rejected": -87.41044616699219, + "loss": 0.8059, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5150136947631836, + "rewards/margins": 3.172762393951416, + "rewards/rejected": -0.6577486395835876, + "step": 3639 + }, + { + "epoch": 0.91, + "grad_norm": 3.0613174438476562, + "learning_rate": 3.349664472963059e-06, + "logits/chosen": -0.3033255636692047, + "logits/rejected": -0.3844870328903198, + "logps/chosen": -57.21216583251953, + "logps/rejected": -88.17301940917969, + "loss": 0.6752, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6614058017730713, + "rewards/margins": 5.854578018188477, + "rewards/rejected": -3.1931722164154053, + "step": 3640 + }, + { + "epoch": 0.91, + "grad_norm": 4.175985336303711, + "learning_rate": 3.3471917761083018e-06, + "logits/chosen": -0.3265061378479004, + "logits/rejected": -0.4327229857444763, + "logps/chosen": -61.791954040527344, + "logps/rejected": -88.29901885986328, + "loss": 0.7354, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.820911407470703, + "rewards/margins": 4.91734504699707, + "rewards/rejected": -2.0964341163635254, + "step": 3641 + }, + { + "epoch": 0.91, + "grad_norm": 5.888408660888672, + "learning_rate": 3.3447195329850646e-06, + "logits/chosen": -0.36866050958633423, + "logits/rejected": -0.4488529562950134, + "logps/chosen": -52.53285598754883, + "logps/rejected": -76.04269409179688, + "loss": 0.7562, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.811837911605835, + "rewards/margins": 4.397380828857422, + "rewards/rejected": -1.5855425596237183, + "step": 3642 + }, + { + "epoch": 0.91, + "grad_norm": 17.521869659423828, + "learning_rate": 3.3422477442720364e-06, + "logits/chosen": -0.30850088596343994, + "logits/rejected": -0.4494099020957947, + "logps/chosen": -72.63729095458984, + "logps/rejected": -71.49189758300781, + "loss": 1.0299, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.625044584274292, + "rewards/margins": 4.344025135040283, + "rewards/rejected": -1.7189807891845703, + "step": 3643 + }, + { + "epoch": 0.91, + "grad_norm": 5.483885288238525, + "learning_rate": 3.339776410647771e-06, + "logits/chosen": -0.33474066853523254, + "logits/rejected": -0.4483897089958191, + "logps/chosen": -61.11760711669922, + "logps/rejected": -71.99775695800781, + "loss": 0.6967, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.874659299850464, + "rewards/margins": 4.373928546905518, + "rewards/rejected": -1.4992693662643433, + "step": 3644 + }, + { + "epoch": 0.91, + "grad_norm": 2.9101052284240723, + "learning_rate": 3.337305532790709e-06, + "logits/chosen": -0.3381780982017517, + "logits/rejected": -0.41440027952194214, + "logps/chosen": -70.33719635009766, + "logps/rejected": -98.04146575927734, + "loss": 0.6677, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.963134288787842, + "rewards/margins": 5.920056343078613, + "rewards/rejected": -2.9569225311279297, + "step": 3645 + }, + { + "epoch": 0.91, + "grad_norm": 6.057355880737305, + "learning_rate": 3.334835111379154e-06, + "logits/chosen": -0.34439292550086975, + "logits/rejected": -0.4244931936264038, + "logps/chosen": -55.432498931884766, + "logps/rejected": -87.94634246826172, + "loss": 0.7936, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7921693325042725, + "rewards/margins": 4.304164409637451, + "rewards/rejected": -1.51199471950531, + "step": 3646 + }, + { + "epoch": 0.91, + "grad_norm": 5.849173545837402, + "learning_rate": 3.332365147091293e-06, + "logits/chosen": -0.19823017716407776, + "logits/rejected": -0.28861600160598755, + "logps/chosen": -64.5243911743164, + "logps/rejected": -87.35615539550781, + "loss": 0.9269, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8236958980560303, + "rewards/margins": 3.5336852073669434, + "rewards/rejected": -0.709989070892334, + "step": 3647 + }, + { + "epoch": 0.91, + "grad_norm": 9.092010498046875, + "learning_rate": 3.3298956406051842e-06, + "logits/chosen": -0.3367716073989868, + "logits/rejected": -0.44498294591903687, + "logps/chosen": -44.58681106567383, + "logps/rejected": -72.2774887084961, + "loss": 0.7414, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.79557466506958, + "rewards/margins": 4.349620819091797, + "rewards/rejected": -1.5540456771850586, + "step": 3648 + }, + { + "epoch": 0.91, + "grad_norm": 5.569833278656006, + "learning_rate": 3.327426592598759e-06, + "logits/chosen": -0.35558435320854187, + "logits/rejected": -0.4240880012512207, + "logps/chosen": -54.059654235839844, + "logps/rejected": -78.42703247070312, + "loss": 0.7666, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.197277545928955, + "rewards/margins": 4.33951997756958, + "rewards/rejected": -1.1422423124313354, + "step": 3649 + }, + { + "epoch": 0.91, + "grad_norm": 12.720684051513672, + "learning_rate": 3.3249580037498263e-06, + "logits/chosen": -0.35886287689208984, + "logits/rejected": -0.4479137659072876, + "logps/chosen": -44.10771942138672, + "logps/rejected": -86.60038757324219, + "loss": 0.6885, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.72945499420166, + "rewards/margins": 5.072827339172363, + "rewards/rejected": -2.343372344970703, + "step": 3650 + }, + { + "epoch": 0.91, + "grad_norm": 13.182815551757812, + "learning_rate": 3.322489874736067e-06, + "logits/chosen": -0.33384615182876587, + "logits/rejected": -0.4618857204914093, + "logps/chosen": -69.78804779052734, + "logps/rejected": -83.4916763305664, + "loss": 1.0553, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.529924154281616, + "rewards/margins": 3.9575090408325195, + "rewards/rejected": -1.4275850057601929, + "step": 3651 + }, + { + "epoch": 0.91, + "grad_norm": 11.185956954956055, + "learning_rate": 3.320022206235033e-06, + "logits/chosen": -0.31308773159980774, + "logits/rejected": -0.4011031687259674, + "logps/chosen": -60.085750579833984, + "logps/rejected": -83.82710266113281, + "loss": 0.8191, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.624361276626587, + "rewards/margins": 4.3557634353637695, + "rewards/rejected": -1.7314022779464722, + "step": 3652 + }, + { + "epoch": 0.91, + "grad_norm": 7.391114711761475, + "learning_rate": 3.3175549989241564e-06, + "logits/chosen": -0.365245521068573, + "logits/rejected": -0.4600730836391449, + "logps/chosen": -52.39322280883789, + "logps/rejected": -83.78202819824219, + "loss": 0.7232, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8956432342529297, + "rewards/margins": 4.692034721374512, + "rewards/rejected": -1.7963913679122925, + "step": 3653 + }, + { + "epoch": 0.91, + "grad_norm": 8.751916885375977, + "learning_rate": 3.315088253480734e-06, + "logits/chosen": -0.3192363381385803, + "logits/rejected": -0.38518890738487244, + "logps/chosen": -62.350074768066406, + "logps/rejected": -91.65782165527344, + "loss": 0.9194, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6728322505950928, + "rewards/margins": 4.529815673828125, + "rewards/rejected": -1.8569831848144531, + "step": 3654 + }, + { + "epoch": 0.91, + "grad_norm": 7.313480377197266, + "learning_rate": 3.312621970581946e-06, + "logits/chosen": -0.2180245816707611, + "logits/rejected": -0.3209184408187866, + "logps/chosen": -67.88557434082031, + "logps/rejected": -91.98402404785156, + "loss": 0.9106, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0174450874328613, + "rewards/margins": 4.464407920837402, + "rewards/rejected": -1.446962594985962, + "step": 3655 + }, + { + "epoch": 0.91, + "grad_norm": 9.368597984313965, + "learning_rate": 3.3101561509048383e-06, + "logits/chosen": -0.24277469515800476, + "logits/rejected": -0.3387444019317627, + "logps/chosen": -55.817298889160156, + "logps/rejected": -77.09809875488281, + "loss": 0.7023, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7197694778442383, + "rewards/margins": 4.36444616317749, + "rewards/rejected": -1.6446765661239624, + "step": 3656 + }, + { + "epoch": 0.91, + "grad_norm": 6.465939044952393, + "learning_rate": 3.3076907951263293e-06, + "logits/chosen": -0.39284059405326843, + "logits/rejected": -0.5215213298797607, + "logps/chosen": -57.3061637878418, + "logps/rejected": -87.31037139892578, + "loss": 0.7866, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.832437038421631, + "rewards/margins": 4.51216983795166, + "rewards/rejected": -1.679732322692871, + "step": 3657 + }, + { + "epoch": 0.92, + "grad_norm": 4.242185115814209, + "learning_rate": 3.305225903923215e-06, + "logits/chosen": -0.19562804698944092, + "logits/rejected": -0.31990835070610046, + "logps/chosen": -62.17583465576172, + "logps/rejected": -84.34027862548828, + "loss": 0.853, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.872208595275879, + "rewards/margins": 3.9444046020507812, + "rewards/rejected": -1.0721962451934814, + "step": 3658 + }, + { + "epoch": 0.92, + "grad_norm": 2.8585431575775146, + "learning_rate": 3.3027614779721595e-06, + "logits/chosen": -0.31156235933303833, + "logits/rejected": -0.4257189631462097, + "logps/chosen": -61.68156814575195, + "logps/rejected": -79.8001708984375, + "loss": 0.7021, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.012117862701416, + "rewards/margins": 5.442873477935791, + "rewards/rejected": -2.430755376815796, + "step": 3659 + }, + { + "epoch": 0.92, + "grad_norm": 5.227640151977539, + "learning_rate": 3.3002975179497033e-06, + "logits/chosen": -0.29625430703163147, + "logits/rejected": -0.3782324194908142, + "logps/chosen": -51.11548614501953, + "logps/rejected": -73.33816528320312, + "loss": 0.7914, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.977947235107422, + "rewards/margins": 3.4545531272888184, + "rewards/rejected": -0.4766061305999756, + "step": 3660 + }, + { + "epoch": 0.92, + "grad_norm": 5.279911041259766, + "learning_rate": 3.297834024532255e-06, + "logits/chosen": -0.3946895897388458, + "logits/rejected": -0.4175792634487152, + "logps/chosen": -55.160606384277344, + "logps/rejected": -83.11114501953125, + "loss": 0.886, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.118311882019043, + "rewards/margins": 3.846694231033325, + "rewards/rejected": -0.7283825278282166, + "step": 3661 + }, + { + "epoch": 0.92, + "grad_norm": 6.028151035308838, + "learning_rate": 3.2953709983960953e-06, + "logits/chosen": -0.3039431869983673, + "logits/rejected": -0.3875008225440979, + "logps/chosen": -53.837745666503906, + "logps/rejected": -85.29110717773438, + "loss": 0.7107, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.825526475906372, + "rewards/margins": 3.9027693271636963, + "rewards/rejected": -1.0772430896759033, + "step": 3662 + }, + { + "epoch": 0.92, + "grad_norm": 3.632319450378418, + "learning_rate": 3.2929084402173804e-06, + "logits/chosen": -0.3053358197212219, + "logits/rejected": -0.45277148485183716, + "logps/chosen": -63.80837631225586, + "logps/rejected": -71.8170166015625, + "loss": 0.7716, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6890504360198975, + "rewards/margins": 4.133169651031494, + "rewards/rejected": -1.4441189765930176, + "step": 3663 + }, + { + "epoch": 0.92, + "grad_norm": 6.9557576179504395, + "learning_rate": 3.290446350672134e-06, + "logits/chosen": -0.29458603262901306, + "logits/rejected": -0.357359379529953, + "logps/chosen": -64.90752410888672, + "logps/rejected": -80.22348022460938, + "loss": 0.9075, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.804152250289917, + "rewards/margins": 3.407186508178711, + "rewards/rejected": -0.6030346751213074, + "step": 3664 + }, + { + "epoch": 0.92, + "grad_norm": 3.2358052730560303, + "learning_rate": 3.2879847304362567e-06, + "logits/chosen": -0.2932857573032379, + "logits/rejected": -0.42825669050216675, + "logps/chosen": -64.78581237792969, + "logps/rejected": -76.39022827148438, + "loss": 0.6953, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9833812713623047, + "rewards/margins": 4.385564804077148, + "rewards/rejected": -1.4021834135055542, + "step": 3665 + }, + { + "epoch": 0.92, + "grad_norm": 3.923285722732544, + "learning_rate": 3.2855235801855114e-06, + "logits/chosen": -0.3121274411678314, + "logits/rejected": -0.40486860275268555, + "logps/chosen": -55.50969696044922, + "logps/rejected": -87.18476867675781, + "loss": 0.7339, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9784700870513916, + "rewards/margins": 5.026458740234375, + "rewards/rejected": -2.0479893684387207, + "step": 3666 + }, + { + "epoch": 0.92, + "grad_norm": 3.4072277545928955, + "learning_rate": 3.283062900595537e-06, + "logits/chosen": -0.3663749098777771, + "logits/rejected": -0.46982595324516296, + "logps/chosen": -49.39923095703125, + "logps/rejected": -78.0171127319336, + "loss": 0.6392, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8227500915527344, + "rewards/margins": 5.073791980743408, + "rewards/rejected": -2.2510414123535156, + "step": 3667 + }, + { + "epoch": 0.92, + "grad_norm": 4.86316442489624, + "learning_rate": 3.2806026923418473e-06, + "logits/chosen": -0.2717691957950592, + "logits/rejected": -0.3538534343242645, + "logps/chosen": -65.84483337402344, + "logps/rejected": -106.34552764892578, + "loss": 0.7625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.034289836883545, + "rewards/margins": 5.102540016174316, + "rewards/rejected": -2.0682501792907715, + "step": 3668 + }, + { + "epoch": 0.92, + "grad_norm": 9.189984321594238, + "learning_rate": 3.2781429560998186e-06, + "logits/chosen": -0.33603939414024353, + "logits/rejected": -0.39388132095336914, + "logps/chosen": -57.860107421875, + "logps/rejected": -105.8018798828125, + "loss": 0.8997, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.77579927444458, + "rewards/margins": 4.12173318862915, + "rewards/rejected": -1.3459335565567017, + "step": 3669 + }, + { + "epoch": 0.92, + "grad_norm": 5.20235013961792, + "learning_rate": 3.2756836925447044e-06, + "logits/chosen": -0.3016410171985626, + "logits/rejected": -0.4594961404800415, + "logps/chosen": -71.09430694580078, + "logps/rejected": -77.26764678955078, + "loss": 0.8991, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.833409309387207, + "rewards/margins": 4.878690719604492, + "rewards/rejected": -2.045280933380127, + "step": 3670 + }, + { + "epoch": 0.92, + "grad_norm": 11.01628303527832, + "learning_rate": 3.273224902351624e-06, + "logits/chosen": -0.29586243629455566, + "logits/rejected": -0.3978070318698883, + "logps/chosen": -60.21669006347656, + "logps/rejected": -92.71572875976562, + "loss": 0.7493, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.841763973236084, + "rewards/margins": 5.227731227874756, + "rewards/rejected": -2.385967254638672, + "step": 3671 + }, + { + "epoch": 0.92, + "grad_norm": 4.643463611602783, + "learning_rate": 3.270766586195568e-06, + "logits/chosen": -0.2851705551147461, + "logits/rejected": -0.4126850664615631, + "logps/chosen": -55.730716705322266, + "logps/rejected": -74.99217224121094, + "loss": 0.7571, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.711965322494507, + "rewards/margins": 4.735392093658447, + "rewards/rejected": -2.0234270095825195, + "step": 3672 + }, + { + "epoch": 0.92, + "grad_norm": 3.213921308517456, + "learning_rate": 3.2683087447513988e-06, + "logits/chosen": -0.30755913257598877, + "logits/rejected": -0.4258072078227997, + "logps/chosen": -64.94432830810547, + "logps/rejected": -95.75506591796875, + "loss": 0.6216, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7981393337249756, + "rewards/margins": 5.518588066101074, + "rewards/rejected": -2.7204487323760986, + "step": 3673 + }, + { + "epoch": 0.92, + "grad_norm": 3.9618985652923584, + "learning_rate": 3.2658513786938457e-06, + "logits/chosen": -0.37379977107048035, + "logits/rejected": -0.45587682723999023, + "logps/chosen": -53.19570541381836, + "logps/rejected": -86.3143310546875, + "loss": 0.683, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.116276264190674, + "rewards/margins": 4.859501361846924, + "rewards/rejected": -1.7432254552841187, + "step": 3674 + }, + { + "epoch": 0.92, + "grad_norm": 5.2651824951171875, + "learning_rate": 3.26339448869751e-06, + "logits/chosen": -0.2796274423599243, + "logits/rejected": -0.42644357681274414, + "logps/chosen": -78.18159484863281, + "logps/rejected": -80.46906280517578, + "loss": 0.8552, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.2708544731140137, + "rewards/margins": 4.385244369506836, + "rewards/rejected": -2.1143901348114014, + "step": 3675 + }, + { + "epoch": 0.92, + "grad_norm": 9.310309410095215, + "learning_rate": 3.2609380754368603e-06, + "logits/chosen": -0.3198932707309723, + "logits/rejected": -0.3764604926109314, + "logps/chosen": -59.511474609375, + "logps/rejected": -87.75609588623047, + "loss": 0.9133, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.749295949935913, + "rewards/margins": 4.08540153503418, + "rewards/rejected": -1.3361057043075562, + "step": 3676 + }, + { + "epoch": 0.92, + "grad_norm": 2.9950296878814697, + "learning_rate": 3.2584821395862327e-06, + "logits/chosen": -0.3415944278240204, + "logits/rejected": -0.45666852593421936, + "logps/chosen": -62.63319396972656, + "logps/rejected": -85.9903335571289, + "loss": 0.6565, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.019270658493042, + "rewards/margins": 5.3137688636779785, + "rewards/rejected": -2.2944984436035156, + "step": 3677 + }, + { + "epoch": 0.92, + "grad_norm": 4.094259262084961, + "learning_rate": 3.2560266818198393e-06, + "logits/chosen": -0.31745749711990356, + "logits/rejected": -0.3916466534137726, + "logps/chosen": -55.102577209472656, + "logps/rejected": -83.50154113769531, + "loss": 0.705, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.856140613555908, + "rewards/margins": 4.349672317504883, + "rewards/rejected": -1.4935317039489746, + "step": 3678 + }, + { + "epoch": 0.92, + "grad_norm": 12.842799186706543, + "learning_rate": 3.253571702811752e-06, + "logits/chosen": -0.3305876851081848, + "logits/rejected": -0.41443875432014465, + "logps/chosen": -62.554969787597656, + "logps/rejected": -92.6219482421875, + "loss": 0.7163, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.557346820831299, + "rewards/margins": 5.53925085067749, + "rewards/rejected": -2.9819045066833496, + "step": 3679 + }, + { + "epoch": 0.92, + "grad_norm": 5.424794673919678, + "learning_rate": 3.2511172032359185e-06, + "logits/chosen": -0.36264175176620483, + "logits/rejected": -0.48464301228523254, + "logps/chosen": -57.55436706542969, + "logps/rejected": -98.44612884521484, + "loss": 0.7198, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8284687995910645, + "rewards/margins": 5.660130977630615, + "rewards/rejected": -2.831662178039551, + "step": 3680 + }, + { + "epoch": 0.92, + "grad_norm": 5.556285381317139, + "learning_rate": 3.248663183766151e-06, + "logits/chosen": -0.3383157253265381, + "logits/rejected": -0.4027268886566162, + "logps/chosen": -67.64727020263672, + "logps/rejected": -98.02568054199219, + "loss": 0.9243, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.822781562805176, + "rewards/margins": 3.91848087310791, + "rewards/rejected": -1.095699667930603, + "step": 3681 + }, + { + "epoch": 0.92, + "grad_norm": 4.226680755615234, + "learning_rate": 3.246209645076128e-06, + "logits/chosen": -0.3287877142429352, + "logits/rejected": -0.4562340974807739, + "logps/chosen": -67.85411071777344, + "logps/rejected": -76.19950103759766, + "loss": 0.7253, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8288748264312744, + "rewards/margins": 5.406898498535156, + "rewards/rejected": -2.578023910522461, + "step": 3682 + }, + { + "epoch": 0.92, + "grad_norm": 8.740547180175781, + "learning_rate": 3.243756587839403e-06, + "logits/chosen": -0.3997584581375122, + "logits/rejected": -0.45235586166381836, + "logps/chosen": -48.03504943847656, + "logps/rejected": -81.03901672363281, + "loss": 0.7456, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.862722873687744, + "rewards/margins": 4.469566345214844, + "rewards/rejected": -1.6068434715270996, + "step": 3683 + }, + { + "epoch": 0.92, + "grad_norm": 5.813603401184082, + "learning_rate": 3.2413040127293906e-06, + "logits/chosen": -0.4045329689979553, + "logits/rejected": -0.5061410069465637, + "logps/chosen": -48.779823303222656, + "logps/rejected": -70.40873718261719, + "loss": 0.8047, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.930926561355591, + "rewards/margins": 4.892034530639648, + "rewards/rejected": -1.9611085653305054, + "step": 3684 + }, + { + "epoch": 0.92, + "grad_norm": 7.858485698699951, + "learning_rate": 3.2388519204193767e-06, + "logits/chosen": -0.3128889799118042, + "logits/rejected": -0.3987874984741211, + "logps/chosen": -58.56022262573242, + "logps/rejected": -87.89340209960938, + "loss": 0.7041, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0332717895507812, + "rewards/margins": 4.196479320526123, + "rewards/rejected": -1.1632071733474731, + "step": 3685 + }, + { + "epoch": 0.92, + "grad_norm": 22.900869369506836, + "learning_rate": 3.2364003115825147e-06, + "logits/chosen": -0.4174039363861084, + "logits/rejected": -0.5359629392623901, + "logps/chosen": -56.66719055175781, + "logps/rejected": -75.35987854003906, + "loss": 0.8829, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.782519578933716, + "rewards/margins": 4.934279441833496, + "rewards/rejected": -2.1517601013183594, + "step": 3686 + }, + { + "epoch": 0.92, + "grad_norm": 4.305857181549072, + "learning_rate": 3.2339491868918206e-06, + "logits/chosen": -0.33345308899879456, + "logits/rejected": -0.43997299671173096, + "logps/chosen": -48.669185638427734, + "logps/rejected": -76.99362182617188, + "loss": 0.6275, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8500382900238037, + "rewards/margins": 4.712022304534912, + "rewards/rejected": -1.8619844913482666, + "step": 3687 + }, + { + "epoch": 0.92, + "grad_norm": 11.013124465942383, + "learning_rate": 3.231498547020184e-06, + "logits/chosen": -0.2898947596549988, + "logits/rejected": -0.3811650276184082, + "logps/chosen": -48.082679748535156, + "logps/rejected": -85.98645782470703, + "loss": 0.601, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8619446754455566, + "rewards/margins": 4.8981099128723145, + "rewards/rejected": -2.0361649990081787, + "step": 3688 + }, + { + "epoch": 0.92, + "grad_norm": 6.5349555015563965, + "learning_rate": 3.2290483926403558e-06, + "logits/chosen": -0.2625776529312134, + "logits/rejected": -0.3728780746459961, + "logps/chosen": -49.252220153808594, + "logps/rejected": -80.94310760498047, + "loss": 0.7783, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0117697715759277, + "rewards/margins": 4.815027236938477, + "rewards/rejected": -1.8032574653625488, + "step": 3689 + }, + { + "epoch": 0.92, + "grad_norm": 7.613153457641602, + "learning_rate": 3.226598724424961e-06, + "logits/chosen": -0.41256359219551086, + "logits/rejected": -0.5517184734344482, + "logps/chosen": -53.60816192626953, + "logps/rejected": -78.69865417480469, + "loss": 0.7121, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.904641628265381, + "rewards/margins": 5.473649501800537, + "rewards/rejected": -2.569007635116577, + "step": 3690 + }, + { + "epoch": 0.92, + "grad_norm": 6.7032151222229, + "learning_rate": 3.224149543046482e-06, + "logits/chosen": -0.34469327330589294, + "logits/rejected": -0.45355862379074097, + "logps/chosen": -51.26371765136719, + "logps/rejected": -74.6582260131836, + "loss": 0.7859, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.787281036376953, + "rewards/margins": 4.734930038452148, + "rewards/rejected": -1.9476490020751953, + "step": 3691 + }, + { + "epoch": 0.92, + "grad_norm": 6.913449287414551, + "learning_rate": 3.2217008491772726e-06, + "logits/chosen": -0.34264934062957764, + "logits/rejected": -0.4567033350467682, + "logps/chosen": -61.11491775512695, + "logps/rejected": -71.8813705444336, + "loss": 0.9557, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1424975395202637, + "rewards/margins": 4.3733649253845215, + "rewards/rejected": -1.230867624282837, + "step": 3692 + }, + { + "epoch": 0.92, + "grad_norm": 5.093822479248047, + "learning_rate": 3.2192526434895534e-06, + "logits/chosen": -0.331604540348053, + "logits/rejected": -0.4262164831161499, + "logps/chosen": -55.307655334472656, + "logps/rejected": -76.08731842041016, + "loss": 0.8504, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.680689573287964, + "rewards/margins": 5.091299057006836, + "rewards/rejected": -2.4106101989746094, + "step": 3693 + }, + { + "epoch": 0.92, + "grad_norm": 26.388513565063477, + "learning_rate": 3.216804926655408e-06, + "logits/chosen": -0.29922452569007874, + "logits/rejected": -0.39752864837646484, + "logps/chosen": -60.62729263305664, + "logps/rejected": -83.1027603149414, + "loss": 0.8574, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9851396083831787, + "rewards/margins": 4.938747406005859, + "rewards/rejected": -1.9536075592041016, + "step": 3694 + }, + { + "epoch": 0.92, + "grad_norm": 5.618992328643799, + "learning_rate": 3.21435769934679e-06, + "logits/chosen": -0.28594252467155457, + "logits/rejected": -0.37101152539253235, + "logps/chosen": -61.54051971435547, + "logps/rejected": -75.95892333984375, + "loss": 0.785, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.791456937789917, + "rewards/margins": 3.5534369945526123, + "rewards/rejected": -0.7619801759719849, + "step": 3695 + }, + { + "epoch": 0.92, + "grad_norm": 8.588932991027832, + "learning_rate": 3.2119109622355157e-06, + "logits/chosen": -0.37232694029808044, + "logits/rejected": -0.43407467007637024, + "logps/chosen": -42.54117965698242, + "logps/rejected": -66.46067810058594, + "loss": 0.7163, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.830388307571411, + "rewards/margins": 4.340921401977539, + "rewards/rejected": -1.5105328559875488, + "step": 3696 + }, + { + "epoch": 0.92, + "grad_norm": 3.087970495223999, + "learning_rate": 3.2094647159932625e-06, + "logits/chosen": -0.27340367436408997, + "logits/rejected": -0.43074271082878113, + "logps/chosen": -70.31680297851562, + "logps/rejected": -83.81990051269531, + "loss": 0.7312, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.966519594192505, + "rewards/margins": 4.5522685050964355, + "rewards/rejected": -1.5857491493225098, + "step": 3697 + }, + { + "epoch": 0.93, + "grad_norm": 7.043747901916504, + "learning_rate": 3.2070189612915848e-06, + "logits/chosen": -0.2989586889743805, + "logits/rejected": -0.39443787932395935, + "logps/chosen": -63.588958740234375, + "logps/rejected": -81.69571685791016, + "loss": 0.899, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.004817008972168, + "rewards/margins": 3.7181596755981445, + "rewards/rejected": -0.7133426070213318, + "step": 3698 + }, + { + "epoch": 0.93, + "grad_norm": 12.294357299804688, + "learning_rate": 3.204573698801888e-06, + "logits/chosen": -0.33503589034080505, + "logits/rejected": -0.4048628509044647, + "logps/chosen": -51.097999572753906, + "logps/rejected": -80.28031921386719, + "loss": 0.9414, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.549431085586548, + "rewards/margins": 3.7224998474121094, + "rewards/rejected": -1.1730687618255615, + "step": 3699 + }, + { + "epoch": 0.93, + "grad_norm": 3.126194477081299, + "learning_rate": 3.202128929195457e-06, + "logits/chosen": -0.2862987220287323, + "logits/rejected": -0.37923288345336914, + "logps/chosen": -46.66265106201172, + "logps/rejected": -73.45272827148438, + "loss": 0.6383, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.936513900756836, + "rewards/margins": 4.662703514099121, + "rewards/rejected": -1.726189374923706, + "step": 3700 + }, + { + "epoch": 0.93, + "grad_norm": 7.616822719573975, + "learning_rate": 3.199684653143429e-06, + "logits/chosen": -0.30618777871131897, + "logits/rejected": -0.44258570671081543, + "logps/chosen": -64.20378112792969, + "logps/rejected": -87.49540710449219, + "loss": 0.7561, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1851022243499756, + "rewards/margins": 4.682547569274902, + "rewards/rejected": -1.497444987297058, + "step": 3701 + }, + { + "epoch": 0.93, + "grad_norm": 12.028350830078125, + "learning_rate": 3.19724087131681e-06, + "logits/chosen": -0.23516173660755157, + "logits/rejected": -0.28032639622688293, + "logps/chosen": -59.997169494628906, + "logps/rejected": -98.5616455078125, + "loss": 0.9256, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4794702529907227, + "rewards/margins": 3.5859057903289795, + "rewards/rejected": -1.1064358949661255, + "step": 3702 + }, + { + "epoch": 0.93, + "grad_norm": 4.183585166931152, + "learning_rate": 3.194797584386474e-06, + "logits/chosen": -0.4145621657371521, + "logits/rejected": -0.514950692653656, + "logps/chosen": -46.99787139892578, + "logps/rejected": -77.64024353027344, + "loss": 0.5865, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.011164903640747, + "rewards/margins": 5.1127119064331055, + "rewards/rejected": -2.1015467643737793, + "step": 3703 + }, + { + "epoch": 0.93, + "grad_norm": 3.3315374851226807, + "learning_rate": 3.1923547930231523e-06, + "logits/chosen": -0.3471750020980835, + "logits/rejected": -0.48396870493888855, + "logps/chosen": -50.05221939086914, + "logps/rejected": -73.7499771118164, + "loss": 0.6752, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0097761154174805, + "rewards/margins": 5.25115442276001, + "rewards/rejected": -2.2413785457611084, + "step": 3704 + }, + { + "epoch": 0.93, + "grad_norm": 3.288339376449585, + "learning_rate": 3.1899124978974476e-06, + "logits/chosen": -0.32549217343330383, + "logits/rejected": -0.44950801134109497, + "logps/chosen": -45.27118682861328, + "logps/rejected": -76.14275360107422, + "loss": 0.6698, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9894931316375732, + "rewards/margins": 4.770901203155518, + "rewards/rejected": -1.7814079523086548, + "step": 3705 + }, + { + "epoch": 0.93, + "grad_norm": 4.613010406494141, + "learning_rate": 3.187470699679821e-06, + "logits/chosen": -0.3637784719467163, + "logits/rejected": -0.4600706100463867, + "logps/chosen": -59.17266082763672, + "logps/rejected": -85.8351058959961, + "loss": 0.6898, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.888822555541992, + "rewards/margins": 5.327999114990234, + "rewards/rejected": -2.439176559448242, + "step": 3706 + }, + { + "epoch": 0.93, + "grad_norm": 7.485780239105225, + "learning_rate": 3.1850293990405966e-06, + "logits/chosen": -0.3518419861793518, + "logits/rejected": -0.43475255370140076, + "logps/chosen": -61.777679443359375, + "logps/rejected": -79.09329986572266, + "loss": 0.8192, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.653546094894409, + "rewards/margins": 4.225443363189697, + "rewards/rejected": -1.5718977451324463, + "step": 3707 + }, + { + "epoch": 0.93, + "grad_norm": 3.387375593185425, + "learning_rate": 3.1825885966499694e-06, + "logits/chosen": -0.3797149658203125, + "logits/rejected": -0.47200289368629456, + "logps/chosen": -50.858131408691406, + "logps/rejected": -73.61078643798828, + "loss": 0.7069, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1773409843444824, + "rewards/margins": 4.537153244018555, + "rewards/rejected": -1.3598124980926514, + "step": 3708 + }, + { + "epoch": 0.93, + "grad_norm": 9.586811065673828, + "learning_rate": 3.180148293177985e-06, + "logits/chosen": -0.33186954259872437, + "logits/rejected": -0.39973846077919006, + "logps/chosen": -49.04104232788086, + "logps/rejected": -73.1383056640625, + "loss": 0.9742, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.0242323875427246, + "rewards/margins": 3.7524733543395996, + "rewards/rejected": -0.7282412052154541, + "step": 3709 + }, + { + "epoch": 0.93, + "grad_norm": 3.9138669967651367, + "learning_rate": 3.177708489294567e-06, + "logits/chosen": -0.3172772526741028, + "logits/rejected": -0.3690429925918579, + "logps/chosen": -48.68751907348633, + "logps/rejected": -98.33535766601562, + "loss": 0.6463, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0969483852386475, + "rewards/margins": 5.087625026702881, + "rewards/rejected": -1.9906764030456543, + "step": 3710 + }, + { + "epoch": 0.93, + "grad_norm": 5.204544544219971, + "learning_rate": 3.1752691856694896e-06, + "logits/chosen": -0.35709500312805176, + "logits/rejected": -0.44950243830680847, + "logps/chosen": -62.97910690307617, + "logps/rejected": -86.00321197509766, + "loss": 0.7844, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1569299697875977, + "rewards/margins": 5.59016227722168, + "rewards/rejected": -2.4332330226898193, + "step": 3711 + }, + { + "epoch": 0.93, + "grad_norm": 14.675281524658203, + "learning_rate": 3.172830382972394e-06, + "logits/chosen": -0.35729286074638367, + "logits/rejected": -0.45864903926849365, + "logps/chosen": -56.42734909057617, + "logps/rejected": -81.19872283935547, + "loss": 1.0759, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.4642109870910645, + "rewards/margins": 3.542841911315918, + "rewards/rejected": -1.0786309242248535, + "step": 3712 + }, + { + "epoch": 0.93, + "grad_norm": 5.42630672454834, + "learning_rate": 3.1703920818727863e-06, + "logits/chosen": -0.3891884982585907, + "logits/rejected": -0.5229112505912781, + "logps/chosen": -64.15553283691406, + "logps/rejected": -70.02862548828125, + "loss": 0.8036, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9087398052215576, + "rewards/margins": 4.926510810852051, + "rewards/rejected": -2.0177714824676514, + "step": 3713 + }, + { + "epoch": 0.93, + "grad_norm": 4.224147319793701, + "learning_rate": 3.167954283040031e-06, + "logits/chosen": -0.3356105089187622, + "logits/rejected": -0.46708235144615173, + "logps/chosen": -62.465576171875, + "logps/rejected": -81.18949127197266, + "loss": 0.7244, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1012086868286133, + "rewards/margins": 4.885336399078369, + "rewards/rejected": -1.7841277122497559, + "step": 3714 + }, + { + "epoch": 0.93, + "grad_norm": 9.215744018554688, + "learning_rate": 3.1655169871433594e-06, + "logits/chosen": -0.3256918787956238, + "logits/rejected": -0.3389310836791992, + "logps/chosen": -60.63311767578125, + "logps/rejected": -100.52203369140625, + "loss": 0.854, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1112329959869385, + "rewards/margins": 4.949779510498047, + "rewards/rejected": -1.8385465145111084, + "step": 3715 + }, + { + "epoch": 0.93, + "grad_norm": 6.069101333618164, + "learning_rate": 3.163080194851859e-06, + "logits/chosen": -0.35447022318840027, + "logits/rejected": -0.43602073192596436, + "logps/chosen": -52.30873107910156, + "logps/rejected": -76.47977447509766, + "loss": 0.7397, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7469418048858643, + "rewards/margins": 4.57579231262207, + "rewards/rejected": -1.8288501501083374, + "step": 3716 + }, + { + "epoch": 0.93, + "grad_norm": 6.950138092041016, + "learning_rate": 3.1606439068344817e-06, + "logits/chosen": -0.346930593252182, + "logits/rejected": -0.4904547333717346, + "logps/chosen": -51.173240661621094, + "logps/rejected": -68.8526840209961, + "loss": 0.7021, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7433533668518066, + "rewards/margins": 4.373810291290283, + "rewards/rejected": -1.6304571628570557, + "step": 3717 + }, + { + "epoch": 0.93, + "grad_norm": 2.8600645065307617, + "learning_rate": 3.1582081237600435e-06, + "logits/chosen": -0.284074991941452, + "logits/rejected": -0.39661264419555664, + "logps/chosen": -51.6988410949707, + "logps/rejected": -79.42816925048828, + "loss": 0.6157, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.957003355026245, + "rewards/margins": 5.0045671463012695, + "rewards/rejected": -2.0475635528564453, + "step": 3718 + }, + { + "epoch": 0.93, + "grad_norm": 7.342613697052002, + "learning_rate": 3.155772846297217e-06, + "logits/chosen": -0.3348504602909088, + "logits/rejected": -0.42729640007019043, + "logps/chosen": -59.01234436035156, + "logps/rejected": -86.43325805664062, + "loss": 0.7352, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.970445156097412, + "rewards/margins": 5.1923828125, + "rewards/rejected": -2.221937656402588, + "step": 3719 + }, + { + "epoch": 0.93, + "grad_norm": 4.163210868835449, + "learning_rate": 3.1533380751145414e-06, + "logits/chosen": -0.3262189030647278, + "logits/rejected": -0.47008660435676575, + "logps/chosen": -62.77238082885742, + "logps/rejected": -90.44290161132812, + "loss": 0.745, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.727600574493408, + "rewards/margins": 4.964616298675537, + "rewards/rejected": -2.237015962600708, + "step": 3720 + }, + { + "epoch": 0.93, + "grad_norm": 3.1235439777374268, + "learning_rate": 3.1509038108804114e-06, + "logits/chosen": -0.30886468291282654, + "logits/rejected": -0.4036693572998047, + "logps/chosen": -52.088443756103516, + "logps/rejected": -81.91632843017578, + "loss": 0.6556, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.108391523361206, + "rewards/margins": 4.939290523529053, + "rewards/rejected": -1.8308991193771362, + "step": 3721 + }, + { + "epoch": 0.93, + "grad_norm": 5.210404396057129, + "learning_rate": 3.148470054263084e-06, + "logits/chosen": -0.35125547647476196, + "logits/rejected": -0.434622198343277, + "logps/chosen": -58.97780990600586, + "logps/rejected": -86.67498779296875, + "loss": 0.7605, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.231370210647583, + "rewards/margins": 4.556047439575195, + "rewards/rejected": -1.3246773481369019, + "step": 3722 + }, + { + "epoch": 0.93, + "grad_norm": 10.99396800994873, + "learning_rate": 3.1460368059306804e-06, + "logits/chosen": -0.2722841203212738, + "logits/rejected": -0.369503915309906, + "logps/chosen": -52.57809066772461, + "logps/rejected": -79.7255859375, + "loss": 0.832, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8823513984680176, + "rewards/margins": 4.943793773651123, + "rewards/rejected": -2.0614423751831055, + "step": 3723 + }, + { + "epoch": 0.93, + "grad_norm": 10.75314712524414, + "learning_rate": 3.1436040665511787e-06, + "logits/chosen": -0.31440818309783936, + "logits/rejected": -0.40856266021728516, + "logps/chosen": -58.9405632019043, + "logps/rejected": -73.69476318359375, + "loss": 0.996, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5084211826324463, + "rewards/margins": 3.7703804969787598, + "rewards/rejected": -1.2619590759277344, + "step": 3724 + }, + { + "epoch": 0.93, + "grad_norm": 4.5283708572387695, + "learning_rate": 3.1411718367924192e-06, + "logits/chosen": -0.3309307396411896, + "logits/rejected": -0.40058377385139465, + "logps/chosen": -63.844947814941406, + "logps/rejected": -101.74211883544922, + "loss": 0.6794, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0933189392089844, + "rewards/margins": 5.288733005523682, + "rewards/rejected": -2.19541335105896, + "step": 3725 + }, + { + "epoch": 0.93, + "grad_norm": 9.866449356079102, + "learning_rate": 3.138740117322101e-06, + "logits/chosen": -0.24868375062942505, + "logits/rejected": -0.3128734827041626, + "logps/chosen": -66.10248565673828, + "logps/rejected": -107.10690307617188, + "loss": 0.851, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9706554412841797, + "rewards/margins": 4.2471232414245605, + "rewards/rejected": -1.2764676809310913, + "step": 3726 + }, + { + "epoch": 0.93, + "grad_norm": 3.669680118560791, + "learning_rate": 3.1363089088077824e-06, + "logits/chosen": -0.3342035412788391, + "logits/rejected": -0.4514535069465637, + "logps/chosen": -46.40262985229492, + "logps/rejected": -73.41617584228516, + "loss": 0.5531, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.802924156188965, + "rewards/margins": 5.0701680183410645, + "rewards/rejected": -2.2672438621520996, + "step": 3727 + }, + { + "epoch": 0.93, + "grad_norm": 9.815400123596191, + "learning_rate": 3.1338782119168844e-06, + "logits/chosen": -0.32222506403923035, + "logits/rejected": -0.4123540222644806, + "logps/chosen": -70.46773529052734, + "logps/rejected": -81.3639144897461, + "loss": 1.0257, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5437002182006836, + "rewards/margins": 4.216268539428711, + "rewards/rejected": -1.6725683212280273, + "step": 3728 + }, + { + "epoch": 0.93, + "grad_norm": 8.31549072265625, + "learning_rate": 3.131448027316684e-06, + "logits/chosen": -0.3606296479701996, + "logits/rejected": -0.3897000849246979, + "logps/chosen": -50.62186050415039, + "logps/rejected": -77.07498931884766, + "loss": 0.8665, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7032246589660645, + "rewards/margins": 2.809701681137085, + "rewards/rejected": -0.10647709667682648, + "step": 3729 + }, + { + "epoch": 0.93, + "grad_norm": 9.426265716552734, + "learning_rate": 3.1290183556743215e-06, + "logits/chosen": -0.30447766184806824, + "logits/rejected": -0.3774580955505371, + "logps/chosen": -60.121726989746094, + "logps/rejected": -76.99055480957031, + "loss": 0.7863, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7596895694732666, + "rewards/margins": 5.264236927032471, + "rewards/rejected": -2.504547357559204, + "step": 3730 + }, + { + "epoch": 0.93, + "grad_norm": 5.208091735839844, + "learning_rate": 3.126589197656794e-06, + "logits/chosen": -0.2988613247871399, + "logits/rejected": -0.3703181743621826, + "logps/chosen": -64.69945526123047, + "logps/rejected": -88.98921203613281, + "loss": 0.8863, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.93074369430542, + "rewards/margins": 5.148190498352051, + "rewards/rejected": -2.217446804046631, + "step": 3731 + }, + { + "epoch": 0.93, + "grad_norm": 4.654784202575684, + "learning_rate": 3.124160553930953e-06, + "logits/chosen": -0.28915950655937195, + "logits/rejected": -0.4276937246322632, + "logps/chosen": -54.195556640625, + "logps/rejected": -69.14871978759766, + "loss": 0.7111, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.960066318511963, + "rewards/margins": 5.0280561447143555, + "rewards/rejected": -2.0679893493652344, + "step": 3732 + }, + { + "epoch": 0.93, + "grad_norm": 13.75394058227539, + "learning_rate": 3.1217324251635217e-06, + "logits/chosen": -0.37953412532806396, + "logits/rejected": -0.5015056729316711, + "logps/chosen": -55.75530242919922, + "logps/rejected": -83.09414672851562, + "loss": 0.7386, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0593745708465576, + "rewards/margins": 5.479578971862793, + "rewards/rejected": -2.4202044010162354, + "step": 3733 + }, + { + "epoch": 0.93, + "grad_norm": 4.917469024658203, + "learning_rate": 3.1193048120210666e-06, + "logits/chosen": -0.43380287289619446, + "logits/rejected": -0.46583443880081177, + "logps/chosen": -54.25755310058594, + "logps/rejected": -95.1795883178711, + "loss": 0.7366, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0012433528900146, + "rewards/margins": 4.577376365661621, + "rewards/rejected": -1.576133370399475, + "step": 3734 + }, + { + "epoch": 0.93, + "grad_norm": 6.182195663452148, + "learning_rate": 3.1168777151700236e-06, + "logits/chosen": -0.4256289303302765, + "logits/rejected": -0.4726638197898865, + "logps/chosen": -41.73752975463867, + "logps/rejected": -80.92476654052734, + "loss": 0.7254, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8509206771850586, + "rewards/margins": 4.587123394012451, + "rewards/rejected": -1.7362024784088135, + "step": 3735 + }, + { + "epoch": 0.93, + "grad_norm": 3.791977643966675, + "learning_rate": 3.1144511352766828e-06, + "logits/chosen": -0.2910081744194031, + "logits/rejected": -0.35692521929740906, + "logps/chosen": -58.65694046020508, + "logps/rejected": -82.3568115234375, + "loss": 0.7184, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0216424465179443, + "rewards/margins": 5.14137077331543, + "rewards/rejected": -2.1197283267974854, + "step": 3736 + }, + { + "epoch": 0.93, + "grad_norm": 17.70396614074707, + "learning_rate": 3.112025073007191e-06, + "logits/chosen": -0.37512511014938354, + "logits/rejected": -0.4681274890899658, + "logps/chosen": -57.00530242919922, + "logps/rejected": -80.74317169189453, + "loss": 0.7509, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8132004737854004, + "rewards/margins": 4.942811489105225, + "rewards/rejected": -2.129610538482666, + "step": 3737 + }, + { + "epoch": 0.94, + "grad_norm": 4.5132155418396, + "learning_rate": 3.109599529027557e-06, + "logits/chosen": -0.325765997171402, + "logits/rejected": -0.4189207851886749, + "logps/chosen": -63.64208984375, + "logps/rejected": -85.92620086669922, + "loss": 0.7009, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5025696754455566, + "rewards/margins": 4.551461219787598, + "rewards/rejected": -2.048892021179199, + "step": 3738 + }, + { + "epoch": 0.94, + "grad_norm": 10.603256225585938, + "learning_rate": 3.107174504003644e-06, + "logits/chosen": -0.3589702844619751, + "logits/rejected": -0.44295549392700195, + "logps/chosen": -61.979766845703125, + "logps/rejected": -94.43974304199219, + "loss": 0.7954, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.000643014907837, + "rewards/margins": 4.563290596008301, + "rewards/rejected": -1.5626472234725952, + "step": 3739 + }, + { + "epoch": 0.94, + "grad_norm": 8.180760383605957, + "learning_rate": 3.104749998601173e-06, + "logits/chosen": -0.3750963807106018, + "logits/rejected": -0.4803502559661865, + "logps/chosen": -63.29070281982422, + "logps/rejected": -82.19719696044922, + "loss": 1.0028, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8999459743499756, + "rewards/margins": 4.907851219177246, + "rewards/rejected": -2.0079057216644287, + "step": 3740 + }, + { + "epoch": 0.94, + "grad_norm": 4.344812870025635, + "learning_rate": 3.1023260134857257e-06, + "logits/chosen": -0.31948673725128174, + "logits/rejected": -0.4095748960971832, + "logps/chosen": -58.04267501831055, + "logps/rejected": -84.36717224121094, + "loss": 0.7969, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1010355949401855, + "rewards/margins": 5.03801155090332, + "rewards/rejected": -1.9369754791259766, + "step": 3741 + }, + { + "epoch": 0.94, + "grad_norm": 4.0382466316223145, + "learning_rate": 3.099902549322733e-06, + "logits/chosen": -0.24543455243110657, + "logits/rejected": -0.3377106785774231, + "logps/chosen": -61.80350112915039, + "logps/rejected": -112.619873046875, + "loss": 0.6543, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.893714427947998, + "rewards/margins": 5.3808393478393555, + "rewards/rejected": -2.4871249198913574, + "step": 3742 + }, + { + "epoch": 0.94, + "grad_norm": 7.289346694946289, + "learning_rate": 3.0974796067774936e-06, + "logits/chosen": -0.32895615696907043, + "logits/rejected": -0.41108208894729614, + "logps/chosen": -63.538902282714844, + "logps/rejected": -95.2685546875, + "loss": 1.0091, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6641387939453125, + "rewards/margins": 3.829501152038574, + "rewards/rejected": -1.165361762046814, + "step": 3743 + }, + { + "epoch": 0.94, + "grad_norm": 6.247248649597168, + "learning_rate": 3.0950571865151537e-06, + "logits/chosen": -0.3268333673477173, + "logits/rejected": -0.41192206740379333, + "logps/chosen": -59.876197814941406, + "logps/rejected": -88.3818588256836, + "loss": 0.7554, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.821610689163208, + "rewards/margins": 4.945894241333008, + "rewards/rejected": -2.124283790588379, + "step": 3744 + }, + { + "epoch": 0.94, + "grad_norm": 13.077284812927246, + "learning_rate": 3.0926352892007238e-06, + "logits/chosen": -0.32648682594299316, + "logits/rejected": -0.41358426213264465, + "logps/chosen": -61.47896194458008, + "logps/rejected": -66.84089660644531, + "loss": 0.7602, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9422712326049805, + "rewards/margins": 3.251800060272217, + "rewards/rejected": -0.309528648853302, + "step": 3745 + }, + { + "epoch": 0.94, + "grad_norm": 5.643627643585205, + "learning_rate": 3.0902139154990636e-06, + "logits/chosen": -0.3751232624053955, + "logits/rejected": -0.48367393016815186, + "logps/chosen": -83.26471710205078, + "logps/rejected": -111.18573760986328, + "loss": 0.8112, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.83186674118042, + "rewards/margins": 4.8619608879089355, + "rewards/rejected": -2.0300941467285156, + "step": 3746 + }, + { + "epoch": 0.94, + "grad_norm": 4.537824630737305, + "learning_rate": 3.087793066074892e-06, + "logits/chosen": -0.3111046850681305, + "logits/rejected": -0.40332210063934326, + "logps/chosen": -56.08230209350586, + "logps/rejected": -84.73260498046875, + "loss": 0.7631, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6432371139526367, + "rewards/margins": 5.165122985839844, + "rewards/rejected": -2.521885871887207, + "step": 3747 + }, + { + "epoch": 0.94, + "grad_norm": 7.677070140838623, + "learning_rate": 3.0853727415927882e-06, + "logits/chosen": -0.3866282105445862, + "logits/rejected": -0.4763518273830414, + "logps/chosen": -51.22810745239258, + "logps/rejected": -91.90704345703125, + "loss": 0.7415, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.884871482849121, + "rewards/margins": 5.609399318695068, + "rewards/rejected": -2.72452712059021, + "step": 3748 + }, + { + "epoch": 0.94, + "grad_norm": 8.066813468933105, + "learning_rate": 3.0829529427171796e-06, + "logits/chosen": -0.28378885984420776, + "logits/rejected": -0.4709036350250244, + "logps/chosen": -67.89891815185547, + "logps/rejected": -69.64765167236328, + "loss": 0.7415, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6686995029449463, + "rewards/margins": 4.835658073425293, + "rewards/rejected": -2.166958808898926, + "step": 3749 + }, + { + "epoch": 0.94, + "grad_norm": 8.042781829833984, + "learning_rate": 3.0805336701123555e-06, + "logits/chosen": -0.3494041860103607, + "logits/rejected": -0.4277160167694092, + "logps/chosen": -56.60304260253906, + "logps/rejected": -91.81183624267578, + "loss": 0.8091, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.547699451446533, + "rewards/margins": 4.199337482452393, + "rewards/rejected": -1.6516379117965698, + "step": 3750 + }, + { + "epoch": 0.94, + "grad_norm": 6.1742072105407715, + "learning_rate": 3.0781149244424592e-06, + "logits/chosen": -0.3823961913585663, + "logits/rejected": -0.3955131769180298, + "logps/chosen": -52.56368637084961, + "logps/rejected": -83.57044219970703, + "loss": 0.7918, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.938697338104248, + "rewards/margins": 3.766983985900879, + "rewards/rejected": -0.8282864689826965, + "step": 3751 + }, + { + "epoch": 0.94, + "grad_norm": 8.330467224121094, + "learning_rate": 3.075696706371484e-06, + "logits/chosen": -0.38966119289398193, + "logits/rejected": -0.4380924701690674, + "logps/chosen": -51.23884582519531, + "logps/rejected": -81.21436309814453, + "loss": 0.8026, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.688319206237793, + "rewards/margins": 4.179562091827393, + "rewards/rejected": -1.4912422895431519, + "step": 3752 + }, + { + "epoch": 0.94, + "grad_norm": 6.728740215301514, + "learning_rate": 3.073279016563289e-06, + "logits/chosen": -0.29417330026626587, + "logits/rejected": -0.4267468750476837, + "logps/chosen": -63.22807312011719, + "logps/rejected": -81.513916015625, + "loss": 0.7505, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.893463134765625, + "rewards/margins": 5.399681091308594, + "rewards/rejected": -2.5062179565429688, + "step": 3753 + }, + { + "epoch": 0.94, + "grad_norm": 6.0843119621276855, + "learning_rate": 3.070861855681577e-06, + "logits/chosen": -0.3346590995788574, + "logits/rejected": -0.45294803380966187, + "logps/chosen": -58.91783905029297, + "logps/rejected": -75.30347442626953, + "loss": 0.7755, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.835575819015503, + "rewards/margins": 4.781477451324463, + "rewards/rejected": -1.9459013938903809, + "step": 3754 + }, + { + "epoch": 0.94, + "grad_norm": 14.432183265686035, + "learning_rate": 3.0684452243899154e-06, + "logits/chosen": -0.3270636796951294, + "logits/rejected": -0.3948861360549927, + "logps/chosen": -61.83855438232422, + "logps/rejected": -84.05561828613281, + "loss": 1.0278, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.564491033554077, + "rewards/margins": 3.783263683319092, + "rewards/rejected": -1.218772530555725, + "step": 3755 + }, + { + "epoch": 0.94, + "grad_norm": 4.943869590759277, + "learning_rate": 3.066029123351718e-06, + "logits/chosen": -0.4375532865524292, + "logits/rejected": -0.4440235495567322, + "logps/chosen": -46.93022537231445, + "logps/rejected": -84.75942993164062, + "loss": 0.7721, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0222702026367188, + "rewards/margins": 4.66115140914917, + "rewards/rejected": -1.6388816833496094, + "step": 3756 + }, + { + "epoch": 0.94, + "grad_norm": 11.309433937072754, + "learning_rate": 3.063613553230258e-06, + "logits/chosen": -0.31639984250068665, + "logits/rejected": -0.388910710811615, + "logps/chosen": -63.280303955078125, + "logps/rejected": -76.46749114990234, + "loss": 0.9105, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.874011993408203, + "rewards/margins": 3.5180139541625977, + "rewards/rejected": -0.6440016627311707, + "step": 3757 + }, + { + "epoch": 0.94, + "grad_norm": 7.607973098754883, + "learning_rate": 3.0611985146886623e-06, + "logits/chosen": -0.30598151683807373, + "logits/rejected": -0.31199511885643005, + "logps/chosen": -48.071563720703125, + "logps/rejected": -101.043212890625, + "loss": 0.6681, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7512917518615723, + "rewards/margins": 5.190202236175537, + "rewards/rejected": -2.438910722732544, + "step": 3758 + }, + { + "epoch": 0.94, + "grad_norm": 7.469743251800537, + "learning_rate": 3.058784008389909e-06, + "logits/chosen": -0.39135971665382385, + "logits/rejected": -0.41213759779930115, + "logps/chosen": -60.214561462402344, + "logps/rejected": -102.34591674804688, + "loss": 0.8612, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7926621437072754, + "rewards/margins": 5.32163143157959, + "rewards/rejected": -2.5289692878723145, + "step": 3759 + }, + { + "epoch": 0.94, + "grad_norm": 5.394297122955322, + "learning_rate": 3.056370034996835e-06, + "logits/chosen": -0.29907727241516113, + "logits/rejected": -0.39619237184524536, + "logps/chosen": -70.69602966308594, + "logps/rejected": -83.75244140625, + "loss": 0.8686, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7060089111328125, + "rewards/margins": 4.212270736694336, + "rewards/rejected": -1.5062620639801025, + "step": 3760 + }, + { + "epoch": 0.94, + "grad_norm": 3.826740026473999, + "learning_rate": 3.053956595172126e-06, + "logits/chosen": -0.3466664254665375, + "logits/rejected": -0.42054107785224915, + "logps/chosen": -49.69358444213867, + "logps/rejected": -90.39341735839844, + "loss": 0.6631, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1065003871917725, + "rewards/margins": 4.39179801940918, + "rewards/rejected": -1.2852981090545654, + "step": 3761 + }, + { + "epoch": 0.94, + "grad_norm": 5.660646438598633, + "learning_rate": 3.0515436895783235e-06, + "logits/chosen": -0.347139447927475, + "logits/rejected": -0.3997698724269867, + "logps/chosen": -55.92286682128906, + "logps/rejected": -95.53433990478516, + "loss": 0.643, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.142970085144043, + "rewards/margins": 5.431352138519287, + "rewards/rejected": -2.288381814956665, + "step": 3762 + }, + { + "epoch": 0.94, + "grad_norm": 4.7994232177734375, + "learning_rate": 3.0491313188778243e-06, + "logits/chosen": -0.2908630669116974, + "logits/rejected": -0.3498433828353882, + "logps/chosen": -49.51701736450195, + "logps/rejected": -105.66094207763672, + "loss": 0.6521, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.777940034866333, + "rewards/margins": 5.554072380065918, + "rewards/rejected": -2.7761318683624268, + "step": 3763 + }, + { + "epoch": 0.94, + "grad_norm": 26.564247131347656, + "learning_rate": 3.0467194837328716e-06, + "logits/chosen": -0.3631289303302765, + "logits/rejected": -0.42214471101760864, + "logps/chosen": -46.714073181152344, + "logps/rejected": -88.54182434082031, + "loss": 0.7894, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.914374828338623, + "rewards/margins": 3.923696756362915, + "rewards/rejected": -1.009321928024292, + "step": 3764 + }, + { + "epoch": 0.94, + "grad_norm": 10.280341148376465, + "learning_rate": 3.0443081848055722e-06, + "logits/chosen": -0.3399718999862671, + "logits/rejected": -0.3847803771495819, + "logps/chosen": -54.92490768432617, + "logps/rejected": -91.36863708496094, + "loss": 0.7742, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9035556316375732, + "rewards/margins": 4.353499889373779, + "rewards/rejected": -1.4499439001083374, + "step": 3765 + }, + { + "epoch": 0.94, + "grad_norm": 6.182958126068115, + "learning_rate": 3.0418974227578758e-06, + "logits/chosen": -0.361807644367218, + "logits/rejected": -0.46403568983078003, + "logps/chosen": -55.33759689331055, + "logps/rejected": -74.64734649658203, + "loss": 0.6686, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0230326652526855, + "rewards/margins": 5.191249847412109, + "rewards/rejected": -2.1682169437408447, + "step": 3766 + }, + { + "epoch": 0.94, + "grad_norm": 3.058469295501709, + "learning_rate": 3.0394871982515872e-06, + "logits/chosen": -0.196886345744133, + "logits/rejected": -0.31802985072135925, + "logps/chosen": -57.021549224853516, + "logps/rejected": -91.6423110961914, + "loss": 0.636, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.875635862350464, + "rewards/margins": 5.2219061851501465, + "rewards/rejected": -2.3462703227996826, + "step": 3767 + }, + { + "epoch": 0.94, + "grad_norm": 6.310364723205566, + "learning_rate": 3.0370775119483694e-06, + "logits/chosen": -0.2887657880783081, + "logits/rejected": -0.31803563237190247, + "logps/chosen": -48.6427001953125, + "logps/rejected": -85.95732879638672, + "loss": 0.7962, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9156103134155273, + "rewards/margins": 3.640659809112549, + "rewards/rejected": -0.7250493168830872, + "step": 3768 + }, + { + "epoch": 0.94, + "grad_norm": 5.438907623291016, + "learning_rate": 3.0346683645097295e-06, + "logits/chosen": -0.28143492341041565, + "logits/rejected": -0.38961800932884216, + "logps/chosen": -57.78651428222656, + "logps/rejected": -80.18772888183594, + "loss": 0.7772, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.005653142929077, + "rewards/margins": 3.985320568084717, + "rewards/rejected": -0.9796675443649292, + "step": 3769 + }, + { + "epoch": 0.94, + "grad_norm": 5.650923728942871, + "learning_rate": 3.0322597565970336e-06, + "logits/chosen": -0.3060489296913147, + "logits/rejected": -0.39655759930610657, + "logps/chosen": -52.55238723754883, + "logps/rejected": -76.09634399414062, + "loss": 0.7579, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1874845027923584, + "rewards/margins": 4.549149990081787, + "rewards/rejected": -1.3616652488708496, + "step": 3770 + }, + { + "epoch": 0.94, + "grad_norm": 7.2006378173828125, + "learning_rate": 3.0298516888714945e-06, + "logits/chosen": -0.3086432218551636, + "logits/rejected": -0.4208548665046692, + "logps/chosen": -61.58253860473633, + "logps/rejected": -78.15507507324219, + "loss": 0.76, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.067248821258545, + "rewards/margins": 4.32861328125, + "rewards/rejected": -1.261364459991455, + "step": 3771 + }, + { + "epoch": 0.94, + "grad_norm": 4.38254976272583, + "learning_rate": 3.0274441619941787e-06, + "logits/chosen": -0.32473331689834595, + "logits/rejected": -0.41645628213882446, + "logps/chosen": -62.68513488769531, + "logps/rejected": -105.0147705078125, + "loss": 0.6956, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7689473628997803, + "rewards/margins": 5.454349517822266, + "rewards/rejected": -2.6854023933410645, + "step": 3772 + }, + { + "epoch": 0.94, + "grad_norm": 7.494043827056885, + "learning_rate": 3.0250371766260052e-06, + "logits/chosen": -0.39959028363227844, + "logits/rejected": -0.5290098190307617, + "logps/chosen": -53.12472152709961, + "logps/rejected": -62.53815460205078, + "loss": 0.7946, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.66570782661438, + "rewards/margins": 4.767916679382324, + "rewards/rejected": -2.1022090911865234, + "step": 3773 + }, + { + "epoch": 0.94, + "grad_norm": 3.9154889583587646, + "learning_rate": 3.022630733427742e-06, + "logits/chosen": -0.3552631139755249, + "logits/rejected": -0.4583752453327179, + "logps/chosen": -54.73106002807617, + "logps/rejected": -84.20622253417969, + "loss": 0.7301, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.940587043762207, + "rewards/margins": 5.037073135375977, + "rewards/rejected": -2.0964865684509277, + "step": 3774 + }, + { + "epoch": 0.94, + "grad_norm": 7.8164963722229, + "learning_rate": 3.0202248330600124e-06, + "logits/chosen": -0.38458895683288574, + "logits/rejected": -0.40703389048576355, + "logps/chosen": -74.92439270019531, + "logps/rejected": -90.14974975585938, + "loss": 0.8724, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8640055656433105, + "rewards/margins": 4.5398454666137695, + "rewards/rejected": -1.6758397817611694, + "step": 3775 + }, + { + "epoch": 0.94, + "grad_norm": 5.787387371063232, + "learning_rate": 3.017819476183287e-06, + "logits/chosen": -0.3148944675922394, + "logits/rejected": -0.4137865900993347, + "logps/chosen": -59.79444122314453, + "logps/rejected": -79.97228240966797, + "loss": 0.8332, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.975191116333008, + "rewards/margins": 3.877220392227173, + "rewards/rejected": -0.9020292162895203, + "step": 3776 + }, + { + "epoch": 0.94, + "grad_norm": 5.605940341949463, + "learning_rate": 3.015414663457885e-06, + "logits/chosen": -0.3477088212966919, + "logits/rejected": -0.4270760118961334, + "logps/chosen": -54.67593765258789, + "logps/rejected": -87.09284973144531, + "loss": 0.7783, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.098745107650757, + "rewards/margins": 4.758076190948486, + "rewards/rejected": -1.65933096408844, + "step": 3777 + }, + { + "epoch": 0.95, + "grad_norm": 12.489049911499023, + "learning_rate": 3.013010395543984e-06, + "logits/chosen": -0.27221396565437317, + "logits/rejected": -0.41256365180015564, + "logps/chosen": -69.59554290771484, + "logps/rejected": -79.2733383178711, + "loss": 0.8992, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.753225088119507, + "rewards/margins": 3.5559935569763184, + "rewards/rejected": -0.8027685284614563, + "step": 3778 + }, + { + "epoch": 0.95, + "grad_norm": 5.427285671234131, + "learning_rate": 3.0106066731016037e-06, + "logits/chosen": -0.3701324164867401, + "logits/rejected": -0.4632960259914398, + "logps/chosen": -51.76519775390625, + "logps/rejected": -75.91496276855469, + "loss": 0.7375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1565592288970947, + "rewards/margins": 5.175139427185059, + "rewards/rejected": -2.018580913543701, + "step": 3779 + }, + { + "epoch": 0.95, + "grad_norm": 6.136619567871094, + "learning_rate": 3.0082034967906204e-06, + "logits/chosen": -0.33767783641815186, + "logits/rejected": -0.4312670826911926, + "logps/chosen": -53.84700393676758, + "logps/rejected": -79.27165222167969, + "loss": 0.7287, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.019648790359497, + "rewards/margins": 4.791197299957275, + "rewards/rejected": -1.7715487480163574, + "step": 3780 + }, + { + "epoch": 0.95, + "grad_norm": 9.779881477355957, + "learning_rate": 3.005800867270756e-06, + "logits/chosen": -0.3506811261177063, + "logits/rejected": -0.46885332465171814, + "logps/chosen": -49.2796745300293, + "logps/rejected": -65.50542449951172, + "loss": 0.9038, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.9637715816497803, + "rewards/margins": 4.142272472381592, + "rewards/rejected": -1.178501009941101, + "step": 3781 + }, + { + "epoch": 0.95, + "grad_norm": 4.893030643463135, + "learning_rate": 3.003398785201585e-06, + "logits/chosen": -0.3336635231971741, + "logits/rejected": -0.44449901580810547, + "logps/chosen": -58.06059265136719, + "logps/rejected": -76.12554931640625, + "loss": 0.7593, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.787518262863159, + "rewards/margins": 4.150815486907959, + "rewards/rejected": -1.3632973432540894, + "step": 3782 + }, + { + "epoch": 0.95, + "grad_norm": 4.780821323394775, + "learning_rate": 3.000997251242531e-06, + "logits/chosen": -0.3046867251396179, + "logits/rejected": -0.4300934672355652, + "logps/chosen": -77.04824829101562, + "logps/rejected": -82.48646545410156, + "loss": 0.786, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9832849502563477, + "rewards/margins": 4.9023847579956055, + "rewards/rejected": -1.9190996885299683, + "step": 3783 + }, + { + "epoch": 0.95, + "grad_norm": 5.651740074157715, + "learning_rate": 2.998596266052866e-06, + "logits/chosen": -0.25656771659851074, + "logits/rejected": -0.419461727142334, + "logps/chosen": -56.49625778198242, + "logps/rejected": -76.16364288330078, + "loss": 0.7686, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.881399154663086, + "rewards/margins": 4.411309719085693, + "rewards/rejected": -1.5299108028411865, + "step": 3784 + }, + { + "epoch": 0.95, + "grad_norm": 6.747636795043945, + "learning_rate": 2.9961958302917136e-06, + "logits/chosen": -0.29949235916137695, + "logits/rejected": -0.39175945520401, + "logps/chosen": -52.126861572265625, + "logps/rejected": -76.59791564941406, + "loss": 0.8806, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.809081554412842, + "rewards/margins": 4.861101150512695, + "rewards/rejected": -2.0520198345184326, + "step": 3785 + }, + { + "epoch": 0.95, + "grad_norm": 6.19291877746582, + "learning_rate": 2.9937959446180466e-06, + "logits/chosen": -0.3130291700363159, + "logits/rejected": -0.46023818850517273, + "logps/chosen": -70.6555404663086, + "logps/rejected": -90.315673828125, + "loss": 0.7381, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0743825435638428, + "rewards/margins": 5.626067161560059, + "rewards/rejected": -2.551684856414795, + "step": 3786 + }, + { + "epoch": 0.95, + "grad_norm": 5.5503692626953125, + "learning_rate": 2.991396609690679e-06, + "logits/chosen": -0.3556341826915741, + "logits/rejected": -0.41277211904525757, + "logps/chosen": -52.88324737548828, + "logps/rejected": -86.3954849243164, + "loss": 0.8159, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.614295244216919, + "rewards/margins": 3.682999610900879, + "rewards/rejected": -1.0687042474746704, + "step": 3787 + }, + { + "epoch": 0.95, + "grad_norm": 4.906284809112549, + "learning_rate": 2.9889978261682873e-06, + "logits/chosen": -0.30096179246902466, + "logits/rejected": -0.38354572653770447, + "logps/chosen": -55.732845306396484, + "logps/rejected": -81.21546173095703, + "loss": 0.7908, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.848065137863159, + "rewards/margins": 3.6915345191955566, + "rewards/rejected": -0.8434693813323975, + "step": 3788 + }, + { + "epoch": 0.95, + "grad_norm": 4.654969692230225, + "learning_rate": 2.986599594709385e-06, + "logits/chosen": -0.3308258056640625, + "logits/rejected": -0.4089624285697937, + "logps/chosen": -59.07405090332031, + "logps/rejected": -79.2205810546875, + "loss": 0.8295, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7827253341674805, + "rewards/margins": 3.979945659637451, + "rewards/rejected": -1.1972205638885498, + "step": 3789 + }, + { + "epoch": 0.95, + "grad_norm": 7.900893688201904, + "learning_rate": 2.98420191597234e-06, + "logits/chosen": -0.3830510675907135, + "logits/rejected": -0.502564013004303, + "logps/chosen": -63.15348815917969, + "logps/rejected": -73.42240905761719, + "loss": 0.7913, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.763697385787964, + "rewards/margins": 4.072882175445557, + "rewards/rejected": -1.3091844320297241, + "step": 3790 + }, + { + "epoch": 0.95, + "grad_norm": 4.395874500274658, + "learning_rate": 2.9818047906153667e-06, + "logits/chosen": -0.31827229261398315, + "logits/rejected": -0.3948930501937866, + "logps/chosen": -61.554954528808594, + "logps/rejected": -81.99944305419922, + "loss": 0.762, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.955683946609497, + "rewards/margins": 4.556382656097412, + "rewards/rejected": -1.6006988286972046, + "step": 3791 + }, + { + "epoch": 0.95, + "grad_norm": 5.733980178833008, + "learning_rate": 2.9794082192965272e-06, + "logits/chosen": -0.24177655577659607, + "logits/rejected": -0.4035027027130127, + "logps/chosen": -56.30251693725586, + "logps/rejected": -74.43771362304688, + "loss": 0.7988, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.866818904876709, + "rewards/margins": 4.570559501647949, + "rewards/rejected": -1.7037410736083984, + "step": 3792 + }, + { + "epoch": 0.95, + "grad_norm": 3.6566569805145264, + "learning_rate": 2.9770122026737334e-06, + "logits/chosen": -0.2932141423225403, + "logits/rejected": -0.35950109362602234, + "logps/chosen": -51.371192932128906, + "logps/rejected": -83.50338745117188, + "loss": 0.6739, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1425185203552246, + "rewards/margins": 4.545924186706543, + "rewards/rejected": -1.4034066200256348, + "step": 3793 + }, + { + "epoch": 0.95, + "grad_norm": 7.146121978759766, + "learning_rate": 2.974616741404741e-06, + "logits/chosen": -0.2849506139755249, + "logits/rejected": -0.4436948597431183, + "logps/chosen": -60.456871032714844, + "logps/rejected": -69.81001281738281, + "loss": 0.9053, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.094398260116577, + "rewards/margins": 3.9965217113494873, + "rewards/rejected": -0.9021233320236206, + "step": 3794 + }, + { + "epoch": 0.95, + "grad_norm": 11.179561614990234, + "learning_rate": 2.972221836147159e-06, + "logits/chosen": -0.4015997350215912, + "logits/rejected": -0.5490033030509949, + "logps/chosen": -59.78152084350586, + "logps/rejected": -67.19586944580078, + "loss": 0.8677, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8624515533447266, + "rewards/margins": 4.745610237121582, + "rewards/rejected": -1.8831584453582764, + "step": 3795 + }, + { + "epoch": 0.95, + "grad_norm": 3.0702121257781982, + "learning_rate": 2.9698274875584393e-06, + "logits/chosen": -0.29672861099243164, + "logits/rejected": -0.3649584650993347, + "logps/chosen": -56.066776275634766, + "logps/rejected": -91.71395874023438, + "loss": 0.6497, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.012582540512085, + "rewards/margins": 5.203785419464111, + "rewards/rejected": -2.1912026405334473, + "step": 3796 + }, + { + "epoch": 0.95, + "grad_norm": 2.8188843727111816, + "learning_rate": 2.967433696295879e-06, + "logits/chosen": -0.37406325340270996, + "logits/rejected": -0.49284136295318604, + "logps/chosen": -44.594642639160156, + "logps/rejected": -94.83565521240234, + "loss": 0.6315, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8345823287963867, + "rewards/margins": 5.926360130310059, + "rewards/rejected": -3.091777801513672, + "step": 3797 + }, + { + "epoch": 0.95, + "grad_norm": 3.26916766166687, + "learning_rate": 2.965040463016632e-06, + "logits/chosen": -0.24297060072422028, + "logits/rejected": -0.3620541989803314, + "logps/chosen": -60.96215057373047, + "logps/rejected": -85.0335464477539, + "loss": 0.6944, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.96848201751709, + "rewards/margins": 4.610718727111816, + "rewards/rejected": -1.6422362327575684, + "step": 3798 + }, + { + "epoch": 0.95, + "grad_norm": 5.9038214683532715, + "learning_rate": 2.9626477883776856e-06, + "logits/chosen": -0.34615135192871094, + "logits/rejected": -0.3739319443702698, + "logps/chosen": -53.125301361083984, + "logps/rejected": -80.1693115234375, + "loss": 0.8845, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0082035064697266, + "rewards/margins": 3.441254138946533, + "rewards/rejected": -0.43305081129074097, + "step": 3799 + }, + { + "epoch": 0.95, + "grad_norm": 5.3790974617004395, + "learning_rate": 2.9602556730358865e-06, + "logits/chosen": -0.37679436802864075, + "logits/rejected": -0.4458041787147522, + "logps/chosen": -62.84394073486328, + "logps/rejected": -77.24591827392578, + "loss": 0.8084, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0310580730438232, + "rewards/margins": 4.2779860496521, + "rewards/rejected": -1.2469278573989868, + "step": 3800 + }, + { + "epoch": 0.95, + "grad_norm": 6.157062530517578, + "learning_rate": 2.957864117647919e-06, + "logits/chosen": -0.2751784026622772, + "logits/rejected": -0.3825095593929291, + "logps/chosen": -56.681392669677734, + "logps/rejected": -90.00182342529297, + "loss": 0.7097, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8602170944213867, + "rewards/margins": 4.864542484283447, + "rewards/rejected": -2.0043249130249023, + "step": 3801 + }, + { + "epoch": 0.95, + "grad_norm": 7.424831390380859, + "learning_rate": 2.955473122870316e-06, + "logits/chosen": -0.37607789039611816, + "logits/rejected": -0.43523678183555603, + "logps/chosen": -48.92489242553711, + "logps/rejected": -90.93439483642578, + "loss": 0.7266, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7184252738952637, + "rewards/margins": 5.121807098388672, + "rewards/rejected": -2.403381586074829, + "step": 3802 + }, + { + "epoch": 0.95, + "grad_norm": 3.9488885402679443, + "learning_rate": 2.9530826893594587e-06, + "logits/chosen": -0.28294309973716736, + "logits/rejected": -0.34183233976364136, + "logps/chosen": -53.82809066772461, + "logps/rejected": -88.32798767089844, + "loss": 0.7226, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.649070978164673, + "rewards/margins": 4.5408453941345215, + "rewards/rejected": -1.8917747735977173, + "step": 3803 + }, + { + "epoch": 0.95, + "grad_norm": 5.000871181488037, + "learning_rate": 2.950692817771571e-06, + "logits/chosen": -0.3594011664390564, + "logits/rejected": -0.40155020356178284, + "logps/chosen": -53.46880340576172, + "logps/rejected": -99.63250732421875, + "loss": 0.8304, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7122063636779785, + "rewards/margins": 4.365447521209717, + "rewards/rejected": -1.6532413959503174, + "step": 3804 + }, + { + "epoch": 0.95, + "grad_norm": 7.655115604400635, + "learning_rate": 2.9483035087627267e-06, + "logits/chosen": -0.3235973119735718, + "logits/rejected": -0.45300009846687317, + "logps/chosen": -52.364891052246094, + "logps/rejected": -75.72594451904297, + "loss": 0.7629, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8772568702697754, + "rewards/margins": 4.429251194000244, + "rewards/rejected": -1.551994800567627, + "step": 3805 + }, + { + "epoch": 0.95, + "grad_norm": 10.407288551330566, + "learning_rate": 2.9459147629888424e-06, + "logits/chosen": -0.3222067356109619, + "logits/rejected": -0.40440547466278076, + "logps/chosen": -66.65674591064453, + "logps/rejected": -83.07213592529297, + "loss": 0.9211, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.4824631214141846, + "rewards/margins": 3.922107219696045, + "rewards/rejected": -1.439644455909729, + "step": 3806 + }, + { + "epoch": 0.95, + "grad_norm": 5.7840352058410645, + "learning_rate": 2.9435265811056763e-06, + "logits/chosen": -0.28896206617355347, + "logits/rejected": -0.3936450481414795, + "logps/chosen": -62.50385665893555, + "logps/rejected": -67.19789123535156, + "loss": 0.8253, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.026034355163574, + "rewards/margins": 4.2295026779174805, + "rewards/rejected": -1.2034685611724854, + "step": 3807 + }, + { + "epoch": 0.95, + "grad_norm": 10.573287010192871, + "learning_rate": 2.941138963768842e-06, + "logits/chosen": -0.33683353662490845, + "logits/rejected": -0.48469096422195435, + "logps/chosen": -64.56292724609375, + "logps/rejected": -73.1091537475586, + "loss": 0.9154, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7870538234710693, + "rewards/margins": 4.54000997543335, + "rewards/rejected": -1.7529561519622803, + "step": 3808 + }, + { + "epoch": 0.95, + "grad_norm": 3.22337007522583, + "learning_rate": 2.938751911633786e-06, + "logits/chosen": -0.38371607661247253, + "logits/rejected": -0.453293114900589, + "logps/chosen": -70.17549133300781, + "logps/rejected": -81.17459106445312, + "loss": 0.7547, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.866323709487915, + "rewards/margins": 4.629254341125488, + "rewards/rejected": -1.7629303932189941, + "step": 3809 + }, + { + "epoch": 0.95, + "grad_norm": 2.915976047515869, + "learning_rate": 2.9363654253558112e-06, + "logits/chosen": -0.3242451548576355, + "logits/rejected": -0.42449092864990234, + "logps/chosen": -49.320552825927734, + "logps/rejected": -68.96732330322266, + "loss": 0.6477, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1605746746063232, + "rewards/margins": 4.747382164001465, + "rewards/rejected": -1.5868077278137207, + "step": 3810 + }, + { + "epoch": 0.95, + "grad_norm": 4.17093563079834, + "learning_rate": 2.9339795055900557e-06, + "logits/chosen": -0.2944141626358032, + "logits/rejected": -0.38240140676498413, + "logps/chosen": -57.57568359375, + "logps/rejected": -95.48381042480469, + "loss": 0.7185, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.852403163909912, + "rewards/margins": 4.986471176147461, + "rewards/rejected": -2.1340675354003906, + "step": 3811 + }, + { + "epoch": 0.95, + "grad_norm": 6.353901386260986, + "learning_rate": 2.931594152991506e-06, + "logits/chosen": -0.34598037600517273, + "logits/rejected": -0.4493251144886017, + "logps/chosen": -66.4325942993164, + "logps/rejected": -75.55435180664062, + "loss": 0.8302, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.86515212059021, + "rewards/margins": 3.8923802375793457, + "rewards/rejected": -1.0272281169891357, + "step": 3812 + }, + { + "epoch": 0.95, + "grad_norm": 5.099472522735596, + "learning_rate": 2.929209368214996e-06, + "logits/chosen": -0.36198127269744873, + "logits/rejected": -0.4819965660572052, + "logps/chosen": -57.992801666259766, + "logps/rejected": -88.1620864868164, + "loss": 0.6551, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7952964305877686, + "rewards/margins": 4.0357184410095215, + "rewards/rejected": -1.240422010421753, + "step": 3813 + }, + { + "epoch": 0.95, + "grad_norm": 5.119798183441162, + "learning_rate": 2.926825151915196e-06, + "logits/chosen": -0.30955371260643005, + "logits/rejected": -0.4079340398311615, + "logps/chosen": -60.82061004638672, + "logps/rejected": -73.38929748535156, + "loss": 0.7928, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8274424076080322, + "rewards/margins": 3.5810961723327637, + "rewards/rejected": -0.7536535859107971, + "step": 3814 + }, + { + "epoch": 0.95, + "grad_norm": 3.133974313735962, + "learning_rate": 2.9244415047466296e-06, + "logits/chosen": -0.32196107506752014, + "logits/rejected": -0.3866761028766632, + "logps/chosen": -46.11245346069336, + "logps/rejected": -94.25743103027344, + "loss": 0.6659, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2430574893951416, + "rewards/margins": 6.225385665893555, + "rewards/rejected": -2.982328414916992, + "step": 3815 + }, + { + "epoch": 0.95, + "grad_norm": 13.037921905517578, + "learning_rate": 2.9220584273636567e-06, + "logits/chosen": -0.31073614954948425, + "logits/rejected": -0.4185372591018677, + "logps/chosen": -57.82914733886719, + "logps/rejected": -79.5503921508789, + "loss": 0.7571, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6776654720306396, + "rewards/margins": 4.3812761306762695, + "rewards/rejected": -1.7036101818084717, + "step": 3816 + }, + { + "epoch": 0.95, + "grad_norm": 5.534129619598389, + "learning_rate": 2.919675920420483e-06, + "logits/chosen": -0.2367970496416092, + "logits/rejected": -0.31647199392318726, + "logps/chosen": -66.37744140625, + "logps/rejected": -90.9068374633789, + "loss": 0.8247, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.842045545578003, + "rewards/margins": 4.373006343841553, + "rewards/rejected": -1.5309603214263916, + "step": 3817 + }, + { + "epoch": 0.96, + "grad_norm": 7.260521411895752, + "learning_rate": 2.91729398457116e-06, + "logits/chosen": -0.4292484521865845, + "logits/rejected": -0.49105092883110046, + "logps/chosen": -59.42991638183594, + "logps/rejected": -79.86515808105469, + "loss": 0.7483, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.075150966644287, + "rewards/margins": 4.84368896484375, + "rewards/rejected": -1.7685376405715942, + "step": 3818 + }, + { + "epoch": 0.96, + "grad_norm": 7.508386135101318, + "learning_rate": 2.91491262046958e-06, + "logits/chosen": -0.27809086441993713, + "logits/rejected": -0.4450472593307495, + "logps/chosen": -66.02375030517578, + "logps/rejected": -79.05557250976562, + "loss": 0.8735, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7573742866516113, + "rewards/margins": 4.340963363647461, + "rewards/rejected": -1.5835891962051392, + "step": 3819 + }, + { + "epoch": 0.96, + "grad_norm": 6.048198223114014, + "learning_rate": 2.9125318287694777e-06, + "logits/chosen": -0.37959858775138855, + "logits/rejected": -0.45990490913391113, + "logps/chosen": -54.45167922973633, + "logps/rejected": -76.69036102294922, + "loss": 0.8681, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.794311761856079, + "rewards/margins": 3.9858810901641846, + "rewards/rejected": -1.191569209098816, + "step": 3820 + }, + { + "epoch": 0.96, + "grad_norm": 4.833902835845947, + "learning_rate": 2.9101516101244322e-06, + "logits/chosen": -0.2906225919723511, + "logits/rejected": -0.41262561082839966, + "logps/chosen": -64.63727569580078, + "logps/rejected": -87.68950653076172, + "loss": 0.7766, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.960822582244873, + "rewards/margins": 4.8043107986450195, + "rewards/rejected": -1.843488335609436, + "step": 3821 + }, + { + "epoch": 0.96, + "grad_norm": 16.820425033569336, + "learning_rate": 2.907771965187863e-06, + "logits/chosen": -0.34216251969337463, + "logits/rejected": -0.398881196975708, + "logps/chosen": -52.61117172241211, + "logps/rejected": -74.02015686035156, + "loss": 0.9225, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.75604248046875, + "rewards/margins": 3.5811421871185303, + "rewards/rejected": -0.8250995874404907, + "step": 3822 + }, + { + "epoch": 0.96, + "grad_norm": 6.914705753326416, + "learning_rate": 2.9053928946130393e-06, + "logits/chosen": -0.3710523247718811, + "logits/rejected": -0.44519859552383423, + "logps/chosen": -63.04100036621094, + "logps/rejected": -82.33079528808594, + "loss": 0.7882, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.884488105773926, + "rewards/margins": 4.321550369262695, + "rewards/rejected": -1.43706214427948, + "step": 3823 + }, + { + "epoch": 0.96, + "grad_norm": 14.565485954284668, + "learning_rate": 2.903014399053063e-06, + "logits/chosen": -0.27760475873947144, + "logits/rejected": -0.45015111565589905, + "logps/chosen": -70.19149780273438, + "logps/rejected": -80.70409393310547, + "loss": 0.8818, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.696037769317627, + "rewards/margins": 4.291129112243652, + "rewards/rejected": -1.5950912237167358, + "step": 3824 + }, + { + "epoch": 0.96, + "grad_norm": 5.199642658233643, + "learning_rate": 2.900636479160884e-06, + "logits/chosen": -0.3465014398097992, + "logits/rejected": -0.44412899017333984, + "logps/chosen": -49.1579704284668, + "logps/rejected": -79.892333984375, + "loss": 0.7312, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1408839225769043, + "rewards/margins": 4.102546215057373, + "rewards/rejected": -0.9616629481315613, + "step": 3825 + }, + { + "epoch": 0.96, + "grad_norm": 2.7701382637023926, + "learning_rate": 2.8982591355892918e-06, + "logits/chosen": -0.34027189016342163, + "logits/rejected": -0.4009034037590027, + "logps/chosen": -56.291839599609375, + "logps/rejected": -83.86949157714844, + "loss": 0.6333, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.116752862930298, + "rewards/margins": 4.79871129989624, + "rewards/rejected": -1.681958556175232, + "step": 3826 + }, + { + "epoch": 0.96, + "grad_norm": 4.573342323303223, + "learning_rate": 2.895882368990919e-06, + "logits/chosen": -0.28866130113601685, + "logits/rejected": -0.42005491256713867, + "logps/chosen": -57.51907730102539, + "logps/rejected": -78.19571685791016, + "loss": 0.6904, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2098796367645264, + "rewards/margins": 5.5707478523254395, + "rewards/rejected": -2.360868215560913, + "step": 3827 + }, + { + "epoch": 0.96, + "grad_norm": 4.857256889343262, + "learning_rate": 2.8935061800182395e-06, + "logits/chosen": -0.359203964471817, + "logits/rejected": -0.4682771861553192, + "logps/chosen": -56.15304946899414, + "logps/rejected": -67.53740692138672, + "loss": 0.7423, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7393805980682373, + "rewards/margins": 3.7540507316589355, + "rewards/rejected": -1.0146702527999878, + "step": 3828 + }, + { + "epoch": 0.96, + "grad_norm": 6.961142539978027, + "learning_rate": 2.8911305693235657e-06, + "logits/chosen": -0.4468744695186615, + "logits/rejected": -0.4847412109375, + "logps/chosen": -60.70436096191406, + "logps/rejected": -83.18732452392578, + "loss": 0.9703, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.2805192470550537, + "rewards/margins": 4.065825939178467, + "rewards/rejected": -1.785306692123413, + "step": 3829 + }, + { + "epoch": 0.96, + "grad_norm": 7.257394313812256, + "learning_rate": 2.888755537559059e-06, + "logits/chosen": -0.28622058033943176, + "logits/rejected": -0.37698251008987427, + "logps/chosen": -57.77703094482422, + "logps/rejected": -82.694091796875, + "loss": 0.8019, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6551918983459473, + "rewards/margins": 4.252471446990967, + "rewards/rejected": -1.5972790718078613, + "step": 3830 + }, + { + "epoch": 0.96, + "grad_norm": 4.409084796905518, + "learning_rate": 2.886381085376714e-06, + "logits/chosen": -0.387400358915329, + "logits/rejected": -0.4945349097251892, + "logps/chosen": -45.4938850402832, + "logps/rejected": -91.08476257324219, + "loss": 0.7305, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.659733295440674, + "rewards/margins": 5.331941604614258, + "rewards/rejected": -2.672208309173584, + "step": 3831 + }, + { + "epoch": 0.96, + "grad_norm": 4.477411270141602, + "learning_rate": 2.884007213428369e-06, + "logits/chosen": -0.32701608538627625, + "logits/rejected": -0.4289809465408325, + "logps/chosen": -64.44027709960938, + "logps/rejected": -84.7315902709961, + "loss": 0.7035, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0348591804504395, + "rewards/margins": 4.784243583679199, + "rewards/rejected": -1.7493841648101807, + "step": 3832 + }, + { + "epoch": 0.96, + "grad_norm": 4.844639778137207, + "learning_rate": 2.8816339223657036e-06, + "logits/chosen": -0.2891901433467865, + "logits/rejected": -0.3910399377346039, + "logps/chosen": -50.300880432128906, + "logps/rejected": -65.7414779663086, + "loss": 0.7189, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7649433612823486, + "rewards/margins": 4.079578876495361, + "rewards/rejected": -1.3146357536315918, + "step": 3833 + }, + { + "epoch": 0.96, + "grad_norm": 2.5069797039031982, + "learning_rate": 2.879261212840235e-06, + "logits/chosen": -0.27761226892471313, + "logits/rejected": -0.31653323769569397, + "logps/chosen": -53.24944305419922, + "logps/rejected": -114.40363311767578, + "loss": 0.6732, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2180120944976807, + "rewards/margins": 5.601471900939941, + "rewards/rejected": -2.3834593296051025, + "step": 3834 + }, + { + "epoch": 0.96, + "grad_norm": 3.0137550830841064, + "learning_rate": 2.876889085503328e-06, + "logits/chosen": -0.35268354415893555, + "logits/rejected": -0.43688350915908813, + "logps/chosen": -45.2738037109375, + "logps/rejected": -83.42073822021484, + "loss": 0.6698, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2322258949279785, + "rewards/margins": 5.180370807647705, + "rewards/rejected": -1.9481452703475952, + "step": 3835 + }, + { + "epoch": 0.96, + "grad_norm": 5.536947727203369, + "learning_rate": 2.8745175410061822e-06, + "logits/chosen": -0.3341810405254364, + "logits/rejected": -0.3667867183685303, + "logps/chosen": -56.24226379394531, + "logps/rejected": -80.98777770996094, + "loss": 0.8381, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.654245138168335, + "rewards/margins": 3.1291697025299072, + "rewards/rejected": -0.47492462396621704, + "step": 3836 + }, + { + "epoch": 0.96, + "grad_norm": 5.558230400085449, + "learning_rate": 2.8721465799998314e-06, + "logits/chosen": -0.3374551236629486, + "logits/rejected": -0.4016721248626709, + "logps/chosen": -55.34592056274414, + "logps/rejected": -90.20272064208984, + "loss": 0.7876, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2477850914001465, + "rewards/margins": 3.82804536819458, + "rewards/rejected": -0.580260157585144, + "step": 3837 + }, + { + "epoch": 0.96, + "grad_norm": 2.32987904548645, + "learning_rate": 2.8697762031351617e-06, + "logits/chosen": -0.34118330478668213, + "logits/rejected": -0.4774937033653259, + "logps/chosen": -50.38734436035156, + "logps/rejected": -83.14073181152344, + "loss": 0.5816, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.110853910446167, + "rewards/margins": 5.586941242218018, + "rewards/rejected": -2.4760875701904297, + "step": 3838 + }, + { + "epoch": 0.96, + "grad_norm": 8.326977729797363, + "learning_rate": 2.867406411062891e-06, + "logits/chosen": -0.3715175986289978, + "logits/rejected": -0.38687223196029663, + "logps/chosen": -54.25874710083008, + "logps/rejected": -92.07292175292969, + "loss": 0.8126, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9257924556732178, + "rewards/margins": 4.404987335205078, + "rewards/rejected": -1.4791951179504395, + "step": 3839 + }, + { + "epoch": 0.96, + "grad_norm": 3.356687545776367, + "learning_rate": 2.8650372044335783e-06, + "logits/chosen": -0.3539304733276367, + "logits/rejected": -0.47002965211868286, + "logps/chosen": -53.679176330566406, + "logps/rejected": -79.16727447509766, + "loss": 0.7167, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.812204122543335, + "rewards/margins": 4.92760705947876, + "rewards/rejected": -2.115403175354004, + "step": 3840 + }, + { + "epoch": 0.96, + "grad_norm": 6.47113037109375, + "learning_rate": 2.862668583897622e-06, + "logits/chosen": -0.29765692353248596, + "logits/rejected": -0.43235522508621216, + "logps/chosen": -55.40052795410156, + "logps/rejected": -75.81411743164062, + "loss": 0.6678, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.842825174331665, + "rewards/margins": 5.083060264587402, + "rewards/rejected": -2.240234613418579, + "step": 3841 + }, + { + "epoch": 0.96, + "grad_norm": 6.899448871612549, + "learning_rate": 2.860300550105257e-06, + "logits/chosen": -0.2308189868927002, + "logits/rejected": -0.360485702753067, + "logps/chosen": -57.0380744934082, + "logps/rejected": -81.34577178955078, + "loss": 0.8244, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5134353637695312, + "rewards/margins": 4.3914079666137695, + "rewards/rejected": -1.8779722452163696, + "step": 3842 + }, + { + "epoch": 0.96, + "grad_norm": 3.6750168800354004, + "learning_rate": 2.8579331037065637e-06, + "logits/chosen": -0.3581947088241577, + "logits/rejected": -0.37766093015670776, + "logps/chosen": -53.3455696105957, + "logps/rejected": -90.24543762207031, + "loss": 0.8017, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.001251220703125, + "rewards/margins": 4.1071648597717285, + "rewards/rejected": -1.1059136390686035, + "step": 3843 + }, + { + "epoch": 0.96, + "grad_norm": 7.176151275634766, + "learning_rate": 2.8555662453514565e-06, + "logits/chosen": -0.4811022877693176, + "logits/rejected": -0.5792738199234009, + "logps/chosen": -51.94645309448242, + "logps/rejected": -86.33316802978516, + "loss": 0.8224, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0297534465789795, + "rewards/margins": 5.399526596069336, + "rewards/rejected": -2.369772434234619, + "step": 3844 + }, + { + "epoch": 0.96, + "grad_norm": 6.2477192878723145, + "learning_rate": 2.8531999756896878e-06, + "logits/chosen": -0.39561113715171814, + "logits/rejected": -0.428907573223114, + "logps/chosen": -54.792808532714844, + "logps/rejected": -107.91695404052734, + "loss": 0.7582, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.016201972961426, + "rewards/margins": 4.496581554412842, + "rewards/rejected": -1.4803797006607056, + "step": 3845 + }, + { + "epoch": 0.96, + "grad_norm": 6.618109226226807, + "learning_rate": 2.85083429537085e-06, + "logits/chosen": -0.20487308502197266, + "logits/rejected": -0.2562985122203827, + "logps/chosen": -60.452613830566406, + "logps/rejected": -97.00584411621094, + "loss": 0.8251, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.742767333984375, + "rewards/margins": 3.304403781890869, + "rewards/rejected": -0.5616368651390076, + "step": 3846 + }, + { + "epoch": 0.96, + "grad_norm": 2.905181646347046, + "learning_rate": 2.8484692050443717e-06, + "logits/chosen": -0.33074286580085754, + "logits/rejected": -0.45600712299346924, + "logps/chosen": -52.37150192260742, + "logps/rejected": -63.118263244628906, + "loss": 0.6046, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0317747592926025, + "rewards/margins": 4.8319830894470215, + "rewards/rejected": -1.8002079725265503, + "step": 3847 + }, + { + "epoch": 0.96, + "grad_norm": 17.049175262451172, + "learning_rate": 2.846104705359527e-06, + "logits/chosen": -0.2611117959022522, + "logits/rejected": -0.3571631610393524, + "logps/chosen": -59.31228256225586, + "logps/rejected": -79.75076293945312, + "loss": 0.8164, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8319013118743896, + "rewards/margins": 4.0838446617126465, + "rewards/rejected": -1.2519441843032837, + "step": 3848 + }, + { + "epoch": 0.96, + "grad_norm": 4.169990062713623, + "learning_rate": 2.8437407969654136e-06, + "logits/chosen": -0.4172156751155853, + "logits/rejected": -0.5098539590835571, + "logps/chosen": -47.1096076965332, + "logps/rejected": -78.12503051757812, + "loss": 0.7246, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7992148399353027, + "rewards/margins": 5.0278472900390625, + "rewards/rejected": -2.2286324501037598, + "step": 3849 + }, + { + "epoch": 0.96, + "grad_norm": 3.0182387828826904, + "learning_rate": 2.8413774805109816e-06, + "logits/chosen": -0.38377806544303894, + "logits/rejected": -0.4454111158847809, + "logps/chosen": -48.96623229980469, + "logps/rejected": -86.02433776855469, + "loss": 0.7144, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0788679122924805, + "rewards/margins": 4.856288909912109, + "rewards/rejected": -1.7774211168289185, + "step": 3850 + }, + { + "epoch": 0.96, + "grad_norm": 5.641568660736084, + "learning_rate": 2.839014756645011e-06, + "logits/chosen": -0.4164395034313202, + "logits/rejected": -0.5245938301086426, + "logps/chosen": -49.22856140136719, + "logps/rejected": -78.7356185913086, + "loss": 0.7836, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.739331007003784, + "rewards/margins": 4.417378902435303, + "rewards/rejected": -1.678047776222229, + "step": 3851 + }, + { + "epoch": 0.96, + "grad_norm": 4.311782360076904, + "learning_rate": 2.836652626016121e-06, + "logits/chosen": -0.2725089192390442, + "logits/rejected": -0.4207252562046051, + "logps/chosen": -58.837486267089844, + "logps/rejected": -74.31940460205078, + "loss": 0.6712, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8555099964141846, + "rewards/margins": 4.555299758911133, + "rewards/rejected": -1.6997895240783691, + "step": 3852 + }, + { + "epoch": 0.96, + "grad_norm": 7.727230548858643, + "learning_rate": 2.8342910892727652e-06, + "logits/chosen": -0.4397808611392975, + "logits/rejected": -0.5320838689804077, + "logps/chosen": -47.04241180419922, + "logps/rejected": -63.57773971557617, + "loss": 0.7819, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6228365898132324, + "rewards/margins": 3.8208847045898438, + "rewards/rejected": -1.1980479955673218, + "step": 3853 + }, + { + "epoch": 0.96, + "grad_norm": 6.084052562713623, + "learning_rate": 2.8319301470632367e-06, + "logits/chosen": -0.2848557233810425, + "logits/rejected": -0.433405339717865, + "logps/chosen": -68.37823486328125, + "logps/rejected": -80.26007080078125, + "loss": 0.8615, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.806891679763794, + "rewards/margins": 3.767482280731201, + "rewards/rejected": -0.9605903625488281, + "step": 3854 + }, + { + "epoch": 0.96, + "grad_norm": 4.761147499084473, + "learning_rate": 2.8295698000356674e-06, + "logits/chosen": -0.40452897548675537, + "logits/rejected": -0.4290328621864319, + "logps/chosen": -45.494895935058594, + "logps/rejected": -78.70084381103516, + "loss": 0.7469, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9068078994750977, + "rewards/margins": 3.7235755920410156, + "rewards/rejected": -0.8167677521705627, + "step": 3855 + }, + { + "epoch": 0.96, + "grad_norm": 4.046993255615234, + "learning_rate": 2.8272100488380237e-06, + "logits/chosen": -0.3735789954662323, + "logits/rejected": -0.4684009253978729, + "logps/chosen": -58.33439254760742, + "logps/rejected": -85.40476989746094, + "loss": 0.652, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9048924446105957, + "rewards/margins": 4.684530258178711, + "rewards/rejected": -1.7796375751495361, + "step": 3856 + }, + { + "epoch": 0.96, + "grad_norm": 6.0416579246521, + "learning_rate": 2.824850894118104e-06, + "logits/chosen": -0.3847060799598694, + "logits/rejected": -0.4695051312446594, + "logps/chosen": -53.06580352783203, + "logps/rejected": -78.62588500976562, + "loss": 0.9616, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.794098138809204, + "rewards/margins": 4.385961532592773, + "rewards/rejected": -1.5918641090393066, + "step": 3857 + }, + { + "epoch": 0.97, + "grad_norm": 5.3521013259887695, + "learning_rate": 2.8224923365235513e-06, + "logits/chosen": -0.4211730360984802, + "logits/rejected": -0.5010238885879517, + "logps/chosen": -53.10960006713867, + "logps/rejected": -77.44674682617188, + "loss": 0.7441, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.126871347427368, + "rewards/margins": 4.461725234985352, + "rewards/rejected": -1.3348548412322998, + "step": 3858 + }, + { + "epoch": 0.97, + "grad_norm": 7.326679229736328, + "learning_rate": 2.8201343767018368e-06, + "logits/chosen": -0.2952539026737213, + "logits/rejected": -0.4305776357650757, + "logps/chosen": -62.645538330078125, + "logps/rejected": -79.46797180175781, + "loss": 0.9866, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.001936435699463, + "rewards/margins": 4.379882335662842, + "rewards/rejected": -1.377945899963379, + "step": 3859 + }, + { + "epoch": 0.97, + "grad_norm": 4.375850677490234, + "learning_rate": 2.817777015300278e-06, + "logits/chosen": -0.3021295368671417, + "logits/rejected": -0.38936054706573486, + "logps/chosen": -53.71471405029297, + "logps/rejected": -89.34195709228516, + "loss": 0.6993, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.828977108001709, + "rewards/margins": 4.463118076324463, + "rewards/rejected": -1.6341403722763062, + "step": 3860 + }, + { + "epoch": 0.97, + "grad_norm": 6.6942243576049805, + "learning_rate": 2.815420252966015e-06, + "logits/chosen": -0.37134429812431335, + "logits/rejected": -0.4312260150909424, + "logps/chosen": -52.996726989746094, + "logps/rejected": -85.83809661865234, + "loss": 0.7404, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0652449131011963, + "rewards/margins": 4.6366376876831055, + "rewards/rejected": -1.5713930130004883, + "step": 3861 + }, + { + "epoch": 0.97, + "grad_norm": 2.9850375652313232, + "learning_rate": 2.8130640903460305e-06, + "logits/chosen": -0.2312156856060028, + "logits/rejected": -0.3718431293964386, + "logps/chosen": -71.42424011230469, + "logps/rejected": -78.2056884765625, + "loss": 0.7848, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9488444328308105, + "rewards/margins": 4.715677738189697, + "rewards/rejected": -1.766833782196045, + "step": 3862 + }, + { + "epoch": 0.97, + "grad_norm": 5.413836479187012, + "learning_rate": 2.810708528087145e-06, + "logits/chosen": -0.43230384588241577, + "logits/rejected": -0.5394826531410217, + "logps/chosen": -52.20027542114258, + "logps/rejected": -71.34477233886719, + "loss": 0.9507, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8161864280700684, + "rewards/margins": 3.7649717330932617, + "rewards/rejected": -0.9487852454185486, + "step": 3863 + }, + { + "epoch": 0.97, + "grad_norm": 23.31391143798828, + "learning_rate": 2.80835356683601e-06, + "logits/chosen": -0.35058391094207764, + "logits/rejected": -0.4131218492984772, + "logps/chosen": -65.37330627441406, + "logps/rejected": -86.3395004272461, + "loss": 0.7633, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0274040699005127, + "rewards/margins": 4.787491321563721, + "rewards/rejected": -1.7600871324539185, + "step": 3864 + }, + { + "epoch": 0.97, + "grad_norm": 5.697617053985596, + "learning_rate": 2.805999207239113e-06, + "logits/chosen": -0.23494696617126465, + "logits/rejected": -0.42034345865249634, + "logps/chosen": -59.71894073486328, + "logps/rejected": -66.53701782226562, + "loss": 0.8122, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7307522296905518, + "rewards/margins": 4.315727233886719, + "rewards/rejected": -1.5849751234054565, + "step": 3865 + }, + { + "epoch": 0.97, + "grad_norm": 3.817793607711792, + "learning_rate": 2.803645449942776e-06, + "logits/chosen": -0.3661665916442871, + "logits/rejected": -0.48371031880378723, + "logps/chosen": -62.170684814453125, + "logps/rejected": -79.46187591552734, + "loss": 0.686, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.074681282043457, + "rewards/margins": 5.080376148223877, + "rewards/rejected": -2.005695343017578, + "step": 3866 + }, + { + "epoch": 0.97, + "grad_norm": 4.5818915367126465, + "learning_rate": 2.8012922955931554e-06, + "logits/chosen": -0.21897733211517334, + "logits/rejected": -0.3451104462146759, + "logps/chosen": -63.98914337158203, + "logps/rejected": -81.71636962890625, + "loss": 0.759, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.198986053466797, + "rewards/margins": 4.078911304473877, + "rewards/rejected": -0.8799257278442383, + "step": 3867 + }, + { + "epoch": 0.97, + "grad_norm": 7.969133377075195, + "learning_rate": 2.798939744836247e-06, + "logits/chosen": -0.24729031324386597, + "logits/rejected": -0.34385284781455994, + "logps/chosen": -62.87958908081055, + "logps/rejected": -88.23777770996094, + "loss": 0.7641, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8416450023651123, + "rewards/margins": 4.788026809692383, + "rewards/rejected": -1.9463821649551392, + "step": 3868 + }, + { + "epoch": 0.97, + "grad_norm": 7.932593822479248, + "learning_rate": 2.7965877983178703e-06, + "logits/chosen": -0.3687456548213959, + "logits/rejected": -0.42551010847091675, + "logps/chosen": -60.325374603271484, + "logps/rejected": -92.22218322753906, + "loss": 0.7947, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.738964080810547, + "rewards/margins": 4.257696151733398, + "rewards/rejected": -1.5187320709228516, + "step": 3869 + }, + { + "epoch": 0.97, + "grad_norm": 6.2391839027404785, + "learning_rate": 2.794236456683691e-06, + "logits/chosen": -0.4377691149711609, + "logits/rejected": -0.5336549282073975, + "logps/chosen": -51.101417541503906, + "logps/rejected": -84.11466979980469, + "loss": 0.7616, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7378923892974854, + "rewards/margins": 4.519821643829346, + "rewards/rejected": -1.7819292545318604, + "step": 3870 + }, + { + "epoch": 0.97, + "grad_norm": 3.1294193267822266, + "learning_rate": 2.7918857205792005e-06, + "logits/chosen": -0.3526727855205536, + "logits/rejected": -0.47573122382164, + "logps/chosen": -53.08019256591797, + "logps/rejected": -84.43376159667969, + "loss": 0.7147, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0885376930236816, + "rewards/margins": 4.683422565460205, + "rewards/rejected": -1.5948851108551025, + "step": 3871 + }, + { + "epoch": 0.97, + "grad_norm": 7.680455684661865, + "learning_rate": 2.789535590649728e-06, + "logits/chosen": -0.29278451204299927, + "logits/rejected": -0.3746013939380646, + "logps/chosen": -51.66456604003906, + "logps/rejected": -67.5208511352539, + "loss": 0.7841, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.936004877090454, + "rewards/margins": 4.085213661193848, + "rewards/rejected": -1.1492085456848145, + "step": 3872 + }, + { + "epoch": 0.97, + "grad_norm": 6.346756458282471, + "learning_rate": 2.7871860675404326e-06, + "logits/chosen": -0.3486759066581726, + "logits/rejected": -0.5148454308509827, + "logps/chosen": -64.66292572021484, + "logps/rejected": -83.75946044921875, + "loss": 0.7145, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.879051685333252, + "rewards/margins": 5.282118320465088, + "rewards/rejected": -2.403066873550415, + "step": 3873 + }, + { + "epoch": 0.97, + "grad_norm": 2.577881097793579, + "learning_rate": 2.7848371518963093e-06, + "logits/chosen": -0.3298412561416626, + "logits/rejected": -0.39997708797454834, + "logps/chosen": -54.46519470214844, + "logps/rejected": -75.57526397705078, + "loss": 0.681, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.327195644378662, + "rewards/margins": 4.3737311363220215, + "rewards/rejected": -1.0465354919433594, + "step": 3874 + }, + { + "epoch": 0.97, + "grad_norm": 3.1020774841308594, + "learning_rate": 2.782488844362189e-06, + "logits/chosen": -0.34993427991867065, + "logits/rejected": -0.45112770795822144, + "logps/chosen": -55.690185546875, + "logps/rejected": -64.67207336425781, + "loss": 0.8103, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9415407180786133, + "rewards/margins": 4.711136817932129, + "rewards/rejected": -1.769595742225647, + "step": 3875 + }, + { + "epoch": 0.97, + "grad_norm": 3.7250592708587646, + "learning_rate": 2.780141145582731e-06, + "logits/chosen": -0.25213637948036194, + "logits/rejected": -0.3302410840988159, + "logps/chosen": -54.84402847290039, + "logps/rejected": -105.8291244506836, + "loss": 0.6567, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.784179925918579, + "rewards/margins": 4.331906318664551, + "rewards/rejected": -1.5477262735366821, + "step": 3876 + }, + { + "epoch": 0.97, + "grad_norm": 4.146730899810791, + "learning_rate": 2.77779405620243e-06, + "logits/chosen": -0.30830878019332886, + "logits/rejected": -0.44133713841438293, + "logps/chosen": -48.901283264160156, + "logps/rejected": -67.71540069580078, + "loss": 0.7382, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.816033124923706, + "rewards/margins": 4.444389820098877, + "rewards/rejected": -1.6283563375473022, + "step": 3877 + }, + { + "epoch": 0.97, + "grad_norm": 5.327227592468262, + "learning_rate": 2.775447576865611e-06, + "logits/chosen": -0.2877851724624634, + "logits/rejected": -0.3640408515930176, + "logps/chosen": -60.888572692871094, + "logps/rejected": -93.94930267333984, + "loss": 1.0057, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9058022499084473, + "rewards/margins": 4.118518829345703, + "rewards/rejected": -1.2127165794372559, + "step": 3878 + }, + { + "epoch": 0.97, + "grad_norm": 3.086681365966797, + "learning_rate": 2.7731017082164326e-06, + "logits/chosen": -0.30341610312461853, + "logits/rejected": -0.36141321063041687, + "logps/chosen": -49.7723274230957, + "logps/rejected": -88.39234161376953, + "loss": 0.6135, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.167429208755493, + "rewards/margins": 4.292945861816406, + "rewards/rejected": -1.1255168914794922, + "step": 3879 + }, + { + "epoch": 0.97, + "grad_norm": 13.313218116760254, + "learning_rate": 2.7707564508988917e-06, + "logits/chosen": -0.3120426535606384, + "logits/rejected": -0.4427691400051117, + "logps/chosen": -53.72687530517578, + "logps/rejected": -70.83236694335938, + "loss": 0.7535, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9364473819732666, + "rewards/margins": 4.558290481567383, + "rewards/rejected": -1.621842861175537, + "step": 3880 + }, + { + "epoch": 0.97, + "grad_norm": 3.7952427864074707, + "learning_rate": 2.7684118055568064e-06, + "logits/chosen": -0.26687076687812805, + "logits/rejected": -0.35960614681243896, + "logps/chosen": -67.85078430175781, + "logps/rejected": -114.62982177734375, + "loss": 0.7574, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8804640769958496, + "rewards/margins": 5.163261413574219, + "rewards/rejected": -2.282797336578369, + "step": 3881 + }, + { + "epoch": 0.97, + "grad_norm": 8.167878150939941, + "learning_rate": 2.7660677728338324e-06, + "logits/chosen": -0.3443077504634857, + "logits/rejected": -0.49480944871902466, + "logps/chosen": -62.81861877441406, + "logps/rejected": -66.07820892333984, + "loss": 0.8439, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0687716007232666, + "rewards/margins": 4.251245498657227, + "rewards/rejected": -1.18247389793396, + "step": 3882 + }, + { + "epoch": 0.97, + "grad_norm": 8.027339935302734, + "learning_rate": 2.7637243533734615e-06, + "logits/chosen": -0.27901139855384827, + "logits/rejected": -0.4265490174293518, + "logps/chosen": -58.1176643371582, + "logps/rejected": -91.0285873413086, + "loss": 0.7401, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.009525775909424, + "rewards/margins": 5.526054859161377, + "rewards/rejected": -2.516528844833374, + "step": 3883 + }, + { + "epoch": 0.97, + "grad_norm": 8.67383098602295, + "learning_rate": 2.761381547819011e-06, + "logits/chosen": -0.23933552205562592, + "logits/rejected": -0.3218597173690796, + "logps/chosen": -66.87394714355469, + "logps/rejected": -87.22560119628906, + "loss": 0.7765, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.942487955093384, + "rewards/margins": 4.140592575073242, + "rewards/rejected": -1.1981048583984375, + "step": 3884 + }, + { + "epoch": 0.97, + "grad_norm": 5.696136951446533, + "learning_rate": 2.7590393568136304e-06, + "logits/chosen": -0.30858755111694336, + "logits/rejected": -0.3802568018436432, + "logps/chosen": -56.980995178222656, + "logps/rejected": -82.05806732177734, + "loss": 0.8004, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.626689910888672, + "rewards/margins": 4.264531135559082, + "rewards/rejected": -1.637840747833252, + "step": 3885 + }, + { + "epoch": 0.97, + "grad_norm": 3.9909801483154297, + "learning_rate": 2.7566977810003025e-06, + "logits/chosen": -0.2871167063713074, + "logits/rejected": -0.4000956118106842, + "logps/chosen": -67.82551574707031, + "logps/rejected": -84.3858871459961, + "loss": 0.7667, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7201321125030518, + "rewards/margins": 4.181797027587891, + "rewards/rejected": -1.4616646766662598, + "step": 3886 + }, + { + "epoch": 0.97, + "grad_norm": 5.147113800048828, + "learning_rate": 2.7543568210218384e-06, + "logits/chosen": -0.46821969747543335, + "logits/rejected": -0.5573586821556091, + "logps/chosen": -60.75239181518555, + "logps/rejected": -89.55591583251953, + "loss": 0.8942, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7053158283233643, + "rewards/margins": 4.58860445022583, + "rewards/rejected": -1.883288860321045, + "step": 3887 + }, + { + "epoch": 0.97, + "grad_norm": 4.494164943695068, + "learning_rate": 2.7520164775208867e-06, + "logits/chosen": -0.4210880398750305, + "logits/rejected": -0.47044190764427185, + "logps/chosen": -44.23954772949219, + "logps/rejected": -87.05375671386719, + "loss": 0.6069, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8720734119415283, + "rewards/margins": 4.813425064086914, + "rewards/rejected": -1.9413515329360962, + "step": 3888 + }, + { + "epoch": 0.97, + "grad_norm": 4.441624164581299, + "learning_rate": 2.749676751139919e-06, + "logits/chosen": -0.23353910446166992, + "logits/rejected": -0.3890846073627472, + "logps/chosen": -65.39503479003906, + "logps/rejected": -79.93931579589844, + "loss": 0.7268, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6998422145843506, + "rewards/margins": 4.39689302444458, + "rewards/rejected": -1.697050929069519, + "step": 3889 + }, + { + "epoch": 0.97, + "grad_norm": 5.519229888916016, + "learning_rate": 2.7473376425212415e-06, + "logits/chosen": -0.3699894845485687, + "logits/rejected": -0.4493323862552643, + "logps/chosen": -41.96424102783203, + "logps/rejected": -89.61441040039062, + "loss": 0.6984, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9996867179870605, + "rewards/margins": 5.822863578796387, + "rewards/rejected": -2.8231773376464844, + "step": 3890 + }, + { + "epoch": 0.97, + "grad_norm": 5.624720573425293, + "learning_rate": 2.7449991523069896e-06, + "logits/chosen": -0.40031886100769043, + "logits/rejected": -0.4596194922924042, + "logps/chosen": -67.66547393798828, + "logps/rejected": -78.66539764404297, + "loss": 0.8454, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.841547966003418, + "rewards/margins": 3.849534511566162, + "rewards/rejected": -1.0079865455627441, + "step": 3891 + }, + { + "epoch": 0.97, + "grad_norm": 4.56769323348999, + "learning_rate": 2.7426612811391295e-06, + "logits/chosen": -0.32697996497154236, + "logits/rejected": -0.4146256148815155, + "logps/chosen": -60.096405029296875, + "logps/rejected": -83.90303802490234, + "loss": 0.7722, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9727931022644043, + "rewards/margins": 5.388924598693848, + "rewards/rejected": -2.4161324501037598, + "step": 3892 + }, + { + "epoch": 0.97, + "grad_norm": 13.036590576171875, + "learning_rate": 2.7403240296594567e-06, + "logits/chosen": -0.3635137677192688, + "logits/rejected": -0.3962080478668213, + "logps/chosen": -54.01325607299805, + "logps/rejected": -87.34196472167969, + "loss": 0.794, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8432655334472656, + "rewards/margins": 3.954284906387329, + "rewards/rejected": -1.1110193729400635, + "step": 3893 + }, + { + "epoch": 0.97, + "grad_norm": 3.333181142807007, + "learning_rate": 2.7379873985095967e-06, + "logits/chosen": -0.2703353464603424, + "logits/rejected": -0.3956136405467987, + "logps/chosen": -60.60424041748047, + "logps/rejected": -85.72818756103516, + "loss": 0.6661, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.983386754989624, + "rewards/margins": 4.845707416534424, + "rewards/rejected": -1.8623203039169312, + "step": 3894 + }, + { + "epoch": 0.97, + "grad_norm": 4.62307596206665, + "learning_rate": 2.7356513883310075e-06, + "logits/chosen": -0.3456781506538391, + "logits/rejected": -0.3723486661911011, + "logps/chosen": -59.94864273071289, + "logps/rejected": -97.80950164794922, + "loss": 0.7578, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5690722465515137, + "rewards/margins": 4.593751430511475, + "rewards/rejected": -2.0246787071228027, + "step": 3895 + }, + { + "epoch": 0.97, + "grad_norm": 4.8045430183410645, + "learning_rate": 2.733315999764974e-06, + "logits/chosen": -0.34838205575942993, + "logits/rejected": -0.4305901825428009, + "logps/chosen": -56.6619987487793, + "logps/rejected": -77.45761108398438, + "loss": 0.7033, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.109551429748535, + "rewards/margins": 3.5761821269989014, + "rewards/rejected": -0.466630756855011, + "step": 3896 + }, + { + "epoch": 0.97, + "grad_norm": 9.428738594055176, + "learning_rate": 2.730981233452609e-06, + "logits/chosen": -0.2621924579143524, + "logits/rejected": -0.35810863971710205, + "logps/chosen": -65.76725006103516, + "logps/rejected": -76.73744201660156, + "loss": 0.8912, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.640897750854492, + "rewards/margins": 4.30533504486084, + "rewards/rejected": -1.6644374132156372, + "step": 3897 + }, + { + "epoch": 0.98, + "grad_norm": 3.4952752590179443, + "learning_rate": 2.728647090034856e-06, + "logits/chosen": -0.2954210937023163, + "logits/rejected": -0.38804978132247925, + "logps/chosen": -59.131446838378906, + "logps/rejected": -76.45292663574219, + "loss": 0.8184, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.851062774658203, + "rewards/margins": 4.097494602203369, + "rewards/rejected": -1.2464319467544556, + "step": 3898 + }, + { + "epoch": 0.98, + "grad_norm": 4.62460470199585, + "learning_rate": 2.726313570152488e-06, + "logits/chosen": -0.3008948266506195, + "logits/rejected": -0.4026862680912018, + "logps/chosen": -51.303985595703125, + "logps/rejected": -79.29444122314453, + "loss": 0.7249, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.870995283126831, + "rewards/margins": 4.4021477699279785, + "rewards/rejected": -1.5311527252197266, + "step": 3899 + }, + { + "epoch": 0.98, + "grad_norm": 5.4033708572387695, + "learning_rate": 2.7239806744461095e-06, + "logits/chosen": -0.4041638970375061, + "logits/rejected": -0.4642544388771057, + "logps/chosen": -55.659427642822266, + "logps/rejected": -87.62017822265625, + "loss": 0.8083, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9212324619293213, + "rewards/margins": 4.435013771057129, + "rewards/rejected": -1.513781189918518, + "step": 3900 + }, + { + "epoch": 0.98, + "grad_norm": 6.811942100524902, + "learning_rate": 2.72164840355615e-06, + "logits/chosen": -0.267699271440506, + "logits/rejected": -0.31854259967803955, + "logps/chosen": -56.847476959228516, + "logps/rejected": -82.34284210205078, + "loss": 0.6743, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1161437034606934, + "rewards/margins": 4.415144920349121, + "rewards/rejected": -1.299000859260559, + "step": 3901 + }, + { + "epoch": 0.98, + "grad_norm": 3.8354949951171875, + "learning_rate": 2.719316758122862e-06, + "logits/chosen": -0.35149896144866943, + "logits/rejected": -0.44075268507003784, + "logps/chosen": -56.829402923583984, + "logps/rejected": -72.57931518554688, + "loss": 0.7292, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.815917491912842, + "rewards/margins": 4.313839912414551, + "rewards/rejected": -1.4979231357574463, + "step": 3902 + }, + { + "epoch": 0.98, + "grad_norm": 5.086442470550537, + "learning_rate": 2.7169857387863398e-06, + "logits/chosen": -0.22415193915367126, + "logits/rejected": -0.3717500567436218, + "logps/chosen": -67.71563720703125, + "logps/rejected": -66.79845428466797, + "loss": 0.7634, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.039464235305786, + "rewards/margins": 5.313612937927246, + "rewards/rejected": -2.274148941040039, + "step": 3903 + }, + { + "epoch": 0.98, + "grad_norm": 7.040485382080078, + "learning_rate": 2.7146553461864943e-06, + "logits/chosen": -0.3906569480895996, + "logits/rejected": -0.41306424140930176, + "logps/chosen": -49.46440505981445, + "logps/rejected": -81.89336395263672, + "loss": 1.0054, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0605273246765137, + "rewards/margins": 3.899658203125, + "rewards/rejected": -0.8391305208206177, + "step": 3904 + }, + { + "epoch": 0.98, + "grad_norm": 4.434136390686035, + "learning_rate": 2.7123255809630734e-06, + "logits/chosen": -0.2945947051048279, + "logits/rejected": -0.3890344500541687, + "logps/chosen": -53.800048828125, + "logps/rejected": -91.80010223388672, + "loss": 0.7203, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6908555030822754, + "rewards/margins": 4.580103397369385, + "rewards/rejected": -1.889248013496399, + "step": 3905 + }, + { + "epoch": 0.98, + "grad_norm": 2.7239651679992676, + "learning_rate": 2.709996443755643e-06, + "logits/chosen": -0.38396185636520386, + "logits/rejected": -0.4710436463356018, + "logps/chosen": -47.3280029296875, + "logps/rejected": -80.92851257324219, + "loss": 0.6621, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.252105236053467, + "rewards/margins": 5.661241054534912, + "rewards/rejected": -2.4091365337371826, + "step": 3906 + }, + { + "epoch": 0.98, + "grad_norm": 9.307403564453125, + "learning_rate": 2.707667935203601e-06, + "logits/chosen": -0.3338965177536011, + "logits/rejected": -0.5100255608558655, + "logps/chosen": -63.61023712158203, + "logps/rejected": -57.591941833496094, + "loss": 0.972, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7959516048431396, + "rewards/margins": 4.240954875946045, + "rewards/rejected": -1.4450032711029053, + "step": 3907 + }, + { + "epoch": 0.98, + "grad_norm": 5.0556206703186035, + "learning_rate": 2.705340055946177e-06, + "logits/chosen": -0.2660349905490875, + "logits/rejected": -0.4073478579521179, + "logps/chosen": -51.62147903442383, + "logps/rejected": -72.58528137207031, + "loss": 0.7621, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6656558513641357, + "rewards/margins": 5.0615739822387695, + "rewards/rejected": -2.395918369293213, + "step": 3908 + }, + { + "epoch": 0.98, + "grad_norm": 4.733033180236816, + "learning_rate": 2.703012806622424e-06, + "logits/chosen": -0.33826255798339844, + "logits/rejected": -0.41718050837516785, + "logps/chosen": -49.1987190246582, + "logps/rejected": -96.36176300048828, + "loss": 0.6967, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.161926746368408, + "rewards/margins": 4.785607814788818, + "rewards/rejected": -1.623680591583252, + "step": 3909 + }, + { + "epoch": 0.98, + "grad_norm": 3.365797519683838, + "learning_rate": 2.7006861878712193e-06, + "logits/chosen": -0.27263158559799194, + "logits/rejected": -0.3159589469432831, + "logps/chosen": -56.211151123046875, + "logps/rejected": -82.71585083007812, + "loss": 0.6408, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.969391107559204, + "rewards/margins": 4.7119011878967285, + "rewards/rejected": -1.7425105571746826, + "step": 3910 + }, + { + "epoch": 0.98, + "grad_norm": 5.53394079208374, + "learning_rate": 2.6983602003312715e-06, + "logits/chosen": -0.3064153492450714, + "logits/rejected": -0.44561779499053955, + "logps/chosen": -55.30731201171875, + "logps/rejected": -70.80542755126953, + "loss": 0.7682, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.83784818649292, + "rewards/margins": 4.286174297332764, + "rewards/rejected": -1.4483262300491333, + "step": 3911 + }, + { + "epoch": 0.98, + "grad_norm": 4.871384143829346, + "learning_rate": 2.6960348446411123e-06, + "logits/chosen": -0.3815695643424988, + "logits/rejected": -0.47565940022468567, + "logps/chosen": -59.208396911621094, + "logps/rejected": -81.90988159179688, + "loss": 0.8462, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8249025344848633, + "rewards/margins": 4.479709625244141, + "rewards/rejected": -1.6548073291778564, + "step": 3912 + }, + { + "epoch": 0.98, + "grad_norm": 4.948055744171143, + "learning_rate": 2.693710121439108e-06, + "logits/chosen": -0.3384394645690918, + "logits/rejected": -0.43872496485710144, + "logps/chosen": -50.22688674926758, + "logps/rejected": -77.55081939697266, + "loss": 0.7377, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.863957405090332, + "rewards/margins": 4.537585258483887, + "rewards/rejected": -1.673627257347107, + "step": 3913 + }, + { + "epoch": 0.98, + "grad_norm": 5.95556640625, + "learning_rate": 2.6913860313634373e-06, + "logits/chosen": -0.2280440330505371, + "logits/rejected": -0.32976317405700684, + "logps/chosen": -50.514251708984375, + "logps/rejected": -74.43892669677734, + "loss": 0.8206, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.145895481109619, + "rewards/margins": 4.166338920593262, + "rewards/rejected": -1.0204434394836426, + "step": 3914 + }, + { + "epoch": 0.98, + "grad_norm": 4.709837913513184, + "learning_rate": 2.689062575052119e-06, + "logits/chosen": -0.2448655068874359, + "logits/rejected": -0.343113511800766, + "logps/chosen": -65.50311279296875, + "logps/rejected": -72.40284729003906, + "loss": 0.8539, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9700686931610107, + "rewards/margins": 3.9333012104034424, + "rewards/rejected": -0.9632322788238525, + "step": 3915 + }, + { + "epoch": 0.98, + "grad_norm": 6.3516526222229, + "learning_rate": 2.6867397531429895e-06, + "logits/chosen": -0.3843154311180115, + "logits/rejected": -0.45842134952545166, + "logps/chosen": -54.212669372558594, + "logps/rejected": -88.63102722167969, + "loss": 0.9923, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.0166659355163574, + "rewards/margins": 3.2982351779937744, + "rewards/rejected": -0.281569242477417, + "step": 3916 + }, + { + "epoch": 0.98, + "grad_norm": 4.714919567108154, + "learning_rate": 2.6844175662737137e-06, + "logits/chosen": -0.39237239956855774, + "logits/rejected": -0.48651033639907837, + "logps/chosen": -74.51728820800781, + "logps/rejected": -84.90557861328125, + "loss": 0.776, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2070746421813965, + "rewards/margins": 4.1313395500183105, + "rewards/rejected": -0.9242650866508484, + "step": 3917 + }, + { + "epoch": 0.98, + "grad_norm": 5.737483978271484, + "learning_rate": 2.6820960150817833e-06, + "logits/chosen": -0.3072114586830139, + "logits/rejected": -0.3782065510749817, + "logps/chosen": -63.24221420288086, + "logps/rejected": -78.23504638671875, + "loss": 0.9452, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8810606002807617, + "rewards/margins": 3.6512131690979004, + "rewards/rejected": -0.7701525092124939, + "step": 3918 + }, + { + "epoch": 0.98, + "grad_norm": 2.9290802478790283, + "learning_rate": 2.6797751002045102e-06, + "logits/chosen": -0.288163959980011, + "logits/rejected": -0.2974386513233185, + "logps/chosen": -47.048099517822266, + "logps/rejected": -84.25821685791016, + "loss": 0.6901, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2224535942077637, + "rewards/margins": 4.282464981079102, + "rewards/rejected": -1.0600111484527588, + "step": 3919 + }, + { + "epoch": 0.98, + "grad_norm": 4.979907512664795, + "learning_rate": 2.6774548222790415e-06, + "logits/chosen": -0.34307220578193665, + "logits/rejected": -0.46160271763801575, + "logps/chosen": -65.0989990234375, + "logps/rejected": -89.50191497802734, + "loss": 0.7387, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.944276809692383, + "rewards/margins": 5.0329437255859375, + "rewards/rejected": -2.0886666774749756, + "step": 3920 + }, + { + "epoch": 0.98, + "grad_norm": 9.306276321411133, + "learning_rate": 2.67513518194234e-06, + "logits/chosen": -0.267194539308548, + "logits/rejected": -0.40464651584625244, + "logps/chosen": -49.56206512451172, + "logps/rejected": -69.98443603515625, + "loss": 0.5987, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1488821506500244, + "rewards/margins": 4.948721885681152, + "rewards/rejected": -1.7998393774032593, + "step": 3921 + }, + { + "epoch": 0.98, + "grad_norm": 4.182176113128662, + "learning_rate": 2.672816179831198e-06, + "logits/chosen": -0.32570651173591614, + "logits/rejected": -0.4423300623893738, + "logps/chosen": -47.28030776977539, + "logps/rejected": -74.5639877319336, + "loss": 0.6159, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.117202043533325, + "rewards/margins": 5.398626327514648, + "rewards/rejected": -2.2814245223999023, + "step": 3922 + }, + { + "epoch": 0.98, + "grad_norm": 18.162233352661133, + "learning_rate": 2.6704978165822308e-06, + "logits/chosen": -0.3565097451210022, + "logits/rejected": -0.4591807425022125, + "logps/chosen": -54.37044906616211, + "logps/rejected": -70.7437515258789, + "loss": 0.7021, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7957630157470703, + "rewards/margins": 4.654142379760742, + "rewards/rejected": -1.858379602432251, + "step": 3923 + }, + { + "epoch": 0.98, + "grad_norm": 4.698780059814453, + "learning_rate": 2.6681800928318778e-06, + "logits/chosen": -0.43298831582069397, + "logits/rejected": -0.4405500888824463, + "logps/chosen": -45.53565979003906, + "logps/rejected": -91.98675537109375, + "loss": 0.8092, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.020048141479492, + "rewards/margins": 4.541470050811768, + "rewards/rejected": -1.5214221477508545, + "step": 3924 + }, + { + "epoch": 0.98, + "grad_norm": 4.830526828765869, + "learning_rate": 2.6658630092164107e-06, + "logits/chosen": -0.28096142411231995, + "logits/rejected": -0.41933709383010864, + "logps/chosen": -54.270729064941406, + "logps/rejected": -80.4736328125, + "loss": 0.704, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1324009895324707, + "rewards/margins": 5.016627311706543, + "rewards/rejected": -1.8842264413833618, + "step": 3925 + }, + { + "epoch": 0.98, + "grad_norm": 4.432183265686035, + "learning_rate": 2.663546566371912e-06, + "logits/chosen": -0.3766115605831146, + "logits/rejected": -0.4361581802368164, + "logps/chosen": -63.4067268371582, + "logps/rejected": -108.74026489257812, + "loss": 0.8229, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0022647380828857, + "rewards/margins": 5.189311504364014, + "rewards/rejected": -2.187046527862549, + "step": 3926 + }, + { + "epoch": 0.98, + "grad_norm": 3.858459234237671, + "learning_rate": 2.661230764934295e-06, + "logits/chosen": -0.33540648221969604, + "logits/rejected": -0.3853702247142792, + "logps/chosen": -45.34238052368164, + "logps/rejected": -81.55985260009766, + "loss": 0.6278, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9492132663726807, + "rewards/margins": 3.8017420768737793, + "rewards/rejected": -0.8525286912918091, + "step": 3927 + }, + { + "epoch": 0.98, + "grad_norm": 9.088179588317871, + "learning_rate": 2.6589156055393023e-06, + "logits/chosen": -0.21308191120624542, + "logits/rejected": -0.3055000603199005, + "logps/chosen": -72.55746459960938, + "logps/rejected": -82.98151397705078, + "loss": 1.0943, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.9924824237823486, + "rewards/margins": 3.6661577224731445, + "rewards/rejected": -0.6736751794815063, + "step": 3928 + }, + { + "epoch": 0.98, + "grad_norm": 3.704348564147949, + "learning_rate": 2.6566010888224925e-06, + "logits/chosen": -0.2738979160785675, + "logits/rejected": -0.3909013867378235, + "logps/chosen": -53.82808303833008, + "logps/rejected": -84.34142303466797, + "loss": 0.7003, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.915006399154663, + "rewards/margins": 4.770271301269531, + "rewards/rejected": -1.8552649021148682, + "step": 3929 + }, + { + "epoch": 0.98, + "grad_norm": 7.571136474609375, + "learning_rate": 2.65428721541925e-06, + "logits/chosen": -0.2794751822948456, + "logits/rejected": -0.3624188303947449, + "logps/chosen": -68.42938232421875, + "logps/rejected": -97.64373779296875, + "loss": 0.9062, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.775496006011963, + "rewards/margins": 3.692596435546875, + "rewards/rejected": -0.9171005487442017, + "step": 3930 + }, + { + "epoch": 0.98, + "grad_norm": 4.422749996185303, + "learning_rate": 2.651973985964783e-06, + "logits/chosen": -0.38444775342941284, + "logits/rejected": -0.3925573229789734, + "logps/chosen": -65.49039459228516, + "logps/rejected": -95.3531494140625, + "loss": 0.8111, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8788485527038574, + "rewards/margins": 4.529945373535156, + "rewards/rejected": -1.651097059249878, + "step": 3931 + }, + { + "epoch": 0.98, + "grad_norm": 5.831296443939209, + "learning_rate": 2.6496614010941217e-06, + "logits/chosen": -0.24971367418766022, + "logits/rejected": -0.39612701535224915, + "logps/chosen": -73.03684997558594, + "logps/rejected": -61.39824295043945, + "loss": 0.9055, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.891963243484497, + "rewards/margins": 4.305593013763428, + "rewards/rejected": -1.4136295318603516, + "step": 3932 + }, + { + "epoch": 0.98, + "grad_norm": 5.7906975746154785, + "learning_rate": 2.6473494614421247e-06, + "logits/chosen": -0.2867732048034668, + "logits/rejected": -0.3824058771133423, + "logps/chosen": -54.822322845458984, + "logps/rejected": -65.41609191894531, + "loss": 0.9687, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.880092144012451, + "rewards/margins": 3.331603527069092, + "rewards/rejected": -0.4515113830566406, + "step": 3933 + }, + { + "epoch": 0.98, + "grad_norm": 4.243691444396973, + "learning_rate": 2.6450381676434657e-06, + "logits/chosen": -0.33157044649124146, + "logits/rejected": -0.41377848386764526, + "logps/chosen": -45.50914001464844, + "logps/rejected": -69.28155517578125, + "loss": 0.6751, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.039412260055542, + "rewards/margins": 4.092952728271484, + "rewards/rejected": -1.0535403490066528, + "step": 3934 + }, + { + "epoch": 0.98, + "grad_norm": 4.256883144378662, + "learning_rate": 2.6427275203326464e-06, + "logits/chosen": -0.33592283725738525, + "logits/rejected": -0.41835108399391174, + "logps/chosen": -45.085025787353516, + "logps/rejected": -75.20965576171875, + "loss": 0.6212, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1135809421539307, + "rewards/margins": 5.240833282470703, + "rewards/rejected": -2.1272518634796143, + "step": 3935 + }, + { + "epoch": 0.98, + "grad_norm": 4.528068542480469, + "learning_rate": 2.6404175201439886e-06, + "logits/chosen": -0.3149064779281616, + "logits/rejected": -0.3047659397125244, + "logps/chosen": -52.55104064941406, + "logps/rejected": -118.23285675048828, + "loss": 0.6605, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7502572536468506, + "rewards/margins": 5.1187968254089355, + "rewards/rejected": -2.368539333343506, + "step": 3936 + }, + { + "epoch": 0.98, + "grad_norm": 8.717130661010742, + "learning_rate": 2.638108167711637e-06, + "logits/chosen": -0.4042336344718933, + "logits/rejected": -0.47347795963287354, + "logps/chosen": -45.08266067504883, + "logps/rejected": -92.96857452392578, + "loss": 0.7754, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.76401948928833, + "rewards/margins": 4.889532089233398, + "rewards/rejected": -2.1255125999450684, + "step": 3937 + }, + { + "epoch": 0.99, + "grad_norm": 4.277319431304932, + "learning_rate": 2.6357994636695593e-06, + "logits/chosen": -0.35102516412734985, + "logits/rejected": -0.46290287375450134, + "logps/chosen": -58.24762725830078, + "logps/rejected": -76.9495620727539, + "loss": 0.7946, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.065387725830078, + "rewards/margins": 3.893686294555664, + "rewards/rejected": -0.8282986283302307, + "step": 3938 + }, + { + "epoch": 0.99, + "grad_norm": 4.854193210601807, + "learning_rate": 2.6334914086515416e-06, + "logits/chosen": -0.4027852416038513, + "logits/rejected": -0.48290354013442993, + "logps/chosen": -53.53771209716797, + "logps/rejected": -88.227294921875, + "loss": 0.6636, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.946931838989258, + "rewards/margins": 4.716638088226318, + "rewards/rejected": -1.76970636844635, + "step": 3939 + }, + { + "epoch": 0.99, + "grad_norm": 7.99240779876709, + "learning_rate": 2.6311840032912006e-06, + "logits/chosen": -0.3117947578430176, + "logits/rejected": -0.4103972613811493, + "logps/chosen": -52.89501190185547, + "logps/rejected": -89.01131439208984, + "loss": 0.6964, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.942519187927246, + "rewards/margins": 4.883347988128662, + "rewards/rejected": -1.9408286809921265, + "step": 3940 + }, + { + "epoch": 0.99, + "grad_norm": 7.931282043457031, + "learning_rate": 2.628877248221965e-06, + "logits/chosen": -0.38206636905670166, + "logits/rejected": -0.44908612966537476, + "logps/chosen": -49.6075439453125, + "logps/rejected": -77.0862808227539, + "loss": 0.7233, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.863751173019409, + "rewards/margins": 4.1485090255737305, + "rewards/rejected": -1.2847576141357422, + "step": 3941 + }, + { + "epoch": 0.99, + "grad_norm": 4.770322799682617, + "learning_rate": 2.6265711440770892e-06, + "logits/chosen": -0.31776151061058044, + "logits/rejected": -0.4433063566684723, + "logps/chosen": -50.568904876708984, + "logps/rejected": -71.06962585449219, + "loss": 0.7506, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9614334106445312, + "rewards/margins": 4.33534574508667, + "rewards/rejected": -1.3739120960235596, + "step": 3942 + }, + { + "epoch": 0.99, + "grad_norm": 4.378464698791504, + "learning_rate": 2.624265691489649e-06, + "logits/chosen": -0.2629612982273102, + "logits/rejected": -0.3845723867416382, + "logps/chosen": -53.77960968017578, + "logps/rejected": -89.8378677368164, + "loss": 0.608, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7591400146484375, + "rewards/margins": 4.965418815612793, + "rewards/rejected": -2.2062783241271973, + "step": 3943 + }, + { + "epoch": 0.99, + "grad_norm": 6.360785484313965, + "learning_rate": 2.62196089109254e-06, + "logits/chosen": -0.32740023732185364, + "logits/rejected": -0.43486708402633667, + "logps/chosen": -54.46201705932617, + "logps/rejected": -78.33613586425781, + "loss": 0.7875, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.855102062225342, + "rewards/margins": 4.04360294342041, + "rewards/rejected": -1.1885008811950684, + "step": 3944 + }, + { + "epoch": 0.99, + "grad_norm": 9.240236282348633, + "learning_rate": 2.619656743518481e-06, + "logits/chosen": -0.2974206209182739, + "logits/rejected": -0.4083203673362732, + "logps/chosen": -54.622596740722656, + "logps/rejected": -82.7138671875, + "loss": 0.8181, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.891580581665039, + "rewards/margins": 5.10971736907959, + "rewards/rejected": -2.2181365489959717, + "step": 3945 + }, + { + "epoch": 0.99, + "grad_norm": 5.702579975128174, + "learning_rate": 2.617353249400012e-06, + "logits/chosen": -0.37227922677993774, + "logits/rejected": -0.42257627844810486, + "logps/chosen": -60.1019287109375, + "logps/rejected": -85.4301528930664, + "loss": 0.8243, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2163374423980713, + "rewards/margins": 4.911679267883301, + "rewards/rejected": -1.69534170627594, + "step": 3946 + }, + { + "epoch": 0.99, + "grad_norm": 17.73641014099121, + "learning_rate": 2.615050409369486e-06, + "logits/chosen": -0.30866754055023193, + "logits/rejected": -0.418753981590271, + "logps/chosen": -64.7701644897461, + "logps/rejected": -73.01991271972656, + "loss": 1.0497, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.733901023864746, + "rewards/margins": 3.6885182857513428, + "rewards/rejected": -0.954617440700531, + "step": 3947 + }, + { + "epoch": 0.99, + "grad_norm": 7.59683084487915, + "learning_rate": 2.6127482240590873e-06, + "logits/chosen": -0.23602858185768127, + "logits/rejected": -0.39490842819213867, + "logps/chosen": -63.2906494140625, + "logps/rejected": -78.08984375, + "loss": 0.7497, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.692549467086792, + "rewards/margins": 4.494142055511475, + "rewards/rejected": -1.801592469215393, + "step": 3948 + }, + { + "epoch": 0.99, + "grad_norm": 5.005329608917236, + "learning_rate": 2.6104466941008145e-06, + "logits/chosen": -0.32144272327423096, + "logits/rejected": -0.39955225586891174, + "logps/chosen": -50.92123031616211, + "logps/rejected": -93.52949523925781, + "loss": 0.7274, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6828508377075195, + "rewards/margins": 5.227349758148193, + "rewards/rejected": -2.544499635696411, + "step": 3949 + }, + { + "epoch": 0.99, + "grad_norm": 5.077554225921631, + "learning_rate": 2.608145820126486e-06, + "logits/chosen": -0.33307287096977234, + "logits/rejected": -0.4308764934539795, + "logps/chosen": -46.6797981262207, + "logps/rejected": -78.40126037597656, + "loss": 0.7513, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.93778657913208, + "rewards/margins": 4.706129550933838, + "rewards/rejected": -1.7683430910110474, + "step": 3950 + }, + { + "epoch": 0.99, + "grad_norm": 4.762372016906738, + "learning_rate": 2.6058456027677428e-06, + "logits/chosen": -0.3520478308200836, + "logits/rejected": -0.40292418003082275, + "logps/chosen": -59.79728317260742, + "logps/rejected": -87.77571105957031, + "loss": 0.7565, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.717597246170044, + "rewards/margins": 4.313478469848633, + "rewards/rejected": -1.5958813428878784, + "step": 3951 + }, + { + "epoch": 0.99, + "grad_norm": 5.709710121154785, + "learning_rate": 2.6035460426560407e-06, + "logits/chosen": -0.2955566644668579, + "logits/rejected": -0.3299587666988373, + "logps/chosen": -47.44623947143555, + "logps/rejected": -85.56781768798828, + "loss": 0.7801, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9115657806396484, + "rewards/margins": 4.61860466003418, + "rewards/rejected": -1.7070386409759521, + "step": 3952 + }, + { + "epoch": 0.99, + "grad_norm": 4.666372299194336, + "learning_rate": 2.6012471404226636e-06, + "logits/chosen": -0.37595322728157043, + "logits/rejected": -0.5036236643791199, + "logps/chosen": -54.64133071899414, + "logps/rejected": -87.38142395019531, + "loss": 0.7884, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7459747791290283, + "rewards/margins": 4.955821514129639, + "rewards/rejected": -2.2098464965820312, + "step": 3953 + }, + { + "epoch": 0.99, + "grad_norm": 11.251038551330566, + "learning_rate": 2.598948896698707e-06, + "logits/chosen": -0.28319454193115234, + "logits/rejected": -0.35370874404907227, + "logps/chosen": -57.71654510498047, + "logps/rejected": -94.35215759277344, + "loss": 0.7347, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7252562046051025, + "rewards/margins": 4.368077754974365, + "rewards/rejected": -1.642821192741394, + "step": 3954 + }, + { + "epoch": 0.99, + "grad_norm": 4.181229591369629, + "learning_rate": 2.5966513121150894e-06, + "logits/chosen": -0.37056785821914673, + "logits/rejected": -0.4613371789455414, + "logps/chosen": -58.11311340332031, + "logps/rejected": -73.26325225830078, + "loss": 0.7951, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8763952255249023, + "rewards/margins": 4.577820777893066, + "rewards/rejected": -1.7014251947402954, + "step": 3955 + }, + { + "epoch": 0.99, + "grad_norm": 11.372893333435059, + "learning_rate": 2.5943543873025465e-06, + "logits/chosen": -0.3493478000164032, + "logits/rejected": -0.403405100107193, + "logps/chosen": -58.85442352294922, + "logps/rejected": -85.92526245117188, + "loss": 1.0191, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.705801248550415, + "rewards/margins": 3.4825944900512695, + "rewards/rejected": -0.7767928838729858, + "step": 3956 + }, + { + "epoch": 0.99, + "grad_norm": 6.1771016120910645, + "learning_rate": 2.592058122891632e-06, + "logits/chosen": -0.35696330666542053, + "logits/rejected": -0.4829951226711273, + "logps/chosen": -62.76747131347656, + "logps/rejected": -70.57715606689453, + "loss": 0.8755, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6877219676971436, + "rewards/margins": 3.5756094455718994, + "rewards/rejected": -0.8878875970840454, + "step": 3957 + }, + { + "epoch": 0.99, + "grad_norm": 4.4698991775512695, + "learning_rate": 2.5897625195127263e-06, + "logits/chosen": -0.4604101777076721, + "logits/rejected": -0.4872085154056549, + "logps/chosen": -49.16835021972656, + "logps/rejected": -92.39891052246094, + "loss": 0.7388, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2659480571746826, + "rewards/margins": 5.175011157989502, + "rewards/rejected": -1.909062385559082, + "step": 3958 + }, + { + "epoch": 0.99, + "grad_norm": 7.134530544281006, + "learning_rate": 2.5874675777960134e-06, + "logits/chosen": -0.4235917329788208, + "logits/rejected": -0.5132007002830505, + "logps/chosen": -54.11018371582031, + "logps/rejected": -78.60784912109375, + "loss": 0.9924, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.684717893600464, + "rewards/margins": 4.636783599853516, + "rewards/rejected": -1.9520655870437622, + "step": 3959 + }, + { + "epoch": 0.99, + "grad_norm": 7.722177028656006, + "learning_rate": 2.585173298371511e-06, + "logits/chosen": -0.33886948227882385, + "logits/rejected": -0.39245855808258057, + "logps/chosen": -61.71527099609375, + "logps/rejected": -85.53230285644531, + "loss": 0.7953, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8185629844665527, + "rewards/margins": 4.029900074005127, + "rewards/rejected": -1.2113368511199951, + "step": 3960 + }, + { + "epoch": 0.99, + "grad_norm": 4.60955810546875, + "learning_rate": 2.5828796818690462e-06, + "logits/chosen": -0.3061503767967224, + "logits/rejected": -0.37661272287368774, + "logps/chosen": -62.96848678588867, + "logps/rejected": -86.45533752441406, + "loss": 0.7798, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0126655101776123, + "rewards/margins": 4.45127534866333, + "rewards/rejected": -1.4386093616485596, + "step": 3961 + }, + { + "epoch": 0.99, + "grad_norm": 5.238865852355957, + "learning_rate": 2.5805867289182663e-06, + "logits/chosen": -0.32559746503829956, + "logits/rejected": -0.41857144236564636, + "logps/chosen": -52.410987854003906, + "logps/rejected": -75.44471740722656, + "loss": 0.7409, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.740893602371216, + "rewards/margins": 4.216926574707031, + "rewards/rejected": -1.4760327339172363, + "step": 3962 + }, + { + "epoch": 0.99, + "grad_norm": 7.765301704406738, + "learning_rate": 2.5782944401486353e-06, + "logits/chosen": -0.3234136700630188, + "logits/rejected": -0.3947564959526062, + "logps/chosen": -57.13744354248047, + "logps/rejected": -93.12696838378906, + "loss": 0.7162, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5421407222747803, + "rewards/margins": 4.624061584472656, + "rewards/rejected": -2.081921339035034, + "step": 3963 + }, + { + "epoch": 0.99, + "grad_norm": 2.601322889328003, + "learning_rate": 2.5760028161894356e-06, + "logits/chosen": -0.37227344512939453, + "logits/rejected": -0.45903050899505615, + "logps/chosen": -44.32841491699219, + "logps/rejected": -80.4843521118164, + "loss": 0.6075, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1162447929382324, + "rewards/margins": 5.615565776824951, + "rewards/rejected": -2.499321222305298, + "step": 3964 + }, + { + "epoch": 0.99, + "grad_norm": 5.0008440017700195, + "learning_rate": 2.57371185766977e-06, + "logits/chosen": -0.37300920486450195, + "logits/rejected": -0.5187044739723206, + "logps/chosen": -51.417789459228516, + "logps/rejected": -87.38082885742188, + "loss": 0.6759, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0546679496765137, + "rewards/margins": 5.122807025909424, + "rewards/rejected": -2.0681395530700684, + "step": 3965 + }, + { + "epoch": 0.99, + "grad_norm": 5.6767072677612305, + "learning_rate": 2.571421565218557e-06, + "logits/chosen": -0.39085590839385986, + "logits/rejected": -0.5060492157936096, + "logps/chosen": -48.140899658203125, + "logps/rejected": -90.51437377929688, + "loss": 0.6712, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.995258331298828, + "rewards/margins": 5.7006402015686035, + "rewards/rejected": -2.7053816318511963, + "step": 3966 + }, + { + "epoch": 0.99, + "grad_norm": 8.983372688293457, + "learning_rate": 2.5691319394645247e-06, + "logits/chosen": -0.3625097870826721, + "logits/rejected": -0.48492270708084106, + "logps/chosen": -63.492340087890625, + "logps/rejected": -71.17562103271484, + "loss": 0.8719, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.658053398132324, + "rewards/margins": 3.7685680389404297, + "rewards/rejected": -1.1105144023895264, + "step": 3967 + }, + { + "epoch": 0.99, + "grad_norm": 8.519917488098145, + "learning_rate": 2.566842981036231e-06, + "logits/chosen": -0.3431704044342041, + "logits/rejected": -0.39648690819740295, + "logps/chosen": -54.146080017089844, + "logps/rejected": -81.34188079833984, + "loss": 0.8075, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7210326194763184, + "rewards/margins": 4.339089870452881, + "rewards/rejected": -1.6180572509765625, + "step": 3968 + }, + { + "epoch": 0.99, + "grad_norm": 17.52022361755371, + "learning_rate": 2.5645546905620404e-06, + "logits/chosen": -0.34068363904953003, + "logits/rejected": -0.4791790843009949, + "logps/chosen": -65.4262466430664, + "logps/rejected": -71.7327880859375, + "loss": 0.8501, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.978429079055786, + "rewards/margins": 4.152409553527832, + "rewards/rejected": -1.173980474472046, + "step": 3969 + }, + { + "epoch": 0.99, + "grad_norm": 8.864716529846191, + "learning_rate": 2.5622670686701445e-06, + "logits/chosen": -0.3277845084667206, + "logits/rejected": -0.4515298902988434, + "logps/chosen": -59.648746490478516, + "logps/rejected": -90.57411193847656, + "loss": 0.828, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9079957008361816, + "rewards/margins": 4.897191524505615, + "rewards/rejected": -1.9891960620880127, + "step": 3970 + }, + { + "epoch": 0.99, + "grad_norm": 4.614034652709961, + "learning_rate": 2.5599801159885374e-06, + "logits/chosen": -0.28328749537467957, + "logits/rejected": -0.43224674463272095, + "logps/chosen": -54.7840461730957, + "logps/rejected": -77.97039031982422, + "loss": 0.7377, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0052223205566406, + "rewards/margins": 5.244810581207275, + "rewards/rejected": -2.2395882606506348, + "step": 3971 + }, + { + "epoch": 0.99, + "grad_norm": 5.802002906799316, + "learning_rate": 2.5576938331450384e-06, + "logits/chosen": -0.3140682280063629, + "logits/rejected": -0.38626155257225037, + "logps/chosen": -61.17048645019531, + "logps/rejected": -94.19263458251953, + "loss": 0.6643, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1433699131011963, + "rewards/margins": 4.942506790161133, + "rewards/rejected": -1.7991368770599365, + "step": 3972 + }, + { + "epoch": 0.99, + "grad_norm": 4.248480796813965, + "learning_rate": 2.555408220767286e-06, + "logits/chosen": -0.293648898601532, + "logits/rejected": -0.36231371760368347, + "logps/chosen": -52.60871124267578, + "logps/rejected": -83.06941986083984, + "loss": 0.7362, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8181371688842773, + "rewards/margins": 5.087731838226318, + "rewards/rejected": -2.2695953845977783, + "step": 3973 + }, + { + "epoch": 0.99, + "grad_norm": 8.369975090026855, + "learning_rate": 2.5531232794827258e-06, + "logits/chosen": -0.23531073331832886, + "logits/rejected": -0.29828348755836487, + "logps/chosen": -54.12261199951172, + "logps/rejected": -81.29331970214844, + "loss": 0.9096, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8003783226013184, + "rewards/margins": 2.800234794616699, + "rewards/rejected": 0.00014334917068481445, + "step": 3974 + }, + { + "epoch": 0.99, + "grad_norm": 6.785473823547363, + "learning_rate": 2.550839009918625e-06, + "logits/chosen": -0.39802828431129456, + "logits/rejected": -0.46373116970062256, + "logps/chosen": -41.53450393676758, + "logps/rejected": -74.50686645507812, + "loss": 0.7044, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.935776710510254, + "rewards/margins": 4.638100624084473, + "rewards/rejected": -1.7023239135742188, + "step": 3975 + }, + { + "epoch": 0.99, + "grad_norm": 18.85883140563965, + "learning_rate": 2.548555412702065e-06, + "logits/chosen": -0.34357231855392456, + "logits/rejected": -0.4700168967247009, + "logps/chosen": -59.63007354736328, + "logps/rejected": -81.36143493652344, + "loss": 0.792, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.051013469696045, + "rewards/margins": 4.564274787902832, + "rewards/rejected": -1.5132614374160767, + "step": 3976 + }, + { + "epoch": 0.99, + "grad_norm": 13.869919776916504, + "learning_rate": 2.5462724884599404e-06, + "logits/chosen": -0.3476656377315521, + "logits/rejected": -0.3805522918701172, + "logps/chosen": -54.31181335449219, + "logps/rejected": -97.28743743896484, + "loss": 0.7125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5849995613098145, + "rewards/margins": 5.527186393737793, + "rewards/rejected": -2.9421870708465576, + "step": 3977 + }, + { + "epoch": 1.0, + "grad_norm": 9.160558700561523, + "learning_rate": 2.5439902378189684e-06, + "logits/chosen": -0.4175277054309845, + "logits/rejected": -0.4808599054813385, + "logps/chosen": -49.39190673828125, + "logps/rejected": -81.02708435058594, + "loss": 0.7746, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.021022081375122, + "rewards/margins": 4.352046966552734, + "rewards/rejected": -1.3310251235961914, + "step": 3978 + }, + { + "epoch": 1.0, + "grad_norm": 4.99199104309082, + "learning_rate": 2.5417086614056696e-06, + "logits/chosen": -0.26448601484298706, + "logits/rejected": -0.3642612099647522, + "logps/chosen": -62.430091857910156, + "logps/rejected": -98.40560913085938, + "loss": 0.6917, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.590845823287964, + "rewards/margins": 4.925851821899414, + "rewards/rejected": -2.3350062370300293, + "step": 3979 + }, + { + "epoch": 1.0, + "grad_norm": 6.806528091430664, + "learning_rate": 2.5394277598463903e-06, + "logits/chosen": -0.358371764421463, + "logits/rejected": -0.4213893413543701, + "logps/chosen": -63.663543701171875, + "logps/rejected": -74.0125961303711, + "loss": 0.8587, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8773984909057617, + "rewards/margins": 3.347766399383545, + "rewards/rejected": -0.47036781907081604, + "step": 3980 + }, + { + "epoch": 1.0, + "grad_norm": 3.126136302947998, + "learning_rate": 2.537147533767286e-06, + "logits/chosen": -0.38199442625045776, + "logits/rejected": -0.44881367683410645, + "logps/chosen": -59.84286880493164, + "logps/rejected": -105.42862701416016, + "loss": 0.6946, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7037875652313232, + "rewards/margins": 5.902953147888184, + "rewards/rejected": -3.1991660594940186, + "step": 3981 + }, + { + "epoch": 1.0, + "grad_norm": 5.849802494049072, + "learning_rate": 2.534867983794328e-06, + "logits/chosen": -0.36057621240615845, + "logits/rejected": -0.48001828789711, + "logps/chosen": -53.503170013427734, + "logps/rejected": -70.21327209472656, + "loss": 0.7095, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.030102491378784, + "rewards/margins": 4.757241725921631, + "rewards/rejected": -1.7271392345428467, + "step": 3982 + }, + { + "epoch": 1.0, + "grad_norm": 7.2147536277771, + "learning_rate": 2.532589110553301e-06, + "logits/chosen": -0.347011536359787, + "logits/rejected": -0.4237575829029083, + "logps/chosen": -54.97065734863281, + "logps/rejected": -82.23614501953125, + "loss": 0.813, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9916915893554688, + "rewards/margins": 5.472186088562012, + "rewards/rejected": -2.480494976043701, + "step": 3983 + }, + { + "epoch": 1.0, + "grad_norm": 6.190946102142334, + "learning_rate": 2.5303109146698046e-06, + "logits/chosen": -0.2848092019557953, + "logits/rejected": -0.4421289265155792, + "logps/chosen": -53.86073684692383, + "logps/rejected": -88.45740509033203, + "loss": 0.6674, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0118556022644043, + "rewards/margins": 5.459551811218262, + "rewards/rejected": -2.4476962089538574, + "step": 3984 + }, + { + "epoch": 1.0, + "grad_norm": 5.591342449188232, + "learning_rate": 2.5280333967692556e-06, + "logits/chosen": -0.3739665746688843, + "logits/rejected": -0.42884325981140137, + "logps/chosen": -58.964874267578125, + "logps/rejected": -81.81857299804688, + "loss": 0.8027, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.809420347213745, + "rewards/margins": 4.262594223022461, + "rewards/rejected": -1.4531738758087158, + "step": 3985 + }, + { + "epoch": 1.0, + "grad_norm": 5.841887474060059, + "learning_rate": 2.52575655747688e-06, + "logits/chosen": -0.3210599422454834, + "logits/rejected": -0.4425964951515198, + "logps/chosen": -50.260807037353516, + "logps/rejected": -83.94160461425781, + "loss": 0.6691, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1217541694641113, + "rewards/margins": 5.291696548461914, + "rewards/rejected": -2.1699421405792236, + "step": 3986 + }, + { + "epoch": 1.0, + "grad_norm": 5.784961223602295, + "learning_rate": 2.523480397417719e-06, + "logits/chosen": -0.35087037086486816, + "logits/rejected": -0.43127089738845825, + "logps/chosen": -67.01488494873047, + "logps/rejected": -79.7734603881836, + "loss": 0.9933, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.786872625350952, + "rewards/margins": 3.966318130493164, + "rewards/rejected": -1.1794456243515015, + "step": 3987 + }, + { + "epoch": 1.0, + "grad_norm": 4.3430495262146, + "learning_rate": 2.5212049172166287e-06, + "logits/chosen": -0.27448543906211853, + "logits/rejected": -0.471762478351593, + "logps/chosen": -56.243873596191406, + "logps/rejected": -70.64132690429688, + "loss": 0.6467, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.922999620437622, + "rewards/margins": 5.602659702301025, + "rewards/rejected": -2.679659366607666, + "step": 3988 + }, + { + "epoch": 1.0, + "grad_norm": 26.324243545532227, + "learning_rate": 2.518930117498274e-06, + "logits/chosen": -0.2725372910499573, + "logits/rejected": -0.4323597848415375, + "logps/chosen": -68.67556762695312, + "logps/rejected": -75.77647399902344, + "loss": 0.8941, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.749992609024048, + "rewards/margins": 3.7262251377105713, + "rewards/rejected": -0.97623211145401, + "step": 3989 + }, + { + "epoch": 1.0, + "grad_norm": 5.9488043785095215, + "learning_rate": 2.5166559988871436e-06, + "logits/chosen": -0.35810786485671997, + "logits/rejected": -0.4013749957084656, + "logps/chosen": -49.2012825012207, + "logps/rejected": -78.60591125488281, + "loss": 0.6889, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.89568829536438, + "rewards/margins": 3.895312547683716, + "rewards/rejected": -0.9996238946914673, + "step": 3990 + }, + { + "epoch": 1.0, + "grad_norm": 4.267927646636963, + "learning_rate": 2.5143825620075255e-06, + "logits/chosen": -0.4091894328594208, + "logits/rejected": -0.5322617292404175, + "logps/chosen": -47.34133529663086, + "logps/rejected": -76.9367904663086, + "loss": 0.6971, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.886820077896118, + "rewards/margins": 4.986185073852539, + "rewards/rejected": -2.099365711212158, + "step": 3991 + }, + { + "epoch": 1.0, + "grad_norm": 8.020086288452148, + "learning_rate": 2.512109807483528e-06, + "logits/chosen": -0.39032426476478577, + "logits/rejected": -0.42060011625289917, + "logps/chosen": -58.71881103515625, + "logps/rejected": -93.53240966796875, + "loss": 0.9612, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5645103454589844, + "rewards/margins": 3.8891429901123047, + "rewards/rejected": -1.3246324062347412, + "step": 3992 + }, + { + "epoch": 1.0, + "grad_norm": 8.524876594543457, + "learning_rate": 2.5098377359390752e-06, + "logits/chosen": -0.29978540539741516, + "logits/rejected": -0.4114219546318054, + "logps/chosen": -54.91923141479492, + "logps/rejected": -70.10466003417969, + "loss": 0.8945, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.06884765625, + "rewards/margins": 4.212095737457275, + "rewards/rejected": -1.1432480812072754, + "step": 3993 + }, + { + "epoch": 1.0, + "grad_norm": 4.158654689788818, + "learning_rate": 2.5075663479978966e-06, + "logits/chosen": -0.3735409379005432, + "logits/rejected": -0.4389592111110687, + "logps/chosen": -50.51386260986328, + "logps/rejected": -83.34427642822266, + "loss": 0.6704, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.923912286758423, + "rewards/margins": 4.519001007080078, + "rewards/rejected": -1.5950886011123657, + "step": 3994 + }, + { + "epoch": 1.0, + "grad_norm": 14.255072593688965, + "learning_rate": 2.5052956442835386e-06, + "logits/chosen": -0.39908933639526367, + "logits/rejected": -0.4675670266151428, + "logps/chosen": -57.014225006103516, + "logps/rejected": -65.72610473632812, + "loss": 0.8763, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6710782051086426, + "rewards/margins": 3.6876842975616455, + "rewards/rejected": -1.0166058540344238, + "step": 3995 + }, + { + "epoch": 1.0, + "grad_norm": 17.12480926513672, + "learning_rate": 2.5030256254193576e-06, + "logits/chosen": -0.4192488193511963, + "logits/rejected": -0.4626590311527252, + "logps/chosen": -51.95772933959961, + "logps/rejected": -91.15442657470703, + "loss": 0.8022, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0731701850891113, + "rewards/margins": 4.082122325897217, + "rewards/rejected": -1.0089521408081055, + "step": 3996 + }, + { + "epoch": 1.0, + "grad_norm": 7.265211582183838, + "learning_rate": 2.50075629202852e-06, + "logits/chosen": -0.40998685359954834, + "logits/rejected": -0.5109131932258606, + "logps/chosen": -49.43071365356445, + "logps/rejected": -64.70682525634766, + "loss": 0.8819, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8004424571990967, + "rewards/margins": 3.7088098526000977, + "rewards/rejected": -0.9083675742149353, + "step": 3997 + }, + { + "epoch": 1.0, + "grad_norm": 8.313267707824707, + "learning_rate": 2.4984876447340124e-06, + "logits/chosen": -0.3586435317993164, + "logits/rejected": -0.494004487991333, + "logps/chosen": -60.66578674316406, + "logps/rejected": -86.72149658203125, + "loss": 0.761, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0154876708984375, + "rewards/margins": 4.745316505432129, + "rewards/rejected": -1.7298285961151123, + "step": 3998 + }, + { + "epoch": 1.0, + "grad_norm": 2.6055827140808105, + "learning_rate": 2.4962196841586245e-06, + "logits/chosen": -0.36195483803749084, + "logits/rejected": -0.4720311760902405, + "logps/chosen": -63.51190185546875, + "logps/rejected": -68.62223815917969, + "loss": 0.7421, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8818793296813965, + "rewards/margins": 4.674654960632324, + "rewards/rejected": -1.7927756309509277, + "step": 3999 + }, + { + "epoch": 1.0, + "grad_norm": 4.680634021759033, + "learning_rate": 2.493952410924961e-06, + "logits/chosen": -0.3545898199081421, + "logits/rejected": -0.4654167592525482, + "logps/chosen": -58.22848892211914, + "logps/rejected": -74.90921020507812, + "loss": 0.73, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8903353214263916, + "rewards/margins": 4.738225936889648, + "rewards/rejected": -1.8478907346725464, + "step": 4000 + }, + { + "epoch": 1.0, + "grad_norm": 3.8498475551605225, + "learning_rate": 2.491685825655436e-06, + "logits/chosen": -0.3326835334300995, + "logits/rejected": -0.46360117197036743, + "logps/chosen": -50.2817268371582, + "logps/rejected": -75.92484283447266, + "loss": 0.6808, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1121408939361572, + "rewards/margins": 5.342770099639893, + "rewards/rejected": -2.230628728866577, + "step": 4001 + }, + { + "epoch": 1.0, + "grad_norm": 6.714812278747559, + "learning_rate": 2.4894199289722783e-06, + "logits/chosen": -0.38541311025619507, + "logits/rejected": -0.44937658309936523, + "logps/chosen": -52.28934097290039, + "logps/rejected": -89.77862548828125, + "loss": 0.8473, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8907487392425537, + "rewards/margins": 4.21545934677124, + "rewards/rejected": -1.324710726737976, + "step": 4002 + }, + { + "epoch": 1.0, + "grad_norm": 8.136967658996582, + "learning_rate": 2.4871547214975235e-06, + "logits/chosen": -0.3532468378543854, + "logits/rejected": -0.41958653926849365, + "logps/chosen": -57.565460205078125, + "logps/rejected": -77.73214721679688, + "loss": 0.7332, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0034661293029785, + "rewards/margins": 4.275293827056885, + "rewards/rejected": -1.2718278169631958, + "step": 4003 + }, + { + "epoch": 1.0, + "grad_norm": 5.150571346282959, + "learning_rate": 2.4848902038530184e-06, + "logits/chosen": -0.328269898891449, + "logits/rejected": -0.4056588411331177, + "logps/chosen": -55.54508590698242, + "logps/rejected": -67.57561492919922, + "loss": 0.87, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.384005069732666, + "rewards/margins": 4.337178707122803, + "rewards/rejected": -0.9531735181808472, + "step": 4004 + }, + { + "epoch": 1.0, + "grad_norm": 4.944064617156982, + "learning_rate": 2.482626376660426e-06, + "logits/chosen": -0.3200632333755493, + "logits/rejected": -0.4509027898311615, + "logps/chosen": -55.59958267211914, + "logps/rejected": -81.76278686523438, + "loss": 0.6549, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7671289443969727, + "rewards/margins": 5.265327453613281, + "rewards/rejected": -2.4981987476348877, + "step": 4005 + }, + { + "epoch": 1.0, + "grad_norm": 5.6877360343933105, + "learning_rate": 2.4803632405412143e-06, + "logits/chosen": -0.295655757188797, + "logits/rejected": -0.3392978608608246, + "logps/chosen": -59.81465148925781, + "logps/rejected": -91.90733337402344, + "loss": 0.9119, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.606330156326294, + "rewards/margins": 4.007341384887695, + "rewards/rejected": -1.401011347770691, + "step": 4006 + }, + { + "epoch": 1.0, + "grad_norm": 2.7603530883789062, + "learning_rate": 2.478100796116662e-06, + "logits/chosen": -0.37549498677253723, + "logits/rejected": -0.45296329259872437, + "logps/chosen": -57.94264221191406, + "logps/rejected": -93.10005187988281, + "loss": 0.6625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9470741748809814, + "rewards/margins": 5.133294582366943, + "rewards/rejected": -2.18622088432312, + "step": 4007 + }, + { + "epoch": 1.0, + "grad_norm": 5.464948654174805, + "learning_rate": 2.47583904400786e-06, + "logits/chosen": -0.30164727568626404, + "logits/rejected": -0.43598106503486633, + "logps/chosen": -57.89659881591797, + "logps/rejected": -74.04009246826172, + "loss": 0.792, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9796829223632812, + "rewards/margins": 4.538458347320557, + "rewards/rejected": -1.5587749481201172, + "step": 4008 + }, + { + "epoch": 1.0, + "grad_norm": 4.0727105140686035, + "learning_rate": 2.4735779848357057e-06, + "logits/chosen": -0.42178216576576233, + "logits/rejected": -0.49529358744621277, + "logps/chosen": -55.6235237121582, + "logps/rejected": -88.12614440917969, + "loss": 0.7158, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3398361206054688, + "rewards/margins": 5.07916259765625, + "rewards/rejected": -1.7393265962600708, + "step": 4009 + }, + { + "epoch": 1.0, + "grad_norm": 4.377815246582031, + "learning_rate": 2.4713176192209117e-06, + "logits/chosen": -0.32095208764076233, + "logits/rejected": -0.4665818214416504, + "logps/chosen": -62.671165466308594, + "logps/rejected": -84.11772155761719, + "loss": 0.7175, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.927933692932129, + "rewards/margins": 4.826716423034668, + "rewards/rejected": -1.8987823724746704, + "step": 4010 + }, + { + "epoch": 1.0, + "grad_norm": 4.551270961761475, + "learning_rate": 2.4690579477839983e-06, + "logits/chosen": -0.4271305799484253, + "logits/rejected": -0.4965217113494873, + "logps/chosen": -58.20014953613281, + "logps/rejected": -76.11139678955078, + "loss": 0.8178, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8472845554351807, + "rewards/margins": 3.682992696762085, + "rewards/rejected": -0.8357083797454834, + "step": 4011 + }, + { + "epoch": 1.0, + "grad_norm": 5.242484092712402, + "learning_rate": 2.4667989711452878e-06, + "logits/chosen": -0.36847206950187683, + "logits/rejected": -0.42429888248443604, + "logps/chosen": -48.07366943359375, + "logps/rejected": -91.00045013427734, + "loss": 0.7668, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.741227149963379, + "rewards/margins": 4.875319004058838, + "rewards/rejected": -2.134091854095459, + "step": 4012 + }, + { + "epoch": 1.0, + "grad_norm": 7.557177543640137, + "learning_rate": 2.4645406899249235e-06, + "logits/chosen": -0.44036373496055603, + "logits/rejected": -0.513913631439209, + "logps/chosen": -51.08951187133789, + "logps/rejected": -83.3009033203125, + "loss": 0.849, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.4765586853027344, + "rewards/margins": 4.101110458374023, + "rewards/rejected": -1.6245518922805786, + "step": 4013 + }, + { + "epoch": 1.0, + "grad_norm": 5.024114608764648, + "learning_rate": 2.462283104742849e-06, + "logits/chosen": -0.2950069010257721, + "logits/rejected": -0.36841318011283875, + "logps/chosen": -56.756370544433594, + "logps/rejected": -86.32148742675781, + "loss": 0.7084, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9340431690216064, + "rewards/margins": 4.58499813079834, + "rewards/rejected": -1.6509546041488647, + "step": 4014 + }, + { + "epoch": 1.0, + "grad_norm": 7.124791622161865, + "learning_rate": 2.4600262162188254e-06, + "logits/chosen": -0.2737029194831848, + "logits/rejected": -0.2520906925201416, + "logps/chosen": -51.429351806640625, + "logps/rejected": -99.37701416015625, + "loss": 0.7991, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7580583095550537, + "rewards/margins": 3.64731502532959, + "rewards/rejected": -0.8892569541931152, + "step": 4015 + }, + { + "epoch": 1.0, + "grad_norm": 8.16289234161377, + "learning_rate": 2.457770024972412e-06, + "logits/chosen": -0.27688369154930115, + "logits/rejected": -0.3354605734348297, + "logps/chosen": -66.72830200195312, + "logps/rejected": -95.20491790771484, + "loss": 0.8116, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8886561393737793, + "rewards/margins": 4.384547233581543, + "rewards/rejected": -1.495890736579895, + "step": 4016 + }, + { + "epoch": 1.0, + "grad_norm": 4.755925178527832, + "learning_rate": 2.455514531622982e-06, + "logits/chosen": -0.28210049867630005, + "logits/rejected": -0.4599357545375824, + "logps/chosen": -54.77172088623047, + "logps/rejected": -69.4161376953125, + "loss": 0.7596, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.977412700653076, + "rewards/margins": 4.389449119567871, + "rewards/rejected": -1.4120365381240845, + "step": 4017 + }, + { + "epoch": 1.01, + "grad_norm": 5.100514888763428, + "learning_rate": 2.453259736789721e-06, + "logits/chosen": -0.37119659781455994, + "logits/rejected": -0.40766608715057373, + "logps/chosen": -48.51804733276367, + "logps/rejected": -81.7118148803711, + "loss": 0.7875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8000166416168213, + "rewards/margins": 3.6266160011291504, + "rewards/rejected": -0.8265991806983948, + "step": 4018 + }, + { + "epoch": 1.01, + "grad_norm": 5.728190898895264, + "learning_rate": 2.4510056410916163e-06, + "logits/chosen": -0.3343636095523834, + "logits/rejected": -0.4831651449203491, + "logps/chosen": -66.01618194580078, + "logps/rejected": -77.85186767578125, + "loss": 0.8504, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7935070991516113, + "rewards/margins": 4.533315658569336, + "rewards/rejected": -1.7398087978363037, + "step": 4019 + }, + { + "epoch": 1.01, + "grad_norm": 4.065576553344727, + "learning_rate": 2.4487522451474655e-06, + "logits/chosen": -0.368543803691864, + "logits/rejected": -0.39952486753463745, + "logps/chosen": -59.20655822753906, + "logps/rejected": -103.06116485595703, + "loss": 0.6465, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0748114585876465, + "rewards/margins": 5.192011833190918, + "rewards/rejected": -2.1172001361846924, + "step": 4020 + }, + { + "epoch": 1.01, + "grad_norm": 5.402865409851074, + "learning_rate": 2.446499549575876e-06, + "logits/chosen": -0.3437115550041199, + "logits/rejected": -0.4157712757587433, + "logps/chosen": -63.21125793457031, + "logps/rejected": -95.26525115966797, + "loss": 0.8195, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0253171920776367, + "rewards/margins": 4.517601490020752, + "rewards/rejected": -1.4922845363616943, + "step": 4021 + }, + { + "epoch": 1.01, + "grad_norm": 3.400400161743164, + "learning_rate": 2.4442475549952575e-06, + "logits/chosen": -0.3828481137752533, + "logits/rejected": -0.44491705298423767, + "logps/chosen": -54.266746520996094, + "logps/rejected": -82.95047760009766, + "loss": 0.7377, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.108798027038574, + "rewards/margins": 5.502188682556152, + "rewards/rejected": -2.3933916091918945, + "step": 4022 + }, + { + "epoch": 1.01, + "grad_norm": 3.8753020763397217, + "learning_rate": 2.441996262023838e-06, + "logits/chosen": -0.311862587928772, + "logits/rejected": -0.39686596393585205, + "logps/chosen": -53.20431137084961, + "logps/rejected": -79.50740051269531, + "loss": 0.8891, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8650691509246826, + "rewards/margins": 4.157641410827637, + "rewards/rejected": -1.2925726175308228, + "step": 4023 + }, + { + "epoch": 1.01, + "grad_norm": 3.174513816833496, + "learning_rate": 2.4397456712796385e-06, + "logits/chosen": -0.39689648151397705, + "logits/rejected": -0.5268464684486389, + "logps/chosen": -55.53905487060547, + "logps/rejected": -80.85358428955078, + "loss": 0.6518, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.02935528755188, + "rewards/margins": 5.224787712097168, + "rewards/rejected": -2.1954329013824463, + "step": 4024 + }, + { + "epoch": 1.01, + "grad_norm": 5.178155899047852, + "learning_rate": 2.4374957833804995e-06, + "logits/chosen": -0.426601380109787, + "logits/rejected": -0.5047202706336975, + "logps/chosen": -69.79508972167969, + "logps/rejected": -84.73516082763672, + "loss": 0.899, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.109015464782715, + "rewards/margins": 3.9313271045684814, + "rewards/rejected": -0.8223117589950562, + "step": 4025 + }, + { + "epoch": 1.01, + "grad_norm": 3.502145767211914, + "learning_rate": 2.4352465989440625e-06, + "logits/chosen": -0.24719415605068207, + "logits/rejected": -0.41920050978660583, + "logps/chosen": -71.5591049194336, + "logps/rejected": -76.395263671875, + "loss": 0.7159, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.640070915222168, + "rewards/margins": 5.221068859100342, + "rewards/rejected": -2.5809974670410156, + "step": 4026 + }, + { + "epoch": 1.01, + "grad_norm": 3.662403106689453, + "learning_rate": 2.4329981185877778e-06, + "logits/chosen": -0.31918948888778687, + "logits/rejected": -0.34409061074256897, + "logps/chosen": -59.16687774658203, + "logps/rejected": -109.01435852050781, + "loss": 0.7028, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.839993953704834, + "rewards/margins": 4.7351603507995605, + "rewards/rejected": -1.8951663970947266, + "step": 4027 + }, + { + "epoch": 1.01, + "grad_norm": 4.492412567138672, + "learning_rate": 2.430750342928901e-06, + "logits/chosen": -0.4530060887336731, + "logits/rejected": -0.5357270836830139, + "logps/chosen": -60.80088806152344, + "logps/rejected": -87.98051452636719, + "loss": 0.8032, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9502358436584473, + "rewards/margins": 4.899910926818848, + "rewards/rejected": -1.9496753215789795, + "step": 4028 + }, + { + "epoch": 1.01, + "grad_norm": 6.096278190612793, + "learning_rate": 2.4285032725844927e-06, + "logits/chosen": -0.39372825622558594, + "logits/rejected": -0.532512903213501, + "logps/chosen": -58.79517364501953, + "logps/rejected": -77.10346221923828, + "loss": 0.7494, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7042858600616455, + "rewards/margins": 4.029901027679443, + "rewards/rejected": -1.3256151676177979, + "step": 4029 + }, + { + "epoch": 1.01, + "grad_norm": 8.415237426757812, + "learning_rate": 2.4262569081714267e-06, + "logits/chosen": -0.34374064207077026, + "logits/rejected": -0.4025729298591614, + "logps/chosen": -55.092994689941406, + "logps/rejected": -94.645751953125, + "loss": 0.8711, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8254292011260986, + "rewards/margins": 4.547224998474121, + "rewards/rejected": -1.7217954397201538, + "step": 4030 + }, + { + "epoch": 1.01, + "grad_norm": 4.158161640167236, + "learning_rate": 2.424011250306376e-06, + "logits/chosen": -0.34084993600845337, + "logits/rejected": -0.4896714985370636, + "logps/chosen": -47.37221145629883, + "logps/rejected": -81.9911880493164, + "loss": 0.6087, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.981713056564331, + "rewards/margins": 5.654242038726807, + "rewards/rejected": -2.6725287437438965, + "step": 4031 + }, + { + "epoch": 1.01, + "grad_norm": 6.368144989013672, + "learning_rate": 2.4217662996058226e-06, + "logits/chosen": -0.3403162360191345, + "logits/rejected": -0.4295256435871124, + "logps/chosen": -49.98624038696289, + "logps/rejected": -90.89347839355469, + "loss": 0.6877, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9348948001861572, + "rewards/margins": 4.783134460449219, + "rewards/rejected": -1.8482398986816406, + "step": 4032 + }, + { + "epoch": 1.01, + "grad_norm": 7.087159633636475, + "learning_rate": 2.4195220566860546e-06, + "logits/chosen": -0.3338489234447479, + "logits/rejected": -0.46463602781295776, + "logps/chosen": -63.90033721923828, + "logps/rejected": -81.20128631591797, + "loss": 0.7298, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.959102153778076, + "rewards/margins": 5.029551029205322, + "rewards/rejected": -2.070449113845825, + "step": 4033 + }, + { + "epoch": 1.01, + "grad_norm": 6.051983833312988, + "learning_rate": 2.4172785221631618e-06, + "logits/chosen": -0.35010653734207153, + "logits/rejected": -0.4470791220664978, + "logps/chosen": -47.08592224121094, + "logps/rejected": -95.37732696533203, + "loss": 0.7544, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0262646675109863, + "rewards/margins": 5.774448871612549, + "rewards/rejected": -2.7481844425201416, + "step": 4034 + }, + { + "epoch": 1.01, + "grad_norm": 6.307795524597168, + "learning_rate": 2.4150356966530493e-06, + "logits/chosen": -0.2684860825538635, + "logits/rejected": -0.31753554940223694, + "logps/chosen": -57.800254821777344, + "logps/rejected": -85.39559173583984, + "loss": 0.7395, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.947770595550537, + "rewards/margins": 4.682678699493408, + "rewards/rejected": -1.7349082231521606, + "step": 4035 + }, + { + "epoch": 1.01, + "grad_norm": 3.413083076477051, + "learning_rate": 2.4127935807714154e-06, + "logits/chosen": -0.432775616645813, + "logits/rejected": -0.5241453647613525, + "logps/chosen": -46.888702392578125, + "logps/rejected": -91.14079284667969, + "loss": 0.6258, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1660268306732178, + "rewards/margins": 5.6300787925720215, + "rewards/rejected": -2.4640517234802246, + "step": 4036 + }, + { + "epoch": 1.01, + "grad_norm": 5.279262065887451, + "learning_rate": 2.410552175133769e-06, + "logits/chosen": -0.3536298871040344, + "logits/rejected": -0.39923781156539917, + "logps/chosen": -52.27066421508789, + "logps/rejected": -80.49283599853516, + "loss": 0.8345, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0307416915893555, + "rewards/margins": 4.552551746368408, + "rewards/rejected": -1.5218101739883423, + "step": 4037 + }, + { + "epoch": 1.01, + "grad_norm": 5.991480350494385, + "learning_rate": 2.4083114803554286e-06, + "logits/chosen": -0.33481597900390625, + "logits/rejected": -0.38259536027908325, + "logps/chosen": -48.64750289916992, + "logps/rejected": -93.34263610839844, + "loss": 0.7666, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.788372755050659, + "rewards/margins": 3.790060520172119, + "rewards/rejected": -1.0016875267028809, + "step": 4038 + }, + { + "epoch": 1.01, + "grad_norm": 6.659294605255127, + "learning_rate": 2.406071497051511e-06, + "logits/chosen": -0.39597925543785095, + "logits/rejected": -0.5662111639976501, + "logps/chosen": -59.75446319580078, + "logps/rejected": -62.53631591796875, + "loss": 0.7229, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.980994462966919, + "rewards/margins": 4.828941822052002, + "rewards/rejected": -1.8479474782943726, + "step": 4039 + }, + { + "epoch": 1.01, + "grad_norm": 5.485104084014893, + "learning_rate": 2.40383222583694e-06, + "logits/chosen": -0.40092694759368896, + "logits/rejected": -0.5195984244346619, + "logps/chosen": -62.82788848876953, + "logps/rejected": -75.536865234375, + "loss": 0.8618, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.947169542312622, + "rewards/margins": 3.2511134147644043, + "rewards/rejected": -0.30394381284713745, + "step": 4040 + }, + { + "epoch": 1.01, + "grad_norm": 5.405158519744873, + "learning_rate": 2.401593667326444e-06, + "logits/chosen": -0.30628421902656555, + "logits/rejected": -0.3851810097694397, + "logps/chosen": -54.191280364990234, + "logps/rejected": -80.77297973632812, + "loss": 0.7503, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.995546817779541, + "rewards/margins": 4.135165691375732, + "rewards/rejected": -1.1396187543869019, + "step": 4041 + }, + { + "epoch": 1.01, + "grad_norm": 3.457041025161743, + "learning_rate": 2.3993558221345536e-06, + "logits/chosen": -0.36574167013168335, + "logits/rejected": -0.4583739638328552, + "logps/chosen": -54.36090850830078, + "logps/rejected": -80.27102661132812, + "loss": 0.7614, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9932210445404053, + "rewards/margins": 5.109601020812988, + "rewards/rejected": -2.1163792610168457, + "step": 4042 + }, + { + "epoch": 1.01, + "grad_norm": 5.167543411254883, + "learning_rate": 2.397118690875609e-06, + "logits/chosen": -0.2921118140220642, + "logits/rejected": -0.4030376970767975, + "logps/chosen": -60.09571838378906, + "logps/rejected": -85.7128677368164, + "loss": 0.8032, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.021883010864258, + "rewards/margins": 5.054987907409668, + "rewards/rejected": -2.033104419708252, + "step": 4043 + }, + { + "epoch": 1.01, + "grad_norm": 3.3044121265411377, + "learning_rate": 2.39488227416375e-06, + "logits/chosen": -0.21883639693260193, + "logits/rejected": -0.33594587445259094, + "logps/chosen": -51.02859115600586, + "logps/rejected": -103.04761505126953, + "loss": 0.6222, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.945437431335449, + "rewards/margins": 4.679776668548584, + "rewards/rejected": -1.7343391180038452, + "step": 4044 + }, + { + "epoch": 1.01, + "grad_norm": 6.6062912940979, + "learning_rate": 2.3926465726129204e-06, + "logits/chosen": -0.27516308426856995, + "logits/rejected": -0.40818649530410767, + "logps/chosen": -59.12116622924805, + "logps/rejected": -79.8180923461914, + "loss": 0.766, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.903393268585205, + "rewards/margins": 4.292896747589111, + "rewards/rejected": -1.389503836631775, + "step": 4045 + }, + { + "epoch": 1.01, + "grad_norm": 5.11784029006958, + "learning_rate": 2.390411586836869e-06, + "logits/chosen": -0.3545638918876648, + "logits/rejected": -0.4360044598579407, + "logps/chosen": -62.2249755859375, + "logps/rejected": -95.60964965820312, + "loss": 0.8024, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9679946899414062, + "rewards/margins": 5.021399974822998, + "rewards/rejected": -2.053405284881592, + "step": 4046 + }, + { + "epoch": 1.01, + "grad_norm": 3.3710622787475586, + "learning_rate": 2.3881773174491473e-06, + "logits/chosen": -0.3952336013317108, + "logits/rejected": -0.5287188291549683, + "logps/chosen": -58.879119873046875, + "logps/rejected": -76.46746063232422, + "loss": 0.711, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.939937114715576, + "rewards/margins": 4.756857395172119, + "rewards/rejected": -1.816920518875122, + "step": 4047 + }, + { + "epoch": 1.01, + "grad_norm": 3.124732494354248, + "learning_rate": 2.3859437650631105e-06, + "logits/chosen": -0.34720879793167114, + "logits/rejected": -0.41986221075057983, + "logps/chosen": -57.5172233581543, + "logps/rejected": -83.32622528076172, + "loss": 0.6847, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.164191722869873, + "rewards/margins": 4.899749755859375, + "rewards/rejected": -1.7355583906173706, + "step": 4048 + }, + { + "epoch": 1.01, + "grad_norm": 8.98610782623291, + "learning_rate": 2.3837109302919157e-06, + "logits/chosen": -0.3037949800491333, + "logits/rejected": -0.41714662313461304, + "logps/chosen": -72.00997924804688, + "logps/rejected": -79.41828918457031, + "loss": 0.9732, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9770989418029785, + "rewards/margins": 3.905423641204834, + "rewards/rejected": -0.9283246397972107, + "step": 4049 + }, + { + "epoch": 1.01, + "grad_norm": 9.414366722106934, + "learning_rate": 2.3814788137485274e-06, + "logits/chosen": -0.30862176418304443, + "logits/rejected": -0.46644407510757446, + "logps/chosen": -64.4096908569336, + "logps/rejected": -83.38475799560547, + "loss": 0.6926, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9454617500305176, + "rewards/margins": 5.015877723693848, + "rewards/rejected": -2.07041597366333, + "step": 4050 + }, + { + "epoch": 1.01, + "grad_norm": 8.194337844848633, + "learning_rate": 2.379247416045708e-06, + "logits/chosen": -0.3674298822879791, + "logits/rejected": -0.45644646883010864, + "logps/chosen": -47.33512878417969, + "logps/rejected": -88.52119445800781, + "loss": 0.7852, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.549602508544922, + "rewards/margins": 4.999418258666992, + "rewards/rejected": -2.4498164653778076, + "step": 4051 + }, + { + "epoch": 1.01, + "grad_norm": 7.133405685424805, + "learning_rate": 2.377016737796024e-06, + "logits/chosen": -0.3519180417060852, + "logits/rejected": -0.4456981122493744, + "logps/chosen": -59.49700164794922, + "logps/rejected": -71.07212829589844, + "loss": 0.8424, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8792340755462646, + "rewards/margins": 4.164117813110352, + "rewards/rejected": -1.2848834991455078, + "step": 4052 + }, + { + "epoch": 1.01, + "grad_norm": 5.267831802368164, + "learning_rate": 2.374786779611845e-06, + "logits/chosen": -0.4039905369281769, + "logits/rejected": -0.4914305806159973, + "logps/chosen": -47.23479080200195, + "logps/rejected": -80.57066345214844, + "loss": 0.6807, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.716780185699463, + "rewards/margins": 4.577901840209961, + "rewards/rejected": -1.861121416091919, + "step": 4053 + }, + { + "epoch": 1.01, + "grad_norm": 4.3091230392456055, + "learning_rate": 2.372557542105341e-06, + "logits/chosen": -0.387079656124115, + "logits/rejected": -0.48467952013015747, + "logps/chosen": -77.13819885253906, + "logps/rejected": -73.26992797851562, + "loss": 0.8536, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2291622161865234, + "rewards/margins": 5.431126117706299, + "rewards/rejected": -2.2019639015197754, + "step": 4054 + }, + { + "epoch": 1.01, + "grad_norm": 27.971532821655273, + "learning_rate": 2.3703290258884894e-06, + "logits/chosen": -0.38529330492019653, + "logits/rejected": -0.506701648235321, + "logps/chosen": -65.79680633544922, + "logps/rejected": -86.22710418701172, + "loss": 0.7384, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7612411975860596, + "rewards/margins": 5.258504390716553, + "rewards/rejected": -2.497263193130493, + "step": 4055 + }, + { + "epoch": 1.01, + "grad_norm": 3.612875461578369, + "learning_rate": 2.368101231573066e-06, + "logits/chosen": -0.3780440092086792, + "logits/rejected": -0.46894749999046326, + "logps/chosen": -52.960140228271484, + "logps/rejected": -82.00473022460938, + "loss": 0.6896, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0848684310913086, + "rewards/margins": 5.133703708648682, + "rewards/rejected": -2.0488357543945312, + "step": 4056 + }, + { + "epoch": 1.01, + "grad_norm": 17.69611358642578, + "learning_rate": 2.365874159770642e-06, + "logits/chosen": -0.2694445252418518, + "logits/rejected": -0.3288274109363556, + "logps/chosen": -52.261775970458984, + "logps/rejected": -83.32012939453125, + "loss": 0.636, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0016303062438965, + "rewards/margins": 4.213425636291504, + "rewards/rejected": -1.2117949724197388, + "step": 4057 + }, + { + "epoch": 1.02, + "grad_norm": 5.226274013519287, + "learning_rate": 2.363647811092603e-06, + "logits/chosen": -0.3447166383266449, + "logits/rejected": -0.43455207347869873, + "logps/chosen": -59.91672134399414, + "logps/rejected": -97.42284393310547, + "loss": 0.6825, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9956250190734863, + "rewards/margins": 5.007022380828857, + "rewards/rejected": -2.011397361755371, + "step": 4058 + }, + { + "epoch": 1.02, + "grad_norm": 9.648425102233887, + "learning_rate": 2.361422186150128e-06, + "logits/chosen": -0.2730417251586914, + "logits/rejected": -0.4273627996444702, + "logps/chosen": -57.78986358642578, + "logps/rejected": -79.73716735839844, + "loss": 0.7216, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0794663429260254, + "rewards/margins": 4.368569374084473, + "rewards/rejected": -1.2891030311584473, + "step": 4059 + }, + { + "epoch": 1.02, + "grad_norm": 2.9986350536346436, + "learning_rate": 2.3591972855541993e-06, + "logits/chosen": -0.32917091250419617, + "logits/rejected": -0.3868591785430908, + "logps/chosen": -50.92161560058594, + "logps/rejected": -86.66964721679688, + "loss": 0.635, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.688260555267334, + "rewards/margins": 4.895002365112305, + "rewards/rejected": -2.2067413330078125, + "step": 4060 + }, + { + "epoch": 1.02, + "grad_norm": 3.1724977493286133, + "learning_rate": 2.3569731099155996e-06, + "logits/chosen": -0.4161686599254608, + "logits/rejected": -0.4863571226596832, + "logps/chosen": -66.33612823486328, + "logps/rejected": -101.38125610351562, + "loss": 0.7028, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.847590446472168, + "rewards/margins": 5.119153022766113, + "rewards/rejected": -2.271562099456787, + "step": 4061 + }, + { + "epoch": 1.02, + "grad_norm": 4.000991344451904, + "learning_rate": 2.354749659844911e-06, + "logits/chosen": -0.37324258685112, + "logits/rejected": -0.4318758249282837, + "logps/chosen": -62.126670837402344, + "logps/rejected": -83.20646667480469, + "loss": 0.8307, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0257468223571777, + "rewards/margins": 4.765321731567383, + "rewards/rejected": -1.739574909210205, + "step": 4062 + }, + { + "epoch": 1.02, + "grad_norm": 4.96914005279541, + "learning_rate": 2.352526935952523e-06, + "logits/chosen": -0.28137490153312683, + "logits/rejected": -0.3233639895915985, + "logps/chosen": -51.228485107421875, + "logps/rejected": -96.4028091430664, + "loss": 0.605, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.031116247177124, + "rewards/margins": 5.307428359985352, + "rewards/rejected": -2.2763118743896484, + "step": 4063 + }, + { + "epoch": 1.02, + "grad_norm": 3.6822080612182617, + "learning_rate": 2.3503049388486187e-06, + "logits/chosen": -0.37889277935028076, + "logits/rejected": -0.4632391333580017, + "logps/chosen": -53.878177642822266, + "logps/rejected": -89.24783325195312, + "loss": 0.6946, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.074411392211914, + "rewards/margins": 4.967636585235596, + "rewards/rejected": -1.8932249546051025, + "step": 4064 + }, + { + "epoch": 1.02, + "grad_norm": 3.9502809047698975, + "learning_rate": 2.3480836691431854e-06, + "logits/chosen": -0.37900590896606445, + "logits/rejected": -0.48372596502304077, + "logps/chosen": -45.579410552978516, + "logps/rejected": -70.33756256103516, + "loss": 0.7282, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8804121017456055, + "rewards/margins": 3.8957748413085938, + "rewards/rejected": -1.0153627395629883, + "step": 4065 + }, + { + "epoch": 1.02, + "grad_norm": 6.931635856628418, + "learning_rate": 2.345863127446008e-06, + "logits/chosen": -0.3184575140476227, + "logits/rejected": -0.38950568437576294, + "logps/chosen": -53.35325622558594, + "logps/rejected": -87.5173568725586, + "loss": 0.7181, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.84065318107605, + "rewards/margins": 4.433566570281982, + "rewards/rejected": -1.5929136276245117, + "step": 4066 + }, + { + "epoch": 1.02, + "grad_norm": 8.664100646972656, + "learning_rate": 2.343643314366672e-06, + "logits/chosen": -0.3736443817615509, + "logits/rejected": -0.46977418661117554, + "logps/chosen": -48.27593231201172, + "logps/rejected": -74.02499389648438, + "loss": 0.7202, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.975834369659424, + "rewards/margins": 4.356278419494629, + "rewards/rejected": -1.380443811416626, + "step": 4067 + }, + { + "epoch": 1.02, + "grad_norm": 4.465970993041992, + "learning_rate": 2.34142423051457e-06, + "logits/chosen": -0.25591179728507996, + "logits/rejected": -0.36308813095092773, + "logps/chosen": -46.86041259765625, + "logps/rejected": -87.65174865722656, + "loss": 0.6512, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9616522789001465, + "rewards/margins": 4.728907108306885, + "rewards/rejected": -1.7672548294067383, + "step": 4068 + }, + { + "epoch": 1.02, + "grad_norm": 6.337660789489746, + "learning_rate": 2.3392058764988805e-06, + "logits/chosen": -0.3769773244857788, + "logits/rejected": -0.4408966302871704, + "logps/chosen": -64.13134765625, + "logps/rejected": -76.27877807617188, + "loss": 0.7847, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5442144870758057, + "rewards/margins": 4.148015975952148, + "rewards/rejected": -1.6038013696670532, + "step": 4069 + }, + { + "epoch": 1.02, + "grad_norm": 6.935546398162842, + "learning_rate": 2.3369882529285946e-06, + "logits/chosen": -0.33171167969703674, + "logits/rejected": -0.3320518732070923, + "logps/chosen": -57.33274841308594, + "logps/rejected": -122.46054077148438, + "loss": 0.6966, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9216604232788086, + "rewards/margins": 5.257732391357422, + "rewards/rejected": -2.3360722064971924, + "step": 4070 + }, + { + "epoch": 1.02, + "grad_norm": 3.30686092376709, + "learning_rate": 2.3347713604124974e-06, + "logits/chosen": -0.3657850921154022, + "logits/rejected": -0.4986596703529358, + "logps/chosen": -59.48918533325195, + "logps/rejected": -67.92536163330078, + "loss": 0.6365, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9529449939727783, + "rewards/margins": 4.38077449798584, + "rewards/rejected": -1.4278290271759033, + "step": 4071 + }, + { + "epoch": 1.02, + "grad_norm": 7.218693256378174, + "learning_rate": 2.3325551995591723e-06, + "logits/chosen": -0.3516230583190918, + "logits/rejected": -0.46293115615844727, + "logps/chosen": -55.218849182128906, + "logps/rejected": -80.84640502929688, + "loss": 0.74, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8561911582946777, + "rewards/margins": 4.857409477233887, + "rewards/rejected": -2.001217842102051, + "step": 4072 + }, + { + "epoch": 1.02, + "grad_norm": 7.563626289367676, + "learning_rate": 2.330339770977004e-06, + "logits/chosen": -0.3032362461090088, + "logits/rejected": -0.4167560338973999, + "logps/chosen": -58.401466369628906, + "logps/rejected": -88.17765808105469, + "loss": 0.687, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.893967866897583, + "rewards/margins": 4.169914722442627, + "rewards/rejected": -1.2759466171264648, + "step": 4073 + }, + { + "epoch": 1.02, + "grad_norm": 2.081511974334717, + "learning_rate": 2.3281250752741733e-06, + "logits/chosen": -0.3571929633617401, + "logits/rejected": -0.4649218022823334, + "logps/chosen": -51.87548828125, + "logps/rejected": -79.41793823242188, + "loss": 0.5878, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9533417224884033, + "rewards/margins": 5.686556816101074, + "rewards/rejected": -2.73321533203125, + "step": 4074 + }, + { + "epoch": 1.02, + "grad_norm": 10.426419258117676, + "learning_rate": 2.325911113058666e-06, + "logits/chosen": -0.23502367734909058, + "logits/rejected": -0.3273492157459259, + "logps/chosen": -63.97001266479492, + "logps/rejected": -91.17549896240234, + "loss": 0.8565, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.635807752609253, + "rewards/margins": 4.329265117645264, + "rewards/rejected": -1.6934572458267212, + "step": 4075 + }, + { + "epoch": 1.02, + "grad_norm": 5.308237552642822, + "learning_rate": 2.3236978849382625e-06, + "logits/chosen": -0.35553163290023804, + "logits/rejected": -0.43366754055023193, + "logps/chosen": -52.38019561767578, + "logps/rejected": -81.2983627319336, + "loss": 0.7647, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6740365028381348, + "rewards/margins": 5.012574672698975, + "rewards/rejected": -2.338538646697998, + "step": 4076 + }, + { + "epoch": 1.02, + "grad_norm": 6.2609734535217285, + "learning_rate": 2.3214853915205354e-06, + "logits/chosen": -0.3859490752220154, + "logits/rejected": -0.4513683021068573, + "logps/chosen": -52.40361404418945, + "logps/rejected": -87.2580795288086, + "loss": 0.7734, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6709072589874268, + "rewards/margins": 4.182131290435791, + "rewards/rejected": -1.5112242698669434, + "step": 4077 + }, + { + "epoch": 1.02, + "grad_norm": 5.826469421386719, + "learning_rate": 2.319273633412868e-06, + "logits/chosen": -0.400007963180542, + "logits/rejected": -0.5224422812461853, + "logps/chosen": -82.63654327392578, + "logps/rejected": -87.62826538085938, + "loss": 0.7804, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0657598972320557, + "rewards/margins": 5.663646697998047, + "rewards/rejected": -2.597886323928833, + "step": 4078 + }, + { + "epoch": 1.02, + "grad_norm": 6.750417709350586, + "learning_rate": 2.317062611222432e-06, + "logits/chosen": -0.273526132106781, + "logits/rejected": -0.3779486119747162, + "logps/chosen": -59.943660736083984, + "logps/rejected": -80.60049438476562, + "loss": 0.7014, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8217012882232666, + "rewards/margins": 4.964323997497559, + "rewards/rejected": -2.14262318611145, + "step": 4079 + }, + { + "epoch": 1.02, + "grad_norm": 6.013363361358643, + "learning_rate": 2.3148523255562065e-06, + "logits/chosen": -0.34530869126319885, + "logits/rejected": -0.4961278736591339, + "logps/chosen": -65.07321166992188, + "logps/rejected": -71.75636291503906, + "loss": 0.7865, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.863084077835083, + "rewards/margins": 4.88156795501709, + "rewards/rejected": -2.0184836387634277, + "step": 4080 + }, + { + "epoch": 1.02, + "grad_norm": 3.8977441787719727, + "learning_rate": 2.3126427770209566e-06, + "logits/chosen": -0.42473459243774414, + "logits/rejected": -0.46461912989616394, + "logps/chosen": -57.93301010131836, + "logps/rejected": -95.98870849609375, + "loss": 0.7431, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.094318389892578, + "rewards/margins": 4.648161888122559, + "rewards/rejected": -1.5538439750671387, + "step": 4081 + }, + { + "epoch": 1.02, + "grad_norm": 3.8297667503356934, + "learning_rate": 2.3104339662232505e-06, + "logits/chosen": -0.4012719988822937, + "logits/rejected": -0.4759848713874817, + "logps/chosen": -53.46444320678711, + "logps/rejected": -79.7930908203125, + "loss": 0.7537, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8947372436523438, + "rewards/margins": 3.5531911849975586, + "rewards/rejected": -0.6584544777870178, + "step": 4082 + }, + { + "epoch": 1.02, + "grad_norm": 4.620519161224365, + "learning_rate": 2.3082258937694595e-06, + "logits/chosen": -0.33593612909317017, + "logits/rejected": -0.4303229749202728, + "logps/chosen": -52.41803741455078, + "logps/rejected": -81.40461730957031, + "loss": 0.6681, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.982687473297119, + "rewards/margins": 4.580512046813965, + "rewards/rejected": -1.5978243350982666, + "step": 4083 + }, + { + "epoch": 1.02, + "grad_norm": 5.416861057281494, + "learning_rate": 2.3060185602657443e-06, + "logits/chosen": -0.3547574281692505, + "logits/rejected": -0.4773028492927551, + "logps/chosen": -58.09184646606445, + "logps/rejected": -81.42816162109375, + "loss": 0.7332, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.942230463027954, + "rewards/margins": 4.846207618713379, + "rewards/rejected": -1.9039771556854248, + "step": 4084 + }, + { + "epoch": 1.02, + "grad_norm": 4.423385143280029, + "learning_rate": 2.303811966318066e-06, + "logits/chosen": -0.42699143290519714, + "logits/rejected": -0.5509343147277832, + "logps/chosen": -51.00933074951172, + "logps/rejected": -68.69122314453125, + "loss": 0.6781, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0654683113098145, + "rewards/margins": 5.150141716003418, + "rewards/rejected": -2.0846734046936035, + "step": 4085 + }, + { + "epoch": 1.02, + "grad_norm": 3.4759280681610107, + "learning_rate": 2.301606112532182e-06, + "logits/chosen": -0.27079033851623535, + "logits/rejected": -0.396257609128952, + "logps/chosen": -73.35487365722656, + "logps/rejected": -80.1622314453125, + "loss": 0.8114, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.680612802505493, + "rewards/margins": 4.286731243133545, + "rewards/rejected": -1.6061184406280518, + "step": 4086 + }, + { + "epoch": 1.02, + "grad_norm": 4.443544387817383, + "learning_rate": 2.2994009995136445e-06, + "logits/chosen": -0.32152244448661804, + "logits/rejected": -0.4035792350769043, + "logps/chosen": -66.58106994628906, + "logps/rejected": -94.30323791503906, + "loss": 0.7253, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3129842281341553, + "rewards/margins": 5.9078450202941895, + "rewards/rejected": -2.594860553741455, + "step": 4087 + }, + { + "epoch": 1.02, + "grad_norm": 7.339483261108398, + "learning_rate": 2.2971966278678115e-06, + "logits/chosen": -0.3598395884037018, + "logits/rejected": -0.48072415590286255, + "logps/chosen": -63.43123245239258, + "logps/rejected": -87.87820434570312, + "loss": 0.847, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.866699695587158, + "rewards/margins": 5.304788589477539, + "rewards/rejected": -2.4380886554718018, + "step": 4088 + }, + { + "epoch": 1.02, + "grad_norm": 5.765407085418701, + "learning_rate": 2.294992998199822e-06, + "logits/chosen": -0.32309776544570923, + "logits/rejected": -0.4959891140460968, + "logps/chosen": -54.785621643066406, + "logps/rejected": -67.9972915649414, + "loss": 0.7964, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.673736095428467, + "rewards/margins": 4.705352306365967, + "rewards/rejected": -2.031616687774658, + "step": 4089 + }, + { + "epoch": 1.02, + "grad_norm": 3.2828683853149414, + "learning_rate": 2.2927901111146257e-06, + "logits/chosen": -0.37284550070762634, + "logits/rejected": -0.49787282943725586, + "logps/chosen": -50.113162994384766, + "logps/rejected": -69.8764419555664, + "loss": 0.642, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2427244186401367, + "rewards/margins": 5.5309739112854, + "rewards/rejected": -2.2882497310638428, + "step": 4090 + }, + { + "epoch": 1.02, + "grad_norm": 3.688939332962036, + "learning_rate": 2.2905879672169605e-06, + "logits/chosen": -0.3568817377090454, + "logits/rejected": -0.4648313522338867, + "logps/chosen": -70.39507293701172, + "logps/rejected": -89.82024383544922, + "loss": 0.7244, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.773221969604492, + "rewards/margins": 4.937830924987793, + "rewards/rejected": -2.164608955383301, + "step": 4091 + }, + { + "epoch": 1.02, + "grad_norm": 3.9207763671875, + "learning_rate": 2.2883865671113637e-06, + "logits/chosen": -0.3270101547241211, + "logits/rejected": -0.43826690316200256, + "logps/chosen": -55.140655517578125, + "logps/rejected": -83.57240295410156, + "loss": 0.6367, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7154226303100586, + "rewards/margins": 5.011415958404541, + "rewards/rejected": -2.2959930896759033, + "step": 4092 + }, + { + "epoch": 1.02, + "grad_norm": 5.000509262084961, + "learning_rate": 2.286185911402166e-06, + "logits/chosen": -0.3448050916194916, + "logits/rejected": -0.3943999111652374, + "logps/chosen": -49.77492141723633, + "logps/rejected": -85.54107666015625, + "loss": 0.772, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8260645866394043, + "rewards/margins": 4.527230262756348, + "rewards/rejected": -1.7011656761169434, + "step": 4093 + }, + { + "epoch": 1.02, + "grad_norm": 4.115973472595215, + "learning_rate": 2.2839860006934927e-06, + "logits/chosen": -0.33553990721702576, + "logits/rejected": -0.4529806673526764, + "logps/chosen": -69.03898620605469, + "logps/rejected": -97.04075622558594, + "loss": 0.7107, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7161169052124023, + "rewards/margins": 5.0961809158325195, + "rewards/rejected": -2.380063772201538, + "step": 4094 + }, + { + "epoch": 1.02, + "grad_norm": 3.1868414878845215, + "learning_rate": 2.281786835589271e-06, + "logits/chosen": -0.29785439372062683, + "logits/rejected": -0.3796120285987854, + "logps/chosen": -56.053306579589844, + "logps/rejected": -91.6962890625, + "loss": 0.6143, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.004878520965576, + "rewards/margins": 5.074637413024902, + "rewards/rejected": -2.0697591304779053, + "step": 4095 + }, + { + "epoch": 1.02, + "grad_norm": 11.513951301574707, + "learning_rate": 2.279588416693218e-06, + "logits/chosen": -0.34063565731048584, + "logits/rejected": -0.416692316532135, + "logps/chosen": -53.518775939941406, + "logps/rejected": -82.82579803466797, + "loss": 0.8901, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8664793968200684, + "rewards/margins": 3.876232862472534, + "rewards/rejected": -1.0097535848617554, + "step": 4096 + }, + { + "epoch": 1.02, + "grad_norm": 3.4506750106811523, + "learning_rate": 2.2773907446088463e-06, + "logits/chosen": -0.37158629298210144, + "logits/rejected": -0.4461785554885864, + "logps/chosen": -50.70790100097656, + "logps/rejected": -77.28119659423828, + "loss": 0.6569, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.874911069869995, + "rewards/margins": 4.627082824707031, + "rewards/rejected": -1.7521719932556152, + "step": 4097 + }, + { + "epoch": 1.03, + "grad_norm": 3.5933804512023926, + "learning_rate": 2.275193819939464e-06, + "logits/chosen": -0.34161150455474854, + "logits/rejected": -0.45619097352027893, + "logps/chosen": -63.60596466064453, + "logps/rejected": -96.4728012084961, + "loss": 0.7273, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9830262660980225, + "rewards/margins": 5.951068878173828, + "rewards/rejected": -2.968043327331543, + "step": 4098 + }, + { + "epoch": 1.03, + "grad_norm": 9.70871353149414, + "learning_rate": 2.2729976432881734e-06, + "logits/chosen": -0.33276036381721497, + "logits/rejected": -0.4423249065876007, + "logps/chosen": -52.79666519165039, + "logps/rejected": -83.07176208496094, + "loss": 0.7557, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.733238697052002, + "rewards/margins": 5.2599616050720215, + "rewards/rejected": -2.526723623275757, + "step": 4099 + }, + { + "epoch": 1.03, + "grad_norm": 3.997067928314209, + "learning_rate": 2.2708022152578775e-06, + "logits/chosen": -0.3218766152858734, + "logits/rejected": -0.3981335163116455, + "logps/chosen": -57.51849365234375, + "logps/rejected": -92.73844909667969, + "loss": 0.6764, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9503815174102783, + "rewards/margins": 5.321134567260742, + "rewards/rejected": -2.370753526687622, + "step": 4100 + }, + { + "epoch": 1.03, + "grad_norm": 3.273138999938965, + "learning_rate": 2.2686075364512627e-06, + "logits/chosen": -0.4443615972995758, + "logits/rejected": -0.5443221926689148, + "logps/chosen": -53.702144622802734, + "logps/rejected": -87.22135162353516, + "loss": 0.645, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.964353561401367, + "rewards/margins": 5.1495208740234375, + "rewards/rejected": -2.1851673126220703, + "step": 4101 + }, + { + "epoch": 1.03, + "grad_norm": 7.125552654266357, + "learning_rate": 2.2664136074708164e-06, + "logits/chosen": -0.21037763357162476, + "logits/rejected": -0.31216150522232056, + "logps/chosen": -65.47149658203125, + "logps/rejected": -99.7937240600586, + "loss": 0.7819, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.751026153564453, + "rewards/margins": 5.883332252502441, + "rewards/rejected": -3.1323063373565674, + "step": 4102 + }, + { + "epoch": 1.03, + "grad_norm": 6.6626152992248535, + "learning_rate": 2.264220428918823e-06, + "logits/chosen": -0.30379095673561096, + "logits/rejected": -0.3867248296737671, + "logps/chosen": -50.76104736328125, + "logps/rejected": -94.8573226928711, + "loss": 0.6594, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.986731767654419, + "rewards/margins": 6.2556376457214355, + "rewards/rejected": -3.2689058780670166, + "step": 4103 + }, + { + "epoch": 1.03, + "grad_norm": 5.412411689758301, + "learning_rate": 2.262028001397355e-06, + "logits/chosen": -0.3746134638786316, + "logits/rejected": -0.47387373447418213, + "logps/chosen": -60.82938766479492, + "logps/rejected": -68.85711669921875, + "loss": 0.701, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1375679969787598, + "rewards/margins": 5.074136734008789, + "rewards/rejected": -1.9365684986114502, + "step": 4104 + }, + { + "epoch": 1.03, + "grad_norm": 5.5950117111206055, + "learning_rate": 2.2598363255082815e-06, + "logits/chosen": -0.3495180010795593, + "logits/rejected": -0.4970080256462097, + "logps/chosen": -59.25565719604492, + "logps/rejected": -73.33199310302734, + "loss": 0.8005, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6479828357696533, + "rewards/margins": 4.647428512573242, + "rewards/rejected": -1.9994455575942993, + "step": 4105 + }, + { + "epoch": 1.03, + "grad_norm": 7.606433868408203, + "learning_rate": 2.257645401853265e-06, + "logits/chosen": -0.3206104636192322, + "logits/rejected": -0.4050351083278656, + "logps/chosen": -61.97241973876953, + "logps/rejected": -94.20598602294922, + "loss": 0.7764, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9405879974365234, + "rewards/margins": 5.472664833068848, + "rewards/rejected": -2.532076835632324, + "step": 4106 + }, + { + "epoch": 1.03, + "grad_norm": 5.002629280090332, + "learning_rate": 2.255455231033759e-06, + "logits/chosen": -0.39618492126464844, + "logits/rejected": -0.4964873790740967, + "logps/chosen": -51.14209747314453, + "logps/rejected": -90.62644958496094, + "loss": 0.6286, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8174076080322266, + "rewards/margins": 4.920251369476318, + "rewards/rejected": -2.10284423828125, + "step": 4107 + }, + { + "epoch": 1.03, + "grad_norm": 13.149866104125977, + "learning_rate": 2.2532658136510165e-06, + "logits/chosen": -0.33387282490730286, + "logits/rejected": -0.4795650839805603, + "logps/chosen": -65.89013671875, + "logps/rejected": -75.86221313476562, + "loss": 0.8948, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.920433759689331, + "rewards/margins": 4.71696662902832, + "rewards/rejected": -1.7965329885482788, + "step": 4108 + }, + { + "epoch": 1.03, + "grad_norm": 3.2278189659118652, + "learning_rate": 2.251077150306078e-06, + "logits/chosen": -0.409171998500824, + "logits/rejected": -0.4789864718914032, + "logps/chosen": -47.07027816772461, + "logps/rejected": -96.02194213867188, + "loss": 0.568, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.898836135864258, + "rewards/margins": 5.776297569274902, + "rewards/rejected": -2.8774611949920654, + "step": 4109 + }, + { + "epoch": 1.03, + "grad_norm": 3.4530017375946045, + "learning_rate": 2.2488892415997785e-06, + "logits/chosen": -0.417996346950531, + "logits/rejected": -0.4862291216850281, + "logps/chosen": -63.69236373901367, + "logps/rejected": -99.04438781738281, + "loss": 0.8716, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.750607490539551, + "rewards/margins": 5.141853332519531, + "rewards/rejected": -2.3912458419799805, + "step": 4110 + }, + { + "epoch": 1.03, + "grad_norm": 5.447269439697266, + "learning_rate": 2.2467020881327466e-06, + "logits/chosen": -0.320120245218277, + "logits/rejected": -0.403949499130249, + "logps/chosen": -55.07048797607422, + "logps/rejected": -97.06875610351562, + "loss": 0.6961, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8588244915008545, + "rewards/margins": 5.246224403381348, + "rewards/rejected": -2.387399673461914, + "step": 4111 + }, + { + "epoch": 1.03, + "grad_norm": 3.938511848449707, + "learning_rate": 2.244515690505403e-06, + "logits/chosen": -0.3364545702934265, + "logits/rejected": -0.37621867656707764, + "logps/chosen": -53.646728515625, + "logps/rejected": -100.87525939941406, + "loss": 0.7069, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1323814392089844, + "rewards/margins": 5.540377616882324, + "rewards/rejected": -2.4079957008361816, + "step": 4112 + }, + { + "epoch": 1.03, + "grad_norm": 3.121925115585327, + "learning_rate": 2.2423300493179615e-06, + "logits/chosen": -0.3744293451309204, + "logits/rejected": -0.4226524233818054, + "logps/chosen": -67.40327453613281, + "logps/rejected": -93.06208038330078, + "loss": 0.6538, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2223472595214844, + "rewards/margins": 5.407113075256348, + "rewards/rejected": -2.1847660541534424, + "step": 4113 + }, + { + "epoch": 1.03, + "grad_norm": 5.061100006103516, + "learning_rate": 2.2401451651704253e-06, + "logits/chosen": -0.38161900639533997, + "logits/rejected": -0.3952069878578186, + "logps/chosen": -55.75657653808594, + "logps/rejected": -105.48961639404297, + "loss": 0.6983, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.077991008758545, + "rewards/margins": 4.69648551940918, + "rewards/rejected": -1.6184945106506348, + "step": 4114 + }, + { + "epoch": 1.03, + "grad_norm": 6.610019683837891, + "learning_rate": 2.2379610386625957e-06, + "logits/chosen": -0.3737969994544983, + "logits/rejected": -0.47332486510276794, + "logps/chosen": -61.055477142333984, + "logps/rejected": -98.00570678710938, + "loss": 0.8048, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4872162342071533, + "rewards/margins": 5.42588996887207, + "rewards/rejected": -2.938673257827759, + "step": 4115 + }, + { + "epoch": 1.03, + "grad_norm": 4.165802955627441, + "learning_rate": 2.2357776703940613e-06, + "logits/chosen": -0.28979870676994324, + "logits/rejected": -0.3570300340652466, + "logps/chosen": -64.9753189086914, + "logps/rejected": -81.29130554199219, + "loss": 0.776, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.037431478500366, + "rewards/margins": 4.027215003967285, + "rewards/rejected": -0.9897834658622742, + "step": 4116 + }, + { + "epoch": 1.03, + "grad_norm": 4.458885669708252, + "learning_rate": 2.233595060964204e-06, + "logits/chosen": -0.3156268298625946, + "logits/rejected": -0.340398371219635, + "logps/chosen": -52.10276794433594, + "logps/rejected": -89.41600799560547, + "loss": 0.8028, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.744396686553955, + "rewards/margins": 3.9308841228485107, + "rewards/rejected": -1.1864877939224243, + "step": 4117 + }, + { + "epoch": 1.03, + "grad_norm": 14.54315185546875, + "learning_rate": 2.231413210972196e-06, + "logits/chosen": -0.3326749801635742, + "logits/rejected": -0.460489422082901, + "logps/chosen": -64.77777099609375, + "logps/rejected": -83.91336822509766, + "loss": 0.8148, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.589742660522461, + "rewards/margins": 4.0077643394470215, + "rewards/rejected": -1.4180214405059814, + "step": 4118 + }, + { + "epoch": 1.03, + "grad_norm": 11.943549156188965, + "learning_rate": 2.2292321210170014e-06, + "logits/chosen": -0.4522472023963928, + "logits/rejected": -0.5062606334686279, + "logps/chosen": -50.425987243652344, + "logps/rejected": -86.4163818359375, + "loss": 0.9652, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6000657081604004, + "rewards/margins": 4.432316780090332, + "rewards/rejected": -1.8322508335113525, + "step": 4119 + }, + { + "epoch": 1.03, + "grad_norm": 4.22039270401001, + "learning_rate": 2.22705179169738e-06, + "logits/chosen": -0.25326433777809143, + "logits/rejected": -0.37524545192718506, + "logps/chosen": -68.8040771484375, + "logps/rejected": -97.75885009765625, + "loss": 0.7086, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.942049026489258, + "rewards/margins": 5.53109884262085, + "rewards/rejected": -2.589049816131592, + "step": 4120 + }, + { + "epoch": 1.03, + "grad_norm": 3.925295114517212, + "learning_rate": 2.224872223611879e-06, + "logits/chosen": -0.3084923028945923, + "logits/rejected": -0.4123461842536926, + "logps/chosen": -67.58338928222656, + "logps/rejected": -78.3921890258789, + "loss": 0.7149, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0890069007873535, + "rewards/margins": 5.096078872680664, + "rewards/rejected": -2.0070722103118896, + "step": 4121 + }, + { + "epoch": 1.03, + "grad_norm": 4.001131057739258, + "learning_rate": 2.2226934173588316e-06, + "logits/chosen": -0.31611692905426025, + "logits/rejected": -0.36373722553253174, + "logps/chosen": -56.27910232543945, + "logps/rejected": -94.39705657958984, + "loss": 0.7492, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9727277755737305, + "rewards/margins": 4.579760551452637, + "rewards/rejected": -1.6070330142974854, + "step": 4122 + }, + { + "epoch": 1.03, + "grad_norm": 3.6555397510528564, + "learning_rate": 2.220515373536372e-06, + "logits/chosen": -0.3274868428707123, + "logits/rejected": -0.46131202578544617, + "logps/chosen": -61.69904327392578, + "logps/rejected": -87.45417785644531, + "loss": 0.6704, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8392951488494873, + "rewards/margins": 5.541188716888428, + "rewards/rejected": -2.7018935680389404, + "step": 4123 + }, + { + "epoch": 1.03, + "grad_norm": 7.650447368621826, + "learning_rate": 2.218338092742418e-06, + "logits/chosen": -0.37276044487953186, + "logits/rejected": -0.5220146775245667, + "logps/chosen": -63.10110855102539, + "logps/rejected": -74.25638580322266, + "loss": 0.8085, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.898510217666626, + "rewards/margins": 4.909988880157471, + "rewards/rejected": -2.0114784240722656, + "step": 4124 + }, + { + "epoch": 1.03, + "grad_norm": 4.995783805847168, + "learning_rate": 2.216161575574684e-06, + "logits/chosen": -0.4021490216255188, + "logits/rejected": -0.47211283445358276, + "logps/chosen": -90.60594177246094, + "logps/rejected": -83.6807861328125, + "loss": 0.7734, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7250356674194336, + "rewards/margins": 4.139408111572266, + "rewards/rejected": -1.4143729209899902, + "step": 4125 + }, + { + "epoch": 1.03, + "grad_norm": 3.743713855743408, + "learning_rate": 2.2139858226306647e-06, + "logits/chosen": -0.3498159945011139, + "logits/rejected": -0.4178895652294159, + "logps/chosen": -61.90266418457031, + "logps/rejected": -90.67408752441406, + "loss": 0.6801, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2491614818573, + "rewards/margins": 4.731655120849609, + "rewards/rejected": -1.4824938774108887, + "step": 4126 + }, + { + "epoch": 1.03, + "grad_norm": 4.339898109436035, + "learning_rate": 2.211810834507656e-06, + "logits/chosen": -0.3621629476547241, + "logits/rejected": -0.4475990831851959, + "logps/chosen": -49.07460021972656, + "logps/rejected": -95.76795196533203, + "loss": 0.6403, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7265424728393555, + "rewards/margins": 4.996551513671875, + "rewards/rejected": -2.2700090408325195, + "step": 4127 + }, + { + "epoch": 1.03, + "grad_norm": 5.339222431182861, + "learning_rate": 2.2096366118027385e-06, + "logits/chosen": -0.3909538686275482, + "logits/rejected": -0.4894045889377594, + "logps/chosen": -54.401187896728516, + "logps/rejected": -85.09837341308594, + "loss": 0.8079, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.040086269378662, + "rewards/margins": 4.919959545135498, + "rewards/rejected": -1.8798737525939941, + "step": 4128 + }, + { + "epoch": 1.03, + "grad_norm": 6.18966007232666, + "learning_rate": 2.207463155112781e-06, + "logits/chosen": -0.3538641631603241, + "logits/rejected": -0.41904857754707336, + "logps/chosen": -48.3302116394043, + "logps/rejected": -85.03683471679688, + "loss": 0.7551, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0464208126068115, + "rewards/margins": 4.190606594085693, + "rewards/rejected": -1.1441861391067505, + "step": 4129 + }, + { + "epoch": 1.03, + "grad_norm": 4.138105869293213, + "learning_rate": 2.205290465034447e-06, + "logits/chosen": -0.31196504831314087, + "logits/rejected": -0.4496763348579407, + "logps/chosen": -59.089576721191406, + "logps/rejected": -69.32833099365234, + "loss": 0.6509, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.141392230987549, + "rewards/margins": 5.362713813781738, + "rewards/rejected": -2.2213220596313477, + "step": 4130 + }, + { + "epoch": 1.03, + "grad_norm": 5.1737213134765625, + "learning_rate": 2.2031185421641828e-06, + "logits/chosen": -0.3785479962825775, + "logits/rejected": -0.4871390461921692, + "logps/chosen": -60.129600524902344, + "logps/rejected": -68.8163833618164, + "loss": 0.7708, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8176345825195312, + "rewards/margins": 4.952867031097412, + "rewards/rejected": -2.135233163833618, + "step": 4131 + }, + { + "epoch": 1.03, + "grad_norm": 12.671142578125, + "learning_rate": 2.200947387098232e-06, + "logits/chosen": -0.38000062108039856, + "logits/rejected": -0.43242037296295166, + "logps/chosen": -60.304569244384766, + "logps/rejected": -82.14684295654297, + "loss": 0.8891, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9266486167907715, + "rewards/margins": 3.4181673526763916, + "rewards/rejected": -0.4915183186531067, + "step": 4132 + }, + { + "epoch": 1.03, + "grad_norm": 4.225491046905518, + "learning_rate": 2.1987770004326255e-06, + "logits/chosen": -0.35716503858566284, + "logits/rejected": -0.5000286102294922, + "logps/chosen": -66.88997650146484, + "logps/rejected": -79.97227478027344, + "loss": 0.8266, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.929128885269165, + "rewards/margins": 4.2589640617370605, + "rewards/rejected": -1.329835057258606, + "step": 4133 + }, + { + "epoch": 1.03, + "grad_norm": 3.5144436359405518, + "learning_rate": 2.1966073827631735e-06, + "logits/chosen": -0.3080625832080841, + "logits/rejected": -0.37267935276031494, + "logps/chosen": -62.029502868652344, + "logps/rejected": -87.37566375732422, + "loss": 0.7411, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1995511054992676, + "rewards/margins": 4.825107574462891, + "rewards/rejected": -1.6255567073822021, + "step": 4134 + }, + { + "epoch": 1.03, + "grad_norm": 3.5622200965881348, + "learning_rate": 2.194438534685489e-06, + "logits/chosen": -0.24539898335933685, + "logits/rejected": -0.3615560829639435, + "logps/chosen": -56.994876861572266, + "logps/rejected": -93.02241516113281, + "loss": 0.6658, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.083305835723877, + "rewards/margins": 5.54764461517334, + "rewards/rejected": -2.4643383026123047, + "step": 4135 + }, + { + "epoch": 1.03, + "grad_norm": 11.75109577178955, + "learning_rate": 2.1922704567949643e-06, + "logits/chosen": -0.3340548276901245, + "logits/rejected": -0.3860870599746704, + "logps/chosen": -54.941619873046875, + "logps/rejected": -90.05782318115234, + "loss": 0.8962, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.686572551727295, + "rewards/margins": 4.318207740783691, + "rewards/rejected": -1.6316354274749756, + "step": 4136 + }, + { + "epoch": 1.03, + "grad_norm": 13.658679008483887, + "learning_rate": 2.190103149686789e-06, + "logits/chosen": -0.3325754702091217, + "logits/rejected": -0.43731623888015747, + "logps/chosen": -67.94276428222656, + "logps/rejected": -82.38220977783203, + "loss": 0.8329, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.608771324157715, + "rewards/margins": 4.192944526672363, + "rewards/rejected": -1.5841732025146484, + "step": 4137 + }, + { + "epoch": 1.04, + "grad_norm": 4.288328170776367, + "learning_rate": 2.1879366139559305e-06, + "logits/chosen": -0.275593101978302, + "logits/rejected": -0.3814713656902313, + "logps/chosen": -71.2483901977539, + "logps/rejected": -80.5688705444336, + "loss": 0.8805, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.678597927093506, + "rewards/margins": 3.3467442989349365, + "rewards/rejected": -0.6681460738182068, + "step": 4138 + }, + { + "epoch": 1.04, + "grad_norm": 4.547238826751709, + "learning_rate": 2.1857708501971484e-06, + "logits/chosen": -0.3213430643081665, + "logits/rejected": -0.4354906678199768, + "logps/chosen": -61.17266082763672, + "logps/rejected": -84.19218444824219, + "loss": 0.815, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.874485492706299, + "rewards/margins": 4.647216320037842, + "rewards/rejected": -1.7727313041687012, + "step": 4139 + }, + { + "epoch": 1.04, + "grad_norm": 4.584372520446777, + "learning_rate": 2.183605859004997e-06, + "logits/chosen": -0.3728688955307007, + "logits/rejected": -0.4435475468635559, + "logps/chosen": -58.299583435058594, + "logps/rejected": -97.24237823486328, + "loss": 0.6282, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.901855945587158, + "rewards/margins": 4.869230270385742, + "rewards/rejected": -1.967374324798584, + "step": 4140 + }, + { + "epoch": 1.04, + "grad_norm": 3.6346349716186523, + "learning_rate": 2.1814416409738095e-06, + "logits/chosen": -0.3508185148239136, + "logits/rejected": -0.46538296341896057, + "logps/chosen": -71.47541046142578, + "logps/rejected": -87.45848083496094, + "loss": 0.7186, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.850447654724121, + "rewards/margins": 5.1550068855285645, + "rewards/rejected": -2.3045594692230225, + "step": 4141 + }, + { + "epoch": 1.04, + "grad_norm": 8.914406776428223, + "learning_rate": 2.179278196697711e-06, + "logits/chosen": -0.3439009189605713, + "logits/rejected": -0.45300212502479553, + "logps/chosen": -49.29076385498047, + "logps/rejected": -80.60540008544922, + "loss": 0.7369, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.931118965148926, + "rewards/margins": 4.739101409912109, + "rewards/rejected": -1.8079825639724731, + "step": 4142 + }, + { + "epoch": 1.04, + "grad_norm": 8.230378150939941, + "learning_rate": 2.1771155267706146e-06, + "logits/chosen": -0.32763129472732544, + "logits/rejected": -0.4102312922477722, + "logps/chosen": -64.9258041381836, + "logps/rejected": -91.56468963623047, + "loss": 0.7963, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6529791355133057, + "rewards/margins": 4.70556116104126, + "rewards/rejected": -2.052581787109375, + "step": 4143 + }, + { + "epoch": 1.04, + "grad_norm": 11.560543060302734, + "learning_rate": 2.174953631786217e-06, + "logits/chosen": -0.4247477352619171, + "logits/rejected": -0.44985896348953247, + "logps/chosen": -45.176788330078125, + "logps/rejected": -94.65486145019531, + "loss": 0.7182, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.11786150932312, + "rewards/margins": 5.742973804473877, + "rewards/rejected": -2.625112533569336, + "step": 4144 + }, + { + "epoch": 1.04, + "grad_norm": 5.93513822555542, + "learning_rate": 2.1727925123380107e-06, + "logits/chosen": -0.3954806923866272, + "logits/rejected": -0.49747440218925476, + "logps/chosen": -61.200836181640625, + "logps/rejected": -77.64501190185547, + "loss": 0.8585, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.015397787094116, + "rewards/margins": 4.3912248611450195, + "rewards/rejected": -1.3758269548416138, + "step": 4145 + }, + { + "epoch": 1.04, + "grad_norm": 3.0903823375701904, + "learning_rate": 2.170632169019263e-06, + "logits/chosen": -0.3728451430797577, + "logits/rejected": -0.4814179539680481, + "logps/chosen": -47.43834686279297, + "logps/rejected": -85.17073059082031, + "loss": 0.5915, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1735951900482178, + "rewards/margins": 5.337952136993408, + "rewards/rejected": -2.1643574237823486, + "step": 4146 + }, + { + "epoch": 1.04, + "grad_norm": 5.172139644622803, + "learning_rate": 2.168472602423039e-06, + "logits/chosen": -0.3145068883895874, + "logits/rejected": -0.39077529311180115, + "logps/chosen": -49.15575408935547, + "logps/rejected": -91.49626159667969, + "loss": 0.7271, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.133267641067505, + "rewards/margins": 5.48801326751709, + "rewards/rejected": -2.354745388031006, + "step": 4147 + }, + { + "epoch": 1.04, + "grad_norm": 3.3746798038482666, + "learning_rate": 2.166313813142185e-06, + "logits/chosen": -0.32293692231178284, + "logits/rejected": -0.4533790946006775, + "logps/chosen": -60.11024475097656, + "logps/rejected": -91.3715591430664, + "loss": 0.7167, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2054991722106934, + "rewards/margins": 5.7996063232421875, + "rewards/rejected": -2.594106674194336, + "step": 4148 + }, + { + "epoch": 1.04, + "grad_norm": 3.154540777206421, + "learning_rate": 2.1641558017693363e-06, + "logits/chosen": -0.39699146151542664, + "logits/rejected": -0.39510419964790344, + "logps/chosen": -50.916786193847656, + "logps/rejected": -89.86465454101562, + "loss": 0.698, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3104865550994873, + "rewards/margins": 4.678181171417236, + "rewards/rejected": -1.3676942586898804, + "step": 4149 + }, + { + "epoch": 1.04, + "grad_norm": 3.5347535610198975, + "learning_rate": 2.161998568896912e-06, + "logits/chosen": -0.3034802973270416, + "logits/rejected": -0.4233543872833252, + "logps/chosen": -71.42929077148438, + "logps/rejected": -76.02400970458984, + "loss": 0.7511, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8643953800201416, + "rewards/margins": 4.112462997436523, + "rewards/rejected": -1.2480676174163818, + "step": 4150 + }, + { + "epoch": 1.04, + "grad_norm": 4.006984233856201, + "learning_rate": 2.1598421151171184e-06, + "logits/chosen": -0.28836172819137573, + "logits/rejected": -0.41694432497024536, + "logps/chosen": -60.54470443725586, + "logps/rejected": -71.0143814086914, + "loss": 0.7196, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.620131254196167, + "rewards/margins": 4.501465320587158, + "rewards/rejected": -1.8813344240188599, + "step": 4151 + }, + { + "epoch": 1.04, + "grad_norm": 10.842724800109863, + "learning_rate": 2.1576864410219517e-06, + "logits/chosen": -0.33605167269706726, + "logits/rejected": -0.5317645072937012, + "logps/chosen": -65.4944839477539, + "logps/rejected": -67.840087890625, + "loss": 0.736, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.035085678100586, + "rewards/margins": 4.738214492797852, + "rewards/rejected": -1.7031292915344238, + "step": 4152 + }, + { + "epoch": 1.04, + "grad_norm": 11.603544235229492, + "learning_rate": 2.155531547203189e-06, + "logits/chosen": -0.2921138107776642, + "logits/rejected": -0.37932223081588745, + "logps/chosen": -59.34735870361328, + "logps/rejected": -92.25411987304688, + "loss": 0.6942, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9750564098358154, + "rewards/margins": 4.912825107574463, + "rewards/rejected": -1.937768578529358, + "step": 4153 + }, + { + "epoch": 1.04, + "grad_norm": 5.078220367431641, + "learning_rate": 2.1533774342523956e-06, + "logits/chosen": -0.3353888988494873, + "logits/rejected": -0.4329654574394226, + "logps/chosen": -54.84810256958008, + "logps/rejected": -86.65013122558594, + "loss": 0.6639, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.765748977661133, + "rewards/margins": 4.721843719482422, + "rewards/rejected": -1.9560948610305786, + "step": 4154 + }, + { + "epoch": 1.04, + "grad_norm": 3.7095787525177, + "learning_rate": 2.1512241027609217e-06, + "logits/chosen": -0.3246159851551056, + "logits/rejected": -0.3104925751686096, + "logps/chosen": -57.1494255065918, + "logps/rejected": -98.77151489257812, + "loss": 0.7673, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0330450534820557, + "rewards/margins": 4.378637790679932, + "rewards/rejected": -1.345592975616455, + "step": 4155 + }, + { + "epoch": 1.04, + "grad_norm": 4.96579122543335, + "learning_rate": 2.1490715533199014e-06, + "logits/chosen": -0.29522526264190674, + "logits/rejected": -0.3980656564235687, + "logps/chosen": -62.033363342285156, + "logps/rejected": -79.72857666015625, + "loss": 0.7204, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5923755168914795, + "rewards/margins": 4.101178169250488, + "rewards/rejected": -1.5088024139404297, + "step": 4156 + }, + { + "epoch": 1.04, + "grad_norm": 3.632737636566162, + "learning_rate": 2.1469197865202617e-06, + "logits/chosen": -0.37676069140434265, + "logits/rejected": -0.4606528580188751, + "logps/chosen": -52.033260345458984, + "logps/rejected": -81.6336669921875, + "loss": 0.7103, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9861490726470947, + "rewards/margins": 4.093897819519043, + "rewards/rejected": -1.1077487468719482, + "step": 4157 + }, + { + "epoch": 1.04, + "grad_norm": 4.462652206420898, + "learning_rate": 2.144768802952703e-06, + "logits/chosen": -0.30606645345687866, + "logits/rejected": -0.4277248978614807, + "logps/chosen": -46.892459869384766, + "logps/rejected": -78.48682403564453, + "loss": 0.719, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9596452713012695, + "rewards/margins": 4.584859848022461, + "rewards/rejected": -1.6252148151397705, + "step": 4158 + }, + { + "epoch": 1.04, + "grad_norm": 5.561644554138184, + "learning_rate": 2.1426186032077178e-06, + "logits/chosen": -0.38171282410621643, + "logits/rejected": -0.4589618742465973, + "logps/chosen": -48.34014129638672, + "logps/rejected": -106.31683349609375, + "loss": 0.6685, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6379244327545166, + "rewards/margins": 4.760229110717773, + "rewards/rejected": -2.122304916381836, + "step": 4159 + }, + { + "epoch": 1.04, + "grad_norm": 2.7485733032226562, + "learning_rate": 2.1404691878755845e-06, + "logits/chosen": -0.3596116900444031, + "logits/rejected": -0.48454490303993225, + "logps/chosen": -62.59458923339844, + "logps/rejected": -91.61709594726562, + "loss": 0.6516, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.992306709289551, + "rewards/margins": 5.921721458435059, + "rewards/rejected": -2.929414749145508, + "step": 4160 + }, + { + "epoch": 1.04, + "grad_norm": 3.582667827606201, + "learning_rate": 2.138320557546363e-06, + "logits/chosen": -0.3497160077095032, + "logits/rejected": -0.44550618529319763, + "logps/chosen": -56.98736572265625, + "logps/rejected": -82.40211486816406, + "loss": 0.7044, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8939340114593506, + "rewards/margins": 5.332921028137207, + "rewards/rejected": -2.4389865398406982, + "step": 4161 + }, + { + "epoch": 1.04, + "grad_norm": 4.5257487297058105, + "learning_rate": 2.1361727128098995e-06, + "logits/chosen": -0.37532368302345276, + "logits/rejected": -0.4680456817150116, + "logps/chosen": -44.52360153198242, + "logps/rejected": -67.86209106445312, + "loss": 0.6646, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9105124473571777, + "rewards/margins": 4.837687969207764, + "rewards/rejected": -1.9271750450134277, + "step": 4162 + }, + { + "epoch": 1.04, + "grad_norm": 8.200583457946777, + "learning_rate": 2.1340256542558225e-06, + "logits/chosen": -0.3160116672515869, + "logits/rejected": -0.3809966742992401, + "logps/chosen": -61.0956916809082, + "logps/rejected": -77.58572387695312, + "loss": 0.6859, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.955216407775879, + "rewards/margins": 4.6715989112854, + "rewards/rejected": -1.7163827419281006, + "step": 4163 + }, + { + "epoch": 1.04, + "grad_norm": 8.409833908081055, + "learning_rate": 2.131879382473544e-06, + "logits/chosen": -0.3234427869319916, + "logits/rejected": -0.45651760697364807, + "logps/chosen": -62.46489715576172, + "logps/rejected": -77.67230224609375, + "loss": 0.8147, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9678597450256348, + "rewards/margins": 4.281187057495117, + "rewards/rejected": -1.3133273124694824, + "step": 4164 + }, + { + "epoch": 1.04, + "grad_norm": 3.589265823364258, + "learning_rate": 2.1297338980522662e-06, + "logits/chosen": -0.3084392845630646, + "logits/rejected": -0.4418063759803772, + "logps/chosen": -57.45445251464844, + "logps/rejected": -76.89800262451172, + "loss": 0.6293, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.883523464202881, + "rewards/margins": 4.8454203605651855, + "rewards/rejected": -1.9618971347808838, + "step": 4165 + }, + { + "epoch": 1.04, + "grad_norm": 6.306605339050293, + "learning_rate": 2.127589201580969e-06, + "logits/chosen": -0.3752853274345398, + "logits/rejected": -0.48642438650131226, + "logps/chosen": -55.12358474731445, + "logps/rejected": -84.41864776611328, + "loss": 0.6931, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5890326499938965, + "rewards/margins": 5.0793280601501465, + "rewards/rejected": -2.490295886993408, + "step": 4166 + }, + { + "epoch": 1.04, + "grad_norm": 5.612700939178467, + "learning_rate": 2.1254452936484176e-06, + "logits/chosen": -0.23847723007202148, + "logits/rejected": -0.3575088381767273, + "logps/chosen": -59.32097625732422, + "logps/rejected": -82.16294860839844, + "loss": 0.7789, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.919572353363037, + "rewards/margins": 4.258949279785156, + "rewards/rejected": -1.33937668800354, + "step": 4167 + }, + { + "epoch": 1.04, + "grad_norm": 6.960265636444092, + "learning_rate": 2.123302174843161e-06, + "logits/chosen": -0.3392004072666168, + "logits/rejected": -0.42829275131225586, + "logps/chosen": -51.03608703613281, + "logps/rejected": -98.16966247558594, + "loss": 0.8032, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.756892681121826, + "rewards/margins": 5.038879871368408, + "rewards/rejected": -2.281986951828003, + "step": 4168 + }, + { + "epoch": 1.04, + "grad_norm": 3.8749449253082275, + "learning_rate": 2.1211598457535316e-06, + "logits/chosen": -0.346048504114151, + "logits/rejected": -0.44990479946136475, + "logps/chosen": -58.53242492675781, + "logps/rejected": -80.33580017089844, + "loss": 0.6921, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1854865550994873, + "rewards/margins": 5.269083499908447, + "rewards/rejected": -2.083597183227539, + "step": 4169 + }, + { + "epoch": 1.04, + "grad_norm": 7.786284923553467, + "learning_rate": 2.119018306967645e-06, + "logits/chosen": -0.3214455246925354, + "logits/rejected": -0.37108153104782104, + "logps/chosen": -68.58012390136719, + "logps/rejected": -90.4285659790039, + "loss": 0.7302, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.940979242324829, + "rewards/margins": 4.609997749328613, + "rewards/rejected": -1.6690189838409424, + "step": 4170 + }, + { + "epoch": 1.04, + "grad_norm": 4.094120502471924, + "learning_rate": 2.116877559073398e-06, + "logits/chosen": -0.37957799434661865, + "logits/rejected": -0.47344115376472473, + "logps/chosen": -56.676414489746094, + "logps/rejected": -78.84463500976562, + "loss": 0.6599, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0359268188476562, + "rewards/margins": 5.000887870788574, + "rewards/rejected": -1.9649611711502075, + "step": 4171 + }, + { + "epoch": 1.04, + "grad_norm": 4.493586540222168, + "learning_rate": 2.114737602658476e-06, + "logits/chosen": -0.2891175448894501, + "logits/rejected": -0.44534918665885925, + "logps/chosen": -76.47052001953125, + "logps/rejected": -92.30755615234375, + "loss": 0.7851, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9399971961975098, + "rewards/margins": 5.245966911315918, + "rewards/rejected": -2.3059699535369873, + "step": 4172 + }, + { + "epoch": 1.04, + "grad_norm": 5.934084415435791, + "learning_rate": 2.112598438310341e-06, + "logits/chosen": -0.34768807888031006, + "logits/rejected": -0.4919470548629761, + "logps/chosen": -52.456565856933594, + "logps/rejected": -74.87299346923828, + "loss": 0.6506, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8215198516845703, + "rewards/margins": 5.849993705749512, + "rewards/rejected": -3.0284738540649414, + "step": 4173 + }, + { + "epoch": 1.04, + "grad_norm": 2.1742498874664307, + "learning_rate": 2.11046006661624e-06, + "logits/chosen": -0.31672537326812744, + "logits/rejected": -0.442527174949646, + "logps/chosen": -57.636863708496094, + "logps/rejected": -72.01589965820312, + "loss": 0.6349, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.704946994781494, + "rewards/margins": 4.8811821937561035, + "rewards/rejected": -2.1762359142303467, + "step": 4174 + }, + { + "epoch": 1.04, + "grad_norm": 3.217273235321045, + "learning_rate": 2.108322488163202e-06, + "logits/chosen": -0.28799670934677124, + "logits/rejected": -0.43574047088623047, + "logps/chosen": -62.94721984863281, + "logps/rejected": -88.25334167480469, + "loss": 0.6127, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.055209159851074, + "rewards/margins": 5.941475868225098, + "rewards/rejected": -2.8862662315368652, + "step": 4175 + }, + { + "epoch": 1.04, + "grad_norm": 5.093685150146484, + "learning_rate": 2.1061857035380364e-06, + "logits/chosen": -0.2784438133239746, + "logits/rejected": -0.41150832176208496, + "logps/chosen": -67.73887634277344, + "logps/rejected": -84.68043518066406, + "loss": 0.8559, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.665292739868164, + "rewards/margins": 5.099391460418701, + "rewards/rejected": -2.434098958969116, + "step": 4176 + }, + { + "epoch": 1.04, + "grad_norm": 3.090667963027954, + "learning_rate": 2.104049713327341e-06, + "logits/chosen": -0.2928708493709564, + "logits/rejected": -0.44881972670555115, + "logps/chosen": -65.01295471191406, + "logps/rejected": -75.57090759277344, + "loss": 0.7308, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.913581371307373, + "rewards/margins": 5.451318740844727, + "rewards/rejected": -2.5377376079559326, + "step": 4177 + }, + { + "epoch": 1.05, + "grad_norm": 4.471789360046387, + "learning_rate": 2.10191451811749e-06, + "logits/chosen": -0.33912327885627747, + "logits/rejected": -0.3739122748374939, + "logps/chosen": -51.992122650146484, + "logps/rejected": -89.21601867675781, + "loss": 0.723, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8241915702819824, + "rewards/margins": 4.482143402099609, + "rewards/rejected": -1.6579523086547852, + "step": 4178 + }, + { + "epoch": 1.05, + "grad_norm": 3.1203038692474365, + "learning_rate": 2.0997801184946366e-06, + "logits/chosen": -0.40619078278541565, + "logits/rejected": -0.4433519244194031, + "logps/chosen": -50.893341064453125, + "logps/rejected": -88.94049072265625, + "loss": 0.674, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.080563545227051, + "rewards/margins": 4.927964210510254, + "rewards/rejected": -1.8474011421203613, + "step": 4179 + }, + { + "epoch": 1.05, + "grad_norm": 5.910259246826172, + "learning_rate": 2.097646515044724e-06, + "logits/chosen": -0.3566378653049469, + "logits/rejected": -0.4213758409023285, + "logps/chosen": -62.815582275390625, + "logps/rejected": -100.29981231689453, + "loss": 0.7454, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.744126319885254, + "rewards/margins": 4.802120208740234, + "rewards/rejected": -2.0579938888549805, + "step": 4180 + }, + { + "epoch": 1.05, + "grad_norm": 4.6536712646484375, + "learning_rate": 2.095513708353471e-06, + "logits/chosen": -0.2993577718734741, + "logits/rejected": -0.43386486172676086, + "logps/chosen": -61.605934143066406, + "logps/rejected": -76.00145721435547, + "loss": 0.8316, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7599234580993652, + "rewards/margins": 4.958704948425293, + "rewards/rejected": -2.198781728744507, + "step": 4181 + }, + { + "epoch": 1.05, + "grad_norm": 8.18682861328125, + "learning_rate": 2.0933816990063794e-06, + "logits/chosen": -0.33426645398139954, + "logits/rejected": -0.41124263405799866, + "logps/chosen": -61.980133056640625, + "logps/rejected": -90.60639953613281, + "loss": 0.8012, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.126755475997925, + "rewards/margins": 5.162972927093506, + "rewards/rejected": -2.036216974258423, + "step": 4182 + }, + { + "epoch": 1.05, + "grad_norm": 7.916745662689209, + "learning_rate": 2.0912504875887305e-06, + "logits/chosen": -0.34804579615592957, + "logits/rejected": -0.4978841245174408, + "logps/chosen": -54.26038360595703, + "logps/rejected": -82.68053436279297, + "loss": 0.7396, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.927147150039673, + "rewards/margins": 5.692778587341309, + "rewards/rejected": -2.7656311988830566, + "step": 4183 + }, + { + "epoch": 1.05, + "grad_norm": 5.262049198150635, + "learning_rate": 2.089120074685587e-06, + "logits/chosen": -0.3739336431026459, + "logits/rejected": -0.4550902545452118, + "logps/chosen": -54.59886932373047, + "logps/rejected": -87.95634460449219, + "loss": 0.7028, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0854015350341797, + "rewards/margins": 5.224196910858154, + "rewards/rejected": -2.138796091079712, + "step": 4184 + }, + { + "epoch": 1.05, + "grad_norm": 5.095002174377441, + "learning_rate": 2.086990460881797e-06, + "logits/chosen": -0.35664239525794983, + "logits/rejected": -0.44610321521759033, + "logps/chosen": -63.93337631225586, + "logps/rejected": -92.11275482177734, + "loss": 0.7793, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.927539110183716, + "rewards/margins": 4.483980178833008, + "rewards/rejected": -1.556441307067871, + "step": 4185 + }, + { + "epoch": 1.05, + "grad_norm": 11.652328491210938, + "learning_rate": 2.084861646761983e-06, + "logits/chosen": -0.33025532960891724, + "logits/rejected": -0.47741901874542236, + "logps/chosen": -50.39448165893555, + "logps/rejected": -75.0260238647461, + "loss": 0.6644, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.566524028778076, + "rewards/margins": 4.840060710906982, + "rewards/rejected": -2.2735371589660645, + "step": 4186 + }, + { + "epoch": 1.05, + "grad_norm": 4.55387020111084, + "learning_rate": 2.0827336329105507e-06, + "logits/chosen": -0.39776673913002014, + "logits/rejected": -0.43137478828430176, + "logps/chosen": -38.73616027832031, + "logps/rejected": -90.02761840820312, + "loss": 0.5564, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1801774501800537, + "rewards/margins": 5.205816268920898, + "rewards/rejected": -2.0256388187408447, + "step": 4187 + }, + { + "epoch": 1.05, + "grad_norm": 5.513501167297363, + "learning_rate": 2.080606419911686e-06, + "logits/chosen": -0.31532806158065796, + "logits/rejected": -0.43250373005867004, + "logps/chosen": -70.57172393798828, + "logps/rejected": -85.34720611572266, + "loss": 0.8864, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.805739641189575, + "rewards/margins": 4.65475606918335, + "rewards/rejected": -1.849016547203064, + "step": 4188 + }, + { + "epoch": 1.05, + "grad_norm": 5.4516448974609375, + "learning_rate": 2.0784800083493524e-06, + "logits/chosen": -0.4291974604129791, + "logits/rejected": -0.4754793643951416, + "logps/chosen": -65.23515319824219, + "logps/rejected": -82.59233856201172, + "loss": 0.6961, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1693458557128906, + "rewards/margins": 4.6669511795043945, + "rewards/rejected": -1.4976056814193726, + "step": 4189 + }, + { + "epoch": 1.05, + "grad_norm": 7.660024642944336, + "learning_rate": 2.076354398807302e-06, + "logits/chosen": -0.2931584119796753, + "logits/rejected": -0.3505650758743286, + "logps/chosen": -57.37297821044922, + "logps/rejected": -105.69898223876953, + "loss": 0.6665, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.522533893585205, + "rewards/margins": 5.573154449462891, + "rewards/rejected": -3.0506207942962646, + "step": 4190 + }, + { + "epoch": 1.05, + "grad_norm": 5.970008373260498, + "learning_rate": 2.0742295918690527e-06, + "logits/chosen": -0.22791561484336853, + "logits/rejected": -0.3845522403717041, + "logps/chosen": -64.86534118652344, + "logps/rejected": -76.38709259033203, + "loss": 0.6288, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.369487762451172, + "rewards/margins": 5.256533622741699, + "rewards/rejected": -1.8870460987091064, + "step": 4191 + }, + { + "epoch": 1.05, + "grad_norm": 6.986112594604492, + "learning_rate": 2.0721055881179154e-06, + "logits/chosen": -0.3087846040725708, + "logits/rejected": -0.4008265733718872, + "logps/chosen": -61.60113525390625, + "logps/rejected": -80.5491714477539, + "loss": 0.7799, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.053537368774414, + "rewards/margins": 4.1971564292907715, + "rewards/rejected": -1.143619418144226, + "step": 4192 + }, + { + "epoch": 1.05, + "grad_norm": 3.0572409629821777, + "learning_rate": 2.0699823881369726e-06, + "logits/chosen": -0.3713614046573639, + "logits/rejected": -0.5542166233062744, + "logps/chosen": -58.56684875488281, + "logps/rejected": -73.12386322021484, + "loss": 0.7483, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0059823989868164, + "rewards/margins": 5.584698677062988, + "rewards/rejected": -2.578716278076172, + "step": 4193 + }, + { + "epoch": 1.05, + "grad_norm": 5.621934413909912, + "learning_rate": 2.067859992509089e-06, + "logits/chosen": -0.4130127429962158, + "logits/rejected": -0.4950541853904724, + "logps/chosen": -49.09922790527344, + "logps/rejected": -89.678955078125, + "loss": 0.6709, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0694568157196045, + "rewards/margins": 6.1418609619140625, + "rewards/rejected": -3.072403907775879, + "step": 4194 + }, + { + "epoch": 1.05, + "grad_norm": 5.872781753540039, + "learning_rate": 2.065738401816909e-06, + "logits/chosen": -0.3713805079460144, + "logits/rejected": -0.5063568353652954, + "logps/chosen": -64.17755126953125, + "logps/rejected": -80.80305480957031, + "loss": 0.7821, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1857457160949707, + "rewards/margins": 4.771900653839111, + "rewards/rejected": -1.5861549377441406, + "step": 4195 + }, + { + "epoch": 1.05, + "grad_norm": 3.6312150955200195, + "learning_rate": 2.063617616642852e-06, + "logits/chosen": -0.3502538800239563, + "logits/rejected": -0.42979249358177185, + "logps/chosen": -54.7042236328125, + "logps/rejected": -79.05887603759766, + "loss": 0.6665, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8295884132385254, + "rewards/margins": 4.798058032989502, + "rewards/rejected": -1.968469500541687, + "step": 4196 + }, + { + "epoch": 1.05, + "grad_norm": 5.3960490226745605, + "learning_rate": 2.0614976375691226e-06, + "logits/chosen": -0.35436558723449707, + "logits/rejected": -0.46384677290916443, + "logps/chosen": -65.50621032714844, + "logps/rejected": -79.58280944824219, + "loss": 0.7071, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8952131271362305, + "rewards/margins": 4.809220314025879, + "rewards/rejected": -1.9140070676803589, + "step": 4197 + }, + { + "epoch": 1.05, + "grad_norm": 7.231687545776367, + "learning_rate": 2.0593784651777014e-06, + "logits/chosen": -0.35105660557746887, + "logits/rejected": -0.46192219853401184, + "logps/chosen": -61.991268157958984, + "logps/rejected": -63.96434783935547, + "loss": 0.8389, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6856393814086914, + "rewards/margins": 4.355037212371826, + "rewards/rejected": -1.669398307800293, + "step": 4198 + }, + { + "epoch": 1.05, + "grad_norm": 3.847219228744507, + "learning_rate": 2.0572601000503424e-06, + "logits/chosen": -0.37060609459877014, + "logits/rejected": -0.48520588874816895, + "logps/chosen": -53.15912628173828, + "logps/rejected": -89.75264739990234, + "loss": 0.6463, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9945120811462402, + "rewards/margins": 4.762970447540283, + "rewards/rejected": -1.7684577703475952, + "step": 4199 + }, + { + "epoch": 1.05, + "grad_norm": 5.579522609710693, + "learning_rate": 2.0551425427685862e-06, + "logits/chosen": -0.3167015314102173, + "logits/rejected": -0.4387477934360504, + "logps/chosen": -69.35149383544922, + "logps/rejected": -73.76427459716797, + "loss": 0.7502, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.920677661895752, + "rewards/margins": 4.888302803039551, + "rewards/rejected": -1.9676251411437988, + "step": 4200 + }, + { + "epoch": 1.05, + "grad_norm": 3.2539849281311035, + "learning_rate": 2.0530257939137456e-06, + "logits/chosen": -0.386353462934494, + "logits/rejected": -0.4873509109020233, + "logps/chosen": -54.886932373046875, + "logps/rejected": -80.67740631103516, + "loss": 0.7246, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.73503041267395, + "rewards/margins": 4.703081130981445, + "rewards/rejected": -1.9680505990982056, + "step": 4201 + }, + { + "epoch": 1.05, + "grad_norm": 8.747539520263672, + "learning_rate": 2.0509098540669187e-06, + "logits/chosen": -0.3450492322444916, + "logits/rejected": -0.4097300171852112, + "logps/chosen": -53.60139083862305, + "logps/rejected": -89.6034927368164, + "loss": 0.8295, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8586618900299072, + "rewards/margins": 4.067398548126221, + "rewards/rejected": -1.2087361812591553, + "step": 4202 + }, + { + "epoch": 1.05, + "grad_norm": 2.9583187103271484, + "learning_rate": 2.0487947238089713e-06, + "logits/chosen": -0.37214046716690063, + "logits/rejected": -0.43906739354133606, + "logps/chosen": -50.787193298339844, + "logps/rejected": -96.3465347290039, + "loss": 0.5829, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.811215877532959, + "rewards/margins": 4.457350254058838, + "rewards/rejected": -1.6461342573165894, + "step": 4203 + }, + { + "epoch": 1.05, + "grad_norm": 4.526684284210205, + "learning_rate": 2.0466804037205527e-06, + "logits/chosen": -0.3243870437145233, + "logits/rejected": -0.4939122200012207, + "logps/chosen": -64.79532623291016, + "logps/rejected": -76.23798370361328, + "loss": 0.8771, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1051559448242188, + "rewards/margins": 4.611639976501465, + "rewards/rejected": -1.5064842700958252, + "step": 4204 + }, + { + "epoch": 1.05, + "grad_norm": 5.465481758117676, + "learning_rate": 2.044566894382092e-06, + "logits/chosen": -0.35260215401649475, + "logits/rejected": -0.4567059278488159, + "logps/chosen": -58.75065994262695, + "logps/rejected": -82.87142181396484, + "loss": 0.7086, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2177977561950684, + "rewards/margins": 5.708988666534424, + "rewards/rejected": -2.4911909103393555, + "step": 4205 + }, + { + "epoch": 1.05, + "grad_norm": 5.6962666511535645, + "learning_rate": 2.0424541963737933e-06, + "logits/chosen": -0.3813154399394989, + "logits/rejected": -0.454261839389801, + "logps/chosen": -50.691253662109375, + "logps/rejected": -83.39189147949219, + "loss": 0.7083, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0967676639556885, + "rewards/margins": 5.100179672241211, + "rewards/rejected": -2.0034120082855225, + "step": 4206 + }, + { + "epoch": 1.05, + "grad_norm": 3.734311580657959, + "learning_rate": 2.0403423102756352e-06, + "logits/chosen": -0.4591209888458252, + "logits/rejected": -0.5418011546134949, + "logps/chosen": -53.79823684692383, + "logps/rejected": -77.7030029296875, + "loss": 0.7577, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4687139987945557, + "rewards/margins": 4.9282450675964355, + "rewards/rejected": -2.459531307220459, + "step": 4207 + }, + { + "epoch": 1.05, + "grad_norm": 6.3014116287231445, + "learning_rate": 2.0382312366673777e-06, + "logits/chosen": -0.39658021926879883, + "logits/rejected": -0.40025395154953003, + "logps/chosen": -49.971981048583984, + "logps/rejected": -92.32331085205078, + "loss": 0.8688, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.948512315750122, + "rewards/margins": 4.455679893493652, + "rewards/rejected": -1.5071673393249512, + "step": 4208 + }, + { + "epoch": 1.05, + "grad_norm": 5.668275833129883, + "learning_rate": 2.0361209761285543e-06, + "logits/chosen": -0.3134738802909851, + "logits/rejected": -0.4173049032688141, + "logps/chosen": -69.77069091796875, + "logps/rejected": -94.99938201904297, + "loss": 0.7235, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9126036167144775, + "rewards/margins": 4.199008464813232, + "rewards/rejected": -1.286405086517334, + "step": 4209 + }, + { + "epoch": 1.05, + "grad_norm": 7.416377067565918, + "learning_rate": 2.0340115292384823e-06, + "logits/chosen": -0.3002015948295593, + "logits/rejected": -0.4080890119075775, + "logps/chosen": -61.987693786621094, + "logps/rejected": -80.94891357421875, + "loss": 0.727, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.989243984222412, + "rewards/margins": 5.41463041305542, + "rewards/rejected": -2.425386667251587, + "step": 4210 + }, + { + "epoch": 1.05, + "grad_norm": 4.41240119934082, + "learning_rate": 2.0319028965762423e-06, + "logits/chosen": -0.2764563262462616, + "logits/rejected": -0.4038674533367157, + "logps/chosen": -57.3919563293457, + "logps/rejected": -75.10200500488281, + "loss": 0.7559, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.658966064453125, + "rewards/margins": 5.04898738861084, + "rewards/rejected": -2.390021800994873, + "step": 4211 + }, + { + "epoch": 1.05, + "grad_norm": 6.5182366371154785, + "learning_rate": 2.029795078720705e-06, + "logits/chosen": -0.34868448972702026, + "logits/rejected": -0.43597209453582764, + "logps/chosen": -61.0897216796875, + "logps/rejected": -84.75, + "loss": 0.7692, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7929294109344482, + "rewards/margins": 4.862147331237793, + "rewards/rejected": -2.069218397140503, + "step": 4212 + }, + { + "epoch": 1.05, + "grad_norm": 3.465420722961426, + "learning_rate": 2.0276880762505113e-06, + "logits/chosen": -0.27309736609458923, + "logits/rejected": -0.4045567512512207, + "logps/chosen": -63.53649139404297, + "logps/rejected": -78.63056945800781, + "loss": 0.6992, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.109426975250244, + "rewards/margins": 5.00368595123291, + "rewards/rejected": -1.8942586183547974, + "step": 4213 + }, + { + "epoch": 1.05, + "grad_norm": 4.5645246505737305, + "learning_rate": 2.0255818897440778e-06, + "logits/chosen": -0.3410949110984802, + "logits/rejected": -0.4483347535133362, + "logps/chosen": -69.03636169433594, + "logps/rejected": -84.15873718261719, + "loss": 0.6856, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8679356575012207, + "rewards/margins": 4.880005836486816, + "rewards/rejected": -2.0120699405670166, + "step": 4214 + }, + { + "epoch": 1.05, + "grad_norm": 8.980814933776855, + "learning_rate": 2.0234765197795988e-06, + "logits/chosen": -0.2532375752925873, + "logits/rejected": -0.4252324104309082, + "logps/chosen": -72.21200561523438, + "logps/rejected": -72.40104675292969, + "loss": 0.8383, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6640498638153076, + "rewards/margins": 4.3259758949279785, + "rewards/rejected": -1.6619261503219604, + "step": 4215 + }, + { + "epoch": 1.05, + "grad_norm": 3.081732749938965, + "learning_rate": 2.0213719669350414e-06, + "logits/chosen": -0.31860804557800293, + "logits/rejected": -0.46023160219192505, + "logps/chosen": -54.47492980957031, + "logps/rejected": -99.13365173339844, + "loss": 0.6623, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9813098907470703, + "rewards/margins": 6.240540027618408, + "rewards/rejected": -3.259229898452759, + "step": 4216 + }, + { + "epoch": 1.05, + "grad_norm": 7.034800052642822, + "learning_rate": 2.019268231788154e-06, + "logits/chosen": -0.37530556321144104, + "logits/rejected": -0.4305632412433624, + "logps/chosen": -47.84425354003906, + "logps/rejected": -93.14311218261719, + "loss": 0.7034, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9437077045440674, + "rewards/margins": 5.449385643005371, + "rewards/rejected": -2.5056779384613037, + "step": 4217 + }, + { + "epoch": 1.06, + "grad_norm": 9.496491432189941, + "learning_rate": 2.0171653149164567e-06, + "logits/chosen": -0.43971192836761475, + "logits/rejected": -0.5527587532997131, + "logps/chosen": -48.01076889038086, + "logps/rejected": -75.63565826416016, + "loss": 0.6891, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7392418384552, + "rewards/margins": 5.488905429840088, + "rewards/rejected": -2.749663829803467, + "step": 4218 + }, + { + "epoch": 1.06, + "grad_norm": 7.356509685516357, + "learning_rate": 2.015063216897245e-06, + "logits/chosen": -0.27624380588531494, + "logits/rejected": -0.4067908525466919, + "logps/chosen": -60.14994430541992, + "logps/rejected": -79.80345916748047, + "loss": 0.648, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.802800416946411, + "rewards/margins": 5.506402969360352, + "rewards/rejected": -2.7036027908325195, + "step": 4219 + }, + { + "epoch": 1.06, + "grad_norm": 3.8976292610168457, + "learning_rate": 2.01296193830759e-06, + "logits/chosen": -0.24968449771404266, + "logits/rejected": -0.3418295085430145, + "logps/chosen": -53.46535873413086, + "logps/rejected": -76.36051940917969, + "loss": 0.6871, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.783693313598633, + "rewards/margins": 4.170723915100098, + "rewards/rejected": -1.3870303630828857, + "step": 4220 + }, + { + "epoch": 1.06, + "grad_norm": 4.344912528991699, + "learning_rate": 2.0108614797243366e-06, + "logits/chosen": -0.35053855180740356, + "logits/rejected": -0.43434691429138184, + "logps/chosen": -58.03929138183594, + "logps/rejected": -82.09040832519531, + "loss": 0.7007, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9436843395233154, + "rewards/margins": 4.548053741455078, + "rewards/rejected": -1.6043694019317627, + "step": 4221 + }, + { + "epoch": 1.06, + "grad_norm": 2.626922607421875, + "learning_rate": 2.008761841724111e-06, + "logits/chosen": -0.3428693413734436, + "logits/rejected": -0.46652752161026, + "logps/chosen": -66.52533721923828, + "logps/rejected": -67.44196319580078, + "loss": 0.6965, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1890151500701904, + "rewards/margins": 5.022493362426758, + "rewards/rejected": -1.8334779739379883, + "step": 4222 + }, + { + "epoch": 1.06, + "grad_norm": 6.497470378875732, + "learning_rate": 2.0066630248833036e-06, + "logits/chosen": -0.3013148307800293, + "logits/rejected": -0.4254869520664215, + "logps/chosen": -77.54335021972656, + "logps/rejected": -72.88448333740234, + "loss": 0.7985, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8622169494628906, + "rewards/margins": 4.919721603393555, + "rewards/rejected": -2.057504653930664, + "step": 4223 + }, + { + "epoch": 1.06, + "grad_norm": 5.715848445892334, + "learning_rate": 2.004565029778085e-06, + "logits/chosen": -0.3668522834777832, + "logits/rejected": -0.4166325330734253, + "logps/chosen": -54.32516098022461, + "logps/rejected": -75.21451568603516, + "loss": 0.79, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.663729190826416, + "rewards/margins": 4.1951494216918945, + "rewards/rejected": -1.5314207077026367, + "step": 4224 + }, + { + "epoch": 1.06, + "grad_norm": 6.975030422210693, + "learning_rate": 2.002467856984404e-06, + "logits/chosen": -0.37897348403930664, + "logits/rejected": -0.4008736312389374, + "logps/chosen": -55.562034606933594, + "logps/rejected": -108.15376281738281, + "loss": 0.7877, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.961214303970337, + "rewards/margins": 4.658725738525391, + "rewards/rejected": -1.6975115537643433, + "step": 4225 + }, + { + "epoch": 1.06, + "grad_norm": 3.5831193923950195, + "learning_rate": 2.0003715070779783e-06, + "logits/chosen": -0.29327234625816345, + "logits/rejected": -0.30503490567207336, + "logps/chosen": -63.03934860229492, + "logps/rejected": -89.38764953613281, + "loss": 0.7646, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.062291145324707, + "rewards/margins": 4.833263874053955, + "rewards/rejected": -1.7709729671478271, + "step": 4226 + }, + { + "epoch": 1.06, + "grad_norm": 2.956294059753418, + "learning_rate": 1.9982759806343003e-06, + "logits/chosen": -0.39893069863319397, + "logits/rejected": -0.5353729724884033, + "logps/chosen": -66.31201171875, + "logps/rejected": -93.7181167602539, + "loss": 0.6555, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.930490016937256, + "rewards/margins": 5.1387481689453125, + "rewards/rejected": -2.2082581520080566, + "step": 4227 + }, + { + "epoch": 1.06, + "grad_norm": 5.982693672180176, + "learning_rate": 1.9961812782286384e-06, + "logits/chosen": -0.38380861282348633, + "logits/rejected": -0.4407498836517334, + "logps/chosen": -51.179473876953125, + "logps/rejected": -89.3271484375, + "loss": 0.6997, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6671178340911865, + "rewards/margins": 5.068367958068848, + "rewards/rejected": -2.4012503623962402, + "step": 4228 + }, + { + "epoch": 1.06, + "grad_norm": 3.5190069675445557, + "learning_rate": 1.9940874004360307e-06, + "logits/chosen": -0.31082260608673096, + "logits/rejected": -0.4022633731365204, + "logps/chosen": -59.432411193847656, + "logps/rejected": -88.6788558959961, + "loss": 0.6651, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7840843200683594, + "rewards/margins": 4.863615989685059, + "rewards/rejected": -2.079531192779541, + "step": 4229 + }, + { + "epoch": 1.06, + "grad_norm": 3.8741021156311035, + "learning_rate": 1.991994347831296e-06, + "logits/chosen": -0.30577147006988525, + "logits/rejected": -0.4103730320930481, + "logps/chosen": -54.935585021972656, + "logps/rejected": -96.3592529296875, + "loss": 0.6577, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.073054552078247, + "rewards/margins": 5.77653694152832, + "rewards/rejected": -2.7034823894500732, + "step": 4230 + }, + { + "epoch": 1.06, + "grad_norm": 5.913698196411133, + "learning_rate": 1.98990212098902e-06, + "logits/chosen": -0.3404174745082855, + "logits/rejected": -0.43745821714401245, + "logps/chosen": -71.71145629882812, + "logps/rejected": -76.18279266357422, + "loss": 0.6758, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.877901077270508, + "rewards/margins": 4.33026647567749, + "rewards/rejected": -1.4523651599884033, + "step": 4231 + }, + { + "epoch": 1.06, + "grad_norm": 3.5556681156158447, + "learning_rate": 1.987810720483566e-06, + "logits/chosen": -0.4128049910068512, + "logits/rejected": -0.5513189435005188, + "logps/chosen": -51.551021575927734, + "logps/rejected": -76.53687286376953, + "loss": 0.6004, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0222489833831787, + "rewards/margins": 5.7192816734313965, + "rewards/rejected": -2.6970324516296387, + "step": 4232 + }, + { + "epoch": 1.06, + "grad_norm": 4.8788018226623535, + "learning_rate": 1.985720146889065e-06, + "logits/chosen": -0.3685965836048126, + "logits/rejected": -0.4306629002094269, + "logps/chosen": -46.277462005615234, + "logps/rejected": -88.85394287109375, + "loss": 0.6965, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1917471885681152, + "rewards/margins": 4.764066219329834, + "rewards/rejected": -1.5723185539245605, + "step": 4233 + }, + { + "epoch": 1.06, + "grad_norm": 5.8661980628967285, + "learning_rate": 1.9836304007794267e-06, + "logits/chosen": -0.3123489022254944, + "logits/rejected": -0.3877847492694855, + "logps/chosen": -61.3140754699707, + "logps/rejected": -92.04638671875, + "loss": 0.7961, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8473620414733887, + "rewards/margins": 4.187758922576904, + "rewards/rejected": -1.3403964042663574, + "step": 4234 + }, + { + "epoch": 1.06, + "grad_norm": 4.122550964355469, + "learning_rate": 1.9815414827283343e-06, + "logits/chosen": -0.3817985951900482, + "logits/rejected": -0.40828561782836914, + "logps/chosen": -44.30525588989258, + "logps/rejected": -87.38440704345703, + "loss": 0.7142, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0131449699401855, + "rewards/margins": 4.366415500640869, + "rewards/rejected": -1.353271245956421, + "step": 4235 + }, + { + "epoch": 1.06, + "grad_norm": 3.3644204139709473, + "learning_rate": 1.979453393309235e-06, + "logits/chosen": -0.4021822214126587, + "logits/rejected": -0.49592941999435425, + "logps/chosen": -52.08270263671875, + "logps/rejected": -85.20948791503906, + "loss": 0.631, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.235530138015747, + "rewards/margins": 5.042586803436279, + "rewards/rejected": -1.8070565462112427, + "step": 4236 + }, + { + "epoch": 1.06, + "grad_norm": 4.951471328735352, + "learning_rate": 1.977366133095358e-06, + "logits/chosen": -0.3780467212200165, + "logits/rejected": -0.5134600400924683, + "logps/chosen": -55.40147018432617, + "logps/rejected": -81.0656967163086, + "loss": 0.6584, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8042678833007812, + "rewards/margins": 5.706135272979736, + "rewards/rejected": -2.901867151260376, + "step": 4237 + }, + { + "epoch": 1.06, + "grad_norm": 5.331997394561768, + "learning_rate": 1.975279702659702e-06, + "logits/chosen": -0.33978381752967834, + "logits/rejected": -0.4589914381504059, + "logps/chosen": -50.79798889160156, + "logps/rejected": -84.94429016113281, + "loss": 0.7653, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8850886821746826, + "rewards/margins": 4.804281711578369, + "rewards/rejected": -1.919192910194397, + "step": 4238 + }, + { + "epoch": 1.06, + "grad_norm": 14.007445335388184, + "learning_rate": 1.9731941025750354e-06, + "logits/chosen": -0.43574827909469604, + "logits/rejected": -0.44549235701560974, + "logps/chosen": -49.02457046508789, + "logps/rejected": -85.58538055419922, + "loss": 0.7113, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7928826808929443, + "rewards/margins": 4.698892593383789, + "rewards/rejected": -1.9060102701187134, + "step": 4239 + }, + { + "epoch": 1.06, + "grad_norm": 8.0934476852417, + "learning_rate": 1.9711093334139013e-06, + "logits/chosen": -0.3594510853290558, + "logits/rejected": -0.3779701590538025, + "logps/chosen": -48.312992095947266, + "logps/rejected": -98.27159881591797, + "loss": 0.9026, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6328511238098145, + "rewards/margins": 3.844329357147217, + "rewards/rejected": -1.2114777565002441, + "step": 4240 + }, + { + "epoch": 1.06, + "grad_norm": 5.9229416847229, + "learning_rate": 1.9690253957486116e-06, + "logits/chosen": -0.4420034885406494, + "logits/rejected": -0.5086857676506042, + "logps/chosen": -48.3040885925293, + "logps/rejected": -82.92781066894531, + "loss": 0.8092, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7227306365966797, + "rewards/margins": 5.167177200317383, + "rewards/rejected": -2.4444468021392822, + "step": 4241 + }, + { + "epoch": 1.06, + "grad_norm": 4.631107807159424, + "learning_rate": 1.966942290151256e-06, + "logits/chosen": -0.39047881960868835, + "logits/rejected": -0.48805734515190125, + "logps/chosen": -57.3701286315918, + "logps/rejected": -78.39305114746094, + "loss": 0.8102, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7248928546905518, + "rewards/margins": 4.9613213539123535, + "rewards/rejected": -2.2364282608032227, + "step": 4242 + }, + { + "epoch": 1.06, + "grad_norm": 25.084598541259766, + "learning_rate": 1.964860017193691e-06, + "logits/chosen": -0.42459022998809814, + "logits/rejected": -0.48186826705932617, + "logps/chosen": -53.923492431640625, + "logps/rejected": -80.508544921875, + "loss": 0.8103, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5248661041259766, + "rewards/margins": 3.296215772628784, + "rewards/rejected": -0.7713495492935181, + "step": 4243 + }, + { + "epoch": 1.06, + "grad_norm": 4.015562534332275, + "learning_rate": 1.9627785774475415e-06, + "logits/chosen": -0.39636939764022827, + "logits/rejected": -0.4971121847629547, + "logps/chosen": -49.64512634277344, + "logps/rejected": -78.03958892822266, + "loss": 0.6046, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9573488235473633, + "rewards/margins": 4.569887638092041, + "rewards/rejected": -1.6125386953353882, + "step": 4244 + }, + { + "epoch": 1.06, + "grad_norm": 7.088048458099365, + "learning_rate": 1.9606979714842124e-06, + "logits/chosen": -0.398098349571228, + "logits/rejected": -0.409315288066864, + "logps/chosen": -57.50801086425781, + "logps/rejected": -95.08065795898438, + "loss": 0.7602, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.034271717071533, + "rewards/margins": 4.70993709564209, + "rewards/rejected": -1.675665259361267, + "step": 4245 + }, + { + "epoch": 1.06, + "grad_norm": 2.693011999130249, + "learning_rate": 1.958618199874871e-06, + "logits/chosen": -0.27541762590408325, + "logits/rejected": -0.3747778534889221, + "logps/chosen": -62.519500732421875, + "logps/rejected": -88.06879425048828, + "loss": 0.7378, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9838688373565674, + "rewards/margins": 5.288022041320801, + "rewards/rejected": -2.304152727127075, + "step": 4246 + }, + { + "epoch": 1.06, + "grad_norm": 7.3076558113098145, + "learning_rate": 1.956539263190465e-06, + "logits/chosen": -0.3516691029071808, + "logits/rejected": -0.4923996925354004, + "logps/chosen": -55.70928955078125, + "logps/rejected": -73.74468994140625, + "loss": 0.7251, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7538602352142334, + "rewards/margins": 4.698493957519531, + "rewards/rejected": -1.9446336030960083, + "step": 4247 + }, + { + "epoch": 1.06, + "grad_norm": 9.996378898620605, + "learning_rate": 1.954461162001703e-06, + "logits/chosen": -0.4065600037574768, + "logits/rejected": -0.4593852162361145, + "logps/chosen": -59.69013214111328, + "logps/rejected": -81.66999816894531, + "loss": 0.8328, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.72664475440979, + "rewards/margins": 4.0491557121276855, + "rewards/rejected": -1.3225111961364746, + "step": 4248 + }, + { + "epoch": 1.06, + "grad_norm": 14.79979133605957, + "learning_rate": 1.952383896879067e-06, + "logits/chosen": -0.32477331161499023, + "logits/rejected": -0.4800765812397003, + "logps/chosen": -67.11639404296875, + "logps/rejected": -77.30207061767578, + "loss": 0.779, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.877033233642578, + "rewards/margins": 4.21546745300293, + "rewards/rejected": -1.3384343385696411, + "step": 4249 + }, + { + "epoch": 1.06, + "grad_norm": 4.74691104888916, + "learning_rate": 1.950307468392816e-06, + "logits/chosen": -0.36656054854393005, + "logits/rejected": -0.5042107105255127, + "logps/chosen": -52.65644454956055, + "logps/rejected": -78.32839965820312, + "loss": 0.6788, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7740907669067383, + "rewards/margins": 5.376935005187988, + "rewards/rejected": -2.602843999862671, + "step": 4250 + }, + { + "epoch": 1.06, + "grad_norm": 9.87234878540039, + "learning_rate": 1.9482318771129715e-06, + "logits/chosen": -0.3249528408050537, + "logits/rejected": -0.3942694664001465, + "logps/chosen": -69.07438659667969, + "logps/rejected": -85.09703826904297, + "loss": 0.8735, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.944664478302002, + "rewards/margins": 4.412812232971191, + "rewards/rejected": -1.4681476354599, + "step": 4251 + }, + { + "epoch": 1.06, + "grad_norm": 4.592178821563721, + "learning_rate": 1.946157123609329e-06, + "logits/chosen": -0.2757207453250885, + "logits/rejected": -0.3548077344894409, + "logps/chosen": -64.70665740966797, + "logps/rejected": -93.76066589355469, + "loss": 0.7434, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7631449699401855, + "rewards/margins": 4.331523895263672, + "rewards/rejected": -1.5683780908584595, + "step": 4252 + }, + { + "epoch": 1.06, + "grad_norm": 21.88436508178711, + "learning_rate": 1.9440832084514527e-06, + "logits/chosen": -0.3371504545211792, + "logits/rejected": -0.42769068479537964, + "logps/chosen": -53.05015563964844, + "logps/rejected": -84.77001190185547, + "loss": 0.7617, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.85136079788208, + "rewards/margins": 4.392456531524658, + "rewards/rejected": -1.5410957336425781, + "step": 4253 + }, + { + "epoch": 1.06, + "grad_norm": 5.11411190032959, + "learning_rate": 1.9420101322086756e-06, + "logits/chosen": -0.34248340129852295, + "logits/rejected": -0.44600746035575867, + "logps/chosen": -53.65550994873047, + "logps/rejected": -92.04095458984375, + "loss": 0.7041, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.058448553085327, + "rewards/margins": 5.183839797973633, + "rewards/rejected": -2.1253910064697266, + "step": 4254 + }, + { + "epoch": 1.06, + "grad_norm": 5.758103847503662, + "learning_rate": 1.9399378954501067e-06, + "logits/chosen": -0.24320945143699646, + "logits/rejected": -0.4048892855644226, + "logps/chosen": -71.89044189453125, + "logps/rejected": -102.7971420288086, + "loss": 0.6768, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7379794120788574, + "rewards/margins": 4.748260021209717, + "rewards/rejected": -2.0102806091308594, + "step": 4255 + }, + { + "epoch": 1.06, + "grad_norm": 2.6962080001831055, + "learning_rate": 1.9378664987446135e-06, + "logits/chosen": -0.3495941162109375, + "logits/rejected": -0.3703433871269226, + "logps/chosen": -52.25103759765625, + "logps/rejected": -90.16893005371094, + "loss": 0.6684, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2061595916748047, + "rewards/margins": 5.138517379760742, + "rewards/rejected": -1.932357668876648, + "step": 4256 + }, + { + "epoch": 1.06, + "grad_norm": 3.1125240325927734, + "learning_rate": 1.935795942660843e-06, + "logits/chosen": -0.3953377306461334, + "logits/rejected": -0.5377448201179504, + "logps/chosen": -60.23933792114258, + "logps/rejected": -75.56596374511719, + "loss": 0.7194, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7619426250457764, + "rewards/margins": 5.132803440093994, + "rewards/rejected": -2.3708603382110596, + "step": 4257 + }, + { + "epoch": 1.07, + "grad_norm": 5.081161022186279, + "learning_rate": 1.933726227767207e-06, + "logits/chosen": -0.4042113423347473, + "logits/rejected": -0.49369189143180847, + "logps/chosen": -68.22039031982422, + "logps/rejected": -87.2086410522461, + "loss": 0.6943, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1510844230651855, + "rewards/margins": 5.197333335876465, + "rewards/rejected": -2.0462493896484375, + "step": 4258 + }, + { + "epoch": 1.07, + "grad_norm": 6.107014179229736, + "learning_rate": 1.9316573546318863e-06, + "logits/chosen": -0.31858089566230774, + "logits/rejected": -0.40870559215545654, + "logps/chosen": -57.042449951171875, + "logps/rejected": -88.15119171142578, + "loss": 0.9221, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.799342632293701, + "rewards/margins": 4.014825820922852, + "rewards/rejected": -1.2154828310012817, + "step": 4259 + }, + { + "epoch": 1.07, + "grad_norm": 2.9562525749206543, + "learning_rate": 1.9295893238228316e-06, + "logits/chosen": -0.3356732726097107, + "logits/rejected": -0.4352226257324219, + "logps/chosen": -47.078792572021484, + "logps/rejected": -86.12049865722656, + "loss": 0.6167, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.055905342102051, + "rewards/margins": 5.67655086517334, + "rewards/rejected": -2.6206459999084473, + "step": 4260 + }, + { + "epoch": 1.07, + "grad_norm": 10.845499992370605, + "learning_rate": 1.9275221359077594e-06, + "logits/chosen": -0.3182484209537506, + "logits/rejected": -0.42084378004074097, + "logps/chosen": -60.85720443725586, + "logps/rejected": -79.61268615722656, + "loss": 0.9376, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.892523765563965, + "rewards/margins": 3.6411051750183105, + "rewards/rejected": -0.7485814690589905, + "step": 4261 + }, + { + "epoch": 1.07, + "grad_norm": 5.148682594299316, + "learning_rate": 1.925455791454162e-06, + "logits/chosen": -0.3423851430416107, + "logits/rejected": -0.4992641806602478, + "logps/chosen": -71.08154296875, + "logps/rejected": -82.56492614746094, + "loss": 0.7563, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.880725383758545, + "rewards/margins": 4.359825134277344, + "rewards/rejected": -1.4791001081466675, + "step": 4262 + }, + { + "epoch": 1.07, + "grad_norm": 5.46238899230957, + "learning_rate": 1.923390291029293e-06, + "logits/chosen": -0.38539543747901917, + "logits/rejected": -0.4801410436630249, + "logps/chosen": -56.42168045043945, + "logps/rejected": -70.93600463867188, + "loss": 0.7311, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.050330400466919, + "rewards/margins": 5.061906337738037, + "rewards/rejected": -2.01157546043396, + "step": 4263 + }, + { + "epoch": 1.07, + "grad_norm": 4.836381912231445, + "learning_rate": 1.9213256352001777e-06, + "logits/chosen": -0.3965969979763031, + "logits/rejected": -0.4846331477165222, + "logps/chosen": -52.505348205566406, + "logps/rejected": -84.75848388671875, + "loss": 0.8027, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.051054000854492, + "rewards/margins": 5.6917619705200195, + "rewards/rejected": -2.640707492828369, + "step": 4264 + }, + { + "epoch": 1.07, + "grad_norm": 4.665035724639893, + "learning_rate": 1.9192618245336075e-06, + "logits/chosen": -0.3882979452610016, + "logits/rejected": -0.4343700706958771, + "logps/chosen": -65.44915008544922, + "logps/rejected": -84.9524917602539, + "loss": 0.7972, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.596116065979004, + "rewards/margins": 4.37267541885376, + "rewards/rejected": -1.7765593528747559, + "step": 4265 + }, + { + "epoch": 1.07, + "grad_norm": 4.738111972808838, + "learning_rate": 1.9171988595961416e-06, + "logits/chosen": -0.295387327671051, + "logits/rejected": -0.3824092745780945, + "logps/chosen": -52.3939208984375, + "logps/rejected": -88.05642700195312, + "loss": 0.5971, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9879682064056396, + "rewards/margins": 5.262635707855225, + "rewards/rejected": -2.2746665477752686, + "step": 4266 + }, + { + "epoch": 1.07, + "grad_norm": 3.2119855880737305, + "learning_rate": 1.9151367409541144e-06, + "logits/chosen": -0.3848578631877899, + "logits/rejected": -0.49542349576950073, + "logps/chosen": -59.47362518310547, + "logps/rejected": -97.99498748779297, + "loss": 0.6188, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.898040294647217, + "rewards/margins": 6.092189788818359, + "rewards/rejected": -3.194149971008301, + "step": 4267 + }, + { + "epoch": 1.07, + "grad_norm": 7.767975330352783, + "learning_rate": 1.9130754691736157e-06, + "logits/chosen": -0.383720725774765, + "logits/rejected": -0.4202462434768677, + "logps/chosen": -47.847660064697266, + "logps/rejected": -91.3631362915039, + "loss": 0.7043, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8047428131103516, + "rewards/margins": 4.367187023162842, + "rewards/rejected": -1.5624440908432007, + "step": 4268 + }, + { + "epoch": 1.07, + "grad_norm": 5.5272979736328125, + "learning_rate": 1.91101504482051e-06, + "logits/chosen": -0.2623923718929291, + "logits/rejected": -0.3883925676345825, + "logps/chosen": -62.52242660522461, + "logps/rejected": -76.17061614990234, + "loss": 0.7885, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.770749568939209, + "rewards/margins": 4.903054237365723, + "rewards/rejected": -2.1323041915893555, + "step": 4269 + }, + { + "epoch": 1.07, + "grad_norm": 4.068314075469971, + "learning_rate": 1.908955468460431e-06, + "logits/chosen": -0.2857966125011444, + "logits/rejected": -0.44623780250549316, + "logps/chosen": -63.13214874267578, + "logps/rejected": -74.85073852539062, + "loss": 0.7742, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9533846378326416, + "rewards/margins": 4.717973709106445, + "rewards/rejected": -1.7645890712738037, + "step": 4270 + }, + { + "epoch": 1.07, + "grad_norm": 4.048321723937988, + "learning_rate": 1.9068967406587758e-06, + "logits/chosen": -0.38119739294052124, + "logits/rejected": -0.5186019539833069, + "logps/chosen": -52.638023376464844, + "logps/rejected": -76.68241119384766, + "loss": 0.8366, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7249557971954346, + "rewards/margins": 4.4379496574401855, + "rewards/rejected": -1.7129935026168823, + "step": 4271 + }, + { + "epoch": 1.07, + "grad_norm": 2.8139803409576416, + "learning_rate": 1.9048388619807085e-06, + "logits/chosen": -0.4189426302909851, + "logits/rejected": -0.4458586871623993, + "logps/chosen": -48.20924377441406, + "logps/rejected": -89.55435943603516, + "loss": 0.64, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2730660438537598, + "rewards/margins": 4.6832275390625, + "rewards/rejected": -1.4101614952087402, + "step": 4272 + }, + { + "epoch": 1.07, + "grad_norm": 6.639122009277344, + "learning_rate": 1.9027818329911624e-06, + "logits/chosen": -0.3828415870666504, + "logits/rejected": -0.48259007930755615, + "logps/chosen": -49.08848190307617, + "logps/rejected": -95.06155395507812, + "loss": 0.6322, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.889948606491089, + "rewards/margins": 4.869836330413818, + "rewards/rejected": -1.97988760471344, + "step": 4273 + }, + { + "epoch": 1.07, + "grad_norm": 6.289495944976807, + "learning_rate": 1.9007256542548336e-06, + "logits/chosen": -0.3455362915992737, + "logits/rejected": -0.39369240403175354, + "logps/chosen": -59.819602966308594, + "logps/rejected": -77.95378112792969, + "loss": 0.9947, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.615243911743164, + "rewards/margins": 3.710583448410034, + "rewards/rejected": -1.095339298248291, + "step": 4274 + }, + { + "epoch": 1.07, + "grad_norm": 11.029495239257812, + "learning_rate": 1.898670326336192e-06, + "logits/chosen": -0.3250043988227844, + "logits/rejected": -0.4000464081764221, + "logps/chosen": -48.05841064453125, + "logps/rejected": -81.91423034667969, + "loss": 0.7837, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.940706729888916, + "rewards/margins": 4.628846645355225, + "rewards/rejected": -1.6881399154663086, + "step": 4275 + }, + { + "epoch": 1.07, + "grad_norm": 6.8037285804748535, + "learning_rate": 1.896615849799467e-06, + "logits/chosen": -0.4051516056060791, + "logits/rejected": -0.5002291202545166, + "logps/chosen": -52.33017349243164, + "logps/rejected": -82.18434143066406, + "loss": 0.8138, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8918848037719727, + "rewards/margins": 5.104574203491211, + "rewards/rejected": -2.21268892288208, + "step": 4276 + }, + { + "epoch": 1.07, + "grad_norm": 13.340690612792969, + "learning_rate": 1.8945622252086572e-06, + "logits/chosen": -0.34509921073913574, + "logits/rejected": -0.39511343836784363, + "logps/chosen": -59.032745361328125, + "logps/rejected": -80.5696029663086, + "loss": 0.8952, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5851805210113525, + "rewards/margins": 3.5081288814544678, + "rewards/rejected": -0.9229480028152466, + "step": 4277 + }, + { + "epoch": 1.07, + "grad_norm": 3.9102773666381836, + "learning_rate": 1.8925094531275267e-06, + "logits/chosen": -0.4523518979549408, + "logits/rejected": -0.4771498739719391, + "logps/chosen": -38.74897003173828, + "logps/rejected": -87.40139770507812, + "loss": 0.6752, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.98028302192688, + "rewards/margins": 5.188351154327393, + "rewards/rejected": -2.2080678939819336, + "step": 4278 + }, + { + "epoch": 1.07, + "grad_norm": 6.100168704986572, + "learning_rate": 1.8904575341196051e-06, + "logits/chosen": -0.28407227993011475, + "logits/rejected": -0.36655354499816895, + "logps/chosen": -62.857704162597656, + "logps/rejected": -83.93897247314453, + "loss": 0.8173, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.041236162185669, + "rewards/margins": 3.7903926372528076, + "rewards/rejected": -0.7491563558578491, + "step": 4279 + }, + { + "epoch": 1.07, + "grad_norm": 12.746659278869629, + "learning_rate": 1.8884064687481897e-06, + "logits/chosen": -0.37300264835357666, + "logits/rejected": -0.39803755283355713, + "logps/chosen": -55.48870849609375, + "logps/rejected": -104.49856567382812, + "loss": 0.6837, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.231943130493164, + "rewards/margins": 6.757919788360596, + "rewards/rejected": -3.5259761810302734, + "step": 4280 + }, + { + "epoch": 1.07, + "grad_norm": 4.822226047515869, + "learning_rate": 1.8863562575763395e-06, + "logits/chosen": -0.2837267220020294, + "logits/rejected": -0.29104045033454895, + "logps/chosen": -58.2873420715332, + "logps/rejected": -101.30927276611328, + "loss": 0.7579, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.77917218208313, + "rewards/margins": 4.0828166007995605, + "rewards/rejected": -1.3036442995071411, + "step": 4281 + }, + { + "epoch": 1.07, + "grad_norm": 4.046550750732422, + "learning_rate": 1.8843069011668852e-06, + "logits/chosen": -0.3181288242340088, + "logits/rejected": -0.43992912769317627, + "logps/chosen": -51.092506408691406, + "logps/rejected": -89.51984405517578, + "loss": 0.769, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.481052875518799, + "rewards/margins": 4.5324578285217285, + "rewards/rejected": -2.051405429840088, + "step": 4282 + }, + { + "epoch": 1.07, + "grad_norm": 5.498443603515625, + "learning_rate": 1.8822584000824185e-06, + "logits/chosen": -0.29967865347862244, + "logits/rejected": -0.4102630615234375, + "logps/chosen": -59.011680603027344, + "logps/rejected": -78.15145111083984, + "loss": 0.7837, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.00907301902771, + "rewards/margins": 4.632571220397949, + "rewards/rejected": -1.6234983205795288, + "step": 4283 + }, + { + "epoch": 1.07, + "grad_norm": 2.1567132472991943, + "learning_rate": 1.880210754885296e-06, + "logits/chosen": -0.2964555621147156, + "logits/rejected": -0.42924976348876953, + "logps/chosen": -52.73323059082031, + "logps/rejected": -92.01016235351562, + "loss": 0.5561, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9903783798217773, + "rewards/margins": 5.896836757659912, + "rewards/rejected": -2.9064579010009766, + "step": 4284 + }, + { + "epoch": 1.07, + "grad_norm": 5.210458278656006, + "learning_rate": 1.878163966137641e-06, + "logits/chosen": -0.42860180139541626, + "logits/rejected": -0.4744255840778351, + "logps/chosen": -46.3361930847168, + "logps/rejected": -89.18301391601562, + "loss": 0.6503, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.850167989730835, + "rewards/margins": 4.883763790130615, + "rewards/rejected": -2.033595561981201, + "step": 4285 + }, + { + "epoch": 1.07, + "grad_norm": 3.8240461349487305, + "learning_rate": 1.8761180344013396e-06, + "logits/chosen": -0.29255110025405884, + "logits/rejected": -0.38489317893981934, + "logps/chosen": -57.837440490722656, + "logps/rejected": -104.29619598388672, + "loss": 0.7097, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.144939422607422, + "rewards/margins": 5.696811199188232, + "rewards/rejected": -2.5518712997436523, + "step": 4286 + }, + { + "epoch": 1.07, + "grad_norm": 3.596763849258423, + "learning_rate": 1.8740729602380475e-06, + "logits/chosen": -0.3025357723236084, + "logits/rejected": -0.4381581246852875, + "logps/chosen": -54.40412902832031, + "logps/rejected": -74.35880279541016, + "loss": 0.6719, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1385159492492676, + "rewards/margins": 5.513885974884033, + "rewards/rejected": -2.3753702640533447, + "step": 4287 + }, + { + "epoch": 1.07, + "grad_norm": 4.9908647537231445, + "learning_rate": 1.8720287442091823e-06, + "logits/chosen": -0.43048882484436035, + "logits/rejected": -0.5444349050521851, + "logps/chosen": -46.190406799316406, + "logps/rejected": -73.53666687011719, + "loss": 0.6716, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1451544761657715, + "rewards/margins": 4.996098041534424, + "rewards/rejected": -1.8509438037872314, + "step": 4288 + }, + { + "epoch": 1.07, + "grad_norm": 5.783740997314453, + "learning_rate": 1.8699853868759187e-06, + "logits/chosen": -0.33257004618644714, + "logits/rejected": -0.3965173363685608, + "logps/chosen": -66.56655883789062, + "logps/rejected": -112.22560119628906, + "loss": 0.7623, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8457648754119873, + "rewards/margins": 5.290046215057373, + "rewards/rejected": -2.444281816482544, + "step": 4289 + }, + { + "epoch": 1.07, + "grad_norm": 3.5679636001586914, + "learning_rate": 1.8679428887992091e-06, + "logits/chosen": -0.320984423160553, + "logits/rejected": -0.3344154953956604, + "logps/chosen": -60.54183578491211, + "logps/rejected": -100.34117126464844, + "loss": 0.7105, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0450572967529297, + "rewards/margins": 4.504884243011475, + "rewards/rejected": -1.4598267078399658, + "step": 4290 + }, + { + "epoch": 1.07, + "grad_norm": 3.933840751647949, + "learning_rate": 1.865901250539761e-06, + "logits/chosen": -0.3698577880859375, + "logits/rejected": -0.45607873797416687, + "logps/chosen": -58.95207595825195, + "logps/rejected": -81.5202407836914, + "loss": 0.6598, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0135815143585205, + "rewards/margins": 5.398331642150879, + "rewards/rejected": -2.3847496509552, + "step": 4291 + }, + { + "epoch": 1.07, + "grad_norm": 3.660691022872925, + "learning_rate": 1.8638604726580479e-06, + "logits/chosen": -0.2509588897228241, + "logits/rejected": -0.4081679880619049, + "logps/chosen": -71.34708404541016, + "logps/rejected": -91.63602447509766, + "loss": 0.7042, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.769810676574707, + "rewards/margins": 5.622673034667969, + "rewards/rejected": -2.852862596511841, + "step": 4292 + }, + { + "epoch": 1.07, + "grad_norm": 5.558195114135742, + "learning_rate": 1.8618205557143076e-06, + "logits/chosen": -0.3373267352581024, + "logits/rejected": -0.4136412739753723, + "logps/chosen": -63.4478759765625, + "logps/rejected": -71.2224349975586, + "loss": 0.8578, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.790694236755371, + "rewards/margins": 4.148000240325928, + "rewards/rejected": -1.357305645942688, + "step": 4293 + }, + { + "epoch": 1.07, + "grad_norm": 4.674361228942871, + "learning_rate": 1.85978150026854e-06, + "logits/chosen": -0.3396419286727905, + "logits/rejected": -0.3549252152442932, + "logps/chosen": -61.808841705322266, + "logps/rejected": -82.63941955566406, + "loss": 0.8343, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.895127773284912, + "rewards/margins": 3.705713987350464, + "rewards/rejected": -0.810586154460907, + "step": 4294 + }, + { + "epoch": 1.07, + "grad_norm": 6.570921897888184, + "learning_rate": 1.8577433068805123e-06, + "logits/chosen": -0.3561078906059265, + "logits/rejected": -0.42247912287712097, + "logps/chosen": -59.09708786010742, + "logps/rejected": -85.12653350830078, + "loss": 0.8633, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9798049926757812, + "rewards/margins": 5.254847526550293, + "rewards/rejected": -2.2750420570373535, + "step": 4295 + }, + { + "epoch": 1.07, + "grad_norm": 4.796471118927002, + "learning_rate": 1.8557059761097517e-06, + "logits/chosen": -0.37296122312545776, + "logits/rejected": -0.5407288074493408, + "logps/chosen": -57.34330749511719, + "logps/rejected": -68.23600769042969, + "loss": 0.6704, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.251270294189453, + "rewards/margins": 5.3216986656188965, + "rewards/rejected": -2.0704288482666016, + "step": 4296 + }, + { + "epoch": 1.07, + "grad_norm": 6.479856014251709, + "learning_rate": 1.8536695085155492e-06, + "logits/chosen": -0.36895373463630676, + "logits/rejected": -0.47683438658714294, + "logps/chosen": -53.76844024658203, + "logps/rejected": -72.41902923583984, + "loss": 0.8029, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9764232635498047, + "rewards/margins": 4.041849613189697, + "rewards/rejected": -1.0654264688491821, + "step": 4297 + }, + { + "epoch": 1.08, + "grad_norm": 9.738960266113281, + "learning_rate": 1.8516339046569593e-06, + "logits/chosen": -0.38172173500061035, + "logits/rejected": -0.4736010432243347, + "logps/chosen": -42.958404541015625, + "logps/rejected": -76.24970245361328, + "loss": 0.7558, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.590911626815796, + "rewards/margins": 4.127469062805176, + "rewards/rejected": -1.5365574359893799, + "step": 4298 + }, + { + "epoch": 1.08, + "grad_norm": 12.741836547851562, + "learning_rate": 1.849599165092797e-06, + "logits/chosen": -0.3175913989543915, + "logits/rejected": -0.46951043605804443, + "logps/chosen": -58.54474639892578, + "logps/rejected": -87.17284393310547, + "loss": 0.7384, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.405367136001587, + "rewards/margins": 4.249624729156494, + "rewards/rejected": -1.8442574739456177, + "step": 4299 + }, + { + "epoch": 1.08, + "grad_norm": 10.722638130187988, + "learning_rate": 1.8475652903816476e-06, + "logits/chosen": -0.29745277762413025, + "logits/rejected": -0.4113485813140869, + "logps/chosen": -55.46245193481445, + "logps/rejected": -78.06926727294922, + "loss": 0.8966, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.792207956314087, + "rewards/margins": 3.6927130222320557, + "rewards/rejected": -0.9005048274993896, + "step": 4300 + }, + { + "epoch": 1.08, + "grad_norm": 7.328588485717773, + "learning_rate": 1.8455322810818465e-06, + "logits/chosen": -0.3373364210128784, + "logits/rejected": -0.37588170170783997, + "logps/chosen": -50.922916412353516, + "logps/rejected": -87.73965454101562, + "loss": 0.6407, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.776352643966675, + "rewards/margins": 4.983638286590576, + "rewards/rejected": -2.2072854042053223, + "step": 4301 + }, + { + "epoch": 1.08, + "grad_norm": 9.961030006408691, + "learning_rate": 1.8435001377515043e-06, + "logits/chosen": -0.30576619505882263, + "logits/rejected": -0.3835889399051666, + "logps/chosen": -58.11951446533203, + "logps/rejected": -85.12628173828125, + "loss": 0.7267, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9463534355163574, + "rewards/margins": 4.440688133239746, + "rewards/rejected": -1.4943349361419678, + "step": 4302 + }, + { + "epoch": 1.08, + "grad_norm": 3.6939406394958496, + "learning_rate": 1.841468860948486e-06, + "logits/chosen": -0.3636822998523712, + "logits/rejected": -0.43921881914138794, + "logps/chosen": -56.84022521972656, + "logps/rejected": -82.34068298339844, + "loss": 0.7543, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8885281085968018, + "rewards/margins": 4.641109466552734, + "rewards/rejected": -1.7525814771652222, + "step": 4303 + }, + { + "epoch": 1.08, + "grad_norm": 5.18402624130249, + "learning_rate": 1.8394384512304215e-06, + "logits/chosen": -0.27171263098716736, + "logits/rejected": -0.36855560541152954, + "logps/chosen": -64.698486328125, + "logps/rejected": -86.03614044189453, + "loss": 0.8499, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8596954345703125, + "rewards/margins": 4.649720668792725, + "rewards/rejected": -1.7900248765945435, + "step": 4304 + }, + { + "epoch": 1.08, + "grad_norm": 4.850376605987549, + "learning_rate": 1.8374089091547015e-06, + "logits/chosen": -0.27791067957878113, + "logits/rejected": -0.40298181772232056, + "logps/chosen": -54.763580322265625, + "logps/rejected": -82.60540771484375, + "loss": 0.6357, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.903088331222534, + "rewards/margins": 4.711276531219482, + "rewards/rejected": -1.8081881999969482, + "step": 4305 + }, + { + "epoch": 1.08, + "grad_norm": 3.3017377853393555, + "learning_rate": 1.8353802352784771e-06, + "logits/chosen": -0.38510048389434814, + "logits/rejected": -0.42542287707328796, + "logps/chosen": -42.807212829589844, + "logps/rejected": -89.57720184326172, + "loss": 0.6018, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.97452974319458, + "rewards/margins": 4.718329906463623, + "rewards/rejected": -1.7437999248504639, + "step": 4306 + }, + { + "epoch": 1.08, + "grad_norm": 7.2593159675598145, + "learning_rate": 1.8333524301586674e-06, + "logits/chosen": -0.40484002232551575, + "logits/rejected": -0.4645845890045166, + "logps/chosen": -53.889610290527344, + "logps/rejected": -82.08123779296875, + "loss": 0.846, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4938907623291016, + "rewards/margins": 4.277196884155273, + "rewards/rejected": -1.7833061218261719, + "step": 4307 + }, + { + "epoch": 1.08, + "grad_norm": 12.347452163696289, + "learning_rate": 1.8313254943519482e-06, + "logits/chosen": -0.2898283302783966, + "logits/rejected": -0.3547540009021759, + "logps/chosen": -67.75878143310547, + "logps/rejected": -103.2301025390625, + "loss": 0.8283, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5157580375671387, + "rewards/margins": 4.86597204208374, + "rewards/rejected": -2.3502135276794434, + "step": 4308 + }, + { + "epoch": 1.08, + "grad_norm": 4.286431789398193, + "learning_rate": 1.8292994284147518e-06, + "logits/chosen": -0.33717164397239685, + "logits/rejected": -0.42122358083724976, + "logps/chosen": -50.71016311645508, + "logps/rejected": -87.86711120605469, + "loss": 0.6273, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9665846824645996, + "rewards/margins": 4.834501266479492, + "rewards/rejected": -1.8679167032241821, + "step": 4309 + }, + { + "epoch": 1.08, + "grad_norm": 4.419230937957764, + "learning_rate": 1.8272742329032823e-06, + "logits/chosen": -0.48753228783607483, + "logits/rejected": -0.5890979170799255, + "logps/chosen": -47.153411865234375, + "logps/rejected": -69.92047882080078, + "loss": 0.7151, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9804627895355225, + "rewards/margins": 4.966002941131592, + "rewards/rejected": -1.9855399131774902, + "step": 4310 + }, + { + "epoch": 1.08, + "grad_norm": 9.763239860534668, + "learning_rate": 1.8252499083734965e-06, + "logits/chosen": -0.30787432193756104, + "logits/rejected": -0.44683200120925903, + "logps/chosen": -52.80400848388672, + "logps/rejected": -72.00856018066406, + "loss": 0.6875, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.165475606918335, + "rewards/margins": 4.832712650299072, + "rewards/rejected": -1.6672375202178955, + "step": 4311 + }, + { + "epoch": 1.08, + "grad_norm": 9.746024131774902, + "learning_rate": 1.8232264553811196e-06, + "logits/chosen": -0.32701343297958374, + "logits/rejected": -0.416673481464386, + "logps/chosen": -42.61476516723633, + "logps/rejected": -73.41584777832031, + "loss": 0.7374, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8640594482421875, + "rewards/margins": 3.8773391246795654, + "rewards/rejected": -1.013279676437378, + "step": 4312 + }, + { + "epoch": 1.08, + "grad_norm": 6.762356281280518, + "learning_rate": 1.8212038744816285e-06, + "logits/chosen": -0.4032091498374939, + "logits/rejected": -0.46823394298553467, + "logps/chosen": -54.535526275634766, + "logps/rejected": -94.2080078125, + "loss": 0.7329, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.914933204650879, + "rewards/margins": 5.571695327758789, + "rewards/rejected": -2.656761646270752, + "step": 4313 + }, + { + "epoch": 1.08, + "grad_norm": 3.6051406860351562, + "learning_rate": 1.819182166230265e-06, + "logits/chosen": -0.3260958194732666, + "logits/rejected": -0.42241084575653076, + "logps/chosen": -68.92745208740234, + "logps/rejected": -85.60269165039062, + "loss": 0.7359, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2428770065307617, + "rewards/margins": 5.064455509185791, + "rewards/rejected": -1.821578860282898, + "step": 4314 + }, + { + "epoch": 1.08, + "grad_norm": 9.051567077636719, + "learning_rate": 1.8171613311820352e-06, + "logits/chosen": -0.35177475214004517, + "logits/rejected": -0.4421645998954773, + "logps/chosen": -57.97444534301758, + "logps/rejected": -83.8482894897461, + "loss": 0.7772, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.918386936187744, + "rewards/margins": 4.852485656738281, + "rewards/rejected": -1.9340981245040894, + "step": 4315 + }, + { + "epoch": 1.08, + "grad_norm": 7.748473167419434, + "learning_rate": 1.8151413698917004e-06, + "logits/chosen": -0.3456704020500183, + "logits/rejected": -0.4986134171485901, + "logps/chosen": -69.6048355102539, + "logps/rejected": -69.55635070800781, + "loss": 0.8762, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.520408868789673, + "rewards/margins": 3.3006250858306885, + "rewards/rejected": -0.7802163362503052, + "step": 4316 + }, + { + "epoch": 1.08, + "grad_norm": 4.707952976226807, + "learning_rate": 1.8131222829137834e-06, + "logits/chosen": -0.43010905385017395, + "logits/rejected": -0.5647386908531189, + "logps/chosen": -56.68257522583008, + "logps/rejected": -78.99724578857422, + "loss": 0.6467, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.884251594543457, + "rewards/margins": 5.272233009338379, + "rewards/rejected": -2.3879811763763428, + "step": 4317 + }, + { + "epoch": 1.08, + "grad_norm": 3.768106460571289, + "learning_rate": 1.8111040708025674e-06, + "logits/chosen": -0.29178038239479065, + "logits/rejected": -0.3986718952655792, + "logps/chosen": -57.0810432434082, + "logps/rejected": -90.62157440185547, + "loss": 0.7301, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.86853289604187, + "rewards/margins": 5.033315658569336, + "rewards/rejected": -2.164782762527466, + "step": 4318 + }, + { + "epoch": 1.08, + "grad_norm": 4.966537952423096, + "learning_rate": 1.8090867341120927e-06, + "logits/chosen": -0.4365142285823822, + "logits/rejected": -0.46880796551704407, + "logps/chosen": -55.17828369140625, + "logps/rejected": -82.8293228149414, + "loss": 0.7989, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8513762950897217, + "rewards/margins": 4.020130634307861, + "rewards/rejected": -1.1687544584274292, + "step": 4319 + }, + { + "epoch": 1.08, + "grad_norm": 4.880061149597168, + "learning_rate": 1.8070702733961676e-06, + "logits/chosen": -0.3388059139251709, + "logits/rejected": -0.4519963562488556, + "logps/chosen": -50.50436782836914, + "logps/rejected": -85.4823226928711, + "loss": 0.7575, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.560828685760498, + "rewards/margins": 4.415598392486572, + "rewards/rejected": -1.8547699451446533, + "step": 4320 + }, + { + "epoch": 1.08, + "grad_norm": 3.8403940200805664, + "learning_rate": 1.805054689208347e-06, + "logits/chosen": -0.3510761857032776, + "logits/rejected": -0.4676557779312134, + "logps/chosen": -52.11238479614258, + "logps/rejected": -84.01376342773438, + "loss": 0.658, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2311949729919434, + "rewards/margins": 6.4222092628479, + "rewards/rejected": -3.191014289855957, + "step": 4321 + }, + { + "epoch": 1.08, + "grad_norm": 6.051425933837891, + "learning_rate": 1.8030399821019573e-06, + "logits/chosen": -0.3601188361644745, + "logits/rejected": -0.4077412486076355, + "logps/chosen": -54.165077209472656, + "logps/rejected": -99.2523422241211, + "loss": 0.7485, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9905505180358887, + "rewards/margins": 5.5404157638549805, + "rewards/rejected": -2.549865484237671, + "step": 4322 + }, + { + "epoch": 1.08, + "grad_norm": 4.728820323944092, + "learning_rate": 1.8010261526300777e-06, + "logits/chosen": -0.32967114448547363, + "logits/rejected": -0.4217227101325989, + "logps/chosen": -54.455299377441406, + "logps/rejected": -80.92823028564453, + "loss": 0.6314, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.98190975189209, + "rewards/margins": 3.9727630615234375, + "rewards/rejected": -0.9908534288406372, + "step": 4323 + }, + { + "epoch": 1.08, + "grad_norm": 5.567877292633057, + "learning_rate": 1.7990132013455475e-06, + "logits/chosen": -0.26924264430999756, + "logits/rejected": -0.3591269552707672, + "logps/chosen": -56.061485290527344, + "logps/rejected": -80.65924835205078, + "loss": 0.7712, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5812020301818848, + "rewards/margins": 4.149500370025635, + "rewards/rejected": -1.5682984590530396, + "step": 4324 + }, + { + "epoch": 1.08, + "grad_norm": 3.378732681274414, + "learning_rate": 1.7970011288009653e-06, + "logits/chosen": -0.3539874851703644, + "logits/rejected": -0.31979528069496155, + "logps/chosen": -45.16477966308594, + "logps/rejected": -85.84956359863281, + "loss": 0.6862, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9291534423828125, + "rewards/margins": 4.125537395477295, + "rewards/rejected": -1.196384310722351, + "step": 4325 + }, + { + "epoch": 1.08, + "grad_norm": 3.3571178913116455, + "learning_rate": 1.7949899355486872e-06, + "logits/chosen": -0.4888840317726135, + "logits/rejected": -0.5494822263717651, + "logps/chosen": -46.073158264160156, + "logps/rejected": -91.09889221191406, + "loss": 0.6708, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8891513347625732, + "rewards/margins": 5.586385726928711, + "rewards/rejected": -2.6972341537475586, + "step": 4326 + }, + { + "epoch": 1.08, + "grad_norm": 5.659530162811279, + "learning_rate": 1.7929796221408313e-06, + "logits/chosen": -0.28823649883270264, + "logits/rejected": -0.39670270681381226, + "logps/chosen": -50.43073272705078, + "logps/rejected": -82.19410705566406, + "loss": 0.7444, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6848526000976562, + "rewards/margins": 4.258472919464111, + "rewards/rejected": -1.5736198425292969, + "step": 4327 + }, + { + "epoch": 1.08, + "grad_norm": 5.575730800628662, + "learning_rate": 1.7909701891292719e-06, + "logits/chosen": -0.433369904756546, + "logits/rejected": -0.5006628036499023, + "logps/chosen": -58.90916061401367, + "logps/rejected": -90.77407836914062, + "loss": 0.8397, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0062804222106934, + "rewards/margins": 4.596789836883545, + "rewards/rejected": -1.5905098915100098, + "step": 4328 + }, + { + "epoch": 1.08, + "grad_norm": 5.700982093811035, + "learning_rate": 1.788961637065641e-06, + "logits/chosen": -0.46711266040802, + "logits/rejected": -0.6103179454803467, + "logps/chosen": -60.82350540161133, + "logps/rejected": -87.50182342529297, + "loss": 0.7417, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9985952377319336, + "rewards/margins": 5.277953624725342, + "rewards/rejected": -2.279358386993408, + "step": 4329 + }, + { + "epoch": 1.08, + "grad_norm": 5.414691925048828, + "learning_rate": 1.786953966501329e-06, + "logits/chosen": -0.31024712324142456, + "logits/rejected": -0.40888434648513794, + "logps/chosen": -74.19212341308594, + "logps/rejected": -92.43026733398438, + "loss": 0.7567, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0818140506744385, + "rewards/margins": 4.832927227020264, + "rewards/rejected": -1.7511130571365356, + "step": 4330 + }, + { + "epoch": 1.08, + "grad_norm": 5.440053462982178, + "learning_rate": 1.784947177987484e-06, + "logits/chosen": -0.2788594365119934, + "logits/rejected": -0.39466392993927, + "logps/chosen": -59.44473648071289, + "logps/rejected": -79.63233947753906, + "loss": 0.6852, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.872731924057007, + "rewards/margins": 4.5942769050598145, + "rewards/rejected": -1.721544861793518, + "step": 4331 + }, + { + "epoch": 1.08, + "grad_norm": 7.150735378265381, + "learning_rate": 1.7829412720750172e-06, + "logits/chosen": -0.4410989284515381, + "logits/rejected": -0.556016743183136, + "logps/chosen": -54.83028030395508, + "logps/rejected": -70.9614486694336, + "loss": 0.7981, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8796112537384033, + "rewards/margins": 4.54063606262207, + "rewards/rejected": -1.661025047302246, + "step": 4332 + }, + { + "epoch": 1.08, + "grad_norm": 5.7640886306762695, + "learning_rate": 1.780936249314588e-06, + "logits/chosen": -0.26274728775024414, + "logits/rejected": -0.41874024271965027, + "logps/chosen": -66.74420166015625, + "logps/rejected": -77.38636016845703, + "loss": 0.7364, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9821412563323975, + "rewards/margins": 4.803743362426758, + "rewards/rejected": -1.8216021060943604, + "step": 4333 + }, + { + "epoch": 1.08, + "grad_norm": 5.316488742828369, + "learning_rate": 1.7789321102566186e-06, + "logits/chosen": -0.38151606917381287, + "logits/rejected": -0.4396129250526428, + "logps/chosen": -47.15241622924805, + "logps/rejected": -90.13505554199219, + "loss": 0.6902, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.87398624420166, + "rewards/margins": 4.754898548126221, + "rewards/rejected": -1.8809125423431396, + "step": 4334 + }, + { + "epoch": 1.08, + "grad_norm": 3.6997952461242676, + "learning_rate": 1.776928855451292e-06, + "logits/chosen": -0.368301659822464, + "logits/rejected": -0.4716050624847412, + "logps/chosen": -50.0079345703125, + "logps/rejected": -92.33578491210938, + "loss": 0.685, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9584579467773438, + "rewards/margins": 4.652417182922363, + "rewards/rejected": -1.6939598321914673, + "step": 4335 + }, + { + "epoch": 1.08, + "grad_norm": 3.346937894821167, + "learning_rate": 1.774926485448543e-06, + "logits/chosen": -0.3304590582847595, + "logits/rejected": -0.5079087615013123, + "logps/chosen": -51.072444915771484, + "logps/rejected": -65.5193099975586, + "loss": 0.617, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9572975635528564, + "rewards/margins": 5.515392303466797, + "rewards/rejected": -2.5580945014953613, + "step": 4336 + }, + { + "epoch": 1.08, + "grad_norm": 4.757516384124756, + "learning_rate": 1.7729250007980654e-06, + "logits/chosen": -0.3595079183578491, + "logits/rejected": -0.4523591995239258, + "logps/chosen": -48.52558898925781, + "logps/rejected": -87.20347595214844, + "loss": 0.6558, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.015429735183716, + "rewards/margins": 4.771672248840332, + "rewards/rejected": -1.7562425136566162, + "step": 4337 + }, + { + "epoch": 1.09, + "grad_norm": 5.282137870788574, + "learning_rate": 1.7709244020493099e-06, + "logits/chosen": -0.3050632178783417, + "logits/rejected": -0.32494527101516724, + "logps/chosen": -54.943565368652344, + "logps/rejected": -101.49571228027344, + "loss": 0.7165, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.964984893798828, + "rewards/margins": 5.589410781860352, + "rewards/rejected": -2.6244254112243652, + "step": 4338 + }, + { + "epoch": 1.09, + "grad_norm": 3.186539888381958, + "learning_rate": 1.7689246897514823e-06, + "logits/chosen": -0.37040671706199646, + "logits/rejected": -0.4571588635444641, + "logps/chosen": -54.74642562866211, + "logps/rejected": -93.8100357055664, + "loss": 0.6707, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2190537452697754, + "rewards/margins": 5.942620277404785, + "rewards/rejected": -2.7235665321350098, + "step": 4339 + }, + { + "epoch": 1.09, + "grad_norm": 7.321242332458496, + "learning_rate": 1.7669258644535497e-06, + "logits/chosen": -0.40150684118270874, + "logits/rejected": -0.5418638586997986, + "logps/chosen": -78.69136810302734, + "logps/rejected": -85.62638854980469, + "loss": 0.8017, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8978936672210693, + "rewards/margins": 5.224686145782471, + "rewards/rejected": -2.3267924785614014, + "step": 4340 + }, + { + "epoch": 1.09, + "grad_norm": 3.1213603019714355, + "learning_rate": 1.7649279267042318e-06, + "logits/chosen": -0.34228578209877014, + "logits/rejected": -0.46280568838119507, + "logps/chosen": -60.56975555419922, + "logps/rejected": -71.75636291503906, + "loss": 0.7957, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.123129367828369, + "rewards/margins": 5.083145618438721, + "rewards/rejected": -1.960015892982483, + "step": 4341 + }, + { + "epoch": 1.09, + "grad_norm": 4.212617874145508, + "learning_rate": 1.7629308770520055e-06, + "logits/chosen": -0.410835325717926, + "logits/rejected": -0.45470017194747925, + "logps/chosen": -61.837039947509766, + "logps/rejected": -84.05569458007812, + "loss": 0.6834, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.032613515853882, + "rewards/margins": 5.090123653411865, + "rewards/rejected": -2.0575101375579834, + "step": 4342 + }, + { + "epoch": 1.09, + "grad_norm": 9.059163093566895, + "learning_rate": 1.7609347160451035e-06, + "logits/chosen": -0.33996039628982544, + "logits/rejected": -0.3847673535346985, + "logps/chosen": -50.51546859741211, + "logps/rejected": -93.41029357910156, + "loss": 0.7248, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.413621425628662, + "rewards/margins": 4.154068946838379, + "rewards/rejected": -0.7404472231864929, + "step": 4343 + }, + { + "epoch": 1.09, + "grad_norm": 7.092114448547363, + "learning_rate": 1.7589394442315144e-06, + "logits/chosen": -0.3379443883895874, + "logits/rejected": -0.3691287636756897, + "logps/chosen": -49.31148910522461, + "logps/rejected": -94.85696411132812, + "loss": 0.7022, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.820852041244507, + "rewards/margins": 4.6294989585876465, + "rewards/rejected": -1.8086460828781128, + "step": 4344 + }, + { + "epoch": 1.09, + "grad_norm": 3.5938401222229004, + "learning_rate": 1.7569450621589873e-06, + "logits/chosen": -0.3644940257072449, + "logits/rejected": -0.501899242401123, + "logps/chosen": -50.065738677978516, + "logps/rejected": -67.20964050292969, + "loss": 0.6354, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8352465629577637, + "rewards/margins": 5.370548248291016, + "rewards/rejected": -2.535301685333252, + "step": 4345 + }, + { + "epoch": 1.09, + "grad_norm": 3.472343921661377, + "learning_rate": 1.7549515703750169e-06, + "logits/chosen": -0.3392828404903412, + "logits/rejected": -0.37035635113716125, + "logps/chosen": -55.328216552734375, + "logps/rejected": -95.75529479980469, + "loss": 0.7363, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9580237865448, + "rewards/margins": 4.462126731872559, + "rewards/rejected": -1.5041027069091797, + "step": 4346 + }, + { + "epoch": 1.09, + "grad_norm": 5.937455654144287, + "learning_rate": 1.7529589694268651e-06, + "logits/chosen": -0.33656173944473267, + "logits/rejected": -0.4606149196624756, + "logps/chosen": -57.774017333984375, + "logps/rejected": -73.87275695800781, + "loss": 0.85, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0187501907348633, + "rewards/margins": 5.052582263946533, + "rewards/rejected": -2.0338315963745117, + "step": 4347 + }, + { + "epoch": 1.09, + "grad_norm": 6.251147270202637, + "learning_rate": 1.7509672598615417e-06, + "logits/chosen": -0.3831096589565277, + "logits/rejected": -0.5055981874465942, + "logps/chosen": -58.241817474365234, + "logps/rejected": -82.53707122802734, + "loss": 0.7393, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5201315879821777, + "rewards/margins": 5.053904056549072, + "rewards/rejected": -2.5337724685668945, + "step": 4348 + }, + { + "epoch": 1.09, + "grad_norm": 3.3261804580688477, + "learning_rate": 1.7489764422258137e-06, + "logits/chosen": -0.43988722562789917, + "logits/rejected": -0.49161776900291443, + "logps/chosen": -51.99200439453125, + "logps/rejected": -90.63177490234375, + "loss": 0.7187, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.986027717590332, + "rewards/margins": 4.776462554931641, + "rewards/rejected": -1.7904353141784668, + "step": 4349 + }, + { + "epoch": 1.09, + "grad_norm": 6.189578056335449, + "learning_rate": 1.7469865170662042e-06, + "logits/chosen": -0.4302252233028412, + "logits/rejected": -0.46291661262512207, + "logps/chosen": -51.27503967285156, + "logps/rejected": -89.79275512695312, + "loss": 0.6968, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.682569980621338, + "rewards/margins": 5.034679412841797, + "rewards/rejected": -2.352109432220459, + "step": 4350 + }, + { + "epoch": 1.09, + "grad_norm": 4.078624248504639, + "learning_rate": 1.7449974849289885e-06, + "logits/chosen": -0.32120949029922485, + "logits/rejected": -0.47716644406318665, + "logps/chosen": -51.56945037841797, + "logps/rejected": -76.5293960571289, + "loss": 0.6136, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9383468627929688, + "rewards/margins": 5.260363578796387, + "rewards/rejected": -2.322016716003418, + "step": 4351 + }, + { + "epoch": 1.09, + "grad_norm": 6.140355587005615, + "learning_rate": 1.7430093463602021e-06, + "logits/chosen": -0.2636801600456238, + "logits/rejected": -0.36617371439933777, + "logps/chosen": -69.34463500976562, + "logps/rejected": -95.61083221435547, + "loss": 0.8579, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6781699657440186, + "rewards/margins": 3.99286150932312, + "rewards/rejected": -1.3146921396255493, + "step": 4352 + }, + { + "epoch": 1.09, + "grad_norm": 5.790975093841553, + "learning_rate": 1.7410221019056323e-06, + "logits/chosen": -0.35804492235183716, + "logits/rejected": -0.39071086049079895, + "logps/chosen": -49.884761810302734, + "logps/rejected": -81.01823425292969, + "loss": 0.7545, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.720825433731079, + "rewards/margins": 4.373956680297852, + "rewards/rejected": -1.653131127357483, + "step": 4353 + }, + { + "epoch": 1.09, + "grad_norm": 15.44758129119873, + "learning_rate": 1.7390357521108153e-06, + "logits/chosen": -0.3275454640388489, + "logits/rejected": -0.4636835753917694, + "logps/chosen": -75.0985336303711, + "logps/rejected": -84.93878936767578, + "loss": 0.9233, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8013665676116943, + "rewards/margins": 4.946373462677002, + "rewards/rejected": -2.1450066566467285, + "step": 4354 + }, + { + "epoch": 1.09, + "grad_norm": 4.740457057952881, + "learning_rate": 1.7370502975210524e-06, + "logits/chosen": -0.35955238342285156, + "logits/rejected": -0.46332108974456787, + "logps/chosen": -60.003074645996094, + "logps/rejected": -84.94735717773438, + "loss": 0.7799, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.567298412322998, + "rewards/margins": 4.778829574584961, + "rewards/rejected": -2.211531162261963, + "step": 4355 + }, + { + "epoch": 1.09, + "grad_norm": 5.0828752517700195, + "learning_rate": 1.7350657386813897e-06, + "logits/chosen": -0.2648630738258362, + "logits/rejected": -0.3212405741214752, + "logps/chosen": -65.91840362548828, + "logps/rejected": -90.0909194946289, + "loss": 0.7674, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.919161558151245, + "rewards/margins": 4.48206901550293, + "rewards/rejected": -1.5629076957702637, + "step": 4356 + }, + { + "epoch": 1.09, + "grad_norm": 3.7509453296661377, + "learning_rate": 1.7330820761366368e-06, + "logits/chosen": -0.2833424210548401, + "logits/rejected": -0.42402002215385437, + "logps/chosen": -60.047882080078125, + "logps/rejected": -77.77119445800781, + "loss": 0.6413, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.649219036102295, + "rewards/margins": 4.8612446784973145, + "rewards/rejected": -2.2120256423950195, + "step": 4357 + }, + { + "epoch": 1.09, + "grad_norm": 8.310955047607422, + "learning_rate": 1.7310993104313474e-06, + "logits/chosen": -0.3603251576423645, + "logits/rejected": -0.4046911597251892, + "logps/chosen": -61.20201873779297, + "logps/rejected": -99.35111236572266, + "loss": 0.8132, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6049532890319824, + "rewards/margins": 4.95029354095459, + "rewards/rejected": -2.3453400135040283, + "step": 4358 + }, + { + "epoch": 1.09, + "grad_norm": 9.158114433288574, + "learning_rate": 1.7291174421098323e-06, + "logits/chosen": -0.15781481564044952, + "logits/rejected": -0.3003552556037903, + "logps/chosen": -60.817718505859375, + "logps/rejected": -78.81962585449219, + "loss": 0.7279, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.858646869659424, + "rewards/margins": 4.257028579711914, + "rewards/rejected": -1.3983817100524902, + "step": 4359 + }, + { + "epoch": 1.09, + "grad_norm": 5.638869762420654, + "learning_rate": 1.7271364717161609e-06, + "logits/chosen": -0.39113712310791016, + "logits/rejected": -0.4872915744781494, + "logps/chosen": -50.01997375488281, + "logps/rejected": -86.65217590332031, + "loss": 0.6816, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8707520961761475, + "rewards/margins": 4.998485088348389, + "rewards/rejected": -2.1277332305908203, + "step": 4360 + }, + { + "epoch": 1.09, + "grad_norm": 5.545263767242432, + "learning_rate": 1.7251563997941518e-06, + "logits/chosen": -0.25830399990081787, + "logits/rejected": -0.3461996018886566, + "logps/chosen": -61.74217224121094, + "logps/rejected": -80.096435546875, + "loss": 0.8076, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8859505653381348, + "rewards/margins": 4.245774269104004, + "rewards/rejected": -1.3598238229751587, + "step": 4361 + }, + { + "epoch": 1.09, + "grad_norm": 5.962203025817871, + "learning_rate": 1.7231772268873753e-06, + "logits/chosen": -0.3598785400390625, + "logits/rejected": -0.43215951323509216, + "logps/chosen": -52.47177505493164, + "logps/rejected": -101.02066040039062, + "loss": 0.6853, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0750648975372314, + "rewards/margins": 5.279020309448242, + "rewards/rejected": -2.203955888748169, + "step": 4362 + }, + { + "epoch": 1.09, + "grad_norm": 4.794460773468018, + "learning_rate": 1.7211989535391594e-06, + "logits/chosen": -0.41157209873199463, + "logits/rejected": -0.5025576949119568, + "logps/chosen": -55.59567642211914, + "logps/rejected": -78.44477844238281, + "loss": 0.6828, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8701331615448, + "rewards/margins": 4.694683074951172, + "rewards/rejected": -1.824549913406372, + "step": 4363 + }, + { + "epoch": 1.09, + "grad_norm": 3.8803486824035645, + "learning_rate": 1.7192215802925793e-06, + "logits/chosen": -0.40481245517730713, + "logits/rejected": -0.48426395654678345, + "logps/chosen": -57.27467346191406, + "logps/rejected": -89.1300277709961, + "loss": 0.7063, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.768332004547119, + "rewards/margins": 5.826608180999756, + "rewards/rejected": -3.0582756996154785, + "step": 4364 + }, + { + "epoch": 1.09, + "grad_norm": 4.959232807159424, + "learning_rate": 1.7172451076904728e-06, + "logits/chosen": -0.41357290744781494, + "logits/rejected": -0.4747314453125, + "logps/chosen": -55.60435104370117, + "logps/rejected": -96.13871765136719, + "loss": 0.7489, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9793949127197266, + "rewards/margins": 5.24060583114624, + "rewards/rejected": -2.2612104415893555, + "step": 4365 + }, + { + "epoch": 1.09, + "grad_norm": 5.548723220825195, + "learning_rate": 1.7152695362754167e-06, + "logits/chosen": -0.29714804887771606, + "logits/rejected": -0.41212183237075806, + "logps/chosen": -61.10671615600586, + "logps/rejected": -99.04341125488281, + "loss": 0.7953, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7967183589935303, + "rewards/margins": 5.317073822021484, + "rewards/rejected": -2.520355224609375, + "step": 4366 + }, + { + "epoch": 1.09, + "grad_norm": 2.4914329051971436, + "learning_rate": 1.7132948665897537e-06, + "logits/chosen": -0.3494619131088257, + "logits/rejected": -0.4419100880622864, + "logps/chosen": -59.24474334716797, + "logps/rejected": -107.63134002685547, + "loss": 0.6365, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.031829357147217, + "rewards/margins": 6.490437984466553, + "rewards/rejected": -3.4586081504821777, + "step": 4367 + }, + { + "epoch": 1.09, + "grad_norm": 4.374066352844238, + "learning_rate": 1.7113210991755713e-06, + "logits/chosen": -0.28898847103118896, + "logits/rejected": -0.33887526392936707, + "logps/chosen": -65.00881958007812, + "logps/rejected": -88.85484313964844, + "loss": 0.7652, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7706875801086426, + "rewards/margins": 3.8345654010772705, + "rewards/rejected": -1.0638777017593384, + "step": 4368 + }, + { + "epoch": 1.09, + "grad_norm": 5.568042755126953, + "learning_rate": 1.7093482345747114e-06, + "logits/chosen": -0.26981014013290405, + "logits/rejected": -0.3595695197582245, + "logps/chosen": -57.64312744140625, + "logps/rejected": -80.50875091552734, + "loss": 0.7654, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0951621532440186, + "rewards/margins": 4.437163829803467, + "rewards/rejected": -1.3420019149780273, + "step": 4369 + }, + { + "epoch": 1.09, + "grad_norm": 4.031356334686279, + "learning_rate": 1.7073762733287673e-06, + "logits/chosen": -0.32680392265319824, + "logits/rejected": -0.45278772711753845, + "logps/chosen": -46.7896614074707, + "logps/rejected": -85.6598892211914, + "loss": 0.6334, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.968088150024414, + "rewards/margins": 5.615499973297119, + "rewards/rejected": -2.647412061691284, + "step": 4370 + }, + { + "epoch": 1.09, + "grad_norm": 4.317344665527344, + "learning_rate": 1.705405215979084e-06, + "logits/chosen": -0.35613685846328735, + "logits/rejected": -0.4396822452545166, + "logps/chosen": -59.235572814941406, + "logps/rejected": -88.44548034667969, + "loss": 0.6667, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9750444889068604, + "rewards/margins": 5.2718505859375, + "rewards/rejected": -2.2968060970306396, + "step": 4371 + }, + { + "epoch": 1.09, + "grad_norm": 5.9874267578125, + "learning_rate": 1.7034350630667628e-06, + "logits/chosen": -0.3344227373600006, + "logits/rejected": -0.4226868748664856, + "logps/chosen": -64.11257934570312, + "logps/rejected": -89.61241912841797, + "loss": 0.849, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7691967487335205, + "rewards/margins": 4.137617588043213, + "rewards/rejected": -1.3684206008911133, + "step": 4372 + }, + { + "epoch": 1.09, + "grad_norm": 14.795798301696777, + "learning_rate": 1.7014658151326508e-06, + "logits/chosen": -0.3479735553264618, + "logits/rejected": -0.4385257363319397, + "logps/chosen": -59.52843475341797, + "logps/rejected": -79.7967300415039, + "loss": 0.7527, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9007930755615234, + "rewards/margins": 4.491662979125977, + "rewards/rejected": -1.5908699035644531, + "step": 4373 + }, + { + "epoch": 1.09, + "grad_norm": 3.876495122909546, + "learning_rate": 1.6994974727173492e-06, + "logits/chosen": -0.38075894117355347, + "logits/rejected": -0.5017331838607788, + "logps/chosen": -70.35737609863281, + "logps/rejected": -91.79365539550781, + "loss": 0.6948, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.690890073776245, + "rewards/margins": 4.8072381019592285, + "rewards/rejected": -2.1163482666015625, + "step": 4374 + }, + { + "epoch": 1.09, + "grad_norm": 3.4367103576660156, + "learning_rate": 1.6975300363612102e-06, + "logits/chosen": -0.4285007119178772, + "logits/rejected": -0.46418678760528564, + "logps/chosen": -46.65144348144531, + "logps/rejected": -102.2450180053711, + "loss": 0.743, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0692076683044434, + "rewards/margins": 4.923167705535889, + "rewards/rejected": -1.8539597988128662, + "step": 4375 + }, + { + "epoch": 1.09, + "grad_norm": 11.328611373901367, + "learning_rate": 1.6955635066043363e-06, + "logits/chosen": -0.34435898065567017, + "logits/rejected": -0.4382195472717285, + "logps/chosen": -59.41496658325195, + "logps/rejected": -79.5605697631836, + "loss": 0.6599, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.805746078491211, + "rewards/margins": 4.899909496307373, + "rewards/rejected": -2.094163656234741, + "step": 4376 + }, + { + "epoch": 1.09, + "grad_norm": 3.830057144165039, + "learning_rate": 1.693597883986588e-06, + "logits/chosen": -0.33098381757736206, + "logits/rejected": -0.39683797955513, + "logps/chosen": -53.904170989990234, + "logps/rejected": -87.2027816772461, + "loss": 0.634, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9329137802124023, + "rewards/margins": 5.1572394371032715, + "rewards/rejected": -2.224325656890869, + "step": 4377 + }, + { + "epoch": 1.1, + "grad_norm": 10.623871803283691, + "learning_rate": 1.6916331690475647e-06, + "logits/chosen": -0.3420356512069702, + "logits/rejected": -0.42314577102661133, + "logps/chosen": -55.410850524902344, + "logps/rejected": -96.59271240234375, + "loss": 0.7972, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7712037563323975, + "rewards/margins": 5.017549991607666, + "rewards/rejected": -2.2463462352752686, + "step": 4378 + }, + { + "epoch": 1.1, + "grad_norm": 4.696936130523682, + "learning_rate": 1.689669362326624e-06, + "logits/chosen": -0.3948839008808136, + "logits/rejected": -0.4930024743080139, + "logps/chosen": -54.986087799072266, + "logps/rejected": -83.58517456054688, + "loss": 0.6664, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.911595344543457, + "rewards/margins": 5.71467924118042, + "rewards/rejected": -2.803083658218384, + "step": 4379 + }, + { + "epoch": 1.1, + "grad_norm": 8.058449745178223, + "learning_rate": 1.6877064643628766e-06, + "logits/chosen": -0.18966737389564514, + "logits/rejected": -0.2518808841705322, + "logps/chosen": -67.18143463134766, + "logps/rejected": -107.79328918457031, + "loss": 0.6911, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9312002658843994, + "rewards/margins": 5.356529235839844, + "rewards/rejected": -2.4253296852111816, + "step": 4380 + }, + { + "epoch": 1.1, + "grad_norm": 5.727046966552734, + "learning_rate": 1.685744475695178e-06, + "logits/chosen": -0.3754030168056488, + "logits/rejected": -0.42542991042137146, + "logps/chosen": -49.171661376953125, + "logps/rejected": -99.49553680419922, + "loss": 0.7101, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.053102493286133, + "rewards/margins": 5.125307559967041, + "rewards/rejected": -2.0722053050994873, + "step": 4381 + }, + { + "epoch": 1.1, + "grad_norm": 4.371768951416016, + "learning_rate": 1.683783396862137e-06, + "logits/chosen": -0.390146404504776, + "logits/rejected": -0.4747466444969177, + "logps/chosen": -62.443668365478516, + "logps/rejected": -78.7060546875, + "loss": 0.6941, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.027404546737671, + "rewards/margins": 4.920226097106934, + "rewards/rejected": -1.8928213119506836, + "step": 4382 + }, + { + "epoch": 1.1, + "grad_norm": 4.367823123931885, + "learning_rate": 1.6818232284021112e-06, + "logits/chosen": -0.3322974741458893, + "logits/rejected": -0.4310213327407837, + "logps/chosen": -49.154327392578125, + "logps/rejected": -89.18203735351562, + "loss": 0.6402, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7647924423217773, + "rewards/margins": 6.19492769241333, + "rewards/rejected": -3.4301350116729736, + "step": 4383 + }, + { + "epoch": 1.1, + "grad_norm": 5.080240249633789, + "learning_rate": 1.6798639708532088e-06, + "logits/chosen": -0.27308347821235657, + "logits/rejected": -0.29897239804267883, + "logps/chosen": -49.71345520019531, + "logps/rejected": -98.94183349609375, + "loss": 0.6753, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9157843589782715, + "rewards/margins": 4.783969402313232, + "rewards/rejected": -1.8681843280792236, + "step": 4384 + }, + { + "epoch": 1.1, + "grad_norm": 6.272966384887695, + "learning_rate": 1.677905624753291e-06, + "logits/chosen": -0.27659252285957336, + "logits/rejected": -0.37547895312309265, + "logps/chosen": -46.77989959716797, + "logps/rejected": -76.50507354736328, + "loss": 0.654, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.763634204864502, + "rewards/margins": 4.550804138183594, + "rewards/rejected": -1.7871698141098022, + "step": 4385 + }, + { + "epoch": 1.1, + "grad_norm": 5.0027971267700195, + "learning_rate": 1.6759481906399643e-06, + "logits/chosen": -0.3132094740867615, + "logits/rejected": -0.46589601039886475, + "logps/chosen": -69.24114990234375, + "logps/rejected": -89.46208190917969, + "loss": 0.7625, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8473947048187256, + "rewards/margins": 5.213715076446533, + "rewards/rejected": -2.3663206100463867, + "step": 4386 + }, + { + "epoch": 1.1, + "grad_norm": 12.37731647491455, + "learning_rate": 1.6739916690505864e-06, + "logits/chosen": -0.31834733486175537, + "logits/rejected": -0.44355177879333496, + "logps/chosen": -56.96834945678711, + "logps/rejected": -94.07258605957031, + "loss": 0.741, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8337550163269043, + "rewards/margins": 4.619509220123291, + "rewards/rejected": -1.7857542037963867, + "step": 4387 + }, + { + "epoch": 1.1, + "grad_norm": 2.699755907058716, + "learning_rate": 1.6720360605222653e-06, + "logits/chosen": -0.4211987257003784, + "logits/rejected": -0.44895511865615845, + "logps/chosen": -43.551666259765625, + "logps/rejected": -85.86529541015625, + "loss": 0.6633, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0088047981262207, + "rewards/margins": 5.367271423339844, + "rewards/rejected": -2.358466863632202, + "step": 4388 + }, + { + "epoch": 1.1, + "grad_norm": 3.103151798248291, + "learning_rate": 1.6700813655918575e-06, + "logits/chosen": -0.4259761571884155, + "logits/rejected": -0.4857392907142639, + "logps/chosen": -55.74003219604492, + "logps/rejected": -92.85470581054688, + "loss": 0.6655, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.759058713912964, + "rewards/margins": 5.477415084838867, + "rewards/rejected": -2.7183563709259033, + "step": 4389 + }, + { + "epoch": 1.1, + "grad_norm": 4.108617782592773, + "learning_rate": 1.6681275847959682e-06, + "logits/chosen": -0.39760562777519226, + "logits/rejected": -0.4845333695411682, + "logps/chosen": -47.827186584472656, + "logps/rejected": -75.9204330444336, + "loss": 0.6996, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.960580825805664, + "rewards/margins": 4.313116073608398, + "rewards/rejected": -1.3525351285934448, + "step": 4390 + }, + { + "epoch": 1.1, + "grad_norm": 5.579718589782715, + "learning_rate": 1.6661747186709515e-06, + "logits/chosen": -0.4140624403953552, + "logits/rejected": -0.5091043710708618, + "logps/chosen": -47.334617614746094, + "logps/rejected": -78.73422241210938, + "loss": 0.6514, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.802908420562744, + "rewards/margins": 5.2023749351501465, + "rewards/rejected": -2.3994665145874023, + "step": 4391 + }, + { + "epoch": 1.1, + "grad_norm": 2.7854843139648438, + "learning_rate": 1.6642227677529149e-06, + "logits/chosen": -0.3284953832626343, + "logits/rejected": -0.4713062644004822, + "logps/chosen": -52.70359420776367, + "logps/rejected": -79.42237854003906, + "loss": 0.6061, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.199498414993286, + "rewards/margins": 6.147832870483398, + "rewards/rejected": -2.948333978652954, + "step": 4392 + }, + { + "epoch": 1.1, + "grad_norm": 10.607608795166016, + "learning_rate": 1.6622717325777088e-06, + "logits/chosen": -0.29136893153190613, + "logits/rejected": -0.4107690155506134, + "logps/chosen": -51.409034729003906, + "logps/rejected": -80.45994567871094, + "loss": 0.6904, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6336066722869873, + "rewards/margins": 4.601040363311768, + "rewards/rejected": -1.9674339294433594, + "step": 4393 + }, + { + "epoch": 1.1, + "grad_norm": 6.664549827575684, + "learning_rate": 1.6603216136809342e-06, + "logits/chosen": -0.417321115732193, + "logits/rejected": -0.5029060244560242, + "logps/chosen": -66.38394165039062, + "logps/rejected": -83.12509155273438, + "loss": 0.8113, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5585639476776123, + "rewards/margins": 4.365856170654297, + "rewards/rejected": -1.807292103767395, + "step": 4394 + }, + { + "epoch": 1.1, + "grad_norm": 20.93379783630371, + "learning_rate": 1.6583724115979416e-06, + "logits/chosen": -0.3527584671974182, + "logits/rejected": -0.4087565541267395, + "logps/chosen": -66.49290466308594, + "logps/rejected": -94.53988647460938, + "loss": 0.8994, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9939260482788086, + "rewards/margins": 4.41171932220459, + "rewards/rejected": -1.4177932739257812, + "step": 4395 + }, + { + "epoch": 1.1, + "grad_norm": 5.218023777008057, + "learning_rate": 1.656424126863827e-06, + "logits/chosen": -0.35413265228271484, + "logits/rejected": -0.46134012937545776, + "logps/chosen": -58.775909423828125, + "logps/rejected": -87.99690246582031, + "loss": 0.7135, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2201709747314453, + "rewards/margins": 5.238266468048096, + "rewards/rejected": -2.018095016479492, + "step": 4396 + }, + { + "epoch": 1.1, + "grad_norm": 3.8259968757629395, + "learning_rate": 1.6544767600134398e-06, + "logits/chosen": -0.30834075808525085, + "logits/rejected": -0.40247642993927, + "logps/chosen": -51.05644607543945, + "logps/rejected": -91.30471801757812, + "loss": 0.717, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.601423501968384, + "rewards/margins": 5.533207893371582, + "rewards/rejected": -2.931784152984619, + "step": 4397 + }, + { + "epoch": 1.1, + "grad_norm": 5.231020450592041, + "learning_rate": 1.652530311581374e-06, + "logits/chosen": -0.4131592810153961, + "logits/rejected": -0.5451380014419556, + "logps/chosen": -53.054046630859375, + "logps/rejected": -68.95941162109375, + "loss": 0.6133, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0487265586853027, + "rewards/margins": 4.732434272766113, + "rewards/rejected": -1.6837077140808105, + "step": 4398 + }, + { + "epoch": 1.1, + "grad_norm": 5.354122638702393, + "learning_rate": 1.6505847821019677e-06, + "logits/chosen": -0.31368309259414673, + "logits/rejected": -0.4518449306488037, + "logps/chosen": -66.04086303710938, + "logps/rejected": -90.29693603515625, + "loss": 0.7689, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.943593740463257, + "rewards/margins": 5.369275093078613, + "rewards/rejected": -2.4256811141967773, + "step": 4399 + }, + { + "epoch": 1.1, + "grad_norm": 5.441140651702881, + "learning_rate": 1.6486401721093143e-06, + "logits/chosen": -0.3593716621398926, + "logits/rejected": -0.5280951857566833, + "logps/chosen": -64.77816009521484, + "logps/rejected": -67.83834838867188, + "loss": 0.7825, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.950948476791382, + "rewards/margins": 4.6802873611450195, + "rewards/rejected": -1.7293391227722168, + "step": 4400 + }, + { + "epoch": 1.1, + "grad_norm": 5.142650604248047, + "learning_rate": 1.646696482137251e-06, + "logits/chosen": -0.28347185254096985, + "logits/rejected": -0.3911832571029663, + "logps/chosen": -54.64549255371094, + "logps/rejected": -83.32819366455078, + "loss": 0.7466, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7700767517089844, + "rewards/margins": 4.757029056549072, + "rewards/rejected": -1.986952543258667, + "step": 4401 + }, + { + "epoch": 1.1, + "grad_norm": 2.8329524993896484, + "learning_rate": 1.6447537127193613e-06, + "logits/chosen": -0.2633349299430847, + "logits/rejected": -0.340157151222229, + "logps/chosen": -54.241851806640625, + "logps/rejected": -92.54743957519531, + "loss": 0.5965, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0938315391540527, + "rewards/margins": 5.331429481506348, + "rewards/rejected": -2.237597703933716, + "step": 4402 + }, + { + "epoch": 1.1, + "grad_norm": 10.178383827209473, + "learning_rate": 1.6428118643889785e-06, + "logits/chosen": -0.33363041281700134, + "logits/rejected": -0.39292192459106445, + "logps/chosen": -53.84540557861328, + "logps/rejected": -81.96897888183594, + "loss": 0.6493, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.029280185699463, + "rewards/margins": 4.573098182678223, + "rewards/rejected": -1.5438179969787598, + "step": 4403 + }, + { + "epoch": 1.1, + "grad_norm": 6.136125564575195, + "learning_rate": 1.6408709376791798e-06, + "logits/chosen": -0.3636936545372009, + "logits/rejected": -0.4566817283630371, + "logps/chosen": -50.453914642333984, + "logps/rejected": -87.28627014160156, + "loss": 0.6996, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8975558280944824, + "rewards/margins": 4.830306053161621, + "rewards/rejected": -1.9327502250671387, + "step": 4404 + }, + { + "epoch": 1.1, + "grad_norm": 5.085086822509766, + "learning_rate": 1.6389309331227943e-06, + "logits/chosen": -0.304058313369751, + "logits/rejected": -0.45588991045951843, + "logps/chosen": -67.99200439453125, + "logps/rejected": -70.82275390625, + "loss": 0.7505, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.170478582382202, + "rewards/margins": 4.559691905975342, + "rewards/rejected": -1.3892133235931396, + "step": 4405 + }, + { + "epoch": 1.1, + "grad_norm": 6.084384918212891, + "learning_rate": 1.6369918512523935e-06, + "logits/chosen": -0.311210572719574, + "logits/rejected": -0.39842942357063293, + "logps/chosen": -61.6552619934082, + "logps/rejected": -76.72991943359375, + "loss": 0.8702, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.823201894760132, + "rewards/margins": 4.465239524841309, + "rewards/rejected": -1.642037272453308, + "step": 4406 + }, + { + "epoch": 1.1, + "grad_norm": 24.53538703918457, + "learning_rate": 1.635053692600298e-06, + "logits/chosen": -0.36744725704193115, + "logits/rejected": -0.4439102113246918, + "logps/chosen": -57.40061950683594, + "logps/rejected": -75.06932067871094, + "loss": 0.7914, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8603837490081787, + "rewards/margins": 3.8395233154296875, + "rewards/rejected": -0.9791396856307983, + "step": 4407 + }, + { + "epoch": 1.1, + "grad_norm": 3.4066290855407715, + "learning_rate": 1.6331164576985737e-06, + "logits/chosen": -0.2979719936847687, + "logits/rejected": -0.3884795010089874, + "logps/chosen": -61.20808410644531, + "logps/rejected": -99.60205078125, + "loss": 0.6547, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.784656524658203, + "rewards/margins": 5.120426177978516, + "rewards/rejected": -2.3357691764831543, + "step": 4408 + }, + { + "epoch": 1.1, + "grad_norm": 9.730195999145508, + "learning_rate": 1.6311801470790318e-06, + "logits/chosen": -0.351761132478714, + "logits/rejected": -0.38116806745529175, + "logps/chosen": -50.18598556518555, + "logps/rejected": -86.8309326171875, + "loss": 0.6254, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.926750421524048, + "rewards/margins": 4.4220170974731445, + "rewards/rejected": -1.4952665567398071, + "step": 4409 + }, + { + "epoch": 1.1, + "grad_norm": 6.044548511505127, + "learning_rate": 1.629244761273236e-06, + "logits/chosen": -0.39538541436195374, + "logits/rejected": -0.46392735838890076, + "logps/chosen": -66.17469787597656, + "logps/rejected": -83.83430480957031, + "loss": 0.8606, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7957375049591064, + "rewards/margins": 4.36226749420166, + "rewards/rejected": -1.5665299892425537, + "step": 4410 + }, + { + "epoch": 1.1, + "grad_norm": 2.607187509536743, + "learning_rate": 1.6273103008124845e-06, + "logits/chosen": -0.28646087646484375, + "logits/rejected": -0.35324475169181824, + "logps/chosen": -56.87366485595703, + "logps/rejected": -83.39933776855469, + "loss": 0.6351, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0919675827026367, + "rewards/margins": 5.347754001617432, + "rewards/rejected": -2.255786895751953, + "step": 4411 + }, + { + "epoch": 1.1, + "grad_norm": 8.907339096069336, + "learning_rate": 1.6253767662278346e-06, + "logits/chosen": -0.36892321705818176, + "logits/rejected": -0.4051060080528259, + "logps/chosen": -54.27094268798828, + "logps/rejected": -93.16026306152344, + "loss": 0.7632, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6509222984313965, + "rewards/margins": 4.2674031257629395, + "rewards/rejected": -1.6164809465408325, + "step": 4412 + }, + { + "epoch": 1.1, + "grad_norm": 5.87972354888916, + "learning_rate": 1.6234441580500815e-06, + "logits/chosen": -0.25017523765563965, + "logits/rejected": -0.33222872018814087, + "logps/chosen": -86.95026397705078, + "logps/rejected": -103.0099105834961, + "loss": 0.9916, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4658236503601074, + "rewards/margins": 4.019622325897217, + "rewards/rejected": -1.553798794746399, + "step": 4413 + }, + { + "epoch": 1.1, + "grad_norm": 7.923832893371582, + "learning_rate": 1.6215124768097667e-06, + "logits/chosen": -0.3204413950443268, + "logits/rejected": -0.41388505697250366, + "logps/chosen": -58.76296615600586, + "logps/rejected": -88.21023559570312, + "loss": 0.7272, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7454073429107666, + "rewards/margins": 4.195938587188721, + "rewards/rejected": -1.4505313634872437, + "step": 4414 + }, + { + "epoch": 1.1, + "grad_norm": 4.563758850097656, + "learning_rate": 1.619581723037179e-06, + "logits/chosen": -0.37724196910858154, + "logits/rejected": -0.44771140813827515, + "logps/chosen": -54.43333435058594, + "logps/rejected": -86.23136901855469, + "loss": 0.7369, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.815890312194824, + "rewards/margins": 4.8715033531188965, + "rewards/rejected": -2.0556135177612305, + "step": 4415 + }, + { + "epoch": 1.1, + "grad_norm": 3.948580026626587, + "learning_rate": 1.6176518972623505e-06, + "logits/chosen": -0.3214266300201416, + "logits/rejected": -0.3795386850833893, + "logps/chosen": -67.28076934814453, + "logps/rejected": -99.90386199951172, + "loss": 0.6645, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7075717449188232, + "rewards/margins": 4.955377578735352, + "rewards/rejected": -2.247805118560791, + "step": 4416 + }, + { + "epoch": 1.1, + "grad_norm": 4.417116641998291, + "learning_rate": 1.615723000015063e-06, + "logits/chosen": -0.34025341272354126, + "logits/rejected": -0.46967166662216187, + "logps/chosen": -54.935428619384766, + "logps/rejected": -93.88958740234375, + "loss": 0.6602, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.874912738800049, + "rewards/margins": 5.259859085083008, + "rewards/rejected": -2.384946823120117, + "step": 4417 + }, + { + "epoch": 1.11, + "grad_norm": 6.70435094833374, + "learning_rate": 1.6137950318248407e-06, + "logits/chosen": -0.3125823140144348, + "logits/rejected": -0.33971577882766724, + "logps/chosen": -62.57488250732422, + "logps/rejected": -90.10057067871094, + "loss": 0.8285, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9384608268737793, + "rewards/margins": 4.4020304679870605, + "rewards/rejected": -1.4635698795318604, + "step": 4418 + }, + { + "epoch": 1.11, + "grad_norm": 4.522684097290039, + "learning_rate": 1.6118679932209475e-06, + "logits/chosen": -0.34312704205513, + "logits/rejected": -0.46995094418525696, + "logps/chosen": -53.03290939331055, + "logps/rejected": -66.5785903930664, + "loss": 0.6193, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.001253604888916, + "rewards/margins": 4.745085716247559, + "rewards/rejected": -1.7438325881958008, + "step": 4419 + }, + { + "epoch": 1.11, + "grad_norm": 8.031723976135254, + "learning_rate": 1.6099418847324022e-06, + "logits/chosen": -0.3870304226875305, + "logits/rejected": -0.4735583961009979, + "logps/chosen": -69.10652923583984, + "logps/rejected": -79.33090209960938, + "loss": 0.8433, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.987562417984009, + "rewards/margins": 5.080045223236084, + "rewards/rejected": -2.092482566833496, + "step": 4420 + }, + { + "epoch": 1.11, + "grad_norm": 5.603567123413086, + "learning_rate": 1.6080167068879594e-06, + "logits/chosen": -0.36518627405166626, + "logits/rejected": -0.3468689024448395, + "logps/chosen": -48.68411636352539, + "logps/rejected": -89.76228332519531, + "loss": 0.7145, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.259669065475464, + "rewards/margins": 4.596979141235352, + "rewards/rejected": -1.3373100757598877, + "step": 4421 + }, + { + "epoch": 1.11, + "grad_norm": 8.048866271972656, + "learning_rate": 1.6060924602161265e-06, + "logits/chosen": -0.3709699511528015, + "logits/rejected": -0.4400106370449066, + "logps/chosen": -66.30890655517578, + "logps/rejected": -81.97370910644531, + "loss": 0.7983, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.553239345550537, + "rewards/margins": 3.8030595779418945, + "rewards/rejected": -1.2498199939727783, + "step": 4422 + }, + { + "epoch": 1.11, + "grad_norm": 6.356619358062744, + "learning_rate": 1.6041691452451468e-06, + "logits/chosen": -0.302760511636734, + "logits/rejected": -0.3919345438480377, + "logps/chosen": -61.663246154785156, + "logps/rejected": -77.23968505859375, + "loss": 0.7542, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.734487771987915, + "rewards/margins": 3.9109089374542236, + "rewards/rejected": -1.1764212846755981, + "step": 4423 + }, + { + "epoch": 1.11, + "grad_norm": 3.7893805503845215, + "learning_rate": 1.6022467625030115e-06, + "logits/chosen": -0.292881041765213, + "logits/rejected": -0.3924241065979004, + "logps/chosen": -54.283931732177734, + "logps/rejected": -81.90884399414062, + "loss": 0.6934, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3611626625061035, + "rewards/margins": 5.357604026794434, + "rewards/rejected": -1.9964407682418823, + "step": 4424 + }, + { + "epoch": 1.11, + "grad_norm": 6.0368123054504395, + "learning_rate": 1.6003253125174589e-06, + "logits/chosen": -0.2894129753112793, + "logits/rejected": -0.39119938015937805, + "logps/chosen": -55.35554504394531, + "logps/rejected": -79.87832641601562, + "loss": 0.7649, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7750444412231445, + "rewards/margins": 4.229273319244385, + "rewards/rejected": -1.4542289972305298, + "step": 4425 + }, + { + "epoch": 1.11, + "grad_norm": 4.024890899658203, + "learning_rate": 1.5984047958159675e-06, + "logits/chosen": -0.34718120098114014, + "logits/rejected": -0.4806796908378601, + "logps/chosen": -58.62075424194336, + "logps/rejected": -88.54391479492188, + "loss": 0.6754, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0466437339782715, + "rewards/margins": 5.834056377410889, + "rewards/rejected": -2.787412405014038, + "step": 4426 + }, + { + "epoch": 1.11, + "grad_norm": 5.456916332244873, + "learning_rate": 1.5964852129257602e-06, + "logits/chosen": -0.4008343815803528, + "logits/rejected": -0.5120975971221924, + "logps/chosen": -56.70730972290039, + "logps/rejected": -90.25938415527344, + "loss": 0.8008, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8947675228118896, + "rewards/margins": 5.002627849578857, + "rewards/rejected": -2.107860565185547, + "step": 4427 + }, + { + "epoch": 1.11, + "grad_norm": 6.515064239501953, + "learning_rate": 1.5945665643738039e-06, + "logits/chosen": -0.36881959438323975, + "logits/rejected": -0.48978716135025024, + "logps/chosen": -59.84049987792969, + "logps/rejected": -85.5130615234375, + "loss": 0.7678, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6808063983917236, + "rewards/margins": 4.517231464385986, + "rewards/rejected": -1.8364254236221313, + "step": 4428 + }, + { + "epoch": 1.11, + "grad_norm": 11.468729019165039, + "learning_rate": 1.5926488506868077e-06, + "logits/chosen": -0.42209917306900024, + "logits/rejected": -0.49533969163894653, + "logps/chosen": -56.577152252197266, + "logps/rejected": -75.87583923339844, + "loss": 0.915, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.790308713912964, + "rewards/margins": 4.339274883270264, + "rewards/rejected": -1.548966646194458, + "step": 4429 + }, + { + "epoch": 1.11, + "grad_norm": 3.9139204025268555, + "learning_rate": 1.59073207239123e-06, + "logits/chosen": -0.3635142743587494, + "logits/rejected": -0.46110135316848755, + "logps/chosen": -55.33171844482422, + "logps/rejected": -87.07447052001953, + "loss": 0.731, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9608969688415527, + "rewards/margins": 4.563366889953613, + "rewards/rejected": -1.6024696826934814, + "step": 4430 + }, + { + "epoch": 1.11, + "grad_norm": 3.7130510807037354, + "learning_rate": 1.5888162300132615e-06, + "logits/chosen": -0.37367695569992065, + "logits/rejected": -0.4730725884437561, + "logps/chosen": -41.77154541015625, + "logps/rejected": -68.30441284179688, + "loss": 0.6085, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.927006721496582, + "rewards/margins": 5.117729187011719, + "rewards/rejected": -2.1907224655151367, + "step": 4431 + }, + { + "epoch": 1.11, + "grad_norm": 3.575655460357666, + "learning_rate": 1.5869013240788466e-06, + "logits/chosen": -0.3380770683288574, + "logits/rejected": -0.3515332341194153, + "logps/chosen": -49.49021530151367, + "logps/rejected": -90.36579895019531, + "loss": 0.6236, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9587783813476562, + "rewards/margins": 4.155653953552246, + "rewards/rejected": -1.1968756914138794, + "step": 4432 + }, + { + "epoch": 1.11, + "grad_norm": 4.633801460266113, + "learning_rate": 1.5849873551136674e-06, + "logits/chosen": -0.40842586755752563, + "logits/rejected": -0.5037158727645874, + "logps/chosen": -56.212379455566406, + "logps/rejected": -97.47574615478516, + "loss": 0.6807, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7067224979400635, + "rewards/margins": 5.882025241851807, + "rewards/rejected": -3.175302505493164, + "step": 4433 + }, + { + "epoch": 1.11, + "grad_norm": 3.4745004177093506, + "learning_rate": 1.5830743236431494e-06, + "logits/chosen": -0.3282816410064697, + "logits/rejected": -0.3983330726623535, + "logps/chosen": -58.844261169433594, + "logps/rejected": -96.02381896972656, + "loss": 0.6664, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0591659545898438, + "rewards/margins": 4.9649200439453125, + "rewards/rejected": -1.9057537317276, + "step": 4434 + }, + { + "epoch": 1.11, + "grad_norm": 4.752220630645752, + "learning_rate": 1.5811622301924611e-06, + "logits/chosen": -0.36417415738105774, + "logits/rejected": -0.46412742137908936, + "logps/chosen": -50.66087341308594, + "logps/rejected": -88.4651107788086, + "loss": 0.6666, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9295856952667236, + "rewards/margins": 5.143673896789551, + "rewards/rejected": -2.214087963104248, + "step": 4435 + }, + { + "epoch": 1.11, + "grad_norm": 4.443364143371582, + "learning_rate": 1.5792510752865114e-06, + "logits/chosen": -0.30310118198394775, + "logits/rejected": -0.4023453891277313, + "logps/chosen": -61.12198257446289, + "logps/rejected": -78.26904296875, + "loss": 0.73, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.94148850440979, + "rewards/margins": 3.928342342376709, + "rewards/rejected": -0.9868541955947876, + "step": 4436 + }, + { + "epoch": 1.11, + "grad_norm": 15.911147117614746, + "learning_rate": 1.5773408594499572e-06, + "logits/chosen": -0.3513518273830414, + "logits/rejected": -0.44238170981407166, + "logps/chosen": -60.701168060302734, + "logps/rejected": -86.76343536376953, + "loss": 0.7268, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8777036666870117, + "rewards/margins": 4.717183589935303, + "rewards/rejected": -1.8394802808761597, + "step": 4437 + }, + { + "epoch": 1.11, + "grad_norm": 9.92546272277832, + "learning_rate": 1.5754315832071926e-06, + "logits/chosen": -0.3492463231086731, + "logits/rejected": -0.44844698905944824, + "logps/chosen": -45.11328887939453, + "logps/rejected": -93.11308288574219, + "loss": 0.535, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0430104732513428, + "rewards/margins": 5.922935485839844, + "rewards/rejected": -2.879924774169922, + "step": 4438 + }, + { + "epoch": 1.11, + "grad_norm": 3.488107681274414, + "learning_rate": 1.5735232470823547e-06, + "logits/chosen": -0.2841607332229614, + "logits/rejected": -0.37095338106155396, + "logps/chosen": -58.193328857421875, + "logps/rejected": -83.24053192138672, + "loss": 0.6807, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7861905097961426, + "rewards/margins": 4.178165912628174, + "rewards/rejected": -1.3919754028320312, + "step": 4439 + }, + { + "epoch": 1.11, + "grad_norm": 4.703873634338379, + "learning_rate": 1.5716158515993223e-06, + "logits/chosen": -0.34799063205718994, + "logits/rejected": -0.37375882267951965, + "logps/chosen": -56.14617156982422, + "logps/rejected": -85.51827239990234, + "loss": 0.7144, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9443795680999756, + "rewards/margins": 3.944974660873413, + "rewards/rejected": -1.0005950927734375, + "step": 4440 + }, + { + "epoch": 1.11, + "grad_norm": 4.133157730102539, + "learning_rate": 1.5697093972817163e-06, + "logits/chosen": -0.3292989134788513, + "logits/rejected": -0.4105397164821625, + "logps/chosen": -69.4564437866211, + "logps/rejected": -80.24684143066406, + "loss": 0.8432, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.23774790763855, + "rewards/margins": 4.4156718254089355, + "rewards/rejected": -1.1779239177703857, + "step": 4441 + }, + { + "epoch": 1.11, + "grad_norm": 4.6700615882873535, + "learning_rate": 1.5678038846529037e-06, + "logits/chosen": -0.3031418025493622, + "logits/rejected": -0.41086041927337646, + "logps/chosen": -58.26359176635742, + "logps/rejected": -88.11466217041016, + "loss": 0.721, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8486671447753906, + "rewards/margins": 4.897909164428711, + "rewards/rejected": -2.0492422580718994, + "step": 4442 + }, + { + "epoch": 1.11, + "grad_norm": 6.409544944763184, + "learning_rate": 1.565899314235984e-06, + "logits/chosen": -0.3304237127304077, + "logits/rejected": -0.37045466899871826, + "logps/chosen": -58.877159118652344, + "logps/rejected": -72.24054718017578, + "loss": 0.7904, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2039265632629395, + "rewards/margins": 3.4023656845092773, + "rewards/rejected": -0.19843892753124237, + "step": 4443 + }, + { + "epoch": 1.11, + "grad_norm": 12.04220199584961, + "learning_rate": 1.563995686553803e-06, + "logits/chosen": -0.32896560430526733, + "logits/rejected": -0.4709393084049225, + "logps/chosen": -68.72977447509766, + "logps/rejected": -67.81986999511719, + "loss": 0.9281, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.797804594039917, + "rewards/margins": 4.164000988006592, + "rewards/rejected": -1.3661961555480957, + "step": 4444 + }, + { + "epoch": 1.11, + "grad_norm": 5.64880895614624, + "learning_rate": 1.5620930021289509e-06, + "logits/chosen": -0.3333706855773926, + "logits/rejected": -0.46800321340560913, + "logps/chosen": -58.49060821533203, + "logps/rejected": -75.26055908203125, + "loss": 0.6514, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6639821529388428, + "rewards/margins": 4.558923721313477, + "rewards/rejected": -1.8949412107467651, + "step": 4445 + }, + { + "epoch": 1.11, + "grad_norm": 3.6073787212371826, + "learning_rate": 1.560191261483754e-06, + "logits/chosen": -0.35327836871147156, + "logits/rejected": -0.3777444660663605, + "logps/chosen": -56.67072296142578, + "logps/rejected": -104.05757141113281, + "loss": 0.7205, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.915497303009033, + "rewards/margins": 5.15519380569458, + "rewards/rejected": -2.239696502685547, + "step": 4446 + }, + { + "epoch": 1.11, + "grad_norm": 5.093123912811279, + "learning_rate": 1.5582904651402807e-06, + "logits/chosen": -0.27656131982803345, + "logits/rejected": -0.3384213149547577, + "logps/chosen": -53.11421203613281, + "logps/rejected": -78.1388931274414, + "loss": 0.7771, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8018178939819336, + "rewards/margins": 4.288242340087891, + "rewards/rejected": -1.486424207687378, + "step": 4447 + }, + { + "epoch": 1.11, + "grad_norm": 3.5938265323638916, + "learning_rate": 1.556390613620341e-06, + "logits/chosen": -0.24233339726924896, + "logits/rejected": -0.3403266668319702, + "logps/chosen": -69.92640686035156, + "logps/rejected": -99.93770599365234, + "loss": 0.7807, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.288818120956421, + "rewards/margins": 4.393735408782959, + "rewards/rejected": -1.104917287826538, + "step": 4448 + }, + { + "epoch": 1.11, + "grad_norm": 3.2176928520202637, + "learning_rate": 1.554491707445484e-06, + "logits/chosen": -0.3671697974205017, + "logits/rejected": -0.4546121656894684, + "logps/chosen": -51.1987190246582, + "logps/rejected": -75.57758331298828, + "loss": 0.6809, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1264734268188477, + "rewards/margins": 5.348092079162598, + "rewards/rejected": -2.221618413925171, + "step": 4449 + }, + { + "epoch": 1.11, + "grad_norm": 4.422471523284912, + "learning_rate": 1.5525937471370028e-06, + "logits/chosen": -0.34669029712677, + "logits/rejected": -0.42529797554016113, + "logps/chosen": -44.86939239501953, + "logps/rejected": -72.2042007446289, + "loss": 0.6694, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.67271089553833, + "rewards/margins": 3.8708748817443848, + "rewards/rejected": -1.1981638669967651, + "step": 4450 + }, + { + "epoch": 1.11, + "grad_norm": 10.531079292297363, + "learning_rate": 1.5506967332159266e-06, + "logits/chosen": -0.3050258755683899, + "logits/rejected": -0.3912295699119568, + "logps/chosen": -57.061798095703125, + "logps/rejected": -90.9332046508789, + "loss": 0.971, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.780911445617676, + "rewards/margins": 4.618707656860352, + "rewards/rejected": -1.8377964496612549, + "step": 4451 + }, + { + "epoch": 1.11, + "grad_norm": 5.062699794769287, + "learning_rate": 1.548800666203028e-06, + "logits/chosen": -0.3611386716365814, + "logits/rejected": -0.46439874172210693, + "logps/chosen": -52.96234893798828, + "logps/rejected": -92.60292053222656, + "loss": 0.6957, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9537882804870605, + "rewards/margins": 5.6970038414001465, + "rewards/rejected": -2.7432165145874023, + "step": 4452 + }, + { + "epoch": 1.11, + "grad_norm": 8.281811714172363, + "learning_rate": 1.546905546618817e-06, + "logits/chosen": -0.3133040964603424, + "logits/rejected": -0.4286770522594452, + "logps/chosen": -60.551658630371094, + "logps/rejected": -83.24174499511719, + "loss": 0.7844, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8806309700012207, + "rewards/margins": 4.618902683258057, + "rewards/rejected": -1.738271713256836, + "step": 4453 + }, + { + "epoch": 1.11, + "grad_norm": 3.4410574436187744, + "learning_rate": 1.5450113749835444e-06, + "logits/chosen": -0.4399048984050751, + "logits/rejected": -0.4864692687988281, + "logps/chosen": -51.202484130859375, + "logps/rejected": -88.23895263671875, + "loss": 0.6666, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.056267023086548, + "rewards/margins": 5.2099151611328125, + "rewards/rejected": -2.1536481380462646, + "step": 4454 + }, + { + "epoch": 1.11, + "grad_norm": 4.885331630706787, + "learning_rate": 1.543118151817205e-06, + "logits/chosen": -0.3663216829299927, + "logits/rejected": -0.48192572593688965, + "logps/chosen": -67.38915252685547, + "logps/rejected": -105.44949340820312, + "loss": 0.7835, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9775137901306152, + "rewards/margins": 5.148646831512451, + "rewards/rejected": -2.171133279800415, + "step": 4455 + }, + { + "epoch": 1.11, + "grad_norm": 5.534002304077148, + "learning_rate": 1.5412258776395233e-06, + "logits/chosen": -0.37627914547920227, + "logits/rejected": -0.448275625705719, + "logps/chosen": -54.999717712402344, + "logps/rejected": -81.43444061279297, + "loss": 0.728, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9832041263580322, + "rewards/margins": 3.810281276702881, + "rewards/rejected": -0.8270769715309143, + "step": 4456 + }, + { + "epoch": 1.11, + "grad_norm": 5.553347587585449, + "learning_rate": 1.5393345529699743e-06, + "logits/chosen": -0.28967544436454773, + "logits/rejected": -0.3200855255126953, + "logps/chosen": -56.75117492675781, + "logps/rejected": -93.0791244506836, + "loss": 0.7561, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.00654673576355, + "rewards/margins": 4.275190830230713, + "rewards/rejected": -1.268644094467163, + "step": 4457 + }, + { + "epoch": 1.12, + "grad_norm": 7.836155414581299, + "learning_rate": 1.5374441783277656e-06, + "logits/chosen": -0.31990164518356323, + "logits/rejected": -0.39095866680145264, + "logps/chosen": -62.097572326660156, + "logps/rejected": -85.0839614868164, + "loss": 0.8038, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7246623039245605, + "rewards/margins": 4.266100883483887, + "rewards/rejected": -1.5414382219314575, + "step": 4458 + }, + { + "epoch": 1.12, + "grad_norm": 5.053718566894531, + "learning_rate": 1.535554754231846e-06, + "logits/chosen": -0.3976019620895386, + "logits/rejected": -0.5155780911445618, + "logps/chosen": -58.502437591552734, + "logps/rejected": -68.0837631225586, + "loss": 0.7503, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5388619899749756, + "rewards/margins": 4.671195983886719, + "rewards/rejected": -2.132333755493164, + "step": 4459 + }, + { + "epoch": 1.12, + "grad_norm": 4.430649757385254, + "learning_rate": 1.5336662812009035e-06, + "logits/chosen": -0.3023119568824768, + "logits/rejected": -0.47753986716270447, + "logps/chosen": -63.281375885009766, + "logps/rejected": -74.48239135742188, + "loss": 0.7134, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5541558265686035, + "rewards/margins": 4.9170145988464355, + "rewards/rejected": -2.362858772277832, + "step": 4460 + }, + { + "epoch": 1.12, + "grad_norm": 5.679043292999268, + "learning_rate": 1.5317787597533623e-06, + "logits/chosen": -0.3012605905532837, + "logits/rejected": -0.42628759145736694, + "logps/chosen": -59.45018768310547, + "logps/rejected": -76.55574035644531, + "loss": 0.7002, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6736416816711426, + "rewards/margins": 4.6445441246032715, + "rewards/rejected": -1.9709025621414185, + "step": 4461 + }, + { + "epoch": 1.12, + "grad_norm": 9.353181838989258, + "learning_rate": 1.5298921904073921e-06, + "logits/chosen": -0.3336891829967499, + "logits/rejected": -0.4770545959472656, + "logps/chosen": -64.13755798339844, + "logps/rejected": -78.08082580566406, + "loss": 0.7427, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7394609451293945, + "rewards/margins": 5.0096435546875, + "rewards/rejected": -2.2701828479766846, + "step": 4462 + }, + { + "epoch": 1.12, + "grad_norm": 4.533186912536621, + "learning_rate": 1.5280065736808957e-06, + "logits/chosen": -0.37965041399002075, + "logits/rejected": -0.4544525146484375, + "logps/chosen": -51.251670837402344, + "logps/rejected": -88.37605285644531, + "loss": 0.6752, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1751961708068848, + "rewards/margins": 4.673139572143555, + "rewards/rejected": -1.4979432821273804, + "step": 4463 + }, + { + "epoch": 1.12, + "grad_norm": 4.129032611846924, + "learning_rate": 1.5261219100915115e-06, + "logits/chosen": -0.3246613144874573, + "logits/rejected": -0.39115428924560547, + "logps/chosen": -52.158180236816406, + "logps/rejected": -85.5875244140625, + "loss": 0.69, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9721596240997314, + "rewards/margins": 4.669988632202148, + "rewards/rejected": -1.697829246520996, + "step": 4464 + }, + { + "epoch": 1.12, + "grad_norm": 18.475467681884766, + "learning_rate": 1.5242382001566253e-06, + "logits/chosen": -0.32064980268478394, + "logits/rejected": -0.40853142738342285, + "logps/chosen": -53.04148864746094, + "logps/rejected": -75.71659851074219, + "loss": 0.7001, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5707366466522217, + "rewards/margins": 5.081023693084717, + "rewards/rejected": -2.510287284851074, + "step": 4465 + }, + { + "epoch": 1.12, + "grad_norm": 10.081520080566406, + "learning_rate": 1.522355444393352e-06, + "logits/chosen": -0.37786614894866943, + "logits/rejected": -0.4901158809661865, + "logps/chosen": -47.02236557006836, + "logps/rejected": -85.38249969482422, + "loss": 0.6079, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0834667682647705, + "rewards/margins": 5.177465915679932, + "rewards/rejected": -2.0939993858337402, + "step": 4466 + }, + { + "epoch": 1.12, + "grad_norm": 5.9933085441589355, + "learning_rate": 1.5204736433185546e-06, + "logits/chosen": -0.3828568756580353, + "logits/rejected": -0.47937965393066406, + "logps/chosen": -58.3802604675293, + "logps/rejected": -88.99530792236328, + "loss": 0.8243, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.868940830230713, + "rewards/margins": 4.8601837158203125, + "rewards/rejected": -1.9912431240081787, + "step": 4467 + }, + { + "epoch": 1.12, + "grad_norm": 7.081972122192383, + "learning_rate": 1.5185927974488224e-06, + "logits/chosen": -0.277968168258667, + "logits/rejected": -0.42186006903648376, + "logps/chosen": -56.839080810546875, + "logps/rejected": -84.77029418945312, + "loss": 0.6701, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3656768798828125, + "rewards/margins": 5.265214443206787, + "rewards/rejected": -2.8995375633239746, + "step": 4468 + }, + { + "epoch": 1.12, + "grad_norm": 4.196554660797119, + "learning_rate": 1.516712907300489e-06, + "logits/chosen": -0.4786282479763031, + "logits/rejected": -0.5370029211044312, + "logps/chosen": -53.87424850463867, + "logps/rejected": -92.47669982910156, + "loss": 0.7457, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.819960594177246, + "rewards/margins": 5.259853363037109, + "rewards/rejected": -2.4398930072784424, + "step": 4469 + }, + { + "epoch": 1.12, + "grad_norm": 3.7816357612609863, + "learning_rate": 1.5148339733896273e-06, + "logits/chosen": -0.2907851040363312, + "logits/rejected": -0.4179548919200897, + "logps/chosen": -57.10840606689453, + "logps/rejected": -89.78166198730469, + "loss": 0.7175, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.849447011947632, + "rewards/margins": 5.931065559387207, + "rewards/rejected": -3.081618547439575, + "step": 4470 + }, + { + "epoch": 1.12, + "grad_norm": 5.735619068145752, + "learning_rate": 1.5129559962320434e-06, + "logits/chosen": -0.30665403604507446, + "logits/rejected": -0.39838048815727234, + "logps/chosen": -55.7236328125, + "logps/rejected": -91.85696411132812, + "loss": 0.7169, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6737616062164307, + "rewards/margins": 4.860828399658203, + "rewards/rejected": -2.1870665550231934, + "step": 4471 + }, + { + "epoch": 1.12, + "grad_norm": 6.976410865783691, + "learning_rate": 1.5110789763432832e-06, + "logits/chosen": -0.3304617404937744, + "logits/rejected": -0.42130348086357117, + "logps/chosen": -52.5281982421875, + "logps/rejected": -104.25944519042969, + "loss": 0.8961, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.600314140319824, + "rewards/margins": 3.5884616374969482, + "rewards/rejected": -0.9881476759910583, + "step": 4472 + }, + { + "epoch": 1.12, + "grad_norm": 3.995742082595825, + "learning_rate": 1.5092029142386294e-06, + "logits/chosen": -0.3703608810901642, + "logits/rejected": -0.42256292700767517, + "logps/chosen": -48.77545166015625, + "logps/rejected": -103.49917602539062, + "loss": 0.6544, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.013373374938965, + "rewards/margins": 4.995274543762207, + "rewards/rejected": -1.9819011688232422, + "step": 4473 + }, + { + "epoch": 1.12, + "grad_norm": 7.968585968017578, + "learning_rate": 1.507327810433099e-06, + "logits/chosen": -0.36285465955734253, + "logits/rejected": -0.4636435806751251, + "logps/chosen": -57.99388885498047, + "logps/rejected": -80.98938751220703, + "loss": 0.7338, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.631232261657715, + "rewards/margins": 4.63150691986084, + "rewards/rejected": -2.000274896621704, + "step": 4474 + }, + { + "epoch": 1.12, + "grad_norm": 5.863402366638184, + "learning_rate": 1.5054536654414543e-06, + "logits/chosen": -0.357761949300766, + "logits/rejected": -0.43842098116874695, + "logps/chosen": -55.34452819824219, + "logps/rejected": -90.81034088134766, + "loss": 0.781, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.251737117767334, + "rewards/margins": 5.225742340087891, + "rewards/rejected": -1.974005103111267, + "step": 4475 + }, + { + "epoch": 1.12, + "grad_norm": 5.441147327423096, + "learning_rate": 1.5035804797781811e-06, + "logits/chosen": -0.3946985900402069, + "logits/rejected": -0.47642165422439575, + "logps/chosen": -57.542320251464844, + "logps/rejected": -81.53321075439453, + "loss": 0.7415, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.964632987976074, + "rewards/margins": 4.431842803955078, + "rewards/rejected": -1.467209815979004, + "step": 4476 + }, + { + "epoch": 1.12, + "grad_norm": 9.801491737365723, + "learning_rate": 1.501708253957515e-06, + "logits/chosen": -0.38106343150138855, + "logits/rejected": -0.5441086888313293, + "logps/chosen": -50.86177444458008, + "logps/rejected": -79.85554504394531, + "loss": 0.7912, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.047780752182007, + "rewards/margins": 5.7715229988098145, + "rewards/rejected": -2.7237424850463867, + "step": 4477 + }, + { + "epoch": 1.12, + "grad_norm": 3.5550272464752197, + "learning_rate": 1.49983698849342e-06, + "logits/chosen": -0.3588532507419586, + "logits/rejected": -0.46305394172668457, + "logps/chosen": -56.60044860839844, + "logps/rejected": -86.88838958740234, + "loss": 0.6883, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.289212226867676, + "rewards/margins": 6.1114397048950195, + "rewards/rejected": -2.822227716445923, + "step": 4478 + }, + { + "epoch": 1.12, + "grad_norm": 5.925077438354492, + "learning_rate": 1.4979666838995992e-06, + "logits/chosen": -0.34951889514923096, + "logits/rejected": -0.42454928159713745, + "logps/chosen": -50.018836975097656, + "logps/rejected": -73.47293090820312, + "loss": 0.8327, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7973196506500244, + "rewards/margins": 4.504039764404297, + "rewards/rejected": -1.7067203521728516, + "step": 4479 + }, + { + "epoch": 1.12, + "grad_norm": 5.567927360534668, + "learning_rate": 1.4960973406894919e-06, + "logits/chosen": -0.33480721712112427, + "logits/rejected": -0.3642924129962921, + "logps/chosen": -61.70721435546875, + "logps/rejected": -87.31488037109375, + "loss": 0.8621, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.153594493865967, + "rewards/margins": 4.123947620391846, + "rewards/rejected": -0.9703528881072998, + "step": 4480 + }, + { + "epoch": 1.12, + "grad_norm": 5.308713436126709, + "learning_rate": 1.494228959376271e-06, + "logits/chosen": -0.3412958085536957, + "logits/rejected": -0.44083866477012634, + "logps/chosen": -66.50105285644531, + "logps/rejected": -73.5483169555664, + "loss": 0.6934, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0977401733398438, + "rewards/margins": 5.406761169433594, + "rewards/rejected": -2.309021472930908, + "step": 4481 + }, + { + "epoch": 1.12, + "grad_norm": 5.065948963165283, + "learning_rate": 1.4923615404728514e-06, + "logits/chosen": -0.3845272660255432, + "logits/rejected": -0.4783765971660614, + "logps/chosen": -51.6090087890625, + "logps/rejected": -73.18504333496094, + "loss": 0.7252, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.06988525390625, + "rewards/margins": 4.530974864959717, + "rewards/rejected": -1.461089015007019, + "step": 4482 + }, + { + "epoch": 1.12, + "grad_norm": 5.348272800445557, + "learning_rate": 1.4904950844918775e-06, + "logits/chosen": -0.3077831268310547, + "logits/rejected": -0.36733555793762207, + "logps/chosen": -55.59980010986328, + "logps/rejected": -92.57249450683594, + "loss": 0.6686, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.983678102493286, + "rewards/margins": 4.235353946685791, + "rewards/rejected": -1.2516759634017944, + "step": 4483 + }, + { + "epoch": 1.12, + "grad_norm": 4.8209428787231445, + "learning_rate": 1.4886295919457317e-06, + "logits/chosen": -0.2784004211425781, + "logits/rejected": -0.40224775671958923, + "logps/chosen": -52.863739013671875, + "logps/rejected": -73.67267608642578, + "loss": 0.724, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.853827953338623, + "rewards/margins": 4.535919189453125, + "rewards/rejected": -1.682091236114502, + "step": 4484 + }, + { + "epoch": 1.12, + "grad_norm": 6.119071960449219, + "learning_rate": 1.4867650633465325e-06, + "logits/chosen": -0.37997928261756897, + "logits/rejected": -0.4752989113330841, + "logps/chosen": -57.005836486816406, + "logps/rejected": -69.87147521972656, + "loss": 0.7128, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7625937461853027, + "rewards/margins": 3.971731662750244, + "rewards/rejected": -1.2091381549835205, + "step": 4485 + }, + { + "epoch": 1.12, + "grad_norm": 5.525485515594482, + "learning_rate": 1.4849014992061316e-06, + "logits/chosen": -0.33926820755004883, + "logits/rejected": -0.39846107363700867, + "logps/chosen": -48.488189697265625, + "logps/rejected": -72.69963836669922, + "loss": 0.8106, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8386499881744385, + "rewards/margins": 3.8387868404388428, + "rewards/rejected": -1.0001370906829834, + "step": 4486 + }, + { + "epoch": 1.12, + "grad_norm": 4.554540634155273, + "learning_rate": 1.4830389000361223e-06, + "logits/chosen": -0.4089178442955017, + "logits/rejected": -0.4377375841140747, + "logps/chosen": -44.49706268310547, + "logps/rejected": -88.27519989013672, + "loss": 0.7075, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.309971332550049, + "rewards/margins": 5.739005088806152, + "rewards/rejected": -2.4290339946746826, + "step": 4487 + }, + { + "epoch": 1.12, + "grad_norm": 3.046135663986206, + "learning_rate": 1.481177266347823e-06, + "logits/chosen": -0.2788293659687042, + "logits/rejected": -0.3519558012485504, + "logps/chosen": -58.05596923828125, + "logps/rejected": -92.42454528808594, + "loss": 0.7187, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8707637786865234, + "rewards/margins": 4.879362106323242, + "rewards/rejected": -2.0085980892181396, + "step": 4488 + }, + { + "epoch": 1.12, + "grad_norm": 5.624329566955566, + "learning_rate": 1.479316598652293e-06, + "logits/chosen": -0.40985873341560364, + "logits/rejected": -0.4316641688346863, + "logps/chosen": -48.291297912597656, + "logps/rejected": -93.559814453125, + "loss": 0.7264, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9818131923675537, + "rewards/margins": 5.225864887237549, + "rewards/rejected": -2.244051694869995, + "step": 4489 + }, + { + "epoch": 1.12, + "grad_norm": 7.095492839813232, + "learning_rate": 1.4774568974603288e-06, + "logits/chosen": -0.3835124969482422, + "logits/rejected": -0.4640114903450012, + "logps/chosen": -62.74442672729492, + "logps/rejected": -90.41463470458984, + "loss": 0.7954, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.727450132369995, + "rewards/margins": 4.949645042419434, + "rewards/rejected": -2.2221951484680176, + "step": 4490 + }, + { + "epoch": 1.12, + "grad_norm": 3.744816303253174, + "learning_rate": 1.4755981632824562e-06, + "logits/chosen": -0.38404417037963867, + "logits/rejected": -0.5125284194946289, + "logps/chosen": -54.06745529174805, + "logps/rejected": -69.2630386352539, + "loss": 0.6375, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9849276542663574, + "rewards/margins": 5.182670593261719, + "rewards/rejected": -2.1977434158325195, + "step": 4491 + }, + { + "epoch": 1.12, + "grad_norm": 6.2285332679748535, + "learning_rate": 1.4737403966289387e-06, + "logits/chosen": -0.40625515580177307, + "logits/rejected": -0.49448907375335693, + "logps/chosen": -49.89889907836914, + "logps/rejected": -77.68794250488281, + "loss": 0.7873, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7538506984710693, + "rewards/margins": 4.126439571380615, + "rewards/rejected": -1.372589349746704, + "step": 4492 + }, + { + "epoch": 1.12, + "grad_norm": 6.094670295715332, + "learning_rate": 1.471883598009773e-06, + "logits/chosen": -0.2538633346557617, + "logits/rejected": -0.3990899324417114, + "logps/chosen": -77.1436538696289, + "logps/rejected": -82.47296905517578, + "loss": 0.8931, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.907977819442749, + "rewards/margins": 4.180635452270508, + "rewards/rejected": -1.2726577520370483, + "step": 4493 + }, + { + "epoch": 1.12, + "grad_norm": 4.8173747062683105, + "learning_rate": 1.4700277679346885e-06, + "logits/chosen": -0.28343465924263, + "logits/rejected": -0.38569968938827515, + "logps/chosen": -58.717926025390625, + "logps/rejected": -89.77037048339844, + "loss": 0.7809, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9844810962677, + "rewards/margins": 5.036481857299805, + "rewards/rejected": -2.0520009994506836, + "step": 4494 + }, + { + "epoch": 1.12, + "grad_norm": 12.224637985229492, + "learning_rate": 1.4681729069131544e-06, + "logits/chosen": -0.27336373925209045, + "logits/rejected": -0.40394487977027893, + "logps/chosen": -68.0528793334961, + "logps/rejected": -86.23792266845703, + "loss": 0.961, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6400601863861084, + "rewards/margins": 4.088018417358398, + "rewards/rejected": -1.4479585886001587, + "step": 4495 + }, + { + "epoch": 1.12, + "grad_norm": 6.115976810455322, + "learning_rate": 1.4663190154543683e-06, + "logits/chosen": -0.24279721081256866, + "logits/rejected": -0.3382014334201813, + "logps/chosen": -61.40651321411133, + "logps/rejected": -86.70891571044922, + "loss": 0.6658, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.013361930847168, + "rewards/margins": 4.4455485343933105, + "rewards/rejected": -1.4321866035461426, + "step": 4496 + }, + { + "epoch": 1.12, + "grad_norm": 2.4658031463623047, + "learning_rate": 1.4644660940672628e-06, + "logits/chosen": -0.3472750186920166, + "logits/rejected": -0.43723952770233154, + "logps/chosen": -66.6992416381836, + "logps/rejected": -94.98121643066406, + "loss": 0.6462, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.851973295211792, + "rewards/margins": 5.20509147644043, + "rewards/rejected": -2.3531181812286377, + "step": 4497 + }, + { + "epoch": 1.13, + "grad_norm": 4.8404765129089355, + "learning_rate": 1.4626141432605058e-06, + "logits/chosen": -0.3670569062232971, + "logits/rejected": -0.4168154001235962, + "logps/chosen": -48.58634948730469, + "logps/rejected": -79.86996459960938, + "loss": 0.7766, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.915813446044922, + "rewards/margins": 4.117086887359619, + "rewards/rejected": -1.2012730836868286, + "step": 4498 + }, + { + "epoch": 1.13, + "grad_norm": 10.243102073669434, + "learning_rate": 1.4607631635424968e-06, + "logits/chosen": -0.3805806040763855, + "logits/rejected": -0.49326831102371216, + "logps/chosen": -48.93988037109375, + "logps/rejected": -91.4434814453125, + "loss": 0.6432, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8415403366088867, + "rewards/margins": 5.771031379699707, + "rewards/rejected": -2.929490566253662, + "step": 4499 + }, + { + "epoch": 1.13, + "grad_norm": 13.027438163757324, + "learning_rate": 1.4589131554213704e-06, + "logits/chosen": -0.29742416739463806, + "logits/rejected": -0.40539586544036865, + "logps/chosen": -55.58606719970703, + "logps/rejected": -88.6683349609375, + "loss": 0.8636, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.829148292541504, + "rewards/margins": 5.637715816497803, + "rewards/rejected": -2.8085672855377197, + "step": 4500 + }, + { + "epoch": 1.13, + "grad_norm": 6.44842004776001, + "learning_rate": 1.4570641194049916e-06, + "logits/chosen": -0.3282884359359741, + "logits/rejected": -0.413730263710022, + "logps/chosen": -51.07051467895508, + "logps/rejected": -83.10784149169922, + "loss": 0.6705, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4875171184539795, + "rewards/margins": 4.733848571777344, + "rewards/rejected": -2.2463319301605225, + "step": 4501 + }, + { + "epoch": 1.13, + "grad_norm": 7.936162948608398, + "learning_rate": 1.4552160560009642e-06, + "logits/chosen": -0.25876861810684204, + "logits/rejected": -0.34781432151794434, + "logps/chosen": -61.46883773803711, + "logps/rejected": -86.03445434570312, + "loss": 0.7531, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9059722423553467, + "rewards/margins": 4.831234931945801, + "rewards/rejected": -1.9252629280090332, + "step": 4502 + }, + { + "epoch": 1.13, + "grad_norm": 6.408934116363525, + "learning_rate": 1.45336896571662e-06, + "logits/chosen": -0.33016863465309143, + "logits/rejected": -0.49878811836242676, + "logps/chosen": -62.562557220458984, + "logps/rejected": -77.90904235839844, + "loss": 0.7729, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6743240356445312, + "rewards/margins": 5.689528942108154, + "rewards/rejected": -3.015204668045044, + "step": 4503 + }, + { + "epoch": 1.13, + "grad_norm": 5.970655918121338, + "learning_rate": 1.451522849059025e-06, + "logits/chosen": -0.4350453019142151, + "logits/rejected": -0.511436939239502, + "logps/chosen": -54.62342834472656, + "logps/rejected": -74.45863342285156, + "loss": 0.6967, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.098381280899048, + "rewards/margins": 4.49820613861084, + "rewards/rejected": -1.3998249769210815, + "step": 4504 + }, + { + "epoch": 1.13, + "grad_norm": 5.854491233825684, + "learning_rate": 1.4496777065349783e-06, + "logits/chosen": -0.35229411721229553, + "logits/rejected": -0.37213438749313354, + "logps/chosen": -43.42579650878906, + "logps/rejected": -98.44979858398438, + "loss": 0.731, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0089919567108154, + "rewards/margins": 5.019488334655762, + "rewards/rejected": -2.0104966163635254, + "step": 4505 + }, + { + "epoch": 1.13, + "grad_norm": 20.027429580688477, + "learning_rate": 1.447833538651009e-06, + "logits/chosen": -0.3126734793186188, + "logits/rejected": -0.37006843090057373, + "logps/chosen": -55.3729133605957, + "logps/rejected": -87.11398315429688, + "loss": 0.8127, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9670841693878174, + "rewards/margins": 4.942331314086914, + "rewards/rejected": -1.9752470254898071, + "step": 4506 + }, + { + "epoch": 1.13, + "grad_norm": 3.558424949645996, + "learning_rate": 1.4459903459133845e-06, + "logits/chosen": -0.36415421962738037, + "logits/rejected": -0.48489829897880554, + "logps/chosen": -48.70866775512695, + "logps/rejected": -96.99301147460938, + "loss": 0.6214, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.879730701446533, + "rewards/margins": 5.990728378295898, + "rewards/rejected": -3.1109986305236816, + "step": 4507 + }, + { + "epoch": 1.13, + "grad_norm": 6.918638706207275, + "learning_rate": 1.444148128828101e-06, + "logits/chosen": -0.3429895341396332, + "logits/rejected": -0.42789220809936523, + "logps/chosen": -65.33304595947266, + "logps/rejected": -103.41900634765625, + "loss": 0.7742, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8582229614257812, + "rewards/margins": 5.9980974197387695, + "rewards/rejected": -3.1398744583129883, + "step": 4508 + }, + { + "epoch": 1.13, + "grad_norm": 4.137506484985352, + "learning_rate": 1.442306887900883e-06, + "logits/chosen": -0.3609074354171753, + "logits/rejected": -0.44663071632385254, + "logps/chosen": -55.18307876586914, + "logps/rejected": -83.99302673339844, + "loss": 0.7065, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.136249303817749, + "rewards/margins": 4.68936014175415, + "rewards/rejected": -1.55311119556427, + "step": 4509 + }, + { + "epoch": 1.13, + "grad_norm": 4.6258978843688965, + "learning_rate": 1.4404666236371945e-06, + "logits/chosen": -0.4207655191421509, + "logits/rejected": -0.44226881861686707, + "logps/chosen": -65.35623931884766, + "logps/rejected": -95.9000473022461, + "loss": 0.7946, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2480568885803223, + "rewards/margins": 4.662343978881836, + "rewards/rejected": -1.4142868518829346, + "step": 4510 + }, + { + "epoch": 1.13, + "grad_norm": 3.3563754558563232, + "learning_rate": 1.438627336542226e-06, + "logits/chosen": -0.3077186346054077, + "logits/rejected": -0.4285402297973633, + "logps/chosen": -64.85523223876953, + "logps/rejected": -96.99395751953125, + "loss": 0.7731, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7683944702148438, + "rewards/margins": 6.415643215179443, + "rewards/rejected": -3.6472482681274414, + "step": 4511 + }, + { + "epoch": 1.13, + "grad_norm": 3.4346961975097656, + "learning_rate": 1.4367890271209024e-06, + "logits/chosen": -0.39941272139549255, + "logits/rejected": -0.46891316771507263, + "logps/chosen": -52.362953186035156, + "logps/rejected": -91.57271575927734, + "loss": 0.6909, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.283755302429199, + "rewards/margins": 5.163596153259277, + "rewards/rejected": -1.8798407316207886, + "step": 4512 + }, + { + "epoch": 1.13, + "grad_norm": 2.973238706588745, + "learning_rate": 1.4349516958778787e-06, + "logits/chosen": -0.37765443325042725, + "logits/rejected": -0.520491898059845, + "logps/chosen": -53.321434020996094, + "logps/rejected": -71.63671112060547, + "loss": 0.6741, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.088609218597412, + "rewards/margins": 5.29556131362915, + "rewards/rejected": -2.2069520950317383, + "step": 4513 + }, + { + "epoch": 1.13, + "grad_norm": 5.657891273498535, + "learning_rate": 1.433115343317541e-06, + "logits/chosen": -0.48568660020828247, + "logits/rejected": -0.5271167755126953, + "logps/chosen": -52.1526985168457, + "logps/rejected": -92.19696807861328, + "loss": 0.8098, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.814453125, + "rewards/margins": 4.21169900894165, + "rewards/rejected": -1.3972458839416504, + "step": 4514 + }, + { + "epoch": 1.13, + "grad_norm": 2.7691054344177246, + "learning_rate": 1.43127996994401e-06, + "logits/chosen": -0.33762428164482117, + "logits/rejected": -0.5093206763267517, + "logps/chosen": -51.89057922363281, + "logps/rejected": -73.1817398071289, + "loss": 0.5862, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1192002296447754, + "rewards/margins": 5.975776195526123, + "rewards/rejected": -2.8565754890441895, + "step": 4515 + }, + { + "epoch": 1.13, + "grad_norm": 4.479691982269287, + "learning_rate": 1.4294455762611343e-06, + "logits/chosen": -0.2808005213737488, + "logits/rejected": -0.46687406301498413, + "logps/chosen": -60.35823440551758, + "logps/rejected": -71.09522247314453, + "loss": 0.6862, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.911928176879883, + "rewards/margins": 4.838388442993164, + "rewards/rejected": -1.9264601469039917, + "step": 4516 + }, + { + "epoch": 1.13, + "grad_norm": 5.812745094299316, + "learning_rate": 1.4276121627724942e-06, + "logits/chosen": -0.3409135937690735, + "logits/rejected": -0.403425931930542, + "logps/chosen": -45.787681579589844, + "logps/rejected": -85.96910858154297, + "loss": 0.6258, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.744386911392212, + "rewards/margins": 4.819891929626465, + "rewards/rejected": -2.0755045413970947, + "step": 4517 + }, + { + "epoch": 1.13, + "grad_norm": 5.487311840057373, + "learning_rate": 1.4257797299814018e-06, + "logits/chosen": -0.428783118724823, + "logits/rejected": -0.5111976861953735, + "logps/chosen": -56.474185943603516, + "logps/rejected": -74.59442901611328, + "loss": 0.8637, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.710482358932495, + "rewards/margins": 4.336973190307617, + "rewards/rejected": -1.6264913082122803, + "step": 4518 + }, + { + "epoch": 1.13, + "grad_norm": 3.3277289867401123, + "learning_rate": 1.4239482783908975e-06, + "logits/chosen": -0.3396646976470947, + "logits/rejected": -0.42731234431266785, + "logps/chosen": -58.12031555175781, + "logps/rejected": -91.74163818359375, + "loss": 0.6499, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9943480491638184, + "rewards/margins": 5.608951568603516, + "rewards/rejected": -2.614603281021118, + "step": 4519 + }, + { + "epoch": 1.13, + "grad_norm": 5.8413591384887695, + "learning_rate": 1.422117808503759e-06, + "logits/chosen": -0.42431604862213135, + "logits/rejected": -0.45175403356552124, + "logps/chosen": -46.014556884765625, + "logps/rejected": -94.40325927734375, + "loss": 0.6425, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0623905658721924, + "rewards/margins": 4.503589153289795, + "rewards/rejected": -1.4411985874176025, + "step": 4520 + }, + { + "epoch": 1.13, + "grad_norm": 2.3847501277923584, + "learning_rate": 1.4202883208224831e-06, + "logits/chosen": -0.354532927274704, + "logits/rejected": -0.47321560978889465, + "logps/chosen": -59.317779541015625, + "logps/rejected": -77.0479965209961, + "loss": 0.7003, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8939356803894043, + "rewards/margins": 4.92374324798584, + "rewards/rejected": -2.0298075675964355, + "step": 4521 + }, + { + "epoch": 1.13, + "grad_norm": 2.621631622314453, + "learning_rate": 1.4184598158493096e-06, + "logits/chosen": -0.394226610660553, + "logits/rejected": -0.4858064651489258, + "logps/chosen": -52.52812576293945, + "logps/rejected": -79.19385528564453, + "loss": 0.6232, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9344916343688965, + "rewards/margins": 5.155887603759766, + "rewards/rejected": -2.22139573097229, + "step": 4522 + }, + { + "epoch": 1.13, + "grad_norm": 12.126519203186035, + "learning_rate": 1.4166322940861993e-06, + "logits/chosen": -0.3764680027961731, + "logits/rejected": -0.49129021167755127, + "logps/chosen": -51.170074462890625, + "logps/rejected": -82.40560913085938, + "loss": 0.7428, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5322771072387695, + "rewards/margins": 4.563731670379639, + "rewards/rejected": -2.031454086303711, + "step": 4523 + }, + { + "epoch": 1.13, + "grad_norm": 27.62506675720215, + "learning_rate": 1.4148057560348477e-06, + "logits/chosen": -0.3185562193393707, + "logits/rejected": -0.4015198349952698, + "logps/chosen": -71.36119842529297, + "logps/rejected": -101.9668960571289, + "loss": 0.9014, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6886346340179443, + "rewards/margins": 5.51270866394043, + "rewards/rejected": -2.8240737915039062, + "step": 4524 + }, + { + "epoch": 1.13, + "grad_norm": 12.4021635055542, + "learning_rate": 1.4129802021966788e-06, + "logits/chosen": -0.37150445580482483, + "logits/rejected": -0.5008035898208618, + "logps/chosen": -58.130306243896484, + "logps/rejected": -87.37144470214844, + "loss": 0.8975, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.807921886444092, + "rewards/margins": 5.096440315246582, + "rewards/rejected": -2.2885186672210693, + "step": 4525 + }, + { + "epoch": 1.13, + "grad_norm": 4.048476696014404, + "learning_rate": 1.4111556330728442e-06, + "logits/chosen": -0.33150309324264526, + "logits/rejected": -0.4156202971935272, + "logps/chosen": -58.41639709472656, + "logps/rejected": -96.62381744384766, + "loss": 0.7628, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9902310371398926, + "rewards/margins": 4.892111301422119, + "rewards/rejected": -1.9018797874450684, + "step": 4526 + }, + { + "epoch": 1.13, + "grad_norm": 17.149301528930664, + "learning_rate": 1.409332049164231e-06, + "logits/chosen": -0.3243063688278198, + "logits/rejected": -0.42387616634368896, + "logps/chosen": -58.363494873046875, + "logps/rejected": -85.86795043945312, + "loss": 0.6786, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0306811332702637, + "rewards/margins": 4.960126876831055, + "rewards/rejected": -1.9294452667236328, + "step": 4527 + }, + { + "epoch": 1.13, + "grad_norm": 3.8275346755981445, + "learning_rate": 1.4075094509714526e-06, + "logits/chosen": -0.3381049633026123, + "logits/rejected": -0.4575501084327698, + "logps/chosen": -69.64210510253906, + "logps/rejected": -91.22464752197266, + "loss": 0.6791, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6773312091827393, + "rewards/margins": 4.340249061584473, + "rewards/rejected": -1.6629173755645752, + "step": 4528 + }, + { + "epoch": 1.13, + "grad_norm": 5.88734769821167, + "learning_rate": 1.4056878389948464e-06, + "logits/chosen": -0.4615185260772705, + "logits/rejected": -0.5507268905639648, + "logps/chosen": -53.509342193603516, + "logps/rejected": -75.38838195800781, + "loss": 0.7287, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.921323537826538, + "rewards/margins": 4.8855719566345215, + "rewards/rejected": -1.9642479419708252, + "step": 4529 + }, + { + "epoch": 1.13, + "grad_norm": 7.280580520629883, + "learning_rate": 1.403867213734489e-06, + "logits/chosen": -0.3089190721511841, + "logits/rejected": -0.43284931778907776, + "logps/chosen": -48.172401428222656, + "logps/rejected": -86.80615234375, + "loss": 0.7667, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.931046485900879, + "rewards/margins": 6.050477981567383, + "rewards/rejected": -3.119431257247925, + "step": 4530 + }, + { + "epoch": 1.13, + "grad_norm": 4.0021071434021, + "learning_rate": 1.4020475756901785e-06, + "logits/chosen": -0.384609580039978, + "logits/rejected": -0.49429425597190857, + "logps/chosen": -52.57707977294922, + "logps/rejected": -82.22484588623047, + "loss": 0.7112, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9535984992980957, + "rewards/margins": 4.769151210784912, + "rewards/rejected": -1.8155525922775269, + "step": 4531 + }, + { + "epoch": 1.13, + "grad_norm": 6.359450817108154, + "learning_rate": 1.4002289253614492e-06, + "logits/chosen": -0.3221016228199005, + "logits/rejected": -0.42134392261505127, + "logps/chosen": -64.00142669677734, + "logps/rejected": -91.49527740478516, + "loss": 0.7372, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.995260238647461, + "rewards/margins": 5.070483684539795, + "rewards/rejected": -2.075223922729492, + "step": 4532 + }, + { + "epoch": 1.13, + "grad_norm": 5.251776695251465, + "learning_rate": 1.3984112632475555e-06, + "logits/chosen": -0.3407341539859772, + "logits/rejected": -0.45011889934539795, + "logps/chosen": -55.7945556640625, + "logps/rejected": -79.75226593017578, + "loss": 0.7429, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.841792106628418, + "rewards/margins": 5.1580424308776855, + "rewards/rejected": -2.3162498474121094, + "step": 4533 + }, + { + "epoch": 1.13, + "grad_norm": 4.630913734436035, + "learning_rate": 1.3965945898474843e-06, + "logits/chosen": -0.44697996973991394, + "logits/rejected": -0.48710763454437256, + "logps/chosen": -58.52508544921875, + "logps/rejected": -108.52633666992188, + "loss": 0.6601, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.352989673614502, + "rewards/margins": 5.348198413848877, + "rewards/rejected": -1.9952086210250854, + "step": 4534 + }, + { + "epoch": 1.13, + "grad_norm": 6.143916130065918, + "learning_rate": 1.3947789056599553e-06, + "logits/chosen": -0.4694092869758606, + "logits/rejected": -0.4487308859825134, + "logps/chosen": -45.380672454833984, + "logps/rejected": -97.68001556396484, + "loss": 0.7721, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0310676097869873, + "rewards/margins": 4.374330043792725, + "rewards/rejected": -1.34326171875, + "step": 4535 + }, + { + "epoch": 1.13, + "grad_norm": 4.249520301818848, + "learning_rate": 1.3929642111834114e-06, + "logits/chosen": -0.372850239276886, + "logits/rejected": -0.4755510091781616, + "logps/chosen": -57.70537185668945, + "logps/rejected": -90.70622253417969, + "loss": 0.6546, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.684682846069336, + "rewards/margins": 5.296142578125, + "rewards/rejected": -2.611459970474243, + "step": 4536 + }, + { + "epoch": 1.13, + "grad_norm": 11.946486473083496, + "learning_rate": 1.3911505069160248e-06, + "logits/chosen": -0.32137784361839294, + "logits/rejected": -0.39582759141921997, + "logps/chosen": -62.73587417602539, + "logps/rejected": -95.52645874023438, + "loss": 0.8918, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9817757606506348, + "rewards/margins": 4.326846599578857, + "rewards/rejected": -1.3450710773468018, + "step": 4537 + }, + { + "epoch": 1.14, + "grad_norm": 3.924583911895752, + "learning_rate": 1.3893377933556973e-06, + "logits/chosen": -0.31337738037109375, + "logits/rejected": -0.3765227198600769, + "logps/chosen": -55.62588119506836, + "logps/rejected": -99.18460083007812, + "loss": 0.5995, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1324305534362793, + "rewards/margins": 5.322784900665283, + "rewards/rejected": -2.190354585647583, + "step": 4538 + }, + { + "epoch": 1.14, + "grad_norm": 3.873400926589966, + "learning_rate": 1.3875260710000553e-06, + "logits/chosen": -0.31870728731155396, + "logits/rejected": -0.4167783856391907, + "logps/chosen": -58.1618537902832, + "logps/rejected": -96.53414916992188, + "loss": 0.5988, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.176870822906494, + "rewards/margins": 6.035082817077637, + "rewards/rejected": -2.8582122325897217, + "step": 4539 + }, + { + "epoch": 1.14, + "grad_norm": 8.237269401550293, + "learning_rate": 1.3857153403464613e-06, + "logits/chosen": -0.3497597277164459, + "logits/rejected": -0.4709393382072449, + "logps/chosen": -55.1546630859375, + "logps/rejected": -69.36565399169922, + "loss": 0.8875, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.733092784881592, + "rewards/margins": 4.853913307189941, + "rewards/rejected": -2.1208205223083496, + "step": 4540 + }, + { + "epoch": 1.14, + "grad_norm": 3.6575381755828857, + "learning_rate": 1.3839056018919938e-06, + "logits/chosen": -0.4471139907836914, + "logits/rejected": -0.5518310070037842, + "logps/chosen": -46.08978271484375, + "logps/rejected": -77.45109558105469, + "loss": 0.6286, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1317105293273926, + "rewards/margins": 5.187170505523682, + "rewards/rejected": -2.0554606914520264, + "step": 4541 + }, + { + "epoch": 1.14, + "grad_norm": 7.446661949157715, + "learning_rate": 1.3820968561334686e-06, + "logits/chosen": -0.29676157236099243, + "logits/rejected": -0.3835899233818054, + "logps/chosen": -51.01648712158203, + "logps/rejected": -92.07988739013672, + "loss": 0.6854, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9607458114624023, + "rewards/margins": 5.081228733062744, + "rewards/rejected": -2.1204824447631836, + "step": 4542 + }, + { + "epoch": 1.14, + "grad_norm": 6.900048732757568, + "learning_rate": 1.380289103567425e-06, + "logits/chosen": -0.3311585783958435, + "logits/rejected": -0.4080060124397278, + "logps/chosen": -45.95939254760742, + "logps/rejected": -80.11640930175781, + "loss": 0.7066, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7188446521759033, + "rewards/margins": 5.137346267700195, + "rewards/rejected": -2.4185023307800293, + "step": 4543 + }, + { + "epoch": 1.14, + "grad_norm": 5.028526782989502, + "learning_rate": 1.3784823446901295e-06, + "logits/chosen": -0.31578025221824646, + "logits/rejected": -0.44134435057640076, + "logps/chosen": -54.63429641723633, + "logps/rejected": -85.73922729492188, + "loss": 0.6157, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.000967502593994, + "rewards/margins": 5.794185161590576, + "rewards/rejected": -2.7932167053222656, + "step": 4544 + }, + { + "epoch": 1.14, + "grad_norm": 3.323929786682129, + "learning_rate": 1.3766765799975767e-06, + "logits/chosen": -0.4303409457206726, + "logits/rejected": -0.5209549069404602, + "logps/chosen": -48.753665924072266, + "logps/rejected": -88.07709503173828, + "loss": 0.6475, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.950507879257202, + "rewards/margins": 4.979236602783203, + "rewards/rejected": -2.028728485107422, + "step": 4545 + }, + { + "epoch": 1.14, + "grad_norm": 5.300804138183594, + "learning_rate": 1.3748718099854863e-06, + "logits/chosen": -0.3633880913257599, + "logits/rejected": -0.4450266361236572, + "logps/chosen": -56.84803771972656, + "logps/rejected": -85.73052215576172, + "loss": 0.7979, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.93501877784729, + "rewards/margins": 4.81174898147583, + "rewards/rejected": -1.8767296075820923, + "step": 4546 + }, + { + "epoch": 1.14, + "grad_norm": 4.168286323547363, + "learning_rate": 1.3730680351493097e-06, + "logits/chosen": -0.39351728558540344, + "logits/rejected": -0.45533522963523865, + "logps/chosen": -52.02341079711914, + "logps/rejected": -89.18302917480469, + "loss": 0.7228, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9099249839782715, + "rewards/margins": 4.698136329650879, + "rewards/rejected": -1.7882118225097656, + "step": 4547 + }, + { + "epoch": 1.14, + "grad_norm": 5.947946548461914, + "learning_rate": 1.3712652559842205e-06, + "logits/chosen": -0.26558271050453186, + "logits/rejected": -0.39920979738235474, + "logps/chosen": -51.57594299316406, + "logps/rejected": -70.95113372802734, + "loss": 0.6866, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8817296028137207, + "rewards/margins": 5.1220479011535645, + "rewards/rejected": -2.240318536758423, + "step": 4548 + }, + { + "epoch": 1.14, + "grad_norm": 4.559998035430908, + "learning_rate": 1.3694634729851203e-06, + "logits/chosen": -0.40421348810195923, + "logits/rejected": -0.4922958314418793, + "logps/chosen": -50.91832733154297, + "logps/rejected": -82.95692443847656, + "loss": 0.6681, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.767045736312866, + "rewards/margins": 5.010403156280518, + "rewards/rejected": -2.2433574199676514, + "step": 4549 + }, + { + "epoch": 1.14, + "grad_norm": 7.156162261962891, + "learning_rate": 1.3676626866466375e-06, + "logits/chosen": -0.35423529148101807, + "logits/rejected": -0.4394490420818329, + "logps/chosen": -44.71033477783203, + "logps/rejected": -89.88148498535156, + "loss": 0.7257, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.926331043243408, + "rewards/margins": 5.065995693206787, + "rewards/rejected": -2.139664649963379, + "step": 4550 + }, + { + "epoch": 1.14, + "grad_norm": 3.7232632637023926, + "learning_rate": 1.3658628974631255e-06, + "logits/chosen": -0.30201002955436707, + "logits/rejected": -0.4444723427295685, + "logps/chosen": -62.66411209106445, + "logps/rejected": -94.73856353759766, + "loss": 0.6621, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.718010663986206, + "rewards/margins": 6.303552627563477, + "rewards/rejected": -3.5855419635772705, + "step": 4551 + }, + { + "epoch": 1.14, + "grad_norm": 4.077831745147705, + "learning_rate": 1.364064105928668e-06, + "logits/chosen": -0.32931286096572876, + "logits/rejected": -0.4581999182701111, + "logps/chosen": -60.44533920288086, + "logps/rejected": -94.73413848876953, + "loss": 0.7512, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9471614360809326, + "rewards/margins": 5.350954532623291, + "rewards/rejected": -2.403794050216675, + "step": 4552 + }, + { + "epoch": 1.14, + "grad_norm": 6.354383945465088, + "learning_rate": 1.3622663125370723e-06, + "logits/chosen": -0.3743363618850708, + "logits/rejected": -0.40842148661613464, + "logps/chosen": -58.94347381591797, + "logps/rejected": -114.85647583007812, + "loss": 0.7583, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8197808265686035, + "rewards/margins": 4.555314540863037, + "rewards/rejected": -1.735533595085144, + "step": 4553 + }, + { + "epoch": 1.14, + "grad_norm": 11.563969612121582, + "learning_rate": 1.360469517781867e-06, + "logits/chosen": -0.37554165720939636, + "logits/rejected": -0.4494175910949707, + "logps/chosen": -58.03627014160156, + "logps/rejected": -79.38612365722656, + "loss": 0.9795, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6897308826446533, + "rewards/margins": 3.273153781890869, + "rewards/rejected": -0.5834227800369263, + "step": 4554 + }, + { + "epoch": 1.14, + "grad_norm": 23.589229583740234, + "learning_rate": 1.3586737221563157e-06, + "logits/chosen": -0.3872718811035156, + "logits/rejected": -0.4699290096759796, + "logps/chosen": -45.642723083496094, + "logps/rejected": -85.71121215820312, + "loss": 0.747, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.808558464050293, + "rewards/margins": 5.033276081085205, + "rewards/rejected": -2.2247180938720703, + "step": 4555 + }, + { + "epoch": 1.14, + "grad_norm": 3.856734037399292, + "learning_rate": 1.356878926153401e-06, + "logits/chosen": -0.3093429207801819, + "logits/rejected": -0.4420956075191498, + "logps/chosen": -60.37019729614258, + "logps/rejected": -71.35579681396484, + "loss": 0.6857, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1801252365112305, + "rewards/margins": 5.008896827697754, + "rewards/rejected": -1.8287711143493652, + "step": 4556 + }, + { + "epoch": 1.14, + "grad_norm": 3.6537022590637207, + "learning_rate": 1.355085130265833e-06, + "logits/chosen": -0.26514971256256104, + "logits/rejected": -0.41230034828186035, + "logps/chosen": -64.89044189453125, + "logps/rejected": -72.73509979248047, + "loss": 0.6927, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.805879592895508, + "rewards/margins": 4.588738441467285, + "rewards/rejected": -1.7828588485717773, + "step": 4557 + }, + { + "epoch": 1.14, + "grad_norm": 5.327727794647217, + "learning_rate": 1.3532923349860484e-06, + "logits/chosen": -0.4079034924507141, + "logits/rejected": -0.5152263045310974, + "logps/chosen": -59.41883087158203, + "logps/rejected": -89.28633117675781, + "loss": 0.6376, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8837995529174805, + "rewards/margins": 4.822551727294922, + "rewards/rejected": -1.9387524127960205, + "step": 4558 + }, + { + "epoch": 1.14, + "grad_norm": 5.900505542755127, + "learning_rate": 1.351500540806206e-06, + "logits/chosen": -0.3229542076587677, + "logits/rejected": -0.3646531105041504, + "logps/chosen": -57.53841781616211, + "logps/rejected": -90.04398345947266, + "loss": 0.7814, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.88769268989563, + "rewards/margins": 5.07960844039917, + "rewards/rejected": -2.19191575050354, + "step": 4559 + }, + { + "epoch": 1.14, + "grad_norm": 7.130000591278076, + "learning_rate": 1.3497097482181948e-06, + "logits/chosen": -0.38956063985824585, + "logits/rejected": -0.4401867389678955, + "logps/chosen": -55.06891632080078, + "logps/rejected": -86.34297180175781, + "loss": 0.6903, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7883849143981934, + "rewards/margins": 4.5291218757629395, + "rewards/rejected": -1.7407370805740356, + "step": 4560 + }, + { + "epoch": 1.14, + "grad_norm": 16.99489402770996, + "learning_rate": 1.3479199577136247e-06, + "logits/chosen": -0.3730658292770386, + "logits/rejected": -0.39460811018943787, + "logps/chosen": -52.095115661621094, + "logps/rejected": -92.52970123291016, + "loss": 0.7719, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6902811527252197, + "rewards/margins": 4.802701473236084, + "rewards/rejected": -2.1124203205108643, + "step": 4561 + }, + { + "epoch": 1.14, + "grad_norm": 4.614891052246094, + "learning_rate": 1.3461311697838324e-06, + "logits/chosen": -0.3378838300704956, + "logits/rejected": -0.4279978275299072, + "logps/chosen": -63.522178649902344, + "logps/rejected": -85.19605255126953, + "loss": 0.791, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.783170700073242, + "rewards/margins": 4.596975326538086, + "rewards/rejected": -1.8138049840927124, + "step": 4562 + }, + { + "epoch": 1.14, + "grad_norm": 5.953861713409424, + "learning_rate": 1.3443433849198778e-06, + "logits/chosen": -0.25822392106056213, + "logits/rejected": -0.4430113732814789, + "logps/chosen": -61.391761779785156, + "logps/rejected": -76.94913482666016, + "loss": 0.6734, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.853801727294922, + "rewards/margins": 4.932474136352539, + "rewards/rejected": -2.078672170639038, + "step": 4563 + }, + { + "epoch": 1.14, + "grad_norm": 7.27044153213501, + "learning_rate": 1.342556603612546e-06, + "logits/chosen": -0.4148843288421631, + "logits/rejected": -0.45289409160614014, + "logps/chosen": -60.52922439575195, + "logps/rejected": -89.60740661621094, + "loss": 0.7945, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.931670904159546, + "rewards/margins": 4.2663421630859375, + "rewards/rejected": -1.3346714973449707, + "step": 4564 + }, + { + "epoch": 1.14, + "grad_norm": 4.4469075202941895, + "learning_rate": 1.34077082635235e-06, + "logits/chosen": -0.3358062505722046, + "logits/rejected": -0.5008573532104492, + "logps/chosen": -75.83676147460938, + "logps/rejected": -92.54621887207031, + "loss": 0.8332, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.698939800262451, + "rewards/margins": 5.086522102355957, + "rewards/rejected": -2.387582302093506, + "step": 4565 + }, + { + "epoch": 1.14, + "grad_norm": 4.375369071960449, + "learning_rate": 1.3389860536295197e-06, + "logits/chosen": -0.3586195111274719, + "logits/rejected": -0.4750468134880066, + "logps/chosen": -62.43644714355469, + "logps/rejected": -76.2974853515625, + "loss": 0.7453, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9039506912231445, + "rewards/margins": 5.011155605316162, + "rewards/rejected": -2.1072049140930176, + "step": 4566 + }, + { + "epoch": 1.14, + "grad_norm": 4.332516670227051, + "learning_rate": 1.3372022859340177e-06, + "logits/chosen": -0.40688657760620117, + "logits/rejected": -0.5012537837028503, + "logps/chosen": -52.24504852294922, + "logps/rejected": -91.15354919433594, + "loss": 0.7003, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.921168804168701, + "rewards/margins": 5.297190189361572, + "rewards/rejected": -2.376020908355713, + "step": 4567 + }, + { + "epoch": 1.14, + "grad_norm": 3.603083610534668, + "learning_rate": 1.3354195237555245e-06, + "logits/chosen": -0.39105188846588135, + "logits/rejected": -0.47186538577079773, + "logps/chosen": -54.036903381347656, + "logps/rejected": -87.8680648803711, + "loss": 0.6561, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0379369258880615, + "rewards/margins": 6.261789321899414, + "rewards/rejected": -3.2238526344299316, + "step": 4568 + }, + { + "epoch": 1.14, + "grad_norm": 3.734602689743042, + "learning_rate": 1.3336377675834483e-06, + "logits/chosen": -0.34768620133399963, + "logits/rejected": -0.48015886545181274, + "logps/chosen": -46.4518928527832, + "logps/rejected": -69.22040557861328, + "loss": 0.614, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0691137313842773, + "rewards/margins": 5.471343517303467, + "rewards/rejected": -2.4022305011749268, + "step": 4569 + }, + { + "epoch": 1.14, + "grad_norm": 3.552309274673462, + "learning_rate": 1.3318570179069185e-06, + "logits/chosen": -0.33658379316329956, + "logits/rejected": -0.5156843066215515, + "logps/chosen": -52.90544891357422, + "logps/rejected": -69.46784973144531, + "loss": 0.7147, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.778345823287964, + "rewards/margins": 5.317432403564453, + "rewards/rejected": -2.53908634185791, + "step": 4570 + }, + { + "epoch": 1.14, + "grad_norm": 4.009077548980713, + "learning_rate": 1.3300772752147879e-06, + "logits/chosen": -0.4470210373401642, + "logits/rejected": -0.5453963875770569, + "logps/chosen": -57.9420166015625, + "logps/rejected": -94.31856536865234, + "loss": 0.7278, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.904589891433716, + "rewards/margins": 5.986252784729004, + "rewards/rejected": -3.08166241645813, + "step": 4571 + }, + { + "epoch": 1.14, + "grad_norm": 6.5615763664245605, + "learning_rate": 1.3282985399956372e-06, + "logits/chosen": -0.4383818507194519, + "logits/rejected": -0.4249095022678375, + "logps/chosen": -41.34992218017578, + "logps/rejected": -91.18907165527344, + "loss": 0.7897, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.059983491897583, + "rewards/margins": 4.447214126586914, + "rewards/rejected": -1.3872309923171997, + "step": 4572 + }, + { + "epoch": 1.14, + "grad_norm": 6.785642623901367, + "learning_rate": 1.3265208127377683e-06, + "logits/chosen": -0.407912015914917, + "logits/rejected": -0.4810068607330322, + "logps/chosen": -58.8446159362793, + "logps/rejected": -91.88619995117188, + "loss": 0.8804, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.931471586227417, + "rewards/margins": 4.5488080978393555, + "rewards/rejected": -1.6173368692398071, + "step": 4573 + }, + { + "epoch": 1.14, + "grad_norm": 4.91384744644165, + "learning_rate": 1.3247440939292e-06, + "logits/chosen": -0.3640131950378418, + "logits/rejected": -0.4532972574234009, + "logps/chosen": -54.954715728759766, + "logps/rejected": -80.3824462890625, + "loss": 0.7563, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.861060857772827, + "rewards/margins": 5.085590362548828, + "rewards/rejected": -2.22452974319458, + "step": 4574 + }, + { + "epoch": 1.14, + "grad_norm": 4.59459924697876, + "learning_rate": 1.3229683840576857e-06, + "logits/chosen": -0.38541918992996216, + "logits/rejected": -0.4652095437049866, + "logps/chosen": -54.62894058227539, + "logps/rejected": -95.5285415649414, + "loss": 0.6797, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7875475883483887, + "rewards/margins": 4.927282810211182, + "rewards/rejected": -2.1397347450256348, + "step": 4575 + }, + { + "epoch": 1.14, + "grad_norm": 6.824947834014893, + "learning_rate": 1.3211936836106926e-06, + "logits/chosen": -0.3528635501861572, + "logits/rejected": -0.5058858394622803, + "logps/chosen": -59.87232208251953, + "logps/rejected": -74.25939178466797, + "loss": 0.7643, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8543410301208496, + "rewards/margins": 4.4586687088012695, + "rewards/rejected": -1.6043274402618408, + "step": 4576 + }, + { + "epoch": 1.14, + "grad_norm": 5.117753982543945, + "learning_rate": 1.3194199930754188e-06, + "logits/chosen": -0.40581634640693665, + "logits/rejected": -0.4657813310623169, + "logps/chosen": -55.11602020263672, + "logps/rejected": -84.28441619873047, + "loss": 0.8028, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.015890121459961, + "rewards/margins": 4.554310321807861, + "rewards/rejected": -1.5384202003479004, + "step": 4577 + }, + { + "epoch": 1.15, + "grad_norm": 6.612566947937012, + "learning_rate": 1.3176473129387761e-06, + "logits/chosen": -0.40243738889694214, + "logits/rejected": -0.5412687659263611, + "logps/chosen": -45.53407287597656, + "logps/rejected": -61.84007263183594, + "loss": 0.7491, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0488874912261963, + "rewards/margins": 4.83936071395874, + "rewards/rejected": -1.7904729843139648, + "step": 4578 + }, + { + "epoch": 1.15, + "grad_norm": 7.7368927001953125, + "learning_rate": 1.3158756436874033e-06, + "logits/chosen": -0.37338003516197205, + "logits/rejected": -0.3889956474304199, + "logps/chosen": -45.05077362060547, + "logps/rejected": -91.28816986083984, + "loss": 0.6704, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8079190254211426, + "rewards/margins": 5.051177024841309, + "rewards/rejected": -2.243257999420166, + "step": 4579 + }, + { + "epoch": 1.15, + "grad_norm": 4.200965404510498, + "learning_rate": 1.3141049858076648e-06, + "logits/chosen": -0.31871819496154785, + "logits/rejected": -0.4516480565071106, + "logps/chosen": -56.715614318847656, + "logps/rejected": -81.02708435058594, + "loss": 0.6613, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.912814140319824, + "rewards/margins": 5.260335445404053, + "rewards/rejected": -2.3475213050842285, + "step": 4580 + }, + { + "epoch": 1.15, + "grad_norm": 5.457111835479736, + "learning_rate": 1.312335339785643e-06, + "logits/chosen": -0.3907936215400696, + "logits/rejected": -0.4697054922580719, + "logps/chosen": -46.90523147583008, + "logps/rejected": -83.04114532470703, + "loss": 0.6067, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1046574115753174, + "rewards/margins": 4.833102703094482, + "rewards/rejected": -1.7284454107284546, + "step": 4581 + }, + { + "epoch": 1.15, + "grad_norm": 6.256730556488037, + "learning_rate": 1.3105667061071443e-06, + "logits/chosen": -0.2958267033100128, + "logits/rejected": -0.3800800144672394, + "logps/chosen": -64.43460845947266, + "logps/rejected": -94.94197845458984, + "loss": 0.7604, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8054280281066895, + "rewards/margins": 4.631322860717773, + "rewards/rejected": -1.8258947134017944, + "step": 4582 + }, + { + "epoch": 1.15, + "grad_norm": 5.636295318603516, + "learning_rate": 1.308799085257696e-06, + "logits/chosen": -0.32513269782066345, + "logits/rejected": -0.44460469484329224, + "logps/chosen": -77.51356506347656, + "logps/rejected": -78.99686431884766, + "loss": 0.7461, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0689971446990967, + "rewards/margins": 4.647758960723877, + "rewards/rejected": -1.5787616968154907, + "step": 4583 + }, + { + "epoch": 1.15, + "grad_norm": 8.92662525177002, + "learning_rate": 1.3070324777225474e-06, + "logits/chosen": -0.3374442756175995, + "logits/rejected": -0.45124655961990356, + "logps/chosen": -62.95146560668945, + "logps/rejected": -76.78841400146484, + "loss": 0.896, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8031833171844482, + "rewards/margins": 4.755957126617432, + "rewards/rejected": -1.9527740478515625, + "step": 4584 + }, + { + "epoch": 1.15, + "grad_norm": 9.789247512817383, + "learning_rate": 1.3052668839866744e-06, + "logits/chosen": -0.2752203047275543, + "logits/rejected": -0.4237271845340729, + "logps/chosen": -75.65364074707031, + "logps/rejected": -76.46928405761719, + "loss": 0.9246, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.749891757965088, + "rewards/margins": 4.257094860076904, + "rewards/rejected": -1.507203221321106, + "step": 4585 + }, + { + "epoch": 1.15, + "grad_norm": 7.8708577156066895, + "learning_rate": 1.303502304534765e-06, + "logits/chosen": -0.30820587277412415, + "logits/rejected": -0.3670942783355713, + "logps/chosen": -61.726219177246094, + "logps/rejected": -87.77442932128906, + "loss": 0.8046, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5508322715759277, + "rewards/margins": 4.152217388153076, + "rewards/rejected": -1.601385235786438, + "step": 4586 + }, + { + "epoch": 1.15, + "grad_norm": 7.281486988067627, + "learning_rate": 1.3017387398512389e-06, + "logits/chosen": -0.34646448493003845, + "logits/rejected": -0.46282413601875305, + "logps/chosen": -69.88783264160156, + "logps/rejected": -86.54834747314453, + "loss": 0.8295, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1670188903808594, + "rewards/margins": 4.504012107849121, + "rewards/rejected": -1.3369930982589722, + "step": 4587 + }, + { + "epoch": 1.15, + "grad_norm": 5.930618762969971, + "learning_rate": 1.29997619042023e-06, + "logits/chosen": -0.29747653007507324, + "logits/rejected": -0.3886669874191284, + "logps/chosen": -50.44718933105469, + "logps/rejected": -90.09539794921875, + "loss": 0.7375, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.906742811203003, + "rewards/margins": 4.659356594085693, + "rewards/rejected": -1.7526137828826904, + "step": 4588 + }, + { + "epoch": 1.15, + "grad_norm": 4.53478479385376, + "learning_rate": 1.2982146567255982e-06, + "logits/chosen": -0.2809530198574066, + "logits/rejected": -0.44447267055511475, + "logps/chosen": -57.69438934326172, + "logps/rejected": -74.71449279785156, + "loss": 0.6906, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8909237384796143, + "rewards/margins": 4.919989585876465, + "rewards/rejected": -2.0290658473968506, + "step": 4589 + }, + { + "epoch": 1.15, + "grad_norm": 4.9247589111328125, + "learning_rate": 1.2964541392509206e-06, + "logits/chosen": -0.2614210546016693, + "logits/rejected": -0.3188573122024536, + "logps/chosen": -64.48390197753906, + "logps/rejected": -96.18791961669922, + "loss": 0.7546, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6633331775665283, + "rewards/margins": 5.109617233276367, + "rewards/rejected": -2.446284770965576, + "step": 4590 + }, + { + "epoch": 1.15, + "grad_norm": 3.9204344749450684, + "learning_rate": 1.2946946384794968e-06, + "logits/chosen": -0.4109416604042053, + "logits/rejected": -0.4772539734840393, + "logps/chosen": -59.17146301269531, + "logps/rejected": -85.23602294921875, + "loss": 0.6799, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1616241931915283, + "rewards/margins": 5.651305198669434, + "rewards/rejected": -2.489680528640747, + "step": 4591 + }, + { + "epoch": 1.15, + "grad_norm": 3.8774781227111816, + "learning_rate": 1.2929361548943503e-06, + "logits/chosen": -0.3819202184677124, + "logits/rejected": -0.4239380955696106, + "logps/chosen": -55.395538330078125, + "logps/rejected": -88.28182220458984, + "loss": 0.7634, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0059852600097656, + "rewards/margins": 4.233498573303223, + "rewards/rejected": -1.2275128364562988, + "step": 4592 + }, + { + "epoch": 1.15, + "grad_norm": 3.3871259689331055, + "learning_rate": 1.2911786889782212e-06, + "logits/chosen": -0.3460366725921631, + "logits/rejected": -0.44898495078086853, + "logps/chosen": -56.227474212646484, + "logps/rejected": -81.55997467041016, + "loss": 0.7624, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.160977840423584, + "rewards/margins": 4.90138053894043, + "rewards/rejected": -1.7404028177261353, + "step": 4593 + }, + { + "epoch": 1.15, + "grad_norm": 11.380084037780762, + "learning_rate": 1.2894222412135721e-06, + "logits/chosen": -0.4420326054096222, + "logits/rejected": -0.5631298422813416, + "logps/chosen": -49.03618621826172, + "logps/rejected": -89.36822509765625, + "loss": 0.7027, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9989969730377197, + "rewards/margins": 5.716889381408691, + "rewards/rejected": -2.7178921699523926, + "step": 4594 + }, + { + "epoch": 1.15, + "grad_norm": 5.696811199188232, + "learning_rate": 1.2876668120825858e-06, + "logits/chosen": -0.3321326673030853, + "logits/rejected": -0.4556041657924652, + "logps/chosen": -65.2970199584961, + "logps/rejected": -72.02622985839844, + "loss": 0.8537, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.673044204711914, + "rewards/margins": 4.023805141448975, + "rewards/rejected": -1.3507609367370605, + "step": 4595 + }, + { + "epoch": 1.15, + "grad_norm": 7.864801406860352, + "learning_rate": 1.2859124020671643e-06, + "logits/chosen": -0.3332957327365875, + "logits/rejected": -0.34253430366516113, + "logps/chosen": -50.08335876464844, + "logps/rejected": -94.73674011230469, + "loss": 0.8088, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6954095363616943, + "rewards/margins": 4.207767486572266, + "rewards/rejected": -1.5123577117919922, + "step": 4596 + }, + { + "epoch": 1.15, + "grad_norm": 4.363083362579346, + "learning_rate": 1.284159011648935e-06, + "logits/chosen": -0.2781941890716553, + "logits/rejected": -0.35889989137649536, + "logps/chosen": -56.44230651855469, + "logps/rejected": -84.79512786865234, + "loss": 0.6631, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2517147064208984, + "rewards/margins": 5.021681308746338, + "rewards/rejected": -1.7699670791625977, + "step": 4597 + }, + { + "epoch": 1.15, + "grad_norm": 1.3636066913604736, + "learning_rate": 1.2824066413092367e-06, + "logits/chosen": -0.43016186356544495, + "logits/rejected": -0.573762059211731, + "logps/chosen": -56.87134552001953, + "logps/rejected": -79.63532257080078, + "loss": 0.6253, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.100970506668091, + "rewards/margins": 6.3132429122924805, + "rewards/rejected": -3.2122726440429688, + "step": 4598 + }, + { + "epoch": 1.15, + "grad_norm": 4.30381965637207, + "learning_rate": 1.2806552915291332e-06, + "logits/chosen": -0.3279903531074524, + "logits/rejected": -0.38384178280830383, + "logps/chosen": -51.609161376953125, + "logps/rejected": -95.04185485839844, + "loss": 0.6508, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0404515266418457, + "rewards/margins": 4.843101501464844, + "rewards/rejected": -1.8026494979858398, + "step": 4599 + }, + { + "epoch": 1.15, + "grad_norm": 4.330972194671631, + "learning_rate": 1.2789049627894111e-06, + "logits/chosen": -0.3664552569389343, + "logits/rejected": -0.458743691444397, + "logps/chosen": -48.027103424072266, + "logps/rejected": -91.88053131103516, + "loss": 0.6725, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.749493360519409, + "rewards/margins": 6.181055068969727, + "rewards/rejected": -3.4315617084503174, + "step": 4600 + }, + { + "epoch": 1.15, + "grad_norm": 5.339662551879883, + "learning_rate": 1.2771556555705711e-06, + "logits/chosen": -0.4991813898086548, + "logits/rejected": -0.5596664547920227, + "logps/chosen": -54.96959686279297, + "logps/rejected": -83.62089538574219, + "loss": 0.7814, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.673496723175049, + "rewards/margins": 5.639531135559082, + "rewards/rejected": -2.9660346508026123, + "step": 4601 + }, + { + "epoch": 1.15, + "grad_norm": 9.118671417236328, + "learning_rate": 1.2754073703528368e-06, + "logits/chosen": -0.2914576232433319, + "logits/rejected": -0.3183715045452118, + "logps/chosen": -57.57659149169922, + "logps/rejected": -88.54048919677734, + "loss": 0.8276, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0428037643432617, + "rewards/margins": 3.933318614959717, + "rewards/rejected": -0.8905147314071655, + "step": 4602 + }, + { + "epoch": 1.15, + "grad_norm": 5.62148380279541, + "learning_rate": 1.2736601076161498e-06, + "logits/chosen": -0.26461324095726013, + "logits/rejected": -0.3870891034603119, + "logps/chosen": -66.47493743896484, + "logps/rejected": -85.26189422607422, + "loss": 0.7813, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6614277362823486, + "rewards/margins": 4.55366849899292, + "rewards/rejected": -1.8922410011291504, + "step": 4603 + }, + { + "epoch": 1.15, + "grad_norm": 4.567074298858643, + "learning_rate": 1.2719138678401693e-06, + "logits/chosen": -0.33534491062164307, + "logits/rejected": -0.4162581264972687, + "logps/chosen": -65.56944274902344, + "logps/rejected": -100.30682373046875, + "loss": 0.7795, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.728799343109131, + "rewards/margins": 5.541713237762451, + "rewards/rejected": -2.8129141330718994, + "step": 4604 + }, + { + "epoch": 1.15, + "grad_norm": 3.522376775741577, + "learning_rate": 1.2701686515042798e-06, + "logits/chosen": -0.3465118706226349, + "logits/rejected": -0.3932461440563202, + "logps/chosen": -65.62686157226562, + "logps/rejected": -89.36259460449219, + "loss": 0.7452, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1427547931671143, + "rewards/margins": 4.645031929016113, + "rewards/rejected": -1.50227689743042, + "step": 4605 + }, + { + "epoch": 1.15, + "grad_norm": 6.5448994636535645, + "learning_rate": 1.2684244590875789e-06, + "logits/chosen": -0.345718115568161, + "logits/rejected": -0.40664026141166687, + "logps/chosen": -58.855224609375, + "logps/rejected": -82.61168670654297, + "loss": 0.8725, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8359878063201904, + "rewards/margins": 4.494312763214111, + "rewards/rejected": -1.6583250761032104, + "step": 4606 + }, + { + "epoch": 1.15, + "grad_norm": 4.401620864868164, + "learning_rate": 1.2666812910688846e-06, + "logits/chosen": -0.30211177468299866, + "logits/rejected": -0.35856392979621887, + "logps/chosen": -60.9139518737793, + "logps/rejected": -91.1257095336914, + "loss": 0.759, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6882846355438232, + "rewards/margins": 4.682010173797607, + "rewards/rejected": -1.9937257766723633, + "step": 4607 + }, + { + "epoch": 1.15, + "grad_norm": 3.862304210662842, + "learning_rate": 1.2649391479267347e-06, + "logits/chosen": -0.3564310371875763, + "logits/rejected": -0.44964340329170227, + "logps/chosen": -45.0209846496582, + "logps/rejected": -71.36679077148438, + "loss": 0.7058, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0537309646606445, + "rewards/margins": 4.161773204803467, + "rewards/rejected": -1.1080424785614014, + "step": 4608 + }, + { + "epoch": 1.15, + "grad_norm": 6.195433616638184, + "learning_rate": 1.2631980301393849e-06, + "logits/chosen": -0.3855421543121338, + "logits/rejected": -0.4953503906726837, + "logps/chosen": -50.512916564941406, + "logps/rejected": -67.60790252685547, + "loss": 0.7631, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9675540924072266, + "rewards/margins": 4.488150119781494, + "rewards/rejected": -1.5205955505371094, + "step": 4609 + }, + { + "epoch": 1.15, + "grad_norm": 7.547142505645752, + "learning_rate": 1.2614579381848097e-06, + "logits/chosen": -0.3269283175468445, + "logits/rejected": -0.3687562048435211, + "logps/chosen": -53.55329513549805, + "logps/rejected": -84.54276275634766, + "loss": 0.8143, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7525835037231445, + "rewards/margins": 4.129917621612549, + "rewards/rejected": -1.3773338794708252, + "step": 4610 + }, + { + "epoch": 1.15, + "grad_norm": 4.187046527862549, + "learning_rate": 1.2597188725407006e-06, + "logits/chosen": -0.3774375915527344, + "logits/rejected": -0.43807369470596313, + "logps/chosen": -56.771549224853516, + "logps/rejected": -95.18174743652344, + "loss": 0.6286, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0479893684387207, + "rewards/margins": 5.014756679534912, + "rewards/rejected": -1.9667670726776123, + "step": 4611 + }, + { + "epoch": 1.15, + "grad_norm": 5.409719467163086, + "learning_rate": 1.2579808336844711e-06, + "logits/chosen": -0.38473740220069885, + "logits/rejected": -0.47003763914108276, + "logps/chosen": -53.797515869140625, + "logps/rejected": -80.8294906616211, + "loss": 0.8501, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8903284072875977, + "rewards/margins": 4.90735387802124, + "rewards/rejected": -2.0170254707336426, + "step": 4612 + }, + { + "epoch": 1.15, + "grad_norm": 3.3259832859039307, + "learning_rate": 1.2562438220932493e-06, + "logits/chosen": -0.32415881752967834, + "logits/rejected": -0.4167638123035431, + "logps/chosen": -62.947914123535156, + "logps/rejected": -81.20893096923828, + "loss": 0.7164, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.072357654571533, + "rewards/margins": 4.944326400756836, + "rewards/rejected": -1.8719689846038818, + "step": 4613 + }, + { + "epoch": 1.15, + "grad_norm": 3.508173704147339, + "learning_rate": 1.2545078382438825e-06, + "logits/chosen": -0.4166281521320343, + "logits/rejected": -0.5390325784683228, + "logps/chosen": -55.184444427490234, + "logps/rejected": -92.50387573242188, + "loss": 0.6114, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.183483123779297, + "rewards/margins": 5.683672904968262, + "rewards/rejected": -2.5001890659332275, + "step": 4614 + }, + { + "epoch": 1.15, + "grad_norm": 4.407756805419922, + "learning_rate": 1.2527728826129348e-06, + "logits/chosen": -0.3271139860153198, + "logits/rejected": -0.4978703260421753, + "logps/chosen": -56.322059631347656, + "logps/rejected": -77.96215057373047, + "loss": 0.6725, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9001870155334473, + "rewards/margins": 5.397824287414551, + "rewards/rejected": -2.497637987136841, + "step": 4615 + }, + { + "epoch": 1.15, + "grad_norm": 4.33517599105835, + "learning_rate": 1.2510389556766884e-06, + "logits/chosen": -0.39930540323257446, + "logits/rejected": -0.40232518315315247, + "logps/chosen": -39.654396057128906, + "logps/rejected": -100.79670715332031, + "loss": 0.6184, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9900717735290527, + "rewards/margins": 4.245791912078857, + "rewards/rejected": -1.255719780921936, + "step": 4616 + }, + { + "epoch": 1.15, + "grad_norm": 6.102812767028809, + "learning_rate": 1.2493060579111465e-06, + "logits/chosen": -0.3065851628780365, + "logits/rejected": -0.3662540912628174, + "logps/chosen": -66.60323333740234, + "logps/rejected": -90.47260284423828, + "loss": 0.7482, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.889112710952759, + "rewards/margins": 4.3370680809021, + "rewards/rejected": -1.4479553699493408, + "step": 4617 + }, + { + "epoch": 1.16, + "grad_norm": 4.234259605407715, + "learning_rate": 1.2475741897920268e-06, + "logits/chosen": -0.3497813940048218, + "logits/rejected": -0.40603023767471313, + "logps/chosen": -57.66928482055664, + "logps/rejected": -89.16256713867188, + "loss": 0.7647, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9254250526428223, + "rewards/margins": 4.313353538513184, + "rewards/rejected": -1.3879286050796509, + "step": 4618 + }, + { + "epoch": 1.16, + "grad_norm": 7.096617698669434, + "learning_rate": 1.2458433517947598e-06, + "logits/chosen": -0.3273087739944458, + "logits/rejected": -0.40927833318710327, + "logps/chosen": -60.42485046386719, + "logps/rejected": -98.98927307128906, + "loss": 0.7901, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5562570095062256, + "rewards/margins": 4.294582843780518, + "rewards/rejected": -1.738326072692871, + "step": 4619 + }, + { + "epoch": 1.16, + "grad_norm": 5.28247594833374, + "learning_rate": 1.2441135443945023e-06, + "logits/chosen": -0.3181743621826172, + "logits/rejected": -0.4169609546661377, + "logps/chosen": -75.39942932128906, + "logps/rejected": -81.52670288085938, + "loss": 0.9028, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.118396282196045, + "rewards/margins": 4.00589656829834, + "rewards/rejected": -0.8875002264976501, + "step": 4620 + }, + { + "epoch": 1.16, + "grad_norm": 3.3361685276031494, + "learning_rate": 1.2423847680661227e-06, + "logits/chosen": -0.4133186638355255, + "logits/rejected": -0.4896460175514221, + "logps/chosen": -44.843318939208984, + "logps/rejected": -75.08815002441406, + "loss": 0.7041, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0062570571899414, + "rewards/margins": 4.521251201629639, + "rewards/rejected": -1.5149943828582764, + "step": 4621 + }, + { + "epoch": 1.16, + "grad_norm": 6.597933769226074, + "learning_rate": 1.2406570232842074e-06, + "logits/chosen": -0.3397875726222992, + "logits/rejected": -0.3955111503601074, + "logps/chosen": -51.02701950073242, + "logps/rejected": -84.7189712524414, + "loss": 0.7099, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9068288803100586, + "rewards/margins": 5.152662754058838, + "rewards/rejected": -2.2458341121673584, + "step": 4622 + }, + { + "epoch": 1.16, + "grad_norm": 10.255518913269043, + "learning_rate": 1.2389303105230593e-06, + "logits/chosen": -0.38848787546157837, + "logits/rejected": -0.48986300826072693, + "logps/chosen": -55.0911979675293, + "logps/rejected": -81.3170166015625, + "loss": 0.942, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9083480834960938, + "rewards/margins": 4.216084003448486, + "rewards/rejected": -1.307735800743103, + "step": 4623 + }, + { + "epoch": 1.16, + "grad_norm": 11.539434432983398, + "learning_rate": 1.237204630256697e-06, + "logits/chosen": -0.2941695749759674, + "logits/rejected": -0.40311548113822937, + "logps/chosen": -55.070369720458984, + "logps/rejected": -86.92977142333984, + "loss": 0.8413, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0643486976623535, + "rewards/margins": 4.701203346252441, + "rewards/rejected": -1.636854648590088, + "step": 4624 + }, + { + "epoch": 1.16, + "grad_norm": 4.682463645935059, + "learning_rate": 1.2354799829588598e-06, + "logits/chosen": -0.4691731333732605, + "logits/rejected": -0.5780725479125977, + "logps/chosen": -50.33662414550781, + "logps/rejected": -71.6830062866211, + "loss": 0.6998, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1946828365325928, + "rewards/margins": 5.714175224304199, + "rewards/rejected": -2.5194919109344482, + "step": 4625 + }, + { + "epoch": 1.16, + "grad_norm": 7.5800042152404785, + "learning_rate": 1.2337563691029996e-06, + "logits/chosen": -0.38520658016204834, + "logits/rejected": -0.49398374557495117, + "logps/chosen": -57.05324172973633, + "logps/rejected": -85.63566589355469, + "loss": 0.8286, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0145463943481445, + "rewards/margins": 5.387046813964844, + "rewards/rejected": -2.37250018119812, + "step": 4626 + }, + { + "epoch": 1.16, + "grad_norm": 11.28612995147705, + "learning_rate": 1.2320337891622846e-06, + "logits/chosen": -0.3081722557544708, + "logits/rejected": -0.3450281023979187, + "logps/chosen": -61.71070098876953, + "logps/rejected": -97.60166931152344, + "loss": 0.77, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.940725803375244, + "rewards/margins": 4.92434549331665, + "rewards/rejected": -1.9836198091506958, + "step": 4627 + }, + { + "epoch": 1.16, + "grad_norm": 5.834625244140625, + "learning_rate": 1.2303122436096015e-06, + "logits/chosen": -0.37037283182144165, + "logits/rejected": -0.4701533615589142, + "logps/chosen": -60.77156066894531, + "logps/rejected": -71.91238403320312, + "loss": 0.8186, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.835845470428467, + "rewards/margins": 3.578043222427368, + "rewards/rejected": -0.7421978712081909, + "step": 4628 + }, + { + "epoch": 1.16, + "grad_norm": 13.511666297912598, + "learning_rate": 1.228591732917549e-06, + "logits/chosen": -0.27221864461898804, + "logits/rejected": -0.36605599522590637, + "logps/chosen": -60.874908447265625, + "logps/rejected": -79.7548828125, + "loss": 0.7388, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1229867935180664, + "rewards/margins": 3.849726915359497, + "rewards/rejected": -0.7267403602600098, + "step": 4629 + }, + { + "epoch": 1.16, + "grad_norm": 3.444054365158081, + "learning_rate": 1.2268722575584491e-06, + "logits/chosen": -0.3143637776374817, + "logits/rejected": -0.4567105174064636, + "logps/chosen": -55.05785369873047, + "logps/rejected": -73.58882904052734, + "loss": 0.6623, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8199830055236816, + "rewards/margins": 4.98787784576416, + "rewards/rejected": -2.1678948402404785, + "step": 4630 + }, + { + "epoch": 1.16, + "grad_norm": 6.04915189743042, + "learning_rate": 1.22515381800433e-06, + "logits/chosen": -0.2940753996372223, + "logits/rejected": -0.3591708540916443, + "logps/chosen": -67.76944732666016, + "logps/rejected": -86.03179168701172, + "loss": 0.7858, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7864229679107666, + "rewards/margins": 3.5485482215881348, + "rewards/rejected": -0.7621251940727234, + "step": 4631 + }, + { + "epoch": 1.16, + "grad_norm": 11.634200096130371, + "learning_rate": 1.2234364147269433e-06, + "logits/chosen": -0.317865252494812, + "logits/rejected": -0.4376354217529297, + "logps/chosen": -61.550048828125, + "logps/rejected": -71.140869140625, + "loss": 0.8484, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.869105339050293, + "rewards/margins": 4.5637125968933105, + "rewards/rejected": -1.6946073770523071, + "step": 4632 + }, + { + "epoch": 1.16, + "grad_norm": 4.303318500518799, + "learning_rate": 1.2217200481977532e-06, + "logits/chosen": -0.29271554946899414, + "logits/rejected": -0.4499126374721527, + "logps/chosen": -66.11444854736328, + "logps/rejected": -67.5859375, + "loss": 0.7958, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9941763877868652, + "rewards/margins": 4.425057411193848, + "rewards/rejected": -1.4308810234069824, + "step": 4633 + }, + { + "epoch": 1.16, + "grad_norm": 4.087196350097656, + "learning_rate": 1.2200047188879382e-06, + "logits/chosen": -0.39523130655288696, + "logits/rejected": -0.4512297511100769, + "logps/chosen": -49.75841522216797, + "logps/rejected": -69.18348693847656, + "loss": 0.7749, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.045795202255249, + "rewards/margins": 3.6525654792785645, + "rewards/rejected": -0.6067702770233154, + "step": 4634 + }, + { + "epoch": 1.16, + "grad_norm": 10.624655723571777, + "learning_rate": 1.2182904272683938e-06, + "logits/chosen": -0.36159902811050415, + "logits/rejected": -0.4539092779159546, + "logps/chosen": -52.464698791503906, + "logps/rejected": -88.21263122558594, + "loss": 0.8204, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.9932005405426025, + "rewards/margins": 4.199958324432373, + "rewards/rejected": -1.206757664680481, + "step": 4635 + }, + { + "epoch": 1.16, + "grad_norm": 4.744399547576904, + "learning_rate": 1.2165771738097288e-06, + "logits/chosen": -0.3029477596282959, + "logits/rejected": -0.4260686933994293, + "logps/chosen": -67.47162628173828, + "logps/rejected": -78.83271026611328, + "loss": 0.73, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.193331241607666, + "rewards/margins": 5.066708564758301, + "rewards/rejected": -1.8733773231506348, + "step": 4636 + }, + { + "epoch": 1.16, + "grad_norm": 5.327393531799316, + "learning_rate": 1.2148649589822708e-06, + "logits/chosen": -0.26041489839553833, + "logits/rejected": -0.3692517876625061, + "logps/chosen": -59.740272521972656, + "logps/rejected": -83.04004669189453, + "loss": 0.7343, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7228870391845703, + "rewards/margins": 4.335486888885498, + "rewards/rejected": -1.6125997304916382, + "step": 4637 + }, + { + "epoch": 1.16, + "grad_norm": 4.993445873260498, + "learning_rate": 1.2131537832560598e-06, + "logits/chosen": -0.29324057698249817, + "logits/rejected": -0.40397125482559204, + "logps/chosen": -70.2232666015625, + "logps/rejected": -76.08956909179688, + "loss": 0.6985, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6470766067504883, + "rewards/margins": 4.8156208992004395, + "rewards/rejected": -2.168543815612793, + "step": 4638 + }, + { + "epoch": 1.16, + "grad_norm": 7.202304363250732, + "learning_rate": 1.2114436471008456e-06, + "logits/chosen": -0.32722118496894836, + "logits/rejected": -0.4366355240345001, + "logps/chosen": -47.73722839355469, + "logps/rejected": -67.93399047851562, + "loss": 0.7242, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0878560543060303, + "rewards/margins": 4.282768726348877, + "rewards/rejected": -1.1949126720428467, + "step": 4639 + }, + { + "epoch": 1.16, + "grad_norm": 3.127650260925293, + "learning_rate": 1.209734550986103e-06, + "logits/chosen": -0.3765006959438324, + "logits/rejected": -0.46257656812667847, + "logps/chosen": -61.57658386230469, + "logps/rejected": -87.33702850341797, + "loss": 0.6814, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.191246747970581, + "rewards/margins": 4.910991668701172, + "rewards/rejected": -1.7197452783584595, + "step": 4640 + }, + { + "epoch": 1.16, + "grad_norm": 5.506709575653076, + "learning_rate": 1.2080264953810117e-06, + "logits/chosen": -0.3671160936355591, + "logits/rejected": -0.4729549288749695, + "logps/chosen": -48.65248107910156, + "logps/rejected": -100.26100158691406, + "loss": 0.7101, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.909318208694458, + "rewards/margins": 5.520568370819092, + "rewards/rejected": -2.611250162124634, + "step": 4641 + }, + { + "epoch": 1.16, + "grad_norm": 3.824166774749756, + "learning_rate": 1.2063194807544748e-06, + "logits/chosen": -0.35472846031188965, + "logits/rejected": -0.40282654762268066, + "logps/chosen": -51.967628479003906, + "logps/rejected": -77.78021240234375, + "loss": 0.7678, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.081812858581543, + "rewards/margins": 5.06043815612793, + "rewards/rejected": -1.9786251783370972, + "step": 4642 + }, + { + "epoch": 1.16, + "grad_norm": 5.1719841957092285, + "learning_rate": 1.2046135075751003e-06, + "logits/chosen": -0.2256195992231369, + "logits/rejected": -0.33657145500183105, + "logps/chosen": -56.344688415527344, + "logps/rejected": -79.67623138427734, + "loss": 0.8133, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.865004539489746, + "rewards/margins": 5.108559608459473, + "rewards/rejected": -2.2435553073883057, + "step": 4643 + }, + { + "epoch": 1.16, + "grad_norm": 5.62833309173584, + "learning_rate": 1.2029085763112142e-06, + "logits/chosen": -0.31421321630477905, + "logits/rejected": -0.39639031887054443, + "logps/chosen": -46.94294357299805, + "logps/rejected": -79.58988189697266, + "loss": 0.6971, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8285903930664062, + "rewards/margins": 4.881009101867676, + "rewards/rejected": -2.052417755126953, + "step": 4644 + }, + { + "epoch": 1.16, + "grad_norm": 9.628883361816406, + "learning_rate": 1.2012046874308602e-06, + "logits/chosen": -0.28342461585998535, + "logits/rejected": -0.3730226755142212, + "logps/chosen": -68.51190185546875, + "logps/rejected": -79.40584564208984, + "loss": 0.9042, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.631380796432495, + "rewards/margins": 3.476478338241577, + "rewards/rejected": -0.845097541809082, + "step": 4645 + }, + { + "epoch": 1.16, + "grad_norm": 6.256176948547363, + "learning_rate": 1.199501841401791e-06, + "logits/chosen": -0.3480464220046997, + "logits/rejected": -0.3880981206893921, + "logps/chosen": -52.436492919921875, + "logps/rejected": -100.34124755859375, + "loss": 0.7035, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8297529220581055, + "rewards/margins": 4.278768062591553, + "rewards/rejected": -1.4490152597427368, + "step": 4646 + }, + { + "epoch": 1.16, + "grad_norm": 4.145107746124268, + "learning_rate": 1.197800038691475e-06, + "logits/chosen": -0.30546218156814575, + "logits/rejected": -0.4490029215812683, + "logps/chosen": -64.1956787109375, + "logps/rejected": -64.60930633544922, + "loss": 0.8201, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.761786937713623, + "rewards/margins": 4.231475353240967, + "rewards/rejected": -1.4696886539459229, + "step": 4647 + }, + { + "epoch": 1.16, + "grad_norm": 6.683682441711426, + "learning_rate": 1.1960992797670935e-06, + "logits/chosen": -0.2827014923095703, + "logits/rejected": -0.33141854405403137, + "logps/chosen": -62.24189758300781, + "logps/rejected": -99.90293884277344, + "loss": 0.8214, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8312668800354004, + "rewards/margins": 4.38671350479126, + "rewards/rejected": -1.5554468631744385, + "step": 4648 + }, + { + "epoch": 1.16, + "grad_norm": 4.493462562561035, + "learning_rate": 1.1943995650955391e-06, + "logits/chosen": -0.31474974751472473, + "logits/rejected": -0.3594641387462616, + "logps/chosen": -55.92140197753906, + "logps/rejected": -79.97888946533203, + "loss": 0.8353, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8706657886505127, + "rewards/margins": 3.5583977699279785, + "rewards/rejected": -0.6877323389053345, + "step": 4649 + }, + { + "epoch": 1.16, + "grad_norm": 2.684497117996216, + "learning_rate": 1.192700895143426e-06, + "logits/chosen": -0.3584514856338501, + "logits/rejected": -0.39936405420303345, + "logps/chosen": -46.835174560546875, + "logps/rejected": -97.44135284423828, + "loss": 0.6278, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1798839569091797, + "rewards/margins": 5.014939308166504, + "rewards/rejected": -1.8350552320480347, + "step": 4650 + }, + { + "epoch": 1.16, + "grad_norm": 3.1946160793304443, + "learning_rate": 1.1910032703770691e-06, + "logits/chosen": -0.28722912073135376, + "logits/rejected": -0.4228818416595459, + "logps/chosen": -47.44419860839844, + "logps/rejected": -81.89623260498047, + "loss": 0.565, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.062920570373535, + "rewards/margins": 5.181759357452393, + "rewards/rejected": -2.1188387870788574, + "step": 4651 + }, + { + "epoch": 1.16, + "grad_norm": 8.73465633392334, + "learning_rate": 1.1893066912625078e-06, + "logits/chosen": -0.37649402022361755, + "logits/rejected": -0.45804563164711, + "logps/chosen": -59.67935562133789, + "logps/rejected": -78.2119140625, + "loss": 0.7964, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.641148805618286, + "rewards/margins": 4.146108627319336, + "rewards/rejected": -1.5049598217010498, + "step": 4652 + }, + { + "epoch": 1.16, + "grad_norm": 4.789882183074951, + "learning_rate": 1.1876111582654882e-06, + "logits/chosen": -0.27642959356307983, + "logits/rejected": -0.3235277235507965, + "logps/chosen": -73.10368347167969, + "logps/rejected": -87.56282043457031, + "loss": 0.7752, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.927605628967285, + "rewards/margins": 4.0016584396362305, + "rewards/rejected": -1.0740524530410767, + "step": 4653 + }, + { + "epoch": 1.16, + "grad_norm": 4.796889305114746, + "learning_rate": 1.1859166718514703e-06, + "logits/chosen": -0.3453756272792816, + "logits/rejected": -0.5020222663879395, + "logps/chosen": -55.868507385253906, + "logps/rejected": -73.68705749511719, + "loss": 0.7481, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0794787406921387, + "rewards/margins": 4.360586643218994, + "rewards/rejected": -1.2811076641082764, + "step": 4654 + }, + { + "epoch": 1.16, + "grad_norm": 3.3164889812469482, + "learning_rate": 1.1842232324856273e-06, + "logits/chosen": -0.3960185945034027, + "logits/rejected": -0.5561850666999817, + "logps/chosen": -44.63701629638672, + "logps/rejected": -75.21668243408203, + "loss": 0.5966, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0664846897125244, + "rewards/margins": 5.844412803649902, + "rewards/rejected": -2.777928352355957, + "step": 4655 + }, + { + "epoch": 1.16, + "grad_norm": 8.04580020904541, + "learning_rate": 1.1825308406328435e-06, + "logits/chosen": -0.2831278443336487, + "logits/rejected": -0.3795597553253174, + "logps/chosen": -60.330360412597656, + "logps/rejected": -86.30549621582031, + "loss": 0.7252, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.975350856781006, + "rewards/margins": 5.466432571411133, + "rewards/rejected": -2.491081476211548, + "step": 4656 + }, + { + "epoch": 1.16, + "grad_norm": 6.167526721954346, + "learning_rate": 1.1808394967577198e-06, + "logits/chosen": -0.27101102471351624, + "logits/rejected": -0.43604469299316406, + "logps/chosen": -60.75697708129883, + "logps/rejected": -82.53030395507812, + "loss": 0.8302, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.828023910522461, + "rewards/margins": 4.545347213745117, + "rewards/rejected": -1.7173231840133667, + "step": 4657 + }, + { + "epoch": 1.17, + "grad_norm": 5.4654645919799805, + "learning_rate": 1.1791492013245654e-06, + "logits/chosen": -0.2926133871078491, + "logits/rejected": -0.38676249980926514, + "logps/chosen": -54.41941452026367, + "logps/rejected": -79.69746398925781, + "loss": 0.7799, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9168803691864014, + "rewards/margins": 4.275561809539795, + "rewards/rejected": -1.3586819171905518, + "step": 4658 + }, + { + "epoch": 1.17, + "grad_norm": 7.58158016204834, + "learning_rate": 1.1774599547974025e-06, + "logits/chosen": -0.3420430123806, + "logits/rejected": -0.4502304494380951, + "logps/chosen": -59.990291595458984, + "logps/rejected": -98.45317840576172, + "loss": 0.737, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.14058518409729, + "rewards/margins": 6.164752006530762, + "rewards/rejected": -3.0241665840148926, + "step": 4659 + }, + { + "epoch": 1.17, + "grad_norm": 7.7920122146606445, + "learning_rate": 1.1757717576399658e-06, + "logits/chosen": -0.351311594247818, + "logits/rejected": -0.4561803638935089, + "logps/chosen": -59.93364715576172, + "logps/rejected": -88.7212142944336, + "loss": 0.7473, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.923149824142456, + "rewards/margins": 5.181567668914795, + "rewards/rejected": -2.258418083190918, + "step": 4660 + }, + { + "epoch": 1.17, + "grad_norm": 4.025489330291748, + "learning_rate": 1.174084610315701e-06, + "logits/chosen": -0.3523383140563965, + "logits/rejected": -0.4205430746078491, + "logps/chosen": -45.56758117675781, + "logps/rejected": -77.59913635253906, + "loss": 0.6539, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7473058700561523, + "rewards/margins": 4.907289981842041, + "rewards/rejected": -2.159984588623047, + "step": 4661 + }, + { + "epoch": 1.17, + "grad_norm": 3.5861799716949463, + "learning_rate": 1.1723985132877686e-06, + "logits/chosen": -0.32093775272369385, + "logits/rejected": -0.4032716751098633, + "logps/chosen": -56.12674331665039, + "logps/rejected": -81.98025512695312, + "loss": 0.7206, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.073751449584961, + "rewards/margins": 4.860232353210449, + "rewards/rejected": -1.786481261253357, + "step": 4662 + }, + { + "epoch": 1.17, + "grad_norm": 4.193518161773682, + "learning_rate": 1.170713467019039e-06, + "logits/chosen": -0.420340895652771, + "logits/rejected": -0.49537110328674316, + "logps/chosen": -47.279380798339844, + "logps/rejected": -85.27863311767578, + "loss": 0.6332, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9452052116394043, + "rewards/margins": 5.340099811553955, + "rewards/rejected": -2.394894599914551, + "step": 4663 + }, + { + "epoch": 1.17, + "grad_norm": 6.774667739868164, + "learning_rate": 1.1690294719720891e-06, + "logits/chosen": -0.428268164396286, + "logits/rejected": -0.4932052195072174, + "logps/chosen": -60.66913986206055, + "logps/rejected": -77.7186508178711, + "loss": 0.7893, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.755768299102783, + "rewards/margins": 3.5606577396392822, + "rewards/rejected": -0.8048895597457886, + "step": 4664 + }, + { + "epoch": 1.17, + "grad_norm": 4.232960224151611, + "learning_rate": 1.1673465286092162e-06, + "logits/chosen": -0.274838387966156, + "logits/rejected": -0.35261180996894836, + "logps/chosen": -70.86811828613281, + "logps/rejected": -82.3357162475586, + "loss": 0.7814, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9957621097564697, + "rewards/margins": 4.5173444747924805, + "rewards/rejected": -1.5215827226638794, + "step": 4665 + }, + { + "epoch": 1.17, + "grad_norm": 4.231084823608398, + "learning_rate": 1.1656646373924234e-06, + "logits/chosen": -0.2959027588367462, + "logits/rejected": -0.42902904748916626, + "logps/chosen": -49.32643127441406, + "logps/rejected": -72.82645416259766, + "loss": 0.6912, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1145899295806885, + "rewards/margins": 4.738258361816406, + "rewards/rejected": -1.623668909072876, + "step": 4666 + }, + { + "epoch": 1.17, + "grad_norm": 7.951078414916992, + "learning_rate": 1.1639837987834268e-06, + "logits/chosen": -0.39903169870376587, + "logits/rejected": -0.5065258145332336, + "logps/chosen": -67.49833679199219, + "logps/rejected": -78.76951599121094, + "loss": 0.6808, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3389437198638916, + "rewards/margins": 4.71760368347168, + "rewards/rejected": -1.3786598443984985, + "step": 4667 + }, + { + "epoch": 1.17, + "grad_norm": 4.116170883178711, + "learning_rate": 1.1623040132436515e-06, + "logits/chosen": -0.3132786452770233, + "logits/rejected": -0.38896435499191284, + "logps/chosen": -51.15443420410156, + "logps/rejected": -94.0441665649414, + "loss": 0.6522, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9022812843322754, + "rewards/margins": 5.450749397277832, + "rewards/rejected": -2.5484681129455566, + "step": 4668 + }, + { + "epoch": 1.17, + "grad_norm": 3.5149424076080322, + "learning_rate": 1.160625281234234e-06, + "logits/chosen": -0.29963913559913635, + "logits/rejected": -0.4359464645385742, + "logps/chosen": -58.21388626098633, + "logps/rejected": -89.09762573242188, + "loss": 0.7295, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.858731269836426, + "rewards/margins": 5.497477054595947, + "rewards/rejected": -2.6387455463409424, + "step": 4669 + }, + { + "epoch": 1.17, + "grad_norm": 4.459079265594482, + "learning_rate": 1.1589476032160258e-06, + "logits/chosen": -0.35674169659614563, + "logits/rejected": -0.4702287018299103, + "logps/chosen": -53.06187057495117, + "logps/rejected": -77.81462097167969, + "loss": 0.6837, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9339239597320557, + "rewards/margins": 5.01372766494751, + "rewards/rejected": -2.0798041820526123, + "step": 4670 + }, + { + "epoch": 1.17, + "grad_norm": 8.744780540466309, + "learning_rate": 1.1572709796495834e-06, + "logits/chosen": -0.3455684781074524, + "logits/rejected": -0.46324992179870605, + "logps/chosen": -58.51951599121094, + "logps/rejected": -89.11577606201172, + "loss": 0.6534, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3076012134552, + "rewards/margins": 5.241516590118408, + "rewards/rejected": -1.9339154958724976, + "step": 4671 + }, + { + "epoch": 1.17, + "grad_norm": 4.1415324211120605, + "learning_rate": 1.1555954109951772e-06, + "logits/chosen": -0.3113911747932434, + "logits/rejected": -0.42619821429252625, + "logps/chosen": -52.489986419677734, + "logps/rejected": -86.57174682617188, + "loss": 0.636, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.816701889038086, + "rewards/margins": 5.312322616577148, + "rewards/rejected": -2.4956204891204834, + "step": 4672 + }, + { + "epoch": 1.17, + "grad_norm": 6.2603912353515625, + "learning_rate": 1.1539208977127858e-06, + "logits/chosen": -0.29659318923950195, + "logits/rejected": -0.40297165513038635, + "logps/chosen": -59.129493713378906, + "logps/rejected": -94.50939178466797, + "loss": 0.7108, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7338404655456543, + "rewards/margins": 5.523255825042725, + "rewards/rejected": -2.7894153594970703, + "step": 4673 + }, + { + "epoch": 1.17, + "grad_norm": 5.792788505554199, + "learning_rate": 1.1522474402620988e-06, + "logits/chosen": -0.28304341435432434, + "logits/rejected": -0.3446826934814453, + "logps/chosen": -55.99653625488281, + "logps/rejected": -81.0463638305664, + "loss": 0.6805, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8014719486236572, + "rewards/margins": 3.987567663192749, + "rewards/rejected": -1.1860955953598022, + "step": 4674 + }, + { + "epoch": 1.17, + "grad_norm": 7.791999816894531, + "learning_rate": 1.1505750391025204e-06, + "logits/chosen": -0.31432563066482544, + "logits/rejected": -0.4011569619178772, + "logps/chosen": -56.23414993286133, + "logps/rejected": -92.56192779541016, + "loss": 0.8025, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0595898628234863, + "rewards/margins": 4.456531524658203, + "rewards/rejected": -1.3969416618347168, + "step": 4675 + }, + { + "epoch": 1.17, + "grad_norm": 5.3987579345703125, + "learning_rate": 1.1489036946931548e-06, + "logits/chosen": -0.3155052661895752, + "logits/rejected": -0.45505645871162415, + "logps/chosen": -50.177799224853516, + "logps/rejected": -71.00051879882812, + "loss": 0.769, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8322231769561768, + "rewards/margins": 4.824934005737305, + "rewards/rejected": -1.9927107095718384, + "step": 4676 + }, + { + "epoch": 1.17, + "grad_norm": 5.9807024002075195, + "learning_rate": 1.147233407492826e-06, + "logits/chosen": -0.28666943311691284, + "logits/rejected": -0.38219472765922546, + "logps/chosen": -55.02061462402344, + "logps/rejected": -97.68213653564453, + "loss": 0.7635, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.53408145904541, + "rewards/margins": 4.5122833251953125, + "rewards/rejected": -1.9782017469406128, + "step": 4677 + }, + { + "epoch": 1.17, + "grad_norm": 4.454338073730469, + "learning_rate": 1.1455641779600634e-06, + "logits/chosen": -0.40067052841186523, + "logits/rejected": -0.5247626900672913, + "logps/chosen": -52.631980895996094, + "logps/rejected": -66.71260070800781, + "loss": 0.8052, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9296305179595947, + "rewards/margins": 4.967466354370117, + "rewards/rejected": -2.0378355979919434, + "step": 4678 + }, + { + "epoch": 1.17, + "grad_norm": 4.4102911949157715, + "learning_rate": 1.1438960065531063e-06, + "logits/chosen": -0.3460443317890167, + "logits/rejected": -0.4303760528564453, + "logps/chosen": -47.517730712890625, + "logps/rejected": -71.30594635009766, + "loss": 0.746, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.897432327270508, + "rewards/margins": 4.679620742797852, + "rewards/rejected": -1.7821887731552124, + "step": 4679 + }, + { + "epoch": 1.17, + "grad_norm": 7.911617755889893, + "learning_rate": 1.1422288937299037e-06, + "logits/chosen": -0.3767828643321991, + "logits/rejected": -0.5191869139671326, + "logps/chosen": -71.19717407226562, + "logps/rejected": -75.39688873291016, + "loss": 0.6954, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.799710750579834, + "rewards/margins": 4.131855487823486, + "rewards/rejected": -1.3321444988250732, + "step": 4680 + }, + { + "epoch": 1.17, + "grad_norm": 3.996574640274048, + "learning_rate": 1.1405628399481123e-06, + "logits/chosen": -0.32885101437568665, + "logits/rejected": -0.45310619473457336, + "logps/chosen": -54.81096267700195, + "logps/rejected": -78.99916076660156, + "loss": 0.6557, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9971261024475098, + "rewards/margins": 4.483733654022217, + "rewards/rejected": -1.486607551574707, + "step": 4681 + }, + { + "epoch": 1.17, + "grad_norm": 6.328601360321045, + "learning_rate": 1.1388978456651023e-06, + "logits/chosen": -0.3928048014640808, + "logits/rejected": -0.49583256244659424, + "logps/chosen": -58.87079620361328, + "logps/rejected": -98.19581604003906, + "loss": 0.7173, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0058774948120117, + "rewards/margins": 6.033227920532227, + "rewards/rejected": -3.0273501873016357, + "step": 4682 + }, + { + "epoch": 1.17, + "grad_norm": 5.108282566070557, + "learning_rate": 1.1372339113379515e-06, + "logits/chosen": -0.29698771238327026, + "logits/rejected": -0.31794604659080505, + "logps/chosen": -52.60880661010742, + "logps/rejected": -91.2740249633789, + "loss": 0.7635, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.070044755935669, + "rewards/margins": 4.186040878295898, + "rewards/rejected": -1.1159958839416504, + "step": 4683 + }, + { + "epoch": 1.17, + "grad_norm": 13.686917304992676, + "learning_rate": 1.1355710374234414e-06, + "logits/chosen": -0.30052292346954346, + "logits/rejected": -0.3633517622947693, + "logps/chosen": -48.6317138671875, + "logps/rejected": -85.38965606689453, + "loss": 0.658, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6546425819396973, + "rewards/margins": 4.821776866912842, + "rewards/rejected": -2.1671342849731445, + "step": 4684 + }, + { + "epoch": 1.17, + "grad_norm": 3.5011067390441895, + "learning_rate": 1.1339092243780703e-06, + "logits/chosen": -0.38230833411216736, + "logits/rejected": -0.4395827054977417, + "logps/chosen": -54.66385269165039, + "logps/rejected": -83.50839233398438, + "loss": 0.7229, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.205812454223633, + "rewards/margins": 4.534878253936768, + "rewards/rejected": -1.329065203666687, + "step": 4685 + }, + { + "epoch": 1.17, + "grad_norm": 5.192705154418945, + "learning_rate": 1.1322484726580396e-06, + "logits/chosen": -0.3241656422615051, + "logits/rejected": -0.3629741668701172, + "logps/chosen": -62.04679489135742, + "logps/rejected": -83.25936126708984, + "loss": 0.8607, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.828266143798828, + "rewards/margins": 3.4556503295898438, + "rewards/rejected": -0.6273844242095947, + "step": 4686 + }, + { + "epoch": 1.17, + "grad_norm": 4.824768543243408, + "learning_rate": 1.1305887827192652e-06, + "logits/chosen": -0.4644582271575928, + "logits/rejected": -0.5205553770065308, + "logps/chosen": -50.350608825683594, + "logps/rejected": -73.25078582763672, + "loss": 0.7547, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.942586898803711, + "rewards/margins": 3.7680182456970215, + "rewards/rejected": -0.8254313468933105, + "step": 4687 + }, + { + "epoch": 1.17, + "grad_norm": 3.9978740215301514, + "learning_rate": 1.1289301550173643e-06, + "logits/chosen": -0.30055010318756104, + "logits/rejected": -0.3952384293079376, + "logps/chosen": -47.05885696411133, + "logps/rejected": -78.61534881591797, + "loss": 0.614, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0168139934539795, + "rewards/margins": 4.988770484924316, + "rewards/rejected": -1.9719557762145996, + "step": 4688 + }, + { + "epoch": 1.17, + "grad_norm": 4.226231098175049, + "learning_rate": 1.1272725900076653e-06, + "logits/chosen": -0.3543447256088257, + "logits/rejected": -0.4658023715019226, + "logps/chosen": -58.30118179321289, + "logps/rejected": -103.05291748046875, + "loss": 0.7432, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7572097778320312, + "rewards/margins": 4.954404830932617, + "rewards/rejected": -2.197195053100586, + "step": 4689 + }, + { + "epoch": 1.17, + "grad_norm": 3.622074604034424, + "learning_rate": 1.1256160881452083e-06, + "logits/chosen": -0.37139981985092163, + "logits/rejected": -0.43746036291122437, + "logps/chosen": -50.513710021972656, + "logps/rejected": -74.43955993652344, + "loss": 0.6437, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9863290786743164, + "rewards/margins": 4.59554386138916, + "rewards/rejected": -1.6092151403427124, + "step": 4690 + }, + { + "epoch": 1.17, + "grad_norm": 3.4432437419891357, + "learning_rate": 1.1239606498847382e-06, + "logits/chosen": -0.3394644260406494, + "logits/rejected": -0.46792837977409363, + "logps/chosen": -61.160064697265625, + "logps/rejected": -99.57929992675781, + "loss": 0.5876, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.018183708190918, + "rewards/margins": 5.850248336791992, + "rewards/rejected": -2.832064390182495, + "step": 4691 + }, + { + "epoch": 1.17, + "grad_norm": 16.128740310668945, + "learning_rate": 1.122306275680708e-06, + "logits/chosen": -0.374063640832901, + "logits/rejected": -0.516806423664093, + "logps/chosen": -60.65092468261719, + "logps/rejected": -82.52952575683594, + "loss": 0.685, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.818098545074463, + "rewards/margins": 5.601718425750732, + "rewards/rejected": -2.783620595932007, + "step": 4692 + }, + { + "epoch": 1.17, + "grad_norm": 5.905430793762207, + "learning_rate": 1.1206529659872788e-06, + "logits/chosen": -0.37978997826576233, + "logits/rejected": -0.4022984206676483, + "logps/chosen": -48.32473373413086, + "logps/rejected": -99.3099365234375, + "loss": 0.6887, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.087005376815796, + "rewards/margins": 4.973646640777588, + "rewards/rejected": -1.8866413831710815, + "step": 4693 + }, + { + "epoch": 1.17, + "grad_norm": 8.218137741088867, + "learning_rate": 1.1190007212583192e-06, + "logits/chosen": -0.3432236611843109, + "logits/rejected": -0.3522890508174896, + "logps/chosen": -69.10021209716797, + "logps/rejected": -110.15657043457031, + "loss": 0.9016, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.735511064529419, + "rewards/margins": 4.410811424255371, + "rewards/rejected": -1.6753003597259521, + "step": 4694 + }, + { + "epoch": 1.17, + "grad_norm": 11.954920768737793, + "learning_rate": 1.1173495419474094e-06, + "logits/chosen": -0.30110764503479004, + "logits/rejected": -0.46468642354011536, + "logps/chosen": -61.11760330200195, + "logps/rejected": -77.09841918945312, + "loss": 0.6351, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.765312433242798, + "rewards/margins": 5.337275505065918, + "rewards/rejected": -2.571962356567383, + "step": 4695 + }, + { + "epoch": 1.17, + "grad_norm": 11.136476516723633, + "learning_rate": 1.1156994285078287e-06, + "logits/chosen": -0.37014931440353394, + "logits/rejected": -0.44920486211776733, + "logps/chosen": -56.765663146972656, + "logps/rejected": -83.8776626586914, + "loss": 0.8241, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.897580146789551, + "rewards/margins": 4.418067932128906, + "rewards/rejected": -1.5204871892929077, + "step": 4696 + }, + { + "epoch": 1.17, + "grad_norm": 4.503375053405762, + "learning_rate": 1.1140503813925728e-06, + "logits/chosen": -0.33294913172721863, + "logits/rejected": -0.42501285672187805, + "logps/chosen": -60.36052322387695, + "logps/rejected": -93.93685150146484, + "loss": 0.6984, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7953543663024902, + "rewards/margins": 4.754815101623535, + "rewards/rejected": -1.9594602584838867, + "step": 4697 + }, + { + "epoch": 1.18, + "grad_norm": 4.207986831665039, + "learning_rate": 1.1124024010543393e-06, + "logits/chosen": -0.39164939522743225, + "logits/rejected": -0.44152116775512695, + "logps/chosen": -52.154998779296875, + "logps/rejected": -106.29669189453125, + "loss": 0.7391, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.974879503250122, + "rewards/margins": 5.500053405761719, + "rewards/rejected": -2.5251739025115967, + "step": 4698 + }, + { + "epoch": 1.18, + "grad_norm": 13.647074699401855, + "learning_rate": 1.1107554879455346e-06, + "logits/chosen": -0.32103878259658813, + "logits/rejected": -0.4058161973953247, + "logps/chosen": -60.43930435180664, + "logps/rejected": -84.44988250732422, + "loss": 0.8575, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8611533641815186, + "rewards/margins": 4.02655553817749, + "rewards/rejected": -1.1654022932052612, + "step": 4699 + }, + { + "epoch": 1.18, + "grad_norm": 3.987151622772217, + "learning_rate": 1.1091096425182719e-06, + "logits/chosen": -0.31069597601890564, + "logits/rejected": -0.4601704776287079, + "logps/chosen": -57.37870788574219, + "logps/rejected": -82.96195220947266, + "loss": 0.6563, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1012864112854004, + "rewards/margins": 5.994009971618652, + "rewards/rejected": -2.892723560333252, + "step": 4700 + }, + { + "epoch": 1.18, + "grad_norm": 5.378357887268066, + "learning_rate": 1.1074648652243692e-06, + "logits/chosen": -0.24685539305210114, + "logits/rejected": -0.3608555793762207, + "logps/chosen": -79.68154907226562, + "logps/rejected": -82.37740325927734, + "loss": 0.8115, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.909583330154419, + "rewards/margins": 3.8333358764648438, + "rewards/rejected": -0.9237526059150696, + "step": 4701 + }, + { + "epoch": 1.18, + "grad_norm": 3.3695051670074463, + "learning_rate": 1.105821156515357e-06, + "logits/chosen": -0.4253925681114197, + "logits/rejected": -0.497491717338562, + "logps/chosen": -46.59107971191406, + "logps/rejected": -99.4087142944336, + "loss": 0.6803, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.829516887664795, + "rewards/margins": 6.036004543304443, + "rewards/rejected": -3.2064871788024902, + "step": 4702 + }, + { + "epoch": 1.18, + "grad_norm": 3.076547384262085, + "learning_rate": 1.1041785168424667e-06, + "logits/chosen": -0.41582435369491577, + "logits/rejected": -0.4963434040546417, + "logps/chosen": -47.523468017578125, + "logps/rejected": -79.84110260009766, + "loss": 0.6755, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2178170680999756, + "rewards/margins": 5.7386956214904785, + "rewards/rejected": -2.520879030227661, + "step": 4703 + }, + { + "epoch": 1.18, + "grad_norm": 4.771617412567139, + "learning_rate": 1.1025369466566394e-06, + "logits/chosen": -0.3345516324043274, + "logits/rejected": -0.4639633595943451, + "logps/chosen": -71.14987182617188, + "logps/rejected": -86.33279418945312, + "loss": 0.7452, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.040403366088867, + "rewards/margins": 5.467470169067383, + "rewards/rejected": -2.4270668029785156, + "step": 4704 + }, + { + "epoch": 1.18, + "grad_norm": 3.3672068119049072, + "learning_rate": 1.1008964464085203e-06, + "logits/chosen": -0.37304991483688354, + "logits/rejected": -0.5102252960205078, + "logps/chosen": -52.999019622802734, + "logps/rejected": -83.86506652832031, + "loss": 0.6013, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9244372844696045, + "rewards/margins": 5.024481296539307, + "rewards/rejected": -2.100043773651123, + "step": 4705 + }, + { + "epoch": 1.18, + "grad_norm": 5.276784420013428, + "learning_rate": 1.0992570165484618e-06, + "logits/chosen": -0.3605310916900635, + "logits/rejected": -0.4419606328010559, + "logps/chosen": -52.57094955444336, + "logps/rejected": -80.15949249267578, + "loss": 0.7146, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8457565307617188, + "rewards/margins": 4.955009460449219, + "rewards/rejected": -2.109252452850342, + "step": 4706 + }, + { + "epoch": 1.18, + "grad_norm": 10.129308700561523, + "learning_rate": 1.0976186575265264e-06, + "logits/chosen": -0.3037460446357727, + "logits/rejected": -0.38482728600502014, + "logps/chosen": -59.53241729736328, + "logps/rejected": -94.54354858398438, + "loss": 0.7292, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.83236026763916, + "rewards/margins": 4.01033878326416, + "rewards/rejected": -1.177978515625, + "step": 4707 + }, + { + "epoch": 1.18, + "grad_norm": 7.240886688232422, + "learning_rate": 1.0959813697924743e-06, + "logits/chosen": -0.35090869665145874, + "logits/rejected": -0.40825849771499634, + "logps/chosen": -48.270294189453125, + "logps/rejected": -91.71615600585938, + "loss": 0.8118, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0674796104431152, + "rewards/margins": 4.403047561645508, + "rewards/rejected": -1.3355680704116821, + "step": 4708 + }, + { + "epoch": 1.18, + "grad_norm": 5.12740421295166, + "learning_rate": 1.0943451537957771e-06, + "logits/chosen": -0.31826138496398926, + "logits/rejected": -0.4672302305698395, + "logps/chosen": -58.525184631347656, + "logps/rejected": -87.53441619873047, + "loss": 0.6244, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.561710834503174, + "rewards/margins": 4.942723274230957, + "rewards/rejected": -2.381012439727783, + "step": 4709 + }, + { + "epoch": 1.18, + "grad_norm": 7.744482517242432, + "learning_rate": 1.0927100099856142e-06, + "logits/chosen": -0.3283389210700989, + "logits/rejected": -0.39164841175079346, + "logps/chosen": -55.832740783691406, + "logps/rejected": -88.46949768066406, + "loss": 0.6823, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7055463790893555, + "rewards/margins": 4.512448310852051, + "rewards/rejected": -1.8069021701812744, + "step": 4710 + }, + { + "epoch": 1.18, + "grad_norm": 3.142042875289917, + "learning_rate": 1.091075938810866e-06, + "logits/chosen": -0.4048765003681183, + "logits/rejected": -0.5581040382385254, + "logps/chosen": -50.73493194580078, + "logps/rejected": -80.44213104248047, + "loss": 0.6725, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.90297794342041, + "rewards/margins": 5.853023529052734, + "rewards/rejected": -2.950045585632324, + "step": 4711 + }, + { + "epoch": 1.18, + "grad_norm": 7.678492546081543, + "learning_rate": 1.0894429407201207e-06, + "logits/chosen": -0.38058024644851685, + "logits/rejected": -0.42332449555397034, + "logps/chosen": -49.67064666748047, + "logps/rejected": -95.20101928710938, + "loss": 0.732, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.16922926902771, + "rewards/margins": 4.133722305297852, + "rewards/rejected": -0.9644931554794312, + "step": 4712 + }, + { + "epoch": 1.18, + "grad_norm": 4.6882219314575195, + "learning_rate": 1.0878110161616712e-06, + "logits/chosen": -0.403873085975647, + "logits/rejected": -0.5260862708091736, + "logps/chosen": -47.30960464477539, + "logps/rejected": -81.44705963134766, + "loss": 0.6126, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.207982063293457, + "rewards/margins": 6.194356918334961, + "rewards/rejected": -2.986375331878662, + "step": 4713 + }, + { + "epoch": 1.18, + "grad_norm": 4.16732120513916, + "learning_rate": 1.0861801655835147e-06, + "logits/chosen": -0.30318206548690796, + "logits/rejected": -0.45565515756607056, + "logps/chosen": -67.6169204711914, + "logps/rejected": -81.61328125, + "loss": 0.7613, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.792959213256836, + "rewards/margins": 4.253097057342529, + "rewards/rejected": -1.4601376056671143, + "step": 4714 + }, + { + "epoch": 1.18, + "grad_norm": 6.072813510894775, + "learning_rate": 1.084550389433358e-06, + "logits/chosen": -0.3866773545742035, + "logits/rejected": -0.4749290347099304, + "logps/chosen": -74.78103637695312, + "logps/rejected": -86.2367172241211, + "loss": 0.7235, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8295178413391113, + "rewards/margins": 4.830099105834961, + "rewards/rejected": -2.0005812644958496, + "step": 4715 + }, + { + "epoch": 1.18, + "grad_norm": 7.064413070678711, + "learning_rate": 1.082921688158608e-06, + "logits/chosen": -0.37252792716026306, + "logits/rejected": -0.42646628618240356, + "logps/chosen": -51.7943000793457, + "logps/rejected": -101.02445983886719, + "loss": 0.6575, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8967435359954834, + "rewards/margins": 5.769118309020996, + "rewards/rejected": -2.8723745346069336, + "step": 4716 + }, + { + "epoch": 1.18, + "grad_norm": 3.809452772140503, + "learning_rate": 1.0812940622063783e-06, + "logits/chosen": -0.28039252758026123, + "logits/rejected": -0.4435274600982666, + "logps/chosen": -62.37351989746094, + "logps/rejected": -72.21598052978516, + "loss": 0.6269, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8998422622680664, + "rewards/margins": 6.117610454559326, + "rewards/rejected": -3.2177677154541016, + "step": 4717 + }, + { + "epoch": 1.18, + "grad_norm": 10.469706535339355, + "learning_rate": 1.079667512023488e-06, + "logits/chosen": -0.33575183153152466, + "logits/rejected": -0.4350091218948364, + "logps/chosen": -55.99928283691406, + "logps/rejected": -83.77049255371094, + "loss": 0.6711, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2522664070129395, + "rewards/margins": 5.116243839263916, + "rewards/rejected": -1.8639777898788452, + "step": 4718 + }, + { + "epoch": 1.18, + "grad_norm": 5.140777587890625, + "learning_rate": 1.0780420380564593e-06, + "logits/chosen": -0.39378821849823, + "logits/rejected": -0.4490151107311249, + "logps/chosen": -41.88814926147461, + "logps/rejected": -75.05506134033203, + "loss": 0.6281, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0202691555023193, + "rewards/margins": 4.031585693359375, + "rewards/rejected": -1.0113165378570557, + "step": 4719 + }, + { + "epoch": 1.18, + "grad_norm": 3.9866559505462646, + "learning_rate": 1.0764176407515203e-06, + "logits/chosen": -0.31196755170822144, + "logits/rejected": -0.3488543629646301, + "logps/chosen": -53.85740661621094, + "logps/rejected": -86.97344207763672, + "loss": 0.7249, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9727048873901367, + "rewards/margins": 4.413532257080078, + "rewards/rejected": -1.4408272504806519, + "step": 4720 + }, + { + "epoch": 1.18, + "grad_norm": 4.531817436218262, + "learning_rate": 1.0747943205546018e-06, + "logits/chosen": -0.39713945984840393, + "logits/rejected": -0.4273430407047272, + "logps/chosen": -46.30767822265625, + "logps/rejected": -93.47179412841797, + "loss": 0.7363, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.991985559463501, + "rewards/margins": 5.025958061218262, + "rewards/rejected": -2.033972978591919, + "step": 4721 + }, + { + "epoch": 1.18, + "grad_norm": 6.180261135101318, + "learning_rate": 1.073172077911343e-06, + "logits/chosen": -0.369964063167572, + "logits/rejected": -0.4429609775543213, + "logps/chosen": -58.49334716796875, + "logps/rejected": -87.21348571777344, + "loss": 0.9191, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6554317474365234, + "rewards/margins": 3.918018102645874, + "rewards/rejected": -1.2625864744186401, + "step": 4722 + }, + { + "epoch": 1.18, + "grad_norm": 4.438068389892578, + "learning_rate": 1.0715509132670832e-06, + "logits/chosen": -0.3090779781341553, + "logits/rejected": -0.4250885248184204, + "logps/chosen": -54.76052474975586, + "logps/rejected": -94.56285095214844, + "loss": 0.6344, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.655618667602539, + "rewards/margins": 5.573803424835205, + "rewards/rejected": -2.918185234069824, + "step": 4723 + }, + { + "epoch": 1.18, + "grad_norm": 5.7255024909973145, + "learning_rate": 1.0699308270668667e-06, + "logits/chosen": -0.48238998651504517, + "logits/rejected": -0.6063588261604309, + "logps/chosen": -59.27016067504883, + "logps/rejected": -81.221923828125, + "loss": 0.6954, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.912738084793091, + "rewards/margins": 5.939375877380371, + "rewards/rejected": -3.0266385078430176, + "step": 4724 + }, + { + "epoch": 1.18, + "grad_norm": 10.83761215209961, + "learning_rate": 1.0683118197554426e-06, + "logits/chosen": -0.3534616529941559, + "logits/rejected": -0.45889586210250854, + "logps/chosen": -56.63988494873047, + "logps/rejected": -75.89473724365234, + "loss": 0.7825, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.983645439147949, + "rewards/margins": 5.353428840637207, + "rewards/rejected": -2.369783401489258, + "step": 4725 + }, + { + "epoch": 1.18, + "grad_norm": 7.065704345703125, + "learning_rate": 1.0666938917772618e-06, + "logits/chosen": -0.3486177623271942, + "logits/rejected": -0.45527559518814087, + "logps/chosen": -63.64344787597656, + "logps/rejected": -81.0064697265625, + "loss": 0.7749, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8599109649658203, + "rewards/margins": 4.50046443939209, + "rewards/rejected": -1.640553593635559, + "step": 4726 + }, + { + "epoch": 1.18, + "grad_norm": 6.307783603668213, + "learning_rate": 1.0650770435764828e-06, + "logits/chosen": -0.3349698781967163, + "logits/rejected": -0.4355529248714447, + "logps/chosen": -79.90155792236328, + "logps/rejected": -83.8203125, + "loss": 0.836, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.719963788986206, + "rewards/margins": 4.700287342071533, + "rewards/rejected": -1.9803242683410645, + "step": 4727 + }, + { + "epoch": 1.18, + "grad_norm": 4.427134990692139, + "learning_rate": 1.0634612755969665e-06, + "logits/chosen": -0.3365894854068756, + "logits/rejected": -0.4007875621318817, + "logps/chosen": -54.96767044067383, + "logps/rejected": -92.5499038696289, + "loss": 0.6401, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8023061752319336, + "rewards/margins": 4.862748622894287, + "rewards/rejected": -2.0604424476623535, + "step": 4728 + }, + { + "epoch": 1.18, + "grad_norm": 4.805983066558838, + "learning_rate": 1.0618465882822703e-06, + "logits/chosen": -0.32139790058135986, + "logits/rejected": -0.4294244647026062, + "logps/chosen": -72.7268295288086, + "logps/rejected": -104.76791381835938, + "loss": 0.7536, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9814140796661377, + "rewards/margins": 5.579297065734863, + "rewards/rejected": -2.5978827476501465, + "step": 4729 + }, + { + "epoch": 1.18, + "grad_norm": 6.4657440185546875, + "learning_rate": 1.0602329820756657e-06, + "logits/chosen": -0.30690914392471313, + "logits/rejected": -0.44679030776023865, + "logps/chosen": -55.419761657714844, + "logps/rejected": -87.96470642089844, + "loss": 0.727, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.79779314994812, + "rewards/margins": 5.305209636688232, + "rewards/rejected": -2.5074167251586914, + "step": 4730 + }, + { + "epoch": 1.18, + "grad_norm": 5.104645252227783, + "learning_rate": 1.0586204574201208e-06, + "logits/chosen": -0.2885778844356537, + "logits/rejected": -0.3376966118812561, + "logps/chosen": -65.23179626464844, + "logps/rejected": -102.26399230957031, + "loss": 0.8263, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.016150712966919, + "rewards/margins": 4.7338995933532715, + "rewards/rejected": -1.717748761177063, + "step": 4731 + }, + { + "epoch": 1.18, + "grad_norm": 3.5755069255828857, + "learning_rate": 1.0570090147583089e-06, + "logits/chosen": -0.370002418756485, + "logits/rejected": -0.47382330894470215, + "logps/chosen": -49.75621032714844, + "logps/rejected": -76.08808135986328, + "loss": 0.6113, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.131617307662964, + "rewards/margins": 5.780401229858398, + "rewards/rejected": -2.648784637451172, + "step": 4732 + }, + { + "epoch": 1.18, + "grad_norm": 3.10201358795166, + "learning_rate": 1.0553986545326055e-06, + "logits/chosen": -0.4025358557701111, + "logits/rejected": -0.5396594405174255, + "logps/chosen": -59.651283264160156, + "logps/rejected": -75.55753326416016, + "loss": 0.7237, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.978187084197998, + "rewards/margins": 4.632160663604736, + "rewards/rejected": -1.6539734601974487, + "step": 4733 + }, + { + "epoch": 1.18, + "grad_norm": 4.580054759979248, + "learning_rate": 1.0537893771850882e-06, + "logits/chosen": -0.33505189418792725, + "logits/rejected": -0.41463902592658997, + "logps/chosen": -51.735782623291016, + "logps/rejected": -94.28531646728516, + "loss": 0.679, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.62654447555542, + "rewards/margins": 5.398171424865723, + "rewards/rejected": -2.7716269493103027, + "step": 4734 + }, + { + "epoch": 1.18, + "grad_norm": 5.4667816162109375, + "learning_rate": 1.0521811831575402e-06, + "logits/chosen": -0.41060346364974976, + "logits/rejected": -0.4245292544364929, + "logps/chosen": -49.877357482910156, + "logps/rejected": -96.990966796875, + "loss": 0.7405, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.880958318710327, + "rewards/margins": 4.601173400878906, + "rewards/rejected": -1.7202153205871582, + "step": 4735 + }, + { + "epoch": 1.18, + "grad_norm": 2.815373420715332, + "learning_rate": 1.050574072891445e-06, + "logits/chosen": -0.39150357246398926, + "logits/rejected": -0.43866685032844543, + "logps/chosen": -51.189918518066406, + "logps/rejected": -95.76161193847656, + "loss": 0.6238, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.971364974975586, + "rewards/margins": 5.273081302642822, + "rewards/rejected": -2.301715850830078, + "step": 4736 + }, + { + "epoch": 1.18, + "grad_norm": 4.969573974609375, + "learning_rate": 1.0489680468279884e-06, + "logits/chosen": -0.2976246178150177, + "logits/rejected": -0.36088740825653076, + "logps/chosen": -53.72455978393555, + "logps/rejected": -104.75860595703125, + "loss": 0.6862, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.950829029083252, + "rewards/margins": 4.323486328125, + "rewards/rejected": -1.3726574182510376, + "step": 4737 + }, + { + "epoch": 1.19, + "grad_norm": 3.7942044734954834, + "learning_rate": 1.0473631054080596e-06, + "logits/chosen": -0.3172764778137207, + "logits/rejected": -0.40523561835289, + "logps/chosen": -45.81488800048828, + "logps/rejected": -91.9027328491211, + "loss": 0.5548, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8727662563323975, + "rewards/margins": 5.511585235595703, + "rewards/rejected": -2.6388187408447266, + "step": 4738 + }, + { + "epoch": 1.19, + "grad_norm": 4.189241886138916, + "learning_rate": 1.045759249072248e-06, + "logits/chosen": -0.33441591262817383, + "logits/rejected": -0.45531603693962097, + "logps/chosen": -58.007686614990234, + "logps/rejected": -76.88223266601562, + "loss": 0.7159, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9249866008758545, + "rewards/margins": 5.252266883850098, + "rewards/rejected": -2.327279567718506, + "step": 4739 + }, + { + "epoch": 1.19, + "grad_norm": 10.398056030273438, + "learning_rate": 1.044156478260851e-06, + "logits/chosen": -0.33525609970092773, + "logits/rejected": -0.3991037607192993, + "logps/chosen": -52.426727294921875, + "logps/rejected": -78.81082153320312, + "loss": 0.748, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6947531700134277, + "rewards/margins": 3.7965009212493896, + "rewards/rejected": -1.1017476320266724, + "step": 4740 + }, + { + "epoch": 1.19, + "grad_norm": 7.7361040115356445, + "learning_rate": 1.0425547934138586e-06, + "logits/chosen": -0.32220420241355896, + "logits/rejected": -0.419183611869812, + "logps/chosen": -56.676204681396484, + "logps/rejected": -76.25495910644531, + "loss": 0.8318, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6949732303619385, + "rewards/margins": 4.9164581298828125, + "rewards/rejected": -2.221485137939453, + "step": 4741 + }, + { + "epoch": 1.19, + "grad_norm": 8.66921615600586, + "learning_rate": 1.0409541949709718e-06, + "logits/chosen": -0.4121628403663635, + "logits/rejected": -0.4766054153442383, + "logps/chosen": -57.679298400878906, + "logps/rejected": -77.28584289550781, + "loss": 1.0573, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5922975540161133, + "rewards/margins": 3.8246712684631348, + "rewards/rejected": -1.2323737144470215, + "step": 4742 + }, + { + "epoch": 1.19, + "grad_norm": 2.9094343185424805, + "learning_rate": 1.0393546833715878e-06, + "logits/chosen": -0.34554946422576904, + "logits/rejected": -0.4449497163295746, + "logps/chosen": -54.29387664794922, + "logps/rejected": -84.90631866455078, + "loss": 0.608, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.279374122619629, + "rewards/margins": 5.240431785583496, + "rewards/rejected": -1.9610567092895508, + "step": 4743 + }, + { + "epoch": 1.19, + "grad_norm": 4.766932010650635, + "learning_rate": 1.0377562590548067e-06, + "logits/chosen": -0.33486202359199524, + "logits/rejected": -0.449779748916626, + "logps/chosen": -47.63597106933594, + "logps/rejected": -91.07962036132812, + "loss": 0.6513, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7078280448913574, + "rewards/margins": 5.750265121459961, + "rewards/rejected": -3.0424373149871826, + "step": 4744 + }, + { + "epoch": 1.19, + "grad_norm": 4.596703052520752, + "learning_rate": 1.0361589224594321e-06, + "logits/chosen": -0.33176249265670776, + "logits/rejected": -0.4776536226272583, + "logps/chosen": -63.88866424560547, + "logps/rejected": -74.05430603027344, + "loss": 0.6442, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9783754348754883, + "rewards/margins": 5.180400848388672, + "rewards/rejected": -2.202025890350342, + "step": 4745 + }, + { + "epoch": 1.19, + "grad_norm": 4.99848747253418, + "learning_rate": 1.034562674023964e-06, + "logits/chosen": -0.3923978805541992, + "logits/rejected": -0.44820836186408997, + "logps/chosen": -49.6859245300293, + "logps/rejected": -90.80790710449219, + "loss": 0.7882, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7784743309020996, + "rewards/margins": 4.668942451477051, + "rewards/rejected": -1.8904680013656616, + "step": 4746 + }, + { + "epoch": 1.19, + "grad_norm": 4.555561065673828, + "learning_rate": 1.0329675141866109e-06, + "logits/chosen": -0.38059747219085693, + "logits/rejected": -0.47677505016326904, + "logps/chosen": -60.08332824707031, + "logps/rejected": -84.90213012695312, + "loss": 0.7505, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1716439723968506, + "rewards/margins": 5.094346046447754, + "rewards/rejected": -1.9227020740509033, + "step": 4747 + }, + { + "epoch": 1.19, + "grad_norm": 4.38608455657959, + "learning_rate": 1.031373443385278e-06, + "logits/chosen": -0.3907194435596466, + "logits/rejected": -0.4354219138622284, + "logps/chosen": -47.666107177734375, + "logps/rejected": -86.83292388916016, + "loss": 0.7374, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8367865085601807, + "rewards/margins": 5.570563793182373, + "rewards/rejected": -2.7337775230407715, + "step": 4748 + }, + { + "epoch": 1.19, + "grad_norm": 5.142448425292969, + "learning_rate": 1.0297804620575685e-06, + "logits/chosen": -0.24233388900756836, + "logits/rejected": -0.342422217130661, + "logps/chosen": -63.24666976928711, + "logps/rejected": -77.20689392089844, + "loss": 0.7154, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0115435123443604, + "rewards/margins": 4.142712593078613, + "rewards/rejected": -1.1311695575714111, + "step": 4749 + }, + { + "epoch": 1.19, + "grad_norm": 13.246335983276367, + "learning_rate": 1.0281885706407946e-06, + "logits/chosen": -0.385611355304718, + "logits/rejected": -0.45743483304977417, + "logps/chosen": -52.30619812011719, + "logps/rejected": -80.27039337158203, + "loss": 0.7784, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.894075870513916, + "rewards/margins": 4.104762077331543, + "rewards/rejected": -1.210686445236206, + "step": 4750 + }, + { + "epoch": 1.19, + "grad_norm": 7.17722225189209, + "learning_rate": 1.0265977695719614e-06, + "logits/chosen": -0.40261417627334595, + "logits/rejected": -0.4642411172389984, + "logps/chosen": -50.313446044921875, + "logps/rejected": -94.22993469238281, + "loss": 0.6954, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4302093982696533, + "rewards/margins": 5.490762233734131, + "rewards/rejected": -2.060553550720215, + "step": 4751 + }, + { + "epoch": 1.19, + "grad_norm": 4.7773756980896, + "learning_rate": 1.025008059287782e-06, + "logits/chosen": -0.2305636703968048, + "logits/rejected": -0.4266952872276306, + "logps/chosen": -78.29180145263672, + "logps/rejected": -79.85055541992188, + "loss": 0.8025, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9337778091430664, + "rewards/margins": 5.44558048248291, + "rewards/rejected": -2.5118021965026855, + "step": 4752 + }, + { + "epoch": 1.19, + "grad_norm": 6.450586318969727, + "learning_rate": 1.0234194402246628e-06, + "logits/chosen": -0.38308635354042053, + "logits/rejected": -0.4525734782218933, + "logps/chosen": -67.91976165771484, + "logps/rejected": -91.6673812866211, + "loss": 0.7984, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7508437633514404, + "rewards/margins": 4.819178104400635, + "rewards/rejected": -2.0683345794677734, + "step": 4753 + }, + { + "epoch": 1.19, + "grad_norm": 4.133342266082764, + "learning_rate": 1.0218319128187137e-06, + "logits/chosen": -0.4303304851055145, + "logits/rejected": -0.47459086775779724, + "logps/chosen": -43.50742721557617, + "logps/rejected": -87.43720245361328, + "loss": 0.7254, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9336137771606445, + "rewards/margins": 4.4786057472229, + "rewards/rejected": -1.5449917316436768, + "step": 4754 + }, + { + "epoch": 1.19, + "grad_norm": 4.885046005249023, + "learning_rate": 1.0202454775057484e-06, + "logits/chosen": -0.3291639983654022, + "logits/rejected": -0.39544180035591125, + "logps/chosen": -50.47379684448242, + "logps/rejected": -82.30874633789062, + "loss": 0.8207, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0873756408691406, + "rewards/margins": 4.237125873565674, + "rewards/rejected": -1.1497503519058228, + "step": 4755 + }, + { + "epoch": 1.19, + "grad_norm": 5.920772552490234, + "learning_rate": 1.0186601347212756e-06, + "logits/chosen": -0.31760266423225403, + "logits/rejected": -0.38393229246139526, + "logps/chosen": -58.48884582519531, + "logps/rejected": -86.9910659790039, + "loss": 0.6743, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0456857681274414, + "rewards/margins": 5.447437286376953, + "rewards/rejected": -2.401751756668091, + "step": 4756 + }, + { + "epoch": 1.19, + "grad_norm": 4.159995079040527, + "learning_rate": 1.0170758849005059e-06, + "logits/chosen": -0.3394484519958496, + "logits/rejected": -0.461005300283432, + "logps/chosen": -57.457603454589844, + "logps/rejected": -80.29861450195312, + "loss": 0.7373, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8561160564422607, + "rewards/margins": 5.060927867889404, + "rewards/rejected": -2.2048120498657227, + "step": 4757 + }, + { + "epoch": 1.19, + "grad_norm": 3.0822484493255615, + "learning_rate": 1.0154927284783512e-06, + "logits/chosen": -0.3587989807128906, + "logits/rejected": -0.5250735878944397, + "logps/chosen": -52.99822998046875, + "logps/rejected": -76.06210327148438, + "loss": 0.5744, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9765889644622803, + "rewards/margins": 6.134585857391357, + "rewards/rejected": -3.157996892929077, + "step": 4758 + }, + { + "epoch": 1.19, + "grad_norm": 5.608465194702148, + "learning_rate": 1.0139106658894193e-06, + "logits/chosen": -0.35001447796821594, + "logits/rejected": -0.4338744282722473, + "logps/chosen": -53.99559020996094, + "logps/rejected": -82.6239013671875, + "loss": 0.7064, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8175792694091797, + "rewards/margins": 4.215913772583008, + "rewards/rejected": -1.3983345031738281, + "step": 4759 + }, + { + "epoch": 1.19, + "grad_norm": 6.285269260406494, + "learning_rate": 1.0123296975680263e-06, + "logits/chosen": -0.27495554089546204, + "logits/rejected": -0.38951414823532104, + "logps/chosen": -62.9023551940918, + "logps/rejected": -79.61775970458984, + "loss": 0.891, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.672772169113159, + "rewards/margins": 3.899156093597412, + "rewards/rejected": -1.226383924484253, + "step": 4760 + }, + { + "epoch": 1.19, + "grad_norm": 4.511825084686279, + "learning_rate": 1.010749823948175e-06, + "logits/chosen": -0.36437177658081055, + "logits/rejected": -0.40316441655158997, + "logps/chosen": -62.4696159362793, + "logps/rejected": -91.91434478759766, + "loss": 0.734, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9387552738189697, + "rewards/margins": 3.871601104736328, + "rewards/rejected": -0.9328461289405823, + "step": 4761 + }, + { + "epoch": 1.19, + "grad_norm": 4.320715427398682, + "learning_rate": 1.00917104546358e-06, + "logits/chosen": -0.3715416193008423, + "logits/rejected": -0.47778698801994324, + "logps/chosen": -55.108402252197266, + "logps/rejected": -76.71505737304688, + "loss": 0.7371, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.006274700164795, + "rewards/margins": 5.33168363571167, + "rewards/rejected": -2.325408935546875, + "step": 4762 + }, + { + "epoch": 1.19, + "grad_norm": 5.456073760986328, + "learning_rate": 1.007593362547648e-06, + "logits/chosen": -0.29026833176612854, + "logits/rejected": -0.4043606221675873, + "logps/chosen": -54.3665657043457, + "logps/rejected": -75.18882751464844, + "loss": 0.8093, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.858255386352539, + "rewards/margins": 3.8763184547424316, + "rewards/rejected": -1.0180630683898926, + "step": 4763 + }, + { + "epoch": 1.19, + "grad_norm": 5.592083930969238, + "learning_rate": 1.0060167756334876e-06, + "logits/chosen": -0.3670419752597809, + "logits/rejected": -0.4705957770347595, + "logps/chosen": -52.10041427612305, + "logps/rejected": -88.96857452392578, + "loss": 0.7277, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7011303901672363, + "rewards/margins": 5.199749946594238, + "rewards/rejected": -2.498619556427002, + "step": 4764 + }, + { + "epoch": 1.19, + "grad_norm": 3.6594974994659424, + "learning_rate": 1.0044412851539055e-06, + "logits/chosen": -0.3886815011501312, + "logits/rejected": -0.4604513347148895, + "logps/chosen": -57.04186248779297, + "logps/rejected": -88.22603607177734, + "loss": 0.7273, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.946220636367798, + "rewards/margins": 4.928158760070801, + "rewards/rejected": -1.9819382429122925, + "step": 4765 + }, + { + "epoch": 1.19, + "grad_norm": 3.2379605770111084, + "learning_rate": 1.0028668915414063e-06, + "logits/chosen": -0.4010451138019562, + "logits/rejected": -0.4990450143814087, + "logps/chosen": -60.68146514892578, + "logps/rejected": -93.40115356445312, + "loss": 0.6475, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8925182819366455, + "rewards/margins": 5.645620346069336, + "rewards/rejected": -2.7531023025512695, + "step": 4766 + }, + { + "epoch": 1.19, + "grad_norm": 13.401482582092285, + "learning_rate": 1.0012935952281987e-06, + "logits/chosen": -0.3403266966342926, + "logits/rejected": -0.39450228214263916, + "logps/chosen": -70.7832260131836, + "logps/rejected": -95.10980224609375, + "loss": 0.7725, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9217331409454346, + "rewards/margins": 4.499814510345459, + "rewards/rejected": -1.5780816078186035, + "step": 4767 + }, + { + "epoch": 1.19, + "grad_norm": 3.0356478691101074, + "learning_rate": 9.997213966461843e-07, + "logits/chosen": -0.406076043844223, + "logits/rejected": -0.480523943901062, + "logps/chosen": -46.9989128112793, + "logps/rejected": -79.547607421875, + "loss": 0.6483, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9815855026245117, + "rewards/margins": 5.699668884277344, + "rewards/rejected": -2.718083620071411, + "step": 4768 + }, + { + "epoch": 1.19, + "grad_norm": 3.2698445320129395, + "learning_rate": 9.981502962269656e-07, + "logits/chosen": -0.4009108543395996, + "logits/rejected": -0.5161519646644592, + "logps/chosen": -65.06085968017578, + "logps/rejected": -75.27776336669922, + "loss": 0.7188, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1252613067626953, + "rewards/margins": 5.098587989807129, + "rewards/rejected": -1.9733268022537231, + "step": 4769 + }, + { + "epoch": 1.19, + "grad_norm": 7.15806245803833, + "learning_rate": 9.965802944018439e-07, + "logits/chosen": -0.2534710168838501, + "logits/rejected": -0.365844190120697, + "logps/chosen": -61.60691833496094, + "logps/rejected": -79.50453186035156, + "loss": 0.7859, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8385109901428223, + "rewards/margins": 4.348171710968018, + "rewards/rejected": -1.5096606016159058, + "step": 4770 + }, + { + "epoch": 1.19, + "grad_norm": 10.08475112915039, + "learning_rate": 9.950113916018166e-07, + "logits/chosen": -0.32223573327064514, + "logits/rejected": -0.36382997035980225, + "logps/chosen": -55.05537796020508, + "logps/rejected": -92.1442642211914, + "loss": 0.8539, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5449342727661133, + "rewards/margins": 4.476445198059082, + "rewards/rejected": -1.9315106868743896, + "step": 4771 + }, + { + "epoch": 1.19, + "grad_norm": 5.351061820983887, + "learning_rate": 9.934435882575849e-07, + "logits/chosen": -0.4118814468383789, + "logits/rejected": -0.4584020674228668, + "logps/chosen": -72.80689239501953, + "logps/rejected": -92.93324279785156, + "loss": 0.8344, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9086480140686035, + "rewards/margins": 5.981627464294434, + "rewards/rejected": -3.07297945022583, + "step": 4772 + }, + { + "epoch": 1.19, + "grad_norm": 5.182188510894775, + "learning_rate": 9.918768847995436e-07, + "logits/chosen": -0.2762150168418884, + "logits/rejected": -0.45551612973213196, + "logps/chosen": -78.51976013183594, + "logps/rejected": -85.59192657470703, + "loss": 0.6265, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7631611824035645, + "rewards/margins": 5.536826133728027, + "rewards/rejected": -2.773664712905884, + "step": 4773 + }, + { + "epoch": 1.19, + "grad_norm": 6.876108646392822, + "learning_rate": 9.903112816577836e-07, + "logits/chosen": -0.4148689806461334, + "logits/rejected": -0.4960119426250458, + "logps/chosen": -45.2318115234375, + "logps/rejected": -92.47293853759766, + "loss": 0.9943, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.471484422683716, + "rewards/margins": 3.7471468448638916, + "rewards/rejected": -1.2756624221801758, + "step": 4774 + }, + { + "epoch": 1.19, + "grad_norm": 4.080647945404053, + "learning_rate": 9.88746779262101e-07, + "logits/chosen": -0.3785160779953003, + "logits/rejected": -0.47770828008651733, + "logps/chosen": -54.585731506347656, + "logps/rejected": -82.08782958984375, + "loss": 0.716, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0463693141937256, + "rewards/margins": 4.867844581604004, + "rewards/rejected": -1.821475625038147, + "step": 4775 + }, + { + "epoch": 1.19, + "grad_norm": 5.2734527587890625, + "learning_rate": 9.871833780419827e-07, + "logits/chosen": -0.24938055872917175, + "logits/rejected": -0.34603023529052734, + "logps/chosen": -60.97193145751953, + "logps/rejected": -86.95771789550781, + "loss": 0.6653, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.617528200149536, + "rewards/margins": 4.563166618347168, + "rewards/rejected": -1.9456384181976318, + "step": 4776 + }, + { + "epoch": 1.19, + "grad_norm": 3.2572991847991943, + "learning_rate": 9.856210784266168e-07, + "logits/chosen": -0.38332152366638184, + "logits/rejected": -0.46647682785987854, + "logps/chosen": -50.3654670715332, + "logps/rejected": -90.90713500976562, + "loss": 0.6576, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.052842617034912, + "rewards/margins": 5.18070125579834, + "rewards/rejected": -2.1278586387634277, + "step": 4777 + }, + { + "epoch": 1.2, + "grad_norm": 4.141324043273926, + "learning_rate": 9.840598808448887e-07, + "logits/chosen": -0.3563712239265442, + "logits/rejected": -0.4278959035873413, + "logps/chosen": -53.86360549926758, + "logps/rejected": -96.60648345947266, + "loss": 0.7308, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.739037036895752, + "rewards/margins": 4.7312750816345215, + "rewards/rejected": -1.99223792552948, + "step": 4778 + }, + { + "epoch": 1.2, + "grad_norm": 3.6751911640167236, + "learning_rate": 9.824997857253792e-07, + "logits/chosen": -0.3599221706390381, + "logits/rejected": -0.3949414789676666, + "logps/chosen": -56.861263275146484, + "logps/rejected": -89.7708969116211, + "loss": 0.7328, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3661460876464844, + "rewards/margins": 4.999032497406006, + "rewards/rejected": -1.6328867673873901, + "step": 4779 + }, + { + "epoch": 1.2, + "grad_norm": 4.869001388549805, + "learning_rate": 9.809407934963705e-07, + "logits/chosen": -0.33075642585754395, + "logits/rejected": -0.43835869431495667, + "logps/chosen": -51.261138916015625, + "logps/rejected": -75.35331726074219, + "loss": 0.7018, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.659877061843872, + "rewards/margins": 4.731218338012695, + "rewards/rejected": -2.071341037750244, + "step": 4780 + }, + { + "epoch": 1.2, + "grad_norm": 3.7580597400665283, + "learning_rate": 9.793829045858388e-07, + "logits/chosen": -0.36590737104415894, + "logits/rejected": -0.46403801441192627, + "logps/chosen": -59.343597412109375, + "logps/rejected": -86.46903991699219, + "loss": 0.7092, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.754574775695801, + "rewards/margins": 5.030976295471191, + "rewards/rejected": -2.276401996612549, + "step": 4781 + }, + { + "epoch": 1.2, + "grad_norm": 7.430352210998535, + "learning_rate": 9.778261194214573e-07, + "logits/chosen": -0.2945142984390259, + "logits/rejected": -0.392637699842453, + "logps/chosen": -58.566871643066406, + "logps/rejected": -89.72474670410156, + "loss": 0.7332, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.17415189743042, + "rewards/margins": 5.817047595977783, + "rewards/rejected": -2.6428959369659424, + "step": 4782 + }, + { + "epoch": 1.2, + "grad_norm": 3.2183523178100586, + "learning_rate": 9.762704384305983e-07, + "logits/chosen": -0.39648640155792236, + "logits/rejected": -0.4792468845844269, + "logps/chosen": -43.58146286010742, + "logps/rejected": -86.78919219970703, + "loss": 0.5898, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.030055522918701, + "rewards/margins": 6.041211128234863, + "rewards/rejected": -3.011154890060425, + "step": 4783 + }, + { + "epoch": 1.2, + "grad_norm": 5.924630165100098, + "learning_rate": 9.747158620403274e-07, + "logits/chosen": -0.41503697633743286, + "logits/rejected": -0.42640283703804016, + "logps/chosen": -53.843257904052734, + "logps/rejected": -112.9142074584961, + "loss": 0.8178, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.14074444770813, + "rewards/margins": 5.430551052093506, + "rewards/rejected": -2.289806365966797, + "step": 4784 + }, + { + "epoch": 1.2, + "grad_norm": 10.470219612121582, + "learning_rate": 9.73162390677414e-07, + "logits/chosen": -0.32077258825302124, + "logits/rejected": -0.399934858083725, + "logps/chosen": -59.81149673461914, + "logps/rejected": -89.46200561523438, + "loss": 0.6829, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0932931900024414, + "rewards/margins": 4.977488994598389, + "rewards/rejected": -1.88419508934021, + "step": 4785 + }, + { + "epoch": 1.2, + "grad_norm": 4.566687107086182, + "learning_rate": 9.716100247683136e-07, + "logits/chosen": -0.3883213400840759, + "logits/rejected": -0.46854621171951294, + "logps/chosen": -53.243621826171875, + "logps/rejected": -99.94395446777344, + "loss": 0.6434, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7986719608306885, + "rewards/margins": 5.321450710296631, + "rewards/rejected": -2.5227787494659424, + "step": 4786 + }, + { + "epoch": 1.2, + "grad_norm": 4.504637718200684, + "learning_rate": 9.70058764739189e-07, + "logits/chosen": -0.45479825139045715, + "logits/rejected": -0.5761973261833191, + "logps/chosen": -59.062965393066406, + "logps/rejected": -77.78506469726562, + "loss": 0.7222, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.831371307373047, + "rewards/margins": 4.72053861618042, + "rewards/rejected": -1.8891671895980835, + "step": 4787 + }, + { + "epoch": 1.2, + "grad_norm": 2.5199477672576904, + "learning_rate": 9.685086110158926e-07, + "logits/chosen": -0.36026790738105774, + "logits/rejected": -0.4677300453186035, + "logps/chosen": -53.53704833984375, + "logps/rejected": -93.1536865234375, + "loss": 0.5703, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0920939445495605, + "rewards/margins": 6.218775272369385, + "rewards/rejected": -3.126681327819824, + "step": 4788 + }, + { + "epoch": 1.2, + "grad_norm": 16.798654556274414, + "learning_rate": 9.669595640239754e-07, + "logits/chosen": -0.2716916501522064, + "logits/rejected": -0.37212374806404114, + "logps/chosen": -70.42719268798828, + "logps/rejected": -96.667724609375, + "loss": 0.9441, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.243217706680298, + "rewards/margins": 3.726867198944092, + "rewards/rejected": -1.483649492263794, + "step": 4789 + }, + { + "epoch": 1.2, + "grad_norm": 2.3940393924713135, + "learning_rate": 9.654116241886835e-07, + "logits/chosen": -0.4158371388912201, + "logits/rejected": -0.5249840617179871, + "logps/chosen": -48.924766540527344, + "logps/rejected": -73.52825927734375, + "loss": 0.6562, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7357640266418457, + "rewards/margins": 5.376996994018555, + "rewards/rejected": -2.641233205795288, + "step": 4790 + }, + { + "epoch": 1.2, + "grad_norm": 3.016906976699829, + "learning_rate": 9.638647919349593e-07, + "logits/chosen": -0.3548493981361389, + "logits/rejected": -0.4356422424316406, + "logps/chosen": -47.252532958984375, + "logps/rejected": -77.44152069091797, + "loss": 0.6794, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9005420207977295, + "rewards/margins": 4.952919006347656, + "rewards/rejected": -2.0523769855499268, + "step": 4791 + }, + { + "epoch": 1.2, + "grad_norm": 3.4065420627593994, + "learning_rate": 9.623190676874438e-07, + "logits/chosen": -0.3069342374801636, + "logits/rejected": -0.400010883808136, + "logps/chosen": -57.357872009277344, + "logps/rejected": -77.55503845214844, + "loss": 0.6985, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.068180799484253, + "rewards/margins": 4.301411151885986, + "rewards/rejected": -1.2332301139831543, + "step": 4792 + }, + { + "epoch": 1.2, + "grad_norm": 5.847533226013184, + "learning_rate": 9.60774451870472e-07, + "logits/chosen": -0.35024264454841614, + "logits/rejected": -0.5090200901031494, + "logps/chosen": -59.85797882080078, + "logps/rejected": -84.42961883544922, + "loss": 0.7391, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.779975175857544, + "rewards/margins": 5.156994819641113, + "rewards/rejected": -2.3770194053649902, + "step": 4793 + }, + { + "epoch": 1.2, + "grad_norm": 9.286443710327148, + "learning_rate": 9.592309449080706e-07, + "logits/chosen": -0.3514048457145691, + "logits/rejected": -0.42452386021614075, + "logps/chosen": -47.46320343017578, + "logps/rejected": -94.52576446533203, + "loss": 0.7237, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.945559501647949, + "rewards/margins": 5.199123382568359, + "rewards/rejected": -2.25356388092041, + "step": 4794 + }, + { + "epoch": 1.2, + "grad_norm": 6.232520580291748, + "learning_rate": 9.576885472239682e-07, + "logits/chosen": -0.270033597946167, + "logits/rejected": -0.416333943605423, + "logps/chosen": -62.71052551269531, + "logps/rejected": -92.03656005859375, + "loss": 0.7618, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.727752685546875, + "rewards/margins": 5.599201202392578, + "rewards/rejected": -2.871448278427124, + "step": 4795 + }, + { + "epoch": 1.2, + "grad_norm": 3.1108129024505615, + "learning_rate": 9.561472592415849e-07, + "logits/chosen": -0.4057082533836365, + "logits/rejected": -0.500449001789093, + "logps/chosen": -54.57341003417969, + "logps/rejected": -72.69038391113281, + "loss": 0.6837, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1954286098480225, + "rewards/margins": 4.996986389160156, + "rewards/rejected": -1.8015578985214233, + "step": 4796 + }, + { + "epoch": 1.2, + "grad_norm": 6.235086441040039, + "learning_rate": 9.546070813840408e-07, + "logits/chosen": -0.35288006067276, + "logits/rejected": -0.42480334639549255, + "logps/chosen": -58.85103225708008, + "logps/rejected": -85.18080139160156, + "loss": 0.8079, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.930074453353882, + "rewards/margins": 4.143095970153809, + "rewards/rejected": -1.213021993637085, + "step": 4797 + }, + { + "epoch": 1.2, + "grad_norm": 3.888051748275757, + "learning_rate": 9.530680140741444e-07, + "logits/chosen": -0.30778270959854126, + "logits/rejected": -0.4206469655036926, + "logps/chosen": -55.06885528564453, + "logps/rejected": -81.07752227783203, + "loss": 0.6355, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.697892189025879, + "rewards/margins": 4.662456035614014, + "rewards/rejected": -1.9645636081695557, + "step": 4798 + }, + { + "epoch": 1.2, + "grad_norm": 2.333148956298828, + "learning_rate": 9.515300577344028e-07, + "logits/chosen": -0.36894094944000244, + "logits/rejected": -0.4932136833667755, + "logps/chosen": -60.80487060546875, + "logps/rejected": -78.34395599365234, + "loss": 0.6552, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0285654067993164, + "rewards/margins": 6.032591819763184, + "rewards/rejected": -3.004026412963867, + "step": 4799 + }, + { + "epoch": 1.2, + "grad_norm": 13.384299278259277, + "learning_rate": 9.499932127870209e-07, + "logits/chosen": -0.46866726875305176, + "logits/rejected": -0.5350856184959412, + "logps/chosen": -48.65909957885742, + "logps/rejected": -83.50355529785156, + "loss": 0.8365, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9529201984405518, + "rewards/margins": 3.796395778656006, + "rewards/rejected": -0.8434758186340332, + "step": 4800 + }, + { + "epoch": 1.2, + "grad_norm": 11.263513565063477, + "learning_rate": 9.484574796538942e-07, + "logits/chosen": -0.3061214089393616, + "logits/rejected": -0.37790459394454956, + "logps/chosen": -53.39315414428711, + "logps/rejected": -68.241943359375, + "loss": 0.9141, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.792006731033325, + "rewards/margins": 4.574426651000977, + "rewards/rejected": -1.7824195623397827, + "step": 4801 + }, + { + "epoch": 1.2, + "grad_norm": 7.524353981018066, + "learning_rate": 9.469228587566148e-07, + "logits/chosen": -0.4030039310455322, + "logits/rejected": -0.44065892696380615, + "logps/chosen": -52.48500061035156, + "logps/rejected": -102.65948486328125, + "loss": 0.7901, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.176928758621216, + "rewards/margins": 5.189968109130859, + "rewards/rejected": -2.0130391120910645, + "step": 4802 + }, + { + "epoch": 1.2, + "grad_norm": 14.69091510772705, + "learning_rate": 9.453893505164691e-07, + "logits/chosen": -0.2785363495349884, + "logits/rejected": -0.32527562975883484, + "logps/chosen": -60.75062561035156, + "logps/rejected": -95.94287872314453, + "loss": 0.938, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.602245807647705, + "rewards/margins": 3.707221746444702, + "rewards/rejected": -1.1049761772155762, + "step": 4803 + }, + { + "epoch": 1.2, + "grad_norm": 4.3863325119018555, + "learning_rate": 9.438569553544369e-07, + "logits/chosen": -0.4008403718471527, + "logits/rejected": -0.4757119417190552, + "logps/chosen": -52.82813262939453, + "logps/rejected": -85.587158203125, + "loss": 0.7499, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9647505283355713, + "rewards/margins": 5.044064998626709, + "rewards/rejected": -2.0793142318725586, + "step": 4804 + }, + { + "epoch": 1.2, + "grad_norm": 3.4393582344055176, + "learning_rate": 9.423256736911973e-07, + "logits/chosen": -0.3711640536785126, + "logits/rejected": -0.4228748083114624, + "logps/chosen": -50.8209114074707, + "logps/rejected": -105.92848205566406, + "loss": 0.6423, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0765552520751953, + "rewards/margins": 6.212726593017578, + "rewards/rejected": -3.1361711025238037, + "step": 4805 + }, + { + "epoch": 1.2, + "grad_norm": 11.38330078125, + "learning_rate": 9.407955059471158e-07, + "logits/chosen": -0.4081002473831177, + "logits/rejected": -0.47195082902908325, + "logps/chosen": -63.25690841674805, + "logps/rejected": -84.57601928710938, + "loss": 1.0277, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.766355514526367, + "rewards/margins": 3.8242788314819336, + "rewards/rejected": -1.0579233169555664, + "step": 4806 + }, + { + "epoch": 1.2, + "grad_norm": 2.834622621536255, + "learning_rate": 9.392664525422601e-07, + "logits/chosen": -0.40742921829223633, + "logits/rejected": -0.5707555413246155, + "logps/chosen": -63.2158203125, + "logps/rejected": -78.08331298828125, + "loss": 0.6643, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0449795722961426, + "rewards/margins": 5.77414083480835, + "rewards/rejected": -2.729161500930786, + "step": 4807 + }, + { + "epoch": 1.2, + "grad_norm": 3.9491758346557617, + "learning_rate": 9.377385138963868e-07, + "logits/chosen": -0.3526822328567505, + "logits/rejected": -0.42609506845474243, + "logps/chosen": -57.08352279663086, + "logps/rejected": -97.06558227539062, + "loss": 0.6436, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.936161518096924, + "rewards/margins": 4.806978702545166, + "rewards/rejected": -1.8708174228668213, + "step": 4808 + }, + { + "epoch": 1.2, + "grad_norm": 5.3184685707092285, + "learning_rate": 9.362116904289482e-07, + "logits/chosen": -0.2915392518043518, + "logits/rejected": -0.40575307607650757, + "logps/chosen": -54.11442184448242, + "logps/rejected": -68.93402862548828, + "loss": 0.699, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.70393705368042, + "rewards/margins": 4.093408584594727, + "rewards/rejected": -1.3894717693328857, + "step": 4809 + }, + { + "epoch": 1.2, + "grad_norm": 9.993535995483398, + "learning_rate": 9.346859825590898e-07, + "logits/chosen": -0.3902320861816406, + "logits/rejected": -0.5225306749343872, + "logps/chosen": -64.24418640136719, + "logps/rejected": -83.22837829589844, + "loss": 0.8116, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.759613275527954, + "rewards/margins": 4.579309940338135, + "rewards/rejected": -1.819696307182312, + "step": 4810 + }, + { + "epoch": 1.2, + "grad_norm": 3.524470567703247, + "learning_rate": 9.331613907056508e-07, + "logits/chosen": -0.35958391427993774, + "logits/rejected": -0.46332770586013794, + "logps/chosen": -51.08097457885742, + "logps/rejected": -99.19776153564453, + "loss": 0.6504, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8347318172454834, + "rewards/margins": 6.424323558807373, + "rewards/rejected": -3.589592456817627, + "step": 4811 + }, + { + "epoch": 1.2, + "grad_norm": 5.2738142013549805, + "learning_rate": 9.316379152871668e-07, + "logits/chosen": -0.2938454747200012, + "logits/rejected": -0.36981114745140076, + "logps/chosen": -60.50694274902344, + "logps/rejected": -91.63976287841797, + "loss": 0.7246, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.761235475540161, + "rewards/margins": 4.699270248413086, + "rewards/rejected": -1.938035488128662, + "step": 4812 + }, + { + "epoch": 1.2, + "grad_norm": 8.363417625427246, + "learning_rate": 9.301155567218634e-07, + "logits/chosen": -0.358467698097229, + "logits/rejected": -0.4815801680088043, + "logps/chosen": -55.18443298339844, + "logps/rejected": -83.9902114868164, + "loss": 0.7117, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.778587818145752, + "rewards/margins": 4.290576457977295, + "rewards/rejected": -1.511988639831543, + "step": 4813 + }, + { + "epoch": 1.2, + "grad_norm": 3.596463918685913, + "learning_rate": 9.285943154276605e-07, + "logits/chosen": -0.3291453421115875, + "logits/rejected": -0.3828825354576111, + "logps/chosen": -45.18889617919922, + "logps/rejected": -78.3353271484375, + "loss": 0.6194, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2740514278411865, + "rewards/margins": 4.894371509552002, + "rewards/rejected": -1.6203192472457886, + "step": 4814 + }, + { + "epoch": 1.2, + "grad_norm": 4.972869873046875, + "learning_rate": 9.270741918221726e-07, + "logits/chosen": -0.3590236008167267, + "logits/rejected": -0.47367072105407715, + "logps/chosen": -49.55155944824219, + "logps/rejected": -72.6955795288086, + "loss": 0.6917, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9673521518707275, + "rewards/margins": 5.159139633178711, + "rewards/rejected": -2.1917872428894043, + "step": 4815 + }, + { + "epoch": 1.2, + "grad_norm": 6.746760845184326, + "learning_rate": 9.255551863227041e-07, + "logits/chosen": -0.3229261338710785, + "logits/rejected": -0.3718889355659485, + "logps/chosen": -55.004058837890625, + "logps/rejected": -95.22076416015625, + "loss": 0.7009, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0852253437042236, + "rewards/margins": 5.08066463470459, + "rewards/rejected": -1.9954396486282349, + "step": 4816 + }, + { + "epoch": 1.2, + "grad_norm": 4.953654766082764, + "learning_rate": 9.240372993462593e-07, + "logits/chosen": -0.421203076839447, + "logits/rejected": -0.45869916677474976, + "logps/chosen": -43.47699737548828, + "logps/rejected": -83.57521057128906, + "loss": 0.7605, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0335328578948975, + "rewards/margins": 5.01777458190918, + "rewards/rejected": -1.9842413663864136, + "step": 4817 + }, + { + "epoch": 1.21, + "grad_norm": 9.047263145446777, + "learning_rate": 9.22520531309527e-07, + "logits/chosen": -0.4967503547668457, + "logits/rejected": -0.5039766430854797, + "logps/chosen": -54.84180450439453, + "logps/rejected": -94.24713897705078, + "loss": 0.7572, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8967909812927246, + "rewards/margins": 4.72932767868042, + "rewards/rejected": -1.8325364589691162, + "step": 4818 + }, + { + "epoch": 1.21, + "grad_norm": 5.321915626525879, + "learning_rate": 9.21004882628892e-07, + "logits/chosen": -0.382083922624588, + "logits/rejected": -0.4497152268886566, + "logps/chosen": -55.842124938964844, + "logps/rejected": -86.4189224243164, + "loss": 0.6681, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9619736671447754, + "rewards/margins": 5.523426532745361, + "rewards/rejected": -2.561452865600586, + "step": 4819 + }, + { + "epoch": 1.21, + "grad_norm": 5.255090713500977, + "learning_rate": 9.194903537204363e-07, + "logits/chosen": -0.36171185970306396, + "logits/rejected": -0.4486769437789917, + "logps/chosen": -60.66533660888672, + "logps/rejected": -85.47354125976562, + "loss": 0.6948, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8494296073913574, + "rewards/margins": 4.875202178955078, + "rewards/rejected": -2.0257725715637207, + "step": 4820 + }, + { + "epoch": 1.21, + "grad_norm": 2.7201685905456543, + "learning_rate": 9.179769449999293e-07, + "logits/chosen": -0.40324097871780396, + "logits/rejected": -0.46314799785614014, + "logps/chosen": -61.107540130615234, + "logps/rejected": -86.62406921386719, + "loss": 0.6689, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1221425533294678, + "rewards/margins": 4.8346943855285645, + "rewards/rejected": -1.7125517129898071, + "step": 4821 + }, + { + "epoch": 1.21, + "grad_norm": 3.149416208267212, + "learning_rate": 9.164646568828334e-07, + "logits/chosen": -0.3732413947582245, + "logits/rejected": -0.49498793482780457, + "logps/chosen": -55.579593658447266, + "logps/rejected": -82.65345764160156, + "loss": 0.6384, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7144086360931396, + "rewards/margins": 5.395998954772949, + "rewards/rejected": -2.6815905570983887, + "step": 4822 + }, + { + "epoch": 1.21, + "grad_norm": 7.682659149169922, + "learning_rate": 9.149534897843048e-07, + "logits/chosen": -0.3614053726196289, + "logits/rejected": -0.40207594633102417, + "logps/chosen": -64.357666015625, + "logps/rejected": -87.40602111816406, + "loss": 0.8688, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.808366060256958, + "rewards/margins": 4.16171407699585, + "rewards/rejected": -1.3533477783203125, + "step": 4823 + }, + { + "epoch": 1.21, + "grad_norm": 4.033755779266357, + "learning_rate": 9.134434441191908e-07, + "logits/chosen": -0.2813010513782501, + "logits/rejected": -0.312303751707077, + "logps/chosen": -57.63351058959961, + "logps/rejected": -84.79515838623047, + "loss": 0.7778, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.952878713607788, + "rewards/margins": 4.331934452056885, + "rewards/rejected": -1.379055380821228, + "step": 4824 + }, + { + "epoch": 1.21, + "grad_norm": 5.819075107574463, + "learning_rate": 9.119345203020336e-07, + "logits/chosen": -0.3817562758922577, + "logits/rejected": -0.4548957347869873, + "logps/chosen": -51.328609466552734, + "logps/rejected": -88.35814666748047, + "loss": 0.6593, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.784060001373291, + "rewards/margins": 4.695431232452393, + "rewards/rejected": -1.9113714694976807, + "step": 4825 + }, + { + "epoch": 1.21, + "grad_norm": 10.222461700439453, + "learning_rate": 9.104267187470639e-07, + "logits/chosen": -0.45280519127845764, + "logits/rejected": -0.5339657664299011, + "logps/chosen": -56.30491256713867, + "logps/rejected": -83.91336822509766, + "loss": 0.8092, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8571417331695557, + "rewards/margins": 4.853919982910156, + "rewards/rejected": -1.9967775344848633, + "step": 4826 + }, + { + "epoch": 1.21, + "grad_norm": 4.358348369598389, + "learning_rate": 9.089200398682058e-07, + "logits/chosen": -0.32520365715026855, + "logits/rejected": -0.41642922163009644, + "logps/chosen": -59.102901458740234, + "logps/rejected": -98.74176788330078, + "loss": 0.6841, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8643903732299805, + "rewards/margins": 5.0645551681518555, + "rewards/rejected": -2.200164794921875, + "step": 4827 + }, + { + "epoch": 1.21, + "grad_norm": 8.445852279663086, + "learning_rate": 9.074144840790755e-07, + "logits/chosen": -0.33302995562553406, + "logits/rejected": -0.4636504650115967, + "logps/chosen": -57.34048843383789, + "logps/rejected": -93.19792175292969, + "loss": 0.7618, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7543797492980957, + "rewards/margins": 5.36841344833374, + "rewards/rejected": -2.6140332221984863, + "step": 4828 + }, + { + "epoch": 1.21, + "grad_norm": 5.6844658851623535, + "learning_rate": 9.059100517929803e-07, + "logits/chosen": -0.43655216693878174, + "logits/rejected": -0.46195778250694275, + "logps/chosen": -77.0311508178711, + "logps/rejected": -91.43627166748047, + "loss": 0.7517, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.07403564453125, + "rewards/margins": 4.93555212020874, + "rewards/rejected": -1.8615158796310425, + "step": 4829 + }, + { + "epoch": 1.21, + "grad_norm": 3.6979870796203613, + "learning_rate": 9.044067434229198e-07, + "logits/chosen": -0.284646213054657, + "logits/rejected": -0.4180276393890381, + "logps/chosen": -51.653221130371094, + "logps/rejected": -78.72282409667969, + "loss": 0.6217, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9116311073303223, + "rewards/margins": 5.408257961273193, + "rewards/rejected": -2.496626853942871, + "step": 4830 + }, + { + "epoch": 1.21, + "grad_norm": 10.580588340759277, + "learning_rate": 9.029045593815822e-07, + "logits/chosen": -0.37485557794570923, + "logits/rejected": -0.5055933594703674, + "logps/chosen": -58.06237030029297, + "logps/rejected": -80.18897247314453, + "loss": 0.6911, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.059520959854126, + "rewards/margins": 5.425738334655762, + "rewards/rejected": -2.3662166595458984, + "step": 4831 + }, + { + "epoch": 1.21, + "grad_norm": 4.208247184753418, + "learning_rate": 9.014035000813531e-07, + "logits/chosen": -0.30907192826271057, + "logits/rejected": -0.44022136926651, + "logps/chosen": -57.7717170715332, + "logps/rejected": -85.90798950195312, + "loss": 0.5772, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.122016429901123, + "rewards/margins": 5.6985979080200195, + "rewards/rejected": -2.576582193374634, + "step": 4832 + }, + { + "epoch": 1.21, + "grad_norm": 4.3890299797058105, + "learning_rate": 8.999035659343036e-07, + "logits/chosen": -0.3478669226169586, + "logits/rejected": -0.4619761109352112, + "logps/chosen": -50.13951110839844, + "logps/rejected": -88.6666030883789, + "loss": 0.6327, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9814200401306152, + "rewards/margins": 5.458499431610107, + "rewards/rejected": -2.477079391479492, + "step": 4833 + }, + { + "epoch": 1.21, + "grad_norm": 15.462915420532227, + "learning_rate": 8.984047573521987e-07, + "logits/chosen": -0.30604979395866394, + "logits/rejected": -0.411414235830307, + "logps/chosen": -63.90806198120117, + "logps/rejected": -82.69412231445312, + "loss": 0.7302, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9184298515319824, + "rewards/margins": 4.188961029052734, + "rewards/rejected": -1.270531177520752, + "step": 4834 + }, + { + "epoch": 1.21, + "grad_norm": 11.917035102844238, + "learning_rate": 8.969070747464931e-07, + "logits/chosen": -0.3060113191604614, + "logits/rejected": -0.41940969228744507, + "logps/chosen": -53.68455505371094, + "logps/rejected": -83.98515319824219, + "loss": 0.6106, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.790152072906494, + "rewards/margins": 5.415779113769531, + "rewards/rejected": -2.6256275177001953, + "step": 4835 + }, + { + "epoch": 1.21, + "grad_norm": 2.264392375946045, + "learning_rate": 8.954105185283324e-07, + "logits/chosen": -0.32457900047302246, + "logits/rejected": -0.4890441298484802, + "logps/chosen": -57.828338623046875, + "logps/rejected": -83.210205078125, + "loss": 0.5988, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.993290901184082, + "rewards/margins": 6.033697605133057, + "rewards/rejected": -3.0404062271118164, + "step": 4836 + }, + { + "epoch": 1.21, + "grad_norm": 3.5090177059173584, + "learning_rate": 8.939150891085551e-07, + "logits/chosen": -0.33273106813430786, + "logits/rejected": -0.4172416031360626, + "logps/chosen": -59.21131134033203, + "logps/rejected": -93.90140533447266, + "loss": 0.7471, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.022498369216919, + "rewards/margins": 5.535213947296143, + "rewards/rejected": -2.512716054916382, + "step": 4837 + }, + { + "epoch": 1.21, + "grad_norm": 2.901543140411377, + "learning_rate": 8.924207868976898e-07, + "logits/chosen": -0.41809606552124023, + "logits/rejected": -0.5103573799133301, + "logps/chosen": -52.9349365234375, + "logps/rejected": -87.87251281738281, + "loss": 0.6578, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.75422739982605, + "rewards/margins": 6.112978458404541, + "rewards/rejected": -3.3587512969970703, + "step": 4838 + }, + { + "epoch": 1.21, + "grad_norm": 6.209221363067627, + "learning_rate": 8.909276123059501e-07, + "logits/chosen": -0.39174994826316833, + "logits/rejected": -0.4699225127696991, + "logps/chosen": -73.38557434082031, + "logps/rejected": -80.50404357910156, + "loss": 0.941, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.4474470615386963, + "rewards/margins": 3.886371374130249, + "rewards/rejected": -1.4389246702194214, + "step": 4839 + }, + { + "epoch": 1.21, + "grad_norm": 3.509638786315918, + "learning_rate": 8.894355657432497e-07, + "logits/chosen": -0.3296041786670685, + "logits/rejected": -0.4486435055732727, + "logps/chosen": -76.28843688964844, + "logps/rejected": -85.55998229980469, + "loss": 0.7485, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.833944082260132, + "rewards/margins": 5.56097412109375, + "rewards/rejected": -2.7270305156707764, + "step": 4840 + }, + { + "epoch": 1.21, + "grad_norm": 3.7381229400634766, + "learning_rate": 8.87944647619185e-07, + "logits/chosen": -0.44672873616218567, + "logits/rejected": -0.5385943055152893, + "logps/chosen": -43.34370422363281, + "logps/rejected": -66.45511627197266, + "loss": 0.6231, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.008655071258545, + "rewards/margins": 4.891780853271484, + "rewards/rejected": -1.8831257820129395, + "step": 4841 + }, + { + "epoch": 1.21, + "grad_norm": 4.007391452789307, + "learning_rate": 8.864548583430454e-07, + "logits/chosen": -0.30735495686531067, + "logits/rejected": -0.3869374394416809, + "logps/chosen": -59.20951843261719, + "logps/rejected": -93.61196899414062, + "loss": 0.7646, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0430850982666016, + "rewards/margins": 4.37838077545166, + "rewards/rejected": -1.3352960348129272, + "step": 4842 + }, + { + "epoch": 1.21, + "grad_norm": 4.115537643432617, + "learning_rate": 8.849661983238106e-07, + "logits/chosen": -0.29112574458122253, + "logits/rejected": -0.3495287001132965, + "logps/chosen": -55.71979522705078, + "logps/rejected": -93.17720794677734, + "loss": 0.724, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0004539489746094, + "rewards/margins": 4.835834980010986, + "rewards/rejected": -1.8353811502456665, + "step": 4843 + }, + { + "epoch": 1.21, + "grad_norm": 5.24827766418457, + "learning_rate": 8.83478667970149e-07, + "logits/chosen": -0.4260396957397461, + "logits/rejected": -0.45671266317367554, + "logps/chosen": -51.42606735229492, + "logps/rejected": -94.32675170898438, + "loss": 0.6672, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9677467346191406, + "rewards/margins": 4.941007614135742, + "rewards/rejected": -1.9732614755630493, + "step": 4844 + }, + { + "epoch": 1.21, + "grad_norm": 8.122112274169922, + "learning_rate": 8.819922676904213e-07, + "logits/chosen": -0.35412654280662537, + "logits/rejected": -0.418106347322464, + "logps/chosen": -54.82117462158203, + "logps/rejected": -98.35626983642578, + "loss": 0.6965, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9990625381469727, + "rewards/margins": 5.502275466918945, + "rewards/rejected": -2.5032124519348145, + "step": 4845 + }, + { + "epoch": 1.21, + "grad_norm": 22.7883358001709, + "learning_rate": 8.805069978926761e-07, + "logits/chosen": -0.26016366481781006, + "logits/rejected": -0.292052298784256, + "logps/chosen": -66.82151794433594, + "logps/rejected": -80.52558898925781, + "loss": 1.0075, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.3702080249786377, + "rewards/margins": 2.6722192764282227, + "rewards/rejected": -0.3020111322402954, + "step": 4846 + }, + { + "epoch": 1.21, + "grad_norm": 3.527860641479492, + "learning_rate": 8.790228589846511e-07, + "logits/chosen": -0.41010424494743347, + "logits/rejected": -0.48406487703323364, + "logps/chosen": -52.88237762451172, + "logps/rejected": -95.72262573242188, + "loss": 0.6687, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.853749990463257, + "rewards/margins": 5.601078987121582, + "rewards/rejected": -2.7473292350769043, + "step": 4847 + }, + { + "epoch": 1.21, + "grad_norm": 3.913539409637451, + "learning_rate": 8.775398513737754e-07, + "logits/chosen": -0.3922974169254303, + "logits/rejected": -0.4532592296600342, + "logps/chosen": -49.16926574707031, + "logps/rejected": -73.24797058105469, + "loss": 0.7147, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.995537281036377, + "rewards/margins": 4.462061882019043, + "rewards/rejected": -1.4665241241455078, + "step": 4848 + }, + { + "epoch": 1.21, + "grad_norm": 3.0039494037628174, + "learning_rate": 8.76057975467165e-07, + "logits/chosen": -0.2763304114341736, + "logits/rejected": -0.36338505148887634, + "logps/chosen": -55.830421447753906, + "logps/rejected": -87.35711669921875, + "loss": 0.6609, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.10233736038208, + "rewards/margins": 5.193453788757324, + "rewards/rejected": -2.0911166667938232, + "step": 4849 + }, + { + "epoch": 1.21, + "grad_norm": 3.8832881450653076, + "learning_rate": 8.745772316716306e-07, + "logits/chosen": -0.2885170876979828, + "logits/rejected": -0.42543596029281616, + "logps/chosen": -67.18101501464844, + "logps/rejected": -93.11332702636719, + "loss": 0.6384, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9555089473724365, + "rewards/margins": 5.4871134757995605, + "rewards/rejected": -2.531604766845703, + "step": 4850 + }, + { + "epoch": 1.21, + "grad_norm": 4.217642307281494, + "learning_rate": 8.730976203936625e-07, + "logits/chosen": -0.271867036819458, + "logits/rejected": -0.413983553647995, + "logps/chosen": -67.52645111083984, + "logps/rejected": -76.84136199951172, + "loss": 0.714, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1316323280334473, + "rewards/margins": 5.12492561340332, + "rewards/rejected": -1.9932935237884521, + "step": 4851 + }, + { + "epoch": 1.21, + "grad_norm": 2.118595838546753, + "learning_rate": 8.71619142039451e-07, + "logits/chosen": -0.2867150902748108, + "logits/rejected": -0.41666266322135925, + "logps/chosen": -56.98673629760742, + "logps/rejected": -96.43109893798828, + "loss": 0.6073, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2828094959259033, + "rewards/margins": 6.050839900970459, + "rewards/rejected": -2.7680306434631348, + "step": 4852 + }, + { + "epoch": 1.21, + "grad_norm": 3.351951837539673, + "learning_rate": 8.701417970148684e-07, + "logits/chosen": -0.26281461119651794, + "logits/rejected": -0.3391945958137512, + "logps/chosen": -61.791221618652344, + "logps/rejected": -89.11679077148438, + "loss": 0.6741, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.277510166168213, + "rewards/margins": 5.4076032638549805, + "rewards/rejected": -2.130093574523926, + "step": 4853 + }, + { + "epoch": 1.21, + "grad_norm": 14.917731285095215, + "learning_rate": 8.686655857254777e-07, + "logits/chosen": -0.31879112124443054, + "logits/rejected": -0.45462873578071594, + "logps/chosen": -66.68133544921875, + "logps/rejected": -95.84934997558594, + "loss": 0.6506, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9801650047302246, + "rewards/margins": 5.947856426239014, + "rewards/rejected": -2.967691421508789, + "step": 4854 + }, + { + "epoch": 1.21, + "grad_norm": 4.192465782165527, + "learning_rate": 8.671905085765309e-07, + "logits/chosen": -0.392983078956604, + "logits/rejected": -0.49215370416641235, + "logps/chosen": -47.766014099121094, + "logps/rejected": -84.15020751953125, + "loss": 0.7262, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1752965450286865, + "rewards/margins": 5.153963088989258, + "rewards/rejected": -1.9786665439605713, + "step": 4855 + }, + { + "epoch": 1.21, + "grad_norm": 7.01469087600708, + "learning_rate": 8.657165659729671e-07, + "logits/chosen": -0.2819109857082367, + "logits/rejected": -0.3571178913116455, + "logps/chosen": -54.327369689941406, + "logps/rejected": -88.88067626953125, + "loss": 0.7452, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9336605072021484, + "rewards/margins": 4.5280303955078125, + "rewards/rejected": -1.5943700075149536, + "step": 4856 + }, + { + "epoch": 1.21, + "grad_norm": 4.277795791625977, + "learning_rate": 8.64243758319418e-07, + "logits/chosen": -0.3766697645187378, + "logits/rejected": -0.4700391888618469, + "logps/chosen": -58.27867889404297, + "logps/rejected": -91.73023986816406, + "loss": 0.6877, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9057157039642334, + "rewards/margins": 4.977311611175537, + "rewards/rejected": -2.071596145629883, + "step": 4857 + }, + { + "epoch": 1.22, + "grad_norm": 3.5614919662475586, + "learning_rate": 8.62772086020201e-07, + "logits/chosen": -0.3519705533981323, + "logits/rejected": -0.46142110228538513, + "logps/chosen": -69.3395004272461, + "logps/rejected": -99.5298080444336, + "loss": 0.6274, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.076416492462158, + "rewards/margins": 5.544513702392578, + "rewards/rejected": -2.46809720993042, + "step": 4858 + }, + { + "epoch": 1.22, + "grad_norm": 3.057163715362549, + "learning_rate": 8.613015494793186e-07, + "logits/chosen": -0.3846958577632904, + "logits/rejected": -0.48129159212112427, + "logps/chosen": -58.786651611328125, + "logps/rejected": -94.765380859375, + "loss": 0.5929, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.721738338470459, + "rewards/margins": 5.703973770141602, + "rewards/rejected": -2.9822354316711426, + "step": 4859 + }, + { + "epoch": 1.22, + "grad_norm": 8.637255668640137, + "learning_rate": 8.59832149100468e-07, + "logits/chosen": -0.3874293863773346, + "logits/rejected": -0.49449482560157776, + "logps/chosen": -57.67229461669922, + "logps/rejected": -75.67730712890625, + "loss": 0.7043, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8553004264831543, + "rewards/margins": 4.606657028198242, + "rewards/rejected": -1.751357078552246, + "step": 4860 + }, + { + "epoch": 1.22, + "grad_norm": 4.426898002624512, + "learning_rate": 8.583638852870291e-07, + "logits/chosen": -0.31052589416503906, + "logits/rejected": -0.4114028215408325, + "logps/chosen": -62.178802490234375, + "logps/rejected": -86.64962768554688, + "loss": 0.7724, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.547811508178711, + "rewards/margins": 4.391626358032227, + "rewards/rejected": -1.8438149690628052, + "step": 4861 + }, + { + "epoch": 1.22, + "grad_norm": 4.352406978607178, + "learning_rate": 8.568967584420756e-07, + "logits/chosen": -0.3388221561908722, + "logits/rejected": -0.38188615441322327, + "logps/chosen": -53.57805633544922, + "logps/rejected": -93.87361907958984, + "loss": 0.7065, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.039360284805298, + "rewards/margins": 4.799602031707764, + "rewards/rejected": -1.760241985321045, + "step": 4862 + }, + { + "epoch": 1.22, + "grad_norm": 3.6627302169799805, + "learning_rate": 8.554307689683616e-07, + "logits/chosen": -0.43737003207206726, + "logits/rejected": -0.45687368512153625, + "logps/chosen": -48.6038932800293, + "logps/rejected": -80.19158935546875, + "loss": 0.7746, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.093924045562744, + "rewards/margins": 4.652227401733398, + "rewards/rejected": -1.5583034753799438, + "step": 4863 + }, + { + "epoch": 1.22, + "grad_norm": 6.87533712387085, + "learning_rate": 8.539659172683329e-07, + "logits/chosen": -0.31717565655708313, + "logits/rejected": -0.45006412267684937, + "logps/chosen": -53.387081146240234, + "logps/rejected": -91.35103607177734, + "loss": 0.7019, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0317232608795166, + "rewards/margins": 5.400510311126709, + "rewards/rejected": -2.3687872886657715, + "step": 4864 + }, + { + "epoch": 1.22, + "grad_norm": 1.979535460472107, + "learning_rate": 8.525022037441261e-07, + "logits/chosen": -0.2269735038280487, + "logits/rejected": -0.33712270855903625, + "logps/chosen": -64.29532623291016, + "logps/rejected": -107.21430969238281, + "loss": 0.5714, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.927868366241455, + "rewards/margins": 5.741218090057373, + "rewards/rejected": -2.813349723815918, + "step": 4865 + }, + { + "epoch": 1.22, + "grad_norm": 5.446110725402832, + "learning_rate": 8.510396287975597e-07, + "logits/chosen": -0.3845984637737274, + "logits/rejected": -0.5281434655189514, + "logps/chosen": -49.604774475097656, + "logps/rejected": -76.01493835449219, + "loss": 0.5644, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.907111883163452, + "rewards/margins": 5.165369033813477, + "rewards/rejected": -2.2582569122314453, + "step": 4866 + }, + { + "epoch": 1.22, + "grad_norm": 4.6537628173828125, + "learning_rate": 8.495781928301428e-07, + "logits/chosen": -0.29433995485305786, + "logits/rejected": -0.4220157563686371, + "logps/chosen": -61.770111083984375, + "logps/rejected": -86.07132720947266, + "loss": 0.6717, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8247694969177246, + "rewards/margins": 5.265018939971924, + "rewards/rejected": -2.440248966217041, + "step": 4867 + }, + { + "epoch": 1.22, + "grad_norm": 3.923711061477661, + "learning_rate": 8.481178962430713e-07, + "logits/chosen": -0.441214382648468, + "logits/rejected": -0.5543587803840637, + "logps/chosen": -57.59020233154297, + "logps/rejected": -79.17591857910156, + "loss": 0.6604, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.849188804626465, + "rewards/margins": 5.809539318084717, + "rewards/rejected": -2.960350513458252, + "step": 4868 + }, + { + "epoch": 1.22, + "grad_norm": 4.458555221557617, + "learning_rate": 8.466587394372267e-07, + "logits/chosen": -0.3448476493358612, + "logits/rejected": -0.3963755667209625, + "logps/chosen": -53.88988494873047, + "logps/rejected": -85.78140258789062, + "loss": 0.6131, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8720197677612305, + "rewards/margins": 5.122148513793945, + "rewards/rejected": -2.2501282691955566, + "step": 4869 + }, + { + "epoch": 1.22, + "grad_norm": 10.593063354492188, + "learning_rate": 8.45200722813182e-07, + "logits/chosen": -0.4128185510635376, + "logits/rejected": -0.48015308380126953, + "logps/chosen": -48.87668228149414, + "logps/rejected": -81.47303009033203, + "loss": 0.7611, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.464785575866699, + "rewards/margins": 4.813918113708496, + "rewards/rejected": -2.3491320610046387, + "step": 4870 + }, + { + "epoch": 1.22, + "grad_norm": 4.741306781768799, + "learning_rate": 8.437438467711906e-07, + "logits/chosen": -0.3394647538661957, + "logits/rejected": -0.3992782533168793, + "logps/chosen": -51.900596618652344, + "logps/rejected": -92.14310455322266, + "loss": 0.6921, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9632763862609863, + "rewards/margins": 5.166130065917969, + "rewards/rejected": -2.202853202819824, + "step": 4871 + }, + { + "epoch": 1.22, + "grad_norm": 6.697758197784424, + "learning_rate": 8.422881117111987e-07, + "logits/chosen": -0.32903963327407837, + "logits/rejected": -0.45677992701530457, + "logps/chosen": -63.617591857910156, + "logps/rejected": -79.81684112548828, + "loss": 0.8742, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5165798664093018, + "rewards/margins": 3.9560277462005615, + "rewards/rejected": -1.4394481182098389, + "step": 4872 + }, + { + "epoch": 1.22, + "grad_norm": 5.205666542053223, + "learning_rate": 8.408335180328359e-07, + "logits/chosen": -0.2746598422527313, + "logits/rejected": -0.40504905581474304, + "logps/chosen": -59.90102767944336, + "logps/rejected": -78.34146118164062, + "loss": 0.6788, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1070663928985596, + "rewards/margins": 5.258520603179932, + "rewards/rejected": -2.151454210281372, + "step": 4873 + }, + { + "epoch": 1.22, + "grad_norm": 10.860849380493164, + "learning_rate": 8.393800661354223e-07, + "logits/chosen": -0.38556748628616333, + "logits/rejected": -0.5223446488380432, + "logps/chosen": -61.33869934082031, + "logps/rejected": -92.09639739990234, + "loss": 0.7366, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.444568395614624, + "rewards/margins": 5.183337211608887, + "rewards/rejected": -2.7387688159942627, + "step": 4874 + }, + { + "epoch": 1.22, + "grad_norm": 4.085781574249268, + "learning_rate": 8.379277564179583e-07, + "logits/chosen": -0.4199920892715454, + "logits/rejected": -0.49472904205322266, + "logps/chosen": -54.40740203857422, + "logps/rejected": -85.46009826660156, + "loss": 0.6723, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8878157138824463, + "rewards/margins": 5.314751148223877, + "rewards/rejected": -2.4269351959228516, + "step": 4875 + }, + { + "epoch": 1.22, + "grad_norm": 3.768214702606201, + "learning_rate": 8.364765892791349e-07, + "logits/chosen": -0.4508868455886841, + "logits/rejected": -0.5165546536445618, + "logps/chosen": -59.24028015136719, + "logps/rejected": -69.51881408691406, + "loss": 0.776, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9884603023529053, + "rewards/margins": 3.9336421489715576, + "rewards/rejected": -0.945182204246521, + "step": 4876 + }, + { + "epoch": 1.22, + "grad_norm": 8.011970520019531, + "learning_rate": 8.3502656511733e-07, + "logits/chosen": -0.31974267959594727, + "logits/rejected": -0.4170630872249603, + "logps/chosen": -72.80224609375, + "logps/rejected": -87.8707046508789, + "loss": 0.9325, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6517369747161865, + "rewards/margins": 4.910866737365723, + "rewards/rejected": -2.2591300010681152, + "step": 4877 + }, + { + "epoch": 1.22, + "grad_norm": 4.50639009475708, + "learning_rate": 8.335776843306065e-07, + "logits/chosen": -0.40705549716949463, + "logits/rejected": -0.5250528454780579, + "logps/chosen": -49.16438293457031, + "logps/rejected": -82.72196960449219, + "loss": 0.6746, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.792228937149048, + "rewards/margins": 6.08906364440918, + "rewards/rejected": -3.296834945678711, + "step": 4878 + }, + { + "epoch": 1.22, + "grad_norm": 3.6786916255950928, + "learning_rate": 8.321299473167127e-07, + "logits/chosen": -0.3953985273838043, + "logits/rejected": -0.48868653178215027, + "logps/chosen": -63.03630447387695, + "logps/rejected": -75.94153594970703, + "loss": 0.7496, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.173490285873413, + "rewards/margins": 4.921499252319336, + "rewards/rejected": -1.7480087280273438, + "step": 4879 + }, + { + "epoch": 1.22, + "grad_norm": 9.63241958618164, + "learning_rate": 8.306833544730841e-07, + "logits/chosen": -0.3151109218597412, + "logits/rejected": -0.42937979102134705, + "logps/chosen": -54.133094787597656, + "logps/rejected": -81.21144104003906, + "loss": 0.7273, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8414835929870605, + "rewards/margins": 5.273849010467529, + "rewards/rejected": -2.4323647022247314, + "step": 4880 + }, + { + "epoch": 1.22, + "grad_norm": 2.337383270263672, + "learning_rate": 8.292379061968403e-07, + "logits/chosen": -0.34367963671684265, + "logits/rejected": -0.43872737884521484, + "logps/chosen": -58.533329010009766, + "logps/rejected": -120.37841796875, + "loss": 0.6837, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.049506187438965, + "rewards/margins": 6.46100378036499, + "rewards/rejected": -3.4114973545074463, + "step": 4881 + }, + { + "epoch": 1.22, + "grad_norm": 5.336540222167969, + "learning_rate": 8.2779360288479e-07, + "logits/chosen": -0.4272211194038391, + "logits/rejected": -0.5393533706665039, + "logps/chosen": -58.492034912109375, + "logps/rejected": -87.55184173583984, + "loss": 0.7293, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8534646034240723, + "rewards/margins": 5.551031112670898, + "rewards/rejected": -2.697566270828247, + "step": 4882 + }, + { + "epoch": 1.22, + "grad_norm": 6.5563530921936035, + "learning_rate": 8.263504449334253e-07, + "logits/chosen": -0.330545037984848, + "logits/rejected": -0.4278803765773773, + "logps/chosen": -54.584556579589844, + "logps/rejected": -81.72293853759766, + "loss": 0.6104, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.915891647338867, + "rewards/margins": 4.9770355224609375, + "rewards/rejected": -2.061143636703491, + "step": 4883 + }, + { + "epoch": 1.22, + "grad_norm": 1.7213518619537354, + "learning_rate": 8.249084327389234e-07, + "logits/chosen": -0.33112016320228577, + "logits/rejected": -0.5101572871208191, + "logps/chosen": -63.05306625366211, + "logps/rejected": -77.25865173339844, + "loss": 0.6097, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.021519660949707, + "rewards/margins": 6.420886039733887, + "rewards/rejected": -3.3993663787841797, + "step": 4884 + }, + { + "epoch": 1.22, + "grad_norm": 9.137378692626953, + "learning_rate": 8.234675666971482e-07, + "logits/chosen": -0.38998404145240784, + "logits/rejected": -0.4478764832019806, + "logps/chosen": -55.648677825927734, + "logps/rejected": -94.74687194824219, + "loss": 0.7213, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.922860622406006, + "rewards/margins": 5.24176549911499, + "rewards/rejected": -2.3189048767089844, + "step": 4885 + }, + { + "epoch": 1.22, + "grad_norm": 2.9160399436950684, + "learning_rate": 8.220278472036486e-07, + "logits/chosen": -0.3543386459350586, + "logits/rejected": -0.4075429141521454, + "logps/chosen": -53.113983154296875, + "logps/rejected": -87.03025817871094, + "loss": 0.6939, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.059493064880371, + "rewards/margins": 4.836147308349609, + "rewards/rejected": -1.7766544818878174, + "step": 4886 + }, + { + "epoch": 1.22, + "grad_norm": 4.4127092361450195, + "learning_rate": 8.205892746536582e-07, + "logits/chosen": -0.2760743498802185, + "logits/rejected": -0.36080753803253174, + "logps/chosen": -74.09967041015625, + "logps/rejected": -93.74375915527344, + "loss": 0.8686, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4220430850982666, + "rewards/margins": 5.090000629425049, + "rewards/rejected": -2.6679577827453613, + "step": 4887 + }, + { + "epoch": 1.22, + "grad_norm": 8.056108474731445, + "learning_rate": 8.191518494420953e-07, + "logits/chosen": -0.3183404207229614, + "logits/rejected": -0.3449714779853821, + "logps/chosen": -57.507415771484375, + "logps/rejected": -87.72676086425781, + "loss": 0.8355, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.65407657623291, + "rewards/margins": 3.971092700958252, + "rewards/rejected": -1.3170160055160522, + "step": 4888 + }, + { + "epoch": 1.22, + "grad_norm": 5.608641624450684, + "learning_rate": 8.177155719635665e-07, + "logits/chosen": -0.3877996802330017, + "logits/rejected": -0.47701770067214966, + "logps/chosen": -51.5910530090332, + "logps/rejected": -107.12588500976562, + "loss": 0.6605, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.855455160140991, + "rewards/margins": 5.875887393951416, + "rewards/rejected": -3.0204319953918457, + "step": 4889 + }, + { + "epoch": 1.22, + "grad_norm": 8.195984840393066, + "learning_rate": 8.162804426123599e-07, + "logits/chosen": -0.36259961128234863, + "logits/rejected": -0.452118456363678, + "logps/chosen": -67.89546966552734, + "logps/rejected": -82.55384063720703, + "loss": 0.7854, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6925439834594727, + "rewards/margins": 3.9273908138275146, + "rewards/rejected": -1.234847068786621, + "step": 4890 + }, + { + "epoch": 1.22, + "grad_norm": 8.273615837097168, + "learning_rate": 8.148464617824492e-07, + "logits/chosen": -0.4830706715583801, + "logits/rejected": -0.5136078000068665, + "logps/chosen": -41.36042404174805, + "logps/rejected": -87.86566925048828, + "loss": 0.6543, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.826127290725708, + "rewards/margins": 5.139939785003662, + "rewards/rejected": -2.3138129711151123, + "step": 4891 + }, + { + "epoch": 1.22, + "grad_norm": 1.7341004610061646, + "learning_rate": 8.134136298674933e-07, + "logits/chosen": -0.4640738368034363, + "logits/rejected": -0.5171165466308594, + "logps/chosen": -50.51628875732422, + "logps/rejected": -97.30380249023438, + "loss": 0.6395, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.022737741470337, + "rewards/margins": 5.713030815124512, + "rewards/rejected": -2.6902928352355957, + "step": 4892 + }, + { + "epoch": 1.22, + "grad_norm": 4.764755725860596, + "learning_rate": 8.119819472608342e-07, + "logits/chosen": -0.3467845618724823, + "logits/rejected": -0.4435248374938965, + "logps/chosen": -67.28548431396484, + "logps/rejected": -78.9148178100586, + "loss": 0.7059, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.698775053024292, + "rewards/margins": 4.851888656616211, + "rewards/rejected": -2.15311336517334, + "step": 4893 + }, + { + "epoch": 1.22, + "grad_norm": 5.512452125549316, + "learning_rate": 8.105514143555021e-07, + "logits/chosen": -0.28728631138801575, + "logits/rejected": -0.36833876371383667, + "logps/chosen": -60.4071044921875, + "logps/rejected": -86.13035583496094, + "loss": 0.8512, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.87329363822937, + "rewards/margins": 4.826383590698242, + "rewards/rejected": -1.9530901908874512, + "step": 4894 + }, + { + "epoch": 1.22, + "grad_norm": 2.631594181060791, + "learning_rate": 8.091220315442089e-07, + "logits/chosen": -0.28341835737228394, + "logits/rejected": -0.4014569818973541, + "logps/chosen": -52.58259582519531, + "logps/rejected": -81.20641326904297, + "loss": 0.6121, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.938849925994873, + "rewards/margins": 5.882493495941162, + "rewards/rejected": -2.9436442852020264, + "step": 4895 + }, + { + "epoch": 1.22, + "grad_norm": 5.77585506439209, + "learning_rate": 8.076937992193478e-07, + "logits/chosen": -0.3520798981189728, + "logits/rejected": -0.5040287375450134, + "logps/chosen": -68.88786315917969, + "logps/rejected": -89.8954086303711, + "loss": 0.7997, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.558347702026367, + "rewards/margins": 5.861580848693848, + "rewards/rejected": -3.3032326698303223, + "step": 4896 + }, + { + "epoch": 1.22, + "grad_norm": 5.504257678985596, + "learning_rate": 8.062667177730032e-07, + "logits/chosen": -0.4245286285877228, + "logits/rejected": -0.4657928943634033, + "logps/chosen": -66.24011993408203, + "logps/rejected": -85.84513854980469, + "loss": 0.9378, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.965109348297119, + "rewards/margins": 4.326028823852539, + "rewards/rejected": -1.36091947555542, + "step": 4897 + }, + { + "epoch": 1.23, + "grad_norm": 2.7595877647399902, + "learning_rate": 8.048407875969377e-07, + "logits/chosen": -0.3693969249725342, + "logits/rejected": -0.44589561223983765, + "logps/chosen": -54.48854064941406, + "logps/rejected": -106.20603942871094, + "loss": 0.6356, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.965555191040039, + "rewards/margins": 5.290869235992432, + "rewards/rejected": -2.3253142833709717, + "step": 4898 + }, + { + "epoch": 1.23, + "grad_norm": 4.798136234283447, + "learning_rate": 8.034160090826004e-07, + "logits/chosen": -0.36068809032440186, + "logits/rejected": -0.4373406767845154, + "logps/chosen": -52.6552734375, + "logps/rejected": -75.70857238769531, + "loss": 0.8318, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.644702911376953, + "rewards/margins": 3.47231125831604, + "rewards/rejected": -0.8276081681251526, + "step": 4899 + }, + { + "epoch": 1.23, + "grad_norm": 6.271637439727783, + "learning_rate": 8.01992382621124e-07, + "logits/chosen": -0.4084846079349518, + "logits/rejected": -0.4713602066040039, + "logps/chosen": -56.174346923828125, + "logps/rejected": -82.30321502685547, + "loss": 0.8182, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.132664918899536, + "rewards/margins": 4.700507640838623, + "rewards/rejected": -1.5678423643112183, + "step": 4900 + }, + { + "epoch": 1.23, + "grad_norm": 7.380187034606934, + "learning_rate": 8.005699086033236e-07, + "logits/chosen": -0.40041717886924744, + "logits/rejected": -0.485754132270813, + "logps/chosen": -56.03296661376953, + "logps/rejected": -87.13446044921875, + "loss": 0.8385, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8660452365875244, + "rewards/margins": 4.105876922607422, + "rewards/rejected": -1.2398314476013184, + "step": 4901 + }, + { + "epoch": 1.23, + "grad_norm": 9.056760787963867, + "learning_rate": 7.991485874197002e-07, + "logits/chosen": -0.399580180644989, + "logits/rejected": -0.49218910932540894, + "logps/chosen": -50.91246795654297, + "logps/rejected": -74.5511703491211, + "loss": 0.823, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.925049066543579, + "rewards/margins": 4.072917938232422, + "rewards/rejected": -1.1478689908981323, + "step": 4902 + }, + { + "epoch": 1.23, + "grad_norm": 9.564799308776855, + "learning_rate": 7.977284194604373e-07, + "logits/chosen": -0.3234928250312805, + "logits/rejected": -0.38441693782806396, + "logps/chosen": -59.726070404052734, + "logps/rejected": -88.13477325439453, + "loss": 0.6952, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0634348392486572, + "rewards/margins": 5.448005199432373, + "rewards/rejected": -2.3845701217651367, + "step": 4903 + }, + { + "epoch": 1.23, + "grad_norm": 11.692390441894531, + "learning_rate": 7.963094051154014e-07, + "logits/chosen": -0.3958677351474762, + "logits/rejected": -0.49385058879852295, + "logps/chosen": -61.42140579223633, + "logps/rejected": -94.13890075683594, + "loss": 0.7244, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9809141159057617, + "rewards/margins": 5.149871826171875, + "rewards/rejected": -2.1689577102661133, + "step": 4904 + }, + { + "epoch": 1.23, + "grad_norm": 5.226720809936523, + "learning_rate": 7.948915447741418e-07, + "logits/chosen": -0.33920034766197205, + "logits/rejected": -0.40183398127555847, + "logps/chosen": -91.34029388427734, + "logps/rejected": -91.72798156738281, + "loss": 0.6861, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1528546810150146, + "rewards/margins": 5.260767936706543, + "rewards/rejected": -2.1079137325286865, + "step": 4905 + }, + { + "epoch": 1.23, + "grad_norm": 4.754930019378662, + "learning_rate": 7.934748388258911e-07, + "logits/chosen": -0.2679111957550049, + "logits/rejected": -0.33945903182029724, + "logps/chosen": -62.314308166503906, + "logps/rejected": -93.6380615234375, + "loss": 0.7932, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8869898319244385, + "rewards/margins": 4.171627044677734, + "rewards/rejected": -1.2846375703811646, + "step": 4906 + }, + { + "epoch": 1.23, + "grad_norm": 5.9913129806518555, + "learning_rate": 7.92059287659569e-07, + "logits/chosen": -0.33244505524635315, + "logits/rejected": -0.34974220395088196, + "logps/chosen": -46.50579071044922, + "logps/rejected": -97.77003479003906, + "loss": 0.7511, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.833587408065796, + "rewards/margins": 4.335476398468018, + "rewards/rejected": -1.5018892288208008, + "step": 4907 + }, + { + "epoch": 1.23, + "grad_norm": 6.237154483795166, + "learning_rate": 7.906448916637705e-07, + "logits/chosen": -0.31835857033729553, + "logits/rejected": -0.40261608362197876, + "logps/chosen": -53.485618591308594, + "logps/rejected": -82.04742431640625, + "loss": 0.7797, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.038691997528076, + "rewards/margins": 4.865850448608398, + "rewards/rejected": -1.827157974243164, + "step": 4908 + }, + { + "epoch": 1.23, + "grad_norm": 6.087014675140381, + "learning_rate": 7.892316512267811e-07, + "logits/chosen": -0.40792036056518555, + "logits/rejected": -0.5136415362358093, + "logps/chosen": -46.855224609375, + "logps/rejected": -75.31815338134766, + "loss": 0.758, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9295482635498047, + "rewards/margins": 4.859554767608643, + "rewards/rejected": -1.9300062656402588, + "step": 4909 + }, + { + "epoch": 1.23, + "grad_norm": 7.078450679779053, + "learning_rate": 7.878195667365645e-07, + "logits/chosen": -0.29988086223602295, + "logits/rejected": -0.40300706028938293, + "logps/chosen": -63.31837844848633, + "logps/rejected": -99.93341827392578, + "loss": 0.7199, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7893712520599365, + "rewards/margins": 5.2179036140441895, + "rewards/rejected": -2.428532361984253, + "step": 4910 + }, + { + "epoch": 1.23, + "grad_norm": 2.5877604484558105, + "learning_rate": 7.86408638580769e-07, + "logits/chosen": -0.4695603549480438, + "logits/rejected": -0.551368236541748, + "logps/chosen": -42.74177551269531, + "logps/rejected": -82.066650390625, + "loss": 0.6176, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.952366590499878, + "rewards/margins": 5.586361885070801, + "rewards/rejected": -2.6339950561523438, + "step": 4911 + }, + { + "epoch": 1.23, + "grad_norm": 6.72102165222168, + "learning_rate": 7.84998867146724e-07, + "logits/chosen": -0.3738383650779724, + "logits/rejected": -0.4673706293106079, + "logps/chosen": -63.53947448730469, + "logps/rejected": -96.57389831542969, + "loss": 0.9505, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8648874759674072, + "rewards/margins": 3.9645817279815674, + "rewards/rejected": -1.0996941328048706, + "step": 4912 + }, + { + "epoch": 1.23, + "grad_norm": 10.178897857666016, + "learning_rate": 7.835902528214412e-07, + "logits/chosen": -0.36109301447868347, + "logits/rejected": -0.41025006771087646, + "logps/chosen": -64.46340942382812, + "logps/rejected": -105.4323959350586, + "loss": 0.835, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9267330169677734, + "rewards/margins": 5.1749420166015625, + "rewards/rejected": -2.248208522796631, + "step": 4913 + }, + { + "epoch": 1.23, + "grad_norm": 5.199062824249268, + "learning_rate": 7.821827959916184e-07, + "logits/chosen": -0.3358894884586334, + "logits/rejected": -0.4277133345603943, + "logps/chosen": -55.12744140625, + "logps/rejected": -81.7882308959961, + "loss": 0.7525, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1418561935424805, + "rewards/margins": 3.8782711029052734, + "rewards/rejected": -0.7364147901535034, + "step": 4914 + }, + { + "epoch": 1.23, + "grad_norm": 6.721561431884766, + "learning_rate": 7.807764970436327e-07, + "logits/chosen": -0.4394294023513794, + "logits/rejected": -0.4885108172893524, + "logps/chosen": -54.792747497558594, + "logps/rejected": -79.33172607421875, + "loss": 0.8235, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8848865032196045, + "rewards/margins": 3.7704386711120605, + "rewards/rejected": -0.8855524063110352, + "step": 4915 + }, + { + "epoch": 1.23, + "grad_norm": 6.121024131774902, + "learning_rate": 7.793713563635396e-07, + "logits/chosen": -0.42526260018348694, + "logits/rejected": -0.5114609599113464, + "logps/chosen": -48.23875045776367, + "logps/rejected": -78.2505111694336, + "loss": 0.7447, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8792853355407715, + "rewards/margins": 4.65123176574707, + "rewards/rejected": -1.7719464302062988, + "step": 4916 + }, + { + "epoch": 1.23, + "grad_norm": 5.23358678817749, + "learning_rate": 7.779673743370841e-07, + "logits/chosen": -0.37479090690612793, + "logits/rejected": -0.4248802661895752, + "logps/chosen": -57.992042541503906, + "logps/rejected": -95.25848388671875, + "loss": 0.7472, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7610392570495605, + "rewards/margins": 4.6222639083862305, + "rewards/rejected": -1.8612240552902222, + "step": 4917 + }, + { + "epoch": 1.23, + "grad_norm": 9.468157768249512, + "learning_rate": 7.765645513496878e-07, + "logits/chosen": -0.33901476860046387, + "logits/rejected": -0.4915616512298584, + "logps/chosen": -55.9640998840332, + "logps/rejected": -78.9226303100586, + "loss": 0.7064, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.034440755844116, + "rewards/margins": 5.415946960449219, + "rewards/rejected": -2.3815057277679443, + "step": 4918 + }, + { + "epoch": 1.23, + "grad_norm": 5.36592435836792, + "learning_rate": 7.751628877864592e-07, + "logits/chosen": -0.2582520842552185, + "logits/rejected": -0.3266565203666687, + "logps/chosen": -60.875362396240234, + "logps/rejected": -93.53885650634766, + "loss": 0.6921, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.823298454284668, + "rewards/margins": 4.6572160720825195, + "rewards/rejected": -1.8339177370071411, + "step": 4919 + }, + { + "epoch": 1.23, + "grad_norm": 5.683043479919434, + "learning_rate": 7.737623840321811e-07, + "logits/chosen": -0.44641217589378357, + "logits/rejected": -0.561476469039917, + "logps/chosen": -54.29723358154297, + "logps/rejected": -90.31097412109375, + "loss": 0.6705, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.052219867706299, + "rewards/margins": 6.072201728820801, + "rewards/rejected": -3.019981861114502, + "step": 4920 + }, + { + "epoch": 1.23, + "grad_norm": 3.7167584896087646, + "learning_rate": 7.723630404713228e-07, + "logits/chosen": -0.4918960928916931, + "logits/rejected": -0.4927959442138672, + "logps/chosen": -49.85078430175781, + "logps/rejected": -82.0478286743164, + "loss": 0.7134, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.065361261367798, + "rewards/margins": 4.778345584869385, + "rewards/rejected": -1.7129844427108765, + "step": 4921 + }, + { + "epoch": 1.23, + "grad_norm": 7.2595062255859375, + "learning_rate": 7.709648574880362e-07, + "logits/chosen": -0.373720645904541, + "logits/rejected": -0.4444071650505066, + "logps/chosen": -52.3841552734375, + "logps/rejected": -89.57510375976562, + "loss": 0.7438, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.273070812225342, + "rewards/margins": 4.816519737243652, + "rewards/rejected": -1.5434490442276, + "step": 4922 + }, + { + "epoch": 1.23, + "grad_norm": 6.6000237464904785, + "learning_rate": 7.695678354661513e-07, + "logits/chosen": -0.30445006489753723, + "logits/rejected": -0.4176553785800934, + "logps/chosen": -60.35340118408203, + "logps/rejected": -82.50615692138672, + "loss": 0.782, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.035968542098999, + "rewards/margins": 4.791778564453125, + "rewards/rejected": -1.7558101415634155, + "step": 4923 + }, + { + "epoch": 1.23, + "grad_norm": 3.7919607162475586, + "learning_rate": 7.681719747891813e-07, + "logits/chosen": -0.312466561794281, + "logits/rejected": -0.3950420618057251, + "logps/chosen": -56.776031494140625, + "logps/rejected": -85.61256408691406, + "loss": 0.6974, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8539440631866455, + "rewards/margins": 4.456502914428711, + "rewards/rejected": -1.6025590896606445, + "step": 4924 + }, + { + "epoch": 1.23, + "grad_norm": 7.619760990142822, + "learning_rate": 7.667772758403197e-07, + "logits/chosen": -0.3388806879520416, + "logits/rejected": -0.3930029273033142, + "logps/chosen": -57.36748123168945, + "logps/rejected": -90.10323333740234, + "loss": 0.6615, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.229935646057129, + "rewards/margins": 5.376462936401367, + "rewards/rejected": -2.146527051925659, + "step": 4925 + }, + { + "epoch": 1.23, + "grad_norm": 4.356178283691406, + "learning_rate": 7.65383739002441e-07, + "logits/chosen": -0.2877158224582672, + "logits/rejected": -0.397879958152771, + "logps/chosen": -56.97958755493164, + "logps/rejected": -96.52294158935547, + "loss": 0.6747, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9032063484191895, + "rewards/margins": 5.17386531829834, + "rewards/rejected": -2.2706589698791504, + "step": 4926 + }, + { + "epoch": 1.23, + "grad_norm": 4.676921367645264, + "learning_rate": 7.639913646581038e-07, + "logits/chosen": -0.43123048543930054, + "logits/rejected": -0.5194811820983887, + "logps/chosen": -50.26219940185547, + "logps/rejected": -86.17399597167969, + "loss": 0.6527, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5768024921417236, + "rewards/margins": 5.483604431152344, + "rewards/rejected": -2.90680193901062, + "step": 4927 + }, + { + "epoch": 1.23, + "grad_norm": 5.122593402862549, + "learning_rate": 7.6260015318954e-07, + "logits/chosen": -0.33590981364250183, + "logits/rejected": -0.382992684841156, + "logps/chosen": -68.90946960449219, + "logps/rejected": -101.84862518310547, + "loss": 0.8757, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7850022315979004, + "rewards/margins": 4.39959192276001, + "rewards/rejected": -1.6145899295806885, + "step": 4928 + }, + { + "epoch": 1.23, + "grad_norm": 5.08538818359375, + "learning_rate": 7.612101049786719e-07, + "logits/chosen": -0.3492402136325836, + "logits/rejected": -0.4114762544631958, + "logps/chosen": -67.47238159179688, + "logps/rejected": -90.79212951660156, + "loss": 0.7615, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1626405715942383, + "rewards/margins": 4.599556922912598, + "rewards/rejected": -1.4369162321090698, + "step": 4929 + }, + { + "epoch": 1.23, + "grad_norm": 6.975496292114258, + "learning_rate": 7.598212204070954e-07, + "logits/chosen": -0.35002437233924866, + "logits/rejected": -0.42274659872055054, + "logps/chosen": -75.63817596435547, + "logps/rejected": -91.62081909179688, + "loss": 0.8698, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.622746706008911, + "rewards/margins": 4.852917671203613, + "rewards/rejected": -2.230170965194702, + "step": 4930 + }, + { + "epoch": 1.23, + "grad_norm": 8.33823013305664, + "learning_rate": 7.584334998560899e-07, + "logits/chosen": -0.2831324338912964, + "logits/rejected": -0.3417009711265564, + "logps/chosen": -63.25426483154297, + "logps/rejected": -83.74684143066406, + "loss": 0.8877, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.668013572692871, + "rewards/margins": 3.633496046066284, + "rewards/rejected": -0.9654824733734131, + "step": 4931 + }, + { + "epoch": 1.23, + "grad_norm": 3.775902271270752, + "learning_rate": 7.570469437066147e-07, + "logits/chosen": -0.2918890118598938, + "logits/rejected": -0.429141104221344, + "logps/chosen": -58.990020751953125, + "logps/rejected": -91.21516418457031, + "loss": 0.6837, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9594104290008545, + "rewards/margins": 5.302377700805664, + "rewards/rejected": -2.3429672718048096, + "step": 4932 + }, + { + "epoch": 1.23, + "grad_norm": 5.590803623199463, + "learning_rate": 7.556615523393074e-07, + "logits/chosen": -0.3701924681663513, + "logits/rejected": -0.49371376633644104, + "logps/chosen": -52.237674713134766, + "logps/rejected": -69.66443634033203, + "loss": 0.7093, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.126106023788452, + "rewards/margins": 5.1650285720825195, + "rewards/rejected": -2.0389232635498047, + "step": 4933 + }, + { + "epoch": 1.23, + "grad_norm": 6.17254638671875, + "learning_rate": 7.542773261344915e-07, + "logits/chosen": -0.26285019516944885, + "logits/rejected": -0.3545824885368347, + "logps/chosen": -63.1248779296875, + "logps/rejected": -84.89488983154297, + "loss": 0.794, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9504141807556152, + "rewards/margins": 4.738622665405273, + "rewards/rejected": -1.7882087230682373, + "step": 4934 + }, + { + "epoch": 1.23, + "grad_norm": 4.159739017486572, + "learning_rate": 7.528942654721644e-07, + "logits/chosen": -0.38861194252967834, + "logits/rejected": -0.45143023133277893, + "logps/chosen": -57.04329299926758, + "logps/rejected": -94.75170135498047, + "loss": 0.6994, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.19791316986084, + "rewards/margins": 4.685606002807617, + "rewards/rejected": -1.4876930713653564, + "step": 4935 + }, + { + "epoch": 1.23, + "grad_norm": 4.947624206542969, + "learning_rate": 7.515123707320071e-07, + "logits/chosen": -0.4223855435848236, + "logits/rejected": -0.48538681864738464, + "logps/chosen": -57.76067352294922, + "logps/rejected": -92.52983093261719, + "loss": 0.6629, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7733993530273438, + "rewards/margins": 5.646059989929199, + "rewards/rejected": -2.8726611137390137, + "step": 4936 + }, + { + "epoch": 1.24, + "grad_norm": 4.192343711853027, + "learning_rate": 7.501316422933791e-07, + "logits/chosen": -0.28252917528152466, + "logits/rejected": -0.4007853865623474, + "logps/chosen": -57.069053649902344, + "logps/rejected": -87.61038208007812, + "loss": 0.6757, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.913517951965332, + "rewards/margins": 5.974433422088623, + "rewards/rejected": -3.060915946960449, + "step": 4937 + }, + { + "epoch": 1.24, + "grad_norm": 4.85222053527832, + "learning_rate": 7.487520805353187e-07, + "logits/chosen": -0.30320319533348083, + "logits/rejected": -0.3465367555618286, + "logps/chosen": -58.56967544555664, + "logps/rejected": -82.76751708984375, + "loss": 0.7077, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9570021629333496, + "rewards/margins": 4.415647506713867, + "rewards/rejected": -1.4586458206176758, + "step": 4938 + }, + { + "epoch": 1.24, + "grad_norm": 3.617758274078369, + "learning_rate": 7.473736858365499e-07, + "logits/chosen": -0.36246445775032043, + "logits/rejected": -0.45986106991767883, + "logps/chosen": -53.37641143798828, + "logps/rejected": -97.56036376953125, + "loss": 0.6158, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9471075534820557, + "rewards/margins": 5.591306686401367, + "rewards/rejected": -2.6441988945007324, + "step": 4939 + }, + { + "epoch": 1.24, + "grad_norm": 4.054317951202393, + "learning_rate": 7.45996458575467e-07, + "logits/chosen": -0.3854195475578308, + "logits/rejected": -0.4530370831489563, + "logps/chosen": -55.04427719116211, + "logps/rejected": -85.02521514892578, + "loss": 0.7179, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.065328598022461, + "rewards/margins": 3.8677868843078613, + "rewards/rejected": -0.8024584650993347, + "step": 4940 + }, + { + "epoch": 1.24, + "grad_norm": 21.430620193481445, + "learning_rate": 7.446203991301498e-07, + "logits/chosen": -0.2673364579677582, + "logits/rejected": -0.34230107069015503, + "logps/chosen": -56.00239944458008, + "logps/rejected": -92.43881225585938, + "loss": 0.8025, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0474441051483154, + "rewards/margins": 4.554421901702881, + "rewards/rejected": -1.506977915763855, + "step": 4941 + }, + { + "epoch": 1.24, + "grad_norm": 6.098270416259766, + "learning_rate": 7.432455078783573e-07, + "logits/chosen": -0.29609110951423645, + "logits/rejected": -0.39998650550842285, + "logps/chosen": -71.12815856933594, + "logps/rejected": -88.64328002929688, + "loss": 0.7809, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8244104385375977, + "rewards/margins": 4.347977638244629, + "rewards/rejected": -1.523567795753479, + "step": 4942 + }, + { + "epoch": 1.24, + "grad_norm": 3.7943031787872314, + "learning_rate": 7.418717851975271e-07, + "logits/chosen": -0.3842169940471649, + "logits/rejected": -0.5050356984138489, + "logps/chosen": -50.954036712646484, + "logps/rejected": -83.35710144042969, + "loss": 0.5996, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.802464485168457, + "rewards/margins": 5.845233917236328, + "rewards/rejected": -3.042769432067871, + "step": 4943 + }, + { + "epoch": 1.24, + "grad_norm": 4.803915500640869, + "learning_rate": 7.404992314647746e-07, + "logits/chosen": -0.3364432752132416, + "logits/rejected": -0.4657607674598694, + "logps/chosen": -57.01628112792969, + "logps/rejected": -96.06434631347656, + "loss": 0.7241, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.096038818359375, + "rewards/margins": 4.421235084533691, + "rewards/rejected": -1.3251962661743164, + "step": 4944 + }, + { + "epoch": 1.24, + "grad_norm": 12.630573272705078, + "learning_rate": 7.391278470568958e-07, + "logits/chosen": -0.3943523168563843, + "logits/rejected": -0.4550129473209381, + "logps/chosen": -57.44236755371094, + "logps/rejected": -105.93949127197266, + "loss": 0.9599, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6123147010803223, + "rewards/margins": 4.99390983581543, + "rewards/rejected": -2.3815948963165283, + "step": 4945 + }, + { + "epoch": 1.24, + "grad_norm": 4.406254291534424, + "learning_rate": 7.377576323503644e-07, + "logits/chosen": -0.30908918380737305, + "logits/rejected": -0.40847089886665344, + "logps/chosen": -56.27539825439453, + "logps/rejected": -81.33999633789062, + "loss": 0.708, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7905380725860596, + "rewards/margins": 5.4156975746154785, + "rewards/rejected": -2.6251587867736816, + "step": 4946 + }, + { + "epoch": 1.24, + "grad_norm": 7.071721076965332, + "learning_rate": 7.363885877213362e-07, + "logits/chosen": -0.3679048717021942, + "logits/rejected": -0.4235961437225342, + "logps/chosen": -57.88129425048828, + "logps/rejected": -91.52532196044922, + "loss": 0.8949, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6684834957122803, + "rewards/margins": 5.742921829223633, + "rewards/rejected": -3.0744385719299316, + "step": 4947 + }, + { + "epoch": 1.24, + "grad_norm": 3.475855588912964, + "learning_rate": 7.350207135456416e-07, + "logits/chosen": -0.3037346601486206, + "logits/rejected": -0.4054316580295563, + "logps/chosen": -56.72918701171875, + "logps/rejected": -84.52591705322266, + "loss": 0.6667, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0775551795959473, + "rewards/margins": 5.1541056632995605, + "rewards/rejected": -2.0765504837036133, + "step": 4948 + }, + { + "epoch": 1.24, + "grad_norm": 8.180700302124023, + "learning_rate": 7.336540101987927e-07, + "logits/chosen": -0.3431461453437805, + "logits/rejected": -0.40236902236938477, + "logps/chosen": -66.60462188720703, + "logps/rejected": -82.36235809326172, + "loss": 0.8144, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1607108116149902, + "rewards/margins": 4.644336700439453, + "rewards/rejected": -1.4836257696151733, + "step": 4949 + }, + { + "epoch": 1.24, + "grad_norm": 3.5155930519104004, + "learning_rate": 7.322884780559781e-07, + "logits/chosen": -0.3872363269329071, + "logits/rejected": -0.4569663107395172, + "logps/chosen": -57.560367584228516, + "logps/rejected": -99.77729797363281, + "loss": 0.7262, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9323017597198486, + "rewards/margins": 5.23517370223999, + "rewards/rejected": -2.3028712272644043, + "step": 4950 + }, + { + "epoch": 1.24, + "grad_norm": 4.263932704925537, + "learning_rate": 7.309241174920667e-07, + "logits/chosen": -0.29586097598075867, + "logits/rejected": -0.41791844367980957, + "logps/chosen": -61.81262969970703, + "logps/rejected": -91.4091567993164, + "loss": 0.7596, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8719513416290283, + "rewards/margins": 4.45014762878418, + "rewards/rejected": -1.57819664478302, + "step": 4951 + }, + { + "epoch": 1.24, + "grad_norm": 5.132444381713867, + "learning_rate": 7.295609288816041e-07, + "logits/chosen": -0.3254411518573761, + "logits/rejected": -0.4103766083717346, + "logps/chosen": -62.897186279296875, + "logps/rejected": -88.2491455078125, + "loss": 0.7341, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.846980333328247, + "rewards/margins": 5.864672660827637, + "rewards/rejected": -3.017692804336548, + "step": 4952 + }, + { + "epoch": 1.24, + "grad_norm": 5.538461208343506, + "learning_rate": 7.281989125988137e-07, + "logits/chosen": -0.2943655252456665, + "logits/rejected": -0.4425237476825714, + "logps/chosen": -65.54989624023438, + "logps/rejected": -74.32513427734375, + "loss": 0.7308, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6564619541168213, + "rewards/margins": 4.543798446655273, + "rewards/rejected": -1.887336254119873, + "step": 4953 + }, + { + "epoch": 1.24, + "grad_norm": 6.460576057434082, + "learning_rate": 7.268380690176019e-07, + "logits/chosen": -0.3690667152404785, + "logits/rejected": -0.40850749611854553, + "logps/chosen": -55.19246292114258, + "logps/rejected": -82.13195037841797, + "loss": 0.6768, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.701974868774414, + "rewards/margins": 4.394700050354004, + "rewards/rejected": -1.6927249431610107, + "step": 4954 + }, + { + "epoch": 1.24, + "grad_norm": 4.8003716468811035, + "learning_rate": 7.254783985115471e-07, + "logits/chosen": -0.31159707903862, + "logits/rejected": -0.39560288190841675, + "logps/chosen": -77.19317626953125, + "logps/rejected": -98.3002700805664, + "loss": 0.7448, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.697559118270874, + "rewards/margins": 3.921956777572632, + "rewards/rejected": -1.2243980169296265, + "step": 4955 + }, + { + "epoch": 1.24, + "grad_norm": 5.295395851135254, + "learning_rate": 7.241199014539097e-07, + "logits/chosen": -0.36605075001716614, + "logits/rejected": -0.44179970026016235, + "logps/chosen": -48.4507942199707, + "logps/rejected": -94.57730102539062, + "loss": 0.6857, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.744884490966797, + "rewards/margins": 5.815183162689209, + "rewards/rejected": -3.0702991485595703, + "step": 4956 + }, + { + "epoch": 1.24, + "grad_norm": 4.612471103668213, + "learning_rate": 7.227625782176246e-07, + "logits/chosen": -0.3543790876865387, + "logits/rejected": -0.4534015655517578, + "logps/chosen": -64.41688537597656, + "logps/rejected": -92.11026000976562, + "loss": 0.7367, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1902647018432617, + "rewards/margins": 6.364338397979736, + "rewards/rejected": -3.1740736961364746, + "step": 4957 + }, + { + "epoch": 1.24, + "grad_norm": 5.654007911682129, + "learning_rate": 7.214064291753065e-07, + "logits/chosen": -0.33583977818489075, + "logits/rejected": -0.5251294374465942, + "logps/chosen": -72.158447265625, + "logps/rejected": -63.268253326416016, + "loss": 0.7578, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.733855962753296, + "rewards/margins": 4.898085594177246, + "rewards/rejected": -2.164229393005371, + "step": 4958 + }, + { + "epoch": 1.24, + "grad_norm": 6.418930530548096, + "learning_rate": 7.200514546992498e-07, + "logits/chosen": -0.3898134231567383, + "logits/rejected": -0.4250168204307556, + "logps/chosen": -56.91336441040039, + "logps/rejected": -84.2503890991211, + "loss": 0.8924, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6745073795318604, + "rewards/margins": 4.759316444396973, + "rewards/rejected": -2.0848090648651123, + "step": 4959 + }, + { + "epoch": 1.24, + "grad_norm": 4.216362953186035, + "learning_rate": 7.186976551614233e-07, + "logits/chosen": -0.37905946373939514, + "logits/rejected": -0.4319609999656677, + "logps/chosen": -41.67107009887695, + "logps/rejected": -82.63202667236328, + "loss": 0.6865, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9546515941619873, + "rewards/margins": 4.6537184715271, + "rewards/rejected": -1.6990671157836914, + "step": 4960 + }, + { + "epoch": 1.24, + "grad_norm": 3.912937879562378, + "learning_rate": 7.173450309334718e-07, + "logits/chosen": -0.35849571228027344, + "logits/rejected": -0.4100848436355591, + "logps/chosen": -53.591529846191406, + "logps/rejected": -101.05642700195312, + "loss": 0.6921, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8999006748199463, + "rewards/margins": 5.585921287536621, + "rewards/rejected": -2.686020612716675, + "step": 4961 + }, + { + "epoch": 1.24, + "grad_norm": 16.44464874267578, + "learning_rate": 7.159935823867231e-07, + "logits/chosen": -0.3522340953350067, + "logits/rejected": -0.3800281584262848, + "logps/chosen": -51.552555084228516, + "logps/rejected": -108.41326904296875, + "loss": 0.9291, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8625648021698, + "rewards/margins": 4.609879016876221, + "rewards/rejected": -1.7473139762878418, + "step": 4962 + }, + { + "epoch": 1.24, + "grad_norm": 7.542725086212158, + "learning_rate": 7.146433098921774e-07, + "logits/chosen": -0.40058955550193787, + "logits/rejected": -0.4955672025680542, + "logps/chosen": -71.84138488769531, + "logps/rejected": -80.28053283691406, + "loss": 0.9581, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.692485809326172, + "rewards/margins": 3.7296130657196045, + "rewards/rejected": -1.0371270179748535, + "step": 4963 + }, + { + "epoch": 1.24, + "grad_norm": 17.925601959228516, + "learning_rate": 7.132942138205135e-07, + "logits/chosen": -0.2681577503681183, + "logits/rejected": -0.3501579463481903, + "logps/chosen": -62.55731964111328, + "logps/rejected": -91.19300079345703, + "loss": 0.6814, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.827503204345703, + "rewards/margins": 5.205826759338379, + "rewards/rejected": -2.3783233165740967, + "step": 4964 + }, + { + "epoch": 1.24, + "grad_norm": 2.5378830432891846, + "learning_rate": 7.11946294542088e-07, + "logits/chosen": -0.3537898659706116, + "logits/rejected": -0.40547606348991394, + "logps/chosen": -47.71994400024414, + "logps/rejected": -90.7987060546875, + "loss": 0.599, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.07326602935791, + "rewards/margins": 5.821005821228027, + "rewards/rejected": -2.747739791870117, + "step": 4965 + }, + { + "epoch": 1.24, + "grad_norm": 5.212196350097656, + "learning_rate": 7.105995524269321e-07, + "logits/chosen": -0.4508308470249176, + "logits/rejected": -0.5059165954589844, + "logps/chosen": -49.105445861816406, + "logps/rejected": -110.24944305419922, + "loss": 0.7961, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8337225914001465, + "rewards/margins": 5.84699821472168, + "rewards/rejected": -3.013275384902954, + "step": 4966 + }, + { + "epoch": 1.24, + "grad_norm": 5.502585411071777, + "learning_rate": 7.092539878447585e-07, + "logits/chosen": -0.3473583459854126, + "logits/rejected": -0.4838657081127167, + "logps/chosen": -53.025882720947266, + "logps/rejected": -78.35690307617188, + "loss": 0.6594, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.002246856689453, + "rewards/margins": 5.32560920715332, + "rewards/rejected": -2.323362350463867, + "step": 4967 + }, + { + "epoch": 1.24, + "grad_norm": 3.4123361110687256, + "learning_rate": 7.079096011649522e-07, + "logits/chosen": -0.2812701165676117, + "logits/rejected": -0.35589584708213806, + "logps/chosen": -60.71432113647461, + "logps/rejected": -89.82793426513672, + "loss": 0.687, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8457906246185303, + "rewards/margins": 4.663538932800293, + "rewards/rejected": -1.8177480697631836, + "step": 4968 + }, + { + "epoch": 1.24, + "grad_norm": 2.6349685192108154, + "learning_rate": 7.065663927565764e-07, + "logits/chosen": -0.31663385033607483, + "logits/rejected": -0.38361236453056335, + "logps/chosen": -54.13437271118164, + "logps/rejected": -97.72254180908203, + "loss": 0.6296, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8096156120300293, + "rewards/margins": 4.796280860900879, + "rewards/rejected": -1.98666512966156, + "step": 4969 + }, + { + "epoch": 1.24, + "grad_norm": 32.92653274536133, + "learning_rate": 7.05224362988371e-07, + "logits/chosen": -0.30746355652809143, + "logits/rejected": -0.4112406075000763, + "logps/chosen": -73.28221130371094, + "logps/rejected": -86.29999542236328, + "loss": 0.8495, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.88169527053833, + "rewards/margins": 5.237942695617676, + "rewards/rejected": -2.356247901916504, + "step": 4970 + }, + { + "epoch": 1.24, + "grad_norm": 6.73980712890625, + "learning_rate": 7.038835122287502e-07, + "logits/chosen": -0.2614167332649231, + "logits/rejected": -0.362814337015152, + "logps/chosen": -73.086669921875, + "logps/rejected": -86.1890869140625, + "loss": 0.8489, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.856361150741577, + "rewards/margins": 4.9074387550354, + "rewards/rejected": -2.051077365875244, + "step": 4971 + }, + { + "epoch": 1.24, + "grad_norm": 6.20622444152832, + "learning_rate": 7.025438408458107e-07, + "logits/chosen": -0.33290380239486694, + "logits/rejected": -0.3926728367805481, + "logps/chosen": -50.33551788330078, + "logps/rejected": -94.99798583984375, + "loss": 0.7764, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8244519233703613, + "rewards/margins": 4.892047882080078, + "rewards/rejected": -2.0675957202911377, + "step": 4972 + }, + { + "epoch": 1.24, + "grad_norm": 10.303938865661621, + "learning_rate": 7.012053492073168e-07, + "logits/chosen": -0.377179890871048, + "logits/rejected": -0.47090572118759155, + "logps/chosen": -55.45832061767578, + "logps/rejected": -88.90795135498047, + "loss": 0.7282, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0505738258361816, + "rewards/margins": 4.71195125579834, + "rewards/rejected": -1.6613776683807373, + "step": 4973 + }, + { + "epoch": 1.24, + "grad_norm": 3.238407611846924, + "learning_rate": 6.99868037680716e-07, + "logits/chosen": -0.33206769824028015, + "logits/rejected": -0.43356841802597046, + "logps/chosen": -59.97528076171875, + "logps/rejected": -94.78230285644531, + "loss": 0.6463, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9493725299835205, + "rewards/margins": 5.313066005706787, + "rewards/rejected": -2.3636934757232666, + "step": 4974 + }, + { + "epoch": 1.24, + "grad_norm": 7.458353042602539, + "learning_rate": 6.985319066331281e-07, + "logits/chosen": -0.3417273461818695, + "logits/rejected": -0.402384489774704, + "logps/chosen": -54.296302795410156, + "logps/rejected": -75.2991943359375, + "loss": 0.7519, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7909586429595947, + "rewards/margins": 4.397823810577393, + "rewards/rejected": -1.6068651676177979, + "step": 4975 + }, + { + "epoch": 1.24, + "grad_norm": 4.3704352378845215, + "learning_rate": 6.971969564313507e-07, + "logits/chosen": -0.2677641808986664, + "logits/rejected": -0.46160703897476196, + "logps/chosen": -75.10079193115234, + "logps/rejected": -82.73933410644531, + "loss": 0.7236, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2529213428497314, + "rewards/margins": 5.6778364181518555, + "rewards/rejected": -2.424914836883545, + "step": 4976 + }, + { + "epoch": 1.25, + "grad_norm": 5.914693832397461, + "learning_rate": 6.958631874418553e-07, + "logits/chosen": -0.3287397623062134, + "logits/rejected": -0.42030492424964905, + "logps/chosen": -59.967586517333984, + "logps/rejected": -84.65777587890625, + "loss": 0.8121, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.549813747406006, + "rewards/margins": 4.148853302001953, + "rewards/rejected": -1.5990397930145264, + "step": 4977 + }, + { + "epoch": 1.25, + "grad_norm": 3.362022638320923, + "learning_rate": 6.945306000307905e-07, + "logits/chosen": -0.36540600657463074, + "logits/rejected": -0.49777209758758545, + "logps/chosen": -46.629615783691406, + "logps/rejected": -78.32085418701172, + "loss": 0.5578, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.121673107147217, + "rewards/margins": 6.044565200805664, + "rewards/rejected": -2.9228920936584473, + "step": 4978 + }, + { + "epoch": 1.25, + "grad_norm": 3.622457504272461, + "learning_rate": 6.931991945639816e-07, + "logits/chosen": -0.3102889358997345, + "logits/rejected": -0.40872570872306824, + "logps/chosen": -50.53916931152344, + "logps/rejected": -87.08539581298828, + "loss": 0.6694, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5927915573120117, + "rewards/margins": 5.22853946685791, + "rewards/rejected": -2.6357483863830566, + "step": 4979 + }, + { + "epoch": 1.25, + "grad_norm": 5.825109958648682, + "learning_rate": 6.918689714069282e-07, + "logits/chosen": -0.2720790505409241, + "logits/rejected": -0.42132633924484253, + "logps/chosen": -51.90584182739258, + "logps/rejected": -79.4161148071289, + "loss": 0.646, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.984098196029663, + "rewards/margins": 4.703857898712158, + "rewards/rejected": -1.7197595834732056, + "step": 4980 + }, + { + "epoch": 1.25, + "grad_norm": 4.622334957122803, + "learning_rate": 6.905399309248018e-07, + "logits/chosen": -0.3298172056674957, + "logits/rejected": -0.4072779417037964, + "logps/chosen": -53.608177185058594, + "logps/rejected": -77.75116729736328, + "loss": 0.7632, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.766939401626587, + "rewards/margins": 4.226436614990234, + "rewards/rejected": -1.4594972133636475, + "step": 4981 + }, + { + "epoch": 1.25, + "grad_norm": 4.349878787994385, + "learning_rate": 6.892120734824565e-07, + "logits/chosen": -0.4011056125164032, + "logits/rejected": -0.5325763821601868, + "logps/chosen": -58.6993408203125, + "logps/rejected": -72.78451538085938, + "loss": 0.7217, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6532058715820312, + "rewards/margins": 5.2542901039123535, + "rewards/rejected": -2.601083993911743, + "step": 4982 + }, + { + "epoch": 1.25, + "grad_norm": 4.722715854644775, + "learning_rate": 6.878853994444157e-07, + "logits/chosen": -0.31193292140960693, + "logits/rejected": -0.42486894130706787, + "logps/chosen": -57.28960037231445, + "logps/rejected": -90.00324249267578, + "loss": 0.7007, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3264904022216797, + "rewards/margins": 6.422441005706787, + "rewards/rejected": -3.0959510803222656, + "step": 4983 + }, + { + "epoch": 1.25, + "grad_norm": 3.0475399494171143, + "learning_rate": 6.865599091748826e-07, + "logits/chosen": -0.32915323972702026, + "logits/rejected": -0.4927150309085846, + "logps/chosen": -62.48447799682617, + "logps/rejected": -77.59041595458984, + "loss": 0.6377, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8376693725585938, + "rewards/margins": 5.749580383300781, + "rewards/rejected": -2.9119110107421875, + "step": 4984 + }, + { + "epoch": 1.25, + "grad_norm": 8.241169929504395, + "learning_rate": 6.852356030377306e-07, + "logits/chosen": -0.2882739305496216, + "logits/rejected": -0.43354880809783936, + "logps/chosen": -70.85746002197266, + "logps/rejected": -84.73954772949219, + "loss": 0.8391, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9517571926116943, + "rewards/margins": 4.371241092681885, + "rewards/rejected": -1.4194839000701904, + "step": 4985 + }, + { + "epoch": 1.25, + "grad_norm": 5.380764007568359, + "learning_rate": 6.839124813965092e-07, + "logits/chosen": -0.3555614650249481, + "logits/rejected": -0.4689951539039612, + "logps/chosen": -64.84549713134766, + "logps/rejected": -83.04905700683594, + "loss": 0.827, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.563106060028076, + "rewards/margins": 4.388635635375977, + "rewards/rejected": -1.8255293369293213, + "step": 4986 + }, + { + "epoch": 1.25, + "grad_norm": 3.6167385578155518, + "learning_rate": 6.825905446144465e-07, + "logits/chosen": -0.3294788897037506, + "logits/rejected": -0.4268811047077179, + "logps/chosen": -61.88372802734375, + "logps/rejected": -80.35592651367188, + "loss": 0.6736, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8568711280822754, + "rewards/margins": 4.940086841583252, + "rewards/rejected": -2.0832157135009766, + "step": 4987 + }, + { + "epoch": 1.25, + "grad_norm": 3.667569398880005, + "learning_rate": 6.81269793054442e-07, + "logits/chosen": -0.35393595695495605, + "logits/rejected": -0.4134099781513214, + "logps/chosen": -52.862857818603516, + "logps/rejected": -83.0364761352539, + "loss": 0.7766, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.081411123275757, + "rewards/margins": 4.401498794555664, + "rewards/rejected": -1.3200875520706177, + "step": 4988 + }, + { + "epoch": 1.25, + "grad_norm": 4.065760612487793, + "learning_rate": 6.799502270790698e-07, + "logits/chosen": -0.3511921465396881, + "logits/rejected": -0.42599987983703613, + "logps/chosen": -60.808631896972656, + "logps/rejected": -84.07813262939453, + "loss": 0.669, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8948259353637695, + "rewards/margins": 4.781454086303711, + "rewards/rejected": -1.88662850856781, + "step": 4989 + }, + { + "epoch": 1.25, + "grad_norm": 5.118017673492432, + "learning_rate": 6.786318470505798e-07, + "logits/chosen": -0.3321068584918976, + "logits/rejected": -0.427626371383667, + "logps/chosen": -60.1295280456543, + "logps/rejected": -91.37016296386719, + "loss": 0.6856, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.204442024230957, + "rewards/margins": 5.654922962188721, + "rewards/rejected": -2.4504806995391846, + "step": 4990 + }, + { + "epoch": 1.25, + "grad_norm": 4.8337836265563965, + "learning_rate": 6.773146533308944e-07, + "logits/chosen": -0.4344768226146698, + "logits/rejected": -0.5935701727867126, + "logps/chosen": -62.53098678588867, + "logps/rejected": -74.02247619628906, + "loss": 0.7575, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.137324571609497, + "rewards/margins": 5.384730339050293, + "rewards/rejected": -2.247405529022217, + "step": 4991 + }, + { + "epoch": 1.25, + "grad_norm": 5.1125078201293945, + "learning_rate": 6.759986462816142e-07, + "logits/chosen": -0.3010546565055847, + "logits/rejected": -0.4117238521575928, + "logps/chosen": -65.1014175415039, + "logps/rejected": -91.6495590209961, + "loss": 0.6683, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9116780757904053, + "rewards/margins": 4.870306015014648, + "rewards/rejected": -1.958627700805664, + "step": 4992 + }, + { + "epoch": 1.25, + "grad_norm": 3.5833065509796143, + "learning_rate": 6.746838262640099e-07, + "logits/chosen": -0.39740172028541565, + "logits/rejected": -0.49789780378341675, + "logps/chosen": -51.098960876464844, + "logps/rejected": -86.7164535522461, + "loss": 0.5298, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8905651569366455, + "rewards/margins": 5.312102317810059, + "rewards/rejected": -2.421536922454834, + "step": 4993 + }, + { + "epoch": 1.25, + "grad_norm": 5.6627516746521, + "learning_rate": 6.733701936390285e-07, + "logits/chosen": -0.4401869475841522, + "logits/rejected": -0.5342891216278076, + "logps/chosen": -51.73973846435547, + "logps/rejected": -80.3255844116211, + "loss": 0.6576, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.596925735473633, + "rewards/margins": 5.383052825927734, + "rewards/rejected": -2.7861270904541016, + "step": 4994 + }, + { + "epoch": 1.25, + "grad_norm": 9.758014678955078, + "learning_rate": 6.720577487672902e-07, + "logits/chosen": -0.3046584129333496, + "logits/rejected": -0.3982895016670227, + "logps/chosen": -72.22738647460938, + "logps/rejected": -87.86309814453125, + "loss": 0.8502, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.583980083465576, + "rewards/margins": 4.923805236816406, + "rewards/rejected": -2.339824676513672, + "step": 4995 + }, + { + "epoch": 1.25, + "grad_norm": 3.949617624282837, + "learning_rate": 6.707464920090895e-07, + "logits/chosen": -0.3687264919281006, + "logits/rejected": -0.42009180784225464, + "logps/chosen": -61.86167907714844, + "logps/rejected": -104.1026611328125, + "loss": 0.7093, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.330042839050293, + "rewards/margins": 5.45631217956543, + "rewards/rejected": -2.126269817352295, + "step": 4996 + }, + { + "epoch": 1.25, + "grad_norm": 4.815743923187256, + "learning_rate": 6.694364237243944e-07, + "logits/chosen": -0.3363017737865448, + "logits/rejected": -0.4261927604675293, + "logps/chosen": -59.12192153930664, + "logps/rejected": -89.69646453857422, + "loss": 0.7349, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9816763401031494, + "rewards/margins": 4.760000705718994, + "rewards/rejected": -1.7783242464065552, + "step": 4997 + }, + { + "epoch": 1.25, + "grad_norm": 4.374403953552246, + "learning_rate": 6.681275442728463e-07, + "logits/chosen": -0.33332616090774536, + "logits/rejected": -0.39950302243232727, + "logps/chosen": -54.91917419433594, + "logps/rejected": -82.70387268066406, + "loss": 0.6666, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.996633529663086, + "rewards/margins": 4.789303779602051, + "rewards/rejected": -1.792669653892517, + "step": 4998 + }, + { + "epoch": 1.25, + "grad_norm": 6.772393226623535, + "learning_rate": 6.668198540137627e-07, + "logits/chosen": -0.3035588264465332, + "logits/rejected": -0.3594915568828583, + "logps/chosen": -62.80577850341797, + "logps/rejected": -100.33055114746094, + "loss": 0.8084, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6135928630828857, + "rewards/margins": 4.157870769500732, + "rewards/rejected": -1.5442781448364258, + "step": 4999 + }, + { + "epoch": 1.25, + "grad_norm": 4.331944942474365, + "learning_rate": 6.65513353306132e-07, + "logits/chosen": -0.30837491154670715, + "logits/rejected": -0.4104422330856323, + "logps/chosen": -54.530487060546875, + "logps/rejected": -75.891357421875, + "loss": 0.6414, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.00669527053833, + "rewards/margins": 4.35977840423584, + "rewards/rejected": -1.3530831336975098, + "step": 5000 + }, + { + "epoch": 1.25, + "grad_norm": 5.542034149169922, + "learning_rate": 6.642080425086167e-07, + "logits/chosen": -0.4067681133747101, + "logits/rejected": -0.5077582597732544, + "logps/chosen": -56.868629455566406, + "logps/rejected": -79.82231140136719, + "loss": 0.7838, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.864182472229004, + "rewards/margins": 4.491036415100098, + "rewards/rejected": -1.6268540620803833, + "step": 5001 + }, + { + "epoch": 1.25, + "grad_norm": 4.75421667098999, + "learning_rate": 6.629039219795525e-07, + "logits/chosen": -0.28694427013397217, + "logits/rejected": -0.33482837677001953, + "logps/chosen": -48.570220947265625, + "logps/rejected": -85.46369934082031, + "loss": 0.7621, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8429388999938965, + "rewards/margins": 4.30167293548584, + "rewards/rejected": -1.4587340354919434, + "step": 5002 + }, + { + "epoch": 1.25, + "grad_norm": 7.891231536865234, + "learning_rate": 6.616009920769478e-07, + "logits/chosen": -0.29629531502723694, + "logits/rejected": -0.3950576186180115, + "logps/chosen": -51.911216735839844, + "logps/rejected": -84.36463928222656, + "loss": 0.7919, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1480307579040527, + "rewards/margins": 4.58519983291626, + "rewards/rejected": -1.437169075012207, + "step": 5003 + }, + { + "epoch": 1.25, + "grad_norm": 5.398181915283203, + "learning_rate": 6.602992531584873e-07, + "logits/chosen": -0.29272788763046265, + "logits/rejected": -0.42019975185394287, + "logps/chosen": -57.509117126464844, + "logps/rejected": -70.00574493408203, + "loss": 0.7747, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.753288507461548, + "rewards/margins": 4.965493202209473, + "rewards/rejected": -2.212204694747925, + "step": 5004 + }, + { + "epoch": 1.25, + "grad_norm": 7.20405912399292, + "learning_rate": 6.589987055815262e-07, + "logits/chosen": -0.34266898036003113, + "logits/rejected": -0.3768349885940552, + "logps/chosen": -54.87078857421875, + "logps/rejected": -103.02114868164062, + "loss": 0.8191, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6924171447753906, + "rewards/margins": 4.821371555328369, + "rewards/rejected": -2.128955125808716, + "step": 5005 + }, + { + "epoch": 1.25, + "grad_norm": 4.696100234985352, + "learning_rate": 6.576993497030893e-07, + "logits/chosen": -0.457611083984375, + "logits/rejected": -0.5297752618789673, + "logps/chosen": -59.85616683959961, + "logps/rejected": -97.0528335571289, + "loss": 0.776, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1320176124572754, + "rewards/margins": 5.8573126792907715, + "rewards/rejected": -2.725295066833496, + "step": 5006 + }, + { + "epoch": 1.25, + "grad_norm": 4.632326602935791, + "learning_rate": 6.564011858798819e-07, + "logits/chosen": -0.32520806789398193, + "logits/rejected": -0.4085749387741089, + "logps/chosen": -55.10109329223633, + "logps/rejected": -95.40760040283203, + "loss": 0.6997, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.772705078125, + "rewards/margins": 4.675258636474609, + "rewards/rejected": -1.9025530815124512, + "step": 5007 + }, + { + "epoch": 1.25, + "grad_norm": 5.691409587860107, + "learning_rate": 6.551042144682763e-07, + "logits/chosen": -0.37710314989089966, + "logits/rejected": -0.4711277186870575, + "logps/chosen": -58.76845932006836, + "logps/rejected": -78.79105377197266, + "loss": 0.7745, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7484629154205322, + "rewards/margins": 3.993262767791748, + "rewards/rejected": -1.244800090789795, + "step": 5008 + }, + { + "epoch": 1.25, + "grad_norm": 4.555638790130615, + "learning_rate": 6.538084358243191e-07, + "logits/chosen": -0.296366810798645, + "logits/rejected": -0.3765318691730499, + "logps/chosen": -60.312110900878906, + "logps/rejected": -84.04874420166016, + "loss": 0.8009, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.194139242172241, + "rewards/margins": 4.649298191070557, + "rewards/rejected": -1.4551589488983154, + "step": 5009 + }, + { + "epoch": 1.25, + "grad_norm": 5.2899322509765625, + "learning_rate": 6.525138503037299e-07, + "logits/chosen": -0.3991650342941284, + "logits/rejected": -0.45178675651550293, + "logps/chosen": -61.37958526611328, + "logps/rejected": -95.34092712402344, + "loss": 0.8243, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1247618198394775, + "rewards/margins": 4.283570766448975, + "rewards/rejected": -1.158808946609497, + "step": 5010 + }, + { + "epoch": 1.25, + "grad_norm": 18.18722152709961, + "learning_rate": 6.512204582618992e-07, + "logits/chosen": -0.43100565671920776, + "logits/rejected": -0.49302345514297485, + "logps/chosen": -57.201629638671875, + "logps/rejected": -90.40769958496094, + "loss": 0.8143, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.195537805557251, + "rewards/margins": 4.652167797088623, + "rewards/rejected": -1.456629753112793, + "step": 5011 + }, + { + "epoch": 1.25, + "grad_norm": 9.06751537322998, + "learning_rate": 6.49928260053893e-07, + "logits/chosen": -0.38680630922317505, + "logits/rejected": -0.5012653470039368, + "logps/chosen": -56.81362533569336, + "logps/rejected": -82.45897674560547, + "loss": 0.7591, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7890493869781494, + "rewards/margins": 4.058714389801025, + "rewards/rejected": -1.2696651220321655, + "step": 5012 + }, + { + "epoch": 1.25, + "grad_norm": 3.6486268043518066, + "learning_rate": 6.486372560344461e-07, + "logits/chosen": -0.29333391785621643, + "logits/rejected": -0.3742395043373108, + "logps/chosen": -52.8286247253418, + "logps/rejected": -76.10983276367188, + "loss": 0.6665, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1997179985046387, + "rewards/margins": 4.5630974769592285, + "rewards/rejected": -1.3633798360824585, + "step": 5013 + }, + { + "epoch": 1.25, + "grad_norm": 5.8895344734191895, + "learning_rate": 6.473474465579682e-07, + "logits/chosen": -0.37443259358406067, + "logits/rejected": -0.4825395941734314, + "logps/chosen": -50.5241813659668, + "logps/rejected": -84.80299377441406, + "loss": 0.652, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.916627883911133, + "rewards/margins": 5.533239841461182, + "rewards/rejected": -2.6166117191314697, + "step": 5014 + }, + { + "epoch": 1.25, + "grad_norm": 5.950118541717529, + "learning_rate": 6.460588319785382e-07, + "logits/chosen": -0.3698105216026306, + "logits/rejected": -0.430641770362854, + "logps/chosen": -60.96868133544922, + "logps/rejected": -101.39014434814453, + "loss": 0.7069, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.949063777923584, + "rewards/margins": 5.0912251472473145, + "rewards/rejected": -2.1421616077423096, + "step": 5015 + }, + { + "epoch": 1.25, + "grad_norm": 5.399962425231934, + "learning_rate": 6.447714126499088e-07, + "logits/chosen": -0.39055678248405457, + "logits/rejected": -0.45363864302635193, + "logps/chosen": -48.38978958129883, + "logps/rejected": -93.70736694335938, + "loss": 0.6921, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7050082683563232, + "rewards/margins": 5.28531551361084, + "rewards/rejected": -2.5803072452545166, + "step": 5016 + }, + { + "epoch": 1.26, + "grad_norm": 4.671093463897705, + "learning_rate": 6.434851889255073e-07, + "logits/chosen": -0.34445279836654663, + "logits/rejected": -0.46739545464515686, + "logps/chosen": -56.92506790161133, + "logps/rejected": -94.81155395507812, + "loss": 0.7172, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.203716993331909, + "rewards/margins": 5.543102264404297, + "rewards/rejected": -2.339385747909546, + "step": 5017 + }, + { + "epoch": 1.26, + "grad_norm": 5.73805046081543, + "learning_rate": 6.422001611584256e-07, + "logits/chosen": -0.38151854276657104, + "logits/rejected": -0.4687798023223877, + "logps/chosen": -53.044254302978516, + "logps/rejected": -89.02680969238281, + "loss": 0.8208, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8577539920806885, + "rewards/margins": 4.7462263107299805, + "rewards/rejected": -1.8884713649749756, + "step": 5018 + }, + { + "epoch": 1.26, + "grad_norm": 6.638334274291992, + "learning_rate": 6.409163297014343e-07, + "logits/chosen": -0.2661527693271637, + "logits/rejected": -0.32599833607673645, + "logps/chosen": -63.623661041259766, + "logps/rejected": -77.62353515625, + "loss": 0.9222, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.89585280418396, + "rewards/margins": 3.6320855617523193, + "rewards/rejected": -0.7362326383590698, + "step": 5019 + }, + { + "epoch": 1.26, + "grad_norm": 8.710283279418945, + "learning_rate": 6.39633694906972e-07, + "logits/chosen": -0.3252466022968292, + "logits/rejected": -0.4300704300403595, + "logps/chosen": -48.13121032714844, + "logps/rejected": -84.27656555175781, + "loss": 0.7054, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.957782030105591, + "rewards/margins": 5.538046360015869, + "rewards/rejected": -2.5802643299102783, + "step": 5020 + }, + { + "epoch": 1.26, + "grad_norm": 4.507818222045898, + "learning_rate": 6.383522571271499e-07, + "logits/chosen": -0.359719842672348, + "logits/rejected": -0.4857322573661804, + "logps/chosen": -59.23988342285156, + "logps/rejected": -73.92082977294922, + "loss": 0.7153, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8645973205566406, + "rewards/margins": 5.498823165893555, + "rewards/rejected": -2.634225845336914, + "step": 5021 + }, + { + "epoch": 1.26, + "grad_norm": 3.390119791030884, + "learning_rate": 6.370720167137501e-07, + "logits/chosen": -0.28968945145606995, + "logits/rejected": -0.3938135504722595, + "logps/chosen": -61.40809631347656, + "logps/rejected": -81.94793701171875, + "loss": 0.6407, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9640097618103027, + "rewards/margins": 5.160361289978027, + "rewards/rejected": -2.1963512897491455, + "step": 5022 + }, + { + "epoch": 1.26, + "grad_norm": 4.49121618270874, + "learning_rate": 6.357929740182255e-07, + "logits/chosen": -0.24636054039001465, + "logits/rejected": -0.3524125814437866, + "logps/chosen": -55.53756332397461, + "logps/rejected": -79.60039520263672, + "loss": 0.7009, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.906709909439087, + "rewards/margins": 4.8438310623168945, + "rewards/rejected": -1.9371206760406494, + "step": 5023 + }, + { + "epoch": 1.26, + "grad_norm": 5.346976280212402, + "learning_rate": 6.345151293917023e-07, + "logits/chosen": -0.32176145911216736, + "logits/rejected": -0.404400110244751, + "logps/chosen": -53.49566650390625, + "logps/rejected": -86.45880889892578, + "loss": 0.7942, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9625144004821777, + "rewards/margins": 4.556802749633789, + "rewards/rejected": -1.5942883491516113, + "step": 5024 + }, + { + "epoch": 1.26, + "grad_norm": 2.375321626663208, + "learning_rate": 6.332384831849775e-07, + "logits/chosen": -0.3470432758331299, + "logits/rejected": -0.45243847370147705, + "logps/chosen": -52.044708251953125, + "logps/rejected": -81.41442108154297, + "loss": 0.6362, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.070756673812866, + "rewards/margins": 5.083835601806641, + "rewards/rejected": -2.0130789279937744, + "step": 5025 + }, + { + "epoch": 1.26, + "grad_norm": 8.953675270080566, + "learning_rate": 6.319630357485135e-07, + "logits/chosen": -0.26924529671669006, + "logits/rejected": -0.3601841330528259, + "logps/chosen": -52.26640701293945, + "logps/rejected": -80.3956527709961, + "loss": 0.7487, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0263566970825195, + "rewards/margins": 5.029109477996826, + "rewards/rejected": -2.0027525424957275, + "step": 5026 + }, + { + "epoch": 1.26, + "grad_norm": 2.6609983444213867, + "learning_rate": 6.306887874324524e-07, + "logits/chosen": -0.37928956747055054, + "logits/rejected": -0.452345609664917, + "logps/chosen": -52.93705368041992, + "logps/rejected": -87.77178955078125, + "loss": 0.6676, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.259099006652832, + "rewards/margins": 5.30280876159668, + "rewards/rejected": -2.0437097549438477, + "step": 5027 + }, + { + "epoch": 1.26, + "grad_norm": 4.883389949798584, + "learning_rate": 6.294157385866007e-07, + "logits/chosen": -0.37181854248046875, + "logits/rejected": -0.3926776349544525, + "logps/chosen": -49.448585510253906, + "logps/rejected": -90.73182678222656, + "loss": 0.7684, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8857004642486572, + "rewards/margins": 4.646703243255615, + "rewards/rejected": -1.761002779006958, + "step": 5028 + }, + { + "epoch": 1.26, + "grad_norm": 5.477398872375488, + "learning_rate": 6.281438895604402e-07, + "logits/chosen": -0.416748970746994, + "logits/rejected": -0.4562535881996155, + "logps/chosen": -55.15022277832031, + "logps/rejected": -75.51927947998047, + "loss": 0.7971, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9713611602783203, + "rewards/margins": 3.8889524936676025, + "rewards/rejected": -0.9175913333892822, + "step": 5029 + }, + { + "epoch": 1.26, + "grad_norm": 3.9307916164398193, + "learning_rate": 6.26873240703118e-07, + "logits/chosen": -0.36484968662261963, + "logits/rejected": -0.4026399254798889, + "logps/chosen": -64.911865234375, + "logps/rejected": -97.14796447753906, + "loss": 0.7186, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8655307292938232, + "rewards/margins": 4.635808944702148, + "rewards/rejected": -1.770277976989746, + "step": 5030 + }, + { + "epoch": 1.26, + "grad_norm": 5.208473205566406, + "learning_rate": 6.256037923634556e-07, + "logits/chosen": -0.3799314498901367, + "logits/rejected": -0.46769946813583374, + "logps/chosen": -60.683841705322266, + "logps/rejected": -81.97884368896484, + "loss": 0.8148, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.916795015335083, + "rewards/margins": 5.1612324714660645, + "rewards/rejected": -2.2444372177124023, + "step": 5031 + }, + { + "epoch": 1.26, + "grad_norm": 10.299721717834473, + "learning_rate": 6.243355448899447e-07, + "logits/chosen": -0.33948686718940735, + "logits/rejected": -0.4795444905757904, + "logps/chosen": -51.751792907714844, + "logps/rejected": -86.34500122070312, + "loss": 0.661, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9738619327545166, + "rewards/margins": 5.062419414520264, + "rewards/rejected": -2.088557004928589, + "step": 5032 + }, + { + "epoch": 1.26, + "grad_norm": 10.597428321838379, + "learning_rate": 6.230684986307461e-07, + "logits/chosen": -0.265360563993454, + "logits/rejected": -0.3215123414993286, + "logps/chosen": -48.94967269897461, + "logps/rejected": -84.91658020019531, + "loss": 0.6849, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.899254322052002, + "rewards/margins": 3.9740867614746094, + "rewards/rejected": -1.074832797050476, + "step": 5033 + }, + { + "epoch": 1.26, + "grad_norm": 4.47482442855835, + "learning_rate": 6.218026539336913e-07, + "logits/chosen": -0.3694147765636444, + "logits/rejected": -0.47010698914527893, + "logps/chosen": -60.001686096191406, + "logps/rejected": -79.78205871582031, + "loss": 0.715, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9817042350769043, + "rewards/margins": 4.7093682289123535, + "rewards/rejected": -1.727663516998291, + "step": 5034 + }, + { + "epoch": 1.26, + "grad_norm": 2.896055221557617, + "learning_rate": 6.205380111462822e-07, + "logits/chosen": -0.33750075101852417, + "logits/rejected": -0.5043847560882568, + "logps/chosen": -59.490966796875, + "logps/rejected": -73.9374008178711, + "loss": 0.6942, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9265058040618896, + "rewards/margins": 6.253302097320557, + "rewards/rejected": -3.326796054840088, + "step": 5035 + }, + { + "epoch": 1.26, + "grad_norm": 4.407924175262451, + "learning_rate": 6.192745706156894e-07, + "logits/chosen": -0.31558820605278015, + "logits/rejected": -0.4557688236236572, + "logps/chosen": -58.670928955078125, + "logps/rejected": -79.72647094726562, + "loss": 0.7687, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9180119037628174, + "rewards/margins": 5.313647270202637, + "rewards/rejected": -2.3956353664398193, + "step": 5036 + }, + { + "epoch": 1.26, + "grad_norm": 6.0728607177734375, + "learning_rate": 6.180123326887577e-07, + "logits/chosen": -0.35612255334854126, + "logits/rejected": -0.45354315638542175, + "logps/chosen": -51.01459503173828, + "logps/rejected": -79.46955871582031, + "loss": 0.7769, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.679422378540039, + "rewards/margins": 4.598300457000732, + "rewards/rejected": -1.9188777208328247, + "step": 5037 + }, + { + "epoch": 1.26, + "grad_norm": 4.886066913604736, + "learning_rate": 6.167512977119944e-07, + "logits/chosen": -0.4221750497817993, + "logits/rejected": -0.38813623785972595, + "logps/chosen": -46.41075134277344, + "logps/rejected": -93.35200500488281, + "loss": 0.683, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.41023850440979, + "rewards/margins": 4.321689605712891, + "rewards/rejected": -0.911451518535614, + "step": 5038 + }, + { + "epoch": 1.26, + "grad_norm": 7.155238628387451, + "learning_rate": 6.154914660315836e-07, + "logits/chosen": -0.3393867611885071, + "logits/rejected": -0.3914972245693207, + "logps/chosen": -48.38357925415039, + "logps/rejected": -95.64417266845703, + "loss": 0.6371, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0508103370666504, + "rewards/margins": 5.271235466003418, + "rewards/rejected": -2.2204251289367676, + "step": 5039 + }, + { + "epoch": 1.26, + "grad_norm": 3.6631839275360107, + "learning_rate": 6.14232837993376e-07, + "logits/chosen": -0.3390202820301056, + "logits/rejected": -0.4801386892795563, + "logps/chosen": -53.64425277709961, + "logps/rejected": -80.053955078125, + "loss": 0.5809, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7456836700439453, + "rewards/margins": 5.491568088531494, + "rewards/rejected": -2.7458841800689697, + "step": 5040 + }, + { + "epoch": 1.26, + "grad_norm": 8.262116432189941, + "learning_rate": 6.129754139428917e-07, + "logits/chosen": -0.40150555968284607, + "logits/rejected": -0.457491397857666, + "logps/chosen": -52.86040496826172, + "logps/rejected": -96.01597595214844, + "loss": 0.7932, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5098185539245605, + "rewards/margins": 5.578320503234863, + "rewards/rejected": -3.068502187728882, + "step": 5041 + }, + { + "epoch": 1.26, + "grad_norm": 13.557367324829102, + "learning_rate": 6.117191942253204e-07, + "logits/chosen": -0.3508067727088928, + "logits/rejected": -0.47498875856399536, + "logps/chosen": -53.32542419433594, + "logps/rejected": -79.56451416015625, + "loss": 0.6331, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.015477180480957, + "rewards/margins": 5.334875106811523, + "rewards/rejected": -2.319397449493408, + "step": 5042 + }, + { + "epoch": 1.26, + "grad_norm": 4.2465691566467285, + "learning_rate": 6.104641791855215e-07, + "logits/chosen": -0.319815993309021, + "logits/rejected": -0.45037898421287537, + "logps/chosen": -52.391971588134766, + "logps/rejected": -77.91764831542969, + "loss": 0.6001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.892937421798706, + "rewards/margins": 5.351971626281738, + "rewards/rejected": -2.459033966064453, + "step": 5043 + }, + { + "epoch": 1.26, + "grad_norm": 4.32685661315918, + "learning_rate": 6.092103691680246e-07, + "logits/chosen": -0.2802046835422516, + "logits/rejected": -0.3432466387748718, + "logps/chosen": -59.900489807128906, + "logps/rejected": -82.16191864013672, + "loss": 0.7199, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9025344848632812, + "rewards/margins": 4.210496425628662, + "rewards/rejected": -1.3079617023468018, + "step": 5044 + }, + { + "epoch": 1.26, + "grad_norm": 6.573760032653809, + "learning_rate": 6.079577645170276e-07, + "logits/chosen": -0.31415724754333496, + "logits/rejected": -0.43849262595176697, + "logps/chosen": -55.83065414428711, + "logps/rejected": -93.76647186279297, + "loss": 0.6614, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.141266345977783, + "rewards/margins": 6.112637996673584, + "rewards/rejected": -2.9713711738586426, + "step": 5045 + }, + { + "epoch": 1.26, + "grad_norm": 3.9637272357940674, + "learning_rate": 6.067063655763967e-07, + "logits/chosen": -0.3476096987724304, + "logits/rejected": -0.3991944491863251, + "logps/chosen": -51.240318298339844, + "logps/rejected": -102.22116088867188, + "loss": 0.6794, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9019782543182373, + "rewards/margins": 6.013271808624268, + "rewards/rejected": -3.111293077468872, + "step": 5046 + }, + { + "epoch": 1.26, + "grad_norm": 6.083270072937012, + "learning_rate": 6.054561726896685e-07, + "logits/chosen": -0.3463602662086487, + "logits/rejected": -0.47803381085395813, + "logps/chosen": -62.1671142578125, + "logps/rejected": -83.82415771484375, + "loss": 0.6906, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7095634937286377, + "rewards/margins": 5.024096965789795, + "rewards/rejected": -2.3145334720611572, + "step": 5047 + }, + { + "epoch": 1.26, + "grad_norm": 4.392425060272217, + "learning_rate": 6.042071862000465e-07, + "logits/chosen": -0.4516215920448303, + "logits/rejected": -0.5146715044975281, + "logps/chosen": -49.99497985839844, + "logps/rejected": -81.00011444091797, + "loss": 0.8075, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0160973072052, + "rewards/margins": 5.818692684173584, + "rewards/rejected": -2.8025951385498047, + "step": 5048 + }, + { + "epoch": 1.26, + "grad_norm": 3.4341514110565186, + "learning_rate": 6.029594064504075e-07, + "logits/chosen": -0.38631880283355713, + "logits/rejected": -0.48345035314559937, + "logps/chosen": -42.11742401123047, + "logps/rejected": -75.48495483398438, + "loss": 0.5823, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9084062576293945, + "rewards/margins": 5.023703575134277, + "rewards/rejected": -2.115297555923462, + "step": 5049 + }, + { + "epoch": 1.26, + "grad_norm": 3.8337254524230957, + "learning_rate": 6.017128337832911e-07, + "logits/chosen": -0.3562333583831787, + "logits/rejected": -0.46665823459625244, + "logps/chosen": -51.97848129272461, + "logps/rejected": -70.91888427734375, + "loss": 0.7568, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.869810104370117, + "rewards/margins": 4.288354873657227, + "rewards/rejected": -1.4185450077056885, + "step": 5050 + }, + { + "epoch": 1.26, + "grad_norm": 8.75208854675293, + "learning_rate": 6.004674685409084e-07, + "logits/chosen": -0.4052727520465851, + "logits/rejected": -0.4877505898475647, + "logps/chosen": -43.789283752441406, + "logps/rejected": -79.53013610839844, + "loss": 0.7244, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9434287548065186, + "rewards/margins": 5.083029747009277, + "rewards/rejected": -2.1396007537841797, + "step": 5051 + }, + { + "epoch": 1.26, + "grad_norm": 3.123077630996704, + "learning_rate": 5.992233110651413e-07, + "logits/chosen": -0.36829614639282227, + "logits/rejected": -0.4493529200553894, + "logps/chosen": -38.82472229003906, + "logps/rejected": -86.71551513671875, + "loss": 0.5303, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2873144149780273, + "rewards/margins": 5.777446746826172, + "rewards/rejected": -2.4901318550109863, + "step": 5052 + }, + { + "epoch": 1.26, + "grad_norm": 8.091230392456055, + "learning_rate": 5.979803616975366e-07, + "logits/chosen": -0.3743496537208557, + "logits/rejected": -0.46118059754371643, + "logps/chosen": -56.67694091796875, + "logps/rejected": -71.35123443603516, + "loss": 0.7179, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.638303518295288, + "rewards/margins": 3.9881887435913086, + "rewards/rejected": -1.3498852252960205, + "step": 5053 + }, + { + "epoch": 1.26, + "grad_norm": 4.473758697509766, + "learning_rate": 5.967386207793102e-07, + "logits/chosen": -0.3903195858001709, + "logits/rejected": -0.4816265404224396, + "logps/chosen": -61.30620574951172, + "logps/rejected": -86.70655822753906, + "loss": 0.7926, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.792367696762085, + "rewards/margins": 4.880633354187012, + "rewards/rejected": -2.0882654190063477, + "step": 5054 + }, + { + "epoch": 1.26, + "grad_norm": 9.048711776733398, + "learning_rate": 5.954980886513478e-07, + "logits/chosen": -0.3174307346343994, + "logits/rejected": -0.38857680559158325, + "logps/chosen": -57.462547302246094, + "logps/rejected": -87.07029724121094, + "loss": 0.6928, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.014495849609375, + "rewards/margins": 4.852958679199219, + "rewards/rejected": -1.8384629487991333, + "step": 5055 + }, + { + "epoch": 1.26, + "grad_norm": 4.200993061065674, + "learning_rate": 5.942587656542004e-07, + "logits/chosen": -0.4313682019710541, + "logits/rejected": -0.5234543085098267, + "logps/chosen": -65.35710144042969, + "logps/rejected": -90.14046478271484, + "loss": 0.7361, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.977159023284912, + "rewards/margins": 5.362776756286621, + "rewards/rejected": -2.385617733001709, + "step": 5056 + }, + { + "epoch": 1.27, + "grad_norm": 5.0786237716674805, + "learning_rate": 5.930206521280912e-07, + "logits/chosen": -0.4789079427719116, + "logits/rejected": -0.5304513573646545, + "logps/chosen": -49.11302947998047, + "logps/rejected": -93.97874450683594, + "loss": 0.775, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.029318332672119, + "rewards/margins": 4.489468097686768, + "rewards/rejected": -1.4601497650146484, + "step": 5057 + }, + { + "epoch": 1.27, + "grad_norm": 4.639688014984131, + "learning_rate": 5.917837484129086e-07, + "logits/chosen": -0.4102543890476227, + "logits/rejected": -0.48268216848373413, + "logps/chosen": -47.41756057739258, + "logps/rejected": -85.55921936035156, + "loss": 0.6613, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7611756324768066, + "rewards/margins": 4.934525489807129, + "rewards/rejected": -2.173349618911743, + "step": 5058 + }, + { + "epoch": 1.27, + "grad_norm": 3.375760793685913, + "learning_rate": 5.905480548482085e-07, + "logits/chosen": -0.4338419735431671, + "logits/rejected": -0.555712878704071, + "logps/chosen": -50.52969741821289, + "logps/rejected": -68.61406707763672, + "loss": 0.5828, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0421817302703857, + "rewards/margins": 5.373241901397705, + "rewards/rejected": -2.331059694290161, + "step": 5059 + }, + { + "epoch": 1.27, + "grad_norm": 5.497535705566406, + "learning_rate": 5.893135717732157e-07, + "logits/chosen": -0.43395066261291504, + "logits/rejected": -0.5129657983779907, + "logps/chosen": -57.27079391479492, + "logps/rejected": -98.6710433959961, + "loss": 0.753, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7681028842926025, + "rewards/margins": 5.228554725646973, + "rewards/rejected": -2.460451364517212, + "step": 5060 + }, + { + "epoch": 1.27, + "grad_norm": 4.429150581359863, + "learning_rate": 5.880802995268226e-07, + "logits/chosen": -0.36186671257019043, + "logits/rejected": -0.39179015159606934, + "logps/chosen": -48.40216827392578, + "logps/rejected": -90.25677490234375, + "loss": 0.7353, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1178669929504395, + "rewards/margins": 4.858551979064941, + "rewards/rejected": -1.7406851053237915, + "step": 5061 + }, + { + "epoch": 1.27, + "grad_norm": 6.476648807525635, + "learning_rate": 5.868482384475887e-07, + "logits/chosen": -0.39700397849082947, + "logits/rejected": -0.445121169090271, + "logps/chosen": -57.42683029174805, + "logps/rejected": -66.46891021728516, + "loss": 0.9311, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.867766857147217, + "rewards/margins": 3.8415019512176514, + "rewards/rejected": -0.9737352132797241, + "step": 5062 + }, + { + "epoch": 1.27, + "grad_norm": 6.904998779296875, + "learning_rate": 5.85617388873741e-07, + "logits/chosen": -0.36988383531570435, + "logits/rejected": -0.463989794254303, + "logps/chosen": -58.03963088989258, + "logps/rejected": -79.63005065917969, + "loss": 0.9047, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.160893201828003, + "rewards/margins": 4.189446449279785, + "rewards/rejected": -1.0285530090332031, + "step": 5063 + }, + { + "epoch": 1.27, + "grad_norm": 4.864954471588135, + "learning_rate": 5.843877511431761e-07, + "logits/chosen": -0.37359750270843506, + "logits/rejected": -0.45910072326660156, + "logps/chosen": -56.88393020629883, + "logps/rejected": -96.9781494140625, + "loss": 0.6888, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9669911861419678, + "rewards/margins": 5.2714738845825195, + "rewards/rejected": -2.3044824600219727, + "step": 5064 + }, + { + "epoch": 1.27, + "grad_norm": 9.873608589172363, + "learning_rate": 5.831593255934548e-07, + "logits/chosen": -0.3459264636039734, + "logits/rejected": -0.45327886939048767, + "logps/chosen": -52.96380615234375, + "logps/rejected": -74.9490966796875, + "loss": 0.7252, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9630627632141113, + "rewards/margins": 4.539118766784668, + "rewards/rejected": -1.5760560035705566, + "step": 5065 + }, + { + "epoch": 1.27, + "grad_norm": 3.7990665435791016, + "learning_rate": 5.819321125618066e-07, + "logits/chosen": -0.35156485438346863, + "logits/rejected": -0.46308889985084534, + "logps/chosen": -62.98789596557617, + "logps/rejected": -73.83564758300781, + "loss": 0.7967, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1250810623168945, + "rewards/margins": 4.905129432678223, + "rewards/rejected": -1.7800480127334595, + "step": 5066 + }, + { + "epoch": 1.27, + "grad_norm": 10.164665222167969, + "learning_rate": 5.807061123851276e-07, + "logits/chosen": -0.4020864963531494, + "logits/rejected": -0.47768235206604004, + "logps/chosen": -57.76136779785156, + "logps/rejected": -81.71930694580078, + "loss": 0.7988, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8511059284210205, + "rewards/margins": 4.410549163818359, + "rewards/rejected": -1.559443712234497, + "step": 5067 + }, + { + "epoch": 1.27, + "grad_norm": 8.62235164642334, + "learning_rate": 5.79481325399981e-07, + "logits/chosen": -0.4075813293457031, + "logits/rejected": -0.48769471049308777, + "logps/chosen": -56.067848205566406, + "logps/rejected": -79.33470153808594, + "loss": 0.8115, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6949379444122314, + "rewards/margins": 4.573419570922852, + "rewards/rejected": -1.8784818649291992, + "step": 5068 + }, + { + "epoch": 1.27, + "grad_norm": 22.059049606323242, + "learning_rate": 5.78257751942598e-07, + "logits/chosen": -0.37966108322143555, + "logits/rejected": -0.522458016872406, + "logps/chosen": -59.35048294067383, + "logps/rejected": -81.47169494628906, + "loss": 0.6453, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7629876136779785, + "rewards/margins": 5.490415096282959, + "rewards/rejected": -2.7274274826049805, + "step": 5069 + }, + { + "epoch": 1.27, + "grad_norm": 20.37394142150879, + "learning_rate": 5.770353923488775e-07, + "logits/chosen": -0.31388968229293823, + "logits/rejected": -0.4648394286632538, + "logps/chosen": -53.55060958862305, + "logps/rejected": -76.21957397460938, + "loss": 0.7206, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6109859943389893, + "rewards/margins": 5.597860336303711, + "rewards/rejected": -2.9868743419647217, + "step": 5070 + }, + { + "epoch": 1.27, + "grad_norm": 3.5941379070281982, + "learning_rate": 5.758142469543798e-07, + "logits/chosen": -0.4052177667617798, + "logits/rejected": -0.5217747092247009, + "logps/chosen": -56.33282470703125, + "logps/rejected": -83.89185333251953, + "loss": 0.6693, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.094611883163452, + "rewards/margins": 5.007189750671387, + "rewards/rejected": -1.9125781059265137, + "step": 5071 + }, + { + "epoch": 1.27, + "grad_norm": 10.039190292358398, + "learning_rate": 5.745943160943385e-07, + "logits/chosen": -0.3772275149822235, + "logits/rejected": -0.4258567690849304, + "logps/chosen": -69.27896881103516, + "logps/rejected": -74.90434265136719, + "loss": 0.9014, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7878220081329346, + "rewards/margins": 3.8869168758392334, + "rewards/rejected": -1.0990947484970093, + "step": 5072 + }, + { + "epoch": 1.27, + "grad_norm": 7.245729923248291, + "learning_rate": 5.7337560010365e-07, + "logits/chosen": -0.31902116537094116, + "logits/rejected": -0.43069082498550415, + "logps/chosen": -53.75050354003906, + "logps/rejected": -90.83433532714844, + "loss": 0.6648, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8527889251708984, + "rewards/margins": 5.702209949493408, + "rewards/rejected": -2.8494210243225098, + "step": 5073 + }, + { + "epoch": 1.27, + "grad_norm": 5.5814290046691895, + "learning_rate": 5.721580993168791e-07, + "logits/chosen": -0.3708914816379547, + "logits/rejected": -0.5541551113128662, + "logps/chosen": -52.98991394042969, + "logps/rejected": -70.04963684082031, + "loss": 0.708, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.791799783706665, + "rewards/margins": 5.63808536529541, + "rewards/rejected": -2.846285104751587, + "step": 5074 + }, + { + "epoch": 1.27, + "grad_norm": 5.735111236572266, + "learning_rate": 5.709418140682543e-07, + "logits/chosen": -0.4643818736076355, + "logits/rejected": -0.5706632137298584, + "logps/chosen": -53.93257522583008, + "logps/rejected": -83.12796020507812, + "loss": 0.841, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7631616592407227, + "rewards/margins": 4.485880374908447, + "rewards/rejected": -1.7227187156677246, + "step": 5075 + }, + { + "epoch": 1.27, + "grad_norm": 12.339287757873535, + "learning_rate": 5.697267446916727e-07, + "logits/chosen": -0.3959370255470276, + "logits/rejected": -0.42889493703842163, + "logps/chosen": -62.28923416137695, + "logps/rejected": -92.76139831542969, + "loss": 0.776, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.676379680633545, + "rewards/margins": 3.9925851821899414, + "rewards/rejected": -1.3162055015563965, + "step": 5076 + }, + { + "epoch": 1.27, + "grad_norm": 8.244569778442383, + "learning_rate": 5.685128915206977e-07, + "logits/chosen": -0.48397839069366455, + "logits/rejected": -0.6071943640708923, + "logps/chosen": -56.364654541015625, + "logps/rejected": -85.8891372680664, + "loss": 0.7057, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0726089477539062, + "rewards/margins": 5.148382186889648, + "rewards/rejected": -2.075773239135742, + "step": 5077 + }, + { + "epoch": 1.27, + "grad_norm": 10.31885051727295, + "learning_rate": 5.673002548885587e-07, + "logits/chosen": -0.32140642404556274, + "logits/rejected": -0.41160181164741516, + "logps/chosen": -55.90325164794922, + "logps/rejected": -81.02996063232422, + "loss": 0.8047, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.787574052810669, + "rewards/margins": 4.896690368652344, + "rewards/rejected": -2.1091156005859375, + "step": 5078 + }, + { + "epoch": 1.27, + "grad_norm": 5.688926696777344, + "learning_rate": 5.660888351281496e-07, + "logits/chosen": -0.37554189562797546, + "logits/rejected": -0.4624122679233551, + "logps/chosen": -58.422828674316406, + "logps/rejected": -83.95136260986328, + "loss": 0.6763, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1116514205932617, + "rewards/margins": 5.405741214752197, + "rewards/rejected": -2.2940895557403564, + "step": 5079 + }, + { + "epoch": 1.27, + "grad_norm": 4.194715976715088, + "learning_rate": 5.648786325720313e-07, + "logits/chosen": -0.36523082852363586, + "logits/rejected": -0.477385938167572, + "logps/chosen": -60.9093017578125, + "logps/rejected": -77.66790771484375, + "loss": 0.7196, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8498756885528564, + "rewards/margins": 4.375047206878662, + "rewards/rejected": -1.5251716375350952, + "step": 5080 + }, + { + "epoch": 1.27, + "grad_norm": 5.045563220977783, + "learning_rate": 5.6366964755243e-07, + "logits/chosen": -0.4241042137145996, + "logits/rejected": -0.5182990431785583, + "logps/chosen": -62.29484176635742, + "logps/rejected": -67.1938247680664, + "loss": 0.8392, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7175469398498535, + "rewards/margins": 4.015626430511475, + "rewards/rejected": -1.2980797290802002, + "step": 5081 + }, + { + "epoch": 1.27, + "grad_norm": 23.804651260375977, + "learning_rate": 5.624618804012405e-07, + "logits/chosen": -0.3151504397392273, + "logits/rejected": -0.4312942624092102, + "logps/chosen": -53.881690979003906, + "logps/rejected": -91.60206604003906, + "loss": 0.7434, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8409807682037354, + "rewards/margins": 5.525393486022949, + "rewards/rejected": -2.684412956237793, + "step": 5082 + }, + { + "epoch": 1.27, + "grad_norm": 2.454514503479004, + "learning_rate": 5.61255331450018e-07, + "logits/chosen": -0.3263508975505829, + "logits/rejected": -0.4269222319126129, + "logps/chosen": -50.39286804199219, + "logps/rejected": -90.30255126953125, + "loss": 0.5854, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.001668930053711, + "rewards/margins": 6.083361625671387, + "rewards/rejected": -3.0816924571990967, + "step": 5083 + }, + { + "epoch": 1.27, + "grad_norm": 3.083810329437256, + "learning_rate": 5.600500010299881e-07, + "logits/chosen": -0.3246400058269501, + "logits/rejected": -0.4037761688232422, + "logps/chosen": -48.07768249511719, + "logps/rejected": -79.66940307617188, + "loss": 0.6116, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.07136869430542, + "rewards/margins": 4.87390661239624, + "rewards/rejected": -1.8025381565093994, + "step": 5084 + }, + { + "epoch": 1.27, + "grad_norm": 7.780176639556885, + "learning_rate": 5.588458894720395e-07, + "logits/chosen": -0.3604282736778259, + "logits/rejected": -0.44191449880599976, + "logps/chosen": -52.45473861694336, + "logps/rejected": -82.75953674316406, + "loss": 0.7982, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8978514671325684, + "rewards/margins": 4.558319568634033, + "rewards/rejected": -1.6604678630828857, + "step": 5085 + }, + { + "epoch": 1.27, + "grad_norm": 4.282344341278076, + "learning_rate": 5.576429971067271e-07, + "logits/chosen": -0.3849268853664398, + "logits/rejected": -0.3912121057510376, + "logps/chosen": -54.55949020385742, + "logps/rejected": -99.11004638671875, + "loss": 0.7927, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.919316291809082, + "rewards/margins": 3.922628402709961, + "rewards/rejected": -1.003312110900879, + "step": 5086 + }, + { + "epoch": 1.27, + "grad_norm": 4.682913780212402, + "learning_rate": 5.564413242642702e-07, + "logits/chosen": -0.40169763565063477, + "logits/rejected": -0.49782511591911316, + "logps/chosen": -56.19945526123047, + "logps/rejected": -91.32360076904297, + "loss": 0.7531, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.732342481613159, + "rewards/margins": 5.029379844665527, + "rewards/rejected": -2.2970376014709473, + "step": 5087 + }, + { + "epoch": 1.27, + "grad_norm": 7.741748809814453, + "learning_rate": 5.552408712745533e-07, + "logits/chosen": -0.3912980556488037, + "logits/rejected": -0.4774187207221985, + "logps/chosen": -53.376956939697266, + "logps/rejected": -89.53372192382812, + "loss": 0.7927, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9102039337158203, + "rewards/margins": 5.47155237197876, + "rewards/rejected": -2.5613484382629395, + "step": 5088 + }, + { + "epoch": 1.27, + "grad_norm": 3.809401035308838, + "learning_rate": 5.540416384671283e-07, + "logits/chosen": -0.29950010776519775, + "logits/rejected": -0.39036524295806885, + "logps/chosen": -58.387168884277344, + "logps/rejected": -84.9977035522461, + "loss": 0.8343, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.885233163833618, + "rewards/margins": 4.345493793487549, + "rewards/rejected": -1.4602605104446411, + "step": 5089 + }, + { + "epoch": 1.27, + "grad_norm": 8.145463943481445, + "learning_rate": 5.528436261712111e-07, + "logits/chosen": -0.369964599609375, + "logits/rejected": -0.44448840618133545, + "logps/chosen": -58.65337371826172, + "logps/rejected": -79.53216552734375, + "loss": 0.8083, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.728132724761963, + "rewards/margins": 4.426493167877197, + "rewards/rejected": -1.698360562324524, + "step": 5090 + }, + { + "epoch": 1.27, + "grad_norm": 5.335058212280273, + "learning_rate": 5.516468347156784e-07, + "logits/chosen": -0.24639129638671875, + "logits/rejected": -0.31202083826065063, + "logps/chosen": -54.61237335205078, + "logps/rejected": -88.41064453125, + "loss": 0.7402, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8563172817230225, + "rewards/margins": 4.47926664352417, + "rewards/rejected": -1.6229496002197266, + "step": 5091 + }, + { + "epoch": 1.27, + "grad_norm": 5.439029693603516, + "learning_rate": 5.504512644290788e-07, + "logits/chosen": -0.3238256573677063, + "logits/rejected": -0.3361476957798004, + "logps/chosen": -56.501487731933594, + "logps/rejected": -90.80278015136719, + "loss": 0.7867, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.949634075164795, + "rewards/margins": 4.22900390625, + "rewards/rejected": -1.2793700695037842, + "step": 5092 + }, + { + "epoch": 1.27, + "grad_norm": 8.518814086914062, + "learning_rate": 5.492569156396193e-07, + "logits/chosen": -0.3935639560222626, + "logits/rejected": -0.4717371463775635, + "logps/chosen": -55.74757385253906, + "logps/rejected": -74.29042053222656, + "loss": 1.0122, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0130813121795654, + "rewards/margins": 3.9923930168151855, + "rewards/rejected": -0.9793120622634888, + "step": 5093 + }, + { + "epoch": 1.27, + "grad_norm": 3.2863264083862305, + "learning_rate": 5.480637886751782e-07, + "logits/chosen": -0.3664272129535675, + "logits/rejected": -0.45819684863090515, + "logps/chosen": -54.392066955566406, + "logps/rejected": -87.15633392333984, + "loss": 0.6687, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8934271335601807, + "rewards/margins": 4.597889423370361, + "rewards/rejected": -1.7044624090194702, + "step": 5094 + }, + { + "epoch": 1.27, + "grad_norm": 3.381821393966675, + "learning_rate": 5.468718838632914e-07, + "logits/chosen": -0.3550797700881958, + "logits/rejected": -0.48071610927581787, + "logps/chosen": -51.8260612487793, + "logps/rejected": -77.52118682861328, + "loss": 0.6606, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7961151599884033, + "rewards/margins": 5.368925094604492, + "rewards/rejected": -2.572810173034668, + "step": 5095 + }, + { + "epoch": 1.27, + "grad_norm": 5.793088912963867, + "learning_rate": 5.456812015311624e-07, + "logits/chosen": -0.3379212021827698, + "logits/rejected": -0.43988561630249023, + "logps/chosen": -63.599178314208984, + "logps/rejected": -91.18819427490234, + "loss": 0.7108, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7548885345458984, + "rewards/margins": 5.099179267883301, + "rewards/rejected": -2.3442909717559814, + "step": 5096 + }, + { + "epoch": 1.28, + "grad_norm": 2.7006959915161133, + "learning_rate": 5.444917420056611e-07, + "logits/chosen": -0.33815068006515503, + "logits/rejected": -0.5226700305938721, + "logps/chosen": -60.653907775878906, + "logps/rejected": -90.94990539550781, + "loss": 0.6338, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7945921421051025, + "rewards/margins": 5.878945350646973, + "rewards/rejected": -3.084352970123291, + "step": 5097 + }, + { + "epoch": 1.28, + "grad_norm": 4.90519380569458, + "learning_rate": 5.43303505613319e-07, + "logits/chosen": -0.3969128131866455, + "logits/rejected": -0.464351087808609, + "logps/chosen": -56.77323532104492, + "logps/rejected": -98.92127227783203, + "loss": 0.7477, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.135427474975586, + "rewards/margins": 4.934382915496826, + "rewards/rejected": -1.798954963684082, + "step": 5098 + }, + { + "epoch": 1.28, + "grad_norm": 4.083364486694336, + "learning_rate": 5.421164926803324e-07, + "logits/chosen": -0.39998412132263184, + "logits/rejected": -0.5279462337493896, + "logps/chosen": -62.235992431640625, + "logps/rejected": -80.70521545410156, + "loss": 0.7014, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.822422504425049, + "rewards/margins": 4.948807716369629, + "rewards/rejected": -2.126385450363159, + "step": 5099 + }, + { + "epoch": 1.28, + "grad_norm": 4.882907390594482, + "learning_rate": 5.409307035325618e-07, + "logits/chosen": -0.3278733491897583, + "logits/rejected": -0.4298689663410187, + "logps/chosen": -53.86081314086914, + "logps/rejected": -82.66795349121094, + "loss": 0.7497, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.077901840209961, + "rewards/margins": 4.901467323303223, + "rewards/rejected": -1.8235658407211304, + "step": 5100 + }, + { + "epoch": 1.28, + "grad_norm": 8.731693267822266, + "learning_rate": 5.397461384955316e-07, + "logits/chosen": -0.32014361023902893, + "logits/rejected": -0.41299423575401306, + "logps/chosen": -54.10306167602539, + "logps/rejected": -79.8639907836914, + "loss": 0.7521, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.755044937133789, + "rewards/margins": 4.866469383239746, + "rewards/rejected": -2.111424446105957, + "step": 5101 + }, + { + "epoch": 1.28, + "grad_norm": 5.835718154907227, + "learning_rate": 5.385627978944319e-07, + "logits/chosen": -0.40339910984039307, + "logits/rejected": -0.48242250084877014, + "logps/chosen": -55.20638656616211, + "logps/rejected": -94.14494323730469, + "loss": 0.7997, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9635798931121826, + "rewards/margins": 4.4414472579956055, + "rewards/rejected": -1.4778672456741333, + "step": 5102 + }, + { + "epoch": 1.28, + "grad_norm": 5.367650032043457, + "learning_rate": 5.373806820541144e-07, + "logits/chosen": -0.25238725543022156, + "logits/rejected": -0.3159097731113434, + "logps/chosen": -46.55402755737305, + "logps/rejected": -96.35132598876953, + "loss": 0.7465, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8571760654449463, + "rewards/margins": 5.271459579467773, + "rewards/rejected": -2.4142837524414062, + "step": 5103 + }, + { + "epoch": 1.28, + "grad_norm": 8.20213508605957, + "learning_rate": 5.361997912990957e-07, + "logits/chosen": -0.33100220561027527, + "logits/rejected": -0.4348837435245514, + "logps/chosen": -55.8446044921875, + "logps/rejected": -96.06170654296875, + "loss": 0.6409, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.821467161178589, + "rewards/margins": 5.753818511962891, + "rewards/rejected": -2.93235182762146, + "step": 5104 + }, + { + "epoch": 1.28, + "grad_norm": 6.02516508102417, + "learning_rate": 5.350201259535554e-07, + "logits/chosen": -0.34182560443878174, + "logits/rejected": -0.4411433935165405, + "logps/chosen": -69.35999298095703, + "logps/rejected": -86.73670959472656, + "loss": 0.7944, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1574501991271973, + "rewards/margins": 5.540595054626465, + "rewards/rejected": -2.3831446170806885, + "step": 5105 + }, + { + "epoch": 1.28, + "grad_norm": 5.692318916320801, + "learning_rate": 5.33841686341337e-07, + "logits/chosen": -0.2674539387226105, + "logits/rejected": -0.3700369894504547, + "logps/chosen": -70.36924743652344, + "logps/rejected": -88.08895111083984, + "loss": 0.6818, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8485183715820312, + "rewards/margins": 5.2022294998168945, + "rewards/rejected": -2.3537111282348633, + "step": 5106 + }, + { + "epoch": 1.28, + "grad_norm": 4.716892242431641, + "learning_rate": 5.326644727859481e-07, + "logits/chosen": -0.330598920583725, + "logits/rejected": -0.4524077773094177, + "logps/chosen": -48.804786682128906, + "logps/rejected": -71.20634460449219, + "loss": 0.7601, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6083247661590576, + "rewards/margins": 4.106110572814941, + "rewards/rejected": -1.4977853298187256, + "step": 5107 + }, + { + "epoch": 1.28, + "grad_norm": 4.585634231567383, + "learning_rate": 5.314884856105573e-07, + "logits/chosen": -0.3140942454338074, + "logits/rejected": -0.4573155343532562, + "logps/chosen": -54.844688415527344, + "logps/rejected": -71.41065216064453, + "loss": 0.6223, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.799983263015747, + "rewards/margins": 5.567503452301025, + "rewards/rejected": -2.7675209045410156, + "step": 5108 + }, + { + "epoch": 1.28, + "grad_norm": 6.606816291809082, + "learning_rate": 5.303137251380019e-07, + "logits/chosen": -0.4188433289527893, + "logits/rejected": -0.49794918298721313, + "logps/chosen": -67.63922119140625, + "logps/rejected": -75.58551025390625, + "loss": 0.9472, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.740553855895996, + "rewards/margins": 4.244777679443359, + "rewards/rejected": -1.5042238235473633, + "step": 5109 + }, + { + "epoch": 1.28, + "grad_norm": 3.7398555278778076, + "learning_rate": 5.291401916907768e-07, + "logits/chosen": -0.45530229806900024, + "logits/rejected": -0.48376572132110596, + "logps/chosen": -70.10659790039062, + "logps/rejected": -86.8259048461914, + "loss": 0.774, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.189110279083252, + "rewards/margins": 4.791748046875, + "rewards/rejected": -1.6026380062103271, + "step": 5110 + }, + { + "epoch": 1.28, + "grad_norm": 4.549030303955078, + "learning_rate": 5.279678855910425e-07, + "logits/chosen": -0.3493797779083252, + "logits/rejected": -0.445477157831192, + "logps/chosen": -64.51219940185547, + "logps/rejected": -84.23220825195312, + "loss": 0.8309, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8101587295532227, + "rewards/margins": 5.345706939697266, + "rewards/rejected": -2.535548448562622, + "step": 5111 + }, + { + "epoch": 1.28, + "grad_norm": 5.910440921783447, + "learning_rate": 5.267968071606222e-07, + "logits/chosen": -0.3259269893169403, + "logits/rejected": -0.47671279311180115, + "logps/chosen": -67.22196197509766, + "logps/rejected": -76.79330444335938, + "loss": 0.6908, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.74629545211792, + "rewards/margins": 4.942385673522949, + "rewards/rejected": -2.1960904598236084, + "step": 5112 + }, + { + "epoch": 1.28, + "grad_norm": 5.887313365936279, + "learning_rate": 5.25626956721002e-07, + "logits/chosen": -0.2682199478149414, + "logits/rejected": -0.35998207330703735, + "logps/chosen": -60.78691482543945, + "logps/rejected": -88.82408905029297, + "loss": 0.9155, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7695138454437256, + "rewards/margins": 4.153080940246582, + "rewards/rejected": -1.3835666179656982, + "step": 5113 + }, + { + "epoch": 1.28, + "grad_norm": 2.22367262840271, + "learning_rate": 5.244583345933324e-07, + "logits/chosen": -0.39684975147247314, + "logits/rejected": -0.5565106868743896, + "logps/chosen": -46.655548095703125, + "logps/rejected": -79.40902709960938, + "loss": 0.5444, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1769683361053467, + "rewards/margins": 6.597306251525879, + "rewards/rejected": -3.4203379154205322, + "step": 5114 + }, + { + "epoch": 1.28, + "grad_norm": 14.772944450378418, + "learning_rate": 5.232909410984249e-07, + "logits/chosen": -0.29288414120674133, + "logits/rejected": -0.4521718919277191, + "logps/chosen": -67.98051452636719, + "logps/rejected": -87.1889419555664, + "loss": 0.7553, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.907337188720703, + "rewards/margins": 6.161992073059082, + "rewards/rejected": -3.2546546459198, + "step": 5115 + }, + { + "epoch": 1.28, + "grad_norm": 12.21035099029541, + "learning_rate": 5.221247765567528e-07, + "logits/chosen": -0.4048043191432953, + "logits/rejected": -0.5043655037879944, + "logps/chosen": -61.32582092285156, + "logps/rejected": -85.36968994140625, + "loss": 0.7055, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9128246307373047, + "rewards/margins": 5.0625739097595215, + "rewards/rejected": -2.149749517440796, + "step": 5116 + }, + { + "epoch": 1.28, + "grad_norm": 2.934696674346924, + "learning_rate": 5.209598412884548e-07, + "logits/chosen": -0.32289382815361023, + "logits/rejected": -0.4501822292804718, + "logps/chosen": -50.041236877441406, + "logps/rejected": -80.31548309326172, + "loss": 0.5768, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0973329544067383, + "rewards/margins": 4.642632961273193, + "rewards/rejected": -1.545299768447876, + "step": 5117 + }, + { + "epoch": 1.28, + "grad_norm": 17.649723052978516, + "learning_rate": 5.197961356133308e-07, + "logits/chosen": -0.37977802753448486, + "logits/rejected": -0.43806374073028564, + "logps/chosen": -60.35443115234375, + "logps/rejected": -83.2947006225586, + "loss": 0.8049, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7485718727111816, + "rewards/margins": 4.3326287269592285, + "rewards/rejected": -1.5840568542480469, + "step": 5118 + }, + { + "epoch": 1.28, + "grad_norm": 12.982826232910156, + "learning_rate": 5.186336598508424e-07, + "logits/chosen": -0.3273145854473114, + "logits/rejected": -0.4271136522293091, + "logps/chosen": -52.80705261230469, + "logps/rejected": -80.5450210571289, + "loss": 0.654, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.856055498123169, + "rewards/margins": 4.475525379180908, + "rewards/rejected": -1.6194701194763184, + "step": 5119 + }, + { + "epoch": 1.28, + "grad_norm": 4.360102653503418, + "learning_rate": 5.174724143201148e-07, + "logits/chosen": -0.2994976043701172, + "logits/rejected": -0.44819381833076477, + "logps/chosen": -62.76222610473633, + "logps/rejected": -78.79627990722656, + "loss": 0.7299, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7532172203063965, + "rewards/margins": 5.361710548400879, + "rewards/rejected": -2.6084933280944824, + "step": 5120 + }, + { + "epoch": 1.28, + "grad_norm": 7.250366687774658, + "learning_rate": 5.163123993399338e-07, + "logits/chosen": -0.3050612807273865, + "logits/rejected": -0.37907278537750244, + "logps/chosen": -60.44881820678711, + "logps/rejected": -86.91007232666016, + "loss": 0.7593, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.807082414627075, + "rewards/margins": 4.703319549560547, + "rewards/rejected": -1.8962371349334717, + "step": 5121 + }, + { + "epoch": 1.28, + "grad_norm": 4.813229560852051, + "learning_rate": 5.151536152287506e-07, + "logits/chosen": -0.3348495364189148, + "logits/rejected": -0.4697803258895874, + "logps/chosen": -69.03250122070312, + "logps/rejected": -83.1904296875, + "loss": 0.727, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.829881191253662, + "rewards/margins": 4.606681823730469, + "rewards/rejected": -1.776800513267517, + "step": 5122 + }, + { + "epoch": 1.28, + "grad_norm": 4.226556301116943, + "learning_rate": 5.139960623046753e-07, + "logits/chosen": -0.3006872236728668, + "logits/rejected": -0.41743162274360657, + "logps/chosen": -57.50868606567383, + "logps/rejected": -83.84373474121094, + "loss": 0.6749, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.982638359069824, + "rewards/margins": 5.769371032714844, + "rewards/rejected": -2.7867331504821777, + "step": 5123 + }, + { + "epoch": 1.28, + "grad_norm": 3.8134491443634033, + "learning_rate": 5.128397408854813e-07, + "logits/chosen": -0.304951936006546, + "logits/rejected": -0.3875003755092621, + "logps/chosen": -55.94932174682617, + "logps/rejected": -85.42457580566406, + "loss": 0.6886, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7159152030944824, + "rewards/margins": 4.420577049255371, + "rewards/rejected": -1.7046624422073364, + "step": 5124 + }, + { + "epoch": 1.28, + "grad_norm": 5.301548480987549, + "learning_rate": 5.116846512886036e-07, + "logits/chosen": -0.37072157859802246, + "logits/rejected": -0.4659609794616699, + "logps/chosen": -56.71929168701172, + "logps/rejected": -113.33574676513672, + "loss": 0.6458, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.671762704849243, + "rewards/margins": 6.28984260559082, + "rewards/rejected": -3.618079662322998, + "step": 5125 + }, + { + "epoch": 1.28, + "grad_norm": 9.662677764892578, + "learning_rate": 5.105307938311382e-07, + "logits/chosen": -0.2652084529399872, + "logits/rejected": -0.4467097520828247, + "logps/chosen": -58.92246627807617, + "logps/rejected": -77.79146575927734, + "loss": 0.7004, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.101576566696167, + "rewards/margins": 5.093051433563232, + "rewards/rejected": -1.991474986076355, + "step": 5126 + }, + { + "epoch": 1.28, + "grad_norm": 5.3227949142456055, + "learning_rate": 5.093781688298471e-07, + "logits/chosen": -0.39607954025268555, + "logits/rejected": -0.5043860673904419, + "logps/chosen": -54.26152801513672, + "logps/rejected": -77.1933822631836, + "loss": 0.6894, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.857215642929077, + "rewards/margins": 4.393181324005127, + "rewards/rejected": -1.5359654426574707, + "step": 5127 + }, + { + "epoch": 1.28, + "grad_norm": 4.577944278717041, + "learning_rate": 5.082267766011467e-07, + "logits/chosen": -0.37552696466445923, + "logits/rejected": -0.48823654651641846, + "logps/chosen": -57.783599853515625, + "logps/rejected": -77.88356018066406, + "loss": 0.6826, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9829328060150146, + "rewards/margins": 4.811173915863037, + "rewards/rejected": -1.828240990638733, + "step": 5128 + }, + { + "epoch": 1.28, + "grad_norm": 4.661914825439453, + "learning_rate": 5.07076617461122e-07, + "logits/chosen": -0.31966036558151245, + "logits/rejected": -0.42083561420440674, + "logps/chosen": -64.82957458496094, + "logps/rejected": -86.65679168701172, + "loss": 0.6868, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.693128824234009, + "rewards/margins": 4.192193031311035, + "rewards/rejected": -1.4990642070770264, + "step": 5129 + }, + { + "epoch": 1.28, + "grad_norm": 2.500201940536499, + "learning_rate": 5.059276917255151e-07, + "logits/chosen": -0.31581956148147583, + "logits/rejected": -0.4216792583465576, + "logps/chosen": -50.99125289916992, + "logps/rejected": -77.05878448486328, + "loss": 0.6015, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0434410572052, + "rewards/margins": 5.748713493347168, + "rewards/rejected": -2.7052724361419678, + "step": 5130 + }, + { + "epoch": 1.28, + "grad_norm": 6.555820465087891, + "learning_rate": 5.047799997097319e-07, + "logits/chosen": -0.386175274848938, + "logits/rejected": -0.4244804382324219, + "logps/chosen": -53.96437454223633, + "logps/rejected": -102.49214172363281, + "loss": 0.7242, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.166285276412964, + "rewards/margins": 5.401301860809326, + "rewards/rejected": -2.2350165843963623, + "step": 5131 + }, + { + "epoch": 1.28, + "grad_norm": 5.734583854675293, + "learning_rate": 5.036335417288374e-07, + "logits/chosen": -0.33660584688186646, + "logits/rejected": -0.4257814586162567, + "logps/chosen": -62.16843032836914, + "logps/rejected": -77.36907958984375, + "loss": 0.7495, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0188698768615723, + "rewards/margins": 4.6006646156311035, + "rewards/rejected": -1.5817952156066895, + "step": 5132 + }, + { + "epoch": 1.28, + "grad_norm": 3.54853892326355, + "learning_rate": 5.024883180975587e-07, + "logits/chosen": -0.3849106729030609, + "logits/rejected": -0.4659768044948578, + "logps/chosen": -60.4799690246582, + "logps/rejected": -84.63463592529297, + "loss": 0.7485, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.752094268798828, + "rewards/margins": 4.7483062744140625, + "rewards/rejected": -1.996212124824524, + "step": 5133 + }, + { + "epoch": 1.28, + "grad_norm": 5.376000881195068, + "learning_rate": 5.013443291302866e-07, + "logits/chosen": -0.3150385916233063, + "logits/rejected": -0.40450629591941833, + "logps/chosen": -74.31749725341797, + "logps/rejected": -106.1139144897461, + "loss": 0.7686, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.020432949066162, + "rewards/margins": 5.210084438323975, + "rewards/rejected": -2.1896517276763916, + "step": 5134 + }, + { + "epoch": 1.28, + "grad_norm": 4.0578107833862305, + "learning_rate": 5.002015751410705e-07, + "logits/chosen": -0.3246261775493622, + "logits/rejected": -0.46014899015426636, + "logps/chosen": -68.36634063720703, + "logps/rejected": -83.97394561767578, + "loss": 0.6557, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.095752000808716, + "rewards/margins": 4.592886447906494, + "rewards/rejected": -1.4971344470977783, + "step": 5135 + }, + { + "epoch": 1.28, + "grad_norm": 2.6665189266204834, + "learning_rate": 4.990600564436177e-07, + "logits/chosen": -0.36720961332321167, + "logits/rejected": -0.45020169019699097, + "logps/chosen": -54.064361572265625, + "logps/rejected": -77.54900360107422, + "loss": 0.6391, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.151768684387207, + "rewards/margins": 5.025059700012207, + "rewards/rejected": -1.8732905387878418, + "step": 5136 + }, + { + "epoch": 1.29, + "grad_norm": 6.844671249389648, + "learning_rate": 4.979197733513031e-07, + "logits/chosen": -0.36916646361351013, + "logits/rejected": -0.4205648899078369, + "logps/chosen": -52.343936920166016, + "logps/rejected": -86.01612854003906, + "loss": 0.7816, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.997487783432007, + "rewards/margins": 3.532846689224243, + "rewards/rejected": -0.5353592038154602, + "step": 5137 + }, + { + "epoch": 1.29, + "grad_norm": 4.956679821014404, + "learning_rate": 4.967807261771567e-07, + "logits/chosen": -0.3377852737903595, + "logits/rejected": -0.48170778155326843, + "logps/chosen": -60.23624038696289, + "logps/rejected": -68.71319580078125, + "loss": 0.7775, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0031144618988037, + "rewards/margins": 5.128610610961914, + "rewards/rejected": -2.1254961490631104, + "step": 5138 + }, + { + "epoch": 1.29, + "grad_norm": 5.156439781188965, + "learning_rate": 4.956429152338754e-07, + "logits/chosen": -0.30705204606056213, + "logits/rejected": -0.4001641869544983, + "logps/chosen": -62.984336853027344, + "logps/rejected": -79.89143371582031, + "loss": 0.8068, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.838109254837036, + "rewards/margins": 4.584456920623779, + "rewards/rejected": -1.7463475465774536, + "step": 5139 + }, + { + "epoch": 1.29, + "grad_norm": 15.921899795532227, + "learning_rate": 4.945063408338085e-07, + "logits/chosen": -0.4647994637489319, + "logits/rejected": -0.5456256866455078, + "logps/chosen": -54.75693130493164, + "logps/rejected": -95.28911590576172, + "loss": 0.8288, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.983830451965332, + "rewards/margins": 5.277461528778076, + "rewards/rejected": -2.293630599975586, + "step": 5140 + }, + { + "epoch": 1.29, + "grad_norm": 4.895979404449463, + "learning_rate": 4.933710032889716e-07, + "logits/chosen": -0.3633232116699219, + "logits/rejected": -0.5105503797531128, + "logps/chosen": -57.590606689453125, + "logps/rejected": -73.88944244384766, + "loss": 0.676, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8958771228790283, + "rewards/margins": 4.548366546630859, + "rewards/rejected": -1.6524896621704102, + "step": 5141 + }, + { + "epoch": 1.29, + "grad_norm": 4.107389450073242, + "learning_rate": 4.922369029110407e-07, + "logits/chosen": -0.34842199087142944, + "logits/rejected": -0.4489109218120575, + "logps/chosen": -52.531036376953125, + "logps/rejected": -95.54600524902344, + "loss": 0.6799, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.991346836090088, + "rewards/margins": 5.118487358093262, + "rewards/rejected": -2.1271398067474365, + "step": 5142 + }, + { + "epoch": 1.29, + "grad_norm": 3.922394037246704, + "learning_rate": 4.911040400113493e-07, + "logits/chosen": -0.359676331281662, + "logits/rejected": -0.5032122135162354, + "logps/chosen": -63.1777229309082, + "logps/rejected": -69.74857330322266, + "loss": 0.7572, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.732372283935547, + "rewards/margins": 4.2069525718688965, + "rewards/rejected": -1.47458016872406, + "step": 5143 + }, + { + "epoch": 1.29, + "grad_norm": 5.946495532989502, + "learning_rate": 4.899724149008933e-07, + "logits/chosen": -0.3690073788166046, + "logits/rejected": -0.43613946437835693, + "logps/chosen": -58.481109619140625, + "logps/rejected": -91.76790618896484, + "loss": 0.7836, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.761451482772827, + "rewards/margins": 3.9866795539855957, + "rewards/rejected": -1.225227952003479, + "step": 5144 + }, + { + "epoch": 1.29, + "grad_norm": 5.141554355621338, + "learning_rate": 4.888420278903283e-07, + "logits/chosen": -0.415924608707428, + "logits/rejected": -0.5322145223617554, + "logps/chosen": -48.68255615234375, + "logps/rejected": -86.09877014160156, + "loss": 0.7368, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8188347816467285, + "rewards/margins": 5.672369480133057, + "rewards/rejected": -2.853534698486328, + "step": 5145 + }, + { + "epoch": 1.29, + "grad_norm": 10.998686790466309, + "learning_rate": 4.877128792899688e-07, + "logits/chosen": -0.3068518340587616, + "logits/rejected": -0.3786168694496155, + "logps/chosen": -55.84593963623047, + "logps/rejected": -94.64134216308594, + "loss": 0.8511, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.420370578765869, + "rewards/margins": 5.373406410217285, + "rewards/rejected": -2.953035831451416, + "step": 5146 + }, + { + "epoch": 1.29, + "grad_norm": 3.595175266265869, + "learning_rate": 4.865849694097935e-07, + "logits/chosen": -0.3187118172645569, + "logits/rejected": -0.40344831347465515, + "logps/chosen": -59.952919006347656, + "logps/rejected": -95.3114242553711, + "loss": 0.6829, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.71886944770813, + "rewards/margins": 4.910573959350586, + "rewards/rejected": -2.191704750061035, + "step": 5147 + }, + { + "epoch": 1.29, + "grad_norm": 8.243171691894531, + "learning_rate": 4.854582985594336e-07, + "logits/chosen": -0.39031243324279785, + "logits/rejected": -0.43341493606567383, + "logps/chosen": -57.4112548828125, + "logps/rejected": -91.24244689941406, + "loss": 0.9268, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8967413902282715, + "rewards/margins": 3.378352642059326, + "rewards/rejected": -0.4816112220287323, + "step": 5148 + }, + { + "epoch": 1.29, + "grad_norm": 7.993181228637695, + "learning_rate": 4.843328670481878e-07, + "logits/chosen": -0.3255048990249634, + "logits/rejected": -0.44953715801239014, + "logps/chosen": -61.88037109375, + "logps/rejected": -89.27510070800781, + "loss": 0.7059, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7937800884246826, + "rewards/margins": 5.350288391113281, + "rewards/rejected": -2.5565080642700195, + "step": 5149 + }, + { + "epoch": 1.29, + "grad_norm": 3.8725597858428955, + "learning_rate": 4.832086751850106e-07, + "logits/chosen": -0.37473776936531067, + "logits/rejected": -0.3990907371044159, + "logps/chosen": -50.536094665527344, + "logps/rejected": -89.6852035522461, + "loss": 0.6773, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9518299102783203, + "rewards/margins": 4.176886558532715, + "rewards/rejected": -1.2250568866729736, + "step": 5150 + }, + { + "epoch": 1.29, + "grad_norm": 11.41518783569336, + "learning_rate": 4.82085723278517e-07, + "logits/chosen": -0.37651196122169495, + "logits/rejected": -0.5029892325401306, + "logps/chosen": -53.4362907409668, + "logps/rejected": -90.85091400146484, + "loss": 0.7457, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.840108633041382, + "rewards/margins": 5.027673721313477, + "rewards/rejected": -2.1875648498535156, + "step": 5151 + }, + { + "epoch": 1.29, + "grad_norm": 4.968575954437256, + "learning_rate": 4.809640116369807e-07, + "logits/chosen": -0.37194472551345825, + "logits/rejected": -0.46649792790412903, + "logps/chosen": -57.779327392578125, + "logps/rejected": -79.48872375488281, + "loss": 0.757, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7701570987701416, + "rewards/margins": 4.003472805023193, + "rewards/rejected": -1.2333158254623413, + "step": 5152 + }, + { + "epoch": 1.29, + "grad_norm": 2.952219009399414, + "learning_rate": 4.79843540568336e-07, + "logits/chosen": -0.36823785305023193, + "logits/rejected": -0.4752124845981598, + "logps/chosen": -49.21541213989258, + "logps/rejected": -81.1750717163086, + "loss": 0.6572, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.154261827468872, + "rewards/margins": 5.715460777282715, + "rewards/rejected": -2.5611989498138428, + "step": 5153 + }, + { + "epoch": 1.29, + "grad_norm": 8.561806678771973, + "learning_rate": 4.787243103801769e-07, + "logits/chosen": -0.3209904730319977, + "logits/rejected": -0.4351322650909424, + "logps/chosen": -56.60993194580078, + "logps/rejected": -67.29108428955078, + "loss": 0.9663, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7402055263519287, + "rewards/margins": 3.2719600200653076, + "rewards/rejected": -0.531754732131958, + "step": 5154 + }, + { + "epoch": 1.29, + "grad_norm": 6.199527263641357, + "learning_rate": 4.776063213797566e-07, + "logits/chosen": -0.3472440838813782, + "logits/rejected": -0.45279157161712646, + "logps/chosen": -54.69487762451172, + "logps/rejected": -85.3879623413086, + "loss": 0.6292, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9465253353118896, + "rewards/margins": 5.388904571533203, + "rewards/rejected": -2.4423794746398926, + "step": 5155 + }, + { + "epoch": 1.29, + "grad_norm": 6.885123252868652, + "learning_rate": 4.7648957387398663e-07, + "logits/chosen": -0.3744575083255768, + "logits/rejected": -0.4431535005569458, + "logps/chosen": -73.75039672851562, + "logps/rejected": -101.91575622558594, + "loss": 0.7917, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.937680244445801, + "rewards/margins": 5.177633285522461, + "rewards/rejected": -2.2399532794952393, + "step": 5156 + }, + { + "epoch": 1.29, + "grad_norm": 6.1608567237854, + "learning_rate": 4.7537406816943797e-07, + "logits/chosen": -0.42335718870162964, + "logits/rejected": -0.45744627714157104, + "logps/chosen": -53.93259811401367, + "logps/rejected": -94.10285186767578, + "loss": 0.7527, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.819746971130371, + "rewards/margins": 4.534064292907715, + "rewards/rejected": -1.714317798614502, + "step": 5157 + }, + { + "epoch": 1.29, + "grad_norm": 3.3462109565734863, + "learning_rate": 4.742598045723407e-07, + "logits/chosen": -0.3615768253803253, + "logits/rejected": -0.5193865299224854, + "logps/chosen": -47.696044921875, + "logps/rejected": -78.6171646118164, + "loss": 0.6231, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3942785263061523, + "rewards/margins": 5.709155082702637, + "rewards/rejected": -2.3148763179779053, + "step": 5158 + }, + { + "epoch": 1.29, + "grad_norm": 5.824495792388916, + "learning_rate": 4.7314678338858664e-07, + "logits/chosen": -0.37595808506011963, + "logits/rejected": -0.47204118967056274, + "logps/chosen": -59.988037109375, + "logps/rejected": -85.96504211425781, + "loss": 0.6364, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.096095323562622, + "rewards/margins": 5.330062389373779, + "rewards/rejected": -2.2339670658111572, + "step": 5159 + }, + { + "epoch": 1.29, + "grad_norm": 2.7806613445281982, + "learning_rate": 4.72035004923721e-07, + "logits/chosen": -0.35969528555870056, + "logits/rejected": -0.40890389680862427, + "logps/chosen": -48.54369354248047, + "logps/rejected": -97.01173400878906, + "loss": 0.5698, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0180249214172363, + "rewards/margins": 4.6431097984313965, + "rewards/rejected": -1.625084638595581, + "step": 5160 + }, + { + "epoch": 1.29, + "grad_norm": 2.210972785949707, + "learning_rate": 4.7092446948295177e-07, + "logits/chosen": -0.338714599609375, + "logits/rejected": -0.4484131634235382, + "logps/chosen": -50.42098617553711, + "logps/rejected": -97.50213623046875, + "loss": 0.5521, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.221264123916626, + "rewards/margins": 5.4077911376953125, + "rewards/rejected": -2.1865265369415283, + "step": 5161 + }, + { + "epoch": 1.29, + "grad_norm": 4.9437408447265625, + "learning_rate": 4.6981517737114626e-07, + "logits/chosen": -0.29761889576911926, + "logits/rejected": -0.39244675636291504, + "logps/chosen": -59.992393493652344, + "logps/rejected": -73.90572357177734, + "loss": 0.7501, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.946458578109741, + "rewards/margins": 3.9992706775665283, + "rewards/rejected": -1.052811861038208, + "step": 5162 + }, + { + "epoch": 1.29, + "grad_norm": 6.628934383392334, + "learning_rate": 4.6870712889282797e-07, + "logits/chosen": -0.35716357827186584, + "logits/rejected": -0.42801225185394287, + "logps/chosen": -52.519432067871094, + "logps/rejected": -89.6954345703125, + "loss": 0.7153, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7831132411956787, + "rewards/margins": 5.407515525817871, + "rewards/rejected": -2.624401807785034, + "step": 5163 + }, + { + "epoch": 1.29, + "grad_norm": 21.54500961303711, + "learning_rate": 4.6760032435218094e-07, + "logits/chosen": -0.3357308804988861, + "logits/rejected": -0.3832096457481384, + "logps/chosen": -61.300201416015625, + "logps/rejected": -95.56982421875, + "loss": 0.8561, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7135987281799316, + "rewards/margins": 5.385103225708008, + "rewards/rejected": -2.671504497528076, + "step": 5164 + }, + { + "epoch": 1.29, + "grad_norm": 5.894099235534668, + "learning_rate": 4.6649476405304593e-07, + "logits/chosen": -0.30398115515708923, + "logits/rejected": -0.41939595341682434, + "logps/chosen": -67.26586151123047, + "logps/rejected": -89.66065216064453, + "loss": 0.7794, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9091978073120117, + "rewards/margins": 4.502931594848633, + "rewards/rejected": -1.593733787536621, + "step": 5165 + }, + { + "epoch": 1.29, + "grad_norm": 4.42645263671875, + "learning_rate": 4.653904482989224e-07, + "logits/chosen": -0.32408636808395386, + "logits/rejected": -0.41099411249160767, + "logps/chosen": -54.90045928955078, + "logps/rejected": -83.44954681396484, + "loss": 0.6809, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.765561819076538, + "rewards/margins": 4.417758941650391, + "rewards/rejected": -1.6521966457366943, + "step": 5166 + }, + { + "epoch": 1.29, + "grad_norm": 4.872594833374023, + "learning_rate": 4.6428737739297125e-07, + "logits/chosen": -0.33067673444747925, + "logits/rejected": -0.472009539604187, + "logps/chosen": -64.4461898803711, + "logps/rejected": -84.79407501220703, + "loss": 0.7618, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9036026000976562, + "rewards/margins": 4.909573554992676, + "rewards/rejected": -2.0059709548950195, + "step": 5167 + }, + { + "epoch": 1.29, + "grad_norm": 2.9002790451049805, + "learning_rate": 4.6318555163800794e-07, + "logits/chosen": -0.37114328145980835, + "logits/rejected": -0.4728548228740692, + "logps/chosen": -55.571964263916016, + "logps/rejected": -69.21648406982422, + "loss": 0.6471, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.966571569442749, + "rewards/margins": 4.939007759094238, + "rewards/rejected": -1.9724369049072266, + "step": 5168 + }, + { + "epoch": 1.29, + "grad_norm": 4.735360622406006, + "learning_rate": 4.6208497133650777e-07, + "logits/chosen": -0.37212270498275757, + "logits/rejected": -0.40374240279197693, + "logps/chosen": -56.2816047668457, + "logps/rejected": -90.5425033569336, + "loss": 0.7271, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.112450361251831, + "rewards/margins": 5.0593037605285645, + "rewards/rejected": -1.9468536376953125, + "step": 5169 + }, + { + "epoch": 1.29, + "grad_norm": 6.619636058807373, + "learning_rate": 4.609856367906029e-07, + "logits/chosen": -0.3556964099407196, + "logits/rejected": -0.4513574242591858, + "logps/chosen": -64.90925598144531, + "logps/rejected": -97.39898681640625, + "loss": 0.8454, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.595517873764038, + "rewards/margins": 4.646531105041504, + "rewards/rejected": -2.0510129928588867, + "step": 5170 + }, + { + "epoch": 1.29, + "grad_norm": 4.58717155456543, + "learning_rate": 4.598875483020848e-07, + "logits/chosen": -0.3973188102245331, + "logits/rejected": -0.47070616483688354, + "logps/chosen": -50.77970504760742, + "logps/rejected": -69.98809814453125, + "loss": 0.7546, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.841686964035034, + "rewards/margins": 4.19487190246582, + "rewards/rejected": -1.3531855344772339, + "step": 5171 + }, + { + "epoch": 1.29, + "grad_norm": 6.644770622253418, + "learning_rate": 4.587907061724034e-07, + "logits/chosen": -0.3088829219341278, + "logits/rejected": -0.4187655746936798, + "logps/chosen": -55.150306701660156, + "logps/rejected": -94.43892669677734, + "loss": 0.6479, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.976933479309082, + "rewards/margins": 5.051620960235596, + "rewards/rejected": -2.0746872425079346, + "step": 5172 + }, + { + "epoch": 1.29, + "grad_norm": 4.346111297607422, + "learning_rate": 4.576951107026628e-07, + "logits/chosen": -0.283194363117218, + "logits/rejected": -0.37600454688072205, + "logps/chosen": -57.46229553222656, + "logps/rejected": -93.10525512695312, + "loss": 0.7085, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7963600158691406, + "rewards/margins": 5.320036888122559, + "rewards/rejected": -2.523676872253418, + "step": 5173 + }, + { + "epoch": 1.29, + "grad_norm": 4.879588603973389, + "learning_rate": 4.5660076219363137e-07, + "logits/chosen": -0.3337506055831909, + "logits/rejected": -0.43552395701408386, + "logps/chosen": -58.78973388671875, + "logps/rejected": -83.92615509033203, + "loss": 0.7985, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.738579750061035, + "rewards/margins": 4.354875087738037, + "rewards/rejected": -1.6162959337234497, + "step": 5174 + }, + { + "epoch": 1.29, + "grad_norm": 12.255188941955566, + "learning_rate": 4.555076609457287e-07, + "logits/chosen": -0.39958786964416504, + "logits/rejected": -0.49884021282196045, + "logps/chosen": -54.68630599975586, + "logps/rejected": -73.4307861328125, + "loss": 0.8064, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.983468532562256, + "rewards/margins": 4.4552321434021, + "rewards/rejected": -1.471764087677002, + "step": 5175 + }, + { + "epoch": 1.29, + "grad_norm": 2.681035280227661, + "learning_rate": 4.54415807259036e-07, + "logits/chosen": -0.3800699710845947, + "logits/rejected": -0.5245198011398315, + "logps/chosen": -53.71977233886719, + "logps/rejected": -81.52656555175781, + "loss": 0.6594, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1158978939056396, + "rewards/margins": 5.413066864013672, + "rewards/rejected": -2.2971689701080322, + "step": 5176 + }, + { + "epoch": 1.3, + "grad_norm": 2.46193265914917, + "learning_rate": 4.533252014332901e-07, + "logits/chosen": -0.36480510234832764, + "logits/rejected": -0.48349347710609436, + "logps/chosen": -54.52122497558594, + "logps/rejected": -95.22364807128906, + "loss": 0.6482, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7990024089813232, + "rewards/margins": 5.723880290985107, + "rewards/rejected": -2.9248781204223633, + "step": 5177 + }, + { + "epoch": 1.3, + "grad_norm": 3.6645848751068115, + "learning_rate": 4.5223584376788533e-07, + "logits/chosen": -0.3286789059638977, + "logits/rejected": -0.4370470643043518, + "logps/chosen": -60.47550964355469, + "logps/rejected": -86.49343872070312, + "loss": 0.7062, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2818973064422607, + "rewards/margins": 5.921657562255859, + "rewards/rejected": -2.639760732650757, + "step": 5178 + }, + { + "epoch": 1.3, + "grad_norm": 8.673826217651367, + "learning_rate": 4.5114773456187475e-07, + "logits/chosen": -0.39328712224960327, + "logits/rejected": -0.4656897485256195, + "logps/chosen": -47.64912796020508, + "logps/rejected": -72.35424041748047, + "loss": 0.6366, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9225523471832275, + "rewards/margins": 4.738393306732178, + "rewards/rejected": -1.815840482711792, + "step": 5179 + }, + { + "epoch": 1.3, + "grad_norm": 3.220928430557251, + "learning_rate": 4.500608741139689e-07, + "logits/chosen": -0.3749008774757385, + "logits/rejected": -0.5092122554779053, + "logps/chosen": -75.22891235351562, + "logps/rejected": -76.59764099121094, + "loss": 0.7638, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7123656272888184, + "rewards/margins": 4.487480640411377, + "rewards/rejected": -1.775114893913269, + "step": 5180 + }, + { + "epoch": 1.3, + "grad_norm": 4.924554824829102, + "learning_rate": 4.4897526272253076e-07, + "logits/chosen": -0.34198763966560364, + "logits/rejected": -0.4174019992351532, + "logps/chosen": -61.02668380737305, + "logps/rejected": -92.69864654541016, + "loss": 0.8104, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1722609996795654, + "rewards/margins": 5.310681343078613, + "rewards/rejected": -2.138420343399048, + "step": 5181 + }, + { + "epoch": 1.3, + "grad_norm": 5.5151166915893555, + "learning_rate": 4.478909006855875e-07, + "logits/chosen": -0.3289092779159546, + "logits/rejected": -0.430946409702301, + "logps/chosen": -61.98086166381836, + "logps/rejected": -82.34905242919922, + "loss": 0.7453, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6121158599853516, + "rewards/margins": 4.659772872924805, + "rewards/rejected": -2.047657012939453, + "step": 5182 + }, + { + "epoch": 1.3, + "grad_norm": 12.074629783630371, + "learning_rate": 4.468077883008187e-07, + "logits/chosen": -0.3745967149734497, + "logits/rejected": -0.42083147168159485, + "logps/chosen": -59.449527740478516, + "logps/rejected": -90.32701873779297, + "loss": 1.0275, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4245495796203613, + "rewards/margins": 4.383798122406006, + "rewards/rejected": -1.9592483043670654, + "step": 5183 + }, + { + "epoch": 1.3, + "grad_norm": 5.770316123962402, + "learning_rate": 4.4572592586556274e-07, + "logits/chosen": -0.43068280816078186, + "logits/rejected": -0.4429382085800171, + "logps/chosen": -58.340171813964844, + "logps/rejected": -98.89604949951172, + "loss": 0.7421, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.738436222076416, + "rewards/margins": 4.9491448402404785, + "rewards/rejected": -2.2107081413269043, + "step": 5184 + }, + { + "epoch": 1.3, + "grad_norm": 5.384241580963135, + "learning_rate": 4.446453136768131e-07, + "logits/chosen": -0.2473621368408203, + "logits/rejected": -0.43505609035491943, + "logps/chosen": -57.693565368652344, + "logps/rejected": -73.49742889404297, + "loss": 0.7334, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7220566272735596, + "rewards/margins": 4.910558700561523, + "rewards/rejected": -2.188502550125122, + "step": 5185 + }, + { + "epoch": 1.3, + "grad_norm": 5.187749862670898, + "learning_rate": 4.4356595203122076e-07, + "logits/chosen": -0.3156393766403198, + "logits/rejected": -0.411286860704422, + "logps/chosen": -57.97101974487305, + "logps/rejected": -93.39859008789062, + "loss": 0.8569, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.926426410675049, + "rewards/margins": 4.43742036819458, + "rewards/rejected": -1.5109941959381104, + "step": 5186 + }, + { + "epoch": 1.3, + "grad_norm": 12.450610160827637, + "learning_rate": 4.4248784122509545e-07, + "logits/chosen": -0.29863837361335754, + "logits/rejected": -0.43424665927886963, + "logps/chosen": -63.430458068847656, + "logps/rejected": -88.44650268554688, + "loss": 0.8095, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8996291160583496, + "rewards/margins": 5.139307975769043, + "rewards/rejected": -2.2396788597106934, + "step": 5187 + }, + { + "epoch": 1.3, + "grad_norm": 3.434192419052124, + "learning_rate": 4.41410981554401e-07, + "logits/chosen": -0.4658363461494446, + "logits/rejected": -0.5941886901855469, + "logps/chosen": -52.600711822509766, + "logps/rejected": -82.82470703125, + "loss": 0.6882, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0225086212158203, + "rewards/margins": 6.106478691101074, + "rewards/rejected": -3.083970069885254, + "step": 5188 + }, + { + "epoch": 1.3, + "grad_norm": 9.889058113098145, + "learning_rate": 4.4033537331475917e-07, + "logits/chosen": -0.4295777976512909, + "logits/rejected": -0.4914953410625458, + "logps/chosen": -44.56886291503906, + "logps/rejected": -70.13888549804688, + "loss": 0.8168, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8261125087738037, + "rewards/margins": 3.962585926055908, + "rewards/rejected": -1.136473536491394, + "step": 5189 + }, + { + "epoch": 1.3, + "grad_norm": 5.124030590057373, + "learning_rate": 4.392610168014466e-07, + "logits/chosen": -0.40506017208099365, + "logits/rejected": -0.4971420168876648, + "logps/chosen": -54.921478271484375, + "logps/rejected": -79.58016204833984, + "loss": 0.7232, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0456020832061768, + "rewards/margins": 4.729104995727539, + "rewards/rejected": -1.6835026741027832, + "step": 5190 + }, + { + "epoch": 1.3, + "grad_norm": 3.7194693088531494, + "learning_rate": 4.381879123093974e-07, + "logits/chosen": -0.3698936700820923, + "logits/rejected": -0.42849719524383545, + "logps/chosen": -57.39990234375, + "logps/rejected": -95.90251159667969, + "loss": 0.69, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.802608013153076, + "rewards/margins": 4.910703659057617, + "rewards/rejected": -2.1080949306488037, + "step": 5191 + }, + { + "epoch": 1.3, + "grad_norm": 4.476774215698242, + "learning_rate": 4.371160601332042e-07, + "logits/chosen": -0.2980949282646179, + "logits/rejected": -0.3623463809490204, + "logps/chosen": -49.194583892822266, + "logps/rejected": -78.21430206298828, + "loss": 0.7427, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9529409408569336, + "rewards/margins": 4.665950775146484, + "rewards/rejected": -1.7130101919174194, + "step": 5192 + }, + { + "epoch": 1.3, + "grad_norm": 3.158493995666504, + "learning_rate": 4.3604546056710993e-07, + "logits/chosen": -0.32109907269477844, + "logits/rejected": -0.43454429507255554, + "logps/chosen": -63.814205169677734, + "logps/rejected": -76.89812469482422, + "loss": 0.7006, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.863834857940674, + "rewards/margins": 5.154051303863525, + "rewards/rejected": -2.2902162075042725, + "step": 5193 + }, + { + "epoch": 1.3, + "grad_norm": 5.936334609985352, + "learning_rate": 4.3497611390502016e-07, + "logits/chosen": -0.31606003642082214, + "logits/rejected": -0.5115468502044678, + "logps/chosen": -61.08644104003906, + "logps/rejected": -71.54385375976562, + "loss": 0.6863, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7793540954589844, + "rewards/margins": 4.930274963378906, + "rewards/rejected": -2.1509206295013428, + "step": 5194 + }, + { + "epoch": 1.3, + "grad_norm": 6.13568639755249, + "learning_rate": 4.339080204404922e-07, + "logits/chosen": -0.4062807261943817, + "logits/rejected": -0.5032932758331299, + "logps/chosen": -58.185752868652344, + "logps/rejected": -75.4962158203125, + "loss": 0.7738, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.911259174346924, + "rewards/margins": 4.224946022033691, + "rewards/rejected": -1.3136868476867676, + "step": 5195 + }, + { + "epoch": 1.3, + "grad_norm": 5.3123650550842285, + "learning_rate": 4.328411804667415e-07, + "logits/chosen": -0.32223764061927795, + "logits/rejected": -0.4189532399177551, + "logps/chosen": -56.58891677856445, + "logps/rejected": -96.33393859863281, + "loss": 0.6733, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.853961944580078, + "rewards/margins": 5.3864264488220215, + "rewards/rejected": -2.532464027404785, + "step": 5196 + }, + { + "epoch": 1.3, + "grad_norm": 2.297811985015869, + "learning_rate": 4.317755942766383e-07, + "logits/chosen": -0.3348880410194397, + "logits/rejected": -0.44883015751838684, + "logps/chosen": -44.54985809326172, + "logps/rejected": -77.17011260986328, + "loss": 0.5335, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1468472480773926, + "rewards/margins": 5.546535968780518, + "rewards/rejected": -2.399688482284546, + "step": 5197 + }, + { + "epoch": 1.3, + "grad_norm": 23.251811981201172, + "learning_rate": 4.307112621627074e-07, + "logits/chosen": -0.3746119737625122, + "logits/rejected": -0.40681198239326477, + "logps/chosen": -54.075164794921875, + "logps/rejected": -82.04579162597656, + "loss": 0.9955, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7672290802001953, + "rewards/margins": 3.47249174118042, + "rewards/rejected": -0.7052628993988037, + "step": 5198 + }, + { + "epoch": 1.3, + "grad_norm": 4.331066131591797, + "learning_rate": 4.296481844171341e-07, + "logits/chosen": -0.43955671787261963, + "logits/rejected": -0.5827271938323975, + "logps/chosen": -46.65997314453125, + "logps/rejected": -64.10758972167969, + "loss": 0.7008, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1059675216674805, + "rewards/margins": 4.7043232917785645, + "rewards/rejected": -1.598355770111084, + "step": 5199 + }, + { + "epoch": 1.3, + "grad_norm": 3.042634963989258, + "learning_rate": 4.2858636133175537e-07, + "logits/chosen": -0.34533268213272095, + "logits/rejected": -0.4427185356616974, + "logps/chosen": -62.698570251464844, + "logps/rejected": -92.07278442382812, + "loss": 0.7419, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6842598915100098, + "rewards/margins": 5.353034496307373, + "rewards/rejected": -2.6687746047973633, + "step": 5200 + }, + { + "epoch": 1.3, + "grad_norm": 8.980162620544434, + "learning_rate": 4.27525793198062e-07, + "logits/chosen": -0.3905559182167053, + "logits/rejected": -0.451576292514801, + "logps/chosen": -59.85918426513672, + "logps/rejected": -104.42730712890625, + "loss": 0.8734, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.208556890487671, + "rewards/margins": 4.862493991851807, + "rewards/rejected": -1.653937578201294, + "step": 5201 + }, + { + "epoch": 1.3, + "grad_norm": 3.8201045989990234, + "learning_rate": 4.2646648030720495e-07, + "logits/chosen": -0.38446950912475586, + "logits/rejected": -0.457282155752182, + "logps/chosen": -44.3509407043457, + "logps/rejected": -81.02969360351562, + "loss": 0.6856, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0949079990386963, + "rewards/margins": 5.0539655685424805, + "rewards/rejected": -1.9590575695037842, + "step": 5202 + }, + { + "epoch": 1.3, + "grad_norm": 12.574541091918945, + "learning_rate": 4.254084229499877e-07, + "logits/chosen": -0.38557201623916626, + "logits/rejected": -0.41415074467658997, + "logps/chosen": -57.3442268371582, + "logps/rejected": -76.54014587402344, + "loss": 0.8923, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.648089647293091, + "rewards/margins": 4.161616802215576, + "rewards/rejected": -1.513527274131775, + "step": 5203 + }, + { + "epoch": 1.3, + "grad_norm": 13.8049955368042, + "learning_rate": 4.2435162141687194e-07, + "logits/chosen": -0.30245310068130493, + "logits/rejected": -0.5022337436676025, + "logps/chosen": -77.3443603515625, + "logps/rejected": -71.62132263183594, + "loss": 0.7254, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.586646556854248, + "rewards/margins": 4.93902587890625, + "rewards/rejected": -2.352379560470581, + "step": 5204 + }, + { + "epoch": 1.3, + "grad_norm": 4.587278842926025, + "learning_rate": 4.2329607599796987e-07, + "logits/chosen": -0.38353103399276733, + "logits/rejected": -0.42272964119911194, + "logps/chosen": -56.281795501708984, + "logps/rejected": -99.09542846679688, + "loss": 0.7606, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.843921184539795, + "rewards/margins": 5.502141952514648, + "rewards/rejected": -2.6582207679748535, + "step": 5205 + }, + { + "epoch": 1.3, + "grad_norm": 2.6891767978668213, + "learning_rate": 4.2224178698305195e-07, + "logits/chosen": -0.3762741684913635, + "logits/rejected": -0.5095371603965759, + "logps/chosen": -56.35457229614258, + "logps/rejected": -94.23041534423828, + "loss": 0.6796, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.08154034614563, + "rewards/margins": 6.069523334503174, + "rewards/rejected": -2.987982988357544, + "step": 5206 + }, + { + "epoch": 1.3, + "grad_norm": 6.463436126708984, + "learning_rate": 4.2118875466154396e-07, + "logits/chosen": -0.37662139534950256, + "logits/rejected": -0.418851375579834, + "logps/chosen": -58.51736068725586, + "logps/rejected": -86.74507141113281, + "loss": 0.7926, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.04521107673645, + "rewards/margins": 4.685354709625244, + "rewards/rejected": -1.640143632888794, + "step": 5207 + }, + { + "epoch": 1.3, + "grad_norm": 4.599698543548584, + "learning_rate": 4.2013697932252627e-07, + "logits/chosen": -0.35622942447662354, + "logits/rejected": -0.43316903710365295, + "logps/chosen": -56.257415771484375, + "logps/rejected": -94.75201416015625, + "loss": 0.6755, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.071188449859619, + "rewards/margins": 6.056014537811279, + "rewards/rejected": -2.98482608795166, + "step": 5208 + }, + { + "epoch": 1.3, + "grad_norm": 6.026369571685791, + "learning_rate": 4.190864612547335e-07, + "logits/chosen": -0.3406515121459961, + "logits/rejected": -0.5028219223022461, + "logps/chosen": -54.45392608642578, + "logps/rejected": -99.2183837890625, + "loss": 0.7251, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.006877899169922, + "rewards/margins": 5.763725280761719, + "rewards/rejected": -2.756847620010376, + "step": 5209 + }, + { + "epoch": 1.3, + "grad_norm": 4.841226100921631, + "learning_rate": 4.1803720074655494e-07, + "logits/chosen": -0.34213364124298096, + "logits/rejected": -0.37860795855522156, + "logps/chosen": -53.30601119995117, + "logps/rejected": -91.78023529052734, + "loss": 0.7617, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8125643730163574, + "rewards/margins": 4.8956618309021, + "rewards/rejected": -2.083097219467163, + "step": 5210 + }, + { + "epoch": 1.3, + "grad_norm": 3.306292772293091, + "learning_rate": 4.1698919808603523e-07, + "logits/chosen": -0.3039013147354126, + "logits/rejected": -0.396610826253891, + "logps/chosen": -58.457305908203125, + "logps/rejected": -105.22960662841797, + "loss": 0.6546, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9596915245056152, + "rewards/margins": 5.494095325469971, + "rewards/rejected": -2.5344040393829346, + "step": 5211 + }, + { + "epoch": 1.3, + "grad_norm": 3.8655099868774414, + "learning_rate": 4.159424535608747e-07, + "logits/chosen": -0.33157476782798767, + "logits/rejected": -0.4628967046737671, + "logps/chosen": -60.92060089111328, + "logps/rejected": -76.5010986328125, + "loss": 0.7377, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1616477966308594, + "rewards/margins": 4.7170000076293945, + "rewards/rejected": -1.5553525686264038, + "step": 5212 + }, + { + "epoch": 1.3, + "grad_norm": 3.8364217281341553, + "learning_rate": 4.148969674584269e-07, + "logits/chosen": -0.30608198046684265, + "logits/rejected": -0.42760777473449707, + "logps/chosen": -59.80623245239258, + "logps/rejected": -71.59796142578125, + "loss": 0.7231, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8642969131469727, + "rewards/margins": 4.329334735870361, + "rewards/rejected": -1.4650377035140991, + "step": 5213 + }, + { + "epoch": 1.3, + "grad_norm": 8.300365447998047, + "learning_rate": 4.1385274006570054e-07, + "logits/chosen": -0.4417794644832611, + "logits/rejected": -0.573759913444519, + "logps/chosen": -54.5150032043457, + "logps/rejected": -75.39938354492188, + "loss": 0.8212, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9962592124938965, + "rewards/margins": 4.437445163726807, + "rewards/rejected": -1.441185712814331, + "step": 5214 + }, + { + "epoch": 1.3, + "grad_norm": 6.952488899230957, + "learning_rate": 4.1280977166935854e-07, + "logits/chosen": -0.3631806969642639, + "logits/rejected": -0.46747907996177673, + "logps/chosen": -48.577491760253906, + "logps/rejected": -80.3349838256836, + "loss": 0.5531, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8380792140960693, + "rewards/margins": 4.939609527587891, + "rewards/rejected": -2.1015305519104004, + "step": 5215 + }, + { + "epoch": 1.3, + "grad_norm": 15.947500228881836, + "learning_rate": 4.1176806255571746e-07, + "logits/chosen": -0.3778195083141327, + "logits/rejected": -0.4987550377845764, + "logps/chosen": -68.45378112792969, + "logps/rejected": -85.58729553222656, + "loss": 0.7717, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0394887924194336, + "rewards/margins": 5.00706672668457, + "rewards/rejected": -1.9675781726837158, + "step": 5216 + }, + { + "epoch": 1.31, + "grad_norm": 6.597357273101807, + "learning_rate": 4.107276130107496e-07, + "logits/chosen": -0.34225767850875854, + "logits/rejected": -0.4398287832736969, + "logps/chosen": -56.27756118774414, + "logps/rejected": -91.56396484375, + "loss": 0.7629, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.91262149810791, + "rewards/margins": 5.599243640899658, + "rewards/rejected": -2.686622142791748, + "step": 5217 + }, + { + "epoch": 1.31, + "grad_norm": 4.717623710632324, + "learning_rate": 4.0968842332007984e-07, + "logits/chosen": -0.3746466338634491, + "logits/rejected": -0.4629001319408417, + "logps/chosen": -59.034515380859375, + "logps/rejected": -81.98822784423828, + "loss": 0.7685, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6938953399658203, + "rewards/margins": 4.902419090270996, + "rewards/rejected": -2.2085232734680176, + "step": 5218 + }, + { + "epoch": 1.31, + "grad_norm": 4.68779993057251, + "learning_rate": 4.086504937689895e-07, + "logits/chosen": -0.31539708375930786, + "logits/rejected": -0.4148271679878235, + "logps/chosen": -62.54974365234375, + "logps/rejected": -83.56291198730469, + "loss": 0.7219, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8573403358459473, + "rewards/margins": 4.232567310333252, + "rewards/rejected": -1.375226616859436, + "step": 5219 + }, + { + "epoch": 1.31, + "grad_norm": 3.6005308628082275, + "learning_rate": 4.0761382464241275e-07, + "logits/chosen": -0.2968486249446869, + "logits/rejected": -0.3564659357070923, + "logps/chosen": -45.402122497558594, + "logps/rejected": -93.55593872070312, + "loss": 0.6474, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9833123683929443, + "rewards/margins": 5.81239652633667, + "rewards/rejected": -2.8290843963623047, + "step": 5220 + }, + { + "epoch": 1.31, + "grad_norm": 2.9603116512298584, + "learning_rate": 4.0657841622493643e-07, + "logits/chosen": -0.41222745180130005, + "logits/rejected": -0.5665096640586853, + "logps/chosen": -61.69013977050781, + "logps/rejected": -69.79983520507812, + "loss": 0.6591, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.033203601837158, + "rewards/margins": 5.4055256843566895, + "rewards/rejected": -2.3723220825195312, + "step": 5221 + }, + { + "epoch": 1.31, + "grad_norm": 6.426162242889404, + "learning_rate": 4.055442688008032e-07, + "logits/chosen": -0.3512982726097107, + "logits/rejected": -0.4043463468551636, + "logps/chosen": -52.58577346801758, + "logps/rejected": -86.73668670654297, + "loss": 0.7475, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8014638423919678, + "rewards/margins": 4.5344038009643555, + "rewards/rejected": -1.7329400777816772, + "step": 5222 + }, + { + "epoch": 1.31, + "grad_norm": 9.754456520080566, + "learning_rate": 4.045113826539082e-07, + "logits/chosen": -0.396435409784317, + "logits/rejected": -0.41426923871040344, + "logps/chosen": -49.440425872802734, + "logps/rejected": -95.6117172241211, + "loss": 0.7671, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9359402656555176, + "rewards/margins": 4.362539768218994, + "rewards/rejected": -1.4265997409820557, + "step": 5223 + }, + { + "epoch": 1.31, + "grad_norm": 3.535921335220337, + "learning_rate": 4.0347975806780184e-07, + "logits/chosen": -0.30524492263793945, + "logits/rejected": -0.40940526127815247, + "logps/chosen": -64.26605224609375, + "logps/rejected": -94.19195556640625, + "loss": 0.7164, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0607786178588867, + "rewards/margins": 5.908050537109375, + "rewards/rejected": -2.8472726345062256, + "step": 5224 + }, + { + "epoch": 1.31, + "grad_norm": 7.5090460777282715, + "learning_rate": 4.0244939532568807e-07, + "logits/chosen": -0.35755372047424316, + "logits/rejected": -0.46330034732818604, + "logps/chosen": -52.836463928222656, + "logps/rejected": -78.71015930175781, + "loss": 0.6672, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0542538166046143, + "rewards/margins": 4.79815673828125, + "rewards/rejected": -1.7439031600952148, + "step": 5225 + }, + { + "epoch": 1.31, + "grad_norm": 15.323094367980957, + "learning_rate": 4.014202947104212e-07, + "logits/chosen": -0.3694700002670288, + "logits/rejected": -0.4256367087364197, + "logps/chosen": -59.664649963378906, + "logps/rejected": -101.3740005493164, + "loss": 0.9385, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5360987186431885, + "rewards/margins": 4.0776166915893555, + "rewards/rejected": -1.541517972946167, + "step": 5226 + }, + { + "epoch": 1.31, + "grad_norm": 5.405868053436279, + "learning_rate": 4.0039245650451306e-07, + "logits/chosen": -0.3395746350288391, + "logits/rejected": -0.4092748761177063, + "logps/chosen": -60.2608528137207, + "logps/rejected": -81.10528564453125, + "loss": 0.8988, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9954631328582764, + "rewards/margins": 4.371311187744141, + "rewards/rejected": -1.375847578048706, + "step": 5227 + }, + { + "epoch": 1.31, + "grad_norm": 10.305110931396484, + "learning_rate": 3.99365880990128e-07, + "logits/chosen": -0.4024268686771393, + "logits/rejected": -0.4535086452960968, + "logps/chosen": -54.10574722290039, + "logps/rejected": -85.609375, + "loss": 0.8039, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.029355049133301, + "rewards/margins": 4.267267227172852, + "rewards/rejected": -1.2379122972488403, + "step": 5228 + }, + { + "epoch": 1.31, + "grad_norm": 5.8181538581848145, + "learning_rate": 3.983405684490821e-07, + "logits/chosen": -0.3033166229724884, + "logits/rejected": -0.3772619664669037, + "logps/chosen": -54.1561279296875, + "logps/rejected": -102.46314239501953, + "loss": 0.6787, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0998690128326416, + "rewards/margins": 5.604735374450684, + "rewards/rejected": -2.504866600036621, + "step": 5229 + }, + { + "epoch": 1.31, + "grad_norm": 4.892991065979004, + "learning_rate": 3.973165191628464e-07, + "logits/chosen": -0.3799785375595093, + "logits/rejected": -0.5163256525993347, + "logps/chosen": -63.075321197509766, + "logps/rejected": -75.12516784667969, + "loss": 0.7978, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.935103178024292, + "rewards/margins": 4.399666786193848, + "rewards/rejected": -1.4645633697509766, + "step": 5230 + }, + { + "epoch": 1.31, + "grad_norm": 11.287858009338379, + "learning_rate": 3.9629373341254327e-07, + "logits/chosen": -0.31079867482185364, + "logits/rejected": -0.41099125146865845, + "logps/chosen": -57.161373138427734, + "logps/rejected": -90.33611297607422, + "loss": 0.736, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7565677165985107, + "rewards/margins": 5.137905120849609, + "rewards/rejected": -2.3813371658325195, + "step": 5231 + }, + { + "epoch": 1.31, + "grad_norm": 7.2161478996276855, + "learning_rate": 3.9527221147895144e-07, + "logits/chosen": -0.30037492513656616, + "logits/rejected": -0.36793601512908936, + "logps/chosen": -69.7461166381836, + "logps/rejected": -96.2912368774414, + "loss": 0.8022, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.806368350982666, + "rewards/margins": 4.071661472320557, + "rewards/rejected": -1.2652928829193115, + "step": 5232 + }, + { + "epoch": 1.31, + "grad_norm": 14.223090171813965, + "learning_rate": 3.9425195364249934e-07, + "logits/chosen": -0.4380255341529846, + "logits/rejected": -0.6218512654304504, + "logps/chosen": -67.56594848632812, + "logps/rejected": -58.50046920776367, + "loss": 0.8931, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4922797679901123, + "rewards/margins": 4.6896538734436035, + "rewards/rejected": -2.1973745822906494, + "step": 5233 + }, + { + "epoch": 1.31, + "grad_norm": 8.521505355834961, + "learning_rate": 3.9323296018327074e-07, + "logits/chosen": -0.27931350469589233, + "logits/rejected": -0.44238054752349854, + "logps/chosen": -72.90642547607422, + "logps/rejected": -87.55924224853516, + "loss": 0.7444, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.651423692703247, + "rewards/margins": 4.334057807922363, + "rewards/rejected": -1.6826344728469849, + "step": 5234 + }, + { + "epoch": 1.31, + "grad_norm": 3.868940830230713, + "learning_rate": 3.922152313810007e-07, + "logits/chosen": -0.39558184146881104, + "logits/rejected": -0.5264207720756531, + "logps/chosen": -66.02889251708984, + "logps/rejected": -80.58932495117188, + "loss": 0.6742, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8272523880004883, + "rewards/margins": 4.86826753616333, + "rewards/rejected": -2.041015148162842, + "step": 5235 + }, + { + "epoch": 1.31, + "grad_norm": 10.228776931762695, + "learning_rate": 3.911987675150763e-07, + "logits/chosen": -0.23539167642593384, + "logits/rejected": -0.4028695821762085, + "logps/chosen": -69.35841369628906, + "logps/rejected": -83.76815032958984, + "loss": 0.7942, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8912112712860107, + "rewards/margins": 4.88638973236084, + "rewards/rejected": -1.99517822265625, + "step": 5236 + }, + { + "epoch": 1.31, + "grad_norm": 3.4101462364196777, + "learning_rate": 3.901835688645428e-07, + "logits/chosen": -0.3007211685180664, + "logits/rejected": -0.3269321322441101, + "logps/chosen": -55.641719818115234, + "logps/rejected": -102.46842193603516, + "loss": 0.719, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9801371097564697, + "rewards/margins": 4.9389448165893555, + "rewards/rejected": -1.9588075876235962, + "step": 5237 + }, + { + "epoch": 1.31, + "grad_norm": 4.8302741050720215, + "learning_rate": 3.891696357080893e-07, + "logits/chosen": -0.3742777109146118, + "logits/rejected": -0.506611704826355, + "logps/chosen": -58.79235076904297, + "logps/rejected": -89.97065734863281, + "loss": 0.7075, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.678969383239746, + "rewards/margins": 5.582304954528809, + "rewards/rejected": -2.9033358097076416, + "step": 5238 + }, + { + "epoch": 1.31, + "grad_norm": 3.8422505855560303, + "learning_rate": 3.8815696832406603e-07, + "logits/chosen": -0.3428501784801483, + "logits/rejected": -0.4436560869216919, + "logps/chosen": -59.67051696777344, + "logps/rejected": -80.7734375, + "loss": 0.6327, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8936080932617188, + "rewards/margins": 5.527818202972412, + "rewards/rejected": -2.6342101097106934, + "step": 5239 + }, + { + "epoch": 1.31, + "grad_norm": 20.886703491210938, + "learning_rate": 3.8714556699047054e-07, + "logits/chosen": -0.39108312129974365, + "logits/rejected": -0.43009912967681885, + "logps/chosen": -56.336326599121094, + "logps/rejected": -96.44766235351562, + "loss": 0.9487, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6313793659210205, + "rewards/margins": 4.3071112632751465, + "rewards/rejected": -1.6757323741912842, + "step": 5240 + }, + { + "epoch": 1.31, + "grad_norm": 4.5620269775390625, + "learning_rate": 3.861354319849542e-07, + "logits/chosen": -0.34259334206581116, + "logits/rejected": -0.43854647874832153, + "logps/chosen": -56.417057037353516, + "logps/rejected": -88.8888168334961, + "loss": 0.6608, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9570398330688477, + "rewards/margins": 5.77225923538208, + "rewards/rejected": -2.8152191638946533, + "step": 5241 + }, + { + "epoch": 1.31, + "grad_norm": 16.252912521362305, + "learning_rate": 3.851265635848211e-07, + "logits/chosen": -0.3490510582923889, + "logits/rejected": -0.4414435923099518, + "logps/chosen": -67.88842010498047, + "logps/rejected": -84.66964721679688, + "loss": 0.8142, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7033944129943848, + "rewards/margins": 3.992176055908203, + "rewards/rejected": -1.2887816429138184, + "step": 5242 + }, + { + "epoch": 1.31, + "grad_norm": 6.750441551208496, + "learning_rate": 3.841189620670266e-07, + "logits/chosen": -0.5009369850158691, + "logits/rejected": -0.6463921070098877, + "logps/chosen": -53.346927642822266, + "logps/rejected": -75.57894897460938, + "loss": 0.7, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.748887777328491, + "rewards/margins": 5.2687530517578125, + "rewards/rejected": -2.5198659896850586, + "step": 5243 + }, + { + "epoch": 1.31, + "grad_norm": 8.581085205078125, + "learning_rate": 3.831126277081809e-07, + "logits/chosen": -0.35371604561805725, + "logits/rejected": -0.4500398635864258, + "logps/chosen": -64.03557586669922, + "logps/rejected": -87.0447006225586, + "loss": 0.925, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.973536491394043, + "rewards/margins": 3.955465316772461, + "rewards/rejected": -0.9819293022155762, + "step": 5244 + }, + { + "epoch": 1.31, + "grad_norm": 5.25813627243042, + "learning_rate": 3.8210756078454427e-07, + "logits/chosen": -0.35755521059036255, + "logits/rejected": -0.4326355457305908, + "logps/chosen": -58.38335418701172, + "logps/rejected": -110.45812225341797, + "loss": 0.6624, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.014021396636963, + "rewards/margins": 5.715585708618164, + "rewards/rejected": -2.701563835144043, + "step": 5245 + }, + { + "epoch": 1.31, + "grad_norm": 4.253631114959717, + "learning_rate": 3.81103761572027e-07, + "logits/chosen": -0.38142141699790955, + "logits/rejected": -0.48483073711395264, + "logps/chosen": -62.40107727050781, + "logps/rejected": -102.16603088378906, + "loss": 0.7164, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.613457679748535, + "rewards/margins": 5.677006721496582, + "rewards/rejected": -3.0635488033294678, + "step": 5246 + }, + { + "epoch": 1.31, + "grad_norm": 4.109501361846924, + "learning_rate": 3.8010123034619537e-07, + "logits/chosen": -0.36588290333747864, + "logits/rejected": -0.44064900279045105, + "logps/chosen": -54.02594757080078, + "logps/rejected": -83.5638427734375, + "loss": 0.7352, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8990416526794434, + "rewards/margins": 5.321717739105225, + "rewards/rejected": -2.4226760864257812, + "step": 5247 + }, + { + "epoch": 1.31, + "grad_norm": 7.597881317138672, + "learning_rate": 3.7909996738226537e-07, + "logits/chosen": -0.33007246255874634, + "logits/rejected": -0.41706231236457825, + "logps/chosen": -71.13329315185547, + "logps/rejected": -84.80377197265625, + "loss": 0.7727, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8492207527160645, + "rewards/margins": 4.991860389709473, + "rewards/rejected": -2.142639636993408, + "step": 5248 + }, + { + "epoch": 1.31, + "grad_norm": 3.7421867847442627, + "learning_rate": 3.780999729551066e-07, + "logits/chosen": -0.3469063937664032, + "logits/rejected": -0.44117558002471924, + "logps/chosen": -52.437721252441406, + "logps/rejected": -87.40011596679688, + "loss": 0.6962, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.921574831008911, + "rewards/margins": 5.540874004364014, + "rewards/rejected": -2.6192994117736816, + "step": 5249 + }, + { + "epoch": 1.31, + "grad_norm": 3.6608805656433105, + "learning_rate": 3.771012473392377e-07, + "logits/chosen": -0.3980947434902191, + "logits/rejected": -0.4986658990383148, + "logps/chosen": -67.8906021118164, + "logps/rejected": -85.87708282470703, + "loss": 0.7349, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2136707305908203, + "rewards/margins": 5.619729042053223, + "rewards/rejected": -2.4060580730438232, + "step": 5250 + }, + { + "epoch": 1.31, + "grad_norm": 11.70967960357666, + "learning_rate": 3.761037908088294e-07, + "logits/chosen": -0.38836801052093506, + "logits/rejected": -0.4439101219177246, + "logps/chosen": -64.5132827758789, + "logps/rejected": -85.63041687011719, + "loss": 1.0409, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.721639394760132, + "rewards/margins": 4.321754455566406, + "rewards/rejected": -1.6001152992248535, + "step": 5251 + }, + { + "epoch": 1.31, + "grad_norm": 17.415542602539062, + "learning_rate": 3.7510760363770714e-07, + "logits/chosen": -0.3424440920352936, + "logits/rejected": -0.4422614276409149, + "logps/chosen": -53.29828643798828, + "logps/rejected": -78.74385070800781, + "loss": 0.7588, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.995817184448242, + "rewards/margins": 4.377350330352783, + "rewards/rejected": -1.381532907485962, + "step": 5252 + }, + { + "epoch": 1.31, + "grad_norm": 5.861170291900635, + "learning_rate": 3.741126860993455e-07, + "logits/chosen": -0.40635350346565247, + "logits/rejected": -0.5122736096382141, + "logps/chosen": -54.34919738769531, + "logps/rejected": -78.2340087890625, + "loss": 0.9109, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.04874324798584, + "rewards/margins": 4.408823013305664, + "rewards/rejected": -1.360079288482666, + "step": 5253 + }, + { + "epoch": 1.31, + "grad_norm": 5.626989841461182, + "learning_rate": 3.731190384668698e-07, + "logits/chosen": -0.4007967412471771, + "logits/rejected": -0.4667261838912964, + "logps/chosen": -57.799766540527344, + "logps/rejected": -92.69070434570312, + "loss": 0.7328, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4958178997039795, + "rewards/margins": 4.934237480163574, + "rewards/rejected": -2.438420295715332, + "step": 5254 + }, + { + "epoch": 1.31, + "grad_norm": 3.4270882606506348, + "learning_rate": 3.721266610130586e-07, + "logits/chosen": -0.36807578802108765, + "logits/rejected": -0.4638158679008484, + "logps/chosen": -53.191951751708984, + "logps/rejected": -95.62151336669922, + "loss": 0.6801, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.924281120300293, + "rewards/margins": 5.301112174987793, + "rewards/rejected": -2.376831293106079, + "step": 5255 + }, + { + "epoch": 1.31, + "grad_norm": 4.2833147048950195, + "learning_rate": 3.711355540103395e-07, + "logits/chosen": -0.37803196907043457, + "logits/rejected": -0.4129301905632019, + "logps/chosen": -44.81541442871094, + "logps/rejected": -92.81829833984375, + "loss": 0.7066, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9280593395233154, + "rewards/margins": 4.717875957489014, + "rewards/rejected": -1.7898166179656982, + "step": 5256 + }, + { + "epoch": 1.32, + "grad_norm": 4.678494453430176, + "learning_rate": 3.7014571773079545e-07, + "logits/chosen": -0.3362889885902405, + "logits/rejected": -0.4001181125640869, + "logps/chosen": -59.875186920166016, + "logps/rejected": -89.01364135742188, + "loss": 0.7132, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.903338670730591, + "rewards/margins": 4.560127258300781, + "rewards/rejected": -1.6567881107330322, + "step": 5257 + }, + { + "epoch": 1.32, + "grad_norm": 5.7327351570129395, + "learning_rate": 3.691571524461546e-07, + "logits/chosen": -0.3070163130760193, + "logits/rejected": -0.3695647418498993, + "logps/chosen": -59.34672927856445, + "logps/rejected": -102.20519256591797, + "loss": 0.6673, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1746745109558105, + "rewards/margins": 5.755228042602539, + "rewards/rejected": -2.5805535316467285, + "step": 5258 + }, + { + "epoch": 1.32, + "grad_norm": 3.5944108963012695, + "learning_rate": 3.6816985842780205e-07, + "logits/chosen": -0.36538222432136536, + "logits/rejected": -0.40847283601760864, + "logps/chosen": -52.39326095581055, + "logps/rejected": -97.89358520507812, + "loss": 0.6952, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.981140613555908, + "rewards/margins": 4.712658882141113, + "rewards/rejected": -1.7315185070037842, + "step": 5259 + }, + { + "epoch": 1.32, + "grad_norm": 7.408573150634766, + "learning_rate": 3.671838359467705e-07, + "logits/chosen": -0.3500012755393982, + "logits/rejected": -0.45863479375839233, + "logps/chosen": -55.821659088134766, + "logps/rejected": -79.84780883789062, + "loss": 0.8123, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2001149654388428, + "rewards/margins": 5.396300792694092, + "rewards/rejected": -2.196186065673828, + "step": 5260 + }, + { + "epoch": 1.32, + "grad_norm": 4.947404861450195, + "learning_rate": 3.6619908527374405e-07, + "logits/chosen": -0.3261539340019226, + "logits/rejected": -0.42436954379081726, + "logps/chosen": -56.186580657958984, + "logps/rejected": -82.99849700927734, + "loss": 0.6464, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0492589473724365, + "rewards/margins": 4.817203998565674, + "rewards/rejected": -1.7679452896118164, + "step": 5261 + }, + { + "epoch": 1.32, + "grad_norm": 3.573056221008301, + "learning_rate": 3.652156066790585e-07, + "logits/chosen": -0.22831600904464722, + "logits/rejected": -0.3764968514442444, + "logps/chosen": -63.42235565185547, + "logps/rejected": -99.32608795166016, + "loss": 0.6262, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9557297229766846, + "rewards/margins": 5.922143459320068, + "rewards/rejected": -2.966413736343384, + "step": 5262 + }, + { + "epoch": 1.32, + "grad_norm": 3.9264438152313232, + "learning_rate": 3.642334004326986e-07, + "logits/chosen": -0.3313380181789398, + "logits/rejected": -0.41503581404685974, + "logps/chosen": -58.42625045776367, + "logps/rejected": -89.24897003173828, + "loss": 0.7115, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9830024242401123, + "rewards/margins": 5.178239822387695, + "rewards/rejected": -2.1952364444732666, + "step": 5263 + }, + { + "epoch": 1.32, + "grad_norm": 3.5978660583496094, + "learning_rate": 3.63252466804303e-07, + "logits/chosen": -0.35329973697662354, + "logits/rejected": -0.430149108171463, + "logps/chosen": -48.039466857910156, + "logps/rejected": -85.9045639038086, + "loss": 0.6998, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9059290885925293, + "rewards/margins": 5.307258129119873, + "rewards/rejected": -2.4013290405273438, + "step": 5264 + }, + { + "epoch": 1.32, + "grad_norm": 5.780211448669434, + "learning_rate": 3.6227280606315863e-07, + "logits/chosen": -0.3470652103424072, + "logits/rejected": -0.4268108606338501, + "logps/chosen": -46.58967971801758, + "logps/rejected": -87.13585662841797, + "loss": 0.6025, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.056640148162842, + "rewards/margins": 5.126066207885742, + "rewards/rejected": -2.0694260597229004, + "step": 5265 + }, + { + "epoch": 1.32, + "grad_norm": 5.451955795288086, + "learning_rate": 3.61294418478203e-07, + "logits/chosen": -0.31562405824661255, + "logits/rejected": -0.375078022480011, + "logps/chosen": -56.64033508300781, + "logps/rejected": -106.09175872802734, + "loss": 0.7546, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0430445671081543, + "rewards/margins": 5.6995391845703125, + "rewards/rejected": -2.656494617462158, + "step": 5266 + }, + { + "epoch": 1.32, + "grad_norm": 7.49474573135376, + "learning_rate": 3.6031730431802524e-07, + "logits/chosen": -0.3900834918022156, + "logits/rejected": -0.46138617396354675, + "logps/chosen": -53.091209411621094, + "logps/rejected": -96.88298034667969, + "loss": 0.7183, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.809941530227661, + "rewards/margins": 4.880366802215576, + "rewards/rejected": -2.0704257488250732, + "step": 5267 + }, + { + "epoch": 1.32, + "grad_norm": 4.867849826812744, + "learning_rate": 3.59341463850863e-07, + "logits/chosen": -0.3090896010398865, + "logits/rejected": -0.4512637257575989, + "logps/chosen": -64.052001953125, + "logps/rejected": -98.09806823730469, + "loss": 0.7067, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.211869239807129, + "rewards/margins": 4.98689079284668, + "rewards/rejected": -1.7750217914581299, + "step": 5268 + }, + { + "epoch": 1.32, + "grad_norm": 29.300819396972656, + "learning_rate": 3.583668973446075e-07, + "logits/chosen": -0.3816843032836914, + "logits/rejected": -0.4127872884273529, + "logps/chosen": -60.70208740234375, + "logps/rejected": -94.51690673828125, + "loss": 0.8759, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.814143180847168, + "rewards/margins": 4.576955318450928, + "rewards/rejected": -1.7628114223480225, + "step": 5269 + }, + { + "epoch": 1.32, + "grad_norm": 10.043094635009766, + "learning_rate": 3.5739360506679697e-07, + "logits/chosen": -0.332440584897995, + "logits/rejected": -0.41933003067970276, + "logps/chosen": -51.79243850708008, + "logps/rejected": -79.33377075195312, + "loss": 0.8474, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9185776710510254, + "rewards/margins": 4.590173244476318, + "rewards/rejected": -1.6715952157974243, + "step": 5270 + }, + { + "epoch": 1.32, + "grad_norm": 4.562108039855957, + "learning_rate": 3.564215872846205e-07, + "logits/chosen": -0.3215651214122772, + "logits/rejected": -0.42751529812812805, + "logps/chosen": -51.29413604736328, + "logps/rejected": -84.59989166259766, + "loss": 0.6686, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2525813579559326, + "rewards/margins": 5.324163436889648, + "rewards/rejected": -2.0715818405151367, + "step": 5271 + }, + { + "epoch": 1.32, + "grad_norm": 3.5018489360809326, + "learning_rate": 3.5545084426491963e-07, + "logits/chosen": -0.31461742520332336, + "logits/rejected": -0.37096112966537476, + "logps/chosen": -65.55500793457031, + "logps/rejected": -98.4858627319336, + "loss": 0.7108, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.812319278717041, + "rewards/margins": 4.512340545654297, + "rewards/rejected": -1.7000210285186768, + "step": 5272 + }, + { + "epoch": 1.32, + "grad_norm": 4.0041656494140625, + "learning_rate": 3.5448137627418334e-07, + "logits/chosen": -0.40519434213638306, + "logits/rejected": -0.4918314814567566, + "logps/chosen": -59.8376350402832, + "logps/rejected": -84.82315826416016, + "loss": 0.6859, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.000218391418457, + "rewards/margins": 5.667074203491211, + "rewards/rejected": -2.666856050491333, + "step": 5273 + }, + { + "epoch": 1.32, + "grad_norm": 2.367155075073242, + "learning_rate": 3.5351318357855156e-07, + "logits/chosen": -0.35067692399024963, + "logits/rejected": -0.4755967855453491, + "logps/chosen": -55.35693359375, + "logps/rejected": -85.67579650878906, + "loss": 0.6004, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1242949962615967, + "rewards/margins": 5.708836555480957, + "rewards/rejected": -2.5845417976379395, + "step": 5274 + }, + { + "epoch": 1.32, + "grad_norm": 4.841548919677734, + "learning_rate": 3.5254626644381453e-07, + "logits/chosen": -0.32719603180885315, + "logits/rejected": -0.4897042214870453, + "logps/chosen": -57.36432647705078, + "logps/rejected": -70.54129791259766, + "loss": 0.7184, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.863347053527832, + "rewards/margins": 5.385513782501221, + "rewards/rejected": -2.5221667289733887, + "step": 5275 + }, + { + "epoch": 1.32, + "grad_norm": 4.248513698577881, + "learning_rate": 3.5158062513541036e-07, + "logits/chosen": -0.34986764192581177, + "logits/rejected": -0.4514496624469757, + "logps/chosen": -61.758182525634766, + "logps/rejected": -84.42430114746094, + "loss": 0.6898, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8670718669891357, + "rewards/margins": 5.123329162597656, + "rewards/rejected": -2.2562572956085205, + "step": 5276 + }, + { + "epoch": 1.32, + "grad_norm": 5.174954414367676, + "learning_rate": 3.50616259918431e-07, + "logits/chosen": -0.34484046697616577, + "logits/rejected": -0.4902290999889374, + "logps/chosen": -61.287017822265625, + "logps/rejected": -94.00637817382812, + "loss": 0.7838, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.800522804260254, + "rewards/margins": 6.531641006469727, + "rewards/rejected": -3.7311177253723145, + "step": 5277 + }, + { + "epoch": 1.32, + "grad_norm": 4.411242961883545, + "learning_rate": 3.496531710576134e-07, + "logits/chosen": -0.31074216961860657, + "logits/rejected": -0.3851063549518585, + "logps/chosen": -62.51557159423828, + "logps/rejected": -86.53215026855469, + "loss": 0.7123, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7506332397460938, + "rewards/margins": 4.867318153381348, + "rewards/rejected": -2.1166844367980957, + "step": 5278 + }, + { + "epoch": 1.32, + "grad_norm": 3.359773635864258, + "learning_rate": 3.4869135881734686e-07, + "logits/chosen": -0.39682769775390625, + "logits/rejected": -0.4634508788585663, + "logps/chosen": -45.46944046020508, + "logps/rejected": -85.70122528076172, + "loss": 0.6156, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8741068840026855, + "rewards/margins": 4.955135822296143, + "rewards/rejected": -2.081028699874878, + "step": 5279 + }, + { + "epoch": 1.32, + "grad_norm": 4.336695671081543, + "learning_rate": 3.4773082346167e-07, + "logits/chosen": -0.4438643753528595, + "logits/rejected": -0.46202564239501953, + "logps/chosen": -51.68608093261719, + "logps/rejected": -77.96453857421875, + "loss": 0.739, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2859654426574707, + "rewards/margins": 4.300680160522461, + "rewards/rejected": -1.0147149562835693, + "step": 5280 + }, + { + "epoch": 1.32, + "grad_norm": 5.1248321533203125, + "learning_rate": 3.467715652542702e-07, + "logits/chosen": -0.3400481641292572, + "logits/rejected": -0.3948858082294464, + "logps/chosen": -63.318878173828125, + "logps/rejected": -90.27566528320312, + "loss": 0.7481, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.134089231491089, + "rewards/margins": 4.941233158111572, + "rewards/rejected": -1.8071441650390625, + "step": 5281 + }, + { + "epoch": 1.32, + "grad_norm": 6.055891990661621, + "learning_rate": 3.4581358445848425e-07, + "logits/chosen": -0.35311922430992126, + "logits/rejected": -0.41712841391563416, + "logps/chosen": -57.904136657714844, + "logps/rejected": -67.29483032226562, + "loss": 0.9393, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.987933874130249, + "rewards/margins": 3.500983238220215, + "rewards/rejected": -0.5130495429039001, + "step": 5282 + }, + { + "epoch": 1.32, + "grad_norm": 3.7320072650909424, + "learning_rate": 3.448568813372982e-07, + "logits/chosen": -0.28041568398475647, + "logits/rejected": -0.38774237036705017, + "logps/chosen": -60.9518928527832, + "logps/rejected": -87.97246551513672, + "loss": 0.735, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8901684284210205, + "rewards/margins": 5.178225517272949, + "rewards/rejected": -2.288057327270508, + "step": 5283 + }, + { + "epoch": 1.32, + "grad_norm": 4.996039390563965, + "learning_rate": 3.439014561533488e-07, + "logits/chosen": -0.38051027059555054, + "logits/rejected": -0.41884753108024597, + "logps/chosen": -56.73184585571289, + "logps/rejected": -85.46794891357422, + "loss": 0.8187, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8892838954925537, + "rewards/margins": 4.315542221069336, + "rewards/rejected": -1.4262584447860718, + "step": 5284 + }, + { + "epoch": 1.32, + "grad_norm": 6.490894317626953, + "learning_rate": 3.429473091689206e-07, + "logits/chosen": -0.31980299949645996, + "logits/rejected": -0.385337233543396, + "logps/chosen": -64.22869873046875, + "logps/rejected": -87.15779113769531, + "loss": 0.8972, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6834805011749268, + "rewards/margins": 4.7462053298950195, + "rewards/rejected": -2.062725067138672, + "step": 5285 + }, + { + "epoch": 1.32, + "grad_norm": 4.983534812927246, + "learning_rate": 3.4199444064594687e-07, + "logits/chosen": -0.35285472869873047, + "logits/rejected": -0.45245373249053955, + "logps/chosen": -54.95820617675781, + "logps/rejected": -83.43550872802734, + "loss": 0.7157, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.00797438621521, + "rewards/margins": 5.056547164916992, + "rewards/rejected": -2.048572540283203, + "step": 5286 + }, + { + "epoch": 1.32, + "grad_norm": 6.626809120178223, + "learning_rate": 3.4104285084601175e-07, + "logits/chosen": -0.2657395899295807, + "logits/rejected": -0.404354065656662, + "logps/chosen": -55.670501708984375, + "logps/rejected": -71.80107116699219, + "loss": 0.7093, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0974433422088623, + "rewards/margins": 4.335023403167725, + "rewards/rejected": -1.2375801801681519, + "step": 5287 + }, + { + "epoch": 1.32, + "grad_norm": 5.730590343475342, + "learning_rate": 3.400925400303451e-07, + "logits/chosen": -0.45678046345710754, + "logits/rejected": -0.5130042433738708, + "logps/chosen": -56.742889404296875, + "logps/rejected": -89.839599609375, + "loss": 0.7702, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7577805519104004, + "rewards/margins": 4.667331695556641, + "rewards/rejected": -1.9095511436462402, + "step": 5288 + }, + { + "epoch": 1.32, + "grad_norm": 14.357054710388184, + "learning_rate": 3.391435084598299e-07, + "logits/chosen": -0.3274085819721222, + "logits/rejected": -0.41817688941955566, + "logps/chosen": -57.727561950683594, + "logps/rejected": -91.30670166015625, + "loss": 0.8609, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.177718162536621, + "rewards/margins": 3.6409621238708496, + "rewards/rejected": -0.46324384212493896, + "step": 5289 + }, + { + "epoch": 1.32, + "grad_norm": 5.883482456207275, + "learning_rate": 3.3819575639499614e-07, + "logits/chosen": -0.3700764775276184, + "logits/rejected": -0.458321750164032, + "logps/chosen": -61.8364372253418, + "logps/rejected": -83.61666870117188, + "loss": 0.8309, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8319430351257324, + "rewards/margins": 4.549915790557861, + "rewards/rejected": -1.717972755432129, + "step": 5290 + }, + { + "epoch": 1.32, + "grad_norm": 6.833747863769531, + "learning_rate": 3.37249284096019e-07, + "logits/chosen": -0.371881365776062, + "logits/rejected": -0.48980769515037537, + "logps/chosen": -56.70720672607422, + "logps/rejected": -83.54032897949219, + "loss": 0.7335, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6805737018585205, + "rewards/margins": 4.6500420570373535, + "rewards/rejected": -1.9694687128067017, + "step": 5291 + }, + { + "epoch": 1.32, + "grad_norm": 5.646861553192139, + "learning_rate": 3.3630409182272896e-07, + "logits/chosen": -0.363977313041687, + "logits/rejected": -0.4892071783542633, + "logps/chosen": -61.514015197753906, + "logps/rejected": -76.67229461669922, + "loss": 0.694, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7697057723999023, + "rewards/margins": 4.673466205596924, + "rewards/rejected": -1.9037598371505737, + "step": 5292 + }, + { + "epoch": 1.32, + "grad_norm": 3.239269256591797, + "learning_rate": 3.35360179834599e-07, + "logits/chosen": -0.36493444442749023, + "logits/rejected": -0.4771176874637604, + "logps/chosen": -49.256378173828125, + "logps/rejected": -79.58277893066406, + "loss": 0.664, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7964401245117188, + "rewards/margins": 5.589699745178223, + "rewards/rejected": -2.7932591438293457, + "step": 5293 + }, + { + "epoch": 1.32, + "grad_norm": 4.967309951782227, + "learning_rate": 3.3441754839075636e-07, + "logits/chosen": -0.34190043807029724, + "logits/rejected": -0.39615267515182495, + "logps/chosen": -49.93583297729492, + "logps/rejected": -101.5771484375, + "loss": 0.6881, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0617847442626953, + "rewards/margins": 5.84274435043335, + "rewards/rejected": -2.7809596061706543, + "step": 5294 + }, + { + "epoch": 1.32, + "grad_norm": 5.452544689178467, + "learning_rate": 3.334761977499712e-07, + "logits/chosen": -0.35686567425727844, + "logits/rejected": -0.44504302740097046, + "logps/chosen": -55.277626037597656, + "logps/rejected": -87.40100860595703, + "loss": 0.8334, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.04463267326355, + "rewards/margins": 4.798810005187988, + "rewards/rejected": -1.7541773319244385, + "step": 5295 + }, + { + "epoch": 1.32, + "grad_norm": 6.349627494812012, + "learning_rate": 3.325361281706646e-07, + "logits/chosen": -0.34937530755996704, + "logits/rejected": -0.4603131413459778, + "logps/chosen": -59.26456832885742, + "logps/rejected": -98.51939392089844, + "loss": 0.8125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.668360710144043, + "rewards/margins": 4.380923748016357, + "rewards/rejected": -1.7125632762908936, + "step": 5296 + }, + { + "epoch": 1.33, + "grad_norm": 15.860404968261719, + "learning_rate": 3.3159733991090793e-07, + "logits/chosen": -0.3241450786590576, + "logits/rejected": -0.38098442554473877, + "logps/chosen": -53.71173095703125, + "logps/rejected": -96.72024536132812, + "loss": 0.7589, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7790451049804688, + "rewards/margins": 4.3482513427734375, + "rewards/rejected": -1.5692059993743896, + "step": 5297 + }, + { + "epoch": 1.33, + "grad_norm": 5.583418846130371, + "learning_rate": 3.306598332284172e-07, + "logits/chosen": -0.3874722421169281, + "logits/rejected": -0.43427854776382446, + "logps/chosen": -51.367279052734375, + "logps/rejected": -77.13179016113281, + "loss": 0.7464, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8670237064361572, + "rewards/margins": 4.159483432769775, + "rewards/rejected": -1.292459487915039, + "step": 5298 + }, + { + "epoch": 1.33, + "grad_norm": 6.53861141204834, + "learning_rate": 3.2972360838055886e-07, + "logits/chosen": -0.36169734597206116, + "logits/rejected": -0.4259744882583618, + "logps/chosen": -51.63809585571289, + "logps/rejected": -87.59349060058594, + "loss": 0.659, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.762249231338501, + "rewards/margins": 4.783046722412109, + "rewards/rejected": -2.0207972526550293, + "step": 5299 + }, + { + "epoch": 1.33, + "grad_norm": 5.905545711517334, + "learning_rate": 3.2878866562434666e-07, + "logits/chosen": -0.4134572148323059, + "logits/rejected": -0.4335409104824066, + "logps/chosen": -45.55392837524414, + "logps/rejected": -93.0001449584961, + "loss": 0.6559, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.008760452270508, + "rewards/margins": 5.098129749298096, + "rewards/rejected": -2.0893688201904297, + "step": 5300 + }, + { + "epoch": 1.33, + "grad_norm": 5.3764967918396, + "learning_rate": 3.2785500521644144e-07, + "logits/chosen": -0.3778533339500427, + "logits/rejected": -0.47649380564689636, + "logps/chosen": -54.71149444580078, + "logps/rejected": -77.78784942626953, + "loss": 0.8118, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.759253740310669, + "rewards/margins": 5.035546779632568, + "rewards/rejected": -2.2762935161590576, + "step": 5301 + }, + { + "epoch": 1.33, + "grad_norm": 7.730609893798828, + "learning_rate": 3.269226274131565e-07, + "logits/chosen": -0.31813615560531616, + "logits/rejected": -0.49935388565063477, + "logps/chosen": -73.06397247314453, + "logps/rejected": -87.92692565917969, + "loss": 0.6613, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.899167060852051, + "rewards/margins": 5.525005340576172, + "rewards/rejected": -2.625838041305542, + "step": 5302 + }, + { + "epoch": 1.33, + "grad_norm": 8.910872459411621, + "learning_rate": 3.259915324704449e-07, + "logits/chosen": -0.34176233410835266, + "logits/rejected": -0.3922199606895447, + "logps/chosen": -61.805240631103516, + "logps/rejected": -83.87501525878906, + "loss": 0.8425, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.392773151397705, + "rewards/margins": 4.1035637855529785, + "rewards/rejected": -1.710790753364563, + "step": 5303 + }, + { + "epoch": 1.33, + "grad_norm": 3.2569327354431152, + "learning_rate": 3.2506172064391605e-07, + "logits/chosen": -0.3699200749397278, + "logits/rejected": -0.5165043473243713, + "logps/chosen": -56.019996643066406, + "logps/rejected": -82.953369140625, + "loss": 0.6082, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9679646492004395, + "rewards/margins": 5.882018089294434, + "rewards/rejected": -2.914053201675415, + "step": 5304 + }, + { + "epoch": 1.33, + "grad_norm": 2.6024651527404785, + "learning_rate": 3.2413319218882245e-07, + "logits/chosen": -0.27566009759902954, + "logits/rejected": -0.3677266836166382, + "logps/chosen": -51.615760803222656, + "logps/rejected": -93.86758422851562, + "loss": 0.6147, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0316638946533203, + "rewards/margins": 6.099665641784668, + "rewards/rejected": -3.068002223968506, + "step": 5305 + }, + { + "epoch": 1.33, + "grad_norm": 16.87981414794922, + "learning_rate": 3.232059473600641e-07, + "logits/chosen": -0.3526465892791748, + "logits/rejected": -0.4605124592781067, + "logps/chosen": -53.96773147583008, + "logps/rejected": -79.04927825927734, + "loss": 0.7285, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.201200008392334, + "rewards/margins": 5.060522556304932, + "rewards/rejected": -1.859323263168335, + "step": 5306 + }, + { + "epoch": 1.33, + "grad_norm": 15.34907054901123, + "learning_rate": 3.2227998641219117e-07, + "logits/chosen": -0.36710819602012634, + "logits/rejected": -0.4234026074409485, + "logps/chosen": -51.268802642822266, + "logps/rejected": -83.8583984375, + "loss": 0.7838, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0535244941711426, + "rewards/margins": 4.595191478729248, + "rewards/rejected": -1.5416667461395264, + "step": 5307 + }, + { + "epoch": 1.33, + "grad_norm": 3.993459939956665, + "learning_rate": 3.2135530959939767e-07, + "logits/chosen": -0.25625503063201904, + "logits/rejected": -0.39571893215179443, + "logps/chosen": -55.305458068847656, + "logps/rejected": -78.05867004394531, + "loss": 0.6147, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0983455181121826, + "rewards/margins": 4.932924270629883, + "rewards/rejected": -1.8345783948898315, + "step": 5308 + }, + { + "epoch": 1.33, + "grad_norm": 2.9367482662200928, + "learning_rate": 3.2043191717553046e-07, + "logits/chosen": -0.3266315162181854, + "logits/rejected": -0.4613150954246521, + "logps/chosen": -48.135032653808594, + "logps/rejected": -76.0976333618164, + "loss": 0.5671, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.406536102294922, + "rewards/margins": 6.168262481689453, + "rewards/rejected": -2.761726140975952, + "step": 5309 + }, + { + "epoch": 1.33, + "grad_norm": 4.61293888092041, + "learning_rate": 3.195098093940785e-07, + "logits/chosen": -0.30623751878738403, + "logits/rejected": -0.4516443610191345, + "logps/chosen": -72.35852813720703, + "logps/rejected": -88.31546783447266, + "loss": 0.7747, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.712089776992798, + "rewards/margins": 5.441349983215332, + "rewards/rejected": -2.729260206222534, + "step": 5310 + }, + { + "epoch": 1.33, + "grad_norm": 4.56569766998291, + "learning_rate": 3.1858898650818093e-07, + "logits/chosen": -0.3592323362827301, + "logits/rejected": -0.4527584910392761, + "logps/chosen": -71.02694702148438, + "logps/rejected": -85.71888732910156, + "loss": 0.789, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.739258050918579, + "rewards/margins": 4.808186054229736, + "rewards/rejected": -2.0689280033111572, + "step": 5311 + }, + { + "epoch": 1.33, + "grad_norm": 5.495945453643799, + "learning_rate": 3.1766944877062387e-07, + "logits/chosen": -0.449016809463501, + "logits/rejected": -0.5481501817703247, + "logps/chosen": -61.561004638671875, + "logps/rejected": -100.95729064941406, + "loss": 0.7682, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.812892436981201, + "rewards/margins": 5.515789985656738, + "rewards/rejected": -2.702897787094116, + "step": 5312 + }, + { + "epoch": 1.33, + "grad_norm": 3.9368858337402344, + "learning_rate": 3.1675119643383876e-07, + "logits/chosen": -0.3492213487625122, + "logits/rejected": -0.4284110963344574, + "logps/chosen": -52.347694396972656, + "logps/rejected": -98.7802505493164, + "loss": 0.6196, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.814465284347534, + "rewards/margins": 5.23591947555542, + "rewards/rejected": -2.421454429626465, + "step": 5313 + }, + { + "epoch": 1.33, + "grad_norm": 11.375810623168945, + "learning_rate": 3.1583422974990784e-07, + "logits/chosen": -0.35675573348999023, + "logits/rejected": -0.48067033290863037, + "logps/chosen": -52.80690383911133, + "logps/rejected": -91.90174865722656, + "loss": 0.6615, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7931430339813232, + "rewards/margins": 5.27170467376709, + "rewards/rejected": -2.4785614013671875, + "step": 5314 + }, + { + "epoch": 1.33, + "grad_norm": 9.168972969055176, + "learning_rate": 3.1491854897055697e-07, + "logits/chosen": -0.34236159920692444, + "logits/rejected": -0.4697977900505066, + "logps/chosen": -64.05723571777344, + "logps/rejected": -78.79743957519531, + "loss": 0.7239, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.805830955505371, + "rewards/margins": 4.578751087188721, + "rewards/rejected": -1.7729201316833496, + "step": 5315 + }, + { + "epoch": 1.33, + "grad_norm": 5.357020854949951, + "learning_rate": 3.1400415434716e-07, + "logits/chosen": -0.2583949565887451, + "logits/rejected": -0.3554877042770386, + "logps/chosen": -68.74140930175781, + "logps/rejected": -99.53125, + "loss": 0.7055, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.875051975250244, + "rewards/margins": 4.687603950500488, + "rewards/rejected": -1.812551736831665, + "step": 5316 + }, + { + "epoch": 1.33, + "grad_norm": 4.158890724182129, + "learning_rate": 3.1309104613073847e-07, + "logits/chosen": -0.29637211561203003, + "logits/rejected": -0.38311564922332764, + "logps/chosen": -54.195762634277344, + "logps/rejected": -87.30015563964844, + "loss": 0.7111, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.983861207962036, + "rewards/margins": 5.07145881652832, + "rewards/rejected": -2.087597131729126, + "step": 5317 + }, + { + "epoch": 1.33, + "grad_norm": 2.8425376415252686, + "learning_rate": 3.121792245719607e-07, + "logits/chosen": -0.38774943351745605, + "logits/rejected": -0.4703049063682556, + "logps/chosen": -45.61847686767578, + "logps/rejected": -66.43257141113281, + "loss": 0.6587, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.830465316772461, + "rewards/margins": 4.935036659240723, + "rewards/rejected": -2.10457181930542, + "step": 5318 + }, + { + "epoch": 1.33, + "grad_norm": 19.832618713378906, + "learning_rate": 3.112686899211409e-07, + "logits/chosen": -0.40105777978897095, + "logits/rejected": -0.523679792881012, + "logps/chosen": -54.525691986083984, + "logps/rejected": -74.37419128417969, + "loss": 0.7503, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.972033977508545, + "rewards/margins": 4.797426700592041, + "rewards/rejected": -1.8253930807113647, + "step": 5319 + }, + { + "epoch": 1.33, + "grad_norm": 3.595080852508545, + "learning_rate": 3.1035944242824077e-07, + "logits/chosen": -0.3444792628288269, + "logits/rejected": -0.5012714266777039, + "logps/chosen": -61.96660614013672, + "logps/rejected": -79.24415588378906, + "loss": 0.6751, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.763723850250244, + "rewards/margins": 5.5907440185546875, + "rewards/rejected": -2.827019691467285, + "step": 5320 + }, + { + "epoch": 1.33, + "grad_norm": 2.680966377258301, + "learning_rate": 3.094514823428674e-07, + "logits/chosen": -0.3353930413722992, + "logits/rejected": -0.4521635174751282, + "logps/chosen": -68.70328521728516, + "logps/rejected": -86.63584899902344, + "loss": 0.6721, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0351457595825195, + "rewards/margins": 5.520437240600586, + "rewards/rejected": -2.4852914810180664, + "step": 5321 + }, + { + "epoch": 1.33, + "grad_norm": 5.632129192352295, + "learning_rate": 3.0854480991427684e-07, + "logits/chosen": -0.3149828314781189, + "logits/rejected": -0.35239848494529724, + "logps/chosen": -54.93354034423828, + "logps/rejected": -94.62997436523438, + "loss": 0.7225, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.686467170715332, + "rewards/margins": 4.616053581237793, + "rewards/rejected": -1.9295865297317505, + "step": 5322 + }, + { + "epoch": 1.33, + "grad_norm": 8.539871215820312, + "learning_rate": 3.076394253913695e-07, + "logits/chosen": -0.33426642417907715, + "logits/rejected": -0.3793657720088959, + "logps/chosen": -50.3422737121582, + "logps/rejected": -78.3782958984375, + "loss": 0.9341, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9516215324401855, + "rewards/margins": 2.9693143367767334, + "rewards/rejected": -0.017692819237709045, + "step": 5323 + }, + { + "epoch": 1.33, + "grad_norm": 4.138472080230713, + "learning_rate": 3.0673532902269323e-07, + "logits/chosen": -0.3604726195335388, + "logits/rejected": -0.46200770139694214, + "logps/chosen": -49.630165100097656, + "logps/rejected": -83.8025894165039, + "loss": 0.5709, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7195606231689453, + "rewards/margins": 5.780387878417969, + "rewards/rejected": -3.0608277320861816, + "step": 5324 + }, + { + "epoch": 1.33, + "grad_norm": 7.507138729095459, + "learning_rate": 3.058325210564417e-07, + "logits/chosen": -0.44016143679618835, + "logits/rejected": -0.507685661315918, + "logps/chosen": -50.20568084716797, + "logps/rejected": -97.26988983154297, + "loss": 0.7583, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.866614818572998, + "rewards/margins": 5.243001937866211, + "rewards/rejected": -2.376386880874634, + "step": 5325 + }, + { + "epoch": 1.33, + "grad_norm": 5.4412922859191895, + "learning_rate": 3.0493100174045497e-07, + "logits/chosen": -0.29923194646835327, + "logits/rejected": -0.3985137939453125, + "logps/chosen": -66.07137298583984, + "logps/rejected": -89.22213745117188, + "loss": 0.765, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.569267749786377, + "rewards/margins": 4.774831771850586, + "rewards/rejected": -2.20556378364563, + "step": 5326 + }, + { + "epoch": 1.33, + "grad_norm": 5.0909199714660645, + "learning_rate": 3.040307713222201e-07, + "logits/chosen": -0.41878682374954224, + "logits/rejected": -0.4621633291244507, + "logps/chosen": -58.22863006591797, + "logps/rejected": -82.75077056884766, + "loss": 0.8284, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.940709352493286, + "rewards/margins": 4.382077217102051, + "rewards/rejected": -1.441367506980896, + "step": 5327 + }, + { + "epoch": 1.33, + "grad_norm": 5.565030574798584, + "learning_rate": 3.0313183004886894e-07, + "logits/chosen": -0.3730660676956177, + "logits/rejected": -0.4564056694507599, + "logps/chosen": -48.81333923339844, + "logps/rejected": -72.20789337158203, + "loss": 0.673, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0903618335723877, + "rewards/margins": 4.802890300750732, + "rewards/rejected": -1.7125284671783447, + "step": 5328 + }, + { + "epoch": 1.33, + "grad_norm": 3.5028350353240967, + "learning_rate": 3.0223417816718116e-07, + "logits/chosen": -0.26988720893859863, + "logits/rejected": -0.39339378476142883, + "logps/chosen": -47.731868743896484, + "logps/rejected": -67.4781265258789, + "loss": 0.6027, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0384342670440674, + "rewards/margins": 4.465627670288086, + "rewards/rejected": -1.4271931648254395, + "step": 5329 + }, + { + "epoch": 1.33, + "grad_norm": 10.749311447143555, + "learning_rate": 3.013378159235819e-07, + "logits/chosen": -0.3623669445514679, + "logits/rejected": -0.45948326587677, + "logps/chosen": -52.7054328918457, + "logps/rejected": -84.76689147949219, + "loss": 0.793, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2573468685150146, + "rewards/margins": 4.835452079772949, + "rewards/rejected": -1.578105092048645, + "step": 5330 + }, + { + "epoch": 1.33, + "grad_norm": 4.266933917999268, + "learning_rate": 3.0044274356414105e-07, + "logits/chosen": -0.23157396912574768, + "logits/rejected": -0.4304625689983368, + "logps/chosen": -66.20073699951172, + "logps/rejected": -73.34048461914062, + "loss": 0.6934, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.036421060562134, + "rewards/margins": 4.541256904602051, + "rewards/rejected": -1.5048357248306274, + "step": 5331 + }, + { + "epoch": 1.33, + "grad_norm": 3.2403554916381836, + "learning_rate": 2.9954896133457536e-07, + "logits/chosen": -0.385398805141449, + "logits/rejected": -0.4064137637615204, + "logps/chosen": -54.67234802246094, + "logps/rejected": -101.49848937988281, + "loss": 0.6015, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.95597243309021, + "rewards/margins": 5.364612579345703, + "rewards/rejected": -2.4086403846740723, + "step": 5332 + }, + { + "epoch": 1.33, + "grad_norm": 10.935784339904785, + "learning_rate": 2.9865646948024683e-07, + "logits/chosen": -0.3050939440727234, + "logits/rejected": -0.4040316045284271, + "logps/chosen": -64.48423767089844, + "logps/rejected": -90.08183288574219, + "loss": 0.7394, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8323750495910645, + "rewards/margins": 5.6132402420043945, + "rewards/rejected": -2.780864953994751, + "step": 5333 + }, + { + "epoch": 1.33, + "grad_norm": 6.777706146240234, + "learning_rate": 2.977652682461651e-07, + "logits/chosen": -0.33892080187797546, + "logits/rejected": -0.38685476779937744, + "logps/chosen": -58.85477066040039, + "logps/rejected": -84.8293685913086, + "loss": 0.7422, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.814791679382324, + "rewards/margins": 4.253089904785156, + "rewards/rejected": -1.438298225402832, + "step": 5334 + }, + { + "epoch": 1.33, + "grad_norm": 4.604219913482666, + "learning_rate": 2.968753578769845e-07, + "logits/chosen": -0.321698397397995, + "logits/rejected": -0.4476991593837738, + "logps/chosen": -63.88595199584961, + "logps/rejected": -79.8121566772461, + "loss": 0.7267, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0936295986175537, + "rewards/margins": 4.956349849700928, + "rewards/rejected": -1.8627197742462158, + "step": 5335 + }, + { + "epoch": 1.33, + "grad_norm": 5.016782283782959, + "learning_rate": 2.959867386170018e-07, + "logits/chosen": -0.36117613315582275, + "logits/rejected": -0.4738283157348633, + "logps/chosen": -64.88195037841797, + "logps/rejected": -87.73150634765625, + "loss": 0.7535, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8110733032226562, + "rewards/margins": 4.495946407318115, + "rewards/rejected": -1.6848726272583008, + "step": 5336 + }, + { + "epoch": 1.34, + "grad_norm": 4.325939655303955, + "learning_rate": 2.9509941071016457e-07, + "logits/chosen": -0.35971125960350037, + "logits/rejected": -0.4389512538909912, + "logps/chosen": -52.613914489746094, + "logps/rejected": -80.32879638671875, + "loss": 0.7785, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1769237518310547, + "rewards/margins": 4.957010746002197, + "rewards/rejected": -1.7800878286361694, + "step": 5337 + }, + { + "epoch": 1.34, + "grad_norm": 5.873085021972656, + "learning_rate": 2.9421337440006303e-07, + "logits/chosen": -0.4195997714996338, + "logits/rejected": -0.5704619288444519, + "logps/chosen": -60.99217987060547, + "logps/rejected": -72.0854721069336, + "loss": 0.7862, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.693885087966919, + "rewards/margins": 5.335390567779541, + "rewards/rejected": -2.6415059566497803, + "step": 5338 + }, + { + "epoch": 1.34, + "grad_norm": 7.676647663116455, + "learning_rate": 2.9332862992993317e-07, + "logits/chosen": -0.34847304224967957, + "logits/rejected": -0.4808713495731354, + "logps/chosen": -50.952362060546875, + "logps/rejected": -90.48286437988281, + "loss": 0.6925, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.20015549659729, + "rewards/margins": 5.622509956359863, + "rewards/rejected": -2.422353744506836, + "step": 5339 + }, + { + "epoch": 1.34, + "grad_norm": 3.4447715282440186, + "learning_rate": 2.9244517754265624e-07, + "logits/chosen": -0.27091094851493835, + "logits/rejected": -0.46109679341316223, + "logps/chosen": -62.885406494140625, + "logps/rejected": -74.90924072265625, + "loss": 0.6453, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9472410678863525, + "rewards/margins": 5.009006023406982, + "rewards/rejected": -2.0617644786834717, + "step": 5340 + }, + { + "epoch": 1.34, + "grad_norm": 14.043435096740723, + "learning_rate": 2.915630174807588e-07, + "logits/chosen": -0.41943231225013733, + "logits/rejected": -0.44879448413848877, + "logps/chosen": -51.43054962158203, + "logps/rejected": -90.95289611816406, + "loss": 0.9606, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7497706413269043, + "rewards/margins": 4.678615570068359, + "rewards/rejected": -1.9288450479507446, + "step": 5341 + }, + { + "epoch": 1.34, + "grad_norm": 9.80080509185791, + "learning_rate": 2.906821499864132e-07, + "logits/chosen": -0.35593968629837036, + "logits/rejected": -0.41868841648101807, + "logps/chosen": -48.529090881347656, + "logps/rejected": -84.27596282958984, + "loss": 0.7848, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.72131085395813, + "rewards/margins": 4.742423057556152, + "rewards/rejected": -2.0211119651794434, + "step": 5342 + }, + { + "epoch": 1.34, + "grad_norm": 11.274579048156738, + "learning_rate": 2.898025753014366e-07, + "logits/chosen": -0.3303784728050232, + "logits/rejected": -0.37859296798706055, + "logps/chosen": -60.00217056274414, + "logps/rejected": -104.9088363647461, + "loss": 0.8752, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9589319229125977, + "rewards/margins": 4.716650009155273, + "rewards/rejected": -1.7577179670333862, + "step": 5343 + }, + { + "epoch": 1.34, + "grad_norm": 8.80390739440918, + "learning_rate": 2.889242936672915e-07, + "logits/chosen": -0.3488083481788635, + "logits/rejected": -0.38431084156036377, + "logps/chosen": -48.964683532714844, + "logps/rejected": -86.28316497802734, + "loss": 0.7234, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.998408317565918, + "rewards/margins": 4.944437026977539, + "rewards/rejected": -1.9460289478302002, + "step": 5344 + }, + { + "epoch": 1.34, + "grad_norm": 6.174777030944824, + "learning_rate": 2.880473053250843e-07, + "logits/chosen": -0.2704322338104248, + "logits/rejected": -0.41180315613746643, + "logps/chosen": -58.58861541748047, + "logps/rejected": -98.21092224121094, + "loss": 0.7134, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5499727725982666, + "rewards/margins": 4.799795627593994, + "rewards/rejected": -2.2498230934143066, + "step": 5345 + }, + { + "epoch": 1.34, + "grad_norm": 7.573368072509766, + "learning_rate": 2.8717161051556753e-07, + "logits/chosen": -0.3628407418727875, + "logits/rejected": -0.4397665560245514, + "logps/chosen": -52.057212829589844, + "logps/rejected": -87.28295135498047, + "loss": 0.7844, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.768558979034424, + "rewards/margins": 5.156447887420654, + "rewards/rejected": -2.3878886699676514, + "step": 5346 + }, + { + "epoch": 1.34, + "grad_norm": 3.184216260910034, + "learning_rate": 2.8629720947913944e-07, + "logits/chosen": -0.3782739043235779, + "logits/rejected": -0.5025357604026794, + "logps/chosen": -52.53133010864258, + "logps/rejected": -62.529701232910156, + "loss": 0.7411, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9946887493133545, + "rewards/margins": 4.67650842666626, + "rewards/rejected": -1.6818195581436157, + "step": 5347 + }, + { + "epoch": 1.34, + "grad_norm": 5.3622612953186035, + "learning_rate": 2.8542410245583965e-07, + "logits/chosen": -0.3903678059577942, + "logits/rejected": -0.44281479716300964, + "logps/chosen": -47.612510681152344, + "logps/rejected": -77.26266479492188, + "loss": 0.7393, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9994747638702393, + "rewards/margins": 3.6447184085845947, + "rewards/rejected": -0.6452431678771973, + "step": 5348 + }, + { + "epoch": 1.34, + "grad_norm": 6.257781028747559, + "learning_rate": 2.8455228968535753e-07, + "logits/chosen": -0.4073858857154846, + "logits/rejected": -0.433337539434433, + "logps/chosen": -54.657493591308594, + "logps/rejected": -93.96697998046875, + "loss": 0.8376, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9367177486419678, + "rewards/margins": 3.5028862953186035, + "rewards/rejected": -0.5661682486534119, + "step": 5349 + }, + { + "epoch": 1.34, + "grad_norm": 4.348357200622559, + "learning_rate": 2.836817714070228e-07, + "logits/chosen": -0.4263918399810791, + "logits/rejected": -0.5124253034591675, + "logps/chosen": -52.394134521484375, + "logps/rejected": -87.88343048095703, + "loss": 0.7079, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7689528465270996, + "rewards/margins": 4.922506332397461, + "rewards/rejected": -2.1535534858703613, + "step": 5350 + }, + { + "epoch": 1.34, + "grad_norm": 5.887733459472656, + "learning_rate": 2.828125478598115e-07, + "logits/chosen": -0.37936195731163025, + "logits/rejected": -0.40483352541923523, + "logps/chosen": -52.733978271484375, + "logps/rejected": -95.23578643798828, + "loss": 0.6465, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.797562837600708, + "rewards/margins": 5.291518211364746, + "rewards/rejected": -2.493955135345459, + "step": 5351 + }, + { + "epoch": 1.34, + "grad_norm": 2.7298731803894043, + "learning_rate": 2.819446192823455e-07, + "logits/chosen": -0.3546076714992523, + "logits/rejected": -0.43725934624671936, + "logps/chosen": -66.94479370117188, + "logps/rejected": -97.2375717163086, + "loss": 0.6338, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9666073322296143, + "rewards/margins": 4.6107988357543945, + "rewards/rejected": -1.644191861152649, + "step": 5352 + }, + { + "epoch": 1.34, + "grad_norm": 6.126176834106445, + "learning_rate": 2.8107798591288814e-07, + "logits/chosen": -0.3606373071670532, + "logits/rejected": -0.5078292489051819, + "logps/chosen": -61.10593032836914, + "logps/rejected": -84.45702362060547, + "loss": 0.7866, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8861238956451416, + "rewards/margins": 4.710386276245117, + "rewards/rejected": -1.824262261390686, + "step": 5353 + }, + { + "epoch": 1.34, + "grad_norm": 15.548324584960938, + "learning_rate": 2.802126479893502e-07, + "logits/chosen": -0.36982887983322144, + "logits/rejected": -0.4435041844844818, + "logps/chosen": -58.43559265136719, + "logps/rejected": -77.649169921875, + "loss": 0.9514, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4067254066467285, + "rewards/margins": 4.126676082611084, + "rewards/rejected": -1.7199510335922241, + "step": 5354 + }, + { + "epoch": 1.34, + "grad_norm": 6.706528186798096, + "learning_rate": 2.7934860574928666e-07, + "logits/chosen": -0.34642329812049866, + "logits/rejected": -0.429757297039032, + "logps/chosen": -57.25123977661133, + "logps/rejected": -82.55467987060547, + "loss": 0.7683, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7261157035827637, + "rewards/margins": 4.570671081542969, + "rewards/rejected": -1.844555377960205, + "step": 5355 + }, + { + "epoch": 1.34, + "grad_norm": 4.014347553253174, + "learning_rate": 2.784858594298928e-07, + "logits/chosen": -0.3770313560962677, + "logits/rejected": -0.3913889527320862, + "logps/chosen": -43.49815368652344, + "logps/rejected": -96.45616912841797, + "loss": 0.7632, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.949082851409912, + "rewards/margins": 4.6717424392700195, + "rewards/rejected": -1.7226598262786865, + "step": 5356 + }, + { + "epoch": 1.34, + "grad_norm": 5.624971866607666, + "learning_rate": 2.776244092680136e-07, + "logits/chosen": -0.35601794719696045, + "logits/rejected": -0.46101462841033936, + "logps/chosen": -45.746002197265625, + "logps/rejected": -65.58777618408203, + "loss": 0.6919, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9094691276550293, + "rewards/margins": 4.057132244110107, + "rewards/rejected": -1.1476634740829468, + "step": 5357 + }, + { + "epoch": 1.34, + "grad_norm": 9.239665985107422, + "learning_rate": 2.7676425550013375e-07, + "logits/chosen": -0.3565234839916229, + "logits/rejected": -0.4297797679901123, + "logps/chosen": -70.38825225830078, + "logps/rejected": -84.03947448730469, + "loss": 0.7962, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.147104263305664, + "rewards/margins": 4.013498306274414, + "rewards/rejected": -0.8663942813873291, + "step": 5358 + }, + { + "epoch": 1.34, + "grad_norm": 3.150080680847168, + "learning_rate": 2.759053983623872e-07, + "logits/chosen": -0.29110440611839294, + "logits/rejected": -0.4083898067474365, + "logps/chosen": -58.37560272216797, + "logps/rejected": -78.20486450195312, + "loss": 0.6622, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.079724073410034, + "rewards/margins": 4.81301212310791, + "rewards/rejected": -1.733288049697876, + "step": 5359 + }, + { + "epoch": 1.34, + "grad_norm": 5.58649206161499, + "learning_rate": 2.7504783809054524e-07, + "logits/chosen": -0.3564615547657013, + "logits/rejected": -0.4887664318084717, + "logps/chosen": -56.503421783447266, + "logps/rejected": -76.56459045410156, + "loss": 0.6857, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8590660095214844, + "rewards/margins": 5.216247081756592, + "rewards/rejected": -2.3571808338165283, + "step": 5360 + }, + { + "epoch": 1.34, + "grad_norm": 14.231647491455078, + "learning_rate": 2.74191574920028e-07, + "logits/chosen": -0.4052996039390564, + "logits/rejected": -0.4705483317375183, + "logps/chosen": -57.59083557128906, + "logps/rejected": -75.51689910888672, + "loss": 0.9589, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.700329065322876, + "rewards/margins": 4.422906875610352, + "rewards/rejected": -1.722577691078186, + "step": 5361 + }, + { + "epoch": 1.34, + "grad_norm": 12.598462104797363, + "learning_rate": 2.733366090858991e-07, + "logits/chosen": -0.34447839856147766, + "logits/rejected": -0.4453641474246979, + "logps/chosen": -54.630027770996094, + "logps/rejected": -77.2381591796875, + "loss": 0.822, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7854392528533936, + "rewards/margins": 4.627562522888184, + "rewards/rejected": -1.84212327003479, + "step": 5362 + }, + { + "epoch": 1.34, + "grad_norm": 5.244242191314697, + "learning_rate": 2.7248294082286407e-07, + "logits/chosen": -0.4086325466632843, + "logits/rejected": -0.4669455885887146, + "logps/chosen": -52.629371643066406, + "logps/rejected": -80.0615463256836, + "loss": 0.7224, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.983363389968872, + "rewards/margins": 4.378404140472412, + "rewards/rejected": -1.3950406312942505, + "step": 5363 + }, + { + "epoch": 1.34, + "grad_norm": 3.8276383876800537, + "learning_rate": 2.7163057036527386e-07, + "logits/chosen": -0.32032880187034607, + "logits/rejected": -0.4169163703918457, + "logps/chosen": -65.00946044921875, + "logps/rejected": -83.08451080322266, + "loss": 0.7317, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8653602600097656, + "rewards/margins": 4.326421737670898, + "rewards/rejected": -1.4610613584518433, + "step": 5364 + }, + { + "epoch": 1.34, + "grad_norm": 3.811983585357666, + "learning_rate": 2.7077949794712177e-07, + "logits/chosen": -0.3976321518421173, + "logits/rejected": -0.4313628077507019, + "logps/chosen": -61.80938720703125, + "logps/rejected": -92.90986633300781, + "loss": 0.6725, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.177267551422119, + "rewards/margins": 5.080270290374756, + "rewards/rejected": -1.9030025005340576, + "step": 5365 + }, + { + "epoch": 1.34, + "grad_norm": 4.594098091125488, + "learning_rate": 2.6992972380204595e-07, + "logits/chosen": -0.31649094820022583, + "logits/rejected": -0.36904677748680115, + "logps/chosen": -57.17810821533203, + "logps/rejected": -87.32177734375, + "loss": 0.6741, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0442917346954346, + "rewards/margins": 4.447704792022705, + "rewards/rejected": -1.4034130573272705, + "step": 5366 + }, + { + "epoch": 1.34, + "grad_norm": 6.307675361633301, + "learning_rate": 2.690812481633287e-07, + "logits/chosen": -0.46872779726982117, + "logits/rejected": -0.50361567735672, + "logps/chosen": -56.83001708984375, + "logps/rejected": -77.41584014892578, + "loss": 0.8435, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2021961212158203, + "rewards/margins": 4.683684349060059, + "rewards/rejected": -1.4814882278442383, + "step": 5367 + }, + { + "epoch": 1.34, + "grad_norm": 4.752852439880371, + "learning_rate": 2.682340712638931e-07, + "logits/chosen": -0.4620594382286072, + "logits/rejected": -0.5163114070892334, + "logps/chosen": -59.34441375732422, + "logps/rejected": -96.71908569335938, + "loss": 0.7141, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0751771926879883, + "rewards/margins": 5.088741302490234, + "rewards/rejected": -2.013564109802246, + "step": 5368 + }, + { + "epoch": 1.34, + "grad_norm": 9.409059524536133, + "learning_rate": 2.6738819333630874e-07, + "logits/chosen": -0.40977057814598083, + "logits/rejected": -0.5018169283866882, + "logps/chosen": -50.672325134277344, + "logps/rejected": -84.37749481201172, + "loss": 0.7857, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.870540142059326, + "rewards/margins": 4.767220497131348, + "rewards/rejected": -1.8966798782348633, + "step": 5369 + }, + { + "epoch": 1.34, + "grad_norm": 5.303481578826904, + "learning_rate": 2.66543614612787e-07, + "logits/chosen": -0.37448903918266296, + "logits/rejected": -0.4294532835483551, + "logps/chosen": -59.46746826171875, + "logps/rejected": -83.05462646484375, + "loss": 0.6784, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.15042781829834, + "rewards/margins": 4.589517593383789, + "rewards/rejected": -1.4390898942947388, + "step": 5370 + }, + { + "epoch": 1.34, + "grad_norm": 5.526054382324219, + "learning_rate": 2.6570033532518304e-07, + "logits/chosen": -0.42742088437080383, + "logits/rejected": -0.4473084807395935, + "logps/chosen": -75.74959564208984, + "logps/rejected": -80.2499008178711, + "loss": 0.8004, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6164963245391846, + "rewards/margins": 4.352813720703125, + "rewards/rejected": -1.73631751537323, + "step": 5371 + }, + { + "epoch": 1.34, + "grad_norm": 3.895566701889038, + "learning_rate": 2.6485835570499496e-07, + "logits/chosen": -0.3336798846721649, + "logits/rejected": -0.4123976528644562, + "logps/chosen": -57.279781341552734, + "logps/rejected": -87.6768569946289, + "loss": 0.663, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7825372219085693, + "rewards/margins": 4.479866027832031, + "rewards/rejected": -1.697328805923462, + "step": 5372 + }, + { + "epoch": 1.34, + "grad_norm": 6.212927341461182, + "learning_rate": 2.6401767598336405e-07, + "logits/chosen": -0.32815253734588623, + "logits/rejected": -0.33372223377227783, + "logps/chosen": -58.65509796142578, + "logps/rejected": -101.92878723144531, + "loss": 0.7343, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1472530364990234, + "rewards/margins": 4.3245649337768555, + "rewards/rejected": -1.1773123741149902, + "step": 5373 + }, + { + "epoch": 1.34, + "grad_norm": 6.092564105987549, + "learning_rate": 2.631782963910762e-07, + "logits/chosen": -0.5110657215118408, + "logits/rejected": -0.6471933126449585, + "logps/chosen": -46.56058120727539, + "logps/rejected": -69.63799285888672, + "loss": 0.7116, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9543607234954834, + "rewards/margins": 5.510167598724365, + "rewards/rejected": -2.5558066368103027, + "step": 5374 + }, + { + "epoch": 1.34, + "grad_norm": 4.826374053955078, + "learning_rate": 2.6234021715855874e-07, + "logits/chosen": -0.45060160756111145, + "logits/rejected": -0.563230574131012, + "logps/chosen": -51.51648712158203, + "logps/rejected": -90.73500061035156, + "loss": 0.5867, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.206049919128418, + "rewards/margins": 5.477150917053223, + "rewards/rejected": -2.2711009979248047, + "step": 5375 + }, + { + "epoch": 1.34, + "grad_norm": 10.956436157226562, + "learning_rate": 2.615034385158821e-07, + "logits/chosen": -0.36400091648101807, + "logits/rejected": -0.4442535638809204, + "logps/chosen": -51.274497985839844, + "logps/rejected": -79.54133605957031, + "loss": 0.7471, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.640028715133667, + "rewards/margins": 5.111306667327881, + "rewards/rejected": -2.4712772369384766, + "step": 5376 + }, + { + "epoch": 1.35, + "grad_norm": 6.456450462341309, + "learning_rate": 2.606679606927603e-07, + "logits/chosen": -0.2736145555973053, + "logits/rejected": -0.4042181372642517, + "logps/chosen": -77.92859649658203, + "logps/rejected": -89.57170867919922, + "loss": 0.8892, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4306445121765137, + "rewards/margins": 4.343216419219971, + "rewards/rejected": -1.9125720262527466, + "step": 5377 + }, + { + "epoch": 1.35, + "grad_norm": 7.278690814971924, + "learning_rate": 2.5983378391854887e-07, + "logits/chosen": -0.3861241638660431, + "logits/rejected": -0.4640433192253113, + "logps/chosen": -61.842628479003906, + "logps/rejected": -86.35086059570312, + "loss": 0.8014, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5848934650421143, + "rewards/margins": 4.638044834136963, + "rewards/rejected": -2.0531511306762695, + "step": 5378 + }, + { + "epoch": 1.35, + "grad_norm": 4.2486748695373535, + "learning_rate": 2.5900090842225057e-07, + "logits/chosen": -0.3757009506225586, + "logits/rejected": -0.485571950674057, + "logps/chosen": -62.076995849609375, + "logps/rejected": -88.63605499267578, + "loss": 0.6807, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9304797649383545, + "rewards/margins": 4.956001281738281, + "rewards/rejected": -2.025521755218506, + "step": 5379 + }, + { + "epoch": 1.35, + "grad_norm": 13.544671058654785, + "learning_rate": 2.581693344325048e-07, + "logits/chosen": -0.2877761125564575, + "logits/rejected": -0.30550867319107056, + "logps/chosen": -59.72235870361328, + "logps/rejected": -99.33414459228516, + "loss": 0.7496, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.895359754562378, + "rewards/margins": 4.65707540512085, + "rewards/rejected": -1.7617161273956299, + "step": 5380 + }, + { + "epoch": 1.35, + "grad_norm": 4.333946704864502, + "learning_rate": 2.5733906217759617e-07, + "logits/chosen": -0.2743059992790222, + "logits/rejected": -0.36768126487731934, + "logps/chosen": -62.59282684326172, + "logps/rejected": -79.8749771118164, + "loss": 0.7489, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0712966918945312, + "rewards/margins": 4.8616228103637695, + "rewards/rejected": -1.7903252840042114, + "step": 5381 + }, + { + "epoch": 1.35, + "grad_norm": 3.9666168689727783, + "learning_rate": 2.565100918854546e-07, + "logits/chosen": -0.27831411361694336, + "logits/rejected": -0.4068938195705414, + "logps/chosen": -61.54859924316406, + "logps/rejected": -73.07762145996094, + "loss": 0.6335, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.843799352645874, + "rewards/margins": 5.033999919891357, + "rewards/rejected": -2.190200090408325, + "step": 5382 + }, + { + "epoch": 1.35, + "grad_norm": 5.3848981857299805, + "learning_rate": 2.556824237836492e-07, + "logits/chosen": -0.3405539393424988, + "logits/rejected": -0.36996009945869446, + "logps/chosen": -52.087337493896484, + "logps/rejected": -94.69951629638672, + "loss": 0.785, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7439775466918945, + "rewards/margins": 4.434529781341553, + "rewards/rejected": -1.6905521154403687, + "step": 5383 + }, + { + "epoch": 1.35, + "grad_norm": 4.232504367828369, + "learning_rate": 2.5485605809939253e-07, + "logits/chosen": -0.2561584711074829, + "logits/rejected": -0.39562684297561646, + "logps/chosen": -57.27724075317383, + "logps/rejected": -86.51686096191406, + "loss": 0.6495, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.137784481048584, + "rewards/margins": 5.741124153137207, + "rewards/rejected": -2.603339910507202, + "step": 5384 + }, + { + "epoch": 1.35, + "grad_norm": 3.6446335315704346, + "learning_rate": 2.5403099505954045e-07, + "logits/chosen": -0.2918126881122589, + "logits/rejected": -0.39062920212745667, + "logps/chosen": -53.17090606689453, + "logps/rejected": -81.07234954833984, + "loss": 0.678, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.934990644454956, + "rewards/margins": 5.184635162353516, + "rewards/rejected": -2.2496447563171387, + "step": 5385 + }, + { + "epoch": 1.35, + "grad_norm": 2.7620162963867188, + "learning_rate": 2.5320723489058897e-07, + "logits/chosen": -0.3812396228313446, + "logits/rejected": -0.5097497701644897, + "logps/chosen": -55.21268844604492, + "logps/rejected": -87.91641235351562, + "loss": 0.5838, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8157944679260254, + "rewards/margins": 5.620054721832275, + "rewards/rejected": -2.804260015487671, + "step": 5386 + }, + { + "epoch": 1.35, + "grad_norm": 3.7282752990722656, + "learning_rate": 2.5238477781868054e-07, + "logits/chosen": -0.34804195165634155, + "logits/rejected": -0.3819957375526428, + "logps/chosen": -49.57215881347656, + "logps/rejected": -105.56194305419922, + "loss": 0.5302, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7202320098876953, + "rewards/margins": 4.992413520812988, + "rewards/rejected": -2.272181987762451, + "step": 5387 + }, + { + "epoch": 1.35, + "grad_norm": 5.076578617095947, + "learning_rate": 2.5156362406959564e-07, + "logits/chosen": -0.2883521020412445, + "logits/rejected": -0.3382505476474762, + "logps/chosen": -49.885868072509766, + "logps/rejected": -88.9350814819336, + "loss": 0.8168, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.034956693649292, + "rewards/margins": 4.356960296630859, + "rewards/rejected": -1.3220036029815674, + "step": 5388 + }, + { + "epoch": 1.35, + "grad_norm": 4.148743152618408, + "learning_rate": 2.5074377386875903e-07, + "logits/chosen": -0.30835670232772827, + "logits/rejected": -0.39359796047210693, + "logps/chosen": -66.56492614746094, + "logps/rejected": -73.43460083007812, + "loss": 0.7807, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2497527599334717, + "rewards/margins": 3.465296983718872, + "rewards/rejected": -0.21554425358772278, + "step": 5389 + }, + { + "epoch": 1.35, + "grad_norm": 7.104308128356934, + "learning_rate": 2.499252274412378e-07, + "logits/chosen": -0.30619341135025024, + "logits/rejected": -0.44793081283569336, + "logps/chosen": -57.7559814453125, + "logps/rejected": -77.13028717041016, + "loss": 0.7639, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0368406772613525, + "rewards/margins": 5.310732841491699, + "rewards/rejected": -2.273892402648926, + "step": 5390 + }, + { + "epoch": 1.35, + "grad_norm": 7.894891262054443, + "learning_rate": 2.491079850117395e-07, + "logits/chosen": -0.3726297616958618, + "logits/rejected": -0.4370025396347046, + "logps/chosen": -58.47210693359375, + "logps/rejected": -91.7786636352539, + "loss": 0.8228, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7334415912628174, + "rewards/margins": 4.555399417877197, + "rewards/rejected": -1.8219579458236694, + "step": 5391 + }, + { + "epoch": 1.35, + "grad_norm": 14.376192092895508, + "learning_rate": 2.4829204680461626e-07, + "logits/chosen": -0.4113398492336273, + "logits/rejected": -0.4402717053890228, + "logps/chosen": -51.9801139831543, + "logps/rejected": -96.74044036865234, + "loss": 0.8377, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8071327209472656, + "rewards/margins": 4.856191158294678, + "rewards/rejected": -2.049058437347412, + "step": 5392 + }, + { + "epoch": 1.35, + "grad_norm": 3.434173822402954, + "learning_rate": 2.4747741304385897e-07, + "logits/chosen": -0.3701111674308777, + "logits/rejected": -0.47145479917526245, + "logps/chosen": -56.767730712890625, + "logps/rejected": -82.42195129394531, + "loss": 0.7209, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.984649896621704, + "rewards/margins": 5.257449626922607, + "rewards/rejected": -2.2727999687194824, + "step": 5393 + }, + { + "epoch": 1.35, + "grad_norm": 4.673034191131592, + "learning_rate": 2.466640839531037e-07, + "logits/chosen": -0.39913445711135864, + "logits/rejected": -0.47529664635658264, + "logps/chosen": -61.500526428222656, + "logps/rejected": -97.4927978515625, + "loss": 0.7001, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.015685796737671, + "rewards/margins": 5.077736854553223, + "rewards/rejected": -2.0620510578155518, + "step": 5394 + }, + { + "epoch": 1.35, + "grad_norm": 5.374673843383789, + "learning_rate": 2.4585205975562634e-07, + "logits/chosen": -0.36892688274383545, + "logits/rejected": -0.48340263962745667, + "logps/chosen": -50.567012786865234, + "logps/rejected": -73.70122528076172, + "loss": 0.6738, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.071115255355835, + "rewards/margins": 4.301596641540527, + "rewards/rejected": -1.230481743812561, + "step": 5395 + }, + { + "epoch": 1.35, + "grad_norm": 6.4388322830200195, + "learning_rate": 2.450413406743446e-07, + "logits/chosen": -0.3106427788734436, + "logits/rejected": -0.32933658361434937, + "logps/chosen": -67.3427505493164, + "logps/rejected": -89.35553741455078, + "loss": 0.9386, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7300894260406494, + "rewards/margins": 3.852219581604004, + "rewards/rejected": -1.1221301555633545, + "step": 5396 + }, + { + "epoch": 1.35, + "grad_norm": 4.411916255950928, + "learning_rate": 2.442319269318194e-07, + "logits/chosen": -0.3627622723579407, + "logits/rejected": -0.43870240449905396, + "logps/chosen": -63.08224868774414, + "logps/rejected": -84.67034912109375, + "loss": 0.87, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.809159278869629, + "rewards/margins": 4.1114397048950195, + "rewards/rejected": -1.3022809028625488, + "step": 5397 + }, + { + "epoch": 1.35, + "grad_norm": 4.225466728210449, + "learning_rate": 2.4342381875025025e-07, + "logits/chosen": -0.37113243341445923, + "logits/rejected": -0.45575425028800964, + "logps/chosen": -53.122352600097656, + "logps/rejected": -86.85746002197266, + "loss": 0.6588, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.839066505432129, + "rewards/margins": 5.931288719177246, + "rewards/rejected": -3.092222213745117, + "step": 5398 + }, + { + "epoch": 1.35, + "grad_norm": 14.73202133178711, + "learning_rate": 2.4261701635148295e-07, + "logits/chosen": -0.3259935975074768, + "logits/rejected": -0.4383104741573334, + "logps/chosen": -61.51591491699219, + "logps/rejected": -83.20598602294922, + "loss": 1.0348, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.2652928829193115, + "rewards/margins": 3.4229514598846436, + "rewards/rejected": -1.157658576965332, + "step": 5399 + }, + { + "epoch": 1.35, + "grad_norm": 8.010743141174316, + "learning_rate": 2.4181151995700146e-07, + "logits/chosen": -0.3288193941116333, + "logits/rejected": -0.4499492645263672, + "logps/chosen": -54.2385368347168, + "logps/rejected": -96.43061065673828, + "loss": 0.7006, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.808201551437378, + "rewards/margins": 5.6651153564453125, + "rewards/rejected": -2.8569140434265137, + "step": 5400 + }, + { + "epoch": 1.35, + "grad_norm": 7.080630302429199, + "learning_rate": 2.4100732978793007e-07, + "logits/chosen": -0.2879900634288788, + "logits/rejected": -0.4369601011276245, + "logps/chosen": -67.6163101196289, + "logps/rejected": -83.34337615966797, + "loss": 0.9417, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5683207511901855, + "rewards/margins": 4.374482154846191, + "rewards/rejected": -1.8061611652374268, + "step": 5401 + }, + { + "epoch": 1.35, + "grad_norm": 6.628976345062256, + "learning_rate": 2.4020444606503765e-07, + "logits/chosen": -0.3254576325416565, + "logits/rejected": -0.4519486725330353, + "logps/chosen": -63.32029724121094, + "logps/rejected": -87.65116119384766, + "loss": 0.6577, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7794618606567383, + "rewards/margins": 5.370588302612305, + "rewards/rejected": -2.5911264419555664, + "step": 5402 + }, + { + "epoch": 1.35, + "grad_norm": 4.619405746459961, + "learning_rate": 2.39402869008733e-07, + "logits/chosen": -0.37861883640289307, + "logits/rejected": -0.4503982365131378, + "logps/chosen": -54.57597732543945, + "logps/rejected": -86.94253540039062, + "loss": 0.7625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0370495319366455, + "rewards/margins": 5.229946136474609, + "rewards/rejected": -2.1928963661193848, + "step": 5403 + }, + { + "epoch": 1.35, + "grad_norm": 9.030653953552246, + "learning_rate": 2.386025988390678e-07, + "logits/chosen": -0.3224056363105774, + "logits/rejected": -0.4366028904914856, + "logps/chosen": -56.11296463012695, + "logps/rejected": -81.93951416015625, + "loss": 0.8248, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9003400802612305, + "rewards/margins": 4.419139862060547, + "rewards/rejected": -1.518799901008606, + "step": 5404 + }, + { + "epoch": 1.35, + "grad_norm": 7.785350799560547, + "learning_rate": 2.3780363577573128e-07, + "logits/chosen": -0.3886290490627289, + "logits/rejected": -0.4580574035644531, + "logps/chosen": -50.36513137817383, + "logps/rejected": -86.48719787597656, + "loss": 0.7257, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.890162944793701, + "rewards/margins": 5.063133716583252, + "rewards/rejected": -2.1729702949523926, + "step": 5405 + }, + { + "epoch": 1.35, + "grad_norm": 11.767690658569336, + "learning_rate": 2.3700598003805587e-07, + "logits/chosen": -0.3570472300052643, + "logits/rejected": -0.408832848072052, + "logps/chosen": -54.61532974243164, + "logps/rejected": -91.74658966064453, + "loss": 0.7473, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6699776649475098, + "rewards/margins": 4.68966007232666, + "rewards/rejected": -2.0196824073791504, + "step": 5406 + }, + { + "epoch": 1.35, + "grad_norm": 7.284280300140381, + "learning_rate": 2.3620963184501744e-07, + "logits/chosen": -0.3971337676048279, + "logits/rejected": -0.4268106520175934, + "logps/chosen": -54.6817741394043, + "logps/rejected": -86.56571960449219, + "loss": 0.6821, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8321046829223633, + "rewards/margins": 4.8262715339660645, + "rewards/rejected": -1.9941668510437012, + "step": 5407 + }, + { + "epoch": 1.35, + "grad_norm": 3.519916296005249, + "learning_rate": 2.3541459141522894e-07, + "logits/chosen": -0.2697039842605591, + "logits/rejected": -0.42354655265808105, + "logps/chosen": -63.92286682128906, + "logps/rejected": -87.2629165649414, + "loss": 0.6433, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9511916637420654, + "rewards/margins": 5.767764091491699, + "rewards/rejected": -2.816572666168213, + "step": 5408 + }, + { + "epoch": 1.35, + "grad_norm": 5.209696292877197, + "learning_rate": 2.346208589669474e-07, + "logits/chosen": -0.327814519405365, + "logits/rejected": -0.3959435820579529, + "logps/chosen": -60.50205993652344, + "logps/rejected": -91.01861572265625, + "loss": 0.7422, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7561423778533936, + "rewards/margins": 4.143383979797363, + "rewards/rejected": -1.3872414827346802, + "step": 5409 + }, + { + "epoch": 1.35, + "grad_norm": 5.30841064453125, + "learning_rate": 2.338284347180686e-07, + "logits/chosen": -0.2962343692779541, + "logits/rejected": -0.3998875021934509, + "logps/chosen": -58.68117904663086, + "logps/rejected": -91.26913452148438, + "loss": 0.659, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.981370449066162, + "rewards/margins": 5.073409557342529, + "rewards/rejected": -2.092038631439209, + "step": 5410 + }, + { + "epoch": 1.35, + "grad_norm": 5.347233295440674, + "learning_rate": 2.3303731888612957e-07, + "logits/chosen": -0.3583758771419525, + "logits/rejected": -0.47235018014907837, + "logps/chosen": -56.52606964111328, + "logps/rejected": -81.9051284790039, + "loss": 0.7723, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0983827114105225, + "rewards/margins": 4.369838237762451, + "rewards/rejected": -1.2714552879333496, + "step": 5411 + }, + { + "epoch": 1.35, + "grad_norm": 10.840666770935059, + "learning_rate": 2.322475116883105e-07, + "logits/chosen": -0.27704179286956787, + "logits/rejected": -0.4034275412559509, + "logps/chosen": -68.82498168945312, + "logps/rejected": -81.07892608642578, + "loss": 0.9126, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.851783275604248, + "rewards/margins": 4.999489784240723, + "rewards/rejected": -2.147706985473633, + "step": 5412 + }, + { + "epoch": 1.35, + "grad_norm": 6.113429546356201, + "learning_rate": 2.314590133414285e-07, + "logits/chosen": -0.39976009726524353, + "logits/rejected": -0.4395201802253723, + "logps/chosen": -54.642494201660156, + "logps/rejected": -92.1446304321289, + "loss": 0.7529, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.126163959503174, + "rewards/margins": 4.906635284423828, + "rewards/rejected": -1.7804718017578125, + "step": 5413 + }, + { + "epoch": 1.35, + "grad_norm": 3.776596784591675, + "learning_rate": 2.3067182406194488e-07, + "logits/chosen": -0.3840981721878052, + "logits/rejected": -0.461951345205307, + "logps/chosen": -62.873451232910156, + "logps/rejected": -100.17597198486328, + "loss": 0.7379, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.032766342163086, + "rewards/margins": 5.381839752197266, + "rewards/rejected": -2.349073648452759, + "step": 5414 + }, + { + "epoch": 1.35, + "grad_norm": 4.786296367645264, + "learning_rate": 2.2988594406595955e-07, + "logits/chosen": -0.30365636944770813, + "logits/rejected": -0.3980827331542969, + "logps/chosen": -62.33412551879883, + "logps/rejected": -78.18290710449219, + "loss": 0.7274, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6658358573913574, + "rewards/margins": 4.587376117706299, + "rewards/rejected": -1.9215400218963623, + "step": 5415 + }, + { + "epoch": 1.35, + "grad_norm": 14.991918563842773, + "learning_rate": 2.2910137356921325e-07, + "logits/chosen": -0.42633551359176636, + "logits/rejected": -0.47341957688331604, + "logps/chosen": -54.07868576049805, + "logps/rejected": -104.47966003417969, + "loss": 0.6554, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.777432680130005, + "rewards/margins": 5.80158805847168, + "rewards/rejected": -3.0241551399230957, + "step": 5416 + }, + { + "epoch": 1.36, + "grad_norm": 5.46477746963501, + "learning_rate": 2.2831811278708816e-07, + "logits/chosen": -0.39205536246299744, + "logits/rejected": -0.540738582611084, + "logps/chosen": -55.21080780029297, + "logps/rejected": -70.1500015258789, + "loss": 0.7052, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.79882550239563, + "rewards/margins": 5.85247278213501, + "rewards/rejected": -3.0536465644836426, + "step": 5417 + }, + { + "epoch": 1.36, + "grad_norm": 10.34885311126709, + "learning_rate": 2.2753616193460505e-07, + "logits/chosen": -0.36625349521636963, + "logits/rejected": -0.4802866578102112, + "logps/chosen": -57.85074234008789, + "logps/rejected": -79.69014739990234, + "loss": 0.7604, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9372363090515137, + "rewards/margins": 4.333900451660156, + "rewards/rejected": -1.3966641426086426, + "step": 5418 + }, + { + "epoch": 1.36, + "grad_norm": 4.685762882232666, + "learning_rate": 2.267555212264283e-07, + "logits/chosen": -0.3258480131626129, + "logits/rejected": -0.3406115174293518, + "logps/chosen": -63.875484466552734, + "logps/rejected": -100.61629486083984, + "loss": 0.7336, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2360641956329346, + "rewards/margins": 5.488361358642578, + "rewards/rejected": -2.2522969245910645, + "step": 5419 + }, + { + "epoch": 1.36, + "grad_norm": 3.319737672805786, + "learning_rate": 2.2597619087685874e-07, + "logits/chosen": -0.3178156614303589, + "logits/rejected": -0.4091941714286804, + "logps/chosen": -51.024070739746094, + "logps/rejected": -84.98143768310547, + "loss": 0.6696, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1494805812835693, + "rewards/margins": 5.3080902099609375, + "rewards/rejected": -2.1586103439331055, + "step": 5420 + }, + { + "epoch": 1.36, + "grad_norm": 4.421566486358643, + "learning_rate": 2.251981710998402e-07, + "logits/chosen": -0.3475356101989746, + "logits/rejected": -0.46363261342048645, + "logps/chosen": -49.58607482910156, + "logps/rejected": -96.25071716308594, + "loss": 0.6367, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.057673454284668, + "rewards/margins": 5.958060264587402, + "rewards/rejected": -2.9003865718841553, + "step": 5421 + }, + { + "epoch": 1.36, + "grad_norm": 3.329728364944458, + "learning_rate": 2.244214621089563e-07, + "logits/chosen": -0.3316904306411743, + "logits/rejected": -0.4370591938495636, + "logps/chosen": -56.003292083740234, + "logps/rejected": -80.70026397705078, + "loss": 0.6623, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.013409376144409, + "rewards/margins": 5.262190818786621, + "rewards/rejected": -2.24878191947937, + "step": 5422 + }, + { + "epoch": 1.36, + "grad_norm": 4.018600940704346, + "learning_rate": 2.2364606411742874e-07, + "logits/chosen": -0.28580570220947266, + "logits/rejected": -0.36567699909210205, + "logps/chosen": -55.55710220336914, + "logps/rejected": -86.04841613769531, + "loss": 0.6679, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8855857849121094, + "rewards/margins": 5.16951847076416, + "rewards/rejected": -2.28393292427063, + "step": 5423 + }, + { + "epoch": 1.36, + "grad_norm": 4.684723377227783, + "learning_rate": 2.2287197733812383e-07, + "logits/chosen": -0.4333938956260681, + "logits/rejected": -0.47887560725212097, + "logps/chosen": -53.94074630737305, + "logps/rejected": -100.29468536376953, + "loss": 0.6891, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0598764419555664, + "rewards/margins": 5.123288154602051, + "rewards/rejected": -2.0634121894836426, + "step": 5424 + }, + { + "epoch": 1.36, + "grad_norm": 6.7028303146362305, + "learning_rate": 2.2209920198354218e-07, + "logits/chosen": -0.3567613959312439, + "logits/rejected": -0.503617525100708, + "logps/chosen": -57.059425354003906, + "logps/rejected": -99.19960021972656, + "loss": 0.8187, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7456352710723877, + "rewards/margins": 6.037944793701172, + "rewards/rejected": -3.2923099994659424, + "step": 5425 + }, + { + "epoch": 1.36, + "grad_norm": 7.055622577667236, + "learning_rate": 2.2132773826582854e-07, + "logits/chosen": -0.35242295265197754, + "logits/rejected": -0.39429962635040283, + "logps/chosen": -56.97565460205078, + "logps/rejected": -93.48085021972656, + "loss": 0.7058, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9324703216552734, + "rewards/margins": 4.737921237945557, + "rewards/rejected": -1.805450439453125, + "step": 5426 + }, + { + "epoch": 1.36, + "grad_norm": 4.043269634246826, + "learning_rate": 2.2055758639676684e-07, + "logits/chosen": -0.3058733344078064, + "logits/rejected": -0.3688933253288269, + "logps/chosen": -59.15842056274414, + "logps/rejected": -85.80191802978516, + "loss": 0.6535, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.911025285720825, + "rewards/margins": 5.212387561798096, + "rewards/rejected": -2.3013625144958496, + "step": 5427 + }, + { + "epoch": 1.36, + "grad_norm": 2.387237310409546, + "learning_rate": 2.197887465877796e-07, + "logits/chosen": -0.342793732881546, + "logits/rejected": -0.4555397927761078, + "logps/chosen": -55.7686653137207, + "logps/rejected": -99.35944366455078, + "loss": 0.6127, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1027748584747314, + "rewards/margins": 6.392632961273193, + "rewards/rejected": -3.289858102798462, + "step": 5428 + }, + { + "epoch": 1.36, + "grad_norm": 4.253584861755371, + "learning_rate": 2.1902121904993078e-07, + "logits/chosen": -0.3663404881954193, + "logits/rejected": -0.48545145988464355, + "logps/chosen": -67.98696899414062, + "logps/rejected": -87.81092071533203, + "loss": 0.7639, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7375168800354004, + "rewards/margins": 4.5943922996521, + "rewards/rejected": -1.8568754196166992, + "step": 5429 + }, + { + "epoch": 1.36, + "grad_norm": 3.8979899883270264, + "learning_rate": 2.182550039939224e-07, + "logits/chosen": -0.23194602131843567, + "logits/rejected": -0.35228487849235535, + "logps/chosen": -62.56235122680664, + "logps/rejected": -92.05509185791016, + "loss": 0.6713, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7546396255493164, + "rewards/margins": 5.0733442306518555, + "rewards/rejected": -2.3187050819396973, + "step": 5430 + }, + { + "epoch": 1.36, + "grad_norm": 4.435664653778076, + "learning_rate": 2.1749010163009676e-07, + "logits/chosen": -0.36920905113220215, + "logits/rejected": -0.43027403950691223, + "logps/chosen": -50.99134826660156, + "logps/rejected": -87.40248107910156, + "loss": 0.7371, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9527063369750977, + "rewards/margins": 4.980039596557617, + "rewards/rejected": -2.0273330211639404, + "step": 5431 + }, + { + "epoch": 1.36, + "grad_norm": 4.555320739746094, + "learning_rate": 2.1672651216843698e-07, + "logits/chosen": -0.36122503876686096, + "logits/rejected": -0.3532503843307495, + "logps/chosen": -51.10702896118164, + "logps/rejected": -107.79716491699219, + "loss": 0.6838, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.999016284942627, + "rewards/margins": 5.26432991027832, + "rewards/rejected": -2.2653136253356934, + "step": 5432 + }, + { + "epoch": 1.36, + "grad_norm": 6.482509136199951, + "learning_rate": 2.1596423581856485e-07, + "logits/chosen": -0.34303149580955505, + "logits/rejected": -0.44442570209503174, + "logps/chosen": -55.20688247680664, + "logps/rejected": -95.07896423339844, + "loss": 0.6946, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0588672161102295, + "rewards/margins": 5.740522861480713, + "rewards/rejected": -2.6816554069519043, + "step": 5433 + }, + { + "epoch": 1.36, + "grad_norm": 14.338342666625977, + "learning_rate": 2.1520327278974129e-07, + "logits/chosen": -0.26657429337501526, + "logits/rejected": -0.3318634331226349, + "logps/chosen": -66.0844497680664, + "logps/rejected": -86.77381134033203, + "loss": 0.8276, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.613581418991089, + "rewards/margins": 3.5360195636749268, + "rewards/rejected": -0.922438383102417, + "step": 5434 + }, + { + "epoch": 1.36, + "grad_norm": 8.634095191955566, + "learning_rate": 2.1444362329086698e-07, + "logits/chosen": -0.3717360496520996, + "logits/rejected": -0.4193895161151886, + "logps/chosen": -65.0916748046875, + "logps/rejected": -93.59503936767578, + "loss": 0.8301, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8332910537719727, + "rewards/margins": 5.010231971740723, + "rewards/rejected": -2.176940679550171, + "step": 5435 + }, + { + "epoch": 1.36, + "grad_norm": 10.439507484436035, + "learning_rate": 2.1368528753048234e-07, + "logits/chosen": -0.34183469414711, + "logits/rejected": -0.43036577105522156, + "logps/chosen": -52.84630584716797, + "logps/rejected": -93.4333724975586, + "loss": 0.7547, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.131117820739746, + "rewards/margins": 5.188060760498047, + "rewards/rejected": -2.056942939758301, + "step": 5436 + }, + { + "epoch": 1.36, + "grad_norm": 5.70832633972168, + "learning_rate": 2.1292826571676695e-07, + "logits/chosen": -0.31589922308921814, + "logits/rejected": -0.3634037971496582, + "logps/chosen": -52.62833023071289, + "logps/rejected": -86.92474365234375, + "loss": 0.7722, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.142275333404541, + "rewards/margins": 5.050359725952148, + "rewards/rejected": -1.9080848693847656, + "step": 5437 + }, + { + "epoch": 1.36, + "grad_norm": 7.291353225708008, + "learning_rate": 2.12172558057539e-07, + "logits/chosen": -0.36843031644821167, + "logits/rejected": -0.41103607416152954, + "logps/chosen": -60.564231872558594, + "logps/rejected": -91.02699279785156, + "loss": 0.7665, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4523210525512695, + "rewards/margins": 4.7357563972473145, + "rewards/rejected": -2.283435821533203, + "step": 5438 + }, + { + "epoch": 1.36, + "grad_norm": 7.117429256439209, + "learning_rate": 2.11418164760257e-07, + "logits/chosen": -0.3863881230354309, + "logits/rejected": -0.5267620086669922, + "logps/chosen": -51.585609436035156, + "logps/rejected": -88.9297866821289, + "loss": 0.7626, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7654805183410645, + "rewards/margins": 5.056893348693848, + "rewards/rejected": -2.2914133071899414, + "step": 5439 + }, + { + "epoch": 1.36, + "grad_norm": 3.400665521621704, + "learning_rate": 2.1066508603201862e-07, + "logits/chosen": -0.3449215888977051, + "logits/rejected": -0.3846391439437866, + "logps/chosen": -52.870548248291016, + "logps/rejected": -88.51850128173828, + "loss": 0.6733, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.139835834503174, + "rewards/margins": 5.361787796020508, + "rewards/rejected": -2.221951484680176, + "step": 5440 + }, + { + "epoch": 1.36, + "grad_norm": 4.151383399963379, + "learning_rate": 2.0991332207955962e-07, + "logits/chosen": -0.3381080627441406, + "logits/rejected": -0.408742219209671, + "logps/chosen": -51.191612243652344, + "logps/rejected": -89.46131134033203, + "loss": 0.6136, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.963940382003784, + "rewards/margins": 4.905020236968994, + "rewards/rejected": -1.9410796165466309, + "step": 5441 + }, + { + "epoch": 1.36, + "grad_norm": 4.044466972351074, + "learning_rate": 2.09162873109256e-07, + "logits/chosen": -0.3647000193595886, + "logits/rejected": -0.5656012296676636, + "logps/chosen": -56.921348571777344, + "logps/rejected": -63.88274002075195, + "loss": 0.7008, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7649166584014893, + "rewards/margins": 5.329051971435547, + "rewards/rejected": -2.5641355514526367, + "step": 5442 + }, + { + "epoch": 1.36, + "grad_norm": 5.021167755126953, + "learning_rate": 2.0841373932712127e-07, + "logits/chosen": -0.28119102120399475, + "logits/rejected": -0.4214802384376526, + "logps/chosen": -61.68363952636719, + "logps/rejected": -73.42886352539062, + "loss": 0.7377, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.580941677093506, + "rewards/margins": 5.411042213439941, + "rewards/rejected": -2.8301002979278564, + "step": 5443 + }, + { + "epoch": 1.36, + "grad_norm": 3.4529049396514893, + "learning_rate": 2.0766592093880934e-07, + "logits/chosen": -0.3600536584854126, + "logits/rejected": -0.5034897327423096, + "logps/chosen": -57.939579010009766, + "logps/rejected": -84.42140197753906, + "loss": 0.6272, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.922999620437622, + "rewards/margins": 5.132784843444824, + "rewards/rejected": -2.209785223007202, + "step": 5444 + }, + { + "epoch": 1.36, + "grad_norm": 8.288409233093262, + "learning_rate": 2.069194181496137e-07, + "logits/chosen": -0.3464982509613037, + "logits/rejected": -0.4005279839038849, + "logps/chosen": -65.39842224121094, + "logps/rejected": -108.81533813476562, + "loss": 0.9127, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.583007574081421, + "rewards/margins": 5.02069616317749, + "rewards/rejected": -2.4376883506774902, + "step": 5445 + }, + { + "epoch": 1.36, + "grad_norm": 3.9999518394470215, + "learning_rate": 2.0617423116446266e-07, + "logits/chosen": -0.35547101497650146, + "logits/rejected": -0.4415983259677887, + "logps/chosen": -62.7763671875, + "logps/rejected": -98.7994613647461, + "loss": 0.6516, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0379371643066406, + "rewards/margins": 5.54119348526001, + "rewards/rejected": -2.503255844116211, + "step": 5446 + }, + { + "epoch": 1.36, + "grad_norm": 7.477618217468262, + "learning_rate": 2.0543036018792873e-07, + "logits/chosen": -0.34683504700660706, + "logits/rejected": -0.3963431119918823, + "logps/chosen": -47.55909729003906, + "logps/rejected": -107.31388092041016, + "loss": 0.6641, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.916780710220337, + "rewards/margins": 5.397711753845215, + "rewards/rejected": -2.480931043624878, + "step": 5447 + }, + { + "epoch": 1.36, + "grad_norm": 5.014648914337158, + "learning_rate": 2.046878054242196e-07, + "logits/chosen": -0.3468095362186432, + "logits/rejected": -0.46903425455093384, + "logps/chosen": -70.29747772216797, + "logps/rejected": -87.08771514892578, + "loss": 0.7227, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.94287371635437, + "rewards/margins": 5.340916633605957, + "rewards/rejected": -2.398043155670166, + "step": 5448 + }, + { + "epoch": 1.36, + "grad_norm": 2.719531774520874, + "learning_rate": 2.039465670771823e-07, + "logits/chosen": -0.32126885652542114, + "logits/rejected": -0.38375288248062134, + "logps/chosen": -60.48597717285156, + "logps/rejected": -96.2014389038086, + "loss": 0.6461, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9131569862365723, + "rewards/margins": 5.16243314743042, + "rewards/rejected": -2.2492761611938477, + "step": 5449 + }, + { + "epoch": 1.36, + "grad_norm": 3.601484537124634, + "learning_rate": 2.0320664535030287e-07, + "logits/chosen": -0.37955066561698914, + "logits/rejected": -0.47707879543304443, + "logps/chosen": -52.676876068115234, + "logps/rejected": -75.1375732421875, + "loss": 0.6626, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.716785192489624, + "rewards/margins": 4.862007141113281, + "rewards/rejected": -2.1452221870422363, + "step": 5450 + }, + { + "epoch": 1.36, + "grad_norm": 2.7407822608947754, + "learning_rate": 2.0246804044670553e-07, + "logits/chosen": -0.3160906732082367, + "logits/rejected": -0.3636862337589264, + "logps/chosen": -54.55377960205078, + "logps/rejected": -105.21469116210938, + "loss": 0.6222, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.862837553024292, + "rewards/margins": 4.703850746154785, + "rewards/rejected": -1.8410135507583618, + "step": 5451 + }, + { + "epoch": 1.36, + "grad_norm": 3.7538444995880127, + "learning_rate": 2.017307525691542e-07, + "logits/chosen": -0.405611515045166, + "logits/rejected": -0.42369410395622253, + "logps/chosen": -50.810279846191406, + "logps/rejected": -91.64985656738281, + "loss": 0.6353, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0713419914245605, + "rewards/margins": 5.513197898864746, + "rewards/rejected": -2.4418559074401855, + "step": 5452 + }, + { + "epoch": 1.36, + "grad_norm": 6.322419166564941, + "learning_rate": 2.009947819200503e-07, + "logits/chosen": -0.4283181130886078, + "logits/rejected": -0.4597916901111603, + "logps/chosen": -46.5794677734375, + "logps/rejected": -84.14925384521484, + "loss": 0.8208, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8156545162200928, + "rewards/margins": 4.317934989929199, + "rewards/rejected": -1.5022802352905273, + "step": 5453 + }, + { + "epoch": 1.36, + "grad_norm": 9.386185646057129, + "learning_rate": 2.0026012870143273e-07, + "logits/chosen": -0.41968801617622375, + "logits/rejected": -0.5112310647964478, + "logps/chosen": -47.1270751953125, + "logps/rejected": -87.7657470703125, + "loss": 0.6818, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8360421657562256, + "rewards/margins": 4.760317325592041, + "rewards/rejected": -1.9242753982543945, + "step": 5454 + }, + { + "epoch": 1.36, + "grad_norm": 8.204751014709473, + "learning_rate": 1.995267931149797e-07, + "logits/chosen": -0.387236624956131, + "logits/rejected": -0.5116301774978638, + "logps/chosen": -62.46582794189453, + "logps/rejected": -84.08386993408203, + "loss": 0.6526, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.959855318069458, + "rewards/margins": 5.438842296600342, + "rewards/rejected": -2.478986978530884, + "step": 5455 + }, + { + "epoch": 1.36, + "grad_norm": 5.986453533172607, + "learning_rate": 1.9879477536200786e-07, + "logits/chosen": -0.27895548939704895, + "logits/rejected": -0.347917765378952, + "logps/chosen": -64.89104461669922, + "logps/rejected": -86.22431945800781, + "loss": 0.7648, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9623007774353027, + "rewards/margins": 4.390935897827148, + "rewards/rejected": -1.4286351203918457, + "step": 5456 + }, + { + "epoch": 1.37, + "grad_norm": 5.65924072265625, + "learning_rate": 1.980640756434732e-07, + "logits/chosen": -0.3527013957500458, + "logits/rejected": -0.47449830174446106, + "logps/chosen": -45.81023406982422, + "logps/rejected": -85.4596939086914, + "loss": 0.6877, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7492315769195557, + "rewards/margins": 5.377241611480713, + "rewards/rejected": -2.628009796142578, + "step": 5457 + }, + { + "epoch": 1.37, + "grad_norm": 5.280632019042969, + "learning_rate": 1.9733469415996576e-07, + "logits/chosen": -0.38349196314811707, + "logits/rejected": -0.37424394488334656, + "logps/chosen": -56.23662567138672, + "logps/rejected": -108.81211853027344, + "loss": 0.7701, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9443929195404053, + "rewards/margins": 4.901557445526123, + "rewards/rejected": -1.9571647644042969, + "step": 5458 + }, + { + "epoch": 1.37, + "grad_norm": 11.389349937438965, + "learning_rate": 1.966066311117193e-07, + "logits/chosen": -0.35756346583366394, + "logits/rejected": -0.4484614133834839, + "logps/chosen": -61.57511901855469, + "logps/rejected": -121.93116760253906, + "loss": 0.7813, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.986607313156128, + "rewards/margins": 4.835123062133789, + "rewards/rejected": -1.8485156297683716, + "step": 5459 + }, + { + "epoch": 1.37, + "grad_norm": 3.118124008178711, + "learning_rate": 1.9587988669860113e-07, + "logits/chosen": -0.3518592417240143, + "logits/rejected": -0.4509758949279785, + "logps/chosen": -59.771663665771484, + "logps/rejected": -90.6333999633789, + "loss": 0.6537, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0982987880706787, + "rewards/margins": 5.291308403015137, + "rewards/rejected": -2.193009853363037, + "step": 5460 + }, + { + "epoch": 1.37, + "grad_norm": 5.528961181640625, + "learning_rate": 1.9515446112011894e-07, + "logits/chosen": -0.40257588028907776, + "logits/rejected": -0.44568488001823425, + "logps/chosen": -50.64595031738281, + "logps/rejected": -72.3938217163086, + "loss": 0.7688, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9128305912017822, + "rewards/margins": 3.4562671184539795, + "rewards/rejected": -0.5434365272521973, + "step": 5461 + }, + { + "epoch": 1.37, + "grad_norm": 2.0024983882904053, + "learning_rate": 1.9443035457541782e-07, + "logits/chosen": -0.36848950386047363, + "logits/rejected": -0.46735680103302, + "logps/chosen": -50.69063186645508, + "logps/rejected": -85.74730682373047, + "loss": 0.5955, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0457746982574463, + "rewards/margins": 5.802830696105957, + "rewards/rejected": -2.7570559978485107, + "step": 5462 + }, + { + "epoch": 1.37, + "grad_norm": 4.718235969543457, + "learning_rate": 1.9370756726327933e-07, + "logits/chosen": -0.3921828269958496, + "logits/rejected": -0.4307917356491089, + "logps/chosen": -54.2483024597168, + "logps/rejected": -86.6759033203125, + "loss": 0.8024, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.986788749694824, + "rewards/margins": 4.393097400665283, + "rewards/rejected": -1.4063084125518799, + "step": 5463 + }, + { + "epoch": 1.37, + "grad_norm": 3.9386978149414062, + "learning_rate": 1.9298609938212641e-07, + "logits/chosen": -0.2979637086391449, + "logits/rejected": -0.3739698827266693, + "logps/chosen": -50.25705337524414, + "logps/rejected": -100.4971923828125, + "loss": 0.6062, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0736236572265625, + "rewards/margins": 4.982144832611084, + "rewards/rejected": -1.9085208177566528, + "step": 5464 + }, + { + "epoch": 1.37, + "grad_norm": 4.895791053771973, + "learning_rate": 1.9226595113001624e-07, + "logits/chosen": -0.3130401372909546, + "logits/rejected": -0.3874894380569458, + "logps/chosen": -50.725154876708984, + "logps/rejected": -95.95048522949219, + "loss": 0.6537, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8714325428009033, + "rewards/margins": 4.991580009460449, + "rewards/rejected": -2.120147943496704, + "step": 5465 + }, + { + "epoch": 1.37, + "grad_norm": 3.318598985671997, + "learning_rate": 1.915471227046445e-07, + "logits/chosen": -0.4150930643081665, + "logits/rejected": -0.4950430393218994, + "logps/chosen": -48.05177688598633, + "logps/rejected": -94.84304809570312, + "loss": 0.6277, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9899322986602783, + "rewards/margins": 5.25858211517334, + "rewards/rejected": -2.2686498165130615, + "step": 5466 + }, + { + "epoch": 1.37, + "grad_norm": 7.010021209716797, + "learning_rate": 1.908296143033467e-07, + "logits/chosen": -0.36947447061538696, + "logits/rejected": -0.5007598400115967, + "logps/chosen": -46.416561126708984, + "logps/rejected": -69.19683074951172, + "loss": 0.6961, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.650801181793213, + "rewards/margins": 4.501087188720703, + "rewards/rejected": -1.8502861261367798, + "step": 5467 + }, + { + "epoch": 1.37, + "grad_norm": 5.829311370849609, + "learning_rate": 1.9011342612309248e-07, + "logits/chosen": -0.27176401019096375, + "logits/rejected": -0.3453199863433838, + "logps/chosen": -58.47492980957031, + "logps/rejected": -92.14933776855469, + "loss": 0.8716, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0680534839630127, + "rewards/margins": 4.089432239532471, + "rewards/rejected": -1.021378993988037, + "step": 5468 + }, + { + "epoch": 1.37, + "grad_norm": 7.248525619506836, + "learning_rate": 1.8939855836049293e-07, + "logits/chosen": -0.286079466342926, + "logits/rejected": -0.4141596555709839, + "logps/chosen": -56.75224304199219, + "logps/rejected": -69.19854736328125, + "loss": 0.8001, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6942427158355713, + "rewards/margins": 3.812507152557373, + "rewards/rejected": -1.11826491355896, + "step": 5469 + }, + { + "epoch": 1.37, + "grad_norm": 11.454638481140137, + "learning_rate": 1.8868501121179327e-07, + "logits/chosen": -0.30052366852760315, + "logits/rejected": -0.4142597019672394, + "logps/chosen": -56.19341278076172, + "logps/rejected": -89.55437469482422, + "loss": 0.6273, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.948808193206787, + "rewards/margins": 5.247665882110596, + "rewards/rejected": -2.2988576889038086, + "step": 5470 + }, + { + "epoch": 1.37, + "grad_norm": 2.5525362491607666, + "learning_rate": 1.8797278487287796e-07, + "logits/chosen": -0.4034848213195801, + "logits/rejected": -0.5123243927955627, + "logps/chosen": -55.35200119018555, + "logps/rejected": -100.37222290039062, + "loss": 0.6913, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.946613073348999, + "rewards/margins": 5.754638671875, + "rewards/rejected": -2.8080248832702637, + "step": 5471 + }, + { + "epoch": 1.37, + "grad_norm": 10.247779846191406, + "learning_rate": 1.8726187953926833e-07, + "logits/chosen": -0.35885611176490784, + "logits/rejected": -0.4739391803741455, + "logps/chosen": -55.96293640136719, + "logps/rejected": -81.72598266601562, + "loss": 0.9462, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.874933958053589, + "rewards/margins": 4.46694278717041, + "rewards/rejected": -1.5920085906982422, + "step": 5472 + }, + { + "epoch": 1.37, + "grad_norm": 3.6739401817321777, + "learning_rate": 1.8655229540612384e-07, + "logits/chosen": -0.3526410758495331, + "logits/rejected": -0.49423569440841675, + "logps/chosen": -54.16154479980469, + "logps/rejected": -83.31498718261719, + "loss": 0.6296, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0559234619140625, + "rewards/margins": 5.148001670837402, + "rewards/rejected": -2.092078447341919, + "step": 5473 + }, + { + "epoch": 1.37, + "grad_norm": 3.685885429382324, + "learning_rate": 1.8584403266824035e-07, + "logits/chosen": -0.4232044816017151, + "logits/rejected": -0.5334791541099548, + "logps/chosen": -52.527923583984375, + "logps/rejected": -71.36457061767578, + "loss": 0.6936, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.859462022781372, + "rewards/margins": 4.763096332550049, + "rewards/rejected": -1.9036343097686768, + "step": 5474 + }, + { + "epoch": 1.37, + "grad_norm": 3.6910786628723145, + "learning_rate": 1.8513709152005122e-07, + "logits/chosen": -0.29234156012535095, + "logits/rejected": -0.4159396290779114, + "logps/chosen": -75.1304931640625, + "logps/rejected": -93.11521911621094, + "loss": 0.7223, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.934905767440796, + "rewards/margins": 4.421838760375977, + "rewards/rejected": -1.4869328737258911, + "step": 5475 + }, + { + "epoch": 1.37, + "grad_norm": 5.518930912017822, + "learning_rate": 1.8443147215562617e-07, + "logits/chosen": -0.32725128531455994, + "logits/rejected": -0.42704519629478455, + "logps/chosen": -58.32929611206055, + "logps/rejected": -80.89741516113281, + "loss": 0.687, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8236947059631348, + "rewards/margins": 5.468596935272217, + "rewards/rejected": -2.644901990890503, + "step": 5476 + }, + { + "epoch": 1.37, + "grad_norm": 5.65593957901001, + "learning_rate": 1.837271747686753e-07, + "logits/chosen": -0.2594398558139801, + "logits/rejected": -0.401321142911911, + "logps/chosen": -59.839996337890625, + "logps/rejected": -83.78679656982422, + "loss": 0.658, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9627957344055176, + "rewards/margins": 5.460927963256836, + "rewards/rejected": -2.4981324672698975, + "step": 5477 + }, + { + "epoch": 1.37, + "grad_norm": 4.4596333503723145, + "learning_rate": 1.8302419955254057e-07, + "logits/chosen": -0.402245432138443, + "logits/rejected": -0.40214550495147705, + "logps/chosen": -50.95564651489258, + "logps/rejected": -98.57341003417969, + "loss": 0.6788, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2847702503204346, + "rewards/margins": 4.54441499710083, + "rewards/rejected": -1.2596445083618164, + "step": 5478 + }, + { + "epoch": 1.37, + "grad_norm": 3.3260159492492676, + "learning_rate": 1.8232254670020543e-07, + "logits/chosen": -0.36766183376312256, + "logits/rejected": -0.48017582297325134, + "logps/chosen": -61.44689178466797, + "logps/rejected": -72.02804565429688, + "loss": 0.6555, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.006040096282959, + "rewards/margins": 5.055945873260498, + "rewards/rejected": -2.049906015396118, + "step": 5479 + }, + { + "epoch": 1.37, + "grad_norm": 2.73008131980896, + "learning_rate": 1.8162221640428912e-07, + "logits/chosen": -0.4064183235168457, + "logits/rejected": -0.49503618478775024, + "logps/chosen": -43.495262145996094, + "logps/rejected": -94.05245208740234, + "loss": 0.6063, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.259753704071045, + "rewards/margins": 7.07350492477417, + "rewards/rejected": -3.813751220703125, + "step": 5480 + }, + { + "epoch": 1.37, + "grad_norm": 3.50858211517334, + "learning_rate": 1.8092320885704673e-07, + "logits/chosen": -0.4184646010398865, + "logits/rejected": -0.42050760984420776, + "logps/chosen": -44.92094421386719, + "logps/rejected": -77.47406005859375, + "loss": 0.668, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.967797040939331, + "rewards/margins": 4.507070541381836, + "rewards/rejected": -1.5392735004425049, + "step": 5481 + }, + { + "epoch": 1.37, + "grad_norm": 3.1108450889587402, + "learning_rate": 1.8022552425037032e-07, + "logits/chosen": -0.3824059069156647, + "logits/rejected": -0.4729230999946594, + "logps/chosen": -58.714332580566406, + "logps/rejected": -89.50101470947266, + "loss": 0.7072, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.699927568435669, + "rewards/margins": 4.889347076416016, + "rewards/rejected": -2.1894190311431885, + "step": 5482 + }, + { + "epoch": 1.37, + "grad_norm": 8.868059158325195, + "learning_rate": 1.7952916277579e-07, + "logits/chosen": -0.3285615146160126, + "logits/rejected": -0.4496382474899292, + "logps/chosen": -61.839942932128906, + "logps/rejected": -73.38414001464844, + "loss": 0.7346, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.573486566543579, + "rewards/margins": 4.20090389251709, + "rewards/rejected": -1.6274173259735107, + "step": 5483 + }, + { + "epoch": 1.37, + "grad_norm": 6.379412651062012, + "learning_rate": 1.7883412462447292e-07, + "logits/chosen": -0.3575576841831207, + "logits/rejected": -0.38725730776786804, + "logps/chosen": -50.21441650390625, + "logps/rejected": -99.2619400024414, + "loss": 0.714, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7886884212493896, + "rewards/margins": 5.114396572113037, + "rewards/rejected": -2.3257083892822266, + "step": 5484 + }, + { + "epoch": 1.37, + "grad_norm": 4.03902006149292, + "learning_rate": 1.7814040998722027e-07, + "logits/chosen": -0.408051460981369, + "logits/rejected": -0.4549853801727295, + "logps/chosen": -48.73728942871094, + "logps/rejected": -84.39620971679688, + "loss": 0.6391, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0153932571411133, + "rewards/margins": 5.1999101638793945, + "rewards/rejected": -2.1845171451568604, + "step": 5485 + }, + { + "epoch": 1.37, + "grad_norm": 5.936025142669678, + "learning_rate": 1.7744801905447362e-07, + "logits/chosen": -0.2921045422554016, + "logits/rejected": -0.37650182843208313, + "logps/chosen": -71.90608978271484, + "logps/rejected": -107.36949157714844, + "loss": 0.8204, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.826878786087036, + "rewards/margins": 4.635946750640869, + "rewards/rejected": -1.809067726135254, + "step": 5486 + }, + { + "epoch": 1.37, + "grad_norm": 7.163334846496582, + "learning_rate": 1.767569520163076e-07, + "logits/chosen": -0.34005796909332275, + "logits/rejected": -0.49017563462257385, + "logps/chosen": -60.7913932800293, + "logps/rejected": -67.00989532470703, + "loss": 0.7612, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.915837049484253, + "rewards/margins": 4.725706100463867, + "rewards/rejected": -1.8098691701889038, + "step": 5487 + }, + { + "epoch": 1.37, + "grad_norm": 9.502790451049805, + "learning_rate": 1.760672090624349e-07, + "logits/chosen": -0.40073031187057495, + "logits/rejected": -0.4638361930847168, + "logps/chosen": -54.28371047973633, + "logps/rejected": -83.37635803222656, + "loss": 0.7311, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.797996759414673, + "rewards/margins": 4.540325164794922, + "rewards/rejected": -1.7423285245895386, + "step": 5488 + }, + { + "epoch": 1.37, + "grad_norm": 11.160771369934082, + "learning_rate": 1.753787903822074e-07, + "logits/chosen": -0.31195056438446045, + "logits/rejected": -0.38249289989471436, + "logps/chosen": -68.89405059814453, + "logps/rejected": -107.94064331054688, + "loss": 0.7183, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5065770149230957, + "rewards/margins": 4.286952018737793, + "rewards/rejected": -0.7803748250007629, + "step": 5489 + }, + { + "epoch": 1.37, + "grad_norm": 11.22718334197998, + "learning_rate": 1.746916961646078e-07, + "logits/chosen": -0.342881977558136, + "logits/rejected": -0.3984268307685852, + "logps/chosen": -52.62560272216797, + "logps/rejected": -88.03407287597656, + "loss": 0.8057, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.709944725036621, + "rewards/margins": 3.6649880409240723, + "rewards/rejected": -0.9550430774688721, + "step": 5490 + }, + { + "epoch": 1.37, + "grad_norm": 8.91180419921875, + "learning_rate": 1.740059265982591e-07, + "logits/chosen": -0.3252195715904236, + "logits/rejected": -0.3410134017467499, + "logps/chosen": -63.32427215576172, + "logps/rejected": -97.81488037109375, + "loss": 1.0436, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8275856971740723, + "rewards/margins": 3.8043060302734375, + "rewards/rejected": -0.9767205715179443, + "step": 5491 + }, + { + "epoch": 1.37, + "grad_norm": 13.555784225463867, + "learning_rate": 1.733214818714213e-07, + "logits/chosen": -0.2985765039920807, + "logits/rejected": -0.3609873652458191, + "logps/chosen": -55.10993576049805, + "logps/rejected": -88.06678009033203, + "loss": 0.8469, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9390711784362793, + "rewards/margins": 3.857149839401245, + "rewards/rejected": -0.9180783629417419, + "step": 5492 + }, + { + "epoch": 1.37, + "grad_norm": 2.672271966934204, + "learning_rate": 1.7263836217198793e-07, + "logits/chosen": -0.42948317527770996, + "logits/rejected": -0.5159836411476135, + "logps/chosen": -58.66533660888672, + "logps/rejected": -98.15868377685547, + "loss": 0.7216, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1262333393096924, + "rewards/margins": 5.94555139541626, + "rewards/rejected": -2.8193182945251465, + "step": 5493 + }, + { + "epoch": 1.37, + "grad_norm": 3.1450355052948, + "learning_rate": 1.7195656768749014e-07, + "logits/chosen": -0.3682267665863037, + "logits/rejected": -0.44713401794433594, + "logps/chosen": -72.30631256103516, + "logps/rejected": -88.42665100097656, + "loss": 0.754, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.093491315841675, + "rewards/margins": 5.451213836669922, + "rewards/rejected": -2.357722759246826, + "step": 5494 + }, + { + "epoch": 1.37, + "grad_norm": 3.171417236328125, + "learning_rate": 1.7127609860509597e-07, + "logits/chosen": -0.4342286288738251, + "logits/rejected": -0.503417432308197, + "logps/chosen": -53.90336608886719, + "logps/rejected": -69.46623992919922, + "loss": 0.7183, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1364905834198, + "rewards/margins": 4.161732196807861, + "rewards/rejected": -1.0252419710159302, + "step": 5495 + }, + { + "epoch": 1.37, + "grad_norm": 4.076683044433594, + "learning_rate": 1.7059695511160712e-07, + "logits/chosen": -0.335554838180542, + "logits/rejected": -0.43902528285980225, + "logps/chosen": -50.608367919921875, + "logps/rejected": -91.7290267944336, + "loss": 0.6274, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.156252145767212, + "rewards/margins": 5.599352836608887, + "rewards/rejected": -2.443101167678833, + "step": 5496 + }, + { + "epoch": 1.38, + "grad_norm": 4.54359769821167, + "learning_rate": 1.6991913739346554e-07, + "logits/chosen": -0.3248080313205719, + "logits/rejected": -0.4632381498813629, + "logps/chosen": -66.16340637207031, + "logps/rejected": -80.4909439086914, + "loss": 0.739, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0516257286071777, + "rewards/margins": 5.00526237487793, + "rewards/rejected": -1.9536365270614624, + "step": 5497 + }, + { + "epoch": 1.38, + "grad_norm": 3.5591773986816406, + "learning_rate": 1.6924264563674575e-07, + "logits/chosen": -0.35859251022338867, + "logits/rejected": -0.43895673751831055, + "logps/chosen": -53.39189147949219, + "logps/rejected": -84.56387329101562, + "loss": 0.6721, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.018376111984253, + "rewards/margins": 5.399598598480225, + "rewards/rejected": -2.3812220096588135, + "step": 5498 + }, + { + "epoch": 1.38, + "grad_norm": 4.892897129058838, + "learning_rate": 1.6856748002715863e-07, + "logits/chosen": -0.34412911534309387, + "logits/rejected": -0.41615521907806396, + "logps/chosen": -49.66505432128906, + "logps/rejected": -76.48963928222656, + "loss": 0.7378, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.894279956817627, + "rewards/margins": 4.262999534606934, + "rewards/rejected": -1.3687196969985962, + "step": 5499 + }, + { + "epoch": 1.38, + "grad_norm": 12.549712181091309, + "learning_rate": 1.678936407500531e-07, + "logits/chosen": -0.36053743958473206, + "logits/rejected": -0.4723602831363678, + "logps/chosen": -55.92934799194336, + "logps/rejected": -74.57243347167969, + "loss": 0.8615, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7111620903015137, + "rewards/margins": 4.5138349533081055, + "rewards/rejected": -1.802673101425171, + "step": 5500 + }, + { + "epoch": 1.38, + "grad_norm": 6.092696189880371, + "learning_rate": 1.6722112799041124e-07, + "logits/chosen": -0.41670408844947815, + "logits/rejected": -0.45707279443740845, + "logps/chosen": -43.09138488769531, + "logps/rejected": -79.85013580322266, + "loss": 0.7769, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.959719181060791, + "rewards/margins": 3.8681812286376953, + "rewards/rejected": -0.9084621667861938, + "step": 5501 + }, + { + "epoch": 1.38, + "grad_norm": 3.5569169521331787, + "learning_rate": 1.6654994193285312e-07, + "logits/chosen": -0.38783350586891174, + "logits/rejected": -0.5844363570213318, + "logps/chosen": -57.92696762084961, + "logps/rejected": -69.84294128417969, + "loss": 0.6752, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0111541748046875, + "rewards/margins": 5.302587032318115, + "rewards/rejected": -2.2914326190948486, + "step": 5502 + }, + { + "epoch": 1.38, + "grad_norm": 6.530730247497559, + "learning_rate": 1.6588008276163303e-07, + "logits/chosen": -0.3612532913684845, + "logits/rejected": -0.4910144805908203, + "logps/chosen": -56.79072570800781, + "logps/rejected": -79.68424224853516, + "loss": 0.7728, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.888967990875244, + "rewards/margins": 5.502648830413818, + "rewards/rejected": -2.613680601119995, + "step": 5503 + }, + { + "epoch": 1.38, + "grad_norm": 12.405558586120605, + "learning_rate": 1.6521155066064276e-07, + "logits/chosen": -0.30904096364974976, + "logits/rejected": -0.35428956151008606, + "logps/chosen": -59.166847229003906, + "logps/rejected": -84.61262512207031, + "loss": 0.7944, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6375322341918945, + "rewards/margins": 4.215686321258545, + "rewards/rejected": -1.5781540870666504, + "step": 5504 + }, + { + "epoch": 1.38, + "grad_norm": 2.144345760345459, + "learning_rate": 1.6454434581340828e-07, + "logits/chosen": -0.3302910327911377, + "logits/rejected": -0.3837769627571106, + "logps/chosen": -53.01056671142578, + "logps/rejected": -96.13475799560547, + "loss": 0.6289, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.041942596435547, + "rewards/margins": 5.1219587326049805, + "rewards/rejected": -2.0800161361694336, + "step": 5505 + }, + { + "epoch": 1.38, + "grad_norm": 5.42427921295166, + "learning_rate": 1.6387846840309195e-07, + "logits/chosen": -0.37427136301994324, + "logits/rejected": -0.473945289850235, + "logps/chosen": -53.0309944152832, + "logps/rejected": -75.10493469238281, + "loss": 0.7917, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.102764129638672, + "rewards/margins": 4.312255859375, + "rewards/rejected": -1.2094917297363281, + "step": 5506 + }, + { + "epoch": 1.38, + "grad_norm": 10.194412231445312, + "learning_rate": 1.6321391861249147e-07, + "logits/chosen": -0.390615850687027, + "logits/rejected": -0.4936121702194214, + "logps/chosen": -52.11838150024414, + "logps/rejected": -88.89440155029297, + "loss": 0.8393, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6495296955108643, + "rewards/margins": 4.992708206176758, + "rewards/rejected": -2.3431782722473145, + "step": 5507 + }, + { + "epoch": 1.38, + "grad_norm": 13.609254837036133, + "learning_rate": 1.6255069662403865e-07, + "logits/chosen": -0.24951675534248352, + "logits/rejected": -0.4476064443588257, + "logps/chosen": -73.54303741455078, + "logps/rejected": -70.02664184570312, + "loss": 0.731, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8696506023406982, + "rewards/margins": 5.494901180267334, + "rewards/rejected": -2.6252503395080566, + "step": 5508 + }, + { + "epoch": 1.38, + "grad_norm": 6.119132995605469, + "learning_rate": 1.6188880261980455e-07, + "logits/chosen": -0.30664312839508057, + "logits/rejected": -0.36317336559295654, + "logps/chosen": -57.9876823425293, + "logps/rejected": -77.27625274658203, + "loss": 0.8397, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1843955516815186, + "rewards/margins": 3.867663860321045, + "rewards/rejected": -0.6832687854766846, + "step": 5509 + }, + { + "epoch": 1.38, + "grad_norm": 3.2231717109680176, + "learning_rate": 1.6122823678149325e-07, + "logits/chosen": -0.382169634103775, + "logits/rejected": -0.4788026511669159, + "logps/chosen": -57.53178024291992, + "logps/rejected": -95.0033950805664, + "loss": 0.6229, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0242342948913574, + "rewards/margins": 5.621466636657715, + "rewards/rejected": -2.597233295440674, + "step": 5510 + }, + { + "epoch": 1.38, + "grad_norm": 5.258654594421387, + "learning_rate": 1.605689992904419e-07, + "logits/chosen": -0.35404181480407715, + "logits/rejected": -0.4678778052330017, + "logps/chosen": -61.22235870361328, + "logps/rejected": -74.00218200683594, + "loss": 0.6922, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.842115879058838, + "rewards/margins": 4.7163286209106445, + "rewards/rejected": -1.8742132186889648, + "step": 5511 + }, + { + "epoch": 1.38, + "grad_norm": 4.3710246086120605, + "learning_rate": 1.599110903276274e-07, + "logits/chosen": -0.3223179876804352, + "logits/rejected": -0.41149717569351196, + "logps/chosen": -65.92947387695312, + "logps/rejected": -87.3962173461914, + "loss": 0.8144, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9140703678131104, + "rewards/margins": 4.968326091766357, + "rewards/rejected": -2.054255485534668, + "step": 5512 + }, + { + "epoch": 1.38, + "grad_norm": 5.300741672515869, + "learning_rate": 1.5925451007365923e-07, + "logits/chosen": -0.4016638398170471, + "logits/rejected": -0.510326087474823, + "logps/chosen": -58.27086639404297, + "logps/rejected": -80.81743621826172, + "loss": 0.7328, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.903150796890259, + "rewards/margins": 4.787977695465088, + "rewards/rejected": -1.884826898574829, + "step": 5513 + }, + { + "epoch": 1.38, + "grad_norm": 7.149805068969727, + "learning_rate": 1.5859925870878424e-07, + "logits/chosen": -0.33713001012802124, + "logits/rejected": -0.43005454540252686, + "logps/chosen": -57.91730499267578, + "logps/rejected": -92.9071044921875, + "loss": 0.7524, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8894829750061035, + "rewards/margins": 4.849185466766357, + "rewards/rejected": -1.959702491760254, + "step": 5514 + }, + { + "epoch": 1.38, + "grad_norm": 3.6912832260131836, + "learning_rate": 1.579453364128808e-07, + "logits/chosen": -0.3883708119392395, + "logits/rejected": -0.5015619397163391, + "logps/chosen": -57.17472457885742, + "logps/rejected": -80.036865234375, + "loss": 0.6333, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1223061084747314, + "rewards/margins": 5.011106014251709, + "rewards/rejected": -1.8888001441955566, + "step": 5515 + }, + { + "epoch": 1.38, + "grad_norm": 4.298310279846191, + "learning_rate": 1.5729274336546584e-07, + "logits/chosen": -0.3408200144767761, + "logits/rejected": -0.40025410056114197, + "logps/chosen": -47.252384185791016, + "logps/rejected": -80.05610656738281, + "loss": 0.749, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8861324787139893, + "rewards/margins": 4.91579008102417, + "rewards/rejected": -2.0296573638916016, + "step": 5516 + }, + { + "epoch": 1.38, + "grad_norm": 4.41680383682251, + "learning_rate": 1.5664147974569056e-07, + "logits/chosen": -0.4199201464653015, + "logits/rejected": -0.495617538690567, + "logps/chosen": -54.701683044433594, + "logps/rejected": -80.43231964111328, + "loss": 0.7061, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6121654510498047, + "rewards/margins": 5.063042640686035, + "rewards/rejected": -2.4508774280548096, + "step": 5517 + }, + { + "epoch": 1.38, + "grad_norm": 3.4956562519073486, + "learning_rate": 1.5599154573234077e-07, + "logits/chosen": -0.37133359909057617, + "logits/rejected": -0.4850139319896698, + "logps/chosen": -53.493045806884766, + "logps/rejected": -77.2416000366211, + "loss": 0.67, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9545798301696777, + "rewards/margins": 5.270975112915039, + "rewards/rejected": -2.3163950443267822, + "step": 5518 + }, + { + "epoch": 1.38, + "grad_norm": 6.272191524505615, + "learning_rate": 1.5534294150383655e-07, + "logits/chosen": -0.38193684816360474, + "logits/rejected": -0.4837648570537567, + "logps/chosen": -57.63145446777344, + "logps/rejected": -78.29246520996094, + "loss": 0.753, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.666109561920166, + "rewards/margins": 4.6576056480407715, + "rewards/rejected": -1.991495966911316, + "step": 5519 + }, + { + "epoch": 1.38, + "grad_norm": 3.7547905445098877, + "learning_rate": 1.5469566723823493e-07, + "logits/chosen": -0.2998599708080292, + "logits/rejected": -0.3990474343299866, + "logps/chosen": -53.34865188598633, + "logps/rejected": -101.4347152709961, + "loss": 0.6151, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0370264053344727, + "rewards/margins": 6.255039691925049, + "rewards/rejected": -3.2180135250091553, + "step": 5520 + }, + { + "epoch": 1.38, + "grad_norm": 7.871203422546387, + "learning_rate": 1.5404972311322486e-07, + "logits/chosen": -0.3812353014945984, + "logits/rejected": -0.4351370632648468, + "logps/chosen": -53.40824508666992, + "logps/rejected": -87.15557861328125, + "loss": 0.6535, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.197784900665283, + "rewards/margins": 5.574237823486328, + "rewards/rejected": -2.376452922821045, + "step": 5521 + }, + { + "epoch": 1.38, + "grad_norm": 3.365363597869873, + "learning_rate": 1.5340510930613506e-07, + "logits/chosen": -0.34721603989601135, + "logits/rejected": -0.46803343296051025, + "logps/chosen": -52.97496032714844, + "logps/rejected": -87.22240447998047, + "loss": 0.6309, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0740137100219727, + "rewards/margins": 5.399677753448486, + "rewards/rejected": -2.3256642818450928, + "step": 5522 + }, + { + "epoch": 1.38, + "grad_norm": 4.733239650726318, + "learning_rate": 1.527618259939223e-07, + "logits/chosen": -0.25121769309043884, + "logits/rejected": -0.442446768283844, + "logps/chosen": -54.7626953125, + "logps/rejected": -78.81805419921875, + "loss": 0.683, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.832951068878174, + "rewards/margins": 5.673477649688721, + "rewards/rejected": -2.8405258655548096, + "step": 5523 + }, + { + "epoch": 1.38, + "grad_norm": 6.256283283233643, + "learning_rate": 1.5211987335318424e-07, + "logits/chosen": -0.3051474094390869, + "logits/rejected": -0.4464333653450012, + "logps/chosen": -65.85954284667969, + "logps/rejected": -66.27193450927734, + "loss": 0.854, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.895622968673706, + "rewards/margins": 4.1730194091796875, + "rewards/rejected": -1.2773971557617188, + "step": 5524 + }, + { + "epoch": 1.38, + "grad_norm": 8.999588012695312, + "learning_rate": 1.5147925156014986e-07, + "logits/chosen": -0.3058004677295685, + "logits/rejected": -0.3673551380634308, + "logps/chosen": -63.707794189453125, + "logps/rejected": -81.58395385742188, + "loss": 0.7875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.006687641143799, + "rewards/margins": 4.357659339904785, + "rewards/rejected": -1.3509716987609863, + "step": 5525 + }, + { + "epoch": 1.38, + "grad_norm": 5.020021438598633, + "learning_rate": 1.508399607906841e-07, + "logits/chosen": -0.3456276059150696, + "logits/rejected": -0.4626268744468689, + "logps/chosen": -54.85874938964844, + "logps/rejected": -75.77960205078125, + "loss": 0.6861, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9968466758728027, + "rewards/margins": 5.272762298583984, + "rewards/rejected": -2.2759156227111816, + "step": 5526 + }, + { + "epoch": 1.38, + "grad_norm": 3.393266201019287, + "learning_rate": 1.5020200122028538e-07, + "logits/chosen": -0.3766927421092987, + "logits/rejected": -0.5345139503479004, + "logps/chosen": -62.51048278808594, + "logps/rejected": -81.37014770507812, + "loss": 0.6934, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.206397533416748, + "rewards/margins": 5.446533679962158, + "rewards/rejected": -2.2401366233825684, + "step": 5527 + }, + { + "epoch": 1.38, + "grad_norm": 7.9314117431640625, + "learning_rate": 1.4956537302408703e-07, + "logits/chosen": -0.3355065882205963, + "logits/rejected": -0.44129398465156555, + "logps/chosen": -58.51856231689453, + "logps/rejected": -80.90668487548828, + "loss": 0.7853, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.788684129714966, + "rewards/margins": 4.04642391204834, + "rewards/rejected": -1.257739543914795, + "step": 5528 + }, + { + "epoch": 1.38, + "grad_norm": 6.34537410736084, + "learning_rate": 1.4893007637685863e-07, + "logits/chosen": -0.3679220974445343, + "logits/rejected": -0.4367830455303192, + "logps/chosen": -55.106224060058594, + "logps/rejected": -79.53617858886719, + "loss": 0.6912, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0563035011291504, + "rewards/margins": 4.411587238311768, + "rewards/rejected": -1.3552838563919067, + "step": 5529 + }, + { + "epoch": 1.38, + "grad_norm": 5.691547393798828, + "learning_rate": 1.4829611145300238e-07, + "logits/chosen": -0.32873964309692383, + "logits/rejected": -0.4120681583881378, + "logps/chosen": -63.190650939941406, + "logps/rejected": -104.58477783203125, + "loss": 0.8005, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.640307903289795, + "rewards/margins": 5.154511451721191, + "rewards/rejected": -2.5142037868499756, + "step": 5530 + }, + { + "epoch": 1.38, + "grad_norm": 7.027170658111572, + "learning_rate": 1.476634784265546e-07, + "logits/chosen": -0.33833077549934387, + "logits/rejected": -0.3695167899131775, + "logps/chosen": -67.5399169921875, + "logps/rejected": -87.11643981933594, + "loss": 0.8718, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7239153385162354, + "rewards/margins": 3.876774549484253, + "rewards/rejected": -1.1528594493865967, + "step": 5531 + }, + { + "epoch": 1.38, + "grad_norm": 6.888582229614258, + "learning_rate": 1.4703217747118748e-07, + "logits/chosen": -0.3730050325393677, + "logits/rejected": -0.4751608371734619, + "logps/chosen": -66.18558502197266, + "logps/rejected": -88.87138366699219, + "loss": 0.773, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.872868776321411, + "rewards/margins": 5.172763347625732, + "rewards/rejected": -2.299894332885742, + "step": 5532 + }, + { + "epoch": 1.38, + "grad_norm": 5.688588619232178, + "learning_rate": 1.4640220876020572e-07, + "logits/chosen": -0.2800522446632385, + "logits/rejected": -0.3349974751472473, + "logps/chosen": -59.51701354980469, + "logps/rejected": -110.77705383300781, + "loss": 0.6631, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1569015979766846, + "rewards/margins": 4.886274337768555, + "rewards/rejected": -1.7293728590011597, + "step": 5533 + }, + { + "epoch": 1.38, + "grad_norm": 4.369758129119873, + "learning_rate": 1.45773572466551e-07, + "logits/chosen": -0.3667362332344055, + "logits/rejected": -0.4424248933792114, + "logps/chosen": -51.69545364379883, + "logps/rejected": -81.16778564453125, + "loss": 0.8224, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8275809288024902, + "rewards/margins": 4.206396579742432, + "rewards/rejected": -1.3788152933120728, + "step": 5534 + }, + { + "epoch": 1.38, + "grad_norm": 3.3586323261260986, + "learning_rate": 1.4514626876279636e-07, + "logits/chosen": -0.3439326584339142, + "logits/rejected": -0.47015300393104553, + "logps/chosen": -60.256805419921875, + "logps/rejected": -90.76979064941406, + "loss": 0.6622, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9345078468322754, + "rewards/margins": 5.317020416259766, + "rewards/rejected": -2.3825125694274902, + "step": 5535 + }, + { + "epoch": 1.38, + "grad_norm": 11.021044731140137, + "learning_rate": 1.4452029782114962e-07, + "logits/chosen": -0.28927838802337646, + "logits/rejected": -0.45920440554618835, + "logps/chosen": -73.85697174072266, + "logps/rejected": -86.02767944335938, + "loss": 0.747, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.537155866622925, + "rewards/margins": 4.565341949462891, + "rewards/rejected": -2.028186321258545, + "step": 5536 + }, + { + "epoch": 1.39, + "grad_norm": 6.486151218414307, + "learning_rate": 1.4389565981345443e-07, + "logits/chosen": -0.32044729590415955, + "logits/rejected": -0.4433131814002991, + "logps/chosen": -61.701637268066406, + "logps/rejected": -94.36437225341797, + "loss": 0.783, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.714329481124878, + "rewards/margins": 5.895626544952393, + "rewards/rejected": -3.1812973022460938, + "step": 5537 + }, + { + "epoch": 1.39, + "grad_norm": 6.2699503898620605, + "learning_rate": 1.4327235491118751e-07, + "logits/chosen": -0.31583547592163086, + "logits/rejected": -0.43585604429244995, + "logps/chosen": -63.55596160888672, + "logps/rejected": -86.090576171875, + "loss": 0.8033, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.25420880317688, + "rewards/margins": 5.11680793762207, + "rewards/rejected": -1.8625990152359009, + "step": 5538 + }, + { + "epoch": 1.39, + "grad_norm": 5.2313618659973145, + "learning_rate": 1.4265038328545922e-07, + "logits/chosen": -0.402421236038208, + "logits/rejected": -0.4694213569164276, + "logps/chosen": -54.65418243408203, + "logps/rejected": -105.96219635009766, + "loss": 0.6719, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.982423782348633, + "rewards/margins": 5.6908955574035645, + "rewards/rejected": -2.7084720134735107, + "step": 5539 + }, + { + "epoch": 1.39, + "grad_norm": 5.126657485961914, + "learning_rate": 1.4202974510701352e-07, + "logits/chosen": -0.3833528757095337, + "logits/rejected": -0.46444183588027954, + "logps/chosen": -47.460411071777344, + "logps/rejected": -71.20315551757812, + "loss": 0.8029, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.826951026916504, + "rewards/margins": 4.2317728996276855, + "rewards/rejected": -1.4048222303390503, + "step": 5540 + }, + { + "epoch": 1.39, + "grad_norm": 11.645868301391602, + "learning_rate": 1.4141044054622966e-07, + "logits/chosen": -0.3314434289932251, + "logits/rejected": -0.36873123049736023, + "logps/chosen": -47.842445373535156, + "logps/rejected": -75.99052429199219, + "loss": 0.8467, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.88792085647583, + "rewards/margins": 4.614871025085449, + "rewards/rejected": -1.7269495725631714, + "step": 5541 + }, + { + "epoch": 1.39, + "grad_norm": 4.392060279846191, + "learning_rate": 1.4079246977312e-07, + "logits/chosen": -0.28106310963630676, + "logits/rejected": -0.4469861090183258, + "logps/chosen": -54.17384338378906, + "logps/rejected": -84.4419937133789, + "loss": 0.6332, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9480748176574707, + "rewards/margins": 5.898952960968018, + "rewards/rejected": -2.950878143310547, + "step": 5542 + }, + { + "epoch": 1.39, + "grad_norm": 6.898133277893066, + "learning_rate": 1.401758329573305e-07, + "logits/chosen": -0.37471938133239746, + "logits/rejected": -0.4639493227005005, + "logps/chosen": -52.884727478027344, + "logps/rejected": -83.20750427246094, + "loss": 0.6932, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9944028854370117, + "rewards/margins": 4.6028971672058105, + "rewards/rejected": -1.6084941625595093, + "step": 5543 + }, + { + "epoch": 1.39, + "grad_norm": 5.3007330894470215, + "learning_rate": 1.3956053026814242e-07, + "logits/chosen": -0.3540341854095459, + "logits/rejected": -0.451394259929657, + "logps/chosen": -57.09303283691406, + "logps/rejected": -75.61083221435547, + "loss": 0.7143, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0928826332092285, + "rewards/margins": 4.26740026473999, + "rewards/rejected": -1.1745173931121826, + "step": 5544 + }, + { + "epoch": 1.39, + "grad_norm": 3.8565351963043213, + "learning_rate": 1.3894656187446843e-07, + "logits/chosen": -0.35290658473968506, + "logits/rejected": -0.5008606910705566, + "logps/chosen": -64.0369873046875, + "logps/rejected": -94.12602996826172, + "loss": 0.6906, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9453163146972656, + "rewards/margins": 5.560054302215576, + "rewards/rejected": -2.6147379875183105, + "step": 5545 + }, + { + "epoch": 1.39, + "grad_norm": 7.439845085144043, + "learning_rate": 1.3833392794485644e-07, + "logits/chosen": -0.3731020987033844, + "logits/rejected": -0.42717477679252625, + "logps/chosen": -59.07965850830078, + "logps/rejected": -83.13580322265625, + "loss": 0.7693, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.063148260116577, + "rewards/margins": 4.165898323059082, + "rewards/rejected": -1.1027500629425049, + "step": 5546 + }, + { + "epoch": 1.39, + "grad_norm": 3.310574531555176, + "learning_rate": 1.3772262864748754e-07, + "logits/chosen": -0.5197457671165466, + "logits/rejected": -0.6665461659431458, + "logps/chosen": -76.83209228515625, + "logps/rejected": -68.77503967285156, + "loss": 0.7635, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.739920139312744, + "rewards/margins": 4.838360786437988, + "rewards/rejected": -2.098440170288086, + "step": 5547 + }, + { + "epoch": 1.39, + "grad_norm": 4.115659713745117, + "learning_rate": 1.371126641501763e-07, + "logits/chosen": -0.4093368947505951, + "logits/rejected": -0.5804280042648315, + "logps/chosen": -64.36900329589844, + "logps/rejected": -74.39232635498047, + "loss": 0.6896, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.004485845565796, + "rewards/margins": 5.379439353942871, + "rewards/rejected": -2.3749537467956543, + "step": 5548 + }, + { + "epoch": 1.39, + "grad_norm": 2.5204639434814453, + "learning_rate": 1.365040346203722e-07, + "logits/chosen": -0.39081281423568726, + "logits/rejected": -0.4206887185573578, + "logps/chosen": -48.55158996582031, + "logps/rejected": -103.09590148925781, + "loss": 0.6355, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2380661964416504, + "rewards/margins": 6.237135887145996, + "rewards/rejected": -2.9990696907043457, + "step": 5549 + }, + { + "epoch": 1.39, + "grad_norm": 6.9450201988220215, + "learning_rate": 1.3589674022515652e-07, + "logits/chosen": -0.2716802656650543, + "logits/rejected": -0.3657275140285492, + "logps/chosen": -64.06139373779297, + "logps/rejected": -75.4256820678711, + "loss": 0.7741, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8623921871185303, + "rewards/margins": 4.076506614685059, + "rewards/rejected": -1.2141145467758179, + "step": 5550 + }, + { + "epoch": 1.39, + "grad_norm": 3.84389066696167, + "learning_rate": 1.3529078113124483e-07, + "logits/chosen": -0.2208538055419922, + "logits/rejected": -0.2999947965145111, + "logps/chosen": -56.10894775390625, + "logps/rejected": -79.53907775878906, + "loss": 0.6511, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.290830612182617, + "rewards/margins": 5.093113899230957, + "rewards/rejected": -1.802283525466919, + "step": 5551 + }, + { + "epoch": 1.39, + "grad_norm": 6.613393783569336, + "learning_rate": 1.346861575049857e-07, + "logits/chosen": -0.332504540681839, + "logits/rejected": -0.4311542809009552, + "logps/chosen": -53.2960090637207, + "logps/rejected": -78.64262390136719, + "loss": 0.7626, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8035192489624023, + "rewards/margins": 4.765573978424072, + "rewards/rejected": -1.96205472946167, + "step": 5552 + }, + { + "epoch": 1.39, + "grad_norm": 7.039034843444824, + "learning_rate": 1.3408286951236083e-07, + "logits/chosen": -0.371599942445755, + "logits/rejected": -0.4607986807823181, + "logps/chosen": -62.96703338623047, + "logps/rejected": -83.6594009399414, + "loss": 0.7814, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.506528377532959, + "rewards/margins": 4.1200127601623535, + "rewards/rejected": -1.613484263420105, + "step": 5553 + }, + { + "epoch": 1.39, + "grad_norm": 4.965272903442383, + "learning_rate": 1.3348091731898715e-07, + "logits/chosen": -0.46670055389404297, + "logits/rejected": -0.5589510202407837, + "logps/chosen": -52.22751235961914, + "logps/rejected": -85.90744018554688, + "loss": 0.8181, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.122073173522949, + "rewards/margins": 4.623685359954834, + "rewards/rejected": -1.501611351966858, + "step": 5554 + }, + { + "epoch": 1.39, + "grad_norm": 1.7177108526229858, + "learning_rate": 1.3288030109011363e-07, + "logits/chosen": -0.345090389251709, + "logits/rejected": -0.4219204783439636, + "logps/chosen": -71.7518310546875, + "logps/rejected": -99.98235321044922, + "loss": 0.592, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.196610450744629, + "rewards/margins": 5.940243721008301, + "rewards/rejected": -2.743633508682251, + "step": 5555 + }, + { + "epoch": 1.39, + "grad_norm": 3.3752081394195557, + "learning_rate": 1.3228102099062055e-07, + "logits/chosen": -0.38247570395469666, + "logits/rejected": -0.4493676722049713, + "logps/chosen": -61.81597900390625, + "logps/rejected": -108.53722381591797, + "loss": 0.6331, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1037936210632324, + "rewards/margins": 5.92711877822876, + "rewards/rejected": -2.8233258724212646, + "step": 5556 + }, + { + "epoch": 1.39, + "grad_norm": 4.033884048461914, + "learning_rate": 1.316830771850247e-07, + "logits/chosen": -0.3179568648338318, + "logits/rejected": -0.3677613139152527, + "logps/chosen": -49.08842468261719, + "logps/rejected": -85.2767333984375, + "loss": 0.6598, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0031964778900146, + "rewards/margins": 4.639743328094482, + "rewards/rejected": -1.6365466117858887, + "step": 5557 + }, + { + "epoch": 1.39, + "grad_norm": 3.7065048217773438, + "learning_rate": 1.3108646983747365e-07, + "logits/chosen": -0.34789928793907166, + "logits/rejected": -0.45920801162719727, + "logps/chosen": -58.933876037597656, + "logps/rejected": -93.03569030761719, + "loss": 0.6453, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9568188190460205, + "rewards/margins": 6.022480487823486, + "rewards/rejected": -3.065662145614624, + "step": 5558 + }, + { + "epoch": 1.39, + "grad_norm": 7.718165397644043, + "learning_rate": 1.3049119911175023e-07, + "logits/chosen": -0.37525466084480286, + "logits/rejected": -0.507867157459259, + "logps/chosen": -49.34808349609375, + "logps/rejected": -71.33533477783203, + "loss": 0.7053, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6562442779541016, + "rewards/margins": 5.555871963500977, + "rewards/rejected": -2.899627923965454, + "step": 5559 + }, + { + "epoch": 1.39, + "grad_norm": 3.017569065093994, + "learning_rate": 1.298972651712682e-07, + "logits/chosen": -0.33570754528045654, + "logits/rejected": -0.45276176929473877, + "logps/chosen": -51.600990295410156, + "logps/rejected": -100.8330307006836, + "loss": 0.5818, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8323402404785156, + "rewards/margins": 6.365802764892578, + "rewards/rejected": -3.5334625244140625, + "step": 5560 + }, + { + "epoch": 1.39, + "grad_norm": 3.729761838912964, + "learning_rate": 1.2930466817907493e-07, + "logits/chosen": -0.34851181507110596, + "logits/rejected": -0.47057458758354187, + "logps/chosen": -55.8831901550293, + "logps/rejected": -83.77667236328125, + "loss": 0.6565, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0154168605804443, + "rewards/margins": 5.714842319488525, + "rewards/rejected": -2.699425458908081, + "step": 5561 + }, + { + "epoch": 1.39, + "grad_norm": 5.626158237457275, + "learning_rate": 1.2871340829785196e-07, + "logits/chosen": -0.41244393587112427, + "logits/rejected": -0.4754811227321625, + "logps/chosen": -47.43727493286133, + "logps/rejected": -68.7303466796875, + "loss": 0.7061, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.980666160583496, + "rewards/margins": 4.5201826095581055, + "rewards/rejected": -1.5395164489746094, + "step": 5562 + }, + { + "epoch": 1.39, + "grad_norm": 7.646661758422852, + "learning_rate": 1.2812348568991217e-07, + "logits/chosen": -0.3244037926197052, + "logits/rejected": -0.4285508692264557, + "logps/chosen": -46.52533721923828, + "logps/rejected": -79.28907775878906, + "loss": 0.757, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8136587142944336, + "rewards/margins": 4.613506317138672, + "rewards/rejected": -1.7998480796813965, + "step": 5563 + }, + { + "epoch": 1.39, + "grad_norm": 4.158573150634766, + "learning_rate": 1.2753490051720273e-07, + "logits/chosen": -0.4148327112197876, + "logits/rejected": -0.4798140525817871, + "logps/chosen": -51.40011215209961, + "logps/rejected": -89.69351959228516, + "loss": 0.791, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7819442749023438, + "rewards/margins": 4.500702857971191, + "rewards/rejected": -1.718758225440979, + "step": 5564 + }, + { + "epoch": 1.39, + "grad_norm": 3.8610918521881104, + "learning_rate": 1.2694765294130273e-07, + "logits/chosen": -0.41782593727111816, + "logits/rejected": -0.5076521039009094, + "logps/chosen": -52.98543167114258, + "logps/rejected": -68.16650390625, + "loss": 0.8113, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9432973861694336, + "rewards/margins": 4.404657363891602, + "rewards/rejected": -1.461360216140747, + "step": 5565 + }, + { + "epoch": 1.39, + "grad_norm": 6.322873592376709, + "learning_rate": 1.263617431234232e-07, + "logits/chosen": -0.38526833057403564, + "logits/rejected": -0.5016353726387024, + "logps/chosen": -54.63665008544922, + "logps/rejected": -84.35250854492188, + "loss": 0.7122, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8887805938720703, + "rewards/margins": 5.018950939178467, + "rewards/rejected": -2.1301703453063965, + "step": 5566 + }, + { + "epoch": 1.39, + "grad_norm": 5.244025230407715, + "learning_rate": 1.2577717122441102e-07, + "logits/chosen": -0.39848318696022034, + "logits/rejected": -0.5216829776763916, + "logps/chosen": -51.225608825683594, + "logps/rejected": -70.81670379638672, + "loss": 0.6152, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.868112087249756, + "rewards/margins": 4.568056583404541, + "rewards/rejected": -1.699944257736206, + "step": 5567 + }, + { + "epoch": 1.39, + "grad_norm": 3.7969696521759033, + "learning_rate": 1.251939374047423e-07, + "logits/chosen": -0.4082941710948944, + "logits/rejected": -0.5510745048522949, + "logps/chosen": -55.864219665527344, + "logps/rejected": -78.89151000976562, + "loss": 0.5914, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0388145446777344, + "rewards/margins": 6.3307108879089355, + "rewards/rejected": -3.291896343231201, + "step": 5568 + }, + { + "epoch": 1.39, + "grad_norm": 6.240478992462158, + "learning_rate": 1.246120418245278e-07, + "logits/chosen": -0.3844974637031555, + "logits/rejected": -0.42179229855537415, + "logps/chosen": -54.244693756103516, + "logps/rejected": -95.16293334960938, + "loss": 0.806, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7549660205841064, + "rewards/margins": 4.72498083114624, + "rewards/rejected": -1.9700149297714233, + "step": 5569 + }, + { + "epoch": 1.39, + "grad_norm": 8.44167709350586, + "learning_rate": 1.240314846435109e-07, + "logits/chosen": -0.29300186038017273, + "logits/rejected": -0.38765743374824524, + "logps/chosen": -60.65678024291992, + "logps/rejected": -78.6565170288086, + "loss": 0.8186, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5125958919525146, + "rewards/margins": 3.6747894287109375, + "rewards/rejected": -1.1621936559677124, + "step": 5570 + }, + { + "epoch": 1.39, + "grad_norm": 3.6053054332733154, + "learning_rate": 1.234522660210663e-07, + "logits/chosen": -0.4155835509300232, + "logits/rejected": -0.46497076749801636, + "logps/chosen": -47.842002868652344, + "logps/rejected": -91.83573150634766, + "loss": 0.6095, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6747491359710693, + "rewards/margins": 4.952538013458252, + "rewards/rejected": -2.2777886390686035, + "step": 5571 + }, + { + "epoch": 1.39, + "grad_norm": 4.647337436676025, + "learning_rate": 1.2287438611620185e-07, + "logits/chosen": -0.3258465826511383, + "logits/rejected": -0.3021804988384247, + "logps/chosen": -47.37470245361328, + "logps/rejected": -84.50495147705078, + "loss": 0.6619, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.131385564804077, + "rewards/margins": 3.7141947746276855, + "rewards/rejected": -0.5828090906143188, + "step": 5572 + }, + { + "epoch": 1.39, + "grad_norm": 3.9238193035125732, + "learning_rate": 1.2229784508755837e-07, + "logits/chosen": -0.3677960932254791, + "logits/rejected": -0.4392065405845642, + "logps/chosen": -53.2797966003418, + "logps/rejected": -84.9198226928711, + "loss": 0.712, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.949192523956299, + "rewards/margins": 5.099151611328125, + "rewards/rejected": -2.1499593257904053, + "step": 5573 + }, + { + "epoch": 1.39, + "grad_norm": 7.10485315322876, + "learning_rate": 1.2172264309340985e-07, + "logits/chosen": -0.2864479422569275, + "logits/rejected": -0.38460293412208557, + "logps/chosen": -53.02827835083008, + "logps/rejected": -74.95919036865234, + "loss": 0.8084, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8227715492248535, + "rewards/margins": 4.96424674987793, + "rewards/rejected": -2.1414756774902344, + "step": 5574 + }, + { + "epoch": 1.39, + "grad_norm": 4.154869556427002, + "learning_rate": 1.2114878029166054e-07, + "logits/chosen": -0.4177674353122711, + "logits/rejected": -0.47544944286346436, + "logps/chosen": -48.977745056152344, + "logps/rejected": -79.13453674316406, + "loss": 0.7772, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.932516574859619, + "rewards/margins": 4.481020927429199, + "rewards/rejected": -1.54850435256958, + "step": 5575 + }, + { + "epoch": 1.39, + "grad_norm": 7.869829177856445, + "learning_rate": 1.2057625683984776e-07, + "logits/chosen": -0.34082841873168945, + "logits/rejected": -0.42265021800994873, + "logps/chosen": -58.1521110534668, + "logps/rejected": -90.9290542602539, + "loss": 0.7437, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0524215698242188, + "rewards/margins": 4.3596601486206055, + "rewards/rejected": -1.3072384595870972, + "step": 5576 + }, + { + "epoch": 1.4, + "grad_norm": 11.042563438415527, + "learning_rate": 1.200050728951424e-07, + "logits/chosen": -0.3780171871185303, + "logits/rejected": -0.47857439517974854, + "logps/chosen": -57.33415222167969, + "logps/rejected": -86.23432922363281, + "loss": 0.684, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0528852939605713, + "rewards/margins": 5.6538405418396, + "rewards/rejected": -2.6009554862976074, + "step": 5577 + }, + { + "epoch": 1.4, + "grad_norm": 5.092554092407227, + "learning_rate": 1.1943522861434576e-07, + "logits/chosen": -0.27479737997055054, + "logits/rejected": -0.31845933198928833, + "logps/chosen": -52.774864196777344, + "logps/rejected": -87.83888244628906, + "loss": 0.7471, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0749337673187256, + "rewards/margins": 4.290194511413574, + "rewards/rejected": -1.21526038646698, + "step": 5578 + }, + { + "epoch": 1.4, + "grad_norm": 6.109897136688232, + "learning_rate": 1.1886672415389488e-07, + "logits/chosen": -0.21803972125053406, + "logits/rejected": -0.30538204312324524, + "logps/chosen": -64.73262786865234, + "logps/rejected": -82.58944702148438, + "loss": 0.8055, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5771114826202393, + "rewards/margins": 4.234585762023926, + "rewards/rejected": -1.6574742794036865, + "step": 5579 + }, + { + "epoch": 1.4, + "grad_norm": 6.9706854820251465, + "learning_rate": 1.1829955966985385e-07, + "logits/chosen": -0.3864225149154663, + "logits/rejected": -0.4697836935520172, + "logps/chosen": -59.57524490356445, + "logps/rejected": -93.58476257324219, + "loss": 0.7065, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.941239356994629, + "rewards/margins": 4.7419281005859375, + "rewards/rejected": -1.8006887435913086, + "step": 5580 + }, + { + "epoch": 1.4, + "grad_norm": 15.351337432861328, + "learning_rate": 1.1773373531792198e-07, + "logits/chosen": -0.3349497318267822, + "logits/rejected": -0.43967902660369873, + "logps/chosen": -62.104454040527344, + "logps/rejected": -84.6253433227539, + "loss": 0.7951, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.714205741882324, + "rewards/margins": 4.870151996612549, + "rewards/rejected": -2.1559464931488037, + "step": 5581 + }, + { + "epoch": 1.4, + "grad_norm": 7.467724800109863, + "learning_rate": 1.1716925125343115e-07, + "logits/chosen": -0.3858647346496582, + "logits/rejected": -0.40288808941841125, + "logps/chosen": -58.77421569824219, + "logps/rejected": -89.8550796508789, + "loss": 0.7502, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.067110300064087, + "rewards/margins": 4.910597801208496, + "rewards/rejected": -1.84348726272583, + "step": 5582 + }, + { + "epoch": 1.4, + "grad_norm": 3.983741044998169, + "learning_rate": 1.1660610763134405e-07, + "logits/chosen": -0.3852522075176239, + "logits/rejected": -0.4365282952785492, + "logps/chosen": -65.3155517578125, + "logps/rejected": -97.26763916015625, + "loss": 0.7642, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.08846378326416, + "rewards/margins": 5.358249664306641, + "rewards/rejected": -2.2697863578796387, + "step": 5583 + }, + { + "epoch": 1.4, + "grad_norm": 3.5046706199645996, + "learning_rate": 1.1604430460625593e-07, + "logits/chosen": -0.4019702970981598, + "logits/rejected": -0.5200439691543579, + "logps/chosen": -60.204227447509766, + "logps/rejected": -92.51678466796875, + "loss": 0.7429, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2674643993377686, + "rewards/margins": 5.955260276794434, + "rewards/rejected": -2.687796115875244, + "step": 5584 + }, + { + "epoch": 1.4, + "grad_norm": 6.496713161468506, + "learning_rate": 1.154838423323934e-07, + "logits/chosen": -0.3897445797920227, + "logits/rejected": -0.4333522915840149, + "logps/chosen": -58.4136962890625, + "logps/rejected": -102.5431900024414, + "loss": 0.8193, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7515029907226562, + "rewards/margins": 4.116397857666016, + "rewards/rejected": -1.3648948669433594, + "step": 5585 + }, + { + "epoch": 1.4, + "grad_norm": 3.6216609477996826, + "learning_rate": 1.1492472096361562e-07, + "logits/chosen": -0.3004467785358429, + "logits/rejected": -0.4245733916759491, + "logps/chosen": -56.525169372558594, + "logps/rejected": -80.21977996826172, + "loss": 0.6659, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9924676418304443, + "rewards/margins": 4.777771949768066, + "rewards/rejected": -1.785304307937622, + "step": 5586 + }, + { + "epoch": 1.4, + "grad_norm": 5.963567733764648, + "learning_rate": 1.143669406534137e-07, + "logits/chosen": -0.3937416076660156, + "logits/rejected": -0.4812087416648865, + "logps/chosen": -51.657386779785156, + "logps/rejected": -106.0732650756836, + "loss": 0.6649, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.755258083343506, + "rewards/margins": 5.801947593688965, + "rewards/rejected": -3.046689510345459, + "step": 5587 + }, + { + "epoch": 1.4, + "grad_norm": 4.9407782554626465, + "learning_rate": 1.1381050155491013e-07, + "logits/chosen": -0.38588839769363403, + "logits/rejected": -0.43667927384376526, + "logps/chosen": -49.49853515625, + "logps/rejected": -80.00652313232422, + "loss": 0.69, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9569311141967773, + "rewards/margins": 4.474409580230713, + "rewards/rejected": -1.517478108406067, + "step": 5588 + }, + { + "epoch": 1.4, + "grad_norm": 2.8918344974517822, + "learning_rate": 1.1325540382085943e-07, + "logits/chosen": -0.3866266906261444, + "logits/rejected": -0.485591858625412, + "logps/chosen": -48.56584548950195, + "logps/rejected": -84.45390319824219, + "loss": 0.6618, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9375624656677246, + "rewards/margins": 5.475946426391602, + "rewards/rejected": -2.538383960723877, + "step": 5589 + }, + { + "epoch": 1.4, + "grad_norm": 4.500362873077393, + "learning_rate": 1.1270164760364854e-07, + "logits/chosen": -0.38080498576164246, + "logits/rejected": -0.4100058078765869, + "logps/chosen": -55.709537506103516, + "logps/rejected": -85.3497085571289, + "loss": 0.8034, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0668630599975586, + "rewards/margins": 4.558913707733154, + "rewards/rejected": -1.4920508861541748, + "step": 5590 + }, + { + "epoch": 1.4, + "grad_norm": 8.173677444458008, + "learning_rate": 1.1214923305529424e-07, + "logits/chosen": -0.30524516105651855, + "logits/rejected": -0.4046686887741089, + "logps/chosen": -59.42848587036133, + "logps/rejected": -97.6371078491211, + "loss": 0.6833, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3234100341796875, + "rewards/margins": 5.765296936035156, + "rewards/rejected": -2.441887140274048, + "step": 5591 + }, + { + "epoch": 1.4, + "grad_norm": 5.814084529876709, + "learning_rate": 1.1159816032744742e-07, + "logits/chosen": -0.3069726824760437, + "logits/rejected": -0.44717901945114136, + "logps/chosen": -69.51419830322266, + "logps/rejected": -88.31810760498047, + "loss": 0.8038, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7022621631622314, + "rewards/margins": 4.801642894744873, + "rewards/rejected": -2.0993807315826416, + "step": 5592 + }, + { + "epoch": 1.4, + "grad_norm": 6.377146244049072, + "learning_rate": 1.1104842957138873e-07, + "logits/chosen": -0.31781959533691406, + "logits/rejected": -0.39943450689315796, + "logps/chosen": -60.1321907043457, + "logps/rejected": -87.04261779785156, + "loss": 0.7296, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7107837200164795, + "rewards/margins": 4.6772661209106445, + "rewards/rejected": -1.966482400894165, + "step": 5593 + }, + { + "epoch": 1.4, + "grad_norm": 17.74050521850586, + "learning_rate": 1.1050004093803135e-07, + "logits/chosen": -0.30179861187934875, + "logits/rejected": -0.44536280632019043, + "logps/chosen": -68.90878295898438, + "logps/rejected": -103.5970458984375, + "loss": 0.9142, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.502751588821411, + "rewards/margins": 5.390352725982666, + "rewards/rejected": -2.887601375579834, + "step": 5594 + }, + { + "epoch": 1.4, + "grad_norm": 7.149720191955566, + "learning_rate": 1.0995299457792041e-07, + "logits/chosen": -0.34144553542137146, + "logits/rejected": -0.4110350012779236, + "logps/chosen": -53.6892204284668, + "logps/rejected": -78.33619689941406, + "loss": 0.9125, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.856930732727051, + "rewards/margins": 4.27305269241333, + "rewards/rejected": -1.4161220788955688, + "step": 5595 + }, + { + "epoch": 1.4, + "grad_norm": 3.467217206954956, + "learning_rate": 1.094072906412308e-07, + "logits/chosen": -0.38362231850624084, + "logits/rejected": -0.45362162590026855, + "logps/chosen": -59.2325439453125, + "logps/rejected": -93.70710754394531, + "loss": 0.7, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.220576286315918, + "rewards/margins": 5.189022064208984, + "rewards/rejected": -1.9684453010559082, + "step": 5596 + }, + { + "epoch": 1.4, + "grad_norm": 7.217944145202637, + "learning_rate": 1.0886292927777098e-07, + "logits/chosen": -0.44501328468322754, + "logits/rejected": -0.5579149723052979, + "logps/chosen": -56.15587615966797, + "logps/rejected": -87.02589416503906, + "loss": 0.8065, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7863802909851074, + "rewards/margins": 5.219267845153809, + "rewards/rejected": -2.432887077331543, + "step": 5597 + }, + { + "epoch": 1.4, + "grad_norm": 4.96474552154541, + "learning_rate": 1.0831991063697866e-07, + "logits/chosen": -0.4750388264656067, + "logits/rejected": -0.504021167755127, + "logps/chosen": -81.36042022705078, + "logps/rejected": -105.57506561279297, + "loss": 0.7223, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.86757755279541, + "rewards/margins": 6.034345626831055, + "rewards/rejected": -3.166768789291382, + "step": 5598 + }, + { + "epoch": 1.4, + "grad_norm": 5.020252704620361, + "learning_rate": 1.0777823486792626e-07, + "logits/chosen": -0.30239030718803406, + "logits/rejected": -0.39661815762519836, + "logps/chosen": -55.49433898925781, + "logps/rejected": -94.945556640625, + "loss": 0.7141, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.007702350616455, + "rewards/margins": 4.726339817047119, + "rewards/rejected": -1.7186367511749268, + "step": 5599 + }, + { + "epoch": 1.4, + "grad_norm": 4.799492359161377, + "learning_rate": 1.0723790211931318e-07, + "logits/chosen": -0.41065406799316406, + "logits/rejected": -0.4070783853530884, + "logps/chosen": -47.0942268371582, + "logps/rejected": -88.938720703125, + "loss": 0.7737, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.857945442199707, + "rewards/margins": 4.025778770446777, + "rewards/rejected": -1.1678334474563599, + "step": 5600 + }, + { + "epoch": 1.4, + "grad_norm": 4.059805393218994, + "learning_rate": 1.066989125394735e-07, + "logits/chosen": -0.3114109933376312, + "logits/rejected": -0.4655874967575073, + "logps/chosen": -58.763790130615234, + "logps/rejected": -92.1328353881836, + "loss": 0.651, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.813746929168701, + "rewards/margins": 4.910217761993408, + "rewards/rejected": -2.096470355987549, + "step": 5601 + }, + { + "epoch": 1.4, + "grad_norm": 5.509166240692139, + "learning_rate": 1.0616126627637169e-07, + "logits/chosen": -0.4241635799407959, + "logits/rejected": -0.5690478682518005, + "logps/chosen": -58.50018310546875, + "logps/rejected": -80.9332046508789, + "loss": 0.7747, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7791121006011963, + "rewards/margins": 5.460019111633301, + "rewards/rejected": -2.680907726287842, + "step": 5602 + }, + { + "epoch": 1.4, + "grad_norm": 5.236794948577881, + "learning_rate": 1.0562496347760354e-07, + "logits/chosen": -0.34408605098724365, + "logits/rejected": -0.4094252288341522, + "logps/chosen": -48.0628662109375, + "logps/rejected": -73.49954223632812, + "loss": 0.6717, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9448037147521973, + "rewards/margins": 4.426246166229248, + "rewards/rejected": -1.4814425706863403, + "step": 5603 + }, + { + "epoch": 1.4, + "grad_norm": 9.472262382507324, + "learning_rate": 1.0509000429039462e-07, + "logits/chosen": -0.283892959356308, + "logits/rejected": -0.3616544008255005, + "logps/chosen": -67.78260803222656, + "logps/rejected": -83.94142150878906, + "loss": 1.103, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.562284469604492, + "rewards/margins": 4.520857334136963, + "rewards/rejected": -1.9585729837417603, + "step": 5604 + }, + { + "epoch": 1.4, + "grad_norm": 7.276174068450928, + "learning_rate": 1.0455638886160357e-07, + "logits/chosen": -0.33860042691230774, + "logits/rejected": -0.43100282549858093, + "logps/chosen": -51.803916931152344, + "logps/rejected": -80.54556274414062, + "loss": 0.7153, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.981743812561035, + "rewards/margins": 4.717406272888184, + "rewards/rejected": -1.7356631755828857, + "step": 5605 + }, + { + "epoch": 1.4, + "grad_norm": 2.7022860050201416, + "learning_rate": 1.0402411733771822e-07, + "logits/chosen": -0.3681512773036957, + "logits/rejected": -0.4368270933628082, + "logps/chosen": -54.60955810546875, + "logps/rejected": -94.02980041503906, + "loss": 0.6562, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.866255521774292, + "rewards/margins": 5.377926826477051, + "rewards/rejected": -2.5116710662841797, + "step": 5606 + }, + { + "epoch": 1.4, + "grad_norm": 18.357675552368164, + "learning_rate": 1.0349318986486057e-07, + "logits/chosen": -0.3289407789707184, + "logits/rejected": -0.47267699241638184, + "logps/chosen": -55.01994323730469, + "logps/rejected": -87.73501586914062, + "loss": 0.5984, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9338877201080322, + "rewards/margins": 5.786720275878906, + "rewards/rejected": -2.852832794189453, + "step": 5607 + }, + { + "epoch": 1.4, + "grad_norm": 3.0378568172454834, + "learning_rate": 1.0296360658878013e-07, + "logits/chosen": -0.4311257600784302, + "logits/rejected": -0.5016289353370667, + "logps/chosen": -51.433326721191406, + "logps/rejected": -90.00444030761719, + "loss": 0.6621, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9908640384674072, + "rewards/margins": 4.966458320617676, + "rewards/rejected": -1.9755946397781372, + "step": 5608 + }, + { + "epoch": 1.4, + "grad_norm": 7.433017253875732, + "learning_rate": 1.0243536765485951e-07, + "logits/chosen": -0.3790479004383087, + "logits/rejected": -0.4857708513736725, + "logps/chosen": -59.21818161010742, + "logps/rejected": -72.673828125, + "loss": 0.6929, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1569712162017822, + "rewards/margins": 5.1480021476745605, + "rewards/rejected": -1.9910308122634888, + "step": 5609 + }, + { + "epoch": 1.4, + "grad_norm": 3.958683967590332, + "learning_rate": 1.0190847320811158e-07, + "logits/chosen": -0.3623228073120117, + "logits/rejected": -0.36455434560775757, + "logps/chosen": -56.71265411376953, + "logps/rejected": -109.77657318115234, + "loss": 0.7132, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.817197322845459, + "rewards/margins": 5.295562744140625, + "rewards/rejected": -2.478365898132324, + "step": 5610 + }, + { + "epoch": 1.4, + "grad_norm": 8.153430938720703, + "learning_rate": 1.0138292339318012e-07, + "logits/chosen": -0.31417417526245117, + "logits/rejected": -0.3770841360092163, + "logps/chosen": -46.54603958129883, + "logps/rejected": -101.51387023925781, + "loss": 0.6412, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.20259165763855, + "rewards/margins": 5.593597412109375, + "rewards/rejected": -2.3910059928894043, + "step": 5611 + }, + { + "epoch": 1.4, + "grad_norm": 6.526782035827637, + "learning_rate": 1.0085871835434025e-07, + "logits/chosen": -0.3102855682373047, + "logits/rejected": -0.38346484303474426, + "logps/chosen": -54.20327377319336, + "logps/rejected": -97.9581527709961, + "loss": 0.6879, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.808288335800171, + "rewards/margins": 4.6210198402404785, + "rewards/rejected": -1.8127318620681763, + "step": 5612 + }, + { + "epoch": 1.4, + "grad_norm": 4.696977615356445, + "learning_rate": 1.0033585823549686e-07, + "logits/chosen": -0.4068138897418976, + "logits/rejected": -0.5068550705909729, + "logps/chosen": -62.04815673828125, + "logps/rejected": -87.48759460449219, + "loss": 0.734, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.923306941986084, + "rewards/margins": 5.429497241973877, + "rewards/rejected": -2.506190538406372, + "step": 5613 + }, + { + "epoch": 1.4, + "grad_norm": 6.225522041320801, + "learning_rate": 9.981434318018734e-08, + "logits/chosen": -0.33527693152427673, + "logits/rejected": -0.3435729146003723, + "logps/chosen": -57.444313049316406, + "logps/rejected": -99.55107879638672, + "loss": 0.7519, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9265758991241455, + "rewards/margins": 4.381799697875977, + "rewards/rejected": -1.4552240371704102, + "step": 5614 + }, + { + "epoch": 1.4, + "grad_norm": 4.129058837890625, + "learning_rate": 9.929417333157777e-08, + "logits/chosen": -0.39068761467933655, + "logits/rejected": -0.479326456785202, + "logps/chosen": -61.05487060546875, + "logps/rejected": -88.47599029541016, + "loss": 0.6829, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.827634572982788, + "rewards/margins": 5.2837324142456055, + "rewards/rejected": -2.45609712600708, + "step": 5615 + }, + { + "epoch": 1.4, + "grad_norm": 3.187523603439331, + "learning_rate": 9.87753488324672e-08, + "logits/chosen": -0.35017433762550354, + "logits/rejected": -0.45183107256889343, + "logps/chosen": -53.250858306884766, + "logps/rejected": -84.77711486816406, + "loss": 0.6385, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7396626472473145, + "rewards/margins": 5.260011196136475, + "rewards/rejected": -2.52034854888916, + "step": 5616 + }, + { + "epoch": 1.41, + "grad_norm": 7.187763214111328, + "learning_rate": 9.825786982528284e-08, + "logits/chosen": -0.3022352457046509, + "logits/rejected": -0.42266061902046204, + "logps/chosen": -55.936832427978516, + "logps/rejected": -79.79692840576172, + "loss": 0.735, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0702195167541504, + "rewards/margins": 5.649787902832031, + "rewards/rejected": -2.57956862449646, + "step": 5617 + }, + { + "epoch": 1.41, + "grad_norm": 12.965044021606445, + "learning_rate": 9.77417364520844e-08, + "logits/chosen": -0.2706931531429291, + "logits/rejected": -0.410075306892395, + "logps/chosen": -61.73174285888672, + "logps/rejected": -86.97453308105469, + "loss": 0.6851, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6268744468688965, + "rewards/margins": 5.065456867218018, + "rewards/rejected": -2.4385828971862793, + "step": 5618 + }, + { + "epoch": 1.41, + "grad_norm": 2.0132081508636475, + "learning_rate": 9.722694885456185e-08, + "logits/chosen": -0.2476121038198471, + "logits/rejected": -0.3695448935031891, + "logps/chosen": -60.293460845947266, + "logps/rejected": -88.7628402709961, + "loss": 0.6335, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0018177032470703, + "rewards/margins": 5.880703449249268, + "rewards/rejected": -2.8788862228393555, + "step": 5619 + }, + { + "epoch": 1.41, + "grad_norm": 6.571269512176514, + "learning_rate": 9.671350717403605e-08, + "logits/chosen": -0.3664393126964569, + "logits/rejected": -0.44968730211257935, + "logps/chosen": -53.9419059753418, + "logps/rejected": -78.8321533203125, + "loss": 0.6936, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8698136806488037, + "rewards/margins": 4.814353942871094, + "rewards/rejected": -1.9445397853851318, + "step": 5620 + }, + { + "epoch": 1.41, + "grad_norm": 10.646882057189941, + "learning_rate": 9.62014115514559e-08, + "logits/chosen": -0.37736746668815613, + "logits/rejected": -0.522676944732666, + "logps/chosen": -63.32901382446289, + "logps/rejected": -80.8805160522461, + "loss": 0.726, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.691256046295166, + "rewards/margins": 4.314977645874023, + "rewards/rejected": -1.6237213611602783, + "step": 5621 + }, + { + "epoch": 1.41, + "grad_norm": 4.657884120941162, + "learning_rate": 9.569066212740453e-08, + "logits/chosen": -0.3929070234298706, + "logits/rejected": -0.5095423460006714, + "logps/chosen": -61.7213249206543, + "logps/rejected": -94.01460266113281, + "loss": 0.7535, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0187368392944336, + "rewards/margins": 5.636966228485107, + "rewards/rejected": -2.618229627609253, + "step": 5622 + }, + { + "epoch": 1.41, + "grad_norm": 3.9238572120666504, + "learning_rate": 9.518125904209307e-08, + "logits/chosen": -0.29118889570236206, + "logits/rejected": -0.4055634140968323, + "logps/chosen": -49.90821075439453, + "logps/rejected": -84.72468566894531, + "loss": 0.6023, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.882342576980591, + "rewards/margins": 4.752960205078125, + "rewards/rejected": -1.8706179857254028, + "step": 5623 + }, + { + "epoch": 1.41, + "grad_norm": 5.4946746826171875, + "learning_rate": 9.46732024353636e-08, + "logits/chosen": -0.32745662331581116, + "logits/rejected": -0.4445440471172333, + "logps/chosen": -65.23977661132812, + "logps/rejected": -92.63227844238281, + "loss": 0.7047, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.070011854171753, + "rewards/margins": 4.692769527435303, + "rewards/rejected": -1.622757911682129, + "step": 5624 + }, + { + "epoch": 1.41, + "grad_norm": 2.292241096496582, + "learning_rate": 9.41664924466884e-08, + "logits/chosen": -0.26826393604278564, + "logits/rejected": -0.30714523792266846, + "logps/chosen": -68.5290298461914, + "logps/rejected": -95.69171142578125, + "loss": 0.6696, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.770503520965576, + "rewards/margins": 5.08917236328125, + "rewards/rejected": -2.318669319152832, + "step": 5625 + }, + { + "epoch": 1.41, + "grad_norm": 4.022078990936279, + "learning_rate": 9.366112921517124e-08, + "logits/chosen": -0.33341458439826965, + "logits/rejected": -0.428308367729187, + "logps/chosen": -49.920082092285156, + "logps/rejected": -70.12541198730469, + "loss": 0.6899, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8898696899414062, + "rewards/margins": 4.816653251647949, + "rewards/rejected": -1.9267836809158325, + "step": 5626 + }, + { + "epoch": 1.41, + "grad_norm": 4.960089206695557, + "learning_rate": 9.315711287954387e-08, + "logits/chosen": -0.25461116433143616, + "logits/rejected": -0.426859050989151, + "logps/chosen": -64.1513671875, + "logps/rejected": -77.37528991699219, + "loss": 0.7392, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.844430685043335, + "rewards/margins": 4.709308624267578, + "rewards/rejected": -1.8648778200149536, + "step": 5627 + }, + { + "epoch": 1.41, + "grad_norm": 4.499983787536621, + "learning_rate": 9.265444357817066e-08, + "logits/chosen": -0.38465416431427, + "logits/rejected": -0.4684484601020813, + "logps/chosen": -52.72764205932617, + "logps/rejected": -81.38737487792969, + "loss": 0.6932, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.940183162689209, + "rewards/margins": 4.899177551269531, + "rewards/rejected": -1.9589946269989014, + "step": 5628 + }, + { + "epoch": 1.41, + "grad_norm": 7.792544841766357, + "learning_rate": 9.215312144904509e-08, + "logits/chosen": -0.37159574031829834, + "logits/rejected": -0.4771915078163147, + "logps/chosen": -46.30004119873047, + "logps/rejected": -72.55816650390625, + "loss": 0.6657, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8097124099731445, + "rewards/margins": 5.179606914520264, + "rewards/rejected": -2.36989426612854, + "step": 5629 + }, + { + "epoch": 1.41, + "grad_norm": 4.889907360076904, + "learning_rate": 9.165314662978986e-08, + "logits/chosen": -0.40106001496315, + "logits/rejected": -0.5014258027076721, + "logps/chosen": -58.593021392822266, + "logps/rejected": -75.39522552490234, + "loss": 0.7252, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.152352809906006, + "rewards/margins": 5.356324195861816, + "rewards/rejected": -2.2039713859558105, + "step": 5630 + }, + { + "epoch": 1.41, + "grad_norm": 10.352517127990723, + "learning_rate": 9.115451925766017e-08, + "logits/chosen": -0.3342224955558777, + "logits/rejected": -0.48224031925201416, + "logps/chosen": -62.43229675292969, + "logps/rejected": -80.73796081542969, + "loss": 0.9307, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.672694206237793, + "rewards/margins": 4.733973979949951, + "rewards/rejected": -2.061279535293579, + "step": 5631 + }, + { + "epoch": 1.41, + "grad_norm": 4.23386287689209, + "learning_rate": 9.065723946953986e-08, + "logits/chosen": -0.33055660128593445, + "logits/rejected": -0.4940093755722046, + "logps/chosen": -63.843971252441406, + "logps/rejected": -80.3789291381836, + "loss": 0.5787, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.965491771697998, + "rewards/margins": 5.622580051422119, + "rewards/rejected": -2.6570885181427, + "step": 5632 + }, + { + "epoch": 1.41, + "grad_norm": 10.401312828063965, + "learning_rate": 9.016130740194196e-08, + "logits/chosen": -0.3361123204231262, + "logits/rejected": -0.47732973098754883, + "logps/chosen": -61.39247512817383, + "logps/rejected": -75.42057800292969, + "loss": 0.9305, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6885604858398438, + "rewards/margins": 4.03232479095459, + "rewards/rejected": -1.3437644243240356, + "step": 5633 + }, + { + "epoch": 1.41, + "grad_norm": 3.2580745220184326, + "learning_rate": 8.966672319101144e-08, + "logits/chosen": -0.37479764223098755, + "logits/rejected": -0.44475993514060974, + "logps/chosen": -52.86212921142578, + "logps/rejected": -77.12797546386719, + "loss": 0.6638, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.859225034713745, + "rewards/margins": 5.077010631561279, + "rewards/rejected": -2.217785596847534, + "step": 5634 + }, + { + "epoch": 1.41, + "grad_norm": 3.0489330291748047, + "learning_rate": 8.917348697252137e-08, + "logits/chosen": -0.404602974653244, + "logits/rejected": -0.4866569936275482, + "logps/chosen": -58.58955764770508, + "logps/rejected": -77.80513763427734, + "loss": 0.6544, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0409979820251465, + "rewards/margins": 4.8852105140686035, + "rewards/rejected": -1.844212532043457, + "step": 5635 + }, + { + "epoch": 1.41, + "grad_norm": 3.261265277862549, + "learning_rate": 8.868159888187733e-08, + "logits/chosen": -0.2790389358997345, + "logits/rejected": -0.35498982667922974, + "logps/chosen": -56.091087341308594, + "logps/rejected": -91.6585693359375, + "loss": 0.6187, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6757936477661133, + "rewards/margins": 5.268322467803955, + "rewards/rejected": -2.5925285816192627, + "step": 5636 + }, + { + "epoch": 1.41, + "grad_norm": 3.4515538215637207, + "learning_rate": 8.819105905411241e-08, + "logits/chosen": -0.3156815767288208, + "logits/rejected": -0.42409783601760864, + "logps/chosen": -56.736900329589844, + "logps/rejected": -84.36624908447266, + "loss": 0.6915, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8696634769439697, + "rewards/margins": 4.389010906219482, + "rewards/rejected": -1.5193480253219604, + "step": 5637 + }, + { + "epoch": 1.41, + "grad_norm": 7.863254070281982, + "learning_rate": 8.770186762388943e-08, + "logits/chosen": -0.39842888712882996, + "logits/rejected": -0.47805312275886536, + "logps/chosen": -58.86298370361328, + "logps/rejected": -87.33646392822266, + "loss": 0.877, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.441602945327759, + "rewards/margins": 4.352713584899902, + "rewards/rejected": -1.911110758781433, + "step": 5638 + }, + { + "epoch": 1.41, + "grad_norm": 4.001060485839844, + "learning_rate": 8.721402472550322e-08, + "logits/chosen": -0.3446536660194397, + "logits/rejected": -0.4776597321033478, + "logps/chosen": -52.28797912597656, + "logps/rejected": -87.90412902832031, + "loss": 0.6497, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.088070869445801, + "rewards/margins": 5.289848327636719, + "rewards/rejected": -2.201777935028076, + "step": 5639 + }, + { + "epoch": 1.41, + "grad_norm": 10.174484252929688, + "learning_rate": 8.672753049287664e-08, + "logits/chosen": -0.34405237436294556, + "logits/rejected": -0.42131009697914124, + "logps/chosen": -50.42075729370117, + "logps/rejected": -91.25373840332031, + "loss": 0.793, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.762462615966797, + "rewards/margins": 5.081853866577148, + "rewards/rejected": -2.3193914890289307, + "step": 5640 + }, + { + "epoch": 1.41, + "grad_norm": 3.6879637241363525, + "learning_rate": 8.624238505956283e-08, + "logits/chosen": -0.4013715982437134, + "logits/rejected": -0.43032917380332947, + "logps/chosen": -52.938575744628906, + "logps/rejected": -110.5262680053711, + "loss": 0.6568, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.940408706665039, + "rewards/margins": 5.971682071685791, + "rewards/rejected": -3.031273365020752, + "step": 5641 + }, + { + "epoch": 1.41, + "grad_norm": 4.146818161010742, + "learning_rate": 8.57585885587453e-08, + "logits/chosen": -0.31900930404663086, + "logits/rejected": -0.4897453188896179, + "logps/chosen": -54.653656005859375, + "logps/rejected": -72.97562408447266, + "loss": 0.5845, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1048498153686523, + "rewards/margins": 6.027943134307861, + "rewards/rejected": -2.923092842102051, + "step": 5642 + }, + { + "epoch": 1.41, + "grad_norm": 6.344629287719727, + "learning_rate": 8.527614112323556e-08, + "logits/chosen": -0.422343373298645, + "logits/rejected": -0.4380071461200714, + "logps/chosen": -55.39558029174805, + "logps/rejected": -101.63735961914062, + "loss": 0.9077, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.228222370147705, + "rewards/margins": 4.492877960205078, + "rewards/rejected": -1.2646552324295044, + "step": 5643 + }, + { + "epoch": 1.41, + "grad_norm": 2.4748172760009766, + "learning_rate": 8.479504288547769e-08, + "logits/chosen": -0.3736719489097595, + "logits/rejected": -0.43949609994888306, + "logps/chosen": -58.24015808105469, + "logps/rejected": -104.07913208007812, + "loss": 0.6432, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9423046112060547, + "rewards/margins": 6.120936870574951, + "rewards/rejected": -3.1786322593688965, + "step": 5644 + }, + { + "epoch": 1.41, + "grad_norm": 7.154048442840576, + "learning_rate": 8.431529397754102e-08, + "logits/chosen": -0.3391595184803009, + "logits/rejected": -0.4567604660987854, + "logps/chosen": -55.12112808227539, + "logps/rejected": -77.93058013916016, + "loss": 0.7472, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6961193084716797, + "rewards/margins": 4.250277996063232, + "rewards/rejected": -1.554159164428711, + "step": 5645 + }, + { + "epoch": 1.41, + "grad_norm": 13.437335014343262, + "learning_rate": 8.383689453112853e-08, + "logits/chosen": -0.3153305649757385, + "logits/rejected": -0.38098540902137756, + "logps/chosen": -49.305721282958984, + "logps/rejected": -72.11618041992188, + "loss": 0.7627, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.892284870147705, + "rewards/margins": 4.564523696899414, + "rewards/rejected": -1.672239065170288, + "step": 5646 + }, + { + "epoch": 1.41, + "grad_norm": 8.47459888458252, + "learning_rate": 8.33598446775713e-08, + "logits/chosen": -0.3272125720977783, + "logits/rejected": -0.44870856404304504, + "logps/chosen": -60.96951675415039, + "logps/rejected": -71.92140197753906, + "loss": 0.7926, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6367783546447754, + "rewards/margins": 4.548785209655762, + "rewards/rejected": -1.9120066165924072, + "step": 5647 + }, + { + "epoch": 1.41, + "grad_norm": 7.7938232421875, + "learning_rate": 8.288414454782901e-08, + "logits/chosen": -0.393585741519928, + "logits/rejected": -0.47178277373313904, + "logps/chosen": -59.97061538696289, + "logps/rejected": -85.15001678466797, + "loss": 0.7225, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8685286045074463, + "rewards/margins": 4.72744607925415, + "rewards/rejected": -1.8589173555374146, + "step": 5648 + }, + { + "epoch": 1.41, + "grad_norm": 9.544554710388184, + "learning_rate": 8.240979427249274e-08, + "logits/chosen": -0.32148560881614685, + "logits/rejected": -0.4252285957336426, + "logps/chosen": -59.661983489990234, + "logps/rejected": -81.53875732421875, + "loss": 0.8869, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.771108865737915, + "rewards/margins": 3.931551456451416, + "rewards/rejected": -1.1604423522949219, + "step": 5649 + }, + { + "epoch": 1.41, + "grad_norm": 8.405726432800293, + "learning_rate": 8.193679398178e-08, + "logits/chosen": -0.42208778858184814, + "logits/rejected": -0.509110689163208, + "logps/chosen": -56.99192810058594, + "logps/rejected": -96.42184448242188, + "loss": 0.7681, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.930299758911133, + "rewards/margins": 4.7365922927856445, + "rewards/rejected": -1.8062928915023804, + "step": 5650 + }, + { + "epoch": 1.41, + "grad_norm": 5.689237117767334, + "learning_rate": 8.146514380554194e-08, + "logits/chosen": -0.34534746408462524, + "logits/rejected": -0.46463310718536377, + "logps/chosen": -50.85946273803711, + "logps/rejected": -80.249755859375, + "loss": 0.6372, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.861907482147217, + "rewards/margins": 4.741562843322754, + "rewards/rejected": -1.879655361175537, + "step": 5651 + }, + { + "epoch": 1.41, + "grad_norm": 7.421765327453613, + "learning_rate": 8.099484387325496e-08, + "logits/chosen": -0.47429323196411133, + "logits/rejected": -0.5529816150665283, + "logps/chosen": -42.97422409057617, + "logps/rejected": -86.90482330322266, + "loss": 0.5973, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1338229179382324, + "rewards/margins": 5.585694789886475, + "rewards/rejected": -2.4518721103668213, + "step": 5652 + }, + { + "epoch": 1.41, + "grad_norm": 3.668731451034546, + "learning_rate": 8.05258943140269e-08, + "logits/chosen": -0.37100085616111755, + "logits/rejected": -0.4111435115337372, + "logps/chosen": -64.18374633789062, + "logps/rejected": -84.14776611328125, + "loss": 0.7402, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2797863483428955, + "rewards/margins": 3.789391040802002, + "rewards/rejected": -0.5096048712730408, + "step": 5653 + }, + { + "epoch": 1.41, + "grad_norm": 5.007746696472168, + "learning_rate": 8.005829525659537e-08, + "logits/chosen": -0.3648250997066498, + "logits/rejected": -0.47786086797714233, + "logps/chosen": -66.07418060302734, + "logps/rejected": -79.79435729980469, + "loss": 0.7385, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.687319755554199, + "rewards/margins": 4.750039100646973, + "rewards/rejected": -2.0627193450927734, + "step": 5654 + }, + { + "epoch": 1.41, + "grad_norm": 4.418515205383301, + "learning_rate": 7.959204682932486e-08, + "logits/chosen": -0.3462412655353546, + "logits/rejected": -0.46312642097473145, + "logps/chosen": -62.522674560546875, + "logps/rejected": -80.0855484008789, + "loss": 0.7493, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7453885078430176, + "rewards/margins": 5.1755452156066895, + "rewards/rejected": -2.430156946182251, + "step": 5655 + }, + { + "epoch": 1.41, + "grad_norm": 4.187135696411133, + "learning_rate": 7.912714916021247e-08, + "logits/chosen": -0.38231033086776733, + "logits/rejected": -0.5070117712020874, + "logps/chosen": -56.263710021972656, + "logps/rejected": -85.10920715332031, + "loss": 0.6961, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0277175903320312, + "rewards/margins": 5.830973148345947, + "rewards/rejected": -2.803255319595337, + "step": 5656 + }, + { + "epoch": 1.42, + "grad_norm": 3.4165005683898926, + "learning_rate": 7.866360237688108e-08, + "logits/chosen": -0.26752710342407227, + "logits/rejected": -0.39614635705947876, + "logps/chosen": -56.66912841796875, + "logps/rejected": -79.8315658569336, + "loss": 0.6104, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9619672298431396, + "rewards/margins": 5.376236915588379, + "rewards/rejected": -2.4142699241638184, + "step": 5657 + }, + { + "epoch": 1.42, + "grad_norm": 5.98808479309082, + "learning_rate": 7.820140660658503e-08, + "logits/chosen": -0.3248891234397888, + "logits/rejected": -0.4299936294555664, + "logps/chosen": -60.87861251831055, + "logps/rejected": -76.60798645019531, + "loss": 0.7859, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6512398719787598, + "rewards/margins": 4.102720260620117, + "rewards/rejected": -1.4514803886413574, + "step": 5658 + }, + { + "epoch": 1.42, + "grad_norm": 4.547351837158203, + "learning_rate": 7.774056197620672e-08, + "logits/chosen": -0.2574312388896942, + "logits/rejected": -0.43259042501449585, + "logps/chosen": -61.251609802246094, + "logps/rejected": -74.6175765991211, + "loss": 0.7049, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.759275197982788, + "rewards/margins": 5.767309188842773, + "rewards/rejected": -3.0080339908599854, + "step": 5659 + }, + { + "epoch": 1.42, + "grad_norm": 6.908977508544922, + "learning_rate": 7.728106861225881e-08, + "logits/chosen": -0.41366100311279297, + "logits/rejected": -0.5183698534965515, + "logps/chosen": -59.400177001953125, + "logps/rejected": -83.85323333740234, + "loss": 0.6942, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.035259246826172, + "rewards/margins": 3.998060703277588, + "rewards/rejected": -0.9628013372421265, + "step": 5660 + }, + { + "epoch": 1.42, + "grad_norm": 5.363404750823975, + "learning_rate": 7.682292664088098e-08, + "logits/chosen": -0.36403846740722656, + "logits/rejected": -0.47836822271347046, + "logps/chosen": -57.44705581665039, + "logps/rejected": -89.79481506347656, + "loss": 0.7384, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5864596366882324, + "rewards/margins": 5.203800678253174, + "rewards/rejected": -2.617340564727783, + "step": 5661 + }, + { + "epoch": 1.42, + "grad_norm": 33.08535385131836, + "learning_rate": 7.636613618784372e-08, + "logits/chosen": -0.3246687948703766, + "logits/rejected": -0.3646181523799896, + "logps/chosen": -55.686546325683594, + "logps/rejected": -88.28884887695312, + "loss": 0.863, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9422717094421387, + "rewards/margins": 3.938621997833252, + "rewards/rejected": -0.996350109577179, + "step": 5662 + }, + { + "epoch": 1.42, + "grad_norm": 5.098918914794922, + "learning_rate": 7.591069737854506e-08, + "logits/chosen": -0.44872114062309265, + "logits/rejected": -0.5559256076812744, + "logps/chosen": -54.26932907104492, + "logps/rejected": -84.43900299072266, + "loss": 0.7442, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.654402017593384, + "rewards/margins": 4.633785724639893, + "rewards/rejected": -1.9793838262557983, + "step": 5663 + }, + { + "epoch": 1.42, + "grad_norm": 4.375621318817139, + "learning_rate": 7.545661033801388e-08, + "logits/chosen": -0.39492321014404297, + "logits/rejected": -0.5134243369102478, + "logps/chosen": -49.53500747680664, + "logps/rejected": -77.31920623779297, + "loss": 0.6089, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2388267517089844, + "rewards/margins": 4.940318584442139, + "rewards/rejected": -1.7014923095703125, + "step": 5664 + }, + { + "epoch": 1.42, + "grad_norm": 3.2576541900634766, + "learning_rate": 7.500387519090657e-08, + "logits/chosen": -0.45259445905685425, + "logits/rejected": -0.45603489875793457, + "logps/chosen": -53.678985595703125, + "logps/rejected": -90.58745574951172, + "loss": 0.7238, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2884020805358887, + "rewards/margins": 4.825379848480225, + "rewards/rejected": -1.536977767944336, + "step": 5665 + }, + { + "epoch": 1.42, + "grad_norm": 9.203594207763672, + "learning_rate": 7.455249206150872e-08, + "logits/chosen": -0.36242300271987915, + "logits/rejected": -0.4511702358722687, + "logps/chosen": -55.90275955200195, + "logps/rejected": -85.68302154541016, + "loss": 0.6925, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.217830181121826, + "rewards/margins": 4.874104022979736, + "rewards/rejected": -1.6562738418579102, + "step": 5666 + }, + { + "epoch": 1.42, + "grad_norm": 8.526593208312988, + "learning_rate": 7.410246107373454e-08, + "logits/chosen": -0.28789013624191284, + "logits/rejected": -0.30309152603149414, + "logps/chosen": -54.315006256103516, + "logps/rejected": -94.05192565917969, + "loss": 0.6498, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.026183843612671, + "rewards/margins": 4.481640815734863, + "rewards/rejected": -1.4554579257965088, + "step": 5667 + }, + { + "epoch": 1.42, + "grad_norm": 21.314504623413086, + "learning_rate": 7.365378235112686e-08, + "logits/chosen": -0.2545955777168274, + "logits/rejected": -0.34650522470474243, + "logps/chosen": -63.94914245605469, + "logps/rejected": -98.33750915527344, + "loss": 0.9556, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.567005157470703, + "rewards/margins": 4.742464065551758, + "rewards/rejected": -2.1754586696624756, + "step": 5668 + }, + { + "epoch": 1.42, + "grad_norm": 10.020856857299805, + "learning_rate": 7.320645601685827e-08, + "logits/chosen": -0.35381609201431274, + "logits/rejected": -0.43907639384269714, + "logps/chosen": -54.458335876464844, + "logps/rejected": -103.01701354980469, + "loss": 0.8657, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.717417001724243, + "rewards/margins": 4.303981781005859, + "rewards/rejected": -1.586564540863037, + "step": 5669 + }, + { + "epoch": 1.42, + "grad_norm": 4.375401496887207, + "learning_rate": 7.276048219372889e-08, + "logits/chosen": -0.4057765305042267, + "logits/rejected": -0.5300526022911072, + "logps/chosen": -60.86931610107422, + "logps/rejected": -78.0039291381836, + "loss": 0.6929, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9212613105773926, + "rewards/margins": 5.114231109619141, + "rewards/rejected": -2.192969560623169, + "step": 5670 + }, + { + "epoch": 1.42, + "grad_norm": 3.9182510375976562, + "learning_rate": 7.231586100416909e-08, + "logits/chosen": -0.3174668550491333, + "logits/rejected": -0.41819244623184204, + "logps/chosen": -58.478485107421875, + "logps/rejected": -83.1334457397461, + "loss": 0.6886, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9423112869262695, + "rewards/margins": 5.313530445098877, + "rewards/rejected": -2.3712189197540283, + "step": 5671 + }, + { + "epoch": 1.42, + "grad_norm": 20.149694442749023, + "learning_rate": 7.187259257023682e-08, + "logits/chosen": -0.3313506245613098, + "logits/rejected": -0.38183891773223877, + "logps/chosen": -59.38713836669922, + "logps/rejected": -110.21458435058594, + "loss": 0.6227, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6722447872161865, + "rewards/margins": 5.330541610717773, + "rewards/rejected": -2.6582961082458496, + "step": 5672 + }, + { + "epoch": 1.42, + "grad_norm": 10.669791221618652, + "learning_rate": 7.143067701361806e-08, + "logits/chosen": -0.3272157311439514, + "logits/rejected": -0.4176409840583801, + "logps/chosen": -57.973350524902344, + "logps/rejected": -88.00718688964844, + "loss": 0.8728, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.742596387863159, + "rewards/margins": 4.732645034790039, + "rewards/rejected": -1.990048885345459, + "step": 5673 + }, + { + "epoch": 1.42, + "grad_norm": 7.279135227203369, + "learning_rate": 7.099011445562853e-08, + "logits/chosen": -0.3139144778251648, + "logits/rejected": -0.3883954882621765, + "logps/chosen": -65.09803009033203, + "logps/rejected": -82.05206298828125, + "loss": 0.8914, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.594670295715332, + "rewards/margins": 3.955138683319092, + "rewards/rejected": -1.3604683876037598, + "step": 5674 + }, + { + "epoch": 1.42, + "grad_norm": 6.510092258453369, + "learning_rate": 7.055090501721207e-08, + "logits/chosen": -0.31575194001197815, + "logits/rejected": -0.46925634145736694, + "logps/chosen": -68.29090118408203, + "logps/rejected": -83.68519592285156, + "loss": 0.8466, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.850517988204956, + "rewards/margins": 4.8905439376831055, + "rewards/rejected": -2.0400261878967285, + "step": 5675 + }, + { + "epoch": 1.42, + "grad_norm": 17.414587020874023, + "learning_rate": 7.011304881894166e-08, + "logits/chosen": -0.3905068039894104, + "logits/rejected": -0.4631601572036743, + "logps/chosen": -60.53756332397461, + "logps/rejected": -98.87771606445312, + "loss": 0.6897, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.929948329925537, + "rewards/margins": 5.509648323059082, + "rewards/rejected": -2.579699993133545, + "step": 5676 + }, + { + "epoch": 1.42, + "grad_norm": 3.7765023708343506, + "learning_rate": 6.967654598101837e-08, + "logits/chosen": -0.36770570278167725, + "logits/rejected": -0.48397544026374817, + "logps/chosen": -55.01332092285156, + "logps/rejected": -73.20906066894531, + "loss": 0.6512, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.176182508468628, + "rewards/margins": 5.154951095581055, + "rewards/rejected": -1.9787683486938477, + "step": 5677 + }, + { + "epoch": 1.42, + "grad_norm": 2.1619200706481934, + "learning_rate": 6.924139662327023e-08, + "logits/chosen": -0.32520419359207153, + "logits/rejected": -0.4960138201713562, + "logps/chosen": -48.290252685546875, + "logps/rejected": -81.25067138671875, + "loss": 0.5174, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9747931957244873, + "rewards/margins": 6.5195441246032715, + "rewards/rejected": -3.5447521209716797, + "step": 5678 + }, + { + "epoch": 1.42, + "grad_norm": 4.803589820861816, + "learning_rate": 6.88076008651567e-08, + "logits/chosen": -0.3500143587589264, + "logits/rejected": -0.4301241636276245, + "logps/chosen": -62.79112243652344, + "logps/rejected": -73.43899536132812, + "loss": 0.7926, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.046304702758789, + "rewards/margins": 4.091795921325684, + "rewards/rejected": -1.0454914569854736, + "step": 5679 + }, + { + "epoch": 1.42, + "grad_norm": 3.9608705043792725, + "learning_rate": 6.837515882576307e-08, + "logits/chosen": -0.32520154118537903, + "logits/rejected": -0.3334271013736725, + "logps/chosen": -57.27241897583008, + "logps/rejected": -90.29591369628906, + "loss": 0.6827, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1003305912017822, + "rewards/margins": 4.862565994262695, + "rewards/rejected": -1.762235403060913, + "step": 5680 + }, + { + "epoch": 1.42, + "grad_norm": 5.805207252502441, + "learning_rate": 6.794407062380492e-08, + "logits/chosen": -0.3877183496952057, + "logits/rejected": -0.43614351749420166, + "logps/chosen": -56.515621185302734, + "logps/rejected": -93.427978515625, + "loss": 0.7374, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.936734199523926, + "rewards/margins": 5.1686177253723145, + "rewards/rejected": -2.2318835258483887, + "step": 5681 + }, + { + "epoch": 1.42, + "grad_norm": 3.2119698524475098, + "learning_rate": 6.751433637762427e-08, + "logits/chosen": -0.35155290365219116, + "logits/rejected": -0.4029439091682434, + "logps/chosen": -56.85734176635742, + "logps/rejected": -99.79338836669922, + "loss": 0.6944, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8201310634613037, + "rewards/margins": 5.472508907318115, + "rewards/rejected": -2.6523778438568115, + "step": 5682 + }, + { + "epoch": 1.42, + "grad_norm": 4.560556411743164, + "learning_rate": 6.708595620519287e-08, + "logits/chosen": -0.36916422843933105, + "logits/rejected": -0.4757273197174072, + "logps/chosen": -56.412986755371094, + "logps/rejected": -108.76742553710938, + "loss": 0.6403, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9503583908081055, + "rewards/margins": 6.236871242523193, + "rewards/rejected": -3.286513090133667, + "step": 5683 + }, + { + "epoch": 1.42, + "grad_norm": 5.264684677124023, + "learning_rate": 6.665893022411107e-08, + "logits/chosen": -0.3484887480735779, + "logits/rejected": -0.4403734803199768, + "logps/chosen": -51.97615051269531, + "logps/rejected": -74.50436401367188, + "loss": 0.7266, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8576712608337402, + "rewards/margins": 4.603507995605469, + "rewards/rejected": -1.7458364963531494, + "step": 5684 + }, + { + "epoch": 1.42, + "grad_norm": 3.5920214653015137, + "learning_rate": 6.623325855160567e-08, + "logits/chosen": -0.35266703367233276, + "logits/rejected": -0.47480249404907227, + "logps/chosen": -52.074398040771484, + "logps/rejected": -87.59912109375, + "loss": 0.6678, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8706142902374268, + "rewards/margins": 5.428411483764648, + "rewards/rejected": -2.557797431945801, + "step": 5685 + }, + { + "epoch": 1.42, + "grad_norm": 5.205046653747559, + "learning_rate": 6.580894130453263e-08, + "logits/chosen": -0.34091314673423767, + "logits/rejected": -0.441924124956131, + "logps/chosen": -58.570396423339844, + "logps/rejected": -72.75560760498047, + "loss": 0.667, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0862250328063965, + "rewards/margins": 5.209873199462891, + "rewards/rejected": -2.123648166656494, + "step": 5686 + }, + { + "epoch": 1.42, + "grad_norm": 5.2730584144592285, + "learning_rate": 6.538597859937712e-08, + "logits/chosen": -0.37601688504219055, + "logits/rejected": -0.4491281807422638, + "logps/chosen": -51.850006103515625, + "logps/rejected": -85.79110717773438, + "loss": 0.7073, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0815353393554688, + "rewards/margins": 4.651791572570801, + "rewards/rejected": -1.5702561140060425, + "step": 5687 + }, + { + "epoch": 1.42, + "grad_norm": 3.8783586025238037, + "learning_rate": 6.496437055225069e-08, + "logits/chosen": -0.387305349111557, + "logits/rejected": -0.5138580799102783, + "logps/chosen": -47.920326232910156, + "logps/rejected": -87.5848388671875, + "loss": 0.6245, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8325929641723633, + "rewards/margins": 5.568364143371582, + "rewards/rejected": -2.7357709407806396, + "step": 5688 + }, + { + "epoch": 1.42, + "grad_norm": 3.829458236694336, + "learning_rate": 6.45441172788952e-08, + "logits/chosen": -0.3852064609527588, + "logits/rejected": -0.48821067810058594, + "logps/chosen": -53.07222366333008, + "logps/rejected": -94.20698547363281, + "loss": 0.5874, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.085932731628418, + "rewards/margins": 4.86843204498291, + "rewards/rejected": -1.7824993133544922, + "step": 5689 + }, + { + "epoch": 1.42, + "grad_norm": 4.680186748504639, + "learning_rate": 6.412521889467726e-08, + "logits/chosen": -0.4036487340927124, + "logits/rejected": -0.5396555066108704, + "logps/chosen": -50.70001983642578, + "logps/rejected": -83.32240295410156, + "loss": 0.6507, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8041765689849854, + "rewards/margins": 5.371001243591309, + "rewards/rejected": -2.566824197769165, + "step": 5690 + }, + { + "epoch": 1.42, + "grad_norm": 1.8833199739456177, + "learning_rate": 6.370767551459489e-08, + "logits/chosen": -0.36072030663490295, + "logits/rejected": -0.5193553566932678, + "logps/chosen": -60.084102630615234, + "logps/rejected": -94.56472778320312, + "loss": 0.6183, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9386141300201416, + "rewards/margins": 5.726996421813965, + "rewards/rejected": -2.788382053375244, + "step": 5691 + }, + { + "epoch": 1.42, + "grad_norm": 4.616584300994873, + "learning_rate": 6.329148725327194e-08, + "logits/chosen": -0.410624623298645, + "logits/rejected": -0.44739916920661926, + "logps/chosen": -47.658966064453125, + "logps/rejected": -87.69491577148438, + "loss": 0.8271, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9655988216400146, + "rewards/margins": 4.150290489196777, + "rewards/rejected": -1.184691309928894, + "step": 5692 + }, + { + "epoch": 1.42, + "grad_norm": 5.319228649139404, + "learning_rate": 6.287665422496203e-08, + "logits/chosen": -0.3298290967941284, + "logits/rejected": -0.40157562494277954, + "logps/chosen": -65.67755126953125, + "logps/rejected": -96.85665893554688, + "loss": 0.7629, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9240641593933105, + "rewards/margins": 5.707137584686279, + "rewards/rejected": -2.783073663711548, + "step": 5693 + }, + { + "epoch": 1.42, + "grad_norm": 4.154080867767334, + "learning_rate": 6.246317654354517e-08, + "logits/chosen": -0.3752853572368622, + "logits/rejected": -0.47672173380851746, + "logps/chosen": -45.11084747314453, + "logps/rejected": -85.64964294433594, + "loss": 0.644, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9166414737701416, + "rewards/margins": 5.449256896972656, + "rewards/rejected": -2.5326156616210938, + "step": 5694 + }, + { + "epoch": 1.42, + "grad_norm": 3.8343396186828613, + "learning_rate": 6.205105432252945e-08, + "logits/chosen": -0.38117271661758423, + "logits/rejected": -0.5024189949035645, + "logps/chosen": -54.15275192260742, + "logps/rejected": -77.79881286621094, + "loss": 0.657, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.037527322769165, + "rewards/margins": 5.586944103240967, + "rewards/rejected": -2.5494163036346436, + "step": 5695 + }, + { + "epoch": 1.42, + "grad_norm": 8.541919708251953, + "learning_rate": 6.164028767505214e-08, + "logits/chosen": -0.33974602818489075, + "logits/rejected": -0.4677685499191284, + "logps/chosen": -59.611934661865234, + "logps/rejected": -87.11961364746094, + "loss": 0.7495, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.920318365097046, + "rewards/margins": 4.515097141265869, + "rewards/rejected": -1.5947786569595337, + "step": 5696 + }, + { + "epoch": 1.43, + "grad_norm": 2.6166272163391113, + "learning_rate": 6.123087671387806e-08, + "logits/chosen": -0.2799685001373291, + "logits/rejected": -0.446090966463089, + "logps/chosen": -58.44888687133789, + "logps/rejected": -78.24925994873047, + "loss": 0.6206, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.161602020263672, + "rewards/margins": 5.77239465713501, + "rewards/rejected": -2.610792875289917, + "step": 5697 + }, + { + "epoch": 1.43, + "grad_norm": 7.80789852142334, + "learning_rate": 6.082282155139729e-08, + "logits/chosen": -0.31758320331573486, + "logits/rejected": -0.40457281470298767, + "logps/chosen": -66.07121276855469, + "logps/rejected": -106.15436553955078, + "loss": 0.7519, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1210129261016846, + "rewards/margins": 5.614582061767578, + "rewards/rejected": -2.4935686588287354, + "step": 5698 + }, + { + "epoch": 1.43, + "grad_norm": 2.4238123893737793, + "learning_rate": 6.041612229963189e-08, + "logits/chosen": -0.4797091484069824, + "logits/rejected": -0.5916867852210999, + "logps/chosen": -45.810646057128906, + "logps/rejected": -75.28400421142578, + "loss": 0.5816, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.269624948501587, + "rewards/margins": 5.821602821350098, + "rewards/rejected": -2.5519771575927734, + "step": 5699 + }, + { + "epoch": 1.43, + "grad_norm": 3.57676362991333, + "learning_rate": 6.001077907022812e-08, + "logits/chosen": -0.32871803641319275, + "logits/rejected": -0.43663546442985535, + "logps/chosen": -53.83795166015625, + "logps/rejected": -98.01435852050781, + "loss": 0.6116, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0965261459350586, + "rewards/margins": 5.931483268737793, + "rewards/rejected": -2.834956645965576, + "step": 5700 + }, + { + "epoch": 1.43, + "grad_norm": 2.032329559326172, + "learning_rate": 5.960679197446307e-08, + "logits/chosen": -0.29436492919921875, + "logits/rejected": -0.44332432746887207, + "logps/chosen": -59.822181701660156, + "logps/rejected": -98.29236602783203, + "loss": 0.6201, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.179008960723877, + "rewards/margins": 5.962604999542236, + "rewards/rejected": -2.7835965156555176, + "step": 5701 + }, + { + "epoch": 1.43, + "grad_norm": 5.2266130447387695, + "learning_rate": 5.9204161123238013e-08, + "logits/chosen": -0.25866153836250305, + "logits/rejected": -0.34396037459373474, + "logps/chosen": -56.475093841552734, + "logps/rejected": -91.78900146484375, + "loss": 0.7164, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9523794651031494, + "rewards/margins": 4.959858417510986, + "rewards/rejected": -2.007479190826416, + "step": 5702 + }, + { + "epoch": 1.43, + "grad_norm": 4.07082462310791, + "learning_rate": 5.8802886627084554e-08, + "logits/chosen": -0.34394997358322144, + "logits/rejected": -0.4199334681034088, + "logps/chosen": -56.01526641845703, + "logps/rejected": -98.78339385986328, + "loss": 0.6662, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9139490127563477, + "rewards/margins": 4.42147970199585, + "rewards/rejected": -1.5075304508209229, + "step": 5703 + }, + { + "epoch": 1.43, + "grad_norm": 6.103016376495361, + "learning_rate": 5.840296859616179e-08, + "logits/chosen": -0.40348562598228455, + "logits/rejected": -0.4384261965751648, + "logps/chosen": -42.357574462890625, + "logps/rejected": -82.74380493164062, + "loss": 0.7231, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9788451194763184, + "rewards/margins": 4.753368854522705, + "rewards/rejected": -1.7745239734649658, + "step": 5704 + }, + { + "epoch": 1.43, + "grad_norm": 6.688997745513916, + "learning_rate": 5.800440714025524e-08, + "logits/chosen": -0.39795270562171936, + "logits/rejected": -0.493241548538208, + "logps/chosen": -54.72563934326172, + "logps/rejected": -89.12849426269531, + "loss": 0.6999, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1027376651763916, + "rewards/margins": 4.790626525878906, + "rewards/rejected": -1.6878893375396729, + "step": 5705 + }, + { + "epoch": 1.43, + "grad_norm": 5.458181858062744, + "learning_rate": 5.760720236877903e-08, + "logits/chosen": -0.33085694909095764, + "logits/rejected": -0.36310985684394836, + "logps/chosen": -68.25994873046875, + "logps/rejected": -119.362060546875, + "loss": 0.7688, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9081900119781494, + "rewards/margins": 4.705752849578857, + "rewards/rejected": -1.7975634336471558, + "step": 5706 + }, + { + "epoch": 1.43, + "grad_norm": 6.6079630851745605, + "learning_rate": 5.72113543907743e-08, + "logits/chosen": -0.42222273349761963, + "logits/rejected": -0.46128425002098083, + "logps/chosen": -47.180816650390625, + "logps/rejected": -83.96807861328125, + "loss": 0.7613, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0214405059814453, + "rewards/margins": 4.988330841064453, + "rewards/rejected": -1.9668903350830078, + "step": 5707 + }, + { + "epoch": 1.43, + "grad_norm": 2.8305060863494873, + "learning_rate": 5.68168633149091e-08, + "logits/chosen": -0.3247279226779938, + "logits/rejected": -0.4674481749534607, + "logps/chosen": -48.62948989868164, + "logps/rejected": -77.67280578613281, + "loss": 0.5792, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.916621208190918, + "rewards/margins": 4.988052845001221, + "rewards/rejected": -2.071432113647461, + "step": 5708 + }, + { + "epoch": 1.43, + "grad_norm": 20.71991729736328, + "learning_rate": 5.642372924948125e-08, + "logits/chosen": -0.3867381513118744, + "logits/rejected": -0.5055761337280273, + "logps/chosen": -58.89775466918945, + "logps/rejected": -92.99174499511719, + "loss": 0.6936, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.017599582672119, + "rewards/margins": 5.666105270385742, + "rewards/rejected": -2.648505449295044, + "step": 5709 + }, + { + "epoch": 1.43, + "grad_norm": 4.733273029327393, + "learning_rate": 5.6031952302412765e-08, + "logits/chosen": -0.3725529611110687, + "logits/rejected": -0.5069110989570618, + "logps/chosen": -50.861610412597656, + "logps/rejected": -62.494384765625, + "loss": 0.6502, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.975851535797119, + "rewards/margins": 5.004712104797363, + "rewards/rejected": -2.028860569000244, + "step": 5710 + }, + { + "epoch": 1.43, + "grad_norm": 3.736774206161499, + "learning_rate": 5.564153258125649e-08, + "logits/chosen": -0.2942197322845459, + "logits/rejected": -0.47122472524642944, + "logps/chosen": -74.12261199951172, + "logps/rejected": -79.69551086425781, + "loss": 0.6985, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9975695610046387, + "rewards/margins": 5.218664646148682, + "rewards/rejected": -2.2210946083068848, + "step": 5711 + }, + { + "epoch": 1.43, + "grad_norm": 12.712648391723633, + "learning_rate": 5.525247019319002e-08, + "logits/chosen": -0.2646521031856537, + "logits/rejected": -0.4007885754108429, + "logps/chosen": -59.38983154296875, + "logps/rejected": -76.21311950683594, + "loss": 0.7768, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7190754413604736, + "rewards/margins": 4.9475626945495605, + "rewards/rejected": -2.228487491607666, + "step": 5712 + }, + { + "epoch": 1.43, + "grad_norm": 15.945393562316895, + "learning_rate": 5.48647652450196e-08, + "logits/chosen": -0.28340139985084534, + "logits/rejected": -0.4174141585826874, + "logps/chosen": -69.12517547607422, + "logps/rejected": -84.963134765625, + "loss": 0.7336, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.728639602661133, + "rewards/margins": 4.852436542510986, + "rewards/rejected": -2.1237969398498535, + "step": 5713 + }, + { + "epoch": 1.43, + "grad_norm": 3.371236801147461, + "learning_rate": 5.447841784317898e-08, + "logits/chosen": -0.3709857165813446, + "logits/rejected": -0.5181481242179871, + "logps/chosen": -54.771766662597656, + "logps/rejected": -60.90841293334961, + "loss": 0.6967, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1836109161376953, + "rewards/margins": 5.1839189529418945, + "rewards/rejected": -2.000307559967041, + "step": 5714 + }, + { + "epoch": 1.43, + "grad_norm": 2.839252233505249, + "learning_rate": 5.409342809372831e-08, + "logits/chosen": -0.34460559487342834, + "logits/rejected": -0.4327980875968933, + "logps/chosen": -54.813926696777344, + "logps/rejected": -101.15760803222656, + "loss": 0.6421, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9660897254943848, + "rewards/margins": 6.1396026611328125, + "rewards/rejected": -3.1735129356384277, + "step": 5715 + }, + { + "epoch": 1.43, + "grad_norm": 5.680253505706787, + "learning_rate": 5.3709796102356385e-08, + "logits/chosen": -0.4582952857017517, + "logits/rejected": -0.5380138158798218, + "logps/chosen": -57.245643615722656, + "logps/rejected": -96.43515014648438, + "loss": 0.8003, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.827669620513916, + "rewards/margins": 4.5404767990112305, + "rewards/rejected": -1.712807536125183, + "step": 5716 + }, + { + "epoch": 1.43, + "grad_norm": 4.720489025115967, + "learning_rate": 5.332752197437729e-08, + "logits/chosen": -0.3776220679283142, + "logits/rejected": -0.4908699691295624, + "logps/chosen": -59.676849365234375, + "logps/rejected": -79.14738464355469, + "loss": 0.7576, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.908583641052246, + "rewards/margins": 4.6644439697265625, + "rewards/rejected": -1.7558599710464478, + "step": 5717 + }, + { + "epoch": 1.43, + "grad_norm": 7.164077281951904, + "learning_rate": 5.2946605814734874e-08, + "logits/chosen": -0.4203367233276367, + "logits/rejected": -0.4902609586715698, + "logps/chosen": -54.18901062011719, + "logps/rejected": -81.53611755371094, + "loss": 0.7742, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8491859436035156, + "rewards/margins": 4.8718671798706055, + "rewards/rejected": -2.0226807594299316, + "step": 5718 + }, + { + "epoch": 1.43, + "grad_norm": 12.834089279174805, + "learning_rate": 5.25670477279977e-08, + "logits/chosen": -0.284361869096756, + "logits/rejected": -0.35904163122177124, + "logps/chosen": -62.923240661621094, + "logps/rejected": -92.99737548828125, + "loss": 0.7937, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.664299249649048, + "rewards/margins": 4.288406848907471, + "rewards/rejected": -1.6241075992584229, + "step": 5719 + }, + { + "epoch": 1.43, + "grad_norm": 4.783960819244385, + "learning_rate": 5.218884781836297e-08, + "logits/chosen": -0.3376377522945404, + "logits/rejected": -0.45817112922668457, + "logps/chosen": -60.74608612060547, + "logps/rejected": -79.2538070678711, + "loss": 0.6924, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.885852813720703, + "rewards/margins": 4.945980548858643, + "rewards/rejected": -2.0601282119750977, + "step": 5720 + }, + { + "epoch": 1.43, + "grad_norm": 4.942054271697998, + "learning_rate": 5.181200618965543e-08, + "logits/chosen": -0.31129494309425354, + "logits/rejected": -0.39772820472717285, + "logps/chosen": -62.859832763671875, + "logps/rejected": -95.47496032714844, + "loss": 0.7445, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7836523056030273, + "rewards/margins": 4.885064125061035, + "rewards/rejected": -2.101411819458008, + "step": 5721 + }, + { + "epoch": 1.43, + "grad_norm": 4.4055891036987305, + "learning_rate": 5.143652294532564e-08, + "logits/chosen": -0.40007466077804565, + "logits/rejected": -0.5117818713188171, + "logps/chosen": -49.99437713623047, + "logps/rejected": -77.07400512695312, + "loss": 0.755, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8943018913269043, + "rewards/margins": 4.871850490570068, + "rewards/rejected": -1.9775484800338745, + "step": 5722 + }, + { + "epoch": 1.43, + "grad_norm": 23.797733306884766, + "learning_rate": 5.106239818845116e-08, + "logits/chosen": -0.26998817920684814, + "logits/rejected": -0.41820353269577026, + "logps/chosen": -64.88932800292969, + "logps/rejected": -78.6033935546875, + "loss": 0.915, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5515575408935547, + "rewards/margins": 4.136541843414307, + "rewards/rejected": -1.584984302520752, + "step": 5723 + }, + { + "epoch": 1.43, + "grad_norm": 4.951985836029053, + "learning_rate": 5.068963202173871e-08, + "logits/chosen": -0.3613821566104889, + "logits/rejected": -0.438920795917511, + "logps/chosen": -59.67181396484375, + "logps/rejected": -91.92887115478516, + "loss": 0.6488, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.461958408355713, + "rewards/margins": 4.82435417175293, + "rewards/rejected": -2.362396001815796, + "step": 5724 + }, + { + "epoch": 1.43, + "grad_norm": 10.44216251373291, + "learning_rate": 5.031822454751978e-08, + "logits/chosen": -0.3547385632991791, + "logits/rejected": -0.4197031259536743, + "logps/chosen": -53.894447326660156, + "logps/rejected": -87.78584289550781, + "loss": 0.7476, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8930864334106445, + "rewards/margins": 4.501494884490967, + "rewards/rejected": -1.6084082126617432, + "step": 5725 + }, + { + "epoch": 1.43, + "grad_norm": 6.371641159057617, + "learning_rate": 4.994817586775391e-08, + "logits/chosen": -0.4506584107875824, + "logits/rejected": -0.5362010598182678, + "logps/chosen": -59.929752349853516, + "logps/rejected": -76.87703704833984, + "loss": 0.8548, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.782383441925049, + "rewards/margins": 4.32023286819458, + "rewards/rejected": -1.5378490686416626, + "step": 5726 + }, + { + "epoch": 1.43, + "grad_norm": 5.203680038452148, + "learning_rate": 4.957948608402707e-08, + "logits/chosen": -0.29228121042251587, + "logits/rejected": -0.44601714611053467, + "logps/chosen": -70.0547103881836, + "logps/rejected": -90.89412689208984, + "loss": 0.7857, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.081916093826294, + "rewards/margins": 5.025140762329102, + "rewards/rejected": -1.9432247877120972, + "step": 5727 + }, + { + "epoch": 1.43, + "grad_norm": 12.8055419921875, + "learning_rate": 4.92121552975533e-08, + "logits/chosen": -0.36777910590171814, + "logits/rejected": -0.4762367606163025, + "logps/chosen": -51.82334518432617, + "logps/rejected": -76.26203918457031, + "loss": 0.7667, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.714963674545288, + "rewards/margins": 4.855326175689697, + "rewards/rejected": -2.1403627395629883, + "step": 5728 + }, + { + "epoch": 1.43, + "grad_norm": 4.291929721832275, + "learning_rate": 4.884618360917248e-08, + "logits/chosen": -0.2958002984523773, + "logits/rejected": -0.442544162273407, + "logps/chosen": -59.27735900878906, + "logps/rejected": -75.22331237792969, + "loss": 0.6602, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0048882961273193, + "rewards/margins": 4.897909641265869, + "rewards/rejected": -1.8930214643478394, + "step": 5729 + }, + { + "epoch": 1.43, + "grad_norm": 3.7595298290252686, + "learning_rate": 4.848157111935148e-08, + "logits/chosen": -0.3911563754081726, + "logits/rejected": -0.43872272968292236, + "logps/chosen": -50.80234909057617, + "logps/rejected": -88.85346221923828, + "loss": 0.7934, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9107072353363037, + "rewards/margins": 5.054134845733643, + "rewards/rejected": -2.1434273719787598, + "step": 5730 + }, + { + "epoch": 1.43, + "grad_norm": 5.898118495941162, + "learning_rate": 4.811831792818522e-08, + "logits/chosen": -0.33882343769073486, + "logits/rejected": -0.47341787815093994, + "logps/chosen": -57.90729522705078, + "logps/rejected": -87.83692932128906, + "loss": 0.686, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0098178386688232, + "rewards/margins": 5.466641426086426, + "rewards/rejected": -2.4568238258361816, + "step": 5731 + }, + { + "epoch": 1.43, + "grad_norm": 10.206062316894531, + "learning_rate": 4.775642413539339e-08, + "logits/chosen": -0.3389425277709961, + "logits/rejected": -0.4735466539859772, + "logps/chosen": -60.982994079589844, + "logps/rejected": -89.36407470703125, + "loss": 0.7952, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.821382522583008, + "rewards/margins": 4.63414192199707, + "rewards/rejected": -1.8127585649490356, + "step": 5732 + }, + { + "epoch": 1.43, + "grad_norm": 6.694800853729248, + "learning_rate": 4.739588984032373e-08, + "logits/chosen": -0.37591972947120667, + "logits/rejected": -0.4330199956893921, + "logps/chosen": -43.09959411621094, + "logps/rejected": -79.97518920898438, + "loss": 0.6108, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.321051597595215, + "rewards/margins": 4.770534515380859, + "rewards/rejected": -1.4494831562042236, + "step": 5733 + }, + { + "epoch": 1.43, + "grad_norm": 17.94811248779297, + "learning_rate": 4.703671514195207e-08, + "logits/chosen": -0.360424667596817, + "logits/rejected": -0.4157390892505646, + "logps/chosen": -55.20177459716797, + "logps/rejected": -95.06582641601562, + "loss": 0.7461, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.007826566696167, + "rewards/margins": 5.11583948135376, + "rewards/rejected": -2.108013153076172, + "step": 5734 + }, + { + "epoch": 1.43, + "grad_norm": 3.1397061347961426, + "learning_rate": 4.6678900138877324e-08, + "logits/chosen": -0.3258952498435974, + "logits/rejected": -0.3959430456161499, + "logps/chosen": -62.206851959228516, + "logps/rejected": -83.50325012207031, + "loss": 0.8184, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0458502769470215, + "rewards/margins": 4.957375526428223, + "rewards/rejected": -1.911525011062622, + "step": 5735 + }, + { + "epoch": 1.43, + "grad_norm": 4.714226245880127, + "learning_rate": 4.63224449293298e-08, + "logits/chosen": -0.4764576554298401, + "logits/rejected": -0.6099754571914673, + "logps/chosen": -57.355873107910156, + "logps/rejected": -62.82902526855469, + "loss": 0.7448, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0763497352600098, + "rewards/margins": 5.338192939758301, + "rewards/rejected": -2.26184344291687, + "step": 5736 + }, + { + "epoch": 1.44, + "grad_norm": 3.4963419437408447, + "learning_rate": 4.596734961116289e-08, + "logits/chosen": -0.363559365272522, + "logits/rejected": -0.43366414308547974, + "logps/chosen": -46.53131103515625, + "logps/rejected": -93.96168518066406, + "loss": 0.5948, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1145167350769043, + "rewards/margins": 5.202179908752441, + "rewards/rejected": -2.087662935256958, + "step": 5737 + }, + { + "epoch": 1.44, + "grad_norm": 3.6990139484405518, + "learning_rate": 4.561361428185751e-08, + "logits/chosen": -0.3510667085647583, + "logits/rejected": -0.41882437467575073, + "logps/chosen": -62.967437744140625, + "logps/rejected": -91.67448425292969, + "loss": 0.7495, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.157053232192993, + "rewards/margins": 4.803851127624512, + "rewards/rejected": -1.6467981338500977, + "step": 5738 + }, + { + "epoch": 1.44, + "grad_norm": 5.569303512573242, + "learning_rate": 4.526123903852264e-08, + "logits/chosen": -0.37596505880355835, + "logits/rejected": -0.48646289110183716, + "logps/chosen": -61.12643814086914, + "logps/rejected": -76.0237808227539, + "loss": 0.8451, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.922175884246826, + "rewards/margins": 4.638556003570557, + "rewards/rejected": -1.7163807153701782, + "step": 5739 + }, + { + "epoch": 1.44, + "grad_norm": 8.376932144165039, + "learning_rate": 4.4910223977892574e-08, + "logits/chosen": -0.39105239510536194, + "logits/rejected": -0.47040683031082153, + "logps/chosen": -48.3699836730957, + "logps/rejected": -76.8803482055664, + "loss": 0.7619, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9688799381256104, + "rewards/margins": 4.832960605621338, + "rewards/rejected": -1.864080786705017, + "step": 5740 + }, + { + "epoch": 1.44, + "grad_norm": 5.300906658172607, + "learning_rate": 4.456056919632856e-08, + "logits/chosen": -0.3555113971233368, + "logits/rejected": -0.5012816786766052, + "logps/chosen": -75.55514526367188, + "logps/rejected": -77.16645812988281, + "loss": 0.7229, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.667588233947754, + "rewards/margins": 5.311507225036621, + "rewards/rejected": -2.643918991088867, + "step": 5741 + }, + { + "epoch": 1.44, + "grad_norm": 10.930706977844238, + "learning_rate": 4.421227478981827e-08, + "logits/chosen": -0.2751641571521759, + "logits/rejected": -0.43118709325790405, + "logps/chosen": -65.08621215820312, + "logps/rejected": -74.57356262207031, + "loss": 0.9165, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9094018936157227, + "rewards/margins": 4.347292423248291, + "rewards/rejected": -1.4378907680511475, + "step": 5742 + }, + { + "epoch": 1.44, + "grad_norm": 5.466285228729248, + "learning_rate": 4.38653408539752e-08, + "logits/chosen": -0.38084763288497925, + "logits/rejected": -0.5004175305366516, + "logps/chosen": -50.80552291870117, + "logps/rejected": -83.38359832763672, + "loss": 0.7545, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.535961627960205, + "rewards/margins": 4.743826389312744, + "rewards/rejected": -2.207864761352539, + "step": 5743 + }, + { + "epoch": 1.44, + "grad_norm": 3.949157953262329, + "learning_rate": 4.351976748404208e-08, + "logits/chosen": -0.36551111936569214, + "logits/rejected": -0.3495381772518158, + "logps/chosen": -64.27611541748047, + "logps/rejected": -96.3094253540039, + "loss": 0.751, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.969773530960083, + "rewards/margins": 4.222161293029785, + "rewards/rejected": -1.2523880004882812, + "step": 5744 + }, + { + "epoch": 1.44, + "grad_norm": 5.346224784851074, + "learning_rate": 4.317555477488411e-08, + "logits/chosen": -0.36309814453125, + "logits/rejected": -0.43142566084861755, + "logps/chosen": -83.9687728881836, + "logps/rejected": -99.62675476074219, + "loss": 0.9334, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5923874378204346, + "rewards/margins": 4.369577407836914, + "rewards/rejected": -1.7771897315979004, + "step": 5745 + }, + { + "epoch": 1.44, + "grad_norm": 6.221680164337158, + "learning_rate": 4.2832702820997386e-08, + "logits/chosen": -0.3187868297100067, + "logits/rejected": -0.4368494153022766, + "logps/chosen": -53.27153778076172, + "logps/rejected": -85.92794036865234, + "loss": 0.7695, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1381444931030273, + "rewards/margins": 6.023283004760742, + "rewards/rejected": -2.8851382732391357, + "step": 5746 + }, + { + "epoch": 1.44, + "grad_norm": 6.226232051849365, + "learning_rate": 4.2491211716499946e-08, + "logits/chosen": -0.301321417093277, + "logits/rejected": -0.4513103663921356, + "logps/chosen": -63.25956726074219, + "logps/rejected": -69.55911254882812, + "loss": 0.6766, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1792235374450684, + "rewards/margins": 4.740995407104492, + "rewards/rejected": -1.5617713928222656, + "step": 5747 + }, + { + "epoch": 1.44, + "grad_norm": 6.629573822021484, + "learning_rate": 4.215108155514014e-08, + "logits/chosen": -0.36244988441467285, + "logits/rejected": -0.4836212396621704, + "logps/chosen": -62.538185119628906, + "logps/rejected": -86.50044250488281, + "loss": 0.7445, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.956357002258301, + "rewards/margins": 5.574919700622559, + "rewards/rejected": -2.6185622215270996, + "step": 5748 + }, + { + "epoch": 1.44, + "grad_norm": 6.971039772033691, + "learning_rate": 4.1812312430289936e-08, + "logits/chosen": -0.3386489748954773, + "logits/rejected": -0.46731412410736084, + "logps/chosen": -62.515533447265625, + "logps/rejected": -64.33773803710938, + "loss": 0.8076, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.004122734069824, + "rewards/margins": 4.413979530334473, + "rewards/rejected": -1.4098572731018066, + "step": 5749 + }, + { + "epoch": 1.44, + "grad_norm": 6.785486698150635, + "learning_rate": 4.1474904434949395e-08, + "logits/chosen": -0.2864508032798767, + "logits/rejected": -0.39733532071113586, + "logps/chosen": -63.02768325805664, + "logps/rejected": -86.99603271484375, + "loss": 0.6973, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.082857370376587, + "rewards/margins": 5.447870254516602, + "rewards/rejected": -2.3650126457214355, + "step": 5750 + }, + { + "epoch": 1.44, + "grad_norm": 11.749669075012207, + "learning_rate": 4.1138857661744417e-08, + "logits/chosen": -0.29185330867767334, + "logits/rejected": -0.40779268741607666, + "logps/chosen": -64.1985855102539, + "logps/rejected": -84.31269073486328, + "loss": 0.8539, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8025879859924316, + "rewards/margins": 4.375051975250244, + "rewards/rejected": -1.5724637508392334, + "step": 5751 + }, + { + "epoch": 1.44, + "grad_norm": 2.9515395164489746, + "learning_rate": 4.080417220292676e-08, + "logits/chosen": -0.39074158668518066, + "logits/rejected": -0.4809887409210205, + "logps/chosen": -54.55857849121094, + "logps/rejected": -85.21563720703125, + "loss": 0.6527, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.211625099182129, + "rewards/margins": 5.382286071777344, + "rewards/rejected": -2.170661449432373, + "step": 5752 + }, + { + "epoch": 1.44, + "grad_norm": 4.26984167098999, + "learning_rate": 4.0470848150374054e-08, + "logits/chosen": -0.4051750898361206, + "logits/rejected": -0.4818832278251648, + "logps/chosen": -44.638267517089844, + "logps/rejected": -87.19190216064453, + "loss": 0.5556, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9672107696533203, + "rewards/margins": 5.101850509643555, + "rewards/rejected": -2.1346395015716553, + "step": 5753 + }, + { + "epoch": 1.44, + "grad_norm": 7.347752094268799, + "learning_rate": 4.013888559559253e-08, + "logits/chosen": -0.45536279678344727, + "logits/rejected": -0.5177932977676392, + "logps/chosen": -53.242828369140625, + "logps/rejected": -92.09229278564453, + "loss": 0.8325, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7865962982177734, + "rewards/margins": 5.443049907684326, + "rewards/rejected": -2.656454086303711, + "step": 5754 + }, + { + "epoch": 1.44, + "grad_norm": 8.572094917297363, + "learning_rate": 3.980828462971209e-08, + "logits/chosen": -0.3813139498233795, + "logits/rejected": -0.46235156059265137, + "logps/chosen": -55.38393020629883, + "logps/rejected": -97.21302032470703, + "loss": 0.7323, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0758345127105713, + "rewards/margins": 5.431332111358643, + "rewards/rejected": -2.3554975986480713, + "step": 5755 + }, + { + "epoch": 1.44, + "grad_norm": 4.371494770050049, + "learning_rate": 3.9479045343489565e-08, + "logits/chosen": -0.33469611406326294, + "logits/rejected": -0.45531362295150757, + "logps/chosen": -60.77823257446289, + "logps/rejected": -86.86778259277344, + "loss": 0.6331, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.973634719848633, + "rewards/margins": 4.957480430603027, + "rewards/rejected": -1.9838459491729736, + "step": 5756 + }, + { + "epoch": 1.44, + "grad_norm": 7.138363838195801, + "learning_rate": 3.915116782730932e-08, + "logits/chosen": -0.38395199179649353, + "logits/rejected": -0.44076818227767944, + "logps/chosen": -54.541839599609375, + "logps/rejected": -79.38716125488281, + "loss": 0.8859, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7634787559509277, + "rewards/margins": 4.896642208099365, + "rewards/rejected": -2.1331634521484375, + "step": 5757 + }, + { + "epoch": 1.44, + "grad_norm": 5.763890743255615, + "learning_rate": 3.882465217117992e-08, + "logits/chosen": -0.3159935474395752, + "logits/rejected": -0.37902915477752686, + "logps/chosen": -57.07166290283203, + "logps/rejected": -83.05200958251953, + "loss": 0.6986, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8426260948181152, + "rewards/margins": 4.956499099731445, + "rewards/rejected": -2.11387300491333, + "step": 5758 + }, + { + "epoch": 1.44, + "grad_norm": 4.302670001983643, + "learning_rate": 3.849949846473744e-08, + "logits/chosen": -0.4063574969768524, + "logits/rejected": -0.49564626812934875, + "logps/chosen": -48.85062026977539, + "logps/rejected": -76.55979919433594, + "loss": 0.6564, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0170702934265137, + "rewards/margins": 4.678652286529541, + "rewards/rejected": -1.6615817546844482, + "step": 5759 + }, + { + "epoch": 1.44, + "grad_norm": 6.485288619995117, + "learning_rate": 3.817570679724269e-08, + "logits/chosen": -0.3272322714328766, + "logits/rejected": -0.4031140208244324, + "logps/chosen": -53.614585876464844, + "logps/rejected": -100.53016662597656, + "loss": 0.7492, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.582726001739502, + "rewards/margins": 4.735210418701172, + "rewards/rejected": -2.152484893798828, + "step": 5760 + }, + { + "epoch": 1.44, + "grad_norm": 3.8961551189422607, + "learning_rate": 3.7853277257584586e-08, + "logits/chosen": -0.45646458864212036, + "logits/rejected": -0.518886387348175, + "logps/chosen": -42.1256103515625, + "logps/rejected": -94.00372314453125, + "loss": 0.6177, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7739200592041016, + "rewards/margins": 6.120547771453857, + "rewards/rejected": -3.3466269969940186, + "step": 5761 + }, + { + "epoch": 1.44, + "grad_norm": 10.01818561553955, + "learning_rate": 3.753220993427675e-08, + "logits/chosen": -0.3110567629337311, + "logits/rejected": -0.3991805911064148, + "logps/chosen": -60.34464645385742, + "logps/rejected": -92.82130432128906, + "loss": 0.7308, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6024491786956787, + "rewards/margins": 4.957708358764648, + "rewards/rejected": -2.3552589416503906, + "step": 5762 + }, + { + "epoch": 1.44, + "grad_norm": 6.656800746917725, + "learning_rate": 3.721250491545869e-08, + "logits/chosen": -0.2539956569671631, + "logits/rejected": -0.30528974533081055, + "logps/chosen": -86.05754852294922, + "logps/rejected": -113.55718994140625, + "loss": 0.8739, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.582406759262085, + "rewards/margins": 4.624651908874512, + "rewards/rejected": -2.0422449111938477, + "step": 5763 + }, + { + "epoch": 1.44, + "grad_norm": 4.261110305786133, + "learning_rate": 3.6894162288896864e-08, + "logits/chosen": -0.3565840423107147, + "logits/rejected": -0.47343045473098755, + "logps/chosen": -48.983707427978516, + "logps/rejected": -77.84744262695312, + "loss": 0.5958, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.304244041442871, + "rewards/margins": 5.600615501403809, + "rewards/rejected": -2.2963716983795166, + "step": 5764 + }, + { + "epoch": 1.44, + "grad_norm": 3.5877890586853027, + "learning_rate": 3.657718214198247e-08, + "logits/chosen": -0.380501925945282, + "logits/rejected": -0.46325260400772095, + "logps/chosen": -45.780242919921875, + "logps/rejected": -82.85266876220703, + "loss": 0.7002, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9309463500976562, + "rewards/margins": 4.389060020446777, + "rewards/rejected": -1.458113193511963, + "step": 5765 + }, + { + "epoch": 1.44, + "grad_norm": 4.887473106384277, + "learning_rate": 3.626156456173424e-08, + "logits/chosen": -0.479062557220459, + "logits/rejected": -0.4686407744884491, + "logps/chosen": -47.71457290649414, + "logps/rejected": -89.09799194335938, + "loss": 0.7289, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8992650508880615, + "rewards/margins": 4.759191989898682, + "rewards/rejected": -1.8599269390106201, + "step": 5766 + }, + { + "epoch": 1.44, + "grad_norm": 4.8481340408325195, + "learning_rate": 3.594730963479509e-08, + "logits/chosen": -0.27493906021118164, + "logits/rejected": -0.35672903060913086, + "logps/chosen": -72.53898620605469, + "logps/rejected": -83.25823211669922, + "loss": 0.8459, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.952793836593628, + "rewards/margins": 4.057190418243408, + "rewards/rejected": -1.104396104812622, + "step": 5767 + }, + { + "epoch": 1.44, + "grad_norm": 5.175452709197998, + "learning_rate": 3.56344174474349e-08, + "logits/chosen": -0.25757503509521484, + "logits/rejected": -0.42585861682891846, + "logps/chosen": -67.57239532470703, + "logps/rejected": -81.7236557006836, + "loss": 0.7161, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8949384689331055, + "rewards/margins": 4.860165596008301, + "rewards/rejected": -1.9652267694473267, + "step": 5768 + }, + { + "epoch": 1.44, + "grad_norm": 3.3606505393981934, + "learning_rate": 3.532288808555051e-08, + "logits/chosen": -0.35793694853782654, + "logits/rejected": -0.43087252974510193, + "logps/chosen": -55.80194854736328, + "logps/rejected": -95.90274047851562, + "loss": 0.6685, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1660196781158447, + "rewards/margins": 5.012548923492432, + "rewards/rejected": -1.8465293645858765, + "step": 5769 + }, + { + "epoch": 1.44, + "grad_norm": 6.660044193267822, + "learning_rate": 3.50127216346624e-08, + "logits/chosen": -0.31858283281326294, + "logits/rejected": -0.35326337814331055, + "logps/chosen": -50.577919006347656, + "logps/rejected": -80.30708312988281, + "loss": 0.8745, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6977124214172363, + "rewards/margins": 3.197455406188965, + "rewards/rejected": -0.49974262714385986, + "step": 5770 + }, + { + "epoch": 1.44, + "grad_norm": 4.344196319580078, + "learning_rate": 3.470391817991803e-08, + "logits/chosen": -0.3384239673614502, + "logits/rejected": -0.47617512941360474, + "logps/chosen": -58.20652770996094, + "logps/rejected": -86.83612060546875, + "loss": 0.6795, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9631192684173584, + "rewards/margins": 5.406644821166992, + "rewards/rejected": -2.4435248374938965, + "step": 5771 + }, + { + "epoch": 1.44, + "grad_norm": 5.656194686889648, + "learning_rate": 3.439647780609068e-08, + "logits/chosen": -0.35739412903785706, + "logits/rejected": -0.4134019613265991, + "logps/chosen": -59.93704605102539, + "logps/rejected": -90.6772689819336, + "loss": 0.8064, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6731765270233154, + "rewards/margins": 4.433533668518066, + "rewards/rejected": -1.7603569030761719, + "step": 5772 + }, + { + "epoch": 1.44, + "grad_norm": 3.7227189540863037, + "learning_rate": 3.409040059757951e-08, + "logits/chosen": -0.3674769401550293, + "logits/rejected": -0.4477428197860718, + "logps/chosen": -43.419586181640625, + "logps/rejected": -83.79034423828125, + "loss": 0.6418, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.015320301055908, + "rewards/margins": 5.333407402038574, + "rewards/rejected": -2.318087339401245, + "step": 5773 + }, + { + "epoch": 1.44, + "grad_norm": 5.412029266357422, + "learning_rate": 3.378568663840898e-08, + "logits/chosen": -0.4303610026836395, + "logits/rejected": -0.5099406242370605, + "logps/chosen": -54.93172836303711, + "logps/rejected": -93.36708068847656, + "loss": 0.6998, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1007614135742188, + "rewards/margins": 5.541825771331787, + "rewards/rejected": -2.4410645961761475, + "step": 5774 + }, + { + "epoch": 1.44, + "grad_norm": 7.092519283294678, + "learning_rate": 3.34823360122305e-08, + "logits/chosen": -0.3416740894317627, + "logits/rejected": -0.3540310859680176, + "logps/chosen": -52.94163513183594, + "logps/rejected": -98.95467376708984, + "loss": 0.7092, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.760300874710083, + "rewards/margins": 5.287343502044678, + "rewards/rejected": -2.527042865753174, + "step": 5775 + }, + { + "epoch": 1.44, + "grad_norm": 7.713984489440918, + "learning_rate": 3.318034880231968e-08, + "logits/chosen": -0.3235284686088562, + "logits/rejected": -0.4314053952693939, + "logps/chosen": -59.9442138671875, + "logps/rejected": -79.09903717041016, + "loss": 0.7306, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8876793384552, + "rewards/margins": 4.736337661743164, + "rewards/rejected": -1.8486586809158325, + "step": 5776 + }, + { + "epoch": 1.45, + "grad_norm": 6.486109733581543, + "learning_rate": 3.287972509157855e-08, + "logits/chosen": -0.30935004353523254, + "logits/rejected": -0.4367622435092926, + "logps/chosen": -58.65583419799805, + "logps/rejected": -79.8340072631836, + "loss": 0.7634, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9409079551696777, + "rewards/margins": 4.971883296966553, + "rewards/rejected": -2.030975580215454, + "step": 5777 + }, + { + "epoch": 1.45, + "grad_norm": 3.9399638175964355, + "learning_rate": 3.258046496253442e-08, + "logits/chosen": -0.3438356816768646, + "logits/rejected": -0.45421677827835083, + "logps/chosen": -63.51205062866211, + "logps/rejected": -90.09186553955078, + "loss": 0.8126, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.934577703475952, + "rewards/margins": 5.729483604431152, + "rewards/rejected": -2.7949061393737793, + "step": 5778 + }, + { + "epoch": 1.45, + "grad_norm": 5.151797294616699, + "learning_rate": 3.228256849734157e-08, + "logits/chosen": -0.32379472255706787, + "logits/rejected": -0.40797409415245056, + "logps/chosen": -57.48289489746094, + "logps/rejected": -86.83910369873047, + "loss": 0.7254, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0051512718200684, + "rewards/margins": 4.5583109855651855, + "rewards/rejected": -1.5531597137451172, + "step": 5779 + }, + { + "epoch": 1.45, + "grad_norm": 3.55996036529541, + "learning_rate": 3.198603577777848e-08, + "logits/chosen": -0.345736026763916, + "logits/rejected": -0.4515230655670166, + "logps/chosen": -46.74171447753906, + "logps/rejected": -72.75502014160156, + "loss": 0.6852, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0903875827789307, + "rewards/margins": 4.516973495483398, + "rewards/rejected": -1.4265857934951782, + "step": 5780 + }, + { + "epoch": 1.45, + "grad_norm": 4.054001331329346, + "learning_rate": 3.169086688524947e-08, + "logits/chosen": -0.31455254554748535, + "logits/rejected": -0.3718203008174896, + "logps/chosen": -55.45554733276367, + "logps/rejected": -81.16766357421875, + "loss": 0.734, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7457213401794434, + "rewards/margins": 4.24471378326416, + "rewards/rejected": -1.4989922046661377, + "step": 5781 + }, + { + "epoch": 1.45, + "grad_norm": 5.8602190017700195, + "learning_rate": 3.139706190078584e-08, + "logits/chosen": -0.37609174847602844, + "logits/rejected": -0.4276628792285919, + "logps/chosen": -45.10556411743164, + "logps/rejected": -101.32524871826172, + "loss": 0.6813, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.052664041519165, + "rewards/margins": 5.0186543464660645, + "rewards/rejected": -1.9659901857376099, + "step": 5782 + }, + { + "epoch": 1.45, + "grad_norm": 10.788213729858398, + "learning_rate": 3.110462090504251e-08, + "logits/chosen": -0.4527757167816162, + "logits/rejected": -0.535022497177124, + "logps/chosen": -57.88581848144531, + "logps/rejected": -89.62566375732422, + "loss": 0.7121, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7830517292022705, + "rewards/margins": 4.886929988861084, + "rewards/rejected": -2.1038780212402344, + "step": 5783 + }, + { + "epoch": 1.45, + "grad_norm": 3.8595499992370605, + "learning_rate": 3.081354397830083e-08, + "logits/chosen": -0.3568214178085327, + "logits/rejected": -0.4479580521583557, + "logps/chosen": -69.62196350097656, + "logps/rejected": -91.15733337402344, + "loss": 0.7852, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9292755126953125, + "rewards/margins": 5.138714790344238, + "rewards/rejected": -2.209439516067505, + "step": 5784 + }, + { + "epoch": 1.45, + "grad_norm": 3.987344741821289, + "learning_rate": 3.052383120046798e-08, + "logits/chosen": -0.4110204875469208, + "logits/rejected": -0.48923131823539734, + "logps/chosen": -51.001739501953125, + "logps/rejected": -88.37405395507812, + "loss": 0.6601, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.903616428375244, + "rewards/margins": 4.980162143707275, + "rewards/rejected": -2.0765457153320312, + "step": 5785 + }, + { + "epoch": 1.45, + "grad_norm": 5.602870941162109, + "learning_rate": 3.02354826510759e-08, + "logits/chosen": -0.29646334052085876, + "logits/rejected": -0.43094027042388916, + "logps/chosen": -53.80379867553711, + "logps/rejected": -81.94905090332031, + "loss": 0.7914, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8630685806274414, + "rewards/margins": 4.570691108703613, + "rewards/rejected": -1.707622766494751, + "step": 5786 + }, + { + "epoch": 1.45, + "grad_norm": 4.131869316101074, + "learning_rate": 2.994849840928404e-08, + "logits/chosen": -0.4155464768409729, + "logits/rejected": -0.4591815173625946, + "logps/chosen": -55.76474380493164, + "logps/rejected": -101.78703308105469, + "loss": 0.8366, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.948712110519409, + "rewards/margins": 4.949759483337402, + "rewards/rejected": -2.001047134399414, + "step": 5787 + }, + { + "epoch": 1.45, + "grad_norm": 5.2259392738342285, + "learning_rate": 2.9662878553873286e-08, + "logits/chosen": -0.35336732864379883, + "logits/rejected": -0.4417160451412201, + "logps/chosen": -61.910606384277344, + "logps/rejected": -86.25861358642578, + "loss": 0.7746, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6463265419006348, + "rewards/margins": 4.214488506317139, + "rewards/rejected": -1.568161964416504, + "step": 5788 + }, + { + "epoch": 1.45, + "grad_norm": 8.838412284851074, + "learning_rate": 2.9378623163254794e-08, + "logits/chosen": -0.36915498971939087, + "logits/rejected": -0.39820149540901184, + "logps/chosen": -45.811737060546875, + "logps/rejected": -91.06681823730469, + "loss": 0.7803, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9812521934509277, + "rewards/margins": 5.160173416137695, + "rewards/rejected": -2.1789214611053467, + "step": 5789 + }, + { + "epoch": 1.45, + "grad_norm": 3.28182053565979, + "learning_rate": 2.9095732315461144e-08, + "logits/chosen": -0.3203587234020233, + "logits/rejected": -0.4513641893863678, + "logps/chosen": -47.47985076904297, + "logps/rejected": -80.88892364501953, + "loss": 0.6416, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.988211154937744, + "rewards/margins": 4.867183208465576, + "rewards/rejected": -1.878972053527832, + "step": 5790 + }, + { + "epoch": 1.45, + "grad_norm": 4.0124664306640625, + "learning_rate": 2.8814206088152997e-08, + "logits/chosen": -0.4064088761806488, + "logits/rejected": -0.47866091132164, + "logps/chosen": -49.441383361816406, + "logps/rejected": -79.80165100097656, + "loss": 0.7344, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6819441318511963, + "rewards/margins": 3.9809579849243164, + "rewards/rejected": -1.2990138530731201, + "step": 5791 + }, + { + "epoch": 1.45, + "grad_norm": 4.931164264678955, + "learning_rate": 2.853404455861519e-08, + "logits/chosen": -0.3145354688167572, + "logits/rejected": -0.35600700974464417, + "logps/chosen": -49.939674377441406, + "logps/rejected": -77.98004150390625, + "loss": 0.7397, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0298237800598145, + "rewards/margins": 4.055691719055176, + "rewards/rejected": -1.0258684158325195, + "step": 5792 + }, + { + "epoch": 1.45, + "grad_norm": 4.326612949371338, + "learning_rate": 2.8255247803757322e-08, + "logits/chosen": -0.3407982587814331, + "logits/rejected": -0.47357484698295593, + "logps/chosen": -49.465126037597656, + "logps/rejected": -81.24419403076172, + "loss": 0.6868, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6882569789886475, + "rewards/margins": 5.292307376861572, + "rewards/rejected": -2.604050397872925, + "step": 5793 + }, + { + "epoch": 1.45, + "grad_norm": 5.3405022621154785, + "learning_rate": 2.7977815900115944e-08, + "logits/chosen": -0.37967678904533386, + "logits/rejected": -0.49332210421562195, + "logps/chosen": -59.507633209228516, + "logps/rejected": -96.9879150390625, + "loss": 0.729, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7867534160614014, + "rewards/margins": 5.817050933837891, + "rewards/rejected": -3.030296802520752, + "step": 5794 + }, + { + "epoch": 1.45, + "grad_norm": 6.137275695800781, + "learning_rate": 2.7701748923851802e-08, + "logits/chosen": -0.2911478281021118, + "logits/rejected": -0.3744281530380249, + "logps/chosen": -54.9490966796875, + "logps/rejected": -82.24602508544922, + "loss": 0.7892, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6582159996032715, + "rewards/margins": 4.236838340759277, + "rewards/rejected": -1.5786223411560059, + "step": 5795 + }, + { + "epoch": 1.45, + "grad_norm": 4.163485527038574, + "learning_rate": 2.742704695075149e-08, + "logits/chosen": -0.3487912714481354, + "logits/rejected": -0.42678549885749817, + "logps/chosen": -65.17041015625, + "logps/rejected": -98.8416748046875, + "loss": 0.7918, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8690803050994873, + "rewards/margins": 5.3874640464782715, + "rewards/rejected": -2.518383264541626, + "step": 5796 + }, + { + "epoch": 1.45, + "grad_norm": 5.648227691650391, + "learning_rate": 2.7153710056226358e-08, + "logits/chosen": -0.33889976143836975, + "logits/rejected": -0.45995110273361206, + "logps/chosen": -57.63587188720703, + "logps/rejected": -86.85873413085938, + "loss": 0.7497, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.680554151535034, + "rewards/margins": 4.9112324714660645, + "rewards/rejected": -2.230677843093872, + "step": 5797 + }, + { + "epoch": 1.45, + "grad_norm": 4.571342468261719, + "learning_rate": 2.6881738315313598e-08, + "logits/chosen": -0.3614122271537781, + "logits/rejected": -0.4226253628730774, + "logps/chosen": -63.401268005371094, + "logps/rejected": -106.28271484375, + "loss": 0.783, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0574045181274414, + "rewards/margins": 5.813284873962402, + "rewards/rejected": -2.755880355834961, + "step": 5798 + }, + { + "epoch": 1.45, + "grad_norm": 8.728556632995605, + "learning_rate": 2.661113180267516e-08, + "logits/chosen": -0.3639570474624634, + "logits/rejected": -0.47047343850135803, + "logps/chosen": -50.084442138671875, + "logps/rejected": -81.68657684326172, + "loss": 0.6924, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.008394241333008, + "rewards/margins": 5.652212142944336, + "rewards/rejected": -2.6438181400299072, + "step": 5799 + }, + { + "epoch": 1.45, + "grad_norm": 3.0830495357513428, + "learning_rate": 2.634189059259773e-08, + "logits/chosen": -0.3962775766849518, + "logits/rejected": -0.47630539536476135, + "logps/chosen": -68.01642608642578, + "logps/rejected": -88.74032592773438, + "loss": 0.7048, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4374232292175293, + "rewards/margins": 6.26287317276001, + "rewards/rejected": -2.8254497051239014, + "step": 5800 + }, + { + "epoch": 1.45, + "grad_norm": 3.8205199241638184, + "learning_rate": 2.607401475899496e-08, + "logits/chosen": -0.3528122007846832, + "logits/rejected": -0.4580686688423157, + "logps/chosen": -64.23872375488281, + "logps/rejected": -89.81282043457031, + "loss": 0.7443, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.096890449523926, + "rewards/margins": 5.6280951499938965, + "rewards/rejected": -2.5312039852142334, + "step": 5801 + }, + { + "epoch": 1.45, + "grad_norm": 13.563192367553711, + "learning_rate": 2.580750437540358e-08, + "logits/chosen": -0.36276909708976746, + "logits/rejected": -0.438498854637146, + "logps/chosen": -60.54645538330078, + "logps/rejected": -78.89877319335938, + "loss": 0.8581, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.69275164604187, + "rewards/margins": 4.243940830230713, + "rewards/rejected": -1.5511893033981323, + "step": 5802 + }, + { + "epoch": 1.45, + "grad_norm": 6.070453643798828, + "learning_rate": 2.554235951498729e-08, + "logits/chosen": -0.3822646737098694, + "logits/rejected": -0.4756435751914978, + "logps/chosen": -56.97062301635742, + "logps/rejected": -87.12589263916016, + "loss": 0.7467, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.685163974761963, + "rewards/margins": 5.283006191253662, + "rewards/rejected": -2.597841501235962, + "step": 5803 + }, + { + "epoch": 1.45, + "grad_norm": 9.51156234741211, + "learning_rate": 2.5278580250533426e-08, + "logits/chosen": -0.3958622217178345, + "logits/rejected": -0.44255712628364563, + "logps/chosen": -56.766563415527344, + "logps/rejected": -76.0407943725586, + "loss": 0.8122, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9735474586486816, + "rewards/margins": 3.967228889465332, + "rewards/rejected": -0.9936810731887817, + "step": 5804 + }, + { + "epoch": 1.45, + "grad_norm": 6.632789134979248, + "learning_rate": 2.5016166654455166e-08, + "logits/chosen": -0.3774532377719879, + "logits/rejected": -0.40418219566345215, + "logps/chosen": -63.74342346191406, + "logps/rejected": -96.45934295654297, + "loss": 0.9493, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7748842239379883, + "rewards/margins": 3.711613178253174, + "rewards/rejected": -0.9367291331291199, + "step": 5805 + }, + { + "epoch": 1.45, + "grad_norm": 4.669487953186035, + "learning_rate": 2.4755118798791e-08, + "logits/chosen": -0.3506947457790375, + "logits/rejected": -0.4746888279914856, + "logps/chosen": -57.44594192504883, + "logps/rejected": -78.44169616699219, + "loss": 0.7253, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.856903076171875, + "rewards/margins": 4.787376880645752, + "rewards/rejected": -1.930473804473877, + "step": 5806 + }, + { + "epoch": 1.45, + "grad_norm": 6.269207954406738, + "learning_rate": 2.4495436755204162e-08, + "logits/chosen": -0.3212439715862274, + "logits/rejected": -0.4455333650112152, + "logps/chosen": -54.13225173950195, + "logps/rejected": -95.35384368896484, + "loss": 0.6201, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.951817512512207, + "rewards/margins": 5.291173458099365, + "rewards/rejected": -2.339355707168579, + "step": 5807 + }, + { + "epoch": 1.45, + "grad_norm": 4.879141330718994, + "learning_rate": 2.4237120594982623e-08, + "logits/chosen": -0.3194701373577118, + "logits/rejected": -0.48494642972946167, + "logps/chosen": -61.191104888916016, + "logps/rejected": -86.17036437988281, + "loss": 0.6778, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8848984241485596, + "rewards/margins": 5.36899471282959, + "rewards/rejected": -2.4840965270996094, + "step": 5808 + }, + { + "epoch": 1.45, + "grad_norm": 2.653238296508789, + "learning_rate": 2.3980170389040213e-08, + "logits/chosen": -0.41210514307022095, + "logits/rejected": -0.47674399614334106, + "logps/chosen": -53.31328582763672, + "logps/rejected": -68.916015625, + "loss": 0.7071, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8892924785614014, + "rewards/margins": 4.961771011352539, + "rewards/rejected": -2.0724785327911377, + "step": 5809 + }, + { + "epoch": 1.45, + "grad_norm": 4.303835868835449, + "learning_rate": 2.37245862079144e-08, + "logits/chosen": -0.2877212464809418, + "logits/rejected": -0.38938331604003906, + "logps/chosen": -52.91484451293945, + "logps/rejected": -80.60993194580078, + "loss": 0.6946, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9647884368896484, + "rewards/margins": 3.9287168979644775, + "rewards/rejected": -0.9639281034469604, + "step": 5810 + }, + { + "epoch": 1.45, + "grad_norm": 6.22410774230957, + "learning_rate": 2.347036812177017e-08, + "logits/chosen": -0.3222348392009735, + "logits/rejected": -0.3899660110473633, + "logps/chosen": -57.87616729736328, + "logps/rejected": -96.00878143310547, + "loss": 0.7643, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0428717136383057, + "rewards/margins": 5.203283786773682, + "rewards/rejected": -2.160411834716797, + "step": 5811 + }, + { + "epoch": 1.45, + "grad_norm": 4.707569599151611, + "learning_rate": 2.3217516200394474e-08, + "logits/chosen": -0.33475005626678467, + "logits/rejected": -0.4435619115829468, + "logps/chosen": -55.79948425292969, + "logps/rejected": -78.7112045288086, + "loss": 0.7595, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.976074457168579, + "rewards/margins": 4.754464626312256, + "rewards/rejected": -1.7783904075622559, + "step": 5812 + }, + { + "epoch": 1.45, + "grad_norm": 9.217704772949219, + "learning_rate": 2.2966030513200678e-08, + "logits/chosen": -0.29923364520072937, + "logits/rejected": -0.3765675127506256, + "logps/chosen": -61.71627426147461, + "logps/rejected": -81.9775619506836, + "loss": 0.9008, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.564859628677368, + "rewards/margins": 3.157961130142212, + "rewards/rejected": -0.5931016206741333, + "step": 5813 + }, + { + "epoch": 1.45, + "grad_norm": 6.797170639038086, + "learning_rate": 2.2715911129227996e-08, + "logits/chosen": -0.3438183665275574, + "logits/rejected": -0.49755606055259705, + "logps/chosen": -53.7965087890625, + "logps/rejected": -75.280517578125, + "loss": 0.6631, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.938650369644165, + "rewards/margins": 5.792259216308594, + "rewards/rejected": -2.8536088466644287, + "step": 5814 + }, + { + "epoch": 1.45, + "grad_norm": 4.624109745025635, + "learning_rate": 2.2467158117138733e-08, + "logits/chosen": -0.432302862405777, + "logits/rejected": -0.5032861828804016, + "logps/chosen": -47.35771179199219, + "logps/rejected": -84.24873352050781, + "loss": 0.6545, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0894296169281006, + "rewards/margins": 5.5158371925354, + "rewards/rejected": -2.4264068603515625, + "step": 5815 + }, + { + "epoch": 1.45, + "grad_norm": 4.055586338043213, + "learning_rate": 2.221977154522159e-08, + "logits/chosen": -0.3175526261329651, + "logits/rejected": -0.43505150079727173, + "logps/chosen": -64.16421508789062, + "logps/rejected": -87.13114166259766, + "loss": 0.6995, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.744267463684082, + "rewards/margins": 4.476497650146484, + "rewards/rejected": -1.7322301864624023, + "step": 5816 + }, + { + "epoch": 1.46, + "grad_norm": 4.603535175323486, + "learning_rate": 2.1973751481388917e-08, + "logits/chosen": -0.32522255182266235, + "logits/rejected": -0.44288185238838196, + "logps/chosen": -65.10228729248047, + "logps/rejected": -81.4014663696289, + "loss": 0.7903, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9384610652923584, + "rewards/margins": 5.079591274261475, + "rewards/rejected": -2.141129970550537, + "step": 5817 + }, + { + "epoch": 1.46, + "grad_norm": 13.16512680053711, + "learning_rate": 2.1729097993178904e-08, + "logits/chosen": -0.4550337791442871, + "logits/rejected": -0.4670380651950836, + "logps/chosen": -53.1187744140625, + "logps/rejected": -96.25247192382812, + "loss": 0.8314, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.080315113067627, + "rewards/margins": 5.164710998535156, + "rewards/rejected": -2.0843961238861084, + "step": 5818 + }, + { + "epoch": 1.46, + "grad_norm": 4.861238479614258, + "learning_rate": 2.1485811147753943e-08, + "logits/chosen": -0.3242093622684479, + "logits/rejected": -0.42626461386680603, + "logps/chosen": -71.792236328125, + "logps/rejected": -88.22344207763672, + "loss": 0.9358, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7780447006225586, + "rewards/margins": 3.8352839946746826, + "rewards/rejected": -1.0572391748428345, + "step": 5819 + }, + { + "epoch": 1.46, + "grad_norm": 5.151688098907471, + "learning_rate": 2.124389101190172e-08, + "logits/chosen": -0.39707666635513306, + "logits/rejected": -0.43841859698295593, + "logps/chosen": -94.80516052246094, + "logps/rejected": -80.376953125, + "loss": 0.8203, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7203967571258545, + "rewards/margins": 4.332886695861816, + "rewards/rejected": -1.612489938735962, + "step": 5820 + }, + { + "epoch": 1.46, + "grad_norm": 4.045431137084961, + "learning_rate": 2.100333765203466e-08, + "logits/chosen": -0.29652613401412964, + "logits/rejected": -0.41729936003685, + "logps/chosen": -51.00730514526367, + "logps/rejected": -93.22205352783203, + "loss": 0.6024, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.925555467605591, + "rewards/margins": 6.475739002227783, + "rewards/rejected": -3.550183057785034, + "step": 5821 + }, + { + "epoch": 1.46, + "grad_norm": 9.374018669128418, + "learning_rate": 2.0764151134189946e-08, + "logits/chosen": -0.34225767850875854, + "logits/rejected": -0.44605907797813416, + "logps/chosen": -58.2037353515625, + "logps/rejected": -87.45204162597656, + "loss": 0.689, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.735229969024658, + "rewards/margins": 5.116323947906494, + "rewards/rejected": -2.381094217300415, + "step": 5822 + }, + { + "epoch": 1.46, + "grad_norm": 3.0609278678894043, + "learning_rate": 2.0526331524028385e-08, + "logits/chosen": -0.39692577719688416, + "logits/rejected": -0.4539974629878998, + "logps/chosen": -41.711578369140625, + "logps/rejected": -85.90511322021484, + "loss": 0.6068, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1034274101257324, + "rewards/margins": 5.147089958190918, + "rewards/rejected": -2.0436630249023438, + "step": 5823 + }, + { + "epoch": 1.46, + "grad_norm": 9.28796672821045, + "learning_rate": 2.028987888683831e-08, + "logits/chosen": -0.313148558139801, + "logits/rejected": -0.4157235324382782, + "logps/chosen": -52.727108001708984, + "logps/rejected": -88.65521240234375, + "loss": 0.7495, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7660062313079834, + "rewards/margins": 4.8031229972839355, + "rewards/rejected": -2.0371170043945312, + "step": 5824 + }, + { + "epoch": 1.46, + "grad_norm": 4.198295593261719, + "learning_rate": 2.005479328752946e-08, + "logits/chosen": -0.4182901680469513, + "logits/rejected": -0.4145076870918274, + "logps/chosen": -53.093135833740234, + "logps/rejected": -104.87568664550781, + "loss": 0.6755, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.208287000656128, + "rewards/margins": 4.641560077667236, + "rewards/rejected": -1.4332730770111084, + "step": 5825 + }, + { + "epoch": 1.46, + "grad_norm": 4.2106828689575195, + "learning_rate": 1.982107479063855e-08, + "logits/chosen": -0.38616350293159485, + "logits/rejected": -0.47576355934143066, + "logps/chosen": -56.726112365722656, + "logps/rejected": -82.02326202392578, + "loss": 0.7119, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7727785110473633, + "rewards/margins": 5.917764186859131, + "rewards/rejected": -3.1449856758117676, + "step": 5826 + }, + { + "epoch": 1.46, + "grad_norm": 10.179666519165039, + "learning_rate": 1.9588723460327032e-08, + "logits/chosen": -0.4577200412750244, + "logits/rejected": -0.4536226987838745, + "logps/chosen": -52.641029357910156, + "logps/rejected": -111.30166625976562, + "loss": 0.7482, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.955582618713379, + "rewards/margins": 5.567737579345703, + "rewards/rejected": -2.6121551990509033, + "step": 5827 + }, + { + "epoch": 1.46, + "grad_norm": 3.515012264251709, + "learning_rate": 1.935773936037888e-08, + "logits/chosen": -0.24035175144672394, + "logits/rejected": -0.36736026406288147, + "logps/chosen": -59.89942169189453, + "logps/rejected": -71.84376525878906, + "loss": 0.6616, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0777339935302734, + "rewards/margins": 4.881056308746338, + "rewards/rejected": -1.803322434425354, + "step": 5828 + }, + { + "epoch": 1.46, + "grad_norm": 6.090829849243164, + "learning_rate": 1.9128122554205597e-08, + "logits/chosen": -0.3744339346885681, + "logits/rejected": -0.4287925958633423, + "logps/chosen": -50.55896759033203, + "logps/rejected": -99.29072570800781, + "loss": 0.6882, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7880961894989014, + "rewards/margins": 5.736020565032959, + "rewards/rejected": -2.9479246139526367, + "step": 5829 + }, + { + "epoch": 1.46, + "grad_norm": 4.698453426361084, + "learning_rate": 1.8899873104841205e-08, + "logits/chosen": -0.42470502853393555, + "logits/rejected": -0.5394649505615234, + "logps/chosen": -53.412086486816406, + "logps/rejected": -81.2540512084961, + "loss": 0.7316, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8461482524871826, + "rewards/margins": 5.131930351257324, + "rewards/rejected": -2.2857823371887207, + "step": 5830 + }, + { + "epoch": 1.46, + "grad_norm": 5.6267008781433105, + "learning_rate": 1.867299107494558e-08, + "logits/chosen": -0.4504512548446655, + "logits/rejected": -0.5467085242271423, + "logps/chosen": -59.15687561035156, + "logps/rejected": -96.89277648925781, + "loss": 0.7849, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.820915460586548, + "rewards/margins": 5.399442672729492, + "rewards/rejected": -2.5785274505615234, + "step": 5831 + }, + { + "epoch": 1.46, + "grad_norm": 10.183341026306152, + "learning_rate": 1.8447476526802787e-08, + "logits/chosen": -0.30956459045410156, + "logits/rejected": -0.4385920763015747, + "logps/chosen": -66.55183410644531, + "logps/rejected": -80.0489501953125, + "loss": 0.8356, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.2502613067626953, + "rewards/margins": 4.409620761871338, + "rewards/rejected": -2.159359931945801, + "step": 5832 + }, + { + "epoch": 1.46, + "grad_norm": 4.871215343475342, + "learning_rate": 1.8223329522319978e-08, + "logits/chosen": -0.384920597076416, + "logits/rejected": -0.512880802154541, + "logps/chosen": -55.236480712890625, + "logps/rejected": -91.00765991210938, + "loss": 0.6075, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.004586696624756, + "rewards/margins": 5.48621940612793, + "rewards/rejected": -2.481633186340332, + "step": 5833 + }, + { + "epoch": 1.46, + "grad_norm": 4.628407955169678, + "learning_rate": 1.800055012303237e-08, + "logits/chosen": -0.38937535881996155, + "logits/rejected": -0.4854840040206909, + "logps/chosen": -57.05320739746094, + "logps/rejected": -76.15254211425781, + "loss": 0.7305, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8065707683563232, + "rewards/margins": 5.244353294372559, + "rewards/rejected": -2.4377822875976562, + "step": 5834 + }, + { + "epoch": 1.46, + "grad_norm": 4.003294944763184, + "learning_rate": 1.7779138390096594e-08, + "logits/chosen": -0.3535418212413788, + "logits/rejected": -0.45862478017807007, + "logps/chosen": -46.10149383544922, + "logps/rejected": -72.71460723876953, + "loss": 0.6314, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.077221632003784, + "rewards/margins": 4.622257232666016, + "rewards/rejected": -1.545035719871521, + "step": 5835 + }, + { + "epoch": 1.46, + "grad_norm": 3.2665562629699707, + "learning_rate": 1.755909438429515e-08, + "logits/chosen": -0.3188210725784302, + "logits/rejected": -0.4304265081882477, + "logps/chosen": -50.73890686035156, + "logps/rejected": -73.16496276855469, + "loss": 0.6649, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0435590744018555, + "rewards/margins": 4.35167121887207, + "rewards/rejected": -1.308112382888794, + "step": 5836 + }, + { + "epoch": 1.46, + "grad_norm": 3.9709391593933105, + "learning_rate": 1.7340418166035265e-08, + "logits/chosen": -0.32001227140426636, + "logits/rejected": -0.4211401045322418, + "logps/chosen": -60.82233428955078, + "logps/rejected": -90.69779205322266, + "loss": 0.7591, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8275513648986816, + "rewards/margins": 5.482373237609863, + "rewards/rejected": -2.6548218727111816, + "step": 5837 + }, + { + "epoch": 1.46, + "grad_norm": 4.685657501220703, + "learning_rate": 1.712310979534726e-08, + "logits/chosen": -0.29598894715309143, + "logits/rejected": -0.4378989040851593, + "logps/chosen": -59.621742248535156, + "logps/rejected": -89.68634796142578, + "loss": 0.6681, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5820953845977783, + "rewards/margins": 6.014218330383301, + "rewards/rejected": -3.432122230529785, + "step": 5838 + }, + { + "epoch": 1.46, + "grad_norm": 9.679299354553223, + "learning_rate": 1.6907169331887852e-08, + "logits/chosen": -0.4303423762321472, + "logits/rejected": -0.4959544837474823, + "logps/chosen": -56.0992431640625, + "logps/rejected": -104.83649444580078, + "loss": 0.7612, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7884414196014404, + "rewards/margins": 6.043956756591797, + "rewards/rejected": -3.2555150985717773, + "step": 5839 + }, + { + "epoch": 1.46, + "grad_norm": 2.6849265098571777, + "learning_rate": 1.6692596834937402e-08, + "logits/chosen": -0.3872167468070984, + "logits/rejected": -0.501215398311615, + "logps/chosen": -48.541778564453125, + "logps/rejected": -92.08560180664062, + "loss": 0.5918, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.263981342315674, + "rewards/margins": 5.410048484802246, + "rewards/rejected": -2.1460673809051514, + "step": 5840 + }, + { + "epoch": 1.46, + "grad_norm": 4.209184169769287, + "learning_rate": 1.6479392363399906e-08, + "logits/chosen": -0.3736894130706787, + "logits/rejected": -0.42415082454681396, + "logps/chosen": -58.331356048583984, + "logps/rejected": -86.99609375, + "loss": 0.7252, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8217735290527344, + "rewards/margins": 5.105132579803467, + "rewards/rejected": -2.283358573913574, + "step": 5841 + }, + { + "epoch": 1.46, + "grad_norm": 4.944249629974365, + "learning_rate": 1.6267555975805203e-08, + "logits/chosen": -0.303242564201355, + "logits/rejected": -0.4179361164569855, + "logps/chosen": -70.05818176269531, + "logps/rejected": -86.4051742553711, + "loss": 0.6964, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8644213676452637, + "rewards/margins": 4.7874674797058105, + "rewards/rejected": -1.9230461120605469, + "step": 5842 + }, + { + "epoch": 1.46, + "grad_norm": 4.325172424316406, + "learning_rate": 1.605708773030734e-08, + "logits/chosen": -0.3913050889968872, + "logits/rejected": -0.49114489555358887, + "logps/chosen": -50.268943786621094, + "logps/rejected": -82.35252380371094, + "loss": 0.7109, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0044009685516357, + "rewards/margins": 4.624452114105225, + "rewards/rejected": -1.620051383972168, + "step": 5843 + }, + { + "epoch": 1.46, + "grad_norm": 5.082590579986572, + "learning_rate": 1.584798768468343e-08, + "logits/chosen": -0.4559749662876129, + "logits/rejected": -0.548700213432312, + "logps/chosen": -59.408721923828125, + "logps/rejected": -84.45356750488281, + "loss": 0.7609, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8360371589660645, + "rewards/margins": 4.66619873046875, + "rewards/rejected": -1.8301615715026855, + "step": 5844 + }, + { + "epoch": 1.46, + "grad_norm": 4.501884937286377, + "learning_rate": 1.5640255896336444e-08, + "logits/chosen": -0.3177744150161743, + "logits/rejected": -0.40819793939590454, + "logps/chosen": -70.18901062011719, + "logps/rejected": -101.23243713378906, + "loss": 0.7268, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6826093196868896, + "rewards/margins": 5.299755096435547, + "rewards/rejected": -2.6171460151672363, + "step": 5845 + }, + { + "epoch": 1.46, + "grad_norm": 5.189078330993652, + "learning_rate": 1.543389242229354e-08, + "logits/chosen": -0.26975446939468384, + "logits/rejected": -0.4116407036781311, + "logps/chosen": -64.08279418945312, + "logps/rejected": -79.43096923828125, + "loss": 0.7148, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1626219749450684, + "rewards/margins": 4.086067199707031, + "rewards/rejected": -0.9234452247619629, + "step": 5846 + }, + { + "epoch": 1.46, + "grad_norm": 6.441802024841309, + "learning_rate": 1.5228897319205515e-08, + "logits/chosen": -0.3442068099975586, + "logits/rejected": -0.4584480822086334, + "logps/chosen": -46.4510383605957, + "logps/rejected": -88.70430755615234, + "loss": 0.7144, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.789344310760498, + "rewards/margins": 4.631232738494873, + "rewards/rejected": -1.8418885469436646, + "step": 5847 + }, + { + "epoch": 1.46, + "grad_norm": 12.148303985595703, + "learning_rate": 1.502527064334791e-08, + "logits/chosen": -0.3366229236125946, + "logits/rejected": -0.4477871060371399, + "logps/chosen": -54.71227264404297, + "logps/rejected": -82.18798828125, + "loss": 0.8164, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6615936756134033, + "rewards/margins": 5.262019634246826, + "rewards/rejected": -2.600425958633423, + "step": 5848 + }, + { + "epoch": 1.46, + "grad_norm": 5.454691410064697, + "learning_rate": 1.4823012450620455e-08, + "logits/chosen": -0.3508989214897156, + "logits/rejected": -0.41823139786720276, + "logps/chosen": -55.57051467895508, + "logps/rejected": -85.88934326171875, + "loss": 0.694, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.809727191925049, + "rewards/margins": 5.051026821136475, + "rewards/rejected": -2.241299629211426, + "step": 5849 + }, + { + "epoch": 1.46, + "grad_norm": 4.200035572052002, + "learning_rate": 1.4622122796548177e-08, + "logits/chosen": -0.3054150640964508, + "logits/rejected": -0.44245782494544983, + "logps/chosen": -70.85118865966797, + "logps/rejected": -74.64622497558594, + "loss": 0.7465, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.066706418991089, + "rewards/margins": 4.699807167053223, + "rewards/rejected": -1.6331006288528442, + "step": 5850 + }, + { + "epoch": 1.46, + "grad_norm": 3.3663904666900635, + "learning_rate": 1.4422601736279185e-08, + "logits/chosen": -0.32015395164489746, + "logits/rejected": -0.49177026748657227, + "logps/chosen": -76.55020904541016, + "logps/rejected": -67.05734252929688, + "loss": 0.7197, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1381208896636963, + "rewards/margins": 5.25788688659668, + "rewards/rejected": -2.1197662353515625, + "step": 5851 + }, + { + "epoch": 1.46, + "grad_norm": 3.632220983505249, + "learning_rate": 1.422444932458633e-08, + "logits/chosen": -0.319108784198761, + "logits/rejected": -0.3940885066986084, + "logps/chosen": -51.485877990722656, + "logps/rejected": -101.05899810791016, + "loss": 0.6891, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8620553016662598, + "rewards/margins": 5.734035491943359, + "rewards/rejected": -2.8719797134399414, + "step": 5852 + }, + { + "epoch": 1.46, + "grad_norm": 4.8421430587768555, + "learning_rate": 1.4027665615866104e-08, + "logits/chosen": -0.37873053550720215, + "logits/rejected": -0.4664407968521118, + "logps/chosen": -59.0488395690918, + "logps/rejected": -84.56314849853516, + "loss": 0.7628, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.878262758255005, + "rewards/margins": 4.056201934814453, + "rewards/rejected": -1.1779389381408691, + "step": 5853 + }, + { + "epoch": 1.46, + "grad_norm": 4.954601287841797, + "learning_rate": 1.3832250664140845e-08, + "logits/chosen": -0.4005243480205536, + "logits/rejected": -0.4580328166484833, + "logps/chosen": -48.8207893371582, + "logps/rejected": -82.72547912597656, + "loss": 0.6705, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8885765075683594, + "rewards/margins": 4.703121662139893, + "rewards/rejected": -1.8145455121994019, + "step": 5854 + }, + { + "epoch": 1.46, + "grad_norm": 4.9887895584106445, + "learning_rate": 1.3638204523055976e-08, + "logits/chosen": -0.3054482340812683, + "logits/rejected": -0.37880200147628784, + "logps/chosen": -60.87887191772461, + "logps/rejected": -89.3640365600586, + "loss": 0.7158, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.053638458251953, + "rewards/margins": 4.314328193664551, + "rewards/rejected": -1.2606903314590454, + "step": 5855 + }, + { + "epoch": 1.46, + "grad_norm": 4.444472312927246, + "learning_rate": 1.3445527245881108e-08, + "logits/chosen": -0.3858717083930969, + "logits/rejected": -0.44458436965942383, + "logps/chosen": -53.94788360595703, + "logps/rejected": -95.29224395751953, + "loss": 0.6716, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0044569969177246, + "rewards/margins": 4.7131218910217285, + "rewards/rejected": -1.708664894104004, + "step": 5856 + }, + { + "epoch": 1.47, + "grad_norm": 4.985713481903076, + "learning_rate": 1.325421888551004e-08, + "logits/chosen": -0.32496148347854614, + "logits/rejected": -0.4400300681591034, + "logps/chosen": -55.198692321777344, + "logps/rejected": -86.23506164550781, + "loss": 0.649, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7491655349731445, + "rewards/margins": 5.148418426513672, + "rewards/rejected": -2.3992533683776855, + "step": 5857 + }, + { + "epoch": 1.47, + "grad_norm": 2.6287195682525635, + "learning_rate": 1.3064279494461874e-08, + "logits/chosen": -0.39002594351768494, + "logits/rejected": -0.49398180842399597, + "logps/chosen": -49.964969635009766, + "logps/rejected": -86.38570404052734, + "loss": 0.615, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.813693046569824, + "rewards/margins": 6.053973197937012, + "rewards/rejected": -3.2402806282043457, + "step": 5858 + }, + { + "epoch": 1.47, + "grad_norm": 5.697807312011719, + "learning_rate": 1.2875709124878233e-08, + "logits/chosen": -0.37594884634017944, + "logits/rejected": -0.4327046573162079, + "logps/chosen": -53.926151275634766, + "logps/rejected": -92.16964721679688, + "loss": 0.6485, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0585312843322754, + "rewards/margins": 4.884489059448242, + "rewards/rejected": -1.8259577751159668, + "step": 5859 + }, + { + "epoch": 1.47, + "grad_norm": 5.197024345397949, + "learning_rate": 1.2688507828526042e-08, + "logits/chosen": -0.31766632199287415, + "logits/rejected": -0.4421757757663727, + "logps/chosen": -63.711212158203125, + "logps/rejected": -75.666015625, + "loss": 0.7243, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.059072494506836, + "rewards/margins": 4.892836093902588, + "rewards/rejected": -1.8337637186050415, + "step": 5860 + }, + { + "epoch": 1.47, + "grad_norm": 3.4031810760498047, + "learning_rate": 1.2502675656796415e-08, + "logits/chosen": -0.4425150156021118, + "logits/rejected": -0.5508161783218384, + "logps/chosen": -56.39125061035156, + "logps/rejected": -76.54784393310547, + "loss": 0.7229, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8390228748321533, + "rewards/margins": 5.5859270095825195, + "rewards/rejected": -2.7469046115875244, + "step": 5861 + }, + { + "epoch": 1.47, + "grad_norm": 6.047496795654297, + "learning_rate": 1.2318212660704098e-08, + "logits/chosen": -0.2757432758808136, + "logits/rejected": -0.3513334095478058, + "logps/chosen": -61.59285354614258, + "logps/rejected": -90.59310913085938, + "loss": 0.7958, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.93021559715271, + "rewards/margins": 4.765512466430664, + "rewards/rejected": -1.8352971076965332, + "step": 5862 + }, + { + "epoch": 1.47, + "grad_norm": 4.1711907386779785, + "learning_rate": 1.2135118890887476e-08, + "logits/chosen": -0.2595548629760742, + "logits/rejected": -0.3994215726852417, + "logps/chosen": -60.30078887939453, + "logps/rejected": -87.15286254882812, + "loss": 0.707, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0203566551208496, + "rewards/margins": 4.974025726318359, + "rewards/rejected": -1.9536688327789307, + "step": 5863 + }, + { + "epoch": 1.47, + "grad_norm": 5.47882080078125, + "learning_rate": 1.1953394397610784e-08, + "logits/chosen": -0.3149247467517853, + "logits/rejected": -0.3554086983203888, + "logps/chosen": -61.55743408203125, + "logps/rejected": -92.9549789428711, + "loss": 0.7018, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.968602418899536, + "rewards/margins": 4.243173599243164, + "rewards/rejected": -1.2745709419250488, + "step": 5864 + }, + { + "epoch": 1.47, + "grad_norm": 4.445024013519287, + "learning_rate": 1.1773039230760786e-08, + "logits/chosen": -0.389143168926239, + "logits/rejected": -0.49604398012161255, + "logps/chosen": -54.518470764160156, + "logps/rejected": -79.56263732910156, + "loss": 0.6834, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.880277156829834, + "rewards/margins": 4.674895286560059, + "rewards/rejected": -1.7946181297302246, + "step": 5865 + }, + { + "epoch": 1.47, + "grad_norm": 3.5669617652893066, + "learning_rate": 1.1594053439848984e-08, + "logits/chosen": -0.28341108560562134, + "logits/rejected": -0.4328932762145996, + "logps/chosen": -66.25517272949219, + "logps/rejected": -90.82302856445312, + "loss": 0.6059, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8754496574401855, + "rewards/margins": 5.9942522048950195, + "rewards/rejected": -3.118803024291992, + "step": 5866 + }, + { + "epoch": 1.47, + "grad_norm": 7.571694374084473, + "learning_rate": 1.1416437074011077e-08, + "logits/chosen": -0.29178735613822937, + "logits/rejected": -0.3705218434333801, + "logps/chosen": -50.20318603515625, + "logps/rejected": -103.03358459472656, + "loss": 0.6913, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8646695613861084, + "rewards/margins": 5.63167667388916, + "rewards/rejected": -2.7670068740844727, + "step": 5867 + }, + { + "epoch": 1.47, + "grad_norm": 6.633346080780029, + "learning_rate": 1.1240190182005838e-08, + "logits/chosen": -0.3922584056854248, + "logits/rejected": -0.46781623363494873, + "logps/chosen": -49.849830627441406, + "logps/rejected": -79.96847534179688, + "loss": 0.7018, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0081348419189453, + "rewards/margins": 4.807427883148193, + "rewards/rejected": -1.7992929220199585, + "step": 5868 + }, + { + "epoch": 1.47, + "grad_norm": 4.322881698608398, + "learning_rate": 1.1065312812217898e-08, + "logits/chosen": -0.4198160171508789, + "logits/rejected": -0.48483914136886597, + "logps/chosen": -49.27705383300781, + "logps/rejected": -87.06672668457031, + "loss": 0.838, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8962011337280273, + "rewards/margins": 4.696672439575195, + "rewards/rejected": -1.8004707098007202, + "step": 5869 + }, + { + "epoch": 1.47, + "grad_norm": 3.74277400970459, + "learning_rate": 1.0891805012653855e-08, + "logits/chosen": -0.3807520568370819, + "logits/rejected": -0.4462701678276062, + "logps/chosen": -55.21102523803711, + "logps/rejected": -89.13178253173828, + "loss": 0.6143, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.227813959121704, + "rewards/margins": 5.023193359375, + "rewards/rejected": -1.7953792810440063, + "step": 5870 + }, + { + "epoch": 1.47, + "grad_norm": 6.020105838775635, + "learning_rate": 1.0719666830946162e-08, + "logits/chosen": -0.32129350304603577, + "logits/rejected": -0.41861796379089355, + "logps/chosen": -48.32909393310547, + "logps/rejected": -96.64241790771484, + "loss": 0.6877, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.953239679336548, + "rewards/margins": 4.841864585876465, + "rewards/rejected": -1.8886244297027588, + "step": 5871 + }, + { + "epoch": 1.47, + "grad_norm": 4.368934154510498, + "learning_rate": 1.0548898314349798e-08, + "logits/chosen": -0.3254931569099426, + "logits/rejected": -0.45015043020248413, + "logps/chosen": -61.01417541503906, + "logps/rejected": -81.05467987060547, + "loss": 0.6823, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8130204677581787, + "rewards/margins": 4.770576477050781, + "rewards/rejected": -1.9575556516647339, + "step": 5872 + }, + { + "epoch": 1.47, + "grad_norm": 3.616100549697876, + "learning_rate": 1.0379499509744484e-08, + "logits/chosen": -0.3027300238609314, + "logits/rejected": -0.48037800192832947, + "logps/chosen": -62.30427551269531, + "logps/rejected": -66.55650329589844, + "loss": 0.6898, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.873427391052246, + "rewards/margins": 4.352752685546875, + "rewards/rejected": -1.4793254137039185, + "step": 5873 + }, + { + "epoch": 1.47, + "grad_norm": 7.274590492248535, + "learning_rate": 1.0211470463634688e-08, + "logits/chosen": -0.4297548532485962, + "logits/rejected": -0.49504613876342773, + "logps/chosen": -52.50322341918945, + "logps/rejected": -88.92265319824219, + "loss": 0.7913, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0295047760009766, + "rewards/margins": 4.766928672790527, + "rewards/rejected": -1.7374238967895508, + "step": 5874 + }, + { + "epoch": 1.47, + "grad_norm": 9.247994422912598, + "learning_rate": 1.0044811222146844e-08, + "logits/chosen": -0.3764399290084839, + "logits/rejected": -0.43814125657081604, + "logps/chosen": -48.009788513183594, + "logps/rejected": -93.55001068115234, + "loss": 0.721, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1988306045532227, + "rewards/margins": 5.632784366607666, + "rewards/rejected": -2.4339537620544434, + "step": 5875 + }, + { + "epoch": 1.47, + "grad_norm": 6.3473076820373535, + "learning_rate": 9.879521831033801e-09, + "logits/chosen": -0.37827953696250916, + "logits/rejected": -0.4262676537036896, + "logps/chosen": -54.377464294433594, + "logps/rejected": -80.40182495117188, + "loss": 0.958, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4967410564422607, + "rewards/margins": 3.8788082599639893, + "rewards/rejected": -1.3820674419403076, + "step": 5876 + }, + { + "epoch": 1.47, + "grad_norm": 4.992039203643799, + "learning_rate": 9.715602335669816e-09, + "logits/chosen": -0.44944697618484497, + "logits/rejected": -0.5095622539520264, + "logps/chosen": -53.054813385009766, + "logps/rejected": -82.36426544189453, + "loss": 0.7186, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.097215175628662, + "rewards/margins": 4.7170186042785645, + "rewards/rejected": -1.6198034286499023, + "step": 5877 + }, + { + "epoch": 1.47, + "grad_norm": 9.622038841247559, + "learning_rate": 9.55305278105445e-09, + "logits/chosen": -0.3806796073913574, + "logits/rejected": -0.44908735156059265, + "logps/chosen": -57.28654479980469, + "logps/rejected": -81.71284484863281, + "loss": 0.8233, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7660281658172607, + "rewards/margins": 4.697032451629639, + "rewards/rejected": -1.9310040473937988, + "step": 5878 + }, + { + "epoch": 1.47, + "grad_norm": 10.937460899353027, + "learning_rate": 9.391873211812008e-09, + "logits/chosen": -0.3641509711742401, + "logits/rejected": -0.4858384430408478, + "logps/chosen": -61.468421936035156, + "logps/rejected": -102.75839233398438, + "loss": 0.6595, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2964398860931396, + "rewards/margins": 5.974129676818848, + "rewards/rejected": -2.67768931388855, + "step": 5879 + }, + { + "epoch": 1.47, + "grad_norm": 6.730123043060303, + "learning_rate": 9.232063672188762e-09, + "logits/chosen": -0.2998983561992645, + "logits/rejected": -0.4051147699356079, + "logps/chosen": -52.691349029541016, + "logps/rejected": -78.22677612304688, + "loss": 0.7496, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9637997150421143, + "rewards/margins": 5.263636589050293, + "rewards/rejected": -2.299837350845337, + "step": 5880 + }, + { + "epoch": 1.47, + "grad_norm": 5.258438587188721, + "learning_rate": 9.073624206056841e-09, + "logits/chosen": -0.4224323332309723, + "logits/rejected": -0.5329568982124329, + "logps/chosen": -71.41300201416016, + "logps/rejected": -97.75022888183594, + "loss": 0.7904, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.936352014541626, + "rewards/margins": 5.366326808929443, + "rewards/rejected": -2.4299745559692383, + "step": 5881 + }, + { + "epoch": 1.47, + "grad_norm": 5.994753837585449, + "learning_rate": 8.916554856910342e-09, + "logits/chosen": -0.4321245551109314, + "logits/rejected": -0.541632890701294, + "logps/chosen": -53.28403091430664, + "logps/rejected": -75.47784423828125, + "loss": 0.6554, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.816875696182251, + "rewards/margins": 4.452335357666016, + "rewards/rejected": -1.635459542274475, + "step": 5882 + }, + { + "epoch": 1.47, + "grad_norm": 6.904731273651123, + "learning_rate": 8.760855667868662e-09, + "logits/chosen": -0.37925317883491516, + "logits/rejected": -0.5023125410079956, + "logps/chosen": -62.69303894042969, + "logps/rejected": -80.15769958496094, + "loss": 0.7599, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.838393449783325, + "rewards/margins": 5.370203018188477, + "rewards/rejected": -2.5318098068237305, + "step": 5883 + }, + { + "epoch": 1.47, + "grad_norm": 6.347916603088379, + "learning_rate": 8.606526681674831e-09, + "logits/chosen": -0.24462422728538513, + "logits/rejected": -0.35364294052124023, + "logps/chosen": -60.63679885864258, + "logps/rejected": -100.2619400024414, + "loss": 0.6937, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9844160079956055, + "rewards/margins": 4.8945112228393555, + "rewards/rejected": -1.9100956916809082, + "step": 5884 + }, + { + "epoch": 1.47, + "grad_norm": 31.259410858154297, + "learning_rate": 8.453567940694962e-09, + "logits/chosen": -0.32421764731407166, + "logits/rejected": -0.3892972469329834, + "logps/chosen": -49.35628890991211, + "logps/rejected": -100.18610382080078, + "loss": 0.6833, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.113659381866455, + "rewards/margins": 5.446979522705078, + "rewards/rejected": -2.3333206176757812, + "step": 5885 + }, + { + "epoch": 1.47, + "grad_norm": 4.104823589324951, + "learning_rate": 8.30197948691991e-09, + "logits/chosen": -0.33428311347961426, + "logits/rejected": -0.40837469696998596, + "logps/chosen": -48.24765396118164, + "logps/rejected": -88.840576171875, + "loss": 0.6836, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8459110260009766, + "rewards/margins": 5.247035980224609, + "rewards/rejected": -2.40112566947937, + "step": 5886 + }, + { + "epoch": 1.47, + "grad_norm": 4.419381618499756, + "learning_rate": 8.151761361964162e-09, + "logits/chosen": -0.3560953736305237, + "logits/rejected": -0.5051684379577637, + "logps/chosen": -54.534542083740234, + "logps/rejected": -66.97394561767578, + "loss": 0.7451, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.779839038848877, + "rewards/margins": 4.279728889465332, + "rewards/rejected": -1.499889850616455, + "step": 5887 + }, + { + "epoch": 1.47, + "grad_norm": 3.5215446949005127, + "learning_rate": 8.002913607064732e-09, + "logits/chosen": -0.30051878094673157, + "logits/rejected": -0.39968156814575195, + "logps/chosen": -54.009803771972656, + "logps/rejected": -83.33084869384766, + "loss": 0.6434, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.045009136199951, + "rewards/margins": 5.363265037536621, + "rewards/rejected": -2.318255662918091, + "step": 5888 + }, + { + "epoch": 1.47, + "grad_norm": 5.906283378601074, + "learning_rate": 7.855436263085048e-09, + "logits/chosen": -0.3018229305744171, + "logits/rejected": -0.41462790966033936, + "logps/chosen": -56.57467269897461, + "logps/rejected": -77.02088165283203, + "loss": 0.7405, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8159689903259277, + "rewards/margins": 4.518002510070801, + "rewards/rejected": -1.7020341157913208, + "step": 5889 + }, + { + "epoch": 1.47, + "grad_norm": 3.3140480518341064, + "learning_rate": 7.709329370509389e-09, + "logits/chosen": -0.2770817279815674, + "logits/rejected": -0.3954048156738281, + "logps/chosen": -45.93952560424805, + "logps/rejected": -93.40925598144531, + "loss": 0.5398, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.825533390045166, + "rewards/margins": 5.629495143890381, + "rewards/rejected": -2.803961753845215, + "step": 5890 + }, + { + "epoch": 1.47, + "grad_norm": 3.7180051803588867, + "learning_rate": 7.564592969448448e-09, + "logits/chosen": -0.32714638113975525, + "logits/rejected": -0.4015721082687378, + "logps/chosen": -54.33086395263672, + "logps/rejected": -82.39295959472656, + "loss": 0.6836, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.270542621612549, + "rewards/margins": 5.520009517669678, + "rewards/rejected": -2.249466896057129, + "step": 5891 + }, + { + "epoch": 1.47, + "grad_norm": 3.975602149963379, + "learning_rate": 7.421227099634887e-09, + "logits/chosen": -0.36639267206192017, + "logits/rejected": -0.4476211369037628, + "logps/chosen": -52.27701187133789, + "logps/rejected": -79.10144805908203, + "loss": 0.6106, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0011091232299805, + "rewards/margins": 5.0567216873168945, + "rewards/rejected": -2.0556130409240723, + "step": 5892 + }, + { + "epoch": 1.47, + "grad_norm": 5.351224422454834, + "learning_rate": 7.279231800425557e-09, + "logits/chosen": -0.3154926896095276, + "logits/rejected": -0.48761582374572754, + "logps/chosen": -59.15587615966797, + "logps/rejected": -89.77965545654297, + "loss": 0.7749, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.826450824737549, + "rewards/margins": 5.598313331604004, + "rewards/rejected": -2.771862506866455, + "step": 5893 + }, + { + "epoch": 1.47, + "grad_norm": 4.136151313781738, + "learning_rate": 7.138607110802054e-09, + "logits/chosen": -0.2703675329685211, + "logits/rejected": -0.368998259305954, + "logps/chosen": -61.523773193359375, + "logps/rejected": -93.246826171875, + "loss": 0.6754, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6304564476013184, + "rewards/margins": 4.720914840698242, + "rewards/rejected": -2.0904581546783447, + "step": 5894 + }, + { + "epoch": 1.47, + "grad_norm": 4.58488655090332, + "learning_rate": 6.999353069367942e-09, + "logits/chosen": -0.22105063498020172, + "logits/rejected": -0.36595475673675537, + "logps/chosen": -68.1672592163086, + "logps/rejected": -82.9559326171875, + "loss": 0.8059, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.034829616546631, + "rewards/margins": 4.336143493652344, + "rewards/rejected": -1.3013135194778442, + "step": 5895 + }, + { + "epoch": 1.47, + "grad_norm": 5.013737201690674, + "learning_rate": 6.861469714352087e-09, + "logits/chosen": -0.2615363299846649, + "logits/rejected": -0.41432130336761475, + "logps/chosen": -59.542415618896484, + "logps/rejected": -69.90805053710938, + "loss": 0.7165, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.853618860244751, + "rewards/margins": 3.8728175163269043, + "rewards/rejected": -1.0191984176635742, + "step": 5896 + }, + { + "epoch": 1.48, + "grad_norm": 2.443678140640259, + "learning_rate": 6.724957083605876e-09, + "logits/chosen": -0.36945924162864685, + "logits/rejected": -0.5079700946807861, + "logps/chosen": -54.308685302734375, + "logps/rejected": -91.19770050048828, + "loss": 0.5528, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.329366683959961, + "rewards/margins": 6.3238420486450195, + "rewards/rejected": -2.9944753646850586, + "step": 5897 + }, + { + "epoch": 1.48, + "grad_norm": 4.876245975494385, + "learning_rate": 6.589815214605999e-09, + "logits/chosen": -0.3036097288131714, + "logits/rejected": -0.4531630575656891, + "logps/chosen": -69.38922119140625, + "logps/rejected": -75.5429458618164, + "loss": 0.7565, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8509368896484375, + "rewards/margins": 4.616775035858154, + "rewards/rejected": -1.765838384628296, + "step": 5898 + }, + { + "epoch": 1.48, + "grad_norm": 5.717004776000977, + "learning_rate": 6.456044144450557e-09, + "logits/chosen": -0.31659722328186035, + "logits/rejected": -0.397611141204834, + "logps/chosen": -51.2796745300293, + "logps/rejected": -100.9407958984375, + "loss": 0.661, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.739128589630127, + "rewards/margins": 5.184293746948242, + "rewards/rejected": -2.4451658725738525, + "step": 5899 + }, + { + "epoch": 1.48, + "grad_norm": 3.2928738594055176, + "learning_rate": 6.323643909863508e-09, + "logits/chosen": -0.3289245367050171, + "logits/rejected": -0.4473157227039337, + "logps/chosen": -51.36875915527344, + "logps/rejected": -85.26979064941406, + "loss": 0.5737, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1333296298980713, + "rewards/margins": 4.9551849365234375, + "rewards/rejected": -1.8218560218811035, + "step": 5900 + }, + { + "epoch": 1.48, + "grad_norm": 3.8687479496002197, + "learning_rate": 7.998690193577585e-06, + "logits/chosen": -0.3758062422275543, + "logits/rejected": -0.47493594884872437, + "logps/chosen": -52.12572479248047, + "logps/rejected": -93.59327697753906, + "loss": 0.722, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7923269271850586, + "rewards/margins": 5.285756587982178, + "rewards/rejected": -2.493429183959961, + "step": 5901 + }, + { + "epoch": 1.48, + "grad_norm": 3.080702304840088, + "learning_rate": 7.998061212035361e-06, + "logits/chosen": -0.3345315158367157, + "logits/rejected": -0.48736244440078735, + "logps/chosen": -51.60511779785156, + "logps/rejected": -76.47123718261719, + "loss": 0.6509, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.060199499130249, + "rewards/margins": 6.096527099609375, + "rewards/rejected": -3.036327362060547, + "step": 5902 + }, + { + "epoch": 1.48, + "grad_norm": 3.3722214698791504, + "learning_rate": 7.997432156407857e-06, + "logits/chosen": -0.38887256383895874, + "logits/rejected": -0.4708590507507324, + "logps/chosen": -51.078155517578125, + "logps/rejected": -92.68329620361328, + "loss": 0.6152, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7972476482391357, + "rewards/margins": 5.323158264160156, + "rewards/rejected": -2.5259103775024414, + "step": 5903 + }, + { + "epoch": 1.48, + "grad_norm": 4.00412654876709, + "learning_rate": 7.996803026710614e-06, + "logits/chosen": -0.3151257634162903, + "logits/rejected": -0.47706520557403564, + "logps/chosen": -68.54580688476562, + "logps/rejected": -79.09954833984375, + "loss": 0.716, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0311546325683594, + "rewards/margins": 4.781259536743164, + "rewards/rejected": -1.7501047849655151, + "step": 5904 + }, + { + "epoch": 1.48, + "grad_norm": 3.83955454826355, + "learning_rate": 7.996173822959184e-06, + "logits/chosen": -0.4031180739402771, + "logits/rejected": -0.5051500201225281, + "logps/chosen": -54.88538360595703, + "logps/rejected": -91.65736389160156, + "loss": 0.6884, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.762068510055542, + "rewards/margins": 5.444610118865967, + "rewards/rejected": -2.682541847229004, + "step": 5905 + }, + { + "epoch": 1.48, + "grad_norm": 7.171896457672119, + "learning_rate": 7.99554454516911e-06, + "logits/chosen": -0.30617403984069824, + "logits/rejected": -0.4326861798763275, + "logps/chosen": -58.358272552490234, + "logps/rejected": -73.70970916748047, + "loss": 0.7775, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.681239604949951, + "rewards/margins": 4.455676078796387, + "rewards/rejected": -1.7744364738464355, + "step": 5906 + }, + { + "epoch": 1.48, + "grad_norm": 10.276805877685547, + "learning_rate": 7.994915193355943e-06, + "logits/chosen": -0.40378081798553467, + "logits/rejected": -0.5085052847862244, + "logps/chosen": -60.416969299316406, + "logps/rejected": -100.5326156616211, + "loss": 0.6327, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.994819164276123, + "rewards/margins": 6.643016338348389, + "rewards/rejected": -3.648198127746582, + "step": 5907 + }, + { + "epoch": 1.48, + "grad_norm": 8.868040084838867, + "learning_rate": 7.994285767535237e-06, + "logits/chosen": -0.44608789682388306, + "logits/rejected": -0.5497296452522278, + "logps/chosen": -46.9465446472168, + "logps/rejected": -73.5243148803711, + "loss": 0.6972, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.841688871383667, + "rewards/margins": 4.934420108795166, + "rewards/rejected": -2.092731475830078, + "step": 5908 + }, + { + "epoch": 1.48, + "grad_norm": 9.200271606445312, + "learning_rate": 7.993656267722545e-06, + "logits/chosen": -0.3437961935997009, + "logits/rejected": -0.4062996208667755, + "logps/chosen": -48.33264923095703, + "logps/rejected": -81.96125030517578, + "loss": 0.8689, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.699007272720337, + "rewards/margins": 4.675300598144531, + "rewards/rejected": -1.9762933254241943, + "step": 5909 + }, + { + "epoch": 1.48, + "grad_norm": 3.967893600463867, + "learning_rate": 7.993026693933422e-06, + "logits/chosen": -0.47135600447654724, + "logits/rejected": -0.5822432041168213, + "logps/chosen": -58.94032669067383, + "logps/rejected": -79.61692810058594, + "loss": 0.782, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.2043895721435547, + "rewards/margins": 5.2399396896362305, + "rewards/rejected": -2.0355498790740967, + "step": 5910 + }, + { + "epoch": 1.48, + "grad_norm": 4.755280017852783, + "learning_rate": 7.992397046183426e-06, + "logits/chosen": -0.2999333441257477, + "logits/rejected": -0.4698140025138855, + "logps/chosen": -54.73855209350586, + "logps/rejected": -73.35453796386719, + "loss": 0.6735, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.030822992324829, + "rewards/margins": 5.64648962020874, + "rewards/rejected": -2.615666389465332, + "step": 5911 + }, + { + "epoch": 1.48, + "grad_norm": 5.830135822296143, + "learning_rate": 7.991767324488117e-06, + "logits/chosen": -0.37012603878974915, + "logits/rejected": -0.4154188334941864, + "logps/chosen": -57.82373809814453, + "logps/rejected": -88.6245346069336, + "loss": 0.8136, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.82330322265625, + "rewards/margins": 3.9951486587524414, + "rewards/rejected": -1.1718454360961914, + "step": 5912 + }, + { + "epoch": 1.48, + "grad_norm": 5.045469760894775, + "learning_rate": 7.991137528863054e-06, + "logits/chosen": -0.3822733759880066, + "logits/rejected": -0.47621986269950867, + "logps/chosen": -56.124977111816406, + "logps/rejected": -88.85971069335938, + "loss": 0.7571, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2091097831726074, + "rewards/margins": 5.940081596374512, + "rewards/rejected": -2.730971336364746, + "step": 5913 + }, + { + "epoch": 1.48, + "grad_norm": 18.00789451599121, + "learning_rate": 7.9905076593238e-06, + "logits/chosen": -0.32623913884162903, + "logits/rejected": -0.45532169938087463, + "logps/chosen": -62.300662994384766, + "logps/rejected": -95.44241333007812, + "loss": 0.7705, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7664952278137207, + "rewards/margins": 4.850804805755615, + "rewards/rejected": -2.0843100547790527, + "step": 5914 + }, + { + "epoch": 1.48, + "grad_norm": 5.918293476104736, + "learning_rate": 7.989877715885925e-06, + "logits/chosen": -0.4165160655975342, + "logits/rejected": -0.4980055093765259, + "logps/chosen": -60.0213508605957, + "logps/rejected": -88.92036437988281, + "loss": 0.7951, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.980010747909546, + "rewards/margins": 3.8128063678741455, + "rewards/rejected": -0.8327956199645996, + "step": 5915 + }, + { + "epoch": 1.48, + "grad_norm": 7.565027713775635, + "learning_rate": 7.989247698564988e-06, + "logits/chosen": -0.3506697416305542, + "logits/rejected": -0.4391748905181885, + "logps/chosen": -54.94096755981445, + "logps/rejected": -80.32740783691406, + "loss": 0.6845, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0029428005218506, + "rewards/margins": 4.909365653991699, + "rewards/rejected": -1.9064230918884277, + "step": 5916 + }, + { + "epoch": 1.48, + "grad_norm": 8.842719078063965, + "learning_rate": 7.988617607376562e-06, + "logits/chosen": -0.44740432500839233, + "logits/rejected": -0.5041555166244507, + "logps/chosen": -44.58610916137695, + "logps/rejected": -87.51439666748047, + "loss": 0.7057, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7775144577026367, + "rewards/margins": 5.611015319824219, + "rewards/rejected": -2.8335015773773193, + "step": 5917 + }, + { + "epoch": 1.48, + "grad_norm": 8.44186019897461, + "learning_rate": 7.987987442336216e-06, + "logits/chosen": -0.27651044726371765, + "logits/rejected": -0.3275105953216553, + "logps/chosen": -59.933807373046875, + "logps/rejected": -95.39749908447266, + "loss": 0.6435, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9121410846710205, + "rewards/margins": 4.5250725746154785, + "rewards/rejected": -1.6129308938980103, + "step": 5918 + }, + { + "epoch": 1.48, + "grad_norm": 6.651026248931885, + "learning_rate": 7.987357203459523e-06, + "logits/chosen": -0.36633917689323425, + "logits/rejected": -0.43212202191352844, + "logps/chosen": -53.011749267578125, + "logps/rejected": -85.31120300292969, + "loss": 0.7822, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8403098583221436, + "rewards/margins": 4.23409366607666, + "rewards/rejected": -1.3937839269638062, + "step": 5919 + }, + { + "epoch": 1.48, + "grad_norm": 16.724790573120117, + "learning_rate": 7.986726890762054e-06, + "logits/chosen": -0.4210948944091797, + "logits/rejected": -0.568780243396759, + "logps/chosen": -66.81941986083984, + "logps/rejected": -78.03470611572266, + "loss": 0.7405, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8728878498077393, + "rewards/margins": 5.161901473999023, + "rewards/rejected": -2.289013624191284, + "step": 5920 + }, + { + "epoch": 1.48, + "grad_norm": 11.089082717895508, + "learning_rate": 7.986096504259388e-06, + "logits/chosen": -0.3691394031047821, + "logits/rejected": -0.4806324243545532, + "logps/chosen": -56.2414436340332, + "logps/rejected": -76.787841796875, + "loss": 0.917, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.002197504043579, + "rewards/margins": 4.7396697998046875, + "rewards/rejected": -1.737472414970398, + "step": 5921 + }, + { + "epoch": 1.48, + "grad_norm": 7.604861736297607, + "learning_rate": 7.9854660439671e-06, + "logits/chosen": -0.3146962523460388, + "logits/rejected": -0.49121129512786865, + "logps/chosen": -65.91645050048828, + "logps/rejected": -86.21318817138672, + "loss": 0.7094, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.80029034614563, + "rewards/margins": 5.110759735107422, + "rewards/rejected": -2.310469388961792, + "step": 5922 + }, + { + "epoch": 1.48, + "grad_norm": 7.707157611846924, + "learning_rate": 7.984835509900772e-06, + "logits/chosen": -0.3076273798942566, + "logits/rejected": -0.4281148314476013, + "logps/chosen": -52.91075897216797, + "logps/rejected": -83.56246948242188, + "loss": 0.6857, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5908172130584717, + "rewards/margins": 5.248741149902344, + "rewards/rejected": -2.657923698425293, + "step": 5923 + }, + { + "epoch": 1.48, + "grad_norm": 6.5288848876953125, + "learning_rate": 7.984204902075984e-06, + "logits/chosen": -0.441226989030838, + "logits/rejected": -0.5427994728088379, + "logps/chosen": -68.66683959960938, + "logps/rejected": -78.27046966552734, + "loss": 0.878, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5837817192077637, + "rewards/margins": 4.357880592346191, + "rewards/rejected": -1.7740987539291382, + "step": 5924 + }, + { + "epoch": 1.48, + "grad_norm": 4.9997076988220215, + "learning_rate": 7.983574220508317e-06, + "logits/chosen": -0.40939003229141235, + "logits/rejected": -0.49458855390548706, + "logps/chosen": -50.449832916259766, + "logps/rejected": -88.70203399658203, + "loss": 0.7586, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2554004192352295, + "rewards/margins": 4.935812473297119, + "rewards/rejected": -1.6804120540618896, + "step": 5925 + }, + { + "epoch": 1.48, + "grad_norm": 8.67282485961914, + "learning_rate": 7.98294346521336e-06, + "logits/chosen": -0.3727792501449585, + "logits/rejected": -0.40844017267227173, + "logps/chosen": -65.34505462646484, + "logps/rejected": -114.47997283935547, + "loss": 0.7422, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8562850952148438, + "rewards/margins": 5.0278825759887695, + "rewards/rejected": -2.1715970039367676, + "step": 5926 + }, + { + "epoch": 1.48, + "grad_norm": 9.160481452941895, + "learning_rate": 7.982312636206695e-06, + "logits/chosen": -0.4091310501098633, + "logits/rejected": -0.48554348945617676, + "logps/chosen": -54.12757110595703, + "logps/rejected": -98.08699798583984, + "loss": 0.7119, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.853865385055542, + "rewards/margins": 5.667278289794922, + "rewards/rejected": -2.813412666320801, + "step": 5927 + }, + { + "epoch": 1.48, + "grad_norm": 4.829545974731445, + "learning_rate": 7.981681733503912e-06, + "logits/chosen": -0.2458030879497528, + "logits/rejected": -0.3839825689792633, + "logps/chosen": -64.76783752441406, + "logps/rejected": -94.25517272949219, + "loss": 0.6499, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7672111988067627, + "rewards/margins": 5.748993873596191, + "rewards/rejected": -2.9817824363708496, + "step": 5928 + }, + { + "epoch": 1.48, + "grad_norm": 5.401940822601318, + "learning_rate": 7.981050757120605e-06, + "logits/chosen": -0.3708561062812805, + "logits/rejected": -0.46433985233306885, + "logps/chosen": -59.977996826171875, + "logps/rejected": -86.02094268798828, + "loss": 0.8129, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7620716094970703, + "rewards/margins": 4.684624671936035, + "rewards/rejected": -1.9225528240203857, + "step": 5929 + }, + { + "epoch": 1.48, + "grad_norm": 4.4481635093688965, + "learning_rate": 7.98041970707236e-06, + "logits/chosen": -0.3433767259120941, + "logits/rejected": -0.4691696763038635, + "logps/chosen": -65.33120727539062, + "logps/rejected": -81.89151763916016, + "loss": 0.6782, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0302512645721436, + "rewards/margins": 4.76945161819458, + "rewards/rejected": -1.7392005920410156, + "step": 5930 + }, + { + "epoch": 1.48, + "grad_norm": 8.627151489257812, + "learning_rate": 7.979788583374777e-06, + "logits/chosen": -0.33534950017929077, + "logits/rejected": -0.46756771206855774, + "logps/chosen": -64.86566162109375, + "logps/rejected": -83.1530532836914, + "loss": 0.7112, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7489476203918457, + "rewards/margins": 5.724364280700684, + "rewards/rejected": -2.975417137145996, + "step": 5931 + }, + { + "epoch": 1.48, + "grad_norm": 4.424736976623535, + "learning_rate": 7.979157386043445e-06, + "logits/chosen": -0.31823495030403137, + "logits/rejected": -0.3827988803386688, + "logps/chosen": -54.45465087890625, + "logps/rejected": -83.46237182617188, + "loss": 0.7351, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.125981330871582, + "rewards/margins": 4.741517066955566, + "rewards/rejected": -1.6155357360839844, + "step": 5932 + }, + { + "epoch": 1.48, + "grad_norm": 8.834753036499023, + "learning_rate": 7.978526115093967e-06, + "logits/chosen": -0.3334863781929016, + "logits/rejected": -0.49882441759109497, + "logps/chosen": -52.74552917480469, + "logps/rejected": -60.01462936401367, + "loss": 0.6901, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.686380386352539, + "rewards/margins": 4.729767322540283, + "rewards/rejected": -2.0433874130249023, + "step": 5933 + }, + { + "epoch": 1.48, + "grad_norm": 3.0234313011169434, + "learning_rate": 7.97789477054194e-06, + "logits/chosen": -0.300083190202713, + "logits/rejected": -0.4104916751384735, + "logps/chosen": -62.76261901855469, + "logps/rejected": -90.76741790771484, + "loss": 0.612, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.000430107116699, + "rewards/margins": 6.014307022094727, + "rewards/rejected": -3.0138769149780273, + "step": 5934 + }, + { + "epoch": 1.48, + "grad_norm": 3.6475090980529785, + "learning_rate": 7.977263352402968e-06, + "logits/chosen": -0.2993741035461426, + "logits/rejected": -0.42458024621009827, + "logps/chosen": -59.80215835571289, + "logps/rejected": -70.37603759765625, + "loss": 0.6651, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0897698402404785, + "rewards/margins": 4.887746334075928, + "rewards/rejected": -1.7979764938354492, + "step": 5935 + }, + { + "epoch": 1.49, + "grad_norm": 4.792705535888672, + "learning_rate": 7.97663186069265e-06, + "logits/chosen": -0.42869216203689575, + "logits/rejected": -0.5192348957061768, + "logps/chosen": -53.25447082519531, + "logps/rejected": -68.52799224853516, + "loss": 0.7687, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1639904975891113, + "rewards/margins": 4.85798454284668, + "rewards/rejected": -1.6939946413040161, + "step": 5936 + }, + { + "epoch": 1.49, + "grad_norm": 4.348864555358887, + "learning_rate": 7.976000295426594e-06, + "logits/chosen": -0.34555014967918396, + "logits/rejected": -0.42114904522895813, + "logps/chosen": -49.719120025634766, + "logps/rejected": -74.90171813964844, + "loss": 0.7381, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.797245740890503, + "rewards/margins": 4.333774566650391, + "rewards/rejected": -1.5365290641784668, + "step": 5937 + }, + { + "epoch": 1.49, + "grad_norm": 3.5274746417999268, + "learning_rate": 7.975368656620404e-06, + "logits/chosen": -0.2812482714653015, + "logits/rejected": -0.3594551682472229, + "logps/chosen": -51.327396392822266, + "logps/rejected": -75.43240356445312, + "loss": 0.6293, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8915231227874756, + "rewards/margins": 4.283387660980225, + "rewards/rejected": -1.391864538192749, + "step": 5938 + }, + { + "epoch": 1.49, + "grad_norm": 3.44181489944458, + "learning_rate": 7.974736944289689e-06, + "logits/chosen": -0.37799280881881714, + "logits/rejected": -0.4035286009311676, + "logps/chosen": -41.20419692993164, + "logps/rejected": -82.68895721435547, + "loss": 0.633, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1243770122528076, + "rewards/margins": 5.562975883483887, + "rewards/rejected": -2.4385986328125, + "step": 5939 + }, + { + "epoch": 1.49, + "grad_norm": 3.6567304134368896, + "learning_rate": 7.974105158450062e-06, + "logits/chosen": -0.32769307494163513, + "logits/rejected": -0.4409632086753845, + "logps/chosen": -51.41424560546875, + "logps/rejected": -81.27408599853516, + "loss": 0.6427, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.832792282104492, + "rewards/margins": 4.898946762084961, + "rewards/rejected": -2.0661544799804688, + "step": 5940 + }, + { + "epoch": 1.49, + "grad_norm": 4.8930439949035645, + "learning_rate": 7.973473299117131e-06, + "logits/chosen": -0.359270840883255, + "logits/rejected": -0.49055689573287964, + "logps/chosen": -55.72378158569336, + "logps/rejected": -72.28162384033203, + "loss": 0.7528, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8603737354278564, + "rewards/margins": 5.427937984466553, + "rewards/rejected": -2.567564010620117, + "step": 5941 + }, + { + "epoch": 1.49, + "grad_norm": 3.470358371734619, + "learning_rate": 7.972841366306515e-06, + "logits/chosen": -0.28135251998901367, + "logits/rejected": -0.4189729690551758, + "logps/chosen": -51.589420318603516, + "logps/rejected": -82.70728302001953, + "loss": 0.6544, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0309906005859375, + "rewards/margins": 5.360935211181641, + "rewards/rejected": -2.329944372177124, + "step": 5942 + }, + { + "epoch": 1.49, + "grad_norm": 5.9138922691345215, + "learning_rate": 7.972209360033822e-06, + "logits/chosen": -0.33492499589920044, + "logits/rejected": -0.41440755128860474, + "logps/chosen": -57.76726150512695, + "logps/rejected": -75.27662658691406, + "loss": 0.9217, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8475637435913086, + "rewards/margins": 3.9227828979492188, + "rewards/rejected": -1.0752187967300415, + "step": 5943 + }, + { + "epoch": 1.49, + "grad_norm": 3.0952627658843994, + "learning_rate": 7.971577280314678e-06, + "logits/chosen": -0.3491719961166382, + "logits/rejected": -0.47114086151123047, + "logps/chosen": -55.73488235473633, + "logps/rejected": -85.791015625, + "loss": 0.6454, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.973175048828125, + "rewards/margins": 4.895394802093506, + "rewards/rejected": -1.9222192764282227, + "step": 5944 + }, + { + "epoch": 1.49, + "grad_norm": 3.499394178390503, + "learning_rate": 7.970945127164699e-06, + "logits/chosen": -0.3438592553138733, + "logits/rejected": -0.48459017276763916, + "logps/chosen": -59.16221618652344, + "logps/rejected": -83.99180603027344, + "loss": 0.6909, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7300477027893066, + "rewards/margins": 5.804970741271973, + "rewards/rejected": -3.074922561645508, + "step": 5945 + }, + { + "epoch": 1.49, + "grad_norm": 3.6975934505462646, + "learning_rate": 7.970312900599504e-06, + "logits/chosen": -0.3444109559059143, + "logits/rejected": -0.43713250756263733, + "logps/chosen": -64.61238098144531, + "logps/rejected": -97.36074829101562, + "loss": 0.7079, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.727311849594116, + "rewards/margins": 4.051901340484619, + "rewards/rejected": -1.324589490890503, + "step": 5946 + }, + { + "epoch": 1.49, + "grad_norm": 3.282870054244995, + "learning_rate": 7.96968060063472e-06, + "logits/chosen": -0.3299466371536255, + "logits/rejected": -0.34976980090141296, + "logps/chosen": -43.93703842163086, + "logps/rejected": -84.72923278808594, + "loss": 0.6619, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9667611122131348, + "rewards/margins": 4.245294094085693, + "rewards/rejected": -1.2785332202911377, + "step": 5947 + }, + { + "epoch": 1.49, + "grad_norm": 4.273837089538574, + "learning_rate": 7.969048227285968e-06, + "logits/chosen": -0.29370036721229553, + "logits/rejected": -0.4246704876422882, + "logps/chosen": -64.9328842163086, + "logps/rejected": -86.19102478027344, + "loss": 0.6553, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.797882556915283, + "rewards/margins": 5.656189918518066, + "rewards/rejected": -2.858307361602783, + "step": 5948 + }, + { + "epoch": 1.49, + "grad_norm": 4.518583297729492, + "learning_rate": 7.968415780568877e-06, + "logits/chosen": -0.32186123728752136, + "logits/rejected": -0.4405874013900757, + "logps/chosen": -47.29051208496094, + "logps/rejected": -73.46723175048828, + "loss": 0.7147, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.914456605911255, + "rewards/margins": 5.180028915405273, + "rewards/rejected": -2.2655725479125977, + "step": 5949 + }, + { + "epoch": 1.49, + "grad_norm": 13.686545372009277, + "learning_rate": 7.967783260499073e-06, + "logits/chosen": -0.3337825536727905, + "logits/rejected": -0.41391992568969727, + "logps/chosen": -58.67926025390625, + "logps/rejected": -72.8009262084961, + "loss": 0.7564, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8026232719421387, + "rewards/margins": 3.71159291267395, + "rewards/rejected": -0.9089697599411011, + "step": 5950 + }, + { + "epoch": 1.49, + "grad_norm": 15.150198936462402, + "learning_rate": 7.967150667092189e-06, + "logits/chosen": -0.3553914725780487, + "logits/rejected": -0.46899309754371643, + "logps/chosen": -57.761600494384766, + "logps/rejected": -89.72359466552734, + "loss": 0.693, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8505895137786865, + "rewards/margins": 5.337757110595703, + "rewards/rejected": -2.4871673583984375, + "step": 5951 + }, + { + "epoch": 1.49, + "grad_norm": 6.133206367492676, + "learning_rate": 7.966518000363857e-06, + "logits/chosen": -0.3494924008846283, + "logits/rejected": -0.4536694884300232, + "logps/chosen": -62.07737731933594, + "logps/rejected": -84.28145599365234, + "loss": 0.7378, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9840750694274902, + "rewards/margins": 4.696415424346924, + "rewards/rejected": -1.7123403549194336, + "step": 5952 + }, + { + "epoch": 1.49, + "grad_norm": 6.199219703674316, + "learning_rate": 7.96588526032971e-06, + "logits/chosen": -0.33306699991226196, + "logits/rejected": -0.42015933990478516, + "logps/chosen": -61.82832336425781, + "logps/rejected": -92.04838562011719, + "loss": 0.7867, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.810394763946533, + "rewards/margins": 5.045522689819336, + "rewards/rejected": -2.235128164291382, + "step": 5953 + }, + { + "epoch": 1.49, + "grad_norm": 5.5818328857421875, + "learning_rate": 7.965252447005384e-06, + "logits/chosen": -0.4026154577732086, + "logits/rejected": -0.4918292164802551, + "logps/chosen": -45.02458572387695, + "logps/rejected": -77.65274810791016, + "loss": 0.6861, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1295382976531982, + "rewards/margins": 4.509281158447266, + "rewards/rejected": -1.3797423839569092, + "step": 5954 + }, + { + "epoch": 1.49, + "grad_norm": 5.520894527435303, + "learning_rate": 7.964619560406514e-06, + "logits/chosen": -0.4323396682739258, + "logits/rejected": -0.5657583475112915, + "logps/chosen": -52.85791015625, + "logps/rejected": -89.79084777832031, + "loss": 0.626, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0122737884521484, + "rewards/margins": 5.745563983917236, + "rewards/rejected": -2.733289957046509, + "step": 5955 + }, + { + "epoch": 1.49, + "grad_norm": 2.3145995140075684, + "learning_rate": 7.96398660054874e-06, + "logits/chosen": -0.40041884779930115, + "logits/rejected": -0.5086532235145569, + "logps/chosen": -62.880279541015625, + "logps/rejected": -85.97833251953125, + "loss": 0.637, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.093532085418701, + "rewards/margins": 5.975607872009277, + "rewards/rejected": -2.8820760250091553, + "step": 5956 + }, + { + "epoch": 1.49, + "grad_norm": 5.13137674331665, + "learning_rate": 7.963353567447707e-06, + "logits/chosen": -0.33080214262008667, + "logits/rejected": -0.4021596908569336, + "logps/chosen": -49.858482360839844, + "logps/rejected": -86.47021484375, + "loss": 0.6536, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.733668804168701, + "rewards/margins": 4.92690372467041, + "rewards/rejected": -2.1932342052459717, + "step": 5957 + }, + { + "epoch": 1.49, + "grad_norm": 3.5769150257110596, + "learning_rate": 7.962720461119055e-06, + "logits/chosen": -0.45110511779785156, + "logits/rejected": -0.5942341685295105, + "logps/chosen": -53.80421447753906, + "logps/rejected": -78.20630645751953, + "loss": 0.6051, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0928573608398438, + "rewards/margins": 5.642215251922607, + "rewards/rejected": -2.5493574142456055, + "step": 5958 + }, + { + "epoch": 1.49, + "grad_norm": 16.165861129760742, + "learning_rate": 7.96208728157843e-06, + "logits/chosen": -0.3602376878261566, + "logits/rejected": -0.3910447955131531, + "logps/chosen": -56.16018295288086, + "logps/rejected": -85.45100402832031, + "loss": 0.7499, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.492945909500122, + "rewards/margins": 4.52514123916626, + "rewards/rejected": -2.032195806503296, + "step": 5959 + }, + { + "epoch": 1.49, + "grad_norm": 2.958413600921631, + "learning_rate": 7.961454028841478e-06, + "logits/chosen": -0.4001637101173401, + "logits/rejected": -0.5086966753005981, + "logps/chosen": -58.30014419555664, + "logps/rejected": -104.51005554199219, + "loss": 0.5945, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8866841793060303, + "rewards/margins": 6.51473331451416, + "rewards/rejected": -3.62804913520813, + "step": 5960 + }, + { + "epoch": 1.49, + "grad_norm": 7.1988630294799805, + "learning_rate": 7.960820702923845e-06, + "logits/chosen": -0.2940911054611206, + "logits/rejected": -0.3864823579788208, + "logps/chosen": -65.58222961425781, + "logps/rejected": -87.38461303710938, + "loss": 0.9001, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7898619174957275, + "rewards/margins": 4.362596035003662, + "rewards/rejected": -1.5727342367172241, + "step": 5961 + }, + { + "epoch": 1.49, + "grad_norm": 13.89419174194336, + "learning_rate": 7.960187303841184e-06, + "logits/chosen": -0.3452627956867218, + "logits/rejected": -0.42684924602508545, + "logps/chosen": -55.78847885131836, + "logps/rejected": -72.37993621826172, + "loss": 0.9287, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5607857704162598, + "rewards/margins": 3.886665105819702, + "rewards/rejected": -1.3258790969848633, + "step": 5962 + }, + { + "epoch": 1.49, + "grad_norm": 10.759988784790039, + "learning_rate": 7.959553831609145e-06, + "logits/chosen": -0.2692418694496155, + "logits/rejected": -0.3809448480606079, + "logps/chosen": -55.37983703613281, + "logps/rejected": -86.87553405761719, + "loss": 0.6784, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.546226978302002, + "rewards/margins": 5.161532878875732, + "rewards/rejected": -2.6153059005737305, + "step": 5963 + }, + { + "epoch": 1.49, + "grad_norm": 3.689497947692871, + "learning_rate": 7.958920286243384e-06, + "logits/chosen": -0.3782459497451782, + "logits/rejected": -0.44197726249694824, + "logps/chosen": -51.59901809692383, + "logps/rejected": -90.94502258300781, + "loss": 0.6913, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0094220638275146, + "rewards/margins": 4.682800769805908, + "rewards/rejected": -1.6733789443969727, + "step": 5964 + }, + { + "epoch": 1.49, + "grad_norm": 11.278412818908691, + "learning_rate": 7.958286667759555e-06, + "logits/chosen": -0.3713075816631317, + "logits/rejected": -0.4867233633995056, + "logps/chosen": -58.638877868652344, + "logps/rejected": -70.75277709960938, + "loss": 0.9935, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.710284471511841, + "rewards/margins": 4.405849456787109, + "rewards/rejected": -1.6955653429031372, + "step": 5965 + }, + { + "epoch": 1.49, + "grad_norm": 3.484834671020508, + "learning_rate": 7.957652976173317e-06, + "logits/chosen": -0.39820605516433716, + "logits/rejected": -0.5086606740951538, + "logps/chosen": -57.119571685791016, + "logps/rejected": -87.36692810058594, + "loss": 0.7359, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.923882484436035, + "rewards/margins": 6.227869987487793, + "rewards/rejected": -3.3039870262145996, + "step": 5966 + }, + { + "epoch": 1.49, + "grad_norm": 7.785430431365967, + "learning_rate": 7.957019211500328e-06, + "logits/chosen": -0.34768879413604736, + "logits/rejected": -0.35813072323799133, + "logps/chosen": -59.544334411621094, + "logps/rejected": -91.45226287841797, + "loss": 0.9126, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7454850673675537, + "rewards/margins": 3.377547264099121, + "rewards/rejected": -0.6320623159408569, + "step": 5967 + }, + { + "epoch": 1.49, + "grad_norm": 22.71733856201172, + "learning_rate": 7.956385373756249e-06, + "logits/chosen": -0.38574695587158203, + "logits/rejected": -0.4109080135822296, + "logps/chosen": -68.9693374633789, + "logps/rejected": -94.26969909667969, + "loss": 1.0677, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.5237996578216553, + "rewards/margins": 3.490502119064331, + "rewards/rejected": -0.9667026996612549, + "step": 5968 + }, + { + "epoch": 1.49, + "grad_norm": 6.138763427734375, + "learning_rate": 7.955751462956742e-06, + "logits/chosen": -0.39316776394844055, + "logits/rejected": -0.4498431384563446, + "logps/chosen": -59.3831672668457, + "logps/rejected": -101.06787109375, + "loss": 0.7319, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.071662664413452, + "rewards/margins": 5.522426605224609, + "rewards/rejected": -2.4507639408111572, + "step": 5969 + }, + { + "epoch": 1.49, + "grad_norm": 7.484187126159668, + "learning_rate": 7.955117479117474e-06, + "logits/chosen": -0.3140009939670563, + "logits/rejected": -0.40048015117645264, + "logps/chosen": -50.39988327026367, + "logps/rejected": -76.92821502685547, + "loss": 0.7089, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.810819387435913, + "rewards/margins": 4.126566410064697, + "rewards/rejected": -1.3157472610473633, + "step": 5970 + }, + { + "epoch": 1.49, + "grad_norm": 6.005807399749756, + "learning_rate": 7.954483422254108e-06, + "logits/chosen": -0.3436344563961029, + "logits/rejected": -0.40661779046058655, + "logps/chosen": -60.012596130371094, + "logps/rejected": -91.20661926269531, + "loss": 0.6851, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0659656524658203, + "rewards/margins": 4.770675182342529, + "rewards/rejected": -1.704709529876709, + "step": 5971 + }, + { + "epoch": 1.49, + "grad_norm": 4.100116729736328, + "learning_rate": 7.953849292382315e-06, + "logits/chosen": -0.27572593092918396, + "logits/rejected": -0.33324524760246277, + "logps/chosen": -67.8907470703125, + "logps/rejected": -107.1229019165039, + "loss": 0.7022, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0064799785614014, + "rewards/margins": 4.319500923156738, + "rewards/rejected": -1.3130208253860474, + "step": 5972 + }, + { + "epoch": 1.49, + "grad_norm": 5.319023609161377, + "learning_rate": 7.953215089517764e-06, + "logits/chosen": -0.4111151695251465, + "logits/rejected": -0.4981035888195038, + "logps/chosen": -54.18137741088867, + "logps/rejected": -89.40481567382812, + "loss": 0.6895, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.874317169189453, + "rewards/margins": 4.981051445007324, + "rewards/rejected": -2.10673451423645, + "step": 5973 + }, + { + "epoch": 1.49, + "grad_norm": 6.617611408233643, + "learning_rate": 7.952580813676127e-06, + "logits/chosen": -0.41452494263648987, + "logits/rejected": -0.4297706186771393, + "logps/chosen": -46.817955017089844, + "logps/rejected": -76.78752899169922, + "loss": 0.7805, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.053179979324341, + "rewards/margins": 3.9183881282806396, + "rewards/rejected": -0.865207850933075, + "step": 5974 + }, + { + "epoch": 1.49, + "grad_norm": 4.781133651733398, + "learning_rate": 7.951946464873078e-06, + "logits/chosen": -0.31876224279403687, + "logits/rejected": -0.4382142126560211, + "logps/chosen": -61.36988067626953, + "logps/rejected": -99.95830535888672, + "loss": 0.6787, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9652483463287354, + "rewards/margins": 5.190755844116211, + "rewards/rejected": -2.2255072593688965, + "step": 5975 + }, + { + "epoch": 1.5, + "grad_norm": 7.213755130767822, + "learning_rate": 7.951312043124291e-06, + "logits/chosen": -0.34944817423820496, + "logits/rejected": -0.5021181702613831, + "logps/chosen": -63.194007873535156, + "logps/rejected": -86.15763092041016, + "loss": 0.7453, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.52740478515625, + "rewards/margins": 4.910983085632324, + "rewards/rejected": -2.383578062057495, + "step": 5976 + }, + { + "epoch": 1.5, + "grad_norm": 6.2430901527404785, + "learning_rate": 7.950677548445443e-06, + "logits/chosen": -0.40667852759361267, + "logits/rejected": -0.4693232774734497, + "logps/chosen": -53.85447692871094, + "logps/rejected": -94.96450805664062, + "loss": 0.9081, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4266111850738525, + "rewards/margins": 5.185159683227539, + "rewards/rejected": -2.7585482597351074, + "step": 5977 + }, + { + "epoch": 1.5, + "grad_norm": 3.866506576538086, + "learning_rate": 7.950042980852217e-06, + "logits/chosen": -0.42265334725379944, + "logits/rejected": -0.45486247539520264, + "logps/chosen": -49.552947998046875, + "logps/rejected": -118.9307861328125, + "loss": 0.6605, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8630611896514893, + "rewards/margins": 7.530992031097412, + "rewards/rejected": -4.66793155670166, + "step": 5978 + }, + { + "epoch": 1.5, + "grad_norm": 6.015756130218506, + "learning_rate": 7.94940834036029e-06, + "logits/chosen": -0.3879767656326294, + "logits/rejected": -0.50054931640625, + "logps/chosen": -54.736236572265625, + "logps/rejected": -99.33480072021484, + "loss": 0.6882, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.938786745071411, + "rewards/margins": 6.004642009735107, + "rewards/rejected": -3.0658555030822754, + "step": 5979 + }, + { + "epoch": 1.5, + "grad_norm": 4.75679874420166, + "learning_rate": 7.948773626985347e-06, + "logits/chosen": -0.33099237084388733, + "logits/rejected": -0.45817938446998596, + "logps/chosen": -51.09162902832031, + "logps/rejected": -82.7761459350586, + "loss": 0.6377, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7960052490234375, + "rewards/margins": 5.444061279296875, + "rewards/rejected": -2.6480557918548584, + "step": 5980 + }, + { + "epoch": 1.5, + "grad_norm": 5.580620765686035, + "learning_rate": 7.948138840743069e-06, + "logits/chosen": -0.3287707567214966, + "logits/rejected": -0.4068695306777954, + "logps/chosen": -62.78788757324219, + "logps/rejected": -94.6383056640625, + "loss": 0.8489, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.659695625305176, + "rewards/margins": 4.5094685554504395, + "rewards/rejected": -1.8497730493545532, + "step": 5981 + }, + { + "epoch": 1.5, + "grad_norm": 2.2892706394195557, + "learning_rate": 7.947503981649145e-06, + "logits/chosen": -0.4291538596153259, + "logits/rejected": -0.45086121559143066, + "logps/chosen": -45.95655822753906, + "logps/rejected": -108.52982330322266, + "loss": 0.6058, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0184404850006104, + "rewards/margins": 6.075380325317383, + "rewards/rejected": -3.0569393634796143, + "step": 5982 + }, + { + "epoch": 1.5, + "grad_norm": 5.921186923980713, + "learning_rate": 7.946869049719264e-06, + "logits/chosen": -0.2761605679988861, + "logits/rejected": -0.4566492438316345, + "logps/chosen": -66.15164184570312, + "logps/rejected": -88.29570007324219, + "loss": 0.7268, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5408356189727783, + "rewards/margins": 5.119425296783447, + "rewards/rejected": -2.578589677810669, + "step": 5983 + }, + { + "epoch": 1.5, + "grad_norm": 8.334294319152832, + "learning_rate": 7.946234044969113e-06, + "logits/chosen": -0.2979758083820343, + "logits/rejected": -0.37973088026046753, + "logps/chosen": -70.78396606445312, + "logps/rejected": -86.18885040283203, + "loss": 0.765, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6334266662597656, + "rewards/margins": 4.362896919250488, + "rewards/rejected": -1.729470133781433, + "step": 5984 + }, + { + "epoch": 1.5, + "grad_norm": 7.964480876922607, + "learning_rate": 7.945598967414386e-06, + "logits/chosen": -0.3102136552333832, + "logits/rejected": -0.4013870656490326, + "logps/chosen": -52.67808151245117, + "logps/rejected": -98.81211853027344, + "loss": 0.8932, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.058229923248291, + "rewards/margins": 5.168349742889404, + "rewards/rejected": -2.110119342803955, + "step": 5985 + }, + { + "epoch": 1.5, + "grad_norm": 7.2028069496154785, + "learning_rate": 7.944963817070774e-06, + "logits/chosen": -0.30036523938179016, + "logits/rejected": -0.4039991796016693, + "logps/chosen": -63.19871520996094, + "logps/rejected": -89.2060775756836, + "loss": 0.7213, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.931046962738037, + "rewards/margins": 5.11884069442749, + "rewards/rejected": -2.187793731689453, + "step": 5986 + }, + { + "epoch": 1.5, + "grad_norm": 5.493373870849609, + "learning_rate": 7.944328593953976e-06, + "logits/chosen": -0.3219703435897827, + "logits/rejected": -0.427761048078537, + "logps/chosen": -73.96829986572266, + "logps/rejected": -101.33782196044922, + "loss": 0.8193, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7491109371185303, + "rewards/margins": 4.6910552978515625, + "rewards/rejected": -1.9419440031051636, + "step": 5987 + }, + { + "epoch": 1.5, + "grad_norm": 21.23678207397461, + "learning_rate": 7.943693298079684e-06, + "logits/chosen": -0.3474362790584564, + "logits/rejected": -0.46037375926971436, + "logps/chosen": -60.01490783691406, + "logps/rejected": -87.26155853271484, + "loss": 0.6973, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.774385452270508, + "rewards/margins": 5.577430725097656, + "rewards/rejected": -2.803044319152832, + "step": 5988 + }, + { + "epoch": 1.5, + "grad_norm": 5.145100116729736, + "learning_rate": 7.943057929463603e-06, + "logits/chosen": -0.4024941325187683, + "logits/rejected": -0.5567472577095032, + "logps/chosen": -53.4987678527832, + "logps/rejected": -69.43275451660156, + "loss": 0.6528, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9593398571014404, + "rewards/margins": 5.4341020584106445, + "rewards/rejected": -2.474762201309204, + "step": 5989 + }, + { + "epoch": 1.5, + "grad_norm": 4.478973865509033, + "learning_rate": 7.942422488121427e-06, + "logits/chosen": -0.36108818650245667, + "logits/rejected": -0.44637706875801086, + "logps/chosen": -46.96427536010742, + "logps/rejected": -105.42692565917969, + "loss": 0.6557, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1776843070983887, + "rewards/margins": 6.323998928070068, + "rewards/rejected": -3.146314859390259, + "step": 5990 + }, + { + "epoch": 1.5, + "grad_norm": 4.678829193115234, + "learning_rate": 7.941786974068862e-06, + "logits/chosen": -0.39841127395629883, + "logits/rejected": -0.48616376519203186, + "logps/chosen": -66.01590728759766, + "logps/rejected": -85.4914321899414, + "loss": 0.7266, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9528892040252686, + "rewards/margins": 5.705601215362549, + "rewards/rejected": -2.7527120113372803, + "step": 5991 + }, + { + "epoch": 1.5, + "grad_norm": 7.870584487915039, + "learning_rate": 7.941151387321613e-06, + "logits/chosen": -0.25786373019218445, + "logits/rejected": -0.4300742745399475, + "logps/chosen": -53.981876373291016, + "logps/rejected": -68.74954986572266, + "loss": 0.5624, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1367344856262207, + "rewards/margins": 4.998347282409668, + "rewards/rejected": -1.861613154411316, + "step": 5992 + }, + { + "epoch": 1.5, + "grad_norm": 14.168497085571289, + "learning_rate": 7.940515727895384e-06, + "logits/chosen": -0.49166566133499146, + "logits/rejected": -0.6038431525230408, + "logps/chosen": -64.09681701660156, + "logps/rejected": -83.11117553710938, + "loss": 0.6682, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4946558475494385, + "rewards/margins": 5.024941921234131, + "rewards/rejected": -2.5302865505218506, + "step": 5993 + }, + { + "epoch": 1.5, + "grad_norm": 10.531521797180176, + "learning_rate": 7.939879995805884e-06, + "logits/chosen": -0.3677158057689667, + "logits/rejected": -0.4684777855873108, + "logps/chosen": -62.35777282714844, + "logps/rejected": -69.38491821289062, + "loss": 0.8051, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5653271675109863, + "rewards/margins": 3.8891918659210205, + "rewards/rejected": -1.3238646984100342, + "step": 5994 + }, + { + "epoch": 1.5, + "grad_norm": 6.436274528503418, + "learning_rate": 7.939244191068824e-06, + "logits/chosen": -0.30730095505714417, + "logits/rejected": -0.43885141611099243, + "logps/chosen": -61.63959503173828, + "logps/rejected": -80.0757064819336, + "loss": 0.6721, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9305543899536133, + "rewards/margins": 5.2347612380981445, + "rewards/rejected": -2.304205894470215, + "step": 5995 + }, + { + "epoch": 1.5, + "grad_norm": 7.1100592613220215, + "learning_rate": 7.938608313699911e-06, + "logits/chosen": -0.4034035801887512, + "logits/rejected": -0.5057764053344727, + "logps/chosen": -44.08234786987305, + "logps/rejected": -79.4994888305664, + "loss": 0.6847, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.838582754135132, + "rewards/margins": 4.877520561218262, + "rewards/rejected": -2.03893780708313, + "step": 5996 + }, + { + "epoch": 1.5, + "grad_norm": 7.482929706573486, + "learning_rate": 7.937972363714863e-06, + "logits/chosen": -0.3472480773925781, + "logits/rejected": -0.4573667347431183, + "logps/chosen": -68.84078979492188, + "logps/rejected": -75.30622863769531, + "loss": 0.7833, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6667068004608154, + "rewards/margins": 4.593809127807617, + "rewards/rejected": -1.9271025657653809, + "step": 5997 + }, + { + "epoch": 1.5, + "grad_norm": 12.495792388916016, + "learning_rate": 7.937336341129392e-06, + "logits/chosen": -0.38119927048683167, + "logits/rejected": -0.437789648771286, + "logps/chosen": -55.161808013916016, + "logps/rejected": -92.81010437011719, + "loss": 0.7206, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.940521001815796, + "rewards/margins": 5.246447563171387, + "rewards/rejected": -2.30592679977417, + "step": 5998 + }, + { + "epoch": 1.5, + "grad_norm": 3.8840417861938477, + "learning_rate": 7.936700245959215e-06, + "logits/chosen": -0.307830274105072, + "logits/rejected": -0.48693645000457764, + "logps/chosen": -57.107601165771484, + "logps/rejected": -88.0245361328125, + "loss": 0.6754, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0052666664123535, + "rewards/margins": 6.838803768157959, + "rewards/rejected": -3.8335368633270264, + "step": 5999 + }, + { + "epoch": 1.5, + "grad_norm": 5.455062389373779, + "learning_rate": 7.936064078220052e-06, + "logits/chosen": -0.3343837857246399, + "logits/rejected": -0.4346253275871277, + "logps/chosen": -72.95404052734375, + "logps/rejected": -87.95198059082031, + "loss": 0.8055, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.139875650405884, + "rewards/margins": 4.930935382843018, + "rewards/rejected": -1.7910596132278442, + "step": 6000 + }, + { + "epoch": 1.5, + "grad_norm": 7.829893112182617, + "learning_rate": 7.935427837927623e-06, + "logits/chosen": -0.35535964369773865, + "logits/rejected": -0.45938870310783386, + "logps/chosen": -60.20981979370117, + "logps/rejected": -77.70794677734375, + "loss": 0.7803, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.004791498184204, + "rewards/margins": 5.089328289031982, + "rewards/rejected": -2.0845367908477783, + "step": 6001 + }, + { + "epoch": 1.5, + "grad_norm": 6.31870698928833, + "learning_rate": 7.93479152509765e-06, + "logits/chosen": -0.32535579800605774, + "logits/rejected": -0.40872180461883545, + "logps/chosen": -50.954002380371094, + "logps/rejected": -80.02264404296875, + "loss": 0.7826, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.960871696472168, + "rewards/margins": 3.8952300548553467, + "rewards/rejected": -0.9343581199645996, + "step": 6002 + }, + { + "epoch": 1.5, + "grad_norm": 3.6201865673065186, + "learning_rate": 7.934155139745857e-06, + "logits/chosen": -0.33797869086265564, + "logits/rejected": -0.44053566455841064, + "logps/chosen": -66.98153686523438, + "logps/rejected": -88.2325210571289, + "loss": 0.6435, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.876707077026367, + "rewards/margins": 5.400680065155029, + "rewards/rejected": -2.523972988128662, + "step": 6003 + }, + { + "epoch": 1.5, + "grad_norm": 6.142951965332031, + "learning_rate": 7.933518681887969e-06, + "logits/chosen": -0.36390599608421326, + "logits/rejected": -0.41720348596572876, + "logps/chosen": -53.08688735961914, + "logps/rejected": -82.0972671508789, + "loss": 0.7913, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8108813762664795, + "rewards/margins": 3.919557571411133, + "rewards/rejected": -1.1086763143539429, + "step": 6004 + }, + { + "epoch": 1.5, + "grad_norm": 14.809098243713379, + "learning_rate": 7.932882151539714e-06, + "logits/chosen": -0.33681443333625793, + "logits/rejected": -0.5089970231056213, + "logps/chosen": -75.65440368652344, + "logps/rejected": -76.68882751464844, + "loss": 0.8543, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7970614433288574, + "rewards/margins": 4.937533378601074, + "rewards/rejected": -2.1404716968536377, + "step": 6005 + }, + { + "epoch": 1.5, + "grad_norm": 5.577622413635254, + "learning_rate": 7.932245548716822e-06, + "logits/chosen": -0.36436212062835693, + "logits/rejected": -0.4243307113647461, + "logps/chosen": -52.36092758178711, + "logps/rejected": -96.78312683105469, + "loss": 0.7228, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9773974418640137, + "rewards/margins": 5.272013187408447, + "rewards/rejected": -2.2946159839630127, + "step": 6006 + }, + { + "epoch": 1.5, + "grad_norm": 3.8140881061553955, + "learning_rate": 7.93160887343502e-06, + "logits/chosen": -0.44510915875434875, + "logits/rejected": -0.5090377330780029, + "logps/chosen": -41.879920959472656, + "logps/rejected": -89.74012756347656, + "loss": 0.6155, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.986875295639038, + "rewards/margins": 5.644200801849365, + "rewards/rejected": -2.657325506210327, + "step": 6007 + }, + { + "epoch": 1.5, + "grad_norm": 4.335719585418701, + "learning_rate": 7.93097212571005e-06, + "logits/chosen": -0.29569047689437866, + "logits/rejected": -0.4340752065181732, + "logps/chosen": -64.55281066894531, + "logps/rejected": -81.49048614501953, + "loss": 0.6941, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2101829051971436, + "rewards/margins": 5.390151500701904, + "rewards/rejected": -2.17996883392334, + "step": 6008 + }, + { + "epoch": 1.5, + "grad_norm": 4.559959411621094, + "learning_rate": 7.930335305557639e-06, + "logits/chosen": -0.32085680961608887, + "logits/rejected": -0.37909311056137085, + "logps/chosen": -59.05637741088867, + "logps/rejected": -83.64964294433594, + "loss": 0.7355, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6312782764434814, + "rewards/margins": 4.07720947265625, + "rewards/rejected": -1.4459316730499268, + "step": 6009 + }, + { + "epoch": 1.5, + "grad_norm": 6.129347324371338, + "learning_rate": 7.929698412993525e-06, + "logits/chosen": -0.36959749460220337, + "logits/rejected": -0.4799916744232178, + "logps/chosen": -64.0936279296875, + "logps/rejected": -85.05729675292969, + "loss": 0.7728, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.103673219680786, + "rewards/margins": 5.235733985900879, + "rewards/rejected": -2.13206148147583, + "step": 6010 + }, + { + "epoch": 1.5, + "grad_norm": 25.878368377685547, + "learning_rate": 7.929061448033448e-06, + "logits/chosen": -0.37753403186798096, + "logits/rejected": -0.440439909696579, + "logps/chosen": -59.77452087402344, + "logps/rejected": -99.37728118896484, + "loss": 0.8436, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.632826805114746, + "rewards/margins": 4.5523176193237305, + "rewards/rejected": -1.919490933418274, + "step": 6011 + }, + { + "epoch": 1.5, + "grad_norm": 8.675600051879883, + "learning_rate": 7.928424410693148e-06, + "logits/chosen": -0.42565980553627014, + "logits/rejected": -0.42780470848083496, + "logps/chosen": -55.74591827392578, + "logps/rejected": -85.77391815185547, + "loss": 0.8638, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2588093280792236, + "rewards/margins": 4.220285415649414, + "rewards/rejected": -0.9614761471748352, + "step": 6012 + }, + { + "epoch": 1.5, + "grad_norm": 8.276834487915039, + "learning_rate": 7.927787300988364e-06, + "logits/chosen": -0.41535425186157227, + "logits/rejected": -0.5034579038619995, + "logps/chosen": -57.694698333740234, + "logps/rejected": -71.00978088378906, + "loss": 0.8212, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0043389797210693, + "rewards/margins": 3.565351963043213, + "rewards/rejected": -0.5610131025314331, + "step": 6013 + }, + { + "epoch": 1.5, + "grad_norm": 4.9047369956970215, + "learning_rate": 7.927150118934843e-06, + "logits/chosen": -0.3194461762905121, + "logits/rejected": -0.3388315439224243, + "logps/chosen": -65.55952453613281, + "logps/rejected": -98.6658706665039, + "loss": 0.8149, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9392623901367188, + "rewards/margins": 4.254890441894531, + "rewards/rejected": -1.3156275749206543, + "step": 6014 + }, + { + "epoch": 1.5, + "grad_norm": 6.115703582763672, + "learning_rate": 7.926512864548329e-06, + "logits/chosen": -0.3600527048110962, + "logits/rejected": -0.49820661544799805, + "logps/chosen": -55.912841796875, + "logps/rejected": -73.88329315185547, + "loss": 0.6964, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6944198608398438, + "rewards/margins": 4.1197686195373535, + "rewards/rejected": -1.4253489971160889, + "step": 6015 + }, + { + "epoch": 1.51, + "grad_norm": 7.618560791015625, + "learning_rate": 7.925875537844568e-06, + "logits/chosen": -0.26252537965774536, + "logits/rejected": -0.358564555644989, + "logps/chosen": -65.03571319580078, + "logps/rejected": -90.55962371826172, + "loss": 0.7888, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7353153228759766, + "rewards/margins": 4.381772994995117, + "rewards/rejected": -1.6464576721191406, + "step": 6016 + }, + { + "epoch": 1.51, + "grad_norm": 8.944145202636719, + "learning_rate": 7.925238138839312e-06, + "logits/chosen": -0.34759193658828735, + "logits/rejected": -0.4833700656890869, + "logps/chosen": -65.95951843261719, + "logps/rejected": -83.08464050292969, + "loss": 0.7821, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.971468925476074, + "rewards/margins": 5.028573036193848, + "rewards/rejected": -2.0571041107177734, + "step": 6017 + }, + { + "epoch": 1.51, + "grad_norm": 6.782735347747803, + "learning_rate": 7.92460066754831e-06, + "logits/chosen": -0.37197375297546387, + "logits/rejected": -0.459171324968338, + "logps/chosen": -53.35853576660156, + "logps/rejected": -75.23460388183594, + "loss": 0.726, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8311562538146973, + "rewards/margins": 4.8946967124938965, + "rewards/rejected": -2.063540458679199, + "step": 6018 + }, + { + "epoch": 1.51, + "grad_norm": 3.2796804904937744, + "learning_rate": 7.923963123987313e-06, + "logits/chosen": -0.4310469925403595, + "logits/rejected": -0.5050365328788757, + "logps/chosen": -41.107017517089844, + "logps/rejected": -70.90901184082031, + "loss": 0.6402, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7698259353637695, + "rewards/margins": 4.851195335388184, + "rewards/rejected": -2.0813684463500977, + "step": 6019 + }, + { + "epoch": 1.51, + "grad_norm": 6.194514274597168, + "learning_rate": 7.92332550817208e-06, + "logits/chosen": -0.3561113178730011, + "logits/rejected": -0.43078887462615967, + "logps/chosen": -66.13761138916016, + "logps/rejected": -88.82909393310547, + "loss": 0.71, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0173635482788086, + "rewards/margins": 4.664687633514404, + "rewards/rejected": -1.6473242044448853, + "step": 6020 + }, + { + "epoch": 1.51, + "grad_norm": 11.699719429016113, + "learning_rate": 7.922687820118363e-06, + "logits/chosen": -0.3517270088195801, + "logits/rejected": -0.43097537755966187, + "logps/chosen": -57.96190643310547, + "logps/rejected": -82.02449035644531, + "loss": 0.6822, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7427499294281006, + "rewards/margins": 4.799098968505859, + "rewards/rejected": -2.056349039077759, + "step": 6021 + }, + { + "epoch": 1.51, + "grad_norm": 3.281877040863037, + "learning_rate": 7.92205005984192e-06, + "logits/chosen": -0.341439813375473, + "logits/rejected": -0.48743578791618347, + "logps/chosen": -60.7421760559082, + "logps/rejected": -94.6081771850586, + "loss": 0.5976, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.875849485397339, + "rewards/margins": 5.849007606506348, + "rewards/rejected": -2.9731576442718506, + "step": 6022 + }, + { + "epoch": 1.51, + "grad_norm": 9.1334867477417, + "learning_rate": 7.921412227358513e-06, + "logits/chosen": -0.3439941704273224, + "logits/rejected": -0.3998143970966339, + "logps/chosen": -51.306053161621094, + "logps/rejected": -86.76457214355469, + "loss": 0.5823, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0697433948516846, + "rewards/margins": 4.284125328063965, + "rewards/rejected": -1.2143819332122803, + "step": 6023 + }, + { + "epoch": 1.51, + "grad_norm": 4.3785905838012695, + "learning_rate": 7.920774322683902e-06, + "logits/chosen": -0.29602140188217163, + "logits/rejected": -0.37312012910842896, + "logps/chosen": -63.333736419677734, + "logps/rejected": -88.7856674194336, + "loss": 0.756, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.895935297012329, + "rewards/margins": 4.207092761993408, + "rewards/rejected": -1.3111575841903687, + "step": 6024 + }, + { + "epoch": 1.51, + "grad_norm": 3.952855110168457, + "learning_rate": 7.920136345833851e-06, + "logits/chosen": -0.353899747133255, + "logits/rejected": -0.4488750100135803, + "logps/chosen": -59.91505813598633, + "logps/rejected": -83.65804290771484, + "loss": 0.7526, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.858538866043091, + "rewards/margins": 5.204869747161865, + "rewards/rejected": -2.3463311195373535, + "step": 6025 + }, + { + "epoch": 1.51, + "grad_norm": 4.959348201751709, + "learning_rate": 7.919498296824127e-06, + "logits/chosen": -0.3046959936618805, + "logits/rejected": -0.41565877199172974, + "logps/chosen": -56.27259826660156, + "logps/rejected": -87.51817321777344, + "loss": 0.7542, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6738243103027344, + "rewards/margins": 4.857026100158691, + "rewards/rejected": -2.183201789855957, + "step": 6026 + }, + { + "epoch": 1.51, + "grad_norm": 12.29484748840332, + "learning_rate": 7.918860175670493e-06, + "logits/chosen": -0.3158356547355652, + "logits/rejected": -0.37182021141052246, + "logps/chosen": -57.947669982910156, + "logps/rejected": -84.87002563476562, + "loss": 0.7416, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7533419132232666, + "rewards/margins": 4.448728561401367, + "rewards/rejected": -1.695386290550232, + "step": 6027 + }, + { + "epoch": 1.51, + "grad_norm": 6.7857160568237305, + "learning_rate": 7.918221982388718e-06, + "logits/chosen": -0.3361622989177704, + "logits/rejected": -0.4298500716686249, + "logps/chosen": -56.732723236083984, + "logps/rejected": -96.38524627685547, + "loss": 0.6603, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7295196056365967, + "rewards/margins": 5.660238742828369, + "rewards/rejected": -2.9307193756103516, + "step": 6028 + }, + { + "epoch": 1.51, + "grad_norm": 6.466341018676758, + "learning_rate": 7.917583716994578e-06, + "logits/chosen": -0.3055264353752136, + "logits/rejected": -0.37608370184898376, + "logps/chosen": -71.4158706665039, + "logps/rejected": -83.62943267822266, + "loss": 0.798, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.914604663848877, + "rewards/margins": 4.588142395019531, + "rewards/rejected": -1.6735374927520752, + "step": 6029 + }, + { + "epoch": 1.51, + "grad_norm": 2.2894809246063232, + "learning_rate": 7.916945379503837e-06, + "logits/chosen": -0.37349069118499756, + "logits/rejected": -0.4759491980075836, + "logps/chosen": -50.766056060791016, + "logps/rejected": -82.86131286621094, + "loss": 0.6092, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2151787281036377, + "rewards/margins": 5.528356075286865, + "rewards/rejected": -2.3131771087646484, + "step": 6030 + }, + { + "epoch": 1.51, + "grad_norm": 2.683474063873291, + "learning_rate": 7.916306969932275e-06, + "logits/chosen": -0.33382800221443176, + "logits/rejected": -0.4267694056034088, + "logps/chosen": -57.22345733642578, + "logps/rejected": -89.41993713378906, + "loss": 0.5909, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0580809116363525, + "rewards/margins": 5.313621997833252, + "rewards/rejected": -2.2555410861968994, + "step": 6031 + }, + { + "epoch": 1.51, + "grad_norm": 5.090240955352783, + "learning_rate": 7.915668488295665e-06, + "logits/chosen": -0.3964858651161194, + "logits/rejected": -0.49268150329589844, + "logps/chosen": -54.10725784301758, + "logps/rejected": -81.07295227050781, + "loss": 0.6977, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9138917922973633, + "rewards/margins": 5.200370788574219, + "rewards/rejected": -2.2864794731140137, + "step": 6032 + }, + { + "epoch": 1.51, + "grad_norm": 3.433047294616699, + "learning_rate": 7.915029934609787e-06, + "logits/chosen": -0.3739737868309021, + "logits/rejected": -0.49681130051612854, + "logps/chosen": -56.814292907714844, + "logps/rejected": -92.38545227050781, + "loss": 0.6199, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9452338218688965, + "rewards/margins": 5.541363716125488, + "rewards/rejected": -2.59613037109375, + "step": 6033 + }, + { + "epoch": 1.51, + "grad_norm": 2.8351359367370605, + "learning_rate": 7.914391308890414e-06, + "logits/chosen": -0.4192069172859192, + "logits/rejected": -0.549527645111084, + "logps/chosen": -50.7679443359375, + "logps/rejected": -84.31005859375, + "loss": 0.6735, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.171794891357422, + "rewards/margins": 5.706902027130127, + "rewards/rejected": -2.535106658935547, + "step": 6034 + }, + { + "epoch": 1.51, + "grad_norm": 12.356879234313965, + "learning_rate": 7.913752611153337e-06, + "logits/chosen": -0.3571486473083496, + "logits/rejected": -0.47755172848701477, + "logps/chosen": -62.58634567260742, + "logps/rejected": -77.55369567871094, + "loss": 0.7664, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7411816120147705, + "rewards/margins": 4.388579368591309, + "rewards/rejected": -1.6473973989486694, + "step": 6035 + }, + { + "epoch": 1.51, + "grad_norm": 4.24580192565918, + "learning_rate": 7.913113841414333e-06, + "logits/chosen": -0.37340405583381653, + "logits/rejected": -0.4300925135612488, + "logps/chosen": -58.87377166748047, + "logps/rejected": -101.04124450683594, + "loss": 0.7082, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.920184850692749, + "rewards/margins": 4.496978282928467, + "rewards/rejected": -1.5767933130264282, + "step": 6036 + }, + { + "epoch": 1.51, + "grad_norm": 5.259593963623047, + "learning_rate": 7.912474999689185e-06, + "logits/chosen": -0.28026071190834045, + "logits/rejected": -0.41641104221343994, + "logps/chosen": -60.00269317626953, + "logps/rejected": -84.14476013183594, + "loss": 0.6768, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8595356941223145, + "rewards/margins": 5.424270153045654, + "rewards/rejected": -2.564735174179077, + "step": 6037 + }, + { + "epoch": 1.51, + "grad_norm": 7.290791034698486, + "learning_rate": 7.911836085993685e-06, + "logits/chosen": -0.3873322606086731, + "logits/rejected": -0.48813527822494507, + "logps/chosen": -67.56688690185547, + "logps/rejected": -89.3793716430664, + "loss": 0.8424, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9176220893859863, + "rewards/margins": 4.351393699645996, + "rewards/rejected": -1.4337713718414307, + "step": 6038 + }, + { + "epoch": 1.51, + "grad_norm": 5.273187160491943, + "learning_rate": 7.911197100343616e-06, + "logits/chosen": -0.4126760959625244, + "logits/rejected": -0.4888048470020294, + "logps/chosen": -47.658451080322266, + "logps/rejected": -104.88722229003906, + "loss": 0.6431, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.301048994064331, + "rewards/margins": 5.763579845428467, + "rewards/rejected": -2.462531089782715, + "step": 6039 + }, + { + "epoch": 1.51, + "grad_norm": 3.490165948867798, + "learning_rate": 7.910558042754771e-06, + "logits/chosen": -0.3823596239089966, + "logits/rejected": -0.499507337808609, + "logps/chosen": -56.22515869140625, + "logps/rejected": -85.57395935058594, + "loss": 0.6784, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.833573341369629, + "rewards/margins": 5.214605331420898, + "rewards/rejected": -2.3810322284698486, + "step": 6040 + }, + { + "epoch": 1.51, + "grad_norm": 10.443175315856934, + "learning_rate": 7.90991891324294e-06, + "logits/chosen": -0.37127405405044556, + "logits/rejected": -0.37572404742240906, + "logps/chosen": -51.32375717163086, + "logps/rejected": -93.80321502685547, + "loss": 0.7903, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.828798770904541, + "rewards/margins": 4.463037490844727, + "rewards/rejected": -1.634238839149475, + "step": 6041 + }, + { + "epoch": 1.51, + "grad_norm": 5.674060344696045, + "learning_rate": 7.909279711823919e-06, + "logits/chosen": -0.4207596480846405, + "logits/rejected": -0.47503143548965454, + "logps/chosen": -60.36859893798828, + "logps/rejected": -93.85155487060547, + "loss": 0.7559, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.773818254470825, + "rewards/margins": 5.549213409423828, + "rewards/rejected": -2.775395154953003, + "step": 6042 + }, + { + "epoch": 1.51, + "grad_norm": 3.062678575515747, + "learning_rate": 7.908640438513499e-06, + "logits/chosen": -0.3272187113761902, + "logits/rejected": -0.44628575444221497, + "logps/chosen": -58.81883239746094, + "logps/rejected": -87.1598892211914, + "loss": 0.5984, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2426469326019287, + "rewards/margins": 5.708823204040527, + "rewards/rejected": -2.4661755561828613, + "step": 6043 + }, + { + "epoch": 1.51, + "grad_norm": 4.055457592010498, + "learning_rate": 7.908001093327485e-06, + "logits/chosen": -0.3427828550338745, + "logits/rejected": -0.4462456703186035, + "logps/chosen": -55.16346740722656, + "logps/rejected": -98.87796783447266, + "loss": 0.6652, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9058752059936523, + "rewards/margins": 6.189945697784424, + "rewards/rejected": -3.284069776535034, + "step": 6044 + }, + { + "epoch": 1.51, + "grad_norm": 10.284499168395996, + "learning_rate": 7.907361676281667e-06, + "logits/chosen": -0.4151920974254608, + "logits/rejected": -0.5321916937828064, + "logps/chosen": -55.51641845703125, + "logps/rejected": -75.04934692382812, + "loss": 0.7438, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.863126754760742, + "rewards/margins": 4.507974624633789, + "rewards/rejected": -1.644848108291626, + "step": 6045 + }, + { + "epoch": 1.51, + "grad_norm": 5.967092990875244, + "learning_rate": 7.90672218739185e-06, + "logits/chosen": -0.3526698350906372, + "logits/rejected": -0.46121230721473694, + "logps/chosen": -54.686073303222656, + "logps/rejected": -83.0813980102539, + "loss": 0.7226, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.602922201156616, + "rewards/margins": 4.739367485046387, + "rewards/rejected": -2.1364452838897705, + "step": 6046 + }, + { + "epoch": 1.51, + "grad_norm": 2.7488791942596436, + "learning_rate": 7.906082626673837e-06, + "logits/chosen": -0.3273589313030243, + "logits/rejected": -0.4462573230266571, + "logps/chosen": -51.57568359375, + "logps/rejected": -83.93062591552734, + "loss": 0.6098, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1079068183898926, + "rewards/margins": 4.875721454620361, + "rewards/rejected": -1.7678142786026, + "step": 6047 + }, + { + "epoch": 1.51, + "grad_norm": 6.719808578491211, + "learning_rate": 7.905442994143431e-06, + "logits/chosen": -0.3131519556045532, + "logits/rejected": -0.40705621242523193, + "logps/chosen": -70.43810272216797, + "logps/rejected": -97.20842742919922, + "loss": 0.7851, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.806715965270996, + "rewards/margins": 5.539975166320801, + "rewards/rejected": -2.7332587242126465, + "step": 6048 + }, + { + "epoch": 1.51, + "grad_norm": 29.789447784423828, + "learning_rate": 7.904803289816439e-06, + "logits/chosen": -0.4092088043689728, + "logits/rejected": -0.4773609936237335, + "logps/chosen": -59.10995101928711, + "logps/rejected": -83.25647735595703, + "loss": 0.9152, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7674400806427, + "rewards/margins": 4.306394577026367, + "rewards/rejected": -1.538954734802246, + "step": 6049 + }, + { + "epoch": 1.51, + "grad_norm": 22.431575775146484, + "learning_rate": 7.904163513708666e-06, + "logits/chosen": -0.3907592296600342, + "logits/rejected": -0.42893487215042114, + "logps/chosen": -52.11210632324219, + "logps/rejected": -85.6130142211914, + "loss": 0.7304, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.823861598968506, + "rewards/margins": 4.123175621032715, + "rewards/rejected": -1.2993141412734985, + "step": 6050 + }, + { + "epoch": 1.51, + "grad_norm": 15.0757474899292, + "learning_rate": 7.903523665835925e-06, + "logits/chosen": -0.37124398350715637, + "logits/rejected": -0.4460116922855377, + "logps/chosen": -57.106201171875, + "logps/rejected": -96.33352661132812, + "loss": 0.8631, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6843957901000977, + "rewards/margins": 4.998868942260742, + "rewards/rejected": -2.3144731521606445, + "step": 6051 + }, + { + "epoch": 1.51, + "grad_norm": 4.281836986541748, + "learning_rate": 7.902883746214024e-06, + "logits/chosen": -0.44474175572395325, + "logits/rejected": -0.515485405921936, + "logps/chosen": -55.58683395385742, + "logps/rejected": -89.12809753417969, + "loss": 0.6744, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.778721570968628, + "rewards/margins": 5.170011520385742, + "rewards/rejected": -2.3912899494171143, + "step": 6052 + }, + { + "epoch": 1.51, + "grad_norm": 7.41178035736084, + "learning_rate": 7.902243754858781e-06, + "logits/chosen": -0.36575597524642944, + "logits/rejected": -0.5090591907501221, + "logps/chosen": -62.989810943603516, + "logps/rejected": -92.96882629394531, + "loss": 0.6807, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7857468128204346, + "rewards/margins": 5.76236629486084, + "rewards/rejected": -2.976619243621826, + "step": 6053 + }, + { + "epoch": 1.51, + "grad_norm": 4.638927459716797, + "learning_rate": 7.901603691786007e-06, + "logits/chosen": -0.4552079737186432, + "logits/rejected": -0.5238950848579407, + "logps/chosen": -63.779296875, + "logps/rejected": -91.93791198730469, + "loss": 0.8506, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.958749771118164, + "rewards/margins": 5.170534133911133, + "rewards/rejected": -2.2117838859558105, + "step": 6054 + }, + { + "epoch": 1.51, + "grad_norm": 7.63701868057251, + "learning_rate": 7.900963557011519e-06, + "logits/chosen": -0.34835734963417053, + "logits/rejected": -0.4353681802749634, + "logps/chosen": -52.98377227783203, + "logps/rejected": -89.23088836669922, + "loss": 0.6856, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.934251308441162, + "rewards/margins": 4.647594451904297, + "rewards/rejected": -1.7133426666259766, + "step": 6055 + }, + { + "epoch": 1.52, + "grad_norm": 4.361390113830566, + "learning_rate": 7.900323350551135e-06, + "logits/chosen": -0.3196619153022766, + "logits/rejected": -0.3889235854148865, + "logps/chosen": -54.22203826904297, + "logps/rejected": -94.76368713378906, + "loss": 0.679, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.980126142501831, + "rewards/margins": 4.859096050262451, + "rewards/rejected": -1.8789701461791992, + "step": 6056 + }, + { + "epoch": 1.52, + "grad_norm": 3.347785472869873, + "learning_rate": 7.899683072420677e-06, + "logits/chosen": -0.3327910304069519, + "logits/rejected": -0.44931113719940186, + "logps/chosen": -66.3427734375, + "logps/rejected": -90.72969055175781, + "loss": 0.7004, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.934929370880127, + "rewards/margins": 5.584397315979004, + "rewards/rejected": -2.649467945098877, + "step": 6057 + }, + { + "epoch": 1.52, + "grad_norm": 7.019649982452393, + "learning_rate": 7.899042722635966e-06, + "logits/chosen": -0.3762895166873932, + "logits/rejected": -0.4916762113571167, + "logps/chosen": -57.044010162353516, + "logps/rejected": -85.90428161621094, + "loss": 0.7947, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.718006134033203, + "rewards/margins": 5.247483253479004, + "rewards/rejected": -2.5294768810272217, + "step": 6058 + }, + { + "epoch": 1.52, + "grad_norm": 6.603013038635254, + "learning_rate": 7.898402301212825e-06, + "logits/chosen": -0.3070659339427948, + "logits/rejected": -0.44966915249824524, + "logps/chosen": -74.97344970703125, + "logps/rejected": -92.01461791992188, + "loss": 0.8501, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.780724048614502, + "rewards/margins": 5.373173713684082, + "rewards/rejected": -2.59244966506958, + "step": 6059 + }, + { + "epoch": 1.52, + "grad_norm": 13.415468215942383, + "learning_rate": 7.897761808167082e-06, + "logits/chosen": -0.4386771023273468, + "logits/rejected": -0.4840063452720642, + "logps/chosen": -48.348976135253906, + "logps/rejected": -79.5401840209961, + "loss": 0.829, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.709728956222534, + "rewards/margins": 3.779963731765747, + "rewards/rejected": -1.0702346563339233, + "step": 6060 + }, + { + "epoch": 1.52, + "grad_norm": 10.231919288635254, + "learning_rate": 7.897121243514561e-06, + "logits/chosen": -0.44607895612716675, + "logits/rejected": -0.5598374009132385, + "logps/chosen": -53.5177116394043, + "logps/rejected": -73.07917022705078, + "loss": 0.7361, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.847212553024292, + "rewards/margins": 4.263497829437256, + "rewards/rejected": -1.4162849187850952, + "step": 6061 + }, + { + "epoch": 1.52, + "grad_norm": 4.380921840667725, + "learning_rate": 7.896480607271093e-06, + "logits/chosen": -0.42944541573524475, + "logits/rejected": -0.5111230611801147, + "logps/chosen": -47.19557189941406, + "logps/rejected": -81.7225341796875, + "loss": 0.778, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8743302822113037, + "rewards/margins": 4.77200984954834, + "rewards/rejected": -1.897679328918457, + "step": 6062 + }, + { + "epoch": 1.52, + "grad_norm": 7.8558197021484375, + "learning_rate": 7.89583989945251e-06, + "logits/chosen": -0.3525148630142212, + "logits/rejected": -0.45168328285217285, + "logps/chosen": -73.46512603759766, + "logps/rejected": -88.83059692382812, + "loss": 0.7983, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.3747148513793945, + "rewards/margins": 4.726334571838379, + "rewards/rejected": -2.3516199588775635, + "step": 6063 + }, + { + "epoch": 1.52, + "grad_norm": 4.502555847167969, + "learning_rate": 7.895199120074641e-06, + "logits/chosen": -0.3601301312446594, + "logits/rejected": -0.44680482149124146, + "logps/chosen": -63.59003829956055, + "logps/rejected": -87.80241394042969, + "loss": 0.8107, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.113274097442627, + "rewards/margins": 4.987193584442139, + "rewards/rejected": -1.8739193677902222, + "step": 6064 + }, + { + "epoch": 1.52, + "grad_norm": 8.278709411621094, + "learning_rate": 7.894558269153322e-06, + "logits/chosen": -0.35438740253448486, + "logits/rejected": -0.44912365078926086, + "logps/chosen": -65.55547332763672, + "logps/rejected": -73.10858154296875, + "loss": 0.9522, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6672675609588623, + "rewards/margins": 3.5491745471954346, + "rewards/rejected": -0.8819071650505066, + "step": 6065 + }, + { + "epoch": 1.52, + "grad_norm": 5.322830677032471, + "learning_rate": 7.89391734670439e-06, + "logits/chosen": -0.4192408323287964, + "logits/rejected": -0.47192996740341187, + "logps/chosen": -46.073265075683594, + "logps/rejected": -91.64347839355469, + "loss": 0.6537, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.959345579147339, + "rewards/margins": 4.820651531219482, + "rewards/rejected": -1.861305594444275, + "step": 6066 + }, + { + "epoch": 1.52, + "grad_norm": 4.622106075286865, + "learning_rate": 7.893276352743681e-06, + "logits/chosen": -0.3501184284687042, + "logits/rejected": -0.4274623692035675, + "logps/chosen": -50.25543975830078, + "logps/rejected": -88.45042419433594, + "loss": 0.7604, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.115344762802124, + "rewards/margins": 4.842493057250977, + "rewards/rejected": -1.7271490097045898, + "step": 6067 + }, + { + "epoch": 1.52, + "grad_norm": 3.529686689376831, + "learning_rate": 7.892635287287039e-06, + "logits/chosen": -0.35920992493629456, + "logits/rejected": -0.5403610467910767, + "logps/chosen": -66.25885772705078, + "logps/rejected": -89.92561340332031, + "loss": 0.7256, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.96809458732605, + "rewards/margins": 6.272310256958008, + "rewards/rejected": -3.304216146469116, + "step": 6068 + }, + { + "epoch": 1.52, + "grad_norm": 3.362473249435425, + "learning_rate": 7.8919941503503e-06, + "logits/chosen": -0.39780914783477783, + "logits/rejected": -0.46505752205848694, + "logps/chosen": -57.661102294921875, + "logps/rejected": -92.80714416503906, + "loss": 0.6627, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.801771640777588, + "rewards/margins": 5.030542850494385, + "rewards/rejected": -2.228771209716797, + "step": 6069 + }, + { + "epoch": 1.52, + "grad_norm": 5.406474590301514, + "learning_rate": 7.89135294194931e-06, + "logits/chosen": -0.35695740580558777, + "logits/rejected": -0.4441863000392914, + "logps/chosen": -60.88298797607422, + "logps/rejected": -81.50163269042969, + "loss": 0.7619, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6436665058135986, + "rewards/margins": 4.7291340827941895, + "rewards/rejected": -2.08546781539917, + "step": 6070 + }, + { + "epoch": 1.52, + "grad_norm": 6.744685173034668, + "learning_rate": 7.890711662099912e-06, + "logits/chosen": -0.21224918961524963, + "logits/rejected": -0.34993183612823486, + "logps/chosen": -54.99170684814453, + "logps/rejected": -88.65167236328125, + "loss": 0.737, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8758206367492676, + "rewards/margins": 5.059366226196289, + "rewards/rejected": -2.1835453510284424, + "step": 6071 + }, + { + "epoch": 1.52, + "grad_norm": 5.824431419372559, + "learning_rate": 7.890070310817956e-06, + "logits/chosen": -0.33573830127716064, + "logits/rejected": -0.47914138436317444, + "logps/chosen": -64.26778411865234, + "logps/rejected": -97.24160766601562, + "loss": 0.6507, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6903319358825684, + "rewards/margins": 4.999704837799072, + "rewards/rejected": -2.309372663497925, + "step": 6072 + }, + { + "epoch": 1.52, + "grad_norm": 9.877640724182129, + "learning_rate": 7.889428888119288e-06, + "logits/chosen": -0.33708077669143677, + "logits/rejected": -0.39395058155059814, + "logps/chosen": -68.585693359375, + "logps/rejected": -99.27163696289062, + "loss": 0.7797, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6724448204040527, + "rewards/margins": 4.603806495666504, + "rewards/rejected": -1.9313615560531616, + "step": 6073 + }, + { + "epoch": 1.52, + "grad_norm": 6.761311054229736, + "learning_rate": 7.88878739401976e-06, + "logits/chosen": -0.42510250210762024, + "logits/rejected": -0.5254225730895996, + "logps/chosen": -60.194190979003906, + "logps/rejected": -73.1280517578125, + "loss": 0.7379, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.411360263824463, + "rewards/margins": 3.9416356086730957, + "rewards/rejected": -1.5302751064300537, + "step": 6074 + }, + { + "epoch": 1.52, + "grad_norm": 13.338915824890137, + "learning_rate": 7.888145828535221e-06, + "logits/chosen": -0.38388532400131226, + "logits/rejected": -0.4788777232170105, + "logps/chosen": -62.22092056274414, + "logps/rejected": -94.10005187988281, + "loss": 0.758, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7142465114593506, + "rewards/margins": 5.073300361633301, + "rewards/rejected": -2.3590543270111084, + "step": 6075 + }, + { + "epoch": 1.52, + "grad_norm": 16.220603942871094, + "learning_rate": 7.887504191681528e-06, + "logits/chosen": -0.4641735255718231, + "logits/rejected": -0.557571530342102, + "logps/chosen": -54.10034942626953, + "logps/rejected": -96.42643737792969, + "loss": 0.6455, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.87015438079834, + "rewards/margins": 6.29773473739624, + "rewards/rejected": -3.4275803565979004, + "step": 6076 + }, + { + "epoch": 1.52, + "grad_norm": 24.506534576416016, + "learning_rate": 7.886862483474534e-06, + "logits/chosen": -0.31038498878479004, + "logits/rejected": -0.4212700426578522, + "logps/chosen": -50.367156982421875, + "logps/rejected": -83.20848083496094, + "loss": 0.6738, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0044848918914795, + "rewards/margins": 4.739256858825684, + "rewards/rejected": -1.7347724437713623, + "step": 6077 + }, + { + "epoch": 1.52, + "grad_norm": 4.229964256286621, + "learning_rate": 7.886220703930099e-06, + "logits/chosen": -0.3780110478401184, + "logits/rejected": -0.46535733342170715, + "logps/chosen": -48.592979431152344, + "logps/rejected": -78.83357238769531, + "loss": 0.6788, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8881635665893555, + "rewards/margins": 4.595214366912842, + "rewards/rejected": -1.7070506811141968, + "step": 6078 + }, + { + "epoch": 1.52, + "grad_norm": 6.149529457092285, + "learning_rate": 7.88557885306408e-06, + "logits/chosen": -0.46020346879959106, + "logits/rejected": -0.5468431711196899, + "logps/chosen": -53.86738586425781, + "logps/rejected": -85.13812255859375, + "loss": 0.6936, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7409160137176514, + "rewards/margins": 5.112521171569824, + "rewards/rejected": -2.3716046810150146, + "step": 6079 + }, + { + "epoch": 1.52, + "grad_norm": 3.587540864944458, + "learning_rate": 7.884936930892338e-06, + "logits/chosen": -0.3293210566043854, + "logits/rejected": -0.43024778366088867, + "logps/chosen": -53.849159240722656, + "logps/rejected": -93.177490234375, + "loss": 0.6655, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.944945812225342, + "rewards/margins": 5.402950286865234, + "rewards/rejected": -2.4580042362213135, + "step": 6080 + }, + { + "epoch": 1.52, + "grad_norm": 4.2337212562561035, + "learning_rate": 7.884294937430735e-06, + "logits/chosen": -0.3996829390525818, + "logits/rejected": -0.42419520020484924, + "logps/chosen": -48.063499450683594, + "logps/rejected": -88.92021179199219, + "loss": 0.5882, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.978087902069092, + "rewards/margins": 5.318309307098389, + "rewards/rejected": -2.3402209281921387, + "step": 6081 + }, + { + "epoch": 1.52, + "grad_norm": 4.464850902557373, + "learning_rate": 7.883652872695136e-06, + "logits/chosen": -0.31938284635543823, + "logits/rejected": -0.4598648250102997, + "logps/chosen": -57.864654541015625, + "logps/rejected": -75.00537872314453, + "loss": 0.7003, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.348773717880249, + "rewards/margins": 4.734638214111328, + "rewards/rejected": -1.3858643770217896, + "step": 6082 + }, + { + "epoch": 1.52, + "grad_norm": 3.0100691318511963, + "learning_rate": 7.883010736701408e-06, + "logits/chosen": -0.41271546483039856, + "logits/rejected": -0.4593049883842468, + "logps/chosen": -51.115028381347656, + "logps/rejected": -98.26221466064453, + "loss": 0.7077, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.053245782852173, + "rewards/margins": 5.255753517150879, + "rewards/rejected": -2.2025084495544434, + "step": 6083 + }, + { + "epoch": 1.52, + "grad_norm": 4.211059093475342, + "learning_rate": 7.882368529465418e-06, + "logits/chosen": -0.27709612250328064, + "logits/rejected": -0.36066675186157227, + "logps/chosen": -61.319637298583984, + "logps/rejected": -83.12445068359375, + "loss": 0.7096, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0823564529418945, + "rewards/margins": 4.216864109039307, + "rewards/rejected": -1.134507656097412, + "step": 6084 + }, + { + "epoch": 1.52, + "grad_norm": 4.0145769119262695, + "learning_rate": 7.881726251003037e-06, + "logits/chosen": -0.3983282446861267, + "logits/rejected": -0.4244755506515503, + "logps/chosen": -52.107025146484375, + "logps/rejected": -95.256591796875, + "loss": 0.6847, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.822664499282837, + "rewards/margins": 4.674851417541504, + "rewards/rejected": -1.852186918258667, + "step": 6085 + }, + { + "epoch": 1.52, + "grad_norm": 12.524125099182129, + "learning_rate": 7.881083901330136e-06, + "logits/chosen": -0.35977399349212646, + "logits/rejected": -0.42820295691490173, + "logps/chosen": -59.58666229248047, + "logps/rejected": -86.1085433959961, + "loss": 0.7843, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7129383087158203, + "rewards/margins": 4.250270843505859, + "rewards/rejected": -1.537332534790039, + "step": 6086 + }, + { + "epoch": 1.52, + "grad_norm": 4.472093105316162, + "learning_rate": 7.880441480462584e-06, + "logits/chosen": -0.3397075831890106, + "logits/rejected": -0.45767319202423096, + "logps/chosen": -66.820556640625, + "logps/rejected": -86.50623321533203, + "loss": 0.7617, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.189182758331299, + "rewards/margins": 4.787786960601807, + "rewards/rejected": -1.5986042022705078, + "step": 6087 + }, + { + "epoch": 1.52, + "grad_norm": 3.108258008956909, + "learning_rate": 7.879798988416261e-06, + "logits/chosen": -0.3230000138282776, + "logits/rejected": -0.42188090085983276, + "logps/chosen": -53.76335144042969, + "logps/rejected": -86.91195678710938, + "loss": 0.698, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0222997665405273, + "rewards/margins": 4.787688732147217, + "rewards/rejected": -1.765389323234558, + "step": 6088 + }, + { + "epoch": 1.52, + "grad_norm": 6.921387672424316, + "learning_rate": 7.879156425207042e-06, + "logits/chosen": -0.421228289604187, + "logits/rejected": -0.4973434507846832, + "logps/chosen": -43.25541687011719, + "logps/rejected": -86.68299865722656, + "loss": 0.6613, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.921396493911743, + "rewards/margins": 5.600117206573486, + "rewards/rejected": -2.678720712661743, + "step": 6089 + }, + { + "epoch": 1.52, + "grad_norm": 7.970170021057129, + "learning_rate": 7.878513790850805e-06, + "logits/chosen": -0.3509517014026642, + "logits/rejected": -0.4719621241092682, + "logps/chosen": -66.12980651855469, + "logps/rejected": -71.448974609375, + "loss": 0.8817, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.688333034515381, + "rewards/margins": 4.50601863861084, + "rewards/rejected": -1.8176859617233276, + "step": 6090 + }, + { + "epoch": 1.52, + "grad_norm": 2.6546730995178223, + "learning_rate": 7.877871085363432e-06, + "logits/chosen": -0.41836974024772644, + "logits/rejected": -0.5135893225669861, + "logps/chosen": -63.79197692871094, + "logps/rejected": -77.12124633789062, + "loss": 0.6406, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.874906301498413, + "rewards/margins": 4.860288619995117, + "rewards/rejected": -1.9853819608688354, + "step": 6091 + }, + { + "epoch": 1.52, + "grad_norm": 7.458629608154297, + "learning_rate": 7.877228308760802e-06, + "logits/chosen": -0.2964629828929901, + "logits/rejected": -0.389077365398407, + "logps/chosen": -59.895606994628906, + "logps/rejected": -89.8766860961914, + "loss": 0.67, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6864893436431885, + "rewards/margins": 4.451531887054443, + "rewards/rejected": -1.765042781829834, + "step": 6092 + }, + { + "epoch": 1.52, + "grad_norm": 10.5518217086792, + "learning_rate": 7.876585461058799e-06, + "logits/chosen": -0.4171282947063446, + "logits/rejected": -0.5261204242706299, + "logps/chosen": -54.654781341552734, + "logps/rejected": -83.14912414550781, + "loss": 0.6681, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.133108139038086, + "rewards/margins": 5.730435371398926, + "rewards/rejected": -2.5973265171051025, + "step": 6093 + }, + { + "epoch": 1.52, + "grad_norm": 4.777942657470703, + "learning_rate": 7.875942542273312e-06, + "logits/chosen": -0.2749886214733124, + "logits/rejected": -0.36023688316345215, + "logps/chosen": -57.243804931640625, + "logps/rejected": -89.26415252685547, + "loss": 0.7071, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4820852279663086, + "rewards/margins": 4.852534770965576, + "rewards/rejected": -1.3704496622085571, + "step": 6094 + }, + { + "epoch": 1.52, + "grad_norm": 5.783993721008301, + "learning_rate": 7.875299552420224e-06, + "logits/chosen": -0.3291723132133484, + "logits/rejected": -0.47134929895401, + "logps/chosen": -69.61233520507812, + "logps/rejected": -84.22445678710938, + "loss": 0.7951, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7420601844787598, + "rewards/margins": 4.857170104980469, + "rewards/rejected": -2.115110397338867, + "step": 6095 + }, + { + "epoch": 1.53, + "grad_norm": 6.607781410217285, + "learning_rate": 7.874656491515427e-06, + "logits/chosen": -0.48558610677719116, + "logits/rejected": -0.5823012590408325, + "logps/chosen": -55.1158447265625, + "logps/rejected": -85.5984878540039, + "loss": 0.7743, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.824573278427124, + "rewards/margins": 5.126404762268066, + "rewards/rejected": -2.3018312454223633, + "step": 6096 + }, + { + "epoch": 1.53, + "grad_norm": 4.11317777633667, + "learning_rate": 7.874013359574809e-06, + "logits/chosen": -0.3868769407272339, + "logits/rejected": -0.4517045021057129, + "logps/chosen": -54.831329345703125, + "logps/rejected": -89.32056427001953, + "loss": 0.7732, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8108606338500977, + "rewards/margins": 5.272219657897949, + "rewards/rejected": -2.4613585472106934, + "step": 6097 + }, + { + "epoch": 1.53, + "grad_norm": 5.373519420623779, + "learning_rate": 7.873370156614265e-06, + "logits/chosen": -0.3105248212814331, + "logits/rejected": -0.43261948227882385, + "logps/chosen": -56.415916442871094, + "logps/rejected": -77.81539154052734, + "loss": 0.6438, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9178433418273926, + "rewards/margins": 4.897228240966797, + "rewards/rejected": -1.9793843030929565, + "step": 6098 + }, + { + "epoch": 1.53, + "grad_norm": 6.4722676277160645, + "learning_rate": 7.872726882649688e-06, + "logits/chosen": -0.3906928598880768, + "logits/rejected": -0.4683176279067993, + "logps/chosen": -42.36351013183594, + "logps/rejected": -85.40077209472656, + "loss": 0.7598, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0525567531585693, + "rewards/margins": 4.642333984375, + "rewards/rejected": -1.5897774696350098, + "step": 6099 + }, + { + "epoch": 1.53, + "grad_norm": 4.95768928527832, + "learning_rate": 7.872083537696975e-06, + "logits/chosen": -0.35911548137664795, + "logits/rejected": -0.5124202370643616, + "logps/chosen": -69.6037368774414, + "logps/rejected": -76.25924682617188, + "loss": 0.7688, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.453789710998535, + "rewards/margins": 4.596756458282471, + "rewards/rejected": -2.1429672241210938, + "step": 6100 + }, + { + "epoch": 1.53, + "grad_norm": 5.157167434692383, + "learning_rate": 7.871440121772021e-06, + "logits/chosen": -0.32279008626937866, + "logits/rejected": -0.464192271232605, + "logps/chosen": -65.95279693603516, + "logps/rejected": -82.07029724121094, + "loss": 0.7262, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0696449279785156, + "rewards/margins": 4.655206203460693, + "rewards/rejected": -1.5855613946914673, + "step": 6101 + }, + { + "epoch": 1.53, + "grad_norm": 4.824919700622559, + "learning_rate": 7.87079663489073e-06, + "logits/chosen": -0.35567739605903625, + "logits/rejected": -0.4356030821800232, + "logps/chosen": -58.11528015136719, + "logps/rejected": -92.57202911376953, + "loss": 0.7071, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.69612455368042, + "rewards/margins": 5.009703159332275, + "rewards/rejected": -2.3135788440704346, + "step": 6102 + }, + { + "epoch": 1.53, + "grad_norm": 4.074745178222656, + "learning_rate": 7.870153077068998e-06, + "logits/chosen": -0.3483002781867981, + "logits/rejected": -0.46874362230300903, + "logps/chosen": -54.00045394897461, + "logps/rejected": -90.68970489501953, + "loss": 0.636, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6292688846588135, + "rewards/margins": 5.09586238861084, + "rewards/rejected": -2.4665932655334473, + "step": 6103 + }, + { + "epoch": 1.53, + "grad_norm": 2.785916566848755, + "learning_rate": 7.869509448322732e-06, + "logits/chosen": -0.3062174916267395, + "logits/rejected": -0.3545767366886139, + "logps/chosen": -63.75974655151367, + "logps/rejected": -87.60042572021484, + "loss": 0.7123, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1147453784942627, + "rewards/margins": 5.015567302703857, + "rewards/rejected": -1.9008219242095947, + "step": 6104 + }, + { + "epoch": 1.53, + "grad_norm": 17.739770889282227, + "learning_rate": 7.868865748667836e-06, + "logits/chosen": -0.37226688861846924, + "logits/rejected": -0.5017633438110352, + "logps/chosen": -60.86260986328125, + "logps/rejected": -81.79515075683594, + "loss": 0.7468, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.464157819747925, + "rewards/margins": 5.558579444885254, + "rewards/rejected": -3.09442138671875, + "step": 6105 + }, + { + "epoch": 1.53, + "grad_norm": 5.488092422485352, + "learning_rate": 7.868221978120215e-06, + "logits/chosen": -0.4513852596282959, + "logits/rejected": -0.480905681848526, + "logps/chosen": -46.66543197631836, + "logps/rejected": -105.82585144042969, + "loss": 0.7502, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8981258869171143, + "rewards/margins": 5.585603713989258, + "rewards/rejected": -2.6874778270721436, + "step": 6106 + }, + { + "epoch": 1.53, + "grad_norm": 4.990548610687256, + "learning_rate": 7.867578136695777e-06, + "logits/chosen": -0.3077930510044098, + "logits/rejected": -0.4712543189525604, + "logps/chosen": -54.557762145996094, + "logps/rejected": -77.40137481689453, + "loss": 0.6215, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0520739555358887, + "rewards/margins": 5.748713493347168, + "rewards/rejected": -2.6966395378112793, + "step": 6107 + }, + { + "epoch": 1.53, + "grad_norm": 10.475726127624512, + "learning_rate": 7.866934224410435e-06, + "logits/chosen": -0.3324671685695648, + "logits/rejected": -0.38909029960632324, + "logps/chosen": -66.19149780273438, + "logps/rejected": -83.30418395996094, + "loss": 0.746, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8743550777435303, + "rewards/margins": 4.566643714904785, + "rewards/rejected": -1.6922881603240967, + "step": 6108 + }, + { + "epoch": 1.53, + "grad_norm": 7.495207786560059, + "learning_rate": 7.866290241280097e-06, + "logits/chosen": -0.3622388243675232, + "logits/rejected": -0.4669976830482483, + "logps/chosen": -58.091949462890625, + "logps/rejected": -91.94949340820312, + "loss": 0.6358, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9047694206237793, + "rewards/margins": 6.20011568069458, + "rewards/rejected": -3.295346260070801, + "step": 6109 + }, + { + "epoch": 1.53, + "grad_norm": 6.4508843421936035, + "learning_rate": 7.86564618732068e-06, + "logits/chosen": -0.401444673538208, + "logits/rejected": -0.4999030828475952, + "logps/chosen": -59.758419036865234, + "logps/rejected": -94.52403259277344, + "loss": 0.792, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7463667392730713, + "rewards/margins": 5.805449962615967, + "rewards/rejected": -3.0590832233428955, + "step": 6110 + }, + { + "epoch": 1.53, + "grad_norm": 18.7758731842041, + "learning_rate": 7.865002062548098e-06, + "logits/chosen": -0.41845300793647766, + "logits/rejected": -0.5207908153533936, + "logps/chosen": -51.289329528808594, + "logps/rejected": -89.9617919921875, + "loss": 0.6844, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8921689987182617, + "rewards/margins": 5.552793025970459, + "rewards/rejected": -2.6606242656707764, + "step": 6111 + }, + { + "epoch": 1.53, + "grad_norm": 6.25104284286499, + "learning_rate": 7.864357866978267e-06, + "logits/chosen": -0.37345588207244873, + "logits/rejected": -0.514121413230896, + "logps/chosen": -61.49292755126953, + "logps/rejected": -102.93603515625, + "loss": 0.7486, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7849507331848145, + "rewards/margins": 5.164667129516602, + "rewards/rejected": -2.379716396331787, + "step": 6112 + }, + { + "epoch": 1.53, + "grad_norm": 17.157705307006836, + "learning_rate": 7.863713600627105e-06, + "logits/chosen": -0.38896891474723816, + "logits/rejected": -0.4330679178237915, + "logps/chosen": -50.63739776611328, + "logps/rejected": -93.50068664550781, + "loss": 0.7664, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.785008192062378, + "rewards/margins": 5.329341888427734, + "rewards/rejected": -2.5443344116210938, + "step": 6113 + }, + { + "epoch": 1.53, + "grad_norm": 1.884893774986267, + "learning_rate": 7.863069263510537e-06, + "logits/chosen": -0.30559611320495605, + "logits/rejected": -0.4767321050167084, + "logps/chosen": -61.225643157958984, + "logps/rejected": -87.20143127441406, + "loss": 0.5643, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9656810760498047, + "rewards/margins": 6.0452752113342285, + "rewards/rejected": -3.0795938968658447, + "step": 6114 + }, + { + "epoch": 1.53, + "grad_norm": 6.382323741912842, + "learning_rate": 7.862424855644479e-06, + "logits/chosen": -0.21891537308692932, + "logits/rejected": -0.3314272165298462, + "logps/chosen": -59.17702865600586, + "logps/rejected": -86.01885986328125, + "loss": 0.703, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8832905292510986, + "rewards/margins": 4.8215436935424805, + "rewards/rejected": -1.9382535219192505, + "step": 6115 + }, + { + "epoch": 1.53, + "grad_norm": 17.408357620239258, + "learning_rate": 7.861780377044862e-06, + "logits/chosen": -0.34363946318626404, + "logits/rejected": -0.48044154047966003, + "logps/chosen": -65.69107818603516, + "logps/rejected": -76.59550476074219, + "loss": 1.0521, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3327724933624268, + "rewards/margins": 4.652282238006592, + "rewards/rejected": -2.319509506225586, + "step": 6116 + }, + { + "epoch": 1.53, + "grad_norm": 6.694767951965332, + "learning_rate": 7.861135827727603e-06, + "logits/chosen": -0.388708233833313, + "logits/rejected": -0.4886740446090698, + "logps/chosen": -58.36858367919922, + "logps/rejected": -84.43930053710938, + "loss": 0.8034, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7019336223602295, + "rewards/margins": 5.060275077819824, + "rewards/rejected": -2.3583412170410156, + "step": 6117 + }, + { + "epoch": 1.53, + "grad_norm": 8.646409034729004, + "learning_rate": 7.860491207708637e-06, + "logits/chosen": -0.3746155798435211, + "logits/rejected": -0.4690846800804138, + "logps/chosen": -56.91105651855469, + "logps/rejected": -82.89028930664062, + "loss": 0.877, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6752331256866455, + "rewards/margins": 4.525077819824219, + "rewards/rejected": -1.8498449325561523, + "step": 6118 + }, + { + "epoch": 1.53, + "grad_norm": 13.071584701538086, + "learning_rate": 7.859846517003893e-06, + "logits/chosen": -0.3875048756599426, + "logits/rejected": -0.4194268584251404, + "logps/chosen": -63.061580657958984, + "logps/rejected": -91.99615478515625, + "loss": 1.061, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.680953025817871, + "rewards/margins": 4.072813510894775, + "rewards/rejected": -1.3918601274490356, + "step": 6119 + }, + { + "epoch": 1.53, + "grad_norm": 3.4814951419830322, + "learning_rate": 7.859201755629297e-06, + "logits/chosen": -0.36576586961746216, + "logits/rejected": -0.38729506731033325, + "logps/chosen": -57.391380310058594, + "logps/rejected": -112.98280334472656, + "loss": 0.6233, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.980854034423828, + "rewards/margins": 6.469583988189697, + "rewards/rejected": -3.48872971534729, + "step": 6120 + }, + { + "epoch": 1.53, + "grad_norm": 5.450709819793701, + "learning_rate": 7.858556923600785e-06, + "logits/chosen": -0.3835966885089874, + "logits/rejected": -0.4927787482738495, + "logps/chosen": -55.564910888671875, + "logps/rejected": -80.08047485351562, + "loss": 0.8181, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.927966833114624, + "rewards/margins": 3.6213057041168213, + "rewards/rejected": -0.6933388113975525, + "step": 6121 + }, + { + "epoch": 1.53, + "grad_norm": 5.026520729064941, + "learning_rate": 7.857912020934291e-06, + "logits/chosen": -0.31307244300842285, + "logits/rejected": -0.3739418685436249, + "logps/chosen": -61.97380828857422, + "logps/rejected": -93.39873504638672, + "loss": 0.7913, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.550398349761963, + "rewards/margins": 4.450113773345947, + "rewards/rejected": -1.8997154235839844, + "step": 6122 + }, + { + "epoch": 1.53, + "grad_norm": 7.023351192474365, + "learning_rate": 7.85726704764575e-06, + "logits/chosen": -0.4565882086753845, + "logits/rejected": -0.5546426177024841, + "logps/chosen": -53.523780822753906, + "logps/rejected": -70.86530303955078, + "loss": 0.9077, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.6476101875305176, + "rewards/margins": 4.117887496948242, + "rewards/rejected": -1.470277190208435, + "step": 6123 + }, + { + "epoch": 1.53, + "grad_norm": 4.83855676651001, + "learning_rate": 7.856622003751105e-06, + "logits/chosen": -0.3245546221733093, + "logits/rejected": -0.45899540185928345, + "logps/chosen": -55.06184387207031, + "logps/rejected": -89.70899200439453, + "loss": 0.6681, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.88037109375, + "rewards/margins": 5.4126296043396, + "rewards/rejected": -2.5322580337524414, + "step": 6124 + }, + { + "epoch": 1.53, + "grad_norm": 7.49248743057251, + "learning_rate": 7.855976889266288e-06, + "logits/chosen": -0.37524574995040894, + "logits/rejected": -0.4431617259979248, + "logps/chosen": -59.807579040527344, + "logps/rejected": -79.79969024658203, + "loss": 0.821, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.972938299179077, + "rewards/margins": 4.372219085693359, + "rewards/rejected": -1.3992801904678345, + "step": 6125 + }, + { + "epoch": 1.53, + "grad_norm": 3.412017583847046, + "learning_rate": 7.855331704207245e-06, + "logits/chosen": -0.38150960206985474, + "logits/rejected": -0.48971182107925415, + "logps/chosen": -49.864013671875, + "logps/rejected": -88.07527923583984, + "loss": 0.7183, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9154481887817383, + "rewards/margins": 5.172083854675293, + "rewards/rejected": -2.2566354274749756, + "step": 6126 + }, + { + "epoch": 1.53, + "grad_norm": 5.625334739685059, + "learning_rate": 7.854686448589919e-06, + "logits/chosen": -0.32899558544158936, + "logits/rejected": -0.4703929126262665, + "logps/chosen": -57.44980239868164, + "logps/rejected": -65.66088104248047, + "loss": 0.6614, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0588061809539795, + "rewards/margins": 4.246520042419434, + "rewards/rejected": -1.1877140998840332, + "step": 6127 + }, + { + "epoch": 1.53, + "grad_norm": 6.701476097106934, + "learning_rate": 7.854041122430254e-06, + "logits/chosen": -0.3768199384212494, + "logits/rejected": -0.4213382601737976, + "logps/chosen": -48.39678955078125, + "logps/rejected": -91.24238586425781, + "loss": 0.8223, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.83807635307312, + "rewards/margins": 4.45108699798584, + "rewards/rejected": -1.6130104064941406, + "step": 6128 + }, + { + "epoch": 1.53, + "grad_norm": 4.218358039855957, + "learning_rate": 7.853395725744199e-06, + "logits/chosen": -0.3613643944263458, + "logits/rejected": -0.43639278411865234, + "logps/chosen": -56.06998062133789, + "logps/rejected": -90.85574340820312, + "loss": 0.7705, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.828439712524414, + "rewards/margins": 4.794116973876953, + "rewards/rejected": -1.9656777381896973, + "step": 6129 + }, + { + "epoch": 1.53, + "grad_norm": 3.076014280319214, + "learning_rate": 7.852750258547698e-06, + "logits/chosen": -0.35200557112693787, + "logits/rejected": -0.5128360986709595, + "logps/chosen": -67.2129135131836, + "logps/rejected": -73.94241333007812, + "loss": 0.715, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2385072708129883, + "rewards/margins": 5.425123691558838, + "rewards/rejected": -2.1866164207458496, + "step": 6130 + }, + { + "epoch": 1.53, + "grad_norm": 5.844913005828857, + "learning_rate": 7.852104720856705e-06, + "logits/chosen": -0.3178894519805908, + "logits/rejected": -0.4225008189678192, + "logps/chosen": -60.49203109741211, + "logps/rejected": -75.0684814453125, + "loss": 0.8741, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.03962779045105, + "rewards/margins": 3.948471784591675, + "rewards/rejected": -0.9088439345359802, + "step": 6131 + }, + { + "epoch": 1.53, + "grad_norm": 8.610074996948242, + "learning_rate": 7.85145911268717e-06, + "logits/chosen": -0.33126693964004517, + "logits/rejected": -0.45106860995292664, + "logps/chosen": -56.47220230102539, + "logps/rejected": -75.09587097167969, + "loss": 0.7641, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9178152084350586, + "rewards/margins": 4.7895917892456055, + "rewards/rejected": -1.8717762231826782, + "step": 6132 + }, + { + "epoch": 1.53, + "grad_norm": 16.618202209472656, + "learning_rate": 7.85081343405505e-06, + "logits/chosen": -0.29158198833465576, + "logits/rejected": -0.3271896243095398, + "logps/chosen": -54.7791633605957, + "logps/rejected": -93.09587097167969, + "loss": 0.8777, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.901106357574463, + "rewards/margins": 4.429840564727783, + "rewards/rejected": -1.5287340879440308, + "step": 6133 + }, + { + "epoch": 1.53, + "grad_norm": 5.990093231201172, + "learning_rate": 7.850167684976294e-06, + "logits/chosen": -0.31685954332351685, + "logits/rejected": -0.49154964089393616, + "logps/chosen": -65.21612548828125, + "logps/rejected": -86.48341369628906, + "loss": 0.6199, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.602902412414551, + "rewards/margins": 4.576199531555176, + "rewards/rejected": -1.9732967615127563, + "step": 6134 + }, + { + "epoch": 1.53, + "grad_norm": 12.583422660827637, + "learning_rate": 7.849521865466865e-06, + "logits/chosen": -0.2958727777004242, + "logits/rejected": -0.3644145429134369, + "logps/chosen": -74.92301177978516, + "logps/rejected": -95.82354736328125, + "loss": 0.8545, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.507638692855835, + "rewards/margins": 4.406575679779053, + "rewards/rejected": -1.8989372253417969, + "step": 6135 + }, + { + "epoch": 1.54, + "grad_norm": 6.7584943771362305, + "learning_rate": 7.84887597554272e-06, + "logits/chosen": -0.36800384521484375, + "logits/rejected": -0.4511064291000366, + "logps/chosen": -48.040138244628906, + "logps/rejected": -92.16287994384766, + "loss": 0.6541, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0679969787597656, + "rewards/margins": 5.823073863983154, + "rewards/rejected": -2.7550768852233887, + "step": 6136 + }, + { + "epoch": 1.54, + "grad_norm": 4.827887535095215, + "learning_rate": 7.84823001521982e-06, + "logits/chosen": -0.28782224655151367, + "logits/rejected": -0.41360679268836975, + "logps/chosen": -56.5118293762207, + "logps/rejected": -85.67939758300781, + "loss": 0.7053, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9373135566711426, + "rewards/margins": 5.123994827270508, + "rewards/rejected": -2.1866812705993652, + "step": 6137 + }, + { + "epoch": 1.54, + "grad_norm": 7.319167137145996, + "learning_rate": 7.847583984514126e-06, + "logits/chosen": -0.384451687335968, + "logits/rejected": -0.4962143003940582, + "logps/chosen": -53.85836410522461, + "logps/rejected": -80.94473266601562, + "loss": 0.9302, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6647133827209473, + "rewards/margins": 3.8554649353027344, + "rewards/rejected": -1.1907514333724976, + "step": 6138 + }, + { + "epoch": 1.54, + "grad_norm": 13.640556335449219, + "learning_rate": 7.8469378834416e-06, + "logits/chosen": -0.29420921206474304, + "logits/rejected": -0.3359716534614563, + "logps/chosen": -56.47673797607422, + "logps/rejected": -81.75907897949219, + "loss": 0.8864, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.000399589538574, + "rewards/margins": 3.249837875366211, + "rewards/rejected": -0.24943819642066956, + "step": 6139 + }, + { + "epoch": 1.54, + "grad_norm": 11.438994407653809, + "learning_rate": 7.846291712018216e-06, + "logits/chosen": -0.3268307149410248, + "logits/rejected": -0.4406527280807495, + "logps/chosen": -68.07070922851562, + "logps/rejected": -78.86821746826172, + "loss": 0.8174, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8133275508880615, + "rewards/margins": 3.621678113937378, + "rewards/rejected": -0.8083506226539612, + "step": 6140 + }, + { + "epoch": 1.54, + "grad_norm": 6.251874923706055, + "learning_rate": 7.845645470259932e-06, + "logits/chosen": -0.39925262331962585, + "logits/rejected": -0.5023026466369629, + "logps/chosen": -56.186363220214844, + "logps/rejected": -78.06973266601562, + "loss": 0.7312, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6947085857391357, + "rewards/margins": 4.761040210723877, + "rewards/rejected": -2.0663318634033203, + "step": 6141 + }, + { + "epoch": 1.54, + "grad_norm": 5.0779337882995605, + "learning_rate": 7.844999158182724e-06, + "logits/chosen": -0.3287610411643982, + "logits/rejected": -0.3974062502384186, + "logps/chosen": -55.63825225830078, + "logps/rejected": -93.2948989868164, + "loss": 0.7014, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.124638080596924, + "rewards/margins": 4.987851142883301, + "rewards/rejected": -1.863213300704956, + "step": 6142 + }, + { + "epoch": 1.54, + "grad_norm": 4.648329257965088, + "learning_rate": 7.844352775802561e-06, + "logits/chosen": -0.3701082170009613, + "logits/rejected": -0.47520115971565247, + "logps/chosen": -65.27293395996094, + "logps/rejected": -80.44515991210938, + "loss": 0.7913, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8525688648223877, + "rewards/margins": 4.621016025543213, + "rewards/rejected": -1.768446922302246, + "step": 6143 + }, + { + "epoch": 1.54, + "grad_norm": 7.194095611572266, + "learning_rate": 7.843706323135414e-06, + "logits/chosen": -0.33923283219337463, + "logits/rejected": -0.3966212868690491, + "logps/chosen": -58.77073287963867, + "logps/rejected": -98.68869018554688, + "loss": 0.6437, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8844871520996094, + "rewards/margins": 4.828768730163574, + "rewards/rejected": -1.9442811012268066, + "step": 6144 + }, + { + "epoch": 1.54, + "grad_norm": 2.4357800483703613, + "learning_rate": 7.84305980019726e-06, + "logits/chosen": -0.3960450291633606, + "logits/rejected": -0.4875405430793762, + "logps/chosen": -50.64630126953125, + "logps/rejected": -102.75418090820312, + "loss": 0.5505, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8111660480499268, + "rewards/margins": 6.605772018432617, + "rewards/rejected": -3.7946059703826904, + "step": 6145 + }, + { + "epoch": 1.54, + "grad_norm": 5.3267130851745605, + "learning_rate": 7.842413207004073e-06, + "logits/chosen": -0.30914735794067383, + "logits/rejected": -0.40058982372283936, + "logps/chosen": -67.7134017944336, + "logps/rejected": -95.50527954101562, + "loss": 0.8247, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0158133506774902, + "rewards/margins": 5.128148078918457, + "rewards/rejected": -2.112334728240967, + "step": 6146 + }, + { + "epoch": 1.54, + "grad_norm": 8.267742156982422, + "learning_rate": 7.841766543571836e-06, + "logits/chosen": -0.32031190395355225, + "logits/rejected": -0.394440621137619, + "logps/chosen": -60.48394775390625, + "logps/rejected": -106.81961059570312, + "loss": 0.8354, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.986968755722046, + "rewards/margins": 5.264865875244141, + "rewards/rejected": -2.2778968811035156, + "step": 6147 + }, + { + "epoch": 1.54, + "grad_norm": 3.9006099700927734, + "learning_rate": 7.841119809916521e-06, + "logits/chosen": -0.4080677628517151, + "logits/rejected": -0.5137076377868652, + "logps/chosen": -53.099464416503906, + "logps/rejected": -72.5823745727539, + "loss": 0.7098, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.158846855163574, + "rewards/margins": 4.742241382598877, + "rewards/rejected": -1.5833947658538818, + "step": 6148 + }, + { + "epoch": 1.54, + "grad_norm": 3.940150022506714, + "learning_rate": 7.840473006054117e-06, + "logits/chosen": -0.42266935110092163, + "logits/rejected": -0.49351930618286133, + "logps/chosen": -50.47832489013672, + "logps/rejected": -84.0057144165039, + "loss": 0.6544, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0097546577453613, + "rewards/margins": 4.349703311920166, + "rewards/rejected": -1.3399484157562256, + "step": 6149 + }, + { + "epoch": 1.54, + "grad_norm": 6.5381035804748535, + "learning_rate": 7.839826132000602e-06, + "logits/chosen": -0.2784443795681, + "logits/rejected": -0.3803878128528595, + "logps/chosen": -60.06916427612305, + "logps/rejected": -100.57759857177734, + "loss": 0.7282, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8842296600341797, + "rewards/margins": 5.088580131530762, + "rewards/rejected": -2.204350233078003, + "step": 6150 + }, + { + "epoch": 1.54, + "grad_norm": 1.8899744749069214, + "learning_rate": 7.839179187771962e-06, + "logits/chosen": -0.33850568532943726, + "logits/rejected": -0.4009262025356293, + "logps/chosen": -47.95280838012695, + "logps/rejected": -87.30928039550781, + "loss": 0.5636, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.108433961868286, + "rewards/margins": 5.512961387634277, + "rewards/rejected": -2.404527425765991, + "step": 6151 + }, + { + "epoch": 1.54, + "grad_norm": 6.33713960647583, + "learning_rate": 7.838532173384184e-06, + "logits/chosen": -0.25059521198272705, + "logits/rejected": -0.37559592723846436, + "logps/chosen": -63.64004135131836, + "logps/rejected": -95.12825012207031, + "loss": 0.7391, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1452505588531494, + "rewards/margins": 5.1900954246521, + "rewards/rejected": -2.04484486579895, + "step": 6152 + }, + { + "epoch": 1.54, + "grad_norm": 3.9063162803649902, + "learning_rate": 7.837885088853259e-06, + "logits/chosen": -0.338118314743042, + "logits/rejected": -0.42659294605255127, + "logps/chosen": -53.240806579589844, + "logps/rejected": -99.20633697509766, + "loss": 0.7293, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.897481679916382, + "rewards/margins": 5.131948471069336, + "rewards/rejected": -2.234466314315796, + "step": 6153 + }, + { + "epoch": 1.54, + "grad_norm": 5.759744167327881, + "learning_rate": 7.837237934195173e-06, + "logits/chosen": -0.36159268021583557, + "logits/rejected": -0.5222936272621155, + "logps/chosen": -60.154991149902344, + "logps/rejected": -76.20681762695312, + "loss": 0.6765, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.787038803100586, + "rewards/margins": 5.226065635681152, + "rewards/rejected": -2.4390265941619873, + "step": 6154 + }, + { + "epoch": 1.54, + "grad_norm": 4.99656867980957, + "learning_rate": 7.83659070942592e-06, + "logits/chosen": -0.32748761773109436, + "logits/rejected": -0.48092955350875854, + "logps/chosen": -48.96601867675781, + "logps/rejected": -85.72212219238281, + "loss": 0.6712, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.75236177444458, + "rewards/margins": 6.027368545532227, + "rewards/rejected": -3.2750065326690674, + "step": 6155 + }, + { + "epoch": 1.54, + "grad_norm": 10.200090408325195, + "learning_rate": 7.835943414561494e-06, + "logits/chosen": -0.3644559681415558, + "logits/rejected": -0.4705066680908203, + "logps/chosen": -51.285606384277344, + "logps/rejected": -84.46098327636719, + "loss": 0.6212, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.140064239501953, + "rewards/margins": 5.616307258605957, + "rewards/rejected": -2.476242780685425, + "step": 6156 + }, + { + "epoch": 1.54, + "grad_norm": 2.764495611190796, + "learning_rate": 7.835296049617889e-06, + "logits/chosen": -0.351341187953949, + "logits/rejected": -0.40023505687713623, + "logps/chosen": -56.05187225341797, + "logps/rejected": -85.08787536621094, + "loss": 0.6909, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9451029300689697, + "rewards/margins": 4.96307373046875, + "rewards/rejected": -2.0179708003997803, + "step": 6157 + }, + { + "epoch": 1.54, + "grad_norm": 5.302546501159668, + "learning_rate": 7.834648614611104e-06, + "logits/chosen": -0.3230848014354706, + "logits/rejected": -0.44603097438812256, + "logps/chosen": -53.814208984375, + "logps/rejected": -86.32262420654297, + "loss": 0.6176, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8036022186279297, + "rewards/margins": 5.22031831741333, + "rewards/rejected": -2.4167163372039795, + "step": 6158 + }, + { + "epoch": 1.54, + "grad_norm": 5.613552093505859, + "learning_rate": 7.834001109557136e-06, + "logits/chosen": -0.31428205966949463, + "logits/rejected": -0.41907745599746704, + "logps/chosen": -48.55113220214844, + "logps/rejected": -83.7096939086914, + "loss": 0.7241, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.245753765106201, + "rewards/margins": 5.136979103088379, + "rewards/rejected": -1.891225814819336, + "step": 6159 + }, + { + "epoch": 1.54, + "grad_norm": 8.815881729125977, + "learning_rate": 7.833353534471988e-06, + "logits/chosen": -0.35030150413513184, + "logits/rejected": -0.38622474670410156, + "logps/chosen": -58.17127227783203, + "logps/rejected": -114.44530487060547, + "loss": 0.6931, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5939459800720215, + "rewards/margins": 5.858978271484375, + "rewards/rejected": -3.2650327682495117, + "step": 6160 + }, + { + "epoch": 1.54, + "grad_norm": 7.603052616119385, + "learning_rate": 7.83270588937166e-06, + "logits/chosen": -0.3854129910469055, + "logits/rejected": -0.5067237019538879, + "logps/chosen": -52.95048904418945, + "logps/rejected": -77.63578033447266, + "loss": 0.826, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0004043579101562, + "rewards/margins": 4.556690216064453, + "rewards/rejected": -1.5562862157821655, + "step": 6161 + }, + { + "epoch": 1.54, + "grad_norm": 4.393235683441162, + "learning_rate": 7.832058174272154e-06, + "logits/chosen": -0.2937905490398407, + "logits/rejected": -0.4304620027542114, + "logps/chosen": -62.77922821044922, + "logps/rejected": -83.15538024902344, + "loss": 0.7839, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9617676734924316, + "rewards/margins": 4.668259620666504, + "rewards/rejected": -1.706492304801941, + "step": 6162 + }, + { + "epoch": 1.54, + "grad_norm": 4.568307399749756, + "learning_rate": 7.831410389189479e-06, + "logits/chosen": -0.3321148753166199, + "logits/rejected": -0.47719627618789673, + "logps/chosen": -66.4659652709961, + "logps/rejected": -83.18173217773438, + "loss": 0.6445, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.182208299636841, + "rewards/margins": 4.720229148864746, + "rewards/rejected": -1.5380209684371948, + "step": 6163 + }, + { + "epoch": 1.54, + "grad_norm": 4.85837984085083, + "learning_rate": 7.830762534139641e-06, + "logits/chosen": -0.32742974162101746, + "logits/rejected": -0.46459195017814636, + "logps/chosen": -45.94270706176758, + "logps/rejected": -84.30422973632812, + "loss": 0.6919, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1040587425231934, + "rewards/margins": 5.13179349899292, + "rewards/rejected": -2.0277352333068848, + "step": 6164 + }, + { + "epoch": 1.54, + "grad_norm": 22.89358901977539, + "learning_rate": 7.830114609138653e-06, + "logits/chosen": -0.3545404374599457, + "logits/rejected": -0.4329858720302582, + "logps/chosen": -54.95587158203125, + "logps/rejected": -74.57284545898438, + "loss": 0.8368, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7007195949554443, + "rewards/margins": 4.184955596923828, + "rewards/rejected": -1.4842355251312256, + "step": 6165 + }, + { + "epoch": 1.54, + "grad_norm": 4.204402446746826, + "learning_rate": 7.829466614202522e-06, + "logits/chosen": -0.4194555878639221, + "logits/rejected": -0.4416593313217163, + "logps/chosen": -57.326175689697266, + "logps/rejected": -91.60539245605469, + "loss": 0.744, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7735941410064697, + "rewards/margins": 4.21235990524292, + "rewards/rejected": -1.4387657642364502, + "step": 6166 + }, + { + "epoch": 1.54, + "grad_norm": 2.34131121635437, + "learning_rate": 7.82881854934726e-06, + "logits/chosen": -0.3762817680835724, + "logits/rejected": -0.49059221148490906, + "logps/chosen": -50.3489990234375, + "logps/rejected": -87.2442626953125, + "loss": 0.6033, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0199942588806152, + "rewards/margins": 5.461187839508057, + "rewards/rejected": -2.441192865371704, + "step": 6167 + }, + { + "epoch": 1.54, + "grad_norm": 8.46534538269043, + "learning_rate": 7.828170414588882e-06, + "logits/chosen": -0.3192533254623413, + "logits/rejected": -0.3996411859989166, + "logps/chosen": -59.558868408203125, + "logps/rejected": -98.72821044921875, + "loss": 0.7147, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9364755153656006, + "rewards/margins": 5.607214450836182, + "rewards/rejected": -2.670738935470581, + "step": 6168 + }, + { + "epoch": 1.54, + "grad_norm": 3.2796518802642822, + "learning_rate": 7.827522209943405e-06, + "logits/chosen": -0.3186625838279724, + "logits/rejected": -0.36451107263565063, + "logps/chosen": -65.84927368164062, + "logps/rejected": -91.50489044189453, + "loss": 0.7315, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7108964920043945, + "rewards/margins": 5.17107629776001, + "rewards/rejected": -2.4601802825927734, + "step": 6169 + }, + { + "epoch": 1.54, + "grad_norm": 12.739076614379883, + "learning_rate": 7.82687393542685e-06, + "logits/chosen": -0.3801215589046478, + "logits/rejected": -0.523681104183197, + "logps/chosen": -50.53651428222656, + "logps/rejected": -69.53649139404297, + "loss": 0.7215, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9188878536224365, + "rewards/margins": 5.135810375213623, + "rewards/rejected": -2.2169222831726074, + "step": 6170 + }, + { + "epoch": 1.54, + "grad_norm": 6.82122278213501, + "learning_rate": 7.82622559105523e-06, + "logits/chosen": -0.3923760652542114, + "logits/rejected": -0.4093596935272217, + "logps/chosen": -51.556365966796875, + "logps/rejected": -98.39730834960938, + "loss": 0.7136, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7107410430908203, + "rewards/margins": 4.066287994384766, + "rewards/rejected": -1.3555469512939453, + "step": 6171 + }, + { + "epoch": 1.54, + "grad_norm": 6.439887523651123, + "learning_rate": 7.82557717684457e-06, + "logits/chosen": -0.3136868178844452, + "logits/rejected": -0.43215233087539673, + "logps/chosen": -61.2452392578125, + "logps/rejected": -76.65646362304688, + "loss": 0.8101, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7523441314697266, + "rewards/margins": 4.124702453613281, + "rewards/rejected": -1.3723583221435547, + "step": 6172 + }, + { + "epoch": 1.54, + "grad_norm": 2.900128126144409, + "learning_rate": 7.824928692810894e-06, + "logits/chosen": -0.34131065011024475, + "logits/rejected": -0.38834673166275024, + "logps/chosen": -47.86150360107422, + "logps/rejected": -84.24311828613281, + "loss": 0.6342, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0346333980560303, + "rewards/margins": 4.612885475158691, + "rewards/rejected": -1.5782523155212402, + "step": 6173 + }, + { + "epoch": 1.54, + "grad_norm": 9.673315048217773, + "learning_rate": 7.824280138970223e-06, + "logits/chosen": -0.3552575409412384, + "logits/rejected": -0.4643332064151764, + "logps/chosen": -58.44288635253906, + "logps/rejected": -83.14314270019531, + "loss": 0.8202, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8988869190216064, + "rewards/margins": 4.655726909637451, + "rewards/rejected": -1.7568401098251343, + "step": 6174 + }, + { + "epoch": 1.54, + "grad_norm": 5.429406642913818, + "learning_rate": 7.823631515338588e-06, + "logits/chosen": -0.2810341715812683, + "logits/rejected": -0.36682283878326416, + "logps/chosen": -56.58533477783203, + "logps/rejected": -83.85039520263672, + "loss": 0.7721, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0369579792022705, + "rewards/margins": 3.5819005966186523, + "rewards/rejected": -0.5449427962303162, + "step": 6175 + }, + { + "epoch": 1.55, + "grad_norm": 4.576462268829346, + "learning_rate": 7.822982821932014e-06, + "logits/chosen": -0.3768942058086395, + "logits/rejected": -0.4511708617210388, + "logps/chosen": -58.001991271972656, + "logps/rejected": -76.74710083007812, + "loss": 0.8352, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.065389394760132, + "rewards/margins": 4.55098295211792, + "rewards/rejected": -1.485593557357788, + "step": 6176 + }, + { + "epoch": 1.55, + "grad_norm": 5.6939873695373535, + "learning_rate": 7.822334058766532e-06, + "logits/chosen": -0.3925279676914215, + "logits/rejected": -0.4651142656803131, + "logps/chosen": -48.19599914550781, + "logps/rejected": -89.7248306274414, + "loss": 0.677, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0041556358337402, + "rewards/margins": 5.632298469543457, + "rewards/rejected": -2.628142833709717, + "step": 6177 + }, + { + "epoch": 1.55, + "grad_norm": 2.835235357284546, + "learning_rate": 7.821685225858172e-06, + "logits/chosen": -0.4257987141609192, + "logits/rejected": -0.5302516222000122, + "logps/chosen": -63.24805450439453, + "logps/rejected": -94.27012634277344, + "loss": 0.6711, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9311132431030273, + "rewards/margins": 5.00121545791626, + "rewards/rejected": -2.0701024532318115, + "step": 6178 + }, + { + "epoch": 1.55, + "grad_norm": 11.978416442871094, + "learning_rate": 7.82103632322297e-06, + "logits/chosen": -0.38579803705215454, + "logits/rejected": -0.3848147988319397, + "logps/chosen": -45.60002136230469, + "logps/rejected": -98.62806701660156, + "loss": 0.6616, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.005985736846924, + "rewards/margins": 5.062527179718018, + "rewards/rejected": -2.0565414428710938, + "step": 6179 + }, + { + "epoch": 1.55, + "grad_norm": 4.748369216918945, + "learning_rate": 7.820387350876961e-06, + "logits/chosen": -0.3792818784713745, + "logits/rejected": -0.4507969915866852, + "logps/chosen": -57.06364059448242, + "logps/rejected": -93.9207763671875, + "loss": 0.8092, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5789783000946045, + "rewards/margins": 4.200252056121826, + "rewards/rejected": -1.6212739944458008, + "step": 6180 + }, + { + "epoch": 1.55, + "grad_norm": 12.208943367004395, + "learning_rate": 7.81973830883618e-06, + "logits/chosen": -0.38638150691986084, + "logits/rejected": -0.5222498178482056, + "logps/chosen": -61.240753173828125, + "logps/rejected": -81.73482513427734, + "loss": 0.8751, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.655632257461548, + "rewards/margins": 4.294283866882324, + "rewards/rejected": -1.6386513710021973, + "step": 6181 + }, + { + "epoch": 1.55, + "grad_norm": 8.55981159210205, + "learning_rate": 7.819089197116663e-06, + "logits/chosen": -0.3863077461719513, + "logits/rejected": -0.45953673124313354, + "logps/chosen": -47.93607711791992, + "logps/rejected": -87.1211929321289, + "loss": 0.7309, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9701366424560547, + "rewards/margins": 5.180819988250732, + "rewards/rejected": -2.2106826305389404, + "step": 6182 + }, + { + "epoch": 1.55, + "grad_norm": 3.577773094177246, + "learning_rate": 7.81844001573446e-06, + "logits/chosen": -0.4188598692417145, + "logits/rejected": -0.47334879636764526, + "logps/chosen": -45.454193115234375, + "logps/rejected": -83.66885375976562, + "loss": 0.6907, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.916497230529785, + "rewards/margins": 4.578868865966797, + "rewards/rejected": -1.6623716354370117, + "step": 6183 + }, + { + "epoch": 1.55, + "grad_norm": 7.268511772155762, + "learning_rate": 7.8177907647056e-06, + "logits/chosen": -0.4100942015647888, + "logits/rejected": -0.45556432008743286, + "logps/chosen": -65.39510345458984, + "logps/rejected": -85.0425796508789, + "loss": 0.8446, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.765892267227173, + "rewards/margins": 3.794611930847168, + "rewards/rejected": -1.0287197828292847, + "step": 6184 + }, + { + "epoch": 1.55, + "grad_norm": 3.5211691856384277, + "learning_rate": 7.817141444046135e-06, + "logits/chosen": -0.38858452439308167, + "logits/rejected": -0.4832417666912079, + "logps/chosen": -66.06562042236328, + "logps/rejected": -95.2762222290039, + "loss": 0.7324, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0094985961914062, + "rewards/margins": 5.457663059234619, + "rewards/rejected": -2.448164701461792, + "step": 6185 + }, + { + "epoch": 1.55, + "grad_norm": 6.137972354888916, + "learning_rate": 7.81649205377211e-06, + "logits/chosen": -0.38537710905075073, + "logits/rejected": -0.4805462956428528, + "logps/chosen": -56.3211669921875, + "logps/rejected": -87.20150756835938, + "loss": 0.732, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9637527465820312, + "rewards/margins": 5.3364362716674805, + "rewards/rejected": -2.37268328666687, + "step": 6186 + }, + { + "epoch": 1.55, + "grad_norm": 3.448854923248291, + "learning_rate": 7.815842593899569e-06, + "logits/chosen": -0.40109091997146606, + "logits/rejected": -0.4913998246192932, + "logps/chosen": -51.03759002685547, + "logps/rejected": -81.53694152832031, + "loss": 0.6668, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7876856327056885, + "rewards/margins": 4.456490516662598, + "rewards/rejected": -1.6688042879104614, + "step": 6187 + }, + { + "epoch": 1.55, + "grad_norm": 7.164760589599609, + "learning_rate": 7.815193064444564e-06, + "logits/chosen": -0.41793105006217957, + "logits/rejected": -0.5410587787628174, + "logps/chosen": -54.358642578125, + "logps/rejected": -72.1142578125, + "loss": 0.8728, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7575626373291016, + "rewards/margins": 3.8958241939544678, + "rewards/rejected": -1.138261318206787, + "step": 6188 + }, + { + "epoch": 1.55, + "grad_norm": 4.5488667488098145, + "learning_rate": 7.814543465423143e-06, + "logits/chosen": -0.4118700623512268, + "logits/rejected": -0.4616626799106598, + "logps/chosen": -58.790470123291016, + "logps/rejected": -84.69966125488281, + "loss": 0.679, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1604397296905518, + "rewards/margins": 4.339358806610107, + "rewards/rejected": -1.1789191961288452, + "step": 6189 + }, + { + "epoch": 1.55, + "grad_norm": 3.5209500789642334, + "learning_rate": 7.81389379685136e-06, + "logits/chosen": -0.43279826641082764, + "logits/rejected": -0.4793151021003723, + "logps/chosen": -49.51572799682617, + "logps/rejected": -88.07882690429688, + "loss": 0.6886, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9944522380828857, + "rewards/margins": 4.429533004760742, + "rewards/rejected": -1.4350807666778564, + "step": 6190 + }, + { + "epoch": 1.55, + "grad_norm": 2.664280652999878, + "learning_rate": 7.813244058745266e-06, + "logits/chosen": -0.3113492429256439, + "logits/rejected": -0.4230431318283081, + "logps/chosen": -50.74856185913086, + "logps/rejected": -97.037353515625, + "loss": 0.6803, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1109137535095215, + "rewards/margins": 5.734738349914551, + "rewards/rejected": -2.62382435798645, + "step": 6191 + }, + { + "epoch": 1.55, + "grad_norm": 5.151182651519775, + "learning_rate": 7.81259425112092e-06, + "logits/chosen": -0.32123681902885437, + "logits/rejected": -0.442609965801239, + "logps/chosen": -58.6724853515625, + "logps/rejected": -72.7745590209961, + "loss": 0.714, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.09704327583313, + "rewards/margins": 5.008780002593994, + "rewards/rejected": -1.9117364883422852, + "step": 6192 + }, + { + "epoch": 1.55, + "grad_norm": 3.3559799194335938, + "learning_rate": 7.81194437399438e-06, + "logits/chosen": -0.3385765850543976, + "logits/rejected": -0.4381738305091858, + "logps/chosen": -49.01136779785156, + "logps/rejected": -77.71515655517578, + "loss": 0.617, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0793111324310303, + "rewards/margins": 4.61337423324585, + "rewards/rejected": -1.5340631008148193, + "step": 6193 + }, + { + "epoch": 1.55, + "grad_norm": 7.462185382843018, + "learning_rate": 7.811294427381703e-06, + "logits/chosen": -0.31225937604904175, + "logits/rejected": -0.31461501121520996, + "logps/chosen": -55.0640983581543, + "logps/rejected": -99.96102905273438, + "loss": 0.9326, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8487765789031982, + "rewards/margins": 4.094009876251221, + "rewards/rejected": -1.2452332973480225, + "step": 6194 + }, + { + "epoch": 1.55, + "grad_norm": 3.9917213916778564, + "learning_rate": 7.810644411298951e-06, + "logits/chosen": -0.4540881812572479, + "logits/rejected": -0.502302348613739, + "logps/chosen": -43.938453674316406, + "logps/rejected": -87.31632995605469, + "loss": 0.6746, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.043550968170166, + "rewards/margins": 5.136163711547852, + "rewards/rejected": -2.0926122665405273, + "step": 6195 + }, + { + "epoch": 1.55, + "grad_norm": 6.125376224517822, + "learning_rate": 7.809994325762185e-06, + "logits/chosen": -0.33190134167671204, + "logits/rejected": -0.43135198950767517, + "logps/chosen": -59.20867156982422, + "logps/rejected": -89.0723648071289, + "loss": 0.7318, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9165425300598145, + "rewards/margins": 4.298498153686523, + "rewards/rejected": -1.3819557428359985, + "step": 6196 + }, + { + "epoch": 1.55, + "grad_norm": 3.912132501602173, + "learning_rate": 7.80934417078747e-06, + "logits/chosen": -0.38354453444480896, + "logits/rejected": -0.4290643632411957, + "logps/chosen": -51.830909729003906, + "logps/rejected": -94.27748107910156, + "loss": 0.6666, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.751643657684326, + "rewards/margins": 4.512933731079102, + "rewards/rejected": -1.7612900733947754, + "step": 6197 + }, + { + "epoch": 1.55, + "grad_norm": 5.112947940826416, + "learning_rate": 7.808693946390874e-06, + "logits/chosen": -0.3243461549282074, + "logits/rejected": -0.41056811809539795, + "logps/chosen": -63.48564910888672, + "logps/rejected": -78.27606201171875, + "loss": 0.9095, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9337079524993896, + "rewards/margins": 4.376276969909668, + "rewards/rejected": -1.4425687789916992, + "step": 6198 + }, + { + "epoch": 1.55, + "grad_norm": 9.909358978271484, + "learning_rate": 7.808043652588462e-06, + "logits/chosen": -0.36594054102897644, + "logits/rejected": -0.45718103647232056, + "logps/chosen": -62.46398162841797, + "logps/rejected": -89.49964904785156, + "loss": 0.8809, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.649285316467285, + "rewards/margins": 4.8806843757629395, + "rewards/rejected": -2.2313990592956543, + "step": 6199 + }, + { + "epoch": 1.55, + "grad_norm": 2.720062255859375, + "learning_rate": 7.807393289396303e-06, + "logits/chosen": -0.3406205475330353, + "logits/rejected": -0.4329557716846466, + "logps/chosen": -71.16189575195312, + "logps/rejected": -87.82685089111328, + "loss": 0.691, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3498337268829346, + "rewards/margins": 5.488908290863037, + "rewards/rejected": -2.1390750408172607, + "step": 6200 + }, + { + "epoch": 1.55, + "grad_norm": 5.248273849487305, + "learning_rate": 7.806742856830472e-06, + "logits/chosen": -0.33870095014572144, + "logits/rejected": -0.4415130317211151, + "logps/chosen": -53.190067291259766, + "logps/rejected": -73.47293853759766, + "loss": 0.6576, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8911828994750977, + "rewards/margins": 3.9713850021362305, + "rewards/rejected": -1.0802021026611328, + "step": 6201 + }, + { + "epoch": 1.55, + "grad_norm": 4.0611677169799805, + "learning_rate": 7.80609235490704e-06, + "logits/chosen": -0.4082369804382324, + "logits/rejected": -0.48625969886779785, + "logps/chosen": -52.34214782714844, + "logps/rejected": -73.24968719482422, + "loss": 0.728, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0367374420166016, + "rewards/margins": 5.414571762084961, + "rewards/rejected": -2.3778343200683594, + "step": 6202 + }, + { + "epoch": 1.55, + "grad_norm": 12.485715866088867, + "learning_rate": 7.80544178364208e-06, + "logits/chosen": -0.3348337411880493, + "logits/rejected": -0.46282345056533813, + "logps/chosen": -47.181793212890625, + "logps/rejected": -80.73501586914062, + "loss": 0.6734, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8979692459106445, + "rewards/margins": 5.582475662231445, + "rewards/rejected": -2.684506893157959, + "step": 6203 + }, + { + "epoch": 1.55, + "grad_norm": 3.764239549636841, + "learning_rate": 7.80479114305167e-06, + "logits/chosen": -0.36717209219932556, + "logits/rejected": -0.4209398925304413, + "logps/chosen": -71.24076843261719, + "logps/rejected": -93.53841400146484, + "loss": 0.7409, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1090049743652344, + "rewards/margins": 4.773868560791016, + "rewards/rejected": -1.6648638248443604, + "step": 6204 + }, + { + "epoch": 1.55, + "grad_norm": 9.455116271972656, + "learning_rate": 7.804140433151885e-06, + "logits/chosen": -0.40342503786087036, + "logits/rejected": -0.5284298658370972, + "logps/chosen": -53.38288497924805, + "logps/rejected": -108.83439636230469, + "loss": 0.8182, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9497969150543213, + "rewards/margins": 4.940056324005127, + "rewards/rejected": -1.9902594089508057, + "step": 6205 + }, + { + "epoch": 1.55, + "grad_norm": 6.806223392486572, + "learning_rate": 7.803489653958811e-06, + "logits/chosen": -0.24592038989067078, + "logits/rejected": -0.31870120763778687, + "logps/chosen": -61.022621154785156, + "logps/rejected": -98.03335571289062, + "loss": 0.7461, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6528167724609375, + "rewards/margins": 4.463900089263916, + "rewards/rejected": -1.8110833168029785, + "step": 6206 + }, + { + "epoch": 1.55, + "grad_norm": 4.30100679397583, + "learning_rate": 7.802838805488524e-06, + "logits/chosen": -0.3838689625263214, + "logits/rejected": -0.4181554913520813, + "logps/chosen": -51.54434585571289, + "logps/rejected": -95.94172668457031, + "loss": 0.8102, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.867783308029175, + "rewards/margins": 5.068658828735352, + "rewards/rejected": -2.200875759124756, + "step": 6207 + }, + { + "epoch": 1.55, + "grad_norm": 4.46539831161499, + "learning_rate": 7.80218788775711e-06, + "logits/chosen": -0.3180059790611267, + "logits/rejected": -0.3879939913749695, + "logps/chosen": -55.32195281982422, + "logps/rejected": -100.22227478027344, + "loss": 0.6733, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6975109577178955, + "rewards/margins": 5.6633100509643555, + "rewards/rejected": -2.965799570083618, + "step": 6208 + }, + { + "epoch": 1.55, + "grad_norm": 2.4080145359039307, + "learning_rate": 7.801536900780652e-06, + "logits/chosen": -0.3569123446941376, + "logits/rejected": -0.4942604899406433, + "logps/chosen": -60.844058990478516, + "logps/rejected": -90.16470336914062, + "loss": 0.7143, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5826950073242188, + "rewards/margins": 5.731802463531494, + "rewards/rejected": -3.149106979370117, + "step": 6209 + }, + { + "epoch": 1.55, + "grad_norm": 9.075390815734863, + "learning_rate": 7.800885844575236e-06, + "logits/chosen": -0.3720298409461975, + "logits/rejected": -0.45574551820755005, + "logps/chosen": -63.19010925292969, + "logps/rejected": -81.9920883178711, + "loss": 0.8563, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1752443313598633, + "rewards/margins": 4.542224884033203, + "rewards/rejected": -1.3669803142547607, + "step": 6210 + }, + { + "epoch": 1.55, + "grad_norm": 8.215951919555664, + "learning_rate": 7.800234719156954e-06, + "logits/chosen": -0.36265602707862854, + "logits/rejected": -0.4581974744796753, + "logps/chosen": -63.7252197265625, + "logps/rejected": -77.54759979248047, + "loss": 1.0011, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.3764419555664062, + "rewards/margins": 4.208713531494141, + "rewards/rejected": -1.8322713375091553, + "step": 6211 + }, + { + "epoch": 1.55, + "grad_norm": 7.64077615737915, + "learning_rate": 7.79958352454189e-06, + "logits/chosen": -0.4010769724845886, + "logits/rejected": -0.4347246289253235, + "logps/chosen": -65.65514373779297, + "logps/rejected": -84.50420379638672, + "loss": 0.8655, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7918801307678223, + "rewards/margins": 3.403310537338257, + "rewards/rejected": -0.6114303469657898, + "step": 6212 + }, + { + "epoch": 1.55, + "grad_norm": 8.07315444946289, + "learning_rate": 7.798932260746145e-06, + "logits/chosen": -0.4965912103652954, + "logits/rejected": -0.6063347458839417, + "logps/chosen": -53.705509185791016, + "logps/rejected": -92.12579345703125, + "loss": 0.8369, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5077807903289795, + "rewards/margins": 5.267446517944336, + "rewards/rejected": -2.7596654891967773, + "step": 6213 + }, + { + "epoch": 1.55, + "grad_norm": 6.9350905418396, + "learning_rate": 7.798280927785802e-06, + "logits/chosen": -0.36909428238868713, + "logits/rejected": -0.46066951751708984, + "logps/chosen": -56.96483612060547, + "logps/rejected": -81.4586181640625, + "loss": 0.843, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.753133773803711, + "rewards/margins": 5.042300224304199, + "rewards/rejected": -2.2891664505004883, + "step": 6214 + }, + { + "epoch": 1.55, + "grad_norm": 14.726960182189941, + "learning_rate": 7.797629525676964e-06, + "logits/chosen": -0.38188672065734863, + "logits/rejected": -0.4124392867088318, + "logps/chosen": -61.91346740722656, + "logps/rejected": -91.55946350097656, + "loss": 0.8723, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6966426372528076, + "rewards/margins": 4.4110260009765625, + "rewards/rejected": -1.7143830060958862, + "step": 6215 + }, + { + "epoch": 1.56, + "grad_norm": 10.607916831970215, + "learning_rate": 7.796978054435722e-06, + "logits/chosen": -0.403756320476532, + "logits/rejected": -0.4658127427101135, + "logps/chosen": -52.31633377075195, + "logps/rejected": -90.76229858398438, + "loss": 0.8104, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.857412338256836, + "rewards/margins": 4.371856689453125, + "rewards/rejected": -1.514445185661316, + "step": 6216 + }, + { + "epoch": 1.56, + "grad_norm": 2.3966846466064453, + "learning_rate": 7.79632651407818e-06, + "logits/chosen": -0.3254181742668152, + "logits/rejected": -0.45174193382263184, + "logps/chosen": -57.45127487182617, + "logps/rejected": -70.25845336914062, + "loss": 0.5777, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.196794033050537, + "rewards/margins": 5.381991386413574, + "rewards/rejected": -2.185197353363037, + "step": 6217 + }, + { + "epoch": 1.56, + "grad_norm": 14.19495677947998, + "learning_rate": 7.795674904620433e-06, + "logits/chosen": -0.3582446575164795, + "logits/rejected": -0.4675155580043793, + "logps/chosen": -70.16919708251953, + "logps/rejected": -101.09764099121094, + "loss": 0.8743, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7604241371154785, + "rewards/margins": 5.627713680267334, + "rewards/rejected": -2.8672897815704346, + "step": 6218 + }, + { + "epoch": 1.56, + "grad_norm": 4.514584064483643, + "learning_rate": 7.795023226078588e-06, + "logits/chosen": -0.39780953526496887, + "logits/rejected": -0.5021911263465881, + "logps/chosen": -53.921939849853516, + "logps/rejected": -78.49502563476562, + "loss": 0.6591, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6572608947753906, + "rewards/margins": 5.1375274658203125, + "rewards/rejected": -2.480266809463501, + "step": 6219 + }, + { + "epoch": 1.56, + "grad_norm": 7.42427396774292, + "learning_rate": 7.794371478468746e-06, + "logits/chosen": -0.35061001777648926, + "logits/rejected": -0.396847665309906, + "logps/chosen": -57.900390625, + "logps/rejected": -102.82112121582031, + "loss": 0.7032, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4998984336853027, + "rewards/margins": 4.59971809387207, + "rewards/rejected": -2.0998191833496094, + "step": 6220 + }, + { + "epoch": 1.56, + "grad_norm": 9.731841087341309, + "learning_rate": 7.79371966180701e-06, + "logits/chosen": -0.35136690735816956, + "logits/rejected": -0.41235432028770447, + "logps/chosen": -49.311546325683594, + "logps/rejected": -86.8099594116211, + "loss": 0.6784, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2273192405700684, + "rewards/margins": 4.579092502593994, + "rewards/rejected": -1.3517731428146362, + "step": 6221 + }, + { + "epoch": 1.56, + "grad_norm": 6.429859161376953, + "learning_rate": 7.79306777610949e-06, + "logits/chosen": -0.3884572386741638, + "logits/rejected": -0.5064211487770081, + "logps/chosen": -49.233924865722656, + "logps/rejected": -76.20600128173828, + "loss": 0.7971, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8670194149017334, + "rewards/margins": 4.538009166717529, + "rewards/rejected": -1.6709895133972168, + "step": 6222 + }, + { + "epoch": 1.56, + "grad_norm": 7.355351448059082, + "learning_rate": 7.792415821392296e-06, + "logits/chosen": -0.3868095874786377, + "logits/rejected": -0.45567893981933594, + "logps/chosen": -50.19217300415039, + "logps/rejected": -82.1827392578125, + "loss": 0.7524, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0495355129241943, + "rewards/margins": 4.71138334274292, + "rewards/rejected": -1.6618478298187256, + "step": 6223 + }, + { + "epoch": 1.56, + "grad_norm": 4.734988212585449, + "learning_rate": 7.791763797671537e-06, + "logits/chosen": -0.3630829453468323, + "logits/rejected": -0.48981672525405884, + "logps/chosen": -51.31892013549805, + "logps/rejected": -73.01448822021484, + "loss": 0.6944, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1940078735351562, + "rewards/margins": 5.044732570648193, + "rewards/rejected": -1.8507239818572998, + "step": 6224 + }, + { + "epoch": 1.56, + "grad_norm": 6.723554611206055, + "learning_rate": 7.791111704963324e-06, + "logits/chosen": -0.30234917998313904, + "logits/rejected": -0.3450779914855957, + "logps/chosen": -47.598731994628906, + "logps/rejected": -86.85877990722656, + "loss": 0.6656, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.690795421600342, + "rewards/margins": 4.626469612121582, + "rewards/rejected": -1.9356745481491089, + "step": 6225 + }, + { + "epoch": 1.56, + "grad_norm": 4.708726406097412, + "learning_rate": 7.790459543283772e-06, + "logits/chosen": -0.3803999125957489, + "logits/rejected": -0.4980897903442383, + "logps/chosen": -52.02012634277344, + "logps/rejected": -79.64266967773438, + "loss": 0.6616, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.813120126724243, + "rewards/margins": 5.08012580871582, + "rewards/rejected": -2.2670059204101562, + "step": 6226 + }, + { + "epoch": 1.56, + "grad_norm": 11.872828483581543, + "learning_rate": 7.789807312648997e-06, + "logits/chosen": -0.37236472964286804, + "logits/rejected": -0.42847391963005066, + "logps/chosen": -45.935874938964844, + "logps/rejected": -72.46011352539062, + "loss": 0.7517, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8267247676849365, + "rewards/margins": 3.8635454177856445, + "rewards/rejected": -1.0368210077285767, + "step": 6227 + }, + { + "epoch": 1.56, + "grad_norm": 15.557353019714355, + "learning_rate": 7.789155013075116e-06, + "logits/chosen": -0.2958401143550873, + "logits/rejected": -0.3422214388847351, + "logps/chosen": -55.200313568115234, + "logps/rejected": -91.02777862548828, + "loss": 0.7146, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9407496452331543, + "rewards/margins": 4.937520503997803, + "rewards/rejected": -1.996771216392517, + "step": 6228 + }, + { + "epoch": 1.56, + "grad_norm": 6.286358833312988, + "learning_rate": 7.788502644578248e-06, + "logits/chosen": -0.3264778256416321, + "logits/rejected": -0.43102404475212097, + "logps/chosen": -62.353050231933594, + "logps/rejected": -88.54051208496094, + "loss": 0.7702, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.100236654281616, + "rewards/margins": 4.462060451507568, + "rewards/rejected": -1.3618241548538208, + "step": 6229 + }, + { + "epoch": 1.56, + "grad_norm": 9.084465026855469, + "learning_rate": 7.787850207174512e-06, + "logits/chosen": -0.2847629487514496, + "logits/rejected": -0.43342432379722595, + "logps/chosen": -65.12824249267578, + "logps/rejected": -73.80193328857422, + "loss": 0.8353, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7495508193969727, + "rewards/margins": 4.5467047691345215, + "rewards/rejected": -1.797154426574707, + "step": 6230 + }, + { + "epoch": 1.56, + "grad_norm": 5.351614952087402, + "learning_rate": 7.787197700880035e-06, + "logits/chosen": -0.3177819848060608, + "logits/rejected": -0.4359593093395233, + "logps/chosen": -71.91078186035156, + "logps/rejected": -86.0816421508789, + "loss": 0.8357, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.665290594100952, + "rewards/margins": 4.0853142738342285, + "rewards/rejected": -1.4200233221054077, + "step": 6231 + }, + { + "epoch": 1.56, + "grad_norm": 7.78914213180542, + "learning_rate": 7.786545125710935e-06, + "logits/chosen": -0.3430097699165344, + "logits/rejected": -0.43966156244277954, + "logps/chosen": -61.1438102722168, + "logps/rejected": -82.13025665283203, + "loss": 0.7357, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9387407302856445, + "rewards/margins": 4.807811260223389, + "rewards/rejected": -1.8690698146820068, + "step": 6232 + }, + { + "epoch": 1.56, + "grad_norm": 2.4453284740448, + "learning_rate": 7.785892481683345e-06, + "logits/chosen": -0.3590536117553711, + "logits/rejected": -0.5245684385299683, + "logps/chosen": -65.14783477783203, + "logps/rejected": -85.36650085449219, + "loss": 0.7304, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.856858730316162, + "rewards/margins": 6.3046441078186035, + "rewards/rejected": -3.4477853775024414, + "step": 6233 + }, + { + "epoch": 1.56, + "grad_norm": 5.040201663970947, + "learning_rate": 7.785239768813386e-06, + "logits/chosen": -0.4188976287841797, + "logits/rejected": -0.4437616467475891, + "logps/chosen": -56.127262115478516, + "logps/rejected": -84.54521179199219, + "loss": 0.838, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.072831869125366, + "rewards/margins": 4.083318710327148, + "rewards/rejected": -1.0104870796203613, + "step": 6234 + }, + { + "epoch": 1.56, + "grad_norm": 4.026856422424316, + "learning_rate": 7.784586987117189e-06, + "logits/chosen": -0.3717154264450073, + "logits/rejected": -0.49643322825431824, + "logps/chosen": -54.580322265625, + "logps/rejected": -78.71379089355469, + "loss": 0.7846, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.864910364151001, + "rewards/margins": 5.315349102020264, + "rewards/rejected": -2.450439453125, + "step": 6235 + }, + { + "epoch": 1.56, + "grad_norm": 4.6707587242126465, + "learning_rate": 7.78393413661089e-06, + "logits/chosen": -0.337935209274292, + "logits/rejected": -0.3760297894477844, + "logps/chosen": -55.591670989990234, + "logps/rejected": -90.17373657226562, + "loss": 0.77, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.916698932647705, + "rewards/margins": 4.351108551025391, + "rewards/rejected": -1.4344091415405273, + "step": 6236 + }, + { + "epoch": 1.56, + "grad_norm": 3.992119550704956, + "learning_rate": 7.783281217310613e-06, + "logits/chosen": -0.32802537083625793, + "logits/rejected": -0.43865492939949036, + "logps/chosen": -56.36832809448242, + "logps/rejected": -77.80634307861328, + "loss": 0.7079, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.926726818084717, + "rewards/margins": 4.922328472137451, + "rewards/rejected": -1.9956018924713135, + "step": 6237 + }, + { + "epoch": 1.56, + "grad_norm": 3.6374144554138184, + "learning_rate": 7.782628229232498e-06, + "logits/chosen": -0.37964001297950745, + "logits/rejected": -0.4979580044746399, + "logps/chosen": -66.86873626708984, + "logps/rejected": -80.91695404052734, + "loss": 0.7052, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8975791931152344, + "rewards/margins": 5.438129425048828, + "rewards/rejected": -2.540550470352173, + "step": 6238 + }, + { + "epoch": 1.56, + "grad_norm": 3.287004232406616, + "learning_rate": 7.78197517239268e-06, + "logits/chosen": -0.41650012135505676, + "logits/rejected": -0.46796339750289917, + "logps/chosen": -56.9046630859375, + "logps/rejected": -87.15719604492188, + "loss": 0.7305, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.210155963897705, + "rewards/margins": 4.90366268157959, + "rewards/rejected": -1.6935068368911743, + "step": 6239 + }, + { + "epoch": 1.56, + "grad_norm": 5.070339202880859, + "learning_rate": 7.781322046807296e-06, + "logits/chosen": -0.32544365525245667, + "logits/rejected": -0.4510100483894348, + "logps/chosen": -59.30097961425781, + "logps/rejected": -87.336181640625, + "loss": 0.7404, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6976587772369385, + "rewards/margins": 4.503663063049316, + "rewards/rejected": -1.8060041666030884, + "step": 6240 + }, + { + "epoch": 1.56, + "grad_norm": 2.855255603790283, + "learning_rate": 7.780668852492485e-06, + "logits/chosen": -0.40108585357666016, + "logits/rejected": -0.48854127526283264, + "logps/chosen": -47.654380798339844, + "logps/rejected": -78.6045150756836, + "loss": 0.544, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0461602210998535, + "rewards/margins": 4.831171035766602, + "rewards/rejected": -1.7850110530853271, + "step": 6241 + }, + { + "epoch": 1.56, + "grad_norm": 6.4233880043029785, + "learning_rate": 7.78001558946439e-06, + "logits/chosen": -0.3653537631034851, + "logits/rejected": -0.46603405475616455, + "logps/chosen": -65.62213897705078, + "logps/rejected": -86.02088928222656, + "loss": 0.7942, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1659679412841797, + "rewards/margins": 4.363088607788086, + "rewards/rejected": -1.1971209049224854, + "step": 6242 + }, + { + "epoch": 1.56, + "grad_norm": 3.250657796859741, + "learning_rate": 7.779362257739154e-06, + "logits/chosen": -0.30262675881385803, + "logits/rejected": -0.37605252861976624, + "logps/chosen": -60.90547561645508, + "logps/rejected": -103.9949951171875, + "loss": 0.7111, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.921731948852539, + "rewards/margins": 5.390045642852783, + "rewards/rejected": -2.4683141708374023, + "step": 6243 + }, + { + "epoch": 1.56, + "grad_norm": 6.651280879974365, + "learning_rate": 7.77870885733292e-06, + "logits/chosen": -0.34645959734916687, + "logits/rejected": -0.46718913316726685, + "logps/chosen": -58.18233871459961, + "logps/rejected": -80.52808380126953, + "loss": 0.7461, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.972048044204712, + "rewards/margins": 4.952794075012207, + "rewards/rejected": -1.9807461500167847, + "step": 6244 + }, + { + "epoch": 1.56, + "grad_norm": 1.811323642730713, + "learning_rate": 7.778055388261832e-06, + "logits/chosen": -0.28481411933898926, + "logits/rejected": -0.43961101770401, + "logps/chosen": -57.95730209350586, + "logps/rejected": -97.02142333984375, + "loss": 0.5499, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3238766193389893, + "rewards/margins": 5.98103666305542, + "rewards/rejected": -2.6571602821350098, + "step": 6245 + }, + { + "epoch": 1.56, + "grad_norm": 6.428532123565674, + "learning_rate": 7.777401850542043e-06, + "logits/chosen": -0.2907855212688446, + "logits/rejected": -0.429158091545105, + "logps/chosen": -62.109046936035156, + "logps/rejected": -84.02259063720703, + "loss": 0.6915, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9193649291992188, + "rewards/margins": 5.313863754272461, + "rewards/rejected": -2.394498586654663, + "step": 6246 + }, + { + "epoch": 1.56, + "grad_norm": 4.621100902557373, + "learning_rate": 7.7767482441897e-06, + "logits/chosen": -0.36073046922683716, + "logits/rejected": -0.5593734979629517, + "logps/chosen": -69.46795654296875, + "logps/rejected": -85.15360260009766, + "loss": 0.7876, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9433205127716064, + "rewards/margins": 6.068002223968506, + "rewards/rejected": -3.1246824264526367, + "step": 6247 + }, + { + "epoch": 1.56, + "grad_norm": 6.787613868713379, + "learning_rate": 7.776094569220954e-06, + "logits/chosen": -0.2510443925857544, + "logits/rejected": -0.3290938138961792, + "logps/chosen": -65.44303894042969, + "logps/rejected": -87.662353515625, + "loss": 0.7537, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.237077236175537, + "rewards/margins": 4.2546162605285645, + "rewards/rejected": -1.0175390243530273, + "step": 6248 + }, + { + "epoch": 1.56, + "grad_norm": 4.328476905822754, + "learning_rate": 7.775440825651958e-06, + "logits/chosen": -0.31421688199043274, + "logits/rejected": -0.3949531614780426, + "logps/chosen": -67.0600814819336, + "logps/rejected": -92.41307830810547, + "loss": 0.7601, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0824477672576904, + "rewards/margins": 4.7670979499816895, + "rewards/rejected": -1.6846507787704468, + "step": 6249 + }, + { + "epoch": 1.56, + "grad_norm": 3.871443033218384, + "learning_rate": 7.774787013498868e-06, + "logits/chosen": -0.37801089882850647, + "logits/rejected": -0.49420252442359924, + "logps/chosen": -57.005897521972656, + "logps/rejected": -67.7252197265625, + "loss": 0.8267, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.194307327270508, + "rewards/margins": 4.819622993469238, + "rewards/rejected": -1.6253149509429932, + "step": 6250 + }, + { + "epoch": 1.56, + "grad_norm": 2.855320692062378, + "learning_rate": 7.774133132777836e-06, + "logits/chosen": -0.28267034888267517, + "logits/rejected": -0.4638369679450989, + "logps/chosen": -60.85023880004883, + "logps/rejected": -85.36602783203125, + "loss": 0.6117, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.35556697845459, + "rewards/margins": 5.517468452453613, + "rewards/rejected": -2.1619012355804443, + "step": 6251 + }, + { + "epoch": 1.56, + "grad_norm": 4.037677764892578, + "learning_rate": 7.773479183505028e-06, + "logits/chosen": -0.3195240795612335, + "logits/rejected": -0.44204968214035034, + "logps/chosen": -52.41653060913086, + "logps/rejected": -75.93183898925781, + "loss": 0.6205, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.125170946121216, + "rewards/margins": 6.048535346984863, + "rewards/rejected": -2.9233639240264893, + "step": 6252 + }, + { + "epoch": 1.56, + "grad_norm": 5.513517379760742, + "learning_rate": 7.772825165696598e-06, + "logits/chosen": -0.3784026503562927, + "logits/rejected": -0.48124510049819946, + "logps/chosen": -55.78146743774414, + "logps/rejected": -71.40314483642578, + "loss": 0.8186, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8094749450683594, + "rewards/margins": 4.164043426513672, + "rewards/rejected": -1.354568600654602, + "step": 6253 + }, + { + "epoch": 1.56, + "grad_norm": 3.2156097888946533, + "learning_rate": 7.772171079368709e-06, + "logits/chosen": -0.383714497089386, + "logits/rejected": -0.488691121339798, + "logps/chosen": -62.955955505371094, + "logps/rejected": -79.0022201538086, + "loss": 0.7324, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.109118938446045, + "rewards/margins": 5.123483657836914, + "rewards/rejected": -2.014364719390869, + "step": 6254 + }, + { + "epoch": 1.56, + "grad_norm": 4.194713115692139, + "learning_rate": 7.771516924537523e-06, + "logits/chosen": -0.2625764012336731, + "logits/rejected": -0.41796910762786865, + "logps/chosen": -74.66624450683594, + "logps/rejected": -70.12708282470703, + "loss": 0.7463, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9124560356140137, + "rewards/margins": 4.491029262542725, + "rewards/rejected": -1.578573226928711, + "step": 6255 + }, + { + "epoch": 1.57, + "grad_norm": 6.929274559020996, + "learning_rate": 7.770862701219207e-06, + "logits/chosen": -0.3174944818019867, + "logits/rejected": -0.4369848966598511, + "logps/chosen": -62.51082992553711, + "logps/rejected": -85.95207214355469, + "loss": 0.6594, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0419914722442627, + "rewards/margins": 5.791652679443359, + "rewards/rejected": -2.7496612071990967, + "step": 6256 + }, + { + "epoch": 1.57, + "grad_norm": 7.347066402435303, + "learning_rate": 7.770208409429925e-06, + "logits/chosen": -0.3877699375152588, + "logits/rejected": -0.4657527506351471, + "logps/chosen": -51.944583892822266, + "logps/rejected": -84.49925231933594, + "loss": 0.6921, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7204713821411133, + "rewards/margins": 4.375954627990723, + "rewards/rejected": -1.655483365058899, + "step": 6257 + }, + { + "epoch": 1.57, + "grad_norm": 14.95108413696289, + "learning_rate": 7.769554049185848e-06, + "logits/chosen": -0.3534609079360962, + "logits/rejected": -0.3714272677898407, + "logps/chosen": -48.13129806518555, + "logps/rejected": -95.68943786621094, + "loss": 0.8, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.944711446762085, + "rewards/margins": 4.734613418579102, + "rewards/rejected": -1.789901852607727, + "step": 6258 + }, + { + "epoch": 1.57, + "grad_norm": 3.1411614418029785, + "learning_rate": 7.768899620503145e-06, + "logits/chosen": -0.27267178893089294, + "logits/rejected": -0.4087488651275635, + "logps/chosen": -61.473350524902344, + "logps/rejected": -72.1795425415039, + "loss": 0.6761, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.221158027648926, + "rewards/margins": 5.265951156616211, + "rewards/rejected": -2.0447933673858643, + "step": 6259 + }, + { + "epoch": 1.57, + "grad_norm": 3.0007524490356445, + "learning_rate": 7.768245123397987e-06, + "logits/chosen": -0.38493773341178894, + "logits/rejected": -0.453739732503891, + "logps/chosen": -47.04557418823242, + "logps/rejected": -90.54299926757812, + "loss": 0.6058, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9760282039642334, + "rewards/margins": 5.040772914886475, + "rewards/rejected": -2.0647451877593994, + "step": 6260 + }, + { + "epoch": 1.57, + "grad_norm": 3.48909854888916, + "learning_rate": 7.767590557886546e-06, + "logits/chosen": -0.3982689678668976, + "logits/rejected": -0.45068755745887756, + "logps/chosen": -66.50413513183594, + "logps/rejected": -82.84844970703125, + "loss": 0.7343, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.101891040802002, + "rewards/margins": 3.568035364151001, + "rewards/rejected": -0.4661444425582886, + "step": 6261 + }, + { + "epoch": 1.57, + "grad_norm": 9.69159984588623, + "learning_rate": 7.766935923985e-06, + "logits/chosen": -0.3530934751033783, + "logits/rejected": -0.5015802383422852, + "logps/chosen": -50.49934387207031, + "logps/rejected": -80.83273315429688, + "loss": 0.6984, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0819993019104004, + "rewards/margins": 5.359745025634766, + "rewards/rejected": -2.277745008468628, + "step": 6262 + }, + { + "epoch": 1.57, + "grad_norm": 3.4513635635375977, + "learning_rate": 7.766281221709524e-06, + "logits/chosen": -0.3334070146083832, + "logits/rejected": -0.44736579060554504, + "logps/chosen": -60.35874938964844, + "logps/rejected": -78.29110717773438, + "loss": 0.6621, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2081425189971924, + "rewards/margins": 5.282165050506592, + "rewards/rejected": -2.074021816253662, + "step": 6263 + }, + { + "epoch": 1.57, + "grad_norm": 3.3815886974334717, + "learning_rate": 7.765626451076295e-06, + "logits/chosen": -0.3153051733970642, + "logits/rejected": -0.4478021562099457, + "logps/chosen": -48.46751403808594, + "logps/rejected": -84.33039855957031, + "loss": 0.6178, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9512524604797363, + "rewards/margins": 5.761054992675781, + "rewards/rejected": -2.8098018169403076, + "step": 6264 + }, + { + "epoch": 1.57, + "grad_norm": 6.452925682067871, + "learning_rate": 7.764971612101497e-06, + "logits/chosen": -0.315706729888916, + "logits/rejected": -0.434032678604126, + "logps/chosen": -48.777069091796875, + "logps/rejected": -79.11531829833984, + "loss": 0.7085, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.030917167663574, + "rewards/margins": 4.907105445861816, + "rewards/rejected": -1.8761881589889526, + "step": 6265 + }, + { + "epoch": 1.57, + "grad_norm": 3.4558632373809814, + "learning_rate": 7.76431670480131e-06, + "logits/chosen": -0.4473714232444763, + "logits/rejected": -0.5468761920928955, + "logps/chosen": -41.96935272216797, + "logps/rejected": -96.48627471923828, + "loss": 0.6194, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0752532482147217, + "rewards/margins": 5.779141426086426, + "rewards/rejected": -2.703888416290283, + "step": 6266 + }, + { + "epoch": 1.57, + "grad_norm": 6.083073139190674, + "learning_rate": 7.763661729191916e-06, + "logits/chosen": -0.3470320701599121, + "logits/rejected": -0.3765600919723511, + "logps/chosen": -57.07863998413086, + "logps/rejected": -109.24665832519531, + "loss": 0.7337, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.044290542602539, + "rewards/margins": 5.199601173400879, + "rewards/rejected": -2.15531063079834, + "step": 6267 + }, + { + "epoch": 1.57, + "grad_norm": 6.993319511413574, + "learning_rate": 7.7630066852895e-06, + "logits/chosen": -0.3551751673221588, + "logits/rejected": -0.49321264028549194, + "logps/chosen": -58.13749313354492, + "logps/rejected": -70.89249420166016, + "loss": 0.7832, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.91161847114563, + "rewards/margins": 4.229267597198486, + "rewards/rejected": -1.317649006843567, + "step": 6268 + }, + { + "epoch": 1.57, + "grad_norm": 16.183015823364258, + "learning_rate": 7.762351573110252e-06, + "logits/chosen": -0.3792099952697754, + "logits/rejected": -0.4837636947631836, + "logps/chosen": -52.002197265625, + "logps/rejected": -83.91991424560547, + "loss": 0.7722, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9442129135131836, + "rewards/margins": 4.982031345367432, + "rewards/rejected": -2.03781795501709, + "step": 6269 + }, + { + "epoch": 1.57, + "grad_norm": 7.969704627990723, + "learning_rate": 7.761696392670357e-06, + "logits/chosen": -0.33475959300994873, + "logits/rejected": -0.4073784351348877, + "logps/chosen": -55.43504333496094, + "logps/rejected": -85.3145751953125, + "loss": 0.7129, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9901156425476074, + "rewards/margins": 3.5071444511413574, + "rewards/rejected": -0.5170285701751709, + "step": 6270 + }, + { + "epoch": 1.57, + "grad_norm": 11.302234649658203, + "learning_rate": 7.761041143986005e-06, + "logits/chosen": -0.25838086009025574, + "logits/rejected": -0.3820967376232147, + "logps/chosen": -59.49571990966797, + "logps/rejected": -82.91793060302734, + "loss": 0.6945, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.983410120010376, + "rewards/margins": 4.869921684265137, + "rewards/rejected": -1.8865113258361816, + "step": 6271 + }, + { + "epoch": 1.57, + "grad_norm": 12.933039665222168, + "learning_rate": 7.760385827073393e-06, + "logits/chosen": -0.3828408122062683, + "logits/rejected": -0.4401262402534485, + "logps/chosen": -54.06745147705078, + "logps/rejected": -78.92507934570312, + "loss": 0.7774, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.653614044189453, + "rewards/margins": 4.019987106323242, + "rewards/rejected": -1.36637282371521, + "step": 6272 + }, + { + "epoch": 1.57, + "grad_norm": 5.531392574310303, + "learning_rate": 7.75973044194871e-06, + "logits/chosen": -0.3927030563354492, + "logits/rejected": -0.42130523920059204, + "logps/chosen": -58.971317291259766, + "logps/rejected": -94.29331970214844, + "loss": 0.721, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8523149490356445, + "rewards/margins": 4.395207405090332, + "rewards/rejected": -1.5428919792175293, + "step": 6273 + }, + { + "epoch": 1.57, + "grad_norm": 6.482242584228516, + "learning_rate": 7.759074988628152e-06, + "logits/chosen": -0.3036803901195526, + "logits/rejected": -0.3690635859966278, + "logps/chosen": -63.80364990234375, + "logps/rejected": -84.799072265625, + "loss": 0.8369, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8609163761138916, + "rewards/margins": 4.638619422912598, + "rewards/rejected": -1.7777023315429688, + "step": 6274 + }, + { + "epoch": 1.57, + "grad_norm": 3.957326650619507, + "learning_rate": 7.758419467127915e-06, + "logits/chosen": -0.3881974220275879, + "logits/rejected": -0.5870946049690247, + "logps/chosen": -49.11555480957031, + "logps/rejected": -74.52227783203125, + "loss": 0.6087, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0102884769439697, + "rewards/margins": 6.281400680541992, + "rewards/rejected": -3.2711117267608643, + "step": 6275 + }, + { + "epoch": 1.57, + "grad_norm": 4.556542873382568, + "learning_rate": 7.757763877464201e-06, + "logits/chosen": -0.3661280572414398, + "logits/rejected": -0.5173336863517761, + "logps/chosen": -62.696109771728516, + "logps/rejected": -81.2383041381836, + "loss": 0.7044, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8508288860321045, + "rewards/margins": 5.919366359710693, + "rewards/rejected": -3.0685369968414307, + "step": 6276 + }, + { + "epoch": 1.57, + "grad_norm": 5.358494758605957, + "learning_rate": 7.757108219653208e-06, + "logits/chosen": -0.2726077139377594, + "logits/rejected": -0.38935303688049316, + "logps/chosen": -55.857704162597656, + "logps/rejected": -105.48426818847656, + "loss": 0.7084, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.200120687484741, + "rewards/margins": 4.705690383911133, + "rewards/rejected": -1.5055699348449707, + "step": 6277 + }, + { + "epoch": 1.57, + "grad_norm": 3.601670742034912, + "learning_rate": 7.756452493711137e-06, + "logits/chosen": -0.3718431890010834, + "logits/rejected": -0.4207697808742523, + "logps/chosen": -64.749267578125, + "logps/rejected": -95.01812744140625, + "loss": 0.7694, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9575815200805664, + "rewards/margins": 4.047957420349121, + "rewards/rejected": -1.0903761386871338, + "step": 6278 + }, + { + "epoch": 1.57, + "grad_norm": 3.0369691848754883, + "learning_rate": 7.755796699654195e-06, + "logits/chosen": -0.3777603507041931, + "logits/rejected": -0.426411896944046, + "logps/chosen": -47.8852424621582, + "logps/rejected": -90.49927520751953, + "loss": 0.7288, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.074427604675293, + "rewards/margins": 4.551604747772217, + "rewards/rejected": -1.4771769046783447, + "step": 6279 + }, + { + "epoch": 1.57, + "grad_norm": 5.242918968200684, + "learning_rate": 7.755140837498584e-06, + "logits/chosen": -0.2985687851905823, + "logits/rejected": -0.44295936822891235, + "logps/chosen": -61.97234344482422, + "logps/rejected": -79.70500183105469, + "loss": 0.6902, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.042884111404419, + "rewards/margins": 5.473410606384277, + "rewards/rejected": -2.4305262565612793, + "step": 6280 + }, + { + "epoch": 1.57, + "grad_norm": 2.7391929626464844, + "learning_rate": 7.754484907260513e-06, + "logits/chosen": -0.2685391902923584, + "logits/rejected": -0.47780218720436096, + "logps/chosen": -77.25331115722656, + "logps/rejected": -70.76750183105469, + "loss": 0.71, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8982505798339844, + "rewards/margins": 5.560378074645996, + "rewards/rejected": -2.6621272563934326, + "step": 6281 + }, + { + "epoch": 1.57, + "grad_norm": 5.963965892791748, + "learning_rate": 7.75382890895619e-06, + "logits/chosen": -0.37578922510147095, + "logits/rejected": -0.4307956099510193, + "logps/chosen": -57.37141036987305, + "logps/rejected": -94.9173355102539, + "loss": 0.8028, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.945347547531128, + "rewards/margins": 4.422420978546143, + "rewards/rejected": -1.4770734310150146, + "step": 6282 + }, + { + "epoch": 1.57, + "grad_norm": 5.498225212097168, + "learning_rate": 7.753172842601825e-06, + "logits/chosen": -0.3610623776912689, + "logits/rejected": -0.4906957149505615, + "logps/chosen": -57.639801025390625, + "logps/rejected": -76.53256225585938, + "loss": 0.8093, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0725202560424805, + "rewards/margins": 3.858583927154541, + "rewards/rejected": -0.786063551902771, + "step": 6283 + }, + { + "epoch": 1.57, + "grad_norm": 6.788854122161865, + "learning_rate": 7.752516708213632e-06, + "logits/chosen": -0.33858174085617065, + "logits/rejected": -0.32138025760650635, + "logps/chosen": -59.48043441772461, + "logps/rejected": -99.12361145019531, + "loss": 0.8137, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1527085304260254, + "rewards/margins": 4.0848612785339355, + "rewards/rejected": -0.9321523904800415, + "step": 6284 + }, + { + "epoch": 1.57, + "grad_norm": 10.924221992492676, + "learning_rate": 7.751860505807822e-06, + "logits/chosen": -0.38484352827072144, + "logits/rejected": -0.4364904463291168, + "logps/chosen": -41.589111328125, + "logps/rejected": -85.45008850097656, + "loss": 0.7745, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0334279537200928, + "rewards/margins": 4.096309661865234, + "rewards/rejected": -1.0628814697265625, + "step": 6285 + }, + { + "epoch": 1.57, + "grad_norm": 4.593951225280762, + "learning_rate": 7.751204235400614e-06, + "logits/chosen": -0.32781779766082764, + "logits/rejected": -0.40607860684394836, + "logps/chosen": -56.7913932800293, + "logps/rejected": -82.80834197998047, + "loss": 0.7007, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.111745834350586, + "rewards/margins": 4.833658218383789, + "rewards/rejected": -1.721911907196045, + "step": 6286 + }, + { + "epoch": 1.57, + "grad_norm": 6.307913303375244, + "learning_rate": 7.75054789700822e-06, + "logits/chosen": -0.2530023157596588, + "logits/rejected": -0.3594359755516052, + "logps/chosen": -64.6097412109375, + "logps/rejected": -85.99537658691406, + "loss": 0.7044, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9950101375579834, + "rewards/margins": 4.722121238708496, + "rewards/rejected": -1.7271109819412231, + "step": 6287 + }, + { + "epoch": 1.57, + "grad_norm": 16.886442184448242, + "learning_rate": 7.749891490646864e-06, + "logits/chosen": -0.3803512752056122, + "logits/rejected": -0.496541291475296, + "logps/chosen": -56.38689041137695, + "logps/rejected": -79.0681381225586, + "loss": 0.8092, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9819774627685547, + "rewards/margins": 4.606244087219238, + "rewards/rejected": -1.6242663860321045, + "step": 6288 + }, + { + "epoch": 1.57, + "grad_norm": 4.612987041473389, + "learning_rate": 7.749235016332763e-06, + "logits/chosen": -0.3749160170555115, + "logits/rejected": -0.48214495182037354, + "logps/chosen": -46.57282638549805, + "logps/rejected": -76.11903381347656, + "loss": 0.6987, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9164650440216064, + "rewards/margins": 5.221561431884766, + "rewards/rejected": -2.3050966262817383, + "step": 6289 + }, + { + "epoch": 1.57, + "grad_norm": 3.9139246940612793, + "learning_rate": 7.748578474082142e-06, + "logits/chosen": -0.3027680516242981, + "logits/rejected": -0.4042016267776489, + "logps/chosen": -53.53247833251953, + "logps/rejected": -89.816162109375, + "loss": 0.7293, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.054234504699707, + "rewards/margins": 5.119507312774658, + "rewards/rejected": -2.0652730464935303, + "step": 6290 + }, + { + "epoch": 1.57, + "grad_norm": 4.295011520385742, + "learning_rate": 7.74792186391122e-06, + "logits/chosen": -0.3654269576072693, + "logits/rejected": -0.4350125193595886, + "logps/chosen": -53.92394256591797, + "logps/rejected": -71.43387603759766, + "loss": 0.7981, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.721757411956787, + "rewards/margins": 3.9115307331085205, + "rewards/rejected": -1.1897730827331543, + "step": 6291 + }, + { + "epoch": 1.57, + "grad_norm": 4.744585037231445, + "learning_rate": 7.74726518583623e-06, + "logits/chosen": -0.2879413962364197, + "logits/rejected": -0.38263848423957825, + "logps/chosen": -55.18878173828125, + "logps/rejected": -89.07211303710938, + "loss": 0.6788, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0184197425842285, + "rewards/margins": 4.818332672119141, + "rewards/rejected": -1.7999132871627808, + "step": 6292 + }, + { + "epoch": 1.57, + "grad_norm": 1.6620126962661743, + "learning_rate": 7.746608439873395e-06, + "logits/chosen": -0.3402191698551178, + "logits/rejected": -0.49946916103363037, + "logps/chosen": -52.70722961425781, + "logps/rejected": -91.08677673339844, + "loss": 0.5838, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.115884780883789, + "rewards/margins": 5.914305686950684, + "rewards/rejected": -2.7984211444854736, + "step": 6293 + }, + { + "epoch": 1.57, + "grad_norm": 5.195455074310303, + "learning_rate": 7.74595162603894e-06, + "logits/chosen": -0.3827926218509674, + "logits/rejected": -0.4445372521877289, + "logps/chosen": -49.61650085449219, + "logps/rejected": -96.28206634521484, + "loss": 0.7491, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8296751976013184, + "rewards/margins": 4.607871055603027, + "rewards/rejected": -1.778195858001709, + "step": 6294 + }, + { + "epoch": 1.57, + "grad_norm": 3.5394093990325928, + "learning_rate": 7.745294744349105e-06, + "logits/chosen": -0.3206748366355896, + "logits/rejected": -0.4308243691921234, + "logps/chosen": -52.09873580932617, + "logps/rejected": -87.37725067138672, + "loss": 0.6734, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1039867401123047, + "rewards/margins": 5.952610015869141, + "rewards/rejected": -2.848623037338257, + "step": 6295 + }, + { + "epoch": 1.58, + "grad_norm": 6.6160478591918945, + "learning_rate": 7.744637794820113e-06, + "logits/chosen": -0.39322495460510254, + "logits/rejected": -0.5039277076721191, + "logps/chosen": -52.07963562011719, + "logps/rejected": -77.20233917236328, + "loss": 0.792, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8747339248657227, + "rewards/margins": 5.120887279510498, + "rewards/rejected": -2.2461538314819336, + "step": 6296 + }, + { + "epoch": 1.58, + "grad_norm": 29.92538070678711, + "learning_rate": 7.743980777468202e-06, + "logits/chosen": -0.3607480525970459, + "logits/rejected": -0.4293091595172882, + "logps/chosen": -47.49660873413086, + "logps/rejected": -87.8835220336914, + "loss": 0.7283, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2756121158599854, + "rewards/margins": 4.413330078125, + "rewards/rejected": -1.1377174854278564, + "step": 6297 + }, + { + "epoch": 1.58, + "grad_norm": 7.6207661628723145, + "learning_rate": 7.74332369230961e-06, + "logits/chosen": -0.32078394293785095, + "logits/rejected": -0.38129788637161255, + "logps/chosen": -53.2324333190918, + "logps/rejected": -80.45773315429688, + "loss": 0.794, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.786050796508789, + "rewards/margins": 4.155881881713867, + "rewards/rejected": -1.3698309659957886, + "step": 6298 + }, + { + "epoch": 1.58, + "grad_norm": 7.353314399719238, + "learning_rate": 7.742666539360569e-06, + "logits/chosen": -0.3742191195487976, + "logits/rejected": -0.5138331055641174, + "logps/chosen": -49.64087677001953, + "logps/rejected": -88.01945495605469, + "loss": 0.6324, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7965095043182373, + "rewards/margins": 5.363564491271973, + "rewards/rejected": -2.5670547485351562, + "step": 6299 + }, + { + "epoch": 1.58, + "grad_norm": 3.2708499431610107, + "learning_rate": 7.742009318637323e-06, + "logits/chosen": -0.3677785396575928, + "logits/rejected": -0.4560157358646393, + "logps/chosen": -50.25766372680664, + "logps/rejected": -90.62464904785156, + "loss": 0.6564, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.970367908477783, + "rewards/margins": 5.7142558097839355, + "rewards/rejected": -2.7438876628875732, + "step": 6300 + }, + { + "epoch": 1.58, + "grad_norm": 4.478875637054443, + "learning_rate": 7.741352030156108e-06, + "logits/chosen": -0.3587677478790283, + "logits/rejected": -0.4310028553009033, + "logps/chosen": -53.156211853027344, + "logps/rejected": -82.99020385742188, + "loss": 0.7564, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9033961296081543, + "rewards/margins": 4.777408123016357, + "rewards/rejected": -1.874011754989624, + "step": 6301 + }, + { + "epoch": 1.58, + "grad_norm": 3.9964540004730225, + "learning_rate": 7.74069467393317e-06, + "logits/chosen": -0.3499690890312195, + "logits/rejected": -0.41784414649009705, + "logps/chosen": -68.39077758789062, + "logps/rejected": -89.90663146972656, + "loss": 0.8241, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.76332688331604, + "rewards/margins": 4.254371643066406, + "rewards/rejected": -1.4910449981689453, + "step": 6302 + }, + { + "epoch": 1.58, + "grad_norm": 3.798994541168213, + "learning_rate": 7.740037249984749e-06, + "logits/chosen": -0.2844771146774292, + "logits/rejected": -0.3030974268913269, + "logps/chosen": -50.783599853515625, + "logps/rejected": -83.20748901367188, + "loss": 0.6863, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3581035137176514, + "rewards/margins": 3.7141637802124023, + "rewards/rejected": -0.3560603857040405, + "step": 6303 + }, + { + "epoch": 1.58, + "grad_norm": 5.9662065505981445, + "learning_rate": 7.739379758327095e-06, + "logits/chosen": -0.29356691241264343, + "logits/rejected": -0.41246497631073, + "logps/chosen": -62.1683349609375, + "logps/rejected": -84.06015014648438, + "loss": 0.7603, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8489434719085693, + "rewards/margins": 5.052881717681885, + "rewards/rejected": -2.2039380073547363, + "step": 6304 + }, + { + "epoch": 1.58, + "grad_norm": 10.511528968811035, + "learning_rate": 7.738722198976453e-06, + "logits/chosen": -0.37905192375183105, + "logits/rejected": -0.47247329354286194, + "logps/chosen": -53.6134033203125, + "logps/rejected": -99.27357482910156, + "loss": 0.7565, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9618637561798096, + "rewards/margins": 4.262204170227051, + "rewards/rejected": -1.300340175628662, + "step": 6305 + }, + { + "epoch": 1.58, + "grad_norm": 4.026598930358887, + "learning_rate": 7.738064571949073e-06, + "logits/chosen": -0.38216328620910645, + "logits/rejected": -0.47947704792022705, + "logps/chosen": -51.889869689941406, + "logps/rejected": -80.41167449951172, + "loss": 0.7284, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.083855152130127, + "rewards/margins": 5.2240190505981445, + "rewards/rejected": -2.1401638984680176, + "step": 6306 + }, + { + "epoch": 1.58, + "grad_norm": 2.534132480621338, + "learning_rate": 7.737406877261205e-06, + "logits/chosen": -0.37987056374549866, + "logits/rejected": -0.4790034294128418, + "logps/chosen": -59.55237579345703, + "logps/rejected": -94.58379364013672, + "loss": 0.616, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.378288984298706, + "rewards/margins": 5.80107307434082, + "rewards/rejected": -2.422783613204956, + "step": 6307 + }, + { + "epoch": 1.58, + "grad_norm": 2.971933603286743, + "learning_rate": 7.7367491149291e-06, + "logits/chosen": -0.3445770740509033, + "logits/rejected": -0.4016929864883423, + "logps/chosen": -60.75413513183594, + "logps/rejected": -93.76082611083984, + "loss": 0.6801, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0552449226379395, + "rewards/margins": 5.058229446411133, + "rewards/rejected": -2.0029845237731934, + "step": 6308 + }, + { + "epoch": 1.58, + "grad_norm": 10.037843704223633, + "learning_rate": 7.736091284969016e-06, + "logits/chosen": -0.39554452896118164, + "logits/rejected": -0.5028043389320374, + "logps/chosen": -58.405826568603516, + "logps/rejected": -67.40342712402344, + "loss": 0.79, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8675854206085205, + "rewards/margins": 3.8963229656219482, + "rewards/rejected": -1.0287375450134277, + "step": 6309 + }, + { + "epoch": 1.58, + "grad_norm": 4.1868767738342285, + "learning_rate": 7.735433387397204e-06, + "logits/chosen": -0.4411466717720032, + "logits/rejected": -0.5139883160591125, + "logps/chosen": -45.204620361328125, + "logps/rejected": -92.37881469726562, + "loss": 0.7155, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.210841178894043, + "rewards/margins": 5.09279727935791, + "rewards/rejected": -1.8819559812545776, + "step": 6310 + }, + { + "epoch": 1.58, + "grad_norm": 15.65140438079834, + "learning_rate": 7.734775422229925e-06, + "logits/chosen": -0.36063212156295776, + "logits/rejected": -0.46600592136383057, + "logps/chosen": -56.1636962890625, + "logps/rejected": -96.538818359375, + "loss": 0.7409, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1658706665039062, + "rewards/margins": 5.3876261711120605, + "rewards/rejected": -2.221755266189575, + "step": 6311 + }, + { + "epoch": 1.58, + "grad_norm": 4.600931644439697, + "learning_rate": 7.734117389483434e-06, + "logits/chosen": -0.35036924481391907, + "logits/rejected": -0.465481698513031, + "logps/chosen": -61.05356216430664, + "logps/rejected": -81.40068054199219, + "loss": 0.668, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7513275146484375, + "rewards/margins": 5.0672502517700195, + "rewards/rejected": -2.315922260284424, + "step": 6312 + }, + { + "epoch": 1.58, + "grad_norm": 3.9594359397888184, + "learning_rate": 7.733459289173996e-06, + "logits/chosen": -0.3075985014438629, + "logits/rejected": -0.34288376569747925, + "logps/chosen": -57.600643157958984, + "logps/rejected": -85.13668823242188, + "loss": 0.7105, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1705079078674316, + "rewards/margins": 4.553432464599609, + "rewards/rejected": -1.3829249143600464, + "step": 6313 + }, + { + "epoch": 1.58, + "grad_norm": 5.331576824188232, + "learning_rate": 7.73280112131787e-06, + "logits/chosen": -0.2799622118473053, + "logits/rejected": -0.43498390913009644, + "logps/chosen": -70.77938842773438, + "logps/rejected": -80.52130889892578, + "loss": 0.7441, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.769493341445923, + "rewards/margins": 5.508556365966797, + "rewards/rejected": -2.739062786102295, + "step": 6314 + }, + { + "epoch": 1.58, + "grad_norm": 7.521226406097412, + "learning_rate": 7.732142885931324e-06, + "logits/chosen": -0.4049893915653229, + "logits/rejected": -0.5168671607971191, + "logps/chosen": -59.35698699951172, + "logps/rejected": -86.53980255126953, + "loss": 0.7682, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.906370162963867, + "rewards/margins": 5.219885349273682, + "rewards/rejected": -2.3135147094726562, + "step": 6315 + }, + { + "epoch": 1.58, + "grad_norm": 5.736645698547363, + "learning_rate": 7.731484583030621e-06, + "logits/chosen": -0.3198194205760956, + "logits/rejected": -0.4277636408805847, + "logps/chosen": -58.485042572021484, + "logps/rejected": -86.52586364746094, + "loss": 0.7098, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.914796829223633, + "rewards/margins": 4.814501762390137, + "rewards/rejected": -1.899704933166504, + "step": 6316 + }, + { + "epoch": 1.58, + "grad_norm": 3.8623619079589844, + "learning_rate": 7.730826212632028e-06, + "logits/chosen": -0.4410657286643982, + "logits/rejected": -0.5708088874816895, + "logps/chosen": -44.175811767578125, + "logps/rejected": -88.00579833984375, + "loss": 0.5635, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0268235206604004, + "rewards/margins": 6.49392557144165, + "rewards/rejected": -3.467101812362671, + "step": 6317 + }, + { + "epoch": 1.58, + "grad_norm": 3.21751070022583, + "learning_rate": 7.730167774751813e-06, + "logits/chosen": -0.42972803115844727, + "logits/rejected": -0.5732912421226501, + "logps/chosen": -64.64427185058594, + "logps/rejected": -73.55888366699219, + "loss": 0.7087, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.538240432739258, + "rewards/margins": 5.056528568267822, + "rewards/rejected": -2.5182878971099854, + "step": 6318 + }, + { + "epoch": 1.58, + "grad_norm": 3.285139560699463, + "learning_rate": 7.72950926940625e-06, + "logits/chosen": -0.4089662432670593, + "logits/rejected": -0.5322189331054688, + "logps/chosen": -49.68669128417969, + "logps/rejected": -80.48989868164062, + "loss": 0.581, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.890493392944336, + "rewards/margins": 5.67080020904541, + "rewards/rejected": -2.780306577682495, + "step": 6319 + }, + { + "epoch": 1.58, + "grad_norm": 4.263320446014404, + "learning_rate": 7.72885069661161e-06, + "logits/chosen": -0.33046332001686096, + "logits/rejected": -0.363657683134079, + "logps/chosen": -56.851348876953125, + "logps/rejected": -89.8049087524414, + "loss": 0.7083, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.751607894897461, + "rewards/margins": 4.2029805183410645, + "rewards/rejected": -1.451372504234314, + "step": 6320 + }, + { + "epoch": 1.58, + "grad_norm": 2.4135758876800537, + "learning_rate": 7.728192056384166e-06, + "logits/chosen": -0.3561161160469055, + "logits/rejected": -0.44654572010040283, + "logps/chosen": -50.632476806640625, + "logps/rejected": -94.93885040283203, + "loss": 0.6042, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.91960072517395, + "rewards/margins": 6.2748517990112305, + "rewards/rejected": -3.3552515506744385, + "step": 6321 + }, + { + "epoch": 1.58, + "grad_norm": 3.9538142681121826, + "learning_rate": 7.727533348740195e-06, + "logits/chosen": -0.3257915675640106, + "logits/rejected": -0.438604474067688, + "logps/chosen": -53.7602424621582, + "logps/rejected": -82.09114837646484, + "loss": 0.7083, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.083897590637207, + "rewards/margins": 5.27173376083374, + "rewards/rejected": -2.1878364086151123, + "step": 6322 + }, + { + "epoch": 1.58, + "grad_norm": 7.286410331726074, + "learning_rate": 7.726874573695972e-06, + "logits/chosen": -0.4528713822364807, + "logits/rejected": -0.5458012819290161, + "logps/chosen": -59.14128875732422, + "logps/rejected": -84.6438980102539, + "loss": 0.7939, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9664242267608643, + "rewards/margins": 5.155847072601318, + "rewards/rejected": -2.189423084259033, + "step": 6323 + }, + { + "epoch": 1.58, + "grad_norm": 5.924346923828125, + "learning_rate": 7.72621573126778e-06, + "logits/chosen": -0.38152623176574707, + "logits/rejected": -0.3914187550544739, + "logps/chosen": -60.36326217651367, + "logps/rejected": -102.13544464111328, + "loss": 0.7474, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1212215423583984, + "rewards/margins": 4.60158109664917, + "rewards/rejected": -1.480359435081482, + "step": 6324 + }, + { + "epoch": 1.58, + "grad_norm": 12.479934692382812, + "learning_rate": 7.725556821471897e-06, + "logits/chosen": -0.34565168619155884, + "logits/rejected": -0.4415595531463623, + "logps/chosen": -58.11812973022461, + "logps/rejected": -83.87554931640625, + "loss": 0.82, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.77712345123291, + "rewards/margins": 4.296731948852539, + "rewards/rejected": -1.5196086168289185, + "step": 6325 + }, + { + "epoch": 1.58, + "grad_norm": 13.93596363067627, + "learning_rate": 7.724897844324606e-06, + "logits/chosen": -0.3920896649360657, + "logits/rejected": -0.4512084126472473, + "logps/chosen": -52.673091888427734, + "logps/rejected": -84.5502700805664, + "loss": 0.7369, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8635363578796387, + "rewards/margins": 4.648735523223877, + "rewards/rejected": -1.7851994037628174, + "step": 6326 + }, + { + "epoch": 1.58, + "grad_norm": 3.5129058361053467, + "learning_rate": 7.724238799842192e-06, + "logits/chosen": -0.3545016348361969, + "logits/rejected": -0.413957804441452, + "logps/chosen": -48.22865676879883, + "logps/rejected": -85.92584228515625, + "loss": 0.5726, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9641075134277344, + "rewards/margins": 5.083550930023193, + "rewards/rejected": -2.119443416595459, + "step": 6327 + }, + { + "epoch": 1.58, + "grad_norm": 4.091187953948975, + "learning_rate": 7.723579688040937e-06, + "logits/chosen": -0.421477347612381, + "logits/rejected": -0.4313187897205353, + "logps/chosen": -52.78806686401367, + "logps/rejected": -84.979248046875, + "loss": 0.7757, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.271819591522217, + "rewards/margins": 4.1959228515625, + "rewards/rejected": -0.9241030812263489, + "step": 6328 + }, + { + "epoch": 1.58, + "grad_norm": 7.946497440338135, + "learning_rate": 7.722920508937132e-06, + "logits/chosen": -0.32546934485435486, + "logits/rejected": -0.4221034646034241, + "logps/chosen": -65.62702178955078, + "logps/rejected": -94.27979278564453, + "loss": 0.7795, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7936971187591553, + "rewards/margins": 5.177524566650391, + "rewards/rejected": -2.383826971054077, + "step": 6329 + }, + { + "epoch": 1.58, + "grad_norm": 8.001672744750977, + "learning_rate": 7.722261262547066e-06, + "logits/chosen": -0.3426901698112488, + "logits/rejected": -0.4776502549648285, + "logps/chosen": -63.99165725708008, + "logps/rejected": -85.73145294189453, + "loss": 0.7623, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7712841033935547, + "rewards/margins": 4.690977573394775, + "rewards/rejected": -1.9196938276290894, + "step": 6330 + }, + { + "epoch": 1.58, + "grad_norm": 4.012503623962402, + "learning_rate": 7.721601948887028e-06, + "logits/chosen": -0.37680572271347046, + "logits/rejected": -0.4665822982788086, + "logps/chosen": -56.742828369140625, + "logps/rejected": -86.77928924560547, + "loss": 0.7437, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.144002914428711, + "rewards/margins": 5.155335426330566, + "rewards/rejected": -2.0113332271575928, + "step": 6331 + }, + { + "epoch": 1.58, + "grad_norm": 6.148758411407471, + "learning_rate": 7.72094256797331e-06, + "logits/chosen": -0.33656325936317444, + "logits/rejected": -0.4184684753417969, + "logps/chosen": -56.97608184814453, + "logps/rejected": -108.10810852050781, + "loss": 0.6234, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.705402135848999, + "rewards/margins": 5.294825077056885, + "rewards/rejected": -2.589423179626465, + "step": 6332 + }, + { + "epoch": 1.58, + "grad_norm": 3.851125717163086, + "learning_rate": 7.720283119822209e-06, + "logits/chosen": -0.32009202241897583, + "logits/rejected": -0.4370166063308716, + "logps/chosen": -60.14056396484375, + "logps/rejected": -83.66588592529297, + "loss": 0.6588, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.249566078186035, + "rewards/margins": 5.35086727142334, + "rewards/rejected": -2.1013011932373047, + "step": 6333 + }, + { + "epoch": 1.58, + "grad_norm": 10.126383781433105, + "learning_rate": 7.719623604450019e-06, + "logits/chosen": -0.32636359333992004, + "logits/rejected": -0.4200662076473236, + "logps/chosen": -60.41895294189453, + "logps/rejected": -116.8200912475586, + "loss": 0.7764, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.875472068786621, + "rewards/margins": 6.256946086883545, + "rewards/rejected": -3.381474494934082, + "step": 6334 + }, + { + "epoch": 1.58, + "grad_norm": 7.532055854797363, + "learning_rate": 7.718964021873035e-06, + "logits/chosen": -0.3334498703479767, + "logits/rejected": -0.41395753622055054, + "logps/chosen": -64.73007202148438, + "logps/rejected": -91.30606079101562, + "loss": 0.7338, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7478792667388916, + "rewards/margins": 4.651391983032227, + "rewards/rejected": -1.9035128355026245, + "step": 6335 + }, + { + "epoch": 1.59, + "grad_norm": 8.91258430480957, + "learning_rate": 7.718304372107558e-06, + "logits/chosen": -0.4368777275085449, + "logits/rejected": -0.5187346935272217, + "logps/chosen": -66.7214584350586, + "logps/rejected": -87.32084655761719, + "loss": 0.8691, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.790576934814453, + "rewards/margins": 4.68109655380249, + "rewards/rejected": -1.890519618988037, + "step": 6336 + }, + { + "epoch": 1.59, + "grad_norm": 7.219736099243164, + "learning_rate": 7.717644655169889e-06, + "logits/chosen": -0.3249683082103729, + "logits/rejected": -0.42521634697914124, + "logps/chosen": -61.02330017089844, + "logps/rejected": -85.6846923828125, + "loss": 0.7326, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.91622257232666, + "rewards/margins": 5.295672416687012, + "rewards/rejected": -2.3794498443603516, + "step": 6337 + }, + { + "epoch": 1.59, + "grad_norm": 18.993331909179688, + "learning_rate": 7.716984871076332e-06, + "logits/chosen": -0.34789806604385376, + "logits/rejected": -0.44310262799263, + "logps/chosen": -61.897274017333984, + "logps/rejected": -92.9560317993164, + "loss": 0.8387, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6849184036254883, + "rewards/margins": 5.093411445617676, + "rewards/rejected": -2.4084935188293457, + "step": 6338 + }, + { + "epoch": 1.59, + "grad_norm": 8.416218757629395, + "learning_rate": 7.716325019843186e-06, + "logits/chosen": -0.4171035885810852, + "logits/rejected": -0.49895966053009033, + "logps/chosen": -63.31598663330078, + "logps/rejected": -97.1290283203125, + "loss": 0.6736, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8842387199401855, + "rewards/margins": 5.613135814666748, + "rewards/rejected": -2.7288970947265625, + "step": 6339 + }, + { + "epoch": 1.59, + "grad_norm": 5.328458786010742, + "learning_rate": 7.715665101486762e-06, + "logits/chosen": -0.3668832778930664, + "logits/rejected": -0.443447470664978, + "logps/chosen": -62.077152252197266, + "logps/rejected": -79.94566345214844, + "loss": 0.845, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.7895233631134033, + "rewards/margins": 4.025123596191406, + "rewards/rejected": -1.2355999946594238, + "step": 6340 + }, + { + "epoch": 1.59, + "grad_norm": 5.302030086517334, + "learning_rate": 7.715005116023365e-06, + "logits/chosen": -0.4084186255931854, + "logits/rejected": -0.4649006724357605, + "logps/chosen": -60.147281646728516, + "logps/rejected": -89.15066528320312, + "loss": 0.705, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.951831102371216, + "rewards/margins": 5.408624172210693, + "rewards/rejected": -2.4567933082580566, + "step": 6341 + }, + { + "epoch": 1.59, + "grad_norm": 5.182696342468262, + "learning_rate": 7.7143450634693e-06, + "logits/chosen": -0.34121277928352356, + "logits/rejected": -0.43057578802108765, + "logps/chosen": -54.18623352050781, + "logps/rejected": -82.21577453613281, + "loss": 0.6542, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.903660297393799, + "rewards/margins": 4.790146827697754, + "rewards/rejected": -1.8864867687225342, + "step": 6342 + }, + { + "epoch": 1.59, + "grad_norm": 5.03875732421875, + "learning_rate": 7.713684943840885e-06, + "logits/chosen": -0.46165597438812256, + "logits/rejected": -0.521172046661377, + "logps/chosen": -72.4707260131836, + "logps/rejected": -84.58024597167969, + "loss": 0.8853, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.906252145767212, + "rewards/margins": 5.4482741355896, + "rewards/rejected": -2.542022466659546, + "step": 6343 + }, + { + "epoch": 1.59, + "grad_norm": 2.4564175605773926, + "learning_rate": 7.713024757154426e-06, + "logits/chosen": -0.30206459760665894, + "logits/rejected": -0.4096885025501251, + "logps/chosen": -53.89638900756836, + "logps/rejected": -101.6480484008789, + "loss": 0.6214, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9064064025878906, + "rewards/margins": 6.155433177947998, + "rewards/rejected": -3.2490267753601074, + "step": 6344 + }, + { + "epoch": 1.59, + "grad_norm": 6.032115936279297, + "learning_rate": 7.712364503426242e-06, + "logits/chosen": -0.3712090849876404, + "logits/rejected": -0.44642937183380127, + "logps/chosen": -56.565147399902344, + "logps/rejected": -83.30217742919922, + "loss": 0.8181, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7848496437072754, + "rewards/margins": 5.094567775726318, + "rewards/rejected": -2.309718132019043, + "step": 6345 + }, + { + "epoch": 1.59, + "grad_norm": 5.003164768218994, + "learning_rate": 7.711704182672645e-06, + "logits/chosen": -0.3870793282985687, + "logits/rejected": -0.4533117711544037, + "logps/chosen": -56.967952728271484, + "logps/rejected": -92.14827728271484, + "loss": 0.681, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8862640857696533, + "rewards/margins": 4.442775726318359, + "rewards/rejected": -1.5565115213394165, + "step": 6346 + }, + { + "epoch": 1.59, + "grad_norm": 5.97713041305542, + "learning_rate": 7.711043794909954e-06, + "logits/chosen": -0.36501359939575195, + "logits/rejected": -0.47743499279022217, + "logps/chosen": -50.92992401123047, + "logps/rejected": -86.04617309570312, + "loss": 0.7333, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8086817264556885, + "rewards/margins": 5.2875518798828125, + "rewards/rejected": -2.478870391845703, + "step": 6347 + }, + { + "epoch": 1.59, + "grad_norm": 3.162867546081543, + "learning_rate": 7.710383340154486e-06, + "logits/chosen": -0.4020707309246063, + "logits/rejected": -0.49735724925994873, + "logps/chosen": -51.90544128417969, + "logps/rejected": -79.29481506347656, + "loss": 0.6288, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.736056089401245, + "rewards/margins": 4.811450958251953, + "rewards/rejected": -2.075394630432129, + "step": 6348 + }, + { + "epoch": 1.59, + "grad_norm": 6.479492664337158, + "learning_rate": 7.709722818422563e-06, + "logits/chosen": -0.32921433448791504, + "logits/rejected": -0.42058536410331726, + "logps/chosen": -51.054656982421875, + "logps/rejected": -83.70979309082031, + "loss": 0.697, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7673401832580566, + "rewards/margins": 5.4248504638671875, + "rewards/rejected": -2.65751051902771, + "step": 6349 + }, + { + "epoch": 1.59, + "grad_norm": 3.28519606590271, + "learning_rate": 7.709062229730509e-06, + "logits/chosen": -0.34178897738456726, + "logits/rejected": -0.3815775513648987, + "logps/chosen": -45.634361267089844, + "logps/rejected": -96.46121978759766, + "loss": 0.6425, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.905585527420044, + "rewards/margins": 4.892120838165283, + "rewards/rejected": -1.986534833908081, + "step": 6350 + }, + { + "epoch": 1.59, + "grad_norm": 6.41204309463501, + "learning_rate": 7.708401574094642e-06, + "logits/chosen": -0.35902631282806396, + "logits/rejected": -0.47691839933395386, + "logps/chosen": -52.694793701171875, + "logps/rejected": -86.45624542236328, + "loss": 0.661, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2067487239837646, + "rewards/margins": 5.5806965827941895, + "rewards/rejected": -2.373947858810425, + "step": 6351 + }, + { + "epoch": 1.59, + "grad_norm": 8.42216682434082, + "learning_rate": 7.707740851531295e-06, + "logits/chosen": -0.4138517677783966, + "logits/rejected": -0.47681403160095215, + "logps/chosen": -67.6042709350586, + "logps/rejected": -91.40166473388672, + "loss": 0.81, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7356436252593994, + "rewards/margins": 4.312984466552734, + "rewards/rejected": -1.5773407220840454, + "step": 6352 + }, + { + "epoch": 1.59, + "grad_norm": 2.5524590015411377, + "learning_rate": 7.70708006205679e-06, + "logits/chosen": -0.4213850200176239, + "logits/rejected": -0.5115705728530884, + "logps/chosen": -50.465728759765625, + "logps/rejected": -96.92843627929688, + "loss": 0.6881, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.972149610519409, + "rewards/margins": 6.949294567108154, + "rewards/rejected": -3.977144718170166, + "step": 6353 + }, + { + "epoch": 1.59, + "grad_norm": 4.6055989265441895, + "learning_rate": 7.706419205687457e-06, + "logits/chosen": -0.3830357491970062, + "logits/rejected": -0.4862871766090393, + "logps/chosen": -50.74851989746094, + "logps/rejected": -84.0052719116211, + "loss": 0.679, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.168064832687378, + "rewards/margins": 5.883976459503174, + "rewards/rejected": -2.715911626815796, + "step": 6354 + }, + { + "epoch": 1.59, + "grad_norm": 3.499211311340332, + "learning_rate": 7.705758282439626e-06, + "logits/chosen": -0.3637736141681671, + "logits/rejected": -0.4935949146747589, + "logps/chosen": -64.78771209716797, + "logps/rejected": -91.42159271240234, + "loss": 0.6747, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9831583499908447, + "rewards/margins": 5.604142665863037, + "rewards/rejected": -2.6209843158721924, + "step": 6355 + }, + { + "epoch": 1.59, + "grad_norm": 5.315962314605713, + "learning_rate": 7.705097292329632e-06, + "logits/chosen": -0.44307589530944824, + "logits/rejected": -0.5142638683319092, + "logps/chosen": -42.690067291259766, + "logps/rejected": -93.33781433105469, + "loss": 0.6027, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.198885440826416, + "rewards/margins": 6.855778217315674, + "rewards/rejected": -3.6568922996520996, + "step": 6356 + }, + { + "epoch": 1.59, + "grad_norm": 3.198619842529297, + "learning_rate": 7.704436235373807e-06, + "logits/chosen": -0.3949580788612366, + "logits/rejected": -0.47071966528892517, + "logps/chosen": -53.47201156616211, + "logps/rejected": -81.55023193359375, + "loss": 0.7463, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.374084711074829, + "rewards/margins": 5.017749786376953, + "rewards/rejected": -1.643664836883545, + "step": 6357 + }, + { + "epoch": 1.59, + "grad_norm": 7.2964935302734375, + "learning_rate": 7.703775111588484e-06, + "logits/chosen": -0.3710208833217621, + "logits/rejected": -0.4115319848060608, + "logps/chosen": -57.59099197387695, + "logps/rejected": -81.67328643798828, + "loss": 0.8847, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8289241790771484, + "rewards/margins": 4.027413845062256, + "rewards/rejected": -1.1984894275665283, + "step": 6358 + }, + { + "epoch": 1.59, + "grad_norm": 3.635830879211426, + "learning_rate": 7.703113920990002e-06, + "logits/chosen": -0.3906334936618805, + "logits/rejected": -0.4825008511543274, + "logps/chosen": -43.026981353759766, + "logps/rejected": -85.30586242675781, + "loss": 0.5839, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1010870933532715, + "rewards/margins": 5.724350929260254, + "rewards/rejected": -2.6232638359069824, + "step": 6359 + }, + { + "epoch": 1.59, + "grad_norm": 4.144906997680664, + "learning_rate": 7.702452663594701e-06, + "logits/chosen": -0.3841497004032135, + "logits/rejected": -0.3852924704551697, + "logps/chosen": -50.09218978881836, + "logps/rejected": -106.07672882080078, + "loss": 0.6073, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9467246532440186, + "rewards/margins": 5.429407119750977, + "rewards/rejected": -2.482682704925537, + "step": 6360 + }, + { + "epoch": 1.59, + "grad_norm": 5.448407173156738, + "learning_rate": 7.701791339418921e-06, + "logits/chosen": -0.32590293884277344, + "logits/rejected": -0.43747812509536743, + "logps/chosen": -55.9543571472168, + "logps/rejected": -77.89643859863281, + "loss": 0.7054, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.967102527618408, + "rewards/margins": 5.175650596618652, + "rewards/rejected": -2.208547830581665, + "step": 6361 + }, + { + "epoch": 1.59, + "grad_norm": 4.69959020614624, + "learning_rate": 7.701129948479001e-06, + "logits/chosen": -0.3608403503894806, + "logits/rejected": -0.413740873336792, + "logps/chosen": -56.83689498901367, + "logps/rejected": -116.24586486816406, + "loss": 0.7387, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8021368980407715, + "rewards/margins": 5.168238162994385, + "rewards/rejected": -2.3661017417907715, + "step": 6362 + }, + { + "epoch": 1.59, + "grad_norm": 5.419337272644043, + "learning_rate": 7.700468490791287e-06, + "logits/chosen": -0.3755471110343933, + "logits/rejected": -0.48905548453330994, + "logps/chosen": -54.87245559692383, + "logps/rejected": -81.44462585449219, + "loss": 0.7605, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8882884979248047, + "rewards/margins": 4.334343433380127, + "rewards/rejected": -1.4460541009902954, + "step": 6363 + }, + { + "epoch": 1.59, + "grad_norm": 3.83467960357666, + "learning_rate": 7.699806966372126e-06, + "logits/chosen": -0.3625965118408203, + "logits/rejected": -0.4757583737373352, + "logps/chosen": -54.36113357543945, + "logps/rejected": -84.6478500366211, + "loss": 0.6326, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.809004545211792, + "rewards/margins": 5.091808795928955, + "rewards/rejected": -2.282804012298584, + "step": 6364 + }, + { + "epoch": 1.59, + "grad_norm": 5.921377182006836, + "learning_rate": 7.699145375237863e-06, + "logits/chosen": -0.31374311447143555, + "logits/rejected": -0.4035792648792267, + "logps/chosen": -64.27445983886719, + "logps/rejected": -84.5063705444336, + "loss": 0.6858, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1458489894866943, + "rewards/margins": 5.44553279876709, + "rewards/rejected": -2.2996838092803955, + "step": 6365 + }, + { + "epoch": 1.59, + "grad_norm": 4.157569885253906, + "learning_rate": 7.698483717404846e-06, + "logits/chosen": -0.3494187891483307, + "logits/rejected": -0.436503529548645, + "logps/chosen": -51.94758605957031, + "logps/rejected": -83.48174285888672, + "loss": 0.6528, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0923540592193604, + "rewards/margins": 5.530369281768799, + "rewards/rejected": -2.438014507293701, + "step": 6366 + }, + { + "epoch": 1.59, + "grad_norm": 7.165525436401367, + "learning_rate": 7.697821992889426e-06, + "logits/chosen": -0.4337906837463379, + "logits/rejected": -0.4775039553642273, + "logps/chosen": -52.38630676269531, + "logps/rejected": -100.90723419189453, + "loss": 0.806, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7294223308563232, + "rewards/margins": 5.428057670593262, + "rewards/rejected": -2.6986348628997803, + "step": 6367 + }, + { + "epoch": 1.59, + "grad_norm": 8.394867897033691, + "learning_rate": 7.697160201707956e-06, + "logits/chosen": -0.350572407245636, + "logits/rejected": -0.45923516154289246, + "logps/chosen": -62.09211349487305, + "logps/rejected": -99.91268920898438, + "loss": 0.8217, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6285400390625, + "rewards/margins": 4.5690083503723145, + "rewards/rejected": -1.9404685497283936, + "step": 6368 + }, + { + "epoch": 1.59, + "grad_norm": 4.414074897766113, + "learning_rate": 7.696498343876787e-06, + "logits/chosen": -0.39696213603019714, + "logits/rejected": -0.48345014452934265, + "logps/chosen": -49.11638259887695, + "logps/rejected": -73.13095092773438, + "loss": 0.611, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0937306880950928, + "rewards/margins": 5.387696743011475, + "rewards/rejected": -2.2939658164978027, + "step": 6369 + }, + { + "epoch": 1.59, + "grad_norm": 3.9587619304656982, + "learning_rate": 7.695836419412277e-06, + "logits/chosen": -0.4286864399909973, + "logits/rejected": -0.4350527822971344, + "logps/chosen": -53.69879913330078, + "logps/rejected": -103.62525177001953, + "loss": 0.7796, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.866133689880371, + "rewards/margins": 5.65052604675293, + "rewards/rejected": -2.7843925952911377, + "step": 6370 + }, + { + "epoch": 1.59, + "grad_norm": 4.873988628387451, + "learning_rate": 7.695174428330783e-06, + "logits/chosen": -0.45053261518478394, + "logits/rejected": -0.5099471211433411, + "logps/chosen": -48.10790252685547, + "logps/rejected": -81.62714385986328, + "loss": 0.6268, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7931759357452393, + "rewards/margins": 4.634322166442871, + "rewards/rejected": -1.8411465883255005, + "step": 6371 + }, + { + "epoch": 1.59, + "grad_norm": 3.8915886878967285, + "learning_rate": 7.69451237064866e-06, + "logits/chosen": -0.2913684844970703, + "logits/rejected": -0.4480942487716675, + "logps/chosen": -65.19017028808594, + "logps/rejected": -92.1474609375, + "loss": 0.6754, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7224931716918945, + "rewards/margins": 6.2753801345825195, + "rewards/rejected": -3.552887201309204, + "step": 6372 + }, + { + "epoch": 1.59, + "grad_norm": 3.958008050918579, + "learning_rate": 7.693850246382271e-06, + "logits/chosen": -0.3643589913845062, + "logits/rejected": -0.43845170736312866, + "logps/chosen": -74.45247650146484, + "logps/rejected": -86.26569366455078, + "loss": 0.7154, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3160130977630615, + "rewards/margins": 4.6397857666015625, + "rewards/rejected": -1.3237723112106323, + "step": 6373 + }, + { + "epoch": 1.59, + "grad_norm": 4.664658069610596, + "learning_rate": 7.693188055547977e-06, + "logits/chosen": -0.3396717607975006, + "logits/rejected": -0.4257071018218994, + "logps/chosen": -56.193817138671875, + "logps/rejected": -92.5999984741211, + "loss": 0.6827, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.702569007873535, + "rewards/margins": 4.771997451782227, + "rewards/rejected": -2.0694284439086914, + "step": 6374 + }, + { + "epoch": 1.59, + "grad_norm": 6.6591973304748535, + "learning_rate": 7.692525798162141e-06, + "logits/chosen": -0.40405622124671936, + "logits/rejected": -0.48519766330718994, + "logps/chosen": -47.41941452026367, + "logps/rejected": -83.371826171875, + "loss": 0.6436, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0129952430725098, + "rewards/margins": 4.621490955352783, + "rewards/rejected": -1.6084957122802734, + "step": 6375 + }, + { + "epoch": 1.6, + "grad_norm": 5.820013523101807, + "learning_rate": 7.69186347424113e-06, + "logits/chosen": -0.374454140663147, + "logits/rejected": -0.4586845934391022, + "logps/chosen": -60.30900192260742, + "logps/rejected": -88.2043228149414, + "loss": 0.6959, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8667800426483154, + "rewards/margins": 4.613114356994629, + "rewards/rejected": -1.7463339567184448, + "step": 6376 + }, + { + "epoch": 1.6, + "grad_norm": 7.605408668518066, + "learning_rate": 7.69120108380131e-06, + "logits/chosen": -0.36710625886917114, + "logits/rejected": -0.49369320273399353, + "logps/chosen": -65.57183074951172, + "logps/rejected": -73.35073852539062, + "loss": 0.9173, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8327603340148926, + "rewards/margins": 4.923778533935547, + "rewards/rejected": -2.0910181999206543, + "step": 6377 + }, + { + "epoch": 1.6, + "grad_norm": 3.2722926139831543, + "learning_rate": 7.690538626859046e-06, + "logits/chosen": -0.34437185525894165, + "logits/rejected": -0.5566995739936829, + "logps/chosen": -57.951698303222656, + "logps/rejected": -76.87615203857422, + "loss": 0.5935, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7949624061584473, + "rewards/margins": 6.7269392013549805, + "rewards/rejected": -3.931976795196533, + "step": 6378 + }, + { + "epoch": 1.6, + "grad_norm": 3.758802652359009, + "learning_rate": 7.68987610343071e-06, + "logits/chosen": -0.4065023958683014, + "logits/rejected": -0.49179625511169434, + "logps/chosen": -53.18242263793945, + "logps/rejected": -93.76837921142578, + "loss": 0.6175, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9444353580474854, + "rewards/margins": 6.656901836395264, + "rewards/rejected": -3.7124664783477783, + "step": 6379 + }, + { + "epoch": 1.6, + "grad_norm": 5.149055004119873, + "learning_rate": 7.689213513532679e-06, + "logits/chosen": -0.3993593156337738, + "logits/rejected": -0.5276779532432556, + "logps/chosen": -55.68421173095703, + "logps/rejected": -88.9674072265625, + "loss": 0.6026, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1513290405273438, + "rewards/margins": 6.3630523681640625, + "rewards/rejected": -3.2117226123809814, + "step": 6380 + }, + { + "epoch": 1.6, + "grad_norm": 17.795007705688477, + "learning_rate": 7.688550857181318e-06, + "logits/chosen": -0.3967166543006897, + "logits/rejected": -0.39838963747024536, + "logps/chosen": -51.867122650146484, + "logps/rejected": -81.28016662597656, + "loss": 0.6797, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9748151302337646, + "rewards/margins": 4.5289201736450195, + "rewards/rejected": -1.5541050434112549, + "step": 6381 + }, + { + "epoch": 1.6, + "grad_norm": 8.065421104431152, + "learning_rate": 7.687888134393007e-06, + "logits/chosen": -0.35531213879585266, + "logits/rejected": -0.4546896517276764, + "logps/chosen": -63.941932678222656, + "logps/rejected": -98.0488052368164, + "loss": 0.742, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5936906337738037, + "rewards/margins": 5.983267784118652, + "rewards/rejected": -3.3895773887634277, + "step": 6382 + }, + { + "epoch": 1.6, + "grad_norm": 7.207706451416016, + "learning_rate": 7.687225345184122e-06, + "logits/chosen": -0.3398524820804596, + "logits/rejected": -0.40823525190353394, + "logps/chosen": -84.48955535888672, + "logps/rejected": -83.18865966796875, + "loss": 0.8303, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7242228984832764, + "rewards/margins": 4.122584342956543, + "rewards/rejected": -1.3983608484268188, + "step": 6383 + }, + { + "epoch": 1.6, + "grad_norm": 5.234302043914795, + "learning_rate": 7.686562489571038e-06, + "logits/chosen": -0.34275126457214355, + "logits/rejected": -0.466461181640625, + "logps/chosen": -60.76719665527344, + "logps/rejected": -91.95059204101562, + "loss": 0.6697, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9337854385375977, + "rewards/margins": 6.136569023132324, + "rewards/rejected": -3.2027831077575684, + "step": 6384 + }, + { + "epoch": 1.6, + "grad_norm": 4.935095310211182, + "learning_rate": 7.68589956757014e-06, + "logits/chosen": -0.3776369094848633, + "logits/rejected": -0.45723146200180054, + "logps/chosen": -57.59944534301758, + "logps/rejected": -100.8875961303711, + "loss": 0.7402, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0444514751434326, + "rewards/margins": 5.951580047607422, + "rewards/rejected": -2.9071288108825684, + "step": 6385 + }, + { + "epoch": 1.6, + "grad_norm": 9.191817283630371, + "learning_rate": 7.685236579197806e-06, + "logits/chosen": -0.3628331422805786, + "logits/rejected": -0.41375625133514404, + "logps/chosen": -62.65583801269531, + "logps/rejected": -104.4281234741211, + "loss": 0.8283, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.907802104949951, + "rewards/margins": 5.675090789794922, + "rewards/rejected": -2.7672882080078125, + "step": 6386 + }, + { + "epoch": 1.6, + "grad_norm": 3.383268117904663, + "learning_rate": 7.68457352447042e-06, + "logits/chosen": -0.3037239611148834, + "logits/rejected": -0.4283078610897064, + "logps/chosen": -61.93279266357422, + "logps/rejected": -100.48149108886719, + "loss": 0.6276, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.116241216659546, + "rewards/margins": 6.176342010498047, + "rewards/rejected": -3.060100793838501, + "step": 6387 + }, + { + "epoch": 1.6, + "grad_norm": 12.078412055969238, + "learning_rate": 7.683910403404366e-06, + "logits/chosen": -0.3136271834373474, + "logits/rejected": -0.39095285534858704, + "logps/chosen": -57.04288101196289, + "logps/rejected": -96.84821319580078, + "loss": 0.7203, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.819664716720581, + "rewards/margins": 4.62691068649292, + "rewards/rejected": -1.807246446609497, + "step": 6388 + }, + { + "epoch": 1.6, + "grad_norm": 5.022266864776611, + "learning_rate": 7.683247216016032e-06, + "logits/chosen": -0.35496148467063904, + "logits/rejected": -0.4018247127532959, + "logps/chosen": -53.49156951904297, + "logps/rejected": -108.48251342773438, + "loss": 0.7273, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.833024024963379, + "rewards/margins": 4.9820170402526855, + "rewards/rejected": -2.1489930152893066, + "step": 6389 + }, + { + "epoch": 1.6, + "grad_norm": 8.19788932800293, + "learning_rate": 7.682583962321804e-06, + "logits/chosen": -0.3471457064151764, + "logits/rejected": -0.4096916913986206, + "logps/chosen": -52.48076629638672, + "logps/rejected": -88.37093353271484, + "loss": 0.7062, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8638644218444824, + "rewards/margins": 5.562912940979004, + "rewards/rejected": -2.6990489959716797, + "step": 6390 + }, + { + "epoch": 1.6, + "grad_norm": 2.4919815063476562, + "learning_rate": 7.681920642338074e-06, + "logits/chosen": -0.31589826941490173, + "logits/rejected": -0.4325079321861267, + "logps/chosen": -60.176429748535156, + "logps/rejected": -85.5066146850586, + "loss": 0.6368, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8685812950134277, + "rewards/margins": 5.2833356857299805, + "rewards/rejected": -2.4147543907165527, + "step": 6391 + }, + { + "epoch": 1.6, + "grad_norm": 4.140427112579346, + "learning_rate": 7.681257256081231e-06, + "logits/chosen": -0.37023383378982544, + "logits/rejected": -0.5376865863800049, + "logps/chosen": -54.65187072753906, + "logps/rejected": -80.64076232910156, + "loss": 0.5986, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9908390045166016, + "rewards/margins": 5.676555156707764, + "rewards/rejected": -2.685715913772583, + "step": 6392 + }, + { + "epoch": 1.6, + "grad_norm": 6.553416728973389, + "learning_rate": 7.680593803567672e-06, + "logits/chosen": -0.36839938163757324, + "logits/rejected": -0.4207070767879486, + "logps/chosen": -59.74119567871094, + "logps/rejected": -79.1806411743164, + "loss": 0.8292, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8753762245178223, + "rewards/margins": 5.133354187011719, + "rewards/rejected": -2.2579782009124756, + "step": 6393 + }, + { + "epoch": 1.6, + "grad_norm": 13.948710441589355, + "learning_rate": 7.679930284813788e-06, + "logits/chosen": -0.4004247188568115, + "logits/rejected": -0.5770542621612549, + "logps/chosen": -68.55589294433594, + "logps/rejected": -73.2655029296875, + "loss": 0.7857, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.711118698120117, + "rewards/margins": 4.4477620124816895, + "rewards/rejected": -1.7366429567337036, + "step": 6394 + }, + { + "epoch": 1.6, + "grad_norm": 7.750741481781006, + "learning_rate": 7.679266699835975e-06, + "logits/chosen": -0.3581394553184509, + "logits/rejected": -0.44805067777633667, + "logps/chosen": -64.21241760253906, + "logps/rejected": -77.79414367675781, + "loss": 0.7787, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6705751419067383, + "rewards/margins": 3.9000160694122314, + "rewards/rejected": -1.2294408082962036, + "step": 6395 + }, + { + "epoch": 1.6, + "grad_norm": 14.86229419708252, + "learning_rate": 7.678603048650633e-06, + "logits/chosen": -0.39064186811447144, + "logits/rejected": -0.49753162264823914, + "logps/chosen": -60.93006134033203, + "logps/rejected": -77.77410888671875, + "loss": 0.7954, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.241081953048706, + "rewards/margins": 4.55962610244751, + "rewards/rejected": -2.318544387817383, + "step": 6396 + }, + { + "epoch": 1.6, + "grad_norm": 4.7387800216674805, + "learning_rate": 7.67793933127416e-06, + "logits/chosen": -0.40924617648124695, + "logits/rejected": -0.5025646686553955, + "logps/chosen": -56.160186767578125, + "logps/rejected": -103.697998046875, + "loss": 0.612, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8608148097991943, + "rewards/margins": 5.672795295715332, + "rewards/rejected": -2.8119800090789795, + "step": 6397 + }, + { + "epoch": 1.6, + "grad_norm": 8.342845916748047, + "learning_rate": 7.677275547722956e-06, + "logits/chosen": -0.47910594940185547, + "logits/rejected": -0.5451474189758301, + "logps/chosen": -48.66294479370117, + "logps/rejected": -86.97021484375, + "loss": 0.6814, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8099024295806885, + "rewards/margins": 5.496717929840088, + "rewards/rejected": -2.6868159770965576, + "step": 6398 + }, + { + "epoch": 1.6, + "grad_norm": 7.362323760986328, + "learning_rate": 7.676611698013428e-06, + "logits/chosen": -0.4388120770454407, + "logits/rejected": -0.5032822489738464, + "logps/chosen": -45.168678283691406, + "logps/rejected": -87.68093872070312, + "loss": 0.7263, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8636255264282227, + "rewards/margins": 5.548879623413086, + "rewards/rejected": -2.6852540969848633, + "step": 6399 + }, + { + "epoch": 1.6, + "grad_norm": 4.580934524536133, + "learning_rate": 7.67594778216198e-06, + "logits/chosen": -0.3430962562561035, + "logits/rejected": -0.45802581310272217, + "logps/chosen": -67.85208129882812, + "logps/rejected": -83.12105560302734, + "loss": 0.6939, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.006309986114502, + "rewards/margins": 4.6555867195129395, + "rewards/rejected": -1.6492764949798584, + "step": 6400 + }, + { + "epoch": 1.6, + "grad_norm": 15.06729507446289, + "learning_rate": 7.675283800185011e-06, + "logits/chosen": -0.42179426550865173, + "logits/rejected": -0.4858540892601013, + "logps/chosen": -51.288387298583984, + "logps/rejected": -87.19854736328125, + "loss": 0.8036, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8171277046203613, + "rewards/margins": 5.426275730133057, + "rewards/rejected": -2.609147310256958, + "step": 6401 + }, + { + "epoch": 1.6, + "grad_norm": 7.275613307952881, + "learning_rate": 7.674619752098938e-06, + "logits/chosen": -0.3422169089317322, + "logits/rejected": -0.4877270460128784, + "logps/chosen": -58.31784439086914, + "logps/rejected": -67.41600036621094, + "loss": 0.988, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9523489475250244, + "rewards/margins": 4.098609447479248, + "rewards/rejected": -1.1462606191635132, + "step": 6402 + }, + { + "epoch": 1.6, + "grad_norm": 7.457286834716797, + "learning_rate": 7.673955637920164e-06, + "logits/chosen": -0.2966259717941284, + "logits/rejected": -0.37647849321365356, + "logps/chosen": -72.6309814453125, + "logps/rejected": -86.48402404785156, + "loss": 0.8979, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.099179267883301, + "rewards/margins": 4.1209282875061035, + "rewards/rejected": -1.0217490196228027, + "step": 6403 + }, + { + "epoch": 1.6, + "grad_norm": 28.286413192749023, + "learning_rate": 7.673291457665102e-06, + "logits/chosen": -0.3460463881492615, + "logits/rejected": -0.45103925466537476, + "logps/chosen": -58.20769500732422, + "logps/rejected": -104.13359069824219, + "loss": 0.689, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7405951023101807, + "rewards/margins": 5.610202312469482, + "rewards/rejected": -2.8696069717407227, + "step": 6404 + }, + { + "epoch": 1.6, + "grad_norm": 3.38901686668396, + "learning_rate": 7.672627211350164e-06, + "logits/chosen": -0.4605156183242798, + "logits/rejected": -0.4611384868621826, + "logps/chosen": -48.534080505371094, + "logps/rejected": -110.81855010986328, + "loss": 0.635, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1666603088378906, + "rewards/margins": 5.663283824920654, + "rewards/rejected": -2.4966237545013428, + "step": 6405 + }, + { + "epoch": 1.6, + "grad_norm": 7.325011253356934, + "learning_rate": 7.671962898991767e-06, + "logits/chosen": -0.37027907371520996, + "logits/rejected": -0.4767731726169586, + "logps/chosen": -65.87933349609375, + "logps/rejected": -89.13433837890625, + "loss": 0.7435, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0121097564697266, + "rewards/margins": 4.752452850341797, + "rewards/rejected": -1.7403432130813599, + "step": 6406 + }, + { + "epoch": 1.6, + "grad_norm": 4.993715763092041, + "learning_rate": 7.671298520606323e-06, + "logits/chosen": -0.44324374198913574, + "logits/rejected": -0.5544767379760742, + "logps/chosen": -51.875518798828125, + "logps/rejected": -75.63310241699219, + "loss": 0.6394, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.052304267883301, + "rewards/margins": 5.035979270935059, + "rewards/rejected": -1.9836747646331787, + "step": 6407 + }, + { + "epoch": 1.6, + "grad_norm": 4.6895751953125, + "learning_rate": 7.670634076210253e-06, + "logits/chosen": -0.32588011026382446, + "logits/rejected": -0.45230555534362793, + "logps/chosen": -66.47252655029297, + "logps/rejected": -87.21687316894531, + "loss": 0.7953, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.961280107498169, + "rewards/margins": 4.972413063049316, + "rewards/rejected": -2.0111331939697266, + "step": 6408 + }, + { + "epoch": 1.6, + "grad_norm": 22.77835464477539, + "learning_rate": 7.669969565819972e-06, + "logits/chosen": -0.4109085500240326, + "logits/rejected": -0.4778216481208801, + "logps/chosen": -60.450439453125, + "logps/rejected": -75.21578216552734, + "loss": 0.9162, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8058953285217285, + "rewards/margins": 3.6950552463531494, + "rewards/rejected": -0.8891595602035522, + "step": 6409 + }, + { + "epoch": 1.6, + "grad_norm": 14.193031311035156, + "learning_rate": 7.669304989451904e-06, + "logits/chosen": -0.40683865547180176, + "logits/rejected": -0.5348718166351318, + "logps/chosen": -54.7022705078125, + "logps/rejected": -80.1144027709961, + "loss": 0.8397, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.663214683532715, + "rewards/margins": 5.7293701171875, + "rewards/rejected": -3.066154956817627, + "step": 6410 + }, + { + "epoch": 1.6, + "grad_norm": 8.694796562194824, + "learning_rate": 7.668640347122471e-06, + "logits/chosen": -0.35484811663627625, + "logits/rejected": -0.44332700967788696, + "logps/chosen": -63.06687545776367, + "logps/rejected": -96.59423065185547, + "loss": 0.7841, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5569355487823486, + "rewards/margins": 4.831361770629883, + "rewards/rejected": -2.274425983428955, + "step": 6411 + }, + { + "epoch": 1.6, + "grad_norm": 6.079732894897461, + "learning_rate": 7.667975638848095e-06, + "logits/chosen": -0.433210551738739, + "logits/rejected": -0.5271702408790588, + "logps/chosen": -61.89031982421875, + "logps/rejected": -88.05473327636719, + "loss": 0.7933, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9038963317871094, + "rewards/margins": 3.7945749759674072, + "rewards/rejected": -0.8906786441802979, + "step": 6412 + }, + { + "epoch": 1.6, + "grad_norm": 7.871239185333252, + "learning_rate": 7.667310864645206e-06, + "logits/chosen": -0.40553581714630127, + "logits/rejected": -0.49555152654647827, + "logps/chosen": -59.659568786621094, + "logps/rejected": -81.08538818359375, + "loss": 0.8326, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.2998435497283936, + "rewards/margins": 4.016228675842285, + "rewards/rejected": -1.7163851261138916, + "step": 6413 + }, + { + "epoch": 1.6, + "grad_norm": 6.933229923248291, + "learning_rate": 7.666646024530226e-06, + "logits/chosen": -0.4017274081707001, + "logits/rejected": -0.5445393919944763, + "logps/chosen": -55.04804229736328, + "logps/rejected": -80.79345703125, + "loss": 0.8431, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7620301246643066, + "rewards/margins": 4.853828430175781, + "rewards/rejected": -2.0917978286743164, + "step": 6414 + }, + { + "epoch": 1.6, + "grad_norm": 5.747468948364258, + "learning_rate": 7.665981118519588e-06, + "logits/chosen": -0.36148154735565186, + "logits/rejected": -0.43031245470046997, + "logps/chosen": -49.50859832763672, + "logps/rejected": -102.16214752197266, + "loss": 0.7078, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.070952892303467, + "rewards/margins": 5.589705467224121, + "rewards/rejected": -2.5187528133392334, + "step": 6415 + }, + { + "epoch": 1.61, + "grad_norm": 9.840201377868652, + "learning_rate": 7.66531614662972e-06, + "logits/chosen": -0.31616824865341187, + "logits/rejected": -0.4688372015953064, + "logps/chosen": -63.71797180175781, + "logps/rejected": -72.12416076660156, + "loss": 0.75, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.762305498123169, + "rewards/margins": 4.501193046569824, + "rewards/rejected": -1.7388877868652344, + "step": 6416 + }, + { + "epoch": 1.61, + "grad_norm": 4.795942783355713, + "learning_rate": 7.664651108877057e-06, + "logits/chosen": -0.4369845986366272, + "logits/rejected": -0.48554033041000366, + "logps/chosen": -57.48167037963867, + "logps/rejected": -92.47731018066406, + "loss": 0.6493, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0675911903381348, + "rewards/margins": 4.2498459815979, + "rewards/rejected": -1.1822547912597656, + "step": 6417 + }, + { + "epoch": 1.61, + "grad_norm": 4.0263543128967285, + "learning_rate": 7.663986005278028e-06, + "logits/chosen": -0.4556124806404114, + "logits/rejected": -0.5079483389854431, + "logps/chosen": -44.351402282714844, + "logps/rejected": -73.53804016113281, + "loss": 0.7362, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2732763290405273, + "rewards/margins": 4.49924898147583, + "rewards/rejected": -1.225972294807434, + "step": 6418 + }, + { + "epoch": 1.61, + "grad_norm": 4.703521251678467, + "learning_rate": 7.663320835849073e-06, + "logits/chosen": -0.3829480707645416, + "logits/rejected": -0.44124650955200195, + "logps/chosen": -56.39752960205078, + "logps/rejected": -84.22899627685547, + "loss": 0.7465, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1354525089263916, + "rewards/margins": 4.50705099105835, + "rewards/rejected": -1.371598720550537, + "step": 6419 + }, + { + "epoch": 1.61, + "grad_norm": 4.142202854156494, + "learning_rate": 7.662655600606627e-06, + "logits/chosen": -0.40745922923088074, + "logits/rejected": -0.4956428110599518, + "logps/chosen": -58.991825103759766, + "logps/rejected": -101.59994506835938, + "loss": 0.7023, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0893895626068115, + "rewards/margins": 5.30111026763916, + "rewards/rejected": -2.2117202281951904, + "step": 6420 + }, + { + "epoch": 1.61, + "grad_norm": 7.410837173461914, + "learning_rate": 7.661990299567129e-06, + "logits/chosen": -0.32193467020988464, + "logits/rejected": -0.4320119619369507, + "logps/chosen": -59.89098358154297, + "logps/rejected": -82.22390747070312, + "loss": 0.8432, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8881828784942627, + "rewards/margins": 4.287364959716797, + "rewards/rejected": -1.3991820812225342, + "step": 6421 + }, + { + "epoch": 1.61, + "grad_norm": 3.879979133605957, + "learning_rate": 7.661324932747018e-06, + "logits/chosen": -0.3635543882846832, + "logits/rejected": -0.40444105863571167, + "logps/chosen": -50.15230178833008, + "logps/rejected": -104.2082290649414, + "loss": 0.6202, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.868854522705078, + "rewards/margins": 4.820263862609863, + "rewards/rejected": -1.951409101486206, + "step": 6422 + }, + { + "epoch": 1.61, + "grad_norm": 5.6844682693481445, + "learning_rate": 7.660659500162739e-06, + "logits/chosen": -0.49571314454078674, + "logits/rejected": -0.5927209854125977, + "logps/chosen": -55.367183685302734, + "logps/rejected": -88.7906723022461, + "loss": 0.7342, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.745793581008911, + "rewards/margins": 5.5763020515441895, + "rewards/rejected": -2.8305087089538574, + "step": 6423 + }, + { + "epoch": 1.61, + "grad_norm": 6.919070243835449, + "learning_rate": 7.659994001830731e-06, + "logits/chosen": -0.3766004741191864, + "logits/rejected": -0.4525149464607239, + "logps/chosen": -56.283966064453125, + "logps/rejected": -91.40775299072266, + "loss": 0.6687, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.965050220489502, + "rewards/margins": 5.157931327819824, + "rewards/rejected": -2.192880630493164, + "step": 6424 + }, + { + "epoch": 1.61, + "grad_norm": 5.303057670593262, + "learning_rate": 7.659328437767445e-06, + "logits/chosen": -0.3459303379058838, + "logits/rejected": -0.4524334669113159, + "logps/chosen": -62.88810729980469, + "logps/rejected": -69.48743438720703, + "loss": 0.7712, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0767486095428467, + "rewards/margins": 4.733798980712891, + "rewards/rejected": -1.6570502519607544, + "step": 6425 + }, + { + "epoch": 1.61, + "grad_norm": 20.961639404296875, + "learning_rate": 7.658662807989324e-06, + "logits/chosen": -0.3467436134815216, + "logits/rejected": -0.46609023213386536, + "logps/chosen": -64.86799621582031, + "logps/rejected": -88.51216125488281, + "loss": 1.0054, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.594404458999634, + "rewards/margins": 4.537245273590088, + "rewards/rejected": -1.9428414106369019, + "step": 6426 + }, + { + "epoch": 1.61, + "grad_norm": 3.620296001434326, + "learning_rate": 7.657997112512817e-06, + "logits/chosen": -0.35117799043655396, + "logits/rejected": -0.4390277862548828, + "logps/chosen": -50.95698547363281, + "logps/rejected": -90.85999298095703, + "loss": 0.6301, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8687338829040527, + "rewards/margins": 5.045278072357178, + "rewards/rejected": -2.176543712615967, + "step": 6427 + }, + { + "epoch": 1.61, + "grad_norm": 4.320342063903809, + "learning_rate": 7.657331351354374e-06, + "logits/chosen": -0.3237816095352173, + "logits/rejected": -0.4166680574417114, + "logps/chosen": -59.308563232421875, + "logps/rejected": -98.00910949707031, + "loss": 0.7206, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.020380735397339, + "rewards/margins": 5.921546459197998, + "rewards/rejected": -2.90116548538208, + "step": 6428 + }, + { + "epoch": 1.61, + "grad_norm": 4.146974086761475, + "learning_rate": 7.656665524530445e-06, + "logits/chosen": -0.34614282846450806, + "logits/rejected": -0.47756239771842957, + "logps/chosen": -62.389102935791016, + "logps/rejected": -87.3168716430664, + "loss": 0.7563, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9298806190490723, + "rewards/margins": 6.317390441894531, + "rewards/rejected": -3.387509822845459, + "step": 6429 + }, + { + "epoch": 1.61, + "grad_norm": 13.386131286621094, + "learning_rate": 7.655999632057488e-06, + "logits/chosen": -0.32147833704948425, + "logits/rejected": -0.3803795278072357, + "logps/chosen": -56.567447662353516, + "logps/rejected": -93.79564666748047, + "loss": 0.9043, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6581132411956787, + "rewards/margins": 3.5336432456970215, + "rewards/rejected": -0.8755303621292114, + "step": 6430 + }, + { + "epoch": 1.61, + "grad_norm": 3.241933584213257, + "learning_rate": 7.655333673951954e-06, + "logits/chosen": -0.3661184310913086, + "logits/rejected": -0.4662718176841736, + "logps/chosen": -56.793758392333984, + "logps/rejected": -82.36463928222656, + "loss": 0.7078, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0013935565948486, + "rewards/margins": 4.595920085906982, + "rewards/rejected": -1.5945264101028442, + "step": 6431 + }, + { + "epoch": 1.61, + "grad_norm": 3.903838634490967, + "learning_rate": 7.654667650230301e-06, + "logits/chosen": -0.34890490770339966, + "logits/rejected": -0.41715019941329956, + "logps/chosen": -59.11518096923828, + "logps/rejected": -88.21479034423828, + "loss": 0.7231, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8647031784057617, + "rewards/margins": 4.359917163848877, + "rewards/rejected": -1.4952137470245361, + "step": 6432 + }, + { + "epoch": 1.61, + "grad_norm": 6.52512788772583, + "learning_rate": 7.654001560908986e-06, + "logits/chosen": -0.31636083126068115, + "logits/rejected": -0.4205249845981598, + "logps/chosen": -64.57610321044922, + "logps/rejected": -68.86672973632812, + "loss": 0.8864, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4725403785705566, + "rewards/margins": 3.4089195728302, + "rewards/rejected": -0.9363793134689331, + "step": 6433 + }, + { + "epoch": 1.61, + "grad_norm": 5.708089828491211, + "learning_rate": 7.65333540600447e-06, + "logits/chosen": -0.3809237480163574, + "logits/rejected": -0.42419809103012085, + "logps/chosen": -60.7941780090332, + "logps/rejected": -85.67190551757812, + "loss": 0.8524, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1843314170837402, + "rewards/margins": 4.084837436676025, + "rewards/rejected": -0.900506317615509, + "step": 6434 + }, + { + "epoch": 1.61, + "grad_norm": 6.555428981781006, + "learning_rate": 7.652669185533213e-06, + "logits/chosen": -0.346108078956604, + "logits/rejected": -0.43973425030708313, + "logps/chosen": -63.10134506225586, + "logps/rejected": -80.53667449951172, + "loss": 0.7802, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.818126916885376, + "rewards/margins": 4.456317901611328, + "rewards/rejected": -1.6381908655166626, + "step": 6435 + }, + { + "epoch": 1.61, + "grad_norm": 5.685418605804443, + "learning_rate": 7.65200289951168e-06, + "logits/chosen": -0.336165189743042, + "logits/rejected": -0.44247958064079285, + "logps/chosen": -68.540283203125, + "logps/rejected": -85.0001220703125, + "loss": 0.715, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.939739227294922, + "rewards/margins": 4.819094657897949, + "rewards/rejected": -1.8793553113937378, + "step": 6436 + }, + { + "epoch": 1.61, + "grad_norm": 3.532792091369629, + "learning_rate": 7.651336547956335e-06, + "logits/chosen": -0.2641242742538452, + "logits/rejected": -0.3433500826358795, + "logps/chosen": -47.43667984008789, + "logps/rejected": -93.73306274414062, + "loss": 0.5829, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.838449001312256, + "rewards/margins": 6.117085933685303, + "rewards/rejected": -3.278636932373047, + "step": 6437 + }, + { + "epoch": 1.61, + "grad_norm": 2.658721923828125, + "learning_rate": 7.650670130883642e-06, + "logits/chosen": -0.28512266278266907, + "logits/rejected": -0.4535708427429199, + "logps/chosen": -55.2496452331543, + "logps/rejected": -67.55121612548828, + "loss": 0.6422, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2071027755737305, + "rewards/margins": 5.188673496246338, + "rewards/rejected": -1.9815703630447388, + "step": 6438 + }, + { + "epoch": 1.61, + "grad_norm": 7.335899829864502, + "learning_rate": 7.650003648310071e-06, + "logits/chosen": -0.4017230272293091, + "logits/rejected": -0.5236205458641052, + "logps/chosen": -48.57965850830078, + "logps/rejected": -87.33049774169922, + "loss": 0.6857, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.847074031829834, + "rewards/margins": 6.117802619934082, + "rewards/rejected": -3.270728826522827, + "step": 6439 + }, + { + "epoch": 1.61, + "grad_norm": 6.850273609161377, + "learning_rate": 7.649337100252091e-06, + "logits/chosen": -0.379364937543869, + "logits/rejected": -0.45921963453292847, + "logps/chosen": -69.88627624511719, + "logps/rejected": -92.36299896240234, + "loss": 0.9218, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9256844520568848, + "rewards/margins": 4.495599746704102, + "rewards/rejected": -1.569915771484375, + "step": 6440 + }, + { + "epoch": 1.61, + "grad_norm": 4.056615352630615, + "learning_rate": 7.648670486726175e-06, + "logits/chosen": -0.23347297310829163, + "logits/rejected": -0.357479989528656, + "logps/chosen": -54.960479736328125, + "logps/rejected": -90.74568176269531, + "loss": 0.6558, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4949796199798584, + "rewards/margins": 5.105759143829346, + "rewards/rejected": -2.6107797622680664, + "step": 6441 + }, + { + "epoch": 1.61, + "grad_norm": 5.0477681159973145, + "learning_rate": 7.648003807748793e-06, + "logits/chosen": -0.39487138390541077, + "logits/rejected": -0.40021950006484985, + "logps/chosen": -47.98457717895508, + "logps/rejected": -80.20123291015625, + "loss": 0.7497, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.308499813079834, + "rewards/margins": 4.239558219909668, + "rewards/rejected": -0.9310580492019653, + "step": 6442 + }, + { + "epoch": 1.61, + "grad_norm": 5.447628498077393, + "learning_rate": 7.647337063336421e-06, + "logits/chosen": -0.4380498230457306, + "logits/rejected": -0.5019134283065796, + "logps/chosen": -63.10670471191406, + "logps/rejected": -81.60823059082031, + "loss": 0.7603, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.174694776535034, + "rewards/margins": 4.60781717300415, + "rewards/rejected": -1.4331223964691162, + "step": 6443 + }, + { + "epoch": 1.61, + "grad_norm": 6.076379299163818, + "learning_rate": 7.646670253505533e-06, + "logits/chosen": -0.3266953229904175, + "logits/rejected": -0.41679856181144714, + "logps/chosen": -64.14892578125, + "logps/rejected": -86.7267074584961, + "loss": 0.8094, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8141653537750244, + "rewards/margins": 4.761407852172852, + "rewards/rejected": -1.9472424983978271, + "step": 6444 + }, + { + "epoch": 1.61, + "grad_norm": 3.5311832427978516, + "learning_rate": 7.646003378272608e-06, + "logits/chosen": -0.31253716349601746, + "logits/rejected": -0.3707050383090973, + "logps/chosen": -51.16259765625, + "logps/rejected": -95.80426788330078, + "loss": 0.6829, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1881649494171143, + "rewards/margins": 5.501885414123535, + "rewards/rejected": -2.313720703125, + "step": 6445 + }, + { + "epoch": 1.61, + "grad_norm": 4.987557888031006, + "learning_rate": 7.645336437654127e-06, + "logits/chosen": -0.3518914580345154, + "logits/rejected": -0.38078802824020386, + "logps/chosen": -50.02079772949219, + "logps/rejected": -82.27723693847656, + "loss": 0.7571, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.817610502243042, + "rewards/margins": 3.966197967529297, + "rewards/rejected": -1.1485874652862549, + "step": 6446 + }, + { + "epoch": 1.61, + "grad_norm": 7.11175012588501, + "learning_rate": 7.644669431666567e-06, + "logits/chosen": -0.4227599501609802, + "logits/rejected": -0.4869009554386139, + "logps/chosen": -53.47315979003906, + "logps/rejected": -89.62203979492188, + "loss": 0.7154, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8287606239318848, + "rewards/margins": 4.633777141571045, + "rewards/rejected": -1.8050165176391602, + "step": 6447 + }, + { + "epoch": 1.61, + "grad_norm": 5.1620354652404785, + "learning_rate": 7.644002360326415e-06, + "logits/chosen": -0.38868868350982666, + "logits/rejected": -0.48824137449264526, + "logps/chosen": -50.742923736572266, + "logps/rejected": -86.8052978515625, + "loss": 0.6913, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0690786838531494, + "rewards/margins": 5.765390872955322, + "rewards/rejected": -2.696312427520752, + "step": 6448 + }, + { + "epoch": 1.61, + "grad_norm": 4.83005952835083, + "learning_rate": 7.64333522365015e-06, + "logits/chosen": -0.4042622745037079, + "logits/rejected": -0.46865618228912354, + "logps/chosen": -43.618919372558594, + "logps/rejected": -85.41889190673828, + "loss": 0.5841, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1877505779266357, + "rewards/margins": 5.298151016235352, + "rewards/rejected": -2.1104001998901367, + "step": 6449 + }, + { + "epoch": 1.61, + "grad_norm": 4.8712639808654785, + "learning_rate": 7.64266802165426e-06, + "logits/chosen": -0.3902629017829895, + "logits/rejected": -0.5010559558868408, + "logps/chosen": -52.98214340209961, + "logps/rejected": -90.55465698242188, + "loss": 0.5941, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0059728622436523, + "rewards/margins": 5.455735206604004, + "rewards/rejected": -2.4497621059417725, + "step": 6450 + }, + { + "epoch": 1.61, + "grad_norm": 3.4465038776397705, + "learning_rate": 7.642000754355233e-06, + "logits/chosen": -0.28944092988967896, + "logits/rejected": -0.37596505880355835, + "logps/chosen": -71.1473388671875, + "logps/rejected": -91.70761108398438, + "loss": 0.6753, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0441133975982666, + "rewards/margins": 5.958774089813232, + "rewards/rejected": -2.914660930633545, + "step": 6451 + }, + { + "epoch": 1.61, + "grad_norm": 10.570459365844727, + "learning_rate": 7.641333421769557e-06, + "logits/chosen": -0.4142378568649292, + "logits/rejected": -0.4458944797515869, + "logps/chosen": -62.18310546875, + "logps/rejected": -110.58368682861328, + "loss": 0.8222, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7690091133117676, + "rewards/margins": 5.521130084991455, + "rewards/rejected": -2.7521209716796875, + "step": 6452 + }, + { + "epoch": 1.61, + "grad_norm": 7.04084587097168, + "learning_rate": 7.640666023913722e-06, + "logits/chosen": -0.36850225925445557, + "logits/rejected": -0.46258315443992615, + "logps/chosen": -57.0435676574707, + "logps/rejected": -98.7174072265625, + "loss": 0.5941, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7941415309906006, + "rewards/margins": 6.630260467529297, + "rewards/rejected": -3.836118221282959, + "step": 6453 + }, + { + "epoch": 1.61, + "grad_norm": 6.132977485656738, + "learning_rate": 7.639998560804223e-06, + "logits/chosen": -0.36637285351753235, + "logits/rejected": -0.4754323363304138, + "logps/chosen": -56.8471794128418, + "logps/rejected": -88.98277282714844, + "loss": 0.6396, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.042957067489624, + "rewards/margins": 5.226612091064453, + "rewards/rejected": -2.183655261993408, + "step": 6454 + }, + { + "epoch": 1.61, + "grad_norm": 5.2656683921813965, + "learning_rate": 7.639331032457548e-06, + "logits/chosen": -0.3926204442977905, + "logits/rejected": -0.4489452540874481, + "logps/chosen": -50.96299362182617, + "logps/rejected": -82.71792602539062, + "loss": 0.7447, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.957319736480713, + "rewards/margins": 4.469970226287842, + "rewards/rejected": -1.5126500129699707, + "step": 6455 + }, + { + "epoch": 1.62, + "grad_norm": 12.163277626037598, + "learning_rate": 7.638663438890199e-06, + "logits/chosen": -0.2915598154067993, + "logits/rejected": -0.4258235991001129, + "logps/chosen": -71.15071105957031, + "logps/rejected": -80.9594955444336, + "loss": 0.6599, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.676626682281494, + "rewards/margins": 5.6162543296813965, + "rewards/rejected": -2.9396274089813232, + "step": 6456 + }, + { + "epoch": 1.62, + "grad_norm": 7.032510280609131, + "learning_rate": 7.63799578011867e-06, + "logits/chosen": -0.46836552023887634, + "logits/rejected": -0.5302287340164185, + "logps/chosen": -61.09834671020508, + "logps/rejected": -83.03765106201172, + "loss": 0.9342, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3399031162261963, + "rewards/margins": 4.1647162437438965, + "rewards/rejected": -1.8248130083084106, + "step": 6457 + }, + { + "epoch": 1.62, + "grad_norm": 9.114221572875977, + "learning_rate": 7.637328056159457e-06, + "logits/chosen": -0.3875926733016968, + "logits/rejected": -0.42329156398773193, + "logps/chosen": -69.28543090820312, + "logps/rejected": -100.50880432128906, + "loss": 0.791, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.406160831451416, + "rewards/margins": 5.348563194274902, + "rewards/rejected": -2.9424023628234863, + "step": 6458 + }, + { + "epoch": 1.62, + "grad_norm": 3.798198699951172, + "learning_rate": 7.636660267029064e-06, + "logits/chosen": -0.38506919145584106, + "logits/rejected": -0.3980638384819031, + "logps/chosen": -53.13539123535156, + "logps/rejected": -122.18624877929688, + "loss": 0.5956, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8891243934631348, + "rewards/margins": 6.97759485244751, + "rewards/rejected": -4.088470458984375, + "step": 6459 + }, + { + "epoch": 1.62, + "grad_norm": 5.814796447753906, + "learning_rate": 7.63599241274399e-06, + "logits/chosen": -0.3896898627281189, + "logits/rejected": -0.5142129063606262, + "logps/chosen": -55.08021926879883, + "logps/rejected": -72.02134704589844, + "loss": 0.7014, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7300832271575928, + "rewards/margins": 4.568840026855469, + "rewards/rejected": -1.8387563228607178, + "step": 6460 + }, + { + "epoch": 1.62, + "grad_norm": 7.716158866882324, + "learning_rate": 7.635324493320742e-06, + "logits/chosen": -0.35654568672180176, + "logits/rejected": -0.43250542879104614, + "logps/chosen": -60.066688537597656, + "logps/rejected": -87.09483337402344, + "loss": 0.7773, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8565104007720947, + "rewards/margins": 4.622403144836426, + "rewards/rejected": -1.7658926248550415, + "step": 6461 + }, + { + "epoch": 1.62, + "grad_norm": 13.763989448547363, + "learning_rate": 7.634656508775821e-06, + "logits/chosen": -0.4236120581626892, + "logits/rejected": -0.5527095198631287, + "logps/chosen": -76.66944122314453, + "logps/rejected": -86.26752471923828, + "loss": 0.969, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.264125108718872, + "rewards/margins": 4.823123931884766, + "rewards/rejected": -2.5589990615844727, + "step": 6462 + }, + { + "epoch": 1.62, + "grad_norm": 5.13244104385376, + "learning_rate": 7.633988459125736e-06, + "logits/chosen": -0.2968655526638031, + "logits/rejected": -0.38186758756637573, + "logps/chosen": -64.93344116210938, + "logps/rejected": -90.6744384765625, + "loss": 0.7406, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7013912200927734, + "rewards/margins": 5.196143627166748, + "rewards/rejected": -2.4947524070739746, + "step": 6463 + }, + { + "epoch": 1.62, + "grad_norm": 8.873579978942871, + "learning_rate": 7.633320344386993e-06, + "logits/chosen": -0.35346531867980957, + "logits/rejected": -0.44002819061279297, + "logps/chosen": -53.55387878417969, + "logps/rejected": -85.35661315917969, + "loss": 0.7684, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6905717849731445, + "rewards/margins": 5.180278778076172, + "rewards/rejected": -2.4897074699401855, + "step": 6464 + }, + { + "epoch": 1.62, + "grad_norm": 24.635608673095703, + "learning_rate": 7.632652164576106e-06, + "logits/chosen": -0.4217327833175659, + "logits/rejected": -0.5019500255584717, + "logps/chosen": -55.299015045166016, + "logps/rejected": -79.43016815185547, + "loss": 0.8275, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.797457695007324, + "rewards/margins": 4.703378677368164, + "rewards/rejected": -1.9059207439422607, + "step": 6465 + }, + { + "epoch": 1.62, + "grad_norm": 5.252135276794434, + "learning_rate": 7.631983919709583e-06, + "logits/chosen": -0.30647358298301697, + "logits/rejected": -0.4020428955554962, + "logps/chosen": -53.0916748046875, + "logps/rejected": -75.09603881835938, + "loss": 0.7478, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8696699142456055, + "rewards/margins": 4.330080986022949, + "rewards/rejected": -1.4604111909866333, + "step": 6466 + }, + { + "epoch": 1.62, + "grad_norm": 9.929800033569336, + "learning_rate": 7.631315609803935e-06, + "logits/chosen": -0.34116750955581665, + "logits/rejected": -0.4513224959373474, + "logps/chosen": -58.883697509765625, + "logps/rejected": -92.9479751586914, + "loss": 0.7457, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9141457080841064, + "rewards/margins": 5.26010274887085, + "rewards/rejected": -2.3459572792053223, + "step": 6467 + }, + { + "epoch": 1.62, + "grad_norm": 6.076980113983154, + "learning_rate": 7.630647234875683e-06, + "logits/chosen": -0.363717257976532, + "logits/rejected": -0.5105732679367065, + "logps/chosen": -45.829376220703125, + "logps/rejected": -66.5126953125, + "loss": 0.6216, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.972919464111328, + "rewards/margins": 5.284102439880371, + "rewards/rejected": -2.3111824989318848, + "step": 6468 + }, + { + "epoch": 1.62, + "grad_norm": 4.772974014282227, + "learning_rate": 7.629978794941339e-06, + "logits/chosen": -0.3406917154788971, + "logits/rejected": -0.4777720868587494, + "logps/chosen": -66.17739868164062, + "logps/rejected": -78.33418273925781, + "loss": 0.7588, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.683373212814331, + "rewards/margins": 5.379998683929443, + "rewards/rejected": -2.6966254711151123, + "step": 6469 + }, + { + "epoch": 1.62, + "grad_norm": 4.0058112144470215, + "learning_rate": 7.629310290017421e-06, + "logits/chosen": -0.3423197567462921, + "logits/rejected": -0.4754917025566101, + "logps/chosen": -60.67706298828125, + "logps/rejected": -83.32968139648438, + "loss": 0.6789, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2737786769866943, + "rewards/margins": 5.105499744415283, + "rewards/rejected": -1.831721305847168, + "step": 6470 + }, + { + "epoch": 1.62, + "grad_norm": 4.109630584716797, + "learning_rate": 7.628641720120449e-06, + "logits/chosen": -0.35183969140052795, + "logits/rejected": -0.4826170802116394, + "logps/chosen": -57.140167236328125, + "logps/rejected": -85.47862243652344, + "loss": 0.6524, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.017174005508423, + "rewards/margins": 5.860748291015625, + "rewards/rejected": -2.8435750007629395, + "step": 6471 + }, + { + "epoch": 1.62, + "grad_norm": 5.6509013175964355, + "learning_rate": 7.627973085266944e-06, + "logits/chosen": -0.3421870470046997, + "logits/rejected": -0.421139657497406, + "logps/chosen": -53.723175048828125, + "logps/rejected": -95.09063720703125, + "loss": 0.5684, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0864129066467285, + "rewards/margins": 5.427494049072266, + "rewards/rejected": -2.3410816192626953, + "step": 6472 + }, + { + "epoch": 1.62, + "grad_norm": 5.3317670822143555, + "learning_rate": 7.62730438547343e-06, + "logits/chosen": -0.37617021799087524, + "logits/rejected": -0.49752599000930786, + "logps/chosen": -55.23875045776367, + "logps/rejected": -98.53189849853516, + "loss": 0.64, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9965686798095703, + "rewards/margins": 6.002747535705566, + "rewards/rejected": -3.006178379058838, + "step": 6473 + }, + { + "epoch": 1.62, + "grad_norm": 2.5267319679260254, + "learning_rate": 7.626635620756428e-06, + "logits/chosen": -0.4159269630908966, + "logits/rejected": -0.5300979614257812, + "logps/chosen": -54.29881286621094, + "logps/rejected": -90.37022399902344, + "loss": 0.6191, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9236934185028076, + "rewards/margins": 5.885005474090576, + "rewards/rejected": -2.9613118171691895, + "step": 6474 + }, + { + "epoch": 1.62, + "grad_norm": 8.756211280822754, + "learning_rate": 7.625966791132469e-06, + "logits/chosen": -0.38857126235961914, + "logits/rejected": -0.4733888506889343, + "logps/chosen": -49.47043228149414, + "logps/rejected": -83.1806411743164, + "loss": 0.6583, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8810644149780273, + "rewards/margins": 4.457827091217041, + "rewards/rejected": -1.576763391494751, + "step": 6475 + }, + { + "epoch": 1.62, + "grad_norm": 8.988201141357422, + "learning_rate": 7.625297896618075e-06, + "logits/chosen": -0.37862712144851685, + "logits/rejected": -0.5313290357589722, + "logps/chosen": -63.62797546386719, + "logps/rejected": -82.79074096679688, + "loss": 0.6868, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7290313243865967, + "rewards/margins": 5.021819114685059, + "rewards/rejected": -2.29278826713562, + "step": 6476 + }, + { + "epoch": 1.62, + "grad_norm": 3.3210971355438232, + "learning_rate": 7.624628937229779e-06, + "logits/chosen": -0.3979860246181488, + "logits/rejected": -0.4454454481601715, + "logps/chosen": -62.95365905761719, + "logps/rejected": -100.970947265625, + "loss": 0.7407, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.955190658569336, + "rewards/margins": 4.489627838134766, + "rewards/rejected": -1.5344372987747192, + "step": 6477 + }, + { + "epoch": 1.62, + "grad_norm": 9.01357364654541, + "learning_rate": 7.623959912984109e-06, + "logits/chosen": -0.3944709897041321, + "logits/rejected": -0.4885154366493225, + "logps/chosen": -54.61259460449219, + "logps/rejected": -84.77885437011719, + "loss": 0.6746, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.701094150543213, + "rewards/margins": 5.939616680145264, + "rewards/rejected": -3.23852276802063, + "step": 6478 + }, + { + "epoch": 1.62, + "grad_norm": 4.3893280029296875, + "learning_rate": 7.6232908238976e-06, + "logits/chosen": -0.29685115814208984, + "logits/rejected": -0.35966938734054565, + "logps/chosen": -62.27619934082031, + "logps/rejected": -76.67930603027344, + "loss": 0.7257, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7945117950439453, + "rewards/margins": 4.753880500793457, + "rewards/rejected": -1.9593689441680908, + "step": 6479 + }, + { + "epoch": 1.62, + "grad_norm": 3.677042007446289, + "learning_rate": 7.622621669986783e-06, + "logits/chosen": -0.36787182092666626, + "logits/rejected": -0.4676511585712433, + "logps/chosen": -49.144046783447266, + "logps/rejected": -99.02239227294922, + "loss": 0.6201, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6497015953063965, + "rewards/margins": 5.85545539855957, + "rewards/rejected": -3.205754280090332, + "step": 6480 + }, + { + "epoch": 1.62, + "grad_norm": 3.5980753898620605, + "learning_rate": 7.621952451268196e-06, + "logits/chosen": -0.47086241841316223, + "logits/rejected": -0.47648876905441284, + "logps/chosen": -48.253395080566406, + "logps/rejected": -96.56332397460938, + "loss": 0.7648, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2319257259368896, + "rewards/margins": 4.84773588180542, + "rewards/rejected": -1.6158100366592407, + "step": 6481 + }, + { + "epoch": 1.62, + "grad_norm": 7.681658744812012, + "learning_rate": 7.6212831677583755e-06, + "logits/chosen": -0.34740006923675537, + "logits/rejected": -0.4049336314201355, + "logps/chosen": -57.85435104370117, + "logps/rejected": -84.64730834960938, + "loss": 0.7421, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.969043016433716, + "rewards/margins": 4.283024787902832, + "rewards/rejected": -1.3139817714691162, + "step": 6482 + }, + { + "epoch": 1.62, + "grad_norm": 9.891153335571289, + "learning_rate": 7.620613819473859e-06, + "logits/chosen": -0.4196677803993225, + "logits/rejected": -0.48519280552864075, + "logps/chosen": -53.139400482177734, + "logps/rejected": -93.71199035644531, + "loss": 0.7308, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4776244163513184, + "rewards/margins": 5.249154567718506, + "rewards/rejected": -2.7715299129486084, + "step": 6483 + }, + { + "epoch": 1.62, + "grad_norm": 4.858042240142822, + "learning_rate": 7.619944406431188e-06, + "logits/chosen": -0.3190848231315613, + "logits/rejected": -0.3700721263885498, + "logps/chosen": -51.583683013916016, + "logps/rejected": -93.4216079711914, + "loss": 0.6479, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.302250862121582, + "rewards/margins": 4.365317344665527, + "rewards/rejected": -1.0630661249160767, + "step": 6484 + }, + { + "epoch": 1.62, + "grad_norm": 8.101731300354004, + "learning_rate": 7.6192749286469025e-06, + "logits/chosen": -0.4238702654838562, + "logits/rejected": -0.5328041911125183, + "logps/chosen": -60.01978302001953, + "logps/rejected": -97.90960693359375, + "loss": 0.8576, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.824924945831299, + "rewards/margins": 4.914877891540527, + "rewards/rejected": -2.0899531841278076, + "step": 6485 + }, + { + "epoch": 1.62, + "grad_norm": 4.424287796020508, + "learning_rate": 7.61860538613755e-06, + "logits/chosen": -0.3902047872543335, + "logits/rejected": -0.4971524775028229, + "logps/chosen": -57.34038162231445, + "logps/rejected": -72.21600341796875, + "loss": 0.7147, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.846207618713379, + "rewards/margins": 4.476593971252441, + "rewards/rejected": -1.6303859949111938, + "step": 6486 + }, + { + "epoch": 1.62, + "grad_norm": 7.312984466552734, + "learning_rate": 7.617935778919671e-06, + "logits/chosen": -0.2985779941082001, + "logits/rejected": -0.44319552183151245, + "logps/chosen": -72.28692626953125, + "logps/rejected": -79.46967315673828, + "loss": 0.7845, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6614909172058105, + "rewards/margins": 4.209969997406006, + "rewards/rejected": -1.5484790802001953, + "step": 6487 + }, + { + "epoch": 1.62, + "grad_norm": 6.908598899841309, + "learning_rate": 7.617266107009815e-06, + "logits/chosen": -0.3723752498626709, + "logits/rejected": -0.46365755796432495, + "logps/chosen": -68.26447296142578, + "logps/rejected": -89.76889038085938, + "loss": 0.7535, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9486546516418457, + "rewards/margins": 5.039831161499023, + "rewards/rejected": -2.091176748275757, + "step": 6488 + }, + { + "epoch": 1.62, + "grad_norm": 3.13460373878479, + "learning_rate": 7.616596370424531e-06, + "logits/chosen": -0.3522476255893707, + "logits/rejected": -0.4452403783798218, + "logps/chosen": -47.93922424316406, + "logps/rejected": -89.1578369140625, + "loss": 0.5404, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.788553476333618, + "rewards/margins": 6.107278823852539, + "rewards/rejected": -3.318725347518921, + "step": 6489 + }, + { + "epoch": 1.62, + "grad_norm": 6.841859817504883, + "learning_rate": 7.615926569180365e-06, + "logits/chosen": -0.336497962474823, + "logits/rejected": -0.44136011600494385, + "logps/chosen": -63.04954528808594, + "logps/rejected": -79.87323760986328, + "loss": 0.9209, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9808812141418457, + "rewards/margins": 3.6881444454193115, + "rewards/rejected": -0.7072631120681763, + "step": 6490 + }, + { + "epoch": 1.62, + "grad_norm": 5.1265153884887695, + "learning_rate": 7.615256703293874e-06, + "logits/chosen": -0.32236236333847046, + "logits/rejected": -0.46239927411079407, + "logps/chosen": -46.26082992553711, + "logps/rejected": -78.3022689819336, + "loss": 0.6106, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.567680835723877, + "rewards/margins": 5.058077812194824, + "rewards/rejected": -2.4903972148895264, + "step": 6491 + }, + { + "epoch": 1.62, + "grad_norm": 17.281219482421875, + "learning_rate": 7.614586772781607e-06, + "logits/chosen": -0.35244515538215637, + "logits/rejected": -0.45579439401626587, + "logps/chosen": -69.39427947998047, + "logps/rejected": -93.53287506103516, + "loss": 0.9714, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9880571365356445, + "rewards/margins": 5.540452480316162, + "rewards/rejected": -2.5523951053619385, + "step": 6492 + }, + { + "epoch": 1.62, + "grad_norm": 7.446410179138184, + "learning_rate": 7.6139167776601184e-06, + "logits/chosen": -0.4037013351917267, + "logits/rejected": -0.5105066895484924, + "logps/chosen": -57.47711181640625, + "logps/rejected": -90.00138854980469, + "loss": 0.7827, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6462156772613525, + "rewards/margins": 5.094542980194092, + "rewards/rejected": -2.4483273029327393, + "step": 6493 + }, + { + "epoch": 1.62, + "grad_norm": 4.532723426818848, + "learning_rate": 7.613246717945968e-06, + "logits/chosen": -0.38823139667510986, + "logits/rejected": -0.4166882038116455, + "logps/chosen": -60.090110778808594, + "logps/rejected": -128.7118377685547, + "loss": 0.7262, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.037687301635742, + "rewards/margins": 5.547135353088379, + "rewards/rejected": -2.5094475746154785, + "step": 6494 + }, + { + "epoch": 1.62, + "grad_norm": 4.259260654449463, + "learning_rate": 7.612576593655709e-06, + "logits/chosen": -0.44372880458831787, + "logits/rejected": -0.5000141859054565, + "logps/chosen": -55.52104187011719, + "logps/rejected": -82.65428161621094, + "loss": 0.713, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0569374561309814, + "rewards/margins": 4.023129463195801, + "rewards/rejected": -0.9661920070648193, + "step": 6495 + }, + { + "epoch": 1.63, + "grad_norm": 5.842869281768799, + "learning_rate": 7.611906404805905e-06, + "logits/chosen": -0.3673962950706482, + "logits/rejected": -0.3747958242893219, + "logps/chosen": -60.180198669433594, + "logps/rejected": -87.23567199707031, + "loss": 0.804, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.055457592010498, + "rewards/margins": 3.7109103202819824, + "rewards/rejected": -0.6554528474807739, + "step": 6496 + }, + { + "epoch": 1.63, + "grad_norm": 5.661207675933838, + "learning_rate": 7.611236151413117e-06, + "logits/chosen": -0.35045233368873596, + "logits/rejected": -0.42731383442878723, + "logps/chosen": -52.94060134887695, + "logps/rejected": -80.9370346069336, + "loss": 0.9094, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8308839797973633, + "rewards/margins": 4.081325531005859, + "rewards/rejected": -1.2504411935806274, + "step": 6497 + }, + { + "epoch": 1.63, + "grad_norm": 4.161563873291016, + "learning_rate": 7.610565833493906e-06, + "logits/chosen": -0.2466762214899063, + "logits/rejected": -0.3843914866447449, + "logps/chosen": -64.06018829345703, + "logps/rejected": -81.75994110107422, + "loss": 0.7261, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.859907388687134, + "rewards/margins": 4.989603519439697, + "rewards/rejected": -2.1296958923339844, + "step": 6498 + }, + { + "epoch": 1.63, + "grad_norm": 6.097327709197998, + "learning_rate": 7.609895451064835e-06, + "logits/chosen": -0.39242053031921387, + "logits/rejected": -0.4645117521286011, + "logps/chosen": -47.5445556640625, + "logps/rejected": -87.19010925292969, + "loss": 0.7495, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8697733879089355, + "rewards/margins": 4.758707046508789, + "rewards/rejected": -1.8889340162277222, + "step": 6499 + }, + { + "epoch": 1.63, + "grad_norm": 4.4563751220703125, + "learning_rate": 7.609225004142472e-06, + "logits/chosen": -0.44677719473838806, + "logits/rejected": -0.5052074790000916, + "logps/chosen": -45.92699432373047, + "logps/rejected": -86.09424591064453, + "loss": 0.6291, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2972562313079834, + "rewards/margins": 5.917740821838379, + "rewards/rejected": -2.6204841136932373, + "step": 6500 + }, + { + "epoch": 1.63, + "grad_norm": 4.315890789031982, + "learning_rate": 7.608554492743383e-06, + "logits/chosen": -0.2916030287742615, + "logits/rejected": -0.37923717498779297, + "logps/chosen": -51.821189880371094, + "logps/rejected": -94.76484680175781, + "loss": 0.6949, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6790683269500732, + "rewards/margins": 4.787371635437012, + "rewards/rejected": -2.1083033084869385, + "step": 6501 + }, + { + "epoch": 1.63, + "grad_norm": 4.1439738273620605, + "learning_rate": 7.60788391688414e-06, + "logits/chosen": -0.3327386975288391, + "logits/rejected": -0.40594589710235596, + "logps/chosen": -49.52582550048828, + "logps/rejected": -78.58921813964844, + "loss": 0.7072, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9917960166931152, + "rewards/margins": 4.555553436279297, + "rewards/rejected": -1.5637578964233398, + "step": 6502 + }, + { + "epoch": 1.63, + "grad_norm": 10.45291519165039, + "learning_rate": 7.607213276581309e-06, + "logits/chosen": -0.4088989794254303, + "logits/rejected": -0.532132625579834, + "logps/chosen": -60.04825210571289, + "logps/rejected": -80.35189819335938, + "loss": 0.8332, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6423633098602295, + "rewards/margins": 3.985215187072754, + "rewards/rejected": -1.3428517580032349, + "step": 6503 + }, + { + "epoch": 1.63, + "grad_norm": 4.785553932189941, + "learning_rate": 7.606542571851467e-06, + "logits/chosen": -0.3640221357345581, + "logits/rejected": -0.48318415880203247, + "logps/chosen": -58.91621780395508, + "logps/rejected": -95.77633666992188, + "loss": 0.7792, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0021889209747314, + "rewards/margins": 5.250556468963623, + "rewards/rejected": -2.2483677864074707, + "step": 6504 + }, + { + "epoch": 1.63, + "grad_norm": 13.670186996459961, + "learning_rate": 7.605871802711184e-06, + "logits/chosen": -0.33311182260513306, + "logits/rejected": -0.39641502499580383, + "logps/chosen": -58.11284255981445, + "logps/rejected": -91.84708404541016, + "loss": 0.8429, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.76674747467041, + "rewards/margins": 4.1484527587890625, + "rewards/rejected": -1.3817048072814941, + "step": 6505 + }, + { + "epoch": 1.63, + "grad_norm": 7.113485813140869, + "learning_rate": 7.605200969177038e-06, + "logits/chosen": -0.4710425138473511, + "logits/rejected": -0.4888475239276886, + "logps/chosen": -74.77017211914062, + "logps/rejected": -80.77465057373047, + "loss": 0.6956, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8625223636627197, + "rewards/margins": 4.739389419555664, + "rewards/rejected": -1.8768675327301025, + "step": 6506 + }, + { + "epoch": 1.63, + "grad_norm": 3.4364309310913086, + "learning_rate": 7.604530071265604e-06, + "logits/chosen": -0.3363778591156006, + "logits/rejected": -0.408536434173584, + "logps/chosen": -64.00389099121094, + "logps/rejected": -98.80392456054688, + "loss": 0.6658, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.189396858215332, + "rewards/margins": 5.736108303070068, + "rewards/rejected": -2.5467112064361572, + "step": 6507 + }, + { + "epoch": 1.63, + "grad_norm": 3.4168968200683594, + "learning_rate": 7.603859108993463e-06, + "logits/chosen": -0.5098954439163208, + "logits/rejected": -0.5932793617248535, + "logps/chosen": -56.35015869140625, + "logps/rejected": -81.46681213378906, + "loss": 0.6902, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7513957023620605, + "rewards/margins": 4.623415946960449, + "rewards/rejected": -1.8720206022262573, + "step": 6508 + }, + { + "epoch": 1.63, + "grad_norm": 10.978316307067871, + "learning_rate": 7.603188082377193e-06, + "logits/chosen": -0.2988518178462982, + "logits/rejected": -0.3851281702518463, + "logps/chosen": -63.24959182739258, + "logps/rejected": -76.42098236083984, + "loss": 0.8355, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0029847621917725, + "rewards/margins": 4.014780521392822, + "rewards/rejected": -1.011795997619629, + "step": 6509 + }, + { + "epoch": 1.63, + "grad_norm": 3.653395175933838, + "learning_rate": 7.602516991433376e-06, + "logits/chosen": -0.4264126121997833, + "logits/rejected": -0.5061203241348267, + "logps/chosen": -49.66889953613281, + "logps/rejected": -69.85059356689453, + "loss": 0.645, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.192674398422241, + "rewards/margins": 4.660710334777832, + "rewards/rejected": -1.4680356979370117, + "step": 6510 + }, + { + "epoch": 1.63, + "grad_norm": 3.895024061203003, + "learning_rate": 7.601845836178598e-06, + "logits/chosen": -0.4344353675842285, + "logits/rejected": -0.5453207492828369, + "logps/chosen": -56.03804016113281, + "logps/rejected": -93.23038482666016, + "loss": 0.7068, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8166162967681885, + "rewards/margins": 5.325838088989258, + "rewards/rejected": -2.5092220306396484, + "step": 6511 + }, + { + "epoch": 1.63, + "grad_norm": 11.876618385314941, + "learning_rate": 7.601174616629441e-06, + "logits/chosen": -0.3736887276172638, + "logits/rejected": -0.4776555895805359, + "logps/chosen": -56.726715087890625, + "logps/rejected": -98.91275787353516, + "loss": 0.7855, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8696517944335938, + "rewards/margins": 5.5026373863220215, + "rewards/rejected": -2.6329855918884277, + "step": 6512 + }, + { + "epoch": 1.63, + "grad_norm": 5.157229423522949, + "learning_rate": 7.600503332802491e-06, + "logits/chosen": -0.38301995396614075, + "logits/rejected": -0.4416530132293701, + "logps/chosen": -49.336395263671875, + "logps/rejected": -79.95548248291016, + "loss": 0.7592, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8789620399475098, + "rewards/margins": 4.713099956512451, + "rewards/rejected": -1.8341375589370728, + "step": 6513 + }, + { + "epoch": 1.63, + "grad_norm": 5.696130752563477, + "learning_rate": 7.599831984714341e-06, + "logits/chosen": -0.38429075479507446, + "logits/rejected": -0.47610369324684143, + "logps/chosen": -61.597591400146484, + "logps/rejected": -93.42766571044922, + "loss": 0.6982, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.769420623779297, + "rewards/margins": 5.303530693054199, + "rewards/rejected": -2.5341100692749023, + "step": 6514 + }, + { + "epoch": 1.63, + "grad_norm": 6.700191020965576, + "learning_rate": 7.5991605723815745e-06, + "logits/chosen": -0.364401638507843, + "logits/rejected": -0.42207372188568115, + "logps/chosen": -55.06660461425781, + "logps/rejected": -86.26214599609375, + "loss": 0.781, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.910818099975586, + "rewards/margins": 4.562736511230469, + "rewards/rejected": -1.6519184112548828, + "step": 6515 + }, + { + "epoch": 1.63, + "grad_norm": 8.696730613708496, + "learning_rate": 7.598489095820786e-06, + "logits/chosen": -0.4135933816432953, + "logits/rejected": -0.5013113021850586, + "logps/chosen": -53.90184020996094, + "logps/rejected": -88.34767150878906, + "loss": 0.7044, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.773651599884033, + "rewards/margins": 5.168663501739502, + "rewards/rejected": -2.3950119018554688, + "step": 6516 + }, + { + "epoch": 1.63, + "grad_norm": 3.868513584136963, + "learning_rate": 7.597817555048568e-06, + "logits/chosen": -0.31847214698791504, + "logits/rejected": -0.3888971507549286, + "logps/chosen": -55.96818161010742, + "logps/rejected": -103.55325317382812, + "loss": 0.7372, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0810446739196777, + "rewards/margins": 5.682368278503418, + "rewards/rejected": -2.6013238430023193, + "step": 6517 + }, + { + "epoch": 1.63, + "grad_norm": 4.90602445602417, + "learning_rate": 7.597145950081515e-06, + "logits/chosen": -0.4583808481693268, + "logits/rejected": -0.5119205713272095, + "logps/chosen": -49.070953369140625, + "logps/rejected": -70.26712799072266, + "loss": 0.7278, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0642261505126953, + "rewards/margins": 5.094362735748291, + "rewards/rejected": -2.0301363468170166, + "step": 6518 + }, + { + "epoch": 1.63, + "grad_norm": 3.3781111240386963, + "learning_rate": 7.596474280936224e-06, + "logits/chosen": -0.3453764021396637, + "logits/rejected": -0.4214695692062378, + "logps/chosen": -54.363128662109375, + "logps/rejected": -104.75004577636719, + "loss": 0.6336, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6249842643737793, + "rewards/margins": 5.844347953796387, + "rewards/rejected": -3.2193636894226074, + "step": 6519 + }, + { + "epoch": 1.63, + "grad_norm": 5.181014060974121, + "learning_rate": 7.5958025476292915e-06, + "logits/chosen": -0.39006307721138, + "logits/rejected": -0.5184931755065918, + "logps/chosen": -56.83529281616211, + "logps/rejected": -73.02156066894531, + "loss": 0.6918, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.868891477584839, + "rewards/margins": 4.134256362915039, + "rewards/rejected": -1.2653647661209106, + "step": 6520 + }, + { + "epoch": 1.63, + "grad_norm": 3.6017343997955322, + "learning_rate": 7.5951307501773165e-06, + "logits/chosen": -0.3920218348503113, + "logits/rejected": -0.45543503761291504, + "logps/chosen": -45.493194580078125, + "logps/rejected": -91.17986297607422, + "loss": 0.64, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0393290519714355, + "rewards/margins": 5.638612747192383, + "rewards/rejected": -2.5992839336395264, + "step": 6521 + }, + { + "epoch": 1.63, + "grad_norm": 4.314445495605469, + "learning_rate": 7.5944588885969006e-06, + "logits/chosen": -0.2837272584438324, + "logits/rejected": -0.374600350856781, + "logps/chosen": -51.967689514160156, + "logps/rejected": -89.84883117675781, + "loss": 0.6892, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.957632541656494, + "rewards/margins": 5.221430778503418, + "rewards/rejected": -2.263798475265503, + "step": 6522 + }, + { + "epoch": 1.63, + "grad_norm": 13.370406150817871, + "learning_rate": 7.593786962904648e-06, + "logits/chosen": -0.43120115995407104, + "logits/rejected": -0.5195122957229614, + "logps/chosen": -55.76866912841797, + "logps/rejected": -90.767333984375, + "loss": 0.7138, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7458856105804443, + "rewards/margins": 4.897597312927246, + "rewards/rejected": -2.1517114639282227, + "step": 6523 + }, + { + "epoch": 1.63, + "grad_norm": 5.028060436248779, + "learning_rate": 7.5931149731171576e-06, + "logits/chosen": -0.3580603301525116, + "logits/rejected": -0.37399381399154663, + "logps/chosen": -53.085487365722656, + "logps/rejected": -93.85606384277344, + "loss": 0.7602, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.227451801300049, + "rewards/margins": 4.9019269943237305, + "rewards/rejected": -1.674475073814392, + "step": 6524 + }, + { + "epoch": 1.63, + "grad_norm": 6.77595853805542, + "learning_rate": 7.592442919251039e-06, + "logits/chosen": -0.3683180809020996, + "logits/rejected": -0.4559071362018585, + "logps/chosen": -57.348304748535156, + "logps/rejected": -95.15652465820312, + "loss": 0.7405, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6185693740844727, + "rewards/margins": 5.6204705238342285, + "rewards/rejected": -3.001901388168335, + "step": 6525 + }, + { + "epoch": 1.63, + "grad_norm": 5.729674339294434, + "learning_rate": 7.591770801322899e-06, + "logits/chosen": -0.38158851861953735, + "logits/rejected": -0.48118856549263, + "logps/chosen": -50.37179183959961, + "logps/rejected": -83.57215881347656, + "loss": 0.8366, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7589619159698486, + "rewards/margins": 4.4460577964782715, + "rewards/rejected": -1.6870958805084229, + "step": 6526 + }, + { + "epoch": 1.63, + "grad_norm": 5.571664333343506, + "learning_rate": 7.591098619349345e-06, + "logits/chosen": -0.3829752504825592, + "logits/rejected": -0.47929102182388306, + "logps/chosen": -57.921714782714844, + "logps/rejected": -82.2203369140625, + "loss": 0.6515, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9715993404388428, + "rewards/margins": 5.392253875732422, + "rewards/rejected": -2.420654535293579, + "step": 6527 + }, + { + "epoch": 1.63, + "grad_norm": 3.82245135307312, + "learning_rate": 7.590426373346989e-06, + "logits/chosen": -0.3757680356502533, + "logits/rejected": -0.4691246747970581, + "logps/chosen": -45.339839935302734, + "logps/rejected": -65.36305236816406, + "loss": 0.6916, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8032681941986084, + "rewards/margins": 5.0452375411987305, + "rewards/rejected": -2.241969108581543, + "step": 6528 + }, + { + "epoch": 1.63, + "grad_norm": 18.144058227539062, + "learning_rate": 7.5897540633324414e-06, + "logits/chosen": -0.39200466871261597, + "logits/rejected": -0.47533348202705383, + "logps/chosen": -59.903507232666016, + "logps/rejected": -94.18824768066406, + "loss": 0.7981, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.609457015991211, + "rewards/margins": 5.522296905517578, + "rewards/rejected": -2.9128401279449463, + "step": 6529 + }, + { + "epoch": 1.63, + "grad_norm": 2.2052180767059326, + "learning_rate": 7.589081689322317e-06, + "logits/chosen": -0.37940242886543274, + "logits/rejected": -0.5200644731521606, + "logps/chosen": -55.8432502746582, + "logps/rejected": -83.24790954589844, + "loss": 0.6056, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7672533988952637, + "rewards/margins": 5.51615571975708, + "rewards/rejected": -2.7489020824432373, + "step": 6530 + }, + { + "epoch": 1.63, + "grad_norm": 5.36233377456665, + "learning_rate": 7.5884092513332285e-06, + "logits/chosen": -0.3543906509876251, + "logits/rejected": -0.4713485836982727, + "logps/chosen": -59.953147888183594, + "logps/rejected": -88.11736297607422, + "loss": 0.6941, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.774925470352173, + "rewards/margins": 4.562779426574707, + "rewards/rejected": -1.7878540754318237, + "step": 6531 + }, + { + "epoch": 1.63, + "grad_norm": 22.237638473510742, + "learning_rate": 7.5877367493817954e-06, + "logits/chosen": -0.36291414499282837, + "logits/rejected": -0.46334415674209595, + "logps/chosen": -70.7832260131836, + "logps/rejected": -104.31865692138672, + "loss": 0.8939, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9692940711975098, + "rewards/margins": 5.853365421295166, + "rewards/rejected": -2.8840713500976562, + "step": 6532 + }, + { + "epoch": 1.63, + "grad_norm": 4.800382137298584, + "learning_rate": 7.587064183484636e-06, + "logits/chosen": -0.4067500829696655, + "logits/rejected": -0.4977962076663971, + "logps/chosen": -51.80526351928711, + "logps/rejected": -77.5973129272461, + "loss": 0.7395, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.695525646209717, + "rewards/margins": 4.23850154876709, + "rewards/rejected": -1.5429761409759521, + "step": 6533 + }, + { + "epoch": 1.63, + "grad_norm": 6.917243480682373, + "learning_rate": 7.5863915536583674e-06, + "logits/chosen": -0.3989843726158142, + "logits/rejected": -0.4791787564754486, + "logps/chosen": -66.56002044677734, + "logps/rejected": -83.40037536621094, + "loss": 0.6993, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.103675603866577, + "rewards/margins": 4.7005534172058105, + "rewards/rejected": -1.5968776941299438, + "step": 6534 + }, + { + "epoch": 1.63, + "grad_norm": 5.997959613800049, + "learning_rate": 7.585718859919613e-06, + "logits/chosen": -0.40158507227897644, + "logits/rejected": -0.44810813665390015, + "logps/chosen": -46.9544677734375, + "logps/rejected": -98.70110321044922, + "loss": 0.6787, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.984558582305908, + "rewards/margins": 5.033944606781006, + "rewards/rejected": -2.0493860244750977, + "step": 6535 + }, + { + "epoch": 1.64, + "grad_norm": 8.059657096862793, + "learning_rate": 7.585046102284994e-06, + "logits/chosen": -0.43847739696502686, + "logits/rejected": -0.5261868238449097, + "logps/chosen": -58.79281234741211, + "logps/rejected": -84.0010757446289, + "loss": 0.8537, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.938481569290161, + "rewards/margins": 5.330147743225098, + "rewards/rejected": -2.3916661739349365, + "step": 6536 + }, + { + "epoch": 1.64, + "grad_norm": 3.810608148574829, + "learning_rate": 7.584373280771138e-06, + "logits/chosen": -0.2300703525543213, + "logits/rejected": -0.3949055075645447, + "logps/chosen": -71.49565124511719, + "logps/rejected": -98.73713684082031, + "loss": 0.7209, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8238260746002197, + "rewards/margins": 5.685842514038086, + "rewards/rejected": -2.862016439437866, + "step": 6537 + }, + { + "epoch": 1.64, + "grad_norm": 3.7566418647766113, + "learning_rate": 7.583700395394669e-06, + "logits/chosen": -0.4350263774394989, + "logits/rejected": -0.46810805797576904, + "logps/chosen": -48.93880081176758, + "logps/rejected": -101.5969009399414, + "loss": 0.7008, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2224502563476562, + "rewards/margins": 5.148138046264648, + "rewards/rejected": -1.92568838596344, + "step": 6538 + }, + { + "epoch": 1.64, + "grad_norm": 3.340934991836548, + "learning_rate": 7.583027446172215e-06, + "logits/chosen": -0.35957789421081543, + "logits/rejected": -0.453168660402298, + "logps/chosen": -48.33656692504883, + "logps/rejected": -83.75592041015625, + "loss": 0.6258, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8766372203826904, + "rewards/margins": 4.7680182456970215, + "rewards/rejected": -1.891380786895752, + "step": 6539 + }, + { + "epoch": 1.64, + "grad_norm": 13.48351764678955, + "learning_rate": 7.582354433120405e-06, + "logits/chosen": -0.33340489864349365, + "logits/rejected": -0.4660610258579254, + "logps/chosen": -54.00373840332031, + "logps/rejected": -87.84215545654297, + "loss": 0.7025, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.794985771179199, + "rewards/margins": 5.414341926574707, + "rewards/rejected": -2.619356155395508, + "step": 6540 + }, + { + "epoch": 1.64, + "grad_norm": 5.328919410705566, + "learning_rate": 7.58168135625587e-06, + "logits/chosen": -0.36111152172088623, + "logits/rejected": -0.4069828391075134, + "logps/chosen": -58.170936584472656, + "logps/rejected": -85.24292755126953, + "loss": 0.7133, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.991063117980957, + "rewards/margins": 4.482710361480713, + "rewards/rejected": -1.491647481918335, + "step": 6541 + }, + { + "epoch": 1.64, + "grad_norm": 7.0364861488342285, + "learning_rate": 7.581008215595243e-06, + "logits/chosen": -0.31849631667137146, + "logits/rejected": -0.38882988691329956, + "logps/chosen": -55.977962493896484, + "logps/rejected": -85.00933074951172, + "loss": 0.6774, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.098918914794922, + "rewards/margins": 4.91383171081543, + "rewards/rejected": -1.814913272857666, + "step": 6542 + }, + { + "epoch": 1.64, + "grad_norm": 9.161252975463867, + "learning_rate": 7.580335011155159e-06, + "logits/chosen": -0.40888282656669617, + "logits/rejected": -0.484747052192688, + "logps/chosen": -48.63362121582031, + "logps/rejected": -79.98880004882812, + "loss": 0.7075, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1233763694763184, + "rewards/margins": 4.211141586303711, + "rewards/rejected": -1.0877652168273926, + "step": 6543 + }, + { + "epoch": 1.64, + "grad_norm": 7.8599629402160645, + "learning_rate": 7.579661742952249e-06, + "logits/chosen": -0.42099493741989136, + "logits/rejected": -0.5191177129745483, + "logps/chosen": -54.55740737915039, + "logps/rejected": -83.58494567871094, + "loss": 0.8947, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.587367296218872, + "rewards/margins": 4.694784164428711, + "rewards/rejected": -2.107416868209839, + "step": 6544 + }, + { + "epoch": 1.64, + "grad_norm": 5.601072788238525, + "learning_rate": 7.578988411003156e-06, + "logits/chosen": -0.41666218638420105, + "logits/rejected": -0.539389431476593, + "logps/chosen": -49.568023681640625, + "logps/rejected": -85.57890319824219, + "loss": 0.6841, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.87996768951416, + "rewards/margins": 5.51600456237793, + "rewards/rejected": -2.6360368728637695, + "step": 6545 + }, + { + "epoch": 1.64, + "grad_norm": 10.054923057556152, + "learning_rate": 7.578315015324517e-06, + "logits/chosen": -0.31890541315078735, + "logits/rejected": -0.43269479274749756, + "logps/chosen": -73.24588775634766, + "logps/rejected": -84.41582489013672, + "loss": 0.878, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.212676525115967, + "rewards/margins": 4.195555686950684, + "rewards/rejected": -1.9828791618347168, + "step": 6546 + }, + { + "epoch": 1.64, + "grad_norm": 5.058782577514648, + "learning_rate": 7.57764155593297e-06, + "logits/chosen": -0.3890518844127655, + "logits/rejected": -0.4478679597377777, + "logps/chosen": -53.21561813354492, + "logps/rejected": -98.86453247070312, + "loss": 0.6344, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.033229112625122, + "rewards/margins": 5.3772969245910645, + "rewards/rejected": -2.344067335128784, + "step": 6547 + }, + { + "epoch": 1.64, + "grad_norm": 3.4742469787597656, + "learning_rate": 7.57696803284516e-06, + "logits/chosen": -0.3568345606327057, + "logits/rejected": -0.4021627902984619, + "logps/chosen": -52.93674087524414, + "logps/rejected": -97.23436737060547, + "loss": 0.6361, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0654029846191406, + "rewards/margins": 5.349165916442871, + "rewards/rejected": -2.2837624549865723, + "step": 6548 + }, + { + "epoch": 1.64, + "grad_norm": 13.848586082458496, + "learning_rate": 7.5762944460777265e-06, + "logits/chosen": -0.3515864312648773, + "logits/rejected": -0.4404456615447998, + "logps/chosen": -61.32112121582031, + "logps/rejected": -96.81246948242188, + "loss": 0.795, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.702605724334717, + "rewards/margins": 5.264909267425537, + "rewards/rejected": -2.562303066253662, + "step": 6549 + }, + { + "epoch": 1.64, + "grad_norm": 5.610424995422363, + "learning_rate": 7.575620795647318e-06, + "logits/chosen": -0.40940606594085693, + "logits/rejected": -0.39383015036582947, + "logps/chosen": -54.36283874511719, + "logps/rejected": -113.95060729980469, + "loss": 0.7776, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.992616653442383, + "rewards/margins": 4.515788555145264, + "rewards/rejected": -1.5231720209121704, + "step": 6550 + }, + { + "epoch": 1.64, + "grad_norm": 6.9426164627075195, + "learning_rate": 7.574947081570581e-06, + "logits/chosen": -0.3888348340988159, + "logits/rejected": -0.48826950788497925, + "logps/chosen": -68.74922180175781, + "logps/rejected": -89.88618469238281, + "loss": 0.7699, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.787836790084839, + "rewards/margins": 4.8438005447387695, + "rewards/rejected": -2.0559639930725098, + "step": 6551 + }, + { + "epoch": 1.64, + "grad_norm": 1.9947266578674316, + "learning_rate": 7.574273303864162e-06, + "logits/chosen": -0.3603028953075409, + "logits/rejected": -0.4874110817909241, + "logps/chosen": -63.76163864135742, + "logps/rejected": -90.15673828125, + "loss": 0.6379, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2476391792297363, + "rewards/margins": 6.011499404907227, + "rewards/rejected": -2.7638604640960693, + "step": 6552 + }, + { + "epoch": 1.64, + "grad_norm": 7.064940929412842, + "learning_rate": 7.5735994625447105e-06, + "logits/chosen": -0.38569334149360657, + "logits/rejected": -0.4866504669189453, + "logps/chosen": -45.14360046386719, + "logps/rejected": -95.94387817382812, + "loss": 0.6065, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4059665203094482, + "rewards/margins": 6.5357232093811035, + "rewards/rejected": -3.1297566890716553, + "step": 6553 + }, + { + "epoch": 1.64, + "grad_norm": 4.527826309204102, + "learning_rate": 7.572925557628881e-06, + "logits/chosen": -0.407877653837204, + "logits/rejected": -0.4111114740371704, + "logps/chosen": -47.3649787902832, + "logps/rejected": -111.30054473876953, + "loss": 0.6725, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.876559257507324, + "rewards/margins": 5.700150966644287, + "rewards/rejected": -2.823591709136963, + "step": 6554 + }, + { + "epoch": 1.64, + "grad_norm": 5.109597682952881, + "learning_rate": 7.572251589133324e-06, + "logits/chosen": -0.37006253004074097, + "logits/rejected": -0.5112398266792297, + "logps/chosen": -59.73258972167969, + "logps/rejected": -80.87354278564453, + "loss": 0.6693, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7600464820861816, + "rewards/margins": 5.800346374511719, + "rewards/rejected": -3.0402991771698, + "step": 6555 + }, + { + "epoch": 1.64, + "grad_norm": 11.607636451721191, + "learning_rate": 7.571577557074692e-06, + "logits/chosen": -0.37727439403533936, + "logits/rejected": -0.5218654274940491, + "logps/chosen": -50.25202560424805, + "logps/rejected": -95.51679229736328, + "loss": 0.6198, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1518921852111816, + "rewards/margins": 6.4814043045043945, + "rewards/rejected": -3.3295111656188965, + "step": 6556 + }, + { + "epoch": 1.64, + "grad_norm": 9.286842346191406, + "learning_rate": 7.5709034614696454e-06, + "logits/chosen": -0.4178236126899719, + "logits/rejected": -0.5109764337539673, + "logps/chosen": -68.05099487304688, + "logps/rejected": -92.51823425292969, + "loss": 0.6783, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.664689302444458, + "rewards/margins": 5.249972343444824, + "rewards/rejected": -2.5852832794189453, + "step": 6557 + }, + { + "epoch": 1.64, + "grad_norm": 2.43975567817688, + "learning_rate": 7.5702293023348395e-06, + "logits/chosen": -0.2787947356700897, + "logits/rejected": -0.3595793843269348, + "logps/chosen": -56.22751235961914, + "logps/rejected": -101.27702331542969, + "loss": 0.5716, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0555319786071777, + "rewards/margins": 6.180835723876953, + "rewards/rejected": -3.125303030014038, + "step": 6558 + }, + { + "epoch": 1.64, + "grad_norm": 22.57889175415039, + "learning_rate": 7.569555079686933e-06, + "logits/chosen": -0.3558887243270874, + "logits/rejected": -0.447738915681839, + "logps/chosen": -65.83678436279297, + "logps/rejected": -78.79357147216797, + "loss": 0.7833, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4841015338897705, + "rewards/margins": 4.386518478393555, + "rewards/rejected": -1.9024169445037842, + "step": 6559 + }, + { + "epoch": 1.64, + "grad_norm": 9.924118041992188, + "learning_rate": 7.568880793542588e-06, + "logits/chosen": -0.3017004132270813, + "logits/rejected": -0.405207097530365, + "logps/chosen": -58.85806655883789, + "logps/rejected": -91.54959869384766, + "loss": 0.6919, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7648496627807617, + "rewards/margins": 5.596203327178955, + "rewards/rejected": -2.8313541412353516, + "step": 6560 + }, + { + "epoch": 1.64, + "grad_norm": 5.02763032913208, + "learning_rate": 7.568206443918465e-06, + "logits/chosen": -0.3596709966659546, + "logits/rejected": -0.44578292965888977, + "logps/chosen": -67.48358154296875, + "logps/rejected": -97.94309997558594, + "loss": 0.7077, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7194695472717285, + "rewards/margins": 5.323825359344482, + "rewards/rejected": -2.604356050491333, + "step": 6561 + }, + { + "epoch": 1.64, + "grad_norm": 20.944427490234375, + "learning_rate": 7.56753203083123e-06, + "logits/chosen": -0.38947588205337524, + "logits/rejected": -0.4591471254825592, + "logps/chosen": -58.330284118652344, + "logps/rejected": -83.65343475341797, + "loss": 0.7862, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9327945709228516, + "rewards/margins": 4.988156318664551, + "rewards/rejected": -2.0553622245788574, + "step": 6562 + }, + { + "epoch": 1.64, + "grad_norm": 7.144587993621826, + "learning_rate": 7.566857554297549e-06, + "logits/chosen": -0.3398478031158447, + "logits/rejected": -0.4572090208530426, + "logps/chosen": -68.59705352783203, + "logps/rejected": -83.43385314941406, + "loss": 0.9009, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9199419021606445, + "rewards/margins": 4.305426597595215, + "rewards/rejected": -1.3854843378067017, + "step": 6563 + }, + { + "epoch": 1.64, + "grad_norm": 4.483684062957764, + "learning_rate": 7.566183014334085e-06, + "logits/chosen": -0.3028848171234131, + "logits/rejected": -0.3973238468170166, + "logps/chosen": -64.1868896484375, + "logps/rejected": -82.46507263183594, + "loss": 0.7613, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.967681407928467, + "rewards/margins": 4.451230049133301, + "rewards/rejected": -1.4835482835769653, + "step": 6564 + }, + { + "epoch": 1.64, + "grad_norm": 3.1982498168945312, + "learning_rate": 7.565508410957512e-06, + "logits/chosen": -0.31523633003234863, + "logits/rejected": -0.4119091033935547, + "logps/chosen": -43.471351623535156, + "logps/rejected": -84.8643569946289, + "loss": 0.571, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9155313968658447, + "rewards/margins": 5.503485679626465, + "rewards/rejected": -2.587953567504883, + "step": 6565 + }, + { + "epoch": 1.64, + "grad_norm": 6.691787242889404, + "learning_rate": 7.564833744184497e-06, + "logits/chosen": -0.41914093494415283, + "logits/rejected": -0.44000184535980225, + "logps/chosen": -49.15684509277344, + "logps/rejected": -109.82383728027344, + "loss": 0.611, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.14005708694458, + "rewards/margins": 5.389228343963623, + "rewards/rejected": -2.249171257019043, + "step": 6566 + }, + { + "epoch": 1.64, + "grad_norm": 5.157948970794678, + "learning_rate": 7.564159014031709e-06, + "logits/chosen": -0.3779226839542389, + "logits/rejected": -0.45432430505752563, + "logps/chosen": -55.04789352416992, + "logps/rejected": -85.22958374023438, + "loss": 0.6116, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.943070650100708, + "rewards/margins": 5.046387672424316, + "rewards/rejected": -2.1033170223236084, + "step": 6567 + }, + { + "epoch": 1.64, + "grad_norm": 11.931198120117188, + "learning_rate": 7.5634842205158266e-06, + "logits/chosen": -0.4122489392757416, + "logits/rejected": -0.5097619891166687, + "logps/chosen": -57.94232940673828, + "logps/rejected": -102.42868041992188, + "loss": 0.7353, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2744574546813965, + "rewards/margins": 6.386913299560547, + "rewards/rejected": -3.112455129623413, + "step": 6568 + }, + { + "epoch": 1.64, + "grad_norm": 12.76911735534668, + "learning_rate": 7.562809363653522e-06, + "logits/chosen": -0.30475273728370667, + "logits/rejected": -0.4153156876564026, + "logps/chosen": -57.35865783691406, + "logps/rejected": -76.77333068847656, + "loss": 0.6217, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7668886184692383, + "rewards/margins": 4.989648342132568, + "rewards/rejected": -2.222759246826172, + "step": 6569 + }, + { + "epoch": 1.64, + "grad_norm": 8.225489616394043, + "learning_rate": 7.562134443461473e-06, + "logits/chosen": -0.4200257360935211, + "logits/rejected": -0.4562695622444153, + "logps/chosen": -57.0521240234375, + "logps/rejected": -84.28181457519531, + "loss": 0.8001, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4823992252349854, + "rewards/margins": 4.547698020935059, + "rewards/rejected": -2.0652987957000732, + "step": 6570 + }, + { + "epoch": 1.64, + "grad_norm": 13.43838119506836, + "learning_rate": 7.561459459956355e-06, + "logits/chosen": -0.379352331161499, + "logits/rejected": -0.4931456446647644, + "logps/chosen": -55.228763580322266, + "logps/rejected": -73.80301666259766, + "loss": 0.6472, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.858416795730591, + "rewards/margins": 4.652217864990234, + "rewards/rejected": -1.7938013076782227, + "step": 6571 + }, + { + "epoch": 1.64, + "grad_norm": 8.799369812011719, + "learning_rate": 7.560784413154851e-06, + "logits/chosen": -0.3496781587600708, + "logits/rejected": -0.44064244627952576, + "logps/chosen": -56.75811004638672, + "logps/rejected": -98.78549194335938, + "loss": 0.8421, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.771550416946411, + "rewards/margins": 5.954925060272217, + "rewards/rejected": -3.183375358581543, + "step": 6572 + }, + { + "epoch": 1.64, + "grad_norm": 4.042404651641846, + "learning_rate": 7.56010930307364e-06, + "logits/chosen": -0.36540499329566956, + "logits/rejected": -0.46390897035598755, + "logps/chosen": -59.150543212890625, + "logps/rejected": -82.81905364990234, + "loss": 0.6311, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.732985019683838, + "rewards/margins": 4.365381717681885, + "rewards/rejected": -1.6323968172073364, + "step": 6573 + }, + { + "epoch": 1.64, + "grad_norm": 5.841647148132324, + "learning_rate": 7.559434129729404e-06, + "logits/chosen": -0.3910561501979828, + "logits/rejected": -0.5385880470275879, + "logps/chosen": -67.25093841552734, + "logps/rejected": -101.90674591064453, + "loss": 0.7205, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.674269199371338, + "rewards/margins": 5.60783576965332, + "rewards/rejected": -2.9335665702819824, + "step": 6574 + }, + { + "epoch": 1.64, + "grad_norm": 8.261382102966309, + "learning_rate": 7.55875889313883e-06, + "logits/chosen": -0.4192110598087311, + "logits/rejected": -0.5350117683410645, + "logps/chosen": -50.642459869384766, + "logps/rejected": -95.6402816772461, + "loss": 0.7239, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.755091905593872, + "rewards/margins": 6.055706024169922, + "rewards/rejected": -3.300614356994629, + "step": 6575 + }, + { + "epoch": 1.65, + "grad_norm": 5.209261417388916, + "learning_rate": 7.558083593318599e-06, + "logits/chosen": -0.45983612537384033, + "logits/rejected": -0.5044986605644226, + "logps/chosen": -62.10602569580078, + "logps/rejected": -90.01179504394531, + "loss": 0.7954, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0453598499298096, + "rewards/margins": 4.296834468841553, + "rewards/rejected": -1.2514740228652954, + "step": 6576 + }, + { + "epoch": 1.65, + "grad_norm": 6.134618759155273, + "learning_rate": 7.5574082302854055e-06, + "logits/chosen": -0.30311164259910583, + "logits/rejected": -0.3634628355503082, + "logps/chosen": -59.948951721191406, + "logps/rejected": -112.0555191040039, + "loss": 0.7572, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.763056993484497, + "rewards/margins": 5.30233907699585, + "rewards/rejected": -2.5392820835113525, + "step": 6577 + }, + { + "epoch": 1.65, + "grad_norm": 6.444342613220215, + "learning_rate": 7.556732804055933e-06, + "logits/chosen": -0.39255213737487793, + "logits/rejected": -0.42417803406715393, + "logps/chosen": -58.9583740234375, + "logps/rejected": -88.63484954833984, + "loss": 0.7366, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8479788303375244, + "rewards/margins": 3.983295202255249, + "rewards/rejected": -1.1353167295455933, + "step": 6578 + }, + { + "epoch": 1.65, + "grad_norm": 5.915912628173828, + "learning_rate": 7.556057314646872e-06, + "logits/chosen": -0.3824966549873352, + "logits/rejected": -0.4415174126625061, + "logps/chosen": -60.6925163269043, + "logps/rejected": -95.26349639892578, + "loss": 0.7765, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.851861000061035, + "rewards/margins": 4.712347507476807, + "rewards/rejected": -1.8604861497879028, + "step": 6579 + }, + { + "epoch": 1.65, + "grad_norm": 12.12109661102295, + "learning_rate": 7.555381762074918e-06, + "logits/chosen": -0.33532842993736267, + "logits/rejected": -0.4879445731639862, + "logps/chosen": -72.63041687011719, + "logps/rejected": -86.86038208007812, + "loss": 0.7307, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9512128829956055, + "rewards/margins": 6.3387041091918945, + "rewards/rejected": -3.38749098777771, + "step": 6580 + }, + { + "epoch": 1.65, + "grad_norm": 2.2670295238494873, + "learning_rate": 7.554706146356761e-06, + "logits/chosen": -0.4020768404006958, + "logits/rejected": -0.4323262572288513, + "logps/chosen": -50.63512420654297, + "logps/rejected": -93.06986236572266, + "loss": 0.578, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1392018795013428, + "rewards/margins": 4.663572311401367, + "rewards/rejected": -1.5243703126907349, + "step": 6581 + }, + { + "epoch": 1.65, + "grad_norm": 4.460199356079102, + "learning_rate": 7.5540304675090995e-06, + "logits/chosen": -0.2936476469039917, + "logits/rejected": -0.38220182061195374, + "logps/chosen": -55.885162353515625, + "logps/rejected": -83.58866882324219, + "loss": 0.7245, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9725215435028076, + "rewards/margins": 4.203495979309082, + "rewards/rejected": -1.2309743165969849, + "step": 6582 + }, + { + "epoch": 1.65, + "grad_norm": 5.989243030548096, + "learning_rate": 7.553354725548627e-06, + "logits/chosen": -0.4085768163204193, + "logits/rejected": -0.4177651107311249, + "logps/chosen": -57.795753479003906, + "logps/rejected": -96.08048248291016, + "loss": 0.8012, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.793461561203003, + "rewards/margins": 4.827298164367676, + "rewards/rejected": -2.0338361263275146, + "step": 6583 + }, + { + "epoch": 1.65, + "grad_norm": 4.065709590911865, + "learning_rate": 7.5526789204920434e-06, + "logits/chosen": -0.4100920855998993, + "logits/rejected": -0.47240978479385376, + "logps/chosen": -53.708072662353516, + "logps/rejected": -80.89157104492188, + "loss": 0.7286, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9588687419891357, + "rewards/margins": 4.67298698425293, + "rewards/rejected": -1.7141177654266357, + "step": 6584 + }, + { + "epoch": 1.65, + "grad_norm": 3.3929882049560547, + "learning_rate": 7.55200305235605e-06, + "logits/chosen": -0.3489646911621094, + "logits/rejected": -0.4400281310081482, + "logps/chosen": -54.66437530517578, + "logps/rejected": -91.28150177001953, + "loss": 0.6715, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.151489496231079, + "rewards/margins": 5.315570831298828, + "rewards/rejected": -2.1640820503234863, + "step": 6585 + }, + { + "epoch": 1.65, + "grad_norm": 6.092407703399658, + "learning_rate": 7.551327121157346e-06, + "logits/chosen": -0.3229750990867615, + "logits/rejected": -0.3869458734989166, + "logps/chosen": -67.06388854980469, + "logps/rejected": -89.35542297363281, + "loss": 0.7816, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7101407051086426, + "rewards/margins": 3.477105140686035, + "rewards/rejected": -0.7669646143913269, + "step": 6586 + }, + { + "epoch": 1.65, + "grad_norm": 2.913654088973999, + "learning_rate": 7.5506511269126335e-06, + "logits/chosen": -0.3611070215702057, + "logits/rejected": -0.5096807479858398, + "logps/chosen": -43.58979034423828, + "logps/rejected": -80.66107940673828, + "loss": 0.5928, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.716521978378296, + "rewards/margins": 5.9703497886657715, + "rewards/rejected": -3.2538278102874756, + "step": 6587 + }, + { + "epoch": 1.65, + "grad_norm": 3.416459083557129, + "learning_rate": 7.549975069638621e-06, + "logits/chosen": -0.37508848309516907, + "logits/rejected": -0.4549849331378937, + "logps/chosen": -60.647220611572266, + "logps/rejected": -88.56158447265625, + "loss": 0.7348, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7340996265411377, + "rewards/margins": 4.755230903625488, + "rewards/rejected": -2.021131753921509, + "step": 6588 + }, + { + "epoch": 1.65, + "grad_norm": 4.7645368576049805, + "learning_rate": 7.549298949352012e-06, + "logits/chosen": -0.3770584762096405, + "logits/rejected": -0.41065680980682373, + "logps/chosen": -55.92138671875, + "logps/rejected": -105.18426513671875, + "loss": 0.7203, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.847838878631592, + "rewards/margins": 5.240434169769287, + "rewards/rejected": -2.3925952911376953, + "step": 6589 + }, + { + "epoch": 1.65, + "grad_norm": 6.62656831741333, + "learning_rate": 7.548622766069513e-06, + "logits/chosen": -0.4072319269180298, + "logits/rejected": -0.5405077934265137, + "logps/chosen": -64.08238220214844, + "logps/rejected": -97.04566955566406, + "loss": 0.7166, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.02280855178833, + "rewards/margins": 6.213614463806152, + "rewards/rejected": -3.190805435180664, + "step": 6590 + }, + { + "epoch": 1.65, + "grad_norm": 20.298357009887695, + "learning_rate": 7.547946519807836e-06, + "logits/chosen": -0.3767024278640747, + "logits/rejected": -0.4281908869743347, + "logps/chosen": -54.95601272583008, + "logps/rejected": -91.91190338134766, + "loss": 0.848, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.714423656463623, + "rewards/margins": 4.724649906158447, + "rewards/rejected": -2.010226249694824, + "step": 6591 + }, + { + "epoch": 1.65, + "grad_norm": 3.302070379257202, + "learning_rate": 7.547270210583689e-06, + "logits/chosen": -0.4455793499946594, + "logits/rejected": -0.48638829588890076, + "logps/chosen": -53.07086181640625, + "logps/rejected": -104.54908752441406, + "loss": 0.6253, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.98260760307312, + "rewards/margins": 4.701628684997559, + "rewards/rejected": -1.7190213203430176, + "step": 6592 + }, + { + "epoch": 1.65, + "grad_norm": 5.90315580368042, + "learning_rate": 7.546593838413787e-06, + "logits/chosen": -0.36182063817977905, + "logits/rejected": -0.4574989080429077, + "logps/chosen": -58.7584114074707, + "logps/rejected": -85.65760040283203, + "loss": 0.7884, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.859534978866577, + "rewards/margins": 4.460720062255859, + "rewards/rejected": -1.6011849641799927, + "step": 6593 + }, + { + "epoch": 1.65, + "grad_norm": 5.121554851531982, + "learning_rate": 7.545917403314842e-06, + "logits/chosen": -0.3736340403556824, + "logits/rejected": -0.40521883964538574, + "logps/chosen": -62.99836349487305, + "logps/rejected": -92.3149642944336, + "loss": 0.8364, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.804264545440674, + "rewards/margins": 4.471963405609131, + "rewards/rejected": -1.6676989793777466, + "step": 6594 + }, + { + "epoch": 1.65, + "grad_norm": 4.906953811645508, + "learning_rate": 7.5452409053035705e-06, + "logits/chosen": -0.4277620017528534, + "logits/rejected": -0.4825845956802368, + "logps/chosen": -58.27748107910156, + "logps/rejected": -87.62571716308594, + "loss": 0.7859, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.791959762573242, + "rewards/margins": 5.480844974517822, + "rewards/rejected": -2.6888856887817383, + "step": 6595 + }, + { + "epoch": 1.65, + "grad_norm": 4.796780586242676, + "learning_rate": 7.5445643443966875e-06, + "logits/chosen": -0.41655969619750977, + "logits/rejected": -0.5551006197929382, + "logps/chosen": -61.761985778808594, + "logps/rejected": -83.25942993164062, + "loss": 0.7714, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8582980632781982, + "rewards/margins": 4.815826416015625, + "rewards/rejected": -1.9575281143188477, + "step": 6596 + }, + { + "epoch": 1.65, + "grad_norm": 4.051344394683838, + "learning_rate": 7.543887720610914e-06, + "logits/chosen": -0.3866429626941681, + "logits/rejected": -0.5404329299926758, + "logps/chosen": -66.35751342773438, + "logps/rejected": -73.86454772949219, + "loss": 0.7346, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8133184909820557, + "rewards/margins": 5.45425271987915, + "rewards/rejected": -2.6409339904785156, + "step": 6597 + }, + { + "epoch": 1.65, + "grad_norm": 7.847044944763184, + "learning_rate": 7.5432110339629695e-06, + "logits/chosen": -0.3734605610370636, + "logits/rejected": -0.44482558965682983, + "logps/chosen": -56.1500129699707, + "logps/rejected": -84.632568359375, + "loss": 0.7492, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8718655109405518, + "rewards/margins": 4.606718063354492, + "rewards/rejected": -1.7348525524139404, + "step": 6598 + }, + { + "epoch": 1.65, + "grad_norm": 7.032290935516357, + "learning_rate": 7.5425342844695735e-06, + "logits/chosen": -0.3377716839313507, + "logits/rejected": -0.42679476737976074, + "logps/chosen": -64.96833801269531, + "logps/rejected": -92.69718933105469, + "loss": 0.6719, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.087908983230591, + "rewards/margins": 6.32138729095459, + "rewards/rejected": -3.2334787845611572, + "step": 6599 + }, + { + "epoch": 1.65, + "grad_norm": 3.4302854537963867, + "learning_rate": 7.541857472147454e-06, + "logits/chosen": -0.40782877802848816, + "logits/rejected": -0.5164531469345093, + "logps/chosen": -52.01094436645508, + "logps/rejected": -86.72063446044922, + "loss": 0.6148, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6877188682556152, + "rewards/margins": 5.306097984313965, + "rewards/rejected": -2.6183791160583496, + "step": 6600 + }, + { + "epoch": 1.65, + "grad_norm": 6.007295608520508, + "learning_rate": 7.541180597013331e-06, + "logits/chosen": -0.4486096501350403, + "logits/rejected": -0.5427255630493164, + "logps/chosen": -55.193702697753906, + "logps/rejected": -90.07231903076172, + "loss": 0.7478, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.64094877243042, + "rewards/margins": 5.478897571563721, + "rewards/rejected": -2.837948799133301, + "step": 6601 + }, + { + "epoch": 1.65, + "grad_norm": 3.387453556060791, + "learning_rate": 7.540503659083933e-06, + "logits/chosen": -0.3887363076210022, + "logits/rejected": -0.5042162537574768, + "logps/chosen": -47.96505355834961, + "logps/rejected": -86.39379119873047, + "loss": 0.5963, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9816174507141113, + "rewards/margins": 5.853010177612305, + "rewards/rejected": -2.8713924884796143, + "step": 6602 + }, + { + "epoch": 1.65, + "grad_norm": 4.583486080169678, + "learning_rate": 7.539826658375986e-06, + "logits/chosen": -0.3629319667816162, + "logits/rejected": -0.378550261259079, + "logps/chosen": -60.15226745605469, + "logps/rejected": -116.18595123291016, + "loss": 0.6718, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0321104526519775, + "rewards/margins": 5.19943904876709, + "rewards/rejected": -2.1673290729522705, + "step": 6603 + }, + { + "epoch": 1.65, + "grad_norm": 5.916999816894531, + "learning_rate": 7.539149594906222e-06, + "logits/chosen": -0.4142843186855316, + "logits/rejected": -0.471153199672699, + "logps/chosen": -54.08542251586914, + "logps/rejected": -98.19097900390625, + "loss": 0.6706, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.506669282913208, + "rewards/margins": 4.444338798522949, + "rewards/rejected": -1.937669277191162, + "step": 6604 + }, + { + "epoch": 1.65, + "grad_norm": 8.804450035095215, + "learning_rate": 7.5384724686913716e-06, + "logits/chosen": -0.36961662769317627, + "logits/rejected": -0.5059117078781128, + "logps/chosen": -71.5321273803711, + "logps/rejected": -82.36643981933594, + "loss": 0.8203, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.275289535522461, + "rewards/margins": 4.987514019012451, + "rewards/rejected": -1.7122242450714111, + "step": 6605 + }, + { + "epoch": 1.65, + "grad_norm": 5.624621868133545, + "learning_rate": 7.537795279748166e-06, + "logits/chosen": -0.4538285434246063, + "logits/rejected": -0.5573785305023193, + "logps/chosen": -53.143795013427734, + "logps/rejected": -81.9393539428711, + "loss": 0.7773, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9220151901245117, + "rewards/margins": 5.365610599517822, + "rewards/rejected": -2.4435949325561523, + "step": 6606 + }, + { + "epoch": 1.65, + "grad_norm": 4.573606967926025, + "learning_rate": 7.537118028093338e-06, + "logits/chosen": -0.4115709662437439, + "logits/rejected": -0.4512343406677246, + "logps/chosen": -60.917510986328125, + "logps/rejected": -92.973876953125, + "loss": 0.6828, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6903076171875, + "rewards/margins": 4.276149272918701, + "rewards/rejected": -1.5858409404754639, + "step": 6607 + }, + { + "epoch": 1.65, + "grad_norm": 5.895534038543701, + "learning_rate": 7.5364407137436265e-06, + "logits/chosen": -0.3565637767314911, + "logits/rejected": -0.4943971037864685, + "logps/chosen": -78.79301452636719, + "logps/rejected": -76.97988891601562, + "loss": 0.937, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.802443027496338, + "rewards/margins": 4.106625556945801, + "rewards/rejected": -1.304182767868042, + "step": 6608 + }, + { + "epoch": 1.65, + "grad_norm": 15.21973705291748, + "learning_rate": 7.5357633367157665e-06, + "logits/chosen": -0.4102858901023865, + "logits/rejected": -0.4358505308628082, + "logps/chosen": -61.569278717041016, + "logps/rejected": -98.07388305664062, + "loss": 0.7994, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4760351181030273, + "rewards/margins": 4.8880205154418945, + "rewards/rejected": -2.411985397338867, + "step": 6609 + }, + { + "epoch": 1.65, + "grad_norm": 12.672094345092773, + "learning_rate": 7.535085897026497e-06, + "logits/chosen": -0.4195455312728882, + "logits/rejected": -0.4714840054512024, + "logps/chosen": -56.35944366455078, + "logps/rejected": -74.02847290039062, + "loss": 0.8317, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4633970260620117, + "rewards/margins": 4.27376127243042, + "rewards/rejected": -1.8103642463684082, + "step": 6610 + }, + { + "epoch": 1.65, + "grad_norm": 6.502291202545166, + "learning_rate": 7.5344083946925595e-06, + "logits/chosen": -0.3652026057243347, + "logits/rejected": -0.5201596617698669, + "logps/chosen": -51.38995361328125, + "logps/rejected": -90.37310791015625, + "loss": 0.6206, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6890435218811035, + "rewards/margins": 6.5715179443359375, + "rewards/rejected": -3.882474660873413, + "step": 6611 + }, + { + "epoch": 1.65, + "grad_norm": 8.250455856323242, + "learning_rate": 7.533730829730694e-06, + "logits/chosen": -0.35946351289749146, + "logits/rejected": -0.45858606696128845, + "logps/chosen": -52.254268646240234, + "logps/rejected": -73.50906372070312, + "loss": 0.8267, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7646262645721436, + "rewards/margins": 3.929163932800293, + "rewards/rejected": -1.1645370721817017, + "step": 6612 + }, + { + "epoch": 1.65, + "grad_norm": 10.521665573120117, + "learning_rate": 7.533053202157645e-06, + "logits/chosen": -0.3767102360725403, + "logits/rejected": -0.42916905879974365, + "logps/chosen": -60.10406494140625, + "logps/rejected": -96.69652557373047, + "loss": 0.8311, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.568974018096924, + "rewards/margins": 3.9870688915252686, + "rewards/rejected": -1.4180943965911865, + "step": 6613 + }, + { + "epoch": 1.65, + "grad_norm": 3.970571756362915, + "learning_rate": 7.532375511990158e-06, + "logits/chosen": -0.3272440731525421, + "logits/rejected": -0.44988390803337097, + "logps/chosen": -59.250579833984375, + "logps/rejected": -98.07142639160156, + "loss": 0.636, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2851226329803467, + "rewards/margins": 6.2059407234191895, + "rewards/rejected": -2.9208180904388428, + "step": 6614 + }, + { + "epoch": 1.65, + "grad_norm": 4.925250053405762, + "learning_rate": 7.531697759244978e-06, + "logits/chosen": -0.42284679412841797, + "logits/rejected": -0.48704227805137634, + "logps/chosen": -58.90509033203125, + "logps/rejected": -83.56782531738281, + "loss": 0.7646, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7946135997772217, + "rewards/margins": 4.020875453948975, + "rewards/rejected": -1.2262623310089111, + "step": 6615 + }, + { + "epoch": 1.66, + "grad_norm": 3.464761257171631, + "learning_rate": 7.5310199439388534e-06, + "logits/chosen": -0.35277244448661804, + "logits/rejected": -0.42009812593460083, + "logps/chosen": -58.722206115722656, + "logps/rejected": -81.85931396484375, + "loss": 0.7047, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0323755741119385, + "rewards/margins": 5.640953063964844, + "rewards/rejected": -2.6085779666900635, + "step": 6616 + }, + { + "epoch": 1.66, + "grad_norm": 8.913434028625488, + "learning_rate": 7.530342066088533e-06, + "logits/chosen": -0.441562682390213, + "logits/rejected": -0.567988932132721, + "logps/chosen": -60.63165283203125, + "logps/rejected": -80.92465209960938, + "loss": 0.7087, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9663357734680176, + "rewards/margins": 5.695986270904541, + "rewards/rejected": -2.7296500205993652, + "step": 6617 + }, + { + "epoch": 1.66, + "grad_norm": 8.52471923828125, + "learning_rate": 7.529664125710769e-06, + "logits/chosen": -0.38089460134506226, + "logits/rejected": -0.47313934564590454, + "logps/chosen": -55.136878967285156, + "logps/rejected": -83.0323486328125, + "loss": 0.76, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.754556179046631, + "rewards/margins": 4.431610584259033, + "rewards/rejected": -1.6770542860031128, + "step": 6618 + }, + { + "epoch": 1.66, + "grad_norm": 5.063058376312256, + "learning_rate": 7.5289861228223145e-06, + "logits/chosen": -0.39311501383781433, + "logits/rejected": -0.5198186635971069, + "logps/chosen": -69.45378112792969, + "logps/rejected": -72.89857482910156, + "loss": 0.8353, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7657971382141113, + "rewards/margins": 4.985881805419922, + "rewards/rejected": -2.2200846672058105, + "step": 6619 + }, + { + "epoch": 1.66, + "grad_norm": 5.335673809051514, + "learning_rate": 7.528308057439924e-06, + "logits/chosen": -0.3951249420642853, + "logits/rejected": -0.5068134069442749, + "logps/chosen": -67.55574798583984, + "logps/rejected": -75.33954620361328, + "loss": 0.8505, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.837214231491089, + "rewards/margins": 3.5135743618011475, + "rewards/rejected": -0.6763598322868347, + "step": 6620 + }, + { + "epoch": 1.66, + "grad_norm": 3.630324363708496, + "learning_rate": 7.52762992958035e-06, + "logits/chosen": -0.30645257234573364, + "logits/rejected": -0.4394618272781372, + "logps/chosen": -64.76507568359375, + "logps/rejected": -94.65312957763672, + "loss": 0.6752, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9553351402282715, + "rewards/margins": 6.603106498718262, + "rewards/rejected": -3.6477718353271484, + "step": 6621 + }, + { + "epoch": 1.66, + "grad_norm": 6.703302383422852, + "learning_rate": 7.526951739260353e-06, + "logits/chosen": -0.5141401290893555, + "logits/rejected": -0.6587104201316833, + "logps/chosen": -47.45232391357422, + "logps/rejected": -73.39277648925781, + "loss": 0.604, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9097795486450195, + "rewards/margins": 5.644443988800049, + "rewards/rejected": -2.7346644401550293, + "step": 6622 + }, + { + "epoch": 1.66, + "grad_norm": 6.5068793296813965, + "learning_rate": 7.526273486496692e-06, + "logits/chosen": -0.37851688265800476, + "logits/rejected": -0.4408063292503357, + "logps/chosen": -48.320465087890625, + "logps/rejected": -81.57402801513672, + "loss": 0.6273, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.846735954284668, + "rewards/margins": 5.023250579833984, + "rewards/rejected": -2.176514148712158, + "step": 6623 + }, + { + "epoch": 1.66, + "grad_norm": 3.6259748935699463, + "learning_rate": 7.525595171306124e-06, + "logits/chosen": -0.36645328998565674, + "logits/rejected": -0.4921293556690216, + "logps/chosen": -52.24960708618164, + "logps/rejected": -76.00315856933594, + "loss": 0.6632, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.995548725128174, + "rewards/margins": 5.168622016906738, + "rewards/rejected": -2.1730728149414062, + "step": 6624 + }, + { + "epoch": 1.66, + "grad_norm": 5.247336387634277, + "learning_rate": 7.524916793705412e-06, + "logits/chosen": -0.4736584722995758, + "logits/rejected": -0.5427708029747009, + "logps/chosen": -53.28437423706055, + "logps/rejected": -96.52420806884766, + "loss": 0.7919, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.145444631576538, + "rewards/margins": 5.227158069610596, + "rewards/rejected": -2.0817134380340576, + "step": 6625 + }, + { + "epoch": 1.66, + "grad_norm": 6.110379695892334, + "learning_rate": 7.524238353711323e-06, + "logits/chosen": -0.391742467880249, + "logits/rejected": -0.5269231200218201, + "logps/chosen": -66.77996063232422, + "logps/rejected": -73.35417175292969, + "loss": 0.8255, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.885119915008545, + "rewards/margins": 5.107125759124756, + "rewards/rejected": -2.222005844116211, + "step": 6626 + }, + { + "epoch": 1.66, + "grad_norm": 11.841947555541992, + "learning_rate": 7.523559851340619e-06, + "logits/chosen": -0.2816823720932007, + "logits/rejected": -0.3581325113773346, + "logps/chosen": -59.08198928833008, + "logps/rejected": -94.59736633300781, + "loss": 0.7442, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9834325313568115, + "rewards/margins": 4.877509117126465, + "rewards/rejected": -1.8940762281417847, + "step": 6627 + }, + { + "epoch": 1.66, + "grad_norm": 4.516798496246338, + "learning_rate": 7.522881286610064e-06, + "logits/chosen": -0.27710309624671936, + "logits/rejected": -0.41829097270965576, + "logps/chosen": -60.75189971923828, + "logps/rejected": -91.68061828613281, + "loss": 0.7098, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9775545597076416, + "rewards/margins": 5.6690897941589355, + "rewards/rejected": -2.691534996032715, + "step": 6628 + }, + { + "epoch": 1.66, + "grad_norm": 4.433670520782471, + "learning_rate": 7.522202659536433e-06, + "logits/chosen": -0.3130255341529846, + "logits/rejected": -0.4765358865261078, + "logps/chosen": -58.61211013793945, + "logps/rejected": -72.3741683959961, + "loss": 0.7733, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8413965702056885, + "rewards/margins": 5.409618377685547, + "rewards/rejected": -2.5682215690612793, + "step": 6629 + }, + { + "epoch": 1.66, + "grad_norm": 2.8963260650634766, + "learning_rate": 7.5215239701364894e-06, + "logits/chosen": -0.3092150390148163, + "logits/rejected": -0.50811368227005, + "logps/chosen": -56.239261627197266, + "logps/rejected": -66.63999938964844, + "loss": 0.6791, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1819825172424316, + "rewards/margins": 5.783170223236084, + "rewards/rejected": -2.6011874675750732, + "step": 6630 + }, + { + "epoch": 1.66, + "grad_norm": 3.4443509578704834, + "learning_rate": 7.520845218427007e-06, + "logits/chosen": -0.3863981068134308, + "logits/rejected": -0.46326005458831787, + "logps/chosen": -54.7924690246582, + "logps/rejected": -82.3961181640625, + "loss": 0.6969, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9323551654815674, + "rewards/margins": 5.452070236206055, + "rewards/rejected": -2.519714832305908, + "step": 6631 + }, + { + "epoch": 1.66, + "grad_norm": 5.404458522796631, + "learning_rate": 7.520166404424758e-06, + "logits/chosen": -0.4083821773529053, + "logits/rejected": -0.46068498492240906, + "logps/chosen": -49.70868682861328, + "logps/rejected": -90.26769256591797, + "loss": 0.6488, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2988884449005127, + "rewards/margins": 4.500054359436035, + "rewards/rejected": -1.2011662721633911, + "step": 6632 + }, + { + "epoch": 1.66, + "grad_norm": 4.326682090759277, + "learning_rate": 7.519487528146517e-06, + "logits/chosen": -0.30613070726394653, + "logits/rejected": -0.40684089064598083, + "logps/chosen": -58.31559753417969, + "logps/rejected": -80.98561096191406, + "loss": 0.6824, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.975581407546997, + "rewards/margins": 4.242523193359375, + "rewards/rejected": -1.266941785812378, + "step": 6633 + }, + { + "epoch": 1.66, + "grad_norm": 3.9713387489318848, + "learning_rate": 7.518808589609059e-06, + "logits/chosen": -0.4293779730796814, + "logits/rejected": -0.49040669202804565, + "logps/chosen": -63.526336669921875, + "logps/rejected": -96.26628112792969, + "loss": 0.7835, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.762632369995117, + "rewards/margins": 4.539505958557129, + "rewards/rejected": -1.776873230934143, + "step": 6634 + }, + { + "epoch": 1.66, + "grad_norm": 5.818007946014404, + "learning_rate": 7.518129588829161e-06, + "logits/chosen": -0.34609463810920715, + "logits/rejected": -0.39547207951545715, + "logps/chosen": -47.78095245361328, + "logps/rejected": -88.17481231689453, + "loss": 0.7553, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.943840503692627, + "rewards/margins": 4.860458850860596, + "rewards/rejected": -1.9166178703308105, + "step": 6635 + }, + { + "epoch": 1.66, + "grad_norm": 2.093625545501709, + "learning_rate": 7.517450525823602e-06, + "logits/chosen": -0.3296201825141907, + "logits/rejected": -0.3948843479156494, + "logps/chosen": -50.40132141113281, + "logps/rejected": -95.74063110351562, + "loss": 0.6134, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1905722618103027, + "rewards/margins": 6.023292541503906, + "rewards/rejected": -2.8327207565307617, + "step": 6636 + }, + { + "epoch": 1.66, + "grad_norm": 5.072338581085205, + "learning_rate": 7.516771400609164e-06, + "logits/chosen": -0.43428996205329895, + "logits/rejected": -0.5154039859771729, + "logps/chosen": -50.55857849121094, + "logps/rejected": -85.43194580078125, + "loss": 0.7302, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7948594093322754, + "rewards/margins": 4.551892280578613, + "rewards/rejected": -1.7570327520370483, + "step": 6637 + }, + { + "epoch": 1.66, + "grad_norm": 4.093358039855957, + "learning_rate": 7.5160922132026284e-06, + "logits/chosen": -0.37962639331817627, + "logits/rejected": -0.4923396706581116, + "logps/chosen": -51.03655242919922, + "logps/rejected": -74.640869140625, + "loss": 0.7143, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.964404821395874, + "rewards/margins": 4.75880241394043, + "rewards/rejected": -1.7943978309631348, + "step": 6638 + }, + { + "epoch": 1.66, + "grad_norm": 4.3854804039001465, + "learning_rate": 7.515412963620776e-06, + "logits/chosen": -0.35946646332740784, + "logits/rejected": -0.4751637876033783, + "logps/chosen": -52.58028793334961, + "logps/rejected": -88.43550109863281, + "loss": 0.7354, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8082706928253174, + "rewards/margins": 5.490124225616455, + "rewards/rejected": -2.6818535327911377, + "step": 6639 + }, + { + "epoch": 1.66, + "grad_norm": 3.480191469192505, + "learning_rate": 7.514733651880395e-06, + "logits/chosen": -0.45466750860214233, + "logits/rejected": -0.5295366048812866, + "logps/chosen": -44.9454231262207, + "logps/rejected": -75.28816223144531, + "loss": 0.694, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0130958557128906, + "rewards/margins": 4.426334381103516, + "rewards/rejected": -1.4132381677627563, + "step": 6640 + }, + { + "epoch": 1.66, + "grad_norm": 4.9601664543151855, + "learning_rate": 7.5140542779982705e-06, + "logits/chosen": -0.3973279297351837, + "logits/rejected": -0.5002013444900513, + "logps/chosen": -54.12091064453125, + "logps/rejected": -75.00970458984375, + "loss": 0.7395, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7804481983184814, + "rewards/margins": 4.45378303527832, + "rewards/rejected": -1.6733345985412598, + "step": 6641 + }, + { + "epoch": 1.66, + "grad_norm": 14.934675216674805, + "learning_rate": 7.51337484199119e-06, + "logits/chosen": -0.4078609347343445, + "logits/rejected": -0.4948185384273529, + "logps/chosen": -57.886619567871094, + "logps/rejected": -81.00686645507812, + "loss": 0.7422, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9523091316223145, + "rewards/margins": 5.172281265258789, + "rewards/rejected": -2.2199723720550537, + "step": 6642 + }, + { + "epoch": 1.66, + "grad_norm": 3.7128899097442627, + "learning_rate": 7.512695343875945e-06, + "logits/chosen": -0.39441877603530884, + "logits/rejected": -0.5025861263275146, + "logps/chosen": -58.530235290527344, + "logps/rejected": -82.47907257080078, + "loss": 0.7225, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1455609798431396, + "rewards/margins": 4.377631187438965, + "rewards/rejected": -1.2320700883865356, + "step": 6643 + }, + { + "epoch": 1.66, + "grad_norm": 4.301883220672607, + "learning_rate": 7.512015783669324e-06, + "logits/chosen": -0.36046838760375977, + "logits/rejected": -0.4034501910209656, + "logps/chosen": -47.23704528808594, + "logps/rejected": -98.01085662841797, + "loss": 0.592, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.860016107559204, + "rewards/margins": 5.546061038970947, + "rewards/rejected": -2.6860451698303223, + "step": 6644 + }, + { + "epoch": 1.66, + "grad_norm": 5.856071472167969, + "learning_rate": 7.511336161388123e-06, + "logits/chosen": -0.3730660378932953, + "logits/rejected": -0.4585765302181244, + "logps/chosen": -75.43665313720703, + "logps/rejected": -93.07229614257812, + "loss": 0.8955, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6818363666534424, + "rewards/margins": 4.988054275512695, + "rewards/rejected": -2.306217908859253, + "step": 6645 + }, + { + "epoch": 1.66, + "grad_norm": 3.177140951156616, + "learning_rate": 7.510656477049134e-06, + "logits/chosen": -0.4179116189479828, + "logits/rejected": -0.5649396777153015, + "logps/chosen": -59.24104690551758, + "logps/rejected": -79.24435424804688, + "loss": 0.6258, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.867069721221924, + "rewards/margins": 5.5435075759887695, + "rewards/rejected": -2.6764371395111084, + "step": 6646 + }, + { + "epoch": 1.66, + "grad_norm": 20.047060012817383, + "learning_rate": 7.50997673066915e-06, + "logits/chosen": -0.41212281584739685, + "logits/rejected": -0.47582173347473145, + "logps/chosen": -64.65579223632812, + "logps/rejected": -71.9888916015625, + "loss": 0.9375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7182486057281494, + "rewards/margins": 4.318193435668945, + "rewards/rejected": -1.5999445915222168, + "step": 6647 + }, + { + "epoch": 1.66, + "grad_norm": 3.516231060028076, + "learning_rate": 7.509296922264975e-06, + "logits/chosen": -0.34392037987709045, + "logits/rejected": -0.36537015438079834, + "logps/chosen": -47.77876281738281, + "logps/rejected": -97.99073791503906, + "loss": 0.6059, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0570313930511475, + "rewards/margins": 5.03248929977417, + "rewards/rejected": -1.975458025932312, + "step": 6648 + }, + { + "epoch": 1.66, + "grad_norm": 8.085297584533691, + "learning_rate": 7.5086170518534015e-06, + "logits/chosen": -0.41853067278862, + "logits/rejected": -0.5368511080741882, + "logps/chosen": -51.57832336425781, + "logps/rejected": -105.26516723632812, + "loss": 0.718, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0491929054260254, + "rewards/margins": 6.626729488372803, + "rewards/rejected": -3.5775370597839355, + "step": 6649 + }, + { + "epoch": 1.66, + "grad_norm": 12.193159103393555, + "learning_rate": 7.507937119451234e-06, + "logits/chosen": -0.4143536686897278, + "logits/rejected": -0.5443090200424194, + "logps/chosen": -61.31427001953125, + "logps/rejected": -80.13378143310547, + "loss": 0.7281, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0197012424468994, + "rewards/margins": 4.822088718414307, + "rewards/rejected": -1.8023871183395386, + "step": 6650 + }, + { + "epoch": 1.66, + "grad_norm": 3.5691497325897217, + "learning_rate": 7.507257125075271e-06, + "logits/chosen": -0.4446873068809509, + "logits/rejected": -0.4831494092941284, + "logps/chosen": -50.90979766845703, + "logps/rejected": -81.73832702636719, + "loss": 0.6852, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8284637928009033, + "rewards/margins": 4.914862155914307, + "rewards/rejected": -2.086398124694824, + "step": 6651 + }, + { + "epoch": 1.66, + "grad_norm": 8.268722534179688, + "learning_rate": 7.50657706874232e-06, + "logits/chosen": -0.3931032419204712, + "logits/rejected": -0.4932211935520172, + "logps/chosen": -59.65684509277344, + "logps/rejected": -100.80237579345703, + "loss": 0.7384, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.956897020339966, + "rewards/margins": 6.031012535095215, + "rewards/rejected": -3.074115514755249, + "step": 6652 + }, + { + "epoch": 1.66, + "grad_norm": 12.327259063720703, + "learning_rate": 7.5058969504691814e-06, + "logits/chosen": -0.3917818069458008, + "logits/rejected": -0.3915511965751648, + "logps/chosen": -55.60696792602539, + "logps/rejected": -93.40936279296875, + "loss": 0.736, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7275261878967285, + "rewards/margins": 4.4988603591918945, + "rewards/rejected": -1.7713342905044556, + "step": 6653 + }, + { + "epoch": 1.66, + "grad_norm": 20.60093116760254, + "learning_rate": 7.505216770272665e-06, + "logits/chosen": -0.3584495484828949, + "logits/rejected": -0.47863250970840454, + "logps/chosen": -60.915164947509766, + "logps/rejected": -92.45071411132812, + "loss": 0.7079, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.762786388397217, + "rewards/margins": 5.516628265380859, + "rewards/rejected": -2.7538418769836426, + "step": 6654 + }, + { + "epoch": 1.66, + "grad_norm": 13.596396446228027, + "learning_rate": 7.50453652816958e-06, + "logits/chosen": -0.3933514654636383, + "logits/rejected": -0.49998584389686584, + "logps/chosen": -58.73808288574219, + "logps/rejected": -90.60734558105469, + "loss": 0.7556, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7252180576324463, + "rewards/margins": 5.261302471160889, + "rewards/rejected": -2.5360841751098633, + "step": 6655 + }, + { + "epoch": 1.67, + "grad_norm": 7.1155924797058105, + "learning_rate": 7.503856224176727e-06, + "logits/chosen": -0.3761308491230011, + "logits/rejected": -0.4322989583015442, + "logps/chosen": -63.671226501464844, + "logps/rejected": -86.18211364746094, + "loss": 0.8563, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6895647048950195, + "rewards/margins": 4.4142327308654785, + "rewards/rejected": -1.7246683835983276, + "step": 6656 + }, + { + "epoch": 1.67, + "grad_norm": 3.770137310028076, + "learning_rate": 7.50317585831093e-06, + "logits/chosen": -0.41732385754585266, + "logits/rejected": -0.49174022674560547, + "logps/chosen": -62.88144302368164, + "logps/rejected": -95.66120910644531, + "loss": 0.7605, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9578256607055664, + "rewards/margins": 5.649764060974121, + "rewards/rejected": -2.6919379234313965, + "step": 6657 + }, + { + "epoch": 1.67, + "grad_norm": 5.824466228485107, + "learning_rate": 7.502495430588991e-06, + "logits/chosen": -0.3585677444934845, + "logits/rejected": -0.4571152329444885, + "logps/chosen": -64.55579376220703, + "logps/rejected": -87.18727111816406, + "loss": 0.8055, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1902084350585938, + "rewards/margins": 5.331580638885498, + "rewards/rejected": -2.1413724422454834, + "step": 6658 + }, + { + "epoch": 1.67, + "grad_norm": 5.429007053375244, + "learning_rate": 7.501814941027728e-06, + "logits/chosen": -0.3977602422237396, + "logits/rejected": -0.43628767132759094, + "logps/chosen": -54.31087875366211, + "logps/rejected": -91.191650390625, + "loss": 0.7193, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1414856910705566, + "rewards/margins": 4.607990741729736, + "rewards/rejected": -1.466505527496338, + "step": 6659 + }, + { + "epoch": 1.67, + "grad_norm": 6.3934760093688965, + "learning_rate": 7.501134389643958e-06, + "logits/chosen": -0.4047791063785553, + "logits/rejected": -0.4976569712162018, + "logps/chosen": -43.294960021972656, + "logps/rejected": -72.5412826538086, + "loss": 0.5968, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.35127329826355, + "rewards/margins": 6.031166076660156, + "rewards/rejected": -2.6798930168151855, + "step": 6660 + }, + { + "epoch": 1.67, + "grad_norm": 6.362386703491211, + "learning_rate": 7.500453776454498e-06, + "logits/chosen": -0.37451961636543274, + "logits/rejected": -0.4591871500015259, + "logps/chosen": -57.62352752685547, + "logps/rejected": -93.83423614501953, + "loss": 0.6427, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.232058525085449, + "rewards/margins": 5.737922191619873, + "rewards/rejected": -2.505864143371582, + "step": 6661 + }, + { + "epoch": 1.67, + "grad_norm": 4.55642032623291, + "learning_rate": 7.499773101476164e-06, + "logits/chosen": -0.3819632828235626, + "logits/rejected": -0.4673832356929779, + "logps/chosen": -49.27709197998047, + "logps/rejected": -86.04907989501953, + "loss": 0.6366, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1621689796447754, + "rewards/margins": 5.090298652648926, + "rewards/rejected": -1.92812979221344, + "step": 6662 + }, + { + "epoch": 1.67, + "grad_norm": 3.750680685043335, + "learning_rate": 7.499092364725778e-06, + "logits/chosen": -0.3762511610984802, + "logits/rejected": -0.45062917470932007, + "logps/chosen": -50.79290771484375, + "logps/rejected": -90.29568481445312, + "loss": 0.6265, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.381425142288208, + "rewards/margins": 5.5532546043396, + "rewards/rejected": -2.1718292236328125, + "step": 6663 + }, + { + "epoch": 1.67, + "grad_norm": 11.724185943603516, + "learning_rate": 7.498411566220163e-06, + "logits/chosen": -0.37501177191734314, + "logits/rejected": -0.4469972252845764, + "logps/chosen": -55.43610382080078, + "logps/rejected": -83.305419921875, + "loss": 0.822, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.826256036758423, + "rewards/margins": 4.552537441253662, + "rewards/rejected": -1.7262815237045288, + "step": 6664 + }, + { + "epoch": 1.67, + "grad_norm": 6.802711486816406, + "learning_rate": 7.497730705976139e-06, + "logits/chosen": -0.3965933322906494, + "logits/rejected": -0.43009549379348755, + "logps/chosen": -59.56511688232422, + "logps/rejected": -97.75454711914062, + "loss": 0.8095, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.54740047454834, + "rewards/margins": 4.603793144226074, + "rewards/rejected": -2.0563931465148926, + "step": 6665 + }, + { + "epoch": 1.67, + "grad_norm": 7.364982604980469, + "learning_rate": 7.497049784010534e-06, + "logits/chosen": -0.34599539637565613, + "logits/rejected": -0.3408385217189789, + "logps/chosen": -54.610809326171875, + "logps/rejected": -102.8402099609375, + "loss": 0.6008, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.990513801574707, + "rewards/margins": 4.733537197113037, + "rewards/rejected": -1.743023157119751, + "step": 6666 + }, + { + "epoch": 1.67, + "grad_norm": 13.497769355773926, + "learning_rate": 7.496368800340171e-06, + "logits/chosen": -0.47837334871292114, + "logits/rejected": -0.4998142719268799, + "logps/chosen": -55.72581100463867, + "logps/rejected": -100.11471557617188, + "loss": 0.986, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.751755714416504, + "rewards/margins": 5.087775230407715, + "rewards/rejected": -2.33601975440979, + "step": 6667 + }, + { + "epoch": 1.67, + "grad_norm": 5.178656101226807, + "learning_rate": 7.495687754981881e-06, + "logits/chosen": -0.4348258376121521, + "logits/rejected": -0.48729148507118225, + "logps/chosen": -49.94952392578125, + "logps/rejected": -112.89826202392578, + "loss": 0.6728, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.773571252822876, + "rewards/margins": 5.903700828552246, + "rewards/rejected": -3.130129814147949, + "step": 6668 + }, + { + "epoch": 1.67, + "grad_norm": 4.130282402038574, + "learning_rate": 7.495006647952492e-06, + "logits/chosen": -0.4333837330341339, + "logits/rejected": -0.5105006098747253, + "logps/chosen": -51.90802764892578, + "logps/rejected": -97.71976470947266, + "loss": 0.6302, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7096667289733887, + "rewards/margins": 4.7048115730285645, + "rewards/rejected": -1.9951448440551758, + "step": 6669 + }, + { + "epoch": 1.67, + "grad_norm": 3.6239852905273438, + "learning_rate": 7.494325479268833e-06, + "logits/chosen": -0.43555766344070435, + "logits/rejected": -0.4867215156555176, + "logps/chosen": -53.708152770996094, + "logps/rejected": -93.75407409667969, + "loss": 0.6706, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.015388011932373, + "rewards/margins": 6.456911563873291, + "rewards/rejected": -3.441523313522339, + "step": 6670 + }, + { + "epoch": 1.67, + "grad_norm": 4.790605068206787, + "learning_rate": 7.493644248947741e-06, + "logits/chosen": -0.3998715579509735, + "logits/rejected": -0.4725600481033325, + "logps/chosen": -85.72693634033203, + "logps/rejected": -76.304931640625, + "loss": 0.6702, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.901198387145996, + "rewards/margins": 5.1581807136535645, + "rewards/rejected": -2.2569823265075684, + "step": 6671 + }, + { + "epoch": 1.67, + "grad_norm": 4.846486568450928, + "learning_rate": 7.492962957006047e-06, + "logits/chosen": -0.26471206545829773, + "logits/rejected": -0.3462646007537842, + "logps/chosen": -61.638938903808594, + "logps/rejected": -81.64556121826172, + "loss": 0.8488, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8201889991760254, + "rewards/margins": 5.146305084228516, + "rewards/rejected": -2.3261170387268066, + "step": 6672 + }, + { + "epoch": 1.67, + "grad_norm": 5.09996223449707, + "learning_rate": 7.492281603460585e-06, + "logits/chosen": -0.3865690231323242, + "logits/rejected": -0.5084656476974487, + "logps/chosen": -59.568016052246094, + "logps/rejected": -105.98536682128906, + "loss": 0.6022, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.803884983062744, + "rewards/margins": 6.538071632385254, + "rewards/rejected": -3.7341866493225098, + "step": 6673 + }, + { + "epoch": 1.67, + "grad_norm": 5.393928527832031, + "learning_rate": 7.491600188328194e-06, + "logits/chosen": -0.4142231047153473, + "logits/rejected": -0.4096422493457794, + "logps/chosen": -54.57426071166992, + "logps/rejected": -112.94339752197266, + "loss": 0.7655, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.500253438949585, + "rewards/margins": 5.2361741065979, + "rewards/rejected": -2.7359206676483154, + "step": 6674 + }, + { + "epoch": 1.67, + "grad_norm": 5.86639404296875, + "learning_rate": 7.490918711625715e-06, + "logits/chosen": -0.2890367805957794, + "logits/rejected": -0.40047597885131836, + "logps/chosen": -71.79484558105469, + "logps/rejected": -89.39583587646484, + "loss": 0.6845, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.840927839279175, + "rewards/margins": 5.2651519775390625, + "rewards/rejected": -2.4242238998413086, + "step": 6675 + }, + { + "epoch": 1.67, + "grad_norm": 7.417694568634033, + "learning_rate": 7.4902371733699805e-06, + "logits/chosen": -0.536095917224884, + "logits/rejected": -0.608860194683075, + "logps/chosen": -46.964805603027344, + "logps/rejected": -94.43620300292969, + "loss": 0.5998, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.966951370239258, + "rewards/margins": 6.495203018188477, + "rewards/rejected": -3.5282514095306396, + "step": 6676 + }, + { + "epoch": 1.67, + "grad_norm": 3.651048183441162, + "learning_rate": 7.48955557357784e-06, + "logits/chosen": -0.37461918592453003, + "logits/rejected": -0.4799949824810028, + "logps/chosen": -60.326271057128906, + "logps/rejected": -74.2596206665039, + "loss": 0.6935, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7871994972229004, + "rewards/margins": 4.316127300262451, + "rewards/rejected": -1.5289279222488403, + "step": 6677 + }, + { + "epoch": 1.67, + "grad_norm": 8.395633697509766, + "learning_rate": 7.4888739122661325e-06, + "logits/chosen": -0.41827529668807983, + "logits/rejected": -0.5279641151428223, + "logps/chosen": -44.8487548828125, + "logps/rejected": -84.19463348388672, + "loss": 0.5458, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8213655948638916, + "rewards/margins": 6.279644012451172, + "rewards/rejected": -3.458278179168701, + "step": 6678 + }, + { + "epoch": 1.67, + "grad_norm": 6.89540433883667, + "learning_rate": 7.488192189451703e-06, + "logits/chosen": -0.40676140785217285, + "logits/rejected": -0.45656639337539673, + "logps/chosen": -60.003875732421875, + "logps/rejected": -83.79681396484375, + "loss": 0.7477, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7841432094573975, + "rewards/margins": 5.068508625030518, + "rewards/rejected": -2.284365177154541, + "step": 6679 + }, + { + "epoch": 1.67, + "grad_norm": 4.597929954528809, + "learning_rate": 7.487510405151399e-06, + "logits/chosen": -0.39536863565444946, + "logits/rejected": -0.5071895122528076, + "logps/chosen": -66.50504302978516, + "logps/rejected": -82.2592544555664, + "loss": 0.7248, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.722778797149658, + "rewards/margins": 5.234105587005615, + "rewards/rejected": -2.511326789855957, + "step": 6680 + }, + { + "epoch": 1.67, + "grad_norm": 9.264411926269531, + "learning_rate": 7.486828559382065e-06, + "logits/chosen": -0.48523247241973877, + "logits/rejected": -0.523095428943634, + "logps/chosen": -57.43411636352539, + "logps/rejected": -92.60948944091797, + "loss": 0.7682, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.563044309616089, + "rewards/margins": 4.487249851226807, + "rewards/rejected": -1.9242054224014282, + "step": 6681 + }, + { + "epoch": 1.67, + "grad_norm": 6.576061248779297, + "learning_rate": 7.486146652160554e-06, + "logits/chosen": -0.3720993399620056, + "logits/rejected": -0.47521698474884033, + "logps/chosen": -68.21858215332031, + "logps/rejected": -93.66207122802734, + "loss": 0.8576, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.623276948928833, + "rewards/margins": 5.3978071212768555, + "rewards/rejected": -2.7745299339294434, + "step": 6682 + }, + { + "epoch": 1.67, + "grad_norm": 2.404104471206665, + "learning_rate": 7.485464683503714e-06, + "logits/chosen": -0.36043959856033325, + "logits/rejected": -0.4971340298652649, + "logps/chosen": -56.21367263793945, + "logps/rejected": -97.81224060058594, + "loss": 0.5794, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.043346881866455, + "rewards/margins": 7.14375638961792, + "rewards/rejected": -4.100409507751465, + "step": 6683 + }, + { + "epoch": 1.67, + "grad_norm": 4.031485080718994, + "learning_rate": 7.4847826534284e-06, + "logits/chosen": -0.4427887499332428, + "logits/rejected": -0.5345988273620605, + "logps/chosen": -53.071842193603516, + "logps/rejected": -85.92491149902344, + "loss": 0.7358, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9879798889160156, + "rewards/margins": 5.195660591125488, + "rewards/rejected": -2.207681179046631, + "step": 6684 + }, + { + "epoch": 1.67, + "grad_norm": 9.465729713439941, + "learning_rate": 7.484100561951459e-06, + "logits/chosen": -0.4362257719039917, + "logits/rejected": -0.4084518551826477, + "logps/chosen": -60.28065872192383, + "logps/rejected": -105.9795913696289, + "loss": 0.8452, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6335415840148926, + "rewards/margins": 3.5454506874084473, + "rewards/rejected": -0.9119093418121338, + "step": 6685 + }, + { + "epoch": 1.67, + "grad_norm": 6.353804588317871, + "learning_rate": 7.483418409089755e-06, + "logits/chosen": -0.33235234022140503, + "logits/rejected": -0.42158809304237366, + "logps/chosen": -64.0519027709961, + "logps/rejected": -83.7025146484375, + "loss": 0.7149, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.898395538330078, + "rewards/margins": 4.895741939544678, + "rewards/rejected": -1.9973466396331787, + "step": 6686 + }, + { + "epoch": 1.67, + "grad_norm": 5.216326713562012, + "learning_rate": 7.482736194860139e-06, + "logits/chosen": -0.3541921079158783, + "logits/rejected": -0.40394189953804016, + "logps/chosen": -58.82952880859375, + "logps/rejected": -87.75428009033203, + "loss": 0.8364, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.829251527786255, + "rewards/margins": 4.131509780883789, + "rewards/rejected": -1.3022584915161133, + "step": 6687 + }, + { + "epoch": 1.67, + "grad_norm": 4.953685760498047, + "learning_rate": 7.482053919279473e-06, + "logits/chosen": -0.45883166790008545, + "logits/rejected": -0.4602949023246765, + "logps/chosen": -52.82254409790039, + "logps/rejected": -89.65216064453125, + "loss": 0.7931, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.853623867034912, + "rewards/margins": 4.383828163146973, + "rewards/rejected": -1.5302042961120605, + "step": 6688 + }, + { + "epoch": 1.67, + "grad_norm": 3.2667062282562256, + "learning_rate": 7.481371582364613e-06, + "logits/chosen": -0.4042799174785614, + "logits/rejected": -0.5390001535415649, + "logps/chosen": -65.94269561767578, + "logps/rejected": -79.85231018066406, + "loss": 0.691, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9938180446624756, + "rewards/margins": 5.489439487457275, + "rewards/rejected": -2.4956214427948, + "step": 6689 + }, + { + "epoch": 1.67, + "grad_norm": 7.124931812286377, + "learning_rate": 7.48068918413242e-06, + "logits/chosen": -0.4331514835357666, + "logits/rejected": -0.4970631003379822, + "logps/chosen": -64.63363647460938, + "logps/rejected": -82.59033203125, + "loss": 0.8655, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1846184730529785, + "rewards/margins": 4.354855537414551, + "rewards/rejected": -1.1702373027801514, + "step": 6690 + }, + { + "epoch": 1.67, + "grad_norm": 4.596276760101318, + "learning_rate": 7.480006724599762e-06, + "logits/chosen": -0.38609418272972107, + "logits/rejected": -0.4563869833946228, + "logps/chosen": -50.12387466430664, + "logps/rejected": -80.10228729248047, + "loss": 0.728, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.80076265335083, + "rewards/margins": 4.5855607986450195, + "rewards/rejected": -1.7847979068756104, + "step": 6691 + }, + { + "epoch": 1.67, + "grad_norm": 2.7730965614318848, + "learning_rate": 7.479324203783498e-06, + "logits/chosen": -0.34077754616737366, + "logits/rejected": -0.49802643060684204, + "logps/chosen": -54.4720344543457, + "logps/rejected": -89.71851348876953, + "loss": 0.5656, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.860363483428955, + "rewards/margins": 6.129034042358398, + "rewards/rejected": -3.2686707973480225, + "step": 6692 + }, + { + "epoch": 1.67, + "grad_norm": 9.159106254577637, + "learning_rate": 7.478641621700496e-06, + "logits/chosen": -0.4404393434524536, + "logits/rejected": -0.4214571416378021, + "logps/chosen": -54.95729064941406, + "logps/rejected": -97.21730041503906, + "loss": 0.8438, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9209938049316406, + "rewards/margins": 3.6322925090789795, + "rewards/rejected": -0.7112985253334045, + "step": 6693 + }, + { + "epoch": 1.67, + "grad_norm": 18.361146926879883, + "learning_rate": 7.477958978367624e-06, + "logits/chosen": -0.33137667179107666, + "logits/rejected": -0.3595125079154968, + "logps/chosen": -64.01275634765625, + "logps/rejected": -97.05590057373047, + "loss": 0.8188, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.711613893508911, + "rewards/margins": 4.297296047210693, + "rewards/rejected": -1.5856819152832031, + "step": 6694 + }, + { + "epoch": 1.67, + "grad_norm": 6.443527698516846, + "learning_rate": 7.47727627380175e-06, + "logits/chosen": -0.3041577637195587, + "logits/rejected": -0.45579519867897034, + "logps/chosen": -69.51972961425781, + "logps/rejected": -99.90707397460938, + "loss": 0.673, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.10505747795105, + "rewards/margins": 6.0578179359436035, + "rewards/rejected": -2.952760696411133, + "step": 6695 + }, + { + "epoch": 1.68, + "grad_norm": 6.200024604797363, + "learning_rate": 7.476593508019744e-06, + "logits/chosen": -0.3291955292224884, + "logits/rejected": -0.4293404519557953, + "logps/chosen": -56.7217903137207, + "logps/rejected": -109.194091796875, + "loss": 0.6669, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.994391918182373, + "rewards/margins": 7.046256065368652, + "rewards/rejected": -4.051864147186279, + "step": 6696 + }, + { + "epoch": 1.68, + "grad_norm": 4.264641761779785, + "learning_rate": 7.475910681038477e-06, + "logits/chosen": -0.409869521856308, + "logits/rejected": -0.494127094745636, + "logps/chosen": -49.839149475097656, + "logps/rejected": -87.14502716064453, + "loss": 0.6248, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8350908756256104, + "rewards/margins": 5.276065826416016, + "rewards/rejected": -2.440974712371826, + "step": 6697 + }, + { + "epoch": 1.68, + "grad_norm": 6.387762546539307, + "learning_rate": 7.475227792874826e-06, + "logits/chosen": -0.35505396127700806, + "logits/rejected": -0.44139760732650757, + "logps/chosen": -65.9186782836914, + "logps/rejected": -104.86763763427734, + "loss": 0.7281, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6916096210479736, + "rewards/margins": 4.943129062652588, + "rewards/rejected": -2.2515196800231934, + "step": 6698 + }, + { + "epoch": 1.68, + "grad_norm": 7.552352428436279, + "learning_rate": 7.474544843545661e-06, + "logits/chosen": -0.43398231267929077, + "logits/rejected": -0.525926947593689, + "logps/chosen": -53.471946716308594, + "logps/rejected": -94.18109130859375, + "loss": 0.7773, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2385447025299072, + "rewards/margins": 5.574713230133057, + "rewards/rejected": -2.336167812347412, + "step": 6699 + }, + { + "epoch": 1.68, + "grad_norm": 6.425453186035156, + "learning_rate": 7.473861833067862e-06, + "logits/chosen": -0.3227335214614868, + "logits/rejected": -0.41952744126319885, + "logps/chosen": -77.968505859375, + "logps/rejected": -97.58296966552734, + "loss": 0.76, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.779031276702881, + "rewards/margins": 5.233487606048584, + "rewards/rejected": -2.454456329345703, + "step": 6700 + }, + { + "epoch": 1.68, + "grad_norm": 6.017228603363037, + "learning_rate": 7.473178761458306e-06, + "logits/chosen": -0.42582547664642334, + "logits/rejected": -0.5627548098564148, + "logps/chosen": -55.49212646484375, + "logps/rejected": -71.11817932128906, + "loss": 0.7418, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9123921394348145, + "rewards/margins": 5.114208221435547, + "rewards/rejected": -2.2018158435821533, + "step": 6701 + }, + { + "epoch": 1.68, + "grad_norm": 5.343839168548584, + "learning_rate": 7.472495628733871e-06, + "logits/chosen": -0.3712810277938843, + "logits/rejected": -0.46962469816207886, + "logps/chosen": -54.90603256225586, + "logps/rejected": -90.89984893798828, + "loss": 0.6083, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6326651573181152, + "rewards/margins": 6.127014636993408, + "rewards/rejected": -3.4943490028381348, + "step": 6702 + }, + { + "epoch": 1.68, + "grad_norm": 7.466826438903809, + "learning_rate": 7.471812434911441e-06, + "logits/chosen": -0.4251663386821747, + "logits/rejected": -0.4998118579387665, + "logps/chosen": -51.23268127441406, + "logps/rejected": -112.12308502197266, + "loss": 0.6285, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.008058547973633, + "rewards/margins": 5.816571235656738, + "rewards/rejected": -2.8085126876831055, + "step": 6703 + }, + { + "epoch": 1.68, + "grad_norm": 6.5709428787231445, + "learning_rate": 7.471129180007896e-06, + "logits/chosen": -0.37218013405799866, + "logits/rejected": -0.4500841796398163, + "logps/chosen": -49.37016296386719, + "logps/rejected": -82.08950805664062, + "loss": 0.8488, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9559519290924072, + "rewards/margins": 5.134854316711426, + "rewards/rejected": -2.1789021492004395, + "step": 6704 + }, + { + "epoch": 1.68, + "grad_norm": 16.54269027709961, + "learning_rate": 7.47044586404012e-06, + "logits/chosen": -0.39594653248786926, + "logits/rejected": -0.4737033545970917, + "logps/chosen": -59.54766082763672, + "logps/rejected": -75.97969055175781, + "loss": 0.8825, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8339316844940186, + "rewards/margins": 4.6095476150512695, + "rewards/rejected": -1.7756155729293823, + "step": 6705 + }, + { + "epoch": 1.68, + "grad_norm": 6.5977277755737305, + "learning_rate": 7.4697624870249985e-06, + "logits/chosen": -0.4074711799621582, + "logits/rejected": -0.45496106147766113, + "logps/chosen": -48.53630828857422, + "logps/rejected": -92.20295715332031, + "loss": 0.7825, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9668824672698975, + "rewards/margins": 4.694267749786377, + "rewards/rejected": -1.72738516330719, + "step": 6706 + }, + { + "epoch": 1.68, + "grad_norm": 6.012142658233643, + "learning_rate": 7.4690790489794195e-06, + "logits/chosen": -0.35164663195610046, + "logits/rejected": -0.4052916169166565, + "logps/chosen": -52.35114288330078, + "logps/rejected": -91.95917510986328, + "loss": 0.6628, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7791748046875, + "rewards/margins": 4.655212879180908, + "rewards/rejected": -1.8760383129119873, + "step": 6707 + }, + { + "epoch": 1.68, + "grad_norm": 10.931998252868652, + "learning_rate": 7.468395549920271e-06, + "logits/chosen": -0.27608251571655273, + "logits/rejected": -0.4035520851612091, + "logps/chosen": -72.64466857910156, + "logps/rejected": -89.70073699951172, + "loss": 0.8139, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.459904193878174, + "rewards/margins": 4.291044235229492, + "rewards/rejected": -1.8311399221420288, + "step": 6708 + }, + { + "epoch": 1.68, + "grad_norm": 7.188682556152344, + "learning_rate": 7.4677119898644436e-06, + "logits/chosen": -0.3284572958946228, + "logits/rejected": -0.4648206830024719, + "logps/chosen": -54.111637115478516, + "logps/rejected": -83.75940704345703, + "loss": 0.7544, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.842031717300415, + "rewards/margins": 5.388607501983643, + "rewards/rejected": -2.5465760231018066, + "step": 6709 + }, + { + "epoch": 1.68, + "grad_norm": 3.6169135570526123, + "learning_rate": 7.467028368828828e-06, + "logits/chosen": -0.37303826212882996, + "logits/rejected": -0.4725921154022217, + "logps/chosen": -54.09381103515625, + "logps/rejected": -88.20588684082031, + "loss": 0.6843, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9269120693206787, + "rewards/margins": 5.48300313949585, + "rewards/rejected": -2.556090831756592, + "step": 6710 + }, + { + "epoch": 1.68, + "grad_norm": 4.3615593910217285, + "learning_rate": 7.466344686830317e-06, + "logits/chosen": -0.3507443964481354, + "logits/rejected": -0.45047056674957275, + "logps/chosen": -57.331817626953125, + "logps/rejected": -95.72516632080078, + "loss": 0.6121, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9371330738067627, + "rewards/margins": 6.463882923126221, + "rewards/rejected": -3.526749610900879, + "step": 6711 + }, + { + "epoch": 1.68, + "grad_norm": 4.957427501678467, + "learning_rate": 7.4656609438858066e-06, + "logits/chosen": -0.36372873187065125, + "logits/rejected": -0.3975028693675995, + "logps/chosen": -53.47884750366211, + "logps/rejected": -87.25727844238281, + "loss": 0.6983, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9965827465057373, + "rewards/margins": 5.06575870513916, + "rewards/rejected": -2.0691757202148438, + "step": 6712 + }, + { + "epoch": 1.68, + "grad_norm": 4.076523303985596, + "learning_rate": 7.464977140012188e-06, + "logits/chosen": -0.35313987731933594, + "logits/rejected": -0.44927743077278137, + "logps/chosen": -63.2376708984375, + "logps/rejected": -81.47102355957031, + "loss": 0.7521, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0939905643463135, + "rewards/margins": 4.876219749450684, + "rewards/rejected": -1.782228708267212, + "step": 6713 + }, + { + "epoch": 1.68, + "grad_norm": 3.0252110958099365, + "learning_rate": 7.464293275226366e-06, + "logits/chosen": -0.40324637293815613, + "logits/rejected": -0.4497952163219452, + "logps/chosen": -53.47024917602539, + "logps/rejected": -89.7518539428711, + "loss": 0.6536, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.13358736038208, + "rewards/margins": 5.736898422241211, + "rewards/rejected": -2.603311777114868, + "step": 6714 + }, + { + "epoch": 1.68, + "grad_norm": 17.725128173828125, + "learning_rate": 7.463609349545236e-06, + "logits/chosen": -0.3873911201953888, + "logits/rejected": -0.4904002845287323, + "logps/chosen": -62.95331573486328, + "logps/rejected": -81.17654418945312, + "loss": 0.9062, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.507207155227661, + "rewards/margins": 4.881614685058594, + "rewards/rejected": -2.3744075298309326, + "step": 6715 + }, + { + "epoch": 1.68, + "grad_norm": 10.538033485412598, + "learning_rate": 7.462925362985697e-06, + "logits/chosen": -0.36699771881103516, + "logits/rejected": -0.47139987349510193, + "logps/chosen": -57.926631927490234, + "logps/rejected": -91.10389709472656, + "loss": 0.7401, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7716116905212402, + "rewards/margins": 5.212932586669922, + "rewards/rejected": -2.4413211345672607, + "step": 6716 + }, + { + "epoch": 1.68, + "grad_norm": 9.75704288482666, + "learning_rate": 7.462241315564654e-06, + "logits/chosen": -0.38416603207588196, + "logits/rejected": -0.4742065668106079, + "logps/chosen": -69.88294982910156, + "logps/rejected": -80.14490509033203, + "loss": 0.9142, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1238768100738525, + "rewards/margins": 4.401177406311035, + "rewards/rejected": -1.2773009538650513, + "step": 6717 + }, + { + "epoch": 1.68, + "grad_norm": 3.7458860874176025, + "learning_rate": 7.461557207299009e-06, + "logits/chosen": -0.4057043194770813, + "logits/rejected": -0.5483529567718506, + "logps/chosen": -54.17533874511719, + "logps/rejected": -74.86026763916016, + "loss": 0.6602, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2909035682678223, + "rewards/margins": 5.455870628356934, + "rewards/rejected": -2.1649672985076904, + "step": 6718 + }, + { + "epoch": 1.68, + "grad_norm": 10.63126277923584, + "learning_rate": 7.4608730382056645e-06, + "logits/chosen": -0.41118890047073364, + "logits/rejected": -0.5465574264526367, + "logps/chosen": -54.192901611328125, + "logps/rejected": -74.37944030761719, + "loss": 0.7427, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.19476318359375, + "rewards/margins": 5.267670631408691, + "rewards/rejected": -2.0729072093963623, + "step": 6719 + }, + { + "epoch": 1.68, + "grad_norm": 3.4896767139434814, + "learning_rate": 7.460188808301532e-06, + "logits/chosen": -0.3350061774253845, + "logits/rejected": -0.4427337646484375, + "logps/chosen": -64.46517944335938, + "logps/rejected": -95.03362274169922, + "loss": 0.7972, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4883601665496826, + "rewards/margins": 5.346308708190918, + "rewards/rejected": -2.8579492568969727, + "step": 6720 + }, + { + "epoch": 1.68, + "grad_norm": 6.946188926696777, + "learning_rate": 7.459504517603517e-06, + "logits/chosen": -0.37449467182159424, + "logits/rejected": -0.39692622423171997, + "logps/chosen": -60.74002456665039, + "logps/rejected": -94.65641784667969, + "loss": 0.8331, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8631744384765625, + "rewards/margins": 3.3581056594848633, + "rewards/rejected": -0.49493151903152466, + "step": 6721 + }, + { + "epoch": 1.68, + "grad_norm": 8.88908863067627, + "learning_rate": 7.458820166128529e-06, + "logits/chosen": -0.4435079097747803, + "logits/rejected": -0.5360845923423767, + "logps/chosen": -68.99746704101562, + "logps/rejected": -78.96223449707031, + "loss": 1.0377, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.0052456855773926, + "rewards/margins": 4.252543926239014, + "rewards/rejected": -1.2472983598709106, + "step": 6722 + }, + { + "epoch": 1.68, + "grad_norm": 4.436005115509033, + "learning_rate": 7.458135753893481e-06, + "logits/chosen": -0.3993939757347107, + "logits/rejected": -0.4849415421485901, + "logps/chosen": -57.955875396728516, + "logps/rejected": -88.51276397705078, + "loss": 0.7516, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.811166763305664, + "rewards/margins": 5.216233253479004, + "rewards/rejected": -2.405066728591919, + "step": 6723 + }, + { + "epoch": 1.68, + "grad_norm": 7.149897575378418, + "learning_rate": 7.457451280915282e-06, + "logits/chosen": -0.33538228273391724, + "logits/rejected": -0.4398300349712372, + "logps/chosen": -56.386173248291016, + "logps/rejected": -82.96073913574219, + "loss": 0.6044, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9172732830047607, + "rewards/margins": 4.874255180358887, + "rewards/rejected": -1.956981897354126, + "step": 6724 + }, + { + "epoch": 1.68, + "grad_norm": 6.682638168334961, + "learning_rate": 7.456766747210849e-06, + "logits/chosen": -0.4016273021697998, + "logits/rejected": -0.4782235622406006, + "logps/chosen": -53.12920379638672, + "logps/rejected": -73.484130859375, + "loss": 0.8173, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7974488735198975, + "rewards/margins": 3.791313409805298, + "rewards/rejected": -0.9938645958900452, + "step": 6725 + }, + { + "epoch": 1.68, + "grad_norm": 9.869904518127441, + "learning_rate": 7.456082152797095e-06, + "logits/chosen": -0.4287629723548889, + "logits/rejected": -0.5256935358047485, + "logps/chosen": -55.77493667602539, + "logps/rejected": -80.52494049072266, + "loss": 0.7282, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9289252758026123, + "rewards/margins": 5.472570419311523, + "rewards/rejected": -2.5436453819274902, + "step": 6726 + }, + { + "epoch": 1.68, + "grad_norm": 9.792364120483398, + "learning_rate": 7.455397497690941e-06, + "logits/chosen": -0.25079822540283203, + "logits/rejected": -0.35558417439460754, + "logps/chosen": -70.30805969238281, + "logps/rejected": -82.00816345214844, + "loss": 0.8281, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.726062059402466, + "rewards/margins": 4.88356351852417, + "rewards/rejected": -2.157501697540283, + "step": 6727 + }, + { + "epoch": 1.68, + "grad_norm": 4.1330413818359375, + "learning_rate": 7.454712781909301e-06, + "logits/chosen": -0.3635796308517456, + "logits/rejected": -0.42095932364463806, + "logps/chosen": -59.95073318481445, + "logps/rejected": -85.59748077392578, + "loss": 0.6945, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1371567249298096, + "rewards/margins": 4.508375644683838, + "rewards/rejected": -1.3712188005447388, + "step": 6728 + }, + { + "epoch": 1.68, + "grad_norm": 12.376540184020996, + "learning_rate": 7.454028005469099e-06, + "logits/chosen": -0.38695624470710754, + "logits/rejected": -0.5184451341629028, + "logps/chosen": -58.39932632446289, + "logps/rejected": -74.74253845214844, + "loss": 0.8832, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5441110134124756, + "rewards/margins": 4.908720016479492, + "rewards/rejected": -2.3646082878112793, + "step": 6729 + }, + { + "epoch": 1.68, + "grad_norm": 4.742470741271973, + "learning_rate": 7.4533431683872535e-06, + "logits/chosen": -0.48597949743270874, + "logits/rejected": -0.5267417430877686, + "logps/chosen": -49.370361328125, + "logps/rejected": -111.15929412841797, + "loss": 0.7024, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7595341205596924, + "rewards/margins": 4.994866847991943, + "rewards/rejected": -2.235332727432251, + "step": 6730 + }, + { + "epoch": 1.68, + "grad_norm": 5.143451690673828, + "learning_rate": 7.452658270680689e-06, + "logits/chosen": -0.40905702114105225, + "logits/rejected": -0.4906764030456543, + "logps/chosen": -48.49952697753906, + "logps/rejected": -80.29342651367188, + "loss": 0.7451, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1053526401519775, + "rewards/margins": 4.933892250061035, + "rewards/rejected": -1.8285400867462158, + "step": 6731 + }, + { + "epoch": 1.68, + "grad_norm": 5.113144874572754, + "learning_rate": 7.4519733123663305e-06, + "logits/chosen": -0.34797102212905884, + "logits/rejected": -0.42098021507263184, + "logps/chosen": -58.25755310058594, + "logps/rejected": -100.15229797363281, + "loss": 0.6644, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.712667465209961, + "rewards/margins": 5.119787693023682, + "rewards/rejected": -2.4071197509765625, + "step": 6732 + }, + { + "epoch": 1.68, + "grad_norm": 8.476936340332031, + "learning_rate": 7.4512882934611035e-06, + "logits/chosen": -0.435586541891098, + "logits/rejected": -0.5206435918807983, + "logps/chosen": -55.69340896606445, + "logps/rejected": -78.2237548828125, + "loss": 0.8636, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7685017585754395, + "rewards/margins": 3.996427059173584, + "rewards/rejected": -1.2279253005981445, + "step": 6733 + }, + { + "epoch": 1.68, + "grad_norm": 3.8566040992736816, + "learning_rate": 7.450603213981934e-06, + "logits/chosen": -0.4629301428794861, + "logits/rejected": -0.5810666084289551, + "logps/chosen": -59.965606689453125, + "logps/rejected": -78.74138641357422, + "loss": 0.6292, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3187875747680664, + "rewards/margins": 5.561923027038574, + "rewards/rejected": -2.243136405944824, + "step": 6734 + }, + { + "epoch": 1.68, + "grad_norm": 9.532872200012207, + "learning_rate": 7.449918073945753e-06, + "logits/chosen": -0.38272279500961304, + "logits/rejected": -0.5022110342979431, + "logps/chosen": -50.07893371582031, + "logps/rejected": -76.3822250366211, + "loss": 0.8148, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.616194725036621, + "rewards/margins": 5.154962062835693, + "rewards/rejected": -2.5387675762176514, + "step": 6735 + }, + { + "epoch": 1.69, + "grad_norm": 4.791905403137207, + "learning_rate": 7.449232873369492e-06, + "logits/chosen": -0.33414870500564575, + "logits/rejected": -0.44200074672698975, + "logps/chosen": -56.18367385864258, + "logps/rejected": -95.00702667236328, + "loss": 0.6455, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.030684232711792, + "rewards/margins": 5.253772735595703, + "rewards/rejected": -2.223088026046753, + "step": 6736 + }, + { + "epoch": 1.69, + "grad_norm": 7.807804584503174, + "learning_rate": 7.448547612270081e-06, + "logits/chosen": -0.3491913676261902, + "logits/rejected": -0.440036803483963, + "logps/chosen": -67.3557357788086, + "logps/rejected": -92.43354034423828, + "loss": 0.7584, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6755294799804688, + "rewards/margins": 4.6589674949646, + "rewards/rejected": -1.9834381341934204, + "step": 6737 + }, + { + "epoch": 1.69, + "grad_norm": 7.9714674949646, + "learning_rate": 7.447862290664454e-06, + "logits/chosen": -0.2894405424594879, + "logits/rejected": -0.4099911153316498, + "logps/chosen": -61.16448211669922, + "logps/rejected": -86.3096694946289, + "loss": 0.6914, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9679923057556152, + "rewards/margins": 5.141617298126221, + "rewards/rejected": -2.1736252307891846, + "step": 6738 + }, + { + "epoch": 1.69, + "grad_norm": 9.098299026489258, + "learning_rate": 7.447176908569544e-06, + "logits/chosen": -0.312826544046402, + "logits/rejected": -0.3758232891559601, + "logps/chosen": -57.72420883178711, + "logps/rejected": -101.48497772216797, + "loss": 0.7422, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5552914142608643, + "rewards/margins": 4.381628036499023, + "rewards/rejected": -1.8263368606567383, + "step": 6739 + }, + { + "epoch": 1.69, + "grad_norm": 12.31001091003418, + "learning_rate": 7.446491466002293e-06, + "logits/chosen": -0.4392932057380676, + "logits/rejected": -0.58650141954422, + "logps/chosen": -55.001487731933594, + "logps/rejected": -72.9568099975586, + "loss": 0.718, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.291555404663086, + "rewards/margins": 5.902098178863525, + "rewards/rejected": -2.6105430126190186, + "step": 6740 + }, + { + "epoch": 1.69, + "grad_norm": 5.056684494018555, + "learning_rate": 7.4458059629796344e-06, + "logits/chosen": -0.36533451080322266, + "logits/rejected": -0.43287983536720276, + "logps/chosen": -62.77971649169922, + "logps/rejected": -93.83147430419922, + "loss": 0.7451, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.949835777282715, + "rewards/margins": 5.286416530609131, + "rewards/rejected": -2.336580991744995, + "step": 6741 + }, + { + "epoch": 1.69, + "grad_norm": 3.8584327697753906, + "learning_rate": 7.445120399518507e-06, + "logits/chosen": -0.3516477942466736, + "logits/rejected": -0.46179184317588806, + "logps/chosen": -56.77564239501953, + "logps/rejected": -95.92586517333984, + "loss": 0.6708, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8655731678009033, + "rewards/margins": 6.313121795654297, + "rewards/rejected": -3.4475481510162354, + "step": 6742 + }, + { + "epoch": 1.69, + "grad_norm": 6.567792892456055, + "learning_rate": 7.444434775635856e-06, + "logits/chosen": -0.35092684626579285, + "logits/rejected": -0.4521441161632538, + "logps/chosen": -61.374515533447266, + "logps/rejected": -85.6483383178711, + "loss": 0.7438, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.239690065383911, + "rewards/margins": 4.860907554626465, + "rewards/rejected": -1.6212172508239746, + "step": 6743 + }, + { + "epoch": 1.69, + "grad_norm": 4.615376949310303, + "learning_rate": 7.443749091348621e-06, + "logits/chosen": -0.3726467192173004, + "logits/rejected": -0.45337027311325073, + "logps/chosen": -47.65631866455078, + "logps/rejected": -84.40955352783203, + "loss": 0.7217, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.900691032409668, + "rewards/margins": 4.339599609375, + "rewards/rejected": -1.438908576965332, + "step": 6744 + }, + { + "epoch": 1.69, + "grad_norm": 10.663745880126953, + "learning_rate": 7.443063346673747e-06, + "logits/chosen": -0.3844863772392273, + "logits/rejected": -0.4734957814216614, + "logps/chosen": -77.71687316894531, + "logps/rejected": -101.56793212890625, + "loss": 0.9044, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9078996181488037, + "rewards/margins": 5.529509544372559, + "rewards/rejected": -2.621609687805176, + "step": 6745 + }, + { + "epoch": 1.69, + "grad_norm": 3.9315483570098877, + "learning_rate": 7.44237754162818e-06, + "logits/chosen": -0.3475085496902466, + "logits/rejected": -0.5221925973892212, + "logps/chosen": -58.05055236816406, + "logps/rejected": -95.27313232421875, + "loss": 0.6317, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8623805046081543, + "rewards/margins": 6.319488525390625, + "rewards/rejected": -3.457108736038208, + "step": 6746 + }, + { + "epoch": 1.69, + "grad_norm": 9.333081245422363, + "learning_rate": 7.441691676228865e-06, + "logits/chosen": -0.39177432656288147, + "logits/rejected": -0.46375763416290283, + "logps/chosen": -56.514854431152344, + "logps/rejected": -85.98162078857422, + "loss": 0.6299, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.191998243331909, + "rewards/margins": 4.987611293792725, + "rewards/rejected": -1.795613169670105, + "step": 6747 + }, + { + "epoch": 1.69, + "grad_norm": 6.4583659172058105, + "learning_rate": 7.4410057504927516e-06, + "logits/chosen": -0.3119683563709259, + "logits/rejected": -0.39722535014152527, + "logps/chosen": -59.887550354003906, + "logps/rejected": -95.47654724121094, + "loss": 0.6522, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.700049877166748, + "rewards/margins": 5.240285396575928, + "rewards/rejected": -2.5402350425720215, + "step": 6748 + }, + { + "epoch": 1.69, + "grad_norm": 4.7907938957214355, + "learning_rate": 7.440319764436789e-06, + "logits/chosen": -0.43845802545547485, + "logits/rejected": -0.5332560539245605, + "logps/chosen": -57.27053451538086, + "logps/rejected": -74.25728607177734, + "loss": 0.7978, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.078153610229492, + "rewards/margins": 4.581982612609863, + "rewards/rejected": -1.5038292407989502, + "step": 6749 + }, + { + "epoch": 1.69, + "grad_norm": 22.85671043395996, + "learning_rate": 7.43963371807793e-06, + "logits/chosen": -0.31797724962234497, + "logits/rejected": -0.4453832507133484, + "logps/chosen": -59.584144592285156, + "logps/rejected": -87.49826049804688, + "loss": 0.8313, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8298892974853516, + "rewards/margins": 5.642147541046143, + "rewards/rejected": -2.812258005142212, + "step": 6750 + }, + { + "epoch": 1.69, + "grad_norm": 4.309192180633545, + "learning_rate": 7.438947611433127e-06, + "logits/chosen": -0.36108270287513733, + "logits/rejected": -0.40235957503318787, + "logps/chosen": -55.5703125, + "logps/rejected": -92.14346313476562, + "loss": 0.7115, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.230282783508301, + "rewards/margins": 4.656369209289551, + "rewards/rejected": -1.426086664199829, + "step": 6751 + }, + { + "epoch": 1.69, + "grad_norm": 3.092548131942749, + "learning_rate": 7.438261444519336e-06, + "logits/chosen": -0.42496052384376526, + "logits/rejected": -0.48350879549980164, + "logps/chosen": -45.36090850830078, + "logps/rejected": -81.83739471435547, + "loss": 0.5607, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.028461217880249, + "rewards/margins": 5.810637474060059, + "rewards/rejected": -2.7821767330169678, + "step": 6752 + }, + { + "epoch": 1.69, + "grad_norm": 8.461847305297852, + "learning_rate": 7.437575217353509e-06, + "logits/chosen": -0.37130656838417053, + "logits/rejected": -0.5009101033210754, + "logps/chosen": -69.92231750488281, + "logps/rejected": -82.22831726074219, + "loss": 0.8778, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.747788429260254, + "rewards/margins": 4.737265586853027, + "rewards/rejected": -1.9894771575927734, + "step": 6753 + }, + { + "epoch": 1.69, + "grad_norm": 4.5164008140563965, + "learning_rate": 7.436888929952606e-06, + "logits/chosen": -0.41246774792671204, + "logits/rejected": -0.4806619882583618, + "logps/chosen": -46.11137390136719, + "logps/rejected": -82.24422454833984, + "loss": 0.647, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.066776752471924, + "rewards/margins": 4.778060436248779, + "rewards/rejected": -1.7112840414047241, + "step": 6754 + }, + { + "epoch": 1.69, + "grad_norm": 4.809209823608398, + "learning_rate": 7.436202582333587e-06, + "logits/chosen": -0.33978724479675293, + "logits/rejected": -0.5159956216812134, + "logps/chosen": -58.387367248535156, + "logps/rejected": -79.38174438476562, + "loss": 0.6228, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8867807388305664, + "rewards/margins": 5.731931686401367, + "rewards/rejected": -2.8451507091522217, + "step": 6755 + }, + { + "epoch": 1.69, + "grad_norm": 6.11173152923584, + "learning_rate": 7.4355161745134085e-06, + "logits/chosen": -0.318345308303833, + "logits/rejected": -0.3409886956214905, + "logps/chosen": -49.16484832763672, + "logps/rejected": -94.27466583251953, + "loss": 0.6687, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8791229724884033, + "rewards/margins": 5.172281265258789, + "rewards/rejected": -2.2931582927703857, + "step": 6756 + }, + { + "epoch": 1.69, + "grad_norm": 4.477237701416016, + "learning_rate": 7.434829706509037e-06, + "logits/chosen": -0.36171483993530273, + "logits/rejected": -0.509919285774231, + "logps/chosen": -59.684051513671875, + "logps/rejected": -71.8502426147461, + "loss": 0.726, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.331332206726074, + "rewards/margins": 5.017852783203125, + "rewards/rejected": -2.68652081489563, + "step": 6757 + }, + { + "epoch": 1.69, + "grad_norm": 7.462162971496582, + "learning_rate": 7.434143178337433e-06, + "logits/chosen": -0.3520530164241791, + "logits/rejected": -0.4337692856788635, + "logps/chosen": -59.12803649902344, + "logps/rejected": -73.89877319335938, + "loss": 0.7149, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0801613330841064, + "rewards/margins": 5.317261695861816, + "rewards/rejected": -2.237100124359131, + "step": 6758 + }, + { + "epoch": 1.69, + "grad_norm": 4.405538082122803, + "learning_rate": 7.433456590015562e-06, + "logits/chosen": -0.33167311549186707, + "logits/rejected": -0.4237535893917084, + "logps/chosen": -62.029052734375, + "logps/rejected": -83.00079345703125, + "loss": 0.6509, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9651761054992676, + "rewards/margins": 4.986939907073975, + "rewards/rejected": -2.021763563156128, + "step": 6759 + }, + { + "epoch": 1.69, + "grad_norm": 12.96367073059082, + "learning_rate": 7.43276994156039e-06, + "logits/chosen": -0.27231428027153015, + "logits/rejected": -0.3981103301048279, + "logps/chosen": -63.79996871948242, + "logps/rejected": -93.68413543701172, + "loss": 0.7499, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.77138090133667, + "rewards/margins": 5.269515037536621, + "rewards/rejected": -2.498133420944214, + "step": 6760 + }, + { + "epoch": 1.69, + "grad_norm": 4.189119815826416, + "learning_rate": 7.432083232988884e-06, + "logits/chosen": -0.3199949264526367, + "logits/rejected": -0.4269064664840698, + "logps/chosen": -55.26594161987305, + "logps/rejected": -76.69412231445312, + "loss": 0.6992, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9976465702056885, + "rewards/margins": 4.738107204437256, + "rewards/rejected": -1.7404606342315674, + "step": 6761 + }, + { + "epoch": 1.69, + "grad_norm": 3.579563856124878, + "learning_rate": 7.431396464318017e-06, + "logits/chosen": -0.4753642976284027, + "logits/rejected": -0.6059404015541077, + "logps/chosen": -54.18501281738281, + "logps/rejected": -85.13990783691406, + "loss": 0.6572, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0422441959381104, + "rewards/margins": 5.308521270751953, + "rewards/rejected": -2.2662770748138428, + "step": 6762 + }, + { + "epoch": 1.69, + "grad_norm": 8.446781158447266, + "learning_rate": 7.430709635564756e-06, + "logits/chosen": -0.37237927317619324, + "logits/rejected": -0.5156342387199402, + "logps/chosen": -59.194976806640625, + "logps/rejected": -89.87046813964844, + "loss": 0.6258, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.791635036468506, + "rewards/margins": 6.05117130279541, + "rewards/rejected": -3.2595362663269043, + "step": 6763 + }, + { + "epoch": 1.69, + "grad_norm": 6.181078910827637, + "learning_rate": 7.430022746746075e-06, + "logits/chosen": -0.4482816159725189, + "logits/rejected": -0.5075487494468689, + "logps/chosen": -63.12303161621094, + "logps/rejected": -87.72713470458984, + "loss": 0.7177, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.090188503265381, + "rewards/margins": 5.296394348144531, + "rewards/rejected": -2.2062056064605713, + "step": 6764 + }, + { + "epoch": 1.69, + "grad_norm": 16.060100555419922, + "learning_rate": 7.429335797878945e-06, + "logits/chosen": -0.3857678174972534, + "logits/rejected": -0.478493869304657, + "logps/chosen": -56.882232666015625, + "logps/rejected": -77.80809783935547, + "loss": 0.6783, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8763060569763184, + "rewards/margins": 4.929267406463623, + "rewards/rejected": -2.0529608726501465, + "step": 6765 + }, + { + "epoch": 1.69, + "grad_norm": 5.887742519378662, + "learning_rate": 7.428648788980346e-06, + "logits/chosen": -0.31421273946762085, + "logits/rejected": -0.3977728486061096, + "logps/chosen": -57.64413070678711, + "logps/rejected": -85.3583755493164, + "loss": 0.7002, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2616896629333496, + "rewards/margins": 4.976576805114746, + "rewards/rejected": -1.714887022972107, + "step": 6766 + }, + { + "epoch": 1.69, + "grad_norm": 5.662224292755127, + "learning_rate": 7.427961720067252e-06, + "logits/chosen": -0.38949310779571533, + "logits/rejected": -0.45136892795562744, + "logps/chosen": -72.14657592773438, + "logps/rejected": -98.8955078125, + "loss": 0.7108, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.934427261352539, + "rewards/margins": 4.34095573425293, + "rewards/rejected": -1.4065284729003906, + "step": 6767 + }, + { + "epoch": 1.69, + "grad_norm": 7.892405033111572, + "learning_rate": 7.427274591156639e-06, + "logits/chosen": -0.3996915817260742, + "logits/rejected": -0.44009271264076233, + "logps/chosen": -64.12339782714844, + "logps/rejected": -96.59464263916016, + "loss": 0.8226, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.781096935272217, + "rewards/margins": 5.421350002288818, + "rewards/rejected": -2.6402533054351807, + "step": 6768 + }, + { + "epoch": 1.69, + "grad_norm": 6.303704738616943, + "learning_rate": 7.426587402265492e-06, + "logits/chosen": -0.41113007068634033, + "logits/rejected": -0.5194768905639648, + "logps/chosen": -63.79697799682617, + "logps/rejected": -73.77802276611328, + "loss": 0.7625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3818137645721436, + "rewards/margins": 4.871648788452148, + "rewards/rejected": -2.489834785461426, + "step": 6769 + }, + { + "epoch": 1.69, + "grad_norm": 12.388697624206543, + "learning_rate": 7.425900153410788e-06, + "logits/chosen": -0.3690961003303528, + "logits/rejected": -0.4658624529838562, + "logps/chosen": -66.58084106445312, + "logps/rejected": -92.9742660522461, + "loss": 0.8979, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4825947284698486, + "rewards/margins": 5.067169666290283, + "rewards/rejected": -2.5845746994018555, + "step": 6770 + }, + { + "epoch": 1.69, + "grad_norm": 3.135159730911255, + "learning_rate": 7.425212844609511e-06, + "logits/chosen": -0.38046449422836304, + "logits/rejected": -0.5521252751350403, + "logps/chosen": -66.33168029785156, + "logps/rejected": -91.35787200927734, + "loss": 0.6706, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.852088212966919, + "rewards/margins": 5.808096885681152, + "rewards/rejected": -2.9560086727142334, + "step": 6771 + }, + { + "epoch": 1.69, + "grad_norm": 6.827880859375, + "learning_rate": 7.424525475878644e-06, + "logits/chosen": -0.4186621606349945, + "logits/rejected": -0.5369816422462463, + "logps/chosen": -58.582706451416016, + "logps/rejected": -74.52928924560547, + "loss": 0.8133, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.099062442779541, + "rewards/margins": 4.851191520690918, + "rewards/rejected": -1.7521288394927979, + "step": 6772 + }, + { + "epoch": 1.69, + "grad_norm": 5.082951545715332, + "learning_rate": 7.423838047235175e-06, + "logits/chosen": -0.3496931195259094, + "logits/rejected": -0.42592769861221313, + "logps/chosen": -66.37574005126953, + "logps/rejected": -105.60502624511719, + "loss": 0.6585, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6665687561035156, + "rewards/margins": 5.611240386962891, + "rewards/rejected": -2.944671154022217, + "step": 6773 + }, + { + "epoch": 1.69, + "grad_norm": 4.030328273773193, + "learning_rate": 7.423150558696089e-06, + "logits/chosen": -0.40145421028137207, + "logits/rejected": -0.5033873319625854, + "logps/chosen": -60.05509567260742, + "logps/rejected": -96.384765625, + "loss": 0.6522, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.024827480316162, + "rewards/margins": 5.426871299743652, + "rewards/rejected": -2.402043581008911, + "step": 6774 + }, + { + "epoch": 1.69, + "grad_norm": 5.259329795837402, + "learning_rate": 7.4224630102783765e-06, + "logits/chosen": -0.33135053515434265, + "logits/rejected": -0.4813177287578583, + "logps/chosen": -53.138694763183594, + "logps/rejected": -90.63973999023438, + "loss": 0.5937, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0679855346679688, + "rewards/margins": 6.70927095413208, + "rewards/rejected": -3.6412854194641113, + "step": 6775 + }, + { + "epoch": 1.7, + "grad_norm": 5.848319053649902, + "learning_rate": 7.421775401999025e-06, + "logits/chosen": -0.38828638195991516, + "logits/rejected": -0.4900192320346832, + "logps/chosen": -56.98554229736328, + "logps/rejected": -88.54073333740234, + "loss": 0.7466, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.767024517059326, + "rewards/margins": 5.339608192443848, + "rewards/rejected": -2.5725841522216797, + "step": 6776 + }, + { + "epoch": 1.7, + "grad_norm": 3.4528563022613525, + "learning_rate": 7.4210877338750285e-06, + "logits/chosen": -0.44705256819725037, + "logits/rejected": -0.5742932558059692, + "logps/chosen": -52.72845458984375, + "logps/rejected": -73.82063293457031, + "loss": 0.5786, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9181437492370605, + "rewards/margins": 4.956423759460449, + "rewards/rejected": -2.0382797718048096, + "step": 6777 + }, + { + "epoch": 1.7, + "grad_norm": 6.913441181182861, + "learning_rate": 7.4204000059233795e-06, + "logits/chosen": -0.39459264278411865, + "logits/rejected": -0.516285240650177, + "logps/chosen": -51.553550720214844, + "logps/rejected": -81.77732849121094, + "loss": 0.6348, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8907201290130615, + "rewards/margins": 5.517215728759766, + "rewards/rejected": -2.626495361328125, + "step": 6778 + }, + { + "epoch": 1.7, + "grad_norm": 3.4412052631378174, + "learning_rate": 7.41971221816107e-06, + "logits/chosen": -0.3812394142150879, + "logits/rejected": -0.5184752941131592, + "logps/chosen": -54.80851364135742, + "logps/rejected": -92.2032699584961, + "loss": 0.6423, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8946609497070312, + "rewards/margins": 5.933210372924805, + "rewards/rejected": -3.0385489463806152, + "step": 6779 + }, + { + "epoch": 1.7, + "grad_norm": 6.531808853149414, + "learning_rate": 7.419024370605099e-06, + "logits/chosen": -0.41148442029953003, + "logits/rejected": -0.5338755249977112, + "logps/chosen": -51.82059097290039, + "logps/rejected": -79.83094024658203, + "loss": 0.6573, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7802367210388184, + "rewards/margins": 5.29595947265625, + "rewards/rejected": -2.5157222747802734, + "step": 6780 + }, + { + "epoch": 1.7, + "grad_norm": 8.701109886169434, + "learning_rate": 7.418336463272462e-06, + "logits/chosen": -0.43047845363616943, + "logits/rejected": -0.5055236220359802, + "logps/chosen": -56.35918426513672, + "logps/rejected": -83.97682189941406, + "loss": 0.6719, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9574384689331055, + "rewards/margins": 5.327286720275879, + "rewards/rejected": -2.3698480129241943, + "step": 6781 + }, + { + "epoch": 1.7, + "grad_norm": 8.102336883544922, + "learning_rate": 7.4176484961801585e-06, + "logits/chosen": -0.3212049603462219, + "logits/rejected": -0.40965819358825684, + "logps/chosen": -64.72623443603516, + "logps/rejected": -96.2099380493164, + "loss": 0.7139, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9527740478515625, + "rewards/margins": 4.674476623535156, + "rewards/rejected": -1.721703052520752, + "step": 6782 + }, + { + "epoch": 1.7, + "grad_norm": 9.558028221130371, + "learning_rate": 7.4169604693451904e-06, + "logits/chosen": -0.3565921187400818, + "logits/rejected": -0.4305657744407654, + "logps/chosen": -58.099647521972656, + "logps/rejected": -95.77213287353516, + "loss": 0.6969, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6010780334472656, + "rewards/margins": 4.754687309265137, + "rewards/rejected": -2.153609037399292, + "step": 6783 + }, + { + "epoch": 1.7, + "grad_norm": 15.633546829223633, + "learning_rate": 7.416272382784559e-06, + "logits/chosen": -0.3294239938259125, + "logits/rejected": -0.4090067744255066, + "logps/chosen": -55.4676399230957, + "logps/rejected": -98.83980560302734, + "loss": 0.8185, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8394811153411865, + "rewards/margins": 4.764893531799316, + "rewards/rejected": -1.9254121780395508, + "step": 6784 + }, + { + "epoch": 1.7, + "grad_norm": 4.184093952178955, + "learning_rate": 7.415584236515264e-06, + "logits/chosen": -0.4552485942840576, + "logits/rejected": -0.5216382741928101, + "logps/chosen": -54.9370002746582, + "logps/rejected": -101.41763305664062, + "loss": 0.7915, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8359575271606445, + "rewards/margins": 5.186950206756592, + "rewards/rejected": -2.3509926795959473, + "step": 6785 + }, + { + "epoch": 1.7, + "grad_norm": 5.264104843139648, + "learning_rate": 7.414896030554316e-06, + "logits/chosen": -0.3324657678604126, + "logits/rejected": -0.4341091215610504, + "logps/chosen": -61.61003494262695, + "logps/rejected": -102.89338684082031, + "loss": 0.8097, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7463247776031494, + "rewards/margins": 4.751946449279785, + "rewards/rejected": -2.0056216716766357, + "step": 6786 + }, + { + "epoch": 1.7, + "grad_norm": 9.91193675994873, + "learning_rate": 7.414207764918718e-06, + "logits/chosen": -0.4449361264705658, + "logits/rejected": -0.545275866985321, + "logps/chosen": -58.859718322753906, + "logps/rejected": -81.01104736328125, + "loss": 0.6658, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.848620653152466, + "rewards/margins": 4.972456932067871, + "rewards/rejected": -2.1238365173339844, + "step": 6787 + }, + { + "epoch": 1.7, + "grad_norm": 10.692911148071289, + "learning_rate": 7.413519439625476e-06, + "logits/chosen": -0.4757068157196045, + "logits/rejected": -0.5088531970977783, + "logps/chosen": -50.26336669921875, + "logps/rejected": -99.43824005126953, + "loss": 0.6959, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.747885227203369, + "rewards/margins": 4.893415451049805, + "rewards/rejected": -2.145530939102173, + "step": 6788 + }, + { + "epoch": 1.7, + "grad_norm": 3.663700580596924, + "learning_rate": 7.412831054691605e-06, + "logits/chosen": -0.4211992621421814, + "logits/rejected": -0.5293537974357605, + "logps/chosen": -55.43296432495117, + "logps/rejected": -81.99403381347656, + "loss": 0.7407, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.845285654067993, + "rewards/margins": 5.610273838043213, + "rewards/rejected": -2.7649879455566406, + "step": 6789 + }, + { + "epoch": 1.7, + "grad_norm": 4.358345031738281, + "learning_rate": 7.41214261013411e-06, + "logits/chosen": -0.34706491231918335, + "logits/rejected": -0.43181419372558594, + "logps/chosen": -63.19415283203125, + "logps/rejected": -82.4269790649414, + "loss": 0.6907, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8414371013641357, + "rewards/margins": 5.206058502197266, + "rewards/rejected": -2.36462140083313, + "step": 6790 + }, + { + "epoch": 1.7, + "grad_norm": 6.443000316619873, + "learning_rate": 7.411454105970005e-06, + "logits/chosen": -0.39464497566223145, + "logits/rejected": -0.4333619475364685, + "logps/chosen": -57.8045539855957, + "logps/rejected": -101.0088119506836, + "loss": 0.7538, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1060850620269775, + "rewards/margins": 5.919782638549805, + "rewards/rejected": -2.813697338104248, + "step": 6791 + }, + { + "epoch": 1.7, + "grad_norm": 4.21628999710083, + "learning_rate": 7.410765542216305e-06, + "logits/chosen": -0.3733675181865692, + "logits/rejected": -0.5009157657623291, + "logps/chosen": -60.152976989746094, + "logps/rejected": -83.94922637939453, + "loss": 0.8046, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.882659435272217, + "rewards/margins": 4.945525169372559, + "rewards/rejected": -2.062865734100342, + "step": 6792 + }, + { + "epoch": 1.7, + "grad_norm": 9.938586235046387, + "learning_rate": 7.410076918890025e-06, + "logits/chosen": -0.37080255150794983, + "logits/rejected": -0.4388715624809265, + "logps/chosen": -68.5651626586914, + "logps/rejected": -100.89242553710938, + "loss": 0.7183, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7766199111938477, + "rewards/margins": 5.341209411621094, + "rewards/rejected": -2.564589738845825, + "step": 6793 + }, + { + "epoch": 1.7, + "grad_norm": 3.118788480758667, + "learning_rate": 7.40938823600818e-06, + "logits/chosen": -0.4346059560775757, + "logits/rejected": -0.559829831123352, + "logps/chosen": -53.408145904541016, + "logps/rejected": -84.73922729492188, + "loss": 0.6902, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.708678960800171, + "rewards/margins": 5.447798252105713, + "rewards/rejected": -2.739119529724121, + "step": 6794 + }, + { + "epoch": 1.7, + "grad_norm": 10.088452339172363, + "learning_rate": 7.408699493587789e-06, + "logits/chosen": -0.39887937903404236, + "logits/rejected": -0.41124001145362854, + "logps/chosen": -80.32723236083984, + "logps/rejected": -99.67849731445312, + "loss": 0.9497, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.596950054168701, + "rewards/margins": 4.075901031494141, + "rewards/rejected": -1.478951096534729, + "step": 6795 + }, + { + "epoch": 1.7, + "grad_norm": 4.13864278793335, + "learning_rate": 7.40801069164587e-06, + "logits/chosen": -0.44105327129364014, + "logits/rejected": -0.48417699337005615, + "logps/chosen": -47.91741943359375, + "logps/rejected": -91.69442749023438, + "loss": 0.6241, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0866100788116455, + "rewards/margins": 5.439348220825195, + "rewards/rejected": -2.3527379035949707, + "step": 6796 + }, + { + "epoch": 1.7, + "grad_norm": 3.0453031063079834, + "learning_rate": 7.4073218301994475e-06, + "logits/chosen": -0.39352717995643616, + "logits/rejected": -0.42108672857284546, + "logps/chosen": -41.733009338378906, + "logps/rejected": -92.32315063476562, + "loss": 0.6751, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2533106803894043, + "rewards/margins": 6.196077823638916, + "rewards/rejected": -2.9427671432495117, + "step": 6797 + }, + { + "epoch": 1.7, + "grad_norm": 9.016118049621582, + "learning_rate": 7.406632909265543e-06, + "logits/chosen": -0.4128820300102234, + "logits/rejected": -0.5191361904144287, + "logps/chosen": -62.36629867553711, + "logps/rejected": -78.03984069824219, + "loss": 0.7839, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0913376808166504, + "rewards/margins": 4.5569915771484375, + "rewards/rejected": -1.4656544923782349, + "step": 6798 + }, + { + "epoch": 1.7, + "grad_norm": 6.2876410484313965, + "learning_rate": 7.405943928861178e-06, + "logits/chosen": -0.3965955972671509, + "logits/rejected": -0.5111212730407715, + "logps/chosen": -56.898380279541016, + "logps/rejected": -89.30014038085938, + "loss": 0.7359, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7658791542053223, + "rewards/margins": 5.121529579162598, + "rewards/rejected": -2.3556501865386963, + "step": 6799 + }, + { + "epoch": 1.7, + "grad_norm": 3.4166972637176514, + "learning_rate": 7.405254889003379e-06, + "logits/chosen": -0.39211615920066833, + "logits/rejected": -0.4768886864185333, + "logps/chosen": -61.50496292114258, + "logps/rejected": -97.82144927978516, + "loss": 0.6886, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.806253433227539, + "rewards/margins": 4.447315692901611, + "rewards/rejected": -1.6410621404647827, + "step": 6800 + }, + { + "epoch": 1.7, + "grad_norm": 5.57070779800415, + "learning_rate": 7.404565789709174e-06, + "logits/chosen": -0.4511551260948181, + "logits/rejected": -0.5225693583488464, + "logps/chosen": -52.237220764160156, + "logps/rejected": -88.70411682128906, + "loss": 0.7927, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7150909900665283, + "rewards/margins": 4.481246471405029, + "rewards/rejected": -1.7661558389663696, + "step": 6801 + }, + { + "epoch": 1.7, + "grad_norm": 5.477015972137451, + "learning_rate": 7.40387663099559e-06, + "logits/chosen": -0.3708398938179016, + "logits/rejected": -0.48279738426208496, + "logps/chosen": -55.8436279296875, + "logps/rejected": -92.0848617553711, + "loss": 0.6843, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1909103393554688, + "rewards/margins": 5.183490753173828, + "rewards/rejected": -1.9925800561904907, + "step": 6802 + }, + { + "epoch": 1.7, + "grad_norm": 4.481578826904297, + "learning_rate": 7.403187412879659e-06, + "logits/chosen": -0.304922878742218, + "logits/rejected": -0.42519962787628174, + "logps/chosen": -60.25090026855469, + "logps/rejected": -86.66184997558594, + "loss": 0.7485, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8828108310699463, + "rewards/margins": 5.456053733825684, + "rewards/rejected": -2.573242664337158, + "step": 6803 + }, + { + "epoch": 1.7, + "grad_norm": 4.273592948913574, + "learning_rate": 7.40249813537841e-06, + "logits/chosen": -0.4398856461048126, + "logits/rejected": -0.5953615307807922, + "logps/chosen": -55.95225143432617, + "logps/rejected": -71.59056854248047, + "loss": 0.6728, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.139911651611328, + "rewards/margins": 5.7282538414001465, + "rewards/rejected": -2.5883421897888184, + "step": 6804 + }, + { + "epoch": 1.7, + "grad_norm": 15.125044822692871, + "learning_rate": 7.401808798508876e-06, + "logits/chosen": -0.3910675644874573, + "logits/rejected": -0.48178645968437195, + "logps/chosen": -66.47273254394531, + "logps/rejected": -94.9032974243164, + "loss": 0.8468, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8844656944274902, + "rewards/margins": 4.64138126373291, + "rewards/rejected": -1.75691556930542, + "step": 6805 + }, + { + "epoch": 1.7, + "grad_norm": 7.261002063751221, + "learning_rate": 7.401119402288093e-06, + "logits/chosen": -0.3646199405193329, + "logits/rejected": -0.4336010217666626, + "logps/chosen": -60.88975143432617, + "logps/rejected": -94.08919525146484, + "loss": 0.6683, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.770270586013794, + "rewards/margins": 5.001738548278809, + "rewards/rejected": -2.2314677238464355, + "step": 6806 + }, + { + "epoch": 1.7, + "grad_norm": 12.211593627929688, + "learning_rate": 7.400429946733096e-06, + "logits/chosen": -0.39382150769233704, + "logits/rejected": -0.49187371134757996, + "logps/chosen": -59.71229934692383, + "logps/rejected": -85.58741760253906, + "loss": 0.7781, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5435285568237305, + "rewards/margins": 3.8906161785125732, + "rewards/rejected": -1.3470872640609741, + "step": 6807 + }, + { + "epoch": 1.7, + "grad_norm": 4.838829040527344, + "learning_rate": 7.39974043186092e-06, + "logits/chosen": -0.4577896296977997, + "logits/rejected": -0.5114991664886475, + "logps/chosen": -43.937068939208984, + "logps/rejected": -85.72915649414062, + "loss": 0.6856, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2100934982299805, + "rewards/margins": 4.778608798980713, + "rewards/rejected": -1.5685153007507324, + "step": 6808 + }, + { + "epoch": 1.7, + "grad_norm": 3.3470876216888428, + "learning_rate": 7.399050857688606e-06, + "logits/chosen": -0.3615567982196808, + "logits/rejected": -0.3984660506248474, + "logps/chosen": -46.40407943725586, + "logps/rejected": -89.73503875732422, + "loss": 0.6843, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1024093627929688, + "rewards/margins": 4.439371109008789, + "rewards/rejected": -1.3369618654251099, + "step": 6809 + }, + { + "epoch": 1.7, + "grad_norm": 4.6461052894592285, + "learning_rate": 7.398361224233195e-06, + "logits/chosen": -0.4148464798927307, + "logits/rejected": -0.514590859413147, + "logps/chosen": -54.72642135620117, + "logps/rejected": -67.03752136230469, + "loss": 0.7824, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.846264362335205, + "rewards/margins": 3.824915885925293, + "rewards/rejected": -0.978651762008667, + "step": 6810 + }, + { + "epoch": 1.7, + "grad_norm": 7.410365581512451, + "learning_rate": 7.397671531511725e-06, + "logits/chosen": -0.37798169255256653, + "logits/rejected": -0.47512689232826233, + "logps/chosen": -60.734214782714844, + "logps/rejected": -79.98336791992188, + "loss": 0.7866, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0260515213012695, + "rewards/margins": 4.95285701751709, + "rewards/rejected": -1.9268049001693726, + "step": 6811 + }, + { + "epoch": 1.7, + "grad_norm": 5.872611045837402, + "learning_rate": 7.396981779541244e-06, + "logits/chosen": -0.42516571283340454, + "logits/rejected": -0.49471062421798706, + "logps/chosen": -54.702857971191406, + "logps/rejected": -101.47288513183594, + "loss": 0.858, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8449602127075195, + "rewards/margins": 4.385573863983154, + "rewards/rejected": -1.540614128112793, + "step": 6812 + }, + { + "epoch": 1.7, + "grad_norm": 3.9925575256347656, + "learning_rate": 7.3962919683387915e-06, + "logits/chosen": -0.3746073246002197, + "logits/rejected": -0.49026358127593994, + "logps/chosen": -53.31452941894531, + "logps/rejected": -73.28278350830078, + "loss": 0.7087, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.702266216278076, + "rewards/margins": 5.296126842498779, + "rewards/rejected": -2.593860626220703, + "step": 6813 + }, + { + "epoch": 1.7, + "grad_norm": 3.10872220993042, + "learning_rate": 7.395602097921417e-06, + "logits/chosen": -0.4155864417552948, + "logits/rejected": -0.5099020600318909, + "logps/chosen": -57.152679443359375, + "logps/rejected": -78.8230972290039, + "loss": 0.6572, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1986875534057617, + "rewards/margins": 4.7776594161987305, + "rewards/rejected": -1.5789719820022583, + "step": 6814 + }, + { + "epoch": 1.7, + "grad_norm": 7.483682155609131, + "learning_rate": 7.394912168306165e-06, + "logits/chosen": -0.39029747247695923, + "logits/rejected": -0.491531103849411, + "logps/chosen": -61.1577033996582, + "logps/rejected": -78.35867309570312, + "loss": 0.8029, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8701910972595215, + "rewards/margins": 4.66937255859375, + "rewards/rejected": -1.7991814613342285, + "step": 6815 + }, + { + "epoch": 1.71, + "grad_norm": 7.612740516662598, + "learning_rate": 7.394222179510086e-06, + "logits/chosen": -0.3753542900085449, + "logits/rejected": -0.4719122052192688, + "logps/chosen": -55.74281692504883, + "logps/rejected": -70.2447509765625, + "loss": 0.7041, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.927253484725952, + "rewards/margins": 4.219975471496582, + "rewards/rejected": -1.2927218675613403, + "step": 6816 + }, + { + "epoch": 1.71, + "grad_norm": 2.627737283706665, + "learning_rate": 7.393532131550232e-06, + "logits/chosen": -0.40122121572494507, + "logits/rejected": -0.48215556144714355, + "logps/chosen": -47.36162185668945, + "logps/rejected": -91.61610412597656, + "loss": 0.6692, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2219231128692627, + "rewards/margins": 5.92678689956665, + "rewards/rejected": -2.7048633098602295, + "step": 6817 + }, + { + "epoch": 1.71, + "grad_norm": 5.109856605529785, + "learning_rate": 7.392842024443653e-06, + "logits/chosen": -0.44031283259391785, + "logits/rejected": -0.5236508846282959, + "logps/chosen": -64.96099853515625, + "logps/rejected": -89.77995300292969, + "loss": 0.7049, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1217782497406006, + "rewards/margins": 4.735822677612305, + "rewards/rejected": -1.6140440702438354, + "step": 6818 + }, + { + "epoch": 1.71, + "grad_norm": 4.464439392089844, + "learning_rate": 7.392151858207402e-06, + "logits/chosen": -0.40509265661239624, + "logits/rejected": -0.5097724199295044, + "logps/chosen": -61.33391571044922, + "logps/rejected": -83.78523254394531, + "loss": 0.6909, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.730560541152954, + "rewards/margins": 5.169675827026367, + "rewards/rejected": -2.4391160011291504, + "step": 6819 + }, + { + "epoch": 1.71, + "grad_norm": 8.919297218322754, + "learning_rate": 7.391461632858533e-06, + "logits/chosen": -0.4442361891269684, + "logits/rejected": -0.4981359839439392, + "logps/chosen": -62.95551300048828, + "logps/rejected": -115.616943359375, + "loss": 0.8155, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8793630599975586, + "rewards/margins": 5.74346923828125, + "rewards/rejected": -2.8641064167022705, + "step": 6820 + }, + { + "epoch": 1.71, + "grad_norm": 4.89841890335083, + "learning_rate": 7.390771348414105e-06, + "logits/chosen": -0.40250733494758606, + "logits/rejected": -0.5005571842193604, + "logps/chosen": -66.40251159667969, + "logps/rejected": -84.32723236083984, + "loss": 0.737, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.985196113586426, + "rewards/margins": 5.042572021484375, + "rewards/rejected": -2.0573766231536865, + "step": 6821 + }, + { + "epoch": 1.71, + "grad_norm": 7.264869213104248, + "learning_rate": 7.390081004891172e-06, + "logits/chosen": -0.4424912631511688, + "logits/rejected": -0.5337198376655579, + "logps/chosen": -53.502960205078125, + "logps/rejected": -84.45248413085938, + "loss": 0.6773, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6957437992095947, + "rewards/margins": 4.6520676612854, + "rewards/rejected": -1.9563242197036743, + "step": 6822 + }, + { + "epoch": 1.71, + "grad_norm": 3.3992345333099365, + "learning_rate": 7.389390602306797e-06, + "logits/chosen": -0.3986849784851074, + "logits/rejected": -0.4953530430793762, + "logps/chosen": -40.19236373901367, + "logps/rejected": -88.25564575195312, + "loss": 0.6202, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.011183738708496, + "rewards/margins": 5.753289699554443, + "rewards/rejected": -2.742105722427368, + "step": 6823 + }, + { + "epoch": 1.71, + "grad_norm": 5.167976379394531, + "learning_rate": 7.388700140678038e-06, + "logits/chosen": -0.4572344720363617, + "logits/rejected": -0.549346923828125, + "logps/chosen": -60.181617736816406, + "logps/rejected": -95.98458099365234, + "loss": 0.7956, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.853675127029419, + "rewards/margins": 5.528461456298828, + "rewards/rejected": -2.6747865676879883, + "step": 6824 + }, + { + "epoch": 1.71, + "grad_norm": 5.9097185134887695, + "learning_rate": 7.3880096200219585e-06, + "logits/chosen": -0.4157329499721527, + "logits/rejected": -0.5860423445701599, + "logps/chosen": -59.39565658569336, + "logps/rejected": -72.82904052734375, + "loss": 0.6683, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0500309467315674, + "rewards/margins": 5.614890098571777, + "rewards/rejected": -2.564859390258789, + "step": 6825 + }, + { + "epoch": 1.71, + "grad_norm": 4.5644850730896, + "learning_rate": 7.387319040355621e-06, + "logits/chosen": -0.4289141595363617, + "logits/rejected": -0.5338075160980225, + "logps/chosen": -59.237911224365234, + "logps/rejected": -97.15880584716797, + "loss": 0.7283, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.000422239303589, + "rewards/margins": 5.4556989669799805, + "rewards/rejected": -2.4552769660949707, + "step": 6826 + }, + { + "epoch": 1.71, + "grad_norm": 11.123929023742676, + "learning_rate": 7.3866284016960906e-06, + "logits/chosen": -0.42973288893699646, + "logits/rejected": -0.5301668047904968, + "logps/chosen": -65.96367645263672, + "logps/rejected": -95.06641387939453, + "loss": 0.8987, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9755773544311523, + "rewards/margins": 4.68984842300415, + "rewards/rejected": -1.7142709493637085, + "step": 6827 + }, + { + "epoch": 1.71, + "grad_norm": 4.080007553100586, + "learning_rate": 7.385937704060434e-06, + "logits/chosen": -0.4015243947505951, + "logits/rejected": -0.4970010817050934, + "logps/chosen": -52.538936614990234, + "logps/rejected": -83.08749389648438, + "loss": 0.6336, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8013460636138916, + "rewards/margins": 5.883398056030273, + "rewards/rejected": -3.0820517539978027, + "step": 6828 + }, + { + "epoch": 1.71, + "grad_norm": 4.530759334564209, + "learning_rate": 7.385246947465718e-06, + "logits/chosen": -0.38308772444725037, + "logits/rejected": -0.5252388119697571, + "logps/chosen": -62.65626525878906, + "logps/rejected": -76.08198547363281, + "loss": 0.7464, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.3885557651519775, + "rewards/margins": 4.359272003173828, + "rewards/rejected": -1.9707162380218506, + "step": 6829 + }, + { + "epoch": 1.71, + "grad_norm": 2.8854587078094482, + "learning_rate": 7.3845561319290145e-06, + "logits/chosen": -0.4351545572280884, + "logits/rejected": -0.5000297427177429, + "logps/chosen": -49.860198974609375, + "logps/rejected": -101.14509582519531, + "loss": 0.5689, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.80952525138855, + "rewards/margins": 5.617886543273926, + "rewards/rejected": -2.808361291885376, + "step": 6830 + }, + { + "epoch": 1.71, + "grad_norm": 6.691858768463135, + "learning_rate": 7.3838652574673905e-06, + "logits/chosen": -0.4367159307003021, + "logits/rejected": -0.5665704607963562, + "logps/chosen": -61.011451721191406, + "logps/rejected": -92.44493103027344, + "loss": 0.6203, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8443853855133057, + "rewards/margins": 5.856568813323975, + "rewards/rejected": -3.012183427810669, + "step": 6831 + }, + { + "epoch": 1.71, + "grad_norm": 8.220724105834961, + "learning_rate": 7.3831743240979224e-06, + "logits/chosen": -0.3720220923423767, + "logits/rejected": -0.4120168387889862, + "logps/chosen": -51.96257019042969, + "logps/rejected": -94.73951721191406, + "loss": 0.726, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5838570594787598, + "rewards/margins": 4.623673439025879, + "rewards/rejected": -2.0398166179656982, + "step": 6832 + }, + { + "epoch": 1.71, + "grad_norm": 5.9593424797058105, + "learning_rate": 7.382483331837681e-06, + "logits/chosen": -0.4352460503578186, + "logits/rejected": -0.4837765097618103, + "logps/chosen": -48.99423599243164, + "logps/rejected": -88.75041198730469, + "loss": 0.6624, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1223981380462646, + "rewards/margins": 5.026569843292236, + "rewards/rejected": -1.9041717052459717, + "step": 6833 + }, + { + "epoch": 1.71, + "grad_norm": 14.59277629852295, + "learning_rate": 7.381792280703742e-06, + "logits/chosen": -0.4684256911277771, + "logits/rejected": -0.5679031014442444, + "logps/chosen": -57.594444274902344, + "logps/rejected": -74.13441467285156, + "loss": 0.8211, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8091094493865967, + "rewards/margins": 4.920180320739746, + "rewards/rejected": -2.1110711097717285, + "step": 6834 + }, + { + "epoch": 1.71, + "grad_norm": 14.138487815856934, + "learning_rate": 7.381101170713184e-06, + "logits/chosen": -0.4717687964439392, + "logits/rejected": -0.5555849075317383, + "logps/chosen": -58.70629119873047, + "logps/rejected": -102.60127258300781, + "loss": 0.737, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.844162940979004, + "rewards/margins": 5.945613861083984, + "rewards/rejected": -3.1014504432678223, + "step": 6835 + }, + { + "epoch": 1.71, + "grad_norm": 5.723841667175293, + "learning_rate": 7.3804100018830825e-06, + "logits/chosen": -0.49561211466789246, + "logits/rejected": -0.5730828642845154, + "logps/chosen": -55.78250503540039, + "logps/rejected": -75.43876647949219, + "loss": 0.8098, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7889339923858643, + "rewards/margins": 4.500621795654297, + "rewards/rejected": -1.7116878032684326, + "step": 6836 + }, + { + "epoch": 1.71, + "grad_norm": 5.6713175773620605, + "learning_rate": 7.379718774230518e-06, + "logits/chosen": -0.4115176200866699, + "logits/rejected": -0.5071719884872437, + "logps/chosen": -55.079307556152344, + "logps/rejected": -93.02406311035156, + "loss": 0.591, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1061220169067383, + "rewards/margins": 6.412992000579834, + "rewards/rejected": -3.3068697452545166, + "step": 6837 + }, + { + "epoch": 1.71, + "grad_norm": 3.4188296794891357, + "learning_rate": 7.37902748777257e-06, + "logits/chosen": -0.3596750795841217, + "logits/rejected": -0.3898109197616577, + "logps/chosen": -59.626380920410156, + "logps/rejected": -110.51896667480469, + "loss": 0.6585, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9511947631835938, + "rewards/margins": 5.599152565002441, + "rewards/rejected": -2.6479580402374268, + "step": 6838 + }, + { + "epoch": 1.71, + "grad_norm": 6.259734630584717, + "learning_rate": 7.378336142526324e-06, + "logits/chosen": -0.36101484298706055, + "logits/rejected": -0.5212110280990601, + "logps/chosen": -59.555931091308594, + "logps/rejected": -90.48008728027344, + "loss": 0.6989, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9384984970092773, + "rewards/margins": 6.309118270874023, + "rewards/rejected": -3.3706185817718506, + "step": 6839 + }, + { + "epoch": 1.71, + "grad_norm": 3.9803075790405273, + "learning_rate": 7.3776447385088616e-06, + "logits/chosen": -0.4108855724334717, + "logits/rejected": -0.4681275486946106, + "logps/chosen": -56.665462493896484, + "logps/rejected": -100.07624053955078, + "loss": 0.781, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8672523498535156, + "rewards/margins": 5.728357791900635, + "rewards/rejected": -2.861104965209961, + "step": 6840 + }, + { + "epoch": 1.71, + "grad_norm": 9.962239265441895, + "learning_rate": 7.376953275737268e-06, + "logits/chosen": -0.4423512816429138, + "logits/rejected": -0.5320258736610413, + "logps/chosen": -54.218772888183594, + "logps/rejected": -84.421875, + "loss": 0.7521, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2376534938812256, + "rewards/margins": 4.986824989318848, + "rewards/rejected": -1.749171495437622, + "step": 6841 + }, + { + "epoch": 1.71, + "grad_norm": 7.738485336303711, + "learning_rate": 7.376261754228633e-06, + "logits/chosen": -0.45568522810935974, + "logits/rejected": -0.5095488429069519, + "logps/chosen": -60.29471206665039, + "logps/rejected": -91.64035034179688, + "loss": 0.7095, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8896143436431885, + "rewards/margins": 5.331633567810059, + "rewards/rejected": -2.442018747329712, + "step": 6842 + }, + { + "epoch": 1.71, + "grad_norm": 8.965293884277344, + "learning_rate": 7.37557017400004e-06, + "logits/chosen": -0.3579488694667816, + "logits/rejected": -0.4009150266647339, + "logps/chosen": -60.179115295410156, + "logps/rejected": -77.74288940429688, + "loss": 0.7446, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.124074697494507, + "rewards/margins": 4.482918739318848, + "rewards/rejected": -1.3588441610336304, + "step": 6843 + }, + { + "epoch": 1.71, + "grad_norm": 9.291696548461914, + "learning_rate": 7.3748785350685835e-06, + "logits/chosen": -0.4224562644958496, + "logits/rejected": -0.5350030064582825, + "logps/chosen": -64.07308197021484, + "logps/rejected": -78.28353118896484, + "loss": 0.7533, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2345643043518066, + "rewards/margins": 5.330826759338379, + "rewards/rejected": -2.096261501312256, + "step": 6844 + }, + { + "epoch": 1.71, + "grad_norm": 4.367468357086182, + "learning_rate": 7.37418683745135e-06, + "logits/chosen": -0.33107563853263855, + "logits/rejected": -0.3842047452926636, + "logps/chosen": -51.978660583496094, + "logps/rejected": -105.62765502929688, + "loss": 0.7313, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0079433917999268, + "rewards/margins": 4.646440505981445, + "rewards/rejected": -1.638496994972229, + "step": 6845 + }, + { + "epoch": 1.71, + "grad_norm": 5.291072368621826, + "learning_rate": 7.373495081165436e-06, + "logits/chosen": -0.3657759428024292, + "logits/rejected": -0.4303669035434723, + "logps/chosen": -63.728973388671875, + "logps/rejected": -86.51897430419922, + "loss": 0.8064, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8114519119262695, + "rewards/margins": 3.732445001602173, + "rewards/rejected": -0.9209930896759033, + "step": 6846 + }, + { + "epoch": 1.71, + "grad_norm": 4.168886661529541, + "learning_rate": 7.372803266227934e-06, + "logits/chosen": -0.3961312770843506, + "logits/rejected": -0.5257100462913513, + "logps/chosen": -63.98576354980469, + "logps/rejected": -88.74419403076172, + "loss": 0.7279, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6798224449157715, + "rewards/margins": 5.823563098907471, + "rewards/rejected": -3.1437411308288574, + "step": 6847 + }, + { + "epoch": 1.71, + "grad_norm": 18.340322494506836, + "learning_rate": 7.372111392655939e-06, + "logits/chosen": -0.38150694966316223, + "logits/rejected": -0.5072463750839233, + "logps/chosen": -72.52001953125, + "logps/rejected": -85.0915756225586, + "loss": 0.786, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.791402816772461, + "rewards/margins": 4.809087753295898, + "rewards/rejected": -2.0176846981048584, + "step": 6848 + }, + { + "epoch": 1.71, + "grad_norm": 4.513457775115967, + "learning_rate": 7.3714194604665476e-06, + "logits/chosen": -0.3726902902126312, + "logits/rejected": -0.48000362515449524, + "logps/chosen": -66.81640625, + "logps/rejected": -82.1683349609375, + "loss": 0.8079, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.091580867767334, + "rewards/margins": 4.816141128540039, + "rewards/rejected": -1.7245597839355469, + "step": 6849 + }, + { + "epoch": 1.71, + "grad_norm": 7.671626091003418, + "learning_rate": 7.3707274696768615e-06, + "logits/chosen": -0.38404420018196106, + "logits/rejected": -0.4982474148273468, + "logps/chosen": -61.10645294189453, + "logps/rejected": -73.50045776367188, + "loss": 0.7938, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.89670991897583, + "rewards/margins": 3.967672348022461, + "rewards/rejected": -1.0709625482559204, + "step": 6850 + }, + { + "epoch": 1.71, + "grad_norm": 3.8281469345092773, + "learning_rate": 7.370035420303975e-06, + "logits/chosen": -0.4046642780303955, + "logits/rejected": -0.5019470453262329, + "logps/chosen": -53.91229248046875, + "logps/rejected": -90.79780578613281, + "loss": 0.6175, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1155269145965576, + "rewards/margins": 5.725840091705322, + "rewards/rejected": -2.6103134155273438, + "step": 6851 + }, + { + "epoch": 1.71, + "grad_norm": 3.187483310699463, + "learning_rate": 7.369343312364994e-06, + "logits/chosen": -0.4414516091346741, + "logits/rejected": -0.5431746244430542, + "logps/chosen": -51.688575744628906, + "logps/rejected": -85.38264465332031, + "loss": 0.5745, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.223888397216797, + "rewards/margins": 5.083737373352051, + "rewards/rejected": -1.859848976135254, + "step": 6852 + }, + { + "epoch": 1.71, + "grad_norm": 7.431768894195557, + "learning_rate": 7.368651145877021e-06, + "logits/chosen": -0.40498244762420654, + "logits/rejected": -0.4946206212043762, + "logps/chosen": -62.61907958984375, + "logps/rejected": -83.78932189941406, + "loss": 0.7712, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8916988372802734, + "rewards/margins": 4.599737644195557, + "rewards/rejected": -1.7080388069152832, + "step": 6853 + }, + { + "epoch": 1.71, + "grad_norm": 4.95415735244751, + "learning_rate": 7.3679589208571555e-06, + "logits/chosen": -0.3610589802265167, + "logits/rejected": -0.45989686250686646, + "logps/chosen": -58.53053283691406, + "logps/rejected": -91.07959747314453, + "loss": 0.7494, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.240689754486084, + "rewards/margins": 4.788183212280273, + "rewards/rejected": -1.5474931001663208, + "step": 6854 + }, + { + "epoch": 1.71, + "grad_norm": 10.601099967956543, + "learning_rate": 7.367266637322508e-06, + "logits/chosen": -0.4882177412509918, + "logits/rejected": -0.5393324494361877, + "logps/chosen": -47.10167694091797, + "logps/rejected": -80.99473571777344, + "loss": 0.7423, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.160038709640503, + "rewards/margins": 4.377757549285889, + "rewards/rejected": -1.2177186012268066, + "step": 6855 + }, + { + "epoch": 1.72, + "grad_norm": 6.119211673736572, + "learning_rate": 7.3665742952901845e-06, + "logits/chosen": -0.43981361389160156, + "logits/rejected": -0.561469316482544, + "logps/chosen": -62.966129302978516, + "logps/rejected": -69.2680892944336, + "loss": 0.8324, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6264188289642334, + "rewards/margins": 4.115916728973389, + "rewards/rejected": -1.4894977807998657, + "step": 6856 + }, + { + "epoch": 1.72, + "grad_norm": 3.6931726932525635, + "learning_rate": 7.3658818947772915e-06, + "logits/chosen": -0.36268216371536255, + "logits/rejected": -0.43827733397483826, + "logps/chosen": -52.06001281738281, + "logps/rejected": -86.6313705444336, + "loss": 0.6631, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9402663707733154, + "rewards/margins": 4.394923210144043, + "rewards/rejected": -1.4546570777893066, + "step": 6857 + }, + { + "epoch": 1.72, + "grad_norm": 4.481281757354736, + "learning_rate": 7.365189435800941e-06, + "logits/chosen": -0.4693986177444458, + "logits/rejected": -0.5736244916915894, + "logps/chosen": -49.263423919677734, + "logps/rejected": -85.6162109375, + "loss": 0.7091, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.171292543411255, + "rewards/margins": 5.066904544830322, + "rewards/rejected": -1.8956117630004883, + "step": 6858 + }, + { + "epoch": 1.72, + "grad_norm": 5.667557716369629, + "learning_rate": 7.364496918378245e-06, + "logits/chosen": -0.3280385136604309, + "logits/rejected": -0.34364867210388184, + "logps/chosen": -61.53239440917969, + "logps/rejected": -102.81744384765625, + "loss": 0.7937, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.612848997116089, + "rewards/margins": 5.224515914916992, + "rewards/rejected": -2.6116671562194824, + "step": 6859 + }, + { + "epoch": 1.72, + "grad_norm": 3.7310287952423096, + "learning_rate": 7.363804342526315e-06, + "logits/chosen": -0.37263086438179016, + "logits/rejected": -0.492654412984848, + "logps/chosen": -46.97045135498047, + "logps/rejected": -75.62897491455078, + "loss": 0.6696, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.667724847793579, + "rewards/margins": 4.466100692749023, + "rewards/rejected": -1.7983757257461548, + "step": 6860 + }, + { + "epoch": 1.72, + "grad_norm": 7.337577819824219, + "learning_rate": 7.363111708262264e-06, + "logits/chosen": -0.38310250639915466, + "logits/rejected": -0.4571199417114258, + "logps/chosen": -54.672603607177734, + "logps/rejected": -96.05510711669922, + "loss": 0.7908, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.001896619796753, + "rewards/margins": 4.873074054718018, + "rewards/rejected": -1.871177077293396, + "step": 6861 + }, + { + "epoch": 1.72, + "grad_norm": 2.5236737728118896, + "learning_rate": 7.36241901560321e-06, + "logits/chosen": -0.4796800911426544, + "logits/rejected": -0.552234411239624, + "logps/chosen": -46.99510955810547, + "logps/rejected": -86.66189575195312, + "loss": 0.6146, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.116987466812134, + "rewards/margins": 5.308453559875488, + "rewards/rejected": -2.1914656162261963, + "step": 6862 + }, + { + "epoch": 1.72, + "grad_norm": 3.316953659057617, + "learning_rate": 7.361726264566269e-06, + "logits/chosen": -0.3884584307670593, + "logits/rejected": -0.46070462465286255, + "logps/chosen": -59.18086242675781, + "logps/rejected": -88.63236999511719, + "loss": 0.6333, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0850613117218018, + "rewards/margins": 4.9699015617370605, + "rewards/rejected": -1.8848403692245483, + "step": 6863 + }, + { + "epoch": 1.72, + "grad_norm": 3.9350738525390625, + "learning_rate": 7.3610334551685615e-06, + "logits/chosen": -0.5489246249198914, + "logits/rejected": -0.6390088200569153, + "logps/chosen": -54.962703704833984, + "logps/rejected": -93.49140930175781, + "loss": 0.6289, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.954010248184204, + "rewards/margins": 6.272652626037598, + "rewards/rejected": -3.3186426162719727, + "step": 6864 + }, + { + "epoch": 1.72, + "grad_norm": 9.381218910217285, + "learning_rate": 7.360340587427202e-06, + "logits/chosen": -0.3817926049232483, + "logits/rejected": -0.4477238357067108, + "logps/chosen": -51.47441864013672, + "logps/rejected": -88.11126708984375, + "loss": 0.7201, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5135891437530518, + "rewards/margins": 4.533814907073975, + "rewards/rejected": -2.020226001739502, + "step": 6865 + }, + { + "epoch": 1.72, + "grad_norm": 6.553889751434326, + "learning_rate": 7.3596476613593205e-06, + "logits/chosen": -0.3431081473827362, + "logits/rejected": -0.4233911335468292, + "logps/chosen": -57.360321044921875, + "logps/rejected": -101.56448364257812, + "loss": 0.6716, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9373326301574707, + "rewards/margins": 5.507772445678711, + "rewards/rejected": -2.5704402923583984, + "step": 6866 + }, + { + "epoch": 1.72, + "grad_norm": 3.5701355934143066, + "learning_rate": 7.358954676982033e-06, + "logits/chosen": -0.45344677567481995, + "logits/rejected": -0.5371102094650269, + "logps/chosen": -53.887229919433594, + "logps/rejected": -85.9305419921875, + "loss": 0.6611, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8708322048187256, + "rewards/margins": 5.929462909698486, + "rewards/rejected": -3.05863094329834, + "step": 6867 + }, + { + "epoch": 1.72, + "grad_norm": 2.8108723163604736, + "learning_rate": 7.358261634312467e-06, + "logits/chosen": -0.4119678735733032, + "logits/rejected": -0.46686384081840515, + "logps/chosen": -45.80894088745117, + "logps/rejected": -93.70018768310547, + "loss": 0.5567, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0197954177856445, + "rewards/margins": 6.032607555389404, + "rewards/rejected": -3.0128116607666016, + "step": 6868 + }, + { + "epoch": 1.72, + "grad_norm": 5.497833728790283, + "learning_rate": 7.357568533367749e-06, + "logits/chosen": -0.43011873960494995, + "logits/rejected": -0.5445812940597534, + "logps/chosen": -48.35976791381836, + "logps/rejected": -76.81602478027344, + "loss": 0.7008, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8685052394866943, + "rewards/margins": 5.429390907287598, + "rewards/rejected": -2.5608859062194824, + "step": 6869 + }, + { + "epoch": 1.72, + "grad_norm": 15.788409233093262, + "learning_rate": 7.356875374165003e-06, + "logits/chosen": -0.398143470287323, + "logits/rejected": -0.4334329664707184, + "logps/chosen": -54.78218078613281, + "logps/rejected": -79.10132598876953, + "loss": 0.8357, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0243496894836426, + "rewards/margins": 4.321253299713135, + "rewards/rejected": -1.2969037294387817, + "step": 6870 + }, + { + "epoch": 1.72, + "grad_norm": 3.799839735031128, + "learning_rate": 7.356182156721361e-06, + "logits/chosen": -0.4938527047634125, + "logits/rejected": -0.5246255397796631, + "logps/chosen": -57.877174377441406, + "logps/rejected": -85.40930938720703, + "loss": 0.7148, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.994797468185425, + "rewards/margins": 5.454281330108643, + "rewards/rejected": -2.459484100341797, + "step": 6871 + }, + { + "epoch": 1.72, + "grad_norm": 3.641942024230957, + "learning_rate": 7.355488881053951e-06, + "logits/chosen": -0.4124874174594879, + "logits/rejected": -0.5017839670181274, + "logps/chosen": -61.85795593261719, + "logps/rejected": -97.13394165039062, + "loss": 0.642, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.908334255218506, + "rewards/margins": 6.2808837890625, + "rewards/rejected": -3.3725497722625732, + "step": 6872 + }, + { + "epoch": 1.72, + "grad_norm": 5.846322536468506, + "learning_rate": 7.354795547179905e-06, + "logits/chosen": -0.4670564532279968, + "logits/rejected": -0.5460218191146851, + "logps/chosen": -53.916358947753906, + "logps/rejected": -87.63035583496094, + "loss": 0.6895, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0526123046875, + "rewards/margins": 5.203456401824951, + "rewards/rejected": -2.150844097137451, + "step": 6873 + }, + { + "epoch": 1.72, + "grad_norm": 6.875414848327637, + "learning_rate": 7.354102155116356e-06, + "logits/chosen": -0.3877408504486084, + "logits/rejected": -0.4745166599750519, + "logps/chosen": -58.215938568115234, + "logps/rejected": -87.07637023925781, + "loss": 0.7597, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9098095893859863, + "rewards/margins": 5.134326457977295, + "rewards/rejected": -2.2245168685913086, + "step": 6874 + }, + { + "epoch": 1.72, + "grad_norm": 13.202160835266113, + "learning_rate": 7.35340870488044e-06, + "logits/chosen": -0.4430854320526123, + "logits/rejected": -0.5424299836158752, + "logps/chosen": -62.118995666503906, + "logps/rejected": -79.02071380615234, + "loss": 0.8736, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.606025218963623, + "rewards/margins": 4.591655731201172, + "rewards/rejected": -1.9856303930282593, + "step": 6875 + }, + { + "epoch": 1.72, + "grad_norm": 6.337812900543213, + "learning_rate": 7.352715196489291e-06, + "logits/chosen": -0.42618322372436523, + "logits/rejected": -0.48980098962783813, + "logps/chosen": -52.76103210449219, + "logps/rejected": -87.3370132446289, + "loss": 0.6688, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0092716217041016, + "rewards/margins": 5.180337905883789, + "rewards/rejected": -2.1710667610168457, + "step": 6876 + }, + { + "epoch": 1.72, + "grad_norm": 5.709996700286865, + "learning_rate": 7.352021629960046e-06, + "logits/chosen": -0.426782488822937, + "logits/rejected": -0.5380261540412903, + "logps/chosen": -59.13274002075195, + "logps/rejected": -77.78356170654297, + "loss": 0.8333, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8363797664642334, + "rewards/margins": 4.641909599304199, + "rewards/rejected": -1.8055298328399658, + "step": 6877 + }, + { + "epoch": 1.72, + "grad_norm": 8.33606243133545, + "learning_rate": 7.351328005309848e-06, + "logits/chosen": -0.49125197529792786, + "logits/rejected": -0.5198782086372375, + "logps/chosen": -42.089195251464844, + "logps/rejected": -84.01602172851562, + "loss": 0.6256, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.913944721221924, + "rewards/margins": 4.341157913208008, + "rewards/rejected": -1.427213191986084, + "step": 6878 + }, + { + "epoch": 1.72, + "grad_norm": 5.608462333679199, + "learning_rate": 7.350634322555832e-06, + "logits/chosen": -0.3873552083969116, + "logits/rejected": -0.43009936809539795, + "logps/chosen": -54.18467712402344, + "logps/rejected": -94.71625518798828, + "loss": 0.7776, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9525301456451416, + "rewards/margins": 4.449865341186523, + "rewards/rejected": -1.4973350763320923, + "step": 6879 + }, + { + "epoch": 1.72, + "grad_norm": 4.631584644317627, + "learning_rate": 7.349940581715142e-06, + "logits/chosen": -0.3769254982471466, + "logits/rejected": -0.5116356611251831, + "logps/chosen": -53.1099853515625, + "logps/rejected": -77.72523498535156, + "loss": 0.6823, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.844015121459961, + "rewards/margins": 5.247986316680908, + "rewards/rejected": -2.403970956802368, + "step": 6880 + }, + { + "epoch": 1.72, + "grad_norm": 3.7685141563415527, + "learning_rate": 7.34924678280492e-06, + "logits/chosen": -0.40475329756736755, + "logits/rejected": -0.48305800557136536, + "logps/chosen": -54.16078567504883, + "logps/rejected": -96.76668548583984, + "loss": 0.6195, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.92625093460083, + "rewards/margins": 5.0787553787231445, + "rewards/rejected": -2.1525046825408936, + "step": 6881 + }, + { + "epoch": 1.72, + "grad_norm": 5.574592590332031, + "learning_rate": 7.348552925842311e-06, + "logits/chosen": -0.49353575706481934, + "logits/rejected": -0.5280468463897705, + "logps/chosen": -46.046417236328125, + "logps/rejected": -80.86946105957031, + "loss": 0.7583, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0905511379241943, + "rewards/margins": 4.559977054595947, + "rewards/rejected": -1.4694260358810425, + "step": 6882 + }, + { + "epoch": 1.72, + "grad_norm": 7.426657199859619, + "learning_rate": 7.347859010844461e-06, + "logits/chosen": -0.36209672689437866, + "logits/rejected": -0.4649474322795868, + "logps/chosen": -67.01827239990234, + "logps/rejected": -100.6202163696289, + "loss": 0.8549, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.74520206451416, + "rewards/margins": 4.987567901611328, + "rewards/rejected": -2.242366075515747, + "step": 6883 + }, + { + "epoch": 1.72, + "grad_norm": 11.524641036987305, + "learning_rate": 7.34716503782852e-06, + "logits/chosen": -0.3483601212501526, + "logits/rejected": -0.4615979492664337, + "logps/chosen": -65.12187194824219, + "logps/rejected": -113.4576644897461, + "loss": 0.6861, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8628125190734863, + "rewards/margins": 5.355247497558594, + "rewards/rejected": -2.4924352169036865, + "step": 6884 + }, + { + "epoch": 1.72, + "grad_norm": 5.029329776763916, + "learning_rate": 7.346471006811633e-06, + "logits/chosen": -0.49182218313217163, + "logits/rejected": -0.5216023325920105, + "logps/chosen": -49.46680450439453, + "logps/rejected": -107.71298217773438, + "loss": 0.659, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.908538579940796, + "rewards/margins": 5.285000324249268, + "rewards/rejected": -2.37646222114563, + "step": 6885 + }, + { + "epoch": 1.72, + "grad_norm": 4.533161640167236, + "learning_rate": 7.34577691781095e-06, + "logits/chosen": -0.3722801208496094, + "logits/rejected": -0.4863150119781494, + "logps/chosen": -51.84078598022461, + "logps/rejected": -80.7548828125, + "loss": 0.6105, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.023155689239502, + "rewards/margins": 5.7024760246276855, + "rewards/rejected": -2.679320812225342, + "step": 6886 + }, + { + "epoch": 1.72, + "grad_norm": 19.4726505279541, + "learning_rate": 7.345082770843626e-06, + "logits/chosen": -0.47492989897727966, + "logits/rejected": -0.5405328273773193, + "logps/chosen": -57.357025146484375, + "logps/rejected": -96.67699432373047, + "loss": 0.7773, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8394522666931152, + "rewards/margins": 4.784571170806885, + "rewards/rejected": -1.94511878490448, + "step": 6887 + }, + { + "epoch": 1.72, + "grad_norm": 4.5529327392578125, + "learning_rate": 7.344388565926812e-06, + "logits/chosen": -0.6042413115501404, + "logits/rejected": -0.6481950283050537, + "logps/chosen": -51.072505950927734, + "logps/rejected": -79.704833984375, + "loss": 0.6771, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.092944622039795, + "rewards/margins": 4.654085159301758, + "rewards/rejected": -1.561140537261963, + "step": 6888 + }, + { + "epoch": 1.72, + "grad_norm": 6.016846656799316, + "learning_rate": 7.343694303077664e-06, + "logits/chosen": -0.47444963455200195, + "logits/rejected": -0.5420969128608704, + "logps/chosen": -57.0401725769043, + "logps/rejected": -88.10427856445312, + "loss": 0.7598, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9799914360046387, + "rewards/margins": 4.889215469360352, + "rewards/rejected": -1.909224271774292, + "step": 6889 + }, + { + "epoch": 1.72, + "grad_norm": 5.299438953399658, + "learning_rate": 7.342999982313334e-06, + "logits/chosen": -0.32525065541267395, + "logits/rejected": -0.46911701560020447, + "logps/chosen": -58.929534912109375, + "logps/rejected": -96.43478393554688, + "loss": 0.6093, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.887214422225952, + "rewards/margins": 6.3411407470703125, + "rewards/rejected": -3.4539263248443604, + "step": 6890 + }, + { + "epoch": 1.72, + "grad_norm": 3.6663756370544434, + "learning_rate": 7.3423056036509856e-06, + "logits/chosen": -0.43628743290901184, + "logits/rejected": -0.5499152541160583, + "logps/chosen": -49.36347961425781, + "logps/rejected": -92.21322631835938, + "loss": 0.7429, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.183823585510254, + "rewards/margins": 5.959894180297852, + "rewards/rejected": -2.776071071624756, + "step": 6891 + }, + { + "epoch": 1.72, + "grad_norm": 4.830106258392334, + "learning_rate": 7.341611167107772e-06, + "logits/chosen": -0.4549202620983124, + "logits/rejected": -0.4943265914916992, + "logps/chosen": -63.07667541503906, + "logps/rejected": -115.51971435546875, + "loss": 0.7244, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1621999740600586, + "rewards/margins": 6.063581943511963, + "rewards/rejected": -2.9013819694519043, + "step": 6892 + }, + { + "epoch": 1.72, + "grad_norm": 4.180750370025635, + "learning_rate": 7.340916672700858e-06, + "logits/chosen": -0.431627482175827, + "logits/rejected": -0.5533502697944641, + "logps/chosen": -59.13515090942383, + "logps/rejected": -95.10982513427734, + "loss": 0.6345, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0754170417785645, + "rewards/margins": 5.641790866851807, + "rewards/rejected": -2.566373586654663, + "step": 6893 + }, + { + "epoch": 1.72, + "grad_norm": 10.497847557067871, + "learning_rate": 7.340222120447401e-06, + "logits/chosen": -0.42050668597221375, + "logits/rejected": -0.47398141026496887, + "logps/chosen": -56.436275482177734, + "logps/rejected": -97.54025268554688, + "loss": 0.7131, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6442692279815674, + "rewards/margins": 5.639772891998291, + "rewards/rejected": -2.9955036640167236, + "step": 6894 + }, + { + "epoch": 1.72, + "grad_norm": 17.35502052307129, + "learning_rate": 7.339527510364567e-06, + "logits/chosen": -0.3741125464439392, + "logits/rejected": -0.4935736060142517, + "logps/chosen": -51.78069305419922, + "logps/rejected": -77.61492156982422, + "loss": 0.6591, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6799371242523193, + "rewards/margins": 5.363369941711426, + "rewards/rejected": -2.683433771133423, + "step": 6895 + }, + { + "epoch": 1.73, + "grad_norm": 5.552949905395508, + "learning_rate": 7.338832842469521e-06, + "logits/chosen": -0.47922950983047485, + "logits/rejected": -0.5559525489807129, + "logps/chosen": -50.490901947021484, + "logps/rejected": -79.45140838623047, + "loss": 0.761, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.164710521697998, + "rewards/margins": 5.160961151123047, + "rewards/rejected": -1.9962506294250488, + "step": 6896 + }, + { + "epoch": 1.73, + "grad_norm": 15.413956642150879, + "learning_rate": 7.338138116779425e-06, + "logits/chosen": -0.3936201333999634, + "logits/rejected": -0.5013064742088318, + "logps/chosen": -44.52660369873047, + "logps/rejected": -89.51317596435547, + "loss": 0.5856, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.879483222961426, + "rewards/margins": 5.9100751876831055, + "rewards/rejected": -3.030592441558838, + "step": 6897 + }, + { + "epoch": 1.73, + "grad_norm": 5.096175670623779, + "learning_rate": 7.337443333311451e-06, + "logits/chosen": -0.39902275800704956, + "logits/rejected": -0.483279824256897, + "logps/chosen": -57.98117446899414, + "logps/rejected": -85.3392562866211, + "loss": 0.6968, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6386916637420654, + "rewards/margins": 4.604288578033447, + "rewards/rejected": -1.9655966758728027, + "step": 6898 + }, + { + "epoch": 1.73, + "grad_norm": 6.484798908233643, + "learning_rate": 7.336748492082766e-06, + "logits/chosen": -0.44775789976119995, + "logits/rejected": -0.5466386079788208, + "logps/chosen": -58.424930572509766, + "logps/rejected": -88.67330932617188, + "loss": 0.615, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.980483293533325, + "rewards/margins": 5.460418701171875, + "rewards/rejected": -2.47993540763855, + "step": 6899 + }, + { + "epoch": 1.73, + "grad_norm": 3.153599500656128, + "learning_rate": 7.33605359311054e-06, + "logits/chosen": -0.42811402678489685, + "logits/rejected": -0.5190491676330566, + "logps/chosen": -66.68619537353516, + "logps/rejected": -90.71134948730469, + "loss": 0.6074, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8829801082611084, + "rewards/margins": 5.624905586242676, + "rewards/rejected": -2.7419252395629883, + "step": 6900 + }, + { + "epoch": 1.73, + "grad_norm": 3.0029242038726807, + "learning_rate": 7.335358636411946e-06, + "logits/chosen": -0.42121225595474243, + "logits/rejected": -0.6108445525169373, + "logps/chosen": -46.84047317504883, + "logps/rejected": -76.28380584716797, + "loss": 0.5412, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.880920886993408, + "rewards/margins": 6.768832206726074, + "rewards/rejected": -3.887911319732666, + "step": 6901 + }, + { + "epoch": 1.73, + "grad_norm": 3.0990922451019287, + "learning_rate": 7.3346636220041545e-06, + "logits/chosen": -0.4782174825668335, + "logits/rejected": -0.5479449033737183, + "logps/chosen": -59.28215789794922, + "logps/rejected": -90.08465576171875, + "loss": 0.7063, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.055007219314575, + "rewards/margins": 5.501223564147949, + "rewards/rejected": -2.4462156295776367, + "step": 6902 + }, + { + "epoch": 1.73, + "grad_norm": 8.504263877868652, + "learning_rate": 7.333968549904342e-06, + "logits/chosen": -0.3843705654144287, + "logits/rejected": -0.4003397226333618, + "logps/chosen": -59.64889907836914, + "logps/rejected": -104.30610656738281, + "loss": 0.7837, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.731874942779541, + "rewards/margins": 4.820211887359619, + "rewards/rejected": -2.0883374214172363, + "step": 6903 + }, + { + "epoch": 1.73, + "grad_norm": 4.619012832641602, + "learning_rate": 7.333273420129685e-06, + "logits/chosen": -0.41787031292915344, + "logits/rejected": -0.5072600841522217, + "logps/chosen": -61.76459884643555, + "logps/rejected": -95.53591918945312, + "loss": 0.6478, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.069593906402588, + "rewards/margins": 5.280532360076904, + "rewards/rejected": -2.2109384536743164, + "step": 6904 + }, + { + "epoch": 1.73, + "grad_norm": 7.093814849853516, + "learning_rate": 7.332578232697359e-06, + "logits/chosen": -0.4250367283821106, + "logits/rejected": -0.49678003787994385, + "logps/chosen": -59.81117248535156, + "logps/rejected": -80.9303207397461, + "loss": 0.7782, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0517520904541016, + "rewards/margins": 4.205655097961426, + "rewards/rejected": -1.1539028882980347, + "step": 6905 + }, + { + "epoch": 1.73, + "grad_norm": 11.733205795288086, + "learning_rate": 7.331882987624545e-06, + "logits/chosen": -0.4450785517692566, + "logits/rejected": -0.5158177614212036, + "logps/chosen": -53.56970977783203, + "logps/rejected": -89.14603424072266, + "loss": 0.8416, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6853671073913574, + "rewards/margins": 4.153913974761963, + "rewards/rejected": -1.4685466289520264, + "step": 6906 + }, + { + "epoch": 1.73, + "grad_norm": 10.544726371765137, + "learning_rate": 7.331187684928421e-06, + "logits/chosen": -0.3944370746612549, + "logits/rejected": -0.4731680750846863, + "logps/chosen": -54.99093246459961, + "logps/rejected": -86.21673583984375, + "loss": 0.8724, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.793153762817383, + "rewards/margins": 4.345492839813232, + "rewards/rejected": -1.552339792251587, + "step": 6907 + }, + { + "epoch": 1.73, + "grad_norm": 6.489032745361328, + "learning_rate": 7.330492324626171e-06, + "logits/chosen": -0.3724052906036377, + "logits/rejected": -0.49925005435943604, + "logps/chosen": -74.73771667480469, + "logps/rejected": -89.96171569824219, + "loss": 0.9209, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.616349935531616, + "rewards/margins": 5.016815185546875, + "rewards/rejected": -2.4004650115966797, + "step": 6908 + }, + { + "epoch": 1.73, + "grad_norm": 4.932799816131592, + "learning_rate": 7.329796906734975e-06, + "logits/chosen": -0.4308358430862427, + "logits/rejected": -0.5088558793067932, + "logps/chosen": -56.26586151123047, + "logps/rejected": -98.61064147949219, + "loss": 0.6982, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.920917510986328, + "rewards/margins": 5.467154026031494, + "rewards/rejected": -2.546236515045166, + "step": 6909 + }, + { + "epoch": 1.73, + "grad_norm": 15.64197063446045, + "learning_rate": 7.329101431272021e-06, + "logits/chosen": -0.4277050793170929, + "logits/rejected": -0.5324346423149109, + "logps/chosen": -62.25590515136719, + "logps/rejected": -98.91622924804688, + "loss": 0.674, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.958724021911621, + "rewards/margins": 6.796853065490723, + "rewards/rejected": -3.8381288051605225, + "step": 6910 + }, + { + "epoch": 1.73, + "grad_norm": 6.551890850067139, + "learning_rate": 7.328405898254491e-06, + "logits/chosen": -0.4472978115081787, + "logits/rejected": -0.5134972929954529, + "logps/chosen": -44.47291946411133, + "logps/rejected": -88.04078674316406, + "loss": 0.7217, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.979224443435669, + "rewards/margins": 5.497888088226318, + "rewards/rejected": -2.5186638832092285, + "step": 6911 + }, + { + "epoch": 1.73, + "grad_norm": 4.2487616539001465, + "learning_rate": 7.327710307699577e-06, + "logits/chosen": -0.43030112981796265, + "logits/rejected": -0.5475069284439087, + "logps/chosen": -62.784934997558594, + "logps/rejected": -90.15050506591797, + "loss": 0.6719, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9234397411346436, + "rewards/margins": 6.311492919921875, + "rewards/rejected": -3.3880531787872314, + "step": 6912 + }, + { + "epoch": 1.73, + "grad_norm": 10.808212280273438, + "learning_rate": 7.327014659624465e-06, + "logits/chosen": -0.45121297240257263, + "logits/rejected": -0.4702802002429962, + "logps/chosen": -59.34504318237305, + "logps/rejected": -118.57084655761719, + "loss": 0.8375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5661747455596924, + "rewards/margins": 6.150727272033691, + "rewards/rejected": -3.5845515727996826, + "step": 6913 + }, + { + "epoch": 1.73, + "grad_norm": 5.170524597167969, + "learning_rate": 7.326318954046345e-06, + "logits/chosen": -0.385644793510437, + "logits/rejected": -0.4356975257396698, + "logps/chosen": -70.67151641845703, + "logps/rejected": -99.84798431396484, + "loss": 0.8222, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4066450595855713, + "rewards/margins": 4.196570873260498, + "rewards/rejected": -1.7899258136749268, + "step": 6914 + }, + { + "epoch": 1.73, + "grad_norm": 4.348519802093506, + "learning_rate": 7.325623190982409e-06, + "logits/chosen": -0.34815260767936707, + "logits/rejected": -0.4341971278190613, + "logps/chosen": -57.489871978759766, + "logps/rejected": -92.32759857177734, + "loss": 0.6041, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6913797855377197, + "rewards/margins": 4.97334098815918, + "rewards/rejected": -2.281961679458618, + "step": 6915 + }, + { + "epoch": 1.73, + "grad_norm": 5.646335601806641, + "learning_rate": 7.3249273704498515e-06, + "logits/chosen": -0.42357534170150757, + "logits/rejected": -0.47868019342422485, + "logps/chosen": -49.539241790771484, + "logps/rejected": -77.51587677001953, + "loss": 0.7458, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.176398277282715, + "rewards/margins": 4.848532676696777, + "rewards/rejected": -1.672134518623352, + "step": 6916 + }, + { + "epoch": 1.73, + "grad_norm": 12.25735092163086, + "learning_rate": 7.324231492465865e-06, + "logits/chosen": -0.5287136435508728, + "logits/rejected": -0.6357549428939819, + "logps/chosen": -55.16046142578125, + "logps/rejected": -83.83172607421875, + "loss": 0.795, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5099382400512695, + "rewards/margins": 4.292182445526123, + "rewards/rejected": -1.7822437286376953, + "step": 6917 + }, + { + "epoch": 1.73, + "grad_norm": 4.949331760406494, + "learning_rate": 7.323535557047646e-06, + "logits/chosen": -0.359555184841156, + "logits/rejected": -0.4606242775917053, + "logps/chosen": -56.613929748535156, + "logps/rejected": -88.76748657226562, + "loss": 0.6948, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.551802635192871, + "rewards/margins": 5.565398216247559, + "rewards/rejected": -3.0135951042175293, + "step": 6918 + }, + { + "epoch": 1.73, + "grad_norm": 4.58804988861084, + "learning_rate": 7.322839564212391e-06, + "logits/chosen": -0.4397016167640686, + "logits/rejected": -0.5493979454040527, + "logps/chosen": -63.92302322387695, + "logps/rejected": -74.37549591064453, + "loss": 0.7829, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7028310298919678, + "rewards/margins": 5.0218825340271, + "rewards/rejected": -2.3190512657165527, + "step": 6919 + }, + { + "epoch": 1.73, + "grad_norm": 17.203388214111328, + "learning_rate": 7.322143513977302e-06, + "logits/chosen": -0.3364669978618622, + "logits/rejected": -0.44563376903533936, + "logps/chosen": -67.83806610107422, + "logps/rejected": -90.74563598632812, + "loss": 0.8224, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3918333053588867, + "rewards/margins": 4.653215408325195, + "rewards/rejected": -2.2613823413848877, + "step": 6920 + }, + { + "epoch": 1.73, + "grad_norm": 4.4521870613098145, + "learning_rate": 7.321447406359577e-06, + "logits/chosen": -0.4499102234840393, + "logits/rejected": -0.5232670307159424, + "logps/chosen": -51.84770202636719, + "logps/rejected": -84.27201843261719, + "loss": 0.6384, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1128618717193604, + "rewards/margins": 6.220396995544434, + "rewards/rejected": -3.107534170150757, + "step": 6921 + }, + { + "epoch": 1.73, + "grad_norm": 12.683765411376953, + "learning_rate": 7.320751241376416e-06, + "logits/chosen": -0.4172547161579132, + "logits/rejected": -0.4911404848098755, + "logps/chosen": -59.30580139160156, + "logps/rejected": -84.7226333618164, + "loss": 0.8733, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.756612539291382, + "rewards/margins": 4.9026641845703125, + "rewards/rejected": -2.1460514068603516, + "step": 6922 + }, + { + "epoch": 1.73, + "grad_norm": 4.531083106994629, + "learning_rate": 7.320055019045022e-06, + "logits/chosen": -0.4612760841846466, + "logits/rejected": -0.5161730647087097, + "logps/chosen": -51.59166717529297, + "logps/rejected": -98.77263641357422, + "loss": 0.711, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1196868419647217, + "rewards/margins": 5.649918079376221, + "rewards/rejected": -2.53023099899292, + "step": 6923 + }, + { + "epoch": 1.73, + "grad_norm": 5.473223686218262, + "learning_rate": 7.319358739382602e-06, + "logits/chosen": -0.3611512780189514, + "logits/rejected": -0.46808940172195435, + "logps/chosen": -68.25528717041016, + "logps/rejected": -87.36625671386719, + "loss": 0.7751, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.020423173904419, + "rewards/margins": 5.121345520019531, + "rewards/rejected": -2.1009228229522705, + "step": 6924 + }, + { + "epoch": 1.73, + "grad_norm": 3.0998432636260986, + "learning_rate": 7.318662402406361e-06, + "logits/chosen": -0.47434544563293457, + "logits/rejected": -0.5431625843048096, + "logps/chosen": -57.893768310546875, + "logps/rejected": -101.87997436523438, + "loss": 0.7008, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9839720726013184, + "rewards/margins": 5.9965105056762695, + "rewards/rejected": -3.012538433074951, + "step": 6925 + }, + { + "epoch": 1.73, + "grad_norm": 15.839980125427246, + "learning_rate": 7.317966008133507e-06, + "logits/chosen": -0.4018380343914032, + "logits/rejected": -0.5665701627731323, + "logps/chosen": -54.02845764160156, + "logps/rejected": -84.31651306152344, + "loss": 0.6592, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.614887237548828, + "rewards/margins": 5.284327030181885, + "rewards/rejected": -2.6694395542144775, + "step": 6926 + }, + { + "epoch": 1.73, + "grad_norm": 5.141571044921875, + "learning_rate": 7.317269556581245e-06, + "logits/chosen": -0.4401104152202606, + "logits/rejected": -0.46771880984306335, + "logps/chosen": -54.184814453125, + "logps/rejected": -106.58480834960938, + "loss": 0.6674, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.624051809310913, + "rewards/margins": 5.744133949279785, + "rewards/rejected": -3.120082139968872, + "step": 6927 + }, + { + "epoch": 1.73, + "grad_norm": 4.597200393676758, + "learning_rate": 7.316573047766788e-06, + "logits/chosen": -0.42237240076065063, + "logits/rejected": -0.4964011311531067, + "logps/chosen": -48.076866149902344, + "logps/rejected": -87.72574615478516, + "loss": 0.7312, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.913235664367676, + "rewards/margins": 4.938243865966797, + "rewards/rejected": -2.025007486343384, + "step": 6928 + }, + { + "epoch": 1.73, + "grad_norm": 6.856733322143555, + "learning_rate": 7.315876481707349e-06, + "logits/chosen": -0.4190607964992523, + "logits/rejected": -0.52208411693573, + "logps/chosen": -61.68035888671875, + "logps/rejected": -86.05149841308594, + "loss": 0.7472, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.536473035812378, + "rewards/margins": 4.465853691101074, + "rewards/rejected": -1.9293811321258545, + "step": 6929 + }, + { + "epoch": 1.73, + "grad_norm": 6.637033939361572, + "learning_rate": 7.315179858420138e-06, + "logits/chosen": -0.43615055084228516, + "logits/rejected": -0.44631072878837585, + "logps/chosen": -53.44929122924805, + "logps/rejected": -95.21932983398438, + "loss": 0.7901, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.03358793258667, + "rewards/margins": 4.342517375946045, + "rewards/rejected": -1.308929443359375, + "step": 6930 + }, + { + "epoch": 1.73, + "grad_norm": 4.230074405670166, + "learning_rate": 7.314483177922368e-06, + "logits/chosen": -0.4292590022087097, + "logits/rejected": -0.47972214221954346, + "logps/chosen": -50.21799087524414, + "logps/rejected": -95.33446502685547, + "loss": 0.6487, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.056122064590454, + "rewards/margins": 5.799197196960449, + "rewards/rejected": -2.743075370788574, + "step": 6931 + }, + { + "epoch": 1.73, + "grad_norm": 6.750776290893555, + "learning_rate": 7.313786440231259e-06, + "logits/chosen": -0.4351695477962494, + "logits/rejected": -0.46858787536621094, + "logps/chosen": -47.76780319213867, + "logps/rejected": -88.36927795410156, + "loss": 0.7206, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0419466495513916, + "rewards/margins": 4.61816930770874, + "rewards/rejected": -1.5762226581573486, + "step": 6932 + }, + { + "epoch": 1.73, + "grad_norm": 4.295741558074951, + "learning_rate": 7.313089645364025e-06, + "logits/chosen": -0.3423747420310974, + "logits/rejected": -0.4377143681049347, + "logps/chosen": -58.92030715942383, + "logps/rejected": -89.25786590576172, + "loss": 0.6695, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.250551223754883, + "rewards/margins": 5.252533435821533, + "rewards/rejected": -2.0019822120666504, + "step": 6933 + }, + { + "epoch": 1.73, + "grad_norm": 7.476727485656738, + "learning_rate": 7.312392793337886e-06, + "logits/chosen": -0.4036315381526947, + "logits/rejected": -0.5048860907554626, + "logps/chosen": -59.95719909667969, + "logps/rejected": -83.19146728515625, + "loss": 0.7181, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.889950752258301, + "rewards/margins": 5.6124348640441895, + "rewards/rejected": -2.7224838733673096, + "step": 6934 + }, + { + "epoch": 1.73, + "grad_norm": 3.3450613021850586, + "learning_rate": 7.311695884170063e-06, + "logits/chosen": -0.5198527574539185, + "logits/rejected": -0.5537558197975159, + "logps/chosen": -52.95816421508789, + "logps/rejected": -88.04950714111328, + "loss": 0.7143, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1982693672180176, + "rewards/margins": 5.070678234100342, + "rewards/rejected": -1.8724087476730347, + "step": 6935 + }, + { + "epoch": 1.74, + "grad_norm": 3.8283908367156982, + "learning_rate": 7.310998917877774e-06, + "logits/chosen": -0.41846269369125366, + "logits/rejected": -0.4848167598247528, + "logps/chosen": -49.93560028076172, + "logps/rejected": -83.87179565429688, + "loss": 0.6965, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1794869899749756, + "rewards/margins": 4.8265061378479, + "rewards/rejected": -1.6470191478729248, + "step": 6936 + }, + { + "epoch": 1.74, + "grad_norm": 4.0870208740234375, + "learning_rate": 7.310301894478244e-06, + "logits/chosen": -0.3274347484111786, + "logits/rejected": -0.4714318513870239, + "logps/chosen": -52.86561965942383, + "logps/rejected": -66.62769317626953, + "loss": 0.6226, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1812055110931396, + "rewards/margins": 5.560193061828613, + "rewards/rejected": -2.3789877891540527, + "step": 6937 + }, + { + "epoch": 1.74, + "grad_norm": 6.542215347290039, + "learning_rate": 7.3096048139886965e-06, + "logits/chosen": -0.3307206928730011, + "logits/rejected": -0.35552406311035156, + "logps/chosen": -63.7579231262207, + "logps/rejected": -87.75555419921875, + "loss": 0.7456, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.080366611480713, + "rewards/margins": 3.780776023864746, + "rewards/rejected": -0.7004092931747437, + "step": 6938 + }, + { + "epoch": 1.74, + "grad_norm": 5.023072719573975, + "learning_rate": 7.308907676426358e-06, + "logits/chosen": -0.39291098713874817, + "logits/rejected": -0.48277339339256287, + "logps/chosen": -43.30666732788086, + "logps/rejected": -98.03460693359375, + "loss": 0.6101, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1425392627716064, + "rewards/margins": 5.6060895919799805, + "rewards/rejected": -2.463550329208374, + "step": 6939 + }, + { + "epoch": 1.74, + "grad_norm": 8.590771675109863, + "learning_rate": 7.308210481808454e-06, + "logits/chosen": -0.47188135981559753, + "logits/rejected": -0.5915558338165283, + "logps/chosen": -64.87303924560547, + "logps/rejected": -78.79426574707031, + "loss": 0.9117, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2417659759521484, + "rewards/margins": 5.386591911315918, + "rewards/rejected": -2.1448261737823486, + "step": 6940 + }, + { + "epoch": 1.74, + "grad_norm": 5.236539840698242, + "learning_rate": 7.307513230152216e-06, + "logits/chosen": -0.3660632371902466, + "logits/rejected": -0.4331192374229431, + "logps/chosen": -49.97526550292969, + "logps/rejected": -87.566650390625, + "loss": 0.7616, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.879636287689209, + "rewards/margins": 4.84588623046875, + "rewards/rejected": -1.966249942779541, + "step": 6941 + }, + { + "epoch": 1.74, + "grad_norm": 6.193575859069824, + "learning_rate": 7.30681592147487e-06, + "logits/chosen": -0.42112311720848083, + "logits/rejected": -0.5238946080207825, + "logps/chosen": -51.62360382080078, + "logps/rejected": -95.13398742675781, + "loss": 0.7453, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.824544906616211, + "rewards/margins": 5.887074947357178, + "rewards/rejected": -3.0625298023223877, + "step": 6942 + }, + { + "epoch": 1.74, + "grad_norm": 4.945182800292969, + "learning_rate": 7.306118555793649e-06, + "logits/chosen": -0.37351852655410767, + "logits/rejected": -0.48148244619369507, + "logps/chosen": -64.46630096435547, + "logps/rejected": -106.43512725830078, + "loss": 0.7355, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.960714340209961, + "rewards/margins": 5.761069297790527, + "rewards/rejected": -2.8003554344177246, + "step": 6943 + }, + { + "epoch": 1.74, + "grad_norm": 5.899263381958008, + "learning_rate": 7.305421133125786e-06, + "logits/chosen": -0.33123910427093506, + "logits/rejected": -0.3826064169406891, + "logps/chosen": -68.21148681640625, + "logps/rejected": -101.30674743652344, + "loss": 0.8612, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.611940860748291, + "rewards/margins": 4.163644790649414, + "rewards/rejected": -1.5517038106918335, + "step": 6944 + }, + { + "epoch": 1.74, + "grad_norm": 4.678888320922852, + "learning_rate": 7.304723653488514e-06, + "logits/chosen": -0.4119233787059784, + "logits/rejected": -0.49131131172180176, + "logps/chosen": -58.029293060302734, + "logps/rejected": -84.82441711425781, + "loss": 0.7456, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8795793056488037, + "rewards/margins": 4.98573112487793, + "rewards/rejected": -2.106152057647705, + "step": 6945 + }, + { + "epoch": 1.74, + "grad_norm": 3.293975591659546, + "learning_rate": 7.30402611689907e-06, + "logits/chosen": -0.42888307571411133, + "logits/rejected": -0.48841583728790283, + "logps/chosen": -44.69577407836914, + "logps/rejected": -90.03020477294922, + "loss": 0.6265, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9794297218322754, + "rewards/margins": 5.340264797210693, + "rewards/rejected": -2.360835075378418, + "step": 6946 + }, + { + "epoch": 1.74, + "grad_norm": 2.9199295043945312, + "learning_rate": 7.3033285233746895e-06, + "logits/chosen": -0.45735594630241394, + "logits/rejected": -0.5879457592964172, + "logps/chosen": -51.10964584350586, + "logps/rejected": -78.39921569824219, + "loss": 0.686, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.944732189178467, + "rewards/margins": 5.857880592346191, + "rewards/rejected": -2.9131484031677246, + "step": 6947 + }, + { + "epoch": 1.74, + "grad_norm": 3.5174601078033447, + "learning_rate": 7.302630872932612e-06, + "logits/chosen": -0.3950236439704895, + "logits/rejected": -0.4880461096763611, + "logps/chosen": -62.68649673461914, + "logps/rejected": -85.33845520019531, + "loss": 0.7862, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8594717979431152, + "rewards/margins": 5.253669261932373, + "rewards/rejected": -2.394197463989258, + "step": 6948 + }, + { + "epoch": 1.74, + "grad_norm": 5.010216236114502, + "learning_rate": 7.301933165590075e-06, + "logits/chosen": -0.38567715883255005, + "logits/rejected": -0.48636767268180847, + "logps/chosen": -59.86677932739258, + "logps/rejected": -78.73060607910156, + "loss": 0.8312, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6993823051452637, + "rewards/margins": 5.011870384216309, + "rewards/rejected": -2.312488079071045, + "step": 6949 + }, + { + "epoch": 1.74, + "grad_norm": 14.343379020690918, + "learning_rate": 7.301235401364323e-06, + "logits/chosen": -0.41649407148361206, + "logits/rejected": -0.5254921913146973, + "logps/chosen": -53.2887077331543, + "logps/rejected": -72.09197998046875, + "loss": 0.7931, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.029911994934082, + "rewards/margins": 4.179885387420654, + "rewards/rejected": -1.1499731540679932, + "step": 6950 + }, + { + "epoch": 1.74, + "grad_norm": 3.855860948562622, + "learning_rate": 7.300537580272594e-06, + "logits/chosen": -0.3919406533241272, + "logits/rejected": -0.5437541007995605, + "logps/chosen": -49.94198226928711, + "logps/rejected": -67.38003540039062, + "loss": 0.7041, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.002492666244507, + "rewards/margins": 5.07615852355957, + "rewards/rejected": -2.0736656188964844, + "step": 6951 + }, + { + "epoch": 1.74, + "grad_norm": 5.06165075302124, + "learning_rate": 7.299839702332136e-06, + "logits/chosen": -0.3925475478172302, + "logits/rejected": -0.40854644775390625, + "logps/chosen": -58.098052978515625, + "logps/rejected": -95.20463562011719, + "loss": 0.7648, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.359468936920166, + "rewards/margins": 4.756819725036621, + "rewards/rejected": -1.3973511457443237, + "step": 6952 + }, + { + "epoch": 1.74, + "grad_norm": 6.2387495040893555, + "learning_rate": 7.299141767560194e-06, + "logits/chosen": -0.39606234431266785, + "logits/rejected": -0.4726984202861786, + "logps/chosen": -68.09075164794922, + "logps/rejected": -87.6683578491211, + "loss": 0.8056, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.991136312484741, + "rewards/margins": 4.556943893432617, + "rewards/rejected": -1.5658073425292969, + "step": 6953 + }, + { + "epoch": 1.74, + "grad_norm": 4.919422626495361, + "learning_rate": 7.29844377597401e-06, + "logits/chosen": -0.461007297039032, + "logits/rejected": -0.5713139772415161, + "logps/chosen": -48.64578628540039, + "logps/rejected": -98.7468490600586, + "loss": 0.591, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7886040210723877, + "rewards/margins": 5.259599685668945, + "rewards/rejected": -2.470996141433716, + "step": 6954 + }, + { + "epoch": 1.74, + "grad_norm": 5.934861660003662, + "learning_rate": 7.297745727590839e-06, + "logits/chosen": -0.3982894718647003, + "logits/rejected": -0.42496243119239807, + "logps/chosen": -51.6776237487793, + "logps/rejected": -106.33735656738281, + "loss": 0.6972, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0971782207489014, + "rewards/margins": 4.702645301818848, + "rewards/rejected": -1.6054673194885254, + "step": 6955 + }, + { + "epoch": 1.74, + "grad_norm": 12.88185977935791, + "learning_rate": 7.297047622427925e-06, + "logits/chosen": -0.44377729296684265, + "logits/rejected": -0.5584288835525513, + "logps/chosen": -59.143104553222656, + "logps/rejected": -85.62240600585938, + "loss": 0.7969, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4648256301879883, + "rewards/margins": 5.138071537017822, + "rewards/rejected": -2.673246145248413, + "step": 6956 + }, + { + "epoch": 1.74, + "grad_norm": 2.728504180908203, + "learning_rate": 7.296349460502521e-06, + "logits/chosen": -0.49892935156822205, + "logits/rejected": -0.5513668656349182, + "logps/chosen": -52.906734466552734, + "logps/rejected": -100.92729949951172, + "loss": 0.6975, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.245492696762085, + "rewards/margins": 5.196115016937256, + "rewards/rejected": -1.950622320175171, + "step": 6957 + }, + { + "epoch": 1.74, + "grad_norm": 5.586594104766846, + "learning_rate": 7.29565124183188e-06, + "logits/chosen": -0.38910502195358276, + "logits/rejected": -0.46264252066612244, + "logps/chosen": -56.10266876220703, + "logps/rejected": -100.88375091552734, + "loss": 0.7068, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8625166416168213, + "rewards/margins": 5.872150897979736, + "rewards/rejected": -3.009634017944336, + "step": 6958 + }, + { + "epoch": 1.74, + "grad_norm": 2.4976511001586914, + "learning_rate": 7.294952966433256e-06, + "logits/chosen": -0.343083918094635, + "logits/rejected": -0.42527613043785095, + "logps/chosen": -53.39239501953125, + "logps/rejected": -113.47606658935547, + "loss": 0.6392, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9157369136810303, + "rewards/margins": 6.955102920532227, + "rewards/rejected": -4.039365768432617, + "step": 6959 + }, + { + "epoch": 1.74, + "grad_norm": 3.177504539489746, + "learning_rate": 7.294254634323902e-06, + "logits/chosen": -0.3824266791343689, + "logits/rejected": -0.45586562156677246, + "logps/chosen": -67.25886535644531, + "logps/rejected": -93.83867645263672, + "loss": 0.6821, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2946202754974365, + "rewards/margins": 5.289574146270752, + "rewards/rejected": -1.9949535131454468, + "step": 6960 + }, + { + "epoch": 1.74, + "grad_norm": 3.993844747543335, + "learning_rate": 7.293556245521076e-06, + "logits/chosen": -0.3843051493167877, + "logits/rejected": -0.4713115692138672, + "logps/chosen": -62.88267517089844, + "logps/rejected": -82.97341918945312, + "loss": 0.7206, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0842089653015137, + "rewards/margins": 4.818654537200928, + "rewards/rejected": -1.7344450950622559, + "step": 6961 + }, + { + "epoch": 1.74, + "grad_norm": 11.329833030700684, + "learning_rate": 7.292857800042036e-06, + "logits/chosen": -0.3405534625053406, + "logits/rejected": -0.45469632744789124, + "logps/chosen": -56.55475997924805, + "logps/rejected": -85.18377685546875, + "loss": 0.8651, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6640303134918213, + "rewards/margins": 4.800173282623291, + "rewards/rejected": -2.136143445968628, + "step": 6962 + }, + { + "epoch": 1.74, + "grad_norm": 4.702679634094238, + "learning_rate": 7.2921592979040386e-06, + "logits/chosen": -0.4908444881439209, + "logits/rejected": -0.5609406232833862, + "logps/chosen": -48.40730285644531, + "logps/rejected": -90.80274963378906, + "loss": 0.6232, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.009410858154297, + "rewards/margins": 6.203320026397705, + "rewards/rejected": -3.1939094066619873, + "step": 6963 + }, + { + "epoch": 1.74, + "grad_norm": 4.317227363586426, + "learning_rate": 7.29146073912435e-06, + "logits/chosen": -0.4208981692790985, + "logits/rejected": -0.4510353207588196, + "logps/chosen": -48.34330749511719, + "logps/rejected": -102.46450805664062, + "loss": 0.6405, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0140769481658936, + "rewards/margins": 5.802849292755127, + "rewards/rejected": -2.788771867752075, + "step": 6964 + }, + { + "epoch": 1.74, + "grad_norm": 19.3106632232666, + "learning_rate": 7.2907621237202275e-06, + "logits/chosen": -0.43079042434692383, + "logits/rejected": -0.5128510594367981, + "logps/chosen": -62.28554153442383, + "logps/rejected": -103.58214569091797, + "loss": 0.9397, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7113687992095947, + "rewards/margins": 5.7476701736450195, + "rewards/rejected": -3.036301612854004, + "step": 6965 + }, + { + "epoch": 1.74, + "grad_norm": 12.54145336151123, + "learning_rate": 7.290063451708937e-06, + "logits/chosen": -0.36517661809921265, + "logits/rejected": -0.4536924958229065, + "logps/chosen": -58.583824157714844, + "logps/rejected": -90.23380279541016, + "loss": 0.7073, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1668715476989746, + "rewards/margins": 5.128180980682373, + "rewards/rejected": -1.9613094329833984, + "step": 6966 + }, + { + "epoch": 1.74, + "grad_norm": 6.701369762420654, + "learning_rate": 7.2893647231077435e-06, + "logits/chosen": -0.46366390585899353, + "logits/rejected": -0.5542942881584167, + "logps/chosen": -63.60308837890625, + "logps/rejected": -95.72036743164062, + "loss": 0.7317, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.899832010269165, + "rewards/margins": 6.066800594329834, + "rewards/rejected": -3.16696834564209, + "step": 6967 + }, + { + "epoch": 1.74, + "grad_norm": 5.634949684143066, + "learning_rate": 7.288665937933912e-06, + "logits/chosen": -0.3938273787498474, + "logits/rejected": -0.48479655385017395, + "logps/chosen": -57.058502197265625, + "logps/rejected": -90.00433349609375, + "loss": 0.7116, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8928112983703613, + "rewards/margins": 5.062436580657959, + "rewards/rejected": -2.1696252822875977, + "step": 6968 + }, + { + "epoch": 1.74, + "grad_norm": 5.133382797241211, + "learning_rate": 7.28796709620471e-06, + "logits/chosen": -0.41677480936050415, + "logits/rejected": -0.5287628769874573, + "logps/chosen": -56.87986373901367, + "logps/rejected": -90.1524887084961, + "loss": 0.6754, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.832085371017456, + "rewards/margins": 4.819274425506592, + "rewards/rejected": -1.9871889352798462, + "step": 6969 + }, + { + "epoch": 1.74, + "grad_norm": 7.808239459991455, + "learning_rate": 7.287268197937408e-06, + "logits/chosen": -0.3774312138557434, + "logits/rejected": -0.4450124204158783, + "logps/chosen": -60.33512878417969, + "logps/rejected": -100.5347900390625, + "loss": 0.7589, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.807607412338257, + "rewards/margins": 5.792966365814209, + "rewards/rejected": -2.985358715057373, + "step": 6970 + }, + { + "epoch": 1.74, + "grad_norm": 12.737140655517578, + "learning_rate": 7.286569243149276e-06, + "logits/chosen": -0.4302210807800293, + "logits/rejected": -0.5499374866485596, + "logps/chosen": -65.19889831542969, + "logps/rejected": -78.20599365234375, + "loss": 0.7993, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7050018310546875, + "rewards/margins": 4.438421726226807, + "rewards/rejected": -1.7334202527999878, + "step": 6971 + }, + { + "epoch": 1.74, + "grad_norm": 6.501263618469238, + "learning_rate": 7.285870231857585e-06, + "logits/chosen": -0.3534862697124481, + "logits/rejected": -0.41820091009140015, + "logps/chosen": -58.40913391113281, + "logps/rejected": -107.08666229248047, + "loss": 0.7268, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5666658878326416, + "rewards/margins": 4.601034164428711, + "rewards/rejected": -2.034367561340332, + "step": 6972 + }, + { + "epoch": 1.74, + "grad_norm": 6.310340881347656, + "learning_rate": 7.285171164079612e-06, + "logits/chosen": -0.534771740436554, + "logits/rejected": -0.6215662360191345, + "logps/chosen": -50.855220794677734, + "logps/rejected": -71.84105682373047, + "loss": 0.7804, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.811688184738159, + "rewards/margins": 5.298300743103027, + "rewards/rejected": -2.486612558364868, + "step": 6973 + }, + { + "epoch": 1.74, + "grad_norm": 8.306891441345215, + "learning_rate": 7.284472039832626e-06, + "logits/chosen": -0.4764111042022705, + "logits/rejected": -0.5880941152572632, + "logps/chosen": -53.675193786621094, + "logps/rejected": -86.98286437988281, + "loss": 0.7853, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5957298278808594, + "rewards/margins": 5.793436050415039, + "rewards/rejected": -3.1977062225341797, + "step": 6974 + }, + { + "epoch": 1.74, + "grad_norm": 16.337514877319336, + "learning_rate": 7.283772859133908e-06, + "logits/chosen": -0.5224677324295044, + "logits/rejected": -0.6077225804328918, + "logps/chosen": -61.6370964050293, + "logps/rejected": -84.03978729248047, + "loss": 0.8217, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8914754390716553, + "rewards/margins": 5.054134845733643, + "rewards/rejected": -2.1626596450805664, + "step": 6975 + }, + { + "epoch": 1.75, + "grad_norm": 4.165499210357666, + "learning_rate": 7.2830736220007335e-06, + "logits/chosen": -0.4051414430141449, + "logits/rejected": -0.519351601600647, + "logps/chosen": -48.851707458496094, + "logps/rejected": -99.3230209350586, + "loss": 0.5769, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8385894298553467, + "rewards/margins": 6.12059211730957, + "rewards/rejected": -3.282003164291382, + "step": 6976 + }, + { + "epoch": 1.75, + "grad_norm": 15.11871337890625, + "learning_rate": 7.282374328450379e-06, + "logits/chosen": -0.335840106010437, + "logits/rejected": -0.41872072219848633, + "logps/chosen": -69.24667358398438, + "logps/rejected": -108.223388671875, + "loss": 0.8162, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.763597011566162, + "rewards/margins": 4.841325759887695, + "rewards/rejected": -2.0777289867401123, + "step": 6977 + }, + { + "epoch": 1.75, + "grad_norm": 6.373713493347168, + "learning_rate": 7.2816749785001295e-06, + "logits/chosen": -0.43076014518737793, + "logits/rejected": -0.5143694281578064, + "logps/chosen": -69.14183044433594, + "logps/rejected": -99.51546478271484, + "loss": 0.8864, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6527388095855713, + "rewards/margins": 5.3028669357299805, + "rewards/rejected": -2.65012788772583, + "step": 6978 + }, + { + "epoch": 1.75, + "grad_norm": 6.090499401092529, + "learning_rate": 7.280975572167264e-06, + "logits/chosen": -0.44570085406303406, + "logits/rejected": -0.5039404034614563, + "logps/chosen": -52.37430953979492, + "logps/rejected": -95.91450500488281, + "loss": 0.7123, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6264379024505615, + "rewards/margins": 4.784595966339111, + "rewards/rejected": -2.158158302307129, + "step": 6979 + }, + { + "epoch": 1.75, + "grad_norm": 4.264438629150391, + "learning_rate": 7.280276109469064e-06, + "logits/chosen": -0.4733789563179016, + "logits/rejected": -0.6064575910568237, + "logps/chosen": -57.45530700683594, + "logps/rejected": -80.83999633789062, + "loss": 0.7192, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9559743404388428, + "rewards/margins": 5.681103229522705, + "rewards/rejected": -2.7251293659210205, + "step": 6980 + }, + { + "epoch": 1.75, + "grad_norm": 5.991645812988281, + "learning_rate": 7.279576590422819e-06, + "logits/chosen": -0.39171716570854187, + "logits/rejected": -0.4861621558666229, + "logps/chosen": -51.64952087402344, + "logps/rejected": -67.35224914550781, + "loss": 0.7701, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7822842597961426, + "rewards/margins": 4.015735149383545, + "rewards/rejected": -1.2334507703781128, + "step": 6981 + }, + { + "epoch": 1.75, + "grad_norm": 22.74058723449707, + "learning_rate": 7.278877015045811e-06, + "logits/chosen": -0.41534891724586487, + "logits/rejected": -0.47970300912857056, + "logps/chosen": -58.01546859741211, + "logps/rejected": -114.26768493652344, + "loss": 0.8603, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.998945713043213, + "rewards/margins": 7.023776054382324, + "rewards/rejected": -4.024829864501953, + "step": 6982 + }, + { + "epoch": 1.75, + "grad_norm": 5.330928802490234, + "learning_rate": 7.278177383355327e-06, + "logits/chosen": -0.3799545168876648, + "logits/rejected": -0.5188139081001282, + "logps/chosen": -72.09352111816406, + "logps/rejected": -93.69835662841797, + "loss": 0.8519, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6658153533935547, + "rewards/margins": 5.911777019500732, + "rewards/rejected": -3.245961904525757, + "step": 6983 + }, + { + "epoch": 1.75, + "grad_norm": 5.1546525955200195, + "learning_rate": 7.277477695368657e-06, + "logits/chosen": -0.4202825725078583, + "logits/rejected": -0.3885997533798218, + "logps/chosen": -56.79659652709961, + "logps/rejected": -99.16439056396484, + "loss": 0.7696, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.951997995376587, + "rewards/margins": 3.349782943725586, + "rewards/rejected": -0.397784948348999, + "step": 6984 + }, + { + "epoch": 1.75, + "grad_norm": 8.5410795211792, + "learning_rate": 7.27677795110309e-06, + "logits/chosen": -0.39848947525024414, + "logits/rejected": -0.5017006397247314, + "logps/chosen": -63.988216400146484, + "logps/rejected": -88.83667755126953, + "loss": 0.882, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.772693634033203, + "rewards/margins": 5.086132526397705, + "rewards/rejected": -2.3134384155273438, + "step": 6985 + }, + { + "epoch": 1.75, + "grad_norm": 3.130823850631714, + "learning_rate": 7.276078150575918e-06, + "logits/chosen": -0.5032135844230652, + "logits/rejected": -0.5136294364929199, + "logps/chosen": -52.83229446411133, + "logps/rejected": -102.77525329589844, + "loss": 0.6925, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0451178550720215, + "rewards/margins": 5.636643409729004, + "rewards/rejected": -2.5915257930755615, + "step": 6986 + }, + { + "epoch": 1.75, + "grad_norm": 6.8343424797058105, + "learning_rate": 7.275378293804436e-06, + "logits/chosen": -0.33652523159980774, + "logits/rejected": -0.4345654249191284, + "logps/chosen": -62.01813888549805, + "logps/rejected": -102.91255187988281, + "loss": 0.8335, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9352030754089355, + "rewards/margins": 5.20534610748291, + "rewards/rejected": -2.2701432704925537, + "step": 6987 + }, + { + "epoch": 1.75, + "grad_norm": 6.0972113609313965, + "learning_rate": 7.274678380805935e-06, + "logits/chosen": -0.41033250093460083, + "logits/rejected": -0.49429836869239807, + "logps/chosen": -82.31681060791016, + "logps/rejected": -74.5628890991211, + "loss": 0.8771, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9293012619018555, + "rewards/margins": 4.592978000640869, + "rewards/rejected": -1.6636770963668823, + "step": 6988 + }, + { + "epoch": 1.75, + "grad_norm": 6.81273078918457, + "learning_rate": 7.273978411597712e-06, + "logits/chosen": -0.43900778889656067, + "logits/rejected": -0.5077826976776123, + "logps/chosen": -59.659461975097656, + "logps/rejected": -97.34375762939453, + "loss": 0.7738, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.890716314315796, + "rewards/margins": 4.61852502822876, + "rewards/rejected": -1.7278084754943848, + "step": 6989 + }, + { + "epoch": 1.75, + "grad_norm": 5.938998222351074, + "learning_rate": 7.273278386197063e-06, + "logits/chosen": -0.5209883451461792, + "logits/rejected": -0.6035395264625549, + "logps/chosen": -49.089385986328125, + "logps/rejected": -83.1996841430664, + "loss": 0.7035, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8572511672973633, + "rewards/margins": 5.289673805236816, + "rewards/rejected": -2.432422399520874, + "step": 6990 + }, + { + "epoch": 1.75, + "grad_norm": 11.612525939941406, + "learning_rate": 7.272578304621287e-06, + "logits/chosen": -0.3620361089706421, + "logits/rejected": -0.4833582639694214, + "logps/chosen": -61.99352264404297, + "logps/rejected": -82.37326049804688, + "loss": 0.682, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8415286540985107, + "rewards/margins": 4.9293389320373535, + "rewards/rejected": -2.0878102779388428, + "step": 6991 + }, + { + "epoch": 1.75, + "grad_norm": 6.385595321655273, + "learning_rate": 7.271878166887686e-06, + "logits/chosen": -0.3694995045661926, + "logits/rejected": -0.45205432176589966, + "logps/chosen": -47.882408142089844, + "logps/rejected": -85.68606567382812, + "loss": 0.5846, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.154467821121216, + "rewards/margins": 5.344307899475098, + "rewards/rejected": -2.189840078353882, + "step": 6992 + }, + { + "epoch": 1.75, + "grad_norm": 7.008300304412842, + "learning_rate": 7.271177973013557e-06, + "logits/chosen": -0.3815712332725525, + "logits/rejected": -0.46528804302215576, + "logps/chosen": -67.0849609375, + "logps/rejected": -91.24043273925781, + "loss": 0.8244, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6591944694519043, + "rewards/margins": 4.8041229248046875, + "rewards/rejected": -2.144928455352783, + "step": 6993 + }, + { + "epoch": 1.75, + "grad_norm": 3.6659436225891113, + "learning_rate": 7.270477723016205e-06, + "logits/chosen": -0.4508809447288513, + "logits/rejected": -0.555553138256073, + "logps/chosen": -54.65760803222656, + "logps/rejected": -94.9185791015625, + "loss": 0.6965, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1891911029815674, + "rewards/margins": 5.784473896026611, + "rewards/rejected": -2.595282554626465, + "step": 6994 + }, + { + "epoch": 1.75, + "grad_norm": 4.607898235321045, + "learning_rate": 7.2697774169129335e-06, + "logits/chosen": -0.40258175134658813, + "logits/rejected": -0.5168331861495972, + "logps/chosen": -52.33982849121094, + "logps/rejected": -66.66035461425781, + "loss": 0.6939, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.07922625541687, + "rewards/margins": 5.038820743560791, + "rewards/rejected": -1.9595943689346313, + "step": 6995 + }, + { + "epoch": 1.75, + "grad_norm": 5.046191215515137, + "learning_rate": 7.269077054721047e-06, + "logits/chosen": -0.4165842533111572, + "logits/rejected": -0.4909752905368805, + "logps/chosen": -52.94425582885742, + "logps/rejected": -80.2314453125, + "loss": 0.7931, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8337512016296387, + "rewards/margins": 5.201400279998779, + "rewards/rejected": -2.3676490783691406, + "step": 6996 + }, + { + "epoch": 1.75, + "grad_norm": 3.834355592727661, + "learning_rate": 7.268376636457854e-06, + "logits/chosen": -0.37995585799217224, + "logits/rejected": -0.46990376710891724, + "logps/chosen": -57.602264404296875, + "logps/rejected": -81.51770782470703, + "loss": 0.6398, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9182159900665283, + "rewards/margins": 4.5993452072143555, + "rewards/rejected": -1.6811292171478271, + "step": 6997 + }, + { + "epoch": 1.75, + "grad_norm": 7.460075855255127, + "learning_rate": 7.267676162140662e-06, + "logits/chosen": -0.41229891777038574, + "logits/rejected": -0.4493004083633423, + "logps/chosen": -61.810325622558594, + "logps/rejected": -94.59484100341797, + "loss": 0.8946, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.809209108352661, + "rewards/margins": 4.488654136657715, + "rewards/rejected": -1.6794450283050537, + "step": 6998 + }, + { + "epoch": 1.75, + "grad_norm": 4.6904449462890625, + "learning_rate": 7.266975631786779e-06, + "logits/chosen": -0.3950845003128052, + "logits/rejected": -0.5082176923751831, + "logps/chosen": -49.17845916748047, + "logps/rejected": -82.00696563720703, + "loss": 0.791, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.994689464569092, + "rewards/margins": 4.427758693695068, + "rewards/rejected": -1.4330694675445557, + "step": 6999 + }, + { + "epoch": 1.75, + "grad_norm": 3.5058131217956543, + "learning_rate": 7.266275045413517e-06, + "logits/chosen": -0.49839913845062256, + "logits/rejected": -0.5957516431808472, + "logps/chosen": -53.5555419921875, + "logps/rejected": -86.19108581542969, + "loss": 0.6553, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.697789192199707, + "rewards/margins": 5.298080921173096, + "rewards/rejected": -2.6002917289733887, + "step": 7000 + }, + { + "epoch": 1.75, + "grad_norm": 7.2757649421691895, + "learning_rate": 7.265574403038189e-06, + "logits/chosen": -0.46262264251708984, + "logits/rejected": -0.565030574798584, + "logps/chosen": -54.38676071166992, + "logps/rejected": -91.77107238769531, + "loss": 0.7373, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9392261505126953, + "rewards/margins": 5.587489604949951, + "rewards/rejected": -2.6482632160186768, + "step": 7001 + }, + { + "epoch": 1.75, + "grad_norm": 9.19705867767334, + "learning_rate": 7.264873704678107e-06, + "logits/chosen": -0.4230955243110657, + "logits/rejected": -0.5133436322212219, + "logps/chosen": -54.75910568237305, + "logps/rejected": -84.88179016113281, + "loss": 0.6631, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9752795696258545, + "rewards/margins": 5.106071949005127, + "rewards/rejected": -2.1307926177978516, + "step": 7002 + }, + { + "epoch": 1.75, + "grad_norm": 7.677887916564941, + "learning_rate": 7.264172950350586e-06, + "logits/chosen": -0.4212714731693268, + "logits/rejected": -0.49838411808013916, + "logps/chosen": -46.33831024169922, + "logps/rejected": -85.00439453125, + "loss": 0.6731, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0890541076660156, + "rewards/margins": 5.13861083984375, + "rewards/rejected": -2.0495567321777344, + "step": 7003 + }, + { + "epoch": 1.75, + "grad_norm": 4.490819454193115, + "learning_rate": 7.263472140072945e-06, + "logits/chosen": -0.3424608111381531, + "logits/rejected": -0.37291958928108215, + "logps/chosen": -57.038002014160156, + "logps/rejected": -106.57684326171875, + "loss": 0.6625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.816580295562744, + "rewards/margins": 6.013538837432861, + "rewards/rejected": -3.196958541870117, + "step": 7004 + }, + { + "epoch": 1.75, + "grad_norm": 3.299359083175659, + "learning_rate": 7.2627712738624985e-06, + "logits/chosen": -0.48559942841529846, + "logits/rejected": -0.5364293456077576, + "logps/chosen": -57.50507354736328, + "logps/rejected": -94.16229248046875, + "loss": 0.7211, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0252957344055176, + "rewards/margins": 5.374921798706055, + "rewards/rejected": -2.349625825881958, + "step": 7005 + }, + { + "epoch": 1.75, + "grad_norm": 9.809962272644043, + "learning_rate": 7.262070351736566e-06, + "logits/chosen": -0.43574759364128113, + "logits/rejected": -0.540617823600769, + "logps/chosen": -48.81597900390625, + "logps/rejected": -74.46963500976562, + "loss": 0.6862, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.977555274963379, + "rewards/margins": 5.356651782989502, + "rewards/rejected": -2.379096269607544, + "step": 7006 + }, + { + "epoch": 1.75, + "grad_norm": 8.248181343078613, + "learning_rate": 7.261369373712473e-06, + "logits/chosen": -0.45180627703666687, + "logits/rejected": -0.5694021582603455, + "logps/chosen": -64.94886779785156, + "logps/rejected": -72.59927368164062, + "loss": 0.7612, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8571670055389404, + "rewards/margins": 4.912362098693848, + "rewards/rejected": -2.0551950931549072, + "step": 7007 + }, + { + "epoch": 1.75, + "grad_norm": 9.434757232666016, + "learning_rate": 7.260668339807534e-06, + "logits/chosen": -0.43118131160736084, + "logits/rejected": -0.4782854914665222, + "logps/chosen": -66.78193664550781, + "logps/rejected": -96.44930267333984, + "loss": 0.795, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8577306270599365, + "rewards/margins": 5.6208086013793945, + "rewards/rejected": -2.763077735900879, + "step": 7008 + }, + { + "epoch": 1.75, + "grad_norm": 32.254093170166016, + "learning_rate": 7.259967250039076e-06, + "logits/chosen": -0.39566659927368164, + "logits/rejected": -0.4292346239089966, + "logps/chosen": -59.32423400878906, + "logps/rejected": -90.52339172363281, + "loss": 1.0113, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.0382766723632812, + "rewards/margins": 3.7607474327087402, + "rewards/rejected": -0.7224701642990112, + "step": 7009 + }, + { + "epoch": 1.75, + "grad_norm": 5.554424285888672, + "learning_rate": 7.259266104424425e-06, + "logits/chosen": -0.4061730206012726, + "logits/rejected": -0.5014637112617493, + "logps/chosen": -50.59303665161133, + "logps/rejected": -83.22610473632812, + "loss": 0.5981, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.957186698913574, + "rewards/margins": 5.546144008636475, + "rewards/rejected": -2.588956832885742, + "step": 7010 + }, + { + "epoch": 1.75, + "grad_norm": 3.665449857711792, + "learning_rate": 7.258564902980905e-06, + "logits/chosen": -0.41841036081314087, + "logits/rejected": -0.47792762517929077, + "logps/chosen": -59.49454879760742, + "logps/rejected": -82.84053039550781, + "loss": 0.7258, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1764626502990723, + "rewards/margins": 5.700938701629639, + "rewards/rejected": -2.524475574493408, + "step": 7011 + }, + { + "epoch": 1.75, + "grad_norm": 5.131557941436768, + "learning_rate": 7.257863645725843e-06, + "logits/chosen": -0.3749871551990509, + "logits/rejected": -0.4364039897918701, + "logps/chosen": -56.285377502441406, + "logps/rejected": -93.82322692871094, + "loss": 0.751, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1468350887298584, + "rewards/margins": 4.431824207305908, + "rewards/rejected": -1.2849892377853394, + "step": 7012 + }, + { + "epoch": 1.75, + "grad_norm": 3.40179705619812, + "learning_rate": 7.257162332676568e-06, + "logits/chosen": -0.3529566526412964, + "logits/rejected": -0.4779554605484009, + "logps/chosen": -60.32630157470703, + "logps/rejected": -95.29341888427734, + "loss": 0.684, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7717905044555664, + "rewards/margins": 4.683955669403076, + "rewards/rejected": -1.9121650457382202, + "step": 7013 + }, + { + "epoch": 1.75, + "grad_norm": 6.233535289764404, + "learning_rate": 7.2564609638504125e-06, + "logits/chosen": -0.4171885550022125, + "logits/rejected": -0.4608256220817566, + "logps/chosen": -52.09590148925781, + "logps/rejected": -84.04349517822266, + "loss": 0.9113, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5992677211761475, + "rewards/margins": 3.809633255004883, + "rewards/rejected": -1.2103654146194458, + "step": 7014 + }, + { + "epoch": 1.75, + "grad_norm": 5.602084636688232, + "learning_rate": 7.255759539264706e-06, + "logits/chosen": -0.4248467981815338, + "logits/rejected": -0.49725398421287537, + "logps/chosen": -62.32362747192383, + "logps/rejected": -90.76019287109375, + "loss": 0.853, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7707390785217285, + "rewards/margins": 4.347222805023193, + "rewards/rejected": -1.5764836072921753, + "step": 7015 + }, + { + "epoch": 1.76, + "grad_norm": 6.965537071228027, + "learning_rate": 7.255058058936783e-06, + "logits/chosen": -0.45265811681747437, + "logits/rejected": -0.5350918769836426, + "logps/chosen": -56.07197189331055, + "logps/rejected": -81.82009887695312, + "loss": 0.8947, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.098295211791992, + "rewards/margins": 5.294368743896484, + "rewards/rejected": -2.1960737705230713, + "step": 7016 + }, + { + "epoch": 1.76, + "grad_norm": 3.1068224906921387, + "learning_rate": 7.254356522883976e-06, + "logits/chosen": -0.48518475890159607, + "logits/rejected": -0.5511257648468018, + "logps/chosen": -51.94605255126953, + "logps/rejected": -100.74788665771484, + "loss": 0.641, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.166867971420288, + "rewards/margins": 5.653286933898926, + "rewards/rejected": -2.4864187240600586, + "step": 7017 + }, + { + "epoch": 1.76, + "grad_norm": 4.444538116455078, + "learning_rate": 7.253654931123622e-06, + "logits/chosen": -0.406101256608963, + "logits/rejected": -0.46791255474090576, + "logps/chosen": -48.960941314697266, + "logps/rejected": -79.03561401367188, + "loss": 0.6803, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.850114345550537, + "rewards/margins": 4.159799098968506, + "rewards/rejected": -1.3096849918365479, + "step": 7018 + }, + { + "epoch": 1.76, + "grad_norm": 3.244755506515503, + "learning_rate": 7.2529532836730565e-06, + "logits/chosen": -0.35832419991493225, + "logits/rejected": -0.4650344252586365, + "logps/chosen": -58.424720764160156, + "logps/rejected": -83.04257202148438, + "loss": 0.5724, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0709266662597656, + "rewards/margins": 5.324738025665283, + "rewards/rejected": -2.253811836242676, + "step": 7019 + }, + { + "epoch": 1.76, + "grad_norm": 3.5162391662597656, + "learning_rate": 7.252251580549619e-06, + "logits/chosen": -0.4828721880912781, + "logits/rejected": -0.5835031270980835, + "logps/chosen": -48.22075271606445, + "logps/rejected": -82.82858276367188, + "loss": 0.6451, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0654001235961914, + "rewards/margins": 5.777402400970459, + "rewards/rejected": -2.7120022773742676, + "step": 7020 + }, + { + "epoch": 1.76, + "grad_norm": 2.892113447189331, + "learning_rate": 7.25154982177065e-06, + "logits/chosen": -0.4715784192085266, + "logits/rejected": -0.5950756072998047, + "logps/chosen": -59.56791305541992, + "logps/rejected": -101.72537231445312, + "loss": 0.6039, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.859297752380371, + "rewards/margins": 6.654330253601074, + "rewards/rejected": -3.7950332164764404, + "step": 7021 + }, + { + "epoch": 1.76, + "grad_norm": 7.393449306488037, + "learning_rate": 7.25084800735349e-06, + "logits/chosen": -0.32250311970710754, + "logits/rejected": -0.48708802461624146, + "logps/chosen": -70.53094482421875, + "logps/rejected": -77.24473571777344, + "loss": 0.7708, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.917980909347534, + "rewards/margins": 5.278632164001465, + "rewards/rejected": -2.3606514930725098, + "step": 7022 + }, + { + "epoch": 1.76, + "grad_norm": 12.961132049560547, + "learning_rate": 7.250146137315481e-06, + "logits/chosen": -0.36399272084236145, + "logits/rejected": -0.4372824430465698, + "logps/chosen": -53.19032669067383, + "logps/rejected": -103.49562072753906, + "loss": 0.6687, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.800591468811035, + "rewards/margins": 5.914275169372559, + "rewards/rejected": -3.1136837005615234, + "step": 7023 + }, + { + "epoch": 1.76, + "grad_norm": 5.336715221405029, + "learning_rate": 7.24944421167397e-06, + "logits/chosen": -0.4844309985637665, + "logits/rejected": -0.5806013941764832, + "logps/chosen": -45.8580436706543, + "logps/rejected": -79.39009094238281, + "loss": 0.6606, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8708441257476807, + "rewards/margins": 4.657855987548828, + "rewards/rejected": -1.7870121002197266, + "step": 7024 + }, + { + "epoch": 1.76, + "grad_norm": 8.52254867553711, + "learning_rate": 7.248742230446297e-06, + "logits/chosen": -0.4485982656478882, + "logits/rejected": -0.48441851139068604, + "logps/chosen": -53.85626983642578, + "logps/rejected": -92.02229309082031, + "loss": 0.731, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.393650770187378, + "rewards/margins": 4.217764854431152, + "rewards/rejected": -0.8241140842437744, + "step": 7025 + }, + { + "epoch": 1.76, + "grad_norm": 5.81986141204834, + "learning_rate": 7.248040193649813e-06, + "logits/chosen": -0.3997417688369751, + "logits/rejected": -0.5248607397079468, + "logps/chosen": -56.84272384643555, + "logps/rejected": -89.16522979736328, + "loss": 0.6597, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0109939575195312, + "rewards/margins": 6.378061771392822, + "rewards/rejected": -3.3670685291290283, + "step": 7026 + }, + { + "epoch": 1.76, + "grad_norm": 6.170675277709961, + "learning_rate": 7.247338101301864e-06, + "logits/chosen": -0.45224082469940186, + "logits/rejected": -0.5588991641998291, + "logps/chosen": -56.63233947753906, + "logps/rejected": -84.41494750976562, + "loss": 0.7254, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8759515285491943, + "rewards/margins": 5.235317230224609, + "rewards/rejected": -2.359365463256836, + "step": 7027 + }, + { + "epoch": 1.76, + "grad_norm": 5.833561420440674, + "learning_rate": 7.2466359534198e-06, + "logits/chosen": -0.5022332668304443, + "logits/rejected": -0.5371310710906982, + "logps/chosen": -59.259666442871094, + "logps/rejected": -100.11473083496094, + "loss": 0.7236, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.648730516433716, + "rewards/margins": 5.330448150634766, + "rewards/rejected": -2.68171763420105, + "step": 7028 + }, + { + "epoch": 1.76, + "grad_norm": 6.474180221557617, + "learning_rate": 7.245933750020973e-06, + "logits/chosen": -0.42722275853157043, + "logits/rejected": -0.5437420606613159, + "logps/chosen": -62.33120346069336, + "logps/rejected": -79.14944458007812, + "loss": 0.7604, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.932516098022461, + "rewards/margins": 5.943498134613037, + "rewards/rejected": -3.0109822750091553, + "step": 7029 + }, + { + "epoch": 1.76, + "grad_norm": 5.945548057556152, + "learning_rate": 7.245231491122733e-06, + "logits/chosen": -0.44141092896461487, + "logits/rejected": -0.4209604263305664, + "logps/chosen": -51.8194465637207, + "logps/rejected": -107.37164306640625, + "loss": 0.6584, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1717443466186523, + "rewards/margins": 5.484145641326904, + "rewards/rejected": -2.3124008178710938, + "step": 7030 + }, + { + "epoch": 1.76, + "grad_norm": 8.505882263183594, + "learning_rate": 7.244529176742435e-06, + "logits/chosen": -0.40187156200408936, + "logits/rejected": -0.47603508830070496, + "logps/chosen": -56.46887969970703, + "logps/rejected": -90.5465087890625, + "loss": 0.6773, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1054904460906982, + "rewards/margins": 5.55678129196167, + "rewards/rejected": -2.451291084289551, + "step": 7031 + }, + { + "epoch": 1.76, + "grad_norm": 5.936328411102295, + "learning_rate": 7.243826806897434e-06, + "logits/chosen": -0.4398975074291229, + "logits/rejected": -0.4693877100944519, + "logps/chosen": -54.92354202270508, + "logps/rejected": -94.88236999511719, + "loss": 0.791, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9579780101776123, + "rewards/margins": 4.5449419021606445, + "rewards/rejected": -1.5869635343551636, + "step": 7032 + }, + { + "epoch": 1.76, + "grad_norm": 4.059300899505615, + "learning_rate": 7.243124381605087e-06, + "logits/chosen": -0.4697747826576233, + "logits/rejected": -0.5857342481613159, + "logps/chosen": -56.83745193481445, + "logps/rejected": -83.52894592285156, + "loss": 0.6621, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0307445526123047, + "rewards/margins": 5.211110591888428, + "rewards/rejected": -2.180366039276123, + "step": 7033 + }, + { + "epoch": 1.76, + "grad_norm": 18.10167121887207, + "learning_rate": 7.242421900882748e-06, + "logits/chosen": -0.4533960223197937, + "logits/rejected": -0.5457442998886108, + "logps/chosen": -51.269805908203125, + "logps/rejected": -122.1720962524414, + "loss": 0.6734, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.791425943374634, + "rewards/margins": 7.376039505004883, + "rewards/rejected": -4.58461332321167, + "step": 7034 + }, + { + "epoch": 1.76, + "grad_norm": 6.156564235687256, + "learning_rate": 7.241719364747781e-06, + "logits/chosen": -0.4694269299507141, + "logits/rejected": -0.5469663143157959, + "logps/chosen": -65.9290771484375, + "logps/rejected": -85.26980590820312, + "loss": 0.8473, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.634140729904175, + "rewards/margins": 4.531349182128906, + "rewards/rejected": -1.8972084522247314, + "step": 7035 + }, + { + "epoch": 1.76, + "grad_norm": 4.043631553649902, + "learning_rate": 7.241016773217544e-06, + "logits/chosen": -0.3574713468551636, + "logits/rejected": -0.45425087213516235, + "logps/chosen": -69.28903198242188, + "logps/rejected": -105.41629791259766, + "loss": 0.6626, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8128504753112793, + "rewards/margins": 5.293709754943848, + "rewards/rejected": -2.4808595180511475, + "step": 7036 + }, + { + "epoch": 1.76, + "grad_norm": 3.5046472549438477, + "learning_rate": 7.240314126309398e-06, + "logits/chosen": -0.4363303482532501, + "logits/rejected": -0.49863892793655396, + "logps/chosen": -57.0477294921875, + "logps/rejected": -95.89171600341797, + "loss": 0.6088, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.833226203918457, + "rewards/margins": 5.443948745727539, + "rewards/rejected": -2.610722303390503, + "step": 7037 + }, + { + "epoch": 1.76, + "grad_norm": 7.491105079650879, + "learning_rate": 7.239611424040707e-06, + "logits/chosen": -0.3981698751449585, + "logits/rejected": -0.5163748860359192, + "logps/chosen": -59.65431594848633, + "logps/rejected": -94.66395568847656, + "loss": 0.6798, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8304927349090576, + "rewards/margins": 5.83032751083374, + "rewards/rejected": -2.9998345375061035, + "step": 7038 + }, + { + "epoch": 1.76, + "grad_norm": 10.34807014465332, + "learning_rate": 7.238908666428837e-06, + "logits/chosen": -0.5453779101371765, + "logits/rejected": -0.6235840320587158, + "logps/chosen": -67.511474609375, + "logps/rejected": -102.27632904052734, + "loss": 0.8399, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.180644989013672, + "rewards/margins": 6.148890495300293, + "rewards/rejected": -2.968245506286621, + "step": 7039 + }, + { + "epoch": 1.76, + "grad_norm": 18.969839096069336, + "learning_rate": 7.238205853491149e-06, + "logits/chosen": -0.37089255452156067, + "logits/rejected": -0.4256734549999237, + "logps/chosen": -56.6524772644043, + "logps/rejected": -88.62409973144531, + "loss": 1.1263, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7076525688171387, + "rewards/margins": 3.5573301315307617, + "rewards/rejected": -0.8496776819229126, + "step": 7040 + }, + { + "epoch": 1.76, + "grad_norm": 7.649484157562256, + "learning_rate": 7.237502985245018e-06, + "logits/chosen": -0.426133394241333, + "logits/rejected": -0.5451055765151978, + "logps/chosen": -62.28546142578125, + "logps/rejected": -86.22052001953125, + "loss": 0.8037, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9006052017211914, + "rewards/margins": 5.608397960662842, + "rewards/rejected": -2.7077927589416504, + "step": 7041 + }, + { + "epoch": 1.76, + "grad_norm": 11.165205001831055, + "learning_rate": 7.236800061707806e-06, + "logits/chosen": -0.49322858452796936, + "logits/rejected": -0.578438401222229, + "logps/chosen": -51.834434509277344, + "logps/rejected": -93.15770721435547, + "loss": 0.7062, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.830667734146118, + "rewards/margins": 5.730502128601074, + "rewards/rejected": -2.899834156036377, + "step": 7042 + }, + { + "epoch": 1.76, + "grad_norm": 7.7846479415893555, + "learning_rate": 7.2360970828968845e-06, + "logits/chosen": -0.4495360255241394, + "logits/rejected": -0.5537338256835938, + "logps/chosen": -55.49754333496094, + "logps/rejected": -90.91409301757812, + "loss": 0.6293, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.042041778564453, + "rewards/margins": 5.959632396697998, + "rewards/rejected": -2.917590379714966, + "step": 7043 + }, + { + "epoch": 1.76, + "grad_norm": 3.010749101638794, + "learning_rate": 7.235394048829627e-06, + "logits/chosen": -0.3795807659626007, + "logits/rejected": -0.49000415205955505, + "logps/chosen": -56.72433090209961, + "logps/rejected": -83.92797088623047, + "loss": 0.6972, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7210702896118164, + "rewards/margins": 5.1011810302734375, + "rewards/rejected": -2.3801112174987793, + "step": 7044 + }, + { + "epoch": 1.76, + "grad_norm": 4.488847255706787, + "learning_rate": 7.234690959523405e-06, + "logits/chosen": -0.4485664963722229, + "logits/rejected": -0.5584481954574585, + "logps/chosen": -47.0526123046875, + "logps/rejected": -79.92298889160156, + "loss": 0.6204, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0078587532043457, + "rewards/margins": 5.4087042808532715, + "rewards/rejected": -2.400846004486084, + "step": 7045 + }, + { + "epoch": 1.76, + "grad_norm": 3.7234816551208496, + "learning_rate": 7.233987814995592e-06, + "logits/chosen": -0.4618702530860901, + "logits/rejected": -0.514923095703125, + "logps/chosen": -55.2659797668457, + "logps/rejected": -105.65726470947266, + "loss": 0.6549, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2589995861053467, + "rewards/margins": 5.245234966278076, + "rewards/rejected": -1.9862357378005981, + "step": 7046 + }, + { + "epoch": 1.76, + "grad_norm": 3.5021305084228516, + "learning_rate": 7.233284615263564e-06, + "logits/chosen": -0.5267176032066345, + "logits/rejected": -0.6349626779556274, + "logps/chosen": -49.44986343383789, + "logps/rejected": -97.9170913696289, + "loss": 0.6406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.004364013671875, + "rewards/margins": 6.273618221282959, + "rewards/rejected": -3.269254446029663, + "step": 7047 + }, + { + "epoch": 1.76, + "grad_norm": 2.6222243309020996, + "learning_rate": 7.2325813603446985e-06, + "logits/chosen": -0.4792279303073883, + "logits/rejected": -0.6275454163551331, + "logps/chosen": -57.30693435668945, + "logps/rejected": -81.88240814208984, + "loss": 0.659, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9200682640075684, + "rewards/margins": 5.996790885925293, + "rewards/rejected": -3.0767223834991455, + "step": 7048 + }, + { + "epoch": 1.76, + "grad_norm": 4.8234686851501465, + "learning_rate": 7.231878050256371e-06, + "logits/chosen": -0.42855948209762573, + "logits/rejected": -0.4613857865333557, + "logps/chosen": -67.92671203613281, + "logps/rejected": -113.50550842285156, + "loss": 0.7408, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8355510234832764, + "rewards/margins": 5.035515308380127, + "rewards/rejected": -2.1999642848968506, + "step": 7049 + }, + { + "epoch": 1.76, + "grad_norm": 5.828568935394287, + "learning_rate": 7.231174685015965e-06, + "logits/chosen": -0.4257236123085022, + "logits/rejected": -0.4682295322418213, + "logps/chosen": -55.310970306396484, + "logps/rejected": -100.40763854980469, + "loss": 0.716, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8891563415527344, + "rewards/margins": 5.598033905029297, + "rewards/rejected": -2.7088780403137207, + "step": 7050 + }, + { + "epoch": 1.76, + "grad_norm": 14.66893196105957, + "learning_rate": 7.230471264640857e-06, + "logits/chosen": -0.48295098543167114, + "logits/rejected": -0.572595477104187, + "logps/chosen": -61.29148864746094, + "logps/rejected": -92.57203674316406, + "loss": 0.8047, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.747445583343506, + "rewards/margins": 4.626864433288574, + "rewards/rejected": -1.8794190883636475, + "step": 7051 + }, + { + "epoch": 1.76, + "grad_norm": 6.563952445983887, + "learning_rate": 7.229767789148434e-06, + "logits/chosen": -0.4127802848815918, + "logits/rejected": -0.49114474654197693, + "logps/chosen": -56.299495697021484, + "logps/rejected": -86.94434356689453, + "loss": 0.712, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.018998861312866, + "rewards/margins": 5.371180534362793, + "rewards/rejected": -2.3521809577941895, + "step": 7052 + }, + { + "epoch": 1.76, + "grad_norm": 3.5498104095458984, + "learning_rate": 7.229064258556077e-06, + "logits/chosen": -0.33074092864990234, + "logits/rejected": -0.40693122148513794, + "logps/chosen": -52.757118225097656, + "logps/rejected": -90.94001770019531, + "loss": 0.6498, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0809409618377686, + "rewards/margins": 4.277230262756348, + "rewards/rejected": -1.1962894201278687, + "step": 7053 + }, + { + "epoch": 1.76, + "grad_norm": 3.943847894668579, + "learning_rate": 7.22836067288117e-06, + "logits/chosen": -0.44997119903564453, + "logits/rejected": -0.5593067407608032, + "logps/chosen": -45.47263717651367, + "logps/rejected": -84.95523071289062, + "loss": 0.621, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.046481132507324, + "rewards/margins": 6.682024955749512, + "rewards/rejected": -3.6355438232421875, + "step": 7054 + }, + { + "epoch": 1.76, + "grad_norm": 4.985090255737305, + "learning_rate": 7.227657032141101e-06, + "logits/chosen": -0.35259658098220825, + "logits/rejected": -0.4304504692554474, + "logps/chosen": -59.85979461669922, + "logps/rejected": -88.46920776367188, + "loss": 0.581, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9996719360351562, + "rewards/margins": 5.377030372619629, + "rewards/rejected": -2.3773581981658936, + "step": 7055 + }, + { + "epoch": 1.77, + "grad_norm": 12.108915328979492, + "learning_rate": 7.226953336353257e-06, + "logits/chosen": -0.42209869623184204, + "logits/rejected": -0.42324769496917725, + "logps/chosen": -53.403018951416016, + "logps/rejected": -87.80043029785156, + "loss": 0.8062, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7349278926849365, + "rewards/margins": 3.8413138389587402, + "rewards/rejected": -1.1063859462738037, + "step": 7056 + }, + { + "epoch": 1.77, + "grad_norm": 5.572956085205078, + "learning_rate": 7.226249585535028e-06, + "logits/chosen": -0.4897410571575165, + "logits/rejected": -0.5560590028762817, + "logps/chosen": -53.04895782470703, + "logps/rejected": -98.67955780029297, + "loss": 0.7124, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.805145740509033, + "rewards/margins": 5.690751075744629, + "rewards/rejected": -2.885605573654175, + "step": 7057 + }, + { + "epoch": 1.77, + "grad_norm": 8.498973846435547, + "learning_rate": 7.225545779703805e-06, + "logits/chosen": -0.44856831431388855, + "logits/rejected": -0.5426003932952881, + "logps/chosen": -52.52739715576172, + "logps/rejected": -80.61783599853516, + "loss": 0.8117, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.050182580947876, + "rewards/margins": 4.940027236938477, + "rewards/rejected": -1.889844536781311, + "step": 7058 + }, + { + "epoch": 1.77, + "grad_norm": 7.565258026123047, + "learning_rate": 7.224841918876976e-06, + "logits/chosen": -0.5117689371109009, + "logits/rejected": -0.5388730764389038, + "logps/chosen": -50.10354232788086, + "logps/rejected": -98.12376403808594, + "loss": 0.7762, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2023072242736816, + "rewards/margins": 5.179978370666504, + "rewards/rejected": -1.9776711463928223, + "step": 7059 + }, + { + "epoch": 1.77, + "grad_norm": 2.9272167682647705, + "learning_rate": 7.2241380030719376e-06, + "logits/chosen": -0.42456814646720886, + "logits/rejected": -0.5592480301856995, + "logps/chosen": -51.189476013183594, + "logps/rejected": -89.69287872314453, + "loss": 0.5742, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3804123401641846, + "rewards/margins": 6.229170799255371, + "rewards/rejected": -2.848759412765503, + "step": 7060 + }, + { + "epoch": 1.77, + "grad_norm": 3.5053136348724365, + "learning_rate": 7.223434032306084e-06, + "logits/chosen": -0.49989598989486694, + "logits/rejected": -0.6191065907478333, + "logps/chosen": -50.17376708984375, + "logps/rejected": -92.94258880615234, + "loss": 0.6127, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.874640703201294, + "rewards/margins": 6.048366069793701, + "rewards/rejected": -3.173725128173828, + "step": 7061 + }, + { + "epoch": 1.77, + "grad_norm": 8.670453071594238, + "learning_rate": 7.222730006596811e-06, + "logits/chosen": -0.41817766427993774, + "logits/rejected": -0.45445898175239563, + "logps/chosen": -59.313514709472656, + "logps/rejected": -88.9369888305664, + "loss": 0.8331, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.026169538497925, + "rewards/margins": 3.7326865196228027, + "rewards/rejected": -0.7065169811248779, + "step": 7062 + }, + { + "epoch": 1.77, + "grad_norm": 7.926827430725098, + "learning_rate": 7.222025925961513e-06, + "logits/chosen": -0.4176119267940521, + "logits/rejected": -0.49383777379989624, + "logps/chosen": -58.19935989379883, + "logps/rejected": -95.75459289550781, + "loss": 0.715, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.816167116165161, + "rewards/margins": 5.370584011077881, + "rewards/rejected": -2.554417133331299, + "step": 7063 + }, + { + "epoch": 1.77, + "grad_norm": 4.46509313583374, + "learning_rate": 7.2213217904175934e-06, + "logits/chosen": -0.4167037904262543, + "logits/rejected": -0.46690675616264343, + "logps/chosen": -47.184356689453125, + "logps/rejected": -89.62251281738281, + "loss": 0.6344, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8974733352661133, + "rewards/margins": 4.900555610656738, + "rewards/rejected": -2.003081798553467, + "step": 7064 + }, + { + "epoch": 1.77, + "grad_norm": 9.80537223815918, + "learning_rate": 7.220617599982449e-06, + "logits/chosen": -0.395845890045166, + "logits/rejected": -0.3783109486103058, + "logps/chosen": -51.18688201904297, + "logps/rejected": -95.39735412597656, + "loss": 0.7043, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.009594440460205, + "rewards/margins": 4.848039627075195, + "rewards/rejected": -1.8384456634521484, + "step": 7065 + }, + { + "epoch": 1.77, + "grad_norm": 4.284572601318359, + "learning_rate": 7.219913354673481e-06, + "logits/chosen": -0.44923731684684753, + "logits/rejected": -0.5034897327423096, + "logps/chosen": -57.4870719909668, + "logps/rejected": -82.99595642089844, + "loss": 0.7353, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8904294967651367, + "rewards/margins": 4.189934730529785, + "rewards/rejected": -1.2995057106018066, + "step": 7066 + }, + { + "epoch": 1.77, + "grad_norm": 5.074726581573486, + "learning_rate": 7.219209054508094e-06, + "logits/chosen": -0.3764250576496124, + "logits/rejected": -0.4238954484462738, + "logps/chosen": -49.227962493896484, + "logps/rejected": -83.05977630615234, + "loss": 0.7324, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8680455684661865, + "rewards/margins": 4.492044925689697, + "rewards/rejected": -1.6239991188049316, + "step": 7067 + }, + { + "epoch": 1.77, + "grad_norm": 6.074094295501709, + "learning_rate": 7.21850469950369e-06, + "logits/chosen": -0.394826740026474, + "logits/rejected": -0.5062762498855591, + "logps/chosen": -67.57640838623047, + "logps/rejected": -92.63935089111328, + "loss": 0.7805, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.133021831512451, + "rewards/margins": 5.788383960723877, + "rewards/rejected": -2.655362606048584, + "step": 7068 + }, + { + "epoch": 1.77, + "grad_norm": 5.569338798522949, + "learning_rate": 7.217800289677674e-06, + "logits/chosen": -0.44992130994796753, + "logits/rejected": -0.5356232523918152, + "logps/chosen": -63.726810455322266, + "logps/rejected": -105.96844482421875, + "loss": 0.8257, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.756649971008301, + "rewards/margins": 5.221482276916504, + "rewards/rejected": -2.464832305908203, + "step": 7069 + }, + { + "epoch": 1.77, + "grad_norm": 5.669541835784912, + "learning_rate": 7.217095825047455e-06, + "logits/chosen": -0.4249439537525177, + "logits/rejected": -0.520296573638916, + "logps/chosen": -57.41355514526367, + "logps/rejected": -80.5779037475586, + "loss": 0.9103, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8700692653656006, + "rewards/margins": 4.439888000488281, + "rewards/rejected": -1.5698187351226807, + "step": 7070 + }, + { + "epoch": 1.77, + "grad_norm": 2.8483240604400635, + "learning_rate": 7.216391305630442e-06, + "logits/chosen": -0.358076274394989, + "logits/rejected": -0.5170539617538452, + "logps/chosen": -63.338932037353516, + "logps/rejected": -90.94905853271484, + "loss": 0.6044, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8261821269989014, + "rewards/margins": 6.563667297363281, + "rewards/rejected": -3.737485408782959, + "step": 7071 + }, + { + "epoch": 1.77, + "grad_norm": 12.156638145446777, + "learning_rate": 7.21568673144404e-06, + "logits/chosen": -0.4444565773010254, + "logits/rejected": -0.5025782585144043, + "logps/chosen": -54.14605712890625, + "logps/rejected": -87.16234588623047, + "loss": 0.8079, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.786330223083496, + "rewards/margins": 4.546696186065674, + "rewards/rejected": -1.7603660821914673, + "step": 7072 + }, + { + "epoch": 1.77, + "grad_norm": 6.3557353019714355, + "learning_rate": 7.2149821025056635e-06, + "logits/chosen": -0.4055826663970947, + "logits/rejected": -0.3762046694755554, + "logps/chosen": -49.32778549194336, + "logps/rejected": -88.91212463378906, + "loss": 0.727, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.108717918395996, + "rewards/margins": 4.214159965515137, + "rewards/rejected": -1.1054424047470093, + "step": 7073 + }, + { + "epoch": 1.77, + "grad_norm": 7.852197170257568, + "learning_rate": 7.214277418832721e-06, + "logits/chosen": -0.4640974998474121, + "logits/rejected": -0.564591109752655, + "logps/chosen": -60.170692443847656, + "logps/rejected": -77.30830383300781, + "loss": 0.8346, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9038681983947754, + "rewards/margins": 4.371044635772705, + "rewards/rejected": -1.4671757221221924, + "step": 7074 + }, + { + "epoch": 1.77, + "grad_norm": 5.018550395965576, + "learning_rate": 7.21357268044263e-06, + "logits/chosen": -0.4258066415786743, + "logits/rejected": -0.48473384976387024, + "logps/chosen": -53.38010787963867, + "logps/rejected": -89.16314697265625, + "loss": 0.7396, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.700253486633301, + "rewards/margins": 4.553073406219482, + "rewards/rejected": -1.8528199195861816, + "step": 7075 + }, + { + "epoch": 1.77, + "grad_norm": 3.156386613845825, + "learning_rate": 7.212867887352804e-06, + "logits/chosen": -0.42088577151298523, + "logits/rejected": -0.5079262256622314, + "logps/chosen": -54.12995147705078, + "logps/rejected": -103.35371398925781, + "loss": 0.6574, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0237298011779785, + "rewards/margins": 6.7527174949646, + "rewards/rejected": -3.728987693786621, + "step": 7076 + }, + { + "epoch": 1.77, + "grad_norm": 3.656622886657715, + "learning_rate": 7.212163039580658e-06, + "logits/chosen": -0.348039448261261, + "logits/rejected": -0.39263415336608887, + "logps/chosen": -64.2587890625, + "logps/rejected": -95.71885681152344, + "loss": 0.8104, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.064432382583618, + "rewards/margins": 4.9265456199646, + "rewards/rejected": -1.8621138334274292, + "step": 7077 + }, + { + "epoch": 1.77, + "grad_norm": 3.9913887977600098, + "learning_rate": 7.21145813714361e-06, + "logits/chosen": -0.5078935623168945, + "logits/rejected": -0.5737920999526978, + "logps/chosen": -46.447792053222656, + "logps/rejected": -80.05253601074219, + "loss": 0.6797, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8377082347869873, + "rewards/margins": 5.096042633056641, + "rewards/rejected": -2.258334159851074, + "step": 7078 + }, + { + "epoch": 1.77, + "grad_norm": 4.129585266113281, + "learning_rate": 7.21075318005908e-06, + "logits/chosen": -0.44413089752197266, + "logits/rejected": -0.5286306142807007, + "logps/chosen": -55.32251739501953, + "logps/rejected": -77.01266479492188, + "loss": 0.8228, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.918492078781128, + "rewards/margins": 4.923732280731201, + "rewards/rejected": -2.005239963531494, + "step": 7079 + }, + { + "epoch": 1.77, + "grad_norm": 4.304149150848389, + "learning_rate": 7.2100481683444864e-06, + "logits/chosen": -0.5098050832748413, + "logits/rejected": -0.5782732367515564, + "logps/chosen": -49.98033905029297, + "logps/rejected": -80.74990844726562, + "loss": 0.645, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.025315999984741, + "rewards/margins": 5.238265037536621, + "rewards/rejected": -2.21294903755188, + "step": 7080 + }, + { + "epoch": 1.77, + "grad_norm": 8.120352745056152, + "learning_rate": 7.209343102017252e-06, + "logits/chosen": -0.3933129608631134, + "logits/rejected": -0.47343116998672485, + "logps/chosen": -62.033599853515625, + "logps/rejected": -98.64618682861328, + "loss": 0.7142, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8437702655792236, + "rewards/margins": 5.3987836837768555, + "rewards/rejected": -2.555013418197632, + "step": 7081 + }, + { + "epoch": 1.77, + "grad_norm": 5.457543849945068, + "learning_rate": 7.208637981094801e-06, + "logits/chosen": -0.45988935232162476, + "logits/rejected": -0.4969947040081024, + "logps/chosen": -53.97308349609375, + "logps/rejected": -90.66151428222656, + "loss": 0.7464, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9958620071411133, + "rewards/margins": 5.398841857910156, + "rewards/rejected": -2.4029791355133057, + "step": 7082 + }, + { + "epoch": 1.77, + "grad_norm": 6.644118309020996, + "learning_rate": 7.207932805594555e-06, + "logits/chosen": -0.41054004430770874, + "logits/rejected": -0.46648725867271423, + "logps/chosen": -56.80200958251953, + "logps/rejected": -93.18856048583984, + "loss": 0.8387, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6225080490112305, + "rewards/margins": 4.967055797576904, + "rewards/rejected": -2.3445475101470947, + "step": 7083 + }, + { + "epoch": 1.77, + "grad_norm": 26.509288787841797, + "learning_rate": 7.207227575533942e-06, + "logits/chosen": -0.4771893322467804, + "logits/rejected": -0.5869925618171692, + "logps/chosen": -54.642608642578125, + "logps/rejected": -77.05706787109375, + "loss": 0.8347, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.746032953262329, + "rewards/margins": 4.8905792236328125, + "rewards/rejected": -2.1445462703704834, + "step": 7084 + }, + { + "epoch": 1.77, + "grad_norm": 5.291846752166748, + "learning_rate": 7.2065222909303875e-06, + "logits/chosen": -0.4399176836013794, + "logits/rejected": -0.5037971138954163, + "logps/chosen": -48.538047790527344, + "logps/rejected": -91.69175720214844, + "loss": 0.7462, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.002098560333252, + "rewards/margins": 4.975981712341309, + "rewards/rejected": -1.9738831520080566, + "step": 7085 + }, + { + "epoch": 1.77, + "grad_norm": 6.4199090003967285, + "learning_rate": 7.2058169518013185e-06, + "logits/chosen": -0.40248045325279236, + "logits/rejected": -0.5106019377708435, + "logps/chosen": -67.20055389404297, + "logps/rejected": -92.07198333740234, + "loss": 0.7826, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.724917411804199, + "rewards/margins": 5.13620138168335, + "rewards/rejected": -2.4112839698791504, + "step": 7086 + }, + { + "epoch": 1.77, + "grad_norm": 7.255518436431885, + "learning_rate": 7.20511155816417e-06, + "logits/chosen": -0.4877418875694275, + "logits/rejected": -0.552970826625824, + "logps/chosen": -46.39531707763672, + "logps/rejected": -88.65249633789062, + "loss": 0.6442, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6462786197662354, + "rewards/margins": 5.344002723693848, + "rewards/rejected": -2.6977243423461914, + "step": 7087 + }, + { + "epoch": 1.77, + "grad_norm": 2.8493857383728027, + "learning_rate": 7.204406110036366e-06, + "logits/chosen": -0.3689786493778229, + "logits/rejected": -0.45292627811431885, + "logps/chosen": -52.47709274291992, + "logps/rejected": -84.68070983886719, + "loss": 0.646, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2181851863861084, + "rewards/margins": 4.977375507354736, + "rewards/rejected": -1.759190559387207, + "step": 7088 + }, + { + "epoch": 1.77, + "grad_norm": 5.803257465362549, + "learning_rate": 7.203700607435345e-06, + "logits/chosen": -0.40971797704696655, + "logits/rejected": -0.5512715578079224, + "logps/chosen": -65.16582489013672, + "logps/rejected": -87.3157730102539, + "loss": 0.8109, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.480234146118164, + "rewards/margins": 5.753889560699463, + "rewards/rejected": -3.273655891418457, + "step": 7089 + }, + { + "epoch": 1.77, + "grad_norm": 7.263607025146484, + "learning_rate": 7.2029950503785364e-06, + "logits/chosen": -0.4432571530342102, + "logits/rejected": -0.5245199203491211, + "logps/chosen": -61.37223434448242, + "logps/rejected": -80.68766784667969, + "loss": 0.7434, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7824134826660156, + "rewards/margins": 4.1577324867248535, + "rewards/rejected": -1.375319242477417, + "step": 7090 + }, + { + "epoch": 1.77, + "grad_norm": 6.767544746398926, + "learning_rate": 7.202289438883376e-06, + "logits/chosen": -0.42342907190322876, + "logits/rejected": -0.5099321007728577, + "logps/chosen": -60.29057312011719, + "logps/rejected": -81.3083267211914, + "loss": 0.8431, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0353987216949463, + "rewards/margins": 4.050344467163086, + "rewards/rejected": -1.0149458646774292, + "step": 7091 + }, + { + "epoch": 1.77, + "grad_norm": 6.585180282592773, + "learning_rate": 7.201583772967303e-06, + "logits/chosen": -0.3893541395664215, + "logits/rejected": -0.4577586352825165, + "logps/chosen": -54.55397033691406, + "logps/rejected": -101.28938293457031, + "loss": 0.6473, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.895677089691162, + "rewards/margins": 5.214712142944336, + "rewards/rejected": -2.319035291671753, + "step": 7092 + }, + { + "epoch": 1.77, + "grad_norm": 12.550490379333496, + "learning_rate": 7.200878052647753e-06, + "logits/chosen": -0.46387553215026855, + "logits/rejected": -0.5273285508155823, + "logps/chosen": -61.341209411621094, + "logps/rejected": -105.2955093383789, + "loss": 0.8767, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6567180156707764, + "rewards/margins": 5.349749565124512, + "rewards/rejected": -2.693031072616577, + "step": 7093 + }, + { + "epoch": 1.77, + "grad_norm": 9.728899002075195, + "learning_rate": 7.2001722779421635e-06, + "logits/chosen": -0.41185134649276733, + "logits/rejected": -0.488088995218277, + "logps/chosen": -54.61439895629883, + "logps/rejected": -92.53662872314453, + "loss": 0.6465, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.783470630645752, + "rewards/margins": 5.168573379516602, + "rewards/rejected": -2.385103225708008, + "step": 7094 + }, + { + "epoch": 1.77, + "grad_norm": 5.3554887771606445, + "learning_rate": 7.199466448867978e-06, + "logits/chosen": -0.3001156747341156, + "logits/rejected": -0.4147821068763733, + "logps/chosen": -58.040863037109375, + "logps/rejected": -94.90984344482422, + "loss": 0.6181, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.710054397583008, + "rewards/margins": 6.422116756439209, + "rewards/rejected": -3.712062120437622, + "step": 7095 + }, + { + "epoch": 1.78, + "grad_norm": 10.68541431427002, + "learning_rate": 7.198760565442639e-06, + "logits/chosen": -0.4176684617996216, + "logits/rejected": -0.4820476174354553, + "logps/chosen": -64.52566528320312, + "logps/rejected": -91.27752685546875, + "loss": 0.945, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7669732570648193, + "rewards/margins": 4.759345054626465, + "rewards/rejected": -1.9923720359802246, + "step": 7096 + }, + { + "epoch": 1.78, + "grad_norm": 3.1501569747924805, + "learning_rate": 7.198054627683585e-06, + "logits/chosen": -0.369987428188324, + "logits/rejected": -0.43620988726615906, + "logps/chosen": -67.05125427246094, + "logps/rejected": -105.91523742675781, + "loss": 0.7135, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0481605529785156, + "rewards/margins": 5.666055202484131, + "rewards/rejected": -2.6178948879241943, + "step": 7097 + }, + { + "epoch": 1.78, + "grad_norm": 3.1060705184936523, + "learning_rate": 7.1973486356082646e-06, + "logits/chosen": -0.4047277271747589, + "logits/rejected": -0.48920440673828125, + "logps/chosen": -59.03961181640625, + "logps/rejected": -101.36112976074219, + "loss": 0.6456, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.905813217163086, + "rewards/margins": 5.279684066772461, + "rewards/rejected": -2.373870849609375, + "step": 7098 + }, + { + "epoch": 1.78, + "grad_norm": 2.088778257369995, + "learning_rate": 7.196642589234122e-06, + "logits/chosen": -0.3561795651912689, + "logits/rejected": -0.47394588589668274, + "logps/chosen": -50.915077209472656, + "logps/rejected": -78.26570129394531, + "loss": 0.5835, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.144956588745117, + "rewards/margins": 6.6195807456970215, + "rewards/rejected": -3.4746241569519043, + "step": 7099 + }, + { + "epoch": 1.78, + "grad_norm": 5.847982883453369, + "learning_rate": 7.195936488578603e-06, + "logits/chosen": -0.4123799204826355, + "logits/rejected": -0.4769696593284607, + "logps/chosen": -62.52842330932617, + "logps/rejected": -88.61725616455078, + "loss": 0.7761, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.841029167175293, + "rewards/margins": 4.992146968841553, + "rewards/rejected": -2.151118040084839, + "step": 7100 + }, + { + "epoch": 1.78, + "grad_norm": 4.924811363220215, + "learning_rate": 7.19523033365916e-06, + "logits/chosen": -0.4574507474899292, + "logits/rejected": -0.5528700947761536, + "logps/chosen": -68.44070434570312, + "logps/rejected": -67.88195037841797, + "loss": 0.8916, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0076887607574463, + "rewards/margins": 3.768125057220459, + "rewards/rejected": -0.7604361772537231, + "step": 7101 + }, + { + "epoch": 1.78, + "grad_norm": 4.768820762634277, + "learning_rate": 7.194524124493239e-06, + "logits/chosen": -0.4518693685531616, + "logits/rejected": -0.5617218613624573, + "logps/chosen": -53.62623596191406, + "logps/rejected": -95.95498657226562, + "loss": 0.7295, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.068573236465454, + "rewards/margins": 5.464200496673584, + "rewards/rejected": -2.395627498626709, + "step": 7102 + }, + { + "epoch": 1.78, + "grad_norm": 3.2630245685577393, + "learning_rate": 7.193817861098293e-06, + "logits/chosen": -0.4106283187866211, + "logits/rejected": -0.5218953490257263, + "logps/chosen": -52.434242248535156, + "logps/rejected": -82.16382598876953, + "loss": 0.6622, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9156699180603027, + "rewards/margins": 5.110032558441162, + "rewards/rejected": -2.1943624019622803, + "step": 7103 + }, + { + "epoch": 1.78, + "grad_norm": 3.4265661239624023, + "learning_rate": 7.193111543491775e-06, + "logits/chosen": -0.40702396631240845, + "logits/rejected": -0.5605455040931702, + "logps/chosen": -48.630672454833984, + "logps/rejected": -69.7423095703125, + "loss": 0.58, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9286210536956787, + "rewards/margins": 5.297334671020508, + "rewards/rejected": -2.36871337890625, + "step": 7104 + }, + { + "epoch": 1.78, + "grad_norm": 4.706140041351318, + "learning_rate": 7.192405171691138e-06, + "logits/chosen": -0.4098236560821533, + "logits/rejected": -0.4304596483707428, + "logps/chosen": -44.44058609008789, + "logps/rejected": -95.00160217285156, + "loss": 0.6796, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0609917640686035, + "rewards/margins": 5.070389747619629, + "rewards/rejected": -2.0093979835510254, + "step": 7105 + }, + { + "epoch": 1.78, + "grad_norm": 5.212247848510742, + "learning_rate": 7.1916987457138354e-06, + "logits/chosen": -0.395392507314682, + "logits/rejected": -0.48244521021842957, + "logps/chosen": -56.874847412109375, + "logps/rejected": -75.81460571289062, + "loss": 0.6865, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.037191867828369, + "rewards/margins": 4.876667022705078, + "rewards/rejected": -1.839475393295288, + "step": 7106 + }, + { + "epoch": 1.78, + "grad_norm": 5.456539154052734, + "learning_rate": 7.190992265577328e-06, + "logits/chosen": -0.3892195522785187, + "logits/rejected": -0.4749408960342407, + "logps/chosen": -71.01702880859375, + "logps/rejected": -86.21031188964844, + "loss": 0.7972, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9867022037506104, + "rewards/margins": 4.489380836486816, + "rewards/rejected": -1.5026788711547852, + "step": 7107 + }, + { + "epoch": 1.78, + "grad_norm": 13.090361595153809, + "learning_rate": 7.190285731299072e-06, + "logits/chosen": -0.3891180455684662, + "logits/rejected": -0.492810994386673, + "logps/chosen": -49.441131591796875, + "logps/rejected": -75.46371459960938, + "loss": 0.8961, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.708798885345459, + "rewards/margins": 4.584232807159424, + "rewards/rejected": -1.8754335641860962, + "step": 7108 + }, + { + "epoch": 1.78, + "grad_norm": 3.1125190258026123, + "learning_rate": 7.189579142896522e-06, + "logits/chosen": -0.377172589302063, + "logits/rejected": -0.5030113458633423, + "logps/chosen": -64.08415222167969, + "logps/rejected": -79.76524353027344, + "loss": 0.6708, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9749343395233154, + "rewards/margins": 5.280363082885742, + "rewards/rejected": -2.3054282665252686, + "step": 7109 + }, + { + "epoch": 1.78, + "grad_norm": 5.834168910980225, + "learning_rate": 7.188872500387145e-06, + "logits/chosen": -0.4053381383419037, + "logits/rejected": -0.44546398520469666, + "logps/chosen": -61.94375228881836, + "logps/rejected": -93.4363784790039, + "loss": 0.8386, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.067389726638794, + "rewards/margins": 4.428606033325195, + "rewards/rejected": -1.3612161874771118, + "step": 7110 + }, + { + "epoch": 1.78, + "grad_norm": 5.867652416229248, + "learning_rate": 7.1881658037884e-06, + "logits/chosen": -0.44189873337745667, + "logits/rejected": -0.4887661635875702, + "logps/chosen": -51.576847076416016, + "logps/rejected": -75.94522094726562, + "loss": 0.7526, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.085866928100586, + "rewards/margins": 5.168807506561279, + "rewards/rejected": -2.082940101623535, + "step": 7111 + }, + { + "epoch": 1.78, + "grad_norm": 5.13467264175415, + "learning_rate": 7.18745905311775e-06, + "logits/chosen": -0.4029425382614136, + "logits/rejected": -0.5110697746276855, + "logps/chosen": -65.12737274169922, + "logps/rejected": -84.20307159423828, + "loss": 0.7903, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.798785448074341, + "rewards/margins": 4.7657551765441895, + "rewards/rejected": -1.9669698476791382, + "step": 7112 + }, + { + "epoch": 1.78, + "grad_norm": 59.69529724121094, + "learning_rate": 7.186752248392661e-06, + "logits/chosen": -0.40259456634521484, + "logits/rejected": -0.5374258756637573, + "logps/chosen": -52.52265548706055, + "logps/rejected": -75.65219116210938, + "loss": 0.7783, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.952829122543335, + "rewards/margins": 4.315462112426758, + "rewards/rejected": -1.3626331090927124, + "step": 7113 + }, + { + "epoch": 1.78, + "grad_norm": 3.9521384239196777, + "learning_rate": 7.186045389630598e-06, + "logits/chosen": -0.39435866475105286, + "logits/rejected": -0.4565322995185852, + "logps/chosen": -60.07586669921875, + "logps/rejected": -93.27070617675781, + "loss": 0.7082, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1018550395965576, + "rewards/margins": 5.207347393035889, + "rewards/rejected": -2.105492115020752, + "step": 7114 + }, + { + "epoch": 1.78, + "grad_norm": 3.4048845767974854, + "learning_rate": 7.1853384768490276e-06, + "logits/chosen": -0.49917930364608765, + "logits/rejected": -0.5833204388618469, + "logps/chosen": -60.73011779785156, + "logps/rejected": -81.12367248535156, + "loss": 0.6746, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1337218284606934, + "rewards/margins": 5.5583391189575195, + "rewards/rejected": -2.4246175289154053, + "step": 7115 + }, + { + "epoch": 1.78, + "grad_norm": 4.2150678634643555, + "learning_rate": 7.184631510065419e-06, + "logits/chosen": -0.3554178476333618, + "logits/rejected": -0.4746853709220886, + "logps/chosen": -51.25322723388672, + "logps/rejected": -77.6470718383789, + "loss": 0.6507, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0675599575042725, + "rewards/margins": 4.9575419425964355, + "rewards/rejected": -1.8899821043014526, + "step": 7116 + }, + { + "epoch": 1.78, + "grad_norm": 4.487256050109863, + "learning_rate": 7.183924489297243e-06, + "logits/chosen": -0.5072322487831116, + "logits/rejected": -0.5951348543167114, + "logps/chosen": -41.845741271972656, + "logps/rejected": -84.90829467773438, + "loss": 0.6247, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.119676351547241, + "rewards/margins": 6.253144264221191, + "rewards/rejected": -3.13346791267395, + "step": 7117 + }, + { + "epoch": 1.78, + "grad_norm": 4.069310665130615, + "learning_rate": 7.183217414561968e-06, + "logits/chosen": -0.3754054605960846, + "logits/rejected": -0.5152991414070129, + "logps/chosen": -46.14329147338867, + "logps/rejected": -82.17294311523438, + "loss": 0.5824, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.118710517883301, + "rewards/margins": 6.172604560852051, + "rewards/rejected": -3.053894519805908, + "step": 7118 + }, + { + "epoch": 1.78, + "grad_norm": 6.926213264465332, + "learning_rate": 7.1825102858770715e-06, + "logits/chosen": -0.5256649255752563, + "logits/rejected": -0.6141672730445862, + "logps/chosen": -48.221256256103516, + "logps/rejected": -86.23417663574219, + "loss": 0.6283, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1241376399993896, + "rewards/margins": 5.658067226409912, + "rewards/rejected": -2.5339295864105225, + "step": 7119 + }, + { + "epoch": 1.78, + "grad_norm": 14.162788391113281, + "learning_rate": 7.181803103260021e-06, + "logits/chosen": -0.47612589597702026, + "logits/rejected": -0.5133199095726013, + "logps/chosen": -50.844669342041016, + "logps/rejected": -91.77037811279297, + "loss": 0.806, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9058775901794434, + "rewards/margins": 4.540811538696289, + "rewards/rejected": -1.6349337100982666, + "step": 7120 + }, + { + "epoch": 1.78, + "grad_norm": 4.390358924865723, + "learning_rate": 7.181095866728297e-06, + "logits/chosen": -0.4965251088142395, + "logits/rejected": -0.5434131622314453, + "logps/chosen": -43.652076721191406, + "logps/rejected": -91.59327697753906, + "loss": 0.6745, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7703301906585693, + "rewards/margins": 4.994781017303467, + "rewards/rejected": -2.2244510650634766, + "step": 7121 + }, + { + "epoch": 1.78, + "grad_norm": 10.167878150939941, + "learning_rate": 7.1803885762993744e-06, + "logits/chosen": -0.4657900333404541, + "logits/rejected": -0.5664756894111633, + "logps/chosen": -58.668922424316406, + "logps/rejected": -83.4656753540039, + "loss": 0.7301, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0003268718719482, + "rewards/margins": 4.508500099182129, + "rewards/rejected": -1.5081729888916016, + "step": 7122 + }, + { + "epoch": 1.78, + "grad_norm": 3.752920389175415, + "learning_rate": 7.17968123199073e-06, + "logits/chosen": -0.534149169921875, + "logits/rejected": -0.5588617324829102, + "logps/chosen": -48.28972625732422, + "logps/rejected": -86.15190124511719, + "loss": 0.6761, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0754928588867188, + "rewards/margins": 4.853339195251465, + "rewards/rejected": -1.7778468132019043, + "step": 7123 + }, + { + "epoch": 1.78, + "grad_norm": 3.5242574214935303, + "learning_rate": 7.178973833819845e-06, + "logits/chosen": -0.47109612822532654, + "logits/rejected": -0.5439661741256714, + "logps/chosen": -48.21693801879883, + "logps/rejected": -91.45883178710938, + "loss": 0.645, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8922743797302246, + "rewards/margins": 4.814934253692627, + "rewards/rejected": -1.9226596355438232, + "step": 7124 + }, + { + "epoch": 1.78, + "grad_norm": 4.965609073638916, + "learning_rate": 7.178266381804199e-06, + "logits/chosen": -0.5099122524261475, + "logits/rejected": -0.5150492191314697, + "logps/chosen": -48.242488861083984, + "logps/rejected": -101.72391510009766, + "loss": 0.7374, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8562099933624268, + "rewards/margins": 5.4129638671875, + "rewards/rejected": -2.5567538738250732, + "step": 7125 + }, + { + "epoch": 1.78, + "grad_norm": 4.679303169250488, + "learning_rate": 7.177558875961273e-06, + "logits/chosen": -0.4060775637626648, + "logits/rejected": -0.5471997857093811, + "logps/chosen": -52.54484939575195, + "logps/rejected": -75.39962005615234, + "loss": 0.6713, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1016271114349365, + "rewards/margins": 6.2491583824157715, + "rewards/rejected": -3.147531509399414, + "step": 7126 + }, + { + "epoch": 1.78, + "grad_norm": 5.403733730316162, + "learning_rate": 7.176851316308551e-06, + "logits/chosen": -0.3311517536640167, + "logits/rejected": -0.385123610496521, + "logps/chosen": -63.8809700012207, + "logps/rejected": -96.94302368164062, + "loss": 0.7967, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5996901988983154, + "rewards/margins": 4.670241355895996, + "rewards/rejected": -2.0705509185791016, + "step": 7127 + }, + { + "epoch": 1.78, + "grad_norm": 2.8859896659851074, + "learning_rate": 7.176143702863518e-06, + "logits/chosen": -0.3823975920677185, + "logits/rejected": -0.5036353468894958, + "logps/chosen": -57.31340026855469, + "logps/rejected": -84.17206573486328, + "loss": 0.5918, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0416507720947266, + "rewards/margins": 6.3637871742248535, + "rewards/rejected": -3.322136878967285, + "step": 7128 + }, + { + "epoch": 1.78, + "grad_norm": 5.864265441894531, + "learning_rate": 7.175436035643658e-06, + "logits/chosen": -0.38610678911209106, + "logits/rejected": -0.4487606883049011, + "logps/chosen": -60.703773498535156, + "logps/rejected": -89.87516784667969, + "loss": 0.8107, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.590151071548462, + "rewards/margins": 5.072723388671875, + "rewards/rejected": -2.482571840286255, + "step": 7129 + }, + { + "epoch": 1.78, + "grad_norm": 13.6729097366333, + "learning_rate": 7.174728314666462e-06, + "logits/chosen": -0.44093990325927734, + "logits/rejected": -0.5105335712432861, + "logps/chosen": -51.82624816894531, + "logps/rejected": -82.9747314453125, + "loss": 0.7584, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8294849395751953, + "rewards/margins": 5.268589496612549, + "rewards/rejected": -2.4391047954559326, + "step": 7130 + }, + { + "epoch": 1.78, + "grad_norm": 4.082967758178711, + "learning_rate": 7.174020539949414e-06, + "logits/chosen": -0.46449148654937744, + "logits/rejected": -0.5751699209213257, + "logps/chosen": -51.14332580566406, + "logps/rejected": -79.11087036132812, + "loss": 0.6662, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9548556804656982, + "rewards/margins": 6.097084999084473, + "rewards/rejected": -3.1422293186187744, + "step": 7131 + }, + { + "epoch": 1.78, + "grad_norm": 16.960163116455078, + "learning_rate": 7.1733127115100075e-06, + "logits/chosen": -0.4855862855911255, + "logits/rejected": -0.5754167437553406, + "logps/chosen": -56.190330505371094, + "logps/rejected": -90.996337890625, + "loss": 0.8227, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.8439242839813232, + "rewards/margins": 4.345028877258301, + "rewards/rejected": -1.5011050701141357, + "step": 7132 + }, + { + "epoch": 1.78, + "grad_norm": 11.14088249206543, + "learning_rate": 7.172604829365732e-06, + "logits/chosen": -0.4852069020271301, + "logits/rejected": -0.5069282650947571, + "logps/chosen": -54.21856689453125, + "logps/rejected": -82.6029052734375, + "loss": 0.758, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9455838203430176, + "rewards/margins": 4.905580043792725, + "rewards/rejected": -1.9599965810775757, + "step": 7133 + }, + { + "epoch": 1.78, + "grad_norm": 6.9597296714782715, + "learning_rate": 7.17189689353408e-06, + "logits/chosen": -0.39440637826919556, + "logits/rejected": -0.46996888518333435, + "logps/chosen": -56.78209686279297, + "logps/rejected": -90.25798034667969, + "loss": 0.7538, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8248143196105957, + "rewards/margins": 4.699326992034912, + "rewards/rejected": -1.8745124340057373, + "step": 7134 + }, + { + "epoch": 1.78, + "grad_norm": 3.254448652267456, + "learning_rate": 7.171188904032547e-06, + "logits/chosen": -0.4186142086982727, + "logits/rejected": -0.4839813709259033, + "logps/chosen": -78.56266784667969, + "logps/rejected": -98.49024200439453, + "loss": 0.7398, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.121997117996216, + "rewards/margins": 6.1628642082214355, + "rewards/rejected": -3.0408666133880615, + "step": 7135 + }, + { + "epoch": 1.79, + "grad_norm": 3.154721260070801, + "learning_rate": 7.170480860878626e-06, + "logits/chosen": -0.47525250911712646, + "logits/rejected": -0.5448727011680603, + "logps/chosen": -45.947200775146484, + "logps/rejected": -93.83817291259766, + "loss": 0.5619, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1323752403259277, + "rewards/margins": 6.075775146484375, + "rewards/rejected": -2.943399667739868, + "step": 7136 + }, + { + "epoch": 1.79, + "grad_norm": 6.355579376220703, + "learning_rate": 7.169772764089814e-06, + "logits/chosen": -0.37012431025505066, + "logits/rejected": -0.4506136178970337, + "logps/chosen": -57.134376525878906, + "logps/rejected": -88.3887939453125, + "loss": 0.8489, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9859588146209717, + "rewards/margins": 4.854805946350098, + "rewards/rejected": -1.868847370147705, + "step": 7137 + }, + { + "epoch": 1.79, + "grad_norm": 5.004922389984131, + "learning_rate": 7.169064613683607e-06, + "logits/chosen": -0.4251185953617096, + "logits/rejected": -0.49711596965789795, + "logps/chosen": -46.40812301635742, + "logps/rejected": -91.26404571533203, + "loss": 0.6914, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7975058555603027, + "rewards/margins": 4.7437615394592285, + "rewards/rejected": -1.9462554454803467, + "step": 7138 + }, + { + "epoch": 1.79, + "grad_norm": 6.998305320739746, + "learning_rate": 7.168356409677511e-06, + "logits/chosen": -0.45753270387649536, + "logits/rejected": -0.5449170470237732, + "logps/chosen": -54.14329528808594, + "logps/rejected": -77.84905242919922, + "loss": 0.7098, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3455891609191895, + "rewards/margins": 4.949244499206543, + "rewards/rejected": -1.6036550998687744, + "step": 7139 + }, + { + "epoch": 1.79, + "grad_norm": 5.9049072265625, + "learning_rate": 7.167648152089017e-06, + "logits/chosen": -0.4108178913593292, + "logits/rejected": -0.5062837600708008, + "logps/chosen": -76.88587951660156, + "logps/rejected": -92.98930358886719, + "loss": 0.8477, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9298019409179688, + "rewards/margins": 5.3422040939331055, + "rewards/rejected": -2.412402629852295, + "step": 7140 + }, + { + "epoch": 1.79, + "grad_norm": 10.527324676513672, + "learning_rate": 7.166939840935634e-06, + "logits/chosen": -0.42623892426490784, + "logits/rejected": -0.4928581118583679, + "logps/chosen": -56.795867919921875, + "logps/rejected": -88.7444839477539, + "loss": 0.7127, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9172656536102295, + "rewards/margins": 4.9275312423706055, + "rewards/rejected": -2.010265350341797, + "step": 7141 + }, + { + "epoch": 1.79, + "grad_norm": 16.72453498840332, + "learning_rate": 7.166231476234864e-06, + "logits/chosen": -0.507265567779541, + "logits/rejected": -0.6262658834457397, + "logps/chosen": -77.07209014892578, + "logps/rejected": -82.5494613647461, + "loss": 0.9898, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6483664512634277, + "rewards/margins": 4.251255512237549, + "rewards/rejected": -1.6028892993927002, + "step": 7142 + }, + { + "epoch": 1.79, + "grad_norm": 5.0642290115356445, + "learning_rate": 7.165523058004207e-06, + "logits/chosen": -0.39737969636917114, + "logits/rejected": -0.5159924626350403, + "logps/chosen": -47.35426712036133, + "logps/rejected": -76.26106262207031, + "loss": 0.7017, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7809925079345703, + "rewards/margins": 5.258056640625, + "rewards/rejected": -2.4770638942718506, + "step": 7143 + }, + { + "epoch": 1.79, + "grad_norm": 4.932483673095703, + "learning_rate": 7.164814586261174e-06, + "logits/chosen": -0.41489285230636597, + "logits/rejected": -0.48711055517196655, + "logps/chosen": -59.38949203491211, + "logps/rejected": -95.62494659423828, + "loss": 0.6898, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.760754108428955, + "rewards/margins": 5.658170700073242, + "rewards/rejected": -2.897416591644287, + "step": 7144 + }, + { + "epoch": 1.79, + "grad_norm": 4.881072998046875, + "learning_rate": 7.164106061023269e-06, + "logits/chosen": -0.4937540590763092, + "logits/rejected": -0.616032063961029, + "logps/chosen": -56.15492248535156, + "logps/rejected": -75.83975219726562, + "loss": 0.6422, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8597559928894043, + "rewards/margins": 5.364757537841797, + "rewards/rejected": -2.5050013065338135, + "step": 7145 + }, + { + "epoch": 1.79, + "grad_norm": 30.407737731933594, + "learning_rate": 7.163397482308001e-06, + "logits/chosen": -0.5293455123901367, + "logits/rejected": -0.5923253297805786, + "logps/chosen": -47.666595458984375, + "logps/rejected": -83.9840087890625, + "loss": 0.6751, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.994636058807373, + "rewards/margins": 4.761829376220703, + "rewards/rejected": -1.7671931982040405, + "step": 7146 + }, + { + "epoch": 1.79, + "grad_norm": 9.500926971435547, + "learning_rate": 7.162688850132882e-06, + "logits/chosen": -0.3790084719657898, + "logits/rejected": -0.4921492040157318, + "logps/chosen": -52.08831024169922, + "logps/rejected": -80.62112426757812, + "loss": 0.7327, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4095773696899414, + "rewards/margins": 4.81233024597168, + "rewards/rejected": -2.4027531147003174, + "step": 7147 + }, + { + "epoch": 1.79, + "grad_norm": 3.260671377182007, + "learning_rate": 7.161980164515419e-06, + "logits/chosen": -0.5070686936378479, + "logits/rejected": -0.6288779377937317, + "logps/chosen": -58.01132583618164, + "logps/rejected": -92.66529083251953, + "loss": 0.6166, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8849449157714844, + "rewards/margins": 6.818822383880615, + "rewards/rejected": -3.9338772296905518, + "step": 7148 + }, + { + "epoch": 1.79, + "grad_norm": 7.644093990325928, + "learning_rate": 7.161271425473128e-06, + "logits/chosen": -0.45773279666900635, + "logits/rejected": -0.5403063297271729, + "logps/chosen": -63.741233825683594, + "logps/rejected": -87.33324432373047, + "loss": 0.7666, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.260390281677246, + "rewards/margins": 5.392277717590332, + "rewards/rejected": -2.131887674331665, + "step": 7149 + }, + { + "epoch": 1.79, + "grad_norm": 11.457356452941895, + "learning_rate": 7.160562633023521e-06, + "logits/chosen": -0.45900511741638184, + "logits/rejected": -0.49604564905166626, + "logps/chosen": -56.55876159667969, + "logps/rejected": -92.58120727539062, + "loss": 0.6993, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.99829363822937, + "rewards/margins": 5.91500186920166, + "rewards/rejected": -2.916707992553711, + "step": 7150 + }, + { + "epoch": 1.79, + "grad_norm": 3.09071946144104, + "learning_rate": 7.159853787184114e-06, + "logits/chosen": -0.46838149428367615, + "logits/rejected": -0.5742359757423401, + "logps/chosen": -66.13507080078125, + "logps/rejected": -94.30996704101562, + "loss": 0.627, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.178711414337158, + "rewards/margins": 6.369051933288574, + "rewards/rejected": -3.190340280532837, + "step": 7151 + }, + { + "epoch": 1.79, + "grad_norm": 5.690864562988281, + "learning_rate": 7.15914488797242e-06, + "logits/chosen": -0.44162583351135254, + "logits/rejected": -0.5623160004615784, + "logps/chosen": -47.60602951049805, + "logps/rejected": -99.93852996826172, + "loss": 0.5868, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1134352684020996, + "rewards/margins": 7.952883720397949, + "rewards/rejected": -4.83944845199585, + "step": 7152 + }, + { + "epoch": 1.79, + "grad_norm": 7.078457355499268, + "learning_rate": 7.158435935405962e-06, + "logits/chosen": -0.42383161187171936, + "logits/rejected": -0.531362771987915, + "logps/chosen": -73.39593505859375, + "logps/rejected": -102.87431335449219, + "loss": 0.8684, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.701209545135498, + "rewards/margins": 5.505725860595703, + "rewards/rejected": -2.804516315460205, + "step": 7153 + }, + { + "epoch": 1.79, + "grad_norm": 6.161340713500977, + "learning_rate": 7.157726929502255e-06, + "logits/chosen": -0.4840167164802551, + "logits/rejected": -0.5736923217773438, + "logps/chosen": -54.42340850830078, + "logps/rejected": -83.78936004638672, + "loss": 0.7209, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.999140977859497, + "rewards/margins": 4.884222507476807, + "rewards/rejected": -1.8850815296173096, + "step": 7154 + }, + { + "epoch": 1.79, + "grad_norm": 6.228610038757324, + "learning_rate": 7.15701787027882e-06, + "logits/chosen": -0.5109850764274597, + "logits/rejected": -0.5568909645080566, + "logps/chosen": -43.6921501159668, + "logps/rejected": -86.93595886230469, + "loss": 0.692, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.262094497680664, + "rewards/margins": 4.526879787445068, + "rewards/rejected": -1.2647850513458252, + "step": 7155 + }, + { + "epoch": 1.79, + "grad_norm": 3.0729291439056396, + "learning_rate": 7.156308757753181e-06, + "logits/chosen": -0.4605976343154907, + "logits/rejected": -0.5250839591026306, + "logps/chosen": -41.783687591552734, + "logps/rejected": -88.04532623291016, + "loss": 0.6156, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.995720386505127, + "rewards/margins": 5.913345813751221, + "rewards/rejected": -2.917625904083252, + "step": 7156 + }, + { + "epoch": 1.79, + "grad_norm": 6.135883331298828, + "learning_rate": 7.1555995919428566e-06, + "logits/chosen": -0.41055038571357727, + "logits/rejected": -0.4609062671661377, + "logps/chosen": -57.93495559692383, + "logps/rejected": -98.95609283447266, + "loss": 0.7126, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9899425506591797, + "rewards/margins": 5.474654197692871, + "rewards/rejected": -2.4847121238708496, + "step": 7157 + }, + { + "epoch": 1.79, + "grad_norm": 15.848883628845215, + "learning_rate": 7.154890372865375e-06, + "logits/chosen": -0.5042226314544678, + "logits/rejected": -0.6288354396820068, + "logps/chosen": -64.3664779663086, + "logps/rejected": -80.19770812988281, + "loss": 0.8575, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.734976053237915, + "rewards/margins": 5.054965972900391, + "rewards/rejected": -2.3199892044067383, + "step": 7158 + }, + { + "epoch": 1.79, + "grad_norm": 5.2221903800964355, + "learning_rate": 7.15418110053826e-06, + "logits/chosen": -0.4025510847568512, + "logits/rejected": -0.446448415517807, + "logps/chosen": -52.74354553222656, + "logps/rejected": -114.02693939208984, + "loss": 0.6284, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1181204319000244, + "rewards/margins": 6.501117706298828, + "rewards/rejected": -3.382997512817383, + "step": 7159 + }, + { + "epoch": 1.79, + "grad_norm": 3.3377466201782227, + "learning_rate": 7.1534717749790385e-06, + "logits/chosen": -0.4682004153728485, + "logits/rejected": -0.5263462662696838, + "logps/chosen": -59.76293182373047, + "logps/rejected": -98.42333984375, + "loss": 0.6524, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1538050174713135, + "rewards/margins": 5.643537998199463, + "rewards/rejected": -2.4897332191467285, + "step": 7160 + }, + { + "epoch": 1.79, + "grad_norm": 3.195584774017334, + "learning_rate": 7.1527623962052385e-06, + "logits/chosen": -0.3361009359359741, + "logits/rejected": -0.4338254928588867, + "logps/chosen": -60.747901916503906, + "logps/rejected": -96.84598541259766, + "loss": 0.6554, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.928223133087158, + "rewards/margins": 5.445773124694824, + "rewards/rejected": -2.517550230026245, + "step": 7161 + }, + { + "epoch": 1.79, + "grad_norm": 7.20702600479126, + "learning_rate": 7.152052964234391e-06, + "logits/chosen": -0.4450221061706543, + "logits/rejected": -0.5233094096183777, + "logps/chosen": -66.81350708007812, + "logps/rejected": -106.56961822509766, + "loss": 0.6258, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.078209161758423, + "rewards/margins": 6.7394256591796875, + "rewards/rejected": -3.6612162590026855, + "step": 7162 + }, + { + "epoch": 1.79, + "grad_norm": 8.823481559753418, + "learning_rate": 7.151343479084025e-06, + "logits/chosen": -0.4371015131473541, + "logits/rejected": -0.5317218899726868, + "logps/chosen": -49.94209671020508, + "logps/rejected": -84.72615814208984, + "loss": 0.8042, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.863619565963745, + "rewards/margins": 4.316135883331299, + "rewards/rejected": -1.4525160789489746, + "step": 7163 + }, + { + "epoch": 1.79, + "grad_norm": 22.880144119262695, + "learning_rate": 7.150633940771674e-06, + "logits/chosen": -0.4073351323604584, + "logits/rejected": -0.5063894391059875, + "logps/chosen": -47.02799987792969, + "logps/rejected": -87.0678939819336, + "loss": 0.5942, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.099201202392578, + "rewards/margins": 5.702971458435059, + "rewards/rejected": -2.6037702560424805, + "step": 7164 + }, + { + "epoch": 1.79, + "grad_norm": 4.881847381591797, + "learning_rate": 7.149924349314871e-06, + "logits/chosen": -0.41693979501724243, + "logits/rejected": -0.4956202507019043, + "logps/chosen": -52.42664337158203, + "logps/rejected": -93.45183563232422, + "loss": 0.6717, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5822839736938477, + "rewards/margins": 5.287364482879639, + "rewards/rejected": -2.705080986022949, + "step": 7165 + }, + { + "epoch": 1.79, + "grad_norm": 10.062862396240234, + "learning_rate": 7.149214704731149e-06, + "logits/chosen": -0.500557005405426, + "logits/rejected": -0.5583805441856384, + "logps/chosen": -52.13582229614258, + "logps/rejected": -102.88432312011719, + "loss": 0.7297, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.951676845550537, + "rewards/margins": 4.039188861846924, + "rewards/rejected": -1.087512493133545, + "step": 7166 + }, + { + "epoch": 1.79, + "grad_norm": 4.513080596923828, + "learning_rate": 7.1485050070380465e-06, + "logits/chosen": -0.4247609078884125, + "logits/rejected": -0.4374479651451111, + "logps/chosen": -56.758331298828125, + "logps/rejected": -88.11654663085938, + "loss": 0.8391, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8941726684570312, + "rewards/margins": 4.106196403503418, + "rewards/rejected": -1.2120238542556763, + "step": 7167 + }, + { + "epoch": 1.79, + "grad_norm": 6.43076229095459, + "learning_rate": 7.1477952562531e-06, + "logits/chosen": -0.48623257875442505, + "logits/rejected": -0.5432336330413818, + "logps/chosen": -51.64394760131836, + "logps/rejected": -99.66136169433594, + "loss": 0.6726, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3237874507904053, + "rewards/margins": 5.692592144012451, + "rewards/rejected": -2.368804454803467, + "step": 7168 + }, + { + "epoch": 1.79, + "grad_norm": 6.350039005279541, + "learning_rate": 7.147085452393848e-06, + "logits/chosen": -0.4568648338317871, + "logits/rejected": -0.5712779760360718, + "logps/chosen": -66.50096893310547, + "logps/rejected": -86.56314086914062, + "loss": 0.7538, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.324920654296875, + "rewards/margins": 5.440624713897705, + "rewards/rejected": -2.115703821182251, + "step": 7169 + }, + { + "epoch": 1.79, + "grad_norm": 14.010571479797363, + "learning_rate": 7.146375595477831e-06, + "logits/chosen": -0.4282195568084717, + "logits/rejected": -0.5000244379043579, + "logps/chosen": -63.989742279052734, + "logps/rejected": -91.76963806152344, + "loss": 0.7772, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5581555366516113, + "rewards/margins": 4.8220109939575195, + "rewards/rejected": -2.2638556957244873, + "step": 7170 + }, + { + "epoch": 1.79, + "grad_norm": 11.500152587890625, + "learning_rate": 7.145665685522591e-06, + "logits/chosen": -0.44792288541793823, + "logits/rejected": -0.5463383197784424, + "logps/chosen": -52.01915740966797, + "logps/rejected": -75.87239074707031, + "loss": 0.9077, + "rewards/accuracies": 0.71875, + "rewards/chosen": 2.880403518676758, + "rewards/margins": 3.9951186180114746, + "rewards/rejected": -1.1147152185440063, + "step": 7171 + }, + { + "epoch": 1.79, + "grad_norm": 4.127769470214844, + "learning_rate": 7.144955722545668e-06, + "logits/chosen": -0.5190850496292114, + "logits/rejected": -0.5776671171188354, + "logps/chosen": -57.268497467041016, + "logps/rejected": -89.0082778930664, + "loss": 0.6875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8850061893463135, + "rewards/margins": 5.173639297485352, + "rewards/rejected": -2.288633108139038, + "step": 7172 + }, + { + "epoch": 1.79, + "grad_norm": 11.59152889251709, + "learning_rate": 7.144245706564609e-06, + "logits/chosen": -0.44495993852615356, + "logits/rejected": -0.5089685916900635, + "logps/chosen": -77.38253021240234, + "logps/rejected": -92.59944152832031, + "loss": 0.9488, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.549111843109131, + "rewards/margins": 4.937536716461182, + "rewards/rejected": -2.38842511177063, + "step": 7173 + }, + { + "epoch": 1.79, + "grad_norm": 4.789380073547363, + "learning_rate": 7.143535637596958e-06, + "logits/chosen": -0.4097268581390381, + "logits/rejected": -0.49652984738349915, + "logps/chosen": -61.20637893676758, + "logps/rejected": -94.7746810913086, + "loss": 0.6942, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2129786014556885, + "rewards/margins": 5.703638076782227, + "rewards/rejected": -2.490659236907959, + "step": 7174 + }, + { + "epoch": 1.79, + "grad_norm": 5.409296989440918, + "learning_rate": 7.142825515660259e-06, + "logits/chosen": -0.4446048438549042, + "logits/rejected": -0.5038384795188904, + "logps/chosen": -50.04026412963867, + "logps/rejected": -101.77718353271484, + "loss": 0.6974, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.924248218536377, + "rewards/margins": 5.887875080108643, + "rewards/rejected": -2.9636266231536865, + "step": 7175 + }, + { + "epoch": 1.8, + "grad_norm": 9.225458145141602, + "learning_rate": 7.1421153407720645e-06, + "logits/chosen": -0.46337243914604187, + "logits/rejected": -0.5354828834533691, + "logps/chosen": -52.69547653198242, + "logps/rejected": -91.97254180908203, + "loss": 0.9082, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5973613262176514, + "rewards/margins": 5.19000244140625, + "rewards/rejected": -2.592641592025757, + "step": 7176 + }, + { + "epoch": 1.8, + "grad_norm": 5.048110485076904, + "learning_rate": 7.141405112949921e-06, + "logits/chosen": -0.4297744333744049, + "logits/rejected": -0.5534491539001465, + "logps/chosen": -56.732269287109375, + "logps/rejected": -88.19963836669922, + "loss": 0.7428, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0070300102233887, + "rewards/margins": 5.4355034828186035, + "rewards/rejected": -2.4284729957580566, + "step": 7177 + }, + { + "epoch": 1.8, + "grad_norm": 6.413766384124756, + "learning_rate": 7.140694832211379e-06, + "logits/chosen": -0.37403401732444763, + "logits/rejected": -0.497453510761261, + "logps/chosen": -61.93135452270508, + "logps/rejected": -89.34968566894531, + "loss": 0.6939, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7173776626586914, + "rewards/margins": 6.046812534332275, + "rewards/rejected": -3.329435348510742, + "step": 7178 + }, + { + "epoch": 1.8, + "grad_norm": 15.615723609924316, + "learning_rate": 7.139984498573991e-06, + "logits/chosen": -0.43074148893356323, + "logits/rejected": -0.5157701969146729, + "logps/chosen": -55.28562545776367, + "logps/rejected": -79.99407958984375, + "loss": 0.7292, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.116905689239502, + "rewards/margins": 5.161548137664795, + "rewards/rejected": -2.044642448425293, + "step": 7179 + }, + { + "epoch": 1.8, + "grad_norm": 4.612171173095703, + "learning_rate": 7.139274112055311e-06, + "logits/chosen": -0.3978976607322693, + "logits/rejected": -0.41732776165008545, + "logps/chosen": -59.048526763916016, + "logps/rejected": -93.69871520996094, + "loss": 0.8214, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1186611652374268, + "rewards/margins": 4.689748764038086, + "rewards/rejected": -1.57108736038208, + "step": 7180 + }, + { + "epoch": 1.8, + "grad_norm": 3.061021089553833, + "learning_rate": 7.1385636726728915e-06, + "logits/chosen": -0.4359402060508728, + "logits/rejected": -0.5320062637329102, + "logps/chosen": -59.69475555419922, + "logps/rejected": -96.28640747070312, + "loss": 0.6174, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9573042392730713, + "rewards/margins": 6.249024868011475, + "rewards/rejected": -3.291720390319824, + "step": 7181 + }, + { + "epoch": 1.8, + "grad_norm": 9.220772743225098, + "learning_rate": 7.137853180444287e-06, + "logits/chosen": -0.4470597803592682, + "logits/rejected": -0.5656286478042603, + "logps/chosen": -58.83304214477539, + "logps/rejected": -83.47273254394531, + "loss": 0.7305, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.742842197418213, + "rewards/margins": 5.066653251647949, + "rewards/rejected": -2.3238112926483154, + "step": 7182 + }, + { + "epoch": 1.8, + "grad_norm": 3.6234426498413086, + "learning_rate": 7.137142635387059e-06, + "logits/chosen": -0.37480607628822327, + "logits/rejected": -0.517013430595398, + "logps/chosen": -60.28135299682617, + "logps/rejected": -72.08930206298828, + "loss": 0.7596, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.180407762527466, + "rewards/margins": 5.2626495361328125, + "rewards/rejected": -2.082242250442505, + "step": 7183 + }, + { + "epoch": 1.8, + "grad_norm": 4.187170505523682, + "learning_rate": 7.136432037518762e-06, + "logits/chosen": -0.36199474334716797, + "logits/rejected": -0.46338173747062683, + "logps/chosen": -66.80770874023438, + "logps/rejected": -97.32642364501953, + "loss": 0.7206, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.662905216217041, + "rewards/margins": 5.970931053161621, + "rewards/rejected": -3.30802583694458, + "step": 7184 + }, + { + "epoch": 1.8, + "grad_norm": 8.030553817749023, + "learning_rate": 7.135721386856959e-06, + "logits/chosen": -0.37794455885887146, + "logits/rejected": -0.4632784426212311, + "logps/chosen": -66.92998504638672, + "logps/rejected": -95.62899017333984, + "loss": 0.8543, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8947248458862305, + "rewards/margins": 4.955100059509277, + "rewards/rejected": -2.060375452041626, + "step": 7185 + }, + { + "epoch": 1.8, + "grad_norm": 6.066946983337402, + "learning_rate": 7.135010683419207e-06, + "logits/chosen": -0.30933186411857605, + "logits/rejected": -0.3941444158554077, + "logps/chosen": -59.943275451660156, + "logps/rejected": -97.61344146728516, + "loss": 0.6865, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9331345558166504, + "rewards/margins": 4.435834884643555, + "rewards/rejected": -1.5027003288269043, + "step": 7186 + }, + { + "epoch": 1.8, + "grad_norm": 9.040699005126953, + "learning_rate": 7.134299927223069e-06, + "logits/chosen": -0.538921594619751, + "logits/rejected": -0.6546223759651184, + "logps/chosen": -54.06129837036133, + "logps/rejected": -83.16941833496094, + "loss": 0.649, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9360320568084717, + "rewards/margins": 6.053024768829346, + "rewards/rejected": -3.116992950439453, + "step": 7187 + }, + { + "epoch": 1.8, + "grad_norm": 4.285770416259766, + "learning_rate": 7.133589118286112e-06, + "logits/chosen": -0.3898463547229767, + "logits/rejected": -0.47820228338241577, + "logps/chosen": -51.9272575378418, + "logps/rejected": -100.08973693847656, + "loss": 0.5847, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7782227993011475, + "rewards/margins": 6.007668495178223, + "rewards/rejected": -3.2294464111328125, + "step": 7188 + }, + { + "epoch": 1.8, + "grad_norm": 5.8254475593566895, + "learning_rate": 7.132878256625897e-06, + "logits/chosen": -0.4116314649581909, + "logits/rejected": -0.5152739882469177, + "logps/chosen": -65.69620513916016, + "logps/rejected": -88.44099426269531, + "loss": 0.7541, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.848238945007324, + "rewards/margins": 4.557891368865967, + "rewards/rejected": -1.7096529006958008, + "step": 7189 + }, + { + "epoch": 1.8, + "grad_norm": 5.494701862335205, + "learning_rate": 7.132167342259994e-06, + "logits/chosen": -0.397928923368454, + "logits/rejected": -0.5015139579772949, + "logps/chosen": -65.28123474121094, + "logps/rejected": -91.34896850585938, + "loss": 0.7701, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.012514591217041, + "rewards/margins": 5.463579177856445, + "rewards/rejected": -2.4510650634765625, + "step": 7190 + }, + { + "epoch": 1.8, + "grad_norm": 3.960970401763916, + "learning_rate": 7.1314563752059655e-06, + "logits/chosen": -0.41884171962738037, + "logits/rejected": -0.4529760777950287, + "logps/chosen": -47.05439758300781, + "logps/rejected": -105.11046600341797, + "loss": 0.696, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1680960655212402, + "rewards/margins": 5.220254421234131, + "rewards/rejected": -2.0521583557128906, + "step": 7191 + }, + { + "epoch": 1.8, + "grad_norm": 4.805107593536377, + "learning_rate": 7.130745355481383e-06, + "logits/chosen": -0.4829866290092468, + "logits/rejected": -0.4974063038825989, + "logps/chosen": -50.28941345214844, + "logps/rejected": -96.97355651855469, + "loss": 0.6813, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9648733139038086, + "rewards/margins": 5.242270469665527, + "rewards/rejected": -2.2773964405059814, + "step": 7192 + }, + { + "epoch": 1.8, + "grad_norm": 4.926972389221191, + "learning_rate": 7.130034283103817e-06, + "logits/chosen": -0.3963695764541626, + "logits/rejected": -0.46461766958236694, + "logps/chosen": -61.13423156738281, + "logps/rejected": -112.99649810791016, + "loss": 0.7045, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0706684589385986, + "rewards/margins": 4.736102104187012, + "rewards/rejected": -1.665433645248413, + "step": 7193 + }, + { + "epoch": 1.8, + "grad_norm": 9.26460075378418, + "learning_rate": 7.129323158090839e-06, + "logits/chosen": -0.4222506880760193, + "logits/rejected": -0.5505039691925049, + "logps/chosen": -57.711997985839844, + "logps/rejected": -72.38220977783203, + "loss": 0.734, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.010389804840088, + "rewards/margins": 4.884459495544434, + "rewards/rejected": -1.8740696907043457, + "step": 7194 + }, + { + "epoch": 1.8, + "grad_norm": 9.565247535705566, + "learning_rate": 7.12861198046002e-06, + "logits/chosen": -0.4662419855594635, + "logits/rejected": -0.5161048769950867, + "logps/chosen": -52.01538848876953, + "logps/rejected": -102.7133560180664, + "loss": 0.6827, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.915724515914917, + "rewards/margins": 5.6089982986450195, + "rewards/rejected": -2.6932733058929443, + "step": 7195 + }, + { + "epoch": 1.8, + "grad_norm": 3.0017619132995605, + "learning_rate": 7.127900750228936e-06, + "logits/chosen": -0.44065067172050476, + "logits/rejected": -0.5411345958709717, + "logps/chosen": -52.67145538330078, + "logps/rejected": -84.47988891601562, + "loss": 0.6383, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.896185874938965, + "rewards/margins": 6.208731174468994, + "rewards/rejected": -3.312544822692871, + "step": 7196 + }, + { + "epoch": 1.8, + "grad_norm": 5.867989540100098, + "learning_rate": 7.12718946741516e-06, + "logits/chosen": -0.46627184748649597, + "logits/rejected": -0.5425124168395996, + "logps/chosen": -48.44308853149414, + "logps/rejected": -81.62797546386719, + "loss": 0.767, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0972986221313477, + "rewards/margins": 4.7680792808532715, + "rewards/rejected": -1.6707801818847656, + "step": 7197 + }, + { + "epoch": 1.8, + "grad_norm": 7.865344524383545, + "learning_rate": 7.12647813203627e-06, + "logits/chosen": -0.4000779390335083, + "logits/rejected": -0.5181772112846375, + "logps/chosen": -64.86357116699219, + "logps/rejected": -86.83808135986328, + "loss": 0.7567, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.685891628265381, + "rewards/margins": 5.407588481903076, + "rewards/rejected": -2.7216968536376953, + "step": 7198 + }, + { + "epoch": 1.8, + "grad_norm": 5.684353351593018, + "learning_rate": 7.125766744109843e-06, + "logits/chosen": -0.5026188492774963, + "logits/rejected": -0.5386672616004944, + "logps/chosen": -57.24946212768555, + "logps/rejected": -94.86702728271484, + "loss": 0.8137, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0750691890716553, + "rewards/margins": 4.467033863067627, + "rewards/rejected": -1.3919647932052612, + "step": 7199 + }, + { + "epoch": 1.8, + "grad_norm": 4.34307336807251, + "learning_rate": 7.12505530365346e-06, + "logits/chosen": -0.5383908748626709, + "logits/rejected": -0.5606116652488708, + "logps/chosen": -50.76116180419922, + "logps/rejected": -85.96405792236328, + "loss": 0.7719, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.039473533630371, + "rewards/margins": 4.497788429260254, + "rewards/rejected": -1.4583147764205933, + "step": 7200 + }, + { + "epoch": 1.8, + "grad_norm": 3.626110792160034, + "learning_rate": 7.1243438106847e-06, + "logits/chosen": -0.41939496994018555, + "logits/rejected": -0.4734465777873993, + "logps/chosen": -54.303672790527344, + "logps/rejected": -105.98782348632812, + "loss": 0.6468, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8886396884918213, + "rewards/margins": 5.224703311920166, + "rewards/rejected": -2.3360636234283447, + "step": 7201 + }, + { + "epoch": 1.8, + "grad_norm": 4.287194728851318, + "learning_rate": 7.123632265221143e-06, + "logits/chosen": -0.5375639200210571, + "logits/rejected": -0.5856589078903198, + "logps/chosen": -52.955284118652344, + "logps/rejected": -80.19126892089844, + "loss": 0.7876, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1216368675231934, + "rewards/margins": 4.323433876037598, + "rewards/rejected": -1.2017970085144043, + "step": 7202 + }, + { + "epoch": 1.8, + "grad_norm": 3.901733636856079, + "learning_rate": 7.122920667280375e-06, + "logits/chosen": -0.4809038043022156, + "logits/rejected": -0.5625932216644287, + "logps/chosen": -56.80189514160156, + "logps/rejected": -101.03433990478516, + "loss": 0.6777, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.853437900543213, + "rewards/margins": 6.008970260620117, + "rewards/rejected": -3.1555323600769043, + "step": 7203 + }, + { + "epoch": 1.8, + "grad_norm": 4.707281589508057, + "learning_rate": 7.12220901687998e-06, + "logits/chosen": -0.4189188778400421, + "logits/rejected": -0.5154672861099243, + "logps/chosen": -55.836158752441406, + "logps/rejected": -77.33332061767578, + "loss": 0.6979, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.926206588745117, + "rewards/margins": 5.242010116577148, + "rewards/rejected": -2.3158037662506104, + "step": 7204 + }, + { + "epoch": 1.8, + "grad_norm": 3.5706124305725098, + "learning_rate": 7.121497314037542e-06, + "logits/chosen": -0.38288620114326477, + "logits/rejected": -0.45548149943351746, + "logps/chosen": -53.07115173339844, + "logps/rejected": -96.81134033203125, + "loss": 0.656, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9203743934631348, + "rewards/margins": 5.59010124206543, + "rewards/rejected": -2.6697275638580322, + "step": 7205 + }, + { + "epoch": 1.8, + "grad_norm": 3.8076236248016357, + "learning_rate": 7.120785558770648e-06, + "logits/chosen": -0.3981679081916809, + "logits/rejected": -0.42154961824417114, + "logps/chosen": -48.738990783691406, + "logps/rejected": -84.70826721191406, + "loss": 0.734, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.279139757156372, + "rewards/margins": 3.8749284744262695, + "rewards/rejected": -0.5957885384559631, + "step": 7206 + }, + { + "epoch": 1.8, + "grad_norm": 4.647182464599609, + "learning_rate": 7.1200737510968895e-06, + "logits/chosen": -0.47536009550094604, + "logits/rejected": -0.5419249534606934, + "logps/chosen": -51.063682556152344, + "logps/rejected": -95.26044464111328, + "loss": 0.6981, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8735129833221436, + "rewards/margins": 6.480445861816406, + "rewards/rejected": -3.606933116912842, + "step": 7207 + }, + { + "epoch": 1.8, + "grad_norm": 8.83315372467041, + "learning_rate": 7.119361891033853e-06, + "logits/chosen": -0.40638497471809387, + "logits/rejected": -0.4694283604621887, + "logps/chosen": -67.03067779541016, + "logps/rejected": -111.38729095458984, + "loss": 0.7197, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.915184259414673, + "rewards/margins": 5.325832366943359, + "rewards/rejected": -2.4106481075286865, + "step": 7208 + }, + { + "epoch": 1.8, + "grad_norm": 6.802195072174072, + "learning_rate": 7.118649978599128e-06, + "logits/chosen": -0.42504173517227173, + "logits/rejected": -0.5399368405342102, + "logps/chosen": -49.541080474853516, + "logps/rejected": -85.50289916992188, + "loss": 0.7103, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.068697452545166, + "rewards/margins": 5.781048774719238, + "rewards/rejected": -2.7123515605926514, + "step": 7209 + }, + { + "epoch": 1.8, + "grad_norm": 5.550110816955566, + "learning_rate": 7.1179380138103105e-06, + "logits/chosen": -0.4690816402435303, + "logits/rejected": -0.5119735598564148, + "logps/chosen": -51.59076690673828, + "logps/rejected": -94.19962310791016, + "loss": 0.712, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.119654655456543, + "rewards/margins": 4.842550277709961, + "rewards/rejected": -1.722895622253418, + "step": 7210 + }, + { + "epoch": 1.8, + "grad_norm": 3.210202217102051, + "learning_rate": 7.117225996684991e-06, + "logits/chosen": -0.49143269658088684, + "logits/rejected": -0.5473688840866089, + "logps/chosen": -51.18476867675781, + "logps/rejected": -93.41388702392578, + "loss": 0.6309, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5173041820526123, + "rewards/margins": 6.015328884124756, + "rewards/rejected": -2.4980247020721436, + "step": 7211 + }, + { + "epoch": 1.8, + "grad_norm": 159.7191925048828, + "learning_rate": 7.116513927240765e-06, + "logits/chosen": -0.47037699818611145, + "logits/rejected": -0.5695518255233765, + "logps/chosen": -53.97193145751953, + "logps/rejected": -80.3924789428711, + "loss": 0.6982, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.626636505126953, + "rewards/margins": 4.393791198730469, + "rewards/rejected": -1.7671550512313843, + "step": 7212 + }, + { + "epoch": 1.8, + "grad_norm": 7.5180745124816895, + "learning_rate": 7.115801805495229e-06, + "logits/chosen": -0.3983386754989624, + "logits/rejected": -0.48450762033462524, + "logps/chosen": -67.31665802001953, + "logps/rejected": -92.34388732910156, + "loss": 0.7728, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.507249593734741, + "rewards/margins": 5.148894309997559, + "rewards/rejected": -2.6416454315185547, + "step": 7213 + }, + { + "epoch": 1.8, + "grad_norm": 4.785204887390137, + "learning_rate": 7.115089631465981e-06, + "logits/chosen": -0.47344136238098145, + "logits/rejected": -0.489211767911911, + "logps/chosen": -51.774288177490234, + "logps/rejected": -111.74522399902344, + "loss": 0.7135, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.173417806625366, + "rewards/margins": 5.356533527374268, + "rewards/rejected": -2.1831159591674805, + "step": 7214 + }, + { + "epoch": 1.8, + "grad_norm": 5.417936325073242, + "learning_rate": 7.114377405170617e-06, + "logits/chosen": -0.4675273895263672, + "logits/rejected": -0.5412111282348633, + "logps/chosen": -56.011924743652344, + "logps/rejected": -77.3856430053711, + "loss": 0.7683, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2950804233551025, + "rewards/margins": 4.881093502044678, + "rewards/rejected": -1.5860133171081543, + "step": 7215 + }, + { + "epoch": 1.81, + "grad_norm": 3.2326674461364746, + "learning_rate": 7.113665126626738e-06, + "logits/chosen": -0.44453808665275574, + "logits/rejected": -0.4519824683666229, + "logps/chosen": -48.36488342285156, + "logps/rejected": -97.38339233398438, + "loss": 0.6404, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.989694356918335, + "rewards/margins": 4.329930305480957, + "rewards/rejected": -1.340235948562622, + "step": 7216 + }, + { + "epoch": 1.81, + "grad_norm": 3.4690728187561035, + "learning_rate": 7.112952795851949e-06, + "logits/chosen": -0.3908441364765167, + "logits/rejected": -0.5144767761230469, + "logps/chosen": -53.150081634521484, + "logps/rejected": -76.56564331054688, + "loss": 0.6499, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0556795597076416, + "rewards/margins": 5.396501541137695, + "rewards/rejected": -2.340822696685791, + "step": 7217 + }, + { + "epoch": 1.81, + "grad_norm": 9.403913497924805, + "learning_rate": 7.112240412863845e-06, + "logits/chosen": -0.4877678453922272, + "logits/rejected": -0.5587642192840576, + "logps/chosen": -59.197444915771484, + "logps/rejected": -85.15060424804688, + "loss": 0.671, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7076735496520996, + "rewards/margins": 5.535980701446533, + "rewards/rejected": -2.8283069133758545, + "step": 7218 + }, + { + "epoch": 1.81, + "grad_norm": 7.471242427825928, + "learning_rate": 7.111527977680035e-06, + "logits/chosen": -0.366807758808136, + "logits/rejected": -0.4583442807197571, + "logps/chosen": -61.499114990234375, + "logps/rejected": -93.00423431396484, + "loss": 0.7462, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8014156818389893, + "rewards/margins": 5.013758182525635, + "rewards/rejected": -2.2123427391052246, + "step": 7219 + }, + { + "epoch": 1.81, + "grad_norm": 3.1377813816070557, + "learning_rate": 7.1108154903181225e-06, + "logits/chosen": -0.3273896276950836, + "logits/rejected": -0.39738690853118896, + "logps/chosen": -58.86952209472656, + "logps/rejected": -81.43354034423828, + "loss": 0.6888, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.057191848754883, + "rewards/margins": 4.8221235275268555, + "rewards/rejected": -1.7649316787719727, + "step": 7220 + }, + { + "epoch": 1.81, + "grad_norm": 6.914497375488281, + "learning_rate": 7.110102950795714e-06, + "logits/chosen": -0.39757636189460754, + "logits/rejected": -0.5183234214782715, + "logps/chosen": -49.864105224609375, + "logps/rejected": -79.20640563964844, + "loss": 0.6677, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.896134376525879, + "rewards/margins": 5.247767925262451, + "rewards/rejected": -2.3516335487365723, + "step": 7221 + }, + { + "epoch": 1.81, + "grad_norm": 3.8863532543182373, + "learning_rate": 7.109390359130418e-06, + "logits/chosen": -0.4036944508552551, + "logits/rejected": -0.4657333791255951, + "logps/chosen": -63.99542236328125, + "logps/rejected": -84.54635620117188, + "loss": 0.7275, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.852048397064209, + "rewards/margins": 4.14813756942749, + "rewards/rejected": -1.2960891723632812, + "step": 7222 + }, + { + "epoch": 1.81, + "grad_norm": 3.8888211250305176, + "learning_rate": 7.108677715339842e-06, + "logits/chosen": -0.40370428562164307, + "logits/rejected": -0.47737863659858704, + "logps/chosen": -59.99873733520508, + "logps/rejected": -88.73860931396484, + "loss": 0.7149, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.060931921005249, + "rewards/margins": 5.027832984924316, + "rewards/rejected": -1.9669005870819092, + "step": 7223 + }, + { + "epoch": 1.81, + "grad_norm": 16.723241806030273, + "learning_rate": 7.107965019441596e-06, + "logits/chosen": -0.40800487995147705, + "logits/rejected": -0.44496822357177734, + "logps/chosen": -53.91960906982422, + "logps/rejected": -91.95440673828125, + "loss": 0.7717, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9054641723632812, + "rewards/margins": 4.958523273468018, + "rewards/rejected": -2.0530591011047363, + "step": 7224 + }, + { + "epoch": 1.81, + "grad_norm": 3.178558349609375, + "learning_rate": 7.107252271453293e-06, + "logits/chosen": -0.39170870184898376, + "logits/rejected": -0.4528037905693054, + "logps/chosen": -54.471614837646484, + "logps/rejected": -97.53791809082031, + "loss": 0.6477, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.017752170562744, + "rewards/margins": 5.605582237243652, + "rewards/rejected": -2.587829351425171, + "step": 7225 + }, + { + "epoch": 1.81, + "grad_norm": 3.369666337966919, + "learning_rate": 7.106539471392545e-06, + "logits/chosen": -0.36374402046203613, + "logits/rejected": -0.46245068311691284, + "logps/chosen": -55.540550231933594, + "logps/rejected": -94.8450698852539, + "loss": 0.6306, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.080770492553711, + "rewards/margins": 5.675384998321533, + "rewards/rejected": -2.5946145057678223, + "step": 7226 + }, + { + "epoch": 1.81, + "grad_norm": 8.099961280822754, + "learning_rate": 7.105826619276966e-06, + "logits/chosen": -0.4634838104248047, + "logits/rejected": -0.5161047577857971, + "logps/chosen": -44.86283874511719, + "logps/rejected": -83.3318099975586, + "loss": 0.725, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9762723445892334, + "rewards/margins": 4.8535075187683105, + "rewards/rejected": -1.877234935760498, + "step": 7227 + }, + { + "epoch": 1.81, + "grad_norm": 6.321718692779541, + "learning_rate": 7.105113715124172e-06, + "logits/chosen": -0.5435208678245544, + "logits/rejected": -0.6338664889335632, + "logps/chosen": -54.78998565673828, + "logps/rejected": -85.19749450683594, + "loss": 0.6416, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9910025596618652, + "rewards/margins": 5.151034832000732, + "rewards/rejected": -2.160032033920288, + "step": 7228 + }, + { + "epoch": 1.81, + "grad_norm": 5.03678560256958, + "learning_rate": 7.104400758951779e-06, + "logits/chosen": -0.5523990392684937, + "logits/rejected": -0.675911545753479, + "logps/chosen": -54.54383850097656, + "logps/rejected": -83.58523559570312, + "loss": 0.7565, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.3940858840942383, + "rewards/margins": 6.445950508117676, + "rewards/rejected": -3.0518641471862793, + "step": 7229 + }, + { + "epoch": 1.81, + "grad_norm": 6.085567474365234, + "learning_rate": 7.1036877507774035e-06, + "logits/chosen": -0.43035709857940674, + "logits/rejected": -0.47806739807128906, + "logps/chosen": -70.82308197021484, + "logps/rejected": -103.21735382080078, + "loss": 0.7951, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.915310859680176, + "rewards/margins": 5.7627458572387695, + "rewards/rejected": -2.8474347591400146, + "step": 7230 + }, + { + "epoch": 1.81, + "grad_norm": 4.295339584350586, + "learning_rate": 7.102974690618669e-06, + "logits/chosen": -0.456216037273407, + "logits/rejected": -0.5071958303451538, + "logps/chosen": -46.27427673339844, + "logps/rejected": -88.81794738769531, + "loss": 0.6159, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1434571743011475, + "rewards/margins": 5.300475120544434, + "rewards/rejected": -2.1570181846618652, + "step": 7231 + }, + { + "epoch": 1.81, + "grad_norm": 5.89251708984375, + "learning_rate": 7.102261578493189e-06, + "logits/chosen": -0.50078284740448, + "logits/rejected": -0.5434260368347168, + "logps/chosen": -57.01884078979492, + "logps/rejected": -92.92497253417969, + "loss": 0.6754, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9995994567871094, + "rewards/margins": 5.334507465362549, + "rewards/rejected": -2.3349080085754395, + "step": 7232 + }, + { + "epoch": 1.81, + "grad_norm": 7.952117443084717, + "learning_rate": 7.101548414418593e-06, + "logits/chosen": -0.41313934326171875, + "logits/rejected": -0.43676191568374634, + "logps/chosen": -51.2515869140625, + "logps/rejected": -87.83670806884766, + "loss": 0.6861, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.046361207962036, + "rewards/margins": 3.673710823059082, + "rewards/rejected": -0.6273495554924011, + "step": 7233 + }, + { + "epoch": 1.81, + "grad_norm": 6.829769134521484, + "learning_rate": 7.100835198412498e-06, + "logits/chosen": -0.509907603263855, + "logits/rejected": -0.5501475930213928, + "logps/chosen": -54.94301223754883, + "logps/rejected": -101.1313247680664, + "loss": 0.8988, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.926723003387451, + "rewards/margins": 5.141072750091553, + "rewards/rejected": -2.2143502235412598, + "step": 7234 + }, + { + "epoch": 1.81, + "grad_norm": 5.523735046386719, + "learning_rate": 7.100121930492532e-06, + "logits/chosen": -0.37936264276504517, + "logits/rejected": -0.45013463497161865, + "logps/chosen": -58.940860748291016, + "logps/rejected": -91.28642272949219, + "loss": 0.8019, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8034274578094482, + "rewards/margins": 4.859465599060059, + "rewards/rejected": -2.056037425994873, + "step": 7235 + }, + { + "epoch": 1.81, + "grad_norm": 4.154051303863525, + "learning_rate": 7.099408610676318e-06, + "logits/chosen": -0.4530666470527649, + "logits/rejected": -0.5257925987243652, + "logps/chosen": -61.60753631591797, + "logps/rejected": -90.75628662109375, + "loss": 0.7008, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.048096179962158, + "rewards/margins": 4.923219203948975, + "rewards/rejected": -1.875123381614685, + "step": 7236 + }, + { + "epoch": 1.81, + "grad_norm": 5.490337371826172, + "learning_rate": 7.098695238981485e-06, + "logits/chosen": -0.47517913579940796, + "logits/rejected": -0.5372617244720459, + "logps/chosen": -53.26958465576172, + "logps/rejected": -101.84706115722656, + "loss": 0.714, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.856006622314453, + "rewards/margins": 6.235307693481445, + "rewards/rejected": -3.379300355911255, + "step": 7237 + }, + { + "epoch": 1.81, + "grad_norm": 3.9465034008026123, + "learning_rate": 7.097981815425661e-06, + "logits/chosen": -0.45833098888397217, + "logits/rejected": -0.5782250761985779, + "logps/chosen": -59.31654357910156, + "logps/rejected": -77.39900207519531, + "loss": 0.5952, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4753570556640625, + "rewards/margins": 5.887556552886963, + "rewards/rejected": -2.4121997356414795, + "step": 7238 + }, + { + "epoch": 1.81, + "grad_norm": 4.098174095153809, + "learning_rate": 7.097268340026474e-06, + "logits/chosen": -0.3735138177871704, + "logits/rejected": -0.46207350492477417, + "logps/chosen": -72.71050262451172, + "logps/rejected": -78.24313354492188, + "loss": 0.7493, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6713614463806152, + "rewards/margins": 4.9622039794921875, + "rewards/rejected": -2.2908425331115723, + "step": 7239 + }, + { + "epoch": 1.81, + "grad_norm": 12.306843757629395, + "learning_rate": 7.096554812801557e-06, + "logits/chosen": -0.423630952835083, + "logits/rejected": -0.4815434217453003, + "logps/chosen": -61.71612548828125, + "logps/rejected": -87.70951843261719, + "loss": 0.8853, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7758805751800537, + "rewards/margins": 4.416855335235596, + "rewards/rejected": -1.6409753561019897, + "step": 7240 + }, + { + "epoch": 1.81, + "grad_norm": 4.766644477844238, + "learning_rate": 7.095841233768538e-06, + "logits/chosen": -0.47081947326660156, + "logits/rejected": -0.536736249923706, + "logps/chosen": -50.579612731933594, + "logps/rejected": -82.8717041015625, + "loss": 0.8103, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.729620933532715, + "rewards/margins": 4.563667297363281, + "rewards/rejected": -1.8340462446212769, + "step": 7241 + }, + { + "epoch": 1.81, + "grad_norm": 7.281798839569092, + "learning_rate": 7.095127602945056e-06, + "logits/chosen": -0.4676212966442108, + "logits/rejected": -0.5897806882858276, + "logps/chosen": -61.213783264160156, + "logps/rejected": -88.34504699707031, + "loss": 0.7538, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0557942390441895, + "rewards/margins": 5.704983234405518, + "rewards/rejected": -2.649188756942749, + "step": 7242 + }, + { + "epoch": 1.81, + "grad_norm": 8.458245277404785, + "learning_rate": 7.0944139203487394e-06, + "logits/chosen": -0.4468865394592285, + "logits/rejected": -0.5640042424201965, + "logps/chosen": -53.42194366455078, + "logps/rejected": -81.6504135131836, + "loss": 0.7886, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5924267768859863, + "rewards/margins": 4.4239630699157715, + "rewards/rejected": -1.8315364122390747, + "step": 7243 + }, + { + "epoch": 1.81, + "grad_norm": 4.688750267028809, + "learning_rate": 7.093700185997228e-06, + "logits/chosen": -0.4439598321914673, + "logits/rejected": -0.5711093544960022, + "logps/chosen": -57.39497375488281, + "logps/rejected": -61.15089797973633, + "loss": 0.7524, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6305603981018066, + "rewards/margins": 4.420263767242432, + "rewards/rejected": -1.789703369140625, + "step": 7244 + }, + { + "epoch": 1.81, + "grad_norm": 4.702823638916016, + "learning_rate": 7.092986399908158e-06, + "logits/chosen": -0.392325758934021, + "logits/rejected": -0.543213963508606, + "logps/chosen": -67.01591491699219, + "logps/rejected": -84.8608627319336, + "loss": 0.7234, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.799253225326538, + "rewards/margins": 5.947054862976074, + "rewards/rejected": -3.1478018760681152, + "step": 7245 + }, + { + "epoch": 1.81, + "grad_norm": 5.034111499786377, + "learning_rate": 7.09227256209917e-06, + "logits/chosen": -0.3620924651622772, + "logits/rejected": -0.45656871795654297, + "logps/chosen": -71.51740264892578, + "logps/rejected": -105.9608154296875, + "loss": 0.6252, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.938422203063965, + "rewards/margins": 5.875370025634766, + "rewards/rejected": -2.9369475841522217, + "step": 7246 + }, + { + "epoch": 1.81, + "grad_norm": 13.445842742919922, + "learning_rate": 7.091558672587899e-06, + "logits/chosen": -0.5250385403633118, + "logits/rejected": -0.5886974334716797, + "logps/chosen": -52.571895599365234, + "logps/rejected": -80.49006652832031, + "loss": 0.7565, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8976216316223145, + "rewards/margins": 5.452213764190674, + "rewards/rejected": -2.5545921325683594, + "step": 7247 + }, + { + "epoch": 1.81, + "grad_norm": 5.054033279418945, + "learning_rate": 7.09084473139199e-06, + "logits/chosen": -0.412212610244751, + "logits/rejected": -0.5081139802932739, + "logps/chosen": -49.82360076904297, + "logps/rejected": -105.76464080810547, + "loss": 0.6218, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7630624771118164, + "rewards/margins": 6.322558879852295, + "rewards/rejected": -3.5594964027404785, + "step": 7248 + }, + { + "epoch": 1.81, + "grad_norm": 3.198936939239502, + "learning_rate": 7.090130738529082e-06, + "logits/chosen": -0.3840368986129761, + "logits/rejected": -0.5227016806602478, + "logps/chosen": -50.909934997558594, + "logps/rejected": -76.26327514648438, + "loss": 0.6244, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.065746784210205, + "rewards/margins": 5.65027379989624, + "rewards/rejected": -2.584527015686035, + "step": 7249 + }, + { + "epoch": 1.81, + "grad_norm": 5.570296764373779, + "learning_rate": 7.089416694016822e-06, + "logits/chosen": -0.5230560898780823, + "logits/rejected": -0.6344776153564453, + "logps/chosen": -59.6630973815918, + "logps/rejected": -85.66918182373047, + "loss": 0.7153, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6487507820129395, + "rewards/margins": 5.012163162231445, + "rewards/rejected": -2.363412857055664, + "step": 7250 + }, + { + "epoch": 1.81, + "grad_norm": 3.407283067703247, + "learning_rate": 7.088702597872853e-06, + "logits/chosen": -0.48383721709251404, + "logits/rejected": -0.5241101980209351, + "logps/chosen": -43.71108627319336, + "logps/rejected": -114.2287826538086, + "loss": 0.6099, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.093141555786133, + "rewards/margins": 6.7827348709106445, + "rewards/rejected": -3.6895933151245117, + "step": 7251 + }, + { + "epoch": 1.81, + "grad_norm": 3.649081230163574, + "learning_rate": 7.08798845011482e-06, + "logits/chosen": -0.39165452122688293, + "logits/rejected": -0.5107237696647644, + "logps/chosen": -58.314544677734375, + "logps/rejected": -91.92847442626953, + "loss": 0.7149, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9654321670532227, + "rewards/margins": 5.049123764038086, + "rewards/rejected": -2.0836923122406006, + "step": 7252 + }, + { + "epoch": 1.81, + "grad_norm": 15.307869911193848, + "learning_rate": 7.087274250760372e-06, + "logits/chosen": -0.4277143180370331, + "logits/rejected": -0.5276227593421936, + "logps/chosen": -47.04533767700195, + "logps/rejected": -83.14058685302734, + "loss": 0.7558, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7071077823638916, + "rewards/margins": 5.697831153869629, + "rewards/rejected": -2.990723133087158, + "step": 7253 + }, + { + "epoch": 1.81, + "grad_norm": 6.550344944000244, + "learning_rate": 7.086559999827158e-06, + "logits/chosen": -0.5823689103126526, + "logits/rejected": -0.6470267176628113, + "logps/chosen": -60.897396087646484, + "logps/rejected": -94.85953521728516, + "loss": 0.8022, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.964311361312866, + "rewards/margins": 5.472495079040527, + "rewards/rejected": -2.508183479309082, + "step": 7254 + }, + { + "epoch": 1.81, + "grad_norm": 5.2511067390441895, + "learning_rate": 7.085845697332825e-06, + "logits/chosen": -0.420491099357605, + "logits/rejected": -0.4630134105682373, + "logps/chosen": -46.81938171386719, + "logps/rejected": -85.87677764892578, + "loss": 0.6751, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.303055763244629, + "rewards/margins": 4.585624694824219, + "rewards/rejected": -1.2825688123703003, + "step": 7255 + }, + { + "epoch": 1.82, + "grad_norm": 3.9397294521331787, + "learning_rate": 7.085131343295028e-06, + "logits/chosen": -0.3728596568107605, + "logits/rejected": -0.4429419934749603, + "logps/chosen": -53.17962646484375, + "logps/rejected": -95.20394134521484, + "loss": 0.6701, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1141419410705566, + "rewards/margins": 4.841639995574951, + "rewards/rejected": -1.7274978160858154, + "step": 7256 + }, + { + "epoch": 1.82, + "grad_norm": 4.2039947509765625, + "learning_rate": 7.084416937731417e-06, + "logits/chosen": -0.41746950149536133, + "logits/rejected": -0.4764670133590698, + "logps/chosen": -46.93682861328125, + "logps/rejected": -92.5927963256836, + "loss": 0.6647, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3078932762145996, + "rewards/margins": 5.712540626525879, + "rewards/rejected": -2.4046475887298584, + "step": 7257 + }, + { + "epoch": 1.82, + "grad_norm": 5.445155620574951, + "learning_rate": 7.083702480659647e-06, + "logits/chosen": -0.4479376971721649, + "logits/rejected": -0.5481955409049988, + "logps/chosen": -49.9542350769043, + "logps/rejected": -86.46231842041016, + "loss": 0.7034, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8609516620635986, + "rewards/margins": 5.350306034088135, + "rewards/rejected": -2.489354372024536, + "step": 7258 + }, + { + "epoch": 1.82, + "grad_norm": 13.274955749511719, + "learning_rate": 7.082987972097373e-06, + "logits/chosen": -0.32921886444091797, + "logits/rejected": -0.4397042989730835, + "logps/chosen": -66.67389678955078, + "logps/rejected": -88.78324890136719, + "loss": 0.7344, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7020764350891113, + "rewards/margins": 4.602478981018066, + "rewards/rejected": -1.9004020690917969, + "step": 7259 + }, + { + "epoch": 1.82, + "grad_norm": 3.874969720840454, + "learning_rate": 7.08227341206225e-06, + "logits/chosen": -0.4875330328941345, + "logits/rejected": -0.5800280570983887, + "logps/chosen": -54.896812438964844, + "logps/rejected": -87.57611846923828, + "loss": 0.6881, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8410162925720215, + "rewards/margins": 4.993434906005859, + "rewards/rejected": -2.1524181365966797, + "step": 7260 + }, + { + "epoch": 1.82, + "grad_norm": 5.937485694885254, + "learning_rate": 7.081558800571935e-06, + "logits/chosen": -0.38192451000213623, + "logits/rejected": -0.47877734899520874, + "logps/chosen": -55.08018493652344, + "logps/rejected": -80.4195785522461, + "loss": 0.7017, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1119630336761475, + "rewards/margins": 4.729093074798584, + "rewards/rejected": -1.6171296834945679, + "step": 7261 + }, + { + "epoch": 1.82, + "grad_norm": 6.036211967468262, + "learning_rate": 7.080844137644089e-06, + "logits/chosen": -0.3564695417881012, + "logits/rejected": -0.3964970111846924, + "logps/chosen": -54.83622360229492, + "logps/rejected": -88.91226196289062, + "loss": 0.7386, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0685174465179443, + "rewards/margins": 4.3870038986206055, + "rewards/rejected": -1.3184866905212402, + "step": 7262 + }, + { + "epoch": 1.82, + "grad_norm": 6.531008243560791, + "learning_rate": 7.080129423296372e-06, + "logits/chosen": -0.42147132754325867, + "logits/rejected": -0.4898989498615265, + "logps/chosen": -61.46847915649414, + "logps/rejected": -84.50209045410156, + "loss": 0.6472, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.905688524246216, + "rewards/margins": 4.65106725692749, + "rewards/rejected": -1.7453782558441162, + "step": 7263 + }, + { + "epoch": 1.82, + "grad_norm": 3.9311461448669434, + "learning_rate": 7.079414657546442e-06, + "logits/chosen": -0.43539857864379883, + "logits/rejected": -0.4720594882965088, + "logps/chosen": -50.695526123046875, + "logps/rejected": -95.50514221191406, + "loss": 0.6856, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2055952548980713, + "rewards/margins": 6.194543361663818, + "rewards/rejected": -2.988947868347168, + "step": 7264 + }, + { + "epoch": 1.82, + "grad_norm": 5.458929538726807, + "learning_rate": 7.078699840411965e-06, + "logits/chosen": -0.47638726234436035, + "logits/rejected": -0.5208439230918884, + "logps/chosen": -46.586063385009766, + "logps/rejected": -90.09264373779297, + "loss": 0.6872, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.04203724861145, + "rewards/margins": 5.026033401489258, + "rewards/rejected": -1.9839956760406494, + "step": 7265 + }, + { + "epoch": 1.82, + "grad_norm": 4.093868732452393, + "learning_rate": 7.077984971910605e-06, + "logits/chosen": -0.3472943603992462, + "logits/rejected": -0.44001108407974243, + "logps/chosen": -55.7792854309082, + "logps/rejected": -96.82652282714844, + "loss": 0.6704, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.913249969482422, + "rewards/margins": 4.460108757019043, + "rewards/rejected": -1.546858549118042, + "step": 7266 + }, + { + "epoch": 1.82, + "grad_norm": 6.0008544921875, + "learning_rate": 7.077270052060026e-06, + "logits/chosen": -0.38052064180374146, + "logits/rejected": -0.430968701839447, + "logps/chosen": -62.507781982421875, + "logps/rejected": -82.06903839111328, + "loss": 0.8109, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.976355791091919, + "rewards/margins": 4.48888635635376, + "rewards/rejected": -1.5125303268432617, + "step": 7267 + }, + { + "epoch": 1.82, + "grad_norm": 7.43782377243042, + "learning_rate": 7.076555080877893e-06, + "logits/chosen": -0.4919544458389282, + "logits/rejected": -0.5463367700576782, + "logps/chosen": -50.922828674316406, + "logps/rejected": -89.103515625, + "loss": 0.6277, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.783735990524292, + "rewards/margins": 5.407184600830078, + "rewards/rejected": -2.623448610305786, + "step": 7268 + }, + { + "epoch": 1.82, + "grad_norm": 6.016452312469482, + "learning_rate": 7.075840058381875e-06, + "logits/chosen": -0.4534067213535309, + "logits/rejected": -0.5635141134262085, + "logps/chosen": -50.607208251953125, + "logps/rejected": -83.09413146972656, + "loss": 0.6677, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.868216037750244, + "rewards/margins": 6.197319030761719, + "rewards/rejected": -3.3291029930114746, + "step": 7269 + }, + { + "epoch": 1.82, + "grad_norm": 7.163858413696289, + "learning_rate": 7.075124984589643e-06, + "logits/chosen": -0.3538614511489868, + "logits/rejected": -0.44954392313957214, + "logps/chosen": -59.094337463378906, + "logps/rejected": -82.28388214111328, + "loss": 0.7168, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1968770027160645, + "rewards/margins": 5.141502857208252, + "rewards/rejected": -1.9446253776550293, + "step": 7270 + }, + { + "epoch": 1.82, + "grad_norm": 5.0233917236328125, + "learning_rate": 7.074409859518866e-06, + "logits/chosen": -0.44901296496391296, + "logits/rejected": -0.5575444102287292, + "logps/chosen": -59.85144805908203, + "logps/rejected": -74.72460174560547, + "loss": 0.796, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8793795108795166, + "rewards/margins": 4.650177001953125, + "rewards/rejected": -1.7707977294921875, + "step": 7271 + }, + { + "epoch": 1.82, + "grad_norm": 7.015655040740967, + "learning_rate": 7.073694683187213e-06, + "logits/chosen": -0.40153980255126953, + "logits/rejected": -0.46385690569877625, + "logps/chosen": -54.288490295410156, + "logps/rejected": -90.4684066772461, + "loss": 0.7955, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.933417558670044, + "rewards/margins": 3.5638973712921143, + "rewards/rejected": -0.6304799914360046, + "step": 7272 + }, + { + "epoch": 1.82, + "grad_norm": 4.064391136169434, + "learning_rate": 7.0729794556123595e-06, + "logits/chosen": -0.4135323464870453, + "logits/rejected": -0.5644878149032593, + "logps/chosen": -52.8487548828125, + "logps/rejected": -77.58853912353516, + "loss": 0.6471, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6906332969665527, + "rewards/margins": 5.429488182067871, + "rewards/rejected": -2.73885440826416, + "step": 7273 + }, + { + "epoch": 1.82, + "grad_norm": 6.537255764007568, + "learning_rate": 7.072264176811979e-06, + "logits/chosen": -0.4278772473335266, + "logits/rejected": -0.5484156608581543, + "logps/chosen": -50.743072509765625, + "logps/rejected": -94.21862030029297, + "loss": 0.6923, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7811832427978516, + "rewards/margins": 5.994256973266602, + "rewards/rejected": -3.21307373046875, + "step": 7274 + }, + { + "epoch": 1.82, + "grad_norm": 5.208417892456055, + "learning_rate": 7.071548846803745e-06, + "logits/chosen": -0.4524359703063965, + "logits/rejected": -0.523553192615509, + "logps/chosen": -50.63089370727539, + "logps/rejected": -87.59022521972656, + "loss": 0.6762, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.155181884765625, + "rewards/margins": 4.239408493041992, + "rewards/rejected": -1.0842262506484985, + "step": 7275 + }, + { + "epoch": 1.82, + "grad_norm": 5.204524040222168, + "learning_rate": 7.070833465605338e-06, + "logits/chosen": -0.5356999039649963, + "logits/rejected": -0.5781429409980774, + "logps/chosen": -45.41130065917969, + "logps/rejected": -91.26109313964844, + "loss": 0.7287, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9047019481658936, + "rewards/margins": 4.887450218200684, + "rewards/rejected": -1.9827485084533691, + "step": 7276 + }, + { + "epoch": 1.82, + "grad_norm": 6.027705669403076, + "learning_rate": 7.070118033234432e-06, + "logits/chosen": -0.4120272696018219, + "logits/rejected": -0.4720383882522583, + "logps/chosen": -53.285518646240234, + "logps/rejected": -84.94223022460938, + "loss": 0.7415, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7938830852508545, + "rewards/margins": 4.485535144805908, + "rewards/rejected": -1.691651701927185, + "step": 7277 + }, + { + "epoch": 1.82, + "grad_norm": 4.56439733505249, + "learning_rate": 7.0694025497087085e-06, + "logits/chosen": -0.4741186499595642, + "logits/rejected": -0.5424491167068481, + "logps/chosen": -60.34421157836914, + "logps/rejected": -91.35295867919922, + "loss": 0.7526, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.507404088973999, + "rewards/margins": 5.35305643081665, + "rewards/rejected": -1.8456521034240723, + "step": 7278 + }, + { + "epoch": 1.82, + "grad_norm": 4.9626312255859375, + "learning_rate": 7.0686870150458454e-06, + "logits/chosen": -0.4258997440338135, + "logits/rejected": -0.4689663350582123, + "logps/chosen": -56.68673324584961, + "logps/rejected": -90.24629211425781, + "loss": 0.8602, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0322937965393066, + "rewards/margins": 4.761148452758789, + "rewards/rejected": -1.7288544178009033, + "step": 7279 + }, + { + "epoch": 1.82, + "grad_norm": 3.7684473991394043, + "learning_rate": 7.067971429263527e-06, + "logits/chosen": -0.48477768898010254, + "logits/rejected": -0.5868645906448364, + "logps/chosen": -49.535274505615234, + "logps/rejected": -83.03601837158203, + "loss": 0.5623, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.972745418548584, + "rewards/margins": 5.923229217529297, + "rewards/rejected": -2.950483798980713, + "step": 7280 + }, + { + "epoch": 1.82, + "grad_norm": 9.592924118041992, + "learning_rate": 7.067255792379435e-06, + "logits/chosen": -0.5225779414176941, + "logits/rejected": -0.586582362651825, + "logps/chosen": -60.34095764160156, + "logps/rejected": -86.66500091552734, + "loss": 0.8888, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.85774827003479, + "rewards/margins": 4.571726322174072, + "rewards/rejected": -1.7139780521392822, + "step": 7281 + }, + { + "epoch": 1.82, + "grad_norm": 6.075364112854004, + "learning_rate": 7.066540104411252e-06, + "logits/chosen": -0.4246146082878113, + "logits/rejected": -0.4362453818321228, + "logps/chosen": -67.7744140625, + "logps/rejected": -99.51182556152344, + "loss": 0.722, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9433705806732178, + "rewards/margins": 4.762822151184082, + "rewards/rejected": -1.819451093673706, + "step": 7282 + }, + { + "epoch": 1.82, + "grad_norm": 4.427699089050293, + "learning_rate": 7.065824365376667e-06, + "logits/chosen": -0.5020565390586853, + "logits/rejected": -0.596104621887207, + "logps/chosen": -49.58855438232422, + "logps/rejected": -96.92558288574219, + "loss": 0.6358, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9060542583465576, + "rewards/margins": 6.5758867263793945, + "rewards/rejected": -3.669832468032837, + "step": 7283 + }, + { + "epoch": 1.82, + "grad_norm": 5.903057098388672, + "learning_rate": 7.065108575293362e-06, + "logits/chosen": -0.4490893483161926, + "logits/rejected": -0.5216846466064453, + "logps/chosen": -55.893829345703125, + "logps/rejected": -94.39817810058594, + "loss": 0.7145, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8675503730773926, + "rewards/margins": 4.587065696716309, + "rewards/rejected": -1.719515085220337, + "step": 7284 + }, + { + "epoch": 1.82, + "grad_norm": 3.474257707595825, + "learning_rate": 7.064392734179031e-06, + "logits/chosen": -0.43906170129776, + "logits/rejected": -0.509422779083252, + "logps/chosen": -56.45412063598633, + "logps/rejected": -102.12586212158203, + "loss": 0.6333, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.09547758102417, + "rewards/margins": 6.628283500671387, + "rewards/rejected": -3.532806158065796, + "step": 7285 + }, + { + "epoch": 1.82, + "grad_norm": 5.262363910675049, + "learning_rate": 7.063676842051357e-06, + "logits/chosen": -0.458497017621994, + "logits/rejected": -0.5937140583992004, + "logps/chosen": -52.02777862548828, + "logps/rejected": -79.13328552246094, + "loss": 0.6326, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.000413179397583, + "rewards/margins": 5.874788284301758, + "rewards/rejected": -2.874375104904175, + "step": 7286 + }, + { + "epoch": 1.82, + "grad_norm": 5.5741496086120605, + "learning_rate": 7.062960898928034e-06, + "logits/chosen": -0.44198843836784363, + "logits/rejected": -0.5241276025772095, + "logps/chosen": -61.758331298828125, + "logps/rejected": -93.65133666992188, + "loss": 0.7443, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.545012950897217, + "rewards/margins": 5.398914337158203, + "rewards/rejected": -2.8539016246795654, + "step": 7287 + }, + { + "epoch": 1.82, + "grad_norm": 6.377352237701416, + "learning_rate": 7.0622449048267535e-06, + "logits/chosen": -0.48989158868789673, + "logits/rejected": -0.5737087726593018, + "logps/chosen": -55.7642707824707, + "logps/rejected": -79.13690185546875, + "loss": 0.7686, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.01432466506958, + "rewards/margins": 5.179367542266846, + "rewards/rejected": -2.1650424003601074, + "step": 7288 + }, + { + "epoch": 1.82, + "grad_norm": 5.598771572113037, + "learning_rate": 7.061528859765208e-06, + "logits/chosen": -0.4893251061439514, + "logits/rejected": -0.52323979139328, + "logps/chosen": -48.839290618896484, + "logps/rejected": -102.64081573486328, + "loss": 0.7317, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.817110300064087, + "rewards/margins": 5.217350482940674, + "rewards/rejected": -2.400240182876587, + "step": 7289 + }, + { + "epoch": 1.82, + "grad_norm": 3.059600353240967, + "learning_rate": 7.060812763761091e-06, + "logits/chosen": -0.42110124230384827, + "logits/rejected": -0.4918786585330963, + "logps/chosen": -45.093143463134766, + "logps/rejected": -91.81217193603516, + "loss": 0.6082, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.393134832382202, + "rewards/margins": 5.5818400382995605, + "rewards/rejected": -2.1887056827545166, + "step": 7290 + }, + { + "epoch": 1.82, + "grad_norm": 7.215651035308838, + "learning_rate": 7.060096616832098e-06, + "logits/chosen": -0.45082420110702515, + "logits/rejected": -0.5688942670822144, + "logps/chosen": -54.686607360839844, + "logps/rejected": -80.23657989501953, + "loss": 0.7129, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8009097576141357, + "rewards/margins": 5.475927352905273, + "rewards/rejected": -2.675017833709717, + "step": 7291 + }, + { + "epoch": 1.82, + "grad_norm": 8.106879234313965, + "learning_rate": 7.0593804189959265e-06, + "logits/chosen": -0.42652228474617004, + "logits/rejected": -0.531093955039978, + "logps/chosen": -61.77766799926758, + "logps/rejected": -83.43930053710938, + "loss": 0.7776, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6662049293518066, + "rewards/margins": 5.341887474060059, + "rewards/rejected": -2.6756820678710938, + "step": 7292 + }, + { + "epoch": 1.82, + "grad_norm": 7.826772212982178, + "learning_rate": 7.058664170270275e-06, + "logits/chosen": -0.44926226139068604, + "logits/rejected": -0.5278881192207336, + "logps/chosen": -64.58059692382812, + "logps/rejected": -113.80233764648438, + "loss": 0.7243, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.3552536964416504, + "rewards/margins": 6.5228424072265625, + "rewards/rejected": -4.167588710784912, + "step": 7293 + }, + { + "epoch": 1.82, + "grad_norm": 12.270325660705566, + "learning_rate": 7.057947870672843e-06, + "logits/chosen": -0.427804559469223, + "logits/rejected": -0.535929262638092, + "logps/chosen": -58.17070770263672, + "logps/rejected": -83.9737548828125, + "loss": 0.7028, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9496586322784424, + "rewards/margins": 4.843716144561768, + "rewards/rejected": -1.8940575122833252, + "step": 7294 + }, + { + "epoch": 1.83, + "grad_norm": 15.519786834716797, + "learning_rate": 7.05723152022133e-06, + "logits/chosen": -0.4900549650192261, + "logits/rejected": -0.58233642578125, + "logps/chosen": -52.53550338745117, + "logps/rejected": -87.94737243652344, + "loss": 0.9122, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6095266342163086, + "rewards/margins": 4.876157760620117, + "rewards/rejected": -2.2666306495666504, + "step": 7295 + }, + { + "epoch": 1.83, + "grad_norm": 5.893584251403809, + "learning_rate": 7.056515118933437e-06, + "logits/chosen": -0.4168429970741272, + "logits/rejected": -0.5612143874168396, + "logps/chosen": -70.44429016113281, + "logps/rejected": -81.43062591552734, + "loss": 0.8772, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.540762186050415, + "rewards/margins": 5.211850166320801, + "rewards/rejected": -2.6710877418518066, + "step": 7296 + }, + { + "epoch": 1.83, + "grad_norm": 6.095090866088867, + "learning_rate": 7.055798666826869e-06, + "logits/chosen": -0.4979058504104614, + "logits/rejected": -0.6014183163642883, + "logps/chosen": -57.875240325927734, + "logps/rejected": -74.7987060546875, + "loss": 0.7807, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.877182960510254, + "rewards/margins": 4.914645671844482, + "rewards/rejected": -2.0374631881713867, + "step": 7297 + }, + { + "epoch": 1.83, + "grad_norm": 4.213382244110107, + "learning_rate": 7.0550821639193266e-06, + "logits/chosen": -0.4480415880680084, + "logits/rejected": -0.4949542284011841, + "logps/chosen": -53.798622131347656, + "logps/rejected": -82.43925476074219, + "loss": 0.6676, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1145665645599365, + "rewards/margins": 4.855332851409912, + "rewards/rejected": -1.740766167640686, + "step": 7298 + }, + { + "epoch": 1.83, + "grad_norm": 9.327091217041016, + "learning_rate": 7.054365610228522e-06, + "logits/chosen": -0.46044597029685974, + "logits/rejected": -0.5379601716995239, + "logps/chosen": -59.27825927734375, + "logps/rejected": -89.39550018310547, + "loss": 0.6479, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1061086654663086, + "rewards/margins": 5.8788161277771, + "rewards/rejected": -2.772706985473633, + "step": 7299 + }, + { + "epoch": 1.83, + "grad_norm": 7.090847492218018, + "learning_rate": 7.0536490057721556e-06, + "logits/chosen": -0.42069751024246216, + "logits/rejected": -0.5054593682289124, + "logps/chosen": -65.27287292480469, + "logps/rejected": -91.82205200195312, + "loss": 0.6703, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0172932147979736, + "rewards/margins": 6.354323863983154, + "rewards/rejected": -3.3370308876037598, + "step": 7300 + }, + { + "epoch": 1.83, + "grad_norm": 3.1582837104797363, + "learning_rate": 7.052932350567938e-06, + "logits/chosen": -0.3777081072330475, + "logits/rejected": -0.5392758250236511, + "logps/chosen": -80.71766662597656, + "logps/rejected": -94.38568115234375, + "loss": 0.671, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9679007530212402, + "rewards/margins": 5.900879383087158, + "rewards/rejected": -2.9329781532287598, + "step": 7301 + }, + { + "epoch": 1.83, + "grad_norm": 6.091170310974121, + "learning_rate": 7.0522156446335785e-06, + "logits/chosen": -0.5019586682319641, + "logits/rejected": -0.5498195886611938, + "logps/chosen": -53.409385681152344, + "logps/rejected": -102.25567626953125, + "loss": 0.8623, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8719708919525146, + "rewards/margins": 3.867980718612671, + "rewards/rejected": -0.9960101842880249, + "step": 7302 + }, + { + "epoch": 1.83, + "grad_norm": 4.375210762023926, + "learning_rate": 7.051498887986789e-06, + "logits/chosen": -0.4647432863712311, + "logits/rejected": -0.5797874331474304, + "logps/chosen": -49.91822814941406, + "logps/rejected": -81.77084350585938, + "loss": 0.7139, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8860466480255127, + "rewards/margins": 5.174001693725586, + "rewards/rejected": -2.2879550457000732, + "step": 7303 + }, + { + "epoch": 1.83, + "grad_norm": 8.366036415100098, + "learning_rate": 7.050782080645279e-06, + "logits/chosen": -0.37166982889175415, + "logits/rejected": -0.5073398351669312, + "logps/chosen": -78.48802947998047, + "logps/rejected": -68.5110092163086, + "loss": 0.8144, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9517405033111572, + "rewards/margins": 5.021191596984863, + "rewards/rejected": -2.069451332092285, + "step": 7304 + }, + { + "epoch": 1.83, + "grad_norm": 5.609448432922363, + "learning_rate": 7.050065222626762e-06, + "logits/chosen": -0.4491352438926697, + "logits/rejected": -0.5604704022407532, + "logps/chosen": -62.387901306152344, + "logps/rejected": -80.41606903076172, + "loss": 0.7319, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6937880516052246, + "rewards/margins": 4.674191951751709, + "rewards/rejected": -1.980404257774353, + "step": 7305 + }, + { + "epoch": 1.83, + "grad_norm": 7.441821098327637, + "learning_rate": 7.049348313948954e-06, + "logits/chosen": -0.4886831045150757, + "logits/rejected": -0.615710437297821, + "logps/chosen": -55.5126953125, + "logps/rejected": -76.011962890625, + "loss": 0.6457, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8926401138305664, + "rewards/margins": 5.686423301696777, + "rewards/rejected": -2.793783664703369, + "step": 7306 + }, + { + "epoch": 1.83, + "grad_norm": 3.828425168991089, + "learning_rate": 7.048631354629567e-06, + "logits/chosen": -0.4514878988265991, + "logits/rejected": -0.48371243476867676, + "logps/chosen": -55.30973815917969, + "logps/rejected": -94.79998779296875, + "loss": 0.7078, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9183106422424316, + "rewards/margins": 4.523594856262207, + "rewards/rejected": -1.6052837371826172, + "step": 7307 + }, + { + "epoch": 1.83, + "grad_norm": 8.116890907287598, + "learning_rate": 7.047914344686322e-06, + "logits/chosen": -0.4092479348182678, + "logits/rejected": -0.543186366558075, + "logps/chosen": -72.84518432617188, + "logps/rejected": -76.47370147705078, + "loss": 0.8324, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0862724781036377, + "rewards/margins": 5.240464687347412, + "rewards/rejected": -2.1541924476623535, + "step": 7308 + }, + { + "epoch": 1.83, + "grad_norm": 4.357203006744385, + "learning_rate": 7.0471972841369355e-06, + "logits/chosen": -0.3494566082954407, + "logits/rejected": -0.42136630415916443, + "logps/chosen": -74.12926483154297, + "logps/rejected": -94.3223648071289, + "loss": 0.7756, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0610735416412354, + "rewards/margins": 3.9478375911712646, + "rewards/rejected": -0.8867642283439636, + "step": 7309 + }, + { + "epoch": 1.83, + "grad_norm": 3.9283690452575684, + "learning_rate": 7.046480172999126e-06, + "logits/chosen": -0.4988258183002472, + "logits/rejected": -0.5516293048858643, + "logps/chosen": -52.50548553466797, + "logps/rejected": -86.14096069335938, + "loss": 0.6608, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9937570095062256, + "rewards/margins": 5.236032485961914, + "rewards/rejected": -2.242274761199951, + "step": 7310 + }, + { + "epoch": 1.83, + "grad_norm": 6.181027889251709, + "learning_rate": 7.045763011290614e-06, + "logits/chosen": -0.41021427512168884, + "logits/rejected": -0.5245960354804993, + "logps/chosen": -52.084590911865234, + "logps/rejected": -85.44206237792969, + "loss": 0.6444, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.544332981109619, + "rewards/margins": 5.454166412353516, + "rewards/rejected": -2.9098339080810547, + "step": 7311 + }, + { + "epoch": 1.83, + "grad_norm": 4.026144027709961, + "learning_rate": 7.045045799029124e-06, + "logits/chosen": -0.40439820289611816, + "logits/rejected": -0.47692346572875977, + "logps/chosen": -75.08772277832031, + "logps/rejected": -100.35920715332031, + "loss": 0.6861, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7999489307403564, + "rewards/margins": 5.049410343170166, + "rewards/rejected": -2.2494611740112305, + "step": 7312 + }, + { + "epoch": 1.83, + "grad_norm": 4.927104949951172, + "learning_rate": 7.044328536232376e-06, + "logits/chosen": -0.4072023928165436, + "logits/rejected": -0.49237412214279175, + "logps/chosen": -54.61076354980469, + "logps/rejected": -97.64543914794922, + "loss": 0.6835, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8173153400421143, + "rewards/margins": 5.544877529144287, + "rewards/rejected": -2.727562427520752, + "step": 7313 + }, + { + "epoch": 1.83, + "grad_norm": 6.838738441467285, + "learning_rate": 7.043611222918094e-06, + "logits/chosen": -0.4572199285030365, + "logits/rejected": -0.491655558347702, + "logps/chosen": -45.45256805419922, + "logps/rejected": -80.24595642089844, + "loss": 0.6711, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.814237117767334, + "rewards/margins": 4.590385437011719, + "rewards/rejected": -1.7761483192443848, + "step": 7314 + }, + { + "epoch": 1.83, + "grad_norm": 4.924750804901123, + "learning_rate": 7.042893859104008e-06, + "logits/chosen": -0.3609454035758972, + "logits/rejected": -0.4680674076080322, + "logps/chosen": -49.97630310058594, + "logps/rejected": -86.52783203125, + "loss": 0.6653, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.284834384918213, + "rewards/margins": 5.631664752960205, + "rewards/rejected": -2.3468306064605713, + "step": 7315 + }, + { + "epoch": 1.83, + "grad_norm": 4.986392974853516, + "learning_rate": 7.042176444807839e-06, + "logits/chosen": -0.46849197149276733, + "logits/rejected": -0.5106679201126099, + "logps/chosen": -57.52423858642578, + "logps/rejected": -87.88284301757812, + "loss": 0.83, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.139728546142578, + "rewards/margins": 4.784322738647461, + "rewards/rejected": -1.644594669342041, + "step": 7316 + }, + { + "epoch": 1.83, + "grad_norm": 22.160350799560547, + "learning_rate": 7.04145898004732e-06, + "logits/chosen": -0.3432895243167877, + "logits/rejected": -0.4468427896499634, + "logps/chosen": -61.094032287597656, + "logps/rejected": -77.49751281738281, + "loss": 0.7298, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1610333919525146, + "rewards/margins": 5.045215129852295, + "rewards/rejected": -1.8841818571090698, + "step": 7317 + }, + { + "epoch": 1.83, + "grad_norm": 8.001615524291992, + "learning_rate": 7.040741464840176e-06, + "logits/chosen": -0.3907124996185303, + "logits/rejected": -0.5173867344856262, + "logps/chosen": -58.04997253417969, + "logps/rejected": -85.50090789794922, + "loss": 0.6573, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7145872116088867, + "rewards/margins": 5.371455192565918, + "rewards/rejected": -2.6568682193756104, + "step": 7318 + }, + { + "epoch": 1.83, + "grad_norm": 4.64061164855957, + "learning_rate": 7.040023899204142e-06, + "logits/chosen": -0.4412067234516144, + "logits/rejected": -0.5070022940635681, + "logps/chosen": -57.336124420166016, + "logps/rejected": -92.50469970703125, + "loss": 0.7369, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.077683687210083, + "rewards/margins": 4.344778060913086, + "rewards/rejected": -1.2670944929122925, + "step": 7319 + }, + { + "epoch": 1.83, + "grad_norm": 3.721742868423462, + "learning_rate": 7.039306283156946e-06, + "logits/chosen": -0.4642695486545563, + "logits/rejected": -0.5215820670127869, + "logps/chosen": -48.30996322631836, + "logps/rejected": -86.49261474609375, + "loss": 0.729, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0831403732299805, + "rewards/margins": 4.779207706451416, + "rewards/rejected": -1.6960670948028564, + "step": 7320 + }, + { + "epoch": 1.83, + "grad_norm": 18.033458709716797, + "learning_rate": 7.038588616716323e-06, + "logits/chosen": -0.4025735557079315, + "logits/rejected": -0.4624614715576172, + "logps/chosen": -57.613059997558594, + "logps/rejected": -91.19393157958984, + "loss": 0.8584, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.665508270263672, + "rewards/margins": 4.768617630004883, + "rewards/rejected": -2.103109359741211, + "step": 7321 + }, + { + "epoch": 1.83, + "grad_norm": 8.038058280944824, + "learning_rate": 7.037870899900008e-06, + "logits/chosen": -0.4054946005344391, + "logits/rejected": -0.46981343626976013, + "logps/chosen": -46.1441650390625, + "logps/rejected": -85.89185333251953, + "loss": 0.6915, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1912763118743896, + "rewards/margins": 4.868886470794678, + "rewards/rejected": -1.677610158920288, + "step": 7322 + }, + { + "epoch": 1.83, + "grad_norm": 7.415050029754639, + "learning_rate": 7.037153132725734e-06, + "logits/chosen": -0.4196900725364685, + "logits/rejected": -0.4738362729549408, + "logps/chosen": -63.78994369506836, + "logps/rejected": -89.83297729492188, + "loss": 0.7379, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.089582920074463, + "rewards/margins": 3.6568546295166016, + "rewards/rejected": -0.5672719478607178, + "step": 7323 + }, + { + "epoch": 1.83, + "grad_norm": 4.547715187072754, + "learning_rate": 7.036435315211241e-06, + "logits/chosen": -0.39226457476615906, + "logits/rejected": -0.5214399099349976, + "logps/chosen": -60.95779037475586, + "logps/rejected": -86.10263061523438, + "loss": 0.7617, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8524088859558105, + "rewards/margins": 4.713967323303223, + "rewards/rejected": -1.861558437347412, + "step": 7324 + }, + { + "epoch": 1.83, + "grad_norm": 3.1700494289398193, + "learning_rate": 7.035717447374264e-06, + "logits/chosen": -0.5465400218963623, + "logits/rejected": -0.641614556312561, + "logps/chosen": -44.82701110839844, + "logps/rejected": -78.77262878417969, + "loss": 0.5849, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1404762268066406, + "rewards/margins": 5.907527923583984, + "rewards/rejected": -2.7670516967773438, + "step": 7325 + }, + { + "epoch": 1.83, + "grad_norm": 5.713059425354004, + "learning_rate": 7.034999529232544e-06, + "logits/chosen": -0.4335898160934448, + "logits/rejected": -0.5337299704551697, + "logps/chosen": -69.14117431640625, + "logps/rejected": -82.68865966796875, + "loss": 0.7985, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.888174533843994, + "rewards/margins": 5.104798316955566, + "rewards/rejected": -2.216623544692993, + "step": 7326 + }, + { + "epoch": 1.83, + "grad_norm": 4.392982006072998, + "learning_rate": 7.03428156080382e-06, + "logits/chosen": -0.46739572286605835, + "logits/rejected": -0.5339198708534241, + "logps/chosen": -48.21377944946289, + "logps/rejected": -80.63219451904297, + "loss": 0.703, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.141324281692505, + "rewards/margins": 5.330691814422607, + "rewards/rejected": -2.1893677711486816, + "step": 7327 + }, + { + "epoch": 1.83, + "grad_norm": 6.172473430633545, + "learning_rate": 7.033563542105837e-06, + "logits/chosen": -0.44290363788604736, + "logits/rejected": -0.5060016512870789, + "logps/chosen": -59.64202880859375, + "logps/rejected": -98.73531341552734, + "loss": 0.746, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.979605197906494, + "rewards/margins": 5.676804065704346, + "rewards/rejected": -2.6971991062164307, + "step": 7328 + }, + { + "epoch": 1.83, + "grad_norm": 9.961727142333984, + "learning_rate": 7.032845473156334e-06, + "logits/chosen": -0.49515458941459656, + "logits/rejected": -0.6046175360679626, + "logps/chosen": -51.03001022338867, + "logps/rejected": -76.94365692138672, + "loss": 0.9287, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6944291591644287, + "rewards/margins": 4.539633274078369, + "rewards/rejected": -1.8452045917510986, + "step": 7329 + }, + { + "epoch": 1.83, + "grad_norm": 11.772396087646484, + "learning_rate": 7.0321273539730575e-06, + "logits/chosen": -0.3949620723724365, + "logits/rejected": -0.43883636593818665, + "logps/chosen": -47.8588752746582, + "logps/rejected": -99.42228698730469, + "loss": 0.6789, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.738938808441162, + "rewards/margins": 5.760262966156006, + "rewards/rejected": -3.0213239192962646, + "step": 7330 + }, + { + "epoch": 1.83, + "grad_norm": 13.30791187286377, + "learning_rate": 7.031409184573754e-06, + "logits/chosen": -0.5039554834365845, + "logits/rejected": -0.5522441864013672, + "logps/chosen": -54.62436294555664, + "logps/rejected": -104.6395263671875, + "loss": 0.7802, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9591355323791504, + "rewards/margins": 6.403092861175537, + "rewards/rejected": -3.4439573287963867, + "step": 7331 + }, + { + "epoch": 1.83, + "grad_norm": 4.102656364440918, + "learning_rate": 7.0306909649761675e-06, + "logits/chosen": -0.5059716701507568, + "logits/rejected": -0.6036930680274963, + "logps/chosen": -61.37177276611328, + "logps/rejected": -85.96001434326172, + "loss": 0.6908, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8588948249816895, + "rewards/margins": 5.667325496673584, + "rewards/rejected": -2.8084311485290527, + "step": 7332 + }, + { + "epoch": 1.83, + "grad_norm": 17.057571411132812, + "learning_rate": 7.0299726951980475e-06, + "logits/chosen": -0.4479536712169647, + "logits/rejected": -0.5110231041908264, + "logps/chosen": -53.885398864746094, + "logps/rejected": -96.55597686767578, + "loss": 0.9091, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.690408945083618, + "rewards/margins": 5.148910045623779, + "rewards/rejected": -2.4585013389587402, + "step": 7333 + }, + { + "epoch": 1.83, + "grad_norm": 6.0841450691223145, + "learning_rate": 7.029254375257143e-06, + "logits/chosen": -0.48771363496780396, + "logits/rejected": -0.5616632103919983, + "logps/chosen": -45.612281799316406, + "logps/rejected": -96.94517517089844, + "loss": 0.5813, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.071017026901245, + "rewards/margins": 6.160538673400879, + "rewards/rejected": -3.089522123336792, + "step": 7334 + }, + { + "epoch": 1.84, + "grad_norm": 5.538244724273682, + "learning_rate": 7.028536005171205e-06, + "logits/chosen": -0.415311723947525, + "logits/rejected": -0.5759496092796326, + "logps/chosen": -60.29716491699219, + "logps/rejected": -80.55885314941406, + "loss": 0.6468, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1873397827148438, + "rewards/margins": 5.4977216720581055, + "rewards/rejected": -2.3103818893432617, + "step": 7335 + }, + { + "epoch": 1.84, + "grad_norm": 3.961693286895752, + "learning_rate": 7.027817584957983e-06, + "logits/chosen": -0.5238560438156128, + "logits/rejected": -0.5939098000526428, + "logps/chosen": -48.105201721191406, + "logps/rejected": -89.5521011352539, + "loss": 0.6888, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.827106237411499, + "rewards/margins": 5.315158367156982, + "rewards/rejected": -2.4880523681640625, + "step": 7336 + }, + { + "epoch": 1.84, + "grad_norm": 4.6587114334106445, + "learning_rate": 7.027099114635234e-06, + "logits/chosen": -0.4516668915748596, + "logits/rejected": -0.5733919143676758, + "logps/chosen": -49.76762008666992, + "logps/rejected": -86.35480499267578, + "loss": 0.6033, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.823634147644043, + "rewards/margins": 5.864317893981934, + "rewards/rejected": -3.0406837463378906, + "step": 7337 + }, + { + "epoch": 1.84, + "grad_norm": 5.275689601898193, + "learning_rate": 7.026380594220709e-06, + "logits/chosen": -0.37200531363487244, + "logits/rejected": -0.49779728055000305, + "logps/chosen": -63.65399932861328, + "logps/rejected": -98.1915512084961, + "loss": 0.6641, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9230337142944336, + "rewards/margins": 6.402211666107178, + "rewards/rejected": -3.479177951812744, + "step": 7338 + }, + { + "epoch": 1.84, + "grad_norm": 4.152076244354248, + "learning_rate": 7.025662023732164e-06, + "logits/chosen": -0.3594023585319519, + "logits/rejected": -0.4321381449699402, + "logps/chosen": -61.59406280517578, + "logps/rejected": -97.38025665283203, + "loss": 0.632, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8253419399261475, + "rewards/margins": 5.89960241317749, + "rewards/rejected": -3.0742604732513428, + "step": 7339 + }, + { + "epoch": 1.84, + "grad_norm": 2.904776096343994, + "learning_rate": 7.024943403187356e-06, + "logits/chosen": -0.36488375067710876, + "logits/rejected": -0.41832512617111206, + "logps/chosen": -58.427574157714844, + "logps/rejected": -108.97997283935547, + "loss": 0.5941, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2933473587036133, + "rewards/margins": 6.186368942260742, + "rewards/rejected": -2.893021583557129, + "step": 7340 + }, + { + "epoch": 1.84, + "grad_norm": 5.860049724578857, + "learning_rate": 7.024224732604041e-06, + "logits/chosen": -0.35277819633483887, + "logits/rejected": -0.4027484357357025, + "logps/chosen": -64.30690002441406, + "logps/rejected": -103.23126220703125, + "loss": 0.749, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8346331119537354, + "rewards/margins": 4.232768535614014, + "rewards/rejected": -1.3981351852416992, + "step": 7341 + }, + { + "epoch": 1.84, + "grad_norm": 10.165338516235352, + "learning_rate": 7.023506011999982e-06, + "logits/chosen": -0.39343199133872986, + "logits/rejected": -0.4678684175014496, + "logps/chosen": -70.42591094970703, + "logps/rejected": -102.78803253173828, + "loss": 0.7359, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8890955448150635, + "rewards/margins": 5.530371189117432, + "rewards/rejected": -2.641275405883789, + "step": 7342 + }, + { + "epoch": 1.84, + "grad_norm": 6.300418376922607, + "learning_rate": 7.022787241392937e-06, + "logits/chosen": -0.4214557707309723, + "logits/rejected": -0.528685986995697, + "logps/chosen": -59.98240661621094, + "logps/rejected": -91.11847686767578, + "loss": 0.6926, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7374367713928223, + "rewards/margins": 4.79032564163208, + "rewards/rejected": -2.052888870239258, + "step": 7343 + }, + { + "epoch": 1.84, + "grad_norm": 4.173788070678711, + "learning_rate": 7.022068420800666e-06, + "logits/chosen": -0.49546581506729126, + "logits/rejected": -0.5461266040802002, + "logps/chosen": -49.58537292480469, + "logps/rejected": -99.19979858398438, + "loss": 0.5989, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9913835525512695, + "rewards/margins": 5.747180461883545, + "rewards/rejected": -2.7557973861694336, + "step": 7344 + }, + { + "epoch": 1.84, + "grad_norm": 6.319101333618164, + "learning_rate": 7.021349550240935e-06, + "logits/chosen": -0.44560545682907104, + "logits/rejected": -0.5589998364448547, + "logps/chosen": -65.7885971069336, + "logps/rejected": -77.4442138671875, + "loss": 0.7134, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0978140830993652, + "rewards/margins": 5.375283241271973, + "rewards/rejected": -2.2774689197540283, + "step": 7345 + }, + { + "epoch": 1.84, + "grad_norm": 4.470737934112549, + "learning_rate": 7.020630629731506e-06, + "logits/chosen": -0.4451724588871002, + "logits/rejected": -0.5329266786575317, + "logps/chosen": -62.22880935668945, + "logps/rejected": -91.36279296875, + "loss": 0.7155, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0706944465637207, + "rewards/margins": 5.953769207000732, + "rewards/rejected": -2.883074998855591, + "step": 7346 + }, + { + "epoch": 1.84, + "grad_norm": 6.941730976104736, + "learning_rate": 7.019911659290145e-06, + "logits/chosen": -0.43176478147506714, + "logits/rejected": -0.4789898991584778, + "logps/chosen": -53.04836654663086, + "logps/rejected": -94.41035461425781, + "loss": 0.6583, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1024112701416016, + "rewards/margins": 5.229499816894531, + "rewards/rejected": -2.1270885467529297, + "step": 7347 + }, + { + "epoch": 1.84, + "grad_norm": 3.7115817070007324, + "learning_rate": 7.019192638934617e-06, + "logits/chosen": -0.3763808012008667, + "logits/rejected": -0.5236715078353882, + "logps/chosen": -61.317039489746094, + "logps/rejected": -85.5133056640625, + "loss": 0.6907, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2470333576202393, + "rewards/margins": 5.790780544281006, + "rewards/rejected": -2.5437469482421875, + "step": 7348 + }, + { + "epoch": 1.84, + "grad_norm": 4.601743221282959, + "learning_rate": 7.0184735686826955e-06, + "logits/chosen": -0.42273861169815063, + "logits/rejected": -0.5352851152420044, + "logps/chosen": -62.679847717285156, + "logps/rejected": -85.41863250732422, + "loss": 0.7358, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.892522096633911, + "rewards/margins": 5.378818511962891, + "rewards/rejected": -2.4862966537475586, + "step": 7349 + }, + { + "epoch": 1.84, + "grad_norm": 3.850512981414795, + "learning_rate": 7.017754448552141e-06, + "logits/chosen": -0.4489019513130188, + "logits/rejected": -0.5867765545845032, + "logps/chosen": -52.244876861572266, + "logps/rejected": -79.75257873535156, + "loss": 0.5915, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0353853702545166, + "rewards/margins": 5.07833194732666, + "rewards/rejected": -2.0429468154907227, + "step": 7350 + }, + { + "epoch": 1.84, + "grad_norm": 10.52002239227295, + "learning_rate": 7.01703527856073e-06, + "logits/chosen": -0.4481198787689209, + "logits/rejected": -0.5870604515075684, + "logps/chosen": -58.556156158447266, + "logps/rejected": -75.14344024658203, + "loss": 0.6695, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0562551021575928, + "rewards/margins": 6.425573825836182, + "rewards/rejected": -3.369318723678589, + "step": 7351 + }, + { + "epoch": 1.84, + "grad_norm": 3.6050589084625244, + "learning_rate": 7.0163160587262305e-06, + "logits/chosen": -0.45143914222717285, + "logits/rejected": -0.583702802658081, + "logps/chosen": -51.396995544433594, + "logps/rejected": -76.43698120117188, + "loss": 0.6187, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.934609889984131, + "rewards/margins": 6.293428897857666, + "rewards/rejected": -3.3588197231292725, + "step": 7352 + }, + { + "epoch": 1.84, + "grad_norm": 9.065550804138184, + "learning_rate": 7.015596789066417e-06, + "logits/chosen": -0.36232900619506836, + "logits/rejected": -0.43118441104888916, + "logps/chosen": -60.0257682800293, + "logps/rejected": -78.22108459472656, + "loss": 0.8243, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.644338369369507, + "rewards/margins": 4.6945295333862305, + "rewards/rejected": -2.050191879272461, + "step": 7353 + }, + { + "epoch": 1.84, + "grad_norm": 7.066791534423828, + "learning_rate": 7.014877469599064e-06, + "logits/chosen": -0.40794527530670166, + "logits/rejected": -0.49836277961730957, + "logps/chosen": -52.98912048339844, + "logps/rejected": -94.52149963378906, + "loss": 0.7073, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9519944190979004, + "rewards/margins": 6.028163433074951, + "rewards/rejected": -3.076169490814209, + "step": 7354 + }, + { + "epoch": 1.84, + "grad_norm": 6.5262556076049805, + "learning_rate": 7.014158100341944e-06, + "logits/chosen": -0.47201108932495117, + "logits/rejected": -0.5260538458824158, + "logps/chosen": -58.99688720703125, + "logps/rejected": -114.09164428710938, + "loss": 0.6661, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.901901960372925, + "rewards/margins": 6.667551517486572, + "rewards/rejected": -3.7656497955322266, + "step": 7355 + }, + { + "epoch": 1.84, + "grad_norm": 7.391214370727539, + "learning_rate": 7.013438681312835e-06, + "logits/chosen": -0.49430614709854126, + "logits/rejected": -0.5969029664993286, + "logps/chosen": -55.26071548461914, + "logps/rejected": -69.03978729248047, + "loss": 0.6759, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.238762617111206, + "rewards/margins": 5.195910453796387, + "rewards/rejected": -1.9571480751037598, + "step": 7356 + }, + { + "epoch": 1.84, + "grad_norm": 7.91289758682251, + "learning_rate": 7.0127192125295155e-06, + "logits/chosen": -0.3695703446865082, + "logits/rejected": -0.4512777328491211, + "logps/chosen": -60.00074005126953, + "logps/rejected": -104.40447998046875, + "loss": 0.6549, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8737223148345947, + "rewards/margins": 6.198545455932617, + "rewards/rejected": -3.3248231410980225, + "step": 7357 + }, + { + "epoch": 1.84, + "grad_norm": 7.171540260314941, + "learning_rate": 7.011999694009764e-06, + "logits/chosen": -0.47307640314102173, + "logits/rejected": -0.508434534072876, + "logps/chosen": -52.669837951660156, + "logps/rejected": -106.1363525390625, + "loss": 0.635, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.336768865585327, + "rewards/margins": 6.903161525726318, + "rewards/rejected": -3.566392660140991, + "step": 7358 + }, + { + "epoch": 1.84, + "grad_norm": 5.525657653808594, + "learning_rate": 7.011280125771358e-06, + "logits/chosen": -0.5163337588310242, + "logits/rejected": -0.5695191621780396, + "logps/chosen": -53.64973449707031, + "logps/rejected": -102.53154754638672, + "loss": 0.7441, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9825637340545654, + "rewards/margins": 5.985202789306641, + "rewards/rejected": -3.002638816833496, + "step": 7359 + }, + { + "epoch": 1.84, + "grad_norm": 8.987439155578613, + "learning_rate": 7.010560507832082e-06, + "logits/chosen": -0.5105984210968018, + "logits/rejected": -0.5829123854637146, + "logps/chosen": -54.19622802734375, + "logps/rejected": -79.50584411621094, + "loss": 0.7122, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.69210147857666, + "rewards/margins": 5.022271633148193, + "rewards/rejected": -2.3301703929901123, + "step": 7360 + }, + { + "epoch": 1.84, + "grad_norm": 5.916048049926758, + "learning_rate": 7.009840840209717e-06, + "logits/chosen": -0.44416481256484985, + "logits/rejected": -0.5500279664993286, + "logps/chosen": -48.72511291503906, + "logps/rejected": -75.55856323242188, + "loss": 0.6211, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1348536014556885, + "rewards/margins": 5.165012359619141, + "rewards/rejected": -2.030158519744873, + "step": 7361 + }, + { + "epoch": 1.84, + "grad_norm": 4.73694372177124, + "learning_rate": 7.009121122922049e-06, + "logits/chosen": -0.5058908462524414, + "logits/rejected": -0.610649585723877, + "logps/chosen": -63.97056579589844, + "logps/rejected": -79.86583709716797, + "loss": 0.7787, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8353629112243652, + "rewards/margins": 6.042141437530518, + "rewards/rejected": -3.2067785263061523, + "step": 7362 + }, + { + "epoch": 1.84, + "grad_norm": 7.894010543823242, + "learning_rate": 7.00840135598686e-06, + "logits/chosen": -0.4451640248298645, + "logits/rejected": -0.4901121258735657, + "logps/chosen": -51.06929016113281, + "logps/rejected": -88.12126159667969, + "loss": 0.6671, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7514753341674805, + "rewards/margins": 4.8790717124938965, + "rewards/rejected": -2.127596616744995, + "step": 7363 + }, + { + "epoch": 1.84, + "grad_norm": 15.656177520751953, + "learning_rate": 7.007681539421936e-06, + "logits/chosen": -0.47808992862701416, + "logits/rejected": -0.5496840476989746, + "logps/chosen": -53.904014587402344, + "logps/rejected": -84.84027099609375, + "loss": 0.7743, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5610313415527344, + "rewards/margins": 4.830745697021484, + "rewards/rejected": -2.269714593887329, + "step": 7364 + }, + { + "epoch": 1.84, + "grad_norm": 9.038298606872559, + "learning_rate": 7.006961673245066e-06, + "logits/chosen": -0.49481362104415894, + "logits/rejected": -0.5869135856628418, + "logps/chosen": -53.09749221801758, + "logps/rejected": -76.48624420166016, + "loss": 0.7438, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.853741407394409, + "rewards/margins": 4.949801921844482, + "rewards/rejected": -2.0960607528686523, + "step": 7365 + }, + { + "epoch": 1.84, + "grad_norm": 6.507637023925781, + "learning_rate": 7.006241757474039e-06, + "logits/chosen": -0.44309401512145996, + "logits/rejected": -0.5605268478393555, + "logps/chosen": -59.13286590576172, + "logps/rejected": -80.23101043701172, + "loss": 0.7767, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6893210411071777, + "rewards/margins": 4.757242202758789, + "rewards/rejected": -2.0679211616516113, + "step": 7366 + }, + { + "epoch": 1.84, + "grad_norm": 3.538193941116333, + "learning_rate": 7.005521792126644e-06, + "logits/chosen": -0.5033861994743347, + "logits/rejected": -0.619615375995636, + "logps/chosen": -50.30028533935547, + "logps/rejected": -76.98120880126953, + "loss": 0.6455, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2076807022094727, + "rewards/margins": 5.471190452575684, + "rewards/rejected": -2.263509750366211, + "step": 7367 + }, + { + "epoch": 1.84, + "grad_norm": 15.809566497802734, + "learning_rate": 7.004801777220672e-06, + "logits/chosen": -0.4399137794971466, + "logits/rejected": -0.5393044948577881, + "logps/chosen": -52.8669319152832, + "logps/rejected": -78.57891082763672, + "loss": 0.7417, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.632986545562744, + "rewards/margins": 4.939950466156006, + "rewards/rejected": -2.306964159011841, + "step": 7368 + }, + { + "epoch": 1.84, + "grad_norm": 8.33103084564209, + "learning_rate": 7.004081712773917e-06, + "logits/chosen": -0.43020540475845337, + "logits/rejected": -0.48259103298187256, + "logps/chosen": -56.183502197265625, + "logps/rejected": -90.65237426757812, + "loss": 0.8551, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.671579360961914, + "rewards/margins": 4.2497453689575195, + "rewards/rejected": -1.5781664848327637, + "step": 7369 + }, + { + "epoch": 1.84, + "grad_norm": 7.98838472366333, + "learning_rate": 7.00336159880417e-06, + "logits/chosen": -0.4270358085632324, + "logits/rejected": -0.4944154620170593, + "logps/chosen": -55.2221794128418, + "logps/rejected": -89.35276794433594, + "loss": 0.7649, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1055524349212646, + "rewards/margins": 5.127694129943848, + "rewards/rejected": -2.022141695022583, + "step": 7370 + }, + { + "epoch": 1.84, + "grad_norm": 11.823673248291016, + "learning_rate": 7.0026414353292275e-06, + "logits/chosen": -0.4393530786037445, + "logits/rejected": -0.5108585953712463, + "logps/chosen": -47.968589782714844, + "logps/rejected": -99.39208221435547, + "loss": 0.6936, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.999221086502075, + "rewards/margins": 5.04383659362793, + "rewards/rejected": -2.0446157455444336, + "step": 7371 + }, + { + "epoch": 1.84, + "grad_norm": 4.675987720489502, + "learning_rate": 7.001921222366884e-06, + "logits/chosen": -0.45424145460128784, + "logits/rejected": -0.5453811883926392, + "logps/chosen": -58.146759033203125, + "logps/rejected": -93.24769592285156, + "loss": 0.6375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7778048515319824, + "rewards/margins": 5.836175918579102, + "rewards/rejected": -3.0583701133728027, + "step": 7372 + }, + { + "epoch": 1.84, + "grad_norm": 4.046223163604736, + "learning_rate": 7.001200959934939e-06, + "logits/chosen": -0.4347667098045349, + "logits/rejected": -0.5099834203720093, + "logps/chosen": -59.10364532470703, + "logps/rejected": -89.20026397705078, + "loss": 0.6392, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2172422409057617, + "rewards/margins": 5.488234996795654, + "rewards/rejected": -2.2709925174713135, + "step": 7373 + }, + { + "epoch": 1.84, + "grad_norm": 4.168663501739502, + "learning_rate": 7.00048064805119e-06, + "logits/chosen": -0.33370018005371094, + "logits/rejected": -0.4394151568412781, + "logps/chosen": -57.67158126831055, + "logps/rejected": -83.52439880371094, + "loss": 0.6543, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.05704927444458, + "rewards/margins": 4.9723029136657715, + "rewards/rejected": -1.9152535200119019, + "step": 7374 + }, + { + "epoch": 1.85, + "grad_norm": 7.032194137573242, + "learning_rate": 6.999760286733434e-06, + "logits/chosen": -0.4565606415271759, + "logits/rejected": -0.49084144830703735, + "logps/chosen": -70.37203979492188, + "logps/rejected": -79.2010269165039, + "loss": 0.8454, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0331668853759766, + "rewards/margins": 3.925591468811035, + "rewards/rejected": -0.892424464225769, + "step": 7375 + }, + { + "epoch": 1.85, + "grad_norm": 4.833025932312012, + "learning_rate": 6.999039875999477e-06, + "logits/chosen": -0.39441633224487305, + "logits/rejected": -0.4644932746887207, + "logps/chosen": -56.28947067260742, + "logps/rejected": -98.11763000488281, + "loss": 0.6701, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1072685718536377, + "rewards/margins": 5.211183547973633, + "rewards/rejected": -2.103914976119995, + "step": 7376 + }, + { + "epoch": 1.85, + "grad_norm": 11.779303550720215, + "learning_rate": 6.998319415867116e-06, + "logits/chosen": -0.5177962779998779, + "logits/rejected": -0.6033332347869873, + "logps/chosen": -58.82634735107422, + "logps/rejected": -102.14446258544922, + "loss": 0.7552, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8139634132385254, + "rewards/margins": 4.905928611755371, + "rewards/rejected": -2.0919649600982666, + "step": 7377 + }, + { + "epoch": 1.85, + "grad_norm": 7.646100044250488, + "learning_rate": 6.9975989063541595e-06, + "logits/chosen": -0.42062869668006897, + "logits/rejected": -0.549332857131958, + "logps/chosen": -54.131290435791016, + "logps/rejected": -79.71463012695312, + "loss": 0.7852, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9184775352478027, + "rewards/margins": 4.976916313171387, + "rewards/rejected": -2.058438777923584, + "step": 7378 + }, + { + "epoch": 1.85, + "grad_norm": 5.107508659362793, + "learning_rate": 6.996878347478409e-06, + "logits/chosen": -0.4070838689804077, + "logits/rejected": -0.4492557942867279, + "logps/chosen": -59.335601806640625, + "logps/rejected": -105.14542388916016, + "loss": 0.8084, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.135735511779785, + "rewards/margins": 5.622196674346924, + "rewards/rejected": -2.4864611625671387, + "step": 7379 + }, + { + "epoch": 1.85, + "grad_norm": 2.742543935775757, + "learning_rate": 6.996157739257668e-06, + "logits/chosen": -0.45366060733795166, + "logits/rejected": -0.5881142616271973, + "logps/chosen": -54.3204460144043, + "logps/rejected": -104.62161254882812, + "loss": 0.6037, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0022871494293213, + "rewards/margins": 7.542194366455078, + "rewards/rejected": -4.539907932281494, + "step": 7380 + }, + { + "epoch": 1.85, + "grad_norm": 8.391860008239746, + "learning_rate": 6.995437081709747e-06, + "logits/chosen": -0.45634353160858154, + "logits/rejected": -0.5173612236976624, + "logps/chosen": -57.281063079833984, + "logps/rejected": -103.470458984375, + "loss": 0.825, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7521448135375977, + "rewards/margins": 5.292189598083496, + "rewards/rejected": -2.5400445461273193, + "step": 7381 + }, + { + "epoch": 1.85, + "grad_norm": 5.567390441894531, + "learning_rate": 6.9947163748524536e-06, + "logits/chosen": -0.43345433473587036, + "logits/rejected": -0.4662156403064728, + "logps/chosen": -53.94257354736328, + "logps/rejected": -92.89124298095703, + "loss": 0.703, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.137167453765869, + "rewards/margins": 6.320026397705078, + "rewards/rejected": -3.182859420776367, + "step": 7382 + }, + { + "epoch": 1.85, + "grad_norm": 4.467025279998779, + "learning_rate": 6.993995618703598e-06, + "logits/chosen": -0.4632917046546936, + "logits/rejected": -0.5123604536056519, + "logps/chosen": -47.104164123535156, + "logps/rejected": -85.65328979492188, + "loss": 0.6425, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0104193687438965, + "rewards/margins": 5.6370697021484375, + "rewards/rejected": -2.626650810241699, + "step": 7383 + }, + { + "epoch": 1.85, + "grad_norm": 4.593806743621826, + "learning_rate": 6.993274813280988e-06, + "logits/chosen": -0.465487003326416, + "logits/rejected": -0.5492050647735596, + "logps/chosen": -48.645172119140625, + "logps/rejected": -77.13667297363281, + "loss": 0.6161, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0153627395629883, + "rewards/margins": 5.618838310241699, + "rewards/rejected": -2.603475332260132, + "step": 7384 + }, + { + "epoch": 1.85, + "grad_norm": 8.13681697845459, + "learning_rate": 6.992553958602439e-06, + "logits/chosen": -0.39482957124710083, + "logits/rejected": -0.4873936176300049, + "logps/chosen": -66.66265869140625, + "logps/rejected": -85.12874603271484, + "loss": 0.8405, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.82499098777771, + "rewards/margins": 4.304714679718018, + "rewards/rejected": -1.4797238111495972, + "step": 7385 + }, + { + "epoch": 1.85, + "grad_norm": 2.982374429702759, + "learning_rate": 6.991833054685762e-06, + "logits/chosen": -0.45134609937667847, + "logits/rejected": -0.4716244637966156, + "logps/chosen": -51.02119064331055, + "logps/rejected": -76.3041000366211, + "loss": 0.7419, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3756961822509766, + "rewards/margins": 4.1728997230529785, + "rewards/rejected": -0.797203779220581, + "step": 7386 + }, + { + "epoch": 1.85, + "grad_norm": 8.030182838439941, + "learning_rate": 6.991112101548769e-06, + "logits/chosen": -0.365653932094574, + "logits/rejected": -0.4340875744819641, + "logps/chosen": -54.66111755371094, + "logps/rejected": -83.26973724365234, + "loss": 0.8182, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.076880931854248, + "rewards/margins": 4.805393695831299, + "rewards/rejected": -1.7285130023956299, + "step": 7387 + }, + { + "epoch": 1.85, + "grad_norm": 6.728757381439209, + "learning_rate": 6.9903910992092815e-06, + "logits/chosen": -0.4033639132976532, + "logits/rejected": -0.550639271736145, + "logps/chosen": -53.591712951660156, + "logps/rejected": -72.26368713378906, + "loss": 0.6571, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.85115909576416, + "rewards/margins": 4.853686809539795, + "rewards/rejected": -2.0025277137756348, + "step": 7388 + }, + { + "epoch": 1.85, + "grad_norm": 11.293537139892578, + "learning_rate": 6.989670047685111e-06, + "logits/chosen": -0.46158739924430847, + "logits/rejected": -0.5170884132385254, + "logps/chosen": -58.703800201416016, + "logps/rejected": -90.75707244873047, + "loss": 0.831, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.89608097076416, + "rewards/margins": 4.605755805969238, + "rewards/rejected": -1.7096749544143677, + "step": 7389 + }, + { + "epoch": 1.85, + "grad_norm": 4.681809902191162, + "learning_rate": 6.988948946994078e-06, + "logits/chosen": -0.45182299613952637, + "logits/rejected": -0.5142557621002197, + "logps/chosen": -50.17304992675781, + "logps/rejected": -84.41239929199219, + "loss": 0.7166, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.761829137802124, + "rewards/margins": 4.532663822174072, + "rewards/rejected": -1.7708345651626587, + "step": 7390 + }, + { + "epoch": 1.85, + "grad_norm": 3.7750375270843506, + "learning_rate": 6.988227797154e-06, + "logits/chosen": -0.4485587179660797, + "logits/rejected": -0.5555088520050049, + "logps/chosen": -58.717552185058594, + "logps/rejected": -83.60296630859375, + "loss": 0.6493, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0694849491119385, + "rewards/margins": 5.021077632904053, + "rewards/rejected": -1.9515931606292725, + "step": 7391 + }, + { + "epoch": 1.85, + "grad_norm": 5.018608570098877, + "learning_rate": 6.987506598182701e-06, + "logits/chosen": -0.37016046047210693, + "logits/rejected": -0.4197719395160675, + "logps/chosen": -58.996307373046875, + "logps/rejected": -83.54507446289062, + "loss": 0.7632, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9181056022644043, + "rewards/margins": 4.49721622467041, + "rewards/rejected": -1.5791101455688477, + "step": 7392 + }, + { + "epoch": 1.85, + "grad_norm": 7.134305477142334, + "learning_rate": 6.986785350097997e-06, + "logits/chosen": -0.5724393725395203, + "logits/rejected": -0.6509937047958374, + "logps/chosen": -57.609901428222656, + "logps/rejected": -83.65311431884766, + "loss": 0.7731, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8566994667053223, + "rewards/margins": 5.085533142089844, + "rewards/rejected": -2.2288341522216797, + "step": 7393 + }, + { + "epoch": 1.85, + "grad_norm": 17.239376068115234, + "learning_rate": 6.986064052917715e-06, + "logits/chosen": -0.3935653269290924, + "logits/rejected": -0.4511033296585083, + "logps/chosen": -70.8868637084961, + "logps/rejected": -97.81611633300781, + "loss": 0.713, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0571627616882324, + "rewards/margins": 4.728888988494873, + "rewards/rejected": -1.6717256307601929, + "step": 7394 + }, + { + "epoch": 1.85, + "grad_norm": 8.075413703918457, + "learning_rate": 6.9853427066596764e-06, + "logits/chosen": -0.5338361263275146, + "logits/rejected": -0.5702195167541504, + "logps/chosen": -54.67572784423828, + "logps/rejected": -99.47673034667969, + "loss": 0.7336, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.136209011077881, + "rewards/margins": 5.572036266326904, + "rewards/rejected": -2.4358274936676025, + "step": 7395 + }, + { + "epoch": 1.85, + "grad_norm": 7.519216537475586, + "learning_rate": 6.984621311341709e-06, + "logits/chosen": -0.4261088967323303, + "logits/rejected": -0.4940127432346344, + "logps/chosen": -47.93806457519531, + "logps/rejected": -80.18099212646484, + "loss": 0.6899, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.912363290786743, + "rewards/margins": 4.925899505615234, + "rewards/rejected": -2.0135366916656494, + "step": 7396 + }, + { + "epoch": 1.85, + "grad_norm": 2.4001123905181885, + "learning_rate": 6.9838998669816395e-06, + "logits/chosen": -0.4464208483695984, + "logits/rejected": -0.5262752771377563, + "logps/chosen": -58.41347122192383, + "logps/rejected": -89.31676483154297, + "loss": 0.6177, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0704345703125, + "rewards/margins": 5.655709266662598, + "rewards/rejected": -2.5852739810943604, + "step": 7397 + }, + { + "epoch": 1.85, + "grad_norm": 4.5155863761901855, + "learning_rate": 6.983178373597293e-06, + "logits/chosen": -0.4927660822868347, + "logits/rejected": -0.540667712688446, + "logps/chosen": -55.29728698730469, + "logps/rejected": -92.56011199951172, + "loss": 0.8715, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0148234367370605, + "rewards/margins": 5.331023216247559, + "rewards/rejected": -2.316199541091919, + "step": 7398 + }, + { + "epoch": 1.85, + "grad_norm": 2.8820481300354004, + "learning_rate": 6.982456831206499e-06, + "logits/chosen": -0.5727954506874084, + "logits/rejected": -0.6351184844970703, + "logps/chosen": -53.04307556152344, + "logps/rejected": -88.84033203125, + "loss": 0.7342, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1542208194732666, + "rewards/margins": 5.8276286125183105, + "rewards/rejected": -2.673407554626465, + "step": 7399 + }, + { + "epoch": 1.85, + "grad_norm": 2.958008050918579, + "learning_rate": 6.981735239827088e-06, + "logits/chosen": -0.4341128468513489, + "logits/rejected": -0.550603449344635, + "logps/chosen": -53.56126022338867, + "logps/rejected": -81.48719787597656, + "loss": 0.5885, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9566690921783447, + "rewards/margins": 5.122902870178223, + "rewards/rejected": -2.166233539581299, + "step": 7400 + }, + { + "epoch": 1.85, + "grad_norm": 10.288934707641602, + "learning_rate": 6.981013599476891e-06, + "logits/chosen": -0.43318578600883484, + "logits/rejected": -0.5260489583015442, + "logps/chosen": -55.16680145263672, + "logps/rejected": -78.26678466796875, + "loss": 0.7243, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.742628335952759, + "rewards/margins": 4.853957653045654, + "rewards/rejected": -2.1113295555114746, + "step": 7401 + }, + { + "epoch": 1.85, + "grad_norm": 3.720322370529175, + "learning_rate": 6.980291910173741e-06, + "logits/chosen": -0.47751864790916443, + "logits/rejected": -0.5892447233200073, + "logps/chosen": -52.429203033447266, + "logps/rejected": -72.70020294189453, + "loss": 0.7178, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.790541887283325, + "rewards/margins": 5.509090423583984, + "rewards/rejected": -2.718548536300659, + "step": 7402 + }, + { + "epoch": 1.85, + "grad_norm": 5.998753070831299, + "learning_rate": 6.979570171935473e-06, + "logits/chosen": -0.3799540102481842, + "logits/rejected": -0.5074257850646973, + "logps/chosen": -69.132080078125, + "logps/rejected": -81.96440887451172, + "loss": 0.8986, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5249695777893066, + "rewards/margins": 5.146636962890625, + "rewards/rejected": -2.6216678619384766, + "step": 7403 + }, + { + "epoch": 1.85, + "grad_norm": 8.864068984985352, + "learning_rate": 6.978848384779919e-06, + "logits/chosen": -0.42651766538619995, + "logits/rejected": -0.5071260929107666, + "logps/chosen": -62.54212951660156, + "logps/rejected": -86.23484802246094, + "loss": 0.7962, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.549562931060791, + "rewards/margins": 5.009027004241943, + "rewards/rejected": -2.4594640731811523, + "step": 7404 + }, + { + "epoch": 1.85, + "grad_norm": 4.392930030822754, + "learning_rate": 6.978126548724917e-06, + "logits/chosen": -0.38130155205726624, + "logits/rejected": -0.4376414716243744, + "logps/chosen": -58.682334899902344, + "logps/rejected": -92.54773712158203, + "loss": 0.6544, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.010251998901367, + "rewards/margins": 5.245107650756836, + "rewards/rejected": -2.234856367111206, + "step": 7405 + }, + { + "epoch": 1.85, + "grad_norm": 3.8526086807250977, + "learning_rate": 6.977404663788306e-06, + "logits/chosen": -0.5233428478240967, + "logits/rejected": -0.5645899176597595, + "logps/chosen": -56.73316955566406, + "logps/rejected": -111.19795989990234, + "loss": 0.6708, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.873690128326416, + "rewards/margins": 6.148943901062012, + "rewards/rejected": -3.275254011154175, + "step": 7406 + }, + { + "epoch": 1.85, + "grad_norm": 2.44820237159729, + "learning_rate": 6.976682729987918e-06, + "logits/chosen": -0.4796704053878784, + "logits/rejected": -0.5934340953826904, + "logps/chosen": -52.87874984741211, + "logps/rejected": -82.47915649414062, + "loss": 0.5622, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3214759826660156, + "rewards/margins": 5.888403415679932, + "rewards/rejected": -2.566927194595337, + "step": 7407 + }, + { + "epoch": 1.85, + "grad_norm": 6.798237323760986, + "learning_rate": 6.975960747341601e-06, + "logits/chosen": -0.3974915146827698, + "logits/rejected": -0.4463909864425659, + "logps/chosen": -57.26308822631836, + "logps/rejected": -100.32524871826172, + "loss": 0.8332, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7581565380096436, + "rewards/margins": 5.169849395751953, + "rewards/rejected": -2.4116926193237305, + "step": 7408 + }, + { + "epoch": 1.85, + "grad_norm": 4.308061599731445, + "learning_rate": 6.975238715867191e-06, + "logits/chosen": -0.530316174030304, + "logits/rejected": -0.6197296977043152, + "logps/chosen": -53.242942810058594, + "logps/rejected": -97.66612243652344, + "loss": 0.7387, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.983869791030884, + "rewards/margins": 5.882811069488525, + "rewards/rejected": -2.8989410400390625, + "step": 7409 + }, + { + "epoch": 1.85, + "grad_norm": 9.9511079788208, + "learning_rate": 6.974516635582531e-06, + "logits/chosen": -0.42839759588241577, + "logits/rejected": -0.48889315128326416, + "logps/chosen": -55.259552001953125, + "logps/rejected": -80.51947784423828, + "loss": 0.7445, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5173826217651367, + "rewards/margins": 4.523037433624268, + "rewards/rejected": -2.005655288696289, + "step": 7410 + }, + { + "epoch": 1.85, + "grad_norm": 13.456866264343262, + "learning_rate": 6.973794506505465e-06, + "logits/chosen": -0.46353358030319214, + "logits/rejected": -0.487626314163208, + "logps/chosen": -54.53050994873047, + "logps/rejected": -108.96739959716797, + "loss": 0.9017, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8941564559936523, + "rewards/margins": 4.915439128875732, + "rewards/rejected": -2.02128267288208, + "step": 7411 + }, + { + "epoch": 1.85, + "grad_norm": 5.704521656036377, + "learning_rate": 6.973072328653838e-06, + "logits/chosen": -0.5102620124816895, + "logits/rejected": -0.5400651693344116, + "logps/chosen": -48.16461181640625, + "logps/rejected": -82.1374740600586, + "loss": 0.7023, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1301560401916504, + "rewards/margins": 4.502668857574463, + "rewards/rejected": -1.3725128173828125, + "step": 7412 + }, + { + "epoch": 1.85, + "grad_norm": 4.853858470916748, + "learning_rate": 6.972350102045494e-06, + "logits/chosen": -0.5258476734161377, + "logits/rejected": -0.5811260342597961, + "logps/chosen": -53.79143142700195, + "logps/rejected": -88.6522216796875, + "loss": 0.8002, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.687580108642578, + "rewards/margins": 5.570077896118164, + "rewards/rejected": -2.8824973106384277, + "step": 7413 + }, + { + "epoch": 1.85, + "grad_norm": 6.84225606918335, + "learning_rate": 6.9716278266982815e-06, + "logits/chosen": -0.41205376386642456, + "logits/rejected": -0.39485397934913635, + "logps/chosen": -50.37199783325195, + "logps/rejected": -110.06096649169922, + "loss": 0.8252, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9999067783355713, + "rewards/margins": 5.716221332550049, + "rewards/rejected": -2.7163145542144775, + "step": 7414 + }, + { + "epoch": 1.86, + "grad_norm": 3.479698896408081, + "learning_rate": 6.97090550263005e-06, + "logits/chosen": -0.4609156847000122, + "logits/rejected": -0.5762748718261719, + "logps/chosen": -60.31499481201172, + "logps/rejected": -100.29187774658203, + "loss": 0.6573, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.934792995452881, + "rewards/margins": 6.892202854156494, + "rewards/rejected": -3.957409620285034, + "step": 7415 + }, + { + "epoch": 1.86, + "grad_norm": 4.778103828430176, + "learning_rate": 6.970183129858644e-06, + "logits/chosen": -0.5131846070289612, + "logits/rejected": -0.6272413730621338, + "logps/chosen": -50.742774963378906, + "logps/rejected": -96.01400756835938, + "loss": 0.6205, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9418647289276123, + "rewards/margins": 6.713364601135254, + "rewards/rejected": -3.771500587463379, + "step": 7416 + }, + { + "epoch": 1.86, + "grad_norm": 14.589990615844727, + "learning_rate": 6.96946070840192e-06, + "logits/chosen": -0.4614155888557434, + "logits/rejected": -0.5374361872673035, + "logps/chosen": -54.52168273925781, + "logps/rejected": -96.54689025878906, + "loss": 0.7465, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6335558891296387, + "rewards/margins": 5.724812984466553, + "rewards/rejected": -3.091257333755493, + "step": 7417 + }, + { + "epoch": 1.86, + "grad_norm": 10.778901100158691, + "learning_rate": 6.968738238277726e-06, + "logits/chosen": -0.41288965940475464, + "logits/rejected": -0.5008987188339233, + "logps/chosen": -58.566375732421875, + "logps/rejected": -112.59231567382812, + "loss": 0.8262, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5800533294677734, + "rewards/margins": 5.557800769805908, + "rewards/rejected": -2.977747917175293, + "step": 7418 + }, + { + "epoch": 1.86, + "grad_norm": 5.186546802520752, + "learning_rate": 6.968015719503915e-06, + "logits/chosen": -0.47260791063308716, + "logits/rejected": -0.5553908348083496, + "logps/chosen": -57.42668151855469, + "logps/rejected": -85.43073272705078, + "loss": 0.7205, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1597683429718018, + "rewards/margins": 5.038161754608154, + "rewards/rejected": -1.8783936500549316, + "step": 7419 + }, + { + "epoch": 1.86, + "grad_norm": 3.8845598697662354, + "learning_rate": 6.967293152098345e-06, + "logits/chosen": -0.4322051405906677, + "logits/rejected": -0.4905596077442169, + "logps/chosen": -48.88698196411133, + "logps/rejected": -81.28875732421875, + "loss": 0.6543, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0897774696350098, + "rewards/margins": 4.818321228027344, + "rewards/rejected": -1.7285432815551758, + "step": 7420 + }, + { + "epoch": 1.86, + "grad_norm": 8.119281768798828, + "learning_rate": 6.966570536078865e-06, + "logits/chosen": -0.47506603598594666, + "logits/rejected": -0.5770461559295654, + "logps/chosen": -60.997047424316406, + "logps/rejected": -88.8310317993164, + "loss": 0.6921, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.799745798110962, + "rewards/margins": 4.918684005737305, + "rewards/rejected": -2.1189379692077637, + "step": 7421 + }, + { + "epoch": 1.86, + "grad_norm": 3.6786880493164062, + "learning_rate": 6.965847871463338e-06, + "logits/chosen": -0.4592319130897522, + "logits/rejected": -0.5627142190933228, + "logps/chosen": -55.56499481201172, + "logps/rejected": -78.75756072998047, + "loss": 0.6504, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9269747734069824, + "rewards/margins": 5.848260402679443, + "rewards/rejected": -2.92128586769104, + "step": 7422 + }, + { + "epoch": 1.86, + "grad_norm": 4.471549987792969, + "learning_rate": 6.965125158269619e-06, + "logits/chosen": -0.4831109941005707, + "logits/rejected": -0.5474998950958252, + "logps/chosen": -51.06363296508789, + "logps/rejected": -88.64447021484375, + "loss": 0.7275, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.075190305709839, + "rewards/margins": 5.235669136047363, + "rewards/rejected": -2.1604785919189453, + "step": 7423 + }, + { + "epoch": 1.86, + "grad_norm": 14.00110912322998, + "learning_rate": 6.964402396515566e-06, + "logits/chosen": -0.4316054582595825, + "logits/rejected": -0.49999648332595825, + "logps/chosen": -56.551063537597656, + "logps/rejected": -79.57952117919922, + "loss": 0.884, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5692954063415527, + "rewards/margins": 4.6464362144470215, + "rewards/rejected": -2.077141046524048, + "step": 7424 + }, + { + "epoch": 1.86, + "grad_norm": 15.643043518066406, + "learning_rate": 6.963679586219041e-06, + "logits/chosen": -0.43971848487854004, + "logits/rejected": -0.5018128156661987, + "logps/chosen": -58.070281982421875, + "logps/rejected": -85.07469177246094, + "loss": 0.819, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.774388313293457, + "rewards/margins": 4.229689598083496, + "rewards/rejected": -1.4553011655807495, + "step": 7425 + }, + { + "epoch": 1.86, + "grad_norm": 3.9776763916015625, + "learning_rate": 6.962956727397906e-06, + "logits/chosen": -0.4928929805755615, + "logits/rejected": -0.565623939037323, + "logps/chosen": -56.55469512939453, + "logps/rejected": -101.97382354736328, + "loss": 0.7248, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1254913806915283, + "rewards/margins": 6.404725074768066, + "rewards/rejected": -3.2792341709136963, + "step": 7426 + }, + { + "epoch": 1.86, + "grad_norm": 4.0064697265625, + "learning_rate": 6.9622338200700194e-06, + "logits/chosen": -0.44636958837509155, + "logits/rejected": -0.4619872570037842, + "logps/chosen": -42.61948776245117, + "logps/rejected": -74.64893341064453, + "loss": 0.6314, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1003081798553467, + "rewards/margins": 4.27714204788208, + "rewards/rejected": -1.1768337488174438, + "step": 7427 + }, + { + "epoch": 1.86, + "grad_norm": 5.937234878540039, + "learning_rate": 6.961510864253252e-06, + "logits/chosen": -0.46075424551963806, + "logits/rejected": -0.541591227054596, + "logps/chosen": -49.8984375, + "logps/rejected": -84.62692260742188, + "loss": 0.7261, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1045382022857666, + "rewards/margins": 4.753305435180664, + "rewards/rejected": -1.6487669944763184, + "step": 7428 + }, + { + "epoch": 1.86, + "grad_norm": 3.1698474884033203, + "learning_rate": 6.960787859965462e-06, + "logits/chosen": -0.40821778774261475, + "logits/rejected": -0.5018426775932312, + "logps/chosen": -54.85320281982422, + "logps/rejected": -94.40889739990234, + "loss": 0.5351, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1621947288513184, + "rewards/margins": 6.052044868469238, + "rewards/rejected": -2.88985013961792, + "step": 7429 + }, + { + "epoch": 1.86, + "grad_norm": 3.7596282958984375, + "learning_rate": 6.960064807224518e-06, + "logits/chosen": -0.3873249888420105, + "logits/rejected": -0.42267993092536926, + "logps/chosen": -55.90580749511719, + "logps/rejected": -95.552490234375, + "loss": 0.7438, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1053287982940674, + "rewards/margins": 4.618088722229004, + "rewards/rejected": -1.5127594470977783, + "step": 7430 + }, + { + "epoch": 1.86, + "grad_norm": 9.517412185668945, + "learning_rate": 6.959341706048289e-06, + "logits/chosen": -0.4096285104751587, + "logits/rejected": -0.5121587514877319, + "logps/chosen": -57.49470520019531, + "logps/rejected": -81.78945922851562, + "loss": 0.7665, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6879448890686035, + "rewards/margins": 4.471528053283691, + "rewards/rejected": -1.783583402633667, + "step": 7431 + }, + { + "epoch": 1.86, + "grad_norm": 18.971162796020508, + "learning_rate": 6.958618556454641e-06, + "logits/chosen": -0.4899868965148926, + "logits/rejected": -0.579433023929596, + "logps/chosen": -50.791046142578125, + "logps/rejected": -75.95211791992188, + "loss": 0.7666, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.970384359359741, + "rewards/margins": 4.70806360244751, + "rewards/rejected": -1.737679362297058, + "step": 7432 + }, + { + "epoch": 1.86, + "grad_norm": 4.061665058135986, + "learning_rate": 6.957895358461448e-06, + "logits/chosen": -0.39943307638168335, + "logits/rejected": -0.5161280632019043, + "logps/chosen": -61.606048583984375, + "logps/rejected": -93.88724517822266, + "loss": 0.6694, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1716878414154053, + "rewards/margins": 5.271032333374023, + "rewards/rejected": -2.099344491958618, + "step": 7433 + }, + { + "epoch": 1.86, + "grad_norm": 3.8803234100341797, + "learning_rate": 6.957172112086576e-06, + "logits/chosen": -0.4704572856426239, + "logits/rejected": -0.5530046820640564, + "logps/chosen": -63.59545135498047, + "logps/rejected": -89.44025421142578, + "loss": 0.7827, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.668590545654297, + "rewards/margins": 5.418481349945068, + "rewards/rejected": -2.7498903274536133, + "step": 7434 + }, + { + "epoch": 1.86, + "grad_norm": 3.8621087074279785, + "learning_rate": 6.9564488173479005e-06, + "logits/chosen": -0.4363456964492798, + "logits/rejected": -0.5396993160247803, + "logps/chosen": -51.38054656982422, + "logps/rejected": -83.97621154785156, + "loss": 0.6371, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8487651348114014, + "rewards/margins": 5.702604293823242, + "rewards/rejected": -2.85383939743042, + "step": 7435 + }, + { + "epoch": 1.86, + "grad_norm": 8.426812171936035, + "learning_rate": 6.955725474263293e-06, + "logits/chosen": -0.4478260576725006, + "logits/rejected": -0.4530138373374939, + "logps/chosen": -54.90472412109375, + "logps/rejected": -88.56556701660156, + "loss": 0.7473, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.654974937438965, + "rewards/margins": 4.474002838134766, + "rewards/rejected": -1.8190282583236694, + "step": 7436 + }, + { + "epoch": 1.86, + "grad_norm": 4.96568489074707, + "learning_rate": 6.955002082850628e-06, + "logits/chosen": -0.5150178670883179, + "logits/rejected": -0.6537960767745972, + "logps/chosen": -57.481544494628906, + "logps/rejected": -90.65231323242188, + "loss": 0.6873, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.622664213180542, + "rewards/margins": 6.629253387451172, + "rewards/rejected": -4.006588935852051, + "step": 7437 + }, + { + "epoch": 1.86, + "grad_norm": 6.248980522155762, + "learning_rate": 6.954278643127783e-06, + "logits/chosen": -0.45761311054229736, + "logits/rejected": -0.54677414894104, + "logps/chosen": -62.175445556640625, + "logps/rejected": -91.5655517578125, + "loss": 0.8719, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.851346254348755, + "rewards/margins": 5.536856651306152, + "rewards/rejected": -2.6855101585388184, + "step": 7438 + }, + { + "epoch": 1.86, + "grad_norm": 9.435941696166992, + "learning_rate": 6.953555155112635e-06, + "logits/chosen": -0.4701746702194214, + "logits/rejected": -0.5922846794128418, + "logps/chosen": -62.1840934753418, + "logps/rejected": -89.19061279296875, + "loss": 0.6633, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9286365509033203, + "rewards/margins": 5.775972366333008, + "rewards/rejected": -2.8473360538482666, + "step": 7439 + }, + { + "epoch": 1.86, + "grad_norm": 4.675570964813232, + "learning_rate": 6.95283161882306e-06, + "logits/chosen": -0.4350220561027527, + "logits/rejected": -0.5020866990089417, + "logps/chosen": -57.78496170043945, + "logps/rejected": -103.72709655761719, + "loss": 0.6677, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8074355125427246, + "rewards/margins": 5.3424811363220215, + "rewards/rejected": -2.5350446701049805, + "step": 7440 + }, + { + "epoch": 1.86, + "grad_norm": 5.930598258972168, + "learning_rate": 6.952108034276938e-06, + "logits/chosen": -0.4348985254764557, + "logits/rejected": -0.5425367951393127, + "logps/chosen": -63.170562744140625, + "logps/rejected": -92.53524017333984, + "loss": 0.6937, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.924376964569092, + "rewards/margins": 5.0744194984436035, + "rewards/rejected": -2.1500425338745117, + "step": 7441 + }, + { + "epoch": 1.86, + "grad_norm": 6.834926605224609, + "learning_rate": 6.951384401492151e-06, + "logits/chosen": -0.4387352764606476, + "logits/rejected": -0.46498560905456543, + "logps/chosen": -59.52755355834961, + "logps/rejected": -97.88268280029297, + "loss": 0.742, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9667747020721436, + "rewards/margins": 5.522707939147949, + "rewards/rejected": -2.5559334754943848, + "step": 7442 + }, + { + "epoch": 1.86, + "grad_norm": 5.746587753295898, + "learning_rate": 6.950660720486581e-06, + "logits/chosen": -0.4450725317001343, + "logits/rejected": -0.4999011754989624, + "logps/chosen": -65.8638687133789, + "logps/rejected": -91.42438507080078, + "loss": 0.8308, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0858755111694336, + "rewards/margins": 5.232861042022705, + "rewards/rejected": -2.1469855308532715, + "step": 7443 + }, + { + "epoch": 1.86, + "grad_norm": 7.518399715423584, + "learning_rate": 6.949936991278108e-06, + "logits/chosen": -0.45712369680404663, + "logits/rejected": -0.5372844338417053, + "logps/chosen": -59.18421936035156, + "logps/rejected": -73.02606964111328, + "loss": 0.8327, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.931246280670166, + "rewards/margins": 4.717823505401611, + "rewards/rejected": -1.7865769863128662, + "step": 7444 + }, + { + "epoch": 1.86, + "grad_norm": 2.877455234527588, + "learning_rate": 6.949213213884619e-06, + "logits/chosen": -0.5459730625152588, + "logits/rejected": -0.6238433122634888, + "logps/chosen": -54.79869079589844, + "logps/rejected": -94.27352905273438, + "loss": 0.6665, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.023775100708008, + "rewards/margins": 6.490280628204346, + "rewards/rejected": -3.466505527496338, + "step": 7445 + }, + { + "epoch": 1.86, + "grad_norm": 2.888699531555176, + "learning_rate": 6.948489388323998e-06, + "logits/chosen": -0.42513611912727356, + "logits/rejected": -0.5181973576545715, + "logps/chosen": -56.63783645629883, + "logps/rejected": -85.71009826660156, + "loss": 0.6324, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.149354934692383, + "rewards/margins": 4.926877021789551, + "rewards/rejected": -1.777522325515747, + "step": 7446 + }, + { + "epoch": 1.86, + "grad_norm": 5.878773212432861, + "learning_rate": 6.947765514614131e-06, + "logits/chosen": -0.4029514193534851, + "logits/rejected": -0.5227072238922119, + "logps/chosen": -64.08200073242188, + "logps/rejected": -87.9587631225586, + "loss": 0.8377, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8410418033599854, + "rewards/margins": 4.868529796600342, + "rewards/rejected": -2.0274877548217773, + "step": 7447 + }, + { + "epoch": 1.86, + "grad_norm": 4.0042619705200195, + "learning_rate": 6.947041592772906e-06, + "logits/chosen": -0.4308721423149109, + "logits/rejected": -0.5699487328529358, + "logps/chosen": -58.326663970947266, + "logps/rejected": -95.75458526611328, + "loss": 0.6071, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.194780111312866, + "rewards/margins": 6.168630123138428, + "rewards/rejected": -2.9738502502441406, + "step": 7448 + }, + { + "epoch": 1.86, + "grad_norm": 3.3603508472442627, + "learning_rate": 6.946317622818215e-06, + "logits/chosen": -0.4806862771511078, + "logits/rejected": -0.5572409629821777, + "logps/chosen": -49.2115592956543, + "logps/rejected": -81.97540283203125, + "loss": 0.6142, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8993496894836426, + "rewards/margins": 5.520135402679443, + "rewards/rejected": -2.620785713195801, + "step": 7449 + }, + { + "epoch": 1.86, + "grad_norm": 2.9167697429656982, + "learning_rate": 6.945593604767941e-06, + "logits/chosen": -0.38497495651245117, + "logits/rejected": -0.5065633058547974, + "logps/chosen": -55.26679992675781, + "logps/rejected": -87.93831634521484, + "loss": 0.6176, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2316741943359375, + "rewards/margins": 5.801580429077148, + "rewards/rejected": -2.56990647315979, + "step": 7450 + }, + { + "epoch": 1.86, + "grad_norm": 5.073827266693115, + "learning_rate": 6.944869538639983e-06, + "logits/chosen": -0.4558800458908081, + "logits/rejected": -0.5404587984085083, + "logps/chosen": -42.37489700317383, + "logps/rejected": -89.27594757080078, + "loss": 0.5647, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8941421508789062, + "rewards/margins": 5.394138336181641, + "rewards/rejected": -2.499995708465576, + "step": 7451 + }, + { + "epoch": 1.86, + "grad_norm": 6.47452449798584, + "learning_rate": 6.944145424452229e-06, + "logits/chosen": -0.4394068121910095, + "logits/rejected": -0.4862443208694458, + "logps/chosen": -50.11300277709961, + "logps/rejected": -84.91922760009766, + "loss": 0.7921, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.051032304763794, + "rewards/margins": 5.601108551025391, + "rewards/rejected": -2.550076484680176, + "step": 7452 + }, + { + "epoch": 1.86, + "grad_norm": 3.4952709674835205, + "learning_rate": 6.943421262222574e-06, + "logits/chosen": -0.4127904176712036, + "logits/rejected": -0.5799237489700317, + "logps/chosen": -51.938446044921875, + "logps/rejected": -78.76976776123047, + "loss": 0.6132, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.292078971862793, + "rewards/margins": 6.488882064819336, + "rewards/rejected": -3.1968026161193848, + "step": 7453 + }, + { + "epoch": 1.86, + "grad_norm": 4.704322338104248, + "learning_rate": 6.9426970519689126e-06, + "logits/chosen": -0.4819260537624359, + "logits/rejected": -0.5354814529418945, + "logps/chosen": -52.74081802368164, + "logps/rejected": -105.78011322021484, + "loss": 0.6138, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.961207389831543, + "rewards/margins": 6.636383056640625, + "rewards/rejected": -3.675175666809082, + "step": 7454 + }, + { + "epoch": 1.87, + "grad_norm": 3.1106791496276855, + "learning_rate": 6.941972793709141e-06, + "logits/chosen": -0.44662436842918396, + "logits/rejected": -0.5036655068397522, + "logps/chosen": -51.809383392333984, + "logps/rejected": -91.39651489257812, + "loss": 0.6738, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.802899122238159, + "rewards/margins": 5.568699836730957, + "rewards/rejected": -2.7658002376556396, + "step": 7455 + }, + { + "epoch": 1.87, + "grad_norm": 5.960238933563232, + "learning_rate": 6.941248487461155e-06, + "logits/chosen": -0.46367818117141724, + "logits/rejected": -0.5493446588516235, + "logps/chosen": -60.24810028076172, + "logps/rejected": -100.34066772460938, + "loss": 0.6949, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9694135189056396, + "rewards/margins": 5.293271064758301, + "rewards/rejected": -2.323857307434082, + "step": 7456 + }, + { + "epoch": 1.87, + "grad_norm": 4.728906154632568, + "learning_rate": 6.940524133242854e-06, + "logits/chosen": -0.36129117012023926, + "logits/rejected": -0.4564207196235657, + "logps/chosen": -60.56214904785156, + "logps/rejected": -82.14051055908203, + "loss": 0.7324, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.989321231842041, + "rewards/margins": 4.48480224609375, + "rewards/rejected": -1.495481014251709, + "step": 7457 + }, + { + "epoch": 1.87, + "grad_norm": 3.2093863487243652, + "learning_rate": 6.939799731072138e-06, + "logits/chosen": -0.542818546295166, + "logits/rejected": -0.568699300289154, + "logps/chosen": -54.9892692565918, + "logps/rejected": -108.78572845458984, + "loss": 0.6082, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1606853008270264, + "rewards/margins": 6.395330429077148, + "rewards/rejected": -3.234645128250122, + "step": 7458 + }, + { + "epoch": 1.87, + "grad_norm": 4.758244037628174, + "learning_rate": 6.939075280966907e-06, + "logits/chosen": -0.4183594286441803, + "logits/rejected": -0.46328043937683105, + "logps/chosen": -51.9471321105957, + "logps/rejected": -86.18531799316406, + "loss": 0.7505, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.010504722595215, + "rewards/margins": 4.425832748413086, + "rewards/rejected": -1.415327787399292, + "step": 7459 + }, + { + "epoch": 1.87, + "grad_norm": 5.59932279586792, + "learning_rate": 6.938350782945065e-06, + "logits/chosen": -0.5122774839401245, + "logits/rejected": -0.5677512884140015, + "logps/chosen": -63.94316864013672, + "logps/rejected": -94.19812774658203, + "loss": 0.6469, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2874770164489746, + "rewards/margins": 5.216366291046143, + "rewards/rejected": -1.9288893938064575, + "step": 7460 + }, + { + "epoch": 1.87, + "grad_norm": 10.55352783203125, + "learning_rate": 6.937626237024513e-06, + "logits/chosen": -0.4731438457965851, + "logits/rejected": -0.5639484524726868, + "logps/chosen": -59.46686553955078, + "logps/rejected": -90.32052612304688, + "loss": 0.7145, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7244856357574463, + "rewards/margins": 6.025623321533203, + "rewards/rejected": -3.301137924194336, + "step": 7461 + }, + { + "epoch": 1.87, + "grad_norm": 6.746569633483887, + "learning_rate": 6.936901643223154e-06, + "logits/chosen": -0.32508328557014465, + "logits/rejected": -0.41766589879989624, + "logps/chosen": -68.31739807128906, + "logps/rejected": -97.97028350830078, + "loss": 0.7799, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0177536010742188, + "rewards/margins": 5.891519546508789, + "rewards/rejected": -2.873765707015991, + "step": 7462 + }, + { + "epoch": 1.87, + "grad_norm": 4.67694091796875, + "learning_rate": 6.9361770015588966e-06, + "logits/chosen": -0.5065507292747498, + "logits/rejected": -0.5627609491348267, + "logps/chosen": -47.18514633178711, + "logps/rejected": -97.47406005859375, + "loss": 0.6666, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9493985176086426, + "rewards/margins": 5.807838439941406, + "rewards/rejected": -2.8584399223327637, + "step": 7463 + }, + { + "epoch": 1.87, + "grad_norm": 8.781411170959473, + "learning_rate": 6.935452312049646e-06, + "logits/chosen": -0.40692824125289917, + "logits/rejected": -0.4794781506061554, + "logps/chosen": -52.53759765625, + "logps/rejected": -89.95333099365234, + "loss": 0.8585, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.969379425048828, + "rewards/margins": 5.071019172668457, + "rewards/rejected": -2.10163950920105, + "step": 7464 + }, + { + "epoch": 1.87, + "grad_norm": 3.192955493927002, + "learning_rate": 6.93472757471331e-06, + "logits/chosen": -0.43205612897872925, + "logits/rejected": -0.5529234409332275, + "logps/chosen": -57.767765045166016, + "logps/rejected": -91.6042251586914, + "loss": 0.6261, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.123586654663086, + "rewards/margins": 6.127232551574707, + "rewards/rejected": -3.003645896911621, + "step": 7465 + }, + { + "epoch": 1.87, + "grad_norm": 4.268892765045166, + "learning_rate": 6.934002789567797e-06, + "logits/chosen": -0.37612706422805786, + "logits/rejected": -0.4825819730758667, + "logps/chosen": -54.48724365234375, + "logps/rejected": -108.7929916381836, + "loss": 0.6658, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.84505033493042, + "rewards/margins": 5.5105695724487305, + "rewards/rejected": -2.6655187606811523, + "step": 7466 + }, + { + "epoch": 1.87, + "grad_norm": 6.15437126159668, + "learning_rate": 6.9332779566310195e-06, + "logits/chosen": -0.4462655186653137, + "logits/rejected": -0.582796573638916, + "logps/chosen": -57.95085144042969, + "logps/rejected": -83.79926300048828, + "loss": 0.6883, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.801591634750366, + "rewards/margins": 5.503678321838379, + "rewards/rejected": -2.7020864486694336, + "step": 7467 + }, + { + "epoch": 1.87, + "grad_norm": 4.05079460144043, + "learning_rate": 6.932553075920886e-06, + "logits/chosen": -0.4661426544189453, + "logits/rejected": -0.6113327145576477, + "logps/chosen": -54.50823211669922, + "logps/rejected": -87.95586395263672, + "loss": 0.6055, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.104623556137085, + "rewards/margins": 5.476655006408691, + "rewards/rejected": -2.3720319271087646, + "step": 7468 + }, + { + "epoch": 1.87, + "grad_norm": 7.649241924285889, + "learning_rate": 6.9318281474553115e-06, + "logits/chosen": -0.41826027631759644, + "logits/rejected": -0.4963369071483612, + "logps/chosen": -54.231815338134766, + "logps/rejected": -92.73838806152344, + "loss": 0.6974, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0653929710388184, + "rewards/margins": 5.471901893615723, + "rewards/rejected": -2.406508684158325, + "step": 7469 + }, + { + "epoch": 1.87, + "grad_norm": 10.901058197021484, + "learning_rate": 6.9311031712522085e-06, + "logits/chosen": -0.4794887900352478, + "logits/rejected": -0.5544441938400269, + "logps/chosen": -57.4976921081543, + "logps/rejected": -85.69806671142578, + "loss": 0.7806, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.277430772781372, + "rewards/margins": 5.522150993347168, + "rewards/rejected": -2.244720697402954, + "step": 7470 + }, + { + "epoch": 1.87, + "grad_norm": 5.122284889221191, + "learning_rate": 6.930378147329491e-06, + "logits/chosen": -0.43827661871910095, + "logits/rejected": -0.5318785905838013, + "logps/chosen": -62.183834075927734, + "logps/rejected": -81.984619140625, + "loss": 0.7282, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9646143913269043, + "rewards/margins": 5.173147678375244, + "rewards/rejected": -2.2085330486297607, + "step": 7471 + }, + { + "epoch": 1.87, + "grad_norm": 5.250616550445557, + "learning_rate": 6.929653075705077e-06, + "logits/chosen": -0.4927231967449188, + "logits/rejected": -0.5640451312065125, + "logps/chosen": -60.889949798583984, + "logps/rejected": -79.68618774414062, + "loss": 0.7188, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8443233966827393, + "rewards/margins": 5.2342329025268555, + "rewards/rejected": -2.389909505844116, + "step": 7472 + }, + { + "epoch": 1.87, + "grad_norm": 3.007854700088501, + "learning_rate": 6.928927956396882e-06, + "logits/chosen": -0.510939359664917, + "logits/rejected": -0.5861403346061707, + "logps/chosen": -53.49049758911133, + "logps/rejected": -102.56858825683594, + "loss": 0.6353, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9499337673187256, + "rewards/margins": 6.506946086883545, + "rewards/rejected": -3.5570123195648193, + "step": 7473 + }, + { + "epoch": 1.87, + "grad_norm": 3.921401023864746, + "learning_rate": 6.928202789422828e-06, + "logits/chosen": -0.47764092683792114, + "logits/rejected": -0.4864928424358368, + "logps/chosen": -51.6024055480957, + "logps/rejected": -97.90316772460938, + "loss": 0.7057, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.034083366394043, + "rewards/margins": 5.3060150146484375, + "rewards/rejected": -2.2719314098358154, + "step": 7474 + }, + { + "epoch": 1.87, + "grad_norm": 9.196646690368652, + "learning_rate": 6.927477574800829e-06, + "logits/chosen": -0.3955739140510559, + "logits/rejected": -0.44744253158569336, + "logps/chosen": -55.17232894897461, + "logps/rejected": -94.5195083618164, + "loss": 0.7169, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2399587631225586, + "rewards/margins": 5.341066837310791, + "rewards/rejected": -2.1011083126068115, + "step": 7475 + }, + { + "epoch": 1.87, + "grad_norm": 13.32700252532959, + "learning_rate": 6.926752312548811e-06, + "logits/chosen": -0.4785957336425781, + "logits/rejected": -0.5762476325035095, + "logps/chosen": -62.12687683105469, + "logps/rejected": -81.48184204101562, + "loss": 0.7778, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7890214920043945, + "rewards/margins": 5.2513532638549805, + "rewards/rejected": -2.462331771850586, + "step": 7476 + }, + { + "epoch": 1.87, + "grad_norm": 11.732429504394531, + "learning_rate": 6.926027002684695e-06, + "logits/chosen": -0.4151557683944702, + "logits/rejected": -0.4922393262386322, + "logps/chosen": -57.63265609741211, + "logps/rejected": -100.7357177734375, + "loss": 0.6701, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6677260398864746, + "rewards/margins": 4.546782493591309, + "rewards/rejected": -1.8790560960769653, + "step": 7477 + }, + { + "epoch": 1.87, + "grad_norm": 7.155299663543701, + "learning_rate": 6.925301645226401e-06, + "logits/chosen": -0.48313164710998535, + "logits/rejected": -0.5878890752792358, + "logps/chosen": -64.03836059570312, + "logps/rejected": -82.75228881835938, + "loss": 0.7313, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.427051305770874, + "rewards/margins": 5.937647342681885, + "rewards/rejected": -2.51059627532959, + "step": 7478 + }, + { + "epoch": 1.87, + "grad_norm": 21.81519889831543, + "learning_rate": 6.924576240191856e-06, + "logits/chosen": -0.3854762017726898, + "logits/rejected": -0.49609053134918213, + "logps/chosen": -70.32196044921875, + "logps/rejected": -93.88703155517578, + "loss": 0.8457, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.1986844539642334, + "rewards/margins": 4.693355083465576, + "rewards/rejected": -2.494670867919922, + "step": 7479 + }, + { + "epoch": 1.87, + "grad_norm": 4.336169719696045, + "learning_rate": 6.9238507875989855e-06, + "logits/chosen": -0.48241204023361206, + "logits/rejected": -0.5645026564598083, + "logps/chosen": -48.650760650634766, + "logps/rejected": -90.1388168334961, + "loss": 0.6376, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.007270336151123, + "rewards/margins": 6.056262016296387, + "rewards/rejected": -3.048992156982422, + "step": 7480 + }, + { + "epoch": 1.87, + "grad_norm": 4.375109672546387, + "learning_rate": 6.923125287465715e-06, + "logits/chosen": -0.4472598433494568, + "logits/rejected": -0.4928269386291504, + "logps/chosen": -54.49093246459961, + "logps/rejected": -93.87220001220703, + "loss": 0.6752, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8535633087158203, + "rewards/margins": 5.093689441680908, + "rewards/rejected": -2.2401256561279297, + "step": 7481 + }, + { + "epoch": 1.87, + "grad_norm": 8.081730842590332, + "learning_rate": 6.922399739809974e-06, + "logits/chosen": -0.45355165004730225, + "logits/rejected": -0.5494720935821533, + "logps/chosen": -55.276222229003906, + "logps/rejected": -79.58960723876953, + "loss": 0.8299, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6422815322875977, + "rewards/margins": 4.86073637008667, + "rewards/rejected": -2.2184548377990723, + "step": 7482 + }, + { + "epoch": 1.87, + "grad_norm": 10.665728569030762, + "learning_rate": 6.921674144649691e-06, + "logits/chosen": -0.443991482257843, + "logits/rejected": -0.5419678092002869, + "logps/chosen": -46.71453094482422, + "logps/rejected": -93.03895568847656, + "loss": 0.6901, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.104584217071533, + "rewards/margins": 6.405212879180908, + "rewards/rejected": -3.3006293773651123, + "step": 7483 + }, + { + "epoch": 1.87, + "grad_norm": 7.01030158996582, + "learning_rate": 6.920948502002795e-06, + "logits/chosen": -0.41452133655548096, + "logits/rejected": -0.453030526638031, + "logps/chosen": -53.61505126953125, + "logps/rejected": -104.4617691040039, + "loss": 0.6861, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.380643606185913, + "rewards/margins": 6.17834997177124, + "rewards/rejected": -2.797706365585327, + "step": 7484 + }, + { + "epoch": 1.87, + "grad_norm": 7.751875400543213, + "learning_rate": 6.920222811887218e-06, + "logits/chosen": -0.4401741027832031, + "logits/rejected": -0.5099719762802124, + "logps/chosen": -61.294456481933594, + "logps/rejected": -97.58280944824219, + "loss": 0.8257, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7651329040527344, + "rewards/margins": 5.132663726806641, + "rewards/rejected": -2.3675308227539062, + "step": 7485 + }, + { + "epoch": 1.87, + "grad_norm": 10.943343162536621, + "learning_rate": 6.919497074320895e-06, + "logits/chosen": -0.48653027415275574, + "logits/rejected": -0.5665345191955566, + "logps/chosen": -59.64691162109375, + "logps/rejected": -89.48921966552734, + "loss": 0.868, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.703990936279297, + "rewards/margins": 4.191072463989258, + "rewards/rejected": -1.487081527709961, + "step": 7486 + }, + { + "epoch": 1.87, + "grad_norm": 3.554705858230591, + "learning_rate": 6.918771289321754e-06, + "logits/chosen": -0.4089910387992859, + "logits/rejected": -0.5532744526863098, + "logps/chosen": -72.16632080078125, + "logps/rejected": -96.24301147460938, + "loss": 0.6057, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4665417671203613, + "rewards/margins": 5.993035793304443, + "rewards/rejected": -2.526494026184082, + "step": 7487 + }, + { + "epoch": 1.87, + "grad_norm": 5.304089546203613, + "learning_rate": 6.918045456907736e-06, + "logits/chosen": -0.3774034380912781, + "logits/rejected": -0.5024886131286621, + "logps/chosen": -61.362701416015625, + "logps/rejected": -80.71603393554688, + "loss": 0.7368, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.89829683303833, + "rewards/margins": 4.704156398773193, + "rewards/rejected": -1.8058598041534424, + "step": 7488 + }, + { + "epoch": 1.87, + "grad_norm": 8.76972770690918, + "learning_rate": 6.917319577096775e-06, + "logits/chosen": -0.37252548336982727, + "logits/rejected": -0.5039899349212646, + "logps/chosen": -65.13475036621094, + "logps/rejected": -83.81085968017578, + "loss": 0.7245, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.911825656890869, + "rewards/margins": 5.185619831085205, + "rewards/rejected": -2.273794174194336, + "step": 7489 + }, + { + "epoch": 1.87, + "grad_norm": 7.093813896179199, + "learning_rate": 6.9165936499068065e-06, + "logits/chosen": -0.4300101399421692, + "logits/rejected": -0.4533420205116272, + "logps/chosen": -57.299705505371094, + "logps/rejected": -89.0325927734375, + "loss": 0.8282, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8503758907318115, + "rewards/margins": 3.4409546852111816, + "rewards/rejected": -0.5905787944793701, + "step": 7490 + }, + { + "epoch": 1.87, + "grad_norm": 3.132915735244751, + "learning_rate": 6.915867675355771e-06, + "logits/chosen": -0.5044909119606018, + "logits/rejected": -0.5714043378829956, + "logps/chosen": -45.264251708984375, + "logps/rejected": -88.45730590820312, + "loss": 0.5902, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.147995948791504, + "rewards/margins": 5.3674774169921875, + "rewards/rejected": -2.2194809913635254, + "step": 7491 + }, + { + "epoch": 1.87, + "grad_norm": 4.60641622543335, + "learning_rate": 6.915141653461608e-06, + "logits/chosen": -0.38518285751342773, + "logits/rejected": -0.49013814330101013, + "logps/chosen": -54.695831298828125, + "logps/rejected": -97.76701354980469, + "loss": 0.6106, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2700860500335693, + "rewards/margins": 6.2281813621521, + "rewards/rejected": -2.958095073699951, + "step": 7492 + }, + { + "epoch": 1.87, + "grad_norm": 5.593782901763916, + "learning_rate": 6.914415584242256e-06, + "logits/chosen": -0.4045884609222412, + "logits/rejected": -0.5104507803916931, + "logps/chosen": -66.66426086425781, + "logps/rejected": -91.61890411376953, + "loss": 0.7354, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.77540922164917, + "rewards/margins": 4.873426914215088, + "rewards/rejected": -2.098017454147339, + "step": 7493 + }, + { + "epoch": 1.87, + "grad_norm": 10.444618225097656, + "learning_rate": 6.913689467715661e-06, + "logits/chosen": -0.4854537546634674, + "logits/rejected": -0.5485203862190247, + "logps/chosen": -52.079891204833984, + "logps/rejected": -100.64707946777344, + "loss": 0.7262, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.891481399536133, + "rewards/margins": 5.823458671569824, + "rewards/rejected": -2.9319777488708496, + "step": 7494 + }, + { + "epoch": 1.88, + "grad_norm": 18.963333129882812, + "learning_rate": 6.9129633038997625e-06, + "logits/chosen": -0.4364921450614929, + "logits/rejected": -0.5237303376197815, + "logps/chosen": -58.0308723449707, + "logps/rejected": -84.41485595703125, + "loss": 0.8587, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.3419618606567383, + "rewards/margins": 5.150718688964844, + "rewards/rejected": -2.8087570667266846, + "step": 7495 + }, + { + "epoch": 1.88, + "grad_norm": 11.951203346252441, + "learning_rate": 6.912237092812505e-06, + "logits/chosen": -0.5491700172424316, + "logits/rejected": -0.5634104609489441, + "logps/chosen": -53.6494255065918, + "logps/rejected": -94.12213134765625, + "loss": 0.8377, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.897244930267334, + "rewards/margins": 4.525472640991211, + "rewards/rejected": -1.6282274723052979, + "step": 7496 + }, + { + "epoch": 1.88, + "grad_norm": 3.657589912414551, + "learning_rate": 6.911510834471836e-06, + "logits/chosen": -0.3721204102039337, + "logits/rejected": -0.4596836566925049, + "logps/chosen": -51.39566421508789, + "logps/rejected": -82.0750961303711, + "loss": 0.5812, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0170891284942627, + "rewards/margins": 5.415107250213623, + "rewards/rejected": -2.3980185985565186, + "step": 7497 + }, + { + "epoch": 1.88, + "grad_norm": 2.939426898956299, + "learning_rate": 6.910784528895702e-06, + "logits/chosen": -0.42574530839920044, + "logits/rejected": -0.5471786260604858, + "logps/chosen": -58.551334381103516, + "logps/rejected": -88.54347229003906, + "loss": 0.6542, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8302721977233887, + "rewards/margins": 6.475821018218994, + "rewards/rejected": -3.6455488204956055, + "step": 7498 + }, + { + "epoch": 1.88, + "grad_norm": 15.319087028503418, + "learning_rate": 6.910058176102048e-06, + "logits/chosen": -0.44776901602745056, + "logits/rejected": -0.5157145261764526, + "logps/chosen": -56.7720947265625, + "logps/rejected": -84.68804931640625, + "loss": 0.8325, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9445621967315674, + "rewards/margins": 4.161161422729492, + "rewards/rejected": -1.2165989875793457, + "step": 7499 + }, + { + "epoch": 1.88, + "grad_norm": 3.2183640003204346, + "learning_rate": 6.909331776108826e-06, + "logits/chosen": -0.4776180684566498, + "logits/rejected": -0.5579351186752319, + "logps/chosen": -52.144630432128906, + "logps/rejected": -103.18534851074219, + "loss": 0.6, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9560065269470215, + "rewards/margins": 5.845816135406494, + "rewards/rejected": -2.8898091316223145, + "step": 7500 + }, + { + "epoch": 1.88, + "grad_norm": 4.446127414703369, + "learning_rate": 6.908605328933986e-06, + "logits/chosen": -0.4570155143737793, + "logits/rejected": -0.48064273595809937, + "logps/chosen": -53.34104919433594, + "logps/rejected": -83.62711334228516, + "loss": 0.7711, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.931581974029541, + "rewards/margins": 4.488725185394287, + "rewards/rejected": -1.5571436882019043, + "step": 7501 + }, + { + "epoch": 1.88, + "grad_norm": 4.152524948120117, + "learning_rate": 6.9078788345954765e-06, + "logits/chosen": -0.3869776129722595, + "logits/rejected": -0.47108182311058044, + "logps/chosen": -60.23731231689453, + "logps/rejected": -98.05138397216797, + "loss": 0.7203, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.967104911804199, + "rewards/margins": 5.136645317077637, + "rewards/rejected": -2.1695404052734375, + "step": 7502 + }, + { + "epoch": 1.88, + "grad_norm": 2.8632876873016357, + "learning_rate": 6.907152293111252e-06, + "logits/chosen": -0.41293489933013916, + "logits/rejected": -0.48525866866111755, + "logps/chosen": -62.59617614746094, + "logps/rejected": -83.95732116699219, + "loss": 0.6344, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9541332721710205, + "rewards/margins": 5.660056114196777, + "rewards/rejected": -2.705923080444336, + "step": 7503 + }, + { + "epoch": 1.88, + "grad_norm": 3.7891063690185547, + "learning_rate": 6.9064257044992655e-06, + "logits/chosen": -0.47647416591644287, + "logits/rejected": -0.576632022857666, + "logps/chosen": -44.786373138427734, + "logps/rejected": -80.40306091308594, + "loss": 0.5886, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.177154064178467, + "rewards/margins": 4.8212690353393555, + "rewards/rejected": -1.6441153287887573, + "step": 7504 + }, + { + "epoch": 1.88, + "grad_norm": 4.160882949829102, + "learning_rate": 6.905699068777473e-06, + "logits/chosen": -0.44936704635620117, + "logits/rejected": -0.5181776285171509, + "logps/chosen": -51.31858444213867, + "logps/rejected": -102.64813232421875, + "loss": 0.7196, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8059802055358887, + "rewards/margins": 5.33845853805542, + "rewards/rejected": -2.5324783325195312, + "step": 7505 + }, + { + "epoch": 1.88, + "grad_norm": 7.1812663078308105, + "learning_rate": 6.9049723859638295e-06, + "logits/chosen": -0.48472344875335693, + "logits/rejected": -0.5106136202812195, + "logps/chosen": -48.689842224121094, + "logps/rejected": -106.3243408203125, + "loss": 0.7306, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.021415948867798, + "rewards/margins": 5.240145683288574, + "rewards/rejected": -2.2187294960021973, + "step": 7506 + }, + { + "epoch": 1.88, + "grad_norm": 7.48154878616333, + "learning_rate": 6.904245656076292e-06, + "logits/chosen": -0.5214112997055054, + "logits/rejected": -0.49415552616119385, + "logps/chosen": -50.73086166381836, + "logps/rejected": -108.52714538574219, + "loss": 0.8394, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9133923053741455, + "rewards/margins": 4.870686054229736, + "rewards/rejected": -1.9572936296463013, + "step": 7507 + }, + { + "epoch": 1.88, + "grad_norm": 4.380408763885498, + "learning_rate": 6.903518879132818e-06, + "logits/chosen": -0.41411420702934265, + "logits/rejected": -0.49198809266090393, + "logps/chosen": -55.73161697387695, + "logps/rejected": -97.79887390136719, + "loss": 0.6707, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7713112831115723, + "rewards/margins": 5.6398515701293945, + "rewards/rejected": -2.8685402870178223, + "step": 7508 + }, + { + "epoch": 1.88, + "grad_norm": 14.146224021911621, + "learning_rate": 6.902792055151368e-06, + "logits/chosen": -0.38391244411468506, + "logits/rejected": -0.46663904190063477, + "logps/chosen": -60.52680206298828, + "logps/rejected": -95.29466247558594, + "loss": 0.7317, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9831080436706543, + "rewards/margins": 5.3335113525390625, + "rewards/rejected": -2.3504037857055664, + "step": 7509 + }, + { + "epoch": 1.88, + "grad_norm": 7.546590805053711, + "learning_rate": 6.902065184149903e-06, + "logits/chosen": -0.44901344180107117, + "logits/rejected": -0.5258215665817261, + "logps/chosen": -54.5485725402832, + "logps/rejected": -99.31381225585938, + "loss": 0.7573, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.15501070022583, + "rewards/margins": 5.579599857330322, + "rewards/rejected": -2.424589157104492, + "step": 7510 + }, + { + "epoch": 1.88, + "grad_norm": 5.332630634307861, + "learning_rate": 6.901338266146384e-06, + "logits/chosen": -0.5105261206626892, + "logits/rejected": -0.5994676947593689, + "logps/chosen": -65.43901062011719, + "logps/rejected": -106.78807067871094, + "loss": 0.7731, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.998781442642212, + "rewards/margins": 6.781614780426025, + "rewards/rejected": -3.7828338146209717, + "step": 7511 + }, + { + "epoch": 1.88, + "grad_norm": 4.052191257476807, + "learning_rate": 6.900611301158775e-06, + "logits/chosen": -0.4558166265487671, + "logits/rejected": -0.5483844876289368, + "logps/chosen": -55.82625961303711, + "logps/rejected": -92.89970397949219, + "loss": 0.6438, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.798989772796631, + "rewards/margins": 5.6874518394470215, + "rewards/rejected": -2.8884623050689697, + "step": 7512 + }, + { + "epoch": 1.88, + "grad_norm": 11.63176155090332, + "learning_rate": 6.899884289205038e-06, + "logits/chosen": -0.4099549353122711, + "logits/rejected": -0.48386844992637634, + "logps/chosen": -60.63747787475586, + "logps/rejected": -86.10453033447266, + "loss": 0.7203, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.74955153465271, + "rewards/margins": 4.598846435546875, + "rewards/rejected": -1.8492951393127441, + "step": 7513 + }, + { + "epoch": 1.88, + "grad_norm": 18.500930786132812, + "learning_rate": 6.8991572303031396e-06, + "logits/chosen": -0.4704996943473816, + "logits/rejected": -0.5266557931900024, + "logps/chosen": -69.75071716308594, + "logps/rejected": -99.32303619384766, + "loss": 0.8442, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.523435354232788, + "rewards/margins": 4.8525872230529785, + "rewards/rejected": -2.3291518688201904, + "step": 7514 + }, + { + "epoch": 1.88, + "grad_norm": 5.610967636108398, + "learning_rate": 6.898430124471047e-06, + "logits/chosen": -0.509207546710968, + "logits/rejected": -0.5854172110557556, + "logps/chosen": -55.376678466796875, + "logps/rejected": -110.19450378417969, + "loss": 0.6354, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.744436502456665, + "rewards/margins": 6.666316032409668, + "rewards/rejected": -3.921879291534424, + "step": 7515 + }, + { + "epoch": 1.88, + "grad_norm": 5.326982498168945, + "learning_rate": 6.897702971726724e-06, + "logits/chosen": -0.44728320837020874, + "logits/rejected": -0.5980589389801025, + "logps/chosen": -54.5770378112793, + "logps/rejected": -83.08990478515625, + "loss": 0.5878, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9516940116882324, + "rewards/margins": 6.227150917053223, + "rewards/rejected": -3.275456428527832, + "step": 7516 + }, + { + "epoch": 1.88, + "grad_norm": 8.6843900680542, + "learning_rate": 6.896975772088146e-06, + "logits/chosen": -0.4588325023651123, + "logits/rejected": -0.5001218914985657, + "logps/chosen": -66.9246597290039, + "logps/rejected": -96.78688049316406, + "loss": 0.7924, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.635204792022705, + "rewards/margins": 5.0188751220703125, + "rewards/rejected": -2.3836700916290283, + "step": 7517 + }, + { + "epoch": 1.88, + "grad_norm": 6.971105575561523, + "learning_rate": 6.896248525573276e-06, + "logits/chosen": -0.37776079773902893, + "logits/rejected": -0.4631333649158478, + "logps/chosen": -66.01248168945312, + "logps/rejected": -108.50151062011719, + "loss": 0.6782, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.55850887298584, + "rewards/margins": 5.7878594398498535, + "rewards/rejected": -3.2293508052825928, + "step": 7518 + }, + { + "epoch": 1.88, + "grad_norm": 8.419977188110352, + "learning_rate": 6.895521232200089e-06, + "logits/chosen": -0.4661250412464142, + "logits/rejected": -0.5198508501052856, + "logps/chosen": -55.90073776245117, + "logps/rejected": -89.94105529785156, + "loss": 0.8888, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8814823627471924, + "rewards/margins": 5.212896347045898, + "rewards/rejected": -2.331413745880127, + "step": 7519 + }, + { + "epoch": 1.88, + "grad_norm": 5.2179179191589355, + "learning_rate": 6.894793891986557e-06, + "logits/chosen": -0.43697863817214966, + "logits/rejected": -0.5242790579795837, + "logps/chosen": -59.015201568603516, + "logps/rejected": -87.21363830566406, + "loss": 0.7238, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0138235092163086, + "rewards/margins": 5.530709266662598, + "rewards/rejected": -2.516885995864868, + "step": 7520 + }, + { + "epoch": 1.88, + "grad_norm": 23.23768424987793, + "learning_rate": 6.8940665049506516e-06, + "logits/chosen": -0.4428808391094208, + "logits/rejected": -0.5904926657676697, + "logps/chosen": -61.264808654785156, + "logps/rejected": -88.8494644165039, + "loss": 0.7144, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.793126344680786, + "rewards/margins": 6.85739803314209, + "rewards/rejected": -4.064272403717041, + "step": 7521 + }, + { + "epoch": 1.88, + "grad_norm": 5.416138172149658, + "learning_rate": 6.893339071110348e-06, + "logits/chosen": -0.5378990769386292, + "logits/rejected": -0.6250123381614685, + "logps/chosen": -46.98853302001953, + "logps/rejected": -87.2107925415039, + "loss": 0.6527, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0687499046325684, + "rewards/margins": 5.739227771759033, + "rewards/rejected": -2.670478343963623, + "step": 7522 + }, + { + "epoch": 1.88, + "grad_norm": 4.245150566101074, + "learning_rate": 6.8926115904836226e-06, + "logits/chosen": -0.45236077904701233, + "logits/rejected": -0.5069608688354492, + "logps/chosen": -51.516998291015625, + "logps/rejected": -99.5069580078125, + "loss": 0.6749, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.852783441543579, + "rewards/margins": 5.110406398773193, + "rewards/rejected": -2.2576231956481934, + "step": 7523 + }, + { + "epoch": 1.88, + "grad_norm": 5.018786430358887, + "learning_rate": 6.891884063088452e-06, + "logits/chosen": -0.5141529440879822, + "logits/rejected": -0.5774492621421814, + "logps/chosen": -45.69590759277344, + "logps/rejected": -83.70187377929688, + "loss": 0.6033, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.756742000579834, + "rewards/margins": 5.287866115570068, + "rewards/rejected": -2.5311243534088135, + "step": 7524 + }, + { + "epoch": 1.88, + "grad_norm": 12.574990272521973, + "learning_rate": 6.891156488942812e-06, + "logits/chosen": -0.405906081199646, + "logits/rejected": -0.465648889541626, + "logps/chosen": -53.68448257446289, + "logps/rejected": -98.10891723632812, + "loss": 0.7976, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6374566555023193, + "rewards/margins": 5.178972244262695, + "rewards/rejected": -2.541515350341797, + "step": 7525 + }, + { + "epoch": 1.88, + "grad_norm": 5.682594299316406, + "learning_rate": 6.890428868064686e-06, + "logits/chosen": -0.444935142993927, + "logits/rejected": -0.5391484498977661, + "logps/chosen": -55.725337982177734, + "logps/rejected": -90.63770294189453, + "loss": 0.7081, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.006624221801758, + "rewards/margins": 5.383429527282715, + "rewards/rejected": -2.376805067062378, + "step": 7526 + }, + { + "epoch": 1.88, + "grad_norm": 7.684499740600586, + "learning_rate": 6.88970120047205e-06, + "logits/chosen": -0.4557686448097229, + "logits/rejected": -0.5249126553535461, + "logps/chosen": -49.96039581298828, + "logps/rejected": -101.66187286376953, + "loss": 0.6287, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.08066463470459, + "rewards/margins": 6.268446445465088, + "rewards/rejected": -3.187781810760498, + "step": 7527 + }, + { + "epoch": 1.88, + "grad_norm": 3.0643458366394043, + "learning_rate": 6.888973486182888e-06, + "logits/chosen": -0.48284173011779785, + "logits/rejected": -0.5898013710975647, + "logps/chosen": -44.658233642578125, + "logps/rejected": -71.61519622802734, + "loss": 0.6322, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1329643726348877, + "rewards/margins": 5.750638484954834, + "rewards/rejected": -2.6176743507385254, + "step": 7528 + }, + { + "epoch": 1.88, + "grad_norm": 6.264185905456543, + "learning_rate": 6.888245725215184e-06, + "logits/chosen": -0.5477433204650879, + "logits/rejected": -0.5750727653503418, + "logps/chosen": -56.53129577636719, + "logps/rejected": -93.43928527832031, + "loss": 0.7579, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.094635486602783, + "rewards/margins": 5.406764984130859, + "rewards/rejected": -2.3121302127838135, + "step": 7529 + }, + { + "epoch": 1.88, + "grad_norm": 5.68253231048584, + "learning_rate": 6.887517917586917e-06, + "logits/chosen": -0.4356144666671753, + "logits/rejected": -0.5443400144577026, + "logps/chosen": -63.373146057128906, + "logps/rejected": -100.72130584716797, + "loss": 0.7002, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.674919605255127, + "rewards/margins": 5.662901878356934, + "rewards/rejected": -2.9879822731018066, + "step": 7530 + }, + { + "epoch": 1.88, + "grad_norm": 4.473787307739258, + "learning_rate": 6.886790063316075e-06, + "logits/chosen": -0.41023242473602295, + "logits/rejected": -0.5122220516204834, + "logps/chosen": -64.4463882446289, + "logps/rejected": -100.93153381347656, + "loss": 0.6372, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.880729913711548, + "rewards/margins": 5.2720627784729, + "rewards/rejected": -2.3913326263427734, + "step": 7531 + }, + { + "epoch": 1.88, + "grad_norm": 4.070603370666504, + "learning_rate": 6.886062162420645e-06, + "logits/chosen": -0.5153954029083252, + "logits/rejected": -0.5629523992538452, + "logps/chosen": -48.35176086425781, + "logps/rejected": -93.94902038574219, + "loss": 0.6748, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3718180656433105, + "rewards/margins": 5.494482517242432, + "rewards/rejected": -2.122664213180542, + "step": 7532 + }, + { + "epoch": 1.88, + "grad_norm": 18.679052352905273, + "learning_rate": 6.885334214918612e-06, + "logits/chosen": -0.46146273612976074, + "logits/rejected": -0.524573564529419, + "logps/chosen": -52.274147033691406, + "logps/rejected": -92.09237670898438, + "loss": 0.7527, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.244518995285034, + "rewards/margins": 4.739621162414551, + "rewards/rejected": -1.4951021671295166, + "step": 7533 + }, + { + "epoch": 1.88, + "grad_norm": 6.63615083694458, + "learning_rate": 6.884606220827965e-06, + "logits/chosen": -0.4958057701587677, + "logits/rejected": -0.5402611494064331, + "logps/chosen": -49.871341705322266, + "logps/rejected": -92.9642333984375, + "loss": 0.7371, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6752655506134033, + "rewards/margins": 5.198493003845215, + "rewards/rejected": -2.5232272148132324, + "step": 7534 + }, + { + "epoch": 1.89, + "grad_norm": 4.485604286193848, + "learning_rate": 6.883878180166695e-06, + "logits/chosen": -0.479729026556015, + "logits/rejected": -0.5671601295471191, + "logps/chosen": -46.628440856933594, + "logps/rejected": -74.3386001586914, + "loss": 0.6486, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0065386295318604, + "rewards/margins": 4.254000186920166, + "rewards/rejected": -1.2474615573883057, + "step": 7535 + }, + { + "epoch": 1.89, + "grad_norm": 5.908108711242676, + "learning_rate": 6.88315009295279e-06, + "logits/chosen": -0.42382463812828064, + "logits/rejected": -0.4984169006347656, + "logps/chosen": -53.31636047363281, + "logps/rejected": -87.58061218261719, + "loss": 0.6646, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6602704524993896, + "rewards/margins": 5.512343406677246, + "rewards/rejected": -2.8520731925964355, + "step": 7536 + }, + { + "epoch": 1.89, + "grad_norm": 7.989561557769775, + "learning_rate": 6.882421959204244e-06, + "logits/chosen": -0.4778788685798645, + "logits/rejected": -0.5074045658111572, + "logps/chosen": -46.755706787109375, + "logps/rejected": -112.84519958496094, + "loss": 0.6472, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0116214752197266, + "rewards/margins": 5.2702813148498535, + "rewards/rejected": -2.2586593627929688, + "step": 7537 + }, + { + "epoch": 1.89, + "grad_norm": 3.095289468765259, + "learning_rate": 6.881693778939049e-06, + "logits/chosen": -0.4299706816673279, + "logits/rejected": -0.5424342155456543, + "logps/chosen": -50.181114196777344, + "logps/rejected": -83.60533905029297, + "loss": 0.614, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1156294345855713, + "rewards/margins": 4.878037452697754, + "rewards/rejected": -1.762407660484314, + "step": 7538 + }, + { + "epoch": 1.89, + "grad_norm": 3.667346954345703, + "learning_rate": 6.880965552175198e-06, + "logits/chosen": -0.37778952717781067, + "logits/rejected": -0.5329758524894714, + "logps/chosen": -55.78974533081055, + "logps/rejected": -66.3182601928711, + "loss": 0.68, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.294952869415283, + "rewards/margins": 5.5417304039001465, + "rewards/rejected": -2.2467780113220215, + "step": 7539 + }, + { + "epoch": 1.89, + "grad_norm": 3.982104539871216, + "learning_rate": 6.880237278930689e-06, + "logits/chosen": -0.4917864203453064, + "logits/rejected": -0.6168388724327087, + "logps/chosen": -62.9919548034668, + "logps/rejected": -83.19499206542969, + "loss": 0.7163, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0828263759613037, + "rewards/margins": 5.4716668128967285, + "rewards/rejected": -2.3888401985168457, + "step": 7540 + }, + { + "epoch": 1.89, + "grad_norm": 3.7124979496002197, + "learning_rate": 6.879508959223517e-06, + "logits/chosen": -0.4128064513206482, + "logits/rejected": -0.5310017466545105, + "logps/chosen": -58.34677505493164, + "logps/rejected": -90.6198959350586, + "loss": 0.6641, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.043283700942993, + "rewards/margins": 5.103731155395508, + "rewards/rejected": -2.0604472160339355, + "step": 7541 + }, + { + "epoch": 1.89, + "grad_norm": 3.6718146800994873, + "learning_rate": 6.878780593071679e-06, + "logits/chosen": -0.4534032344818115, + "logits/rejected": -0.5246341228485107, + "logps/chosen": -61.56510543823242, + "logps/rejected": -90.54544067382812, + "loss": 0.7491, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.640885829925537, + "rewards/margins": 5.328429698944092, + "rewards/rejected": -2.6875438690185547, + "step": 7542 + }, + { + "epoch": 1.89, + "grad_norm": 4.066495895385742, + "learning_rate": 6.878052180493174e-06, + "logits/chosen": -0.37989112734794617, + "logits/rejected": -0.36470240354537964, + "logps/chosen": -58.49940872192383, + "logps/rejected": -112.94635009765625, + "loss": 0.6121, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.986215829849243, + "rewards/margins": 5.877940654754639, + "rewards/rejected": -2.8917253017425537, + "step": 7543 + }, + { + "epoch": 1.89, + "grad_norm": 4.185839653015137, + "learning_rate": 6.8773237215060016e-06, + "logits/chosen": -0.4846917688846588, + "logits/rejected": -0.5628098249435425, + "logps/chosen": -52.18579864501953, + "logps/rejected": -88.75489807128906, + "loss": 0.6329, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8657138347625732, + "rewards/margins": 5.762747764587402, + "rewards/rejected": -2.897033929824829, + "step": 7544 + }, + { + "epoch": 1.89, + "grad_norm": 16.6934757232666, + "learning_rate": 6.8765952161281645e-06, + "logits/chosen": -0.29381871223449707, + "logits/rejected": -0.4859544336795807, + "logps/chosen": -65.35006713867188, + "logps/rejected": -82.8330078125, + "loss": 0.7686, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7370517253875732, + "rewards/margins": 4.89998197555542, + "rewards/rejected": -2.1629300117492676, + "step": 7545 + }, + { + "epoch": 1.89, + "grad_norm": 5.0042595863342285, + "learning_rate": 6.875866664377663e-06, + "logits/chosen": -0.514627993106842, + "logits/rejected": -0.5961059331893921, + "logps/chosen": -51.96437072753906, + "logps/rejected": -90.64070892333984, + "loss": 0.6521, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1327710151672363, + "rewards/margins": 6.460064888000488, + "rewards/rejected": -3.327293634414673, + "step": 7546 + }, + { + "epoch": 1.89, + "grad_norm": 3.639120101928711, + "learning_rate": 6.8751380662725e-06, + "logits/chosen": -0.4508948028087616, + "logits/rejected": -0.5335604548454285, + "logps/chosen": -48.76428985595703, + "logps/rejected": -75.5203857421875, + "loss": 0.6595, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.899162530899048, + "rewards/margins": 5.137131690979004, + "rewards/rejected": -2.237969398498535, + "step": 7547 + }, + { + "epoch": 1.89, + "grad_norm": 5.605374336242676, + "learning_rate": 6.874409421830683e-06, + "logits/chosen": -0.5260880589485168, + "logits/rejected": -0.630967378616333, + "logps/chosen": -52.12898254394531, + "logps/rejected": -88.4732894897461, + "loss": 0.6431, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6232550144195557, + "rewards/margins": 6.316886901855469, + "rewards/rejected": -3.693631887435913, + "step": 7548 + }, + { + "epoch": 1.89, + "grad_norm": 17.54230499267578, + "learning_rate": 6.8736807310702135e-06, + "logits/chosen": -0.4237956702709198, + "logits/rejected": -0.5189470052719116, + "logps/chosen": -62.54108428955078, + "logps/rejected": -97.9201431274414, + "loss": 0.6845, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2822587490081787, + "rewards/margins": 5.779024600982666, + "rewards/rejected": -2.496765613555908, + "step": 7549 + }, + { + "epoch": 1.89, + "grad_norm": 4.266198635101318, + "learning_rate": 6.872951994009102e-06, + "logits/chosen": -0.4818994104862213, + "logits/rejected": -0.5516853928565979, + "logps/chosen": -62.090431213378906, + "logps/rejected": -91.27661895751953, + "loss": 0.7251, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8733999729156494, + "rewards/margins": 5.000306129455566, + "rewards/rejected": -2.126906633377075, + "step": 7550 + }, + { + "epoch": 1.89, + "grad_norm": 12.783350944519043, + "learning_rate": 6.872223210665353e-06, + "logits/chosen": -0.3820595145225525, + "logits/rejected": -0.5060641765594482, + "logps/chosen": -60.41823196411133, + "logps/rejected": -99.77992248535156, + "loss": 0.7489, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.443438768386841, + "rewards/margins": 5.27916145324707, + "rewards/rejected": -2.8357224464416504, + "step": 7551 + }, + { + "epoch": 1.89, + "grad_norm": 6.350446701049805, + "learning_rate": 6.871494381056978e-06, + "logits/chosen": -0.5404231548309326, + "logits/rejected": -0.5979411602020264, + "logps/chosen": -56.70244598388672, + "logps/rejected": -96.02565002441406, + "loss": 0.813, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7133026123046875, + "rewards/margins": 5.272674083709717, + "rewards/rejected": -2.55937123298645, + "step": 7552 + }, + { + "epoch": 1.89, + "grad_norm": 3.2001848220825195, + "learning_rate": 6.870765505201985e-06, + "logits/chosen": -0.4760187864303589, + "logits/rejected": -0.582342803478241, + "logps/chosen": -52.26189041137695, + "logps/rejected": -83.5163345336914, + "loss": 0.6453, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.077298879623413, + "rewards/margins": 5.532329559326172, + "rewards/rejected": -2.4550302028656006, + "step": 7553 + }, + { + "epoch": 1.89, + "grad_norm": 8.075263977050781, + "learning_rate": 6.870036583118388e-06, + "logits/chosen": -0.48247089982032776, + "logits/rejected": -0.5522017478942871, + "logps/chosen": -63.521453857421875, + "logps/rejected": -98.65534210205078, + "loss": 0.8203, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.560575008392334, + "rewards/margins": 5.630831241607666, + "rewards/rejected": -3.070255756378174, + "step": 7554 + }, + { + "epoch": 1.89, + "grad_norm": 16.458024978637695, + "learning_rate": 6.869307614824196e-06, + "logits/chosen": -0.43873679637908936, + "logits/rejected": -0.5132378339767456, + "logps/chosen": -63.986263275146484, + "logps/rejected": -95.78423309326172, + "loss": 0.806, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.901918649673462, + "rewards/margins": 5.250993251800537, + "rewards/rejected": -2.3490748405456543, + "step": 7555 + }, + { + "epoch": 1.89, + "grad_norm": 5.137571334838867, + "learning_rate": 6.868578600337427e-06, + "logits/chosen": -0.4545445442199707, + "logits/rejected": -0.5067709684371948, + "logps/chosen": -77.60920715332031, + "logps/rejected": -116.19651794433594, + "loss": 0.7513, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1305603981018066, + "rewards/margins": 6.074554443359375, + "rewards/rejected": -2.9439940452575684, + "step": 7556 + }, + { + "epoch": 1.89, + "grad_norm": 7.729434013366699, + "learning_rate": 6.8678495396760915e-06, + "logits/chosen": -0.51160728931427, + "logits/rejected": -0.6246292591094971, + "logps/chosen": -55.19465637207031, + "logps/rejected": -99.73757934570312, + "loss": 0.7084, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7312750816345215, + "rewards/margins": 6.32471227645874, + "rewards/rejected": -3.5934371948242188, + "step": 7557 + }, + { + "epoch": 1.89, + "grad_norm": 10.128779411315918, + "learning_rate": 6.867120432858209e-06, + "logits/chosen": -0.3858504295349121, + "logits/rejected": -0.4703270494937897, + "logps/chosen": -48.5733757019043, + "logps/rejected": -92.82683563232422, + "loss": 0.5393, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0982866287231445, + "rewards/margins": 6.337670803070068, + "rewards/rejected": -3.2393839359283447, + "step": 7558 + }, + { + "epoch": 1.89, + "grad_norm": 9.10605525970459, + "learning_rate": 6.866391279901792e-06, + "logits/chosen": -0.3707752525806427, + "logits/rejected": -0.5105109214782715, + "logps/chosen": -64.58847045898438, + "logps/rejected": -85.60327911376953, + "loss": 0.6889, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.044635772705078, + "rewards/margins": 5.149409770965576, + "rewards/rejected": -2.104773759841919, + "step": 7559 + }, + { + "epoch": 1.89, + "grad_norm": 7.776904582977295, + "learning_rate": 6.865662080824864e-06, + "logits/chosen": -0.4472341537475586, + "logits/rejected": -0.5122787952423096, + "logps/chosen": -44.312156677246094, + "logps/rejected": -88.62705993652344, + "loss": 0.7293, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0611183643341064, + "rewards/margins": 5.854900360107422, + "rewards/rejected": -2.7937819957733154, + "step": 7560 + }, + { + "epoch": 1.89, + "grad_norm": 12.445829391479492, + "learning_rate": 6.8649328356454416e-06, + "logits/chosen": -0.47673261165618896, + "logits/rejected": -0.5031439065933228, + "logps/chosen": -53.917266845703125, + "logps/rejected": -80.94976043701172, + "loss": 0.7121, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9240245819091797, + "rewards/margins": 4.354397296905518, + "rewards/rejected": -1.4303728342056274, + "step": 7561 + }, + { + "epoch": 1.89, + "grad_norm": 5.343075752258301, + "learning_rate": 6.864203544381542e-06, + "logits/chosen": -0.3634985685348511, + "logits/rejected": -0.4633941352367401, + "logps/chosen": -57.75645446777344, + "logps/rejected": -96.6954345703125, + "loss": 0.6644, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.020375967025757, + "rewards/margins": 5.656997203826904, + "rewards/rejected": -2.6366212368011475, + "step": 7562 + }, + { + "epoch": 1.89, + "grad_norm": 10.653332710266113, + "learning_rate": 6.863474207051193e-06, + "logits/chosen": -0.42466461658477783, + "logits/rejected": -0.4740821421146393, + "logps/chosen": -50.083648681640625, + "logps/rejected": -96.68949127197266, + "loss": 0.6835, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.835916519165039, + "rewards/margins": 4.733522891998291, + "rewards/rejected": -1.8976062536239624, + "step": 7563 + }, + { + "epoch": 1.89, + "grad_norm": 3.848055839538574, + "learning_rate": 6.862744823672413e-06, + "logits/chosen": -0.47664985060691833, + "logits/rejected": -0.5883028507232666, + "logps/chosen": -68.17779541015625, + "logps/rejected": -86.47212219238281, + "loss": 0.7649, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.87630295753479, + "rewards/margins": 5.28413200378418, + "rewards/rejected": -2.4078290462493896, + "step": 7564 + }, + { + "epoch": 1.89, + "grad_norm": 4.06640100479126, + "learning_rate": 6.862015394263226e-06, + "logits/chosen": -0.4962739944458008, + "logits/rejected": -0.5581369400024414, + "logps/chosen": -50.24861145019531, + "logps/rejected": -87.82147216796875, + "loss": 0.6928, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1242735385894775, + "rewards/margins": 4.937723159790039, + "rewards/rejected": -1.8134498596191406, + "step": 7565 + }, + { + "epoch": 1.89, + "grad_norm": 4.547267436981201, + "learning_rate": 6.86128591884166e-06, + "logits/chosen": -0.44831669330596924, + "logits/rejected": -0.549501895904541, + "logps/chosen": -64.5854263305664, + "logps/rejected": -93.59351348876953, + "loss": 0.6766, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.250652551651001, + "rewards/margins": 6.29709529876709, + "rewards/rejected": -3.0464439392089844, + "step": 7566 + }, + { + "epoch": 1.89, + "grad_norm": 7.187270641326904, + "learning_rate": 6.860556397425739e-06, + "logits/chosen": -0.37164708971977234, + "logits/rejected": -0.4200923442840576, + "logps/chosen": -70.32299041748047, + "logps/rejected": -96.0796127319336, + "loss": 0.8204, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7386488914489746, + "rewards/margins": 4.669825077056885, + "rewards/rejected": -1.9311763048171997, + "step": 7567 + }, + { + "epoch": 1.89, + "grad_norm": 5.1839728355407715, + "learning_rate": 6.859826830033489e-06, + "logits/chosen": -0.4591403305530548, + "logits/rejected": -0.5222777724266052, + "logps/chosen": -58.26079559326172, + "logps/rejected": -83.91902923583984, + "loss": 0.7068, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2626914978027344, + "rewards/margins": 3.85549259185791, + "rewards/rejected": -0.5928010940551758, + "step": 7568 + }, + { + "epoch": 1.89, + "grad_norm": 9.542434692382812, + "learning_rate": 6.859097216682942e-06, + "logits/chosen": -0.3662586808204651, + "logits/rejected": -0.43633782863616943, + "logps/chosen": -57.843109130859375, + "logps/rejected": -86.36369323730469, + "loss": 0.8032, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7061386108398438, + "rewards/margins": 3.426741600036621, + "rewards/rejected": -0.7206026911735535, + "step": 7569 + }, + { + "epoch": 1.89, + "grad_norm": 4.974776268005371, + "learning_rate": 6.858367557392124e-06, + "logits/chosen": -0.44244569540023804, + "logits/rejected": -0.5071554183959961, + "logps/chosen": -54.866275787353516, + "logps/rejected": -93.23209381103516, + "loss": 0.735, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.786161184310913, + "rewards/margins": 5.23344612121582, + "rewards/rejected": -2.447284698486328, + "step": 7570 + }, + { + "epoch": 1.89, + "grad_norm": 3.7264182567596436, + "learning_rate": 6.857637852179067e-06, + "logits/chosen": -0.38277435302734375, + "logits/rejected": -0.4463302493095398, + "logps/chosen": -45.41454315185547, + "logps/rejected": -111.70280456542969, + "loss": 0.6857, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0613303184509277, + "rewards/margins": 5.558139801025391, + "rewards/rejected": -2.496809720993042, + "step": 7571 + }, + { + "epoch": 1.89, + "grad_norm": 5.573183059692383, + "learning_rate": 6.856908101061803e-06, + "logits/chosen": -0.49093714356422424, + "logits/rejected": -0.5866534113883972, + "logps/chosen": -59.36016845703125, + "logps/rejected": -80.64818572998047, + "loss": 0.6848, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.063474655151367, + "rewards/margins": 5.0733208656311035, + "rewards/rejected": -2.0098459720611572, + "step": 7572 + }, + { + "epoch": 1.89, + "grad_norm": 5.045497417449951, + "learning_rate": 6.856178304058365e-06, + "logits/chosen": -0.42761746048927307, + "logits/rejected": -0.4889480769634247, + "logps/chosen": -50.44472885131836, + "logps/rejected": -84.18619537353516, + "loss": 0.6758, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.021596670150757, + "rewards/margins": 4.679056644439697, + "rewards/rejected": -1.6574599742889404, + "step": 7573 + }, + { + "epoch": 1.89, + "grad_norm": 6.302188873291016, + "learning_rate": 6.855448461186785e-06, + "logits/chosen": -0.4645952880382538, + "logits/rejected": -0.5914552807807922, + "logps/chosen": -48.41466522216797, + "logps/rejected": -80.58734893798828, + "loss": 0.6, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.840547561645508, + "rewards/margins": 4.943378448486328, + "rewards/rejected": -2.1028311252593994, + "step": 7574 + }, + { + "epoch": 1.9, + "grad_norm": 2.3179166316986084, + "learning_rate": 6.854718572465102e-06, + "logits/chosen": -0.41818714141845703, + "logits/rejected": -0.49595919251441956, + "logps/chosen": -51.968505859375, + "logps/rejected": -95.74116516113281, + "loss": 0.5917, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0114428997039795, + "rewards/margins": 5.4408793449401855, + "rewards/rejected": -2.4294369220733643, + "step": 7575 + }, + { + "epoch": 1.9, + "grad_norm": 3.2492129802703857, + "learning_rate": 6.85398863791135e-06, + "logits/chosen": -0.4602636694908142, + "logits/rejected": -0.5680567622184753, + "logps/chosen": -50.98023986816406, + "logps/rejected": -78.22322082519531, + "loss": 0.6095, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8536629676818848, + "rewards/margins": 5.1603288650512695, + "rewards/rejected": -2.306666374206543, + "step": 7576 + }, + { + "epoch": 1.9, + "grad_norm": 4.57193660736084, + "learning_rate": 6.853258657543567e-06, + "logits/chosen": -0.531610369682312, + "logits/rejected": -0.6026846170425415, + "logps/chosen": -56.9334602355957, + "logps/rejected": -89.05329895019531, + "loss": 0.6724, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7250640392303467, + "rewards/margins": 4.510391712188721, + "rewards/rejected": -1.7853275537490845, + "step": 7577 + }, + { + "epoch": 1.9, + "grad_norm": 12.783904075622559, + "learning_rate": 6.852528631379791e-06, + "logits/chosen": -0.4737088084220886, + "logits/rejected": -0.5044717192649841, + "logps/chosen": -59.71488571166992, + "logps/rejected": -92.56119537353516, + "loss": 0.8101, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0620198249816895, + "rewards/margins": 4.228768825531006, + "rewards/rejected": -1.166749119758606, + "step": 7578 + }, + { + "epoch": 1.9, + "grad_norm": 7.0494065284729, + "learning_rate": 6.851798559438061e-06, + "logits/chosen": -0.3526557981967926, + "logits/rejected": -0.48375269770622253, + "logps/chosen": -71.14334106445312, + "logps/rejected": -80.80636596679688, + "loss": 0.6838, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9719038009643555, + "rewards/margins": 5.516267776489258, + "rewards/rejected": -2.5443639755249023, + "step": 7579 + }, + { + "epoch": 1.9, + "grad_norm": 5.519151210784912, + "learning_rate": 6.85106844173642e-06, + "logits/chosen": -0.4171447455883026, + "logits/rejected": -0.5344829559326172, + "logps/chosen": -53.23646545410156, + "logps/rejected": -93.70942687988281, + "loss": 0.5663, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8844752311706543, + "rewards/margins": 6.162440299987793, + "rewards/rejected": -3.2779648303985596, + "step": 7580 + }, + { + "epoch": 1.9, + "grad_norm": 2.852325201034546, + "learning_rate": 6.8503382782929095e-06, + "logits/chosen": -0.4774141013622284, + "logits/rejected": -0.5998492240905762, + "logps/chosen": -59.10191345214844, + "logps/rejected": -80.14138793945312, + "loss": 0.6176, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.038480281829834, + "rewards/margins": 5.767371654510498, + "rewards/rejected": -2.728891134262085, + "step": 7581 + }, + { + "epoch": 1.9, + "grad_norm": 5.2973737716674805, + "learning_rate": 6.849608069125571e-06, + "logits/chosen": -0.4282965660095215, + "logits/rejected": -0.5046875476837158, + "logps/chosen": -57.575965881347656, + "logps/rejected": -74.51821899414062, + "loss": 0.8333, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9005794525146484, + "rewards/margins": 4.41190767288208, + "rewards/rejected": -1.5113282203674316, + "step": 7582 + }, + { + "epoch": 1.9, + "grad_norm": 7.137880802154541, + "learning_rate": 6.8488778142524505e-06, + "logits/chosen": -0.46193286776542664, + "logits/rejected": -0.5380088686943054, + "logps/chosen": -59.2004280090332, + "logps/rejected": -88.37278747558594, + "loss": 0.8014, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1058738231658936, + "rewards/margins": 5.405416488647461, + "rewards/rejected": -2.299542188644409, + "step": 7583 + }, + { + "epoch": 1.9, + "grad_norm": 6.827965259552002, + "learning_rate": 6.8481475136915935e-06, + "logits/chosen": -0.45159685611724854, + "logits/rejected": -0.5267825126647949, + "logps/chosen": -54.62156677246094, + "logps/rejected": -96.89026641845703, + "loss": 0.6989, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.152939558029175, + "rewards/margins": 5.891858100891113, + "rewards/rejected": -2.7389183044433594, + "step": 7584 + }, + { + "epoch": 1.9, + "grad_norm": 14.844718933105469, + "learning_rate": 6.847417167461043e-06, + "logits/chosen": -0.4886540472507477, + "logits/rejected": -0.5774579048156738, + "logps/chosen": -85.33003234863281, + "logps/rejected": -89.55365753173828, + "loss": 0.7851, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.881248950958252, + "rewards/margins": 5.136830806732178, + "rewards/rejected": -2.2555816173553467, + "step": 7585 + }, + { + "epoch": 1.9, + "grad_norm": 5.433290958404541, + "learning_rate": 6.846686775578851e-06, + "logits/chosen": -0.45258206129074097, + "logits/rejected": -0.5061187744140625, + "logps/chosen": -49.4570426940918, + "logps/rejected": -87.07560729980469, + "loss": 0.6896, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.322190523147583, + "rewards/margins": 4.603605270385742, + "rewards/rejected": -1.28141450881958, + "step": 7586 + }, + { + "epoch": 1.9, + "grad_norm": 4.047146320343018, + "learning_rate": 6.845956338063064e-06, + "logits/chosen": -0.3516751825809479, + "logits/rejected": -0.4818023443222046, + "logps/chosen": -53.45491409301758, + "logps/rejected": -67.37032318115234, + "loss": 0.6904, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0810604095458984, + "rewards/margins": 4.75706672668457, + "rewards/rejected": -1.6760064363479614, + "step": 7587 + }, + { + "epoch": 1.9, + "grad_norm": 11.325018882751465, + "learning_rate": 6.845225854931733e-06, + "logits/chosen": -0.4350408613681793, + "logits/rejected": -0.5136500597000122, + "logps/chosen": -58.93617248535156, + "logps/rejected": -86.56118774414062, + "loss": 1.1161, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.549189329147339, + "rewards/margins": 4.358452796936035, + "rewards/rejected": -1.809262990951538, + "step": 7588 + }, + { + "epoch": 1.9, + "grad_norm": 37.276058197021484, + "learning_rate": 6.844495326202908e-06, + "logits/chosen": -0.3915245831012726, + "logits/rejected": -0.4693232774734497, + "logps/chosen": -60.55135726928711, + "logps/rejected": -86.31668090820312, + "loss": 0.8003, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.918677568435669, + "rewards/margins": 5.07581901550293, + "rewards/rejected": -2.157141923904419, + "step": 7589 + }, + { + "epoch": 1.9, + "grad_norm": 6.021236896514893, + "learning_rate": 6.8437647518946414e-06, + "logits/chosen": -0.4409010708332062, + "logits/rejected": -0.5120912194252014, + "logps/chosen": -50.80849075317383, + "logps/rejected": -100.26041412353516, + "loss": 0.6521, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.692472457885742, + "rewards/margins": 5.496146202087402, + "rewards/rejected": -2.8036739826202393, + "step": 7590 + }, + { + "epoch": 1.9, + "grad_norm": 3.921002149581909, + "learning_rate": 6.843034132024987e-06, + "logits/chosen": -0.4552173614501953, + "logits/rejected": -0.5588920712471008, + "logps/chosen": -49.188262939453125, + "logps/rejected": -80.84719848632812, + "loss": 0.5302, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2184414863586426, + "rewards/margins": 6.32652473449707, + "rewards/rejected": -3.1080832481384277, + "step": 7591 + }, + { + "epoch": 1.9, + "grad_norm": 6.133358478546143, + "learning_rate": 6.842303466611999e-06, + "logits/chosen": -0.4854840636253357, + "logits/rejected": -0.561779260635376, + "logps/chosen": -58.262779235839844, + "logps/rejected": -88.05903625488281, + "loss": 0.8071, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.994968891143799, + "rewards/margins": 5.793707370758057, + "rewards/rejected": -2.7987382411956787, + "step": 7592 + }, + { + "epoch": 1.9, + "grad_norm": 5.197835922241211, + "learning_rate": 6.841572755673734e-06, + "logits/chosen": -0.4968695640563965, + "logits/rejected": -0.5861133933067322, + "logps/chosen": -57.08797836303711, + "logps/rejected": -89.27802276611328, + "loss": 0.5991, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.860654354095459, + "rewards/margins": 6.022863388061523, + "rewards/rejected": -3.1622092723846436, + "step": 7593 + }, + { + "epoch": 1.9, + "grad_norm": 4.729925632476807, + "learning_rate": 6.840841999228245e-06, + "logits/chosen": -0.42993927001953125, + "logits/rejected": -0.49561426043510437, + "logps/chosen": -49.37456130981445, + "logps/rejected": -87.45637512207031, + "loss": 0.6069, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.880319118499756, + "rewards/margins": 5.900640487670898, + "rewards/rejected": -3.0203213691711426, + "step": 7594 + }, + { + "epoch": 1.9, + "grad_norm": 4.5847930908203125, + "learning_rate": 6.840111197293594e-06, + "logits/chosen": -0.39637628197669983, + "logits/rejected": -0.5108466148376465, + "logps/chosen": -67.15660095214844, + "logps/rejected": -91.93487548828125, + "loss": 0.6996, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.891038417816162, + "rewards/margins": 6.38333797454834, + "rewards/rejected": -3.4922990798950195, + "step": 7595 + }, + { + "epoch": 1.9, + "grad_norm": 21.323402404785156, + "learning_rate": 6.839380349887836e-06, + "logits/chosen": -0.5085446834564209, + "logits/rejected": -0.5553361177444458, + "logps/chosen": -49.90401840209961, + "logps/rejected": -93.54889678955078, + "loss": 0.8725, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.709998846054077, + "rewards/margins": 4.651959419250488, + "rewards/rejected": -1.9419606924057007, + "step": 7596 + }, + { + "epoch": 1.9, + "grad_norm": 12.744868278503418, + "learning_rate": 6.8386494570290365e-06, + "logits/chosen": -0.4117664098739624, + "logits/rejected": -0.3910834491252899, + "logps/chosen": -49.92131042480469, + "logps/rejected": -110.90826416015625, + "loss": 0.7267, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8144779205322266, + "rewards/margins": 5.174519062042236, + "rewards/rejected": -2.3600409030914307, + "step": 7597 + }, + { + "epoch": 1.9, + "grad_norm": 9.25040054321289, + "learning_rate": 6.837918518735251e-06, + "logits/chosen": -0.4791184067726135, + "logits/rejected": -0.5678092241287231, + "logps/chosen": -58.037845611572266, + "logps/rejected": -92.98493957519531, + "loss": 0.7363, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9037423133850098, + "rewards/margins": 6.484616279602051, + "rewards/rejected": -3.580873727798462, + "step": 7598 + }, + { + "epoch": 1.9, + "grad_norm": 7.225922584533691, + "learning_rate": 6.837187535024543e-06, + "logits/chosen": -0.44251295924186707, + "logits/rejected": -0.5441499352455139, + "logps/chosen": -52.60089874267578, + "logps/rejected": -86.2894287109375, + "loss": 0.6502, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7688608169555664, + "rewards/margins": 5.522483825683594, + "rewards/rejected": -2.7536227703094482, + "step": 7599 + }, + { + "epoch": 1.9, + "grad_norm": 2.50950026512146, + "learning_rate": 6.836456505914979e-06, + "logits/chosen": -0.4117147922515869, + "logits/rejected": -0.5159242153167725, + "logps/chosen": -53.58546447753906, + "logps/rejected": -100.05575561523438, + "loss": 0.5752, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.225884437561035, + "rewards/margins": 6.701569557189941, + "rewards/rejected": -3.475684642791748, + "step": 7600 + }, + { + "epoch": 1.9, + "grad_norm": 8.017794609069824, + "learning_rate": 6.83572543142462e-06, + "logits/chosen": -0.35012415051460266, + "logits/rejected": -0.4258614778518677, + "logps/chosen": -63.6083984375, + "logps/rejected": -85.94014739990234, + "loss": 0.7748, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9121079444885254, + "rewards/margins": 5.301344871520996, + "rewards/rejected": -2.3892366886138916, + "step": 7601 + }, + { + "epoch": 1.9, + "grad_norm": 9.406342506408691, + "learning_rate": 6.834994311571535e-06, + "logits/chosen": -0.44856587052345276, + "logits/rejected": -0.487678200006485, + "logps/chosen": -60.40123748779297, + "logps/rejected": -91.93998718261719, + "loss": 0.7695, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.605290651321411, + "rewards/margins": 4.893509387969971, + "rewards/rejected": -2.288219451904297, + "step": 7602 + }, + { + "epoch": 1.9, + "grad_norm": 6.41974401473999, + "learning_rate": 6.834263146373786e-06, + "logits/chosen": -0.4168074429035187, + "logits/rejected": -0.50780189037323, + "logps/chosen": -59.640220642089844, + "logps/rejected": -88.94861602783203, + "loss": 0.7771, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6726880073547363, + "rewards/margins": 4.857243061065674, + "rewards/rejected": -2.1845555305480957, + "step": 7603 + }, + { + "epoch": 1.9, + "grad_norm": 6.233143329620361, + "learning_rate": 6.833531935849446e-06, + "logits/chosen": -0.4935353994369507, + "logits/rejected": -0.5844756364822388, + "logps/chosen": -46.72748565673828, + "logps/rejected": -75.79429626464844, + "loss": 0.7573, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8014957904815674, + "rewards/margins": 5.237157821655273, + "rewards/rejected": -2.435661792755127, + "step": 7604 + }, + { + "epoch": 1.9, + "grad_norm": 6.212472915649414, + "learning_rate": 6.832800680016579e-06, + "logits/chosen": -0.4542708396911621, + "logits/rejected": -0.4816695749759674, + "logps/chosen": -63.09477615356445, + "logps/rejected": -124.813720703125, + "loss": 0.739, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.815155506134033, + "rewards/margins": 6.605528831481934, + "rewards/rejected": -3.7903733253479004, + "step": 7605 + }, + { + "epoch": 1.9, + "grad_norm": 6.616486072540283, + "learning_rate": 6.8320693788932605e-06, + "logits/chosen": -0.44254186749458313, + "logits/rejected": -0.554973840713501, + "logps/chosen": -64.05484008789062, + "logps/rejected": -90.81568908691406, + "loss": 0.7028, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.79598069190979, + "rewards/margins": 5.506997585296631, + "rewards/rejected": -2.71101713180542, + "step": 7606 + }, + { + "epoch": 1.9, + "grad_norm": 10.899957656860352, + "learning_rate": 6.831338032497557e-06, + "logits/chosen": -0.41957879066467285, + "logits/rejected": -0.4457166790962219, + "logps/chosen": -60.42580795288086, + "logps/rejected": -99.30662536621094, + "loss": 0.7244, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.821910858154297, + "rewards/margins": 5.2585129737854, + "rewards/rejected": -2.4366021156311035, + "step": 7607 + }, + { + "epoch": 1.9, + "grad_norm": 3.360586643218994, + "learning_rate": 6.8306066408475435e-06, + "logits/chosen": -0.46832048892974854, + "logits/rejected": -0.5855319499969482, + "logps/chosen": -58.681453704833984, + "logps/rejected": -90.33118438720703, + "loss": 0.6571, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.032238006591797, + "rewards/margins": 5.341419696807861, + "rewards/rejected": -2.3091814517974854, + "step": 7608 + }, + { + "epoch": 1.9, + "grad_norm": 6.681711196899414, + "learning_rate": 6.829875203961295e-06, + "logits/chosen": -0.4388655424118042, + "logits/rejected": -0.5086520910263062, + "logps/chosen": -63.413143157958984, + "logps/rejected": -87.38581085205078, + "loss": 0.6974, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.163818359375, + "rewards/margins": 5.241802215576172, + "rewards/rejected": -2.077984094619751, + "step": 7609 + }, + { + "epoch": 1.9, + "grad_norm": 6.569335460662842, + "learning_rate": 6.8291437218568815e-06, + "logits/chosen": -0.47252804040908813, + "logits/rejected": -0.5796076059341431, + "logps/chosen": -54.623680114746094, + "logps/rejected": -79.18563079833984, + "loss": 0.6522, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.817136764526367, + "rewards/margins": 5.12935209274292, + "rewards/rejected": -2.3122150897979736, + "step": 7610 + }, + { + "epoch": 1.9, + "grad_norm": 3.6623384952545166, + "learning_rate": 6.82841219455238e-06, + "logits/chosen": -0.3648621737957001, + "logits/rejected": -0.4464147090911865, + "logps/chosen": -63.680152893066406, + "logps/rejected": -86.55751037597656, + "loss": 0.6659, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.257558822631836, + "rewards/margins": 5.568833827972412, + "rewards/rejected": -2.311274290084839, + "step": 7611 + }, + { + "epoch": 1.9, + "grad_norm": 4.848849296569824, + "learning_rate": 6.827680622065871e-06, + "logits/chosen": -0.3954850733280182, + "logits/rejected": -0.4681259095668793, + "logps/chosen": -70.39582824707031, + "logps/rejected": -97.59359741210938, + "loss": 0.733, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1600534915924072, + "rewards/margins": 5.39779806137085, + "rewards/rejected": -2.2377445697784424, + "step": 7612 + }, + { + "epoch": 1.9, + "grad_norm": 13.362192153930664, + "learning_rate": 6.8269490044154274e-06, + "logits/chosen": -0.39608538150787354, + "logits/rejected": -0.4418364465236664, + "logps/chosen": -58.219242095947266, + "logps/rejected": -99.2907943725586, + "loss": 0.7916, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.19132661819458, + "rewards/margins": 4.673823356628418, + "rewards/rejected": -1.4824965000152588, + "step": 7613 + }, + { + "epoch": 1.9, + "grad_norm": 6.23155403137207, + "learning_rate": 6.826217341619132e-06, + "logits/chosen": -0.3249164819717407, + "logits/rejected": -0.415484219789505, + "logps/chosen": -56.36402130126953, + "logps/rejected": -97.50794982910156, + "loss": 0.7693, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7917184829711914, + "rewards/margins": 4.968998432159424, + "rewards/rejected": -2.177279472351074, + "step": 7614 + }, + { + "epoch": 1.91, + "grad_norm": 3.9532554149627686, + "learning_rate": 6.825485633695063e-06, + "logits/chosen": -0.4675649404525757, + "logits/rejected": -0.552688717842102, + "logps/chosen": -55.37209701538086, + "logps/rejected": -89.66154479980469, + "loss": 0.6387, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9166393280029297, + "rewards/margins": 4.805356502532959, + "rewards/rejected": -1.8887171745300293, + "step": 7615 + }, + { + "epoch": 1.91, + "grad_norm": 3.5039279460906982, + "learning_rate": 6.824753880661303e-06, + "logits/chosen": -0.5059604644775391, + "logits/rejected": -0.6154143214225769, + "logps/chosen": -58.12416458129883, + "logps/rejected": -88.11393737792969, + "loss": 0.725, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.056150197982788, + "rewards/margins": 5.569363594055176, + "rewards/rejected": -2.513213634490967, + "step": 7616 + }, + { + "epoch": 1.91, + "grad_norm": 6.751143932342529, + "learning_rate": 6.8240220825359335e-06, + "logits/chosen": -0.3852030336856842, + "logits/rejected": -0.4221547842025757, + "logps/chosen": -57.38336944580078, + "logps/rejected": -94.03277587890625, + "loss": 0.7414, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.033858299255371, + "rewards/margins": 4.516441345214844, + "rewards/rejected": -1.4825831651687622, + "step": 7617 + }, + { + "epoch": 1.91, + "grad_norm": 3.3402836322784424, + "learning_rate": 6.823290239337039e-06, + "logits/chosen": -0.43297499418258667, + "logits/rejected": -0.4943634569644928, + "logps/chosen": -59.30955505371094, + "logps/rejected": -92.16988372802734, + "loss": 0.7473, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3112053871154785, + "rewards/margins": 4.777796745300293, + "rewards/rejected": -1.4665910005569458, + "step": 7618 + }, + { + "epoch": 1.91, + "grad_norm": 5.969826698303223, + "learning_rate": 6.822558351082702e-06, + "logits/chosen": -0.33363470435142517, + "logits/rejected": -0.41989943385124207, + "logps/chosen": -52.91259002685547, + "logps/rejected": -94.69969177246094, + "loss": 0.5583, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9616622924804688, + "rewards/margins": 5.432034492492676, + "rewards/rejected": -2.470371961593628, + "step": 7619 + }, + { + "epoch": 1.91, + "grad_norm": 8.211673736572266, + "learning_rate": 6.821826417791011e-06, + "logits/chosen": -0.327833890914917, + "logits/rejected": -0.4471859335899353, + "logps/chosen": -60.0735969543457, + "logps/rejected": -73.26681518554688, + "loss": 0.7812, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6389834880828857, + "rewards/margins": 5.07937479019165, + "rewards/rejected": -2.440392017364502, + "step": 7620 + }, + { + "epoch": 1.91, + "grad_norm": 3.637856960296631, + "learning_rate": 6.8210944394800505e-06, + "logits/chosen": -0.38454997539520264, + "logits/rejected": -0.470575749874115, + "logps/chosen": -57.05936050415039, + "logps/rejected": -92.48672485351562, + "loss": 0.7257, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3202264308929443, + "rewards/margins": 5.264763832092285, + "rewards/rejected": -1.94453763961792, + "step": 7621 + }, + { + "epoch": 1.91, + "grad_norm": 3.75376033782959, + "learning_rate": 6.820362416167909e-06, + "logits/chosen": -0.4700675308704376, + "logits/rejected": -0.5061805844306946, + "logps/chosen": -53.00226974487305, + "logps/rejected": -97.07440185546875, + "loss": 0.6903, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9944441318511963, + "rewards/margins": 5.428136825561523, + "rewards/rejected": -2.4336929321289062, + "step": 7622 + }, + { + "epoch": 1.91, + "grad_norm": 4.387667655944824, + "learning_rate": 6.819630347872677e-06, + "logits/chosen": -0.4512558877468109, + "logits/rejected": -0.5024634599685669, + "logps/chosen": -58.99081039428711, + "logps/rejected": -96.52405548095703, + "loss": 0.6819, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3998987674713135, + "rewards/margins": 5.230340003967285, + "rewards/rejected": -1.8304415941238403, + "step": 7623 + }, + { + "epoch": 1.91, + "grad_norm": 3.5291199684143066, + "learning_rate": 6.818898234612443e-06, + "logits/chosen": -0.3948748707771301, + "logits/rejected": -0.5135162472724915, + "logps/chosen": -54.692237854003906, + "logps/rejected": -88.49272155761719, + "loss": 0.6411, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3408257961273193, + "rewards/margins": 6.239242076873779, + "rewards/rejected": -2.898416519165039, + "step": 7624 + }, + { + "epoch": 1.91, + "grad_norm": 6.543626308441162, + "learning_rate": 6.818166076405298e-06, + "logits/chosen": -0.34229201078414917, + "logits/rejected": -0.45445284247398376, + "logps/chosen": -67.38671112060547, + "logps/rejected": -82.65227508544922, + "loss": 0.7782, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8312909603118896, + "rewards/margins": 4.396087169647217, + "rewards/rejected": -1.5647965669631958, + "step": 7625 + }, + { + "epoch": 1.91, + "grad_norm": 4.373072147369385, + "learning_rate": 6.8174338732693375e-06, + "logits/chosen": -0.4741995632648468, + "logits/rejected": -0.5440605878829956, + "logps/chosen": -58.27628707885742, + "logps/rejected": -90.5550537109375, + "loss": 0.7025, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.903198719024658, + "rewards/margins": 4.622284412384033, + "rewards/rejected": -1.719085454940796, + "step": 7626 + }, + { + "epoch": 1.91, + "grad_norm": 8.493180274963379, + "learning_rate": 6.8167016252226525e-06, + "logits/chosen": -0.5458512306213379, + "logits/rejected": -0.5639675259590149, + "logps/chosen": -74.7750015258789, + "logps/rejected": -80.00511169433594, + "loss": 0.7972, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.057875633239746, + "rewards/margins": 4.803257942199707, + "rewards/rejected": -1.7453821897506714, + "step": 7627 + }, + { + "epoch": 1.91, + "grad_norm": 5.753317356109619, + "learning_rate": 6.8159693322833365e-06, + "logits/chosen": -0.35175418853759766, + "logits/rejected": -0.42002564668655396, + "logps/chosen": -67.8082275390625, + "logps/rejected": -92.73771667480469, + "loss": 0.8253, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9903907775878906, + "rewards/margins": 4.596309185028076, + "rewards/rejected": -1.605918526649475, + "step": 7628 + }, + { + "epoch": 1.91, + "grad_norm": 3.404712677001953, + "learning_rate": 6.815236994469489e-06, + "logits/chosen": -0.5288129448890686, + "logits/rejected": -0.5813263654708862, + "logps/chosen": -49.591217041015625, + "logps/rejected": -88.66017150878906, + "loss": 0.6612, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.94537353515625, + "rewards/margins": 5.178919315338135, + "rewards/rejected": -2.2335457801818848, + "step": 7629 + }, + { + "epoch": 1.91, + "grad_norm": 9.849645614624023, + "learning_rate": 6.814504611799202e-06, + "logits/chosen": -0.36681145429611206, + "logits/rejected": -0.40614113211631775, + "logps/chosen": -61.603858947753906, + "logps/rejected": -87.93611907958984, + "loss": 0.7987, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.4038584232330322, + "rewards/margins": 4.3858489990234375, + "rewards/rejected": -0.98199063539505, + "step": 7630 + }, + { + "epoch": 1.91, + "grad_norm": 6.20664644241333, + "learning_rate": 6.813772184290577e-06, + "logits/chosen": -0.3763798475265503, + "logits/rejected": -0.5019158124923706, + "logps/chosen": -62.9835205078125, + "logps/rejected": -82.7005844116211, + "loss": 0.7964, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.832690954208374, + "rewards/margins": 4.4268364906311035, + "rewards/rejected": -1.5941457748413086, + "step": 7631 + }, + { + "epoch": 1.91, + "grad_norm": 5.895796775817871, + "learning_rate": 6.813039711961713e-06, + "logits/chosen": -0.4639233648777008, + "logits/rejected": -0.5532766580581665, + "logps/chosen": -62.09053421020508, + "logps/rejected": -90.78709411621094, + "loss": 0.6943, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.792248010635376, + "rewards/margins": 4.252366065979004, + "rewards/rejected": -1.4601179361343384, + "step": 7632 + }, + { + "epoch": 1.91, + "grad_norm": 5.1031036376953125, + "learning_rate": 6.812307194830709e-06, + "logits/chosen": -0.4655686616897583, + "logits/rejected": -0.5134553909301758, + "logps/chosen": -62.237579345703125, + "logps/rejected": -92.4842758178711, + "loss": 0.6657, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3146986961364746, + "rewards/margins": 4.8861188888549805, + "rewards/rejected": -1.5714200735092163, + "step": 7633 + }, + { + "epoch": 1.91, + "grad_norm": 2.9710888862609863, + "learning_rate": 6.811574632915666e-06, + "logits/chosen": -0.39434778690338135, + "logits/rejected": -0.5316107869148254, + "logps/chosen": -67.59554290771484, + "logps/rejected": -81.83882141113281, + "loss": 0.6716, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0332894325256348, + "rewards/margins": 5.435139179229736, + "rewards/rejected": -2.4018497467041016, + "step": 7634 + }, + { + "epoch": 1.91, + "grad_norm": 5.630308628082275, + "learning_rate": 6.810842026234687e-06, + "logits/chosen": -0.43467649817466736, + "logits/rejected": -0.5085474848747253, + "logps/chosen": -54.78678894042969, + "logps/rejected": -87.71869659423828, + "loss": 0.7789, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9999284744262695, + "rewards/margins": 4.6336517333984375, + "rewards/rejected": -1.6337230205535889, + "step": 7635 + }, + { + "epoch": 1.91, + "grad_norm": 5.241967678070068, + "learning_rate": 6.810109374805875e-06, + "logits/chosen": -0.47347214818000793, + "logits/rejected": -0.5583163499832153, + "logps/chosen": -52.16893768310547, + "logps/rejected": -88.66616821289062, + "loss": 0.6074, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.258082866668701, + "rewards/margins": 6.6717143058776855, + "rewards/rejected": -3.413630962371826, + "step": 7636 + }, + { + "epoch": 1.91, + "grad_norm": 4.951119422912598, + "learning_rate": 6.8093766786473355e-06, + "logits/chosen": -0.40898120403289795, + "logits/rejected": -0.541420578956604, + "logps/chosen": -64.45816802978516, + "logps/rejected": -77.92395782470703, + "loss": 0.7673, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0885610580444336, + "rewards/margins": 4.564641952514648, + "rewards/rejected": -1.4760808944702148, + "step": 7637 + }, + { + "epoch": 1.91, + "grad_norm": 4.04103422164917, + "learning_rate": 6.8086439377771755e-06, + "logits/chosen": -0.4325869083404541, + "logits/rejected": -0.5203933119773865, + "logps/chosen": -41.45981979370117, + "logps/rejected": -75.24508666992188, + "loss": 0.6358, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.080826997756958, + "rewards/margins": 5.955539703369141, + "rewards/rejected": -2.8747129440307617, + "step": 7638 + }, + { + "epoch": 1.91, + "grad_norm": 5.333274841308594, + "learning_rate": 6.8079111522134975e-06, + "logits/chosen": -0.4685448706150055, + "logits/rejected": -0.5176175236701965, + "logps/chosen": -52.29100799560547, + "logps/rejected": -94.92496490478516, + "loss": 0.6713, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.981531858444214, + "rewards/margins": 5.963467597961426, + "rewards/rejected": -2.981935739517212, + "step": 7639 + }, + { + "epoch": 1.91, + "grad_norm": 5.277716159820557, + "learning_rate": 6.8071783219744126e-06, + "logits/chosen": -0.4396384358406067, + "logits/rejected": -0.5240445137023926, + "logps/chosen": -50.25372314453125, + "logps/rejected": -75.60017395019531, + "loss": 0.7097, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.043938159942627, + "rewards/margins": 4.644931316375732, + "rewards/rejected": -1.600993275642395, + "step": 7640 + }, + { + "epoch": 1.91, + "grad_norm": 17.797536849975586, + "learning_rate": 6.8064454470780295e-06, + "logits/chosen": -0.42572757601737976, + "logits/rejected": -0.4434582591056824, + "logps/chosen": -60.96265411376953, + "logps/rejected": -91.61701202392578, + "loss": 0.8135, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8467178344726562, + "rewards/margins": 4.32590389251709, + "rewards/rejected": -1.479185938835144, + "step": 7641 + }, + { + "epoch": 1.91, + "grad_norm": 7.757429599761963, + "learning_rate": 6.805712527542457e-06, + "logits/chosen": -0.49626341462135315, + "logits/rejected": -0.5598626136779785, + "logps/chosen": -58.63602828979492, + "logps/rejected": -86.63941955566406, + "loss": 0.7316, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8260457515716553, + "rewards/margins": 5.02632999420166, + "rewards/rejected": -2.2002837657928467, + "step": 7642 + }, + { + "epoch": 1.91, + "grad_norm": 19.994279861450195, + "learning_rate": 6.80497956338581e-06, + "logits/chosen": -0.48591145873069763, + "logits/rejected": -0.5828890800476074, + "logps/chosen": -58.77296447753906, + "logps/rejected": -80.0829849243164, + "loss": 0.7837, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8277711868286133, + "rewards/margins": 5.365849494934082, + "rewards/rejected": -2.5380780696868896, + "step": 7643 + }, + { + "epoch": 1.91, + "grad_norm": 6.3434295654296875, + "learning_rate": 6.804246554626196e-06, + "logits/chosen": -0.42113828659057617, + "logits/rejected": -0.5751339197158813, + "logps/chosen": -51.9040641784668, + "logps/rejected": -78.61726379394531, + "loss": 0.6514, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.880218505859375, + "rewards/margins": 5.931361675262451, + "rewards/rejected": -3.051142930984497, + "step": 7644 + }, + { + "epoch": 1.91, + "grad_norm": 3.5270676612854004, + "learning_rate": 6.80351350128173e-06, + "logits/chosen": -0.5070716142654419, + "logits/rejected": -0.5547128319740295, + "logps/chosen": -59.075923919677734, + "logps/rejected": -95.08055877685547, + "loss": 0.6706, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.001295804977417, + "rewards/margins": 6.016573905944824, + "rewards/rejected": -3.015277624130249, + "step": 7645 + }, + { + "epoch": 1.91, + "grad_norm": 8.158560752868652, + "learning_rate": 6.802780403370528e-06, + "logits/chosen": -0.45357710123062134, + "logits/rejected": -0.5422461032867432, + "logps/chosen": -48.827640533447266, + "logps/rejected": -78.96688079833984, + "loss": 0.647, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9893579483032227, + "rewards/margins": 5.322343349456787, + "rewards/rejected": -2.3329856395721436, + "step": 7646 + }, + { + "epoch": 1.91, + "grad_norm": 6.9028096199035645, + "learning_rate": 6.8020472609107054e-06, + "logits/chosen": -0.4166865348815918, + "logits/rejected": -0.5573160648345947, + "logps/chosen": -53.30225372314453, + "logps/rejected": -74.27603149414062, + "loss": 0.6555, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.103602886199951, + "rewards/margins": 5.792500019073486, + "rewards/rejected": -2.6888976097106934, + "step": 7647 + }, + { + "epoch": 1.91, + "grad_norm": 2.401059865951538, + "learning_rate": 6.801314073920377e-06, + "logits/chosen": -0.5282093286514282, + "logits/rejected": -0.5811914801597595, + "logps/chosen": -53.327938079833984, + "logps/rejected": -88.46408081054688, + "loss": 0.7311, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9509389400482178, + "rewards/margins": 4.778065204620361, + "rewards/rejected": -1.8271267414093018, + "step": 7648 + }, + { + "epoch": 1.91, + "grad_norm": 3.885744333267212, + "learning_rate": 6.800580842417662e-06, + "logits/chosen": -0.5230265259742737, + "logits/rejected": -0.627415657043457, + "logps/chosen": -51.698829650878906, + "logps/rejected": -83.28091430664062, + "loss": 0.6735, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.094691514968872, + "rewards/margins": 5.623612880706787, + "rewards/rejected": -2.528921604156494, + "step": 7649 + }, + { + "epoch": 1.91, + "grad_norm": 8.071075439453125, + "learning_rate": 6.799847566420679e-06, + "logits/chosen": -0.3978198766708374, + "logits/rejected": -0.44484013319015503, + "logps/chosen": -50.69195556640625, + "logps/rejected": -94.28629302978516, + "loss": 0.6546, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7252602577209473, + "rewards/margins": 4.996041297912598, + "rewards/rejected": -2.2707808017730713, + "step": 7650 + }, + { + "epoch": 1.91, + "grad_norm": 4.246086597442627, + "learning_rate": 6.799114245947549e-06, + "logits/chosen": -0.4382261633872986, + "logits/rejected": -0.5432168245315552, + "logps/chosen": -50.54436492919922, + "logps/rejected": -84.02198028564453, + "loss": 0.5831, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8379995822906494, + "rewards/margins": 5.112364768981934, + "rewards/rejected": -2.2743656635284424, + "step": 7651 + }, + { + "epoch": 1.91, + "grad_norm": 4.035645961761475, + "learning_rate": 6.798380881016393e-06, + "logits/chosen": -0.4216180741786957, + "logits/rejected": -0.46450334787368774, + "logps/chosen": -58.489620208740234, + "logps/rejected": -85.08293914794922, + "loss": 0.6742, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1561906337738037, + "rewards/margins": 4.263364791870117, + "rewards/rejected": -1.1071739196777344, + "step": 7652 + }, + { + "epoch": 1.91, + "grad_norm": 7.287083625793457, + "learning_rate": 6.797647471645332e-06, + "logits/chosen": -0.44411587715148926, + "logits/rejected": -0.4542473256587982, + "logps/chosen": -57.7636604309082, + "logps/rejected": -107.85033416748047, + "loss": 0.7024, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6753485202789307, + "rewards/margins": 5.411155700683594, + "rewards/rejected": -2.735806941986084, + "step": 7653 + }, + { + "epoch": 1.91, + "grad_norm": 5.393913269042969, + "learning_rate": 6.796914017852489e-06, + "logits/chosen": -0.45978185534477234, + "logits/rejected": -0.5126807689666748, + "logps/chosen": -67.68061828613281, + "logps/rejected": -85.20022583007812, + "loss": 0.8278, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0848567485809326, + "rewards/margins": 4.386187553405762, + "rewards/rejected": -1.301330804824829, + "step": 7654 + }, + { + "epoch": 1.92, + "grad_norm": 3.9509458541870117, + "learning_rate": 6.79618051965599e-06, + "logits/chosen": -0.4656614065170288, + "logits/rejected": -0.5342633128166199, + "logps/chosen": -54.73136520385742, + "logps/rejected": -90.64783477783203, + "loss": 0.6566, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.065359115600586, + "rewards/margins": 6.218193531036377, + "rewards/rejected": -3.152834177017212, + "step": 7655 + }, + { + "epoch": 1.92, + "grad_norm": 4.667917251586914, + "learning_rate": 6.795446977073959e-06, + "logits/chosen": -0.416461318731308, + "logits/rejected": -0.5338780879974365, + "logps/chosen": -55.51023483276367, + "logps/rejected": -85.72174835205078, + "loss": 0.573, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.527797222137451, + "rewards/margins": 6.339461803436279, + "rewards/rejected": -3.811664581298828, + "step": 7656 + }, + { + "epoch": 1.92, + "grad_norm": 15.52291488647461, + "learning_rate": 6.794713390124526e-06, + "logits/chosen": -0.5347276329994202, + "logits/rejected": -0.5519330501556396, + "logps/chosen": -59.631649017333984, + "logps/rejected": -104.45675659179688, + "loss": 0.9057, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.530324935913086, + "rewards/margins": 5.610377311706543, + "rewards/rejected": -3.080052375793457, + "step": 7657 + }, + { + "epoch": 1.92, + "grad_norm": 3.6722726821899414, + "learning_rate": 6.793979758825814e-06, + "logits/chosen": -0.410783052444458, + "logits/rejected": -0.4854108989238739, + "logps/chosen": -54.299232482910156, + "logps/rejected": -94.16130828857422, + "loss": 0.6037, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1545889377593994, + "rewards/margins": 6.392470836639404, + "rewards/rejected": -3.237881660461426, + "step": 7658 + }, + { + "epoch": 1.92, + "grad_norm": 5.058290004730225, + "learning_rate": 6.793246083195954e-06, + "logits/chosen": -0.4673493504524231, + "logits/rejected": -0.5262848138809204, + "logps/chosen": -64.41670989990234, + "logps/rejected": -95.5296401977539, + "loss": 0.6278, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0286855697631836, + "rewards/margins": 5.961827754974365, + "rewards/rejected": -2.9331419467926025, + "step": 7659 + }, + { + "epoch": 1.92, + "grad_norm": 10.329567909240723, + "learning_rate": 6.792512363253076e-06, + "logits/chosen": -0.41237521171569824, + "logits/rejected": -0.4777756929397583, + "logps/chosen": -57.13732147216797, + "logps/rejected": -85.60507202148438, + "loss": 0.8129, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.801846504211426, + "rewards/margins": 4.527304172515869, + "rewards/rejected": -1.7254576683044434, + "step": 7660 + }, + { + "epoch": 1.92, + "grad_norm": 8.332476615905762, + "learning_rate": 6.7917785990153126e-06, + "logits/chosen": -0.5082665085792542, + "logits/rejected": -0.5438117384910583, + "logps/chosen": -51.97108840942383, + "logps/rejected": -82.73585510253906, + "loss": 0.7596, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8780455589294434, + "rewards/margins": 5.278243064880371, + "rewards/rejected": -2.4001972675323486, + "step": 7661 + }, + { + "epoch": 1.92, + "grad_norm": 4.591156005859375, + "learning_rate": 6.791044790500791e-06, + "logits/chosen": -0.39535224437713623, + "logits/rejected": -0.49348506331443787, + "logps/chosen": -61.54242706298828, + "logps/rejected": -105.05197143554688, + "loss": 0.6082, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.783802032470703, + "rewards/margins": 5.928269386291504, + "rewards/rejected": -3.1444668769836426, + "step": 7662 + }, + { + "epoch": 1.92, + "grad_norm": 14.621843338012695, + "learning_rate": 6.790310937727651e-06, + "logits/chosen": -0.5130934715270996, + "logits/rejected": -0.6314244270324707, + "logps/chosen": -64.22338104248047, + "logps/rejected": -88.69903564453125, + "loss": 0.9833, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7547802925109863, + "rewards/margins": 5.084591865539551, + "rewards/rejected": -2.3298115730285645, + "step": 7663 + }, + { + "epoch": 1.92, + "grad_norm": 4.9543609619140625, + "learning_rate": 6.7895770407140216e-06, + "logits/chosen": -0.48181667923927307, + "logits/rejected": -0.4685346782207489, + "logps/chosen": -54.83055877685547, + "logps/rejected": -114.53807067871094, + "loss": 0.6444, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.972687244415283, + "rewards/margins": 7.3453288078308105, + "rewards/rejected": -4.372641563415527, + "step": 7664 + }, + { + "epoch": 1.92, + "grad_norm": 9.464693069458008, + "learning_rate": 6.788843099478041e-06, + "logits/chosen": -0.45710647106170654, + "logits/rejected": -0.49099111557006836, + "logps/chosen": -58.905059814453125, + "logps/rejected": -110.03690338134766, + "loss": 0.7965, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9430177211761475, + "rewards/margins": 5.341880798339844, + "rewards/rejected": -2.398862838745117, + "step": 7665 + }, + { + "epoch": 1.92, + "grad_norm": 4.3927321434021, + "learning_rate": 6.788109114037844e-06, + "logits/chosen": -0.4354502558708191, + "logits/rejected": -0.5570589303970337, + "logps/chosen": -57.281005859375, + "logps/rejected": -94.97126007080078, + "loss": 0.6253, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.409830331802368, + "rewards/margins": 7.2942094802856445, + "rewards/rejected": -3.8843801021575928, + "step": 7666 + }, + { + "epoch": 1.92, + "grad_norm": 14.867512702941895, + "learning_rate": 6.787375084411569e-06, + "logits/chosen": -0.44730955362319946, + "logits/rejected": -0.47889217734336853, + "logps/chosen": -55.220733642578125, + "logps/rejected": -93.06656646728516, + "loss": 0.9783, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6673147678375244, + "rewards/margins": 5.216631889343262, + "rewards/rejected": -2.549316883087158, + "step": 7667 + }, + { + "epoch": 1.92, + "grad_norm": 4.425867080688477, + "learning_rate": 6.786641010617354e-06, + "logits/chosen": -0.43840864300727844, + "logits/rejected": -0.5224789977073669, + "logps/chosen": -58.790313720703125, + "logps/rejected": -95.92718505859375, + "loss": 0.7697, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0508065223693848, + "rewards/margins": 6.537793159484863, + "rewards/rejected": -3.4869863986968994, + "step": 7668 + }, + { + "epoch": 1.92, + "grad_norm": 9.766420364379883, + "learning_rate": 6.785906892673339e-06, + "logits/chosen": -0.4647758901119232, + "logits/rejected": -0.5223840475082397, + "logps/chosen": -46.14120101928711, + "logps/rejected": -81.94720458984375, + "loss": 0.766, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.874445915222168, + "rewards/margins": 5.3327860832214355, + "rewards/rejected": -2.4583404064178467, + "step": 7669 + }, + { + "epoch": 1.92, + "grad_norm": 20.531057357788086, + "learning_rate": 6.785172730597668e-06, + "logits/chosen": -0.5459198951721191, + "logits/rejected": -0.5771603584289551, + "logps/chosen": -50.27245330810547, + "logps/rejected": -101.77452850341797, + "loss": 0.9067, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.950737714767456, + "rewards/margins": 5.3835272789001465, + "rewards/rejected": -2.432788848876953, + "step": 7670 + }, + { + "epoch": 1.92, + "grad_norm": 4.180569648742676, + "learning_rate": 6.784438524408477e-06, + "logits/chosen": -0.4275361895561218, + "logits/rejected": -0.5396714210510254, + "logps/chosen": -64.60531616210938, + "logps/rejected": -99.75577545166016, + "loss": 0.6884, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.817295551300049, + "rewards/margins": 6.529689788818359, + "rewards/rejected": -3.7123942375183105, + "step": 7671 + }, + { + "epoch": 1.92, + "grad_norm": 13.786069869995117, + "learning_rate": 6.783704274123913e-06, + "logits/chosen": -0.43488335609436035, + "logits/rejected": -0.5751137733459473, + "logps/chosen": -55.06230163574219, + "logps/rejected": -77.56529998779297, + "loss": 0.7152, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1068477630615234, + "rewards/margins": 5.930617332458496, + "rewards/rejected": -2.8237693309783936, + "step": 7672 + }, + { + "epoch": 1.92, + "grad_norm": 9.73066520690918, + "learning_rate": 6.782969979762119e-06, + "logits/chosen": -0.4704611301422119, + "logits/rejected": -0.5591486692428589, + "logps/chosen": -56.7791862487793, + "logps/rejected": -79.08572387695312, + "loss": 0.8634, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7291598320007324, + "rewards/margins": 5.292445659637451, + "rewards/rejected": -2.5632853507995605, + "step": 7673 + }, + { + "epoch": 1.92, + "grad_norm": 7.940004348754883, + "learning_rate": 6.782235641341241e-06, + "logits/chosen": -0.4269411265850067, + "logits/rejected": -0.5081309676170349, + "logps/chosen": -64.05000305175781, + "logps/rejected": -93.48846435546875, + "loss": 0.8075, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.621192693710327, + "rewards/margins": 4.501706600189209, + "rewards/rejected": -1.8805134296417236, + "step": 7674 + }, + { + "epoch": 1.92, + "grad_norm": 5.319779872894287, + "learning_rate": 6.781501258879425e-06, + "logits/chosen": -0.38386765122413635, + "logits/rejected": -0.48194220662117004, + "logps/chosen": -54.321807861328125, + "logps/rejected": -93.25031280517578, + "loss": 0.6147, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1035425662994385, + "rewards/margins": 5.9839372634887695, + "rewards/rejected": -2.880394458770752, + "step": 7675 + }, + { + "epoch": 1.92, + "grad_norm": 5.781062126159668, + "learning_rate": 6.780766832394816e-06, + "logits/chosen": -0.38610097765922546, + "logits/rejected": -0.5262880325317383, + "logps/chosen": -73.36964416503906, + "logps/rejected": -95.49333190917969, + "loss": 0.8929, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5960752964019775, + "rewards/margins": 5.8805365562438965, + "rewards/rejected": -3.284461498260498, + "step": 7676 + }, + { + "epoch": 1.92, + "grad_norm": 6.732096195220947, + "learning_rate": 6.780032361905567e-06, + "logits/chosen": -0.36582300066947937, + "logits/rejected": -0.4314079284667969, + "logps/chosen": -48.25007629394531, + "logps/rejected": -87.75786590576172, + "loss": 0.5849, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.122821807861328, + "rewards/margins": 5.067942142486572, + "rewards/rejected": -1.9451199769973755, + "step": 7677 + }, + { + "epoch": 1.92, + "grad_norm": 4.082574844360352, + "learning_rate": 6.779297847429822e-06, + "logits/chosen": -0.4008365571498871, + "logits/rejected": -0.5073256492614746, + "logps/chosen": -58.58414077758789, + "logps/rejected": -83.15132904052734, + "loss": 0.6981, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.877500534057617, + "rewards/margins": 4.957762718200684, + "rewards/rejected": -2.0802624225616455, + "step": 7678 + }, + { + "epoch": 1.92, + "grad_norm": 6.496105670928955, + "learning_rate": 6.778563288985737e-06, + "logits/chosen": -0.4002118706703186, + "logits/rejected": -0.5845122933387756, + "logps/chosen": -64.99423217773438, + "logps/rejected": -75.51870727539062, + "loss": 0.6991, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.596351385116577, + "rewards/margins": 4.551126003265381, + "rewards/rejected": -1.954774260520935, + "step": 7679 + }, + { + "epoch": 1.92, + "grad_norm": 4.434053421020508, + "learning_rate": 6.77782868659146e-06, + "logits/chosen": -0.4813644587993622, + "logits/rejected": -0.5274560451507568, + "logps/chosen": -54.56619644165039, + "logps/rejected": -86.42874145507812, + "loss": 0.7423, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.920020818710327, + "rewards/margins": 4.3717360496521, + "rewards/rejected": -1.4517147541046143, + "step": 7680 + }, + { + "epoch": 1.92, + "grad_norm": 5.213713645935059, + "learning_rate": 6.777094040265146e-06, + "logits/chosen": -0.4668109118938446, + "logits/rejected": -0.5622548460960388, + "logps/chosen": -53.181793212890625, + "logps/rejected": -90.57453918457031, + "loss": 0.6963, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.811985969543457, + "rewards/margins": 5.552446365356445, + "rewards/rejected": -2.740460157394409, + "step": 7681 + }, + { + "epoch": 1.92, + "grad_norm": 3.516730785369873, + "learning_rate": 6.776359350024948e-06, + "logits/chosen": -0.42199423909187317, + "logits/rejected": -0.5908092856407166, + "logps/chosen": -50.960121154785156, + "logps/rejected": -82.77095794677734, + "loss": 0.5843, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0663564205169678, + "rewards/margins": 5.962686538696289, + "rewards/rejected": -2.896329879760742, + "step": 7682 + }, + { + "epoch": 1.92, + "grad_norm": 3.9965386390686035, + "learning_rate": 6.77562461588902e-06, + "logits/chosen": -0.43367719650268555, + "logits/rejected": -0.5759811401367188, + "logps/chosen": -43.11117935180664, + "logps/rejected": -95.17825317382812, + "loss": 0.5786, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3008294105529785, + "rewards/margins": 5.520989894866943, + "rewards/rejected": -2.220160961151123, + "step": 7683 + }, + { + "epoch": 1.92, + "grad_norm": 4.234814643859863, + "learning_rate": 6.774889837875522e-06, + "logits/chosen": -0.41577261686325073, + "logits/rejected": -0.4551694095134735, + "logps/chosen": -63.06339645385742, + "logps/rejected": -89.85177612304688, + "loss": 0.7254, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.901848793029785, + "rewards/margins": 4.371772766113281, + "rewards/rejected": -1.4699242115020752, + "step": 7684 + }, + { + "epoch": 1.92, + "grad_norm": 4.907370090484619, + "learning_rate": 6.7741550160026035e-06, + "logits/chosen": -0.43880295753479004, + "logits/rejected": -0.5079765319824219, + "logps/chosen": -68.91629028320312, + "logps/rejected": -99.01622009277344, + "loss": 0.7365, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2245612144470215, + "rewards/margins": 5.131442070007324, + "rewards/rejected": -1.906880497932434, + "step": 7685 + }, + { + "epoch": 1.92, + "grad_norm": 2.6812639236450195, + "learning_rate": 6.773420150288432e-06, + "logits/chosen": -0.4922162890434265, + "logits/rejected": -0.49956774711608887, + "logps/chosen": -54.43144226074219, + "logps/rejected": -112.36911010742188, + "loss": 0.6789, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3708245754241943, + "rewards/margins": 5.865914344787598, + "rewards/rejected": -2.4950900077819824, + "step": 7686 + }, + { + "epoch": 1.92, + "grad_norm": 3.282982587814331, + "learning_rate": 6.77268524075116e-06, + "logits/chosen": -0.4224682152271271, + "logits/rejected": -0.4587417542934418, + "logps/chosen": -55.94306945800781, + "logps/rejected": -78.1220703125, + "loss": 0.6879, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0194504261016846, + "rewards/margins": 4.792245388031006, + "rewards/rejected": -1.7727948427200317, + "step": 7687 + }, + { + "epoch": 1.92, + "grad_norm": 9.01318073272705, + "learning_rate": 6.77195028740895e-06, + "logits/chosen": -0.48263049125671387, + "logits/rejected": -0.592616081237793, + "logps/chosen": -58.20762252807617, + "logps/rejected": -85.41117095947266, + "loss": 0.7829, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.753000020980835, + "rewards/margins": 4.891944408416748, + "rewards/rejected": -2.138944387435913, + "step": 7688 + }, + { + "epoch": 1.92, + "grad_norm": 4.327275276184082, + "learning_rate": 6.7712152902799646e-06, + "logits/chosen": -0.4647623896598816, + "logits/rejected": -0.5650925040245056, + "logps/chosen": -55.00861358642578, + "logps/rejected": -82.4945068359375, + "loss": 0.6983, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0981767177581787, + "rewards/margins": 5.4174323081970215, + "rewards/rejected": -2.3192551136016846, + "step": 7689 + }, + { + "epoch": 1.92, + "grad_norm": 3.525667190551758, + "learning_rate": 6.770480249382366e-06, + "logits/chosen": -0.43811437487602234, + "logits/rejected": -0.5068207383155823, + "logps/chosen": -48.569984436035156, + "logps/rejected": -93.55489349365234, + "loss": 0.6711, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.732773542404175, + "rewards/margins": 5.298744201660156, + "rewards/rejected": -2.5659708976745605, + "step": 7690 + }, + { + "epoch": 1.92, + "grad_norm": 3.9752559661865234, + "learning_rate": 6.769745164734316e-06, + "logits/chosen": -0.44435757398605347, + "logits/rejected": -0.538426399230957, + "logps/chosen": -56.659698486328125, + "logps/rejected": -94.82279968261719, + "loss": 0.6406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0346436500549316, + "rewards/margins": 5.242745399475098, + "rewards/rejected": -2.2081024646759033, + "step": 7691 + }, + { + "epoch": 1.92, + "grad_norm": 17.341129302978516, + "learning_rate": 6.769010036353981e-06, + "logits/chosen": -0.3708897531032562, + "logits/rejected": -0.4446222186088562, + "logps/chosen": -62.93191146850586, + "logps/rejected": -95.25161743164062, + "loss": 0.9662, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6312713623046875, + "rewards/margins": 4.619400978088379, + "rewards/rejected": -1.9881293773651123, + "step": 7692 + }, + { + "epoch": 1.92, + "grad_norm": 3.399278402328491, + "learning_rate": 6.768274864259528e-06, + "logits/chosen": -0.4917658269405365, + "logits/rejected": -0.6073437929153442, + "logps/chosen": -53.98139572143555, + "logps/rejected": -79.94598388671875, + "loss": 0.6885, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8785767555236816, + "rewards/margins": 5.897104263305664, + "rewards/rejected": -3.0185275077819824, + "step": 7693 + }, + { + "epoch": 1.92, + "grad_norm": 7.1327033042907715, + "learning_rate": 6.767539648469119e-06, + "logits/chosen": -0.4131242334842682, + "logits/rejected": -0.587710976600647, + "logps/chosen": -63.36919403076172, + "logps/rejected": -65.39859771728516, + "loss": 0.777, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.711857557296753, + "rewards/margins": 5.149431228637695, + "rewards/rejected": -2.437572956085205, + "step": 7694 + }, + { + "epoch": 1.93, + "grad_norm": 14.95699691772461, + "learning_rate": 6.766804389000928e-06, + "logits/chosen": -0.44143927097320557, + "logits/rejected": -0.4863170087337494, + "logps/chosen": -71.3072738647461, + "logps/rejected": -86.52892303466797, + "loss": 0.9498, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.58113431930542, + "rewards/margins": 4.183245658874512, + "rewards/rejected": -1.6021114587783813, + "step": 7695 + }, + { + "epoch": 1.93, + "grad_norm": 4.151925563812256, + "learning_rate": 6.766069085873121e-06, + "logits/chosen": -0.49286365509033203, + "logits/rejected": -0.5909376740455627, + "logps/chosen": -43.800628662109375, + "logps/rejected": -81.2359619140625, + "loss": 0.6234, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.224543333053589, + "rewards/margins": 4.987152576446533, + "rewards/rejected": -1.7626092433929443, + "step": 7696 + }, + { + "epoch": 1.93, + "grad_norm": 7.078448295593262, + "learning_rate": 6.7653337391038665e-06, + "logits/chosen": -0.47223231196403503, + "logits/rejected": -0.5505444407463074, + "logps/chosen": -59.347103118896484, + "logps/rejected": -82.09938049316406, + "loss": 0.7298, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8651950359344482, + "rewards/margins": 5.080560684204102, + "rewards/rejected": -2.2153661251068115, + "step": 7697 + }, + { + "epoch": 1.93, + "grad_norm": 2.9271397590637207, + "learning_rate": 6.764598348711339e-06, + "logits/chosen": -0.39882972836494446, + "logits/rejected": -0.5334968566894531, + "logps/chosen": -52.14097213745117, + "logps/rejected": -80.9981460571289, + "loss": 0.6279, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.019449234008789, + "rewards/margins": 5.551360607147217, + "rewards/rejected": -2.5319113731384277, + "step": 7698 + }, + { + "epoch": 1.93, + "grad_norm": 7.924300670623779, + "learning_rate": 6.763862914713708e-06, + "logits/chosen": -0.3729129731655121, + "logits/rejected": -0.4913147985935211, + "logps/chosen": -60.12752151489258, + "logps/rejected": -81.1137466430664, + "loss": 0.6928, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0955734252929688, + "rewards/margins": 5.857278823852539, + "rewards/rejected": -2.7617053985595703, + "step": 7699 + }, + { + "epoch": 1.93, + "grad_norm": 3.5550129413604736, + "learning_rate": 6.763127437129151e-06, + "logits/chosen": -0.4051288366317749, + "logits/rejected": -0.5017824769020081, + "logps/chosen": -61.33588409423828, + "logps/rejected": -86.46919250488281, + "loss": 0.6656, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9529225826263428, + "rewards/margins": 5.513455867767334, + "rewards/rejected": -2.560533046722412, + "step": 7700 + }, + { + "epoch": 1.93, + "grad_norm": 8.56185531616211, + "learning_rate": 6.762391915975838e-06, + "logits/chosen": -0.511832058429718, + "logits/rejected": -0.5957032442092896, + "logps/chosen": -51.56486511230469, + "logps/rejected": -95.79562377929688, + "loss": 0.7482, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.990419864654541, + "rewards/margins": 4.171789646148682, + "rewards/rejected": -1.1813700199127197, + "step": 7701 + }, + { + "epoch": 1.93, + "grad_norm": 5.50494384765625, + "learning_rate": 6.761656351271946e-06, + "logits/chosen": -0.4367152154445648, + "logits/rejected": -0.47680675983428955, + "logps/chosen": -55.38533020019531, + "logps/rejected": -94.04782104492188, + "loss": 0.7757, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8554017543792725, + "rewards/margins": 3.8979125022888184, + "rewards/rejected": -1.042510747909546, + "step": 7702 + }, + { + "epoch": 1.93, + "grad_norm": 4.383598327636719, + "learning_rate": 6.760920743035652e-06, + "logits/chosen": -0.439637154340744, + "logits/rejected": -0.53280109167099, + "logps/chosen": -55.515403747558594, + "logps/rejected": -90.86653137207031, + "loss": 0.6767, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.720048666000366, + "rewards/margins": 5.951435565948486, + "rewards/rejected": -3.2313873767852783, + "step": 7703 + }, + { + "epoch": 1.93, + "grad_norm": 6.979673385620117, + "learning_rate": 6.7601850912851345e-06, + "logits/chosen": -0.44063371419906616, + "logits/rejected": -0.5562052130699158, + "logps/chosen": -57.73570251464844, + "logps/rejected": -81.3460693359375, + "loss": 0.7214, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7417409420013428, + "rewards/margins": 5.44305419921875, + "rewards/rejected": -2.7013134956359863, + "step": 7704 + }, + { + "epoch": 1.93, + "grad_norm": 10.10916519165039, + "learning_rate": 6.759449396038569e-06, + "logits/chosen": -0.42716121673583984, + "logits/rejected": -0.5245241522789001, + "logps/chosen": -51.13737869262695, + "logps/rejected": -92.81719207763672, + "loss": 0.7978, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.657214403152466, + "rewards/margins": 5.3549604415893555, + "rewards/rejected": -2.6977460384368896, + "step": 7705 + }, + { + "epoch": 1.93, + "grad_norm": 6.143057823181152, + "learning_rate": 6.758713657314138e-06, + "logits/chosen": -0.39239874482154846, + "logits/rejected": -0.46986061334609985, + "logps/chosen": -64.73268127441406, + "logps/rejected": -80.79718017578125, + "loss": 0.6685, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.105414867401123, + "rewards/margins": 4.847670078277588, + "rewards/rejected": -1.7422552108764648, + "step": 7706 + }, + { + "epoch": 1.93, + "grad_norm": 4.640702247619629, + "learning_rate": 6.757977875130024e-06, + "logits/chosen": -0.45331817865371704, + "logits/rejected": -0.5484687089920044, + "logps/chosen": -53.97504806518555, + "logps/rejected": -104.97350311279297, + "loss": 0.6487, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9373581409454346, + "rewards/margins": 6.247491836547852, + "rewards/rejected": -3.310133218765259, + "step": 7707 + }, + { + "epoch": 1.93, + "grad_norm": 3.312955379486084, + "learning_rate": 6.7572420495044046e-06, + "logits/chosen": -0.4822874665260315, + "logits/rejected": -0.5927518606185913, + "logps/chosen": -53.360897064208984, + "logps/rejected": -99.89327239990234, + "loss": 0.6163, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.283752679824829, + "rewards/margins": 6.52664852142334, + "rewards/rejected": -3.242896318435669, + "step": 7708 + }, + { + "epoch": 1.93, + "grad_norm": 4.13822078704834, + "learning_rate": 6.756506180455467e-06, + "logits/chosen": -0.4971016049385071, + "logits/rejected": -0.6298072338104248, + "logps/chosen": -50.54996871948242, + "logps/rejected": -88.25194549560547, + "loss": 0.6221, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9338631629943848, + "rewards/margins": 6.3010993003845215, + "rewards/rejected": -3.3672361373901367, + "step": 7709 + }, + { + "epoch": 1.93, + "grad_norm": 5.621150016784668, + "learning_rate": 6.755770268001392e-06, + "logits/chosen": -0.537674605846405, + "logits/rejected": -0.5601257085800171, + "logps/chosen": -41.83256912231445, + "logps/rejected": -96.6275405883789, + "loss": 0.6876, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0783510208129883, + "rewards/margins": 5.867611885070801, + "rewards/rejected": -2.7892606258392334, + "step": 7710 + }, + { + "epoch": 1.93, + "grad_norm": 2.4260706901550293, + "learning_rate": 6.755034312160367e-06, + "logits/chosen": -0.43939343094825745, + "logits/rejected": -0.4923055171966553, + "logps/chosen": -61.0108528137207, + "logps/rejected": -100.43815612792969, + "loss": 0.7012, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.489255905151367, + "rewards/margins": 5.919029235839844, + "rewards/rejected": -2.4297735691070557, + "step": 7711 + }, + { + "epoch": 1.93, + "grad_norm": 6.009151935577393, + "learning_rate": 6.754298312950576e-06, + "logits/chosen": -0.4542846083641052, + "logits/rejected": -0.5141363143920898, + "logps/chosen": -64.1565170288086, + "logps/rejected": -81.63774871826172, + "loss": 0.7388, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9349725246429443, + "rewards/margins": 3.4996256828308105, + "rewards/rejected": -0.5646528005599976, + "step": 7712 + }, + { + "epoch": 1.93, + "grad_norm": 8.682639122009277, + "learning_rate": 6.753562270390209e-06, + "logits/chosen": -0.4601239264011383, + "logits/rejected": -0.5571349263191223, + "logps/chosen": -58.60826873779297, + "logps/rejected": -95.85224914550781, + "loss": 0.694, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5513763427734375, + "rewards/margins": 5.286074638366699, + "rewards/rejected": -2.73469877243042, + "step": 7713 + }, + { + "epoch": 1.93, + "grad_norm": 4.157597541809082, + "learning_rate": 6.7528261844974515e-06, + "logits/chosen": -0.40747255086898804, + "logits/rejected": -0.4945400357246399, + "logps/chosen": -66.96663665771484, + "logps/rejected": -90.56324768066406, + "loss": 0.7596, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9030940532684326, + "rewards/margins": 5.208094120025635, + "rewards/rejected": -2.305000066757202, + "step": 7714 + }, + { + "epoch": 1.93, + "grad_norm": 6.711552619934082, + "learning_rate": 6.752090055290497e-06, + "logits/chosen": -0.5003647208213806, + "logits/rejected": -0.5539705753326416, + "logps/chosen": -68.25164031982422, + "logps/rejected": -109.57222747802734, + "loss": 0.6969, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.720818042755127, + "rewards/margins": 5.493250370025635, + "rewards/rejected": -2.772432804107666, + "step": 7715 + }, + { + "epoch": 1.93, + "grad_norm": 5.864592552185059, + "learning_rate": 6.751353882787533e-06, + "logits/chosen": -0.4386271834373474, + "logits/rejected": -0.5581788420677185, + "logps/chosen": -60.93345642089844, + "logps/rejected": -95.62483215332031, + "loss": 0.6445, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4116668701171875, + "rewards/margins": 5.902719020843506, + "rewards/rejected": -3.4910526275634766, + "step": 7716 + }, + { + "epoch": 1.93, + "grad_norm": 8.084755897521973, + "learning_rate": 6.750617667006749e-06, + "logits/chosen": -0.48741286993026733, + "logits/rejected": -0.566786527633667, + "logps/chosen": -59.572906494140625, + "logps/rejected": -82.20391845703125, + "loss": 0.8229, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.987379550933838, + "rewards/margins": 4.266656398773193, + "rewards/rejected": -1.2792773246765137, + "step": 7717 + }, + { + "epoch": 1.93, + "grad_norm": 3.7029879093170166, + "learning_rate": 6.749881407966345e-06, + "logits/chosen": -0.4923389256000519, + "logits/rejected": -0.5888835191726685, + "logps/chosen": -43.04878234863281, + "logps/rejected": -88.7362060546875, + "loss": 0.6693, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7989230155944824, + "rewards/margins": 5.620587348937988, + "rewards/rejected": -2.821664333343506, + "step": 7718 + }, + { + "epoch": 1.93, + "grad_norm": 14.532015800476074, + "learning_rate": 6.749145105684507e-06, + "logits/chosen": -0.49586227536201477, + "logits/rejected": -0.641010582447052, + "logps/chosen": -55.43369674682617, + "logps/rejected": -86.23413848876953, + "loss": 0.807, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0743329524993896, + "rewards/margins": 6.126476287841797, + "rewards/rejected": -3.0521438121795654, + "step": 7719 + }, + { + "epoch": 1.93, + "grad_norm": 12.548582077026367, + "learning_rate": 6.748408760179434e-06, + "logits/chosen": -0.41379910707473755, + "logits/rejected": -0.5005162954330444, + "logps/chosen": -56.51152038574219, + "logps/rejected": -81.86306762695312, + "loss": 0.7459, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.922909736633301, + "rewards/margins": 4.939435005187988, + "rewards/rejected": -2.0165252685546875, + "step": 7720 + }, + { + "epoch": 1.93, + "grad_norm": 4.244461536407471, + "learning_rate": 6.747672371469319e-06, + "logits/chosen": -0.4599187970161438, + "logits/rejected": -0.5834593772888184, + "logps/chosen": -48.099395751953125, + "logps/rejected": -72.02938842773438, + "loss": 0.7181, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9295287132263184, + "rewards/margins": 5.542511940002441, + "rewards/rejected": -2.612982749938965, + "step": 7721 + }, + { + "epoch": 1.93, + "grad_norm": 6.128978252410889, + "learning_rate": 6.746935939572364e-06, + "logits/chosen": -0.4926259219646454, + "logits/rejected": -0.5260714292526245, + "logps/chosen": -53.368526458740234, + "logps/rejected": -77.67030334472656, + "loss": 0.9209, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.945817470550537, + "rewards/margins": 4.203770160675049, + "rewards/rejected": -1.25795316696167, + "step": 7722 + }, + { + "epoch": 1.93, + "grad_norm": 3.0873985290527344, + "learning_rate": 6.746199464506761e-06, + "logits/chosen": -0.4222216010093689, + "logits/rejected": -0.5711871981620789, + "logps/chosen": -58.34616470336914, + "logps/rejected": -77.7916488647461, + "loss": 0.687, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1127095222473145, + "rewards/margins": 5.535740852355957, + "rewards/rejected": -2.4230315685272217, + "step": 7723 + }, + { + "epoch": 1.93, + "grad_norm": 7.364283561706543, + "learning_rate": 6.745462946290713e-06, + "logits/chosen": -0.46065640449523926, + "logits/rejected": -0.5271169543266296, + "logps/chosen": -59.9496955871582, + "logps/rejected": -119.40453338623047, + "loss": 0.75, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.707024097442627, + "rewards/margins": 5.539722442626953, + "rewards/rejected": -2.832698345184326, + "step": 7724 + }, + { + "epoch": 1.93, + "grad_norm": 6.475995063781738, + "learning_rate": 6.744726384942419e-06, + "logits/chosen": -0.42941126227378845, + "logits/rejected": -0.5271435976028442, + "logps/chosen": -60.10035705566406, + "logps/rejected": -85.3565673828125, + "loss": 0.8395, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.922875165939331, + "rewards/margins": 5.303750514984131, + "rewards/rejected": -2.3808746337890625, + "step": 7725 + }, + { + "epoch": 1.93, + "grad_norm": 7.778172016143799, + "learning_rate": 6.74398978048008e-06, + "logits/chosen": -0.44282957911491394, + "logits/rejected": -0.48987236618995667, + "logps/chosen": -46.956275939941406, + "logps/rejected": -97.28026580810547, + "loss": 0.7578, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9681384563446045, + "rewards/margins": 4.427768230438232, + "rewards/rejected": -1.4596296548843384, + "step": 7726 + }, + { + "epoch": 1.93, + "grad_norm": 3.4891767501831055, + "learning_rate": 6.7432531329219e-06, + "logits/chosen": -0.4307044446468353, + "logits/rejected": -0.5454239845275879, + "logps/chosen": -55.674827575683594, + "logps/rejected": -84.1561050415039, + "loss": 0.6665, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.967710018157959, + "rewards/margins": 5.354741096496582, + "rewards/rejected": -2.387031316757202, + "step": 7727 + }, + { + "epoch": 1.93, + "grad_norm": 3.341797113418579, + "learning_rate": 6.742516442286078e-06, + "logits/chosen": -0.4047950506210327, + "logits/rejected": -0.49359726905822754, + "logps/chosen": -49.36861038208008, + "logps/rejected": -79.54905700683594, + "loss": 0.6378, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1276447772979736, + "rewards/margins": 4.971073627471924, + "rewards/rejected": -1.8434289693832397, + "step": 7728 + }, + { + "epoch": 1.93, + "grad_norm": 4.695243835449219, + "learning_rate": 6.741779708590824e-06, + "logits/chosen": -0.47952204942703247, + "logits/rejected": -0.5142403244972229, + "logps/chosen": -51.13204574584961, + "logps/rejected": -88.78365325927734, + "loss": 0.7131, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.927847146987915, + "rewards/margins": 5.161042213439941, + "rewards/rejected": -2.233194589614868, + "step": 7729 + }, + { + "epoch": 1.93, + "grad_norm": 5.626506328582764, + "learning_rate": 6.741042931854341e-06, + "logits/chosen": -0.43459761142730713, + "logits/rejected": -0.5037413239479065, + "logps/chosen": -56.69382858276367, + "logps/rejected": -90.78592681884766, + "loss": 0.775, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.089148759841919, + "rewards/margins": 4.8853020668029785, + "rewards/rejected": -1.7961530685424805, + "step": 7730 + }, + { + "epoch": 1.93, + "grad_norm": 5.643693447113037, + "learning_rate": 6.740306112094832e-06, + "logits/chosen": -0.4615189731121063, + "logits/rejected": -0.5449984073638916, + "logps/chosen": -49.52607345581055, + "logps/rejected": -75.4973373413086, + "loss": 0.7958, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0440726280212402, + "rewards/margins": 4.690945625305176, + "rewards/rejected": -1.6468729972839355, + "step": 7731 + }, + { + "epoch": 1.93, + "grad_norm": 6.161096096038818, + "learning_rate": 6.739569249330511e-06, + "logits/chosen": -0.39411261677742004, + "logits/rejected": -0.49211204051971436, + "logps/chosen": -64.310302734375, + "logps/rejected": -82.73036193847656, + "loss": 0.8621, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1712186336517334, + "rewards/margins": 4.695300102233887, + "rewards/rejected": -1.5240812301635742, + "step": 7732 + }, + { + "epoch": 1.93, + "grad_norm": 3.275102376937866, + "learning_rate": 6.738832343579582e-06, + "logits/chosen": -0.4868468940258026, + "logits/rejected": -0.5258538722991943, + "logps/chosen": -54.773582458496094, + "logps/rejected": -99.16199493408203, + "loss": 0.6235, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8085989952087402, + "rewards/margins": 5.350007057189941, + "rewards/rejected": -2.541408061981201, + "step": 7733 + }, + { + "epoch": 1.93, + "grad_norm": 3.7103991508483887, + "learning_rate": 6.738095394860258e-06, + "logits/chosen": -0.3746994733810425, + "logits/rejected": -0.4740302562713623, + "logps/chosen": -58.274539947509766, + "logps/rejected": -99.62503051757812, + "loss": 0.6361, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.11411714553833, + "rewards/margins": 5.3132524490356445, + "rewards/rejected": -2.1991353034973145, + "step": 7734 + }, + { + "epoch": 1.94, + "grad_norm": 2.969135046005249, + "learning_rate": 6.737358403190746e-06, + "logits/chosen": -0.40221813321113586, + "logits/rejected": -0.5016313195228577, + "logps/chosen": -47.37537384033203, + "logps/rejected": -86.20680236816406, + "loss": 0.6466, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9386701583862305, + "rewards/margins": 5.691067695617676, + "rewards/rejected": -2.7523975372314453, + "step": 7735 + }, + { + "epoch": 1.94, + "grad_norm": 10.59122371673584, + "learning_rate": 6.736621368589263e-06, + "logits/chosen": -0.5122605562210083, + "logits/rejected": -0.6312592029571533, + "logps/chosen": -55.78754425048828, + "logps/rejected": -82.95831298828125, + "loss": 0.7515, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8722314834594727, + "rewards/margins": 4.614654541015625, + "rewards/rejected": -1.7424230575561523, + "step": 7736 + }, + { + "epoch": 1.94, + "grad_norm": 5.353503704071045, + "learning_rate": 6.735884291074014e-06, + "logits/chosen": -0.382699191570282, + "logits/rejected": -0.4785556197166443, + "logps/chosen": -55.01308059692383, + "logps/rejected": -81.56171417236328, + "loss": 0.8029, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8748960494995117, + "rewards/margins": 4.642064571380615, + "rewards/rejected": -1.7671682834625244, + "step": 7737 + }, + { + "epoch": 1.94, + "grad_norm": 13.171319007873535, + "learning_rate": 6.735147170663221e-06, + "logits/chosen": -0.41274788975715637, + "logits/rejected": -0.5347650647163391, + "logps/chosen": -53.118656158447266, + "logps/rejected": -88.95494842529297, + "loss": 0.652, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7645132541656494, + "rewards/margins": 5.791085243225098, + "rewards/rejected": -3.026571750640869, + "step": 7738 + }, + { + "epoch": 1.94, + "grad_norm": 5.534273147583008, + "learning_rate": 6.734410007375095e-06, + "logits/chosen": -0.47530046105384827, + "logits/rejected": -0.5385675430297852, + "logps/chosen": -55.400604248046875, + "logps/rejected": -69.71639251708984, + "loss": 0.7801, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.097722291946411, + "rewards/margins": 4.470606803894043, + "rewards/rejected": -1.3728845119476318, + "step": 7739 + }, + { + "epoch": 1.94, + "grad_norm": 7.15962553024292, + "learning_rate": 6.733672801227853e-06, + "logits/chosen": -0.4792183041572571, + "logits/rejected": -0.5526885390281677, + "logps/chosen": -59.706932067871094, + "logps/rejected": -88.1058120727539, + "loss": 0.7052, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.748321294784546, + "rewards/margins": 5.356197357177734, + "rewards/rejected": -2.6078765392303467, + "step": 7740 + }, + { + "epoch": 1.94, + "grad_norm": 6.683245658874512, + "learning_rate": 6.7329355522397134e-06, + "logits/chosen": -0.40431568026542664, + "logits/rejected": -0.44977736473083496, + "logps/chosen": -63.923500061035156, + "logps/rejected": -93.38006591796875, + "loss": 0.803, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2075116634368896, + "rewards/margins": 4.538377285003662, + "rewards/rejected": -1.3308659791946411, + "step": 7741 + }, + { + "epoch": 1.94, + "grad_norm": 4.045650005340576, + "learning_rate": 6.7321982604288915e-06, + "logits/chosen": -0.48792093992233276, + "logits/rejected": -0.557563841342926, + "logps/chosen": -51.57206726074219, + "logps/rejected": -81.19974517822266, + "loss": 0.7224, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9626495838165283, + "rewards/margins": 5.198275566101074, + "rewards/rejected": -2.235626220703125, + "step": 7742 + }, + { + "epoch": 1.94, + "grad_norm": 1.9804869890213013, + "learning_rate": 6.7314609258136085e-06, + "logits/chosen": -0.43196162581443787, + "logits/rejected": -0.5196906328201294, + "logps/chosen": -44.066017150878906, + "logps/rejected": -89.32854461669922, + "loss": 0.5622, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0134732723236084, + "rewards/margins": 5.509901523590088, + "rewards/rejected": -2.4964284896850586, + "step": 7743 + }, + { + "epoch": 1.94, + "grad_norm": 3.6401827335357666, + "learning_rate": 6.730723548412083e-06, + "logits/chosen": -0.5001099109649658, + "logits/rejected": -0.5783154964447021, + "logps/chosen": -51.29827880859375, + "logps/rejected": -88.713134765625, + "loss": 0.5879, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2313265800476074, + "rewards/margins": 5.835756301879883, + "rewards/rejected": -2.6044299602508545, + "step": 7744 + }, + { + "epoch": 1.94, + "grad_norm": 2.636509418487549, + "learning_rate": 6.729986128242541e-06, + "logits/chosen": -0.4467063546180725, + "logits/rejected": -0.5345006585121155, + "logps/chosen": -52.094635009765625, + "logps/rejected": -87.81432342529297, + "loss": 0.6342, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.966597080230713, + "rewards/margins": 5.626829624176025, + "rewards/rejected": -2.6602330207824707, + "step": 7745 + }, + { + "epoch": 1.94, + "grad_norm": 6.845312595367432, + "learning_rate": 6.7292486653231995e-06, + "logits/chosen": -0.4333099126815796, + "logits/rejected": -0.5144544839859009, + "logps/chosen": -50.86981201171875, + "logps/rejected": -90.24718475341797, + "loss": 0.6767, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.916140079498291, + "rewards/margins": 5.892181873321533, + "rewards/rejected": -2.9760422706604004, + "step": 7746 + }, + { + "epoch": 1.94, + "grad_norm": 3.1000943183898926, + "learning_rate": 6.728511159672284e-06, + "logits/chosen": -0.4114547073841095, + "logits/rejected": -0.4744577705860138, + "logps/chosen": -55.869834899902344, + "logps/rejected": -104.39505004882812, + "loss": 0.663, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.961272954940796, + "rewards/margins": 5.314723491668701, + "rewards/rejected": -2.353450059890747, + "step": 7747 + }, + { + "epoch": 1.94, + "grad_norm": 3.801527500152588, + "learning_rate": 6.7277736113080204e-06, + "logits/chosen": -0.45152747631073, + "logits/rejected": -0.47648710012435913, + "logps/chosen": -51.78037643432617, + "logps/rejected": -86.25433349609375, + "loss": 0.7804, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8845510482788086, + "rewards/margins": 4.707879066467285, + "rewards/rejected": -1.8233275413513184, + "step": 7748 + }, + { + "epoch": 1.94, + "grad_norm": 6.268484592437744, + "learning_rate": 6.727036020248632e-06, + "logits/chosen": -0.4369755983352661, + "logits/rejected": -0.5321828722953796, + "logps/chosen": -55.60089874267578, + "logps/rejected": -83.27001190185547, + "loss": 0.6835, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7080817222595215, + "rewards/margins": 4.857771873474121, + "rewards/rejected": -2.1496899127960205, + "step": 7749 + }, + { + "epoch": 1.94, + "grad_norm": 3.7941105365753174, + "learning_rate": 6.7262983865123475e-06, + "logits/chosen": -0.4055357277393341, + "logits/rejected": -0.48372772336006165, + "logps/chosen": -61.140777587890625, + "logps/rejected": -93.7945556640625, + "loss": 0.7499, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9920806884765625, + "rewards/margins": 5.376182556152344, + "rewards/rejected": -2.3841021060943604, + "step": 7750 + }, + { + "epoch": 1.94, + "grad_norm": 3.8758175373077393, + "learning_rate": 6.7255607101173924e-06, + "logits/chosen": -0.5074331164360046, + "logits/rejected": -0.5516677498817444, + "logps/chosen": -45.276161193847656, + "logps/rejected": -82.62474060058594, + "loss": 0.6354, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9927875995635986, + "rewards/margins": 5.76526403427124, + "rewards/rejected": -2.7724761962890625, + "step": 7751 + }, + { + "epoch": 1.94, + "grad_norm": 5.155344486236572, + "learning_rate": 6.724822991081998e-06, + "logits/chosen": -0.4650416076183319, + "logits/rejected": -0.5687941312789917, + "logps/chosen": -57.34518051147461, + "logps/rejected": -91.52618408203125, + "loss": 0.6372, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.793968677520752, + "rewards/margins": 5.719714164733887, + "rewards/rejected": -2.9257454872131348, + "step": 7752 + }, + { + "epoch": 1.94, + "grad_norm": 5.894489288330078, + "learning_rate": 6.724085229424393e-06, + "logits/chosen": -0.4351767599582672, + "logits/rejected": -0.583835244178772, + "logps/chosen": -58.825435638427734, + "logps/rejected": -73.95791625976562, + "loss": 0.6788, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6491928100585938, + "rewards/margins": 5.370849132537842, + "rewards/rejected": -2.721656322479248, + "step": 7753 + }, + { + "epoch": 1.94, + "grad_norm": 3.2360358238220215, + "learning_rate": 6.723347425162807e-06, + "logits/chosen": -0.42787668108940125, + "logits/rejected": -0.6181950569152832, + "logps/chosen": -56.47895431518555, + "logps/rejected": -67.42756652832031, + "loss": 0.6226, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8438899517059326, + "rewards/margins": 5.343045234680176, + "rewards/rejected": -2.499155044555664, + "step": 7754 + }, + { + "epoch": 1.94, + "grad_norm": 5.277298450469971, + "learning_rate": 6.722609578315474e-06, + "logits/chosen": -0.5669835805892944, + "logits/rejected": -0.6180689930915833, + "logps/chosen": -50.86803436279297, + "logps/rejected": -92.32866668701172, + "loss": 0.7416, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9444382190704346, + "rewards/margins": 5.657050132751465, + "rewards/rejected": -2.7126126289367676, + "step": 7755 + }, + { + "epoch": 1.94, + "grad_norm": 5.195717811584473, + "learning_rate": 6.721871688900626e-06, + "logits/chosen": -0.4963921010494232, + "logits/rejected": -0.5717991590499878, + "logps/chosen": -50.90421676635742, + "logps/rejected": -89.3653793334961, + "loss": 0.7487, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1556549072265625, + "rewards/margins": 4.654804706573486, + "rewards/rejected": -1.4991497993469238, + "step": 7756 + }, + { + "epoch": 1.94, + "grad_norm": 14.684391021728516, + "learning_rate": 6.7211337569364965e-06, + "logits/chosen": -0.400773823261261, + "logits/rejected": -0.510526180267334, + "logps/chosen": -66.08240509033203, + "logps/rejected": -80.41554260253906, + "loss": 0.6938, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.977275848388672, + "rewards/margins": 5.355698585510254, + "rewards/rejected": -2.3784232139587402, + "step": 7757 + }, + { + "epoch": 1.94, + "grad_norm": 9.364693641662598, + "learning_rate": 6.720395782441321e-06, + "logits/chosen": -0.4909966289997101, + "logits/rejected": -0.5707101821899414, + "logps/chosen": -58.958126068115234, + "logps/rejected": -81.80773162841797, + "loss": 0.7389, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0520660877227783, + "rewards/margins": 5.430548191070557, + "rewards/rejected": -2.3784823417663574, + "step": 7758 + }, + { + "epoch": 1.94, + "grad_norm": 5.9247002601623535, + "learning_rate": 6.719657765433336e-06, + "logits/chosen": -0.47436657547950745, + "logits/rejected": -0.534618616104126, + "logps/chosen": -62.92660140991211, + "logps/rejected": -91.30311584472656, + "loss": 0.7252, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8266711235046387, + "rewards/margins": 5.253569602966309, + "rewards/rejected": -2.426898241043091, + "step": 7759 + }, + { + "epoch": 1.94, + "grad_norm": 4.818549156188965, + "learning_rate": 6.71891970593078e-06, + "logits/chosen": -0.39309990406036377, + "logits/rejected": -0.4776785969734192, + "logps/chosen": -52.96552658081055, + "logps/rejected": -104.57059478759766, + "loss": 0.6932, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.109389305114746, + "rewards/margins": 6.792131423950195, + "rewards/rejected": -3.682742118835449, + "step": 7760 + }, + { + "epoch": 1.94, + "grad_norm": 5.215530872344971, + "learning_rate": 6.718181603951889e-06, + "logits/chosen": -0.40908926725387573, + "logits/rejected": -0.51181560754776, + "logps/chosen": -47.04759979248047, + "logps/rejected": -82.79107666015625, + "loss": 0.6588, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.035844087600708, + "rewards/margins": 5.744264125823975, + "rewards/rejected": -2.7084195613861084, + "step": 7761 + }, + { + "epoch": 1.94, + "grad_norm": 10.76789379119873, + "learning_rate": 6.717443459514903e-06, + "logits/chosen": -0.4763679802417755, + "logits/rejected": -0.591867983341217, + "logps/chosen": -48.310264587402344, + "logps/rejected": -103.18010711669922, + "loss": 0.5994, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0015389919281006, + "rewards/margins": 6.824379920959473, + "rewards/rejected": -3.822840929031372, + "step": 7762 + }, + { + "epoch": 1.94, + "grad_norm": 14.348876953125, + "learning_rate": 6.716705272638061e-06, + "logits/chosen": -0.5218591094017029, + "logits/rejected": -0.5522705316543579, + "logps/chosen": -56.05052947998047, + "logps/rejected": -108.05522918701172, + "loss": 0.7337, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.688041925430298, + "rewards/margins": 5.630900859832764, + "rewards/rejected": -2.942858934402466, + "step": 7763 + }, + { + "epoch": 1.94, + "grad_norm": 13.690716743469238, + "learning_rate": 6.7159670433396066e-06, + "logits/chosen": -0.43459761142730713, + "logits/rejected": -0.5407260060310364, + "logps/chosen": -62.004676818847656, + "logps/rejected": -85.62901306152344, + "loss": 0.6872, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.867936611175537, + "rewards/margins": 5.895524024963379, + "rewards/rejected": -3.027587890625, + "step": 7764 + }, + { + "epoch": 1.94, + "grad_norm": 4.550796985626221, + "learning_rate": 6.715228771637782e-06, + "logits/chosen": -0.520399808883667, + "logits/rejected": -0.5557699799537659, + "logps/chosen": -50.944236755371094, + "logps/rejected": -92.09659576416016, + "loss": 0.6631, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9686381816864014, + "rewards/margins": 4.996521949768066, + "rewards/rejected": -2.027883768081665, + "step": 7765 + }, + { + "epoch": 1.94, + "grad_norm": 4.7873005867004395, + "learning_rate": 6.714490457550831e-06, + "logits/chosen": -0.501273512840271, + "logits/rejected": -0.6115530133247375, + "logps/chosen": -56.13762283325195, + "logps/rejected": -81.26541900634766, + "loss": 0.6453, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9081759452819824, + "rewards/margins": 5.466454029083252, + "rewards/rejected": -2.5582778453826904, + "step": 7766 + }, + { + "epoch": 1.94, + "grad_norm": 19.474687576293945, + "learning_rate": 6.713752101096995e-06, + "logits/chosen": -0.43966591358184814, + "logits/rejected": -0.5056349039077759, + "logps/chosen": -73.51754760742188, + "logps/rejected": -89.77227020263672, + "loss": 0.9782, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.3893074989318848, + "rewards/margins": 4.657210350036621, + "rewards/rejected": -2.2679033279418945, + "step": 7767 + }, + { + "epoch": 1.94, + "grad_norm": 2.821559429168701, + "learning_rate": 6.713013702294522e-06, + "logits/chosen": -0.5468707084655762, + "logits/rejected": -0.5535669326782227, + "logps/chosen": -58.8249397277832, + "logps/rejected": -99.3902359008789, + "loss": 0.6531, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3672053813934326, + "rewards/margins": 5.676694393157959, + "rewards/rejected": -2.3094890117645264, + "step": 7768 + }, + { + "epoch": 1.94, + "grad_norm": 14.585725784301758, + "learning_rate": 6.712275261161658e-06, + "logits/chosen": -0.47588014602661133, + "logits/rejected": -0.5429922938346863, + "logps/chosen": -60.15849685668945, + "logps/rejected": -94.28152465820312, + "loss": 0.8015, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.736588954925537, + "rewards/margins": 5.251163482666016, + "rewards/rejected": -2.514575242996216, + "step": 7769 + }, + { + "epoch": 1.94, + "grad_norm": 11.03294849395752, + "learning_rate": 6.711536777716654e-06, + "logits/chosen": -0.49557140469551086, + "logits/rejected": -0.5935049057006836, + "logps/chosen": -55.800193786621094, + "logps/rejected": -97.09434509277344, + "loss": 0.6301, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0487725734710693, + "rewards/margins": 5.682360649108887, + "rewards/rejected": -2.633587598800659, + "step": 7770 + }, + { + "epoch": 1.94, + "grad_norm": 5.374124526977539, + "learning_rate": 6.710798251977753e-06, + "logits/chosen": -0.46053844690322876, + "logits/rejected": -0.5480297207832336, + "logps/chosen": -54.18768310546875, + "logps/rejected": -105.75065612792969, + "loss": 0.6072, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.868443250656128, + "rewards/margins": 5.788748741149902, + "rewards/rejected": -2.9203054904937744, + "step": 7771 + }, + { + "epoch": 1.94, + "grad_norm": 17.249177932739258, + "learning_rate": 6.7100596839632085e-06, + "logits/chosen": -0.44196373224258423, + "logits/rejected": -0.545300304889679, + "logps/chosen": -69.49374389648438, + "logps/rejected": -93.84125518798828, + "loss": 0.8449, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7420871257781982, + "rewards/margins": 5.557580471038818, + "rewards/rejected": -2.815493583679199, + "step": 7772 + }, + { + "epoch": 1.94, + "grad_norm": 3.09562611579895, + "learning_rate": 6.7093210736912695e-06, + "logits/chosen": -0.49617910385131836, + "logits/rejected": -0.5794885158538818, + "logps/chosen": -56.916748046875, + "logps/rejected": -90.47994995117188, + "loss": 0.683, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0179991722106934, + "rewards/margins": 5.565129280090332, + "rewards/rejected": -2.547130584716797, + "step": 7773 + }, + { + "epoch": 1.94, + "grad_norm": 6.082251071929932, + "learning_rate": 6.7085824211801886e-06, + "logits/chosen": -0.5766561031341553, + "logits/rejected": -0.6326888203620911, + "logps/chosen": -45.973236083984375, + "logps/rejected": -102.6396484375, + "loss": 0.638, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.123061418533325, + "rewards/margins": 5.620113372802734, + "rewards/rejected": -2.49705171585083, + "step": 7774 + }, + { + "epoch": 1.95, + "grad_norm": 6.542344093322754, + "learning_rate": 6.70784372644822e-06, + "logits/chosen": -0.4768884778022766, + "logits/rejected": -0.5782265067100525, + "logps/chosen": -63.222747802734375, + "logps/rejected": -89.39065551757812, + "loss": 0.7588, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0139453411102295, + "rewards/margins": 4.681797027587891, + "rewards/rejected": -1.6678521633148193, + "step": 7775 + }, + { + "epoch": 1.95, + "grad_norm": 4.893989086151123, + "learning_rate": 6.707104989513615e-06, + "logits/chosen": -0.5311443209648132, + "logits/rejected": -0.6162519454956055, + "logps/chosen": -54.349891662597656, + "logps/rejected": -87.25772857666016, + "loss": 0.6707, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.167184829711914, + "rewards/margins": 5.269351482391357, + "rewards/rejected": -2.102166175842285, + "step": 7776 + }, + { + "epoch": 1.95, + "grad_norm": 5.103865623474121, + "learning_rate": 6.706366210394632e-06, + "logits/chosen": -0.45886653661727905, + "logits/rejected": -0.5420113801956177, + "logps/chosen": -58.449424743652344, + "logps/rejected": -99.15434265136719, + "loss": 0.729, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.718046188354492, + "rewards/margins": 5.609959602355957, + "rewards/rejected": -2.8919131755828857, + "step": 7777 + }, + { + "epoch": 1.95, + "grad_norm": 5.878907680511475, + "learning_rate": 6.705627389109522e-06, + "logits/chosen": -0.5030308961868286, + "logits/rejected": -0.5218233466148376, + "logps/chosen": -52.85033416748047, + "logps/rejected": -102.07492065429688, + "loss": 0.637, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0826575756073, + "rewards/margins": 6.0606608390808105, + "rewards/rejected": -2.978003740310669, + "step": 7778 + }, + { + "epoch": 1.95, + "grad_norm": 5.051436424255371, + "learning_rate": 6.704888525676549e-06, + "logits/chosen": -0.5116048455238342, + "logits/rejected": -0.586763858795166, + "logps/chosen": -54.5546989440918, + "logps/rejected": -86.6090316772461, + "loss": 0.6336, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.169572114944458, + "rewards/margins": 5.823738098144531, + "rewards/rejected": -2.6541659832000732, + "step": 7779 + }, + { + "epoch": 1.95, + "grad_norm": 6.6216864585876465, + "learning_rate": 6.704149620113962e-06, + "logits/chosen": -0.5103504061698914, + "logits/rejected": -0.5822300910949707, + "logps/chosen": -48.25682067871094, + "logps/rejected": -86.24971008300781, + "loss": 0.7365, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.195913553237915, + "rewards/margins": 5.9450836181640625, + "rewards/rejected": -2.7491695880889893, + "step": 7780 + }, + { + "epoch": 1.95, + "grad_norm": 7.433215141296387, + "learning_rate": 6.703410672440029e-06, + "logits/chosen": -0.46719449758529663, + "logits/rejected": -0.6080129146575928, + "logps/chosen": -61.55739212036133, + "logps/rejected": -94.80133056640625, + "loss": 0.6627, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.820936918258667, + "rewards/margins": 6.269863128662109, + "rewards/rejected": -3.4489262104034424, + "step": 7781 + }, + { + "epoch": 1.95, + "grad_norm": 4.223278999328613, + "learning_rate": 6.7026716826730056e-06, + "logits/chosen": -0.4605828821659088, + "logits/rejected": -0.5537393689155579, + "logps/chosen": -48.45140838623047, + "logps/rejected": -91.76669311523438, + "loss": 0.6288, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1667885780334473, + "rewards/margins": 6.162560939788818, + "rewards/rejected": -2.99577260017395, + "step": 7782 + }, + { + "epoch": 1.95, + "grad_norm": 6.8084797859191895, + "learning_rate": 6.701932650831153e-06, + "logits/chosen": -0.49386292695999146, + "logits/rejected": -0.5387825965881348, + "logps/chosen": -62.73686218261719, + "logps/rejected": -108.74288940429688, + "loss": 0.7677, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.801635980606079, + "rewards/margins": 5.368908882141113, + "rewards/rejected": -2.5672731399536133, + "step": 7783 + }, + { + "epoch": 1.95, + "grad_norm": 4.456422805786133, + "learning_rate": 6.7011935769327365e-06, + "logits/chosen": -0.4976709187030792, + "logits/rejected": -0.5706095099449158, + "logps/chosen": -52.413429260253906, + "logps/rejected": -93.52310943603516, + "loss": 0.6412, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.231642246246338, + "rewards/margins": 5.387624263763428, + "rewards/rejected": -2.1559813022613525, + "step": 7784 + }, + { + "epoch": 1.95, + "grad_norm": 7.648993968963623, + "learning_rate": 6.700454460996015e-06, + "logits/chosen": -0.44799497723579407, + "logits/rejected": -0.5307037830352783, + "logps/chosen": -55.230628967285156, + "logps/rejected": -99.96564483642578, + "loss": 0.7672, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.065101146697998, + "rewards/margins": 5.902372360229492, + "rewards/rejected": -2.837270498275757, + "step": 7785 + }, + { + "epoch": 1.95, + "grad_norm": 29.556259155273438, + "learning_rate": 6.699715303039257e-06, + "logits/chosen": -0.4952853322029114, + "logits/rejected": -0.5911332368850708, + "logps/chosen": -50.30217742919922, + "logps/rejected": -109.25497436523438, + "loss": 0.5673, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2229537963867188, + "rewards/margins": 7.5648016929626465, + "rewards/rejected": -4.341848850250244, + "step": 7786 + }, + { + "epoch": 1.95, + "grad_norm": 14.724403381347656, + "learning_rate": 6.698976103080724e-06, + "logits/chosen": -0.4167405664920807, + "logits/rejected": -0.4608580768108368, + "logps/chosen": -65.7153549194336, + "logps/rejected": -94.8055191040039, + "loss": 0.9623, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6494386196136475, + "rewards/margins": 3.6630873680114746, + "rewards/rejected": -1.0136489868164062, + "step": 7787 + }, + { + "epoch": 1.95, + "grad_norm": 10.435150146484375, + "learning_rate": 6.698236861138685e-06, + "logits/chosen": -0.461688756942749, + "logits/rejected": -0.5742225646972656, + "logps/chosen": -53.65106964111328, + "logps/rejected": -83.3424072265625, + "loss": 0.6742, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2048397064208984, + "rewards/margins": 6.279698371887207, + "rewards/rejected": -3.0748589038848877, + "step": 7788 + }, + { + "epoch": 1.95, + "grad_norm": 6.794879913330078, + "learning_rate": 6.6974975772314075e-06, + "logits/chosen": -0.43508249521255493, + "logits/rejected": -0.5152363777160645, + "logps/chosen": -55.02174758911133, + "logps/rejected": -113.31608581542969, + "loss": 0.6946, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7691543102264404, + "rewards/margins": 5.895317077636719, + "rewards/rejected": -3.1261630058288574, + "step": 7789 + }, + { + "epoch": 1.95, + "grad_norm": 4.589243412017822, + "learning_rate": 6.69675825137716e-06, + "logits/chosen": -0.49696967005729675, + "logits/rejected": -0.5829516649246216, + "logps/chosen": -57.74756622314453, + "logps/rejected": -93.58126831054688, + "loss": 0.7258, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.019649028778076, + "rewards/margins": 6.058783531188965, + "rewards/rejected": -3.039134979248047, + "step": 7790 + }, + { + "epoch": 1.95, + "grad_norm": 7.010815143585205, + "learning_rate": 6.69601888359421e-06, + "logits/chosen": -0.4050133228302002, + "logits/rejected": -0.4768449366092682, + "logps/chosen": -58.02850341796875, + "logps/rejected": -109.09400939941406, + "loss": 0.7686, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6856496334075928, + "rewards/margins": 4.992491722106934, + "rewards/rejected": -2.306842088699341, + "step": 7791 + }, + { + "epoch": 1.95, + "grad_norm": 8.4349946975708, + "learning_rate": 6.6952794739008285e-06, + "logits/chosen": -0.5053595900535583, + "logits/rejected": -0.6018227338790894, + "logps/chosen": -56.36429977416992, + "logps/rejected": -76.5987548828125, + "loss": 0.6949, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0021097660064697, + "rewards/margins": 5.727322578430176, + "rewards/rejected": -2.725213050842285, + "step": 7792 + }, + { + "epoch": 1.95, + "grad_norm": 3.7442784309387207, + "learning_rate": 6.694540022315291e-06, + "logits/chosen": -0.48099055886268616, + "logits/rejected": -0.5603012442588806, + "logps/chosen": -48.278018951416016, + "logps/rejected": -79.58049774169922, + "loss": 0.6604, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2453014850616455, + "rewards/margins": 5.875569820404053, + "rewards/rejected": -2.6302683353424072, + "step": 7793 + }, + { + "epoch": 1.95, + "grad_norm": 3.058305263519287, + "learning_rate": 6.693800528855864e-06, + "logits/chosen": -0.45513996481895447, + "logits/rejected": -0.5386313199996948, + "logps/chosen": -56.244850158691406, + "logps/rejected": -97.16351318359375, + "loss": 0.6466, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.973465919494629, + "rewards/margins": 6.383607864379883, + "rewards/rejected": -3.4101414680480957, + "step": 7794 + }, + { + "epoch": 1.95, + "grad_norm": 5.162930488586426, + "learning_rate": 6.693060993540826e-06, + "logits/chosen": -0.44809478521347046, + "logits/rejected": -0.4937385320663452, + "logps/chosen": -57.71125411987305, + "logps/rejected": -108.19374084472656, + "loss": 0.785, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6933436393737793, + "rewards/margins": 5.544438362121582, + "rewards/rejected": -2.851094961166382, + "step": 7795 + }, + { + "epoch": 1.95, + "grad_norm": 6.729604721069336, + "learning_rate": 6.69232141638845e-06, + "logits/chosen": -0.4656490683555603, + "logits/rejected": -0.5343908667564392, + "logps/chosen": -61.40025329589844, + "logps/rejected": -98.08604431152344, + "loss": 0.9098, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.713111639022827, + "rewards/margins": 4.600967884063721, + "rewards/rejected": -1.8878562450408936, + "step": 7796 + }, + { + "epoch": 1.95, + "grad_norm": 5.1041789054870605, + "learning_rate": 6.691581797417012e-06, + "logits/chosen": -0.47760528326034546, + "logits/rejected": -0.5580564141273499, + "logps/chosen": -65.14279174804688, + "logps/rejected": -97.02447509765625, + "loss": 0.7373, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9307892322540283, + "rewards/margins": 5.607430458068848, + "rewards/rejected": -2.6766414642333984, + "step": 7797 + }, + { + "epoch": 1.95, + "grad_norm": 4.727845191955566, + "learning_rate": 6.690842136644788e-06, + "logits/chosen": -0.5150663256645203, + "logits/rejected": -0.6192451119422913, + "logps/chosen": -70.37992858886719, + "logps/rejected": -92.59950256347656, + "loss": 0.7023, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.205065965652466, + "rewards/margins": 6.583808898925781, + "rewards/rejected": -3.3787424564361572, + "step": 7798 + }, + { + "epoch": 1.95, + "grad_norm": 6.308542728424072, + "learning_rate": 6.690102434090056e-06, + "logits/chosen": -0.4624493718147278, + "logits/rejected": -0.49372348189353943, + "logps/chosen": -46.71615219116211, + "logps/rejected": -88.90989685058594, + "loss": 0.7153, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8274786472320557, + "rewards/margins": 5.095766544342041, + "rewards/rejected": -2.2682878971099854, + "step": 7799 + }, + { + "epoch": 1.95, + "grad_norm": 5.156953811645508, + "learning_rate": 6.689362689771096e-06, + "logits/chosen": -0.5377298593521118, + "logits/rejected": -0.5937964916229248, + "logps/chosen": -54.960575103759766, + "logps/rejected": -93.3534927368164, + "loss": 0.6688, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0343635082244873, + "rewards/margins": 5.321573734283447, + "rewards/rejected": -2.287209987640381, + "step": 7800 + }, + { + "epoch": 1.95, + "grad_norm": 4.475849151611328, + "learning_rate": 6.688622903706187e-06, + "logits/chosen": -0.4398658871650696, + "logits/rejected": -0.565639078617096, + "logps/chosen": -71.13728332519531, + "logps/rejected": -83.70050048828125, + "loss": 0.8232, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.912444829940796, + "rewards/margins": 5.291610240936279, + "rewards/rejected": -2.3791654109954834, + "step": 7801 + }, + { + "epoch": 1.95, + "grad_norm": 5.269418239593506, + "learning_rate": 6.68788307591361e-06, + "logits/chosen": -0.4608132541179657, + "logits/rejected": -0.5054896473884583, + "logps/chosen": -49.619754791259766, + "logps/rejected": -94.98849487304688, + "loss": 0.6569, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0322418212890625, + "rewards/margins": 6.1608147621154785, + "rewards/rejected": -3.128572702407837, + "step": 7802 + }, + { + "epoch": 1.95, + "grad_norm": 4.291522979736328, + "learning_rate": 6.687143206411645e-06, + "logits/chosen": -0.4640476107597351, + "logits/rejected": -0.5556167364120483, + "logps/chosen": -44.278324127197266, + "logps/rejected": -87.7991714477539, + "loss": 0.6175, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9423327445983887, + "rewards/margins": 6.116220474243164, + "rewards/rejected": -3.173887252807617, + "step": 7803 + }, + { + "epoch": 1.95, + "grad_norm": 4.534447193145752, + "learning_rate": 6.68640329521858e-06, + "logits/chosen": -0.45133233070373535, + "logits/rejected": -0.48203516006469727, + "logps/chosen": -73.72920227050781, + "logps/rejected": -79.87971496582031, + "loss": 0.6023, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.368924617767334, + "rewards/margins": 5.419526100158691, + "rewards/rejected": -2.0506012439727783, + "step": 7804 + }, + { + "epoch": 1.95, + "grad_norm": 7.249303340911865, + "learning_rate": 6.685663342352693e-06, + "logits/chosen": -0.43115371465682983, + "logits/rejected": -0.5434473156929016, + "logps/chosen": -53.09938430786133, + "logps/rejected": -85.45563507080078, + "loss": 0.6972, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.056978940963745, + "rewards/margins": 5.703625202178955, + "rewards/rejected": -2.646646738052368, + "step": 7805 + }, + { + "epoch": 1.95, + "grad_norm": 8.902642250061035, + "learning_rate": 6.684923347832273e-06, + "logits/chosen": -0.4363648593425751, + "logits/rejected": -0.5258390307426453, + "logps/chosen": -67.53146362304688, + "logps/rejected": -80.50387573242188, + "loss": 0.7981, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.941990852355957, + "rewards/margins": 4.242035865783691, + "rewards/rejected": -1.3000448942184448, + "step": 7806 + }, + { + "epoch": 1.95, + "grad_norm": 3.8263309001922607, + "learning_rate": 6.684183311675606e-06, + "logits/chosen": -0.4181264042854309, + "logits/rejected": -0.5611648559570312, + "logps/chosen": -60.878726959228516, + "logps/rejected": -71.78189086914062, + "loss": 0.6845, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9114441871643066, + "rewards/margins": 5.636895179748535, + "rewards/rejected": -2.7254507541656494, + "step": 7807 + }, + { + "epoch": 1.95, + "grad_norm": 10.670064926147461, + "learning_rate": 6.683443233900977e-06, + "logits/chosen": -0.5021293759346008, + "logits/rejected": -0.5234342813491821, + "logps/chosen": -49.043853759765625, + "logps/rejected": -100.39877319335938, + "loss": 0.7238, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.786271572113037, + "rewards/margins": 5.89927339553833, + "rewards/rejected": -3.113001585006714, + "step": 7808 + }, + { + "epoch": 1.95, + "grad_norm": 4.972908020019531, + "learning_rate": 6.682703114526674e-06, + "logits/chosen": -0.4564916789531708, + "logits/rejected": -0.5397732853889465, + "logps/chosen": -52.36980056762695, + "logps/rejected": -81.85958862304688, + "loss": 0.7333, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.179492950439453, + "rewards/margins": 5.935595512390137, + "rewards/rejected": -2.7561025619506836, + "step": 7809 + }, + { + "epoch": 1.95, + "grad_norm": 4.600259780883789, + "learning_rate": 6.681962953570989e-06, + "logits/chosen": -0.49605849385261536, + "logits/rejected": -0.5684238076210022, + "logps/chosen": -53.947547912597656, + "logps/rejected": -100.59159088134766, + "loss": 0.6552, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.808037042617798, + "rewards/margins": 6.036738872528076, + "rewards/rejected": -3.2287018299102783, + "step": 7810 + }, + { + "epoch": 1.95, + "grad_norm": 2.550899028778076, + "learning_rate": 6.681222751052209e-06, + "logits/chosen": -0.42830154299736023, + "logits/rejected": -0.5178228616714478, + "logps/chosen": -60.0947265625, + "logps/rejected": -95.02973937988281, + "loss": 0.6226, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.921828508377075, + "rewards/margins": 6.179409027099609, + "rewards/rejected": -3.257580280303955, + "step": 7811 + }, + { + "epoch": 1.95, + "grad_norm": 4.592648029327393, + "learning_rate": 6.680482506988627e-06, + "logits/chosen": -0.4772863984107971, + "logits/rejected": -0.5259959101676941, + "logps/chosen": -45.77544403076172, + "logps/rejected": -94.06319427490234, + "loss": 0.6358, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7337698936462402, + "rewards/margins": 5.239162921905518, + "rewards/rejected": -2.5053930282592773, + "step": 7812 + }, + { + "epoch": 1.95, + "grad_norm": 4.707428455352783, + "learning_rate": 6.679742221398535e-06, + "logits/chosen": -0.4550149440765381, + "logits/rejected": -0.5730454921722412, + "logps/chosen": -69.01000213623047, + "logps/rejected": -82.75240325927734, + "loss": 0.6754, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.866196870803833, + "rewards/margins": 5.350340843200684, + "rewards/rejected": -2.4841439723968506, + "step": 7813 + }, + { + "epoch": 1.95, + "grad_norm": 6.517017841339111, + "learning_rate": 6.679001894300225e-06, + "logits/chosen": -0.4646540880203247, + "logits/rejected": -0.5355947613716125, + "logps/chosen": -60.624114990234375, + "logps/rejected": -102.88471221923828, + "loss": 0.7481, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0700573921203613, + "rewards/margins": 5.874475479125977, + "rewards/rejected": -2.8044188022613525, + "step": 7814 + }, + { + "epoch": 1.96, + "grad_norm": 3.839521646499634, + "learning_rate": 6.678261525711996e-06, + "logits/chosen": -0.5084913969039917, + "logits/rejected": -0.6006119251251221, + "logps/chosen": -47.629150390625, + "logps/rejected": -79.23641967773438, + "loss": 0.6449, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.147228717803955, + "rewards/margins": 5.8069610595703125, + "rewards/rejected": -2.659731864929199, + "step": 7815 + }, + { + "epoch": 1.96, + "grad_norm": 7.856467247009277, + "learning_rate": 6.677521115652137e-06, + "logits/chosen": -0.4900086522102356, + "logits/rejected": -0.4992358982563019, + "logps/chosen": -60.88440704345703, + "logps/rejected": -120.17594146728516, + "loss": 0.7585, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.790842294692993, + "rewards/margins": 5.490156173706055, + "rewards/rejected": -2.6993134021759033, + "step": 7816 + }, + { + "epoch": 1.96, + "grad_norm": 6.021834850311279, + "learning_rate": 6.676780664138945e-06, + "logits/chosen": -0.4927683472633362, + "logits/rejected": -0.5897629261016846, + "logps/chosen": -60.377777099609375, + "logps/rejected": -91.77288055419922, + "loss": 0.6789, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.044433355331421, + "rewards/margins": 6.403310775756836, + "rewards/rejected": -3.358877420425415, + "step": 7817 + }, + { + "epoch": 1.96, + "grad_norm": 9.063966751098633, + "learning_rate": 6.676040171190723e-06, + "logits/chosen": -0.5188024044036865, + "logits/rejected": -0.5423295497894287, + "logps/chosen": -47.34600830078125, + "logps/rejected": -103.4404067993164, + "loss": 0.7355, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9042582511901855, + "rewards/margins": 6.089648723602295, + "rewards/rejected": -3.1853904724121094, + "step": 7818 + }, + { + "epoch": 1.96, + "grad_norm": 10.67287826538086, + "learning_rate": 6.675299636825764e-06, + "logits/chosen": -0.4929002523422241, + "logits/rejected": -0.5598956942558289, + "logps/chosen": -56.31920623779297, + "logps/rejected": -86.7490234375, + "loss": 0.8781, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.564131259918213, + "rewards/margins": 4.516058444976807, + "rewards/rejected": -1.9519271850585938, + "step": 7819 + }, + { + "epoch": 1.96, + "grad_norm": 4.446218013763428, + "learning_rate": 6.674559061062369e-06, + "logits/chosen": -0.4149479866027832, + "logits/rejected": -0.5348480939865112, + "logps/chosen": -59.911251068115234, + "logps/rejected": -83.19486236572266, + "loss": 0.7423, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2608048915863037, + "rewards/margins": 4.920840740203857, + "rewards/rejected": -1.660036325454712, + "step": 7820 + }, + { + "epoch": 1.96, + "grad_norm": 4.498970985412598, + "learning_rate": 6.673818443918838e-06, + "logits/chosen": -0.4586181044578552, + "logits/rejected": -0.5100479125976562, + "logps/chosen": -52.34215545654297, + "logps/rejected": -107.97465515136719, + "loss": 0.6156, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.074387550354004, + "rewards/margins": 6.301147937774658, + "rewards/rejected": -3.226759910583496, + "step": 7821 + }, + { + "epoch": 1.96, + "grad_norm": 5.290565490722656, + "learning_rate": 6.673077785413473e-06, + "logits/chosen": -0.4413805603981018, + "logits/rejected": -0.4869561195373535, + "logps/chosen": -45.328678131103516, + "logps/rejected": -95.84236907958984, + "loss": 0.6373, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.070744037628174, + "rewards/margins": 5.640610694885254, + "rewards/rejected": -2.56986665725708, + "step": 7822 + }, + { + "epoch": 1.96, + "grad_norm": 7.720234394073486, + "learning_rate": 6.672337085564577e-06, + "logits/chosen": -0.3707400858402252, + "logits/rejected": -0.4857103228569031, + "logps/chosen": -67.96022033691406, + "logps/rejected": -98.0411605834961, + "loss": 0.7778, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7112438678741455, + "rewards/margins": 5.6424150466918945, + "rewards/rejected": -2.93117094039917, + "step": 7823 + }, + { + "epoch": 1.96, + "grad_norm": 7.398331642150879, + "learning_rate": 6.671596344390452e-06, + "logits/chosen": -0.40349048376083374, + "logits/rejected": -0.4902866780757904, + "logps/chosen": -60.816410064697266, + "logps/rejected": -90.06388092041016, + "loss": 0.7343, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6223862171173096, + "rewards/margins": 4.883965492248535, + "rewards/rejected": -2.2615790367126465, + "step": 7824 + }, + { + "epoch": 1.96, + "grad_norm": 10.95743465423584, + "learning_rate": 6.670855561909404e-06, + "logits/chosen": -0.446590781211853, + "logits/rejected": -0.5389670729637146, + "logps/chosen": -66.99340057373047, + "logps/rejected": -97.4434814453125, + "loss": 0.9184, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.798198699951172, + "rewards/margins": 5.754550933837891, + "rewards/rejected": -2.9563519954681396, + "step": 7825 + }, + { + "epoch": 1.96, + "grad_norm": 4.739680290222168, + "learning_rate": 6.670114738139738e-06, + "logits/chosen": -0.5057441592216492, + "logits/rejected": -0.5700139403343201, + "logps/chosen": -75.3818588256836, + "logps/rejected": -121.60223388671875, + "loss": 0.8124, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.595522165298462, + "rewards/margins": 5.611300945281982, + "rewards/rejected": -3.0157785415649414, + "step": 7826 + }, + { + "epoch": 1.96, + "grad_norm": 3.8672080039978027, + "learning_rate": 6.669373873099761e-06, + "logits/chosen": -0.4212723672389984, + "logits/rejected": -0.5228091478347778, + "logps/chosen": -61.30426788330078, + "logps/rejected": -91.21322631835938, + "loss": 0.7116, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0476369857788086, + "rewards/margins": 5.739062309265137, + "rewards/rejected": -2.691425085067749, + "step": 7827 + }, + { + "epoch": 1.96, + "grad_norm": 3.6963515281677246, + "learning_rate": 6.668632966807778e-06, + "logits/chosen": -0.5014504790306091, + "logits/rejected": -0.625866711139679, + "logps/chosen": -55.81877517700195, + "logps/rejected": -107.94733428955078, + "loss": 0.6816, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8560667037963867, + "rewards/margins": 6.001261234283447, + "rewards/rejected": -3.1451947689056396, + "step": 7828 + }, + { + "epoch": 1.96, + "grad_norm": 2.8191442489624023, + "learning_rate": 6.667892019282101e-06, + "logits/chosen": -0.5032917261123657, + "logits/rejected": -0.5692209005355835, + "logps/chosen": -53.192928314208984, + "logps/rejected": -87.3148193359375, + "loss": 0.6254, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8556571006774902, + "rewards/margins": 5.988452911376953, + "rewards/rejected": -3.132795572280884, + "step": 7829 + }, + { + "epoch": 1.96, + "grad_norm": 4.927189826965332, + "learning_rate": 6.667151030541038e-06, + "logits/chosen": -0.5379389524459839, + "logits/rejected": -0.640326976776123, + "logps/chosen": -52.00762176513672, + "logps/rejected": -79.84423065185547, + "loss": 0.6935, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8756814002990723, + "rewards/margins": 4.8446736335754395, + "rewards/rejected": -1.9689927101135254, + "step": 7830 + }, + { + "epoch": 1.96, + "grad_norm": 7.485104084014893, + "learning_rate": 6.666410000602899e-06, + "logits/chosen": -0.34852656722068787, + "logits/rejected": -0.4253094792366028, + "logps/chosen": -71.63948822021484, + "logps/rejected": -81.48978424072266, + "loss": 0.8257, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.980454683303833, + "rewards/margins": 4.888615131378174, + "rewards/rejected": -1.908160924911499, + "step": 7831 + }, + { + "epoch": 1.96, + "grad_norm": 3.359130382537842, + "learning_rate": 6.665668929485998e-06, + "logits/chosen": -0.4766881763935089, + "logits/rejected": -0.5700721144676208, + "logps/chosen": -48.254241943359375, + "logps/rejected": -92.40155029296875, + "loss": 0.5922, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.325915813446045, + "rewards/margins": 6.662168025970459, + "rewards/rejected": -3.336252450942993, + "step": 7832 + }, + { + "epoch": 1.96, + "grad_norm": 3.8302245140075684, + "learning_rate": 6.664927817208645e-06, + "logits/chosen": -0.38561397790908813, + "logits/rejected": -0.410672664642334, + "logps/chosen": -46.785926818847656, + "logps/rejected": -87.36720275878906, + "loss": 0.7002, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.235598564147949, + "rewards/margins": 4.8453216552734375, + "rewards/rejected": -1.6097233295440674, + "step": 7833 + }, + { + "epoch": 1.96, + "grad_norm": 6.154936790466309, + "learning_rate": 6.664186663789155e-06, + "logits/chosen": -0.41021859645843506, + "logits/rejected": -0.4880605936050415, + "logps/chosen": -56.50172424316406, + "logps/rejected": -84.28954315185547, + "loss": 0.7087, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6593387126922607, + "rewards/margins": 4.719443321228027, + "rewards/rejected": -2.0601048469543457, + "step": 7834 + }, + { + "epoch": 1.96, + "grad_norm": 7.816266059875488, + "learning_rate": 6.663445469245842e-06, + "logits/chosen": -0.4655100405216217, + "logits/rejected": -0.5183043479919434, + "logps/chosen": -52.72932815551758, + "logps/rejected": -75.37036895751953, + "loss": 0.7654, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8636670112609863, + "rewards/margins": 4.009960651397705, + "rewards/rejected": -1.146294116973877, + "step": 7835 + }, + { + "epoch": 1.96, + "grad_norm": 5.8679118156433105, + "learning_rate": 6.662704233597023e-06, + "logits/chosen": -0.39457398653030396, + "logits/rejected": -0.43902522325515747, + "logps/chosen": -64.76683044433594, + "logps/rejected": -106.97158813476562, + "loss": 0.7167, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.178697109222412, + "rewards/margins": 5.360915660858154, + "rewards/rejected": -2.182218551635742, + "step": 7836 + }, + { + "epoch": 1.96, + "grad_norm": 29.80010223388672, + "learning_rate": 6.661962956861012e-06, + "logits/chosen": -0.44438785314559937, + "logits/rejected": -0.5568448305130005, + "logps/chosen": -65.73175048828125, + "logps/rejected": -77.86520385742188, + "loss": 0.8583, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0350255966186523, + "rewards/margins": 4.2239274978637695, + "rewards/rejected": -1.1889019012451172, + "step": 7837 + }, + { + "epoch": 1.96, + "grad_norm": 3.7384510040283203, + "learning_rate": 6.6612216390561304e-06, + "logits/chosen": -0.36579760909080505, + "logits/rejected": -0.40794888138771057, + "logps/chosen": -57.964481353759766, + "logps/rejected": -86.16438293457031, + "loss": 0.8723, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1513419151306152, + "rewards/margins": 4.886550426483154, + "rewards/rejected": -1.7352087497711182, + "step": 7838 + }, + { + "epoch": 1.96, + "grad_norm": 8.023833274841309, + "learning_rate": 6.660480280200694e-06, + "logits/chosen": -0.4697975516319275, + "logits/rejected": -0.5269864797592163, + "logps/chosen": -46.713592529296875, + "logps/rejected": -88.9444351196289, + "loss": 0.7616, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.964046001434326, + "rewards/margins": 5.277583599090576, + "rewards/rejected": -2.3135385513305664, + "step": 7839 + }, + { + "epoch": 1.96, + "grad_norm": 12.446737289428711, + "learning_rate": 6.659738880313025e-06, + "logits/chosen": -0.42476823925971985, + "logits/rejected": -0.5319781303405762, + "logps/chosen": -64.40812683105469, + "logps/rejected": -80.62116241455078, + "loss": 0.9141, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.897146463394165, + "rewards/margins": 4.912744045257568, + "rewards/rejected": -2.0155978202819824, + "step": 7840 + }, + { + "epoch": 1.96, + "grad_norm": 11.212045669555664, + "learning_rate": 6.6589974394114435e-06, + "logits/chosen": -0.48389214277267456, + "logits/rejected": -0.5460894107818604, + "logps/chosen": -54.01780319213867, + "logps/rejected": -91.52781677246094, + "loss": 0.7984, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.694483995437622, + "rewards/margins": 5.195252418518066, + "rewards/rejected": -2.5007684230804443, + "step": 7841 + }, + { + "epoch": 1.96, + "grad_norm": 12.170859336853027, + "learning_rate": 6.658255957514268e-06, + "logits/chosen": -0.4903441071510315, + "logits/rejected": -0.5673291087150574, + "logps/chosen": -61.56117630004883, + "logps/rejected": -99.60205078125, + "loss": 0.7566, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1398279666900635, + "rewards/margins": 5.176085472106934, + "rewards/rejected": -2.0362579822540283, + "step": 7842 + }, + { + "epoch": 1.96, + "grad_norm": 4.59035587310791, + "learning_rate": 6.657514434639826e-06, + "logits/chosen": -0.4473278522491455, + "logits/rejected": -0.5376531481742859, + "logps/chosen": -67.27252197265625, + "logps/rejected": -105.29669952392578, + "loss": 0.8399, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.708221197128296, + "rewards/margins": 6.432793617248535, + "rewards/rejected": -3.7245726585388184, + "step": 7843 + }, + { + "epoch": 1.96, + "grad_norm": 5.73043966293335, + "learning_rate": 6.6567728708064385e-06, + "logits/chosen": -0.4011469781398773, + "logits/rejected": -0.44763752818107605, + "logps/chosen": -59.46698760986328, + "logps/rejected": -105.7635498046875, + "loss": 0.714, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.341153621673584, + "rewards/margins": 6.17703914642334, + "rewards/rejected": -2.835885524749756, + "step": 7844 + }, + { + "epoch": 1.96, + "grad_norm": 2.579197645187378, + "learning_rate": 6.656031266032432e-06, + "logits/chosen": -0.49743786454200745, + "logits/rejected": -0.6296386122703552, + "logps/chosen": -49.244140625, + "logps/rejected": -88.46011352539062, + "loss": 0.6105, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1218953132629395, + "rewards/margins": 7.367893218994141, + "rewards/rejected": -4.245998382568359, + "step": 7845 + }, + { + "epoch": 1.96, + "grad_norm": 5.175207614898682, + "learning_rate": 6.6552896203361294e-06, + "logits/chosen": -0.38787227869033813, + "logits/rejected": -0.49709320068359375, + "logps/chosen": -51.39069747924805, + "logps/rejected": -95.95864868164062, + "loss": 0.6345, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9017810821533203, + "rewards/margins": 6.09306001663208, + "rewards/rejected": -3.1912801265716553, + "step": 7846 + }, + { + "epoch": 1.96, + "grad_norm": 19.212120056152344, + "learning_rate": 6.654547933735862e-06, + "logits/chosen": -0.41313502192497253, + "logits/rejected": -0.5722088813781738, + "logps/chosen": -64.78353118896484, + "logps/rejected": -75.14463806152344, + "loss": 0.7454, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.612895965576172, + "rewards/margins": 5.149541854858398, + "rewards/rejected": -2.5366458892822266, + "step": 7847 + }, + { + "epoch": 1.96, + "grad_norm": 4.353224277496338, + "learning_rate": 6.653806206249954e-06, + "logits/chosen": -0.29912814497947693, + "logits/rejected": -0.3629932105541229, + "logps/chosen": -49.00001525878906, + "logps/rejected": -87.92848205566406, + "loss": 0.7644, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0607573986053467, + "rewards/margins": 4.742992877960205, + "rewards/rejected": -1.6822353601455688, + "step": 7848 + }, + { + "epoch": 1.96, + "grad_norm": 3.165902853012085, + "learning_rate": 6.6530644378967355e-06, + "logits/chosen": -0.43101629614830017, + "logits/rejected": -0.5279409885406494, + "logps/chosen": -61.141578674316406, + "logps/rejected": -95.87464141845703, + "loss": 0.623, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7251205444335938, + "rewards/margins": 5.574571132659912, + "rewards/rejected": -2.8494508266448975, + "step": 7849 + }, + { + "epoch": 1.96, + "grad_norm": 6.874777317047119, + "learning_rate": 6.6523226286945375e-06, + "logits/chosen": -0.49325430393218994, + "logits/rejected": -0.5588054656982422, + "logps/chosen": -46.12303161621094, + "logps/rejected": -89.55319213867188, + "loss": 0.7258, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0860559940338135, + "rewards/margins": 5.008168697357178, + "rewards/rejected": -1.9221125841140747, + "step": 7850 + }, + { + "epoch": 1.96, + "grad_norm": 5.59643030166626, + "learning_rate": 6.651580778661689e-06, + "logits/chosen": -0.43328922986984253, + "logits/rejected": -0.5129511952400208, + "logps/chosen": -54.716339111328125, + "logps/rejected": -89.98342895507812, + "loss": 0.7647, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.929816246032715, + "rewards/margins": 5.266500473022461, + "rewards/rejected": -2.336683988571167, + "step": 7851 + }, + { + "epoch": 1.96, + "grad_norm": 9.754571914672852, + "learning_rate": 6.650838887816523e-06, + "logits/chosen": -0.4349634647369385, + "logits/rejected": -0.5880980491638184, + "logps/chosen": -73.4593505859375, + "logps/rejected": -79.34846496582031, + "loss": 0.788, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4814040660858154, + "rewards/margins": 6.274742126464844, + "rewards/rejected": -3.79333758354187, + "step": 7852 + }, + { + "epoch": 1.96, + "grad_norm": 3.7260630130767822, + "learning_rate": 6.650096956177372e-06, + "logits/chosen": -0.4648151993751526, + "logits/rejected": -0.5482946634292603, + "logps/chosen": -58.45515823364258, + "logps/rejected": -92.67417907714844, + "loss": 0.7097, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3281309604644775, + "rewards/margins": 6.101906776428223, + "rewards/rejected": -2.773776054382324, + "step": 7853 + }, + { + "epoch": 1.96, + "grad_norm": 6.112572193145752, + "learning_rate": 6.64935498376257e-06, + "logits/chosen": -0.49182963371276855, + "logits/rejected": -0.5668822526931763, + "logps/chosen": -53.23602294921875, + "logps/rejected": -89.98834228515625, + "loss": 0.7299, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1169567108154297, + "rewards/margins": 4.71980619430542, + "rewards/rejected": -1.6028496026992798, + "step": 7854 + }, + { + "epoch": 1.97, + "grad_norm": 9.524459838867188, + "learning_rate": 6.648612970590453e-06, + "logits/chosen": -0.47120413184165955, + "logits/rejected": -0.5440660715103149, + "logps/chosen": -64.12380981445312, + "logps/rejected": -90.13175964355469, + "loss": 0.758, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.130547523498535, + "rewards/margins": 5.4072160720825195, + "rewards/rejected": -2.2766690254211426, + "step": 7855 + }, + { + "epoch": 1.97, + "grad_norm": 4.449542045593262, + "learning_rate": 6.6478709166793554e-06, + "logits/chosen": -0.48929738998413086, + "logits/rejected": -0.589110791683197, + "logps/chosen": -62.119625091552734, + "logps/rejected": -83.26836395263672, + "loss": 0.7356, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4027976989746094, + "rewards/margins": 4.965813159942627, + "rewards/rejected": -2.5630154609680176, + "step": 7856 + }, + { + "epoch": 1.97, + "grad_norm": 6.784121036529541, + "learning_rate": 6.647128822047613e-06, + "logits/chosen": -0.49121102690696716, + "logits/rejected": -0.5730845928192139, + "logps/chosen": -50.67271041870117, + "logps/rejected": -74.99322509765625, + "loss": 0.7354, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0020408630371094, + "rewards/margins": 5.304137706756592, + "rewards/rejected": -2.302095890045166, + "step": 7857 + }, + { + "epoch": 1.97, + "grad_norm": 3.537177085876465, + "learning_rate": 6.646386686713568e-06, + "logits/chosen": -0.47458699345588684, + "logits/rejected": -0.5563365817070007, + "logps/chosen": -55.653934478759766, + "logps/rejected": -103.47637939453125, + "loss": 0.7392, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8585591316223145, + "rewards/margins": 5.752458572387695, + "rewards/rejected": -2.893899440765381, + "step": 7858 + }, + { + "epoch": 1.97, + "grad_norm": 6.555119514465332, + "learning_rate": 6.645644510695557e-06, + "logits/chosen": -0.35932013392448425, + "logits/rejected": -0.4607745409011841, + "logps/chosen": -72.9219741821289, + "logps/rejected": -112.84973907470703, + "loss": 0.7794, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.735741138458252, + "rewards/margins": 5.681588649749756, + "rewards/rejected": -2.945847511291504, + "step": 7859 + }, + { + "epoch": 1.97, + "grad_norm": 6.176486492156982, + "learning_rate": 6.644902294011917e-06, + "logits/chosen": -0.47514453530311584, + "logits/rejected": -0.5410453081130981, + "logps/chosen": -65.81024932861328, + "logps/rejected": -129.0127410888672, + "loss": 0.7523, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0925521850585938, + "rewards/margins": 6.982354164123535, + "rewards/rejected": -3.889802932739258, + "step": 7860 + }, + { + "epoch": 1.97, + "grad_norm": 3.8785619735717773, + "learning_rate": 6.6441600366809935e-06, + "logits/chosen": -0.45343512296676636, + "logits/rejected": -0.5380200743675232, + "logps/chosen": -62.00797653198242, + "logps/rejected": -86.31534576416016, + "loss": 0.7429, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9850704669952393, + "rewards/margins": 4.9216508865356445, + "rewards/rejected": -1.9365804195404053, + "step": 7861 + }, + { + "epoch": 1.97, + "grad_norm": 6.597679615020752, + "learning_rate": 6.643417738721126e-06, + "logits/chosen": -0.500238835811615, + "logits/rejected": -0.520588219165802, + "logps/chosen": -52.663265228271484, + "logps/rejected": -87.430908203125, + "loss": 0.7256, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.184830665588379, + "rewards/margins": 4.9156951904296875, + "rewards/rejected": -1.7308645248413086, + "step": 7862 + }, + { + "epoch": 1.97, + "grad_norm": 4.268352031707764, + "learning_rate": 6.642675400150659e-06, + "logits/chosen": -0.41086792945861816, + "logits/rejected": -0.4730813205242157, + "logps/chosen": -46.37698745727539, + "logps/rejected": -98.35665893554688, + "loss": 0.6261, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0729784965515137, + "rewards/margins": 6.525484561920166, + "rewards/rejected": -3.4525060653686523, + "step": 7863 + }, + { + "epoch": 1.97, + "grad_norm": 5.728391647338867, + "learning_rate": 6.641933020987936e-06, + "logits/chosen": -0.486019104719162, + "logits/rejected": -0.5635896921157837, + "logps/chosen": -50.00370788574219, + "logps/rejected": -86.35106658935547, + "loss": 0.7897, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.118734121322632, + "rewards/margins": 5.40739107131958, + "rewards/rejected": -2.2886571884155273, + "step": 7864 + }, + { + "epoch": 1.97, + "grad_norm": 5.0881667137146, + "learning_rate": 6.6411906012513004e-06, + "logits/chosen": -0.5113183259963989, + "logits/rejected": -0.5967196226119995, + "logps/chosen": -56.24405288696289, + "logps/rejected": -82.94984436035156, + "loss": 0.7804, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.206037998199463, + "rewards/margins": 5.280120849609375, + "rewards/rejected": -2.074083089828491, + "step": 7865 + }, + { + "epoch": 1.97, + "grad_norm": 3.5717694759368896, + "learning_rate": 6.640448140959099e-06, + "logits/chosen": -0.4459589719772339, + "logits/rejected": -0.5466510057449341, + "logps/chosen": -72.25067901611328, + "logps/rejected": -95.88542938232422, + "loss": 0.7291, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1828324794769287, + "rewards/margins": 6.101202011108398, + "rewards/rejected": -2.9183688163757324, + "step": 7866 + }, + { + "epoch": 1.97, + "grad_norm": 8.580973625183105, + "learning_rate": 6.639705640129681e-06, + "logits/chosen": -0.4942115545272827, + "logits/rejected": -0.5971240401268005, + "logps/chosen": -54.47142791748047, + "logps/rejected": -94.81258392333984, + "loss": 0.8226, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4684712886810303, + "rewards/margins": 5.4531097412109375, + "rewards/rejected": -2.9846389293670654, + "step": 7867 + }, + { + "epoch": 1.97, + "grad_norm": 5.474248886108398, + "learning_rate": 6.638963098781391e-06, + "logits/chosen": -0.454365074634552, + "logits/rejected": -0.536704421043396, + "logps/chosen": -55.41145324707031, + "logps/rejected": -90.00593566894531, + "loss": 0.7397, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8852920532226562, + "rewards/margins": 5.058663368225098, + "rewards/rejected": -2.1733715534210205, + "step": 7868 + }, + { + "epoch": 1.97, + "grad_norm": 6.644804954528809, + "learning_rate": 6.63822051693258e-06, + "logits/chosen": -0.44035670161247253, + "logits/rejected": -0.5354865789413452, + "logps/chosen": -63.86737060546875, + "logps/rejected": -89.68875122070312, + "loss": 0.8517, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.991633415222168, + "rewards/margins": 4.846722602844238, + "rewards/rejected": -1.8550894260406494, + "step": 7869 + }, + { + "epoch": 1.97, + "grad_norm": 3.517202138900757, + "learning_rate": 6.637477894601598e-06, + "logits/chosen": -0.4994381070137024, + "logits/rejected": -0.53800368309021, + "logps/chosen": -50.732975006103516, + "logps/rejected": -84.81193542480469, + "loss": 0.7357, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9285104274749756, + "rewards/margins": 5.056214809417725, + "rewards/rejected": -2.127704620361328, + "step": 7870 + }, + { + "epoch": 1.97, + "grad_norm": 15.121657371520996, + "learning_rate": 6.636735231806795e-06, + "logits/chosen": -0.45824676752090454, + "logits/rejected": -0.587554931640625, + "logps/chosen": -60.766971588134766, + "logps/rejected": -84.68595123291016, + "loss": 0.6943, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0153889656066895, + "rewards/margins": 5.649247646331787, + "rewards/rejected": -2.6338586807250977, + "step": 7871 + }, + { + "epoch": 1.97, + "grad_norm": 5.845269680023193, + "learning_rate": 6.635992528566524e-06, + "logits/chosen": -0.5066213011741638, + "logits/rejected": -0.5512920022010803, + "logps/chosen": -52.792293548583984, + "logps/rejected": -98.50587463378906, + "loss": 0.7228, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0298755168914795, + "rewards/margins": 5.4377031326293945, + "rewards/rejected": -2.407827615737915, + "step": 7872 + }, + { + "epoch": 1.97, + "grad_norm": 7.036728382110596, + "learning_rate": 6.635249784899138e-06, + "logits/chosen": -0.45367270708084106, + "logits/rejected": -0.5307968258857727, + "logps/chosen": -63.74016571044922, + "logps/rejected": -93.814453125, + "loss": 0.8054, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.731339931488037, + "rewards/margins": 5.1568403244018555, + "rewards/rejected": -2.42549991607666, + "step": 7873 + }, + { + "epoch": 1.97, + "grad_norm": 4.1122965812683105, + "learning_rate": 6.634507000822988e-06, + "logits/chosen": -0.48758578300476074, + "logits/rejected": -0.6291662454605103, + "logps/chosen": -50.437255859375, + "logps/rejected": -93.1488037109375, + "loss": 0.6509, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3491170406341553, + "rewards/margins": 6.621285438537598, + "rewards/rejected": -3.2721681594848633, + "step": 7874 + }, + { + "epoch": 1.97, + "grad_norm": 5.19146203994751, + "learning_rate": 6.633764176356434e-06, + "logits/chosen": -0.4950878620147705, + "logits/rejected": -0.5572428703308105, + "logps/chosen": -63.20386505126953, + "logps/rejected": -109.38063049316406, + "loss": 0.7596, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.133380174636841, + "rewards/margins": 6.343253135681152, + "rewards/rejected": -3.2098724842071533, + "step": 7875 + }, + { + "epoch": 1.97, + "grad_norm": 4.826361179351807, + "learning_rate": 6.633021311517829e-06, + "logits/chosen": -0.4337470531463623, + "logits/rejected": -0.5342302918434143, + "logps/chosen": -66.46968841552734, + "logps/rejected": -96.8358154296875, + "loss": 0.6956, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.756791591644287, + "rewards/margins": 6.176620006561279, + "rewards/rejected": -3.419828414916992, + "step": 7876 + }, + { + "epoch": 1.97, + "grad_norm": 20.99466896057129, + "learning_rate": 6.63227840632553e-06, + "logits/chosen": -0.4021645188331604, + "logits/rejected": -0.4861574172973633, + "logps/chosen": -72.79071044921875, + "logps/rejected": -79.66453552246094, + "loss": 0.9506, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.9500887393951416, + "rewards/margins": 4.994258880615234, + "rewards/rejected": -2.0441701412200928, + "step": 7877 + }, + { + "epoch": 1.97, + "grad_norm": 5.058096885681152, + "learning_rate": 6.631535460797895e-06, + "logits/chosen": -0.47484949231147766, + "logits/rejected": -0.5279474854469299, + "logps/chosen": -51.74006271362305, + "logps/rejected": -87.56598663330078, + "loss": 0.8716, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0038228034973145, + "rewards/margins": 5.026678085327148, + "rewards/rejected": -2.022855281829834, + "step": 7878 + }, + { + "epoch": 1.97, + "grad_norm": 11.35452651977539, + "learning_rate": 6.6307924749532845e-06, + "logits/chosen": -0.36443257331848145, + "logits/rejected": -0.4344159662723541, + "logps/chosen": -50.1781120300293, + "logps/rejected": -104.17229461669922, + "loss": 0.6554, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.962088108062744, + "rewards/margins": 5.38069486618042, + "rewards/rejected": -2.418606758117676, + "step": 7879 + }, + { + "epoch": 1.97, + "grad_norm": 5.569881439208984, + "learning_rate": 6.6300494488100556e-06, + "logits/chosen": -0.5690116286277771, + "logits/rejected": -0.6283811926841736, + "logps/chosen": -51.705528259277344, + "logps/rejected": -95.27379608154297, + "loss": 0.6314, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8345322608947754, + "rewards/margins": 5.208930015563965, + "rewards/rejected": -2.3743982315063477, + "step": 7880 + }, + { + "epoch": 1.97, + "grad_norm": 4.185871601104736, + "learning_rate": 6.629306382386573e-06, + "logits/chosen": -0.44085589051246643, + "logits/rejected": -0.520728588104248, + "logps/chosen": -54.72214126586914, + "logps/rejected": -83.94033813476562, + "loss": 0.6966, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.015665292739868, + "rewards/margins": 5.511351585388184, + "rewards/rejected": -2.4956867694854736, + "step": 7881 + }, + { + "epoch": 1.97, + "grad_norm": 8.493249893188477, + "learning_rate": 6.628563275701196e-06, + "logits/chosen": -0.44658899307250977, + "logits/rejected": -0.5120126605033875, + "logps/chosen": -61.01209259033203, + "logps/rejected": -90.09651947021484, + "loss": 0.8192, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.865675687789917, + "rewards/margins": 5.550199031829834, + "rewards/rejected": -2.684523582458496, + "step": 7882 + }, + { + "epoch": 1.97, + "grad_norm": 6.022536754608154, + "learning_rate": 6.627820128772288e-06, + "logits/chosen": -0.4598374664783478, + "logits/rejected": -0.5303057432174683, + "logps/chosen": -48.48793411254883, + "logps/rejected": -86.978759765625, + "loss": 0.5945, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1181201934814453, + "rewards/margins": 5.555914402008057, + "rewards/rejected": -2.4377939701080322, + "step": 7883 + }, + { + "epoch": 1.97, + "grad_norm": 3.8815953731536865, + "learning_rate": 6.6270769416182135e-06, + "logits/chosen": -0.4521493911743164, + "logits/rejected": -0.5078226327896118, + "logps/chosen": -63.28471755981445, + "logps/rejected": -121.42340087890625, + "loss": 0.6538, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6876797676086426, + "rewards/margins": 5.9696431159973145, + "rewards/rejected": -3.28196382522583, + "step": 7884 + }, + { + "epoch": 1.97, + "grad_norm": 4.91505241394043, + "learning_rate": 6.626333714257336e-06, + "logits/chosen": -0.37101393938064575, + "logits/rejected": -0.46171510219573975, + "logps/chosen": -54.64213943481445, + "logps/rejected": -112.29601287841797, + "loss": 0.6499, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8601841926574707, + "rewards/margins": 7.087810039520264, + "rewards/rejected": -4.227625846862793, + "step": 7885 + }, + { + "epoch": 1.97, + "grad_norm": 9.545308113098145, + "learning_rate": 6.625590446708024e-06, + "logits/chosen": -0.4310421049594879, + "logits/rejected": -0.49092891812324524, + "logps/chosen": -53.724456787109375, + "logps/rejected": -95.47750091552734, + "loss": 0.755, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8302416801452637, + "rewards/margins": 5.354964733123779, + "rewards/rejected": -2.5247225761413574, + "step": 7886 + }, + { + "epoch": 1.97, + "grad_norm": 4.799771308898926, + "learning_rate": 6.624847138988642e-06, + "logits/chosen": -0.4776064455509186, + "logits/rejected": -0.5752259492874146, + "logps/chosen": -65.53010559082031, + "logps/rejected": -112.3721694946289, + "loss": 0.7643, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.725900173187256, + "rewards/margins": 6.035537242889404, + "rewards/rejected": -3.3096370697021484, + "step": 7887 + }, + { + "epoch": 1.97, + "grad_norm": 8.296509742736816, + "learning_rate": 6.62410379111756e-06, + "logits/chosen": -0.46981436014175415, + "logits/rejected": -0.5692967772483826, + "logps/chosen": -62.917320251464844, + "logps/rejected": -97.03465270996094, + "loss": 0.7633, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.098487377166748, + "rewards/margins": 5.502780437469482, + "rewards/rejected": -2.4042928218841553, + "step": 7888 + }, + { + "epoch": 1.97, + "grad_norm": 8.710245132446289, + "learning_rate": 6.6233604031131446e-06, + "logits/chosen": -0.35402095317840576, + "logits/rejected": -0.4231262803077698, + "logps/chosen": -50.12495040893555, + "logps/rejected": -82.88801574707031, + "loss": 0.644, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0584876537323, + "rewards/margins": 5.019935607910156, + "rewards/rejected": -1.9614481925964355, + "step": 7889 + }, + { + "epoch": 1.97, + "grad_norm": 10.970053672790527, + "learning_rate": 6.622616974993768e-06, + "logits/chosen": -0.4631442725658417, + "logits/rejected": -0.5525786876678467, + "logps/chosen": -58.10960388183594, + "logps/rejected": -80.88406372070312, + "loss": 0.7191, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.498426914215088, + "rewards/margins": 5.0794453620910645, + "rewards/rejected": -2.5810186862945557, + "step": 7890 + }, + { + "epoch": 1.97, + "grad_norm": 7.265768527984619, + "learning_rate": 6.621873506777799e-06, + "logits/chosen": -0.41181349754333496, + "logits/rejected": -0.5116613507270813, + "logps/chosen": -57.631710052490234, + "logps/rejected": -98.69502258300781, + "loss": 0.6748, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.66762113571167, + "rewards/margins": 6.129385948181152, + "rewards/rejected": -3.4617648124694824, + "step": 7891 + }, + { + "epoch": 1.97, + "grad_norm": 11.360613822937012, + "learning_rate": 6.621129998483611e-06, + "logits/chosen": -0.442198783159256, + "logits/rejected": -0.5583012700080872, + "logps/chosen": -61.455650329589844, + "logps/rejected": -79.93391418457031, + "loss": 0.8061, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.927579879760742, + "rewards/margins": 5.0302934646606445, + "rewards/rejected": -2.1027133464813232, + "step": 7892 + }, + { + "epoch": 1.97, + "grad_norm": 6.351663589477539, + "learning_rate": 6.620386450129578e-06, + "logits/chosen": -0.46567219495773315, + "logits/rejected": -0.4907320439815521, + "logps/chosen": -61.688514709472656, + "logps/rejected": -98.00211334228516, + "loss": 0.8682, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1499710083007812, + "rewards/margins": 4.837155342102051, + "rewards/rejected": -1.6871839761734009, + "step": 7893 + }, + { + "epoch": 1.97, + "grad_norm": 9.003518104553223, + "learning_rate": 6.6196428617340706e-06, + "logits/chosen": -0.48909422755241394, + "logits/rejected": -0.5648956298828125, + "logps/chosen": -51.88298034667969, + "logps/rejected": -87.99705505371094, + "loss": 0.7344, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8977136611938477, + "rewards/margins": 5.573597431182861, + "rewards/rejected": -2.6758840084075928, + "step": 7894 + }, + { + "epoch": 1.98, + "grad_norm": 6.131275177001953, + "learning_rate": 6.618899233315466e-06, + "logits/chosen": -0.466133713722229, + "logits/rejected": -0.573654055595398, + "logps/chosen": -51.351070404052734, + "logps/rejected": -105.41246795654297, + "loss": 0.6274, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9097704887390137, + "rewards/margins": 6.615742206573486, + "rewards/rejected": -3.7059717178344727, + "step": 7895 + }, + { + "epoch": 1.98, + "grad_norm": 6.4945292472839355, + "learning_rate": 6.618155564892138e-06, + "logits/chosen": -0.47356289625167847, + "logits/rejected": -0.5624482035636902, + "logps/chosen": -63.084110260009766, + "logps/rejected": -94.27589416503906, + "loss": 0.8166, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7866768836975098, + "rewards/margins": 5.162450790405273, + "rewards/rejected": -2.3757741451263428, + "step": 7896 + }, + { + "epoch": 1.98, + "grad_norm": 11.542259216308594, + "learning_rate": 6.617411856482466e-06, + "logits/chosen": -0.3710097074508667, + "logits/rejected": -0.4802769720554352, + "logps/chosen": -64.28739929199219, + "logps/rejected": -86.66101837158203, + "loss": 0.7555, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.701878786087036, + "rewards/margins": 4.64762020111084, + "rewards/rejected": -1.9457415342330933, + "step": 7897 + }, + { + "epoch": 1.98, + "grad_norm": 4.911020278930664, + "learning_rate": 6.6166681081048276e-06, + "logits/chosen": -0.49231386184692383, + "logits/rejected": -0.5456039309501648, + "logps/chosen": -53.25918197631836, + "logps/rejected": -92.64266967773438, + "loss": 0.7699, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.464348793029785, + "rewards/margins": 5.050426483154297, + "rewards/rejected": -1.5860779285430908, + "step": 7898 + }, + { + "epoch": 1.98, + "grad_norm": 3.78446888923645, + "learning_rate": 6.615924319777599e-06, + "logits/chosen": -0.42718952894210815, + "logits/rejected": -0.5088061094284058, + "logps/chosen": -53.23338317871094, + "logps/rejected": -95.03433227539062, + "loss": 0.6511, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6875133514404297, + "rewards/margins": 5.085421562194824, + "rewards/rejected": -2.3979077339172363, + "step": 7899 + }, + { + "epoch": 1.98, + "grad_norm": 7.527228355407715, + "learning_rate": 6.615180491519162e-06, + "logits/chosen": -0.4070929288864136, + "logits/rejected": -0.4656829833984375, + "logps/chosen": -56.410919189453125, + "logps/rejected": -81.72591400146484, + "loss": 0.7972, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8541879653930664, + "rewards/margins": 4.407939910888672, + "rewards/rejected": -1.5537521839141846, + "step": 7900 + }, + { + "epoch": 1.98, + "grad_norm": 3.90846586227417, + "learning_rate": 6.614436623347898e-06, + "logits/chosen": -0.4249105751514435, + "logits/rejected": -0.5458398461341858, + "logps/chosen": -58.20945739746094, + "logps/rejected": -81.42704772949219, + "loss": 0.6984, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0654776096343994, + "rewards/margins": 5.731027126312256, + "rewards/rejected": -2.6655497550964355, + "step": 7901 + }, + { + "epoch": 1.98, + "grad_norm": 4.390946388244629, + "learning_rate": 6.613692715282187e-06, + "logits/chosen": -0.4545220732688904, + "logits/rejected": -0.5169258117675781, + "logps/chosen": -51.94441223144531, + "logps/rejected": -89.79513549804688, + "loss": 0.7845, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.864429473876953, + "rewards/margins": 4.774439334869385, + "rewards/rejected": -1.9100102186203003, + "step": 7902 + }, + { + "epoch": 1.98, + "grad_norm": 5.6764349937438965, + "learning_rate": 6.612948767340413e-06, + "logits/chosen": -0.4265294075012207, + "logits/rejected": -0.5468268990516663, + "logps/chosen": -53.12567901611328, + "logps/rejected": -80.2846908569336, + "loss": 0.6744, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.992459297180176, + "rewards/margins": 5.316418647766113, + "rewards/rejected": -2.3239593505859375, + "step": 7903 + }, + { + "epoch": 1.98, + "grad_norm": 3.941493272781372, + "learning_rate": 6.61220477954096e-06, + "logits/chosen": -0.40755051374435425, + "logits/rejected": -0.5068193078041077, + "logps/chosen": -49.25225067138672, + "logps/rejected": -90.54592895507812, + "loss": 0.6264, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0296521186828613, + "rewards/margins": 5.796788692474365, + "rewards/rejected": -2.767136573791504, + "step": 7904 + }, + { + "epoch": 1.98, + "grad_norm": 7.872483730316162, + "learning_rate": 6.61146075190221e-06, + "logits/chosen": -0.37870460748672485, + "logits/rejected": -0.5034009218215942, + "logps/chosen": -66.85916137695312, + "logps/rejected": -78.59754943847656, + "loss": 0.7463, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8179636001586914, + "rewards/margins": 4.783658981323242, + "rewards/rejected": -1.9656951427459717, + "step": 7905 + }, + { + "epoch": 1.98, + "grad_norm": 3.0081207752227783, + "learning_rate": 6.610716684442553e-06, + "logits/chosen": -0.37985941767692566, + "logits/rejected": -0.46068230271339417, + "logps/chosen": -53.121307373046875, + "logps/rejected": -82.5140609741211, + "loss": 0.5738, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3129913806915283, + "rewards/margins": 5.302745342254639, + "rewards/rejected": -1.9897539615631104, + "step": 7906 + }, + { + "epoch": 1.98, + "grad_norm": 4.5255303382873535, + "learning_rate": 6.6099725771803725e-06, + "logits/chosen": -0.3980327546596527, + "logits/rejected": -0.4723207950592041, + "logps/chosen": -50.99278259277344, + "logps/rejected": -90.37938690185547, + "loss": 0.717, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8257904052734375, + "rewards/margins": 5.810421466827393, + "rewards/rejected": -2.984631061553955, + "step": 7907 + }, + { + "epoch": 1.98, + "grad_norm": 4.3361592292785645, + "learning_rate": 6.609228430134058e-06, + "logits/chosen": -0.4310445189476013, + "logits/rejected": -0.5485429763793945, + "logps/chosen": -55.4956169128418, + "logps/rejected": -85.36984252929688, + "loss": 0.6807, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1921911239624023, + "rewards/margins": 7.088325500488281, + "rewards/rejected": -3.896134376525879, + "step": 7908 + }, + { + "epoch": 1.98, + "grad_norm": 4.107460021972656, + "learning_rate": 6.608484243321995e-06, + "logits/chosen": -0.39895200729370117, + "logits/rejected": -0.4862404763698578, + "logps/chosen": -58.068267822265625, + "logps/rejected": -93.56219482421875, + "loss": 0.6886, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1959023475646973, + "rewards/margins": 5.1680426597595215, + "rewards/rejected": -1.9721404314041138, + "step": 7909 + }, + { + "epoch": 1.98, + "grad_norm": 6.989278793334961, + "learning_rate": 6.6077400167625784e-06, + "logits/chosen": -0.470848947763443, + "logits/rejected": -0.5390197038650513, + "logps/chosen": -50.46562957763672, + "logps/rejected": -71.99385833740234, + "loss": 0.8321, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.948218584060669, + "rewards/margins": 3.9626846313476562, + "rewards/rejected": -1.0144660472869873, + "step": 7910 + }, + { + "epoch": 1.98, + "grad_norm": 6.424853324890137, + "learning_rate": 6.606995750474194e-06, + "logits/chosen": -0.39368996024131775, + "logits/rejected": -0.45741453766822815, + "logps/chosen": -72.00970458984375, + "logps/rejected": -112.63829803466797, + "loss": 0.736, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9945931434631348, + "rewards/margins": 5.40751838684082, + "rewards/rejected": -2.4129254817962646, + "step": 7911 + }, + { + "epoch": 1.98, + "grad_norm": 6.492954730987549, + "learning_rate": 6.606251444475236e-06, + "logits/chosen": -0.40612953901290894, + "logits/rejected": -0.47316208481788635, + "logps/chosen": -54.89372253417969, + "logps/rejected": -86.02945709228516, + "loss": 0.8238, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.999696731567383, + "rewards/margins": 4.567526817321777, + "rewards/rejected": -1.5678298473358154, + "step": 7912 + }, + { + "epoch": 1.98, + "grad_norm": 4.518408298492432, + "learning_rate": 6.6055070987840965e-06, + "logits/chosen": -0.3771858513355255, + "logits/rejected": -0.5134832262992859, + "logps/chosen": -55.81777572631836, + "logps/rejected": -85.41582489013672, + "loss": 0.7147, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9695804119110107, + "rewards/margins": 6.377993583679199, + "rewards/rejected": -3.4084129333496094, + "step": 7913 + }, + { + "epoch": 1.98, + "grad_norm": 7.985358238220215, + "learning_rate": 6.604762713419168e-06, + "logits/chosen": -0.49981582164764404, + "logits/rejected": -0.527937650680542, + "logps/chosen": -50.92689514160156, + "logps/rejected": -85.01862335205078, + "loss": 0.8883, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6621592044830322, + "rewards/margins": 4.284884929656982, + "rewards/rejected": -1.6227259635925293, + "step": 7914 + }, + { + "epoch": 1.98, + "grad_norm": 4.828819274902344, + "learning_rate": 6.604018288398847e-06, + "logits/chosen": -0.41261348128318787, + "logits/rejected": -0.5080211758613586, + "logps/chosen": -58.360652923583984, + "logps/rejected": -91.872314453125, + "loss": 0.7901, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.3136847019195557, + "rewards/margins": 5.2564191818237305, + "rewards/rejected": -1.9427341222763062, + "step": 7915 + }, + { + "epoch": 1.98, + "grad_norm": 4.98480224609375, + "learning_rate": 6.603273823741529e-06, + "logits/chosen": -0.4133843779563904, + "logits/rejected": -0.5172897577285767, + "logps/chosen": -46.65944290161133, + "logps/rejected": -79.18133544921875, + "loss": 0.69, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.805182695388794, + "rewards/margins": 5.342987060546875, + "rewards/rejected": -2.537804365158081, + "step": 7916 + }, + { + "epoch": 1.98, + "grad_norm": 3.895975112915039, + "learning_rate": 6.602529319465607e-06, + "logits/chosen": -0.41809892654418945, + "logits/rejected": -0.5251430869102478, + "logps/chosen": -62.2190055847168, + "logps/rejected": -95.14350128173828, + "loss": 0.667, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.853363513946533, + "rewards/margins": 5.919558048248291, + "rewards/rejected": -3.066194772720337, + "step": 7917 + }, + { + "epoch": 1.98, + "grad_norm": 8.624505996704102, + "learning_rate": 6.601784775589483e-06, + "logits/chosen": -0.5240408182144165, + "logits/rejected": -0.5916290879249573, + "logps/chosen": -44.80973815917969, + "logps/rejected": -78.08113098144531, + "loss": 0.7164, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9700734615325928, + "rewards/margins": 4.942968845367432, + "rewards/rejected": -1.9728955030441284, + "step": 7918 + }, + { + "epoch": 1.98, + "grad_norm": 8.857898712158203, + "learning_rate": 6.601040192131551e-06, + "logits/chosen": -0.39023739099502563, + "logits/rejected": -0.452370285987854, + "logps/chosen": -59.56402587890625, + "logps/rejected": -99.89453125, + "loss": 0.6907, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.986632823944092, + "rewards/margins": 5.228148937225342, + "rewards/rejected": -2.2415153980255127, + "step": 7919 + }, + { + "epoch": 1.98, + "grad_norm": 16.329132080078125, + "learning_rate": 6.6002955691102156e-06, + "logits/chosen": -0.4930400252342224, + "logits/rejected": -0.5264661908149719, + "logps/chosen": -49.67997741699219, + "logps/rejected": -117.77164459228516, + "loss": 0.7987, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0492513179779053, + "rewards/margins": 6.665812015533447, + "rewards/rejected": -3.6165611743927, + "step": 7920 + }, + { + "epoch": 1.98, + "grad_norm": 2.3484082221984863, + "learning_rate": 6.599550906543872e-06, + "logits/chosen": -0.5064448118209839, + "logits/rejected": -0.6128570437431335, + "logps/chosen": -57.49921417236328, + "logps/rejected": -85.52651977539062, + "loss": 0.6304, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.215569257736206, + "rewards/margins": 5.687526226043701, + "rewards/rejected": -2.471956729888916, + "step": 7921 + }, + { + "epoch": 1.98, + "grad_norm": 3.2349720001220703, + "learning_rate": 6.598806204450927e-06, + "logits/chosen": -0.4658724069595337, + "logits/rejected": -0.5600718855857849, + "logps/chosen": -62.38979721069336, + "logps/rejected": -106.64840698242188, + "loss": 0.6771, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9570932388305664, + "rewards/margins": 6.3414506912231445, + "rewards/rejected": -3.384356737136841, + "step": 7922 + }, + { + "epoch": 1.98, + "grad_norm": 7.565133571624756, + "learning_rate": 6.598061462849779e-06, + "logits/chosen": -0.48626312613487244, + "logits/rejected": -0.49182409048080444, + "logps/chosen": -55.036376953125, + "logps/rejected": -78.3853530883789, + "loss": 0.7761, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9734952449798584, + "rewards/margins": 4.520449161529541, + "rewards/rejected": -1.5469534397125244, + "step": 7923 + }, + { + "epoch": 1.98, + "grad_norm": 19.537153244018555, + "learning_rate": 6.597316681758831e-06, + "logits/chosen": -0.42375439405441284, + "logits/rejected": -0.5429455041885376, + "logps/chosen": -55.953338623046875, + "logps/rejected": -104.44221496582031, + "loss": 0.6576, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.956162214279175, + "rewards/margins": 6.393048286437988, + "rewards/rejected": -3.4368860721588135, + "step": 7924 + }, + { + "epoch": 1.98, + "grad_norm": 5.439802169799805, + "learning_rate": 6.596571861196491e-06, + "logits/chosen": -0.4785381555557251, + "logits/rejected": -0.5667741894721985, + "logps/chosen": -40.140846252441406, + "logps/rejected": -89.43089294433594, + "loss": 0.6492, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0766429901123047, + "rewards/margins": 6.007150650024414, + "rewards/rejected": -2.9305076599121094, + "step": 7925 + }, + { + "epoch": 1.98, + "grad_norm": 4.770374774932861, + "learning_rate": 6.59582700118116e-06, + "logits/chosen": -0.3906102776527405, + "logits/rejected": -0.4886842370033264, + "logps/chosen": -55.502235412597656, + "logps/rejected": -81.97232055664062, + "loss": 0.641, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.138913631439209, + "rewards/margins": 5.58273983001709, + "rewards/rejected": -2.44382643699646, + "step": 7926 + }, + { + "epoch": 1.98, + "grad_norm": 12.581367492675781, + "learning_rate": 6.5950821017312486e-06, + "logits/chosen": -0.4102536141872406, + "logits/rejected": -0.530773937702179, + "logps/chosen": -54.420074462890625, + "logps/rejected": -95.10121154785156, + "loss": 0.6328, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8757500648498535, + "rewards/margins": 6.7147040367126465, + "rewards/rejected": -3.8389546871185303, + "step": 7927 + }, + { + "epoch": 1.98, + "grad_norm": 4.073361396789551, + "learning_rate": 6.59433716286516e-06, + "logits/chosen": -0.3847498297691345, + "logits/rejected": -0.4818204641342163, + "logps/chosen": -54.64018249511719, + "logps/rejected": -95.59385681152344, + "loss": 0.6698, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1346566677093506, + "rewards/margins": 6.3273701667785645, + "rewards/rejected": -3.1927130222320557, + "step": 7928 + }, + { + "epoch": 1.98, + "grad_norm": 7.230688571929932, + "learning_rate": 6.593592184601305e-06, + "logits/chosen": -0.44753777980804443, + "logits/rejected": -0.46714547276496887, + "logps/chosen": -70.31849670410156, + "logps/rejected": -92.87750244140625, + "loss": 0.8483, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.818142890930176, + "rewards/margins": 3.9150278568267822, + "rewards/rejected": -1.0968852043151855, + "step": 7929 + }, + { + "epoch": 1.98, + "grad_norm": 4.695334434509277, + "learning_rate": 6.5928471669580925e-06, + "logits/chosen": -0.47903019189834595, + "logits/rejected": -0.5061090588569641, + "logps/chosen": -47.45927429199219, + "logps/rejected": -95.24610137939453, + "loss": 0.7344, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9623866081237793, + "rewards/margins": 4.966020584106445, + "rewards/rejected": -2.003633975982666, + "step": 7930 + }, + { + "epoch": 1.98, + "grad_norm": 6.6461992263793945, + "learning_rate": 6.592102109953932e-06, + "logits/chosen": -0.4468882977962494, + "logits/rejected": -0.5158869624137878, + "logps/chosen": -59.996116638183594, + "logps/rejected": -94.1528549194336, + "loss": 0.7425, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.774740219116211, + "rewards/margins": 5.518428802490234, + "rewards/rejected": -2.7436885833740234, + "step": 7931 + }, + { + "epoch": 1.98, + "grad_norm": 2.6060264110565186, + "learning_rate": 6.5913570136072345e-06, + "logits/chosen": -0.42862215638160706, + "logits/rejected": -0.5989763140678406, + "logps/chosen": -61.90104675292969, + "logps/rejected": -78.46183776855469, + "loss": 0.576, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.03611159324646, + "rewards/margins": 6.133755683898926, + "rewards/rejected": -3.097644090652466, + "step": 7932 + }, + { + "epoch": 1.98, + "grad_norm": 14.63020133972168, + "learning_rate": 6.5906118779364125e-06, + "logits/chosen": -0.5086760520935059, + "logits/rejected": -0.5799633264541626, + "logps/chosen": -58.51081466674805, + "logps/rejected": -80.64967346191406, + "loss": 0.7713, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.068004608154297, + "rewards/margins": 5.4544878005981445, + "rewards/rejected": -2.3864827156066895, + "step": 7933 + }, + { + "epoch": 1.98, + "grad_norm": 11.752464294433594, + "learning_rate": 6.58986670295988e-06, + "logits/chosen": -0.39003509283065796, + "logits/rejected": -0.4508585035800934, + "logps/chosen": -59.69643020629883, + "logps/rejected": -91.2917251586914, + "loss": 0.7358, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1741232872009277, + "rewards/margins": 4.578834056854248, + "rewards/rejected": -1.4047107696533203, + "step": 7934 + }, + { + "epoch": 1.99, + "grad_norm": 6.210545539855957, + "learning_rate": 6.589121488696049e-06, + "logits/chosen": -0.46200063824653625, + "logits/rejected": -0.5648162961006165, + "logps/chosen": -52.02450942993164, + "logps/rejected": -92.2292709350586, + "loss": 0.7906, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0238261222839355, + "rewards/margins": 5.407500743865967, + "rewards/rejected": -2.3836748600006104, + "step": 7935 + }, + { + "epoch": 1.99, + "grad_norm": 10.522716522216797, + "learning_rate": 6.588376235163337e-06, + "logits/chosen": -0.43462565541267395, + "logits/rejected": -0.4603905975818634, + "logps/chosen": -54.330230712890625, + "logps/rejected": -84.7866439819336, + "loss": 0.8636, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9859085083007812, + "rewards/margins": 3.9584708213806152, + "rewards/rejected": -0.9725617170333862, + "step": 7936 + }, + { + "epoch": 1.99, + "grad_norm": 4.916130065917969, + "learning_rate": 6.587630942380159e-06, + "logits/chosen": -0.40441837906837463, + "logits/rejected": -0.4942222833633423, + "logps/chosen": -56.60655975341797, + "logps/rejected": -108.39434814453125, + "loss": 0.6048, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.88924503326416, + "rewards/margins": 6.670334815979004, + "rewards/rejected": -3.7810897827148438, + "step": 7937 + }, + { + "epoch": 1.99, + "grad_norm": 3.088728427886963, + "learning_rate": 6.58688561036493e-06, + "logits/chosen": -0.4546552300453186, + "logits/rejected": -0.5301923751831055, + "logps/chosen": -53.892601013183594, + "logps/rejected": -93.73094177246094, + "loss": 0.5756, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.878868818283081, + "rewards/margins": 6.033106803894043, + "rewards/rejected": -3.154237747192383, + "step": 7938 + }, + { + "epoch": 1.99, + "grad_norm": 14.446730613708496, + "learning_rate": 6.586140239136072e-06, + "logits/chosen": -0.32875099778175354, + "logits/rejected": -0.43188539147377014, + "logps/chosen": -56.426116943359375, + "logps/rejected": -76.88519287109375, + "loss": 0.6615, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2224619388580322, + "rewards/margins": 4.992697238922119, + "rewards/rejected": -1.770235538482666, + "step": 7939 + }, + { + "epoch": 1.99, + "grad_norm": 4.187066078186035, + "learning_rate": 6.585394828711998e-06, + "logits/chosen": -0.4529540538787842, + "logits/rejected": -0.5805799961090088, + "logps/chosen": -54.970703125, + "logps/rejected": -95.15394592285156, + "loss": 0.5807, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.159459114074707, + "rewards/margins": 6.899198532104492, + "rewards/rejected": -3.7397396564483643, + "step": 7940 + }, + { + "epoch": 1.99, + "grad_norm": 14.97332763671875, + "learning_rate": 6.584649379111135e-06, + "logits/chosen": -0.4598706364631653, + "logits/rejected": -0.5840061902999878, + "logps/chosen": -56.5017204284668, + "logps/rejected": -83.44597625732422, + "loss": 0.6707, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.73136830329895, + "rewards/margins": 5.678605079650879, + "rewards/rejected": -2.9472367763519287, + "step": 7941 + }, + { + "epoch": 1.99, + "grad_norm": 14.5729341506958, + "learning_rate": 6.583903890351899e-06, + "logits/chosen": -0.5105410218238831, + "logits/rejected": -0.5944926738739014, + "logps/chosen": -55.26270294189453, + "logps/rejected": -106.40713500976562, + "loss": 0.7787, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.910581111907959, + "rewards/margins": 6.250443935394287, + "rewards/rejected": -3.3398630619049072, + "step": 7942 + }, + { + "epoch": 1.99, + "grad_norm": 15.617938995361328, + "learning_rate": 6.583158362452713e-06, + "logits/chosen": -0.38725587725639343, + "logits/rejected": -0.46443960070610046, + "logps/chosen": -65.85111999511719, + "logps/rejected": -91.75625610351562, + "loss": 0.7254, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8928990364074707, + "rewards/margins": 4.6837944984436035, + "rewards/rejected": -1.7908952236175537, + "step": 7943 + }, + { + "epoch": 1.99, + "grad_norm": 6.29276180267334, + "learning_rate": 6.582412795432e-06, + "logits/chosen": -0.5059897899627686, + "logits/rejected": -0.58455491065979, + "logps/chosen": -47.01240921020508, + "logps/rejected": -86.7960433959961, + "loss": 0.7186, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0271687507629395, + "rewards/margins": 5.543021202087402, + "rewards/rejected": -2.515852212905884, + "step": 7944 + }, + { + "epoch": 1.99, + "grad_norm": 4.648402690887451, + "learning_rate": 6.581667189308185e-06, + "logits/chosen": -0.41098347306251526, + "logits/rejected": -0.51451176404953, + "logps/chosen": -64.6175308227539, + "logps/rejected": -92.03910064697266, + "loss": 0.7539, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.046306610107422, + "rewards/margins": 6.577852249145508, + "rewards/rejected": -3.5315451622009277, + "step": 7945 + }, + { + "epoch": 1.99, + "grad_norm": 9.922074317932129, + "learning_rate": 6.580921544099688e-06, + "logits/chosen": -0.4670424461364746, + "logits/rejected": -0.5203161835670471, + "logps/chosen": -52.99491882324219, + "logps/rejected": -101.04825592041016, + "loss": 0.6853, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6693148612976074, + "rewards/margins": 6.289144515991211, + "rewards/rejected": -3.6198301315307617, + "step": 7946 + }, + { + "epoch": 1.99, + "grad_norm": 3.7574708461761475, + "learning_rate": 6.580175859824942e-06, + "logits/chosen": -0.4605654776096344, + "logits/rejected": -0.5423535704612732, + "logps/chosen": -50.47389602661133, + "logps/rejected": -80.93281555175781, + "loss": 0.6502, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0060625076293945, + "rewards/margins": 5.2663750648498535, + "rewards/rejected": -2.260313034057617, + "step": 7947 + }, + { + "epoch": 1.99, + "grad_norm": 4.642513275146484, + "learning_rate": 6.579430136502369e-06, + "logits/chosen": -0.4728870093822479, + "logits/rejected": -0.5443876385688782, + "logps/chosen": -45.757911682128906, + "logps/rejected": -87.43011474609375, + "loss": 0.5848, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1910006999969482, + "rewards/margins": 6.206873416900635, + "rewards/rejected": -3.0158727169036865, + "step": 7948 + }, + { + "epoch": 1.99, + "grad_norm": 11.979119300842285, + "learning_rate": 6.578684374150395e-06, + "logits/chosen": -0.4689774215221405, + "logits/rejected": -0.5512243509292603, + "logps/chosen": -47.2000732421875, + "logps/rejected": -100.74478149414062, + "loss": 0.8542, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5518648624420166, + "rewards/margins": 6.068408012390137, + "rewards/rejected": -3.51654314994812, + "step": 7949 + }, + { + "epoch": 1.99, + "grad_norm": 4.061631679534912, + "learning_rate": 6.577938572787453e-06, + "logits/chosen": -0.3647041916847229, + "logits/rejected": -0.40779781341552734, + "logps/chosen": -67.32510375976562, + "logps/rejected": -107.05358123779297, + "loss": 0.7664, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9832029342651367, + "rewards/margins": 4.42495059967041, + "rewards/rejected": -1.4417476654052734, + "step": 7950 + }, + { + "epoch": 1.99, + "grad_norm": 6.480958938598633, + "learning_rate": 6.5771927324319705e-06, + "logits/chosen": -0.39420032501220703, + "logits/rejected": -0.43440043926239014, + "logps/chosen": -64.6614990234375, + "logps/rejected": -111.87035369873047, + "loss": 0.679, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.586672782897949, + "rewards/margins": 5.627518653869629, + "rewards/rejected": -3.0408453941345215, + "step": 7951 + }, + { + "epoch": 1.99, + "grad_norm": 7.254005432128906, + "learning_rate": 6.576446853102377e-06, + "logits/chosen": -0.48168766498565674, + "logits/rejected": -0.503940224647522, + "logps/chosen": -56.32096481323242, + "logps/rejected": -88.54391479492188, + "loss": 0.8334, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9054763317108154, + "rewards/margins": 4.162565231323242, + "rewards/rejected": -1.2570891380310059, + "step": 7952 + }, + { + "epoch": 1.99, + "grad_norm": 5.045097827911377, + "learning_rate": 6.575700934817105e-06, + "logits/chosen": -0.46282726526260376, + "logits/rejected": -0.5619103908538818, + "logps/chosen": -60.31962203979492, + "logps/rejected": -95.35372161865234, + "loss": 0.7382, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9610509872436523, + "rewards/margins": 6.021298885345459, + "rewards/rejected": -3.0602481365203857, + "step": 7953 + }, + { + "epoch": 1.99, + "grad_norm": 4.8222336769104, + "learning_rate": 6.5749549775945885e-06, + "logits/chosen": -0.49013978242874146, + "logits/rejected": -0.549275815486908, + "logps/chosen": -48.925498962402344, + "logps/rejected": -94.26083374023438, + "loss": 0.7145, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1528801918029785, + "rewards/margins": 4.9865264892578125, + "rewards/rejected": -1.8336459398269653, + "step": 7954 + }, + { + "epoch": 1.99, + "grad_norm": 4.050932884216309, + "learning_rate": 6.574208981453258e-06, + "logits/chosen": -0.37786880135536194, + "logits/rejected": -0.46315985918045044, + "logps/chosen": -62.44319152832031, + "logps/rejected": -91.0745620727539, + "loss": 0.6617, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7799630165100098, + "rewards/margins": 5.765436172485352, + "rewards/rejected": -2.985473155975342, + "step": 7955 + }, + { + "epoch": 1.99, + "grad_norm": 3.438922882080078, + "learning_rate": 6.573462946411549e-06, + "logits/chosen": -0.5061304569244385, + "logits/rejected": -0.6032358407974243, + "logps/chosen": -59.68968200683594, + "logps/rejected": -98.92353820800781, + "loss": 0.6284, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9753479957580566, + "rewards/margins": 6.502281665802002, + "rewards/rejected": -3.5269336700439453, + "step": 7956 + }, + { + "epoch": 1.99, + "grad_norm": 7.4229655265808105, + "learning_rate": 6.572716872487898e-06, + "logits/chosen": -0.4721832871437073, + "logits/rejected": -0.5577234625816345, + "logps/chosen": -51.62350845336914, + "logps/rejected": -89.31021881103516, + "loss": 0.6803, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9709603786468506, + "rewards/margins": 5.178527355194092, + "rewards/rejected": -2.2075679302215576, + "step": 7957 + }, + { + "epoch": 1.99, + "grad_norm": 5.836289882659912, + "learning_rate": 6.571970759700739e-06, + "logits/chosen": -0.47026583552360535, + "logits/rejected": -0.5472866892814636, + "logps/chosen": -57.4554328918457, + "logps/rejected": -100.85245513916016, + "loss": 0.7102, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.937537670135498, + "rewards/margins": 5.91170072555542, + "rewards/rejected": -2.974163293838501, + "step": 7958 + }, + { + "epoch": 1.99, + "grad_norm": 3.710460901260376, + "learning_rate": 6.571224608068512e-06, + "logits/chosen": -0.49264851212501526, + "logits/rejected": -0.5934510231018066, + "logps/chosen": -55.15523147583008, + "logps/rejected": -84.82084655761719, + "loss": 0.6154, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.127426862716675, + "rewards/margins": 4.710800647735596, + "rewards/rejected": -1.5833734273910522, + "step": 7959 + }, + { + "epoch": 1.99, + "grad_norm": 6.775718688964844, + "learning_rate": 6.570478417609651e-06, + "logits/chosen": -0.4709192216396332, + "logits/rejected": -0.5345722436904907, + "logps/chosen": -57.59553527832031, + "logps/rejected": -92.28730773925781, + "loss": 0.8464, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6411707401275635, + "rewards/margins": 5.448790073394775, + "rewards/rejected": -2.807619094848633, + "step": 7960 + }, + { + "epoch": 1.99, + "grad_norm": 7.3787150382995605, + "learning_rate": 6.5697321883426005e-06, + "logits/chosen": -0.47282978892326355, + "logits/rejected": -0.5845797061920166, + "logps/chosen": -66.79430389404297, + "logps/rejected": -87.57915496826172, + "loss": 0.8576, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.903719902038574, + "rewards/margins": 5.376162528991699, + "rewards/rejected": -2.472442388534546, + "step": 7961 + }, + { + "epoch": 1.99, + "grad_norm": 6.835818767547607, + "learning_rate": 6.568985920285797e-06, + "logits/chosen": -0.4756986200809479, + "logits/rejected": -0.5292931795120239, + "logps/chosen": -71.09236907958984, + "logps/rejected": -89.9344482421875, + "loss": 0.7029, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7667970657348633, + "rewards/margins": 5.778191566467285, + "rewards/rejected": -3.0113942623138428, + "step": 7962 + }, + { + "epoch": 1.99, + "grad_norm": 14.758862495422363, + "learning_rate": 6.568239613457682e-06, + "logits/chosen": -0.42483431100845337, + "logits/rejected": -0.4180521070957184, + "logps/chosen": -54.40288162231445, + "logps/rejected": -123.35655212402344, + "loss": 0.7823, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1787679195404053, + "rewards/margins": 5.2954254150390625, + "rewards/rejected": -2.116657257080078, + "step": 7963 + }, + { + "epoch": 1.99, + "grad_norm": 5.855493068695068, + "learning_rate": 6.5674932678767e-06, + "logits/chosen": -0.44304734468460083, + "logits/rejected": -0.5285521149635315, + "logps/chosen": -58.38895034790039, + "logps/rejected": -96.1795425415039, + "loss": 0.815, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7973361015319824, + "rewards/margins": 5.245887279510498, + "rewards/rejected": -2.4485514163970947, + "step": 7964 + }, + { + "epoch": 1.99, + "grad_norm": 25.553232192993164, + "learning_rate": 6.56674688356129e-06, + "logits/chosen": -0.4087989926338196, + "logits/rejected": -0.4710421860218048, + "logps/chosen": -57.061134338378906, + "logps/rejected": -94.79027557373047, + "loss": 0.9897, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5437114238739014, + "rewards/margins": 4.787038803100586, + "rewards/rejected": -2.2433278560638428, + "step": 7965 + }, + { + "epoch": 1.99, + "grad_norm": 16.539997100830078, + "learning_rate": 6.5660004605299e-06, + "logits/chosen": -0.517290472984314, + "logits/rejected": -0.5748525857925415, + "logps/chosen": -63.20951843261719, + "logps/rejected": -82.45686340332031, + "loss": 0.7979, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.097622871398926, + "rewards/margins": 5.436882019042969, + "rewards/rejected": -2.339258909225464, + "step": 7966 + }, + { + "epoch": 1.99, + "grad_norm": 6.6907806396484375, + "learning_rate": 6.565253998800971e-06, + "logits/chosen": -0.4844982326030731, + "logits/rejected": -0.6098247766494751, + "logps/chosen": -54.312469482421875, + "logps/rejected": -87.59659576416016, + "loss": 0.6793, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0259976387023926, + "rewards/margins": 7.153721809387207, + "rewards/rejected": -4.1277241706848145, + "step": 7967 + }, + { + "epoch": 1.99, + "grad_norm": 8.012757301330566, + "learning_rate": 6.5645074983929524e-06, + "logits/chosen": -0.4629714787006378, + "logits/rejected": -0.5644066333770752, + "logps/chosen": -56.364349365234375, + "logps/rejected": -73.24506378173828, + "loss": 0.8069, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8137896060943604, + "rewards/margins": 4.537306785583496, + "rewards/rejected": -1.7235170602798462, + "step": 7968 + }, + { + "epoch": 1.99, + "grad_norm": 2.9003820419311523, + "learning_rate": 6.5637609593242875e-06, + "logits/chosen": -0.49455171823501587, + "logits/rejected": -0.5784578323364258, + "logps/chosen": -46.777809143066406, + "logps/rejected": -96.92263793945312, + "loss": 0.5378, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2287447452545166, + "rewards/margins": 6.8806586265563965, + "rewards/rejected": -3.651913642883301, + "step": 7969 + }, + { + "epoch": 1.99, + "grad_norm": 4.755362033843994, + "learning_rate": 6.563014381613428e-06, + "logits/chosen": -0.4853980243206024, + "logits/rejected": -0.5490487217903137, + "logps/chosen": -58.80542755126953, + "logps/rejected": -82.98609924316406, + "loss": 0.7216, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8794150352478027, + "rewards/margins": 4.541862964630127, + "rewards/rejected": -1.6624486446380615, + "step": 7970 + }, + { + "epoch": 1.99, + "grad_norm": 20.523635864257812, + "learning_rate": 6.56226776527882e-06, + "logits/chosen": -0.47974175214767456, + "logits/rejected": -0.582489550113678, + "logps/chosen": -54.31199264526367, + "logps/rejected": -91.19448852539062, + "loss": 0.8957, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.644955635070801, + "rewards/margins": 5.540962219238281, + "rewards/rejected": -2.8960070610046387, + "step": 7971 + }, + { + "epoch": 1.99, + "grad_norm": 5.541330337524414, + "learning_rate": 6.5615211103389135e-06, + "logits/chosen": -0.4486023187637329, + "logits/rejected": -0.5346431732177734, + "logps/chosen": -59.349613189697266, + "logps/rejected": -88.55783081054688, + "loss": 0.6962, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0325443744659424, + "rewards/margins": 5.048357009887695, + "rewards/rejected": -2.0158121585845947, + "step": 7972 + }, + { + "epoch": 1.99, + "grad_norm": 8.347002983093262, + "learning_rate": 6.56077441681216e-06, + "logits/chosen": -0.5449520349502563, + "logits/rejected": -0.5897560119628906, + "logps/chosen": -50.881690979003906, + "logps/rejected": -93.26966094970703, + "loss": 0.6339, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8952794075012207, + "rewards/margins": 5.871386528015137, + "rewards/rejected": -2.976107358932495, + "step": 7973 + }, + { + "epoch": 1.99, + "grad_norm": 3.8460583686828613, + "learning_rate": 6.56002768471701e-06, + "logits/chosen": -0.35084694623947144, + "logits/rejected": -0.4855797588825226, + "logps/chosen": -64.13016510009766, + "logps/rejected": -80.47477722167969, + "loss": 0.5993, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9203314781188965, + "rewards/margins": 5.437047004699707, + "rewards/rejected": -2.5167160034179688, + "step": 7974 + }, + { + "epoch": 2.0, + "grad_norm": 5.152435779571533, + "learning_rate": 6.559280914071917e-06, + "logits/chosen": -0.459125816822052, + "logits/rejected": -0.5300100445747375, + "logps/chosen": -56.37616729736328, + "logps/rejected": -82.15068817138672, + "loss": 0.7149, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.206827163696289, + "rewards/margins": 5.046844482421875, + "rewards/rejected": -1.8400176763534546, + "step": 7975 + }, + { + "epoch": 2.0, + "grad_norm": 4.003619194030762, + "learning_rate": 6.558534104895333e-06, + "logits/chosen": -0.4280495345592499, + "logits/rejected": -0.5021226406097412, + "logps/chosen": -47.712013244628906, + "logps/rejected": -77.36381530761719, + "loss": 0.6053, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.004324436187744, + "rewards/margins": 5.479522705078125, + "rewards/rejected": -2.47519850730896, + "step": 7976 + }, + { + "epoch": 2.0, + "grad_norm": 3.6081044673919678, + "learning_rate": 6.5577872572057155e-06, + "logits/chosen": -0.357876718044281, + "logits/rejected": -0.463820219039917, + "logps/chosen": -59.21485137939453, + "logps/rejected": -94.2165298461914, + "loss": 0.615, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.021846055984497, + "rewards/margins": 5.710302829742432, + "rewards/rejected": -2.6884567737579346, + "step": 7977 + }, + { + "epoch": 2.0, + "grad_norm": 4.9747209548950195, + "learning_rate": 6.5570403710215156e-06, + "logits/chosen": -0.3782198131084442, + "logits/rejected": -0.4271776080131531, + "logps/chosen": -59.698856353759766, + "logps/rejected": -87.05572509765625, + "loss": 0.7446, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0914878845214844, + "rewards/margins": 4.197424411773682, + "rewards/rejected": -1.1059366464614868, + "step": 7978 + }, + { + "epoch": 2.0, + "grad_norm": 2.911529779434204, + "learning_rate": 6.556293446361194e-06, + "logits/chosen": -0.3960428237915039, + "logits/rejected": -0.4822501242160797, + "logps/chosen": -65.90489959716797, + "logps/rejected": -108.90738677978516, + "loss": 0.668, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.941711187362671, + "rewards/margins": 6.602741241455078, + "rewards/rejected": -3.6610302925109863, + "step": 7979 + }, + { + "epoch": 2.0, + "grad_norm": 4.5761003494262695, + "learning_rate": 6.555546483243205e-06, + "logits/chosen": -0.43588027358055115, + "logits/rejected": -0.5267103910446167, + "logps/chosen": -53.174110412597656, + "logps/rejected": -88.57330322265625, + "loss": 0.6376, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.050753116607666, + "rewards/margins": 5.9457879066467285, + "rewards/rejected": -2.8950347900390625, + "step": 7980 + }, + { + "epoch": 2.0, + "grad_norm": 4.177983283996582, + "learning_rate": 6.554799481686007e-06, + "logits/chosen": -0.4834761917591095, + "logits/rejected": -0.49986356496810913, + "logps/chosen": -50.27234649658203, + "logps/rejected": -97.85285949707031, + "loss": 0.6316, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8732943534851074, + "rewards/margins": 5.085932731628418, + "rewards/rejected": -2.2126379013061523, + "step": 7981 + }, + { + "epoch": 2.0, + "grad_norm": 8.941060066223145, + "learning_rate": 6.554052441708063e-06, + "logits/chosen": -0.46807771921157837, + "logits/rejected": -0.6096862554550171, + "logps/chosen": -59.478599548339844, + "logps/rejected": -84.16583251953125, + "loss": 0.8179, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.765608787536621, + "rewards/margins": 5.9089035987854, + "rewards/rejected": -3.1432950496673584, + "step": 7982 + }, + { + "epoch": 2.0, + "grad_norm": 6.456771373748779, + "learning_rate": 6.553305363327827e-06, + "logits/chosen": -0.41826483607292175, + "logits/rejected": -0.5280646681785583, + "logps/chosen": -64.60718536376953, + "logps/rejected": -81.84640502929688, + "loss": 0.8709, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6817455291748047, + "rewards/margins": 3.9275240898132324, + "rewards/rejected": -1.2457783222198486, + "step": 7983 + }, + { + "epoch": 2.0, + "grad_norm": 5.2876410484313965, + "learning_rate": 6.552558246563765e-06, + "logits/chosen": -0.4261060357093811, + "logits/rejected": -0.5389876365661621, + "logps/chosen": -56.271846771240234, + "logps/rejected": -91.60073852539062, + "loss": 0.6848, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0652735233306885, + "rewards/margins": 6.074928283691406, + "rewards/rejected": -3.0096540451049805, + "step": 7984 + }, + { + "epoch": 2.0, + "grad_norm": 3.3503286838531494, + "learning_rate": 6.551811091434337e-06, + "logits/chosen": -0.4081105589866638, + "logits/rejected": -0.4502016305923462, + "logps/chosen": -54.70615768432617, + "logps/rejected": -91.8565673828125, + "loss": 0.6889, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.031497001647949, + "rewards/margins": 5.3882060050964355, + "rewards/rejected": -2.3567094802856445, + "step": 7985 + }, + { + "epoch": 2.0, + "grad_norm": 4.2706708908081055, + "learning_rate": 6.551063897958006e-06, + "logits/chosen": -0.45675578713417053, + "logits/rejected": -0.5215047597885132, + "logps/chosen": -53.97224807739258, + "logps/rejected": -108.55874633789062, + "loss": 0.7087, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2543013095855713, + "rewards/margins": 6.136064052581787, + "rewards/rejected": -2.8817625045776367, + "step": 7986 + }, + { + "epoch": 2.0, + "grad_norm": 3.579648494720459, + "learning_rate": 6.550316666153237e-06, + "logits/chosen": -0.4613352119922638, + "logits/rejected": -0.5575203895568848, + "logps/chosen": -51.3823356628418, + "logps/rejected": -92.47579956054688, + "loss": 0.5722, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.657837390899658, + "rewards/margins": 6.6627702713012695, + "rewards/rejected": -4.0049333572387695, + "step": 7987 + }, + { + "epoch": 2.0, + "grad_norm": 16.504213333129883, + "learning_rate": 6.5495693960384946e-06, + "logits/chosen": -0.4380650520324707, + "logits/rejected": -0.568783164024353, + "logps/chosen": -62.09614562988281, + "logps/rejected": -87.86101531982422, + "loss": 0.6857, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.728130340576172, + "rewards/margins": 5.876033306121826, + "rewards/rejected": -3.147902250289917, + "step": 7988 + }, + { + "epoch": 2.0, + "grad_norm": 6.490471839904785, + "learning_rate": 6.548822087632242e-06, + "logits/chosen": -0.4368715286254883, + "logits/rejected": -0.5424467325210571, + "logps/chosen": -54.240028381347656, + "logps/rejected": -80.96772003173828, + "loss": 0.8056, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.590679407119751, + "rewards/margins": 4.83054780960083, + "rewards/rejected": -2.2398688793182373, + "step": 7989 + }, + { + "epoch": 2.0, + "grad_norm": 6.4898786544799805, + "learning_rate": 6.54807474095295e-06, + "logits/chosen": -0.3766486346721649, + "logits/rejected": -0.48952966928482056, + "logps/chosen": -58.414974212646484, + "logps/rejected": -79.90132904052734, + "loss": 0.7466, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6680517196655273, + "rewards/margins": 5.118587493896484, + "rewards/rejected": -2.450536012649536, + "step": 7990 + }, + { + "epoch": 2.0, + "grad_norm": 10.81192398071289, + "learning_rate": 6.547327356019085e-06, + "logits/chosen": -0.3629819452762604, + "logits/rejected": -0.464885950088501, + "logps/chosen": -66.49047088623047, + "logps/rejected": -93.21908569335938, + "loss": 0.7887, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.822845697402954, + "rewards/margins": 4.9457106590271, + "rewards/rejected": -2.1228652000427246, + "step": 7991 + }, + { + "epoch": 2.0, + "grad_norm": 5.083428382873535, + "learning_rate": 6.546579932849113e-06, + "logits/chosen": -0.4735613465309143, + "logits/rejected": -0.5751364827156067, + "logps/chosen": -53.40218734741211, + "logps/rejected": -102.6492691040039, + "loss": 0.6401, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.348935127258301, + "rewards/margins": 7.220070838928223, + "rewards/rejected": -3.871135950088501, + "step": 7992 + }, + { + "epoch": 2.0, + "grad_norm": 4.249195098876953, + "learning_rate": 6.545832471461508e-06, + "logits/chosen": -0.3863031268119812, + "logits/rejected": -0.468930721282959, + "logps/chosen": -48.880863189697266, + "logps/rejected": -101.799560546875, + "loss": 0.681, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0875730514526367, + "rewards/margins": 5.45962381362915, + "rewards/rejected": -2.372051239013672, + "step": 7993 + }, + { + "epoch": 2.0, + "grad_norm": 9.290834426879883, + "learning_rate": 6.545084971874738e-06, + "logits/chosen": -0.5304199457168579, + "logits/rejected": -0.5436208248138428, + "logps/chosen": -54.373741149902344, + "logps/rejected": -97.09475708007812, + "loss": 0.7188, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8822057247161865, + "rewards/margins": 3.296353816986084, + "rewards/rejected": -0.4141480624675751, + "step": 7994 + }, + { + "epoch": 2.0, + "grad_norm": 3.2476084232330322, + "learning_rate": 6.544337434107274e-06, + "logits/chosen": -0.5347362756729126, + "logits/rejected": -0.639525294303894, + "logps/chosen": -54.10075759887695, + "logps/rejected": -75.09481811523438, + "loss": 0.6817, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0839812755584717, + "rewards/margins": 5.237942695617676, + "rewards/rejected": -2.153961181640625, + "step": 7995 + }, + { + "epoch": 2.0, + "grad_norm": 9.339176177978516, + "learning_rate": 6.543589858177591e-06, + "logits/chosen": -0.42382174730300903, + "logits/rejected": -0.4704754650592804, + "logps/chosen": -55.945472717285156, + "logps/rejected": -101.92842102050781, + "loss": 0.7284, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1292076110839844, + "rewards/margins": 4.697688579559326, + "rewards/rejected": -1.5684807300567627, + "step": 7996 + }, + { + "epoch": 2.0, + "grad_norm": 3.3544085025787354, + "learning_rate": 6.542842244104159e-06, + "logits/chosen": -0.507287323474884, + "logits/rejected": -0.587448000907898, + "logps/chosen": -45.814849853515625, + "logps/rejected": -94.68211364746094, + "loss": 0.5443, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3400025367736816, + "rewards/margins": 6.169398784637451, + "rewards/rejected": -2.8293967247009277, + "step": 7997 + }, + { + "epoch": 2.0, + "grad_norm": 3.0880191326141357, + "learning_rate": 6.5420945919054545e-06, + "logits/chosen": -0.3850526809692383, + "logits/rejected": -0.4700808525085449, + "logps/chosen": -59.76865768432617, + "logps/rejected": -104.79127502441406, + "loss": 0.5739, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0497350692749023, + "rewards/margins": 5.904455184936523, + "rewards/rejected": -2.8547203540802, + "step": 7998 + }, + { + "epoch": 2.0, + "grad_norm": 7.476234436035156, + "learning_rate": 6.541346901599953e-06, + "logits/chosen": -0.484894335269928, + "logits/rejected": -0.5120766758918762, + "logps/chosen": -67.37063598632812, + "logps/rejected": -92.19152069091797, + "loss": 0.7323, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.073906898498535, + "rewards/margins": 4.464324951171875, + "rewards/rejected": -1.3904180526733398, + "step": 7999 + }, + { + "epoch": 2.0, + "grad_norm": 7.83256721496582, + "learning_rate": 6.5405991732061305e-06, + "logits/chosen": -0.42614084482192993, + "logits/rejected": -0.5184463858604431, + "logps/chosen": -77.41603088378906, + "logps/rejected": -89.98344421386719, + "loss": 0.7304, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.011960029602051, + "rewards/margins": 5.789498805999756, + "rewards/rejected": -2.777538537979126, + "step": 8000 + }, + { + "epoch": 2.0, + "grad_norm": 5.985630512237549, + "learning_rate": 6.5398514067424625e-06, + "logits/chosen": -0.36166656017303467, + "logits/rejected": -0.43929219245910645, + "logps/chosen": -53.91242980957031, + "logps/rejected": -97.81114196777344, + "loss": 0.5913, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.93341326713562, + "rewards/margins": 5.543635368347168, + "rewards/rejected": -2.610222101211548, + "step": 8001 + }, + { + "epoch": 2.0, + "grad_norm": 5.500566482543945, + "learning_rate": 6.53910360222743e-06, + "logits/chosen": -0.41777652502059937, + "logits/rejected": -0.533435583114624, + "logps/chosen": -55.2421875, + "logps/rejected": -84.1972885131836, + "loss": 0.7833, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.838653326034546, + "rewards/margins": 5.53756046295166, + "rewards/rejected": -2.698906421661377, + "step": 8002 + }, + { + "epoch": 2.0, + "grad_norm": 4.10089635848999, + "learning_rate": 6.5383557596795085e-06, + "logits/chosen": -0.4246978759765625, + "logits/rejected": -0.49102917313575745, + "logps/chosen": -55.56208419799805, + "logps/rejected": -87.40685272216797, + "loss": 0.6592, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8282158374786377, + "rewards/margins": 5.5800089836120605, + "rewards/rejected": -2.7517929077148438, + "step": 8003 + }, + { + "epoch": 2.0, + "grad_norm": 1.9019135236740112, + "learning_rate": 6.5376078791171804e-06, + "logits/chosen": -0.4224448800086975, + "logits/rejected": -0.4981295168399811, + "logps/chosen": -60.20083236694336, + "logps/rejected": -104.25716400146484, + "loss": 0.5662, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0529367923736572, + "rewards/margins": 6.424228668212891, + "rewards/rejected": -3.3712918758392334, + "step": 8004 + }, + { + "epoch": 2.0, + "grad_norm": 4.802238941192627, + "learning_rate": 6.536859960558928e-06, + "logits/chosen": -0.4357936382293701, + "logits/rejected": -0.505902111530304, + "logps/chosen": -48.98394012451172, + "logps/rejected": -118.18624114990234, + "loss": 0.5572, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0386366844177246, + "rewards/margins": 7.147678375244141, + "rewards/rejected": -4.109041690826416, + "step": 8005 + }, + { + "epoch": 2.0, + "grad_norm": 14.015247344970703, + "learning_rate": 6.536112004023228e-06, + "logits/chosen": -0.44830286502838135, + "logits/rejected": -0.5568675994873047, + "logps/chosen": -49.69530487060547, + "logps/rejected": -97.26553344726562, + "loss": 0.6515, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0599679946899414, + "rewards/margins": 6.050652503967285, + "rewards/rejected": -2.9906845092773438, + "step": 8006 + }, + { + "epoch": 2.0, + "grad_norm": 6.310832500457764, + "learning_rate": 6.53536400952857e-06, + "logits/chosen": -0.5125231146812439, + "logits/rejected": -0.5984731912612915, + "logps/chosen": -54.868408203125, + "logps/rejected": -86.76873779296875, + "loss": 0.7297, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8103857040405273, + "rewards/margins": 5.406816482543945, + "rewards/rejected": -2.596431016921997, + "step": 8007 + }, + { + "epoch": 2.0, + "grad_norm": 9.221614837646484, + "learning_rate": 6.534615977093433e-06, + "logits/chosen": -0.5521609783172607, + "logits/rejected": -0.6201228499412537, + "logps/chosen": -55.66750717163086, + "logps/rejected": -106.13330078125, + "loss": 0.6734, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.950037956237793, + "rewards/margins": 7.278438568115234, + "rewards/rejected": -4.328401565551758, + "step": 8008 + }, + { + "epoch": 2.0, + "grad_norm": 6.334353923797607, + "learning_rate": 6.533867906736301e-06, + "logits/chosen": -0.4606361985206604, + "logits/rejected": -0.5767421722412109, + "logps/chosen": -59.617286682128906, + "logps/rejected": -103.58595275878906, + "loss": 0.6839, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.817751407623291, + "rewards/margins": 6.402733325958252, + "rewards/rejected": -3.584981918334961, + "step": 8009 + }, + { + "epoch": 2.0, + "grad_norm": 9.006210327148438, + "learning_rate": 6.533119798475663e-06, + "logits/chosen": -0.45748215913772583, + "logits/rejected": -0.584037184715271, + "logps/chosen": -57.91667556762695, + "logps/rejected": -84.61856842041016, + "loss": 0.6327, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.901881217956543, + "rewards/margins": 6.032961368560791, + "rewards/rejected": -3.13107967376709, + "step": 8010 + }, + { + "epoch": 2.0, + "grad_norm": 19.45161247253418, + "learning_rate": 6.532371652330005e-06, + "logits/chosen": -0.5048739314079285, + "logits/rejected": -0.5281265377998352, + "logps/chosen": -58.14185333251953, + "logps/rejected": -99.3467025756836, + "loss": 0.8061, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5279436111450195, + "rewards/margins": 5.126335620880127, + "rewards/rejected": -2.5983920097351074, + "step": 8011 + }, + { + "epoch": 2.0, + "grad_norm": 12.766319274902344, + "learning_rate": 6.531623468317811e-06, + "logits/chosen": -0.4381767213344574, + "logits/rejected": -0.5178259015083313, + "logps/chosen": -68.43098449707031, + "logps/rejected": -81.77486419677734, + "loss": 0.8159, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5201778411865234, + "rewards/margins": 4.516432285308838, + "rewards/rejected": -1.996254563331604, + "step": 8012 + }, + { + "epoch": 2.0, + "grad_norm": 7.812234878540039, + "learning_rate": 6.5308752464575755e-06, + "logits/chosen": -0.47024673223495483, + "logits/rejected": -0.4823833107948303, + "logps/chosen": -49.361289978027344, + "logps/rejected": -102.6251220703125, + "loss": 0.7778, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.844527244567871, + "rewards/margins": 4.847776412963867, + "rewards/rejected": -2.0032496452331543, + "step": 8013 + }, + { + "epoch": 2.0, + "grad_norm": 7.644888401031494, + "learning_rate": 6.530126986767783e-06, + "logits/chosen": -0.46583688259124756, + "logits/rejected": -0.5685086846351624, + "logps/chosen": -64.71432495117188, + "logps/rejected": -94.41077423095703, + "loss": 0.7685, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6894216537475586, + "rewards/margins": 5.571702003479004, + "rewards/rejected": -2.882279872894287, + "step": 8014 + }, + { + "epoch": 2.01, + "grad_norm": 8.07319450378418, + "learning_rate": 6.529378689266923e-06, + "logits/chosen": -0.43612584471702576, + "logits/rejected": -0.5792497396469116, + "logps/chosen": -64.68704986572266, + "logps/rejected": -86.94743347167969, + "loss": 0.6661, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8980557918548584, + "rewards/margins": 6.4883036613464355, + "rewards/rejected": -3.5902481079101562, + "step": 8015 + }, + { + "epoch": 2.01, + "grad_norm": 4.051328659057617, + "learning_rate": 6.5286303539734915e-06, + "logits/chosen": -0.5269010663032532, + "logits/rejected": -0.5742641091346741, + "logps/chosen": -59.83051681518555, + "logps/rejected": -128.29811096191406, + "loss": 0.643, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0573318004608154, + "rewards/margins": 6.404980659484863, + "rewards/rejected": -3.3476483821868896, + "step": 8016 + }, + { + "epoch": 2.01, + "grad_norm": 15.965852737426758, + "learning_rate": 6.527881980905977e-06, + "logits/chosen": -0.4779652953147888, + "logits/rejected": -0.5382096767425537, + "logps/chosen": -50.077308654785156, + "logps/rejected": -82.46424865722656, + "loss": 0.7972, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3599696159362793, + "rewards/margins": 6.386922359466553, + "rewards/rejected": -3.0269529819488525, + "step": 8017 + }, + { + "epoch": 2.01, + "grad_norm": 2.1237218379974365, + "learning_rate": 6.527133570082873e-06, + "logits/chosen": -0.4238893389701843, + "logits/rejected": -0.585745096206665, + "logps/chosen": -68.01496887207031, + "logps/rejected": -92.57379913330078, + "loss": 0.6683, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8253955841064453, + "rewards/margins": 6.835158348083496, + "rewards/rejected": -4.009763240814209, + "step": 8018 + }, + { + "epoch": 2.01, + "grad_norm": 10.67542552947998, + "learning_rate": 6.526385121522675e-06, + "logits/chosen": -0.5197344422340393, + "logits/rejected": -0.5687097311019897, + "logps/chosen": -55.38550567626953, + "logps/rejected": -93.75199890136719, + "loss": 0.7915, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9326891899108887, + "rewards/margins": 5.227124214172363, + "rewards/rejected": -2.2944350242614746, + "step": 8019 + }, + { + "epoch": 2.01, + "grad_norm": 5.889989852905273, + "learning_rate": 6.525636635243877e-06, + "logits/chosen": -0.38314977288246155, + "logits/rejected": -0.46220189332962036, + "logps/chosen": -58.31747817993164, + "logps/rejected": -87.54737854003906, + "loss": 0.8322, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0282299518585205, + "rewards/margins": 5.434216499328613, + "rewards/rejected": -2.4059860706329346, + "step": 8020 + }, + { + "epoch": 2.01, + "grad_norm": 5.517740726470947, + "learning_rate": 6.524888111264975e-06, + "logits/chosen": -0.4372592568397522, + "logits/rejected": -0.5029658675193787, + "logps/chosen": -71.05073547363281, + "logps/rejected": -105.21890258789062, + "loss": 0.7457, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7673277854919434, + "rewards/margins": 6.284267902374268, + "rewards/rejected": -3.5169403553009033, + "step": 8021 + }, + { + "epoch": 2.01, + "grad_norm": 2.7534470558166504, + "learning_rate": 6.524139549604466e-06, + "logits/chosen": -0.40068677067756653, + "logits/rejected": -0.49197515845298767, + "logps/chosen": -61.37056350708008, + "logps/rejected": -78.88282012939453, + "loss": 0.5782, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.925994634628296, + "rewards/margins": 4.915055751800537, + "rewards/rejected": -1.9890614748001099, + "step": 8022 + }, + { + "epoch": 2.01, + "grad_norm": 4.605807304382324, + "learning_rate": 6.523390950280848e-06, + "logits/chosen": -0.45609039068222046, + "logits/rejected": -0.525153398513794, + "logps/chosen": -58.85480880737305, + "logps/rejected": -91.06614685058594, + "loss": 0.6685, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.97318172454834, + "rewards/margins": 5.632030487060547, + "rewards/rejected": -2.658848762512207, + "step": 8023 + }, + { + "epoch": 2.01, + "grad_norm": 4.464471340179443, + "learning_rate": 6.5226423133126175e-06, + "logits/chosen": -0.5094784498214722, + "logits/rejected": -0.5977902412414551, + "logps/chosen": -49.518035888671875, + "logps/rejected": -82.51625061035156, + "loss": 0.742, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9503300189971924, + "rewards/margins": 4.8051438331604, + "rewards/rejected": -1.8548139333724976, + "step": 8024 + }, + { + "epoch": 2.01, + "grad_norm": 4.839216709136963, + "learning_rate": 6.521893638718277e-06, + "logits/chosen": -0.4489428400993347, + "logits/rejected": -0.6017919182777405, + "logps/chosen": -57.725807189941406, + "logps/rejected": -101.54910278320312, + "loss": 0.6866, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.119671106338501, + "rewards/margins": 6.98261833190918, + "rewards/rejected": -3.8629469871520996, + "step": 8025 + }, + { + "epoch": 2.01, + "grad_norm": 4.258254528045654, + "learning_rate": 6.521144926516327e-06, + "logits/chosen": -0.3743167221546173, + "logits/rejected": -0.5261600613594055, + "logps/chosen": -60.76713562011719, + "logps/rejected": -95.77457427978516, + "loss": 0.6004, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6920177936553955, + "rewards/margins": 6.518669605255127, + "rewards/rejected": -3.8266522884368896, + "step": 8026 + }, + { + "epoch": 2.01, + "grad_norm": 4.978341102600098, + "learning_rate": 6.520396176725267e-06, + "logits/chosen": -0.4558151960372925, + "logits/rejected": -0.53692626953125, + "logps/chosen": -55.24593734741211, + "logps/rejected": -82.36540222167969, + "loss": 0.7965, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0488646030426025, + "rewards/margins": 5.109461307525635, + "rewards/rejected": -2.0605969429016113, + "step": 8027 + }, + { + "epoch": 2.01, + "grad_norm": 4.407245635986328, + "learning_rate": 6.519647389363599e-06, + "logits/chosen": -0.4830529987812042, + "logits/rejected": -0.5677077174186707, + "logps/chosen": -48.50322723388672, + "logps/rejected": -73.45015716552734, + "loss": 0.5499, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0428829193115234, + "rewards/margins": 5.82633638381958, + "rewards/rejected": -2.7834537029266357, + "step": 8028 + }, + { + "epoch": 2.01, + "grad_norm": 3.492058277130127, + "learning_rate": 6.518898564449829e-06, + "logits/chosen": -0.5523892045021057, + "logits/rejected": -0.6610062122344971, + "logps/chosen": -59.01472473144531, + "logps/rejected": -83.29054260253906, + "loss": 0.6695, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.083385467529297, + "rewards/margins": 5.098912239074707, + "rewards/rejected": -2.0155272483825684, + "step": 8029 + }, + { + "epoch": 2.01, + "grad_norm": 4.269946098327637, + "learning_rate": 6.518149702002461e-06, + "logits/chosen": -0.3913136124610901, + "logits/rejected": -0.4703311622142792, + "logps/chosen": -52.882362365722656, + "logps/rejected": -77.42930603027344, + "loss": 0.8132, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.057642936706543, + "rewards/margins": 4.4008355140686035, + "rewards/rejected": -1.343192458152771, + "step": 8030 + }, + { + "epoch": 2.01, + "grad_norm": 10.616104125976562, + "learning_rate": 6.517400802039997e-06, + "logits/chosen": -0.4095594882965088, + "logits/rejected": -0.44439175724983215, + "logps/chosen": -55.42178726196289, + "logps/rejected": -99.67707824707031, + "loss": 0.825, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.235581874847412, + "rewards/margins": 4.5409135818481445, + "rewards/rejected": -1.3053313493728638, + "step": 8031 + }, + { + "epoch": 2.01, + "grad_norm": 3.593381643295288, + "learning_rate": 6.516651864580945e-06, + "logits/chosen": -0.4766269028186798, + "logits/rejected": -0.5493992567062378, + "logps/chosen": -55.032958984375, + "logps/rejected": -87.64854431152344, + "loss": 0.6968, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1992740631103516, + "rewards/margins": 4.6194353103637695, + "rewards/rejected": -1.4201607704162598, + "step": 8032 + }, + { + "epoch": 2.01, + "grad_norm": 13.040608406066895, + "learning_rate": 6.515902889643814e-06, + "logits/chosen": -0.42412060499191284, + "logits/rejected": -0.5230148434638977, + "logps/chosen": -63.988075256347656, + "logps/rejected": -93.63487243652344, + "loss": 0.7103, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.027340888977051, + "rewards/margins": 6.148090839385986, + "rewards/rejected": -3.1207501888275146, + "step": 8033 + }, + { + "epoch": 2.01, + "grad_norm": 3.9032716751098633, + "learning_rate": 6.51515387724711e-06, + "logits/chosen": -0.5186880826950073, + "logits/rejected": -0.6347787380218506, + "logps/chosen": -62.090457916259766, + "logps/rejected": -102.38041687011719, + "loss": 0.6205, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.028751850128174, + "rewards/margins": 7.076679706573486, + "rewards/rejected": -4.047928333282471, + "step": 8034 + }, + { + "epoch": 2.01, + "grad_norm": 2.2638885974884033, + "learning_rate": 6.514404827409341e-06, + "logits/chosen": -0.3928583860397339, + "logits/rejected": -0.48870718479156494, + "logps/chosen": -59.4271125793457, + "logps/rejected": -81.88623046875, + "loss": 0.6023, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.941593885421753, + "rewards/margins": 5.458876132965088, + "rewards/rejected": -2.517282009124756, + "step": 8035 + }, + { + "epoch": 2.01, + "grad_norm": 8.981700897216797, + "learning_rate": 6.5136557401490185e-06, + "logits/chosen": -0.47209054231643677, + "logits/rejected": -0.5531607270240784, + "logps/chosen": -55.2139778137207, + "logps/rejected": -89.42839050292969, + "loss": 0.8147, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9425318241119385, + "rewards/margins": 4.97074556350708, + "rewards/rejected": -2.0282135009765625, + "step": 8036 + }, + { + "epoch": 2.01, + "grad_norm": 5.975010395050049, + "learning_rate": 6.5129066154846535e-06, + "logits/chosen": -0.4366631805896759, + "logits/rejected": -0.49514344334602356, + "logps/chosen": -55.9703254699707, + "logps/rejected": -101.51689910888672, + "loss": 0.7225, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1694204807281494, + "rewards/margins": 5.7478485107421875, + "rewards/rejected": -2.578428030014038, + "step": 8037 + }, + { + "epoch": 2.01, + "grad_norm": 11.94056224822998, + "learning_rate": 6.5121574534347556e-06, + "logits/chosen": -0.48286956548690796, + "logits/rejected": -0.5681729912757874, + "logps/chosen": -53.25322723388672, + "logps/rejected": -103.37555694580078, + "loss": 0.7561, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.998978614807129, + "rewards/margins": 6.096654891967773, + "rewards/rejected": -3.0976755619049072, + "step": 8038 + }, + { + "epoch": 2.01, + "grad_norm": 8.472468376159668, + "learning_rate": 6.511408254017839e-06, + "logits/chosen": -0.40346112847328186, + "logits/rejected": -0.5084577798843384, + "logps/chosen": -62.936832427978516, + "logps/rejected": -96.705322265625, + "loss": 0.7544, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5220863819122314, + "rewards/margins": 4.99901008605957, + "rewards/rejected": -2.476923942565918, + "step": 8039 + }, + { + "epoch": 2.01, + "grad_norm": 8.197805404663086, + "learning_rate": 6.510659017252417e-06, + "logits/chosen": -0.49209025502204895, + "logits/rejected": -0.552245020866394, + "logps/chosen": -49.976104736328125, + "logps/rejected": -99.91194915771484, + "loss": 0.901, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7319321632385254, + "rewards/margins": 5.355626106262207, + "rewards/rejected": -2.6236939430236816, + "step": 8040 + }, + { + "epoch": 2.01, + "grad_norm": 5.32198429107666, + "learning_rate": 6.509909743157004e-06, + "logits/chosen": -0.3826148211956024, + "logits/rejected": -0.4829709827899933, + "logps/chosen": -47.50577163696289, + "logps/rejected": -70.20907592773438, + "loss": 0.7035, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.087491035461426, + "rewards/margins": 5.00676155090332, + "rewards/rejected": -1.9192708730697632, + "step": 8041 + }, + { + "epoch": 2.01, + "grad_norm": 3.101428508758545, + "learning_rate": 6.509160431750114e-06, + "logits/chosen": -0.4979639947414398, + "logits/rejected": -0.5800085067749023, + "logps/chosen": -55.255027770996094, + "logps/rejected": -99.74198913574219, + "loss": 0.6043, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9797863960266113, + "rewards/margins": 6.089465618133545, + "rewards/rejected": -3.1096789836883545, + "step": 8042 + }, + { + "epoch": 2.01, + "grad_norm": 9.506437301635742, + "learning_rate": 6.508411083050265e-06, + "logits/chosen": -0.5462936162948608, + "logits/rejected": -0.6023961305618286, + "logps/chosen": -54.19401550292969, + "logps/rejected": -95.2049331665039, + "loss": 0.7525, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.794400691986084, + "rewards/margins": 5.416898727416992, + "rewards/rejected": -2.622497797012329, + "step": 8043 + }, + { + "epoch": 2.01, + "grad_norm": 3.818342685699463, + "learning_rate": 6.5076616970759745e-06, + "logits/chosen": -0.4893489480018616, + "logits/rejected": -0.5809999704360962, + "logps/chosen": -52.88666534423828, + "logps/rejected": -98.78108978271484, + "loss": 0.6017, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.018129587173462, + "rewards/margins": 6.095475196838379, + "rewards/rejected": -3.077345609664917, + "step": 8044 + }, + { + "epoch": 2.01, + "grad_norm": 6.111505508422852, + "learning_rate": 6.50691227384576e-06, + "logits/chosen": -0.5025976896286011, + "logits/rejected": -0.5334716439247131, + "logps/chosen": -58.69624328613281, + "logps/rejected": -91.75845336914062, + "loss": 0.8385, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.554572582244873, + "rewards/margins": 4.705188751220703, + "rewards/rejected": -2.150615930557251, + "step": 8045 + }, + { + "epoch": 2.01, + "grad_norm": 5.331265449523926, + "learning_rate": 6.506162813378139e-06, + "logits/chosen": -0.4710361957550049, + "logits/rejected": -0.5677657723426819, + "logps/chosen": -53.044010162353516, + "logps/rejected": -91.71770477294922, + "loss": 0.7337, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1844329833984375, + "rewards/margins": 6.398554801940918, + "rewards/rejected": -3.2141218185424805, + "step": 8046 + }, + { + "epoch": 2.01, + "grad_norm": 8.296518325805664, + "learning_rate": 6.505413315691634e-06, + "logits/chosen": -0.5066363215446472, + "logits/rejected": -0.6366031765937805, + "logps/chosen": -46.34391403198242, + "logps/rejected": -95.17225646972656, + "loss": 0.587, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.784667491912842, + "rewards/margins": 6.079577922821045, + "rewards/rejected": -3.294910430908203, + "step": 8047 + }, + { + "epoch": 2.01, + "grad_norm": 3.240724563598633, + "learning_rate": 6.504663780804765e-06, + "logits/chosen": -0.3632792830467224, + "logits/rejected": -0.43352338671684265, + "logps/chosen": -69.69839477539062, + "logps/rejected": -110.79639434814453, + "loss": 0.6826, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8087096214294434, + "rewards/margins": 6.283512115478516, + "rewards/rejected": -3.474802017211914, + "step": 8048 + }, + { + "epoch": 2.01, + "grad_norm": 26.992244720458984, + "learning_rate": 6.5039142087360525e-06, + "logits/chosen": -0.5342093706130981, + "logits/rejected": -0.5742263197898865, + "logps/chosen": -56.036964416503906, + "logps/rejected": -92.08981323242188, + "loss": 0.7616, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1793861389160156, + "rewards/margins": 5.236032962799072, + "rewards/rejected": -2.0566465854644775, + "step": 8049 + }, + { + "epoch": 2.01, + "grad_norm": 4.748141765594482, + "learning_rate": 6.503164599504022e-06, + "logits/chosen": -0.3497411906719208, + "logits/rejected": -0.4808056652545929, + "logps/chosen": -59.76044464111328, + "logps/rejected": -93.29579162597656, + "loss": 0.5942, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0134682655334473, + "rewards/margins": 5.568324089050293, + "rewards/rejected": -2.554856061935425, + "step": 8050 + }, + { + "epoch": 2.01, + "grad_norm": 10.761569023132324, + "learning_rate": 6.502414953127194e-06, + "logits/chosen": -0.4547634422779083, + "logits/rejected": -0.5511300563812256, + "logps/chosen": -57.893226623535156, + "logps/rejected": -93.2608413696289, + "loss": 0.7655, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.772981643676758, + "rewards/margins": 5.407191276550293, + "rewards/rejected": -2.6342098712921143, + "step": 8051 + }, + { + "epoch": 2.01, + "grad_norm": 5.816640377044678, + "learning_rate": 6.501665269624093e-06, + "logits/chosen": -0.4246768653392792, + "logits/rejected": -0.5031068921089172, + "logps/chosen": -66.16952514648438, + "logps/rejected": -101.98536682128906, + "loss": 0.8217, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.114710569381714, + "rewards/margins": 4.660041809082031, + "rewards/rejected": -1.545331358909607, + "step": 8052 + }, + { + "epoch": 2.01, + "grad_norm": 6.715337753295898, + "learning_rate": 6.500915549013248e-06, + "logits/chosen": -0.5316977500915527, + "logits/rejected": -0.6226040124893188, + "logps/chosen": -48.13965606689453, + "logps/rejected": -113.56102752685547, + "loss": 0.6251, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.713275671005249, + "rewards/margins": 6.742320537567139, + "rewards/rejected": -4.029045104980469, + "step": 8053 + }, + { + "epoch": 2.01, + "grad_norm": 5.456141471862793, + "learning_rate": 6.500165791313185e-06, + "logits/chosen": -0.5119531750679016, + "logits/rejected": -0.5825241208076477, + "logps/chosen": -57.61191177368164, + "logps/rejected": -114.83342742919922, + "loss": 0.7338, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.878594160079956, + "rewards/margins": 6.539989948272705, + "rewards/rejected": -3.661396026611328, + "step": 8054 + }, + { + "epoch": 2.02, + "grad_norm": 3.3823702335357666, + "learning_rate": 6.499415996542426e-06, + "logits/chosen": -0.44624239206314087, + "logits/rejected": -0.5754921436309814, + "logps/chosen": -48.40006637573242, + "logps/rejected": -110.20580291748047, + "loss": 0.5497, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8483924865722656, + "rewards/margins": 7.070144176483154, + "rewards/rejected": -4.221752166748047, + "step": 8055 + }, + { + "epoch": 2.02, + "grad_norm": 2.7320802211761475, + "learning_rate": 6.498666164719505e-06, + "logits/chosen": -0.4746399521827698, + "logits/rejected": -0.5042850971221924, + "logps/chosen": -53.53810501098633, + "logps/rejected": -95.77923583984375, + "loss": 0.5963, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0767018795013428, + "rewards/margins": 5.285456657409668, + "rewards/rejected": -2.2087550163269043, + "step": 8056 + }, + { + "epoch": 2.02, + "grad_norm": 12.76455307006836, + "learning_rate": 6.497916295862949e-06, + "logits/chosen": -0.45939555764198303, + "logits/rejected": -0.5168197751045227, + "logps/chosen": -65.85243225097656, + "logps/rejected": -88.39019775390625, + "loss": 0.877, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5100769996643066, + "rewards/margins": 4.295445442199707, + "rewards/rejected": -1.7853683233261108, + "step": 8057 + }, + { + "epoch": 2.02, + "grad_norm": 6.480767726898193, + "learning_rate": 6.497166389991286e-06, + "logits/chosen": -0.4172402024269104, + "logits/rejected": -0.5089596509933472, + "logps/chosen": -60.22865295410156, + "logps/rejected": -96.23892974853516, + "loss": 0.6739, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.060962677001953, + "rewards/margins": 6.320461273193359, + "rewards/rejected": -3.259498357772827, + "step": 8058 + }, + { + "epoch": 2.02, + "grad_norm": 5.096374034881592, + "learning_rate": 6.496416447123052e-06, + "logits/chosen": -0.47299882769584656, + "logits/rejected": -0.5609681010246277, + "logps/chosen": -55.52086639404297, + "logps/rejected": -102.87028503417969, + "loss": 0.655, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0128536224365234, + "rewards/margins": 5.981170654296875, + "rewards/rejected": -2.9683172702789307, + "step": 8059 + }, + { + "epoch": 2.02, + "grad_norm": 13.799845695495605, + "learning_rate": 6.495666467276775e-06, + "logits/chosen": -0.5037561058998108, + "logits/rejected": -0.5443449020385742, + "logps/chosen": -58.565643310546875, + "logps/rejected": -95.00898742675781, + "loss": 0.7238, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5874693393707275, + "rewards/margins": 3.986212730407715, + "rewards/rejected": -1.398743748664856, + "step": 8060 + }, + { + "epoch": 2.02, + "grad_norm": 4.147736072540283, + "learning_rate": 6.494916450470987e-06, + "logits/chosen": -0.44282376766204834, + "logits/rejected": -0.48096567392349243, + "logps/chosen": -49.32380676269531, + "logps/rejected": -92.74405670166016, + "loss": 0.636, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7785799503326416, + "rewards/margins": 5.541328430175781, + "rewards/rejected": -2.7627482414245605, + "step": 8061 + }, + { + "epoch": 2.02, + "grad_norm": 14.502522468566895, + "learning_rate": 6.494166396724226e-06, + "logits/chosen": -0.5734496712684631, + "logits/rejected": -0.6155219078063965, + "logps/chosen": -60.683746337890625, + "logps/rejected": -104.87208557128906, + "loss": 0.7557, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.872941017150879, + "rewards/margins": 5.818820953369141, + "rewards/rejected": -2.94588041305542, + "step": 8062 + }, + { + "epoch": 2.02, + "grad_norm": 3.675539016723633, + "learning_rate": 6.4934163060550224e-06, + "logits/chosen": -0.5583778023719788, + "logits/rejected": -0.6350424885749817, + "logps/chosen": -52.74946594238281, + "logps/rejected": -89.23136138916016, + "loss": 0.6301, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.799778938293457, + "rewards/margins": 6.214254379272461, + "rewards/rejected": -3.414475202560425, + "step": 8063 + }, + { + "epoch": 2.02, + "grad_norm": 4.653238296508789, + "learning_rate": 6.492666178481915e-06, + "logits/chosen": -0.4689965844154358, + "logits/rejected": -0.5487258434295654, + "logps/chosen": -60.664180755615234, + "logps/rejected": -86.18782806396484, + "loss": 0.7023, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8429205417633057, + "rewards/margins": 5.118542194366455, + "rewards/rejected": -2.2756221294403076, + "step": 8064 + }, + { + "epoch": 2.02, + "grad_norm": 5.253628730773926, + "learning_rate": 6.491916014023436e-06, + "logits/chosen": -0.4772522449493408, + "logits/rejected": -0.5050914287567139, + "logps/chosen": -49.58745193481445, + "logps/rejected": -100.4354476928711, + "loss": 0.6091, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4170262813568115, + "rewards/margins": 5.847840785980225, + "rewards/rejected": -2.430814743041992, + "step": 8065 + }, + { + "epoch": 2.02, + "grad_norm": 7.934152126312256, + "learning_rate": 6.491165812698127e-06, + "logits/chosen": -0.5072435140609741, + "logits/rejected": -0.5379077196121216, + "logps/chosen": -55.72612380981445, + "logps/rejected": -93.17647552490234, + "loss": 0.6942, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7308602333068848, + "rewards/margins": 5.64199686050415, + "rewards/rejected": -2.9111366271972656, + "step": 8066 + }, + { + "epoch": 2.02, + "grad_norm": 17.009897232055664, + "learning_rate": 6.490415574524525e-06, + "logits/chosen": -0.40568286180496216, + "logits/rejected": -0.5000212788581848, + "logps/chosen": -60.072731018066406, + "logps/rejected": -99.85989379882812, + "loss": 0.7641, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8314690589904785, + "rewards/margins": 5.969871997833252, + "rewards/rejected": -3.1384029388427734, + "step": 8067 + }, + { + "epoch": 2.02, + "grad_norm": 10.812417030334473, + "learning_rate": 6.489665299521169e-06, + "logits/chosen": -0.45098787546157837, + "logits/rejected": -0.518599808216095, + "logps/chosen": -54.84827423095703, + "logps/rejected": -88.40271759033203, + "loss": 0.835, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9534952640533447, + "rewards/margins": 5.350472450256348, + "rewards/rejected": -2.396977186203003, + "step": 8068 + }, + { + "epoch": 2.02, + "grad_norm": 8.79601764678955, + "learning_rate": 6.488914987706598e-06, + "logits/chosen": -0.48688969016075134, + "logits/rejected": -0.6034315824508667, + "logps/chosen": -55.708072662353516, + "logps/rejected": -74.80681610107422, + "loss": 0.7269, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0122828483581543, + "rewards/margins": 4.563498497009277, + "rewards/rejected": -1.5512161254882812, + "step": 8069 + }, + { + "epoch": 2.02, + "grad_norm": 5.7513837814331055, + "learning_rate": 6.488164639099354e-06, + "logits/chosen": -0.49104660749435425, + "logits/rejected": -0.5181288719177246, + "logps/chosen": -43.35889434814453, + "logps/rejected": -126.57133483886719, + "loss": 0.5809, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.007096529006958, + "rewards/margins": 6.8378472328186035, + "rewards/rejected": -3.8307509422302246, + "step": 8070 + }, + { + "epoch": 2.02, + "grad_norm": 3.1441810131073, + "learning_rate": 6.4874142537179806e-06, + "logits/chosen": -0.4772348999977112, + "logits/rejected": -0.5754092931747437, + "logps/chosen": -53.20425796508789, + "logps/rejected": -94.24121856689453, + "loss": 0.6421, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.932124614715576, + "rewards/margins": 6.017688751220703, + "rewards/rejected": -3.085563898086548, + "step": 8071 + }, + { + "epoch": 2.02, + "grad_norm": 58.60373306274414, + "learning_rate": 6.486663831581016e-06, + "logits/chosen": -0.5874454379081726, + "logits/rejected": -0.6347303986549377, + "logps/chosen": -48.92136764526367, + "logps/rejected": -97.25883483886719, + "loss": 0.7878, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.877110481262207, + "rewards/margins": 6.195733547210693, + "rewards/rejected": -3.3186237812042236, + "step": 8072 + }, + { + "epoch": 2.02, + "grad_norm": 8.348686218261719, + "learning_rate": 6.485913372707009e-06, + "logits/chosen": -0.48811811208724976, + "logits/rejected": -0.5818004608154297, + "logps/chosen": -53.00035858154297, + "logps/rejected": -83.95689392089844, + "loss": 0.691, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3615705966949463, + "rewards/margins": 5.541464328765869, + "rewards/rejected": -2.179893732070923, + "step": 8073 + }, + { + "epoch": 2.02, + "grad_norm": 6.031437397003174, + "learning_rate": 6.485162877114501e-06, + "logits/chosen": -0.4284117519855499, + "logits/rejected": -0.44684600830078125, + "logps/chosen": -55.49555587768555, + "logps/rejected": -102.48262786865234, + "loss": 0.7354, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.868539810180664, + "rewards/margins": 5.114613056182861, + "rewards/rejected": -2.2460732460021973, + "step": 8074 + }, + { + "epoch": 2.02, + "grad_norm": 2.6887857913970947, + "learning_rate": 6.4844123448220396e-06, + "logits/chosen": -0.39636319875717163, + "logits/rejected": -0.5269618630409241, + "logps/chosen": -53.06442642211914, + "logps/rejected": -81.71324157714844, + "loss": 0.5459, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1628434658050537, + "rewards/margins": 5.781940460205078, + "rewards/rejected": -2.619096279144287, + "step": 8075 + }, + { + "epoch": 2.02, + "grad_norm": 8.188313484191895, + "learning_rate": 6.483661775848169e-06, + "logits/chosen": -0.5205790996551514, + "logits/rejected": -0.5681419372558594, + "logps/chosen": -59.14678192138672, + "logps/rejected": -105.28551483154297, + "loss": 0.7179, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2503528594970703, + "rewards/margins": 5.842438697814941, + "rewards/rejected": -2.592085838317871, + "step": 8076 + }, + { + "epoch": 2.02, + "grad_norm": 3.8920435905456543, + "learning_rate": 6.48291117021144e-06, + "logits/chosen": -0.4718564450740814, + "logits/rejected": -0.5918453335762024, + "logps/chosen": -54.08552169799805, + "logps/rejected": -86.2369155883789, + "loss": 0.7039, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8521652221679688, + "rewards/margins": 5.763352394104004, + "rewards/rejected": -2.911186695098877, + "step": 8077 + }, + { + "epoch": 2.02, + "grad_norm": 5.170870780944824, + "learning_rate": 6.482160527930397e-06, + "logits/chosen": -0.4563212990760803, + "logits/rejected": -0.49324148893356323, + "logps/chosen": -55.51248550415039, + "logps/rejected": -99.54190826416016, + "loss": 0.6484, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7644906044006348, + "rewards/margins": 5.066426753997803, + "rewards/rejected": -2.301936626434326, + "step": 8078 + }, + { + "epoch": 2.02, + "grad_norm": 11.668657302856445, + "learning_rate": 6.481409849023591e-06, + "logits/chosen": -0.48857980966567993, + "logits/rejected": -0.5547480583190918, + "logps/chosen": -68.34198760986328, + "logps/rejected": -88.29557800292969, + "loss": 0.8328, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9775736331939697, + "rewards/margins": 4.876865386962891, + "rewards/rejected": -1.8992915153503418, + "step": 8079 + }, + { + "epoch": 2.02, + "grad_norm": 4.696839332580566, + "learning_rate": 6.480659133509573e-06, + "logits/chosen": -0.40893474221229553, + "logits/rejected": -0.52467942237854, + "logps/chosen": -60.61018753051758, + "logps/rejected": -90.27330017089844, + "loss": 0.6998, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7905893325805664, + "rewards/margins": 5.738144874572754, + "rewards/rejected": -2.9475555419921875, + "step": 8080 + }, + { + "epoch": 2.02, + "grad_norm": 4.1443681716918945, + "learning_rate": 6.479908381406891e-06, + "logits/chosen": -0.5713068246841431, + "logits/rejected": -0.6480154991149902, + "logps/chosen": -49.898704528808594, + "logps/rejected": -76.273681640625, + "loss": 0.7344, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9430861473083496, + "rewards/margins": 5.081510543823242, + "rewards/rejected": -2.1384243965148926, + "step": 8081 + }, + { + "epoch": 2.02, + "grad_norm": 30.459163665771484, + "learning_rate": 6.4791575927341e-06, + "logits/chosen": -0.5423908829689026, + "logits/rejected": -0.5936414003372192, + "logps/chosen": -60.85622787475586, + "logps/rejected": -99.05857849121094, + "loss": 0.6774, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8304738998413086, + "rewards/margins": 5.825497150421143, + "rewards/rejected": -2.995023250579834, + "step": 8082 + }, + { + "epoch": 2.02, + "grad_norm": 3.6654486656188965, + "learning_rate": 6.478406767509751e-06, + "logits/chosen": -0.5079491138458252, + "logits/rejected": -0.5764244794845581, + "logps/chosen": -54.06881332397461, + "logps/rejected": -83.99710845947266, + "loss": 0.6868, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.003366708755493, + "rewards/margins": 5.926507949829102, + "rewards/rejected": -2.9231412410736084, + "step": 8083 + }, + { + "epoch": 2.02, + "grad_norm": 3.065188407897949, + "learning_rate": 6.477655905752399e-06, + "logits/chosen": -0.494700163602829, + "logits/rejected": -0.6147412061691284, + "logps/chosen": -62.10887908935547, + "logps/rejected": -83.67709350585938, + "loss": 0.7291, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9923994541168213, + "rewards/margins": 6.41586446762085, + "rewards/rejected": -3.4234650135040283, + "step": 8084 + }, + { + "epoch": 2.02, + "grad_norm": 2.8557872772216797, + "learning_rate": 6.476905007480597e-06, + "logits/chosen": -0.427955687046051, + "logits/rejected": -0.5405099987983704, + "logps/chosen": -70.17776489257812, + "logps/rejected": -92.09012603759766, + "loss": 0.5967, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.224102735519409, + "rewards/margins": 6.049496650695801, + "rewards/rejected": -2.8253941535949707, + "step": 8085 + }, + { + "epoch": 2.02, + "grad_norm": 8.020407676696777, + "learning_rate": 6.476154072712904e-06, + "logits/chosen": -0.4344649016857147, + "logits/rejected": -0.49926698207855225, + "logps/chosen": -58.09348678588867, + "logps/rejected": -99.39286804199219, + "loss": 0.6649, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9465465545654297, + "rewards/margins": 5.874525547027588, + "rewards/rejected": -2.927978515625, + "step": 8086 + }, + { + "epoch": 2.02, + "grad_norm": 4.404993057250977, + "learning_rate": 6.47540310146787e-06, + "logits/chosen": -0.5351876616477966, + "logits/rejected": -0.6159605979919434, + "logps/chosen": -57.04752731323242, + "logps/rejected": -82.88966369628906, + "loss": 0.6886, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1695151329040527, + "rewards/margins": 5.646434783935547, + "rewards/rejected": -2.476919651031494, + "step": 8087 + }, + { + "epoch": 2.02, + "grad_norm": 3.5655834674835205, + "learning_rate": 6.474652093764057e-06, + "logits/chosen": -0.5274848937988281, + "logits/rejected": -0.5666849613189697, + "logps/chosen": -54.997589111328125, + "logps/rejected": -102.99950408935547, + "loss": 0.6078, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9840357303619385, + "rewards/margins": 5.597906589508057, + "rewards/rejected": -2.6138710975646973, + "step": 8088 + }, + { + "epoch": 2.02, + "grad_norm": 5.03394079208374, + "learning_rate": 6.473901049620024e-06, + "logits/chosen": -0.4220103323459625, + "logits/rejected": -0.5291410088539124, + "logps/chosen": -52.689083099365234, + "logps/rejected": -86.98126220703125, + "loss": 0.6629, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0429232120513916, + "rewards/margins": 5.3348517417907715, + "rewards/rejected": -2.291928291320801, + "step": 8089 + }, + { + "epoch": 2.02, + "grad_norm": 5.191244125366211, + "learning_rate": 6.473149969054326e-06, + "logits/chosen": -0.3851470351219177, + "logits/rejected": -0.5076472163200378, + "logps/chosen": -68.58277893066406, + "logps/rejected": -93.66651153564453, + "loss": 0.7269, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.74168062210083, + "rewards/margins": 5.38491153717041, + "rewards/rejected": -2.6432313919067383, + "step": 8090 + }, + { + "epoch": 2.02, + "grad_norm": 6.409666061401367, + "learning_rate": 6.472398852085529e-06, + "logits/chosen": -0.42051005363464355, + "logits/rejected": -0.46347030997276306, + "logps/chosen": -64.37966918945312, + "logps/rejected": -86.79193115234375, + "loss": 0.8674, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9860148429870605, + "rewards/margins": 4.76483154296875, + "rewards/rejected": -1.7788174152374268, + "step": 8091 + }, + { + "epoch": 2.02, + "grad_norm": 4.993552207946777, + "learning_rate": 6.471647698732186e-06, + "logits/chosen": -0.5233270525932312, + "logits/rejected": -0.6277821660041809, + "logps/chosen": -58.02499008178711, + "logps/rejected": -90.19558715820312, + "loss": 0.6923, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.02117919921875, + "rewards/margins": 5.413650989532471, + "rewards/rejected": -2.3924715518951416, + "step": 8092 + }, + { + "epoch": 2.02, + "grad_norm": 2.9963269233703613, + "learning_rate": 6.470896509012865e-06, + "logits/chosen": -0.48365169763565063, + "logits/rejected": -0.5542421936988831, + "logps/chosen": -42.43653869628906, + "logps/rejected": -75.99651336669922, + "loss": 0.692, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.113605260848999, + "rewards/margins": 5.83693265914917, + "rewards/rejected": -2.723327159881592, + "step": 8093 + }, + { + "epoch": 2.02, + "grad_norm": 5.965826511383057, + "learning_rate": 6.470145282946127e-06, + "logits/chosen": -0.42626646161079407, + "logits/rejected": -0.5224770307540894, + "logps/chosen": -65.15928649902344, + "logps/rejected": -92.6640396118164, + "loss": 0.7649, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.885646104812622, + "rewards/margins": 5.442522048950195, + "rewards/rejected": -2.5568759441375732, + "step": 8094 + }, + { + "epoch": 2.03, + "grad_norm": 14.689071655273438, + "learning_rate": 6.469394020550534e-06, + "logits/chosen": -0.46509218215942383, + "logits/rejected": -0.5416727066040039, + "logps/chosen": -59.05523681640625, + "logps/rejected": -98.77068328857422, + "loss": 0.7854, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1424200534820557, + "rewards/margins": 4.406893253326416, + "rewards/rejected": -1.2644734382629395, + "step": 8095 + }, + { + "epoch": 2.03, + "grad_norm": 12.109091758728027, + "learning_rate": 6.468642721844655e-06, + "logits/chosen": -0.4683448374271393, + "logits/rejected": -0.5564828515052795, + "logps/chosen": -56.351505279541016, + "logps/rejected": -88.06571960449219, + "loss": 0.7524, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.741668462753296, + "rewards/margins": 5.764391899108887, + "rewards/rejected": -3.02272367477417, + "step": 8096 + }, + { + "epoch": 2.03, + "grad_norm": 16.76322364807129, + "learning_rate": 6.467891386847049e-06, + "logits/chosen": -0.45126959681510925, + "logits/rejected": -0.5140907168388367, + "logps/chosen": -56.98414611816406, + "logps/rejected": -92.49945831298828, + "loss": 0.7724, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8619775772094727, + "rewards/margins": 5.265027046203613, + "rewards/rejected": -2.4030494689941406, + "step": 8097 + }, + { + "epoch": 2.03, + "grad_norm": 3.6800880432128906, + "learning_rate": 6.467140015576288e-06, + "logits/chosen": -0.513176441192627, + "logits/rejected": -0.6001952886581421, + "logps/chosen": -56.376651763916016, + "logps/rejected": -89.75526428222656, + "loss": 0.6772, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.92445969581604, + "rewards/margins": 5.681953430175781, + "rewards/rejected": -2.757493495941162, + "step": 8098 + }, + { + "epoch": 2.03, + "grad_norm": 3.3134233951568604, + "learning_rate": 6.4663886080509344e-06, + "logits/chosen": -0.5573183298110962, + "logits/rejected": -0.6516901850700378, + "logps/chosen": -52.53971481323242, + "logps/rejected": -101.1181869506836, + "loss": 0.6035, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9147000312805176, + "rewards/margins": 6.686460018157959, + "rewards/rejected": -3.7717602252960205, + "step": 8099 + }, + { + "epoch": 2.03, + "grad_norm": 3.1763689517974854, + "learning_rate": 6.46563716428956e-06, + "logits/chosen": -0.44304877519607544, + "logits/rejected": -0.5514587163925171, + "logps/chosen": -54.44719314575195, + "logps/rejected": -104.61493682861328, + "loss": 0.5968, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0822434425354004, + "rewards/margins": 5.758474826812744, + "rewards/rejected": -2.67623233795166, + "step": 8100 + }, + { + "epoch": 2.03, + "grad_norm": 9.0866060256958, + "learning_rate": 6.464885684310731e-06, + "logits/chosen": -0.536679208278656, + "logits/rejected": -0.6027952432632446, + "logps/chosen": -50.20751190185547, + "logps/rejected": -77.34451293945312, + "loss": 0.8631, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6047627925872803, + "rewards/margins": 4.399387836456299, + "rewards/rejected": -1.7946250438690186, + "step": 8101 + }, + { + "epoch": 2.03, + "grad_norm": 9.43324089050293, + "learning_rate": 6.4641341681330205e-06, + "logits/chosen": -0.4666164815425873, + "logits/rejected": -0.5765700340270996, + "logps/chosen": -61.266456604003906, + "logps/rejected": -95.61210632324219, + "loss": 0.6836, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.911468744277954, + "rewards/margins": 5.977798938751221, + "rewards/rejected": -3.0663304328918457, + "step": 8102 + }, + { + "epoch": 2.03, + "grad_norm": 4.776383399963379, + "learning_rate": 6.463382615774995e-06, + "logits/chosen": -0.4567742943763733, + "logits/rejected": -0.4948226511478424, + "logps/chosen": -52.635719299316406, + "logps/rejected": -85.72242736816406, + "loss": 0.644, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.192307949066162, + "rewards/margins": 6.257543563842773, + "rewards/rejected": -3.0652356147766113, + "step": 8103 + }, + { + "epoch": 2.03, + "grad_norm": 4.251107692718506, + "learning_rate": 6.46263102725523e-06, + "logits/chosen": -0.4660007357597351, + "logits/rejected": -0.508922815322876, + "logps/chosen": -43.945457458496094, + "logps/rejected": -91.39974975585938, + "loss": 0.7041, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2470407485961914, + "rewards/margins": 5.145178318023682, + "rewards/rejected": -1.8981375694274902, + "step": 8104 + }, + { + "epoch": 2.03, + "grad_norm": 8.02361011505127, + "learning_rate": 6.4618794025922975e-06, + "logits/chosen": -0.43036994338035583, + "logits/rejected": -0.5324968099594116, + "logps/chosen": -69.11505126953125, + "logps/rejected": -86.08264923095703, + "loss": 0.9675, + "rewards/accuracies": 0.78125, + "rewards/chosen": 3.003948211669922, + "rewards/margins": 5.035852432250977, + "rewards/rejected": -2.0319039821624756, + "step": 8105 + }, + { + "epoch": 2.03, + "grad_norm": 6.087265491485596, + "learning_rate": 6.461127741804769e-06, + "logits/chosen": -0.49449634552001953, + "logits/rejected": -0.5868875980377197, + "logps/chosen": -58.91512680053711, + "logps/rejected": -88.80559539794922, + "loss": 0.6751, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.118925094604492, + "rewards/margins": 5.181529998779297, + "rewards/rejected": -2.0626041889190674, + "step": 8106 + }, + { + "epoch": 2.03, + "grad_norm": 3.7852602005004883, + "learning_rate": 6.460376044911219e-06, + "logits/chosen": -0.5029907822608948, + "logits/rejected": -0.6233552694320679, + "logps/chosen": -51.70259094238281, + "logps/rejected": -82.93862915039062, + "loss": 0.6212, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.763777017593384, + "rewards/margins": 5.67815637588501, + "rewards/rejected": -2.914379596710205, + "step": 8107 + }, + { + "epoch": 2.03, + "grad_norm": 2.6502084732055664, + "learning_rate": 6.459624311930225e-06, + "logits/chosen": -0.4739980101585388, + "logits/rejected": -0.5573976039886475, + "logps/chosen": -50.51758575439453, + "logps/rejected": -115.1641845703125, + "loss": 0.582, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.079538106918335, + "rewards/margins": 7.147106170654297, + "rewards/rejected": -4.067567825317383, + "step": 8108 + }, + { + "epoch": 2.03, + "grad_norm": 3.919403314590454, + "learning_rate": 6.458872542880361e-06, + "logits/chosen": -0.47690507769584656, + "logits/rejected": -0.5596852898597717, + "logps/chosen": -59.05943298339844, + "logps/rejected": -93.65390014648438, + "loss": 0.6501, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.043424367904663, + "rewards/margins": 5.8362531661987305, + "rewards/rejected": -2.7928287982940674, + "step": 8109 + }, + { + "epoch": 2.03, + "grad_norm": 3.9796555042266846, + "learning_rate": 6.458120737780205e-06, + "logits/chosen": -0.38560113310813904, + "logits/rejected": -0.4740036725997925, + "logps/chosen": -55.64371109008789, + "logps/rejected": -98.35551452636719, + "loss": 0.625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3306405544281006, + "rewards/margins": 5.61046838760376, + "rewards/rejected": -2.279827833175659, + "step": 8110 + }, + { + "epoch": 2.03, + "grad_norm": 4.95226526260376, + "learning_rate": 6.457368896648335e-06, + "logits/chosen": -0.5577352046966553, + "logits/rejected": -0.5634020566940308, + "logps/chosen": -52.98038101196289, + "logps/rejected": -92.80854034423828, + "loss": 0.7766, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0268383026123047, + "rewards/margins": 4.268641471862793, + "rewards/rejected": -1.2418034076690674, + "step": 8111 + }, + { + "epoch": 2.03, + "grad_norm": 4.253625392913818, + "learning_rate": 6.456617019503328e-06, + "logits/chosen": -0.44910210371017456, + "logits/rejected": -0.5595059990882874, + "logps/chosen": -56.4195556640625, + "logps/rejected": -90.42558288574219, + "loss": 0.6493, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1694753170013428, + "rewards/margins": 5.476292610168457, + "rewards/rejected": -2.306816816329956, + "step": 8112 + }, + { + "epoch": 2.03, + "grad_norm": 3.9818642139434814, + "learning_rate": 6.4558651063637665e-06, + "logits/chosen": -0.47851642966270447, + "logits/rejected": -0.547660768032074, + "logps/chosen": -58.72697448730469, + "logps/rejected": -110.13302612304688, + "loss": 0.6456, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.791395664215088, + "rewards/margins": 5.904043197631836, + "rewards/rejected": -3.1126482486724854, + "step": 8113 + }, + { + "epoch": 2.03, + "grad_norm": 9.063719749450684, + "learning_rate": 6.45511315724823e-06, + "logits/chosen": -0.4844571352005005, + "logits/rejected": -0.5218117237091064, + "logps/chosen": -44.89741897583008, + "logps/rejected": -95.00994873046875, + "loss": 0.7353, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0869524478912354, + "rewards/margins": 5.307021141052246, + "rewards/rejected": -2.2200684547424316, + "step": 8114 + }, + { + "epoch": 2.03, + "grad_norm": 18.596683502197266, + "learning_rate": 6.454361172175298e-06, + "logits/chosen": -0.5248818397521973, + "logits/rejected": -0.567206621170044, + "logps/chosen": -57.04391860961914, + "logps/rejected": -104.73585510253906, + "loss": 0.8891, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.447977066040039, + "rewards/margins": 5.368204116821289, + "rewards/rejected": -2.92022705078125, + "step": 8115 + }, + { + "epoch": 2.03, + "grad_norm": 4.786584854125977, + "learning_rate": 6.453609151163557e-06, + "logits/chosen": -0.4940756559371948, + "logits/rejected": -0.581544041633606, + "logps/chosen": -61.09899139404297, + "logps/rejected": -81.71249389648438, + "loss": 0.6738, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9011073112487793, + "rewards/margins": 4.991430759429932, + "rewards/rejected": -2.0903234481811523, + "step": 8116 + }, + { + "epoch": 2.03, + "grad_norm": 6.663055896759033, + "learning_rate": 6.452857094231586e-06, + "logits/chosen": -0.4286908209323883, + "logits/rejected": -0.49542564153671265, + "logps/chosen": -61.773719787597656, + "logps/rejected": -92.09505462646484, + "loss": 0.6778, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.327725410461426, + "rewards/margins": 5.4564642906188965, + "rewards/rejected": -2.1287388801574707, + "step": 8117 + }, + { + "epoch": 2.03, + "grad_norm": 9.61854076385498, + "learning_rate": 6.45210500139797e-06, + "logits/chosen": -0.4352940618991852, + "logits/rejected": -0.5045687556266785, + "logps/chosen": -55.46477508544922, + "logps/rejected": -86.61859130859375, + "loss": 0.6983, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8635847568511963, + "rewards/margins": 5.631033897399902, + "rewards/rejected": -2.767449140548706, + "step": 8118 + }, + { + "epoch": 2.03, + "grad_norm": 4.3732147216796875, + "learning_rate": 6.451352872681296e-06, + "logits/chosen": -0.39244723320007324, + "logits/rejected": -0.4419932961463928, + "logps/chosen": -57.74969482421875, + "logps/rejected": -105.61241149902344, + "loss": 0.6991, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.827836513519287, + "rewards/margins": 5.555055618286133, + "rewards/rejected": -2.7272191047668457, + "step": 8119 + }, + { + "epoch": 2.03, + "grad_norm": 3.728905439376831, + "learning_rate": 6.45060070810015e-06, + "logits/chosen": -0.3925538957118988, + "logits/rejected": -0.49510106444358826, + "logps/chosen": -58.36555862426758, + "logps/rejected": -82.05989074707031, + "loss": 0.7156, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6759896278381348, + "rewards/margins": 4.71825647354126, + "rewards/rejected": -2.042267084121704, + "step": 8120 + }, + { + "epoch": 2.03, + "grad_norm": 3.553467035293579, + "learning_rate": 6.449848507673116e-06, + "logits/chosen": -0.4944237768650055, + "logits/rejected": -0.5719311237335205, + "logps/chosen": -50.5931510925293, + "logps/rejected": -83.22477722167969, + "loss": 0.6343, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9109199047088623, + "rewards/margins": 4.674631118774414, + "rewards/rejected": -1.7637113332748413, + "step": 8121 + }, + { + "epoch": 2.03, + "grad_norm": 3.414137125015259, + "learning_rate": 6.4490962714187845e-06, + "logits/chosen": -0.3635236620903015, + "logits/rejected": -0.46126052737236023, + "logps/chosen": -55.89636993408203, + "logps/rejected": -103.29286193847656, + "loss": 0.5851, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9129974842071533, + "rewards/margins": 5.628879547119141, + "rewards/rejected": -2.715881824493408, + "step": 8122 + }, + { + "epoch": 2.03, + "grad_norm": 4.346716403961182, + "learning_rate": 6.448343999355742e-06, + "logits/chosen": -0.49962636828422546, + "logits/rejected": -0.5699114799499512, + "logps/chosen": -64.33860778808594, + "logps/rejected": -87.56295013427734, + "loss": 0.6984, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.19952392578125, + "rewards/margins": 4.945539951324463, + "rewards/rejected": -1.7460161447525024, + "step": 8123 + }, + { + "epoch": 2.03, + "grad_norm": 3.2193901538848877, + "learning_rate": 6.447591691502577e-06, + "logits/chosen": -0.5169599056243896, + "logits/rejected": -0.6028038263320923, + "logps/chosen": -66.60651397705078, + "logps/rejected": -103.71481323242188, + "loss": 0.627, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3346829414367676, + "rewards/margins": 7.1031317710876465, + "rewards/rejected": -3.7684483528137207, + "step": 8124 + }, + { + "epoch": 2.03, + "grad_norm": 8.332488059997559, + "learning_rate": 6.446839347877885e-06, + "logits/chosen": -0.46566474437713623, + "logits/rejected": -0.5389901995658875, + "logps/chosen": -54.94816207885742, + "logps/rejected": -84.731689453125, + "loss": 0.7566, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.991213798522949, + "rewards/margins": 5.53776741027832, + "rewards/rejected": -2.54655385017395, + "step": 8125 + }, + { + "epoch": 2.03, + "grad_norm": 3.775419235229492, + "learning_rate": 6.446086968500251e-06, + "logits/chosen": -0.48409023880958557, + "logits/rejected": -0.5273188352584839, + "logps/chosen": -49.32643127441406, + "logps/rejected": -93.95775604248047, + "loss": 0.5656, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.090386390686035, + "rewards/margins": 6.475429058074951, + "rewards/rejected": -3.385042667388916, + "step": 8126 + }, + { + "epoch": 2.03, + "grad_norm": 3.4435360431671143, + "learning_rate": 6.445334553388272e-06, + "logits/chosen": -0.4711382985115051, + "logits/rejected": -0.6006349325180054, + "logps/chosen": -42.86030197143555, + "logps/rejected": -75.309814453125, + "loss": 0.592, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.263939619064331, + "rewards/margins": 5.456718921661377, + "rewards/rejected": -2.192779064178467, + "step": 8127 + }, + { + "epoch": 2.03, + "grad_norm": 6.6663737297058105, + "learning_rate": 6.444582102560537e-06, + "logits/chosen": -0.4060070514678955, + "logits/rejected": -0.48388999700546265, + "logps/chosen": -51.858726501464844, + "logps/rejected": -95.23111724853516, + "loss": 0.6252, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9473931789398193, + "rewards/margins": 6.035846710205078, + "rewards/rejected": -3.088453769683838, + "step": 8128 + }, + { + "epoch": 2.03, + "grad_norm": 5.197813987731934, + "learning_rate": 6.443829616035643e-06, + "logits/chosen": -0.47322285175323486, + "logits/rejected": -0.5377597808837891, + "logps/chosen": -60.072471618652344, + "logps/rejected": -98.4300308227539, + "loss": 0.7835, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.074765920639038, + "rewards/margins": 5.566967487335205, + "rewards/rejected": -2.4922008514404297, + "step": 8129 + }, + { + "epoch": 2.03, + "grad_norm": 2.388883590698242, + "learning_rate": 6.443077093832183e-06, + "logits/chosen": -0.48156094551086426, + "logits/rejected": -0.5641165971755981, + "logps/chosen": -52.477088928222656, + "logps/rejected": -83.47603607177734, + "loss": 0.5655, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.29120135307312, + "rewards/margins": 6.369214057922363, + "rewards/rejected": -3.0780131816864014, + "step": 8130 + }, + { + "epoch": 2.03, + "grad_norm": 6.23306131362915, + "learning_rate": 6.442324535968752e-06, + "logits/chosen": -0.4341695010662079, + "logits/rejected": -0.4882024824619293, + "logps/chosen": -50.911842346191406, + "logps/rejected": -96.02613067626953, + "loss": 0.743, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.914896249771118, + "rewards/margins": 5.197641372680664, + "rewards/rejected": -2.282745361328125, + "step": 8131 + }, + { + "epoch": 2.03, + "grad_norm": 11.144055366516113, + "learning_rate": 6.441571942463949e-06, + "logits/chosen": -0.6040874719619751, + "logits/rejected": -0.6755102872848511, + "logps/chosen": -53.14091110229492, + "logps/rejected": -102.57782745361328, + "loss": 0.6084, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.011411428451538, + "rewards/margins": 6.849018096923828, + "rewards/rejected": -3.837606906890869, + "step": 8132 + }, + { + "epoch": 2.03, + "grad_norm": 3.914773464202881, + "learning_rate": 6.44081931333637e-06, + "logits/chosen": -0.43926215171813965, + "logits/rejected": -0.5659084916114807, + "logps/chosen": -49.355430603027344, + "logps/rejected": -91.91971588134766, + "loss": 0.5802, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.923967123031616, + "rewards/margins": 6.556217670440674, + "rewards/rejected": -3.6322505474090576, + "step": 8133 + }, + { + "epoch": 2.03, + "grad_norm": 5.945909023284912, + "learning_rate": 6.440066648604613e-06, + "logits/chosen": -0.407387375831604, + "logits/rejected": -0.4709022045135498, + "logps/chosen": -66.43260955810547, + "logps/rejected": -94.42218017578125, + "loss": 0.8324, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.775890350341797, + "rewards/margins": 4.538689136505127, + "rewards/rejected": -1.7627989053726196, + "step": 8134 + }, + { + "epoch": 2.04, + "grad_norm": 8.821505546569824, + "learning_rate": 6.439313948287278e-06, + "logits/chosen": -0.49023717641830444, + "logits/rejected": -0.5510233044624329, + "logps/chosen": -57.6077766418457, + "logps/rejected": -100.49008178710938, + "loss": 0.6756, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.880394458770752, + "rewards/margins": 6.954799175262451, + "rewards/rejected": -4.074404716491699, + "step": 8135 + }, + { + "epoch": 2.04, + "grad_norm": 4.540314674377441, + "learning_rate": 6.438561212402963e-06, + "logits/chosen": -0.4525569975376129, + "logits/rejected": -0.5410391092300415, + "logps/chosen": -59.45098876953125, + "logps/rejected": -100.56131744384766, + "loss": 0.6089, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2030599117279053, + "rewards/margins": 5.829763412475586, + "rewards/rejected": -2.6267032623291016, + "step": 8136 + }, + { + "epoch": 2.04, + "grad_norm": 6.7768096923828125, + "learning_rate": 6.4378084409702726e-06, + "logits/chosen": -0.5530495643615723, + "logits/rejected": -0.5641138553619385, + "logps/chosen": -45.490821838378906, + "logps/rejected": -98.48486328125, + "loss": 0.6362, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0774805545806885, + "rewards/margins": 6.153104782104492, + "rewards/rejected": -3.075624465942383, + "step": 8137 + }, + { + "epoch": 2.04, + "grad_norm": 8.669791221618652, + "learning_rate": 6.437055634007802e-06, + "logits/chosen": -0.4618484377861023, + "logits/rejected": -0.5385655760765076, + "logps/chosen": -62.451210021972656, + "logps/rejected": -80.06825256347656, + "loss": 0.7437, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7866978645324707, + "rewards/margins": 4.9510626792907715, + "rewards/rejected": -2.1643645763397217, + "step": 8138 + }, + { + "epoch": 2.04, + "grad_norm": 10.057659149169922, + "learning_rate": 6.436302791534163e-06, + "logits/chosen": -0.43440669775009155, + "logits/rejected": -0.5567449927330017, + "logps/chosen": -59.673622131347656, + "logps/rejected": -88.77681732177734, + "loss": 0.7086, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.048088550567627, + "rewards/margins": 6.069173336029053, + "rewards/rejected": -3.0210845470428467, + "step": 8139 + }, + { + "epoch": 2.04, + "grad_norm": 4.904883861541748, + "learning_rate": 6.435549913567951e-06, + "logits/chosen": -0.3937344253063202, + "logits/rejected": -0.42630964517593384, + "logps/chosen": -52.77903747558594, + "logps/rejected": -97.78337097167969, + "loss": 0.6699, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.153043508529663, + "rewards/margins": 5.440053939819336, + "rewards/rejected": -2.2870099544525146, + "step": 8140 + }, + { + "epoch": 2.04, + "grad_norm": 13.249185562133789, + "learning_rate": 6.434797000127775e-06, + "logits/chosen": -0.45770135521888733, + "logits/rejected": -0.5551948547363281, + "logps/chosen": -71.936279296875, + "logps/rejected": -92.89087677001953, + "loss": 0.7927, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.693516492843628, + "rewards/margins": 5.331424713134766, + "rewards/rejected": -2.6379079818725586, + "step": 8141 + }, + { + "epoch": 2.04, + "grad_norm": 4.562607765197754, + "learning_rate": 6.434044051232239e-06, + "logits/chosen": -0.502238392829895, + "logits/rejected": -0.5135611295700073, + "logps/chosen": -52.13148880004883, + "logps/rejected": -117.00067138671875, + "loss": 0.6766, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7822213172912598, + "rewards/margins": 6.854964256286621, + "rewards/rejected": -4.072741985321045, + "step": 8142 + }, + { + "epoch": 2.04, + "grad_norm": 4.606217861175537, + "learning_rate": 6.433291066899949e-06, + "logits/chosen": -0.5088494420051575, + "logits/rejected": -0.5286627411842346, + "logps/chosen": -44.20268249511719, + "logps/rejected": -92.07886505126953, + "loss": 0.6638, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.906221866607666, + "rewards/margins": 4.321192741394043, + "rewards/rejected": -1.414970874786377, + "step": 8143 + }, + { + "epoch": 2.04, + "grad_norm": 5.490513801574707, + "learning_rate": 6.432538047149513e-06, + "logits/chosen": -0.5468869805335999, + "logits/rejected": -0.5875556468963623, + "logps/chosen": -51.608280181884766, + "logps/rejected": -99.51212310791016, + "loss": 0.6387, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.736999750137329, + "rewards/margins": 5.244241714477539, + "rewards/rejected": -2.5072426795959473, + "step": 8144 + }, + { + "epoch": 2.04, + "grad_norm": 3.783766984939575, + "learning_rate": 6.4317849919995364e-06, + "logits/chosen": -0.5809638500213623, + "logits/rejected": -0.6623629331588745, + "logps/chosen": -39.730953216552734, + "logps/rejected": -83.00205993652344, + "loss": 0.5727, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.114210367202759, + "rewards/margins": 6.243801116943359, + "rewards/rejected": -3.1295907497406006, + "step": 8145 + }, + { + "epoch": 2.04, + "grad_norm": 6.0937604904174805, + "learning_rate": 6.4310319014686315e-06, + "logits/chosen": -0.43104952573776245, + "logits/rejected": -0.5699909329414368, + "logps/chosen": -72.39107513427734, + "logps/rejected": -83.89922332763672, + "loss": 0.7081, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1526167392730713, + "rewards/margins": 5.741644382476807, + "rewards/rejected": -2.5890278816223145, + "step": 8146 + }, + { + "epoch": 2.04, + "grad_norm": 5.207399368286133, + "learning_rate": 6.430278775575405e-06, + "logits/chosen": -0.5652979612350464, + "logits/rejected": -0.6299812197685242, + "logps/chosen": -50.768577575683594, + "logps/rejected": -109.29721069335938, + "loss": 0.6087, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7055325508117676, + "rewards/margins": 7.1212568283081055, + "rewards/rejected": -4.415724277496338, + "step": 8147 + }, + { + "epoch": 2.04, + "grad_norm": 5.07316780090332, + "learning_rate": 6.42952561433847e-06, + "logits/chosen": -0.5107874870300293, + "logits/rejected": -0.5807240605354309, + "logps/chosen": -71.50646209716797, + "logps/rejected": -96.07988739013672, + "loss": 0.7548, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.808274269104004, + "rewards/margins": 5.96366548538208, + "rewards/rejected": -3.155391216278076, + "step": 8148 + }, + { + "epoch": 2.04, + "grad_norm": 18.249252319335938, + "learning_rate": 6.428772417776436e-06, + "logits/chosen": -0.4791598916053772, + "logits/rejected": -0.5954297780990601, + "logps/chosen": -60.445823669433594, + "logps/rejected": -79.93009948730469, + "loss": 0.7969, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7109668254852295, + "rewards/margins": 5.653260707855225, + "rewards/rejected": -2.942293643951416, + "step": 8149 + }, + { + "epoch": 2.04, + "grad_norm": 8.47299575805664, + "learning_rate": 6.428019185907914e-06, + "logits/chosen": -0.46469002962112427, + "logits/rejected": -0.5204100012779236, + "logps/chosen": -57.42546463012695, + "logps/rejected": -96.03927612304688, + "loss": 0.712, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.28578519821167, + "rewards/margins": 5.805273056030273, + "rewards/rejected": -2.5194880962371826, + "step": 8150 + }, + { + "epoch": 2.04, + "grad_norm": 2.6124892234802246, + "learning_rate": 6.42726591875152e-06, + "logits/chosen": -0.44455236196517944, + "logits/rejected": -0.5465136170387268, + "logps/chosen": -58.37019348144531, + "logps/rejected": -89.86044311523438, + "loss": 0.5784, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9870479106903076, + "rewards/margins": 5.8876633644104, + "rewards/rejected": -2.9006149768829346, + "step": 8151 + }, + { + "epoch": 2.04, + "grad_norm": 5.637746810913086, + "learning_rate": 6.426512616325868e-06, + "logits/chosen": -0.4750262498855591, + "logits/rejected": -0.5587371587753296, + "logps/chosen": -59.65141296386719, + "logps/rejected": -91.18653106689453, + "loss": 0.6498, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8524816036224365, + "rewards/margins": 5.660614490509033, + "rewards/rejected": -2.808133125305176, + "step": 8152 + }, + { + "epoch": 2.04, + "grad_norm": 4.521706581115723, + "learning_rate": 6.425759278649573e-06, + "logits/chosen": -0.4850786626338959, + "logits/rejected": -0.5487145185470581, + "logps/chosen": -62.93742752075195, + "logps/rejected": -96.97941589355469, + "loss": 0.7616, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8856892585754395, + "rewards/margins": 5.472217559814453, + "rewards/rejected": -2.5865278244018555, + "step": 8153 + }, + { + "epoch": 2.04, + "grad_norm": 22.11289405822754, + "learning_rate": 6.425005905741247e-06, + "logits/chosen": -0.45336467027664185, + "logits/rejected": -0.5683596134185791, + "logps/chosen": -65.60176086425781, + "logps/rejected": -89.61201477050781, + "loss": 0.7676, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6779584884643555, + "rewards/margins": 5.976372241973877, + "rewards/rejected": -3.2984137535095215, + "step": 8154 + }, + { + "epoch": 2.04, + "grad_norm": 4.5311737060546875, + "learning_rate": 6.424252497619511e-06, + "logits/chosen": -0.4267885088920593, + "logits/rejected": -0.4957365393638611, + "logps/chosen": -59.57624053955078, + "logps/rejected": -98.74967956542969, + "loss": 0.6466, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.622830629348755, + "rewards/margins": 5.959695816040039, + "rewards/rejected": -3.336864948272705, + "step": 8155 + }, + { + "epoch": 2.04, + "grad_norm": 2.4633631706237793, + "learning_rate": 6.423499054302979e-06, + "logits/chosen": -0.4946364164352417, + "logits/rejected": -0.6029181480407715, + "logps/chosen": -51.344818115234375, + "logps/rejected": -106.50556945800781, + "loss": 0.6067, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1145596504211426, + "rewards/margins": 6.306354999542236, + "rewards/rejected": -3.1917951107025146, + "step": 8156 + }, + { + "epoch": 2.04, + "grad_norm": 4.088324069976807, + "learning_rate": 6.4227455758102744e-06, + "logits/chosen": -0.5318958759307861, + "logits/rejected": -0.5893254280090332, + "logps/chosen": -42.798973083496094, + "logps/rejected": -90.68406677246094, + "loss": 0.6419, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.204206943511963, + "rewards/margins": 6.418161869049072, + "rewards/rejected": -3.2139551639556885, + "step": 8157 + }, + { + "epoch": 2.04, + "grad_norm": 3.4467782974243164, + "learning_rate": 6.42199206216001e-06, + "logits/chosen": -0.4895229637622833, + "logits/rejected": -0.5762590169906616, + "logps/chosen": -52.269039154052734, + "logps/rejected": -79.58846282958984, + "loss": 0.6368, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2824862003326416, + "rewards/margins": 5.712525367736816, + "rewards/rejected": -2.430039167404175, + "step": 8158 + }, + { + "epoch": 2.04, + "grad_norm": 5.72257661819458, + "learning_rate": 6.421238513370812e-06, + "logits/chosen": -0.5081742405891418, + "logits/rejected": -0.6098926663398743, + "logps/chosen": -54.07798385620117, + "logps/rejected": -88.24365997314453, + "loss": 0.5923, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.816835641860962, + "rewards/margins": 6.319354057312012, + "rewards/rejected": -3.50251841545105, + "step": 8159 + }, + { + "epoch": 2.04, + "grad_norm": 3.828890323638916, + "learning_rate": 6.420484929461298e-06, + "logits/chosen": -0.46469223499298096, + "logits/rejected": -0.5229666233062744, + "logps/chosen": -63.137596130371094, + "logps/rejected": -104.96804809570312, + "loss": 0.6895, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8688607215881348, + "rewards/margins": 6.417059421539307, + "rewards/rejected": -3.548198699951172, + "step": 8160 + }, + { + "epoch": 2.04, + "grad_norm": 6.114458084106445, + "learning_rate": 6.4197313104500904e-06, + "logits/chosen": -0.4685975909233093, + "logits/rejected": -0.571794331073761, + "logps/chosen": -54.44782638549805, + "logps/rejected": -90.67976379394531, + "loss": 0.7364, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.76889967918396, + "rewards/margins": 6.052475929260254, + "rewards/rejected": -3.283576250076294, + "step": 8161 + }, + { + "epoch": 2.04, + "grad_norm": 2.8596370220184326, + "learning_rate": 6.4189776563558116e-06, + "logits/chosen": -0.5249063968658447, + "logits/rejected": -0.5733120441436768, + "logps/chosen": -49.753299713134766, + "logps/rejected": -94.1452865600586, + "loss": 0.5924, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.284125804901123, + "rewards/margins": 6.211535453796387, + "rewards/rejected": -2.927410364151001, + "step": 8162 + }, + { + "epoch": 2.04, + "grad_norm": 15.15070915222168, + "learning_rate": 6.418223967197086e-06, + "logits/chosen": -0.5162640810012817, + "logits/rejected": -0.6333040595054626, + "logps/chosen": -51.685604095458984, + "logps/rejected": -84.05579376220703, + "loss": 0.5899, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9780025482177734, + "rewards/margins": 6.173676013946533, + "rewards/rejected": -3.1956729888916016, + "step": 8163 + }, + { + "epoch": 2.04, + "grad_norm": 7.113704681396484, + "learning_rate": 6.4174702429925365e-06, + "logits/chosen": -0.5291079878807068, + "logits/rejected": -0.5240380167961121, + "logps/chosen": -53.989097595214844, + "logps/rejected": -113.44904327392578, + "loss": 0.6372, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9129092693328857, + "rewards/margins": 5.493098735809326, + "rewards/rejected": -2.5801894664764404, + "step": 8164 + }, + { + "epoch": 2.04, + "grad_norm": 6.505702495574951, + "learning_rate": 6.416716483760789e-06, + "logits/chosen": -0.5320209264755249, + "logits/rejected": -0.5867597460746765, + "logps/chosen": -47.03510284423828, + "logps/rejected": -80.50702667236328, + "loss": 0.6445, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2291152477264404, + "rewards/margins": 5.723658561706543, + "rewards/rejected": -2.4945433139801025, + "step": 8165 + }, + { + "epoch": 2.04, + "grad_norm": 3.6359646320343018, + "learning_rate": 6.415962689520474e-06, + "logits/chosen": -0.4624262750148773, + "logits/rejected": -0.5547553896903992, + "logps/chosen": -62.95460891723633, + "logps/rejected": -97.73696899414062, + "loss": 0.6337, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8166422843933105, + "rewards/margins": 5.4267706871032715, + "rewards/rejected": -2.6101279258728027, + "step": 8166 + }, + { + "epoch": 2.04, + "grad_norm": 5.506829261779785, + "learning_rate": 6.415208860290212e-06, + "logits/chosen": -0.5094075202941895, + "logits/rejected": -0.5933427810668945, + "logps/chosen": -59.566429138183594, + "logps/rejected": -86.88758087158203, + "loss": 0.7653, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.787876605987549, + "rewards/margins": 5.83126163482666, + "rewards/rejected": -3.0433852672576904, + "step": 8167 + }, + { + "epoch": 2.04, + "grad_norm": 5.7320756912231445, + "learning_rate": 6.414454996088635e-06, + "logits/chosen": -0.4954119920730591, + "logits/rejected": -0.5401276350021362, + "logps/chosen": -58.735252380371094, + "logps/rejected": -100.23060607910156, + "loss": 0.7353, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.19769287109375, + "rewards/margins": 5.540605545043945, + "rewards/rejected": -2.3429126739501953, + "step": 8168 + }, + { + "epoch": 2.04, + "grad_norm": 4.113963603973389, + "learning_rate": 6.4137010969343695e-06, + "logits/chosen": -0.4736091196537018, + "logits/rejected": -0.5543648600578308, + "logps/chosen": -54.082611083984375, + "logps/rejected": -93.66642761230469, + "loss": 0.6845, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9372973442077637, + "rewards/margins": 5.661822319030762, + "rewards/rejected": -2.72452449798584, + "step": 8169 + }, + { + "epoch": 2.04, + "grad_norm": 8.51433277130127, + "learning_rate": 6.412947162846046e-06, + "logits/chosen": -0.4207584261894226, + "logits/rejected": -0.5428928732872009, + "logps/chosen": -60.76993179321289, + "logps/rejected": -92.58489990234375, + "loss": 0.7373, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.711637496948242, + "rewards/margins": 5.770720481872559, + "rewards/rejected": -3.0590829849243164, + "step": 8170 + }, + { + "epoch": 2.04, + "grad_norm": 3.219327688217163, + "learning_rate": 6.412193193842298e-06, + "logits/chosen": -0.5273230671882629, + "logits/rejected": -0.6236525774002075, + "logps/chosen": -50.063865661621094, + "logps/rejected": -96.02327728271484, + "loss": 0.566, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1497795581817627, + "rewards/margins": 6.612894058227539, + "rewards/rejected": -3.4631152153015137, + "step": 8171 + }, + { + "epoch": 2.04, + "grad_norm": 5.06164026260376, + "learning_rate": 6.411439189941751e-06, + "logits/chosen": -0.4218123257160187, + "logits/rejected": -0.5324572324752808, + "logps/chosen": -60.200035095214844, + "logps/rejected": -81.84896087646484, + "loss": 0.7203, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9723031520843506, + "rewards/margins": 4.829947471618652, + "rewards/rejected": -1.85764479637146, + "step": 8172 + }, + { + "epoch": 2.04, + "grad_norm": 15.44734001159668, + "learning_rate": 6.410685151163041e-06, + "logits/chosen": -0.4902251660823822, + "logits/rejected": -0.5900330543518066, + "logps/chosen": -63.0750846862793, + "logps/rejected": -106.39273071289062, + "loss": 0.6717, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9246578216552734, + "rewards/margins": 7.732449054718018, + "rewards/rejected": -4.807790756225586, + "step": 8173 + }, + { + "epoch": 2.04, + "grad_norm": 10.722797393798828, + "learning_rate": 6.409931077524801e-06, + "logits/chosen": -0.5198297500610352, + "logits/rejected": -0.6269775629043579, + "logps/chosen": -54.01768493652344, + "logps/rejected": -78.2540054321289, + "loss": 0.8345, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6369731426239014, + "rewards/margins": 5.388532638549805, + "rewards/rejected": -2.7515594959259033, + "step": 8174 + }, + { + "epoch": 2.05, + "grad_norm": 4.467505931854248, + "learning_rate": 6.409176969045664e-06, + "logits/chosen": -0.4694811701774597, + "logits/rejected": -0.5575576424598694, + "logps/chosen": -56.424720764160156, + "logps/rejected": -92.95936584472656, + "loss": 0.6282, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.967503309249878, + "rewards/margins": 6.593552112579346, + "rewards/rejected": -3.626049041748047, + "step": 8175 + }, + { + "epoch": 2.05, + "grad_norm": 9.94419002532959, + "learning_rate": 6.408422825744265e-06, + "logits/chosen": -0.4703119397163391, + "logits/rejected": -0.5115389823913574, + "logps/chosen": -63.844032287597656, + "logps/rejected": -102.34323120117188, + "loss": 0.78, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.981259346008301, + "rewards/margins": 5.4777398109436035, + "rewards/rejected": -2.496480703353882, + "step": 8176 + }, + { + "epoch": 2.05, + "grad_norm": 5.85857629776001, + "learning_rate": 6.407668647639241e-06, + "logits/chosen": -0.4683735966682434, + "logits/rejected": -0.480942964553833, + "logps/chosen": -51.08639144897461, + "logps/rejected": -98.05986022949219, + "loss": 0.6773, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9927642345428467, + "rewards/margins": 4.788058280944824, + "rewards/rejected": -1.795294165611267, + "step": 8177 + }, + { + "epoch": 2.05, + "grad_norm": 16.447614669799805, + "learning_rate": 6.406914434749227e-06, + "logits/chosen": -0.5267863273620605, + "logits/rejected": -0.5969375371932983, + "logps/chosen": -52.77581787109375, + "logps/rejected": -98.44287109375, + "loss": 0.7191, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6698715686798096, + "rewards/margins": 7.188534736633301, + "rewards/rejected": -4.518662929534912, + "step": 8178 + }, + { + "epoch": 2.05, + "grad_norm": 6.157422065734863, + "learning_rate": 6.4061601870928605e-06, + "logits/chosen": -0.43604838848114014, + "logits/rejected": -0.551525354385376, + "logps/chosen": -54.17851638793945, + "logps/rejected": -88.19300842285156, + "loss": 0.7612, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6878974437713623, + "rewards/margins": 6.293042182922363, + "rewards/rejected": -3.6051454544067383, + "step": 8179 + }, + { + "epoch": 2.05, + "grad_norm": 8.257160186767578, + "learning_rate": 6.405405904688781e-06, + "logits/chosen": -0.501380980014801, + "logits/rejected": -0.580392599105835, + "logps/chosen": -54.43939971923828, + "logps/rejected": -85.94693756103516, + "loss": 0.7479, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9203343391418457, + "rewards/margins": 5.600100040435791, + "rewards/rejected": -2.679765224456787, + "step": 8180 + }, + { + "epoch": 2.05, + "grad_norm": 6.250667572021484, + "learning_rate": 6.4046515875556245e-06, + "logits/chosen": -0.5662710070610046, + "logits/rejected": -0.6217435598373413, + "logps/chosen": -58.737640380859375, + "logps/rejected": -93.5442123413086, + "loss": 0.7669, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.739758014678955, + "rewards/margins": 5.1878557205200195, + "rewards/rejected": -2.4480972290039062, + "step": 8181 + }, + { + "epoch": 2.05, + "grad_norm": 11.381248474121094, + "learning_rate": 6.403897235712035e-06, + "logits/chosen": -0.46629422903060913, + "logits/rejected": -0.5589908361434937, + "logps/chosen": -53.91339111328125, + "logps/rejected": -94.4347152709961, + "loss": 0.8929, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0467519760131836, + "rewards/margins": 6.4095282554626465, + "rewards/rejected": -3.362776279449463, + "step": 8182 + }, + { + "epoch": 2.05, + "grad_norm": 5.776915073394775, + "learning_rate": 6.403142849176653e-06, + "logits/chosen": -0.5026271939277649, + "logits/rejected": -0.5786086916923523, + "logps/chosen": -58.26386642456055, + "logps/rejected": -99.09835815429688, + "loss": 0.7163, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6955251693725586, + "rewards/margins": 5.476850986480713, + "rewards/rejected": -2.7813258171081543, + "step": 8183 + }, + { + "epoch": 2.05, + "grad_norm": 9.963915824890137, + "learning_rate": 6.402388427968116e-06, + "logits/chosen": -0.4277632236480713, + "logits/rejected": -0.5000869035720825, + "logps/chosen": -57.40333557128906, + "logps/rejected": -100.1180419921875, + "loss": 0.7489, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.876642942428589, + "rewards/margins": 5.350526809692383, + "rewards/rejected": -2.473884105682373, + "step": 8184 + }, + { + "epoch": 2.05, + "grad_norm": 3.493838310241699, + "learning_rate": 6.40163397210507e-06, + "logits/chosen": -0.3986407518386841, + "logits/rejected": -0.4790855646133423, + "logps/chosen": -56.05729675292969, + "logps/rejected": -112.2476577758789, + "loss": 0.6269, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9254517555236816, + "rewards/margins": 6.6775336265563965, + "rewards/rejected": -3.7520816326141357, + "step": 8185 + }, + { + "epoch": 2.05, + "grad_norm": 5.047699928283691, + "learning_rate": 6.400879481606159e-06, + "logits/chosen": -0.4970811605453491, + "logits/rejected": -0.5591152906417847, + "logps/chosen": -48.31657791137695, + "logps/rejected": -92.72721099853516, + "loss": 0.6744, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3624892234802246, + "rewards/margins": 5.770428657531738, + "rewards/rejected": -2.407939910888672, + "step": 8186 + }, + { + "epoch": 2.05, + "grad_norm": 9.850409507751465, + "learning_rate": 6.400124956490025e-06, + "logits/chosen": -0.40790972113609314, + "logits/rejected": -0.5040444135665894, + "logps/chosen": -60.79174041748047, + "logps/rejected": -96.86156463623047, + "loss": 0.6653, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.867246150970459, + "rewards/margins": 5.95768928527832, + "rewards/rejected": -3.0904438495635986, + "step": 8187 + }, + { + "epoch": 2.05, + "grad_norm": 8.800455093383789, + "learning_rate": 6.399370396775313e-06, + "logits/chosen": -0.47222214937210083, + "logits/rejected": -0.5554621815681458, + "logps/chosen": -59.20138168334961, + "logps/rejected": -90.50346374511719, + "loss": 0.8345, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9314825534820557, + "rewards/margins": 5.430860996246338, + "rewards/rejected": -2.4993784427642822, + "step": 8188 + }, + { + "epoch": 2.05, + "grad_norm": 11.506503105163574, + "learning_rate": 6.398615802480672e-06, + "logits/chosen": -0.5127233862876892, + "logits/rejected": -0.6227297186851501, + "logps/chosen": -56.22706604003906, + "logps/rejected": -75.32941436767578, + "loss": 0.7675, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0579707622528076, + "rewards/margins": 5.3072028160095215, + "rewards/rejected": -2.249232530593872, + "step": 8189 + }, + { + "epoch": 2.05, + "grad_norm": 3.8181846141815186, + "learning_rate": 6.397861173624745e-06, + "logits/chosen": -0.46473008394241333, + "logits/rejected": -0.5001633763313293, + "logps/chosen": -47.1520881652832, + "logps/rejected": -93.67613983154297, + "loss": 0.6733, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.112060070037842, + "rewards/margins": 5.847995758056641, + "rewards/rejected": -2.735936164855957, + "step": 8190 + }, + { + "epoch": 2.05, + "grad_norm": 4.4006547927856445, + "learning_rate": 6.3971065102261834e-06, + "logits/chosen": -0.41607776284217834, + "logits/rejected": -0.498260498046875, + "logps/chosen": -58.30624771118164, + "logps/rejected": -107.64813232421875, + "loss": 0.6437, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8171377182006836, + "rewards/margins": 6.873244285583496, + "rewards/rejected": -4.056107044219971, + "step": 8191 + }, + { + "epoch": 2.05, + "grad_norm": 4.581463813781738, + "learning_rate": 6.396351812303633e-06, + "logits/chosen": -0.4809649586677551, + "logits/rejected": -0.5395994186401367, + "logps/chosen": -54.952972412109375, + "logps/rejected": -88.08494567871094, + "loss": 0.7803, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.018691301345825, + "rewards/margins": 5.051109313964844, + "rewards/rejected": -2.0324184894561768, + "step": 8192 + }, + { + "epoch": 2.05, + "grad_norm": 8.101645469665527, + "learning_rate": 6.3955970798757435e-06, + "logits/chosen": -0.5190922617912292, + "logits/rejected": -0.5688002705574036, + "logps/chosen": -53.8054084777832, + "logps/rejected": -104.68748474121094, + "loss": 0.6596, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7651944160461426, + "rewards/margins": 6.1112380027771, + "rewards/rejected": -3.3460440635681152, + "step": 8193 + }, + { + "epoch": 2.05, + "grad_norm": 2.9363648891448975, + "learning_rate": 6.3948423129611684e-06, + "logits/chosen": -0.427415668964386, + "logits/rejected": -0.5252919793128967, + "logps/chosen": -61.769447326660156, + "logps/rejected": -105.48356628417969, + "loss": 0.6002, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.852185010910034, + "rewards/margins": 6.450610160827637, + "rewards/rejected": -3.5984251499176025, + "step": 8194 + }, + { + "epoch": 2.05, + "grad_norm": 3.902031898498535, + "learning_rate": 6.394087511578554e-06, + "logits/chosen": -0.5042410492897034, + "logits/rejected": -0.5152706503868103, + "logps/chosen": -54.482276916503906, + "logps/rejected": -90.47271728515625, + "loss": 0.7503, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1247642040252686, + "rewards/margins": 4.921567440032959, + "rewards/rejected": -1.79680335521698, + "step": 8195 + }, + { + "epoch": 2.05, + "grad_norm": 4.867344379425049, + "learning_rate": 6.393332675746554e-06, + "logits/chosen": -0.4302300810813904, + "logits/rejected": -0.5122384428977966, + "logps/chosen": -58.05305480957031, + "logps/rejected": -94.98603820800781, + "loss": 0.6448, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0389809608459473, + "rewards/margins": 5.512174129486084, + "rewards/rejected": -2.4731929302215576, + "step": 8196 + }, + { + "epoch": 2.05, + "grad_norm": 6.965622425079346, + "learning_rate": 6.392577805483821e-06, + "logits/chosen": -0.5702268481254578, + "logits/rejected": -0.6969149112701416, + "logps/chosen": -58.713340759277344, + "logps/rejected": -85.72956085205078, + "loss": 0.6735, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8160715103149414, + "rewards/margins": 6.2161641120910645, + "rewards/rejected": -3.400092601776123, + "step": 8197 + }, + { + "epoch": 2.05, + "grad_norm": 3.9754037857055664, + "learning_rate": 6.3918229008090105e-06, + "logits/chosen": -0.49989479780197144, + "logits/rejected": -0.540658175945282, + "logps/chosen": -49.39298629760742, + "logps/rejected": -105.34483337402344, + "loss": 0.6615, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.960977077484131, + "rewards/margins": 5.746235370635986, + "rewards/rejected": -2.7852582931518555, + "step": 8198 + }, + { + "epoch": 2.05, + "grad_norm": 7.57082462310791, + "learning_rate": 6.391067961740776e-06, + "logits/chosen": -0.4712701141834259, + "logits/rejected": -0.5937590003013611, + "logps/chosen": -55.68895721435547, + "logps/rejected": -95.67286682128906, + "loss": 0.7145, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.84236216545105, + "rewards/margins": 7.141141891479492, + "rewards/rejected": -4.29878044128418, + "step": 8199 + }, + { + "epoch": 2.05, + "grad_norm": 8.999395370483398, + "learning_rate": 6.390312988297773e-06, + "logits/chosen": -0.4614269733428955, + "logits/rejected": -0.5441005825996399, + "logps/chosen": -66.59418487548828, + "logps/rejected": -114.23374938964844, + "loss": 0.6874, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5901522636413574, + "rewards/margins": 6.9664225578308105, + "rewards/rejected": -4.376269817352295, + "step": 8200 + }, + { + "epoch": 2.05, + "grad_norm": 8.30452823638916, + "learning_rate": 6.389557980498655e-06, + "logits/chosen": -0.45669108629226685, + "logits/rejected": -0.4971289038658142, + "logps/chosen": -46.17204284667969, + "logps/rejected": -89.0323715209961, + "loss": 0.6851, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.105854034423828, + "rewards/margins": 5.572155952453613, + "rewards/rejected": -2.4663028717041016, + "step": 8201 + }, + { + "epoch": 2.05, + "grad_norm": 5.539242744445801, + "learning_rate": 6.388802938362083e-06, + "logits/chosen": -0.46382883191108704, + "logits/rejected": -0.5723384022712708, + "logps/chosen": -56.21610641479492, + "logps/rejected": -88.8044662475586, + "loss": 0.6231, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.057037830352783, + "rewards/margins": 6.874734878540039, + "rewards/rejected": -3.817697048187256, + "step": 8202 + }, + { + "epoch": 2.05, + "grad_norm": 5.451137065887451, + "learning_rate": 6.388047861906714e-06, + "logits/chosen": -0.42953163385391235, + "logits/rejected": -0.5316909551620483, + "logps/chosen": -63.86467742919922, + "logps/rejected": -106.63357543945312, + "loss": 0.6394, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.650805711746216, + "rewards/margins": 7.537899971008301, + "rewards/rejected": -4.887094020843506, + "step": 8203 + }, + { + "epoch": 2.05, + "grad_norm": 7.119174957275391, + "learning_rate": 6.387292751151204e-06, + "logits/chosen": -0.4171420931816101, + "logits/rejected": -0.4855988025665283, + "logps/chosen": -63.3876838684082, + "logps/rejected": -99.25498962402344, + "loss": 0.776, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8180642127990723, + "rewards/margins": 6.309050559997559, + "rewards/rejected": -3.490985870361328, + "step": 8204 + }, + { + "epoch": 2.05, + "grad_norm": 6.888065814971924, + "learning_rate": 6.386537606114216e-06, + "logits/chosen": -0.3765415549278259, + "logits/rejected": -0.4560042917728424, + "logps/chosen": -60.21287536621094, + "logps/rejected": -88.4241943359375, + "loss": 0.766, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.673593044281006, + "rewards/margins": 4.430002689361572, + "rewards/rejected": -1.7564096450805664, + "step": 8205 + }, + { + "epoch": 2.05, + "grad_norm": 3.792531967163086, + "learning_rate": 6.385782426814407e-06, + "logits/chosen": -0.46073195338249207, + "logits/rejected": -0.5653102397918701, + "logps/chosen": -48.479835510253906, + "logps/rejected": -94.02890014648438, + "loss": 0.5289, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.846135139465332, + "rewards/margins": 5.7026801109313965, + "rewards/rejected": -2.8565456867218018, + "step": 8206 + }, + { + "epoch": 2.05, + "grad_norm": 5.434072971343994, + "learning_rate": 6.385027213270442e-06, + "logits/chosen": -0.4490046501159668, + "logits/rejected": -0.5020559430122375, + "logps/chosen": -61.792327880859375, + "logps/rejected": -124.56597900390625, + "loss": 0.6409, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8947038650512695, + "rewards/margins": 6.407951354980469, + "rewards/rejected": -3.513247489929199, + "step": 8207 + }, + { + "epoch": 2.05, + "grad_norm": 4.423379898071289, + "learning_rate": 6.384271965500981e-06, + "logits/chosen": -0.5473489761352539, + "logits/rejected": -0.620712399482727, + "logps/chosen": -56.70106887817383, + "logps/rejected": -92.52828216552734, + "loss": 0.7035, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8951942920684814, + "rewards/margins": 5.821501731872559, + "rewards/rejected": -2.926307201385498, + "step": 8208 + }, + { + "epoch": 2.05, + "grad_norm": 4.764664173126221, + "learning_rate": 6.383516683524687e-06, + "logits/chosen": -0.4892430007457733, + "logits/rejected": -0.5608202219009399, + "logps/chosen": -50.32059097290039, + "logps/rejected": -90.32307434082031, + "loss": 0.6094, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.006342649459839, + "rewards/margins": 5.751005172729492, + "rewards/rejected": -2.7446627616882324, + "step": 8209 + }, + { + "epoch": 2.05, + "grad_norm": 3.8965415954589844, + "learning_rate": 6.382761367360225e-06, + "logits/chosen": -0.508385181427002, + "logits/rejected": -0.5223661065101624, + "logps/chosen": -65.18443298339844, + "logps/rejected": -100.47789001464844, + "loss": 0.7039, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.01621675491333, + "rewards/margins": 6.121632099151611, + "rewards/rejected": -3.1054155826568604, + "step": 8210 + }, + { + "epoch": 2.05, + "grad_norm": 5.237210750579834, + "learning_rate": 6.382006017026257e-06, + "logits/chosen": -0.4550178647041321, + "logits/rejected": -0.5304456353187561, + "logps/chosen": -59.768516540527344, + "logps/rejected": -79.64198303222656, + "loss": 0.783, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9927473068237305, + "rewards/margins": 4.665904998779297, + "rewards/rejected": -1.673158049583435, + "step": 8211 + }, + { + "epoch": 2.05, + "grad_norm": 3.1534876823425293, + "learning_rate": 6.381250632541452e-06, + "logits/chosen": -0.4738588333129883, + "logits/rejected": -0.5738983154296875, + "logps/chosen": -48.11127471923828, + "logps/rejected": -96.78019714355469, + "loss": 0.568, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.131061315536499, + "rewards/margins": 6.359279632568359, + "rewards/rejected": -3.2282180786132812, + "step": 8212 + }, + { + "epoch": 2.05, + "grad_norm": 2.767634630203247, + "learning_rate": 6.380495213924475e-06, + "logits/chosen": -0.4317070245742798, + "logits/rejected": -0.6036068201065063, + "logps/chosen": -50.677486419677734, + "logps/rejected": -78.8400650024414, + "loss": 0.5603, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0461013317108154, + "rewards/margins": 6.048676490783691, + "rewards/rejected": -3.002574920654297, + "step": 8213 + }, + { + "epoch": 2.05, + "grad_norm": 18.39146614074707, + "learning_rate": 6.379739761193992e-06, + "logits/chosen": -0.4704316258430481, + "logits/rejected": -0.530816376209259, + "logps/chosen": -74.09515380859375, + "logps/rejected": -122.7702407836914, + "loss": 0.8228, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6428229808807373, + "rewards/margins": 6.615873336791992, + "rewards/rejected": -3.973050117492676, + "step": 8214 + }, + { + "epoch": 2.06, + "grad_norm": 4.036341667175293, + "learning_rate": 6.3789842743686725e-06, + "logits/chosen": -0.48873475193977356, + "logits/rejected": -0.5645664930343628, + "logps/chosen": -72.16187286376953, + "logps/rejected": -103.7928466796875, + "loss": 0.7057, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7565419673919678, + "rewards/margins": 6.290951251983643, + "rewards/rejected": -3.534409761428833, + "step": 8215 + }, + { + "epoch": 2.06, + "grad_norm": 9.843923568725586, + "learning_rate": 6.378228753467184e-06, + "logits/chosen": -0.4952300190925598, + "logits/rejected": -0.5748975276947021, + "logps/chosen": -59.78143310546875, + "logps/rejected": -96.23833465576172, + "loss": 0.6606, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0058536529541016, + "rewards/margins": 6.219178676605225, + "rewards/rejected": -3.213324785232544, + "step": 8216 + }, + { + "epoch": 2.06, + "grad_norm": 7.48940372467041, + "learning_rate": 6.377473198508198e-06, + "logits/chosen": -0.4651251435279846, + "logits/rejected": -0.5513834953308105, + "logps/chosen": -67.33251953125, + "logps/rejected": -100.74885559082031, + "loss": 0.7909, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.043698787689209, + "rewards/margins": 6.273032188415527, + "rewards/rejected": -3.2293338775634766, + "step": 8217 + }, + { + "epoch": 2.06, + "grad_norm": 4.327616214752197, + "learning_rate": 6.376717609510383e-06, + "logits/chosen": -0.39956948161125183, + "logits/rejected": -0.485270619392395, + "logps/chosen": -53.39717483520508, + "logps/rejected": -81.4510498046875, + "loss": 0.6749, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.820803642272949, + "rewards/margins": 5.656088352203369, + "rewards/rejected": -2.835284471511841, + "step": 8218 + }, + { + "epoch": 2.06, + "grad_norm": 6.510993003845215, + "learning_rate": 6.375961986492414e-06, + "logits/chosen": -0.4074366092681885, + "logits/rejected": -0.4941222071647644, + "logps/chosen": -62.25122833251953, + "logps/rejected": -90.15644836425781, + "loss": 0.7653, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9924941062927246, + "rewards/margins": 5.393802642822266, + "rewards/rejected": -2.401308298110962, + "step": 8219 + }, + { + "epoch": 2.06, + "grad_norm": 4.565569877624512, + "learning_rate": 6.375206329472959e-06, + "logits/chosen": -0.48506689071655273, + "logits/rejected": -0.603078305721283, + "logps/chosen": -49.468017578125, + "logps/rejected": -97.80664825439453, + "loss": 0.6949, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8860065937042236, + "rewards/margins": 6.208913326263428, + "rewards/rejected": -3.322906970977783, + "step": 8220 + }, + { + "epoch": 2.06, + "grad_norm": 5.568330764770508, + "learning_rate": 6.3744506384706915e-06, + "logits/chosen": -0.49926987290382385, + "logits/rejected": -0.5330763459205627, + "logps/chosen": -50.10710906982422, + "logps/rejected": -104.84823608398438, + "loss": 0.7701, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.4451422691345215, + "rewards/margins": 6.782036781311035, + "rewards/rejected": -3.3368935585021973, + "step": 8221 + }, + { + "epoch": 2.06, + "grad_norm": 2.9536216259002686, + "learning_rate": 6.373694913504288e-06, + "logits/chosen": -0.4634472131729126, + "logits/rejected": -0.5610988140106201, + "logps/chosen": -53.47501754760742, + "logps/rejected": -93.56591033935547, + "loss": 0.6139, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.085034132003784, + "rewards/margins": 6.497813701629639, + "rewards/rejected": -3.4127795696258545, + "step": 8222 + }, + { + "epoch": 2.06, + "grad_norm": 4.709991455078125, + "learning_rate": 6.372939154592423e-06, + "logits/chosen": -0.42677241563796997, + "logits/rejected": -0.6072723865509033, + "logps/chosen": -52.608863830566406, + "logps/rejected": -70.99127960205078, + "loss": 0.6505, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0645337104797363, + "rewards/margins": 6.178092002868652, + "rewards/rejected": -3.1135592460632324, + "step": 8223 + }, + { + "epoch": 2.06, + "grad_norm": 21.195064544677734, + "learning_rate": 6.37218336175377e-06, + "logits/chosen": -0.47033244371414185, + "logits/rejected": -0.536313533782959, + "logps/chosen": -60.96632766723633, + "logps/rejected": -96.0149917602539, + "loss": 0.7837, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.149981737136841, + "rewards/margins": 5.296902179718018, + "rewards/rejected": -2.1469204425811768, + "step": 8224 + }, + { + "epoch": 2.06, + "grad_norm": 3.3124682903289795, + "learning_rate": 6.371427535007008e-06, + "logits/chosen": -0.4756840765476227, + "logits/rejected": -0.5506911873817444, + "logps/chosen": -55.70502853393555, + "logps/rejected": -99.94353485107422, + "loss": 0.6336, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.210730791091919, + "rewards/margins": 6.291023254394531, + "rewards/rejected": -3.080292224884033, + "step": 8225 + }, + { + "epoch": 2.06, + "grad_norm": 4.817370414733887, + "learning_rate": 6.370671674370811e-06, + "logits/chosen": -0.44491082429885864, + "logits/rejected": -0.5330001711845398, + "logps/chosen": -42.4622917175293, + "logps/rejected": -88.53524017333984, + "loss": 0.6263, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7837114334106445, + "rewards/margins": 5.593819618225098, + "rewards/rejected": -2.8101084232330322, + "step": 8226 + }, + { + "epoch": 2.06, + "grad_norm": 10.298635482788086, + "learning_rate": 6.36991577986386e-06, + "logits/chosen": -0.39415496587753296, + "logits/rejected": -0.4558740556240082, + "logps/chosen": -68.70111083984375, + "logps/rejected": -96.18010711669922, + "loss": 0.7918, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.85379695892334, + "rewards/margins": 4.427670955657959, + "rewards/rejected": -1.573873519897461, + "step": 8227 + }, + { + "epoch": 2.06, + "grad_norm": 3.781418800354004, + "learning_rate": 6.369159851504833e-06, + "logits/chosen": -0.4318813681602478, + "logits/rejected": -0.5342053174972534, + "logps/chosen": -53.98280334472656, + "logps/rejected": -93.93045806884766, + "loss": 0.6448, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0507686138153076, + "rewards/margins": 5.986939430236816, + "rewards/rejected": -2.9361705780029297, + "step": 8228 + }, + { + "epoch": 2.06, + "grad_norm": 2.661980152130127, + "learning_rate": 6.36840388931241e-06, + "logits/chosen": -0.4615345299243927, + "logits/rejected": -0.4792509078979492, + "logps/chosen": -53.832557678222656, + "logps/rejected": -114.05498504638672, + "loss": 0.5753, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.058478832244873, + "rewards/margins": 6.408141136169434, + "rewards/rejected": -3.3496623039245605, + "step": 8229 + }, + { + "epoch": 2.06, + "grad_norm": 4.231681823730469, + "learning_rate": 6.367647893305271e-06, + "logits/chosen": -0.46129268407821655, + "logits/rejected": -0.5298911333084106, + "logps/chosen": -54.16320037841797, + "logps/rejected": -97.42304229736328, + "loss": 0.6267, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.106710910797119, + "rewards/margins": 5.476884365081787, + "rewards/rejected": -2.370173454284668, + "step": 8230 + }, + { + "epoch": 2.06, + "grad_norm": 3.3098301887512207, + "learning_rate": 6.366891863502097e-06, + "logits/chosen": -0.5016081929206848, + "logits/rejected": -0.5183660387992859, + "logps/chosen": -49.90216827392578, + "logps/rejected": -103.49695587158203, + "loss": 0.7008, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1992526054382324, + "rewards/margins": 5.025585651397705, + "rewards/rejected": -1.8263331651687622, + "step": 8231 + }, + { + "epoch": 2.06, + "grad_norm": 8.896896362304688, + "learning_rate": 6.366135799921573e-06, + "logits/chosen": -0.5171394348144531, + "logits/rejected": -0.5697371959686279, + "logps/chosen": -63.17881774902344, + "logps/rejected": -104.8434066772461, + "loss": 0.7603, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8991873264312744, + "rewards/margins": 5.182429313659668, + "rewards/rejected": -2.2832424640655518, + "step": 8232 + }, + { + "epoch": 2.06, + "grad_norm": 3.2964799404144287, + "learning_rate": 6.365379702582378e-06, + "logits/chosen": -0.5590622425079346, + "logits/rejected": -0.6364111304283142, + "logps/chosen": -51.1592903137207, + "logps/rejected": -80.87975311279297, + "loss": 0.5915, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2471680641174316, + "rewards/margins": 5.688840866088867, + "rewards/rejected": -2.4416728019714355, + "step": 8233 + }, + { + "epoch": 2.06, + "grad_norm": 7.074417591094971, + "learning_rate": 6.364623571503201e-06, + "logits/chosen": -0.4824681878089905, + "logits/rejected": -0.5405783653259277, + "logps/chosen": -55.17372512817383, + "logps/rejected": -92.18315887451172, + "loss": 0.7911, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.894047975540161, + "rewards/margins": 5.090264320373535, + "rewards/rejected": -2.1962153911590576, + "step": 8234 + }, + { + "epoch": 2.06, + "grad_norm": 3.5351874828338623, + "learning_rate": 6.363867406702722e-06, + "logits/chosen": -0.4571364223957062, + "logits/rejected": -0.5377639532089233, + "logps/chosen": -55.33991241455078, + "logps/rejected": -99.25077056884766, + "loss": 0.6667, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.904071569442749, + "rewards/margins": 6.237794399261475, + "rewards/rejected": -3.3337225914001465, + "step": 8235 + }, + { + "epoch": 2.06, + "grad_norm": 11.095337867736816, + "learning_rate": 6.36311120819963e-06, + "logits/chosen": -0.44600909948349, + "logits/rejected": -0.5431385040283203, + "logps/chosen": -53.384254455566406, + "logps/rejected": -107.43576049804688, + "loss": 0.655, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9824905395507812, + "rewards/margins": 6.883255958557129, + "rewards/rejected": -3.9007656574249268, + "step": 8236 + }, + { + "epoch": 2.06, + "grad_norm": 5.305983543395996, + "learning_rate": 6.36235497601261e-06, + "logits/chosen": -0.3341047465801239, + "logits/rejected": -0.45641353726387024, + "logps/chosen": -63.575469970703125, + "logps/rejected": -97.12721252441406, + "loss": 0.5905, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.80717396736145, + "rewards/margins": 6.02248477935791, + "rewards/rejected": -3.215310573577881, + "step": 8237 + }, + { + "epoch": 2.06, + "grad_norm": 4.230191707611084, + "learning_rate": 6.361598710160348e-06, + "logits/chosen": -0.49876153469085693, + "logits/rejected": -0.5735828876495361, + "logps/chosen": -52.183963775634766, + "logps/rejected": -98.9202880859375, + "loss": 0.6264, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9633290767669678, + "rewards/margins": 6.3943071365356445, + "rewards/rejected": -3.4309780597686768, + "step": 8238 + }, + { + "epoch": 2.06, + "grad_norm": 3.6614036560058594, + "learning_rate": 6.360842410661535e-06, + "logits/chosen": -0.442363440990448, + "logits/rejected": -0.5576187372207642, + "logps/chosen": -61.25003433227539, + "logps/rejected": -98.319580078125, + "loss": 0.648, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8836097717285156, + "rewards/margins": 6.041217803955078, + "rewards/rejected": -3.1576080322265625, + "step": 8239 + }, + { + "epoch": 2.06, + "grad_norm": 2.4807324409484863, + "learning_rate": 6.360086077534859e-06, + "logits/chosen": -0.5034642815589905, + "logits/rejected": -0.5842865109443665, + "logps/chosen": -50.76108169555664, + "logps/rejected": -98.11901092529297, + "loss": 0.5892, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.860398530960083, + "rewards/margins": 5.69428825378418, + "rewards/rejected": -2.833889961242676, + "step": 8240 + }, + { + "epoch": 2.06, + "grad_norm": 6.258822917938232, + "learning_rate": 6.359329710799007e-06, + "logits/chosen": -0.48561543226242065, + "logits/rejected": -0.5588386058807373, + "logps/chosen": -54.628631591796875, + "logps/rejected": -90.5014877319336, + "loss": 0.5902, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8689656257629395, + "rewards/margins": 5.344090461730957, + "rewards/rejected": -2.4751248359680176, + "step": 8241 + }, + { + "epoch": 2.06, + "grad_norm": 3.550626039505005, + "learning_rate": 6.3585733104726736e-06, + "logits/chosen": -0.4644702076911926, + "logits/rejected": -0.4946780502796173, + "logps/chosen": -69.29432678222656, + "logps/rejected": -92.77911376953125, + "loss": 0.7337, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.978938579559326, + "rewards/margins": 4.98969841003418, + "rewards/rejected": -2.0107600688934326, + "step": 8242 + }, + { + "epoch": 2.06, + "grad_norm": 12.08102035522461, + "learning_rate": 6.357816876574549e-06, + "logits/chosen": -0.5268805027008057, + "logits/rejected": -0.6220961809158325, + "logps/chosen": -59.7902946472168, + "logps/rejected": -92.04922485351562, + "loss": 0.9022, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.890164375305176, + "rewards/margins": 6.499238014221191, + "rewards/rejected": -3.609074115753174, + "step": 8243 + }, + { + "epoch": 2.06, + "grad_norm": 6.0917816162109375, + "learning_rate": 6.357060409123326e-06, + "logits/chosen": -0.4312968850135803, + "logits/rejected": -0.5690851807594299, + "logps/chosen": -54.88981628417969, + "logps/rejected": -98.98178100585938, + "loss": 0.6087, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7447896003723145, + "rewards/margins": 7.032101631164551, + "rewards/rejected": -4.287312030792236, + "step": 8244 + }, + { + "epoch": 2.06, + "grad_norm": 6.390414714813232, + "learning_rate": 6.356303908137695e-06, + "logits/chosen": -0.49731379747390747, + "logits/rejected": -0.5900935530662537, + "logps/chosen": -58.64699935913086, + "logps/rejected": -95.76280212402344, + "loss": 0.7747, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0016403198242188, + "rewards/margins": 6.63048791885376, + "rewards/rejected": -3.62884783744812, + "step": 8245 + }, + { + "epoch": 2.06, + "grad_norm": 12.217402458190918, + "learning_rate": 6.355547373636352e-06, + "logits/chosen": -0.44508135318756104, + "logits/rejected": -0.4893784821033478, + "logps/chosen": -59.6874885559082, + "logps/rejected": -108.2938461303711, + "loss": 0.7564, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9395389556884766, + "rewards/margins": 5.383070945739746, + "rewards/rejected": -2.4435322284698486, + "step": 8246 + }, + { + "epoch": 2.06, + "grad_norm": 3.270812749862671, + "learning_rate": 6.354790805637991e-06, + "logits/chosen": -0.5331653356552124, + "logits/rejected": -0.652053952217102, + "logps/chosen": -58.46731948852539, + "logps/rejected": -91.98944091796875, + "loss": 0.6246, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.029919147491455, + "rewards/margins": 6.863785266876221, + "rewards/rejected": -3.8338663578033447, + "step": 8247 + }, + { + "epoch": 2.06, + "grad_norm": 12.036766052246094, + "learning_rate": 6.354034204161311e-06, + "logits/chosen": -0.44549787044525146, + "logits/rejected": -0.4972873032093048, + "logps/chosen": -63.73910140991211, + "logps/rejected": -99.088623046875, + "loss": 0.8969, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7626571655273438, + "rewards/margins": 5.043056488037109, + "rewards/rejected": -2.280399799346924, + "step": 8248 + }, + { + "epoch": 2.06, + "grad_norm": 3.276463747024536, + "learning_rate": 6.353277569225003e-06, + "logits/chosen": -0.524755597114563, + "logits/rejected": -0.6306408643722534, + "logps/chosen": -56.45448303222656, + "logps/rejected": -86.08000183105469, + "loss": 0.6008, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.944549083709717, + "rewards/margins": 5.304646015167236, + "rewards/rejected": -2.3600969314575195, + "step": 8249 + }, + { + "epoch": 2.06, + "grad_norm": 2.833813190460205, + "learning_rate": 6.352520900847767e-06, + "logits/chosen": -0.5342098474502563, + "logits/rejected": -0.6453262567520142, + "logps/chosen": -65.23833465576172, + "logps/rejected": -79.91291809082031, + "loss": 0.647, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.312959671020508, + "rewards/margins": 6.320725917816162, + "rewards/rejected": -3.0077662467956543, + "step": 8250 + }, + { + "epoch": 2.06, + "grad_norm": 4.728328227996826, + "learning_rate": 6.351764199048304e-06, + "logits/chosen": -0.46592965722084045, + "logits/rejected": -0.5099257230758667, + "logps/chosen": -53.62751770019531, + "logps/rejected": -106.13902282714844, + "loss": 0.6298, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7442216873168945, + "rewards/margins": 5.781053066253662, + "rewards/rejected": -3.0368313789367676, + "step": 8251 + }, + { + "epoch": 2.06, + "grad_norm": 19.155746459960938, + "learning_rate": 6.351007463845307e-06, + "logits/chosen": -0.49689817428588867, + "logits/rejected": -0.5580900311470032, + "logps/chosen": -45.8382568359375, + "logps/rejected": -91.90057373046875, + "loss": 0.7091, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.987363576889038, + "rewards/margins": 6.002540588378906, + "rewards/rejected": -3.015177011489868, + "step": 8252 + }, + { + "epoch": 2.06, + "grad_norm": 4.108516693115234, + "learning_rate": 6.350250695257478e-06, + "logits/chosen": -0.47845205664634705, + "logits/rejected": -0.5735681056976318, + "logps/chosen": -57.014801025390625, + "logps/rejected": -98.22030639648438, + "loss": 0.6653, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9786183834075928, + "rewards/margins": 6.756474018096924, + "rewards/rejected": -3.777855634689331, + "step": 8253 + }, + { + "epoch": 2.06, + "grad_norm": 1.7397252321243286, + "learning_rate": 6.3494938933035196e-06, + "logits/chosen": -0.4209236204624176, + "logits/rejected": -0.5567165613174438, + "logps/chosen": -65.57711791992188, + "logps/rejected": -108.6137466430664, + "loss": 0.5265, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.117847204208374, + "rewards/margins": 7.7455735206604, + "rewards/rejected": -4.627727031707764, + "step": 8254 + }, + { + "epoch": 2.07, + "grad_norm": 4.1505126953125, + "learning_rate": 6.348737058002132e-06, + "logits/chosen": -0.47554558515548706, + "logits/rejected": -0.5404978394508362, + "logps/chosen": -51.329437255859375, + "logps/rejected": -93.99697875976562, + "loss": 0.6459, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1814041137695312, + "rewards/margins": 5.680359840393066, + "rewards/rejected": -2.498955249786377, + "step": 8255 + }, + { + "epoch": 2.07, + "grad_norm": 7.906711101531982, + "learning_rate": 6.347980189372015e-06, + "logits/chosen": -0.4984334111213684, + "logits/rejected": -0.5568580627441406, + "logps/chosen": -55.10997772216797, + "logps/rejected": -106.75198364257812, + "loss": 0.6471, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4801957607269287, + "rewards/margins": 5.637531280517578, + "rewards/rejected": -3.1573357582092285, + "step": 8256 + }, + { + "epoch": 2.07, + "grad_norm": 18.50690269470215, + "learning_rate": 6.347223287431876e-06, + "logits/chosen": -0.38246604800224304, + "logits/rejected": -0.47209030389785767, + "logps/chosen": -61.891685485839844, + "logps/rejected": -103.42569732666016, + "loss": 0.7948, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.806607246398926, + "rewards/margins": 5.259564399719238, + "rewards/rejected": -2.4529566764831543, + "step": 8257 + }, + { + "epoch": 2.07, + "grad_norm": 8.126667022705078, + "learning_rate": 6.346466352200416e-06, + "logits/chosen": -0.4963948130607605, + "logits/rejected": -0.5788624286651611, + "logps/chosen": -48.08102035522461, + "logps/rejected": -83.60755920410156, + "loss": 0.5779, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1735446453094482, + "rewards/margins": 6.155786514282227, + "rewards/rejected": -2.982241630554199, + "step": 8258 + }, + { + "epoch": 2.07, + "grad_norm": 2.803074359893799, + "learning_rate": 6.345709383696339e-06, + "logits/chosen": -0.4523284137248993, + "logits/rejected": -0.5596222877502441, + "logps/chosen": -59.80632400512695, + "logps/rejected": -90.1318588256836, + "loss": 0.5731, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.118244171142578, + "rewards/margins": 6.658528804779053, + "rewards/rejected": -3.5402843952178955, + "step": 8259 + }, + { + "epoch": 2.07, + "grad_norm": 3.4866631031036377, + "learning_rate": 6.344952381938354e-06, + "logits/chosen": -0.5258438587188721, + "logits/rejected": -0.6245081424713135, + "logps/chosen": -61.42315673828125, + "logps/rejected": -95.71229553222656, + "loss": 0.6352, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1147217750549316, + "rewards/margins": 6.1160078048706055, + "rewards/rejected": -3.0012857913970947, + "step": 8260 + }, + { + "epoch": 2.07, + "grad_norm": 3.319009304046631, + "learning_rate": 6.344195346945163e-06, + "logits/chosen": -0.37497562170028687, + "logits/rejected": -0.47322726249694824, + "logps/chosen": -70.6076889038086, + "logps/rejected": -98.03822326660156, + "loss": 0.6647, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9608442783355713, + "rewards/margins": 4.898685932159424, + "rewards/rejected": -1.9378418922424316, + "step": 8261 + }, + { + "epoch": 2.07, + "grad_norm": 27.723922729492188, + "learning_rate": 6.3434382787354764e-06, + "logits/chosen": -0.4571213126182556, + "logits/rejected": -0.4893382787704468, + "logps/chosen": -55.115272521972656, + "logps/rejected": -94.62651824951172, + "loss": 0.9145, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.773446559906006, + "rewards/margins": 4.867005348205566, + "rewards/rejected": -2.0935590267181396, + "step": 8262 + }, + { + "epoch": 2.07, + "grad_norm": 11.620266914367676, + "learning_rate": 6.342681177328001e-06, + "logits/chosen": -0.4449659585952759, + "logits/rejected": -0.5132677555084229, + "logps/chosen": -65.26506805419922, + "logps/rejected": -97.85798645019531, + "loss": 0.7493, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.052731990814209, + "rewards/margins": 5.070076942443848, + "rewards/rejected": -2.0173444747924805, + "step": 8263 + }, + { + "epoch": 2.07, + "grad_norm": 10.145733833312988, + "learning_rate": 6.341924042741444e-06, + "logits/chosen": -0.4617610573768616, + "logits/rejected": -0.5437187552452087, + "logps/chosen": -52.782691955566406, + "logps/rejected": -89.08631134033203, + "loss": 0.6078, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2387924194335938, + "rewards/margins": 5.441577434539795, + "rewards/rejected": -2.202785015106201, + "step": 8264 + }, + { + "epoch": 2.07, + "grad_norm": 6.112497329711914, + "learning_rate": 6.341166874994518e-06, + "logits/chosen": -0.43947747349739075, + "logits/rejected": -0.5002440810203552, + "logps/chosen": -55.80713653564453, + "logps/rejected": -91.26665496826172, + "loss": 0.664, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8009564876556396, + "rewards/margins": 5.119619846343994, + "rewards/rejected": -2.3186631202697754, + "step": 8265 + }, + { + "epoch": 2.07, + "grad_norm": 3.667102813720703, + "learning_rate": 6.340409674105932e-06, + "logits/chosen": -0.4424310624599457, + "logits/rejected": -0.5855097770690918, + "logps/chosen": -52.81478500366211, + "logps/rejected": -72.98812866210938, + "loss": 0.5647, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.156452178955078, + "rewards/margins": 6.101295471191406, + "rewards/rejected": -2.9448437690734863, + "step": 8266 + }, + { + "epoch": 2.07, + "grad_norm": 17.955978393554688, + "learning_rate": 6.339652440094396e-06, + "logits/chosen": -0.4530162811279297, + "logits/rejected": -0.528205394744873, + "logps/chosen": -63.56048583984375, + "logps/rejected": -97.12068176269531, + "loss": 0.7645, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4268546104431152, + "rewards/margins": 5.877455234527588, + "rewards/rejected": -3.450601100921631, + "step": 8267 + }, + { + "epoch": 2.07, + "grad_norm": 14.530710220336914, + "learning_rate": 6.338895172978625e-06, + "logits/chosen": -0.40047329664230347, + "logits/rejected": -0.499426007270813, + "logps/chosen": -55.948707580566406, + "logps/rejected": -82.32162475585938, + "loss": 0.7754, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6538050174713135, + "rewards/margins": 4.592219352722168, + "rewards/rejected": -1.9384140968322754, + "step": 8268 + }, + { + "epoch": 2.07, + "grad_norm": 12.87991714477539, + "learning_rate": 6.33813787277733e-06, + "logits/chosen": -0.4575998783111572, + "logits/rejected": -0.4707163870334625, + "logps/chosen": -58.84236526489258, + "logps/rejected": -101.15448760986328, + "loss": 0.8343, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.592555046081543, + "rewards/margins": 4.5050458908081055, + "rewards/rejected": -1.9124914407730103, + "step": 8269 + }, + { + "epoch": 2.07, + "grad_norm": 2.9719350337982178, + "learning_rate": 6.337380539509224e-06, + "logits/chosen": -0.456883043050766, + "logits/rejected": -0.5047220587730408, + "logps/chosen": -48.50483322143555, + "logps/rejected": -99.65884399414062, + "loss": 0.5447, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0865936279296875, + "rewards/margins": 6.526005268096924, + "rewards/rejected": -3.43941068649292, + "step": 8270 + }, + { + "epoch": 2.07, + "grad_norm": 10.632646560668945, + "learning_rate": 6.336623173193022e-06, + "logits/chosen": -0.4558620750904083, + "logits/rejected": -0.5290902256965637, + "logps/chosen": -61.17469787597656, + "logps/rejected": -81.7137451171875, + "loss": 0.7761, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.985157012939453, + "rewards/margins": 5.325767517089844, + "rewards/rejected": -2.3406105041503906, + "step": 8271 + }, + { + "epoch": 2.07, + "grad_norm": 5.303518295288086, + "learning_rate": 6.335865773847441e-06, + "logits/chosen": -0.4455305337905884, + "logits/rejected": -0.49883806705474854, + "logps/chosen": -51.351158142089844, + "logps/rejected": -101.82200622558594, + "loss": 0.6231, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0814013481140137, + "rewards/margins": 5.13736629486084, + "rewards/rejected": -2.055964946746826, + "step": 8272 + }, + { + "epoch": 2.07, + "grad_norm": 3.7879106998443604, + "learning_rate": 6.3351083414911955e-06, + "logits/chosen": -0.4780743420124054, + "logits/rejected": -0.5508061051368713, + "logps/chosen": -53.00640869140625, + "logps/rejected": -93.1565170288086, + "loss": 0.6032, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0357985496520996, + "rewards/margins": 5.957871437072754, + "rewards/rejected": -2.9220733642578125, + "step": 8273 + }, + { + "epoch": 2.07, + "grad_norm": 5.156782150268555, + "learning_rate": 6.334350876143002e-06, + "logits/chosen": -0.4258112907409668, + "logits/rejected": -0.5203675627708435, + "logps/chosen": -49.70848083496094, + "logps/rejected": -91.18827819824219, + "loss": 0.6856, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2546207904815674, + "rewards/margins": 6.092164993286133, + "rewards/rejected": -2.8375444412231445, + "step": 8274 + }, + { + "epoch": 2.07, + "grad_norm": 11.505626678466797, + "learning_rate": 6.333593377821581e-06, + "logits/chosen": -0.4148488938808441, + "logits/rejected": -0.4852057695388794, + "logps/chosen": -58.829490661621094, + "logps/rejected": -92.31651306152344, + "loss": 0.7117, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2675647735595703, + "rewards/margins": 6.12271785736084, + "rewards/rejected": -2.8551526069641113, + "step": 8275 + }, + { + "epoch": 2.07, + "grad_norm": 3.0930705070495605, + "learning_rate": 6.332835846545647e-06, + "logits/chosen": -0.4923238456249237, + "logits/rejected": -0.5782176852226257, + "logps/chosen": -49.544700622558594, + "logps/rejected": -96.98910522460938, + "loss": 0.5993, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1773922443389893, + "rewards/margins": 6.147228240966797, + "rewards/rejected": -2.9698357582092285, + "step": 8276 + }, + { + "epoch": 2.07, + "grad_norm": 6.4874420166015625, + "learning_rate": 6.332078282333922e-06, + "logits/chosen": -0.4866165518760681, + "logits/rejected": -0.5190610289573669, + "logps/chosen": -46.13441467285156, + "logps/rejected": -90.55765533447266, + "loss": 0.7931, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0336008071899414, + "rewards/margins": 5.703668117523193, + "rewards/rejected": -2.670067310333252, + "step": 8277 + }, + { + "epoch": 2.07, + "grad_norm": 6.004758358001709, + "learning_rate": 6.331320685205125e-06, + "logits/chosen": -0.49197837710380554, + "logits/rejected": -0.5754692554473877, + "logps/chosen": -56.3773078918457, + "logps/rejected": -84.26375579833984, + "loss": 0.6535, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.339902877807617, + "rewards/margins": 5.242565631866455, + "rewards/rejected": -1.902662754058838, + "step": 8278 + }, + { + "epoch": 2.07, + "grad_norm": 5.291046619415283, + "learning_rate": 6.33056305517798e-06, + "logits/chosen": -0.4814448058605194, + "logits/rejected": -0.5377368927001953, + "logps/chosen": -56.398075103759766, + "logps/rejected": -96.79070281982422, + "loss": 0.7356, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.024341583251953, + "rewards/margins": 5.797946929931641, + "rewards/rejected": -2.7736058235168457, + "step": 8279 + }, + { + "epoch": 2.07, + "grad_norm": 6.729601860046387, + "learning_rate": 6.329805392271206e-06, + "logits/chosen": -0.42134493589401245, + "logits/rejected": -0.5101615786552429, + "logps/chosen": -54.1515007019043, + "logps/rejected": -82.07520294189453, + "loss": 0.6217, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9144344329833984, + "rewards/margins": 6.014575958251953, + "rewards/rejected": -3.100141763687134, + "step": 8280 + }, + { + "epoch": 2.07, + "grad_norm": 8.235504150390625, + "learning_rate": 6.329047696503526e-06, + "logits/chosen": -0.5343879461288452, + "logits/rejected": -0.5272631049156189, + "logps/chosen": -53.714927673339844, + "logps/rejected": -101.85723114013672, + "loss": 0.9226, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9205567836761475, + "rewards/margins": 4.633158206939697, + "rewards/rejected": -1.712601661682129, + "step": 8281 + }, + { + "epoch": 2.07, + "grad_norm": 5.654296875, + "learning_rate": 6.3282899678936636e-06, + "logits/chosen": -0.45022547245025635, + "logits/rejected": -0.5886824727058411, + "logps/chosen": -50.69166946411133, + "logps/rejected": -79.7609634399414, + "loss": 0.7102, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8471004962921143, + "rewards/margins": 5.182538032531738, + "rewards/rejected": -2.335437297821045, + "step": 8282 + }, + { + "epoch": 2.07, + "grad_norm": 5.006932735443115, + "learning_rate": 6.327532206460343e-06, + "logits/chosen": -0.49063900113105774, + "logits/rejected": -0.5260741710662842, + "logps/chosen": -50.38722229003906, + "logps/rejected": -94.03691101074219, + "loss": 0.6674, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1104862689971924, + "rewards/margins": 6.160978317260742, + "rewards/rejected": -3.050492286682129, + "step": 8283 + }, + { + "epoch": 2.07, + "grad_norm": 3.332216739654541, + "learning_rate": 6.3267744122222895e-06, + "logits/chosen": -0.5178627371788025, + "logits/rejected": -0.6373425722122192, + "logps/chosen": -60.32940673828125, + "logps/rejected": -98.47575378417969, + "loss": 0.611, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1126582622528076, + "rewards/margins": 7.142400741577148, + "rewards/rejected": -4.029742240905762, + "step": 8284 + }, + { + "epoch": 2.07, + "grad_norm": 6.002993106842041, + "learning_rate": 6.32601658519823e-06, + "logits/chosen": -0.48255300521850586, + "logits/rejected": -0.5794733166694641, + "logps/chosen": -61.1832160949707, + "logps/rejected": -94.92169189453125, + "loss": 0.6688, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6678051948547363, + "rewards/margins": 5.693479061126709, + "rewards/rejected": -3.0256736278533936, + "step": 8285 + }, + { + "epoch": 2.07, + "grad_norm": 4.401938438415527, + "learning_rate": 6.3252587254068896e-06, + "logits/chosen": -0.5106470584869385, + "logits/rejected": -0.6023125052452087, + "logps/chosen": -47.20866394042969, + "logps/rejected": -77.77328491210938, + "loss": 0.6186, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.118562936782837, + "rewards/margins": 5.876925468444824, + "rewards/rejected": -2.7583630084991455, + "step": 8286 + }, + { + "epoch": 2.07, + "grad_norm": 13.895116806030273, + "learning_rate": 6.3245008328669965e-06, + "logits/chosen": -0.534648597240448, + "logits/rejected": -0.6139554977416992, + "logps/chosen": -58.173553466796875, + "logps/rejected": -98.11730194091797, + "loss": 0.699, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1938233375549316, + "rewards/margins": 6.727295398712158, + "rewards/rejected": -3.5334718227386475, + "step": 8287 + }, + { + "epoch": 2.07, + "grad_norm": 3.987976551055908, + "learning_rate": 6.3237429075972785e-06, + "logits/chosen": -0.49834513664245605, + "logits/rejected": -0.5556640625, + "logps/chosen": -60.395233154296875, + "logps/rejected": -98.57202911376953, + "loss": 0.7363, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9068617820739746, + "rewards/margins": 5.757264137268066, + "rewards/rejected": -2.8504021167755127, + "step": 8288 + }, + { + "epoch": 2.07, + "grad_norm": 6.989846229553223, + "learning_rate": 6.322984949616466e-06, + "logits/chosen": -0.5233162045478821, + "logits/rejected": -0.5571337342262268, + "logps/chosen": -57.05230712890625, + "logps/rejected": -101.2023696899414, + "loss": 0.8042, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.639724016189575, + "rewards/margins": 5.1217570304870605, + "rewards/rejected": -2.4820332527160645, + "step": 8289 + }, + { + "epoch": 2.07, + "grad_norm": 6.2575154304504395, + "learning_rate": 6.322226958943287e-06, + "logits/chosen": -0.4574839174747467, + "logits/rejected": -0.5303065776824951, + "logps/chosen": -55.558937072753906, + "logps/rejected": -86.30721282958984, + "loss": 0.7385, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.756873369216919, + "rewards/margins": 5.088754653930664, + "rewards/rejected": -2.3318817615509033, + "step": 8290 + }, + { + "epoch": 2.07, + "grad_norm": 3.396231174468994, + "learning_rate": 6.321468935596476e-06, + "logits/chosen": -0.4521123170852661, + "logits/rejected": -0.5316970944404602, + "logps/chosen": -54.58537673950195, + "logps/rejected": -97.35242462158203, + "loss": 0.5897, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1428892612457275, + "rewards/margins": 5.827010154724121, + "rewards/rejected": -2.6841204166412354, + "step": 8291 + }, + { + "epoch": 2.07, + "grad_norm": 17.654552459716797, + "learning_rate": 6.32071087959476e-06, + "logits/chosen": -0.5272071957588196, + "logits/rejected": -0.5850127339363098, + "logps/chosen": -54.522056579589844, + "logps/rejected": -87.05026245117188, + "loss": 0.7617, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.890841484069824, + "rewards/margins": 5.597043037414551, + "rewards/rejected": -2.7062017917633057, + "step": 8292 + }, + { + "epoch": 2.07, + "grad_norm": 4.843214988708496, + "learning_rate": 6.319952790956873e-06, + "logits/chosen": -0.4380195140838623, + "logits/rejected": -0.5043269991874695, + "logps/chosen": -51.03045654296875, + "logps/rejected": -86.04780578613281, + "loss": 0.6636, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0257301330566406, + "rewards/margins": 6.0799455642700195, + "rewards/rejected": -3.0542147159576416, + "step": 8293 + }, + { + "epoch": 2.07, + "grad_norm": 5.620090007781982, + "learning_rate": 6.319194669701551e-06, + "logits/chosen": -0.4751511812210083, + "logits/rejected": -0.555092990398407, + "logps/chosen": -55.52992248535156, + "logps/rejected": -85.65367889404297, + "loss": 0.6459, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.912914752960205, + "rewards/margins": 5.784669876098633, + "rewards/rejected": -2.8717546463012695, + "step": 8294 + }, + { + "epoch": 2.08, + "grad_norm": 7.564356803894043, + "learning_rate": 6.318436515847525e-06, + "logits/chosen": -0.45597174763679504, + "logits/rejected": -0.5257633924484253, + "logps/chosen": -57.50090026855469, + "logps/rejected": -104.5743179321289, + "loss": 0.6439, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0588366985321045, + "rewards/margins": 6.4250946044921875, + "rewards/rejected": -3.3662586212158203, + "step": 8295 + }, + { + "epoch": 2.08, + "grad_norm": 5.683372974395752, + "learning_rate": 6.317678329413528e-06, + "logits/chosen": -0.45629310607910156, + "logits/rejected": -0.5141927599906921, + "logps/chosen": -52.93215560913086, + "logps/rejected": -88.85673522949219, + "loss": 0.7498, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.815713882446289, + "rewards/margins": 5.620203495025635, + "rewards/rejected": -2.804489850997925, + "step": 8296 + }, + { + "epoch": 2.08, + "grad_norm": 4.509268760681152, + "learning_rate": 6.3169201104182995e-06, + "logits/chosen": -0.47127121686935425, + "logits/rejected": -0.5681747198104858, + "logps/chosen": -56.328731536865234, + "logps/rejected": -106.7665023803711, + "loss": 0.6423, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7355289459228516, + "rewards/margins": 6.64647912979126, + "rewards/rejected": -3.910949945449829, + "step": 8297 + }, + { + "epoch": 2.08, + "grad_norm": 6.374074935913086, + "learning_rate": 6.316161858880575e-06, + "logits/chosen": -0.49136245250701904, + "logits/rejected": -0.5732380747795105, + "logps/chosen": -53.96446990966797, + "logps/rejected": -110.7489013671875, + "loss": 0.6608, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1086082458496094, + "rewards/margins": 5.709610462188721, + "rewards/rejected": -2.6010022163391113, + "step": 8298 + }, + { + "epoch": 2.08, + "grad_norm": 3.7771427631378174, + "learning_rate": 6.315403574819089e-06, + "logits/chosen": -0.46779415011405945, + "logits/rejected": -0.5354292988777161, + "logps/chosen": -63.405391693115234, + "logps/rejected": -89.43666076660156, + "loss": 0.7313, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.014742612838745, + "rewards/margins": 5.4212517738342285, + "rewards/rejected": -2.4065091609954834, + "step": 8299 + }, + { + "epoch": 2.08, + "grad_norm": 4.100815773010254, + "learning_rate": 6.314645258252583e-06, + "logits/chosen": -0.41452959179878235, + "logits/rejected": -0.5348939895629883, + "logps/chosen": -53.28034210205078, + "logps/rejected": -83.1507568359375, + "loss": 0.6566, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0420780181884766, + "rewards/margins": 6.789483070373535, + "rewards/rejected": -3.7474048137664795, + "step": 8300 + }, + { + "epoch": 2.08, + "grad_norm": 5.770185470581055, + "learning_rate": 6.313886909199794e-06, + "logits/chosen": -0.4039681553840637, + "logits/rejected": -0.4871103763580322, + "logps/chosen": -55.50881576538086, + "logps/rejected": -99.00711822509766, + "loss": 0.6433, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.027681350708008, + "rewards/margins": 6.051783561706543, + "rewards/rejected": -3.0241024494171143, + "step": 8301 + }, + { + "epoch": 2.08, + "grad_norm": 5.43843412399292, + "learning_rate": 6.3131285276794615e-06, + "logits/chosen": -0.5105257630348206, + "logits/rejected": -0.6055689454078674, + "logps/chosen": -52.399497985839844, + "logps/rejected": -97.84086608886719, + "loss": 0.6405, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.837827205657959, + "rewards/margins": 6.382778644561768, + "rewards/rejected": -3.544952154159546, + "step": 8302 + }, + { + "epoch": 2.08, + "grad_norm": 5.9565839767456055, + "learning_rate": 6.312370113710329e-06, + "logits/chosen": -0.4444701075553894, + "logits/rejected": -0.5010215640068054, + "logps/chosen": -47.72650909423828, + "logps/rejected": -89.58882141113281, + "loss": 0.654, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.92555570602417, + "rewards/margins": 5.814940452575684, + "rewards/rejected": -2.8893845081329346, + "step": 8303 + }, + { + "epoch": 2.08, + "grad_norm": 3.3168416023254395, + "learning_rate": 6.311611667311133e-06, + "logits/chosen": -0.3898733854293823, + "logits/rejected": -0.4647567868232727, + "logps/chosen": -48.3213996887207, + "logps/rejected": -125.92958068847656, + "loss": 0.6187, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0554189682006836, + "rewards/margins": 6.754575729370117, + "rewards/rejected": -3.6991565227508545, + "step": 8304 + }, + { + "epoch": 2.08, + "grad_norm": 2.9919474124908447, + "learning_rate": 6.310853188500616e-06, + "logits/chosen": -0.4566287398338318, + "logits/rejected": -0.5903953313827515, + "logps/chosen": -52.50626754760742, + "logps/rejected": -94.34638214111328, + "loss": 0.5812, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2143640518188477, + "rewards/margins": 7.099757194519043, + "rewards/rejected": -3.885392665863037, + "step": 8305 + }, + { + "epoch": 2.08, + "grad_norm": 7.408533096313477, + "learning_rate": 6.3100946772975246e-06, + "logits/chosen": -0.49167415499687195, + "logits/rejected": -0.5437920093536377, + "logps/chosen": -49.94029235839844, + "logps/rejected": -86.64570617675781, + "loss": 0.6195, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0984816551208496, + "rewards/margins": 5.378717422485352, + "rewards/rejected": -2.280236005783081, + "step": 8306 + }, + { + "epoch": 2.08, + "grad_norm": 9.636246681213379, + "learning_rate": 6.309336133720599e-06, + "logits/chosen": -0.4690314531326294, + "logits/rejected": -0.5841660499572754, + "logps/chosen": -62.59257507324219, + "logps/rejected": -81.3232650756836, + "loss": 0.7437, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.654405117034912, + "rewards/margins": 4.903453826904297, + "rewards/rejected": -2.2490487098693848, + "step": 8307 + }, + { + "epoch": 2.08, + "grad_norm": 15.722209930419922, + "learning_rate": 6.308577557788587e-06, + "logits/chosen": -0.3748365044593811, + "logits/rejected": -0.4691123962402344, + "logps/chosen": -62.4444465637207, + "logps/rejected": -89.37889099121094, + "loss": 0.8884, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7428789138793945, + "rewards/margins": 5.440563201904297, + "rewards/rejected": -2.6976842880249023, + "step": 8308 + }, + { + "epoch": 2.08, + "grad_norm": 6.820403575897217, + "learning_rate": 6.30781894952023e-06, + "logits/chosen": -0.4692041873931885, + "logits/rejected": -0.5374398231506348, + "logps/chosen": -51.94728088378906, + "logps/rejected": -98.9061050415039, + "loss": 0.639, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1646766662597656, + "rewards/margins": 6.455136299133301, + "rewards/rejected": -3.2904598712921143, + "step": 8309 + }, + { + "epoch": 2.08, + "grad_norm": 7.193438529968262, + "learning_rate": 6.307060308934276e-06, + "logits/chosen": -0.4589923620223999, + "logits/rejected": -0.4884326457977295, + "logps/chosen": -54.84622573852539, + "logps/rejected": -103.70631408691406, + "loss": 0.7443, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5144972801208496, + "rewards/margins": 5.173896789550781, + "rewards/rejected": -2.6593990325927734, + "step": 8310 + }, + { + "epoch": 2.08, + "grad_norm": 6.805700778961182, + "learning_rate": 6.3063016360494715e-06, + "logits/chosen": -0.4493800103664398, + "logits/rejected": -0.5377787351608276, + "logps/chosen": -56.33038330078125, + "logps/rejected": -112.40364837646484, + "loss": 0.7423, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1486258506774902, + "rewards/margins": 6.17681360244751, + "rewards/rejected": -3.0281875133514404, + "step": 8311 + }, + { + "epoch": 2.08, + "grad_norm": 3.902529716491699, + "learning_rate": 6.305542930884565e-06, + "logits/chosen": -0.5399772524833679, + "logits/rejected": -0.5975573658943176, + "logps/chosen": -56.05067825317383, + "logps/rejected": -104.20894622802734, + "loss": 0.6751, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.346022844314575, + "rewards/margins": 6.98623514175415, + "rewards/rejected": -3.640212059020996, + "step": 8312 + }, + { + "epoch": 2.08, + "grad_norm": 3.762701988220215, + "learning_rate": 6.304784193458302e-06, + "logits/chosen": -0.4618082642555237, + "logits/rejected": -0.5288167595863342, + "logps/chosen": -58.79521942138672, + "logps/rejected": -103.84584045410156, + "loss": 0.6507, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7076752185821533, + "rewards/margins": 5.445749282836914, + "rewards/rejected": -2.73807430267334, + "step": 8313 + }, + { + "epoch": 2.08, + "grad_norm": 10.513155937194824, + "learning_rate": 6.304025423789435e-06, + "logits/chosen": -0.46087342500686646, + "logits/rejected": -0.514166533946991, + "logps/chosen": -69.01618194580078, + "logps/rejected": -99.42619323730469, + "loss": 0.8999, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9303829669952393, + "rewards/margins": 5.178882598876953, + "rewards/rejected": -2.248499631881714, + "step": 8314 + }, + { + "epoch": 2.08, + "grad_norm": 3.6811819076538086, + "learning_rate": 6.303266621896713e-06, + "logits/chosen": -0.43202799558639526, + "logits/rejected": -0.5219457149505615, + "logps/chosen": -54.97263717651367, + "logps/rejected": -89.52641296386719, + "loss": 0.6712, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0773403644561768, + "rewards/margins": 5.697294235229492, + "rewards/rejected": -2.6199538707733154, + "step": 8315 + }, + { + "epoch": 2.08, + "grad_norm": 4.004794120788574, + "learning_rate": 6.302507787798886e-06, + "logits/chosen": -0.5090413093566895, + "logits/rejected": -0.5418095588684082, + "logps/chosen": -47.685821533203125, + "logps/rejected": -98.48949432373047, + "loss": 0.6133, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.924558639526367, + "rewards/margins": 5.253334999084473, + "rewards/rejected": -2.3287768363952637, + "step": 8316 + }, + { + "epoch": 2.08, + "grad_norm": 4.935268878936768, + "learning_rate": 6.301748921514706e-06, + "logits/chosen": -0.5271406769752502, + "logits/rejected": -0.6674717664718628, + "logps/chosen": -54.66878890991211, + "logps/rejected": -85.3242416381836, + "loss": 0.6731, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.657458543777466, + "rewards/margins": 5.641881465911865, + "rewards/rejected": -2.984422206878662, + "step": 8317 + }, + { + "epoch": 2.08, + "grad_norm": 7.382977485656738, + "learning_rate": 6.300990023062926e-06, + "logits/chosen": -0.40153467655181885, + "logits/rejected": -0.5138663649559021, + "logps/chosen": -61.18360137939453, + "logps/rejected": -90.60206604003906, + "loss": 0.7581, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4912819862365723, + "rewards/margins": 6.37544059753418, + "rewards/rejected": -3.8841590881347656, + "step": 8318 + }, + { + "epoch": 2.08, + "grad_norm": 5.701155662536621, + "learning_rate": 6.300231092462299e-06, + "logits/chosen": -0.5257009267807007, + "logits/rejected": -0.5951778888702393, + "logps/chosen": -50.625370025634766, + "logps/rejected": -87.73947143554688, + "loss": 0.6345, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8609979152679443, + "rewards/margins": 5.298632621765137, + "rewards/rejected": -2.4376351833343506, + "step": 8319 + }, + { + "epoch": 2.08, + "grad_norm": 5.204965114593506, + "learning_rate": 6.2994721297315786e-06, + "logits/chosen": -0.4334103465080261, + "logits/rejected": -0.5043959021568298, + "logps/chosen": -56.418025970458984, + "logps/rejected": -101.30229187011719, + "loss": 0.7299, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.81687331199646, + "rewards/margins": 5.29836893081665, + "rewards/rejected": -2.4814951419830322, + "step": 8320 + }, + { + "epoch": 2.08, + "grad_norm": 6.322991371154785, + "learning_rate": 6.298713134889518e-06, + "logits/chosen": -0.4630364775657654, + "logits/rejected": -0.5254602432250977, + "logps/chosen": -53.53538131713867, + "logps/rejected": -101.56829071044922, + "loss": 0.7131, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8723909854888916, + "rewards/margins": 5.30513858795166, + "rewards/rejected": -2.432748317718506, + "step": 8321 + }, + { + "epoch": 2.08, + "grad_norm": 6.303607940673828, + "learning_rate": 6.297954107954876e-06, + "logits/chosen": -0.6323232054710388, + "logits/rejected": -0.719548761844635, + "logps/chosen": -52.85090255737305, + "logps/rejected": -79.22953033447266, + "loss": 0.6509, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7847397327423096, + "rewards/margins": 6.055448055267334, + "rewards/rejected": -3.270707845687866, + "step": 8322 + }, + { + "epoch": 2.08, + "grad_norm": 5.796185493469238, + "learning_rate": 6.297195048946409e-06, + "logits/chosen": -0.5728808045387268, + "logits/rejected": -0.6685868501663208, + "logps/chosen": -57.515804290771484, + "logps/rejected": -87.10734558105469, + "loss": 0.7024, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.648892402648926, + "rewards/margins": 6.45193338394165, + "rewards/rejected": -3.8030409812927246, + "step": 8323 + }, + { + "epoch": 2.08, + "grad_norm": 4.304732799530029, + "learning_rate": 6.29643595788287e-06, + "logits/chosen": -0.5317773222923279, + "logits/rejected": -0.5724207162857056, + "logps/chosen": -55.11846160888672, + "logps/rejected": -109.58514404296875, + "loss": 0.6648, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8529956340789795, + "rewards/margins": 6.05226993560791, + "rewards/rejected": -3.1992740631103516, + "step": 8324 + }, + { + "epoch": 2.08, + "grad_norm": 4.981015682220459, + "learning_rate": 6.29567683478302e-06, + "logits/chosen": -0.45092999935150146, + "logits/rejected": -0.5565529465675354, + "logps/chosen": -62.356910705566406, + "logps/rejected": -96.21662902832031, + "loss": 0.6748, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6819374561309814, + "rewards/margins": 6.6563639640808105, + "rewards/rejected": -3.974426746368408, + "step": 8325 + }, + { + "epoch": 2.08, + "grad_norm": 4.9956488609313965, + "learning_rate": 6.294917679665619e-06, + "logits/chosen": -0.4717446267604828, + "logits/rejected": -0.5633990168571472, + "logps/chosen": -63.74971389770508, + "logps/rejected": -95.04560852050781, + "loss": 0.6935, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.327265501022339, + "rewards/margins": 5.92213249206543, + "rewards/rejected": -2.594866991043091, + "step": 8326 + }, + { + "epoch": 2.08, + "grad_norm": 3.3125078678131104, + "learning_rate": 6.2941584925494224e-06, + "logits/chosen": -0.5266851782798767, + "logits/rejected": -0.5793095231056213, + "logps/chosen": -62.04265594482422, + "logps/rejected": -128.5302734375, + "loss": 0.6444, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.769801139831543, + "rewards/margins": 6.9610700607299805, + "rewards/rejected": -4.1912689208984375, + "step": 8327 + }, + { + "epoch": 2.08, + "grad_norm": 3.912463426589966, + "learning_rate": 6.293399273453195e-06, + "logits/chosen": -0.44439685344696045, + "logits/rejected": -0.5381784439086914, + "logps/chosen": -64.09404754638672, + "logps/rejected": -115.38679504394531, + "loss": 0.6929, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0916380882263184, + "rewards/margins": 6.816972255706787, + "rewards/rejected": -3.725334405899048, + "step": 8328 + }, + { + "epoch": 2.08, + "grad_norm": 12.55228042602539, + "learning_rate": 6.292640022395694e-06, + "logits/chosen": -0.45743852853775024, + "logits/rejected": -0.5142289400100708, + "logps/chosen": -59.11367416381836, + "logps/rejected": -90.4706039428711, + "loss": 0.778, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6303927898406982, + "rewards/margins": 5.114129543304443, + "rewards/rejected": -2.483736515045166, + "step": 8329 + }, + { + "epoch": 2.08, + "grad_norm": 12.410862922668457, + "learning_rate": 6.291880739395683e-06, + "logits/chosen": -0.4354448616504669, + "logits/rejected": -0.4894498884677887, + "logps/chosen": -54.75476837158203, + "logps/rejected": -95.21367645263672, + "loss": 0.6913, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9441323280334473, + "rewards/margins": 4.42971134185791, + "rewards/rejected": -1.4855788946151733, + "step": 8330 + }, + { + "epoch": 2.08, + "grad_norm": 7.123181343078613, + "learning_rate": 6.2911214244719265e-06, + "logits/chosen": -0.47343090176582336, + "logits/rejected": -0.5536882281303406, + "logps/chosen": -52.87851333618164, + "logps/rejected": -83.46382141113281, + "loss": 0.6995, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0809459686279297, + "rewards/margins": 4.889568328857422, + "rewards/rejected": -1.8086220026016235, + "step": 8331 + }, + { + "epoch": 2.08, + "grad_norm": 8.455367088317871, + "learning_rate": 6.290362077643186e-06, + "logits/chosen": -0.5108471512794495, + "logits/rejected": -0.6221591830253601, + "logps/chosen": -62.556739807128906, + "logps/rejected": -83.4189682006836, + "loss": 0.8102, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9076480865478516, + "rewards/margins": 5.536870956420898, + "rewards/rejected": -2.629222869873047, + "step": 8332 + }, + { + "epoch": 2.08, + "grad_norm": 6.906117916107178, + "learning_rate": 6.2896026989282254e-06, + "logits/chosen": -0.46837595105171204, + "logits/rejected": -0.49701640009880066, + "logps/chosen": -65.50377655029297, + "logps/rejected": -108.66629791259766, + "loss": 0.7916, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.722368001937866, + "rewards/margins": 5.876871109008789, + "rewards/rejected": -3.1545026302337646, + "step": 8333 + }, + { + "epoch": 2.08, + "grad_norm": 5.760822296142578, + "learning_rate": 6.2888432883458115e-06, + "logits/chosen": -0.38698387145996094, + "logits/rejected": -0.5115177631378174, + "logps/chosen": -72.13167572021484, + "logps/rejected": -91.4908447265625, + "loss": 0.7109, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.57175874710083, + "rewards/margins": 5.070750713348389, + "rewards/rejected": -2.4989919662475586, + "step": 8334 + }, + { + "epoch": 2.09, + "grad_norm": 5.258785247802734, + "learning_rate": 6.288083845914709e-06, + "logits/chosen": -0.560100257396698, + "logits/rejected": -0.6292541027069092, + "logps/chosen": -46.325626373291016, + "logps/rejected": -88.51046752929688, + "loss": 0.61, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2468173503875732, + "rewards/margins": 6.361318111419678, + "rewards/rejected": -3.1145005226135254, + "step": 8335 + }, + { + "epoch": 2.09, + "grad_norm": 7.257627010345459, + "learning_rate": 6.287324371653683e-06, + "logits/chosen": -0.4416404664516449, + "logits/rejected": -0.5251306891441345, + "logps/chosen": -52.73929977416992, + "logps/rejected": -110.40205383300781, + "loss": 0.5845, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.665510654449463, + "rewards/margins": 6.401581764221191, + "rewards/rejected": -3.736071825027466, + "step": 8336 + }, + { + "epoch": 2.09, + "grad_norm": 6.731166362762451, + "learning_rate": 6.286564865581503e-06, + "logits/chosen": -0.4855670630931854, + "logits/rejected": -0.47359299659729004, + "logps/chosen": -47.10395050048828, + "logps/rejected": -110.27481842041016, + "loss": 0.7283, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0226311683654785, + "rewards/margins": 5.9980926513671875, + "rewards/rejected": -2.97546124458313, + "step": 8337 + }, + { + "epoch": 2.09, + "grad_norm": 7.445951461791992, + "learning_rate": 6.285805327716938e-06, + "logits/chosen": -0.4163239598274231, + "logits/rejected": -0.47797173261642456, + "logps/chosen": -50.909000396728516, + "logps/rejected": -97.86764526367188, + "loss": 0.7541, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0389533042907715, + "rewards/margins": 5.416128158569336, + "rewards/rejected": -2.3771748542785645, + "step": 8338 + }, + { + "epoch": 2.09, + "grad_norm": 3.429157257080078, + "learning_rate": 6.285045758078754e-06, + "logits/chosen": -0.5275911092758179, + "logits/rejected": -0.6278719902038574, + "logps/chosen": -47.128936767578125, + "logps/rejected": -82.55213928222656, + "loss": 0.5826, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0150146484375, + "rewards/margins": 5.877874374389648, + "rewards/rejected": -2.8628594875335693, + "step": 8339 + }, + { + "epoch": 2.09, + "grad_norm": 3.7870397567749023, + "learning_rate": 6.2842861566857234e-06, + "logits/chosen": -0.5084049701690674, + "logits/rejected": -0.6311612725257874, + "logps/chosen": -44.63427734375, + "logps/rejected": -83.5139389038086, + "loss": 0.5324, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0137696266174316, + "rewards/margins": 5.941747188568115, + "rewards/rejected": -2.9279778003692627, + "step": 8340 + }, + { + "epoch": 2.09, + "grad_norm": 13.382471084594727, + "learning_rate": 6.2835265235566165e-06, + "logits/chosen": -0.4829883575439453, + "logits/rejected": -0.5485028624534607, + "logps/chosen": -39.481231689453125, + "logps/rejected": -97.40498352050781, + "loss": 0.5854, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2359414100646973, + "rewards/margins": 6.394073486328125, + "rewards/rejected": -3.158132314682007, + "step": 8341 + }, + { + "epoch": 2.09, + "grad_norm": 3.1605401039123535, + "learning_rate": 6.282766858710203e-06, + "logits/chosen": -0.49093252420425415, + "logits/rejected": -0.5982397198677063, + "logps/chosen": -49.53873062133789, + "logps/rejected": -93.2713394165039, + "loss": 0.5728, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.889545440673828, + "rewards/margins": 6.1590166091918945, + "rewards/rejected": -3.2694716453552246, + "step": 8342 + }, + { + "epoch": 2.09, + "grad_norm": 8.910747528076172, + "learning_rate": 6.282007162165256e-06, + "logits/chosen": -0.4471919536590576, + "logits/rejected": -0.5357162356376648, + "logps/chosen": -57.80730438232422, + "logps/rejected": -87.22540283203125, + "loss": 0.6463, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.057596206665039, + "rewards/margins": 5.736546516418457, + "rewards/rejected": -2.678950309753418, + "step": 8343 + }, + { + "epoch": 2.09, + "grad_norm": 8.177871704101562, + "learning_rate": 6.281247433940549e-06, + "logits/chosen": -0.42787015438079834, + "logits/rejected": -0.5129026174545288, + "logps/chosen": -54.31128692626953, + "logps/rejected": -91.1650161743164, + "loss": 0.7799, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9904637336730957, + "rewards/margins": 5.477102279663086, + "rewards/rejected": -2.486638069152832, + "step": 8344 + }, + { + "epoch": 2.09, + "grad_norm": 15.495635032653809, + "learning_rate": 6.280487674054854e-06, + "logits/chosen": -0.49704432487487793, + "logits/rejected": -0.5380938053131104, + "logps/chosen": -50.388328552246094, + "logps/rejected": -90.95673370361328, + "loss": 0.7874, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7993433475494385, + "rewards/margins": 5.118213653564453, + "rewards/rejected": -2.3188695907592773, + "step": 8345 + }, + { + "epoch": 2.09, + "grad_norm": 6.330806255340576, + "learning_rate": 6.279727882526949e-06, + "logits/chosen": -0.4801955223083496, + "logits/rejected": -0.5881401896476746, + "logps/chosen": -45.69863510131836, + "logps/rejected": -87.4592056274414, + "loss": 0.6721, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0070505142211914, + "rewards/margins": 7.160359859466553, + "rewards/rejected": -4.153309345245361, + "step": 8346 + }, + { + "epoch": 2.09, + "grad_norm": 4.020204067230225, + "learning_rate": 6.278968059375604e-06, + "logits/chosen": -0.4267692267894745, + "logits/rejected": -0.521045446395874, + "logps/chosen": -56.56214141845703, + "logps/rejected": -100.19786834716797, + "loss": 0.6722, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9363162517547607, + "rewards/margins": 5.197357177734375, + "rewards/rejected": -2.2610414028167725, + "step": 8347 + }, + { + "epoch": 2.09, + "grad_norm": 5.443017482757568, + "learning_rate": 6.2782082046195995e-06, + "logits/chosen": -0.4860824942588806, + "logits/rejected": -0.5976957082748413, + "logps/chosen": -56.86603546142578, + "logps/rejected": -96.7525634765625, + "loss": 0.7068, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1627440452575684, + "rewards/margins": 6.501702308654785, + "rewards/rejected": -3.338958740234375, + "step": 8348 + }, + { + "epoch": 2.09, + "grad_norm": 4.51864767074585, + "learning_rate": 6.27744831827771e-06, + "logits/chosen": -0.5495997071266174, + "logits/rejected": -0.6045737266540527, + "logps/chosen": -42.94075393676758, + "logps/rejected": -82.62921142578125, + "loss": 0.6616, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.989344835281372, + "rewards/margins": 5.5331268310546875, + "rewards/rejected": -2.5437822341918945, + "step": 8349 + }, + { + "epoch": 2.09, + "grad_norm": 5.695225238800049, + "learning_rate": 6.276688400368713e-06, + "logits/chosen": -0.4362924098968506, + "logits/rejected": -0.503170371055603, + "logps/chosen": -52.39942932128906, + "logps/rejected": -94.51068878173828, + "loss": 0.6777, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.103511095046997, + "rewards/margins": 5.972965240478516, + "rewards/rejected": -2.8694539070129395, + "step": 8350 + }, + { + "epoch": 2.09, + "grad_norm": 5.095293998718262, + "learning_rate": 6.2759284509113905e-06, + "logits/chosen": -0.45782020688056946, + "logits/rejected": -0.5358946919441223, + "logps/chosen": -62.75592803955078, + "logps/rejected": -92.89280700683594, + "loss": 0.7238, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9731528759002686, + "rewards/margins": 4.567844390869141, + "rewards/rejected": -1.594691514968872, + "step": 8351 + }, + { + "epoch": 2.09, + "grad_norm": 9.582813262939453, + "learning_rate": 6.275168469924517e-06, + "logits/chosen": -0.4991386830806732, + "logits/rejected": -0.5201118588447571, + "logps/chosen": -50.33064651489258, + "logps/rejected": -81.27603149414062, + "loss": 0.6992, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.775432825088501, + "rewards/margins": 4.198934078216553, + "rewards/rejected": -1.4235014915466309, + "step": 8352 + }, + { + "epoch": 2.09, + "grad_norm": 7.038720607757568, + "learning_rate": 6.274408457426873e-06, + "logits/chosen": -0.5007206201553345, + "logits/rejected": -0.6249709725379944, + "logps/chosen": -58.172401428222656, + "logps/rejected": -70.59512329101562, + "loss": 0.6772, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0673537254333496, + "rewards/margins": 5.114639759063721, + "rewards/rejected": -2.047285795211792, + "step": 8353 + }, + { + "epoch": 2.09, + "grad_norm": 7.9180755615234375, + "learning_rate": 6.273648413437243e-06, + "logits/chosen": -0.4839947819709778, + "logits/rejected": -0.5700236558914185, + "logps/chosen": -59.30799102783203, + "logps/rejected": -108.13151550292969, + "loss": 0.642, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.920757293701172, + "rewards/margins": 5.215551853179932, + "rewards/rejected": -2.2947940826416016, + "step": 8354 + }, + { + "epoch": 2.09, + "grad_norm": 7.147372722625732, + "learning_rate": 6.272888337974406e-06, + "logits/chosen": -0.40986624360084534, + "logits/rejected": -0.4528130888938904, + "logps/chosen": -58.774505615234375, + "logps/rejected": -93.50498962402344, + "loss": 0.6406, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.375570774078369, + "rewards/margins": 5.285177707672119, + "rewards/rejected": -1.9096065759658813, + "step": 8355 + }, + { + "epoch": 2.09, + "grad_norm": 4.329803466796875, + "learning_rate": 6.272128231057142e-06, + "logits/chosen": -0.4824603796005249, + "logits/rejected": -0.5803958773612976, + "logps/chosen": -68.31927490234375, + "logps/rejected": -89.98210906982422, + "loss": 0.6895, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.092879056930542, + "rewards/margins": 5.655062675476074, + "rewards/rejected": -2.5621836185455322, + "step": 8356 + }, + { + "epoch": 2.09, + "grad_norm": 2.6815297603607178, + "learning_rate": 6.271368092704237e-06, + "logits/chosen": -0.5124682188034058, + "logits/rejected": -0.5667329430580139, + "logps/chosen": -58.44038009643555, + "logps/rejected": -104.99129486083984, + "loss": 0.6033, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.122032880783081, + "rewards/margins": 6.410747528076172, + "rewards/rejected": -3.288714647293091, + "step": 8357 + }, + { + "epoch": 2.09, + "grad_norm": 6.020215034484863, + "learning_rate": 6.270607922934477e-06, + "logits/chosen": -0.40612250566482544, + "logits/rejected": -0.47249752283096313, + "logps/chosen": -51.14055633544922, + "logps/rejected": -99.59440612792969, + "loss": 0.6014, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9505717754364014, + "rewards/margins": 5.906944274902344, + "rewards/rejected": -2.9563724994659424, + "step": 8358 + }, + { + "epoch": 2.09, + "grad_norm": 9.621192932128906, + "learning_rate": 6.26984772176664e-06, + "logits/chosen": -0.462373822927475, + "logits/rejected": -0.5299561619758606, + "logps/chosen": -56.00204849243164, + "logps/rejected": -94.42524719238281, + "loss": 0.723, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.009671688079834, + "rewards/margins": 5.399590969085693, + "rewards/rejected": -2.3899190425872803, + "step": 8359 + }, + { + "epoch": 2.09, + "grad_norm": 6.484712600708008, + "learning_rate": 6.269087489219517e-06, + "logits/chosen": -0.5434128046035767, + "logits/rejected": -0.6113318800926208, + "logps/chosen": -47.67840576171875, + "logps/rejected": -86.20699310302734, + "loss": 0.6843, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.279844045639038, + "rewards/margins": 5.906726360321045, + "rewards/rejected": -2.626882314682007, + "step": 8360 + }, + { + "epoch": 2.09, + "grad_norm": 8.640273094177246, + "learning_rate": 6.268327225311891e-06, + "logits/chosen": -0.39150407910346985, + "logits/rejected": -0.4579687714576721, + "logps/chosen": -62.435638427734375, + "logps/rejected": -118.19267272949219, + "loss": 0.7279, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.86930513381958, + "rewards/margins": 6.444646835327148, + "rewards/rejected": -3.5753414630889893, + "step": 8361 + }, + { + "epoch": 2.09, + "grad_norm": 9.813105583190918, + "learning_rate": 6.267566930062551e-06, + "logits/chosen": -0.4487925171852112, + "logits/rejected": -0.5178425908088684, + "logps/chosen": -57.973087310791016, + "logps/rejected": -105.13113403320312, + "loss": 0.6446, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.847294569015503, + "rewards/margins": 6.353756904602051, + "rewards/rejected": -3.506462574005127, + "step": 8362 + }, + { + "epoch": 2.09, + "grad_norm": 10.521766662597656, + "learning_rate": 6.266806603490283e-06, + "logits/chosen": -0.4608815610408783, + "logits/rejected": -0.4585415720939636, + "logps/chosen": -54.14710235595703, + "logps/rejected": -103.80094146728516, + "loss": 0.7772, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.112717866897583, + "rewards/margins": 5.407344341278076, + "rewards/rejected": -2.294626474380493, + "step": 8363 + }, + { + "epoch": 2.09, + "grad_norm": 5.69795036315918, + "learning_rate": 6.266046245613879e-06, + "logits/chosen": -0.4437967836856842, + "logits/rejected": -0.5461080074310303, + "logps/chosen": -52.231502532958984, + "logps/rejected": -83.94757843017578, + "loss": 0.62, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.918246269226074, + "rewards/margins": 5.956090927124023, + "rewards/rejected": -3.0378451347351074, + "step": 8364 + }, + { + "epoch": 2.09, + "grad_norm": 2.6188082695007324, + "learning_rate": 6.265285856452123e-06, + "logits/chosen": -0.46282240748405457, + "logits/rejected": -0.5755977034568787, + "logps/chosen": -53.898193359375, + "logps/rejected": -91.77983093261719, + "loss": 0.622, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9882686138153076, + "rewards/margins": 6.767683029174805, + "rewards/rejected": -3.779414415359497, + "step": 8365 + }, + { + "epoch": 2.09, + "grad_norm": 3.837848424911499, + "learning_rate": 6.2645254360238095e-06, + "logits/chosen": -0.4392685890197754, + "logits/rejected": -0.4508417546749115, + "logps/chosen": -45.23603820800781, + "logps/rejected": -101.93389892578125, + "loss": 0.6283, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0783238410949707, + "rewards/margins": 5.466644287109375, + "rewards/rejected": -2.388319969177246, + "step": 8366 + }, + { + "epoch": 2.09, + "grad_norm": 5.953733444213867, + "learning_rate": 6.263764984347726e-06, + "logits/chosen": -0.45252546668052673, + "logits/rejected": -0.570218026638031, + "logps/chosen": -60.28458786010742, + "logps/rejected": -89.73477935791016, + "loss": 0.6885, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.878067970275879, + "rewards/margins": 5.759254455566406, + "rewards/rejected": -2.8811862468719482, + "step": 8367 + }, + { + "epoch": 2.09, + "grad_norm": 5.482954502105713, + "learning_rate": 6.2630045014426655e-06, + "logits/chosen": -0.5120739340782166, + "logits/rejected": -0.5174230933189392, + "logps/chosen": -49.64933395385742, + "logps/rejected": -85.70436096191406, + "loss": 0.6854, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9253199100494385, + "rewards/margins": 4.782888889312744, + "rewards/rejected": -1.857568621635437, + "step": 8368 + }, + { + "epoch": 2.09, + "grad_norm": 3.8900678157806396, + "learning_rate": 6.262243987327422e-06, + "logits/chosen": -0.49462252855300903, + "logits/rejected": -0.5144245624542236, + "logps/chosen": -49.77857971191406, + "logps/rejected": -107.62211608886719, + "loss": 0.6887, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.626188039779663, + "rewards/margins": 5.216573715209961, + "rewards/rejected": -2.5903851985931396, + "step": 8369 + }, + { + "epoch": 2.09, + "grad_norm": 5.694381237030029, + "learning_rate": 6.2614834420207845e-06, + "logits/chosen": -0.46086350083351135, + "logits/rejected": -0.47187742590904236, + "logps/chosen": -50.72058868408203, + "logps/rejected": -96.69933319091797, + "loss": 0.6493, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1455578804016113, + "rewards/margins": 4.595396518707275, + "rewards/rejected": -1.4498385190963745, + "step": 8370 + }, + { + "epoch": 2.09, + "grad_norm": 6.033020973205566, + "learning_rate": 6.260722865541552e-06, + "logits/chosen": -0.45101475715637207, + "logits/rejected": -0.4681878983974457, + "logps/chosen": -56.5108642578125, + "logps/rejected": -88.78138732910156, + "loss": 0.7114, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0517477989196777, + "rewards/margins": 3.677262544631958, + "rewards/rejected": -0.6255145072937012, + "step": 8371 + }, + { + "epoch": 2.09, + "grad_norm": 3.9926531314849854, + "learning_rate": 6.259962257908515e-06, + "logits/chosen": -0.39507022500038147, + "logits/rejected": -0.43704771995544434, + "logps/chosen": -44.73794174194336, + "logps/rejected": -97.29043579101562, + "loss": 0.5478, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.051431894302368, + "rewards/margins": 5.632887363433838, + "rewards/rejected": -2.581455945968628, + "step": 8372 + }, + { + "epoch": 2.09, + "grad_norm": 3.9550533294677734, + "learning_rate": 6.2592016191404704e-06, + "logits/chosen": -0.5385850667953491, + "logits/rejected": -0.601381778717041, + "logps/chosen": -48.13214874267578, + "logps/rejected": -83.52071380615234, + "loss": 0.7052, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7082245349884033, + "rewards/margins": 5.429026126861572, + "rewards/rejected": -2.720802068710327, + "step": 8373 + }, + { + "epoch": 2.09, + "grad_norm": 5.932597637176514, + "learning_rate": 6.258440949256215e-06, + "logits/chosen": -0.4296402931213379, + "logits/rejected": -0.6091635227203369, + "logps/chosen": -74.32464599609375, + "logps/rejected": -81.99385070800781, + "loss": 0.6779, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.994363307952881, + "rewards/margins": 5.853053569793701, + "rewards/rejected": -2.858690023422241, + "step": 8374 + }, + { + "epoch": 2.1, + "grad_norm": 10.89142894744873, + "learning_rate": 6.257680248274546e-06, + "logits/chosen": -0.5545445680618286, + "logits/rejected": -0.6023150682449341, + "logps/chosen": -58.6163444519043, + "logps/rejected": -97.3386001586914, + "loss": 0.7543, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0276777744293213, + "rewards/margins": 5.227010726928711, + "rewards/rejected": -2.1993327140808105, + "step": 8375 + }, + { + "epoch": 2.1, + "grad_norm": 6.834461212158203, + "learning_rate": 6.256919516214259e-06, + "logits/chosen": -0.4185631275177002, + "logits/rejected": -0.4597238302230835, + "logps/chosen": -56.432823181152344, + "logps/rejected": -89.21720123291016, + "loss": 0.6892, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.917722225189209, + "rewards/margins": 4.638437271118164, + "rewards/rejected": -1.720715045928955, + "step": 8376 + }, + { + "epoch": 2.1, + "grad_norm": 16.79273223876953, + "learning_rate": 6.256158753094154e-06, + "logits/chosen": -0.5231605768203735, + "logits/rejected": -0.5768165588378906, + "logps/chosen": -52.39049530029297, + "logps/rejected": -108.60726928710938, + "loss": 0.7583, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.949552059173584, + "rewards/margins": 6.281490802764893, + "rewards/rejected": -3.3319382667541504, + "step": 8377 + }, + { + "epoch": 2.1, + "grad_norm": 5.610520362854004, + "learning_rate": 6.255397958933031e-06, + "logits/chosen": -0.43811434507369995, + "logits/rejected": -0.5171886086463928, + "logps/chosen": -60.72452926635742, + "logps/rejected": -96.79839324951172, + "loss": 0.7467, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1604082584381104, + "rewards/margins": 5.022115707397461, + "rewards/rejected": -1.8617075681686401, + "step": 8378 + }, + { + "epoch": 2.1, + "grad_norm": 6.992171764373779, + "learning_rate": 6.254637133749687e-06, + "logits/chosen": -0.4041709303855896, + "logits/rejected": -0.42481762170791626, + "logps/chosen": -63.681610107421875, + "logps/rejected": -118.25978088378906, + "loss": 0.7896, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.678069829940796, + "rewards/margins": 5.26063871383667, + "rewards/rejected": -2.582569122314453, + "step": 8379 + }, + { + "epoch": 2.1, + "grad_norm": 5.565215110778809, + "learning_rate": 6.253876277562927e-06, + "logits/chosen": -0.405659019947052, + "logits/rejected": -0.513765811920166, + "logps/chosen": -65.28050994873047, + "logps/rejected": -105.71269226074219, + "loss": 0.7078, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.991522789001465, + "rewards/margins": 5.386894702911377, + "rewards/rejected": -2.395372152328491, + "step": 8380 + }, + { + "epoch": 2.1, + "grad_norm": 3.7141435146331787, + "learning_rate": 6.25311539039155e-06, + "logits/chosen": -0.4347177743911743, + "logits/rejected": -0.5231761932373047, + "logps/chosen": -53.690364837646484, + "logps/rejected": -94.97747802734375, + "loss": 0.6614, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8870251178741455, + "rewards/margins": 5.840274333953857, + "rewards/rejected": -2.953249454498291, + "step": 8381 + }, + { + "epoch": 2.1, + "grad_norm": 4.125096797943115, + "learning_rate": 6.2523544722543586e-06, + "logits/chosen": -0.5857007503509521, + "logits/rejected": -0.6239393353462219, + "logps/chosen": -51.862422943115234, + "logps/rejected": -103.50921630859375, + "loss": 0.6453, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.071233034133911, + "rewards/margins": 5.898237228393555, + "rewards/rejected": -2.827004909515381, + "step": 8382 + }, + { + "epoch": 2.1, + "grad_norm": 2.267033338546753, + "learning_rate": 6.251593523170158e-06, + "logits/chosen": -0.4764442443847656, + "logits/rejected": -0.5848269462585449, + "logps/chosen": -44.08189010620117, + "logps/rejected": -77.54261016845703, + "loss": 0.5373, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.352965831756592, + "rewards/margins": 6.249393463134766, + "rewards/rejected": -2.896428108215332, + "step": 8383 + }, + { + "epoch": 2.1, + "grad_norm": 5.5188469886779785, + "learning_rate": 6.2508325431577475e-06, + "logits/chosen": -0.5572155714035034, + "logits/rejected": -0.6660006046295166, + "logps/chosen": -58.501365661621094, + "logps/rejected": -94.72550964355469, + "loss": 0.778, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7807693481445312, + "rewards/margins": 6.634091854095459, + "rewards/rejected": -3.8533225059509277, + "step": 8384 + }, + { + "epoch": 2.1, + "grad_norm": 3.7925028800964355, + "learning_rate": 6.250071532235935e-06, + "logits/chosen": -0.4697621464729309, + "logits/rejected": -0.5372912883758545, + "logps/chosen": -54.42497634887695, + "logps/rejected": -92.00237274169922, + "loss": 0.6143, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1112818717956543, + "rewards/margins": 4.994879245758057, + "rewards/rejected": -1.8835967779159546, + "step": 8385 + }, + { + "epoch": 2.1, + "grad_norm": 4.316070079803467, + "learning_rate": 6.249310490423526e-06, + "logits/chosen": -0.44917160272598267, + "logits/rejected": -0.5130259990692139, + "logps/chosen": -59.33803176879883, + "logps/rejected": -97.28754425048828, + "loss": 0.6781, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.825244665145874, + "rewards/margins": 5.360313415527344, + "rewards/rejected": -2.5350685119628906, + "step": 8386 + }, + { + "epoch": 2.1, + "grad_norm": 4.198243141174316, + "learning_rate": 6.248549417739325e-06, + "logits/chosen": -0.46519172191619873, + "logits/rejected": -0.5374191999435425, + "logps/chosen": -54.144371032714844, + "logps/rejected": -79.5834732055664, + "loss": 0.6911, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1396126747131348, + "rewards/margins": 5.468634128570557, + "rewards/rejected": -2.3290209770202637, + "step": 8387 + }, + { + "epoch": 2.1, + "grad_norm": 5.788479328155518, + "learning_rate": 6.24778831420214e-06, + "logits/chosen": -0.5278915166854858, + "logits/rejected": -0.5598433613777161, + "logps/chosen": -58.73727035522461, + "logps/rejected": -100.62416076660156, + "loss": 0.7805, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.200977325439453, + "rewards/margins": 5.277838230133057, + "rewards/rejected": -2.0768611431121826, + "step": 8388 + }, + { + "epoch": 2.1, + "grad_norm": 2.5967319011688232, + "learning_rate": 6.24702717983078e-06, + "logits/chosen": -0.506683886051178, + "logits/rejected": -0.5688113570213318, + "logps/chosen": -52.46350860595703, + "logps/rejected": -110.92191314697266, + "loss": 0.5781, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1467747688293457, + "rewards/margins": 6.183035850524902, + "rewards/rejected": -3.0362610816955566, + "step": 8389 + }, + { + "epoch": 2.1, + "grad_norm": 5.548328876495361, + "learning_rate": 6.246266014644051e-06, + "logits/chosen": -0.4449554681777954, + "logits/rejected": -0.5279370546340942, + "logps/chosen": -58.786373138427734, + "logps/rejected": -92.7162857055664, + "loss": 0.7252, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9119162559509277, + "rewards/margins": 5.252293586730957, + "rewards/rejected": -2.34037709236145, + "step": 8390 + }, + { + "epoch": 2.1, + "grad_norm": 3.950122117996216, + "learning_rate": 6.245504818660763e-06, + "logits/chosen": -0.4561687111854553, + "logits/rejected": -0.5172693133354187, + "logps/chosen": -52.10744094848633, + "logps/rejected": -105.15155029296875, + "loss": 0.6171, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9067165851593018, + "rewards/margins": 6.060512065887451, + "rewards/rejected": -3.1537954807281494, + "step": 8391 + }, + { + "epoch": 2.1, + "grad_norm": 5.318702697753906, + "learning_rate": 6.244743591899727e-06, + "logits/chosen": -0.5130558609962463, + "logits/rejected": -0.6396726369857788, + "logps/chosen": -50.12357711791992, + "logps/rejected": -70.19828033447266, + "loss": 0.6282, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.96321964263916, + "rewards/margins": 6.07224178314209, + "rewards/rejected": -3.1090221405029297, + "step": 8392 + }, + { + "epoch": 2.1, + "grad_norm": 5.661214351654053, + "learning_rate": 6.2439823343797515e-06, + "logits/chosen": -0.4025176465511322, + "logits/rejected": -0.5801421999931335, + "logps/chosen": -63.94580841064453, + "logps/rejected": -75.35539245605469, + "loss": 0.6875, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.034181594848633, + "rewards/margins": 6.122506141662598, + "rewards/rejected": -3.088325023651123, + "step": 8393 + }, + { + "epoch": 2.1, + "grad_norm": 4.298539161682129, + "learning_rate": 6.243221046119651e-06, + "logits/chosen": -0.4863739609718323, + "logits/rejected": -0.5150784850120544, + "logps/chosen": -51.90253448486328, + "logps/rejected": -109.78083801269531, + "loss": 0.6134, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0977938175201416, + "rewards/margins": 5.962250232696533, + "rewards/rejected": -2.86445689201355, + "step": 8394 + }, + { + "epoch": 2.1, + "grad_norm": 3.3563759326934814, + "learning_rate": 6.242459727138235e-06, + "logits/chosen": -0.4059569835662842, + "logits/rejected": -0.5098252892494202, + "logps/chosen": -64.51361846923828, + "logps/rejected": -100.37399291992188, + "loss": 0.6296, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.807433605194092, + "rewards/margins": 5.996319770812988, + "rewards/rejected": -3.1888856887817383, + "step": 8395 + }, + { + "epoch": 2.1, + "grad_norm": 5.121533393859863, + "learning_rate": 6.241698377454317e-06, + "logits/chosen": -0.5434398055076599, + "logits/rejected": -0.6025280356407166, + "logps/chosen": -56.04582595825195, + "logps/rejected": -101.72190856933594, + "loss": 0.6814, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.765726327896118, + "rewards/margins": 5.9339599609375, + "rewards/rejected": -3.168233633041382, + "step": 8396 + }, + { + "epoch": 2.1, + "grad_norm": 3.854402780532837, + "learning_rate": 6.240936997086712e-06, + "logits/chosen": -0.46344077587127686, + "logits/rejected": -0.4890056848526001, + "logps/chosen": -53.22063446044922, + "logps/rejected": -98.6452865600586, + "loss": 0.687, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9425504207611084, + "rewards/margins": 5.2900919914245605, + "rewards/rejected": -2.347541570663452, + "step": 8397 + }, + { + "epoch": 2.1, + "grad_norm": 5.248754501342773, + "learning_rate": 6.240175586054235e-06, + "logits/chosen": -0.47545790672302246, + "logits/rejected": -0.5691368579864502, + "logps/chosen": -60.71672821044922, + "logps/rejected": -83.5189208984375, + "loss": 0.7206, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7867331504821777, + "rewards/margins": 5.006197929382324, + "rewards/rejected": -2.2194650173187256, + "step": 8398 + }, + { + "epoch": 2.1, + "grad_norm": 3.333601236343384, + "learning_rate": 6.2394141443756985e-06, + "logits/chosen": -0.44084668159484863, + "logits/rejected": -0.5215232968330383, + "logps/chosen": -59.68714904785156, + "logps/rejected": -90.2410659790039, + "loss": 0.6354, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8331260681152344, + "rewards/margins": 5.8352179527282715, + "rewards/rejected": -3.002091646194458, + "step": 8399 + }, + { + "epoch": 2.1, + "grad_norm": 3.749608278274536, + "learning_rate": 6.238652672069921e-06, + "logits/chosen": -0.5489497184753418, + "logits/rejected": -0.6200985312461853, + "logps/chosen": -53.202091217041016, + "logps/rejected": -90.00263214111328, + "loss": 0.6534, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.999812602996826, + "rewards/margins": 5.5268731117248535, + "rewards/rejected": -2.5270607471466064, + "step": 8400 + }, + { + "epoch": 2.1, + "grad_norm": 5.091055870056152, + "learning_rate": 6.23789116915572e-06, + "logits/chosen": -0.4646577835083008, + "logits/rejected": -0.5374716520309448, + "logps/chosen": -60.286773681640625, + "logps/rejected": -106.26309967041016, + "loss": 0.61, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2515246868133545, + "rewards/margins": 6.096007823944092, + "rewards/rejected": -2.844482421875, + "step": 8401 + }, + { + "epoch": 2.1, + "grad_norm": 6.194533348083496, + "learning_rate": 6.237129635651909e-06, + "logits/chosen": -0.5027137398719788, + "logits/rejected": -0.5817765593528748, + "logps/chosen": -58.210792541503906, + "logps/rejected": -95.75725555419922, + "loss": 0.7098, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2210276126861572, + "rewards/margins": 5.32292366027832, + "rewards/rejected": -2.101895809173584, + "step": 8402 + }, + { + "epoch": 2.1, + "grad_norm": 6.0984063148498535, + "learning_rate": 6.23636807157731e-06, + "logits/chosen": -0.46439242362976074, + "logits/rejected": -0.5381650924682617, + "logps/chosen": -52.540374755859375, + "logps/rejected": -100.2584228515625, + "loss": 0.6874, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.735058307647705, + "rewards/margins": 6.0564045906066895, + "rewards/rejected": -3.3213460445404053, + "step": 8403 + }, + { + "epoch": 2.1, + "grad_norm": 9.025252342224121, + "learning_rate": 6.23560647695074e-06, + "logits/chosen": -0.41895532608032227, + "logits/rejected": -0.5333571434020996, + "logps/chosen": -67.06271362304688, + "logps/rejected": -93.11821746826172, + "loss": 0.7595, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.967390537261963, + "rewards/margins": 5.762477874755859, + "rewards/rejected": -2.7950875759124756, + "step": 8404 + }, + { + "epoch": 2.1, + "grad_norm": 6.1314849853515625, + "learning_rate": 6.23484485179102e-06, + "logits/chosen": -0.49134427309036255, + "logits/rejected": -0.5774640440940857, + "logps/chosen": -52.0916748046875, + "logps/rejected": -97.2398452758789, + "loss": 0.6836, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0278780460357666, + "rewards/margins": 6.810028076171875, + "rewards/rejected": -3.7821505069732666, + "step": 8405 + }, + { + "epoch": 2.1, + "grad_norm": 3.4231104850769043, + "learning_rate": 6.234083196116972e-06, + "logits/chosen": -0.4321650266647339, + "logits/rejected": -0.49159884452819824, + "logps/chosen": -57.65983200073242, + "logps/rejected": -91.91712188720703, + "loss": 0.6456, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0967893600463867, + "rewards/margins": 5.348991870880127, + "rewards/rejected": -2.252201795578003, + "step": 8406 + }, + { + "epoch": 2.1, + "grad_norm": 3.1704297065734863, + "learning_rate": 6.233321509947414e-06, + "logits/chosen": -0.46757572889328003, + "logits/rejected": -0.5771864652633667, + "logps/chosen": -51.42816162109375, + "logps/rejected": -103.69554901123047, + "loss": 0.604, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3274595737457275, + "rewards/margins": 6.077003479003906, + "rewards/rejected": -2.7495439052581787, + "step": 8407 + }, + { + "epoch": 2.1, + "grad_norm": 3.8283069133758545, + "learning_rate": 6.23255979330117e-06, + "logits/chosen": -0.49168238043785095, + "logits/rejected": -0.6177526712417603, + "logps/chosen": -60.398834228515625, + "logps/rejected": -90.39321899414062, + "loss": 0.5829, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2063424587249756, + "rewards/margins": 6.469443321228027, + "rewards/rejected": -3.2631008625030518, + "step": 8408 + }, + { + "epoch": 2.1, + "grad_norm": 5.543299198150635, + "learning_rate": 6.2317980461970615e-06, + "logits/chosen": -0.4984567165374756, + "logits/rejected": -0.5771058797836304, + "logps/chosen": -55.09734344482422, + "logps/rejected": -92.88651275634766, + "loss": 0.6952, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.045145034790039, + "rewards/margins": 5.581264972686768, + "rewards/rejected": -2.5361199378967285, + "step": 8409 + }, + { + "epoch": 2.1, + "grad_norm": 3.8087146282196045, + "learning_rate": 6.231036268653914e-06, + "logits/chosen": -0.43886956572532654, + "logits/rejected": -0.49641430377960205, + "logps/chosen": -60.08877182006836, + "logps/rejected": -104.33798217773438, + "loss": 0.6767, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.871565818786621, + "rewards/margins": 6.040935039520264, + "rewards/rejected": -3.1693689823150635, + "step": 8410 + }, + { + "epoch": 2.1, + "grad_norm": 5.875500679016113, + "learning_rate": 6.230274460690551e-06, + "logits/chosen": -0.5586968660354614, + "logits/rejected": -0.6216021776199341, + "logps/chosen": -49.44515609741211, + "logps/rejected": -92.56771087646484, + "loss": 0.6294, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.985485553741455, + "rewards/margins": 6.9208526611328125, + "rewards/rejected": -3.9353671073913574, + "step": 8411 + }, + { + "epoch": 2.1, + "grad_norm": 7.233911037445068, + "learning_rate": 6.229512622325797e-06, + "logits/chosen": -0.5556144714355469, + "logits/rejected": -0.6438300013542175, + "logps/chosen": -48.56813049316406, + "logps/rejected": -84.71601867675781, + "loss": 0.7064, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.953333854675293, + "rewards/margins": 5.177243709564209, + "rewards/rejected": -2.223910093307495, + "step": 8412 + }, + { + "epoch": 2.1, + "grad_norm": 7.895904064178467, + "learning_rate": 6.228750753578479e-06, + "logits/chosen": -0.42504289746284485, + "logits/rejected": -0.5000411868095398, + "logps/chosen": -62.51558303833008, + "logps/rejected": -105.47533416748047, + "loss": 0.725, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.991548538208008, + "rewards/margins": 5.588429927825928, + "rewards/rejected": -2.5968809127807617, + "step": 8413 + }, + { + "epoch": 2.1, + "grad_norm": 5.047516345977783, + "learning_rate": 6.227988854467422e-06, + "logits/chosen": -0.536632776260376, + "logits/rejected": -0.5806223750114441, + "logps/chosen": -43.69819259643555, + "logps/rejected": -88.80447387695312, + "loss": 0.6506, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0239336490631104, + "rewards/margins": 5.9965925216674805, + "rewards/rejected": -2.97265887260437, + "step": 8414 + }, + { + "epoch": 2.11, + "grad_norm": 8.698654174804688, + "learning_rate": 6.227226925011456e-06, + "logits/chosen": -0.5199316740036011, + "logits/rejected": -0.6082299947738647, + "logps/chosen": -58.23067855834961, + "logps/rejected": -111.51799011230469, + "loss": 0.7234, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.794398307800293, + "rewards/margins": 5.079423904418945, + "rewards/rejected": -2.2850260734558105, + "step": 8415 + }, + { + "epoch": 2.11, + "grad_norm": 14.390972137451172, + "learning_rate": 6.226464965229405e-06, + "logits/chosen": -0.5270570516586304, + "logits/rejected": -0.6248043775558472, + "logps/chosen": -57.60376739501953, + "logps/rejected": -91.36680603027344, + "loss": 0.6776, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.987922191619873, + "rewards/margins": 5.793224334716797, + "rewards/rejected": -2.805302143096924, + "step": 8416 + }, + { + "epoch": 2.11, + "grad_norm": 6.295035362243652, + "learning_rate": 6.225702975140103e-06, + "logits/chosen": -0.46963056921958923, + "logits/rejected": -0.5349709391593933, + "logps/chosen": -58.327484130859375, + "logps/rejected": -91.44513702392578, + "loss": 0.6717, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9772863388061523, + "rewards/margins": 5.168604373931885, + "rewards/rejected": -2.1913182735443115, + "step": 8417 + }, + { + "epoch": 2.11, + "grad_norm": 12.438264846801758, + "learning_rate": 6.224940954762375e-06, + "logits/chosen": -0.5318323969841003, + "logits/rejected": -0.5594018697738647, + "logps/chosen": -55.96232223510742, + "logps/rejected": -101.53763580322266, + "loss": 0.7961, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1634225845336914, + "rewards/margins": 6.0755510330200195, + "rewards/rejected": -2.912128210067749, + "step": 8418 + }, + { + "epoch": 2.11, + "grad_norm": 5.909511566162109, + "learning_rate": 6.224178904115053e-06, + "logits/chosen": -0.5046123266220093, + "logits/rejected": -0.657810628414154, + "logps/chosen": -66.82685852050781, + "logps/rejected": -87.55712890625, + "loss": 0.7849, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7694878578186035, + "rewards/margins": 5.895864486694336, + "rewards/rejected": -3.1263771057128906, + "step": 8419 + }, + { + "epoch": 2.11, + "grad_norm": 2.084354877471924, + "learning_rate": 6.223416823216968e-06, + "logits/chosen": -0.5349476933479309, + "logits/rejected": -0.6261833310127258, + "logps/chosen": -55.792606353759766, + "logps/rejected": -90.72479248046875, + "loss": 0.5959, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.807316780090332, + "rewards/margins": 6.080498218536377, + "rewards/rejected": -3.273181915283203, + "step": 8420 + }, + { + "epoch": 2.11, + "grad_norm": 3.412428855895996, + "learning_rate": 6.222654712086953e-06, + "logits/chosen": -0.5156406760215759, + "logits/rejected": -0.6174907088279724, + "logps/chosen": -51.807193756103516, + "logps/rejected": -91.80079650878906, + "loss": 0.5973, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0392587184906006, + "rewards/margins": 6.482365608215332, + "rewards/rejected": -3.4431066513061523, + "step": 8421 + }, + { + "epoch": 2.11, + "grad_norm": 4.40304708480835, + "learning_rate": 6.2218925707438385e-06, + "logits/chosen": -0.6468544602394104, + "logits/rejected": -0.7206653356552124, + "logps/chosen": -49.9854736328125, + "logps/rejected": -93.17198944091797, + "loss": 0.6526, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.204592227935791, + "rewards/margins": 6.963831901550293, + "rewards/rejected": -3.759239912033081, + "step": 8422 + }, + { + "epoch": 2.11, + "grad_norm": 2.91013240814209, + "learning_rate": 6.2211303992064595e-06, + "logits/chosen": -0.40641528367996216, + "logits/rejected": -0.5518921613693237, + "logps/chosen": -63.377586364746094, + "logps/rejected": -86.3443603515625, + "loss": 0.632, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0531067848205566, + "rewards/margins": 6.439793586730957, + "rewards/rejected": -3.3866868019104004, + "step": 8423 + }, + { + "epoch": 2.11, + "grad_norm": 3.5781641006469727, + "learning_rate": 6.220368197493651e-06, + "logits/chosen": -0.45965513586997986, + "logits/rejected": -0.5272356271743774, + "logps/chosen": -47.407798767089844, + "logps/rejected": -92.64271545410156, + "loss": 0.5539, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.095696210861206, + "rewards/margins": 6.1848368644714355, + "rewards/rejected": -3.089141368865967, + "step": 8424 + }, + { + "epoch": 2.11, + "grad_norm": 7.7324442863464355, + "learning_rate": 6.219605965624244e-06, + "logits/chosen": -0.4272422194480896, + "logits/rejected": -0.5252792835235596, + "logps/chosen": -58.88951110839844, + "logps/rejected": -93.95240783691406, + "loss": 0.6321, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9332170486450195, + "rewards/margins": 5.868254661560059, + "rewards/rejected": -2.935037612915039, + "step": 8425 + }, + { + "epoch": 2.11, + "grad_norm": 3.844745397567749, + "learning_rate": 6.218843703617078e-06, + "logits/chosen": -0.46188926696777344, + "logits/rejected": -0.5692635178565979, + "logps/chosen": -63.883750915527344, + "logps/rejected": -93.12486267089844, + "loss": 0.6918, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.154306411743164, + "rewards/margins": 5.610680103302002, + "rewards/rejected": -2.456373929977417, + "step": 8426 + }, + { + "epoch": 2.11, + "grad_norm": 8.851520538330078, + "learning_rate": 6.218081411490986e-06, + "logits/chosen": -0.45396125316619873, + "logits/rejected": -0.48989635705947876, + "logps/chosen": -53.083282470703125, + "logps/rejected": -101.600341796875, + "loss": 0.6699, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9401369094848633, + "rewards/margins": 6.2492475509643555, + "rewards/rejected": -3.309110641479492, + "step": 8427 + }, + { + "epoch": 2.11, + "grad_norm": 9.139492988586426, + "learning_rate": 6.217319089264807e-06, + "logits/chosen": -0.44226837158203125, + "logits/rejected": -0.6124627590179443, + "logps/chosen": -73.90484619140625, + "logps/rejected": -84.92265319824219, + "loss": 0.6196, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7746737003326416, + "rewards/margins": 5.848815441131592, + "rewards/rejected": -3.074141502380371, + "step": 8428 + }, + { + "epoch": 2.11, + "grad_norm": 8.431703567504883, + "learning_rate": 6.216556736957379e-06, + "logits/chosen": -0.5140572786331177, + "logits/rejected": -0.6151098608970642, + "logps/chosen": -66.98056030273438, + "logps/rejected": -97.16746520996094, + "loss": 0.7075, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.907397985458374, + "rewards/margins": 6.241400718688965, + "rewards/rejected": -3.334002733230591, + "step": 8429 + }, + { + "epoch": 2.11, + "grad_norm": 1.909637689590454, + "learning_rate": 6.21579435458754e-06, + "logits/chosen": -0.5255759954452515, + "logits/rejected": -0.5676091909408569, + "logps/chosen": -53.64057540893555, + "logps/rejected": -127.50524139404297, + "loss": 0.5705, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8408725261688232, + "rewards/margins": 7.999944686889648, + "rewards/rejected": -5.159071922302246, + "step": 8430 + }, + { + "epoch": 2.11, + "grad_norm": 7.69758939743042, + "learning_rate": 6.215031942174129e-06, + "logits/chosen": -0.4467039704322815, + "logits/rejected": -0.5071682333946228, + "logps/chosen": -55.88145446777344, + "logps/rejected": -107.93888854980469, + "loss": 0.6936, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.73936128616333, + "rewards/margins": 6.1853156089782715, + "rewards/rejected": -3.4459545612335205, + "step": 8431 + }, + { + "epoch": 2.11, + "grad_norm": 16.84478759765625, + "learning_rate": 6.214269499735987e-06, + "logits/chosen": -0.42388975620269775, + "logits/rejected": -0.47800830006599426, + "logps/chosen": -59.47319030761719, + "logps/rejected": -89.82095336914062, + "loss": 0.9966, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5577874183654785, + "rewards/margins": 4.189305782318115, + "rewards/rejected": -1.6315183639526367, + "step": 8432 + }, + { + "epoch": 2.11, + "grad_norm": 14.223309516906738, + "learning_rate": 6.213507027291954e-06, + "logits/chosen": -0.49710288643836975, + "logits/rejected": -0.5659599900245667, + "logps/chosen": -53.308292388916016, + "logps/rejected": -91.59196472167969, + "loss": 0.6921, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.929769992828369, + "rewards/margins": 5.630341053009033, + "rewards/rejected": -2.7005715370178223, + "step": 8433 + }, + { + "epoch": 2.11, + "grad_norm": 6.14691162109375, + "learning_rate": 6.212744524860872e-06, + "logits/chosen": -0.39083799719810486, + "logits/rejected": -0.49063488841056824, + "logps/chosen": -67.17476654052734, + "logps/rejected": -87.61091613769531, + "loss": 0.7783, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.768113136291504, + "rewards/margins": 4.8319315910339355, + "rewards/rejected": -2.06381893157959, + "step": 8434 + }, + { + "epoch": 2.11, + "grad_norm": 3.8196067810058594, + "learning_rate": 6.211981992461583e-06, + "logits/chosen": -0.5118436217308044, + "logits/rejected": -0.6037393808364868, + "logps/chosen": -52.54490661621094, + "logps/rejected": -105.79027557373047, + "loss": 0.6638, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7715792655944824, + "rewards/margins": 6.9021100997924805, + "rewards/rejected": -4.130530834197998, + "step": 8435 + }, + { + "epoch": 2.11, + "grad_norm": 5.2768635749816895, + "learning_rate": 6.2112194301129284e-06, + "logits/chosen": -0.5124154090881348, + "logits/rejected": -0.5934793949127197, + "logps/chosen": -55.70893478393555, + "logps/rejected": -74.51165771484375, + "loss": 0.7321, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.868053674697876, + "rewards/margins": 5.668274879455566, + "rewards/rejected": -2.8002219200134277, + "step": 8436 + }, + { + "epoch": 2.11, + "grad_norm": 4.153006553649902, + "learning_rate": 6.210456837833754e-06, + "logits/chosen": -0.4463857114315033, + "logits/rejected": -0.5008317828178406, + "logps/chosen": -67.09740447998047, + "logps/rejected": -94.3948974609375, + "loss": 0.7305, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.250602960586548, + "rewards/margins": 5.42575740814209, + "rewards/rejected": -2.175154685974121, + "step": 8437 + }, + { + "epoch": 2.11, + "grad_norm": 3.4648168087005615, + "learning_rate": 6.209694215642904e-06, + "logits/chosen": -0.5389891266822815, + "logits/rejected": -0.6483768820762634, + "logps/chosen": -58.86623001098633, + "logps/rejected": -93.34281158447266, + "loss": 0.5767, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5148255825042725, + "rewards/margins": 7.140185356140137, + "rewards/rejected": -3.6253602504730225, + "step": 8438 + }, + { + "epoch": 2.11, + "grad_norm": 3.986246109008789, + "learning_rate": 6.208931563559223e-06, + "logits/chosen": -0.5184072852134705, + "logits/rejected": -0.5918875932693481, + "logps/chosen": -48.586612701416016, + "logps/rejected": -95.71308898925781, + "loss": 0.644, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5566885471343994, + "rewards/margins": 6.047978401184082, + "rewards/rejected": -3.4912900924682617, + "step": 8439 + }, + { + "epoch": 2.11, + "grad_norm": 3.0639302730560303, + "learning_rate": 6.208168881601556e-06, + "logits/chosen": -0.5466970205307007, + "logits/rejected": -0.5854600071907043, + "logps/chosen": -60.568729400634766, + "logps/rejected": -100.53724670410156, + "loss": 0.8001, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7229273319244385, + "rewards/margins": 6.261803150177002, + "rewards/rejected": -3.5388755798339844, + "step": 8440 + }, + { + "epoch": 2.11, + "grad_norm": 2.8554980754852295, + "learning_rate": 6.207406169788754e-06, + "logits/chosen": -0.4299584925174713, + "logits/rejected": -0.5093302726745605, + "logps/chosen": -50.933006286621094, + "logps/rejected": -93.877685546875, + "loss": 0.6089, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1136465072631836, + "rewards/margins": 6.426254749298096, + "rewards/rejected": -3.312608003616333, + "step": 8441 + }, + { + "epoch": 2.11, + "grad_norm": 5.839544773101807, + "learning_rate": 6.206643428139658e-06, + "logits/chosen": -0.43743664026260376, + "logits/rejected": -0.5182914137840271, + "logps/chosen": -52.569679260253906, + "logps/rejected": -92.27644348144531, + "loss": 0.6173, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8400726318359375, + "rewards/margins": 6.478359699249268, + "rewards/rejected": -3.63828706741333, + "step": 8442 + }, + { + "epoch": 2.11, + "grad_norm": 6.6429572105407715, + "learning_rate": 6.205880656673121e-06, + "logits/chosen": -0.4277127683162689, + "logits/rejected": -0.5501527786254883, + "logps/chosen": -56.938297271728516, + "logps/rejected": -81.67681121826172, + "loss": 0.7322, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.923354148864746, + "rewards/margins": 5.10606575012207, + "rewards/rejected": -2.182711362838745, + "step": 8443 + }, + { + "epoch": 2.11, + "grad_norm": 6.339224815368652, + "learning_rate": 6.205117855407991e-06, + "logits/chosen": -0.46776169538497925, + "logits/rejected": -0.5749900341033936, + "logps/chosen": -48.06370544433594, + "logps/rejected": -107.55220031738281, + "loss": 0.5326, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.876817464828491, + "rewards/margins": 7.143097877502441, + "rewards/rejected": -4.266280174255371, + "step": 8444 + }, + { + "epoch": 2.11, + "grad_norm": 1.5820379257202148, + "learning_rate": 6.204355024363116e-06, + "logits/chosen": -0.4861454963684082, + "logits/rejected": -0.5953728556632996, + "logps/chosen": -60.74971389770508, + "logps/rejected": -92.10870361328125, + "loss": 0.574, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.033113479614258, + "rewards/margins": 7.288912773132324, + "rewards/rejected": -4.255799293518066, + "step": 8445 + }, + { + "epoch": 2.11, + "grad_norm": 8.336276054382324, + "learning_rate": 6.203592163557347e-06, + "logits/chosen": -0.40183985233306885, + "logits/rejected": -0.50234055519104, + "logps/chosen": -51.169029235839844, + "logps/rejected": -90.67446899414062, + "loss": 0.602, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.975961685180664, + "rewards/margins": 5.52778959274292, + "rewards/rejected": -2.551827907562256, + "step": 8446 + }, + { + "epoch": 2.11, + "grad_norm": 5.375890254974365, + "learning_rate": 6.202829273009537e-06, + "logits/chosen": -0.4449966549873352, + "logits/rejected": -0.5564008951187134, + "logps/chosen": -58.18128204345703, + "logps/rejected": -88.58908081054688, + "loss": 0.6847, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9752273559570312, + "rewards/margins": 5.643328666687012, + "rewards/rejected": -2.6681015491485596, + "step": 8447 + }, + { + "epoch": 2.11, + "grad_norm": 6.745488166809082, + "learning_rate": 6.202066352738534e-06, + "logits/chosen": -0.49181386828422546, + "logits/rejected": -0.5519811511039734, + "logps/chosen": -45.679649353027344, + "logps/rejected": -85.36505126953125, + "loss": 0.6662, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.94342041015625, + "rewards/margins": 5.99894380569458, + "rewards/rejected": -3.055523633956909, + "step": 8448 + }, + { + "epoch": 2.11, + "grad_norm": 4.232004642486572, + "learning_rate": 6.201303402763195e-06, + "logits/chosen": -0.4540846347808838, + "logits/rejected": -0.5151274800300598, + "logps/chosen": -56.45487594604492, + "logps/rejected": -84.95282745361328, + "loss": 0.7223, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7531471252441406, + "rewards/margins": 5.078747749328613, + "rewards/rejected": -2.3256006240844727, + "step": 8449 + }, + { + "epoch": 2.11, + "grad_norm": 21.523012161254883, + "learning_rate": 6.200540423102371e-06, + "logits/chosen": -0.4761492609977722, + "logits/rejected": -0.4648350775241852, + "logps/chosen": -54.85894775390625, + "logps/rejected": -102.85775756835938, + "loss": 0.8554, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8300669193267822, + "rewards/margins": 5.510025978088379, + "rewards/rejected": -2.6799590587615967, + "step": 8450 + }, + { + "epoch": 2.11, + "grad_norm": 13.675671577453613, + "learning_rate": 6.1997774137749145e-06, + "logits/chosen": -0.4845539927482605, + "logits/rejected": -0.5230282545089722, + "logps/chosen": -52.53952407836914, + "logps/rejected": -91.5249252319336, + "loss": 0.6902, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9813289642333984, + "rewards/margins": 4.681867599487305, + "rewards/rejected": -1.700538992881775, + "step": 8451 + }, + { + "epoch": 2.11, + "grad_norm": 5.597290992736816, + "learning_rate": 6.199014374799682e-06, + "logits/chosen": -0.4660947322845459, + "logits/rejected": -0.5110476613044739, + "logps/chosen": -56.89510726928711, + "logps/rejected": -104.71726989746094, + "loss": 0.6639, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.11238694190979, + "rewards/margins": 5.282553672790527, + "rewards/rejected": -2.170166492462158, + "step": 8452 + }, + { + "epoch": 2.11, + "grad_norm": 3.290013551712036, + "learning_rate": 6.19825130619553e-06, + "logits/chosen": -0.5118008852005005, + "logits/rejected": -0.5578588247299194, + "logps/chosen": -53.61265182495117, + "logps/rejected": -94.94686126708984, + "loss": 0.7228, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1912426948547363, + "rewards/margins": 6.245602130889893, + "rewards/rejected": -3.054359197616577, + "step": 8453 + }, + { + "epoch": 2.11, + "grad_norm": 7.728716850280762, + "learning_rate": 6.197488207981313e-06, + "logits/chosen": -0.4922562837600708, + "logits/rejected": -0.5947843790054321, + "logps/chosen": -62.35724639892578, + "logps/rejected": -76.10000610351562, + "loss": 0.7417, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.065373182296753, + "rewards/margins": 5.141159534454346, + "rewards/rejected": -2.075786590576172, + "step": 8454 + }, + { + "epoch": 2.12, + "grad_norm": 8.787726402282715, + "learning_rate": 6.1967250801758904e-06, + "logits/chosen": -0.5382143259048462, + "logits/rejected": -0.631850004196167, + "logps/chosen": -50.928741455078125, + "logps/rejected": -99.93864440917969, + "loss": 0.6695, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0442304611206055, + "rewards/margins": 6.242591857910156, + "rewards/rejected": -3.1983606815338135, + "step": 8455 + }, + { + "epoch": 2.12, + "grad_norm": 5.790552139282227, + "learning_rate": 6.195961922798116e-06, + "logits/chosen": -0.4890938401222229, + "logits/rejected": -0.5703557729721069, + "logps/chosen": -48.214500427246094, + "logps/rejected": -93.23237609863281, + "loss": 0.6645, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.031667709350586, + "rewards/margins": 6.349152565002441, + "rewards/rejected": -3.3174853324890137, + "step": 8456 + }, + { + "epoch": 2.12, + "grad_norm": 3.0400989055633545, + "learning_rate": 6.1951987358668516e-06, + "logits/chosen": -0.5170869827270508, + "logits/rejected": -0.6419655084609985, + "logps/chosen": -60.69398498535156, + "logps/rejected": -86.60095977783203, + "loss": 0.6005, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.377423048019409, + "rewards/margins": 6.731094837188721, + "rewards/rejected": -3.3536717891693115, + "step": 8457 + }, + { + "epoch": 2.12, + "grad_norm": 3.3720297813415527, + "learning_rate": 6.194435519400955e-06, + "logits/chosen": -0.49061906337738037, + "logits/rejected": -0.5707305669784546, + "logps/chosen": -62.40060043334961, + "logps/rejected": -91.03877258300781, + "loss": 0.6864, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0501723289489746, + "rewards/margins": 6.048795223236084, + "rewards/rejected": -2.9986228942871094, + "step": 8458 + }, + { + "epoch": 2.12, + "grad_norm": 6.568853378295898, + "learning_rate": 6.193672273419286e-06, + "logits/chosen": -0.4975486993789673, + "logits/rejected": -0.5792877078056335, + "logps/chosen": -56.60480499267578, + "logps/rejected": -84.43315887451172, + "loss": 0.7655, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0834262371063232, + "rewards/margins": 5.03139591217041, + "rewards/rejected": -1.947969913482666, + "step": 8459 + }, + { + "epoch": 2.12, + "grad_norm": 1.7915620803833008, + "learning_rate": 6.192908997940706e-06, + "logits/chosen": -0.43980199098587036, + "logits/rejected": -0.5441663265228271, + "logps/chosen": -57.5228271484375, + "logps/rejected": -112.69107818603516, + "loss": 0.5618, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.026620864868164, + "rewards/margins": 7.388079643249512, + "rewards/rejected": -4.361458778381348, + "step": 8460 + }, + { + "epoch": 2.12, + "grad_norm": 5.478672504425049, + "learning_rate": 6.192145692984075e-06, + "logits/chosen": -0.5899427533149719, + "logits/rejected": -0.6504544615745544, + "logps/chosen": -56.00929260253906, + "logps/rejected": -98.16236877441406, + "loss": 0.7156, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.186479091644287, + "rewards/margins": 5.441834449768066, + "rewards/rejected": -2.255354881286621, + "step": 8461 + }, + { + "epoch": 2.12, + "grad_norm": 3.4907119274139404, + "learning_rate": 6.191382358568256e-06, + "logits/chosen": -0.5021907687187195, + "logits/rejected": -0.5675248503684998, + "logps/chosen": -64.02276611328125, + "logps/rejected": -91.541748046875, + "loss": 0.7361, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8962931632995605, + "rewards/margins": 5.867220401763916, + "rewards/rejected": -2.9709267616271973, + "step": 8462 + }, + { + "epoch": 2.12, + "grad_norm": 3.4242804050445557, + "learning_rate": 6.190618994712112e-06, + "logits/chosen": -0.5348231792449951, + "logits/rejected": -0.6149476766586304, + "logps/chosen": -51.76735305786133, + "logps/rejected": -79.530517578125, + "loss": 0.6753, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1179749965667725, + "rewards/margins": 5.4459004402160645, + "rewards/rejected": -2.327924966812134, + "step": 8463 + }, + { + "epoch": 2.12, + "grad_norm": 5.897436141967773, + "learning_rate": 6.189855601434508e-06, + "logits/chosen": -0.4841190576553345, + "logits/rejected": -0.6206007599830627, + "logps/chosen": -55.375404357910156, + "logps/rejected": -74.54826354980469, + "loss": 0.595, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.911742687225342, + "rewards/margins": 5.932391166687012, + "rewards/rejected": -3.020648241043091, + "step": 8464 + }, + { + "epoch": 2.12, + "grad_norm": 6.683719158172607, + "learning_rate": 6.189092178754303e-06, + "logits/chosen": -0.4806307554244995, + "logits/rejected": -0.5302150249481201, + "logps/chosen": -54.21635437011719, + "logps/rejected": -95.75863647460938, + "loss": 0.7218, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.686511754989624, + "rewards/margins": 4.477591037750244, + "rewards/rejected": -1.791078805923462, + "step": 8465 + }, + { + "epoch": 2.12, + "grad_norm": 6.045158386230469, + "learning_rate": 6.188328726690369e-06, + "logits/chosen": -0.45099103450775146, + "logits/rejected": -0.5078601837158203, + "logps/chosen": -49.739994049072266, + "logps/rejected": -110.75186157226562, + "loss": 0.5591, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6901752948760986, + "rewards/margins": 5.8349809646606445, + "rewards/rejected": -3.144805431365967, + "step": 8466 + }, + { + "epoch": 2.12, + "grad_norm": 5.497316360473633, + "learning_rate": 6.187565245261566e-06, + "logits/chosen": -0.5072659850120544, + "logits/rejected": -0.5549317002296448, + "logps/chosen": -53.89529037475586, + "logps/rejected": -107.0787353515625, + "loss": 0.7076, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7644848823547363, + "rewards/margins": 4.779428005218506, + "rewards/rejected": -2.0149428844451904, + "step": 8467 + }, + { + "epoch": 2.12, + "grad_norm": 4.563650608062744, + "learning_rate": 6.186801734486762e-06, + "logits/chosen": -0.49700847268104553, + "logits/rejected": -0.5833350419998169, + "logps/chosen": -57.43574523925781, + "logps/rejected": -91.8674087524414, + "loss": 0.7517, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7526752948760986, + "rewards/margins": 5.55790901184082, + "rewards/rejected": -2.805233955383301, + "step": 8468 + }, + { + "epoch": 2.12, + "grad_norm": 2.8723247051239014, + "learning_rate": 6.186038194384828e-06, + "logits/chosen": -0.43578484654426575, + "logits/rejected": -0.5506529808044434, + "logps/chosen": -52.31589126586914, + "logps/rejected": -91.04496765136719, + "loss": 0.5707, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9258360862731934, + "rewards/margins": 6.124059677124023, + "rewards/rejected": -3.19822359085083, + "step": 8469 + }, + { + "epoch": 2.12, + "grad_norm": 3.6419224739074707, + "learning_rate": 6.185274624974627e-06, + "logits/chosen": -0.37496188282966614, + "logits/rejected": -0.42309191823005676, + "logps/chosen": -67.43034362792969, + "logps/rejected": -91.84099578857422, + "loss": 0.64, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9985897541046143, + "rewards/margins": 5.074376106262207, + "rewards/rejected": -2.0757861137390137, + "step": 8470 + }, + { + "epoch": 2.12, + "grad_norm": 5.283976078033447, + "learning_rate": 6.184511026275027e-06, + "logits/chosen": -0.4912407100200653, + "logits/rejected": -0.628830075263977, + "logps/chosen": -60.9989013671875, + "logps/rejected": -89.0773696899414, + "loss": 0.7286, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.946655035018921, + "rewards/margins": 5.670918941497803, + "rewards/rejected": -2.724263906478882, + "step": 8471 + }, + { + "epoch": 2.12, + "grad_norm": 2.8035287857055664, + "learning_rate": 6.183747398304902e-06, + "logits/chosen": -0.45226290822029114, + "logits/rejected": -0.5664443373680115, + "logps/chosen": -59.86665344238281, + "logps/rejected": -86.21694946289062, + "loss": 0.6046, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8993124961853027, + "rewards/margins": 6.004315376281738, + "rewards/rejected": -3.1050033569335938, + "step": 8472 + }, + { + "epoch": 2.12, + "grad_norm": 4.098289966583252, + "learning_rate": 6.18298374108312e-06, + "logits/chosen": -0.5057569742202759, + "logits/rejected": -0.5733422040939331, + "logps/chosen": -53.04794692993164, + "logps/rejected": -85.43539428710938, + "loss": 0.7195, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0026330947875977, + "rewards/margins": 5.156016826629639, + "rewards/rejected": -2.153383731842041, + "step": 8473 + }, + { + "epoch": 2.12, + "grad_norm": 8.296348571777344, + "learning_rate": 6.182220054628552e-06, + "logits/chosen": -0.4945881962776184, + "logits/rejected": -0.4951975345611572, + "logps/chosen": -59.479339599609375, + "logps/rejected": -100.12711334228516, + "loss": 0.8741, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7497200965881348, + "rewards/margins": 3.7554495334625244, + "rewards/rejected": -1.0057302713394165, + "step": 8474 + }, + { + "epoch": 2.12, + "grad_norm": 8.728062629699707, + "learning_rate": 6.181456338960067e-06, + "logits/chosen": -0.48431599140167236, + "logits/rejected": -0.4601273238658905, + "logps/chosen": -50.17144775390625, + "logps/rejected": -109.83301544189453, + "loss": 0.6935, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1267807483673096, + "rewards/margins": 4.781798362731934, + "rewards/rejected": -1.6550172567367554, + "step": 8475 + }, + { + "epoch": 2.12, + "grad_norm": 11.726463317871094, + "learning_rate": 6.180692594096539e-06, + "logits/chosen": -0.5527217388153076, + "logits/rejected": -0.6473602056503296, + "logps/chosen": -56.41339874267578, + "logps/rejected": -82.45864868164062, + "loss": 0.8192, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5290002822875977, + "rewards/margins": 5.020054340362549, + "rewards/rejected": -2.491053581237793, + "step": 8476 + }, + { + "epoch": 2.12, + "grad_norm": 4.384295463562012, + "learning_rate": 6.179928820056842e-06, + "logits/chosen": -0.49231040477752686, + "logits/rejected": -0.6135548949241638, + "logps/chosen": -67.79463958740234, + "logps/rejected": -91.78226470947266, + "loss": 0.7943, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7991104125976562, + "rewards/margins": 5.450055122375488, + "rewards/rejected": -2.650944948196411, + "step": 8477 + }, + { + "epoch": 2.12, + "grad_norm": 4.319016456604004, + "learning_rate": 6.1791650168598505e-06, + "logits/chosen": -0.43583354353904724, + "logits/rejected": -0.5201708078384399, + "logps/chosen": -59.9007453918457, + "logps/rejected": -94.15705108642578, + "loss": 0.6764, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9020016193389893, + "rewards/margins": 5.9159698486328125, + "rewards/rejected": -3.0139684677124023, + "step": 8478 + }, + { + "epoch": 2.12, + "grad_norm": 6.879019260406494, + "learning_rate": 6.1784011845244354e-06, + "logits/chosen": -0.49600130319595337, + "logits/rejected": -0.5541064739227295, + "logps/chosen": -56.479759216308594, + "logps/rejected": -95.7088623046875, + "loss": 0.7177, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6364006996154785, + "rewards/margins": 6.120308876037598, + "rewards/rejected": -3.483907699584961, + "step": 8479 + }, + { + "epoch": 2.12, + "grad_norm": 3.537327289581299, + "learning_rate": 6.177637323069473e-06, + "logits/chosen": -0.4789845645427704, + "logits/rejected": -0.5734531879425049, + "logps/chosen": -61.45538330078125, + "logps/rejected": -96.91266632080078, + "loss": 0.6847, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.062932014465332, + "rewards/margins": 6.100340843200684, + "rewards/rejected": -3.0374088287353516, + "step": 8480 + }, + { + "epoch": 2.12, + "grad_norm": 6.4062180519104, + "learning_rate": 6.176873432513841e-06, + "logits/chosen": -0.5377483367919922, + "logits/rejected": -0.634166419506073, + "logps/chosen": -56.88039779663086, + "logps/rejected": -85.41731262207031, + "loss": 0.6888, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6992461681365967, + "rewards/margins": 5.369385242462158, + "rewards/rejected": -2.6701395511627197, + "step": 8481 + }, + { + "epoch": 2.12, + "grad_norm": 3.3794775009155273, + "learning_rate": 6.176109512876414e-06, + "logits/chosen": -0.4654102921485901, + "logits/rejected": -0.5226922631263733, + "logps/chosen": -48.20780944824219, + "logps/rejected": -102.35962677001953, + "loss": 0.5912, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1941351890563965, + "rewards/margins": 6.44379186630249, + "rewards/rejected": -3.249656915664673, + "step": 8482 + }, + { + "epoch": 2.12, + "grad_norm": 6.574061393737793, + "learning_rate": 6.1753455641760716e-06, + "logits/chosen": -0.4485991299152374, + "logits/rejected": -0.559107780456543, + "logps/chosen": -63.683250427246094, + "logps/rejected": -84.98746490478516, + "loss": 0.7546, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8348357677459717, + "rewards/margins": 5.552781581878662, + "rewards/rejected": -2.7179455757141113, + "step": 8483 + }, + { + "epoch": 2.12, + "grad_norm": 4.512369155883789, + "learning_rate": 6.174581586431688e-06, + "logits/chosen": -0.4204254746437073, + "logits/rejected": -0.5664490461349487, + "logps/chosen": -56.665321350097656, + "logps/rejected": -84.0681381225586, + "loss": 0.6023, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1063716411590576, + "rewards/margins": 6.201329231262207, + "rewards/rejected": -3.0949578285217285, + "step": 8484 + }, + { + "epoch": 2.12, + "grad_norm": 7.8823699951171875, + "learning_rate": 6.173817579662145e-06, + "logits/chosen": -0.4395487904548645, + "logits/rejected": -0.5130597352981567, + "logps/chosen": -49.996891021728516, + "logps/rejected": -114.1205062866211, + "loss": 0.6865, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.782684326171875, + "rewards/margins": 6.906401634216309, + "rewards/rejected": -4.123717308044434, + "step": 8485 + }, + { + "epoch": 2.12, + "grad_norm": 3.8600385189056396, + "learning_rate": 6.17305354388632e-06, + "logits/chosen": -0.4597872495651245, + "logits/rejected": -0.5724353194236755, + "logps/chosen": -57.40937805175781, + "logps/rejected": -96.54013061523438, + "loss": 0.6152, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1431901454925537, + "rewards/margins": 6.745906829833984, + "rewards/rejected": -3.602717161178589, + "step": 8486 + }, + { + "epoch": 2.12, + "grad_norm": 4.857093811035156, + "learning_rate": 6.1722894791230966e-06, + "logits/chosen": -0.5318623781204224, + "logits/rejected": -0.5711783766746521, + "logps/chosen": -48.04677963256836, + "logps/rejected": -93.99644470214844, + "loss": 0.6003, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.208923578262329, + "rewards/margins": 5.539403915405273, + "rewards/rejected": -2.3304800987243652, + "step": 8487 + }, + { + "epoch": 2.12, + "grad_norm": 4.374650478363037, + "learning_rate": 6.1715253853913514e-06, + "logits/chosen": -0.5338177680969238, + "logits/rejected": -0.6173532605171204, + "logps/chosen": -53.253997802734375, + "logps/rejected": -90.59795379638672, + "loss": 0.6791, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4262144565582275, + "rewards/margins": 6.186349868774414, + "rewards/rejected": -2.7601358890533447, + "step": 8488 + }, + { + "epoch": 2.12, + "grad_norm": 7.739315986633301, + "learning_rate": 6.170761262709969e-06, + "logits/chosen": -0.49954843521118164, + "logits/rejected": -0.5265587568283081, + "logps/chosen": -51.90909194946289, + "logps/rejected": -96.89517211914062, + "loss": 0.7788, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.114882469177246, + "rewards/margins": 4.319159984588623, + "rewards/rejected": -1.2042770385742188, + "step": 8489 + }, + { + "epoch": 2.12, + "grad_norm": 3.9872164726257324, + "learning_rate": 6.169997111097829e-06, + "logits/chosen": -0.5324233770370483, + "logits/rejected": -0.538565993309021, + "logps/chosen": -50.25886917114258, + "logps/rejected": -108.82447814941406, + "loss": 0.6726, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.094973564147949, + "rewards/margins": 5.703070163726807, + "rewards/rejected": -2.6080968379974365, + "step": 8490 + }, + { + "epoch": 2.12, + "grad_norm": 4.438342571258545, + "learning_rate": 6.169232930573817e-06, + "logits/chosen": -0.5266301035881042, + "logits/rejected": -0.5598745942115784, + "logps/chosen": -41.65110778808594, + "logps/rejected": -99.6647720336914, + "loss": 0.6179, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0948400497436523, + "rewards/margins": 6.647726535797119, + "rewards/rejected": -3.552886962890625, + "step": 8491 + }, + { + "epoch": 2.12, + "grad_norm": 3.202791929244995, + "learning_rate": 6.168468721156816e-06, + "logits/chosen": -0.5689311027526855, + "logits/rejected": -0.6480718851089478, + "logps/chosen": -49.698211669921875, + "logps/rejected": -100.9547119140625, + "loss": 0.6861, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2304677963256836, + "rewards/margins": 7.182177543640137, + "rewards/rejected": -3.9517099857330322, + "step": 8492 + }, + { + "epoch": 2.12, + "grad_norm": 5.736093044281006, + "learning_rate": 6.167704482865709e-06, + "logits/chosen": -0.3696713149547577, + "logits/rejected": -0.4325857162475586, + "logps/chosen": -55.29071807861328, + "logps/rejected": -99.06886291503906, + "loss": 0.7349, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.876887798309326, + "rewards/margins": 5.477005958557129, + "rewards/rejected": -2.600118398666382, + "step": 8493 + }, + { + "epoch": 2.12, + "grad_norm": 3.076298475265503, + "learning_rate": 6.1669402157193835e-06, + "logits/chosen": -0.49085533618927, + "logits/rejected": -0.5163204669952393, + "logps/chosen": -49.36495590209961, + "logps/rejected": -79.48809814453125, + "loss": 0.6564, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.151247262954712, + "rewards/margins": 4.631359577178955, + "rewards/rejected": -1.4801125526428223, + "step": 8494 + }, + { + "epoch": 2.13, + "grad_norm": 6.682573318481445, + "learning_rate": 6.166175919736723e-06, + "logits/chosen": -0.5289319157600403, + "logits/rejected": -0.6021636724472046, + "logps/chosen": -64.44463348388672, + "logps/rejected": -101.40072631835938, + "loss": 0.741, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8610775470733643, + "rewards/margins": 5.817712306976318, + "rewards/rejected": -2.956634283065796, + "step": 8495 + }, + { + "epoch": 2.13, + "grad_norm": 4.498961925506592, + "learning_rate": 6.165411594936614e-06, + "logits/chosen": -0.5153245329856873, + "logits/rejected": -0.5703901648521423, + "logps/chosen": -58.64276885986328, + "logps/rejected": -95.29548645019531, + "loss": 0.695, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.060659170150757, + "rewards/margins": 5.578383445739746, + "rewards/rejected": -2.51772403717041, + "step": 8496 + }, + { + "epoch": 2.13, + "grad_norm": 5.21584415435791, + "learning_rate": 6.164647241337947e-06, + "logits/chosen": -0.4404241740703583, + "logits/rejected": -0.5087444186210632, + "logps/chosen": -55.237815856933594, + "logps/rejected": -92.49391174316406, + "loss": 0.6462, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.189051389694214, + "rewards/margins": 5.146535873413086, + "rewards/rejected": -1.9574838876724243, + "step": 8497 + }, + { + "epoch": 2.13, + "grad_norm": 24.164804458618164, + "learning_rate": 6.163882858959608e-06, + "logits/chosen": -0.5526745915412903, + "logits/rejected": -0.5951623916625977, + "logps/chosen": -48.91265869140625, + "logps/rejected": -95.6514663696289, + "loss": 0.7686, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7304775714874268, + "rewards/margins": 5.488927364349365, + "rewards/rejected": -2.7584497928619385, + "step": 8498 + }, + { + "epoch": 2.13, + "grad_norm": 5.446592807769775, + "learning_rate": 6.163118447820484e-06, + "logits/chosen": -0.4405186176300049, + "logits/rejected": -0.5592795014381409, + "logps/chosen": -64.68896484375, + "logps/rejected": -103.33949279785156, + "loss": 0.7063, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7653584480285645, + "rewards/margins": 6.771117687225342, + "rewards/rejected": -4.005759239196777, + "step": 8499 + }, + { + "epoch": 2.13, + "grad_norm": 4.4632649421691895, + "learning_rate": 6.162354007939467e-06, + "logits/chosen": -0.4750525653362274, + "logits/rejected": -0.5748559236526489, + "logps/chosen": -62.06591033935547, + "logps/rejected": -107.65290069580078, + "loss": 0.7128, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8302338123321533, + "rewards/margins": 5.782902240753174, + "rewards/rejected": -2.9526681900024414, + "step": 8500 + }, + { + "epoch": 2.13, + "grad_norm": 8.738640785217285, + "learning_rate": 6.161589539335446e-06, + "logits/chosen": -0.5687330365180969, + "logits/rejected": -0.5975252389907837, + "logps/chosen": -50.66826629638672, + "logps/rejected": -99.0995101928711, + "loss": 0.7843, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.809903383255005, + "rewards/margins": 5.297994613647461, + "rewards/rejected": -2.488090991973877, + "step": 8501 + }, + { + "epoch": 2.13, + "grad_norm": 6.818868637084961, + "learning_rate": 6.160825042027312e-06, + "logits/chosen": -0.555950403213501, + "logits/rejected": -0.5948446989059448, + "logps/chosen": -52.99602127075195, + "logps/rejected": -86.01823425292969, + "loss": 0.7339, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0044567584991455, + "rewards/margins": 5.730490684509277, + "rewards/rejected": -2.726034164428711, + "step": 8502 + }, + { + "epoch": 2.13, + "grad_norm": 9.169031143188477, + "learning_rate": 6.160060516033957e-06, + "logits/chosen": -0.4735925495624542, + "logits/rejected": -0.5497219562530518, + "logps/chosen": -53.16239547729492, + "logps/rejected": -90.45191192626953, + "loss": 0.7727, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.768170118331909, + "rewards/margins": 5.332481861114502, + "rewards/rejected": -2.5643112659454346, + "step": 8503 + }, + { + "epoch": 2.13, + "grad_norm": 6.019094944000244, + "learning_rate": 6.159295961374272e-06, + "logits/chosen": -0.5705262422561646, + "logits/rejected": -0.651928186416626, + "logps/chosen": -52.39929962158203, + "logps/rejected": -88.90599822998047, + "loss": 0.6985, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.111201286315918, + "rewards/margins": 4.92340087890625, + "rewards/rejected": -1.8121989965438843, + "step": 8504 + }, + { + "epoch": 2.13, + "grad_norm": 3.728388786315918, + "learning_rate": 6.158531378067151e-06, + "logits/chosen": -0.4927169680595398, + "logits/rejected": -0.5840212106704712, + "logps/chosen": -57.20657730102539, + "logps/rejected": -78.41505432128906, + "loss": 0.7339, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.850207805633545, + "rewards/margins": 5.170051574707031, + "rewards/rejected": -2.3198437690734863, + "step": 8505 + }, + { + "epoch": 2.13, + "grad_norm": 4.93502140045166, + "learning_rate": 6.157766766131486e-06, + "logits/chosen": -0.41214168071746826, + "logits/rejected": -0.5001401305198669, + "logps/chosen": -62.392059326171875, + "logps/rejected": -96.80887603759766, + "loss": 0.6984, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9068334102630615, + "rewards/margins": 5.9529876708984375, + "rewards/rejected": -3.046154737472534, + "step": 8506 + }, + { + "epoch": 2.13, + "grad_norm": 3.4457249641418457, + "learning_rate": 6.157002125586175e-06, + "logits/chosen": -0.3775818347930908, + "logits/rejected": -0.509433388710022, + "logps/chosen": -69.785400390625, + "logps/rejected": -96.4117202758789, + "loss": 0.6044, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9271793365478516, + "rewards/margins": 6.416243553161621, + "rewards/rejected": -3.4890642166137695, + "step": 8507 + }, + { + "epoch": 2.13, + "grad_norm": 3.7479827404022217, + "learning_rate": 6.156237456450109e-06, + "logits/chosen": -0.44660213589668274, + "logits/rejected": -0.5538508892059326, + "logps/chosen": -55.710182189941406, + "logps/rejected": -94.77882385253906, + "loss": 0.6375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.312567949295044, + "rewards/margins": 5.501652717590332, + "rewards/rejected": -2.189085006713867, + "step": 8508 + }, + { + "epoch": 2.13, + "grad_norm": 5.959882736206055, + "learning_rate": 6.155472758742187e-06, + "logits/chosen": -0.4518167972564697, + "logits/rejected": -0.5632860064506531, + "logps/chosen": -51.234703063964844, + "logps/rejected": -81.16925048828125, + "loss": 0.6958, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0564699172973633, + "rewards/margins": 5.342971324920654, + "rewards/rejected": -2.2865021228790283, + "step": 8509 + }, + { + "epoch": 2.13, + "grad_norm": 3.8986246585845947, + "learning_rate": 6.1547080324813045e-06, + "logits/chosen": -0.501355767250061, + "logits/rejected": -0.5890025496482849, + "logps/chosen": -54.10703659057617, + "logps/rejected": -79.92820739746094, + "loss": 0.6708, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1437184810638428, + "rewards/margins": 5.837992191314697, + "rewards/rejected": -2.6942734718322754, + "step": 8510 + }, + { + "epoch": 2.13, + "grad_norm": 7.339400768280029, + "learning_rate": 6.1539432776863566e-06, + "logits/chosen": -0.4389030933380127, + "logits/rejected": -0.5084364414215088, + "logps/chosen": -53.338096618652344, + "logps/rejected": -91.60282897949219, + "loss": 0.7438, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.101580858230591, + "rewards/margins": 5.800361633300781, + "rewards/rejected": -2.6987810134887695, + "step": 8511 + }, + { + "epoch": 2.13, + "grad_norm": 5.333484649658203, + "learning_rate": 6.153178494376244e-06, + "logits/chosen": -0.5263314843177795, + "logits/rejected": -0.6248616576194763, + "logps/chosen": -64.353759765625, + "logps/rejected": -77.15337371826172, + "loss": 0.8551, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.793599843978882, + "rewards/margins": 5.862142562866211, + "rewards/rejected": -3.068542957305908, + "step": 8512 + }, + { + "epoch": 2.13, + "grad_norm": 2.5220677852630615, + "learning_rate": 6.1524136825698635e-06, + "logits/chosen": -0.6036094427108765, + "logits/rejected": -0.703788161277771, + "logps/chosen": -41.67342758178711, + "logps/rejected": -86.53506469726562, + "loss": 0.5437, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.091127634048462, + "rewards/margins": 6.036232948303223, + "rewards/rejected": -2.9451050758361816, + "step": 8513 + }, + { + "epoch": 2.13, + "grad_norm": 4.586297512054443, + "learning_rate": 6.1516488422861155e-06, + "logits/chosen": -0.4633066952228546, + "logits/rejected": -0.5049384236335754, + "logps/chosen": -75.28347778320312, + "logps/rejected": -104.97042846679688, + "loss": 0.7162, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.904364824295044, + "rewards/margins": 6.378851413726807, + "rewards/rejected": -3.474486827850342, + "step": 8514 + }, + { + "epoch": 2.13, + "grad_norm": 3.777214765548706, + "learning_rate": 6.150883973543901e-06, + "logits/chosen": -0.4557473957538605, + "logits/rejected": -0.5843390226364136, + "logps/chosen": -57.768516540527344, + "logps/rejected": -81.79512786865234, + "loss": 0.5935, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2164125442504883, + "rewards/margins": 5.586869239807129, + "rewards/rejected": -2.3704562187194824, + "step": 8515 + }, + { + "epoch": 2.13, + "grad_norm": 4.5317792892456055, + "learning_rate": 6.1501190763621175e-06, + "logits/chosen": -0.48880326747894287, + "logits/rejected": -0.5246143341064453, + "logps/chosen": -60.48057556152344, + "logps/rejected": -91.66584777832031, + "loss": 0.7637, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.939663887023926, + "rewards/margins": 4.945521831512451, + "rewards/rejected": -2.0058584213256836, + "step": 8516 + }, + { + "epoch": 2.13, + "grad_norm": 39.19050216674805, + "learning_rate": 6.149354150759669e-06, + "logits/chosen": -0.43621817231178284, + "logits/rejected": -0.5346652865409851, + "logps/chosen": -54.839393615722656, + "logps/rejected": -89.1630630493164, + "loss": 0.6978, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7745018005371094, + "rewards/margins": 4.966779708862305, + "rewards/rejected": -2.1922779083251953, + "step": 8517 + }, + { + "epoch": 2.13, + "grad_norm": 9.687295913696289, + "learning_rate": 6.148589196755455e-06, + "logits/chosen": -0.4534960389137268, + "logits/rejected": -0.5688712000846863, + "logps/chosen": -55.35963821411133, + "logps/rejected": -83.43607330322266, + "loss": 0.6867, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.900750160217285, + "rewards/margins": 5.909787654876709, + "rewards/rejected": -3.009037733078003, + "step": 8518 + }, + { + "epoch": 2.13, + "grad_norm": 13.718449592590332, + "learning_rate": 6.147824214368383e-06, + "logits/chosen": -0.49206864833831787, + "logits/rejected": -0.6229990720748901, + "logps/chosen": -66.9603500366211, + "logps/rejected": -85.32608795166016, + "loss": 0.8152, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2108893394470215, + "rewards/margins": 5.61063289642334, + "rewards/rejected": -2.3997433185577393, + "step": 8519 + }, + { + "epoch": 2.13, + "grad_norm": 29.345670700073242, + "learning_rate": 6.147059203617353e-06, + "logits/chosen": -0.5010194778442383, + "logits/rejected": -0.561152994632721, + "logps/chosen": -52.74908447265625, + "logps/rejected": -94.54090881347656, + "loss": 0.8897, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.972804307937622, + "rewards/margins": 4.437189102172852, + "rewards/rejected": -1.4643847942352295, + "step": 8520 + }, + { + "epoch": 2.13, + "grad_norm": 4.4342474937438965, + "learning_rate": 6.14629416452127e-06, + "logits/chosen": -0.43605151772499084, + "logits/rejected": -0.4762552082538605, + "logps/chosen": -51.693931579589844, + "logps/rejected": -95.18797302246094, + "loss": 0.6426, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0297505855560303, + "rewards/margins": 4.6880879402160645, + "rewards/rejected": -1.6583375930786133, + "step": 8521 + }, + { + "epoch": 2.13, + "grad_norm": 8.115860939025879, + "learning_rate": 6.145529097099039e-06, + "logits/chosen": -0.46126192808151245, + "logits/rejected": -0.539975106716156, + "logps/chosen": -49.02485656738281, + "logps/rejected": -86.74235534667969, + "loss": 0.6744, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0607290267944336, + "rewards/margins": 5.721017360687256, + "rewards/rejected": -2.6602883338928223, + "step": 8522 + }, + { + "epoch": 2.13, + "grad_norm": 4.385700225830078, + "learning_rate": 6.144764001369565e-06, + "logits/chosen": -0.4939442276954651, + "logits/rejected": -0.5658490657806396, + "logps/chosen": -56.137245178222656, + "logps/rejected": -91.5281753540039, + "loss": 0.6949, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.049299716949463, + "rewards/margins": 6.080353260040283, + "rewards/rejected": -3.031053304672241, + "step": 8523 + }, + { + "epoch": 2.13, + "grad_norm": 6.525774955749512, + "learning_rate": 6.143998877351756e-06, + "logits/chosen": -0.45461201667785645, + "logits/rejected": -0.5690008997917175, + "logps/chosen": -71.13838958740234, + "logps/rejected": -93.38737487792969, + "loss": 0.7416, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.23551869392395, + "rewards/margins": 6.531272888183594, + "rewards/rejected": -3.2957541942596436, + "step": 8524 + }, + { + "epoch": 2.13, + "grad_norm": 3.0990097522735596, + "learning_rate": 6.143233725064516e-06, + "logits/chosen": -0.4842122495174408, + "logits/rejected": -0.602534294128418, + "logps/chosen": -59.922767639160156, + "logps/rejected": -102.24664306640625, + "loss": 0.6319, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.681760311126709, + "rewards/margins": 6.678659915924072, + "rewards/rejected": -3.996898889541626, + "step": 8525 + }, + { + "epoch": 2.13, + "grad_norm": 4.309584140777588, + "learning_rate": 6.142468544526757e-06, + "logits/chosen": -0.5100357532501221, + "logits/rejected": -0.606667697429657, + "logps/chosen": -49.88842010498047, + "logps/rejected": -104.74011993408203, + "loss": 0.6168, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9410362243652344, + "rewards/margins": 6.354724884033203, + "rewards/rejected": -3.4136886596679688, + "step": 8526 + }, + { + "epoch": 2.13, + "grad_norm": 3.4717044830322266, + "learning_rate": 6.141703335757383e-06, + "logits/chosen": -0.43123680353164673, + "logits/rejected": -0.5549027323722839, + "logps/chosen": -51.70136642456055, + "logps/rejected": -98.77926635742188, + "loss": 0.5044, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1563994884490967, + "rewards/margins": 7.3030500411987305, + "rewards/rejected": -4.146650314331055, + "step": 8527 + }, + { + "epoch": 2.13, + "grad_norm": 3.255512237548828, + "learning_rate": 6.140938098775306e-06, + "logits/chosen": -0.5548138618469238, + "logits/rejected": -0.6371105909347534, + "logps/chosen": -48.099853515625, + "logps/rejected": -104.70745849609375, + "loss": 0.5934, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0972888469696045, + "rewards/margins": 5.736748695373535, + "rewards/rejected": -2.6394598484039307, + "step": 8528 + }, + { + "epoch": 2.13, + "grad_norm": 9.248051643371582, + "learning_rate": 6.140172833599436e-06, + "logits/chosen": -0.4696120321750641, + "logits/rejected": -0.5504676103591919, + "logps/chosen": -61.605224609375, + "logps/rejected": -110.24601745605469, + "loss": 0.6414, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8271853923797607, + "rewards/margins": 6.895082473754883, + "rewards/rejected": -4.067897319793701, + "step": 8529 + }, + { + "epoch": 2.13, + "grad_norm": 2.9990811347961426, + "learning_rate": 6.139407540248682e-06, + "logits/chosen": -0.44683367013931274, + "logits/rejected": -0.5009213089942932, + "logps/chosen": -44.88365936279297, + "logps/rejected": -89.34526062011719, + "loss": 0.5955, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0691683292388916, + "rewards/margins": 5.618226051330566, + "rewards/rejected": -2.5490572452545166, + "step": 8530 + }, + { + "epoch": 2.13, + "grad_norm": 4.947289943695068, + "learning_rate": 6.138642218741955e-06, + "logits/chosen": -0.40260612964630127, + "logits/rejected": -0.5264779925346375, + "logps/chosen": -56.25585174560547, + "logps/rejected": -94.41966247558594, + "loss": 0.5684, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8587427139282227, + "rewards/margins": 6.284916877746582, + "rewards/rejected": -3.4261741638183594, + "step": 8531 + }, + { + "epoch": 2.13, + "grad_norm": 4.465793132781982, + "learning_rate": 6.1378768690981686e-06, + "logits/chosen": -0.46492552757263184, + "logits/rejected": -0.5402534008026123, + "logps/chosen": -49.55498123168945, + "logps/rejected": -91.93964385986328, + "loss": 0.6814, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.950953722000122, + "rewards/margins": 5.117856502532959, + "rewards/rejected": -2.166902780532837, + "step": 8532 + }, + { + "epoch": 2.13, + "grad_norm": 11.334491729736328, + "learning_rate": 6.137111491336234e-06, + "logits/chosen": -0.513978898525238, + "logits/rejected": -0.5353794097900391, + "logps/chosen": -53.488014221191406, + "logps/rejected": -87.91214752197266, + "loss": 0.7539, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.13236403465271, + "rewards/margins": 4.392268657684326, + "rewards/rejected": -1.2599042654037476, + "step": 8533 + }, + { + "epoch": 2.13, + "grad_norm": 16.643526077270508, + "learning_rate": 6.1363460854750646e-06, + "logits/chosen": -0.4242280423641205, + "logits/rejected": -0.4760749340057373, + "logps/chosen": -61.551692962646484, + "logps/rejected": -86.62918853759766, + "loss": 1.0463, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.908863067626953, + "rewards/margins": 4.249141216278076, + "rewards/rejected": -1.340278148651123, + "step": 8534 + }, + { + "epoch": 2.14, + "grad_norm": 8.194181442260742, + "learning_rate": 6.135580651533576e-06, + "logits/chosen": -0.5203700661659241, + "logits/rejected": -0.6079673171043396, + "logps/chosen": -59.1375732421875, + "logps/rejected": -84.34506225585938, + "loss": 0.7284, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.111893653869629, + "rewards/margins": 5.520001411437988, + "rewards/rejected": -2.4081077575683594, + "step": 8535 + }, + { + "epoch": 2.14, + "grad_norm": 6.135183811187744, + "learning_rate": 6.13481518953068e-06, + "logits/chosen": -0.46252524852752686, + "logits/rejected": -0.47191816568374634, + "logps/chosen": -63.951419830322266, + "logps/rejected": -101.74125671386719, + "loss": 0.721, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.799222469329834, + "rewards/margins": 4.800464153289795, + "rewards/rejected": -2.00124192237854, + "step": 8536 + }, + { + "epoch": 2.14, + "grad_norm": 3.802492380142212, + "learning_rate": 6.134049699485294e-06, + "logits/chosen": -0.5259539484977722, + "logits/rejected": -0.620881974697113, + "logps/chosen": -50.52061462402344, + "logps/rejected": -102.89389038085938, + "loss": 0.6547, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.947052240371704, + "rewards/margins": 7.058120250701904, + "rewards/rejected": -4.111068248748779, + "step": 8537 + }, + { + "epoch": 2.14, + "grad_norm": 6.760519504547119, + "learning_rate": 6.133284181416335e-06, + "logits/chosen": -0.4596843123435974, + "logits/rejected": -0.5484127998352051, + "logps/chosen": -59.79312515258789, + "logps/rejected": -78.96524047851562, + "loss": 0.7536, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8726871013641357, + "rewards/margins": 5.801530361175537, + "rewards/rejected": -2.928842544555664, + "step": 8538 + }, + { + "epoch": 2.14, + "grad_norm": 16.26004981994629, + "learning_rate": 6.132518635342717e-06, + "logits/chosen": -0.45903515815734863, + "logits/rejected": -0.5751541256904602, + "logps/chosen": -69.01371765136719, + "logps/rejected": -100.04264068603516, + "loss": 0.7939, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.457159996032715, + "rewards/margins": 5.476604461669922, + "rewards/rejected": -3.019444465637207, + "step": 8539 + }, + { + "epoch": 2.14, + "grad_norm": 8.144657135009766, + "learning_rate": 6.13175306128336e-06, + "logits/chosen": -0.6138482093811035, + "logits/rejected": -0.6485080718994141, + "logps/chosen": -49.70594787597656, + "logps/rejected": -98.74060821533203, + "loss": 0.7501, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8712692260742188, + "rewards/margins": 5.807507514953613, + "rewards/rejected": -2.9362387657165527, + "step": 8540 + }, + { + "epoch": 2.14, + "grad_norm": 3.7443361282348633, + "learning_rate": 6.13098745925718e-06, + "logits/chosen": -0.5052801966667175, + "logits/rejected": -0.5942766666412354, + "logps/chosen": -57.34806442260742, + "logps/rejected": -87.81266021728516, + "loss": 0.6946, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9100842475891113, + "rewards/margins": 5.616350173950195, + "rewards/rejected": -2.706266403198242, + "step": 8541 + }, + { + "epoch": 2.14, + "grad_norm": 7.274704456329346, + "learning_rate": 6.1302218292830975e-06, + "logits/chosen": -0.47700873017311096, + "logits/rejected": -0.57173752784729, + "logps/chosen": -59.91054153442383, + "logps/rejected": -98.50975036621094, + "loss": 0.7585, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1536865234375, + "rewards/margins": 5.789224624633789, + "rewards/rejected": -2.6355385780334473, + "step": 8542 + }, + { + "epoch": 2.14, + "grad_norm": 6.564849376678467, + "learning_rate": 6.1294561713800305e-06, + "logits/chosen": -0.4042366147041321, + "logits/rejected": -0.5039148330688477, + "logps/chosen": -59.98915481567383, + "logps/rejected": -80.02898406982422, + "loss": 0.7528, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8088531494140625, + "rewards/margins": 3.9683382511138916, + "rewards/rejected": -1.159485101699829, + "step": 8543 + }, + { + "epoch": 2.14, + "grad_norm": 4.461559772491455, + "learning_rate": 6.128690485566903e-06, + "logits/chosen": -0.44226574897766113, + "logits/rejected": -0.5365375280380249, + "logps/chosen": -62.562747955322266, + "logps/rejected": -105.89016723632812, + "loss": 0.7194, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.656456708908081, + "rewards/margins": 6.270800590515137, + "rewards/rejected": -3.6143438816070557, + "step": 8544 + }, + { + "epoch": 2.14, + "grad_norm": 4.884516716003418, + "learning_rate": 6.127924771862629e-06, + "logits/chosen": -0.4636796712875366, + "logits/rejected": -0.5634993314743042, + "logps/chosen": -55.05683517456055, + "logps/rejected": -84.78073120117188, + "loss": 0.6109, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0747575759887695, + "rewards/margins": 5.71305513381958, + "rewards/rejected": -2.6382973194122314, + "step": 8545 + }, + { + "epoch": 2.14, + "grad_norm": 5.300649642944336, + "learning_rate": 6.127159030286137e-06, + "logits/chosen": -0.4409422278404236, + "logits/rejected": -0.4700773358345032, + "logps/chosen": -46.500946044921875, + "logps/rejected": -113.25067901611328, + "loss": 0.6501, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.154085159301758, + "rewards/margins": 6.222871780395508, + "rewards/rejected": -3.068786382675171, + "step": 8546 + }, + { + "epoch": 2.14, + "grad_norm": 4.08673620223999, + "learning_rate": 6.126393260856346e-06, + "logits/chosen": -0.45812198519706726, + "logits/rejected": -0.5836262702941895, + "logps/chosen": -53.31412124633789, + "logps/rejected": -96.876953125, + "loss": 0.5778, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9662983417510986, + "rewards/margins": 7.402017593383789, + "rewards/rejected": -4.435719013214111, + "step": 8547 + }, + { + "epoch": 2.14, + "grad_norm": 2.262371301651001, + "learning_rate": 6.125627463592179e-06, + "logits/chosen": -0.4604988098144531, + "logits/rejected": -0.5749426484107971, + "logps/chosen": -63.10224914550781, + "logps/rejected": -99.69804382324219, + "loss": 0.593, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.98964262008667, + "rewards/margins": 6.7302446365356445, + "rewards/rejected": -3.7406022548675537, + "step": 8548 + }, + { + "epoch": 2.14, + "grad_norm": 5.052637100219727, + "learning_rate": 6.12486163851256e-06, + "logits/chosen": -0.5505478382110596, + "logits/rejected": -0.6560816764831543, + "logps/chosen": -53.58507537841797, + "logps/rejected": -81.07176208496094, + "loss": 0.7227, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.007176637649536, + "rewards/margins": 5.750669002532959, + "rewards/rejected": -2.7434918880462646, + "step": 8549 + }, + { + "epoch": 2.14, + "grad_norm": 4.358017444610596, + "learning_rate": 6.124095785636415e-06, + "logits/chosen": -0.42085644602775574, + "logits/rejected": -0.4737170338630676, + "logps/chosen": -49.10981750488281, + "logps/rejected": -90.00664520263672, + "loss": 0.677, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1363186836242676, + "rewards/margins": 4.938454627990723, + "rewards/rejected": -1.8021355867385864, + "step": 8550 + }, + { + "epoch": 2.14, + "grad_norm": 6.362608432769775, + "learning_rate": 6.123329904982665e-06, + "logits/chosen": -0.3584836721420288, + "logits/rejected": -0.4334189295768738, + "logps/chosen": -56.156776428222656, + "logps/rejected": -89.7504653930664, + "loss": 0.7593, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7729244232177734, + "rewards/margins": 5.0062031745910645, + "rewards/rejected": -2.233278512954712, + "step": 8551 + }, + { + "epoch": 2.14, + "grad_norm": 2.7616004943847656, + "learning_rate": 6.12256399657024e-06, + "logits/chosen": -0.5169888138771057, + "logits/rejected": -0.6175976991653442, + "logps/chosen": -47.38750076293945, + "logps/rejected": -90.94963073730469, + "loss": 0.5694, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.162785291671753, + "rewards/margins": 5.664911270141602, + "rewards/rejected": -2.5021257400512695, + "step": 8552 + }, + { + "epoch": 2.14, + "grad_norm": 10.964615821838379, + "learning_rate": 6.121798060418065e-06, + "logits/chosen": -0.461660772562027, + "logits/rejected": -0.5606951713562012, + "logps/chosen": -57.603919982910156, + "logps/rejected": -82.2153091430664, + "loss": 0.6881, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.924804210662842, + "rewards/margins": 5.359350681304932, + "rewards/rejected": -2.43454647064209, + "step": 8553 + }, + { + "epoch": 2.14, + "grad_norm": 5.697349548339844, + "learning_rate": 6.121032096545064e-06, + "logits/chosen": -0.4793947637081146, + "logits/rejected": -0.5249330997467041, + "logps/chosen": -66.05225372314453, + "logps/rejected": -101.32398223876953, + "loss": 0.6748, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9307937622070312, + "rewards/margins": 4.970170021057129, + "rewards/rejected": -2.0393762588500977, + "step": 8554 + }, + { + "epoch": 2.14, + "grad_norm": 8.586620330810547, + "learning_rate": 6.120266104970171e-06, + "logits/chosen": -0.4626051187515259, + "logits/rejected": -0.5736596584320068, + "logps/chosen": -58.02601623535156, + "logps/rejected": -93.16405487060547, + "loss": 0.8532, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.970033884048462, + "rewards/margins": 6.199695110321045, + "rewards/rejected": -3.229661464691162, + "step": 8555 + }, + { + "epoch": 2.14, + "grad_norm": 4.034226417541504, + "learning_rate": 6.119500085712309e-06, + "logits/chosen": -0.478096604347229, + "logits/rejected": -0.5816872715950012, + "logps/chosen": -59.696449279785156, + "logps/rejected": -78.87427520751953, + "loss": 0.6343, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9683663845062256, + "rewards/margins": 5.159693717956543, + "rewards/rejected": -2.1913270950317383, + "step": 8556 + }, + { + "epoch": 2.14, + "grad_norm": 6.623554706573486, + "learning_rate": 6.118734038790408e-06, + "logits/chosen": -0.460218608379364, + "logits/rejected": -0.5075612664222717, + "logps/chosen": -53.40398406982422, + "logps/rejected": -88.29006958007812, + "loss": 0.822, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9462313652038574, + "rewards/margins": 5.124246120452881, + "rewards/rejected": -2.1780147552490234, + "step": 8557 + }, + { + "epoch": 2.14, + "grad_norm": 4.4013543128967285, + "learning_rate": 6.117967964223402e-06, + "logits/chosen": -0.47631150484085083, + "logits/rejected": -0.5690025091171265, + "logps/chosen": -55.71039581298828, + "logps/rejected": -99.28057098388672, + "loss": 0.6429, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.251807928085327, + "rewards/margins": 5.889955997467041, + "rewards/rejected": -2.638148069381714, + "step": 8558 + }, + { + "epoch": 2.14, + "grad_norm": 4.260580062866211, + "learning_rate": 6.117201862030217e-06, + "logits/chosen": -0.42777031660079956, + "logits/rejected": -0.4887198805809021, + "logps/chosen": -47.6163330078125, + "logps/rejected": -95.02843475341797, + "loss": 0.6212, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.220618963241577, + "rewards/margins": 5.72784423828125, + "rewards/rejected": -2.50722599029541, + "step": 8559 + }, + { + "epoch": 2.14, + "grad_norm": 5.686953544616699, + "learning_rate": 6.116435732229785e-06, + "logits/chosen": -0.44558361172676086, + "logits/rejected": -0.5469729900360107, + "logps/chosen": -66.11239624023438, + "logps/rejected": -78.76573181152344, + "loss": 0.7728, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8012046813964844, + "rewards/margins": 4.325754165649414, + "rewards/rejected": -1.5245496034622192, + "step": 8560 + }, + { + "epoch": 2.14, + "grad_norm": 6.036674976348877, + "learning_rate": 6.115669574841039e-06, + "logits/chosen": -0.39709600806236267, + "logits/rejected": -0.48982685804367065, + "logps/chosen": -59.99861526489258, + "logps/rejected": -102.5615005493164, + "loss": 0.7574, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9439971446990967, + "rewards/margins": 5.544764518737793, + "rewards/rejected": -2.600767135620117, + "step": 8561 + }, + { + "epoch": 2.14, + "grad_norm": 5.574467182159424, + "learning_rate": 6.114903389882912e-06, + "logits/chosen": -0.5537921786308289, + "logits/rejected": -0.6341174244880676, + "logps/chosen": -53.79059600830078, + "logps/rejected": -84.0981216430664, + "loss": 0.7005, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.152317523956299, + "rewards/margins": 5.101827144622803, + "rewards/rejected": -1.9495097398757935, + "step": 8562 + }, + { + "epoch": 2.14, + "grad_norm": 2.9429192543029785, + "learning_rate": 6.1141371773743345e-06, + "logits/chosen": -0.4419589638710022, + "logits/rejected": -0.5451972484588623, + "logps/chosen": -54.630043029785156, + "logps/rejected": -99.32090759277344, + "loss": 0.6385, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0250706672668457, + "rewards/margins": 6.602620601654053, + "rewards/rejected": -3.577550172805786, + "step": 8563 + }, + { + "epoch": 2.14, + "grad_norm": 1.9478272199630737, + "learning_rate": 6.113370937334244e-06, + "logits/chosen": -0.44537559151649475, + "logits/rejected": -0.5600290894508362, + "logps/chosen": -54.86533737182617, + "logps/rejected": -105.82989501953125, + "loss": 0.5818, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9454126358032227, + "rewards/margins": 6.913687229156494, + "rewards/rejected": -3.9682748317718506, + "step": 8564 + }, + { + "epoch": 2.14, + "grad_norm": 8.04269790649414, + "learning_rate": 6.112604669781572e-06, + "logits/chosen": -0.47998225688934326, + "logits/rejected": -0.5661773681640625, + "logps/chosen": -57.21345520019531, + "logps/rejected": -99.3907699584961, + "loss": 0.6631, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.944340229034424, + "rewards/margins": 6.293740272521973, + "rewards/rejected": -3.3493998050689697, + "step": 8565 + }, + { + "epoch": 2.14, + "grad_norm": 5.246053218841553, + "learning_rate": 6.111838374735257e-06, + "logits/chosen": -0.5419235825538635, + "logits/rejected": -0.5873481631278992, + "logps/chosen": -57.56573486328125, + "logps/rejected": -98.12218475341797, + "loss": 0.6932, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.166133165359497, + "rewards/margins": 5.601625919342041, + "rewards/rejected": -2.4354922771453857, + "step": 8566 + }, + { + "epoch": 2.14, + "grad_norm": 6.277742862701416, + "learning_rate": 6.111072052214233e-06, + "logits/chosen": -0.47215956449508667, + "logits/rejected": -0.5543249249458313, + "logps/chosen": -49.68438720703125, + "logps/rejected": -91.67904663085938, + "loss": 0.6532, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.186936378479004, + "rewards/margins": 6.306539058685303, + "rewards/rejected": -3.119602680206299, + "step": 8567 + }, + { + "epoch": 2.14, + "grad_norm": 7.893098831176758, + "learning_rate": 6.110305702237435e-06, + "logits/chosen": -0.5136067271232605, + "logits/rejected": -0.5763424634933472, + "logps/chosen": -62.51842498779297, + "logps/rejected": -83.28821563720703, + "loss": 0.7884, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1261467933654785, + "rewards/margins": 4.673586368560791, + "rewards/rejected": -1.547439455986023, + "step": 8568 + }, + { + "epoch": 2.14, + "grad_norm": 4.236950397491455, + "learning_rate": 6.1095393248238035e-06, + "logits/chosen": -0.47584855556488037, + "logits/rejected": -0.6066120862960815, + "logps/chosen": -65.92631530761719, + "logps/rejected": -79.02157592773438, + "loss": 0.6473, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.896712064743042, + "rewards/margins": 5.5939717292785645, + "rewards/rejected": -2.6972601413726807, + "step": 8569 + }, + { + "epoch": 2.14, + "grad_norm": 5.6903157234191895, + "learning_rate": 6.108772919992274e-06, + "logits/chosen": -0.49438565969467163, + "logits/rejected": -0.5788198709487915, + "logps/chosen": -63.685333251953125, + "logps/rejected": -84.71644592285156, + "loss": 0.7084, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8050198554992676, + "rewards/margins": 5.5361328125, + "rewards/rejected": -2.7311131954193115, + "step": 8570 + }, + { + "epoch": 2.14, + "grad_norm": 5.340285778045654, + "learning_rate": 6.108006487761788e-06, + "logits/chosen": -0.5031064748764038, + "logits/rejected": -0.5759552121162415, + "logps/chosen": -47.999351501464844, + "logps/rejected": -91.80384826660156, + "loss": 0.6816, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2003397941589355, + "rewards/margins": 5.136234283447266, + "rewards/rejected": -1.935894250869751, + "step": 8571 + }, + { + "epoch": 2.14, + "grad_norm": 30.32433319091797, + "learning_rate": 6.1072400281512814e-06, + "logits/chosen": -0.46102631092071533, + "logits/rejected": -0.5656338334083557, + "logps/chosen": -54.49912643432617, + "logps/rejected": -81.81395721435547, + "loss": 0.7598, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1471762657165527, + "rewards/margins": 4.999613285064697, + "rewards/rejected": -1.852436900138855, + "step": 8572 + }, + { + "epoch": 2.14, + "grad_norm": 3.297469139099121, + "learning_rate": 6.1064735411796976e-06, + "logits/chosen": -0.44755837321281433, + "logits/rejected": -0.49067115783691406, + "logps/chosen": -55.22594451904297, + "logps/rejected": -107.90640258789062, + "loss": 0.6498, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9280073642730713, + "rewards/margins": 6.558043956756592, + "rewards/rejected": -3.6300368309020996, + "step": 8573 + }, + { + "epoch": 2.14, + "grad_norm": 4.279537677764893, + "learning_rate": 6.105707026865975e-06, + "logits/chosen": -0.5859843492507935, + "logits/rejected": -0.6236328482627869, + "logps/chosen": -55.56074905395508, + "logps/rejected": -91.90270233154297, + "loss": 0.7282, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.230290651321411, + "rewards/margins": 5.918967247009277, + "rewards/rejected": -2.6886777877807617, + "step": 8574 + }, + { + "epoch": 2.15, + "grad_norm": 3.9654083251953125, + "learning_rate": 6.104940485229055e-06, + "logits/chosen": -0.532296359539032, + "logits/rejected": -0.6253697872161865, + "logps/chosen": -56.75553512573242, + "logps/rejected": -102.7446517944336, + "loss": 0.6495, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1497602462768555, + "rewards/margins": 5.872714519500732, + "rewards/rejected": -2.7229535579681396, + "step": 8575 + }, + { + "epoch": 2.15, + "grad_norm": 6.735373020172119, + "learning_rate": 6.104173916287881e-06, + "logits/chosen": -0.4395070970058441, + "logits/rejected": -0.553173840045929, + "logps/chosen": -72.24393463134766, + "logps/rejected": -92.83648681640625, + "loss": 0.7506, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.899707317352295, + "rewards/margins": 5.218773365020752, + "rewards/rejected": -2.319066286087036, + "step": 8576 + }, + { + "epoch": 2.15, + "grad_norm": 3.7719380855560303, + "learning_rate": 6.103407320061394e-06, + "logits/chosen": -0.5158169269561768, + "logits/rejected": -0.6395512819290161, + "logps/chosen": -57.67998504638672, + "logps/rejected": -82.55976104736328, + "loss": 0.6479, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.130903959274292, + "rewards/margins": 6.727104663848877, + "rewards/rejected": -3.596200942993164, + "step": 8577 + }, + { + "epoch": 2.15, + "grad_norm": 5.393953323364258, + "learning_rate": 6.1026406965685404e-06, + "logits/chosen": -0.5320578217506409, + "logits/rejected": -0.5922520756721497, + "logps/chosen": -55.649497985839844, + "logps/rejected": -92.1721420288086, + "loss": 0.6842, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.085038185119629, + "rewards/margins": 5.585931777954102, + "rewards/rejected": -2.5008938312530518, + "step": 8578 + }, + { + "epoch": 2.15, + "grad_norm": 5.581700801849365, + "learning_rate": 6.101874045828261e-06, + "logits/chosen": -0.4705391526222229, + "logits/rejected": -0.5462630987167358, + "logps/chosen": -50.64311981201172, + "logps/rejected": -86.97372436523438, + "loss": 0.7029, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8783349990844727, + "rewards/margins": 4.83090877532959, + "rewards/rejected": -1.952574372291565, + "step": 8579 + }, + { + "epoch": 2.15, + "grad_norm": 4.937509536743164, + "learning_rate": 6.101107367859502e-06, + "logits/chosen": -0.5327470898628235, + "logits/rejected": -0.5516921281814575, + "logps/chosen": -46.60784912109375, + "logps/rejected": -99.63330078125, + "loss": 0.6467, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8882110118865967, + "rewards/margins": 5.612174987792969, + "rewards/rejected": -2.723963975906372, + "step": 8580 + }, + { + "epoch": 2.15, + "grad_norm": 6.517430782318115, + "learning_rate": 6.10034066268121e-06, + "logits/chosen": -0.4466787278652191, + "logits/rejected": -0.511520504951477, + "logps/chosen": -62.477577209472656, + "logps/rejected": -80.54931640625, + "loss": 0.8436, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8129119873046875, + "rewards/margins": 3.8530426025390625, + "rewards/rejected": -1.0401307344436646, + "step": 8581 + }, + { + "epoch": 2.15, + "grad_norm": 7.237023830413818, + "learning_rate": 6.099573930312329e-06, + "logits/chosen": -0.4766133427619934, + "logits/rejected": -0.5331430435180664, + "logps/chosen": -53.319297790527344, + "logps/rejected": -103.70539855957031, + "loss": 0.7212, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7565503120422363, + "rewards/margins": 5.910033226013184, + "rewards/rejected": -3.1534833908081055, + "step": 8582 + }, + { + "epoch": 2.15, + "grad_norm": 9.670380592346191, + "learning_rate": 6.098807170771807e-06, + "logits/chosen": -0.43362611532211304, + "logits/rejected": -0.49165964126586914, + "logps/chosen": -60.1876106262207, + "logps/rejected": -105.2426528930664, + "loss": 0.5901, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.069169044494629, + "rewards/margins": 6.847469806671143, + "rewards/rejected": -3.778301239013672, + "step": 8583 + }, + { + "epoch": 2.15, + "grad_norm": 12.37632942199707, + "learning_rate": 6.098040384078589e-06, + "logits/chosen": -0.4858977496623993, + "logits/rejected": -0.534697413444519, + "logps/chosen": -61.86254119873047, + "logps/rejected": -123.87286376953125, + "loss": 0.718, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.490778684616089, + "rewards/margins": 5.367461204528809, + "rewards/rejected": -2.8766825199127197, + "step": 8584 + }, + { + "epoch": 2.15, + "grad_norm": 9.412141799926758, + "learning_rate": 6.097273570251627e-06, + "logits/chosen": -0.4649280309677124, + "logits/rejected": -0.5460981130599976, + "logps/chosen": -61.27959442138672, + "logps/rejected": -98.67253875732422, + "loss": 0.7374, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6631760597229004, + "rewards/margins": 5.997928142547607, + "rewards/rejected": -3.3347525596618652, + "step": 8585 + }, + { + "epoch": 2.15, + "grad_norm": 3.5863373279571533, + "learning_rate": 6.096506729309868e-06, + "logits/chosen": -0.5210559368133545, + "logits/rejected": -0.5974902510643005, + "logps/chosen": -46.886653900146484, + "logps/rejected": -91.07630157470703, + "loss": 0.6196, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2898643016815186, + "rewards/margins": 6.008695602416992, + "rewards/rejected": -2.7188310623168945, + "step": 8586 + }, + { + "epoch": 2.15, + "grad_norm": 4.205886363983154, + "learning_rate": 6.095739861272262e-06, + "logits/chosen": -0.4785349369049072, + "logits/rejected": -0.5567810535430908, + "logps/chosen": -55.116722106933594, + "logps/rejected": -104.27783966064453, + "loss": 0.7442, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3424696922302246, + "rewards/margins": 6.472138404846191, + "rewards/rejected": -3.129668712615967, + "step": 8587 + }, + { + "epoch": 2.15, + "grad_norm": 3.54476261138916, + "learning_rate": 6.094972966157757e-06, + "logits/chosen": -0.5244501829147339, + "logits/rejected": -0.5967577695846558, + "logps/chosen": -56.63764190673828, + "logps/rejected": -101.26494598388672, + "loss": 0.6691, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.626741886138916, + "rewards/margins": 6.311423301696777, + "rewards/rejected": -3.6846814155578613, + "step": 8588 + }, + { + "epoch": 2.15, + "grad_norm": 4.956941604614258, + "learning_rate": 6.094206043985307e-06, + "logits/chosen": -0.5226913690567017, + "logits/rejected": -0.5778655409812927, + "logps/chosen": -57.02978515625, + "logps/rejected": -90.87873077392578, + "loss": 0.6447, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9405057430267334, + "rewards/margins": 5.240304946899414, + "rewards/rejected": -2.2997989654541016, + "step": 8589 + }, + { + "epoch": 2.15, + "grad_norm": 6.949780464172363, + "learning_rate": 6.093439094773861e-06, + "logits/chosen": -0.4632895588874817, + "logits/rejected": -0.549159049987793, + "logps/chosen": -55.254066467285156, + "logps/rejected": -100.34272766113281, + "loss": 0.6413, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9331140518188477, + "rewards/margins": 5.694606781005859, + "rewards/rejected": -2.76149320602417, + "step": 8590 + }, + { + "epoch": 2.15, + "grad_norm": 8.632354736328125, + "learning_rate": 6.092672118542371e-06, + "logits/chosen": -0.4544854462146759, + "logits/rejected": -0.6215327978134155, + "logps/chosen": -65.00421142578125, + "logps/rejected": -102.95584106445312, + "loss": 0.6205, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1245217323303223, + "rewards/margins": 6.6299333572387695, + "rewards/rejected": -3.5054118633270264, + "step": 8591 + }, + { + "epoch": 2.15, + "grad_norm": 10.115126609802246, + "learning_rate": 6.091905115309793e-06, + "logits/chosen": -0.530493438243866, + "logits/rejected": -0.5932787656784058, + "logps/chosen": -46.14470291137695, + "logps/rejected": -94.21951293945312, + "loss": 0.7808, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.537421226501465, + "rewards/margins": 5.125740051269531, + "rewards/rejected": -2.5883193016052246, + "step": 8592 + }, + { + "epoch": 2.15, + "grad_norm": 23.369670867919922, + "learning_rate": 6.091138085095076e-06, + "logits/chosen": -0.5665510892868042, + "logits/rejected": -0.6464506387710571, + "logps/chosen": -56.96717834472656, + "logps/rejected": -94.3442611694336, + "loss": 0.7489, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.660203218460083, + "rewards/margins": 4.9622602462768555, + "rewards/rejected": -2.3020572662353516, + "step": 8593 + }, + { + "epoch": 2.15, + "grad_norm": 6.249392986297607, + "learning_rate": 6.090371027917177e-06, + "logits/chosen": -0.48644545674324036, + "logits/rejected": -0.5595386028289795, + "logps/chosen": -56.273136138916016, + "logps/rejected": -107.66619873046875, + "loss": 0.6357, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.795330762863159, + "rewards/margins": 6.4658918380737305, + "rewards/rejected": -3.6705617904663086, + "step": 8594 + }, + { + "epoch": 2.15, + "grad_norm": 6.838075637817383, + "learning_rate": 6.08960394379505e-06, + "logits/chosen": -0.47111615538597107, + "logits/rejected": -0.5377486944198608, + "logps/chosen": -60.60122299194336, + "logps/rejected": -90.35169219970703, + "loss": 0.7889, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.410208225250244, + "rewards/margins": 4.943368434906006, + "rewards/rejected": -2.5331599712371826, + "step": 8595 + }, + { + "epoch": 2.15, + "grad_norm": 6.652609348297119, + "learning_rate": 6.088836832747651e-06, + "logits/chosen": -0.4793241322040558, + "logits/rejected": -0.6136856079101562, + "logps/chosen": -55.866455078125, + "logps/rejected": -83.184814453125, + "loss": 0.6303, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9281694889068604, + "rewards/margins": 6.253451347351074, + "rewards/rejected": -3.325281858444214, + "step": 8596 + }, + { + "epoch": 2.15, + "grad_norm": 5.96893835067749, + "learning_rate": 6.088069694793936e-06, + "logits/chosen": -0.5059472322463989, + "logits/rejected": -0.5443096160888672, + "logps/chosen": -55.332664489746094, + "logps/rejected": -87.51811218261719, + "loss": 0.7086, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.803562879562378, + "rewards/margins": 4.945975303649902, + "rewards/rejected": -2.142413377761841, + "step": 8597 + }, + { + "epoch": 2.15, + "grad_norm": 6.396735191345215, + "learning_rate": 6.087302529952861e-06, + "logits/chosen": -0.5598251223564148, + "logits/rejected": -0.6602185368537903, + "logps/chosen": -67.14623260498047, + "logps/rejected": -98.98342895507812, + "loss": 0.8229, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0498480796813965, + "rewards/margins": 5.570694923400879, + "rewards/rejected": -2.5208468437194824, + "step": 8598 + }, + { + "epoch": 2.15, + "grad_norm": 12.561507225036621, + "learning_rate": 6.086535338243383e-06, + "logits/chosen": -0.5609344244003296, + "logits/rejected": -0.6149576306343079, + "logps/chosen": -50.941497802734375, + "logps/rejected": -91.1069107055664, + "loss": 0.7215, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7643558979034424, + "rewards/margins": 5.4523444175720215, + "rewards/rejected": -2.6879889965057373, + "step": 8599 + }, + { + "epoch": 2.15, + "grad_norm": 7.4486236572265625, + "learning_rate": 6.085768119684463e-06, + "logits/chosen": -0.4972149729728699, + "logits/rejected": -0.6014281511306763, + "logps/chosen": -59.01783752441406, + "logps/rejected": -78.98297119140625, + "loss": 0.722, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6865296363830566, + "rewards/margins": 4.70958137512207, + "rewards/rejected": -2.0230512619018555, + "step": 8600 + }, + { + "epoch": 2.15, + "grad_norm": 25.38317108154297, + "learning_rate": 6.085000874295057e-06, + "logits/chosen": -0.4868101179599762, + "logits/rejected": -0.5115576386451721, + "logps/chosen": -47.83502960205078, + "logps/rejected": -83.5191879272461, + "loss": 0.7666, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8661646842956543, + "rewards/margins": 3.958925724029541, + "rewards/rejected": -1.0927608013153076, + "step": 8601 + }, + { + "epoch": 2.15, + "grad_norm": 6.241402626037598, + "learning_rate": 6.084233602094125e-06, + "logits/chosen": -0.4587874412536621, + "logits/rejected": -0.5619513392448425, + "logps/chosen": -64.24176025390625, + "logps/rejected": -71.450927734375, + "loss": 0.7558, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9699089527130127, + "rewards/margins": 5.394239902496338, + "rewards/rejected": -2.424330711364746, + "step": 8602 + }, + { + "epoch": 2.15, + "grad_norm": 6.0697455406188965, + "learning_rate": 6.083466303100628e-06, + "logits/chosen": -0.4583676755428314, + "logits/rejected": -0.5027333498001099, + "logps/chosen": -63.01979064941406, + "logps/rejected": -111.42631530761719, + "loss": 0.6883, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.009262800216675, + "rewards/margins": 5.976097106933594, + "rewards/rejected": -2.96683406829834, + "step": 8603 + }, + { + "epoch": 2.15, + "grad_norm": 15.829235076904297, + "learning_rate": 6.082698977333526e-06, + "logits/chosen": -0.44567686319351196, + "logits/rejected": -0.5079947113990784, + "logps/chosen": -60.98863983154297, + "logps/rejected": -104.21138000488281, + "loss": 0.8095, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9439620971679688, + "rewards/margins": 5.682364463806152, + "rewards/rejected": -2.7384016513824463, + "step": 8604 + }, + { + "epoch": 2.15, + "grad_norm": 4.362473964691162, + "learning_rate": 6.081931624811781e-06, + "logits/chosen": -0.6019667387008667, + "logits/rejected": -0.655005693435669, + "logps/chosen": -77.95552062988281, + "logps/rejected": -87.40618133544922, + "loss": 0.6957, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.826383113861084, + "rewards/margins": 6.225174427032471, + "rewards/rejected": -3.398791551589966, + "step": 8605 + }, + { + "epoch": 2.15, + "grad_norm": 12.274934768676758, + "learning_rate": 6.081164245554355e-06, + "logits/chosen": -0.4776880443096161, + "logits/rejected": -0.5402368903160095, + "logps/chosen": -67.17566680908203, + "logps/rejected": -118.71533203125, + "loss": 0.8094, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.088268518447876, + "rewards/margins": 5.975732803344727, + "rewards/rejected": -2.887464761734009, + "step": 8606 + }, + { + "epoch": 2.15, + "grad_norm": 5.324131488800049, + "learning_rate": 6.0803968395802094e-06, + "logits/chosen": -0.47704559564590454, + "logits/rejected": -0.5344977974891663, + "logps/chosen": -73.25910949707031, + "logps/rejected": -102.24564361572266, + "loss": 0.7264, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5436744689941406, + "rewards/margins": 5.4826202392578125, + "rewards/rejected": -2.938945770263672, + "step": 8607 + }, + { + "epoch": 2.15, + "grad_norm": 7.6077494621276855, + "learning_rate": 6.079629406908309e-06, + "logits/chosen": -0.5446105003356934, + "logits/rejected": -0.6058942079544067, + "logps/chosen": -56.76799774169922, + "logps/rejected": -88.37154388427734, + "loss": 0.7333, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.901332139968872, + "rewards/margins": 5.602650165557861, + "rewards/rejected": -2.70131778717041, + "step": 8608 + }, + { + "epoch": 2.15, + "grad_norm": 21.116802215576172, + "learning_rate": 6.078861947557618e-06, + "logits/chosen": -0.5113634467124939, + "logits/rejected": -0.5773259997367859, + "logps/chosen": -54.28114318847656, + "logps/rejected": -88.1318359375, + "loss": 0.7946, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.869702100753784, + "rewards/margins": 4.994752407073975, + "rewards/rejected": -2.1250500679016113, + "step": 8609 + }, + { + "epoch": 2.15, + "grad_norm": 4.470364570617676, + "learning_rate": 6.0780944615471016e-06, + "logits/chosen": -0.49411413073539734, + "logits/rejected": -0.545019268989563, + "logps/chosen": -56.884639739990234, + "logps/rejected": -117.9566879272461, + "loss": 0.6331, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.861436128616333, + "rewards/margins": 6.9459710121154785, + "rewards/rejected": -4.084534645080566, + "step": 8610 + }, + { + "epoch": 2.15, + "grad_norm": 6.608721733093262, + "learning_rate": 6.077326948895722e-06, + "logits/chosen": -0.5219665765762329, + "logits/rejected": -0.5638371706008911, + "logps/chosen": -58.391212463378906, + "logps/rejected": -85.83411407470703, + "loss": 0.7426, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9652976989746094, + "rewards/margins": 4.705361843109131, + "rewards/rejected": -1.7400645017623901, + "step": 8611 + }, + { + "epoch": 2.15, + "grad_norm": 5.580715656280518, + "learning_rate": 6.07655940962245e-06, + "logits/chosen": -0.47154897451400757, + "logits/rejected": -0.5796759128570557, + "logps/chosen": -57.5590705871582, + "logps/rejected": -98.97274780273438, + "loss": 0.6634, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0433449745178223, + "rewards/margins": 6.287476062774658, + "rewards/rejected": -3.244131088256836, + "step": 8612 + }, + { + "epoch": 2.15, + "grad_norm": 3.510688066482544, + "learning_rate": 6.075791843746248e-06, + "logits/chosen": -0.4651879668235779, + "logits/rejected": -0.5641943216323853, + "logps/chosen": -54.321346282958984, + "logps/rejected": -85.72772216796875, + "loss": 0.6746, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.376049280166626, + "rewards/margins": 6.651110649108887, + "rewards/rejected": -3.27506160736084, + "step": 8613 + }, + { + "epoch": 2.15, + "grad_norm": 7.110760688781738, + "learning_rate": 6.0750242512860865e-06, + "logits/chosen": -0.5450906157493591, + "logits/rejected": -0.611015796661377, + "logps/chosen": -47.725887298583984, + "logps/rejected": -98.75355529785156, + "loss": 0.736, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.680394411087036, + "rewards/margins": 5.307042121887207, + "rewards/rejected": -2.626647472381592, + "step": 8614 + }, + { + "epoch": 2.16, + "grad_norm": 2.528116226196289, + "learning_rate": 6.074256632260933e-06, + "logits/chosen": -0.5182247757911682, + "logits/rejected": -0.5331971645355225, + "logps/chosen": -53.89141845703125, + "logps/rejected": -113.6461181640625, + "loss": 0.6, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.93524432182312, + "rewards/margins": 5.912111282348633, + "rewards/rejected": -2.9768667221069336, + "step": 8615 + }, + { + "epoch": 2.16, + "grad_norm": 3.52978515625, + "learning_rate": 6.073488986689755e-06, + "logits/chosen": -0.4421875476837158, + "logits/rejected": -0.530523419380188, + "logps/chosen": -54.38734817504883, + "logps/rejected": -95.21556854248047, + "loss": 0.6023, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2696192264556885, + "rewards/margins": 6.045100688934326, + "rewards/rejected": -2.7754812240600586, + "step": 8616 + }, + { + "epoch": 2.16, + "grad_norm": 11.779818534851074, + "learning_rate": 6.072721314591521e-06, + "logits/chosen": -0.5077090263366699, + "logits/rejected": -0.5842560529708862, + "logps/chosen": -45.39982223510742, + "logps/rejected": -91.78929138183594, + "loss": 0.6432, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7929153442382812, + "rewards/margins": 5.765383720397949, + "rewards/rejected": -2.972468614578247, + "step": 8617 + }, + { + "epoch": 2.16, + "grad_norm": 3.5762598514556885, + "learning_rate": 6.071953615985203e-06, + "logits/chosen": -0.43901294469833374, + "logits/rejected": -0.5205889344215393, + "logps/chosen": -65.67962646484375, + "logps/rejected": -103.69479370117188, + "loss": 0.7134, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8129632472991943, + "rewards/margins": 5.424455642700195, + "rewards/rejected": -2.611492395401001, + "step": 8618 + }, + { + "epoch": 2.16, + "grad_norm": 3.7179923057556152, + "learning_rate": 6.071185890889772e-06, + "logits/chosen": -0.5237234830856323, + "logits/rejected": -0.6548401117324829, + "logps/chosen": -54.92672348022461, + "logps/rejected": -77.416259765625, + "loss": 0.6782, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7903575897216797, + "rewards/margins": 5.288081169128418, + "rewards/rejected": -2.4977238178253174, + "step": 8619 + }, + { + "epoch": 2.16, + "grad_norm": 3.16552734375, + "learning_rate": 6.070418139324195e-06, + "logits/chosen": -0.4708971381187439, + "logits/rejected": -0.5205792784690857, + "logps/chosen": -53.36613082885742, + "logps/rejected": -100.68798065185547, + "loss": 0.5817, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.134249687194824, + "rewards/margins": 6.128296375274658, + "rewards/rejected": -2.994046688079834, + "step": 8620 + }, + { + "epoch": 2.16, + "grad_norm": 4.792099952697754, + "learning_rate": 6.069650361307451e-06, + "logits/chosen": -0.43164390325546265, + "logits/rejected": -0.5318237543106079, + "logps/chosen": -74.30731201171875, + "logps/rejected": -95.35477447509766, + "loss": 0.6537, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.378369092941284, + "rewards/margins": 5.5598602294921875, + "rewards/rejected": -2.181490898132324, + "step": 8621 + }, + { + "epoch": 2.16, + "grad_norm": 3.9476711750030518, + "learning_rate": 6.068882556858507e-06, + "logits/chosen": -0.564355731010437, + "logits/rejected": -0.6503582000732422, + "logps/chosen": -51.09426498413086, + "logps/rejected": -84.65766906738281, + "loss": 0.6111, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.018218755722046, + "rewards/margins": 5.521926403045654, + "rewards/rejected": -2.5037074089050293, + "step": 8622 + }, + { + "epoch": 2.16, + "grad_norm": 4.738649368286133, + "learning_rate": 6.068114725996339e-06, + "logits/chosen": -0.5445067882537842, + "logits/rejected": -0.6800014972686768, + "logps/chosen": -54.72123718261719, + "logps/rejected": -95.9999771118164, + "loss": 0.6504, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8999736309051514, + "rewards/margins": 6.684431552886963, + "rewards/rejected": -3.7844576835632324, + "step": 8623 + }, + { + "epoch": 2.16, + "grad_norm": 3.497985363006592, + "learning_rate": 6.06734686873992e-06, + "logits/chosen": -0.5245024561882019, + "logits/rejected": -0.6131529808044434, + "logps/chosen": -57.399818420410156, + "logps/rejected": -100.29739379882812, + "loss": 0.6276, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9248392581939697, + "rewards/margins": 6.685367107391357, + "rewards/rejected": -3.760528087615967, + "step": 8624 + }, + { + "epoch": 2.16, + "grad_norm": 3.6801412105560303, + "learning_rate": 6.0665789851082245e-06, + "logits/chosen": -0.4538264572620392, + "logits/rejected": -0.5336400866508484, + "logps/chosen": -51.841156005859375, + "logps/rejected": -84.36119842529297, + "loss": 0.6364, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.263845682144165, + "rewards/margins": 6.292102813720703, + "rewards/rejected": -3.028256893157959, + "step": 8625 + }, + { + "epoch": 2.16, + "grad_norm": 6.048447608947754, + "learning_rate": 6.065811075120227e-06, + "logits/chosen": -0.5830955505371094, + "logits/rejected": -0.6707707047462463, + "logps/chosen": -48.41403579711914, + "logps/rejected": -78.37459564208984, + "loss": 0.7041, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.062643051147461, + "rewards/margins": 5.878934860229492, + "rewards/rejected": -2.8162918090820312, + "step": 8626 + }, + { + "epoch": 2.16, + "grad_norm": 18.275714874267578, + "learning_rate": 6.065043138794905e-06, + "logits/chosen": -0.5047438740730286, + "logits/rejected": -0.5780779719352722, + "logps/chosen": -52.151004791259766, + "logps/rejected": -91.58955383300781, + "loss": 0.7282, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6546547412872314, + "rewards/margins": 5.771781921386719, + "rewards/rejected": -3.117126703262329, + "step": 8627 + }, + { + "epoch": 2.16, + "grad_norm": 10.080428123474121, + "learning_rate": 6.064275176151233e-06, + "logits/chosen": -0.38309189677238464, + "logits/rejected": -0.5314350724220276, + "logps/chosen": -67.16524505615234, + "logps/rejected": -78.39077758789062, + "loss": 0.7045, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8695974349975586, + "rewards/margins": 5.974551200866699, + "rewards/rejected": -3.1049530506134033, + "step": 8628 + }, + { + "epoch": 2.16, + "grad_norm": 25.421913146972656, + "learning_rate": 6.063507187208191e-06, + "logits/chosen": -0.5324011445045471, + "logits/rejected": -0.5632358193397522, + "logps/chosen": -49.51375198364258, + "logps/rejected": -98.66133117675781, + "loss": 0.8825, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7553303241729736, + "rewards/margins": 5.332622528076172, + "rewards/rejected": -2.5772924423217773, + "step": 8629 + }, + { + "epoch": 2.16, + "grad_norm": 3.9103915691375732, + "learning_rate": 6.062739171984755e-06, + "logits/chosen": -0.5227209329605103, + "logits/rejected": -0.6316730976104736, + "logps/chosen": -58.55910873413086, + "logps/rejected": -89.92866516113281, + "loss": 0.7311, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0565006732940674, + "rewards/margins": 5.935723304748535, + "rewards/rejected": -2.8792226314544678, + "step": 8630 + }, + { + "epoch": 2.16, + "grad_norm": 8.13394832611084, + "learning_rate": 6.0619711304999026e-06, + "logits/chosen": -0.531593382358551, + "logits/rejected": -0.6325693130493164, + "logps/chosen": -54.40958786010742, + "logps/rejected": -82.24818420410156, + "loss": 0.6669, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8468856811523438, + "rewards/margins": 4.811786651611328, + "rewards/rejected": -1.964901089668274, + "step": 8631 + }, + { + "epoch": 2.16, + "grad_norm": 9.861454010009766, + "learning_rate": 6.061203062772615e-06, + "logits/chosen": -0.44857704639434814, + "logits/rejected": -0.4357171356678009, + "logps/chosen": -61.79629135131836, + "logps/rejected": -107.68909454345703, + "loss": 0.8317, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7978639602661133, + "rewards/margins": 4.344496250152588, + "rewards/rejected": -1.5466320514678955, + "step": 8632 + }, + { + "epoch": 2.16, + "grad_norm": 3.0396790504455566, + "learning_rate": 6.060434968821871e-06, + "logits/chosen": -0.49700480699539185, + "logits/rejected": -0.5915710926055908, + "logps/chosen": -53.08473587036133, + "logps/rejected": -93.11427307128906, + "loss": 0.6127, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1893372535705566, + "rewards/margins": 6.306149482727051, + "rewards/rejected": -3.116812229156494, + "step": 8633 + }, + { + "epoch": 2.16, + "grad_norm": 10.69268798828125, + "learning_rate": 6.059666848666649e-06, + "logits/chosen": -0.4678969979286194, + "logits/rejected": -0.5631940364837646, + "logps/chosen": -49.33149337768555, + "logps/rejected": -85.11738586425781, + "loss": 0.7391, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.805371046066284, + "rewards/margins": 4.757710933685303, + "rewards/rejected": -1.9523402452468872, + "step": 8634 + }, + { + "epoch": 2.16, + "grad_norm": 3.56630802154541, + "learning_rate": 6.058898702325935e-06, + "logits/chosen": -0.47704651951789856, + "logits/rejected": -0.6111553907394409, + "logps/chosen": -58.37432098388672, + "logps/rejected": -93.06031799316406, + "loss": 0.6195, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.880800724029541, + "rewards/margins": 7.218083381652832, + "rewards/rejected": -4.337283134460449, + "step": 8635 + }, + { + "epoch": 2.16, + "grad_norm": 7.146730899810791, + "learning_rate": 6.058130529818705e-06, + "logits/chosen": -0.47463124990463257, + "logits/rejected": -0.5879952907562256, + "logps/chosen": -53.20518112182617, + "logps/rejected": -92.1727523803711, + "loss": 0.7621, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.005953311920166, + "rewards/margins": 5.898303031921387, + "rewards/rejected": -2.8923497200012207, + "step": 8636 + }, + { + "epoch": 2.16, + "grad_norm": 17.897499084472656, + "learning_rate": 6.057362331163946e-06, + "logits/chosen": -0.4315129816532135, + "logits/rejected": -0.533452033996582, + "logps/chosen": -56.409358978271484, + "logps/rejected": -103.36619567871094, + "loss": 0.6839, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1246869564056396, + "rewards/margins": 6.220041275024414, + "rewards/rejected": -3.095353841781616, + "step": 8637 + }, + { + "epoch": 2.16, + "grad_norm": 4.863635540008545, + "learning_rate": 6.05659410638064e-06, + "logits/chosen": -0.48082974553108215, + "logits/rejected": -0.5434719920158386, + "logps/chosen": -56.23614501953125, + "logps/rejected": -134.79730224609375, + "loss": 0.6296, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.706021785736084, + "rewards/margins": 6.932126998901367, + "rewards/rejected": -4.226105213165283, + "step": 8638 + }, + { + "epoch": 2.16, + "grad_norm": 4.667004585266113, + "learning_rate": 6.055825855487768e-06, + "logits/chosen": -0.461031973361969, + "logits/rejected": -0.5036176443099976, + "logps/chosen": -56.70862579345703, + "logps/rejected": -90.74365234375, + "loss": 0.6978, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1270554065704346, + "rewards/margins": 5.61924934387207, + "rewards/rejected": -2.492194175720215, + "step": 8639 + }, + { + "epoch": 2.16, + "grad_norm": 5.413978576660156, + "learning_rate": 6.055057578504316e-06, + "logits/chosen": -0.4665847420692444, + "logits/rejected": -0.574730634689331, + "logps/chosen": -72.45152282714844, + "logps/rejected": -98.55940246582031, + "loss": 0.7324, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8188109397888184, + "rewards/margins": 4.940480709075928, + "rewards/rejected": -2.1216695308685303, + "step": 8640 + }, + { + "epoch": 2.16, + "grad_norm": 4.537370681762695, + "learning_rate": 6.054289275449269e-06, + "logits/chosen": -0.5159180760383606, + "logits/rejected": -0.5493950843811035, + "logps/chosen": -49.02225112915039, + "logps/rejected": -112.07121276855469, + "loss": 0.6256, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.801265001296997, + "rewards/margins": 6.818042278289795, + "rewards/rejected": -4.016777515411377, + "step": 8641 + }, + { + "epoch": 2.16, + "grad_norm": 4.186224937438965, + "learning_rate": 6.0535209463416144e-06, + "logits/chosen": -0.4591103792190552, + "logits/rejected": -0.5478779077529907, + "logps/chosen": -59.77082443237305, + "logps/rejected": -105.12229919433594, + "loss": 0.6536, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.059260368347168, + "rewards/margins": 7.303621768951416, + "rewards/rejected": -4.24436092376709, + "step": 8642 + }, + { + "epoch": 2.16, + "grad_norm": 6.2062482833862305, + "learning_rate": 6.052752591200334e-06, + "logits/chosen": -0.5151175856590271, + "logits/rejected": -0.555954098701477, + "logps/chosen": -61.391563415527344, + "logps/rejected": -105.45828247070312, + "loss": 0.7238, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8193607330322266, + "rewards/margins": 5.521181583404541, + "rewards/rejected": -2.7018210887908936, + "step": 8643 + }, + { + "epoch": 2.16, + "grad_norm": 7.3076252937316895, + "learning_rate": 6.051984210044419e-06, + "logits/chosen": -0.42046260833740234, + "logits/rejected": -0.5119103193283081, + "logps/chosen": -51.94682693481445, + "logps/rejected": -87.22879028320312, + "loss": 0.6783, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9726204872131348, + "rewards/margins": 5.562919616699219, + "rewards/rejected": -2.590299367904663, + "step": 8644 + }, + { + "epoch": 2.16, + "grad_norm": 3.7835006713867188, + "learning_rate": 6.051215802892855e-06, + "logits/chosen": -0.5294063091278076, + "logits/rejected": -0.6537542939186096, + "logps/chosen": -67.45693969726562, + "logps/rejected": -91.00971984863281, + "loss": 0.7442, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.511152744293213, + "rewards/margins": 6.02640438079834, + "rewards/rejected": -3.515251398086548, + "step": 8645 + }, + { + "epoch": 2.16, + "grad_norm": 4.001434326171875, + "learning_rate": 6.05044736976463e-06, + "logits/chosen": -0.5860820412635803, + "logits/rejected": -0.6876624822616577, + "logps/chosen": -57.51768493652344, + "logps/rejected": -110.05644989013672, + "loss": 0.6582, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2829489707946777, + "rewards/margins": 6.844577312469482, + "rewards/rejected": -3.5616278648376465, + "step": 8646 + }, + { + "epoch": 2.16, + "grad_norm": 2.8545145988464355, + "learning_rate": 6.049678910678735e-06, + "logits/chosen": -0.4777366518974304, + "logits/rejected": -0.5936201810836792, + "logps/chosen": -50.93595886230469, + "logps/rejected": -92.91832733154297, + "loss": 0.6198, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.985257148742676, + "rewards/margins": 6.8498663902282715, + "rewards/rejected": -3.864609479904175, + "step": 8647 + }, + { + "epoch": 2.16, + "grad_norm": 5.446430206298828, + "learning_rate": 6.048910425654155e-06, + "logits/chosen": -0.44059228897094727, + "logits/rejected": -0.5035892724990845, + "logps/chosen": -56.127315521240234, + "logps/rejected": -91.18379211425781, + "loss": 0.6536, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.201596975326538, + "rewards/margins": 6.215327262878418, + "rewards/rejected": -3.013730764389038, + "step": 8648 + }, + { + "epoch": 2.16, + "grad_norm": 3.055622100830078, + "learning_rate": 6.048141914709884e-06, + "logits/chosen": -0.5123763680458069, + "logits/rejected": -0.49122676253318787, + "logps/chosen": -40.603477478027344, + "logps/rejected": -119.29591369628906, + "loss": 0.6448, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0884439945220947, + "rewards/margins": 6.125409126281738, + "rewards/rejected": -3.0369644165039062, + "step": 8649 + }, + { + "epoch": 2.16, + "grad_norm": 4.638376235961914, + "learning_rate": 6.0473733778649115e-06, + "logits/chosen": -0.47067585587501526, + "logits/rejected": -0.5800422430038452, + "logps/chosen": -61.183372497558594, + "logps/rejected": -99.08122253417969, + "loss": 0.649, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9026272296905518, + "rewards/margins": 6.548481464385986, + "rewards/rejected": -3.6458537578582764, + "step": 8650 + }, + { + "epoch": 2.16, + "grad_norm": 8.756531715393066, + "learning_rate": 6.046604815138228e-06, + "logits/chosen": -0.4786826968193054, + "logits/rejected": -0.5261862277984619, + "logps/chosen": -54.716854095458984, + "logps/rejected": -98.56417083740234, + "loss": 0.6973, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0011820793151855, + "rewards/margins": 5.4289116859436035, + "rewards/rejected": -2.427729368209839, + "step": 8651 + }, + { + "epoch": 2.16, + "grad_norm": 3.38727068901062, + "learning_rate": 6.045836226548827e-06, + "logits/chosen": -0.6408272385597229, + "logits/rejected": -0.7204078435897827, + "logps/chosen": -91.96035766601562, + "logps/rejected": -84.64054870605469, + "loss": 0.6423, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3658711910247803, + "rewards/margins": 6.2464447021484375, + "rewards/rejected": -3.8805735111236572, + "step": 8652 + }, + { + "epoch": 2.16, + "grad_norm": 5.0561323165893555, + "learning_rate": 6.0450676121157e-06, + "logits/chosen": -0.45996713638305664, + "logits/rejected": -0.5428176522254944, + "logps/chosen": -57.04777526855469, + "logps/rejected": -90.11626434326172, + "loss": 0.7501, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.930238962173462, + "rewards/margins": 5.79302453994751, + "rewards/rejected": -2.8627851009368896, + "step": 8653 + }, + { + "epoch": 2.16, + "grad_norm": 4.791961193084717, + "learning_rate": 6.044298971857841e-06, + "logits/chosen": -0.4341915249824524, + "logits/rejected": -0.49533918499946594, + "logps/chosen": -61.57981872558594, + "logps/rejected": -96.62112426757812, + "loss": 0.6815, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7559986114501953, + "rewards/margins": 5.132342338562012, + "rewards/rejected": -2.3763437271118164, + "step": 8654 + }, + { + "epoch": 2.17, + "grad_norm": 6.169352054595947, + "learning_rate": 6.043530305794244e-06, + "logits/chosen": -0.4708648920059204, + "logits/rejected": -0.5600175261497498, + "logps/chosen": -58.181278228759766, + "logps/rejected": -93.54508972167969, + "loss": 0.7164, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.992856740951538, + "rewards/margins": 5.165242671966553, + "rewards/rejected": -2.1723856925964355, + "step": 8655 + }, + { + "epoch": 2.17, + "grad_norm": 7.0366363525390625, + "learning_rate": 6.042761613943903e-06, + "logits/chosen": -0.4973156750202179, + "logits/rejected": -0.5444357395172119, + "logps/chosen": -72.9900894165039, + "logps/rejected": -97.85916137695312, + "loss": 0.7656, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.877079963684082, + "rewards/margins": 5.161482334136963, + "rewards/rejected": -2.2844021320343018, + "step": 8656 + }, + { + "epoch": 2.17, + "grad_norm": 16.641807556152344, + "learning_rate": 6.0419928963258125e-06, + "logits/chosen": -0.47145184874534607, + "logits/rejected": -0.5620655417442322, + "logps/chosen": -65.20928955078125, + "logps/rejected": -84.76984405517578, + "loss": 0.8004, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.3716859817504883, + "rewards/margins": 5.447275161743164, + "rewards/rejected": -2.075589656829834, + "step": 8657 + }, + { + "epoch": 2.17, + "grad_norm": 3.2688241004943848, + "learning_rate": 6.041224152958971e-06, + "logits/chosen": -0.4402313530445099, + "logits/rejected": -0.5498032569885254, + "logps/chosen": -57.08236312866211, + "logps/rejected": -80.54798889160156, + "loss": 0.5876, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0134918689727783, + "rewards/margins": 5.355728626251221, + "rewards/rejected": -2.3422367572784424, + "step": 8658 + }, + { + "epoch": 2.17, + "grad_norm": 6.406438827514648, + "learning_rate": 6.0404553838623715e-06, + "logits/chosen": -0.5481503009796143, + "logits/rejected": -0.6344999074935913, + "logps/chosen": -58.71864318847656, + "logps/rejected": -79.4296875, + "loss": 0.7912, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.050137519836426, + "rewards/margins": 4.610799789428711, + "rewards/rejected": -1.5606622695922852, + "step": 8659 + }, + { + "epoch": 2.17, + "grad_norm": 4.549278736114502, + "learning_rate": 6.0396865890550134e-06, + "logits/chosen": -0.470874160528183, + "logits/rejected": -0.528782069683075, + "logps/chosen": -63.703155517578125, + "logps/rejected": -95.0049819946289, + "loss": 0.7746, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.844771146774292, + "rewards/margins": 5.248296737670898, + "rewards/rejected": -2.4035251140594482, + "step": 8660 + }, + { + "epoch": 2.17, + "grad_norm": 6.103726387023926, + "learning_rate": 6.038917768555894e-06, + "logits/chosen": -0.4603353440761566, + "logits/rejected": -0.530130922794342, + "logps/chosen": -51.998809814453125, + "logps/rejected": -97.15930938720703, + "loss": 0.6492, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.382457733154297, + "rewards/margins": 5.664503574371338, + "rewards/rejected": -2.282045602798462, + "step": 8661 + }, + { + "epoch": 2.17, + "grad_norm": 3.2427902221679688, + "learning_rate": 6.038148922384012e-06, + "logits/chosen": -0.45095449686050415, + "logits/rejected": -0.5275976061820984, + "logps/chosen": -65.11003112792969, + "logps/rejected": -96.22570037841797, + "loss": 0.6318, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7895989418029785, + "rewards/margins": 6.012629508972168, + "rewards/rejected": -3.2230300903320312, + "step": 8662 + }, + { + "epoch": 2.17, + "grad_norm": 7.144091606140137, + "learning_rate": 6.037380050558364e-06, + "logits/chosen": -0.48442110419273376, + "logits/rejected": -0.5425711870193481, + "logps/chosen": -62.33390808105469, + "logps/rejected": -93.25135040283203, + "loss": 0.7301, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3323402404785156, + "rewards/margins": 5.066038608551025, + "rewards/rejected": -1.7336983680725098, + "step": 8663 + }, + { + "epoch": 2.17, + "grad_norm": 3.594320058822632, + "learning_rate": 6.0366111530979535e-06, + "logits/chosen": -0.5054008960723877, + "logits/rejected": -0.5938619375228882, + "logps/chosen": -57.080684661865234, + "logps/rejected": -101.60713195800781, + "loss": 0.6651, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8712973594665527, + "rewards/margins": 6.533219337463379, + "rewards/rejected": -3.661921739578247, + "step": 8664 + }, + { + "epoch": 2.17, + "grad_norm": 3.528597831726074, + "learning_rate": 6.035842230021778e-06, + "logits/chosen": -0.45838481187820435, + "logits/rejected": -0.520971953868866, + "logps/chosen": -44.615997314453125, + "logps/rejected": -93.2901611328125, + "loss": 0.5386, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.138145923614502, + "rewards/margins": 6.566649436950684, + "rewards/rejected": -3.4285032749176025, + "step": 8665 + }, + { + "epoch": 2.17, + "grad_norm": 11.178019523620605, + "learning_rate": 6.035073281348839e-06, + "logits/chosen": -0.4663234353065491, + "logits/rejected": -0.5778228044509888, + "logps/chosen": -67.43594360351562, + "logps/rejected": -104.74940490722656, + "loss": 0.6886, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.902297019958496, + "rewards/margins": 6.915824890136719, + "rewards/rejected": -4.013528347015381, + "step": 8666 + }, + { + "epoch": 2.17, + "grad_norm": 3.5900444984436035, + "learning_rate": 6.03430430709814e-06, + "logits/chosen": -0.5614187717437744, + "logits/rejected": -0.6301470994949341, + "logps/chosen": -52.033626556396484, + "logps/rejected": -95.55174255371094, + "loss": 0.6688, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1343204975128174, + "rewards/margins": 6.144771575927734, + "rewards/rejected": -3.010450839996338, + "step": 8667 + }, + { + "epoch": 2.17, + "grad_norm": 4.834979057312012, + "learning_rate": 6.033535307288679e-06, + "logits/chosen": -0.5523586869239807, + "logits/rejected": -0.6036503314971924, + "logps/chosen": -61.898929595947266, + "logps/rejected": -94.86158752441406, + "loss": 0.8047, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1938326358795166, + "rewards/margins": 5.9515180587768555, + "rewards/rejected": -2.7576847076416016, + "step": 8668 + }, + { + "epoch": 2.17, + "grad_norm": 13.01874828338623, + "learning_rate": 6.032766281939463e-06, + "logits/chosen": -0.4956878423690796, + "logits/rejected": -0.5770023465156555, + "logps/chosen": -61.60941696166992, + "logps/rejected": -97.25618743896484, + "loss": 0.7748, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.785771369934082, + "rewards/margins": 4.604866981506348, + "rewards/rejected": -1.8190957307815552, + "step": 8669 + }, + { + "epoch": 2.17, + "grad_norm": 13.341133117675781, + "learning_rate": 6.031997231069492e-06, + "logits/chosen": -0.4898484945297241, + "logits/rejected": -0.5976430773735046, + "logps/chosen": -69.35736846923828, + "logps/rejected": -100.61653900146484, + "loss": 0.8219, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6049458980560303, + "rewards/margins": 6.378420352935791, + "rewards/rejected": -3.7734739780426025, + "step": 8670 + }, + { + "epoch": 2.17, + "grad_norm": 4.543042182922363, + "learning_rate": 6.031228154697772e-06, + "logits/chosen": -0.4744908809661865, + "logits/rejected": -0.6151012778282166, + "logps/chosen": -60.00741958618164, + "logps/rejected": -95.4308853149414, + "loss": 0.6439, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1349902153015137, + "rewards/margins": 6.895800590515137, + "rewards/rejected": -3.760809898376465, + "step": 8671 + }, + { + "epoch": 2.17, + "grad_norm": 8.420543670654297, + "learning_rate": 6.03045905284331e-06, + "logits/chosen": -0.4621959328651428, + "logits/rejected": -0.5297998189926147, + "logps/chosen": -62.91989517211914, + "logps/rejected": -111.87955474853516, + "loss": 0.7864, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.784989833831787, + "rewards/margins": 6.7486724853515625, + "rewards/rejected": -3.9636826515197754, + "step": 8672 + }, + { + "epoch": 2.17, + "grad_norm": 5.4044365882873535, + "learning_rate": 6.029689925525106e-06, + "logits/chosen": -0.5407451391220093, + "logits/rejected": -0.598138689994812, + "logps/chosen": -49.20818328857422, + "logps/rejected": -105.28507995605469, + "loss": 0.6834, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9693403244018555, + "rewards/margins": 6.019377708435059, + "rewards/rejected": -3.0500380992889404, + "step": 8673 + }, + { + "epoch": 2.17, + "grad_norm": 13.317789077758789, + "learning_rate": 6.02892077276217e-06, + "logits/chosen": -0.4473779797554016, + "logits/rejected": -0.6029825806617737, + "logps/chosen": -62.23824691772461, + "logps/rejected": -82.35897827148438, + "loss": 0.7439, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9219954013824463, + "rewards/margins": 6.639819622039795, + "rewards/rejected": -3.7178244590759277, + "step": 8674 + }, + { + "epoch": 2.17, + "grad_norm": 8.065643310546875, + "learning_rate": 6.028151594573506e-06, + "logits/chosen": -0.5283297896385193, + "logits/rejected": -0.5654581785202026, + "logps/chosen": -68.50096893310547, + "logps/rejected": -99.69662475585938, + "loss": 0.9407, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8712844848632812, + "rewards/margins": 5.816878318786621, + "rewards/rejected": -2.94559383392334, + "step": 8675 + }, + { + "epoch": 2.17, + "grad_norm": 5.603522300720215, + "learning_rate": 6.027382390978125e-06, + "logits/chosen": -0.5679275393486023, + "logits/rejected": -0.6332369446754456, + "logps/chosen": -53.88585662841797, + "logps/rejected": -82.5920639038086, + "loss": 0.7584, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8627681732177734, + "rewards/margins": 5.309689521789551, + "rewards/rejected": -2.4469215869903564, + "step": 8676 + }, + { + "epoch": 2.17, + "grad_norm": 4.592607021331787, + "learning_rate": 6.02661316199503e-06, + "logits/chosen": -0.500950038433075, + "logits/rejected": -0.5548298358917236, + "logps/chosen": -50.73052215576172, + "logps/rejected": -79.47402954101562, + "loss": 0.7107, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9392788410186768, + "rewards/margins": 4.688592910766602, + "rewards/rejected": -1.7493140697479248, + "step": 8677 + }, + { + "epoch": 2.17, + "grad_norm": 6.964572429656982, + "learning_rate": 6.025843907643233e-06, + "logits/chosen": -0.40455788373947144, + "logits/rejected": -0.5202596783638, + "logps/chosen": -60.40955352783203, + "logps/rejected": -89.85436248779297, + "loss": 0.7778, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.09466552734375, + "rewards/margins": 5.382720947265625, + "rewards/rejected": -2.288055658340454, + "step": 8678 + }, + { + "epoch": 2.17, + "grad_norm": 5.42457914352417, + "learning_rate": 6.02507462794174e-06, + "logits/chosen": -0.47031524777412415, + "logits/rejected": -0.5556544065475464, + "logps/chosen": -56.52477264404297, + "logps/rejected": -92.50752258300781, + "loss": 0.6094, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8764703273773193, + "rewards/margins": 6.303464889526367, + "rewards/rejected": -3.426994562149048, + "step": 8679 + }, + { + "epoch": 2.17, + "grad_norm": 4.637300491333008, + "learning_rate": 6.024305322909565e-06, + "logits/chosen": -0.5380942225456238, + "logits/rejected": -0.6002028584480286, + "logps/chosen": -51.31462478637695, + "logps/rejected": -89.81414031982422, + "loss": 0.6883, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8700015544891357, + "rewards/margins": 5.538814067840576, + "rewards/rejected": -2.6688122749328613, + "step": 8680 + }, + { + "epoch": 2.17, + "grad_norm": 3.5554463863372803, + "learning_rate": 6.023535992565716e-06, + "logits/chosen": -0.5001459121704102, + "logits/rejected": -0.5550001263618469, + "logps/chosen": -43.76226043701172, + "logps/rejected": -100.9052734375, + "loss": 0.6254, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0852861404418945, + "rewards/margins": 5.880467414855957, + "rewards/rejected": -2.7951817512512207, + "step": 8681 + }, + { + "epoch": 2.17, + "grad_norm": 6.1477274894714355, + "learning_rate": 6.022766636929203e-06, + "logits/chosen": -0.487282395362854, + "logits/rejected": -0.6126604676246643, + "logps/chosen": -70.13786315917969, + "logps/rejected": -92.84564208984375, + "loss": 0.7768, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.197948694229126, + "rewards/margins": 5.9487504959106445, + "rewards/rejected": -2.7508015632629395, + "step": 8682 + }, + { + "epoch": 2.17, + "grad_norm": 5.229752063751221, + "learning_rate": 6.021997256019037e-06, + "logits/chosen": -0.4529891014099121, + "logits/rejected": -0.5533549189567566, + "logps/chosen": -56.80417251586914, + "logps/rejected": -78.51854705810547, + "loss": 0.7602, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9428045749664307, + "rewards/margins": 5.475869178771973, + "rewards/rejected": -2.533064842224121, + "step": 8683 + }, + { + "epoch": 2.17, + "grad_norm": 5.508463382720947, + "learning_rate": 6.021227849854234e-06, + "logits/chosen": -0.4845016598701477, + "logits/rejected": -0.5299522280693054, + "logps/chosen": -56.91673278808594, + "logps/rejected": -94.22391510009766, + "loss": 0.7699, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9925179481506348, + "rewards/margins": 4.770481109619141, + "rewards/rejected": -1.7779626846313477, + "step": 8684 + }, + { + "epoch": 2.17, + "grad_norm": 24.016254425048828, + "learning_rate": 6.0204584184538055e-06, + "logits/chosen": -0.44341981410980225, + "logits/rejected": -0.5934985876083374, + "logps/chosen": -61.71306610107422, + "logps/rejected": -84.49333190917969, + "loss": 0.6634, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6309080123901367, + "rewards/margins": 6.1911725997924805, + "rewards/rejected": -3.5602641105651855, + "step": 8685 + }, + { + "epoch": 2.17, + "grad_norm": 4.031693458557129, + "learning_rate": 6.01968896183676e-06, + "logits/chosen": -0.45363056659698486, + "logits/rejected": -0.5541859865188599, + "logps/chosen": -65.0381851196289, + "logps/rejected": -93.76727294921875, + "loss": 0.8488, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9303011894226074, + "rewards/margins": 4.811652183532715, + "rewards/rejected": -1.8813506364822388, + "step": 8686 + }, + { + "epoch": 2.17, + "grad_norm": 6.1606669425964355, + "learning_rate": 6.01891948002212e-06, + "logits/chosen": -0.5076603889465332, + "logits/rejected": -0.6338167786598206, + "logps/chosen": -56.122467041015625, + "logps/rejected": -74.86944580078125, + "loss": 0.6674, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8744192123413086, + "rewards/margins": 5.970943450927734, + "rewards/rejected": -3.096524238586426, + "step": 8687 + }, + { + "epoch": 2.17, + "grad_norm": 3.9180948734283447, + "learning_rate": 6.018149973028893e-06, + "logits/chosen": -0.5193759799003601, + "logits/rejected": -0.5525437593460083, + "logps/chosen": -50.67488098144531, + "logps/rejected": -97.27850341796875, + "loss": 0.7076, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1960136890411377, + "rewards/margins": 5.504900932312012, + "rewards/rejected": -2.308887481689453, + "step": 8688 + }, + { + "epoch": 2.17, + "grad_norm": 6.9739532470703125, + "learning_rate": 6.017380440876098e-06, + "logits/chosen": -0.49154147505760193, + "logits/rejected": -0.6259880065917969, + "logps/chosen": -63.7589111328125, + "logps/rejected": -89.37300109863281, + "loss": 0.7406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.858241081237793, + "rewards/margins": 5.813309192657471, + "rewards/rejected": -2.9550681114196777, + "step": 8689 + }, + { + "epoch": 2.17, + "grad_norm": 5.767848968505859, + "learning_rate": 6.016610883582752e-06, + "logits/chosen": -0.5363858342170715, + "logits/rejected": -0.5866792798042297, + "logps/chosen": -45.48849868774414, + "logps/rejected": -88.27916717529297, + "loss": 0.6426, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.096102237701416, + "rewards/margins": 4.920394420623779, + "rewards/rejected": -1.8242921829223633, + "step": 8690 + }, + { + "epoch": 2.17, + "grad_norm": 3.888413667678833, + "learning_rate": 6.015841301167867e-06, + "logits/chosen": -0.5137187242507935, + "logits/rejected": -0.5718059539794922, + "logps/chosen": -48.22960662841797, + "logps/rejected": -90.44221496582031, + "loss": 0.6887, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.046618938446045, + "rewards/margins": 6.237608909606934, + "rewards/rejected": -3.1909894943237305, + "step": 8691 + }, + { + "epoch": 2.17, + "grad_norm": 4.185261249542236, + "learning_rate": 6.015071693650465e-06, + "logits/chosen": -0.43391430377960205, + "logits/rejected": -0.5473482608795166, + "logps/chosen": -59.677486419677734, + "logps/rejected": -84.5452651977539, + "loss": 0.7145, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.278717041015625, + "rewards/margins": 5.8386030197143555, + "rewards/rejected": -2.5598859786987305, + "step": 8692 + }, + { + "epoch": 2.17, + "grad_norm": 11.778050422668457, + "learning_rate": 6.01430206104956e-06, + "logits/chosen": -0.49870818853378296, + "logits/rejected": -0.5440057516098022, + "logps/chosen": -66.88192749023438, + "logps/rejected": -94.92462921142578, + "loss": 0.8996, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9741291999816895, + "rewards/margins": 4.598949432373047, + "rewards/rejected": -1.6248201131820679, + "step": 8693 + }, + { + "epoch": 2.17, + "grad_norm": 3.5821969509124756, + "learning_rate": 6.013532403384173e-06, + "logits/chosen": -0.4309774935245514, + "logits/rejected": -0.574350118637085, + "logps/chosen": -55.90899658203125, + "logps/rejected": -89.41270446777344, + "loss": 0.6186, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.022343873977661, + "rewards/margins": 6.592476844787598, + "rewards/rejected": -3.5701332092285156, + "step": 8694 + }, + { + "epoch": 2.18, + "grad_norm": 2.4334909915924072, + "learning_rate": 6.012762720673322e-06, + "logits/chosen": -0.5079385638237, + "logits/rejected": -0.6206765174865723, + "logps/chosen": -53.63072967529297, + "logps/rejected": -81.4252700805664, + "loss": 0.5779, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0565826892852783, + "rewards/margins": 5.990577697753906, + "rewards/rejected": -2.933995485305786, + "step": 8695 + }, + { + "epoch": 2.18, + "grad_norm": 8.248030662536621, + "learning_rate": 6.011993012936028e-06, + "logits/chosen": -0.5284509062767029, + "logits/rejected": -0.619777500629425, + "logps/chosen": -51.95257568359375, + "logps/rejected": -97.1526870727539, + "loss": 0.6832, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.839130163192749, + "rewards/margins": 6.302407264709473, + "rewards/rejected": -3.463277578353882, + "step": 8696 + }, + { + "epoch": 2.18, + "grad_norm": 2.967871904373169, + "learning_rate": 6.01122328019131e-06, + "logits/chosen": -0.4131258726119995, + "logits/rejected": -0.4946121573448181, + "logps/chosen": -62.279441833496094, + "logps/rejected": -93.87675476074219, + "loss": 0.6861, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.945577383041382, + "rewards/margins": 5.770987510681152, + "rewards/rejected": -2.8254103660583496, + "step": 8697 + }, + { + "epoch": 2.18, + "grad_norm": 6.969301223754883, + "learning_rate": 6.010453522458188e-06, + "logits/chosen": -0.4672723412513733, + "logits/rejected": -0.5752291083335876, + "logps/chosen": -59.3784294128418, + "logps/rejected": -114.62458038330078, + "loss": 0.6686, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.783287286758423, + "rewards/margins": 7.263439178466797, + "rewards/rejected": -4.4801530838012695, + "step": 8698 + }, + { + "epoch": 2.18, + "grad_norm": 4.064791679382324, + "learning_rate": 6.009683739755686e-06, + "logits/chosen": -0.5181320905685425, + "logits/rejected": -0.5818905234336853, + "logps/chosen": -50.036537170410156, + "logps/rejected": -90.52351379394531, + "loss": 0.6826, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2967286109924316, + "rewards/margins": 5.238694190979004, + "rewards/rejected": -1.941965937614441, + "step": 8699 + }, + { + "epoch": 2.18, + "grad_norm": 2.874293088912964, + "learning_rate": 6.008913932102823e-06, + "logits/chosen": -0.4039002060890198, + "logits/rejected": -0.45501816272735596, + "logps/chosen": -69.55128479003906, + "logps/rejected": -106.91570281982422, + "loss": 0.6752, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.001805543899536, + "rewards/margins": 5.617221355438232, + "rewards/rejected": -2.6154162883758545, + "step": 8700 + }, + { + "epoch": 2.18, + "grad_norm": 7.294665336608887, + "learning_rate": 6.008144099518626e-06, + "logits/chosen": -0.45866745710372925, + "logits/rejected": -0.5662448406219482, + "logps/chosen": -60.53034210205078, + "logps/rejected": -88.17530822753906, + "loss": 0.6661, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.471614122390747, + "rewards/margins": 5.7826080322265625, + "rewards/rejected": -3.3109934329986572, + "step": 8701 + }, + { + "epoch": 2.18, + "grad_norm": 13.93594741821289, + "learning_rate": 6.007374242022115e-06, + "logits/chosen": -0.5148975253105164, + "logits/rejected": -0.5280841588973999, + "logps/chosen": -55.87247085571289, + "logps/rejected": -102.43999481201172, + "loss": 0.7658, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6494438648223877, + "rewards/margins": 4.508363723754883, + "rewards/rejected": -1.858919382095337, + "step": 8702 + }, + { + "epoch": 2.18, + "grad_norm": 12.019851684570312, + "learning_rate": 6.006604359632314e-06, + "logits/chosen": -0.4681955873966217, + "logits/rejected": -0.5334908366203308, + "logps/chosen": -63.19353485107422, + "logps/rejected": -103.26449584960938, + "loss": 0.7133, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9546666145324707, + "rewards/margins": 6.506743431091309, + "rewards/rejected": -3.552077293395996, + "step": 8703 + }, + { + "epoch": 2.18, + "grad_norm": 12.036235809326172, + "learning_rate": 6.005834452368251e-06, + "logits/chosen": -0.5282216668128967, + "logits/rejected": -0.6110131144523621, + "logps/chosen": -47.0398063659668, + "logps/rejected": -96.77912902832031, + "loss": 0.6203, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8422999382019043, + "rewards/margins": 6.913072109222412, + "rewards/rejected": -4.070772171020508, + "step": 8704 + }, + { + "epoch": 2.18, + "grad_norm": 6.50285530090332, + "learning_rate": 6.005064520248947e-06, + "logits/chosen": -0.48555198311805725, + "logits/rejected": -0.6381357908248901, + "logps/chosen": -57.60121154785156, + "logps/rejected": -97.54700469970703, + "loss": 0.5942, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.896850109100342, + "rewards/margins": 6.7329912185668945, + "rewards/rejected": -3.836141347885132, + "step": 8705 + }, + { + "epoch": 2.18, + "grad_norm": 3.151228666305542, + "learning_rate": 6.004294563293429e-06, + "logits/chosen": -0.4526015520095825, + "logits/rejected": -0.5443915128707886, + "logps/chosen": -60.29021072387695, + "logps/rejected": -96.48246002197266, + "loss": 0.6241, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.313643455505371, + "rewards/margins": 6.56645393371582, + "rewards/rejected": -3.2528107166290283, + "step": 8706 + }, + { + "epoch": 2.18, + "grad_norm": 11.047879219055176, + "learning_rate": 6.003524581520726e-06, + "logits/chosen": -0.5436469316482544, + "logits/rejected": -0.5944231748580933, + "logps/chosen": -53.382774353027344, + "logps/rejected": -85.52906799316406, + "loss": 0.7677, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1346192359924316, + "rewards/margins": 5.2978620529174805, + "rewards/rejected": -2.163243055343628, + "step": 8707 + }, + { + "epoch": 2.18, + "grad_norm": 4.267085552215576, + "learning_rate": 6.002754574949862e-06, + "logits/chosen": -0.47562772035598755, + "logits/rejected": -0.5815871357917786, + "logps/chosen": -51.74159622192383, + "logps/rejected": -79.44725036621094, + "loss": 0.627, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.025263547897339, + "rewards/margins": 5.562039852142334, + "rewards/rejected": -2.536776304244995, + "step": 8708 + }, + { + "epoch": 2.18, + "grad_norm": 5.33732271194458, + "learning_rate": 6.001984543599866e-06, + "logits/chosen": -0.5490601062774658, + "logits/rejected": -0.6696580648422241, + "logps/chosen": -57.24618911743164, + "logps/rejected": -79.23753356933594, + "loss": 0.6418, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8342232704162598, + "rewards/margins": 6.4367995262146, + "rewards/rejected": -3.60257625579834, + "step": 8709 + }, + { + "epoch": 2.18, + "grad_norm": 4.134236812591553, + "learning_rate": 6.001214487489766e-06, + "logits/chosen": -0.4832812547683716, + "logits/rejected": -0.5711472630500793, + "logps/chosen": -53.82769775390625, + "logps/rejected": -107.22908782958984, + "loss": 0.6219, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.909383535385132, + "rewards/margins": 6.710440635681152, + "rewards/rejected": -3.8010573387145996, + "step": 8710 + }, + { + "epoch": 2.18, + "grad_norm": 15.5825777053833, + "learning_rate": 6.000444406638591e-06, + "logits/chosen": -0.49423015117645264, + "logits/rejected": -0.6261444091796875, + "logps/chosen": -64.82344818115234, + "logps/rejected": -88.80290222167969, + "loss": 0.8669, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5602004528045654, + "rewards/margins": 6.124186992645264, + "rewards/rejected": -3.5639865398406982, + "step": 8711 + }, + { + "epoch": 2.18, + "grad_norm": 3.9847233295440674, + "learning_rate": 5.999674301065372e-06, + "logits/chosen": -0.5253705382347107, + "logits/rejected": -0.6324952840805054, + "logps/chosen": -53.321075439453125, + "logps/rejected": -89.69572448730469, + "loss": 0.7235, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0648181438446045, + "rewards/margins": 6.019064426422119, + "rewards/rejected": -2.9542458057403564, + "step": 8712 + }, + { + "epoch": 2.18, + "grad_norm": 8.715545654296875, + "learning_rate": 5.998904170789137e-06, + "logits/chosen": -0.5137951970100403, + "logits/rejected": -0.6003721356391907, + "logps/chosen": -46.1606559753418, + "logps/rejected": -108.13072967529297, + "loss": 0.6441, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9180383682250977, + "rewards/margins": 7.144001007080078, + "rewards/rejected": -4.2259626388549805, + "step": 8713 + }, + { + "epoch": 2.18, + "grad_norm": 4.3228983879089355, + "learning_rate": 5.998134015828916e-06, + "logits/chosen": -0.4165303707122803, + "logits/rejected": -0.5077933073043823, + "logps/chosen": -49.19737243652344, + "logps/rejected": -91.09236907958984, + "loss": 0.6082, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0346415042877197, + "rewards/margins": 5.840753078460693, + "rewards/rejected": -2.8061118125915527, + "step": 8714 + }, + { + "epoch": 2.18, + "grad_norm": 3.677725315093994, + "learning_rate": 5.997363836203744e-06, + "logits/chosen": -0.4153618812561035, + "logits/rejected": -0.5223869681358337, + "logps/chosen": -71.99415588378906, + "logps/rejected": -100.46378326416016, + "loss": 0.6576, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.946467161178589, + "rewards/margins": 6.0420637130737305, + "rewards/rejected": -3.0955963134765625, + "step": 8715 + }, + { + "epoch": 2.18, + "grad_norm": 3.307352066040039, + "learning_rate": 5.996593631932649e-06, + "logits/chosen": -0.5503137707710266, + "logits/rejected": -0.6084356904029846, + "logps/chosen": -41.37316131591797, + "logps/rejected": -86.81495666503906, + "loss": 0.6833, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.881436347961426, + "rewards/margins": 5.7485880851745605, + "rewards/rejected": -2.8671517372131348, + "step": 8716 + }, + { + "epoch": 2.18, + "grad_norm": 4.867707252502441, + "learning_rate": 5.995823403034666e-06, + "logits/chosen": -0.5634890198707581, + "logits/rejected": -0.6840890645980835, + "logps/chosen": -50.983909606933594, + "logps/rejected": -81.98131561279297, + "loss": 0.6531, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1371097564697266, + "rewards/margins": 6.105216026306152, + "rewards/rejected": -2.968106269836426, + "step": 8717 + }, + { + "epoch": 2.18, + "grad_norm": 5.385268688201904, + "learning_rate": 5.995053149528828e-06, + "logits/chosen": -0.4321833550930023, + "logits/rejected": -0.503385066986084, + "logps/chosen": -56.38515090942383, + "logps/rejected": -82.3494873046875, + "loss": 0.8192, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0889604091644287, + "rewards/margins": 3.9946377277374268, + "rewards/rejected": -0.9056771993637085, + "step": 8718 + }, + { + "epoch": 2.18, + "grad_norm": 2.779787063598633, + "learning_rate": 5.9942828714341685e-06, + "logits/chosen": -0.49080824851989746, + "logits/rejected": -0.6328280568122864, + "logps/chosen": -49.65362548828125, + "logps/rejected": -70.54612731933594, + "loss": 0.5728, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.732990264892578, + "rewards/margins": 6.072486877441406, + "rewards/rejected": -3.339496374130249, + "step": 8719 + }, + { + "epoch": 2.18, + "grad_norm": 6.761359214782715, + "learning_rate": 5.993512568769719e-06, + "logits/chosen": -0.48318231105804443, + "logits/rejected": -0.575722336769104, + "logps/chosen": -47.29015350341797, + "logps/rejected": -88.28495025634766, + "loss": 0.6377, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.177168369293213, + "rewards/margins": 5.283368110656738, + "rewards/rejected": -2.1061995029449463, + "step": 8720 + }, + { + "epoch": 2.18, + "grad_norm": 2.8949520587921143, + "learning_rate": 5.992742241554521e-06, + "logits/chosen": -0.4663320481777191, + "logits/rejected": -0.5826578140258789, + "logps/chosen": -51.742393493652344, + "logps/rejected": -88.4804916381836, + "loss": 0.584, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9905107021331787, + "rewards/margins": 6.123347759246826, + "rewards/rejected": -3.1328375339508057, + "step": 8721 + }, + { + "epoch": 2.18, + "grad_norm": 6.321597576141357, + "learning_rate": 5.991971889807605e-06, + "logits/chosen": -0.5170214772224426, + "logits/rejected": -0.6133747100830078, + "logps/chosen": -56.313087463378906, + "logps/rejected": -86.68167877197266, + "loss": 0.7459, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8152146339416504, + "rewards/margins": 5.785305976867676, + "rewards/rejected": -2.9700911045074463, + "step": 8722 + }, + { + "epoch": 2.18, + "grad_norm": 3.2805685997009277, + "learning_rate": 5.991201513548006e-06, + "logits/chosen": -0.5319005250930786, + "logits/rejected": -0.6357275247573853, + "logps/chosen": -53.61565017700195, + "logps/rejected": -83.69071960449219, + "loss": 0.6791, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.21560001373291, + "rewards/margins": 5.6114912033081055, + "rewards/rejected": -2.395890951156616, + "step": 8723 + }, + { + "epoch": 2.18, + "grad_norm": 4.968726634979248, + "learning_rate": 5.990431112794766e-06, + "logits/chosen": -0.45402592420578003, + "logits/rejected": -0.5329551696777344, + "logps/chosen": -66.7130355834961, + "logps/rejected": -98.13249969482422, + "loss": 0.7321, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9071195125579834, + "rewards/margins": 5.844457149505615, + "rewards/rejected": -2.9373373985290527, + "step": 8724 + }, + { + "epoch": 2.18, + "grad_norm": 6.347146034240723, + "learning_rate": 5.98966068756692e-06, + "logits/chosen": -0.4349496364593506, + "logits/rejected": -0.5050215125083923, + "logps/chosen": -64.20181274414062, + "logps/rejected": -95.75818634033203, + "loss": 0.773, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0481278896331787, + "rewards/margins": 5.495678424835205, + "rewards/rejected": -2.4475507736206055, + "step": 8725 + }, + { + "epoch": 2.18, + "grad_norm": 3.056601047515869, + "learning_rate": 5.988890237883505e-06, + "logits/chosen": -0.4875144064426422, + "logits/rejected": -0.5293043851852417, + "logps/chosen": -44.52494430541992, + "logps/rejected": -98.58161163330078, + "loss": 0.5801, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3001930713653564, + "rewards/margins": 5.755150318145752, + "rewards/rejected": -2.4549572467803955, + "step": 8726 + }, + { + "epoch": 2.18, + "grad_norm": 5.763659477233887, + "learning_rate": 5.988119763763559e-06, + "logits/chosen": -0.4299532175064087, + "logits/rejected": -0.5520087480545044, + "logps/chosen": -53.040184020996094, + "logps/rejected": -86.03572082519531, + "loss": 0.6282, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.202573537826538, + "rewards/margins": 4.972135066986084, + "rewards/rejected": -1.7695610523223877, + "step": 8727 + }, + { + "epoch": 2.18, + "grad_norm": 3.706880569458008, + "learning_rate": 5.9873492652261245e-06, + "logits/chosen": -0.4679965078830719, + "logits/rejected": -0.5429185628890991, + "logps/chosen": -51.19597244262695, + "logps/rejected": -81.4386978149414, + "loss": 0.6565, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.125300407409668, + "rewards/margins": 5.302867889404297, + "rewards/rejected": -2.17756724357605, + "step": 8728 + }, + { + "epoch": 2.18, + "grad_norm": 2.9430508613586426, + "learning_rate": 5.986578742290239e-06, + "logits/chosen": -0.5292670726776123, + "logits/rejected": -0.6087514162063599, + "logps/chosen": -42.43921661376953, + "logps/rejected": -79.83979797363281, + "loss": 0.6207, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.102187395095825, + "rewards/margins": 5.878721714019775, + "rewards/rejected": -2.7765350341796875, + "step": 8729 + }, + { + "epoch": 2.18, + "grad_norm": 5.450256824493408, + "learning_rate": 5.985808194974943e-06, + "logits/chosen": -0.5253744721412659, + "logits/rejected": -0.6422861814498901, + "logps/chosen": -47.63568878173828, + "logps/rejected": -93.71931457519531, + "loss": 0.534, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9287123680114746, + "rewards/margins": 6.6304802894592285, + "rewards/rejected": -3.701767921447754, + "step": 8730 + }, + { + "epoch": 2.18, + "grad_norm": 14.262191772460938, + "learning_rate": 5.985037623299278e-06, + "logits/chosen": -0.4861869513988495, + "logits/rejected": -0.5360894203186035, + "logps/chosen": -53.16387939453125, + "logps/rejected": -91.44215393066406, + "loss": 0.794, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1137986183166504, + "rewards/margins": 5.059067726135254, + "rewards/rejected": -1.9452693462371826, + "step": 8731 + }, + { + "epoch": 2.18, + "grad_norm": 2.2554879188537598, + "learning_rate": 5.984267027282286e-06, + "logits/chosen": -0.47898873686790466, + "logits/rejected": -0.5648108720779419, + "logps/chosen": -53.47008514404297, + "logps/rejected": -92.39228820800781, + "loss": 0.6214, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0676326751708984, + "rewards/margins": 6.27712345123291, + "rewards/rejected": -3.209491014480591, + "step": 8732 + }, + { + "epoch": 2.18, + "grad_norm": 33.92911148071289, + "learning_rate": 5.983496406943009e-06, + "logits/chosen": -0.4579692482948303, + "logits/rejected": -0.5573217272758484, + "logps/chosen": -63.64991760253906, + "logps/rejected": -91.33905792236328, + "loss": 0.8819, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6701741218566895, + "rewards/margins": 4.4121575355529785, + "rewards/rejected": -1.7419829368591309, + "step": 8733 + }, + { + "epoch": 2.18, + "grad_norm": 6.795610427856445, + "learning_rate": 5.982725762300489e-06, + "logits/chosen": -0.4767282009124756, + "logits/rejected": -0.5617036819458008, + "logps/chosen": -45.211734771728516, + "logps/rejected": -84.61665344238281, + "loss": 0.7007, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.059886932373047, + "rewards/margins": 6.21923828125, + "rewards/rejected": -3.159350633621216, + "step": 8734 + }, + { + "epoch": 2.19, + "grad_norm": 5.518632411956787, + "learning_rate": 5.98195509337377e-06, + "logits/chosen": -0.49422088265419006, + "logits/rejected": -0.5488013625144958, + "logps/chosen": -47.179344177246094, + "logps/rejected": -105.75271606445312, + "loss": 0.59, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2135491371154785, + "rewards/margins": 6.893251419067383, + "rewards/rejected": -3.6797027587890625, + "step": 8735 + }, + { + "epoch": 2.19, + "grad_norm": 4.997684955596924, + "learning_rate": 5.981184400181897e-06, + "logits/chosen": -0.5061626434326172, + "logits/rejected": -0.5785166621208191, + "logps/chosen": -59.70274353027344, + "logps/rejected": -87.31136322021484, + "loss": 0.8071, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7746124267578125, + "rewards/margins": 5.336117267608643, + "rewards/rejected": -2.56150484085083, + "step": 8736 + }, + { + "epoch": 2.19, + "grad_norm": 8.872027397155762, + "learning_rate": 5.980413682743913e-06, + "logits/chosen": -0.5104723572731018, + "logits/rejected": -0.5847805142402649, + "logps/chosen": -55.186622619628906, + "logps/rejected": -93.22434997558594, + "loss": 0.6982, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.853438377380371, + "rewards/margins": 5.704537868499756, + "rewards/rejected": -2.8510994911193848, + "step": 8737 + }, + { + "epoch": 2.19, + "grad_norm": 3.5551364421844482, + "learning_rate": 5.979642941078865e-06, + "logits/chosen": -0.5159170627593994, + "logits/rejected": -0.6262511014938354, + "logps/chosen": -59.452762603759766, + "logps/rejected": -83.17959594726562, + "loss": 0.7278, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.437643527984619, + "rewards/margins": 6.064222812652588, + "rewards/rejected": -2.626579523086548, + "step": 8738 + }, + { + "epoch": 2.19, + "grad_norm": 3.6838343143463135, + "learning_rate": 5.978872175205798e-06, + "logits/chosen": -0.48660042881965637, + "logits/rejected": -0.5944138765335083, + "logps/chosen": -63.222618103027344, + "logps/rejected": -90.31010437011719, + "loss": 0.6337, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.690013885498047, + "rewards/margins": 5.602982044219971, + "rewards/rejected": -2.9129676818847656, + "step": 8739 + }, + { + "epoch": 2.19, + "grad_norm": 10.97460651397705, + "learning_rate": 5.978101385143757e-06, + "logits/chosen": -0.3222191333770752, + "logits/rejected": -0.4202832877635956, + "logps/chosen": -60.71662521362305, + "logps/rejected": -92.9952392578125, + "loss": 0.7319, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.681572437286377, + "rewards/margins": 5.3093390464782715, + "rewards/rejected": -2.6277666091918945, + "step": 8740 + }, + { + "epoch": 2.19, + "grad_norm": 3.7093441486358643, + "learning_rate": 5.977330570911791e-06, + "logits/chosen": -0.47498461604118347, + "logits/rejected": -0.5868924856185913, + "logps/chosen": -58.17442321777344, + "logps/rejected": -92.98522186279297, + "loss": 0.6812, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9725759029388428, + "rewards/margins": 5.927910804748535, + "rewards/rejected": -2.9553349018096924, + "step": 8741 + }, + { + "epoch": 2.19, + "grad_norm": 4.253256797790527, + "learning_rate": 5.976559732528948e-06, + "logits/chosen": -0.6007344722747803, + "logits/rejected": -0.7046520113945007, + "logps/chosen": -47.851741790771484, + "logps/rejected": -91.81266021728516, + "loss": 0.6782, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1533026695251465, + "rewards/margins": 6.238100051879883, + "rewards/rejected": -3.0847978591918945, + "step": 8742 + }, + { + "epoch": 2.19, + "grad_norm": 3.0772359371185303, + "learning_rate": 5.975788870014273e-06, + "logits/chosen": -0.5226656198501587, + "logits/rejected": -0.5823563933372498, + "logps/chosen": -52.4727897644043, + "logps/rejected": -103.27857208251953, + "loss": 0.6277, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8589537143707275, + "rewards/margins": 6.3996405601501465, + "rewards/rejected": -3.540687084197998, + "step": 8743 + }, + { + "epoch": 2.19, + "grad_norm": 5.5930399894714355, + "learning_rate": 5.97501798338682e-06, + "logits/chosen": -0.5133680105209351, + "logits/rejected": -0.6345595717430115, + "logps/chosen": -61.22180938720703, + "logps/rejected": -76.5524673461914, + "loss": 0.7852, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0180163383483887, + "rewards/margins": 5.0580339431762695, + "rewards/rejected": -2.040017604827881, + "step": 8744 + }, + { + "epoch": 2.19, + "grad_norm": 6.09093713760376, + "learning_rate": 5.974247072665636e-06, + "logits/chosen": -0.4701473116874695, + "logits/rejected": -0.5480021834373474, + "logps/chosen": -56.94731140136719, + "logps/rejected": -99.98686981201172, + "loss": 0.6841, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9161148071289062, + "rewards/margins": 5.255104064941406, + "rewards/rejected": -2.3389899730682373, + "step": 8745 + }, + { + "epoch": 2.19, + "grad_norm": 23.304889678955078, + "learning_rate": 5.973476137869767e-06, + "logits/chosen": -0.5273051857948303, + "logits/rejected": -0.6012884974479675, + "logps/chosen": -66.07315826416016, + "logps/rejected": -92.87055969238281, + "loss": 0.878, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.68981671333313, + "rewards/margins": 5.132020950317383, + "rewards/rejected": -2.442204236984253, + "step": 8746 + }, + { + "epoch": 2.19, + "grad_norm": 2.960885763168335, + "learning_rate": 5.97270517901827e-06, + "logits/chosen": -0.5504799485206604, + "logits/rejected": -0.6631137132644653, + "logps/chosen": -55.70966720581055, + "logps/rejected": -93.03260803222656, + "loss": 0.6048, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9288277626037598, + "rewards/margins": 6.138845920562744, + "rewards/rejected": -3.2100181579589844, + "step": 8747 + }, + { + "epoch": 2.19, + "grad_norm": 4.578742027282715, + "learning_rate": 5.971934196130191e-06, + "logits/chosen": -0.490430623292923, + "logits/rejected": -0.5240938067436218, + "logps/chosen": -47.42053985595703, + "logps/rejected": -107.77822875976562, + "loss": 0.5673, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0436513423919678, + "rewards/margins": 6.168457508087158, + "rewards/rejected": -3.1248059272766113, + "step": 8748 + }, + { + "epoch": 2.19, + "grad_norm": 3.1810390949249268, + "learning_rate": 5.971163189224586e-06, + "logits/chosen": -0.49211663007736206, + "logits/rejected": -0.589858889579773, + "logps/chosen": -48.4493408203125, + "logps/rejected": -89.10528564453125, + "loss": 0.5782, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0709822177886963, + "rewards/margins": 5.491977691650391, + "rewards/rejected": -2.420994997024536, + "step": 8749 + }, + { + "epoch": 2.19, + "grad_norm": 4.24805212020874, + "learning_rate": 5.970392158320505e-06, + "logits/chosen": -0.3998371958732605, + "logits/rejected": -0.49139001965522766, + "logps/chosen": -58.18370056152344, + "logps/rejected": -102.43548583984375, + "loss": 0.637, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0011098384857178, + "rewards/margins": 6.244253158569336, + "rewards/rejected": -3.2431435585021973, + "step": 8750 + }, + { + "epoch": 2.19, + "grad_norm": 7.15388822555542, + "learning_rate": 5.969621103437003e-06, + "logits/chosen": -0.5232100486755371, + "logits/rejected": -0.6626855134963989, + "logps/chosen": -56.82674026489258, + "logps/rejected": -89.69133758544922, + "loss": 0.6919, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4872665405273438, + "rewards/margins": 6.612412452697754, + "rewards/rejected": -3.12514591217041, + "step": 8751 + }, + { + "epoch": 2.19, + "grad_norm": 11.106064796447754, + "learning_rate": 5.96885002459313e-06, + "logits/chosen": -0.5694494843482971, + "logits/rejected": -0.6659989356994629, + "logps/chosen": -50.55671310424805, + "logps/rejected": -91.2597885131836, + "loss": 0.7092, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.101442337036133, + "rewards/margins": 5.1586222648620605, + "rewards/rejected": -2.057180166244507, + "step": 8752 + }, + { + "epoch": 2.19, + "grad_norm": 2.3216021060943604, + "learning_rate": 5.968078921807945e-06, + "logits/chosen": -0.4699118733406067, + "logits/rejected": -0.5466015934944153, + "logps/chosen": -51.01387023925781, + "logps/rejected": -95.16107177734375, + "loss": 0.5573, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.147279739379883, + "rewards/margins": 6.275940895080566, + "rewards/rejected": -3.128661632537842, + "step": 8753 + }, + { + "epoch": 2.19, + "grad_norm": 3.885061740875244, + "learning_rate": 5.967307795100498e-06, + "logits/chosen": -0.500278890132904, + "logits/rejected": -0.5865803956985474, + "logps/chosen": -64.25668334960938, + "logps/rejected": -110.3959732055664, + "loss": 0.696, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1739861965179443, + "rewards/margins": 7.177708148956299, + "rewards/rejected": -4.003722190856934, + "step": 8754 + }, + { + "epoch": 2.19, + "grad_norm": 3.7532293796539307, + "learning_rate": 5.966536644489847e-06, + "logits/chosen": -0.43446430563926697, + "logits/rejected": -0.5184502601623535, + "logps/chosen": -70.22233581542969, + "logps/rejected": -92.91925048828125, + "loss": 0.7056, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3474843502044678, + "rewards/margins": 5.6364593505859375, + "rewards/rejected": -2.288975238800049, + "step": 8755 + }, + { + "epoch": 2.19, + "grad_norm": 5.39702033996582, + "learning_rate": 5.965765469995048e-06, + "logits/chosen": -0.47271624207496643, + "logits/rejected": -0.5392816066741943, + "logps/chosen": -56.697120666503906, + "logps/rejected": -92.34774780273438, + "loss": 0.6887, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.823355197906494, + "rewards/margins": 5.912901878356934, + "rewards/rejected": -3.0895471572875977, + "step": 8756 + }, + { + "epoch": 2.19, + "grad_norm": 8.826919555664062, + "learning_rate": 5.964994271635156e-06, + "logits/chosen": -0.5329580903053284, + "logits/rejected": -0.5966018438339233, + "logps/chosen": -62.1514778137207, + "logps/rejected": -94.92996978759766, + "loss": 0.7579, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.643852710723877, + "rewards/margins": 5.661032676696777, + "rewards/rejected": -3.0171799659729004, + "step": 8757 + }, + { + "epoch": 2.19, + "grad_norm": 7.352463245391846, + "learning_rate": 5.964223049429229e-06, + "logits/chosen": -0.4947291314601898, + "logits/rejected": -0.5769263505935669, + "logps/chosen": -63.078617095947266, + "logps/rejected": -118.95205688476562, + "loss": 0.7951, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1358401775360107, + "rewards/margins": 6.845357894897461, + "rewards/rejected": -3.709517478942871, + "step": 8758 + }, + { + "epoch": 2.19, + "grad_norm": 5.116219997406006, + "learning_rate": 5.963451803396326e-06, + "logits/chosen": -0.4594685733318329, + "logits/rejected": -0.5681061744689941, + "logps/chosen": -60.13338088989258, + "logps/rejected": -90.67417907714844, + "loss": 0.6548, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9356417655944824, + "rewards/margins": 5.930031776428223, + "rewards/rejected": -2.9943900108337402, + "step": 8759 + }, + { + "epoch": 2.19, + "grad_norm": 5.839029312133789, + "learning_rate": 5.962680533555504e-06, + "logits/chosen": -0.45448845624923706, + "logits/rejected": -0.5374205708503723, + "logps/chosen": -55.395973205566406, + "logps/rejected": -84.87694549560547, + "loss": 0.6404, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.030486822128296, + "rewards/margins": 5.959656238555908, + "rewards/rejected": -2.9291694164276123, + "step": 8760 + }, + { + "epoch": 2.19, + "grad_norm": 3.1588134765625, + "learning_rate": 5.961909239925821e-06, + "logits/chosen": -0.5152610540390015, + "logits/rejected": -0.5566781759262085, + "logps/chosen": -57.9581298828125, + "logps/rejected": -82.55040740966797, + "loss": 0.6972, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3942389488220215, + "rewards/margins": 5.719951629638672, + "rewards/rejected": -2.3257129192352295, + "step": 8761 + }, + { + "epoch": 2.19, + "grad_norm": 8.533239364624023, + "learning_rate": 5.96113792252634e-06, + "logits/chosen": -0.5563852190971375, + "logits/rejected": -0.6555877923965454, + "logps/chosen": -47.88368225097656, + "logps/rejected": -67.97466278076172, + "loss": 0.6866, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.025338888168335, + "rewards/margins": 5.8175835609436035, + "rewards/rejected": -2.7922444343566895, + "step": 8762 + }, + { + "epoch": 2.19, + "grad_norm": 2.7124714851379395, + "learning_rate": 5.960366581376117e-06, + "logits/chosen": -0.46213218569755554, + "logits/rejected": -0.5698897838592529, + "logps/chosen": -52.78363037109375, + "logps/rejected": -92.19857788085938, + "loss": 0.5583, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.687675714492798, + "rewards/margins": 6.907735347747803, + "rewards/rejected": -4.220059394836426, + "step": 8763 + }, + { + "epoch": 2.19, + "grad_norm": 9.652596473693848, + "learning_rate": 5.959595216494214e-06, + "logits/chosen": -0.45210736989974976, + "logits/rejected": -0.5046809911727905, + "logps/chosen": -63.53651428222656, + "logps/rejected": -109.87479400634766, + "loss": 0.7513, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.055758476257324, + "rewards/margins": 5.375696182250977, + "rewards/rejected": -2.3199381828308105, + "step": 8764 + }, + { + "epoch": 2.19, + "grad_norm": 3.22464656829834, + "learning_rate": 5.9588238278996936e-06, + "logits/chosen": -0.4044945240020752, + "logits/rejected": -0.5202289819717407, + "logps/chosen": -66.33456420898438, + "logps/rejected": -102.32910919189453, + "loss": 0.653, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1564533710479736, + "rewards/margins": 6.204682350158691, + "rewards/rejected": -3.0482287406921387, + "step": 8765 + }, + { + "epoch": 2.19, + "grad_norm": 3.294778347015381, + "learning_rate": 5.958052415611615e-06, + "logits/chosen": -0.47050708532333374, + "logits/rejected": -0.563913106918335, + "logps/chosen": -54.008827209472656, + "logps/rejected": -93.03182220458984, + "loss": 0.5867, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2959132194519043, + "rewards/margins": 6.6759185791015625, + "rewards/rejected": -3.3800058364868164, + "step": 8766 + }, + { + "epoch": 2.19, + "grad_norm": 5.984947204589844, + "learning_rate": 5.957280979649043e-06, + "logits/chosen": -0.4923173189163208, + "logits/rejected": -0.5464153289794922, + "logps/chosen": -57.006019592285156, + "logps/rejected": -90.28826141357422, + "loss": 0.7666, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.743006467819214, + "rewards/margins": 5.033873081207275, + "rewards/rejected": -2.2908670902252197, + "step": 8767 + }, + { + "epoch": 2.19, + "grad_norm": 4.160638332366943, + "learning_rate": 5.95650952003104e-06, + "logits/chosen": -0.4875466823577881, + "logits/rejected": -0.558414876461029, + "logps/chosen": -53.48261642456055, + "logps/rejected": -95.53770446777344, + "loss": 0.6131, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.988201856613159, + "rewards/margins": 5.889419078826904, + "rewards/rejected": -2.901217222213745, + "step": 8768 + }, + { + "epoch": 2.19, + "grad_norm": 2.926858425140381, + "learning_rate": 5.955738036776668e-06, + "logits/chosen": -0.4581259489059448, + "logits/rejected": -0.5488308668136597, + "logps/chosen": -54.162776947021484, + "logps/rejected": -108.10425567626953, + "loss": 0.5459, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.963857650756836, + "rewards/margins": 6.205651760101318, + "rewards/rejected": -3.241793632507324, + "step": 8769 + }, + { + "epoch": 2.19, + "grad_norm": 4.230539798736572, + "learning_rate": 5.954966529904995e-06, + "logits/chosen": -0.47549548745155334, + "logits/rejected": -0.5611351132392883, + "logps/chosen": -47.40850067138672, + "logps/rejected": -89.96214294433594, + "loss": 0.6563, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.241420030593872, + "rewards/margins": 6.122529029846191, + "rewards/rejected": -2.881108522415161, + "step": 8770 + }, + { + "epoch": 2.19, + "grad_norm": 5.164734363555908, + "learning_rate": 5.954194999435081e-06, + "logits/chosen": -0.5423352718353271, + "logits/rejected": -0.582788348197937, + "logps/chosen": -54.53789520263672, + "logps/rejected": -99.81039428710938, + "loss": 0.7477, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.738337993621826, + "rewards/margins": 4.728867530822754, + "rewards/rejected": -1.9905290603637695, + "step": 8771 + }, + { + "epoch": 2.19, + "grad_norm": 4.655275821685791, + "learning_rate": 5.953423445385995e-06, + "logits/chosen": -0.5270372629165649, + "logits/rejected": -0.6045475006103516, + "logps/chosen": -50.725669860839844, + "logps/rejected": -91.93728637695312, + "loss": 0.5906, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0017898082733154, + "rewards/margins": 6.395214557647705, + "rewards/rejected": -3.3934247493743896, + "step": 8772 + }, + { + "epoch": 2.19, + "grad_norm": 9.935503959655762, + "learning_rate": 5.952651867776801e-06, + "logits/chosen": -0.5041692852973938, + "logits/rejected": -0.6292564868927002, + "logps/chosen": -59.147056579589844, + "logps/rejected": -73.28828430175781, + "loss": 0.7124, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9819083213806152, + "rewards/margins": 5.69071626663208, + "rewards/rejected": -2.708807945251465, + "step": 8773 + }, + { + "epoch": 2.19, + "grad_norm": 22.106918334960938, + "learning_rate": 5.951880266626566e-06, + "logits/chosen": -0.5558326244354248, + "logits/rejected": -0.6176214814186096, + "logps/chosen": -51.124935150146484, + "logps/rejected": -80.134765625, + "loss": 0.7495, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1669344902038574, + "rewards/margins": 5.355232238769531, + "rewards/rejected": -2.188297748565674, + "step": 8774 + }, + { + "epoch": 2.2, + "grad_norm": 3.901940107345581, + "learning_rate": 5.951108641954357e-06, + "logits/chosen": -0.4335304796695709, + "logits/rejected": -0.5710842609405518, + "logps/chosen": -54.413211822509766, + "logps/rejected": -84.13121795654297, + "loss": 0.5898, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7709460258483887, + "rewards/margins": 6.694892883300781, + "rewards/rejected": -3.9239463806152344, + "step": 8775 + }, + { + "epoch": 2.2, + "grad_norm": 18.344026565551758, + "learning_rate": 5.9503369937792435e-06, + "logits/chosen": -0.5537128448486328, + "logits/rejected": -0.650688111782074, + "logps/chosen": -53.41073226928711, + "logps/rejected": -92.8875961303711, + "loss": 0.702, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7924387454986572, + "rewards/margins": 5.747619152069092, + "rewards/rejected": -2.9551806449890137, + "step": 8776 + }, + { + "epoch": 2.2, + "grad_norm": 5.825515270233154, + "learning_rate": 5.94956532212029e-06, + "logits/chosen": -0.5071499943733215, + "logits/rejected": -0.5372027158737183, + "logps/chosen": -70.52010345458984, + "logps/rejected": -97.41352844238281, + "loss": 0.7808, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8794143199920654, + "rewards/margins": 4.276717185974121, + "rewards/rejected": -1.3973026275634766, + "step": 8777 + }, + { + "epoch": 2.2, + "grad_norm": 2.6020777225494385, + "learning_rate": 5.948793626996568e-06, + "logits/chosen": -0.4396923780441284, + "logits/rejected": -0.5434185862541199, + "logps/chosen": -55.165767669677734, + "logps/rejected": -104.79110717773438, + "loss": 0.5539, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8285105228424072, + "rewards/margins": 6.675081253051758, + "rewards/rejected": -3.8465704917907715, + "step": 8778 + }, + { + "epoch": 2.2, + "grad_norm": 2.5147883892059326, + "learning_rate": 5.948021908427148e-06, + "logits/chosen": -0.569190263748169, + "logits/rejected": -0.656179666519165, + "logps/chosen": -47.834716796875, + "logps/rejected": -91.79216003417969, + "loss": 0.5712, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.972137689590454, + "rewards/margins": 6.927517890930176, + "rewards/rejected": -3.9553802013397217, + "step": 8779 + }, + { + "epoch": 2.2, + "grad_norm": 6.535794734954834, + "learning_rate": 5.9472501664310945e-06, + "logits/chosen": -0.49490755796432495, + "logits/rejected": -0.562910258769989, + "logps/chosen": -56.060760498046875, + "logps/rejected": -104.7795639038086, + "loss": 0.7351, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0684585571289062, + "rewards/margins": 5.399134635925293, + "rewards/rejected": -2.3306756019592285, + "step": 8780 + }, + { + "epoch": 2.2, + "grad_norm": 2.0513370037078857, + "learning_rate": 5.946478401027486e-06, + "logits/chosen": -0.5955991744995117, + "logits/rejected": -0.6468458771705627, + "logps/chosen": -45.54895782470703, + "logps/rejected": -102.71082305908203, + "loss": 0.5612, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.314842939376831, + "rewards/margins": 7.259623050689697, + "rewards/rejected": -3.944779872894287, + "step": 8781 + }, + { + "epoch": 2.2, + "grad_norm": 3.1640210151672363, + "learning_rate": 5.945706612235387e-06, + "logits/chosen": -0.5448399782180786, + "logits/rejected": -0.6076411008834839, + "logps/chosen": -46.62757110595703, + "logps/rejected": -91.7071533203125, + "loss": 0.5705, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1825969219207764, + "rewards/margins": 5.712076663970947, + "rewards/rejected": -2.529479742050171, + "step": 8782 + }, + { + "epoch": 2.2, + "grad_norm": 6.1251606941223145, + "learning_rate": 5.944934800073872e-06, + "logits/chosen": -0.5408822298049927, + "logits/rejected": -0.637893557548523, + "logps/chosen": -60.0274772644043, + "logps/rejected": -86.2374267578125, + "loss": 0.7146, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1877927780151367, + "rewards/margins": 5.173959255218506, + "rewards/rejected": -1.986167073249817, + "step": 8783 + }, + { + "epoch": 2.2, + "grad_norm": 6.791087627410889, + "learning_rate": 5.944162964562013e-06, + "logits/chosen": -0.47615352272987366, + "logits/rejected": -0.5261461138725281, + "logps/chosen": -50.41145706176758, + "logps/rejected": -93.23418426513672, + "loss": 0.704, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8722243309020996, + "rewards/margins": 5.12361478805542, + "rewards/rejected": -2.251389980316162, + "step": 8784 + }, + { + "epoch": 2.2, + "grad_norm": 2.8439197540283203, + "learning_rate": 5.943391105718883e-06, + "logits/chosen": -0.5300835371017456, + "logits/rejected": -0.585959255695343, + "logps/chosen": -50.67439270019531, + "logps/rejected": -110.17475128173828, + "loss": 0.6368, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0626063346862793, + "rewards/margins": 6.649312973022461, + "rewards/rejected": -3.5867066383361816, + "step": 8785 + }, + { + "epoch": 2.2, + "grad_norm": 11.020846366882324, + "learning_rate": 5.942619223563555e-06, + "logits/chosen": -0.523628294467926, + "logits/rejected": -0.6120439767837524, + "logps/chosen": -59.28847885131836, + "logps/rejected": -99.99055480957031, + "loss": 0.7588, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7057864665985107, + "rewards/margins": 6.036701679229736, + "rewards/rejected": -3.3309154510498047, + "step": 8786 + }, + { + "epoch": 2.2, + "grad_norm": 4.12683629989624, + "learning_rate": 5.9418473181151045e-06, + "logits/chosen": -0.48871660232543945, + "logits/rejected": -0.5908352136611938, + "logps/chosen": -57.99903106689453, + "logps/rejected": -96.15109252929688, + "loss": 0.7157, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0253233909606934, + "rewards/margins": 5.212221622467041, + "rewards/rejected": -2.1868977546691895, + "step": 8787 + }, + { + "epoch": 2.2, + "grad_norm": 6.893191814422607, + "learning_rate": 5.941075389392605e-06, + "logits/chosen": -0.5175067186355591, + "logits/rejected": -0.6020665168762207, + "logps/chosen": -50.732444763183594, + "logps/rejected": -90.18975830078125, + "loss": 0.6711, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1693921089172363, + "rewards/margins": 5.9155988693237305, + "rewards/rejected": -2.746206760406494, + "step": 8788 + }, + { + "epoch": 2.2, + "grad_norm": 3.0307414531707764, + "learning_rate": 5.94030343741513e-06, + "logits/chosen": -0.5767279863357544, + "logits/rejected": -0.6033446788787842, + "logps/chosen": -44.907554626464844, + "logps/rejected": -104.21766662597656, + "loss": 0.5725, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0589237213134766, + "rewards/margins": 6.214661598205566, + "rewards/rejected": -3.1557376384735107, + "step": 8789 + }, + { + "epoch": 2.2, + "grad_norm": 4.060441017150879, + "learning_rate": 5.939531462201759e-06, + "logits/chosen": -0.5162680745124817, + "logits/rejected": -0.6273013353347778, + "logps/chosen": -52.179412841796875, + "logps/rejected": -88.31863403320312, + "loss": 0.5392, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1591477394104004, + "rewards/margins": 5.8323774337768555, + "rewards/rejected": -2.673229694366455, + "step": 8790 + }, + { + "epoch": 2.2, + "grad_norm": 9.870555877685547, + "learning_rate": 5.9387594637715655e-06, + "logits/chosen": -0.3866978585720062, + "logits/rejected": -0.4693486988544464, + "logps/chosen": -70.55552673339844, + "logps/rejected": -89.41213989257812, + "loss": 0.754, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0739641189575195, + "rewards/margins": 5.446988582611084, + "rewards/rejected": -2.3730247020721436, + "step": 8791 + }, + { + "epoch": 2.2, + "grad_norm": 4.390090465545654, + "learning_rate": 5.937987442143626e-06, + "logits/chosen": -0.404563844203949, + "logits/rejected": -0.5091962218284607, + "logps/chosen": -58.376708984375, + "logps/rejected": -107.2055892944336, + "loss": 0.5617, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1398377418518066, + "rewards/margins": 6.339165210723877, + "rewards/rejected": -3.199326992034912, + "step": 8792 + }, + { + "epoch": 2.2, + "grad_norm": 6.827944755554199, + "learning_rate": 5.9372153973370195e-06, + "logits/chosen": -0.5187423229217529, + "logits/rejected": -0.6178352236747742, + "logps/chosen": -55.954246520996094, + "logps/rejected": -115.34046936035156, + "loss": 0.7578, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.409691333770752, + "rewards/margins": 6.877305030822754, + "rewards/rejected": -4.467613697052002, + "step": 8793 + }, + { + "epoch": 2.2, + "grad_norm": 8.355077743530273, + "learning_rate": 5.936443329370825e-06, + "logits/chosen": -0.48525470495224, + "logits/rejected": -0.6037116646766663, + "logps/chosen": -78.43036651611328, + "logps/rejected": -89.95542907714844, + "loss": 0.7458, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.684299945831299, + "rewards/margins": 5.807704925537109, + "rewards/rejected": -3.1234049797058105, + "step": 8794 + }, + { + "epoch": 2.2, + "grad_norm": 6.022635459899902, + "learning_rate": 5.935671238264118e-06, + "logits/chosen": -0.4957693815231323, + "logits/rejected": -0.5680234432220459, + "logps/chosen": -46.669010162353516, + "logps/rejected": -89.05186462402344, + "loss": 0.654, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7540946006774902, + "rewards/margins": 5.542151927947998, + "rewards/rejected": -2.788057565689087, + "step": 8795 + }, + { + "epoch": 2.2, + "grad_norm": 5.581099987030029, + "learning_rate": 5.934899124035981e-06, + "logits/chosen": -0.5998972654342651, + "logits/rejected": -0.6698437929153442, + "logps/chosen": -56.99391555786133, + "logps/rejected": -86.78855895996094, + "loss": 0.7357, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8286256790161133, + "rewards/margins": 6.142919540405273, + "rewards/rejected": -3.314293384552002, + "step": 8796 + }, + { + "epoch": 2.2, + "grad_norm": 3.8321871757507324, + "learning_rate": 5.934126986705491e-06, + "logits/chosen": -0.504761278629303, + "logits/rejected": -0.5761343836784363, + "logps/chosen": -61.26557922363281, + "logps/rejected": -84.65304565429688, + "loss": 0.7045, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2762675285339355, + "rewards/margins": 5.1564717292785645, + "rewards/rejected": -1.8802039623260498, + "step": 8797 + }, + { + "epoch": 2.2, + "grad_norm": 4.998137950897217, + "learning_rate": 5.933354826291729e-06, + "logits/chosen": -0.493692547082901, + "logits/rejected": -0.5519276857376099, + "logps/chosen": -53.71683120727539, + "logps/rejected": -102.7052230834961, + "loss": 0.7436, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.055114984512329, + "rewards/margins": 4.853646278381348, + "rewards/rejected": -1.7985316514968872, + "step": 8798 + }, + { + "epoch": 2.2, + "grad_norm": 4.7300333976745605, + "learning_rate": 5.9325826428137785e-06, + "logits/chosen": -0.5334801077842712, + "logits/rejected": -0.6126323938369751, + "logps/chosen": -45.83258056640625, + "logps/rejected": -90.15608978271484, + "loss": 0.6158, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.860952377319336, + "rewards/margins": 5.886275291442871, + "rewards/rejected": -3.025322675704956, + "step": 8799 + }, + { + "epoch": 2.2, + "grad_norm": 12.612580299377441, + "learning_rate": 5.931810436290719e-06, + "logits/chosen": -0.4444931745529175, + "logits/rejected": -0.5187299847602844, + "logps/chosen": -53.19770050048828, + "logps/rejected": -100.45979309082031, + "loss": 0.6931, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.71891450881958, + "rewards/margins": 6.055286407470703, + "rewards/rejected": -3.3363723754882812, + "step": 8800 + }, + { + "epoch": 2.2, + "grad_norm": 4.440515518188477, + "learning_rate": 5.93103820674163e-06, + "logits/chosen": -0.5293892621994019, + "logits/rejected": -0.5976763963699341, + "logps/chosen": -51.821380615234375, + "logps/rejected": -94.75191497802734, + "loss": 0.6801, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.656569004058838, + "rewards/margins": 5.897608757019043, + "rewards/rejected": -3.2410402297973633, + "step": 8801 + }, + { + "epoch": 2.2, + "grad_norm": 3.4791641235351562, + "learning_rate": 5.930265954185599e-06, + "logits/chosen": -0.5248490571975708, + "logits/rejected": -0.6325485110282898, + "logps/chosen": -65.23091888427734, + "logps/rejected": -92.60187530517578, + "loss": 0.6876, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8684942722320557, + "rewards/margins": 6.081002712249756, + "rewards/rejected": -3.212507963180542, + "step": 8802 + }, + { + "epoch": 2.2, + "grad_norm": 4.95112419128418, + "learning_rate": 5.929493678641705e-06, + "logits/chosen": -0.5108730792999268, + "logits/rejected": -0.5444433689117432, + "logps/chosen": -44.54121017456055, + "logps/rejected": -101.03897857666016, + "loss": 0.5471, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.008212089538574, + "rewards/margins": 5.822690963745117, + "rewards/rejected": -2.814478874206543, + "step": 8803 + }, + { + "epoch": 2.2, + "grad_norm": 4.060926914215088, + "learning_rate": 5.928721380129037e-06, + "logits/chosen": -0.4951661229133606, + "logits/rejected": -0.5812908411026001, + "logps/chosen": -49.2225456237793, + "logps/rejected": -92.96391296386719, + "loss": 0.6545, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2631185054779053, + "rewards/margins": 6.471842288970947, + "rewards/rejected": -3.208723783493042, + "step": 8804 + }, + { + "epoch": 2.2, + "grad_norm": 7.348896503448486, + "learning_rate": 5.927949058666672e-06, + "logits/chosen": -0.49686864018440247, + "logits/rejected": -0.5804442167282104, + "logps/chosen": -65.8907470703125, + "logps/rejected": -94.63434600830078, + "loss": 0.8333, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7466001510620117, + "rewards/margins": 4.337774276733398, + "rewards/rejected": -1.5911741256713867, + "step": 8805 + }, + { + "epoch": 2.2, + "grad_norm": 6.160462856292725, + "learning_rate": 5.9271767142737e-06, + "logits/chosen": -0.44793546199798584, + "logits/rejected": -0.5417184829711914, + "logps/chosen": -53.52433776855469, + "logps/rejected": -101.62347412109375, + "loss": 0.6923, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.062021255493164, + "rewards/margins": 5.116187572479248, + "rewards/rejected": -2.054166555404663, + "step": 8806 + }, + { + "epoch": 2.2, + "grad_norm": 6.797431945800781, + "learning_rate": 5.926404346969206e-06, + "logits/chosen": -0.486768513917923, + "logits/rejected": -0.5695450305938721, + "logps/chosen": -63.421722412109375, + "logps/rejected": -88.17660522460938, + "loss": 0.7994, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.774462938308716, + "rewards/margins": 5.670873165130615, + "rewards/rejected": -2.8964099884033203, + "step": 8807 + }, + { + "epoch": 2.2, + "grad_norm": 4.368398666381836, + "learning_rate": 5.9256319567722755e-06, + "logits/chosen": -0.4483416676521301, + "logits/rejected": -0.5844027400016785, + "logps/chosen": -69.74807739257812, + "logps/rejected": -106.30805969238281, + "loss": 0.648, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.594916820526123, + "rewards/margins": 6.110239505767822, + "rewards/rejected": -3.5153231620788574, + "step": 8808 + }, + { + "epoch": 2.2, + "grad_norm": 18.544597625732422, + "learning_rate": 5.924859543701991e-06, + "logits/chosen": -0.50159752368927, + "logits/rejected": -0.589330792427063, + "logps/chosen": -57.42365264892578, + "logps/rejected": -97.28318786621094, + "loss": 0.8569, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.91046142578125, + "rewards/margins": 6.815362930297852, + "rewards/rejected": -3.9049017429351807, + "step": 8809 + }, + { + "epoch": 2.2, + "grad_norm": 17.933935165405273, + "learning_rate": 5.924087107777449e-06, + "logits/chosen": -0.5134455561637878, + "logits/rejected": -0.6181867718696594, + "logps/chosen": -57.681846618652344, + "logps/rejected": -89.06385040283203, + "loss": 0.7492, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.749298334121704, + "rewards/margins": 6.1408467292785645, + "rewards/rejected": -3.3915488719940186, + "step": 8810 + }, + { + "epoch": 2.2, + "grad_norm": 3.387375593185425, + "learning_rate": 5.92331464901773e-06, + "logits/chosen": -0.43181389570236206, + "logits/rejected": -0.5583975315093994, + "logps/chosen": -61.29541778564453, + "logps/rejected": -111.04344940185547, + "loss": 0.6175, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.037306308746338, + "rewards/margins": 6.9543657302856445, + "rewards/rejected": -3.9170596599578857, + "step": 8811 + }, + { + "epoch": 2.2, + "grad_norm": 11.775415420532227, + "learning_rate": 5.922542167441921e-06, + "logits/chosen": -0.5763702988624573, + "logits/rejected": -0.6465808749198914, + "logps/chosen": -48.2819938659668, + "logps/rejected": -99.83889770507812, + "loss": 0.6118, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0415797233581543, + "rewards/margins": 5.400289535522461, + "rewards/rejected": -2.3587098121643066, + "step": 8812 + }, + { + "epoch": 2.2, + "grad_norm": 11.417679786682129, + "learning_rate": 5.9217696630691166e-06, + "logits/chosen": -0.4598398506641388, + "logits/rejected": -0.5207479000091553, + "logps/chosen": -52.40909957885742, + "logps/rejected": -104.74264526367188, + "loss": 0.6081, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0390279293060303, + "rewards/margins": 5.891304016113281, + "rewards/rejected": -2.85227632522583, + "step": 8813 + }, + { + "epoch": 2.2, + "grad_norm": 3.367058038711548, + "learning_rate": 5.920997135918402e-06, + "logits/chosen": -0.5061969757080078, + "logits/rejected": -0.6195340156555176, + "logps/chosen": -53.153717041015625, + "logps/rejected": -91.31654357910156, + "loss": 0.5737, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9768152236938477, + "rewards/margins": 6.9003400802612305, + "rewards/rejected": -3.9235243797302246, + "step": 8814 + }, + { + "epoch": 2.21, + "grad_norm": 20.844907760620117, + "learning_rate": 5.920224586008869e-06, + "logits/chosen": -0.48015743494033813, + "logits/rejected": -0.5721220970153809, + "logps/chosen": -74.13639068603516, + "logps/rejected": -106.61178588867188, + "loss": 0.8585, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9099392890930176, + "rewards/margins": 5.340029716491699, + "rewards/rejected": -2.4300899505615234, + "step": 8815 + }, + { + "epoch": 2.21, + "grad_norm": 13.845842361450195, + "learning_rate": 5.919452013359608e-06, + "logits/chosen": -0.499869167804718, + "logits/rejected": -0.5640045404434204, + "logps/chosen": -51.197723388671875, + "logps/rejected": -81.60259246826172, + "loss": 0.6696, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.163693904876709, + "rewards/margins": 4.339537143707275, + "rewards/rejected": -1.1758434772491455, + "step": 8816 + }, + { + "epoch": 2.21, + "grad_norm": 5.75240421295166, + "learning_rate": 5.91867941798971e-06, + "logits/chosen": -0.4574306011199951, + "logits/rejected": -0.4545918405056, + "logps/chosen": -55.73024368286133, + "logps/rejected": -109.63331604003906, + "loss": 0.7105, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9627373218536377, + "rewards/margins": 5.858841896057129, + "rewards/rejected": -2.896104097366333, + "step": 8817 + }, + { + "epoch": 2.21, + "grad_norm": 6.608936786651611, + "learning_rate": 5.9179067999182645e-06, + "logits/chosen": -0.46744683384895325, + "logits/rejected": -0.5426097512245178, + "logps/chosen": -53.848358154296875, + "logps/rejected": -82.7347640991211, + "loss": 0.658, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0577926635742188, + "rewards/margins": 4.8624067306518555, + "rewards/rejected": -1.8046135902404785, + "step": 8818 + }, + { + "epoch": 2.21, + "grad_norm": 2.5793068408966064, + "learning_rate": 5.917134159164368e-06, + "logits/chosen": -0.49400991201400757, + "logits/rejected": -0.5908414125442505, + "logps/chosen": -51.206321716308594, + "logps/rejected": -89.51598358154297, + "loss": 0.5602, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4699513912200928, + "rewards/margins": 7.465992450714111, + "rewards/rejected": -3.9960415363311768, + "step": 8819 + }, + { + "epoch": 2.21, + "grad_norm": 7.3197784423828125, + "learning_rate": 5.91636149574711e-06, + "logits/chosen": -0.47994932532310486, + "logits/rejected": -0.5714840888977051, + "logps/chosen": -60.470890045166016, + "logps/rejected": -93.85710906982422, + "loss": 0.7914, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.905186414718628, + "rewards/margins": 5.492469787597656, + "rewards/rejected": -2.587283134460449, + "step": 8820 + }, + { + "epoch": 2.21, + "grad_norm": 6.349688529968262, + "learning_rate": 5.915588809685584e-06, + "logits/chosen": -0.6122618317604065, + "logits/rejected": -0.6220861077308655, + "logps/chosen": -55.4123649597168, + "logps/rejected": -96.28656768798828, + "loss": 0.6621, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.879997491836548, + "rewards/margins": 5.601877689361572, + "rewards/rejected": -2.7218804359436035, + "step": 8821 + }, + { + "epoch": 2.21, + "grad_norm": 6.6862359046936035, + "learning_rate": 5.9148161009988865e-06, + "logits/chosen": -0.5389276146888733, + "logits/rejected": -0.597384512424469, + "logps/chosen": -52.95661926269531, + "logps/rejected": -101.72400665283203, + "loss": 0.6485, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.069627285003662, + "rewards/margins": 6.084536552429199, + "rewards/rejected": -3.014909505844116, + "step": 8822 + }, + { + "epoch": 2.21, + "grad_norm": 2.7963268756866455, + "learning_rate": 5.914043369706108e-06, + "logits/chosen": -0.5545679330825806, + "logits/rejected": -0.622418999671936, + "logps/chosen": -43.96913528442383, + "logps/rejected": -98.11538696289062, + "loss": 0.5245, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.295614004135132, + "rewards/margins": 7.487623691558838, + "rewards/rejected": -4.192009449005127, + "step": 8823 + }, + { + "epoch": 2.21, + "grad_norm": 16.07367706298828, + "learning_rate": 5.913270615826347e-06, + "logits/chosen": -0.5506722331047058, + "logits/rejected": -0.5881353616714478, + "logps/chosen": -55.85502243041992, + "logps/rejected": -76.849609375, + "loss": 0.9711, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5176572799682617, + "rewards/margins": 3.598968744277954, + "rewards/rejected": -1.0813113451004028, + "step": 8824 + }, + { + "epoch": 2.21, + "grad_norm": 4.581408500671387, + "learning_rate": 5.912497839378697e-06, + "logits/chosen": -0.5660375356674194, + "logits/rejected": -0.5810535550117493, + "logps/chosen": -47.42497253417969, + "logps/rejected": -108.80866241455078, + "loss": 0.6236, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.873452663421631, + "rewards/margins": 5.9529500007629395, + "rewards/rejected": -3.0794968605041504, + "step": 8825 + }, + { + "epoch": 2.21, + "grad_norm": 4.607836723327637, + "learning_rate": 5.911725040382256e-06, + "logits/chosen": -0.5760185718536377, + "logits/rejected": -0.6536768674850464, + "logps/chosen": -52.872901916503906, + "logps/rejected": -84.66513061523438, + "loss": 0.701, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1874136924743652, + "rewards/margins": 5.49985408782959, + "rewards/rejected": -2.312441110610962, + "step": 8826 + }, + { + "epoch": 2.21, + "grad_norm": 13.232986450195312, + "learning_rate": 5.910952218856117e-06, + "logits/chosen": -0.5414705872535706, + "logits/rejected": -0.6304906010627747, + "logps/chosen": -52.019779205322266, + "logps/rejected": -89.0699691772461, + "loss": 0.7357, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.118588447570801, + "rewards/margins": 5.8077616691589355, + "rewards/rejected": -2.6891732215881348, + "step": 8827 + }, + { + "epoch": 2.21, + "grad_norm": 4.333131313323975, + "learning_rate": 5.910179374819383e-06, + "logits/chosen": -0.5005646347999573, + "logits/rejected": -0.6222048401832581, + "logps/chosen": -52.910865783691406, + "logps/rejected": -101.36151123046875, + "loss": 0.6246, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9706058502197266, + "rewards/margins": 7.078065872192383, + "rewards/rejected": -4.107460021972656, + "step": 8828 + }, + { + "epoch": 2.21, + "grad_norm": 3.6787402629852295, + "learning_rate": 5.909406508291146e-06, + "logits/chosen": -0.5001295804977417, + "logits/rejected": -0.6339737772941589, + "logps/chosen": -50.747108459472656, + "logps/rejected": -96.43392944335938, + "loss": 0.5327, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.167091131210327, + "rewards/margins": 6.84480094909668, + "rewards/rejected": -3.6777098178863525, + "step": 8829 + }, + { + "epoch": 2.21, + "grad_norm": 7.459095001220703, + "learning_rate": 5.908633619290508e-06, + "logits/chosen": -0.5452530384063721, + "logits/rejected": -0.6624106764793396, + "logps/chosen": -58.109535217285156, + "logps/rejected": -88.05098724365234, + "loss": 0.6135, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0924534797668457, + "rewards/margins": 6.52455472946167, + "rewards/rejected": -3.4321014881134033, + "step": 8830 + }, + { + "epoch": 2.21, + "grad_norm": 3.0171120166778564, + "learning_rate": 5.907860707836568e-06, + "logits/chosen": -0.5434277653694153, + "logits/rejected": -0.62998366355896, + "logps/chosen": -64.21379852294922, + "logps/rejected": -109.34877014160156, + "loss": 0.7012, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.036107063293457, + "rewards/margins": 7.928948402404785, + "rewards/rejected": -4.892841339111328, + "step": 8831 + }, + { + "epoch": 2.21, + "grad_norm": 21.86184310913086, + "learning_rate": 5.907087773948421e-06, + "logits/chosen": -0.47235316038131714, + "logits/rejected": -0.5672247409820557, + "logps/chosen": -62.31133270263672, + "logps/rejected": -91.25691223144531, + "loss": 0.8232, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.746753215789795, + "rewards/margins": 5.609272003173828, + "rewards/rejected": -2.8625190258026123, + "step": 8832 + }, + { + "epoch": 2.21, + "grad_norm": 7.562829971313477, + "learning_rate": 5.9063148176451735e-06, + "logits/chosen": -0.5692667961120605, + "logits/rejected": -0.6253113150596619, + "logps/chosen": -50.45815658569336, + "logps/rejected": -106.39846801757812, + "loss": 0.7189, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1507599353790283, + "rewards/margins": 6.577860355377197, + "rewards/rejected": -3.42710018157959, + "step": 8833 + }, + { + "epoch": 2.21, + "grad_norm": 4.1651835441589355, + "learning_rate": 5.905541838945921e-06, + "logits/chosen": -0.5493262410163879, + "logits/rejected": -0.6035507917404175, + "logps/chosen": -51.078975677490234, + "logps/rejected": -124.23564147949219, + "loss": 0.6486, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7296197414398193, + "rewards/margins": 7.244326591491699, + "rewards/rejected": -4.514706611633301, + "step": 8834 + }, + { + "epoch": 2.21, + "grad_norm": 7.688889980316162, + "learning_rate": 5.9047688378697674e-06, + "logits/chosen": -0.563052237033844, + "logits/rejected": -0.6139680743217468, + "logps/chosen": -63.14302062988281, + "logps/rejected": -108.99140930175781, + "loss": 0.6864, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.632957935333252, + "rewards/margins": 5.897421360015869, + "rewards/rejected": -3.264463424682617, + "step": 8835 + }, + { + "epoch": 2.21, + "grad_norm": 4.678411960601807, + "learning_rate": 5.903995814435814e-06, + "logits/chosen": -0.5241912007331848, + "logits/rejected": -0.6127835512161255, + "logps/chosen": -62.64870071411133, + "logps/rejected": -73.2679443359375, + "loss": 0.7649, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.550835609436035, + "rewards/margins": 6.214554786682129, + "rewards/rejected": -2.6637187004089355, + "step": 8836 + }, + { + "epoch": 2.21, + "grad_norm": 5.005670547485352, + "learning_rate": 5.903222768663161e-06, + "logits/chosen": -0.4737702012062073, + "logits/rejected": -0.5772605538368225, + "logps/chosen": -53.94356918334961, + "logps/rejected": -93.38465118408203, + "loss": 0.7264, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5609169006347656, + "rewards/margins": 6.638329029083252, + "rewards/rejected": -4.077412128448486, + "step": 8837 + }, + { + "epoch": 2.21, + "grad_norm": 3.7728629112243652, + "learning_rate": 5.902449700570913e-06, + "logits/chosen": -0.5808438062667847, + "logits/rejected": -0.6511334180831909, + "logps/chosen": -57.87826156616211, + "logps/rejected": -101.48089599609375, + "loss": 0.7016, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2496275901794434, + "rewards/margins": 6.777276515960693, + "rewards/rejected": -3.527648448944092, + "step": 8838 + }, + { + "epoch": 2.21, + "grad_norm": 7.213962078094482, + "learning_rate": 5.901676610178173e-06, + "logits/chosen": -0.4873715937137604, + "logits/rejected": -0.5420476794242859, + "logps/chosen": -61.99039077758789, + "logps/rejected": -88.68186950683594, + "loss": 0.7551, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0224571228027344, + "rewards/margins": 5.010702610015869, + "rewards/rejected": -1.9882457256317139, + "step": 8839 + }, + { + "epoch": 2.21, + "grad_norm": 5.66848611831665, + "learning_rate": 5.900903497504045e-06, + "logits/chosen": -0.4951786696910858, + "logits/rejected": -0.5520936846733093, + "logps/chosen": -49.09245300292969, + "logps/rejected": -98.65554809570312, + "loss": 0.6653, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8789658546447754, + "rewards/margins": 6.0106201171875, + "rewards/rejected": -3.1316542625427246, + "step": 8840 + }, + { + "epoch": 2.21, + "grad_norm": 4.40133810043335, + "learning_rate": 5.9001303625676335e-06, + "logits/chosen": -0.5264818072319031, + "logits/rejected": -0.6216955184936523, + "logps/chosen": -54.52743148803711, + "logps/rejected": -91.39356231689453, + "loss": 0.704, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7585461139678955, + "rewards/margins": 5.844459533691406, + "rewards/rejected": -3.0859131813049316, + "step": 8841 + }, + { + "epoch": 2.21, + "grad_norm": 3.8031575679779053, + "learning_rate": 5.899357205388043e-06, + "logits/chosen": -0.45286568999290466, + "logits/rejected": -0.5580896735191345, + "logps/chosen": -54.226287841796875, + "logps/rejected": -90.8933334350586, + "loss": 0.6376, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.026651382446289, + "rewards/margins": 6.126031875610352, + "rewards/rejected": -3.0993802547454834, + "step": 8842 + }, + { + "epoch": 2.21, + "grad_norm": 5.325748443603516, + "learning_rate": 5.89858402598438e-06, + "logits/chosen": -0.465750515460968, + "logits/rejected": -0.5585801601409912, + "logps/chosen": -52.35650634765625, + "logps/rejected": -85.93025207519531, + "loss": 0.7172, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.129088878631592, + "rewards/margins": 5.706504821777344, + "rewards/rejected": -2.577415943145752, + "step": 8843 + }, + { + "epoch": 2.21, + "grad_norm": 5.078763008117676, + "learning_rate": 5.897810824375749e-06, + "logits/chosen": -0.40984299778938293, + "logits/rejected": -0.4939939081668854, + "logps/chosen": -56.355735778808594, + "logps/rejected": -90.94207000732422, + "loss": 0.7441, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1276326179504395, + "rewards/margins": 5.394990921020508, + "rewards/rejected": -2.2673580646514893, + "step": 8844 + }, + { + "epoch": 2.21, + "grad_norm": 9.5969820022583, + "learning_rate": 5.897037600581259e-06, + "logits/chosen": -0.49797314405441284, + "logits/rejected": -0.566276490688324, + "logps/chosen": -66.15425109863281, + "logps/rejected": -85.07176971435547, + "loss": 0.7152, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.792983055114746, + "rewards/margins": 5.09118127822876, + "rewards/rejected": -2.2981984615325928, + "step": 8845 + }, + { + "epoch": 2.21, + "grad_norm": 4.843038558959961, + "learning_rate": 5.8962643546200136e-06, + "logits/chosen": -0.5921202301979065, + "logits/rejected": -0.6529203653335571, + "logps/chosen": -59.70302200317383, + "logps/rejected": -109.34573364257812, + "loss": 0.7166, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2211833000183105, + "rewards/margins": 6.118196487426758, + "rewards/rejected": -2.897012948989868, + "step": 8846 + }, + { + "epoch": 2.21, + "grad_norm": 5.103734970092773, + "learning_rate": 5.895491086511126e-06, + "logits/chosen": -0.531101644039154, + "logits/rejected": -0.5903165936470032, + "logps/chosen": -59.740848541259766, + "logps/rejected": -94.74905395507812, + "loss": 0.7302, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.613264322280884, + "rewards/margins": 4.954451560974121, + "rewards/rejected": -2.3411874771118164, + "step": 8847 + }, + { + "epoch": 2.21, + "grad_norm": 5.704371452331543, + "learning_rate": 5.894717796273699e-06, + "logits/chosen": -0.49723947048187256, + "logits/rejected": -0.5131756663322449, + "logps/chosen": -46.02919387817383, + "logps/rejected": -111.2234115600586, + "loss": 0.6674, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0322701930999756, + "rewards/margins": 5.811985015869141, + "rewards/rejected": -2.779714584350586, + "step": 8848 + }, + { + "epoch": 2.21, + "grad_norm": 10.083513259887695, + "learning_rate": 5.893944483926843e-06, + "logits/chosen": -0.5110947489738464, + "logits/rejected": -0.5669571757316589, + "logps/chosen": -53.00261688232422, + "logps/rejected": -97.64407348632812, + "loss": 0.6897, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8264498710632324, + "rewards/margins": 6.187824249267578, + "rewards/rejected": -3.3613743782043457, + "step": 8849 + }, + { + "epoch": 2.21, + "grad_norm": 7.221946716308594, + "learning_rate": 5.893171149489668e-06, + "logits/chosen": -0.49872368574142456, + "logits/rejected": -0.5815762877464294, + "logps/chosen": -50.330142974853516, + "logps/rejected": -86.74115753173828, + "loss": 0.5994, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0688858032226562, + "rewards/margins": 5.187595367431641, + "rewards/rejected": -2.1187095642089844, + "step": 8850 + }, + { + "epoch": 2.21, + "grad_norm": 29.919960021972656, + "learning_rate": 5.892397792981286e-06, + "logits/chosen": -0.5097469687461853, + "logits/rejected": -0.5663854479789734, + "logps/chosen": -69.21163177490234, + "logps/rejected": -112.69171142578125, + "loss": 0.6698, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0641231536865234, + "rewards/margins": 6.254000186920166, + "rewards/rejected": -3.189877510070801, + "step": 8851 + }, + { + "epoch": 2.21, + "grad_norm": 4.122931003570557, + "learning_rate": 5.891624414420802e-06, + "logits/chosen": -0.5012372732162476, + "logits/rejected": -0.5733363032341003, + "logps/chosen": -56.804317474365234, + "logps/rejected": -105.68355560302734, + "loss": 0.6769, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9670209884643555, + "rewards/margins": 5.886972427368164, + "rewards/rejected": -2.9199514389038086, + "step": 8852 + }, + { + "epoch": 2.21, + "grad_norm": 2.888756036758423, + "learning_rate": 5.890851013827333e-06, + "logits/chosen": -0.49978938698768616, + "logits/rejected": -0.6420638561248779, + "logps/chosen": -60.34681701660156, + "logps/rejected": -97.25233459472656, + "loss": 0.6144, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1495015621185303, + "rewards/margins": 6.6145806312561035, + "rewards/rejected": -3.4650793075561523, + "step": 8853 + }, + { + "epoch": 2.21, + "grad_norm": 7.81585693359375, + "learning_rate": 5.890077591219989e-06, + "logits/chosen": -0.5200499892234802, + "logits/rejected": -0.5755167603492737, + "logps/chosen": -57.527889251708984, + "logps/rejected": -96.6778564453125, + "loss": 0.7627, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.876157283782959, + "rewards/margins": 5.373245716094971, + "rewards/rejected": -2.4970884323120117, + "step": 8854 + }, + { + "epoch": 2.22, + "grad_norm": 19.277416229248047, + "learning_rate": 5.889304146617878e-06, + "logits/chosen": -0.5106504559516907, + "logits/rejected": -0.5498038530349731, + "logps/chosen": -61.94871520996094, + "logps/rejected": -104.30778503417969, + "loss": 0.9016, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.774139642715454, + "rewards/margins": 5.293041706085205, + "rewards/rejected": -2.51890230178833, + "step": 8855 + }, + { + "epoch": 2.22, + "grad_norm": 7.116250514984131, + "learning_rate": 5.888530680040118e-06, + "logits/chosen": -0.49388664960861206, + "logits/rejected": -0.5477536916732788, + "logps/chosen": -57.8194580078125, + "logps/rejected": -94.36337280273438, + "loss": 0.6454, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.995753765106201, + "rewards/margins": 6.000659942626953, + "rewards/rejected": -3.0049071311950684, + "step": 8856 + }, + { + "epoch": 2.22, + "grad_norm": 5.861141681671143, + "learning_rate": 5.887757191505818e-06, + "logits/chosen": -0.47160792350769043, + "logits/rejected": -0.5593180656433105, + "logps/chosen": -73.63884735107422, + "logps/rejected": -97.49490356445312, + "loss": 0.7915, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6443850994110107, + "rewards/margins": 5.510693550109863, + "rewards/rejected": -2.8663084506988525, + "step": 8857 + }, + { + "epoch": 2.22, + "grad_norm": 3.578911304473877, + "learning_rate": 5.886983681034094e-06, + "logits/chosen": -0.4477137327194214, + "logits/rejected": -0.48450785875320435, + "logps/chosen": -53.01593017578125, + "logps/rejected": -97.10906982421875, + "loss": 0.5844, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9203646183013916, + "rewards/margins": 6.0040388107299805, + "rewards/rejected": -3.083674430847168, + "step": 8858 + }, + { + "epoch": 2.22, + "grad_norm": 5.254626750946045, + "learning_rate": 5.886210148644059e-06, + "logits/chosen": -0.4755314290523529, + "logits/rejected": -0.5713031888008118, + "logps/chosen": -59.422218322753906, + "logps/rejected": -94.60313415527344, + "loss": 0.668, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.91963791847229, + "rewards/margins": 6.429396629333496, + "rewards/rejected": -3.509758949279785, + "step": 8859 + }, + { + "epoch": 2.22, + "grad_norm": 7.472779273986816, + "learning_rate": 5.8854365943548306e-06, + "logits/chosen": -0.4146159887313843, + "logits/rejected": -0.4833623468875885, + "logps/chosen": -55.626827239990234, + "logps/rejected": -91.36234283447266, + "loss": 0.7027, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1799986362457275, + "rewards/margins": 5.953030109405518, + "rewards/rejected": -2.7730319499969482, + "step": 8860 + }, + { + "epoch": 2.22, + "grad_norm": 4.990023136138916, + "learning_rate": 5.8846630181855215e-06, + "logits/chosen": -0.4754745364189148, + "logits/rejected": -0.6191522479057312, + "logps/chosen": -57.14244079589844, + "logps/rejected": -87.20137023925781, + "loss": 0.6068, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.88590669631958, + "rewards/margins": 6.134971618652344, + "rewards/rejected": -3.2490649223327637, + "step": 8861 + }, + { + "epoch": 2.22, + "grad_norm": 7.156981468200684, + "learning_rate": 5.8838894201552474e-06, + "logits/chosen": -0.42592233419418335, + "logits/rejected": -0.4880888760089874, + "logps/chosen": -53.617149353027344, + "logps/rejected": -110.215576171875, + "loss": 0.6457, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7042551040649414, + "rewards/margins": 6.2924580574035645, + "rewards/rejected": -3.588202714920044, + "step": 8862 + }, + { + "epoch": 2.22, + "grad_norm": 9.021795272827148, + "learning_rate": 5.883115800283126e-06, + "logits/chosen": -0.4999266564846039, + "logits/rejected": -0.5967806577682495, + "logps/chosen": -68.0654525756836, + "logps/rejected": -102.27838134765625, + "loss": 0.7651, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5908408164978027, + "rewards/margins": 5.689406394958496, + "rewards/rejected": -3.0985658168792725, + "step": 8863 + }, + { + "epoch": 2.22, + "grad_norm": 13.325334548950195, + "learning_rate": 5.882342158588273e-06, + "logits/chosen": -0.5689186453819275, + "logits/rejected": -0.6669676303863525, + "logps/chosen": -56.4749755859375, + "logps/rejected": -77.275390625, + "loss": 0.8797, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8702452182769775, + "rewards/margins": 5.3511433601379395, + "rewards/rejected": -2.480898141860962, + "step": 8864 + }, + { + "epoch": 2.22, + "grad_norm": 7.924570083618164, + "learning_rate": 5.881568495089809e-06, + "logits/chosen": -0.5052457451820374, + "logits/rejected": -0.554305374622345, + "logps/chosen": -51.9187126159668, + "logps/rejected": -103.38169860839844, + "loss": 0.7881, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.816392421722412, + "rewards/margins": 5.496232986450195, + "rewards/rejected": -2.679840564727783, + "step": 8865 + }, + { + "epoch": 2.22, + "grad_norm": 4.998674392700195, + "learning_rate": 5.880794809806848e-06, + "logits/chosen": -0.44422978162765503, + "logits/rejected": -0.5379339456558228, + "logps/chosen": -57.13522720336914, + "logps/rejected": -106.16775512695312, + "loss": 0.6131, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9775969982147217, + "rewards/margins": 6.429407596588135, + "rewards/rejected": -3.451810598373413, + "step": 8866 + }, + { + "epoch": 2.22, + "grad_norm": 16.197948455810547, + "learning_rate": 5.8800211027585105e-06, + "logits/chosen": -0.44702252745628357, + "logits/rejected": -0.5855426788330078, + "logps/chosen": -63.374267578125, + "logps/rejected": -82.54944610595703, + "loss": 0.7134, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.791451930999756, + "rewards/margins": 5.8645501136779785, + "rewards/rejected": -3.0730981826782227, + "step": 8867 + }, + { + "epoch": 2.22, + "grad_norm": 6.836610794067383, + "learning_rate": 5.879247373963917e-06, + "logits/chosen": -0.5033727288246155, + "logits/rejected": -0.5700839757919312, + "logps/chosen": -52.884315490722656, + "logps/rejected": -82.54507446289062, + "loss": 0.7806, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8415770530700684, + "rewards/margins": 5.010413646697998, + "rewards/rejected": -2.168837070465088, + "step": 8868 + }, + { + "epoch": 2.22, + "grad_norm": 13.280529975891113, + "learning_rate": 5.878473623442184e-06, + "logits/chosen": -0.5308431386947632, + "logits/rejected": -0.5977091193199158, + "logps/chosen": -49.5041389465332, + "logps/rejected": -110.29127502441406, + "loss": 0.5859, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.059551239013672, + "rewards/margins": 6.057374477386475, + "rewards/rejected": -2.9978232383728027, + "step": 8869 + }, + { + "epoch": 2.22, + "grad_norm": 3.2172374725341797, + "learning_rate": 5.877699851212434e-06, + "logits/chosen": -0.4933798909187317, + "logits/rejected": -0.586239218711853, + "logps/chosen": -60.72392654418945, + "logps/rejected": -94.6609878540039, + "loss": 0.6497, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.131560802459717, + "rewards/margins": 5.900598526000977, + "rewards/rejected": -2.7690374851226807, + "step": 8870 + }, + { + "epoch": 2.22, + "grad_norm": 3.7200348377227783, + "learning_rate": 5.876926057293787e-06, + "logits/chosen": -0.473175048828125, + "logits/rejected": -0.5971589088439941, + "logps/chosen": -66.70674133300781, + "logps/rejected": -91.32489013671875, + "loss": 0.6416, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.834933280944824, + "rewards/margins": 6.283642292022705, + "rewards/rejected": -3.4487085342407227, + "step": 8871 + }, + { + "epoch": 2.22, + "grad_norm": 5.037492275238037, + "learning_rate": 5.876152241705364e-06, + "logits/chosen": -0.5403072237968445, + "logits/rejected": -0.5950462818145752, + "logps/chosen": -52.88408660888672, + "logps/rejected": -86.73138427734375, + "loss": 0.75, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9556946754455566, + "rewards/margins": 5.768664836883545, + "rewards/rejected": -2.812969923019409, + "step": 8872 + }, + { + "epoch": 2.22, + "grad_norm": 8.81489086151123, + "learning_rate": 5.875378404466288e-06, + "logits/chosen": -0.44477739930152893, + "logits/rejected": -0.5536561012268066, + "logps/chosen": -62.996551513671875, + "logps/rejected": -114.03951263427734, + "loss": 0.7327, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.532973289489746, + "rewards/margins": 6.2259840965271, + "rewards/rejected": -3.6930105686187744, + "step": 8873 + }, + { + "epoch": 2.22, + "grad_norm": 7.198457717895508, + "learning_rate": 5.874604545595681e-06, + "logits/chosen": -0.4710251986980438, + "logits/rejected": -0.516725480556488, + "logps/chosen": -54.635379791259766, + "logps/rejected": -103.21317291259766, + "loss": 0.7042, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8785712718963623, + "rewards/margins": 5.884173393249512, + "rewards/rejected": -3.005601644515991, + "step": 8874 + }, + { + "epoch": 2.22, + "grad_norm": 7.552540302276611, + "learning_rate": 5.873830665112663e-06, + "logits/chosen": -0.481315940618515, + "logits/rejected": -0.5641170740127563, + "logps/chosen": -59.354923248291016, + "logps/rejected": -94.75907897949219, + "loss": 0.6652, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1808340549468994, + "rewards/margins": 5.378253936767578, + "rewards/rejected": -2.1974196434020996, + "step": 8875 + }, + { + "epoch": 2.22, + "grad_norm": 6.679042816162109, + "learning_rate": 5.873056763036362e-06, + "logits/chosen": -0.4379693865776062, + "logits/rejected": -0.5015549063682556, + "logps/chosen": -56.92759704589844, + "logps/rejected": -99.79263305664062, + "loss": 0.6513, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.011460781097412, + "rewards/margins": 5.206628799438477, + "rewards/rejected": -2.1951680183410645, + "step": 8876 + }, + { + "epoch": 2.22, + "grad_norm": 5.6319098472595215, + "learning_rate": 5.872282839385899e-06, + "logits/chosen": -0.50108802318573, + "logits/rejected": -0.5919730067253113, + "logps/chosen": -47.725563049316406, + "logps/rejected": -87.70320892333984, + "loss": 0.6442, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.087888717651367, + "rewards/margins": 6.181375503540039, + "rewards/rejected": -3.093486785888672, + "step": 8877 + }, + { + "epoch": 2.22, + "grad_norm": 5.787680625915527, + "learning_rate": 5.8715088941804e-06, + "logits/chosen": -0.46239781379699707, + "logits/rejected": -0.5012038350105286, + "logps/chosen": -60.73092269897461, + "logps/rejected": -93.33296203613281, + "loss": 0.6632, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.311476230621338, + "rewards/margins": 5.069221019744873, + "rewards/rejected": -1.757744550704956, + "step": 8878 + }, + { + "epoch": 2.22, + "grad_norm": 4.175792694091797, + "learning_rate": 5.870734927438989e-06, + "logits/chosen": -0.5332279205322266, + "logits/rejected": -0.5959781408309937, + "logps/chosen": -59.87409210205078, + "logps/rejected": -101.53466796875, + "loss": 0.6211, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0010812282562256, + "rewards/margins": 6.848223686218262, + "rewards/rejected": -3.847141742706299, + "step": 8879 + }, + { + "epoch": 2.22, + "grad_norm": 11.518227577209473, + "learning_rate": 5.869960939180791e-06, + "logits/chosen": -0.5508352518081665, + "logits/rejected": -0.6012946367263794, + "logps/chosen": -51.8146858215332, + "logps/rejected": -101.1187515258789, + "loss": 0.8519, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.131972074508667, + "rewards/margins": 5.911214351654053, + "rewards/rejected": -2.779242515563965, + "step": 8880 + }, + { + "epoch": 2.22, + "grad_norm": 10.091764450073242, + "learning_rate": 5.8691869294249335e-06, + "logits/chosen": -0.5157214999198914, + "logits/rejected": -0.5685010552406311, + "logps/chosen": -55.17805862426758, + "logps/rejected": -101.95455169677734, + "loss": 0.6766, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.830915927886963, + "rewards/margins": 6.436252117156982, + "rewards/rejected": -3.6053359508514404, + "step": 8881 + }, + { + "epoch": 2.22, + "grad_norm": 9.835912704467773, + "learning_rate": 5.868412898190542e-06, + "logits/chosen": -0.4451298415660858, + "logits/rejected": -0.5406871438026428, + "logps/chosen": -57.01219940185547, + "logps/rejected": -87.6781997680664, + "loss": 0.6686, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9206671714782715, + "rewards/margins": 5.83780574798584, + "rewards/rejected": -2.91713809967041, + "step": 8882 + }, + { + "epoch": 2.22, + "grad_norm": 4.018226146697998, + "learning_rate": 5.867638845496744e-06, + "logits/chosen": -0.4942563474178314, + "logits/rejected": -0.5330671668052673, + "logps/chosen": -45.968170166015625, + "logps/rejected": -90.71383666992188, + "loss": 0.6323, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1319663524627686, + "rewards/margins": 5.34937858581543, + "rewards/rejected": -2.2174124717712402, + "step": 8883 + }, + { + "epoch": 2.22, + "grad_norm": 4.917675495147705, + "learning_rate": 5.866864771362668e-06, + "logits/chosen": -0.5854017734527588, + "logits/rejected": -0.652225136756897, + "logps/chosen": -52.06544494628906, + "logps/rejected": -88.93975067138672, + "loss": 0.6895, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.192822217941284, + "rewards/margins": 5.703181266784668, + "rewards/rejected": -2.510359287261963, + "step": 8884 + }, + { + "epoch": 2.22, + "grad_norm": 9.90657901763916, + "learning_rate": 5.866090675807441e-06, + "logits/chosen": -0.5740600824356079, + "logits/rejected": -0.6538817286491394, + "logps/chosen": -54.24464416503906, + "logps/rejected": -100.74696350097656, + "loss": 0.8896, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6080105304718018, + "rewards/margins": 6.048253536224365, + "rewards/rejected": -3.4402427673339844, + "step": 8885 + }, + { + "epoch": 2.22, + "grad_norm": 20.635805130004883, + "learning_rate": 5.865316558850193e-06, + "logits/chosen": -0.44373664259910583, + "logits/rejected": -0.557366132736206, + "logps/chosen": -55.04505920410156, + "logps/rejected": -80.91790771484375, + "loss": 0.6085, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.842273712158203, + "rewards/margins": 6.145571708679199, + "rewards/rejected": -3.3032989501953125, + "step": 8886 + }, + { + "epoch": 2.22, + "grad_norm": 2.2674026489257812, + "learning_rate": 5.864542420510051e-06, + "logits/chosen": -0.4889698922634125, + "logits/rejected": -0.5596926212310791, + "logps/chosen": -46.011287689208984, + "logps/rejected": -102.9025650024414, + "loss": 0.5577, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.417726516723633, + "rewards/margins": 6.533907413482666, + "rewards/rejected": -3.1161813735961914, + "step": 8887 + }, + { + "epoch": 2.22, + "grad_norm": 2.7275519371032715, + "learning_rate": 5.863768260806148e-06, + "logits/chosen": -0.4692617952823639, + "logits/rejected": -0.5944924354553223, + "logps/chosen": -60.961036682128906, + "logps/rejected": -95.29867553710938, + "loss": 0.6142, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1929848194122314, + "rewards/margins": 7.127871036529541, + "rewards/rejected": -3.934886932373047, + "step": 8888 + }, + { + "epoch": 2.22, + "grad_norm": 4.5315446853637695, + "learning_rate": 5.86299407975761e-06, + "logits/chosen": -0.5320273637771606, + "logits/rejected": -0.6310124397277832, + "logps/chosen": -52.834903717041016, + "logps/rejected": -80.87425994873047, + "loss": 0.6934, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0772573947906494, + "rewards/margins": 5.539299964904785, + "rewards/rejected": -2.4620425701141357, + "step": 8889 + }, + { + "epoch": 2.22, + "grad_norm": 5.317074775695801, + "learning_rate": 5.8622198773835725e-06, + "logits/chosen": -0.4730125665664673, + "logits/rejected": -0.5395209789276123, + "logps/chosen": -49.1646728515625, + "logps/rejected": -85.98851776123047, + "loss": 0.6292, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2323529720306396, + "rewards/margins": 5.769953727722168, + "rewards/rejected": -2.5376009941101074, + "step": 8890 + }, + { + "epoch": 2.22, + "grad_norm": 4.5957746505737305, + "learning_rate": 5.861445653703164e-06, + "logits/chosen": -0.4376831352710724, + "logits/rejected": -0.5408642888069153, + "logps/chosen": -64.90071868896484, + "logps/rejected": -84.66017150878906, + "loss": 0.707, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9635345935821533, + "rewards/margins": 5.073527812957764, + "rewards/rejected": -2.1099936962127686, + "step": 8891 + }, + { + "epoch": 2.22, + "grad_norm": 2.9206626415252686, + "learning_rate": 5.860671408735517e-06, + "logits/chosen": -0.4266517758369446, + "logits/rejected": -0.5420493483543396, + "logps/chosen": -58.06855010986328, + "logps/rejected": -84.36144256591797, + "loss": 0.5962, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7488396167755127, + "rewards/margins": 6.288590431213379, + "rewards/rejected": -3.5397510528564453, + "step": 8892 + }, + { + "epoch": 2.22, + "grad_norm": 5.512407302856445, + "learning_rate": 5.859897142499764e-06, + "logits/chosen": -0.5685998201370239, + "logits/rejected": -0.653124213218689, + "logps/chosen": -52.09864044189453, + "logps/rejected": -84.10157775878906, + "loss": 0.6987, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1499242782592773, + "rewards/margins": 5.932283878326416, + "rewards/rejected": -2.7823591232299805, + "step": 8893 + }, + { + "epoch": 2.22, + "grad_norm": 3.741713523864746, + "learning_rate": 5.859122855015039e-06, + "logits/chosen": -0.46017172932624817, + "logits/rejected": -0.5457214117050171, + "logps/chosen": -49.888484954833984, + "logps/rejected": -93.82652282714844, + "loss": 0.6378, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.926987886428833, + "rewards/margins": 6.194543361663818, + "rewards/rejected": -3.2675554752349854, + "step": 8894 + }, + { + "epoch": 2.23, + "grad_norm": 11.48656940460205, + "learning_rate": 5.858348546300474e-06, + "logits/chosen": -0.5236750841140747, + "logits/rejected": -0.5953818559646606, + "logps/chosen": -59.10230255126953, + "logps/rejected": -94.88825225830078, + "loss": 0.6864, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.926391839981079, + "rewards/margins": 5.765044689178467, + "rewards/rejected": -2.838653326034546, + "step": 8895 + }, + { + "epoch": 2.23, + "grad_norm": 8.137499809265137, + "learning_rate": 5.857574216375203e-06, + "logits/chosen": -0.5003359317779541, + "logits/rejected": -0.47529172897338867, + "logps/chosen": -68.65917205810547, + "logps/rejected": -108.69549560546875, + "loss": 0.8196, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.684769630432129, + "rewards/margins": 4.700969219207764, + "rewards/rejected": -2.0161995887756348, + "step": 8896 + }, + { + "epoch": 2.23, + "grad_norm": 4.79754114151001, + "learning_rate": 5.856799865258361e-06, + "logits/chosen": -0.5051112174987793, + "logits/rejected": -0.5767013430595398, + "logps/chosen": -48.15019226074219, + "logps/rejected": -86.64341735839844, + "loss": 0.6901, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8821840286254883, + "rewards/margins": 5.340885162353516, + "rewards/rejected": -2.4587011337280273, + "step": 8897 + }, + { + "epoch": 2.23, + "grad_norm": 4.925898551940918, + "learning_rate": 5.856025492969082e-06, + "logits/chosen": -0.5000467896461487, + "logits/rejected": -0.6080109477043152, + "logps/chosen": -57.18146896362305, + "logps/rejected": -87.7908935546875, + "loss": 0.6856, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.108215570449829, + "rewards/margins": 4.92017936706543, + "rewards/rejected": -1.8119641542434692, + "step": 8898 + }, + { + "epoch": 2.23, + "grad_norm": 14.492650032043457, + "learning_rate": 5.855251099526504e-06, + "logits/chosen": -0.4127234220504761, + "logits/rejected": -0.48061665892601013, + "logps/chosen": -72.18608856201172, + "logps/rejected": -108.67926025390625, + "loss": 0.866, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.675867795944214, + "rewards/margins": 4.781123161315918, + "rewards/rejected": -2.105254888534546, + "step": 8899 + }, + { + "epoch": 2.23, + "grad_norm": 3.4327492713928223, + "learning_rate": 5.854476684949761e-06, + "logits/chosen": -0.48654401302337646, + "logits/rejected": -0.5456446409225464, + "logps/chosen": -55.56450653076172, + "logps/rejected": -86.81849670410156, + "loss": 0.6525, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9695191383361816, + "rewards/margins": 4.941902160644531, + "rewards/rejected": -1.9723832607269287, + "step": 8900 + }, + { + "epoch": 2.23, + "grad_norm": 7.721398830413818, + "learning_rate": 5.8537022492579905e-06, + "logits/chosen": -0.4855125844478607, + "logits/rejected": -0.5404303669929504, + "logps/chosen": -63.753787994384766, + "logps/rejected": -94.39720916748047, + "loss": 0.8026, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0034098625183105, + "rewards/margins": 4.598598957061768, + "rewards/rejected": -1.5951889753341675, + "step": 8901 + }, + { + "epoch": 2.23, + "grad_norm": 4.265340805053711, + "learning_rate": 5.852927792470331e-06, + "logits/chosen": -0.45585814118385315, + "logits/rejected": -0.5178583860397339, + "logps/chosen": -54.94575881958008, + "logps/rejected": -97.83685302734375, + "loss": 0.6809, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.139193058013916, + "rewards/margins": 5.687950134277344, + "rewards/rejected": -2.5487568378448486, + "step": 8902 + }, + { + "epoch": 2.23, + "grad_norm": 13.466437339782715, + "learning_rate": 5.852153314605916e-06, + "logits/chosen": -0.5265302658081055, + "logits/rejected": -0.5520631670951843, + "logps/chosen": -53.212303161621094, + "logps/rejected": -101.84237670898438, + "loss": 0.727, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.000178337097168, + "rewards/margins": 5.202917098999023, + "rewards/rejected": -2.2027390003204346, + "step": 8903 + }, + { + "epoch": 2.23, + "grad_norm": 4.3587117195129395, + "learning_rate": 5.851378815683887e-06, + "logits/chosen": -0.5185083150863647, + "logits/rejected": -0.5570198893547058, + "logps/chosen": -44.587982177734375, + "logps/rejected": -102.3765869140625, + "loss": 0.6522, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.459590435028076, + "rewards/margins": 6.355693340301514, + "rewards/rejected": -2.8961024284362793, + "step": 8904 + }, + { + "epoch": 2.23, + "grad_norm": 5.757452964782715, + "learning_rate": 5.850604295723382e-06, + "logits/chosen": -0.5615564584732056, + "logits/rejected": -0.6185051202774048, + "logps/chosen": -51.99927520751953, + "logps/rejected": -81.86046600341797, + "loss": 0.6972, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1238155364990234, + "rewards/margins": 4.44180154800415, + "rewards/rejected": -1.317986011505127, + "step": 8905 + }, + { + "epoch": 2.23, + "grad_norm": 4.2063822746276855, + "learning_rate": 5.84982975474354e-06, + "logits/chosen": -0.48334240913391113, + "logits/rejected": -0.540136992931366, + "logps/chosen": -56.130638122558594, + "logps/rejected": -111.1192626953125, + "loss": 0.7227, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1851463317871094, + "rewards/margins": 7.153497695922852, + "rewards/rejected": -3.9683516025543213, + "step": 8906 + }, + { + "epoch": 2.23, + "grad_norm": 6.144248962402344, + "learning_rate": 5.849055192763501e-06, + "logits/chosen": -0.3904631733894348, + "logits/rejected": -0.515093207359314, + "logps/chosen": -51.42927932739258, + "logps/rejected": -105.39762115478516, + "loss": 0.5874, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.128424882888794, + "rewards/margins": 7.525549411773682, + "rewards/rejected": -4.397124767303467, + "step": 8907 + }, + { + "epoch": 2.23, + "grad_norm": 4.473593235015869, + "learning_rate": 5.848280609802406e-06, + "logits/chosen": -0.47492849826812744, + "logits/rejected": -0.5659223198890686, + "logps/chosen": -70.1671371459961, + "logps/rejected": -89.36910247802734, + "loss": 0.692, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0811116695404053, + "rewards/margins": 4.9486775398254395, + "rewards/rejected": -1.8675662279129028, + "step": 8908 + }, + { + "epoch": 2.23, + "grad_norm": 30.387836456298828, + "learning_rate": 5.847506005879393e-06, + "logits/chosen": -0.5010433197021484, + "logits/rejected": -0.5699325799942017, + "logps/chosen": -53.90370178222656, + "logps/rejected": -84.88005828857422, + "loss": 0.928, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7722368240356445, + "rewards/margins": 4.746702671051025, + "rewards/rejected": -1.9744658470153809, + "step": 8909 + }, + { + "epoch": 2.23, + "grad_norm": 19.1242618560791, + "learning_rate": 5.846731381013606e-06, + "logits/chosen": -0.5204330682754517, + "logits/rejected": -0.5698118805885315, + "logps/chosen": -48.751426696777344, + "logps/rejected": -107.36846160888672, + "loss": 0.628, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.808316469192505, + "rewards/margins": 6.168464183807373, + "rewards/rejected": -3.360147476196289, + "step": 8910 + }, + { + "epoch": 2.23, + "grad_norm": 5.609975814819336, + "learning_rate": 5.8459567352241875e-06, + "logits/chosen": -0.5285265445709229, + "logits/rejected": -0.6142470240592957, + "logps/chosen": -62.47004699707031, + "logps/rejected": -113.20948791503906, + "loss": 0.677, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.962118625640869, + "rewards/margins": 5.633837699890137, + "rewards/rejected": -2.6717190742492676, + "step": 8911 + }, + { + "epoch": 2.23, + "grad_norm": 7.190921306610107, + "learning_rate": 5.845182068530275e-06, + "logits/chosen": -0.4739569425582886, + "logits/rejected": -0.5815349221229553, + "logps/chosen": -59.477577209472656, + "logps/rejected": -99.74811553955078, + "loss": 0.6768, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7433784008026123, + "rewards/margins": 6.627414226531982, + "rewards/rejected": -3.884035587310791, + "step": 8912 + }, + { + "epoch": 2.23, + "grad_norm": 5.8683905601501465, + "learning_rate": 5.844407380951017e-06, + "logits/chosen": -0.5702224969863892, + "logits/rejected": -0.6613936424255371, + "logps/chosen": -53.41522979736328, + "logps/rejected": -79.42391967773438, + "loss": 0.6276, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1020991802215576, + "rewards/margins": 5.717282295227051, + "rewards/rejected": -2.615182638168335, + "step": 8913 + }, + { + "epoch": 2.23, + "grad_norm": 11.583231925964355, + "learning_rate": 5.8436326725055535e-06, + "logits/chosen": -0.45199382305145264, + "logits/rejected": -0.5036590099334717, + "logps/chosen": -60.406158447265625, + "logps/rejected": -101.43589782714844, + "loss": 0.6566, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1892318725585938, + "rewards/margins": 6.191627502441406, + "rewards/rejected": -3.0023953914642334, + "step": 8914 + }, + { + "epoch": 2.23, + "grad_norm": 2.9971418380737305, + "learning_rate": 5.84285794321303e-06, + "logits/chosen": -0.49877670407295227, + "logits/rejected": -0.5726701021194458, + "logps/chosen": -54.175262451171875, + "logps/rejected": -107.62079620361328, + "loss": 0.5796, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.975849151611328, + "rewards/margins": 7.175731658935547, + "rewards/rejected": -4.199882507324219, + "step": 8915 + }, + { + "epoch": 2.23, + "grad_norm": 24.095579147338867, + "learning_rate": 5.842083193092589e-06, + "logits/chosen": -0.4956005811691284, + "logits/rejected": -0.5846234560012817, + "logps/chosen": -55.200016021728516, + "logps/rejected": -83.79106140136719, + "loss": 0.7285, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.824273109436035, + "rewards/margins": 5.592164993286133, + "rewards/rejected": -2.767892599105835, + "step": 8916 + }, + { + "epoch": 2.23, + "grad_norm": 5.082993984222412, + "learning_rate": 5.841308422163379e-06, + "logits/chosen": -0.5145615935325623, + "logits/rejected": -0.5825466513633728, + "logps/chosen": -56.172607421875, + "logps/rejected": -88.2362060546875, + "loss": 0.7174, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6974258422851562, + "rewards/margins": 5.7693915367126465, + "rewards/rejected": -3.0719656944274902, + "step": 8917 + }, + { + "epoch": 2.23, + "grad_norm": 8.598583221435547, + "learning_rate": 5.840533630444541e-06, + "logits/chosen": -0.5056493282318115, + "logits/rejected": -0.5569438934326172, + "logps/chosen": -59.724578857421875, + "logps/rejected": -88.36410522460938, + "loss": 0.7161, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9373621940612793, + "rewards/margins": 4.687577247619629, + "rewards/rejected": -1.7502151727676392, + "step": 8918 + }, + { + "epoch": 2.23, + "grad_norm": 11.99007797241211, + "learning_rate": 5.839758817955223e-06, + "logits/chosen": -0.5412353873252869, + "logits/rejected": -0.5123003721237183, + "logps/chosen": -46.821903228759766, + "logps/rejected": -97.5252685546875, + "loss": 0.8511, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.299680233001709, + "rewards/margins": 4.216057777404785, + "rewards/rejected": -0.916377604007721, + "step": 8919 + }, + { + "epoch": 2.23, + "grad_norm": 12.644497871398926, + "learning_rate": 5.838983984714574e-06, + "logits/chosen": -0.36836400628089905, + "logits/rejected": -0.4233848750591278, + "logps/chosen": -62.870399475097656, + "logps/rejected": -93.2800521850586, + "loss": 0.7164, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8196253776550293, + "rewards/margins": 5.57481575012207, + "rewards/rejected": -2.755190372467041, + "step": 8920 + }, + { + "epoch": 2.23, + "grad_norm": 4.043971538543701, + "learning_rate": 5.838209130741734e-06, + "logits/chosen": -0.429735392332077, + "logits/rejected": -0.49917471408843994, + "logps/chosen": -57.15153503417969, + "logps/rejected": -120.06563568115234, + "loss": 0.7417, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.34726619720459, + "rewards/margins": 7.646629333496094, + "rewards/rejected": -4.299363136291504, + "step": 8921 + }, + { + "epoch": 2.23, + "grad_norm": 3.919700860977173, + "learning_rate": 5.837434256055858e-06, + "logits/chosen": -0.48955491185188293, + "logits/rejected": -0.5592004060745239, + "logps/chosen": -54.74179458618164, + "logps/rejected": -100.35145568847656, + "loss": 0.6103, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9678573608398438, + "rewards/margins": 6.181103706359863, + "rewards/rejected": -3.2132465839385986, + "step": 8922 + }, + { + "epoch": 2.23, + "grad_norm": 4.196511268615723, + "learning_rate": 5.83665936067609e-06, + "logits/chosen": -0.4945961534976959, + "logits/rejected": -0.6315964460372925, + "logps/chosen": -75.06531524658203, + "logps/rejected": -93.53943634033203, + "loss": 0.7193, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.460096836090088, + "rewards/margins": 6.756757736206055, + "rewards/rejected": -4.296660423278809, + "step": 8923 + }, + { + "epoch": 2.23, + "grad_norm": 5.9483747482299805, + "learning_rate": 5.835884444621579e-06, + "logits/chosen": -0.48778343200683594, + "logits/rejected": -0.5099284052848816, + "logps/chosen": -49.15266799926758, + "logps/rejected": -98.41659545898438, + "loss": 0.6205, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2819957733154297, + "rewards/margins": 6.21152400970459, + "rewards/rejected": -2.92952823638916, + "step": 8924 + }, + { + "epoch": 2.23, + "grad_norm": 4.563562870025635, + "learning_rate": 5.835109507911475e-06, + "logits/chosen": -0.5822397470474243, + "logits/rejected": -0.6496212482452393, + "logps/chosen": -43.85700225830078, + "logps/rejected": -102.3086929321289, + "loss": 0.6281, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.866035223007202, + "rewards/margins": 5.75458288192749, + "rewards/rejected": -2.888547420501709, + "step": 8925 + }, + { + "epoch": 2.23, + "grad_norm": 7.915970802307129, + "learning_rate": 5.8343345505649265e-06, + "logits/chosen": -0.44470974802970886, + "logits/rejected": -0.6073194742202759, + "logps/chosen": -59.71821975708008, + "logps/rejected": -83.876220703125, + "loss": 0.6745, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8591151237487793, + "rewards/margins": 6.210770606994629, + "rewards/rejected": -3.351656436920166, + "step": 8926 + }, + { + "epoch": 2.23, + "grad_norm": 7.746796607971191, + "learning_rate": 5.833559572601084e-06, + "logits/chosen": -0.6173514723777771, + "logits/rejected": -0.6827844381332397, + "logps/chosen": -49.50941848754883, + "logps/rejected": -106.57020568847656, + "loss": 0.7273, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1970605850219727, + "rewards/margins": 6.808022975921631, + "rewards/rejected": -3.6109628677368164, + "step": 8927 + }, + { + "epoch": 2.23, + "grad_norm": 6.422415256500244, + "learning_rate": 5.832784574039096e-06, + "logits/chosen": -0.44904625415802, + "logits/rejected": -0.5836893916130066, + "logps/chosen": -51.727577209472656, + "logps/rejected": -89.08516693115234, + "loss": 0.5754, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.620286226272583, + "rewards/margins": 7.3530426025390625, + "rewards/rejected": -3.7327561378479004, + "step": 8928 + }, + { + "epoch": 2.23, + "grad_norm": 6.533350467681885, + "learning_rate": 5.8320095548981175e-06, + "logits/chosen": -0.42796099185943604, + "logits/rejected": -0.4991884231567383, + "logps/chosen": -59.85842514038086, + "logps/rejected": -95.94530487060547, + "loss": 0.812, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0440542697906494, + "rewards/margins": 5.057826519012451, + "rewards/rejected": -2.0137720108032227, + "step": 8929 + }, + { + "epoch": 2.23, + "grad_norm": 15.22661304473877, + "learning_rate": 5.831234515197297e-06, + "logits/chosen": -0.537531852722168, + "logits/rejected": -0.5721412897109985, + "logps/chosen": -48.87393569946289, + "logps/rejected": -105.74370574951172, + "loss": 0.8502, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7550320625305176, + "rewards/margins": 5.5128984451293945, + "rewards/rejected": -2.757866621017456, + "step": 8930 + }, + { + "epoch": 2.23, + "grad_norm": 5.2677001953125, + "learning_rate": 5.8304594549557894e-06, + "logits/chosen": -0.47845202684402466, + "logits/rejected": -0.5069115161895752, + "logps/chosen": -56.39131546020508, + "logps/rejected": -108.56422424316406, + "loss": 0.7183, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.884857416152954, + "rewards/margins": 5.3349833488464355, + "rewards/rejected": -2.4501256942749023, + "step": 8931 + }, + { + "epoch": 2.23, + "grad_norm": 6.37425422668457, + "learning_rate": 5.829684374192741e-06, + "logits/chosen": -0.42656904458999634, + "logits/rejected": -0.546331524848938, + "logps/chosen": -57.391029357910156, + "logps/rejected": -88.32554626464844, + "loss": 0.7173, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.198094367980957, + "rewards/margins": 5.472894668579102, + "rewards/rejected": -2.2748005390167236, + "step": 8932 + }, + { + "epoch": 2.23, + "grad_norm": 4.68153715133667, + "learning_rate": 5.828909272927313e-06, + "logits/chosen": -0.477527379989624, + "logits/rejected": -0.5872677564620972, + "logps/chosen": -48.66315460205078, + "logps/rejected": -96.25959014892578, + "loss": 0.547, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8716940879821777, + "rewards/margins": 6.144179821014404, + "rewards/rejected": -3.2724852561950684, + "step": 8933 + }, + { + "epoch": 2.24, + "grad_norm": 3.865755319595337, + "learning_rate": 5.8281341511786545e-06, + "logits/chosen": -0.44685181975364685, + "logits/rejected": -0.5751886367797852, + "logps/chosen": -56.83634948730469, + "logps/rejected": -78.95957946777344, + "loss": 0.6379, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8271756172180176, + "rewards/margins": 5.715356349945068, + "rewards/rejected": -2.88818097114563, + "step": 8934 + }, + { + "epoch": 2.24, + "grad_norm": 11.48543643951416, + "learning_rate": 5.827359008965919e-06, + "logits/chosen": -0.5573878884315491, + "logits/rejected": -0.6589112877845764, + "logps/chosen": -60.55724334716797, + "logps/rejected": -87.05028533935547, + "loss": 0.7488, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7902979850769043, + "rewards/margins": 4.855022430419922, + "rewards/rejected": -2.0647246837615967, + "step": 8935 + }, + { + "epoch": 2.24, + "grad_norm": 5.717777252197266, + "learning_rate": 5.826583846308264e-06, + "logits/chosen": -0.47690916061401367, + "logits/rejected": -0.5498909950256348, + "logps/chosen": -53.24713897705078, + "logps/rejected": -88.87770080566406, + "loss": 0.6349, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.994816541671753, + "rewards/margins": 5.077606678009033, + "rewards/rejected": -2.082789897918701, + "step": 8936 + }, + { + "epoch": 2.24, + "grad_norm": 3.30964732170105, + "learning_rate": 5.825808663224843e-06, + "logits/chosen": -0.5715386271476746, + "logits/rejected": -0.600460410118103, + "logps/chosen": -50.57178497314453, + "logps/rejected": -110.6910629272461, + "loss": 0.6943, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.109811305999756, + "rewards/margins": 6.835458755493164, + "rewards/rejected": -3.72564697265625, + "step": 8937 + }, + { + "epoch": 2.24, + "grad_norm": 5.430688858032227, + "learning_rate": 5.82503345973481e-06, + "logits/chosen": -0.5661087036132812, + "logits/rejected": -0.662078857421875, + "logps/chosen": -55.347206115722656, + "logps/rejected": -83.97274780273438, + "loss": 0.7272, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.254662036895752, + "rewards/margins": 6.6828436851501465, + "rewards/rejected": -3.4281816482543945, + "step": 8938 + }, + { + "epoch": 2.24, + "grad_norm": 3.369196891784668, + "learning_rate": 5.824258235857324e-06, + "logits/chosen": -0.4841653108596802, + "logits/rejected": -0.5698468089103699, + "logps/chosen": -65.46359252929688, + "logps/rejected": -91.296142578125, + "loss": 0.6852, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9581096172332764, + "rewards/margins": 6.172788619995117, + "rewards/rejected": -3.2146785259246826, + "step": 8939 + }, + { + "epoch": 2.24, + "grad_norm": 4.744142055511475, + "learning_rate": 5.82348299161154e-06, + "logits/chosen": -0.3865070343017578, + "logits/rejected": -0.4739788770675659, + "logps/chosen": -73.45040130615234, + "logps/rejected": -108.21182250976562, + "loss": 0.6639, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9387621879577637, + "rewards/margins": 5.375892162322998, + "rewards/rejected": -2.4371299743652344, + "step": 8940 + }, + { + "epoch": 2.24, + "grad_norm": 5.0559000968933105, + "learning_rate": 5.8227077270166145e-06, + "logits/chosen": -0.49791136384010315, + "logits/rejected": -0.5470969676971436, + "logps/chosen": -47.28546905517578, + "logps/rejected": -93.85867309570312, + "loss": 0.6595, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1621530055999756, + "rewards/margins": 5.526510238647461, + "rewards/rejected": -2.3643569946289062, + "step": 8941 + }, + { + "epoch": 2.24, + "grad_norm": 4.952399253845215, + "learning_rate": 5.821932442091706e-06, + "logits/chosen": -0.4171571135520935, + "logits/rejected": -0.5257734656333923, + "logps/chosen": -54.70414352416992, + "logps/rejected": -96.63733673095703, + "loss": 0.6192, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8871469497680664, + "rewards/margins": 5.662907600402832, + "rewards/rejected": -2.775761127471924, + "step": 8942 + }, + { + "epoch": 2.24, + "grad_norm": 7.43845796585083, + "learning_rate": 5.821157136855974e-06, + "logits/chosen": -0.44022053480148315, + "logits/rejected": -0.5493571758270264, + "logps/chosen": -62.250728607177734, + "logps/rejected": -95.09662628173828, + "loss": 0.8539, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9950995445251465, + "rewards/margins": 5.15678071975708, + "rewards/rejected": -2.1616804599761963, + "step": 8943 + }, + { + "epoch": 2.24, + "grad_norm": 19.934356689453125, + "learning_rate": 5.820381811328575e-06, + "logits/chosen": -0.4869442582130432, + "logits/rejected": -0.6125264167785645, + "logps/chosen": -58.01277160644531, + "logps/rejected": -88.25554656982422, + "loss": 0.6118, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.157078504562378, + "rewards/margins": 6.247273921966553, + "rewards/rejected": -3.0901951789855957, + "step": 8944 + }, + { + "epoch": 2.24, + "grad_norm": 14.424065589904785, + "learning_rate": 5.81960646552867e-06, + "logits/chosen": -0.5664486885070801, + "logits/rejected": -0.5891563892364502, + "logps/chosen": -58.18919372558594, + "logps/rejected": -107.25287628173828, + "loss": 0.7494, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0312447547912598, + "rewards/margins": 5.434802055358887, + "rewards/rejected": -2.403557538986206, + "step": 8945 + }, + { + "epoch": 2.24, + "grad_norm": 4.678377151489258, + "learning_rate": 5.818831099475416e-06, + "logits/chosen": -0.47629329562187195, + "logits/rejected": -0.5608351230621338, + "logps/chosen": -52.21113967895508, + "logps/rejected": -90.80399322509766, + "loss": 0.6546, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.86478328704834, + "rewards/margins": 6.4804887771606445, + "rewards/rejected": -3.6157052516937256, + "step": 8946 + }, + { + "epoch": 2.24, + "grad_norm": 2.234555721282959, + "learning_rate": 5.818055713187974e-06, + "logits/chosen": -0.5032080411911011, + "logits/rejected": -0.5821950435638428, + "logps/chosen": -67.48004913330078, + "logps/rejected": -87.11890411376953, + "loss": 0.6291, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3397955894470215, + "rewards/margins": 6.347603797912598, + "rewards/rejected": -3.007807970046997, + "step": 8947 + }, + { + "epoch": 2.24, + "grad_norm": 9.389564514160156, + "learning_rate": 5.817280306685507e-06, + "logits/chosen": -0.44348597526550293, + "logits/rejected": -0.5799791812896729, + "logps/chosen": -59.663124084472656, + "logps/rejected": -100.44954681396484, + "loss": 0.6239, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.890775203704834, + "rewards/margins": 6.41865348815918, + "rewards/rejected": -3.527878522872925, + "step": 8948 + }, + { + "epoch": 2.24, + "grad_norm": 3.7495975494384766, + "learning_rate": 5.816504879987173e-06, + "logits/chosen": -0.535633385181427, + "logits/rejected": -0.5585198402404785, + "logps/chosen": -56.76905822753906, + "logps/rejected": -110.0254135131836, + "loss": 0.679, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.182973861694336, + "rewards/margins": 6.328989505767822, + "rewards/rejected": -3.1460156440734863, + "step": 8949 + }, + { + "epoch": 2.24, + "grad_norm": 4.857825756072998, + "learning_rate": 5.815729433112134e-06, + "logits/chosen": -0.46165332198143005, + "logits/rejected": -0.5368214249610901, + "logps/chosen": -51.16259002685547, + "logps/rejected": -83.95486450195312, + "loss": 0.7228, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9695372581481934, + "rewards/margins": 5.3965373039245605, + "rewards/rejected": -2.427000045776367, + "step": 8950 + }, + { + "epoch": 2.24, + "grad_norm": 14.135955810546875, + "learning_rate": 5.814953966079555e-06, + "logits/chosen": -0.45121920108795166, + "logits/rejected": -0.5267105102539062, + "logps/chosen": -53.68191909790039, + "logps/rejected": -83.82395935058594, + "loss": 0.649, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.026857852935791, + "rewards/margins": 5.379948616027832, + "rewards/rejected": -2.353090286254883, + "step": 8951 + }, + { + "epoch": 2.24, + "grad_norm": 2.0253708362579346, + "learning_rate": 5.814178478908596e-06, + "logits/chosen": -0.442377507686615, + "logits/rejected": -0.45914316177368164, + "logps/chosen": -47.65827560424805, + "logps/rejected": -111.36673736572266, + "loss": 0.5837, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1238183975219727, + "rewards/margins": 5.7006449699401855, + "rewards/rejected": -2.5768260955810547, + "step": 8952 + }, + { + "epoch": 2.24, + "grad_norm": 8.671692848205566, + "learning_rate": 5.813402971618421e-06, + "logits/chosen": -0.5276992321014404, + "logits/rejected": -0.6042450666427612, + "logps/chosen": -59.370445251464844, + "logps/rejected": -108.88221740722656, + "loss": 0.7102, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.927182197570801, + "rewards/margins": 6.580667972564697, + "rewards/rejected": -3.6534852981567383, + "step": 8953 + }, + { + "epoch": 2.24, + "grad_norm": 3.548109292984009, + "learning_rate": 5.812627444228194e-06, + "logits/chosen": -0.5368077158927917, + "logits/rejected": -0.577865719795227, + "logps/chosen": -53.544830322265625, + "logps/rejected": -105.75775909423828, + "loss": 0.6141, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.051053047180176, + "rewards/margins": 5.690548419952393, + "rewards/rejected": -2.6394951343536377, + "step": 8954 + }, + { + "epoch": 2.24, + "grad_norm": 7.051318645477295, + "learning_rate": 5.8118518967570766e-06, + "logits/chosen": -0.48504719138145447, + "logits/rejected": -0.5813792943954468, + "logps/chosen": -52.8828239440918, + "logps/rejected": -81.12934875488281, + "loss": 0.6413, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.736809253692627, + "rewards/margins": 5.278651237487793, + "rewards/rejected": -2.541841745376587, + "step": 8955 + }, + { + "epoch": 2.24, + "grad_norm": 7.824638366699219, + "learning_rate": 5.811076329224238e-06, + "logits/chosen": -0.44172245264053345, + "logits/rejected": -0.5321150422096252, + "logps/chosen": -54.645137786865234, + "logps/rejected": -98.33111572265625, + "loss": 0.7152, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0356032848358154, + "rewards/margins": 6.175329208374023, + "rewards/rejected": -3.139726161956787, + "step": 8956 + }, + { + "epoch": 2.24, + "grad_norm": 6.474244594573975, + "learning_rate": 5.8103007416488375e-06, + "logits/chosen": -0.44676312804222107, + "logits/rejected": -0.5333142876625061, + "logps/chosen": -59.167640686035156, + "logps/rejected": -87.73561096191406, + "loss": 0.7512, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.14530086517334, + "rewards/margins": 6.259377479553223, + "rewards/rejected": -3.1140761375427246, + "step": 8957 + }, + { + "epoch": 2.24, + "grad_norm": 3.1538236141204834, + "learning_rate": 5.809525134050046e-06, + "logits/chosen": -0.5178855657577515, + "logits/rejected": -0.5995102524757385, + "logps/chosen": -71.04686737060547, + "logps/rejected": -103.54560852050781, + "loss": 0.6693, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3249335289001465, + "rewards/margins": 7.221635341644287, + "rewards/rejected": -3.8967020511627197, + "step": 8958 + }, + { + "epoch": 2.24, + "grad_norm": 2.9903929233551025, + "learning_rate": 5.8087495064470265e-06, + "logits/chosen": -0.6139375567436218, + "logits/rejected": -0.6628700494766235, + "logps/chosen": -78.35115051269531, + "logps/rejected": -86.13140106201172, + "loss": 0.6557, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1870203018188477, + "rewards/margins": 6.2355170249938965, + "rewards/rejected": -3.048496723175049, + "step": 8959 + }, + { + "epoch": 2.24, + "grad_norm": 6.708479404449463, + "learning_rate": 5.807973858858947e-06, + "logits/chosen": -0.4622892737388611, + "logits/rejected": -0.5850188732147217, + "logps/chosen": -59.501861572265625, + "logps/rejected": -83.55525970458984, + "loss": 0.6515, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0268940925598145, + "rewards/margins": 6.2763519287109375, + "rewards/rejected": -3.249457359313965, + "step": 8960 + }, + { + "epoch": 2.24, + "grad_norm": 17.39206314086914, + "learning_rate": 5.807198191304975e-06, + "logits/chosen": -0.5209119319915771, + "logits/rejected": -0.6236717700958252, + "logps/chosen": -53.00973129272461, + "logps/rejected": -87.47850036621094, + "loss": 0.6401, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.135469913482666, + "rewards/margins": 6.0381035804748535, + "rewards/rejected": -2.9026334285736084, + "step": 8961 + }, + { + "epoch": 2.24, + "grad_norm": 17.62646484375, + "learning_rate": 5.806422503804274e-06, + "logits/chosen": -0.4646717607975006, + "logits/rejected": -0.5303077697753906, + "logps/chosen": -62.53358459472656, + "logps/rejected": -111.52421569824219, + "loss": 0.7042, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7922213077545166, + "rewards/margins": 6.025180816650391, + "rewards/rejected": -3.232959270477295, + "step": 8962 + }, + { + "epoch": 2.24, + "grad_norm": 5.772811412811279, + "learning_rate": 5.805646796376019e-06, + "logits/chosen": -0.46168580651283264, + "logits/rejected": -0.5738582611083984, + "logps/chosen": -56.21848678588867, + "logps/rejected": -99.98675537109375, + "loss": 0.6892, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7913572788238525, + "rewards/margins": 5.963739395141602, + "rewards/rejected": -3.17238187789917, + "step": 8963 + }, + { + "epoch": 2.24, + "grad_norm": 5.807385444641113, + "learning_rate": 5.804871069039372e-06, + "logits/chosen": -0.5112082958221436, + "logits/rejected": -0.5680173635482788, + "logps/chosen": -53.7195930480957, + "logps/rejected": -94.1065444946289, + "loss": 0.6892, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.792494058609009, + "rewards/margins": 5.107748985290527, + "rewards/rejected": -2.3152546882629395, + "step": 8964 + }, + { + "epoch": 2.24, + "grad_norm": 8.735093116760254, + "learning_rate": 5.804095321813505e-06, + "logits/chosen": -0.5109258890151978, + "logits/rejected": -0.6167916655540466, + "logps/chosen": -51.31432342529297, + "logps/rejected": -81.15630340576172, + "loss": 0.697, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.957777738571167, + "rewards/margins": 6.011773109436035, + "rewards/rejected": -3.0539958477020264, + "step": 8965 + }, + { + "epoch": 2.24, + "grad_norm": 6.962173938751221, + "learning_rate": 5.803319554717588e-06, + "logits/chosen": -0.48274898529052734, + "logits/rejected": -0.5974408388137817, + "logps/chosen": -68.22567749023438, + "logps/rejected": -88.73784637451172, + "loss": 0.7334, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.838827133178711, + "rewards/margins": 6.066268444061279, + "rewards/rejected": -3.22744083404541, + "step": 8966 + }, + { + "epoch": 2.24, + "grad_norm": 4.242734909057617, + "learning_rate": 5.802543767770791e-06, + "logits/chosen": -0.4360581338405609, + "logits/rejected": -0.503688395023346, + "logps/chosen": -51.29690933227539, + "logps/rejected": -101.88148498535156, + "loss": 0.6152, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0953965187072754, + "rewards/margins": 6.382881164550781, + "rewards/rejected": -3.287484645843506, + "step": 8967 + }, + { + "epoch": 2.24, + "grad_norm": 6.883720397949219, + "learning_rate": 5.801767960992284e-06, + "logits/chosen": -0.5494335293769836, + "logits/rejected": -0.5987598896026611, + "logps/chosen": -50.56015396118164, + "logps/rejected": -87.85111999511719, + "loss": 0.7222, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8202860355377197, + "rewards/margins": 5.231486797332764, + "rewards/rejected": -2.411200523376465, + "step": 8968 + }, + { + "epoch": 2.24, + "grad_norm": 3.3151369094848633, + "learning_rate": 5.800992134401237e-06, + "logits/chosen": -0.5089012980461121, + "logits/rejected": -0.5881007313728333, + "logps/chosen": -47.137718200683594, + "logps/rejected": -87.9212417602539, + "loss": 0.5942, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0569779872894287, + "rewards/margins": 6.566102027893066, + "rewards/rejected": -3.5091233253479004, + "step": 8969 + }, + { + "epoch": 2.24, + "grad_norm": 6.878387928009033, + "learning_rate": 5.800216288016821e-06, + "logits/chosen": -0.4613284170627594, + "logits/rejected": -0.5368406176567078, + "logps/chosen": -62.635990142822266, + "logps/rejected": -93.31241607666016, + "loss": 0.7575, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.276026964187622, + "rewards/margins": 5.2550811767578125, + "rewards/rejected": -1.9790542125701904, + "step": 8970 + }, + { + "epoch": 2.24, + "grad_norm": 4.742170810699463, + "learning_rate": 5.799440421858211e-06, + "logits/chosen": -0.437175989151001, + "logits/rejected": -0.5345100164413452, + "logps/chosen": -53.00916290283203, + "logps/rejected": -80.89347839355469, + "loss": 0.6651, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.084777355194092, + "rewards/margins": 5.451126575469971, + "rewards/rejected": -2.366349220275879, + "step": 8971 + }, + { + "epoch": 2.24, + "grad_norm": 5.71160364151001, + "learning_rate": 5.798664535944578e-06, + "logits/chosen": -0.4597293436527252, + "logits/rejected": -0.5803753733634949, + "logps/chosen": -64.48030090332031, + "logps/rejected": -83.32550048828125, + "loss": 0.6702, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.110658884048462, + "rewards/margins": 6.080216884613037, + "rewards/rejected": -2.969557762145996, + "step": 8972 + }, + { + "epoch": 2.24, + "grad_norm": 6.451143741607666, + "learning_rate": 5.797888630295094e-06, + "logits/chosen": -0.5089201331138611, + "logits/rejected": -0.5080876350402832, + "logps/chosen": -52.330604553222656, + "logps/rejected": -101.69049072265625, + "loss": 0.8357, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1357603073120117, + "rewards/margins": 5.135855674743652, + "rewards/rejected": -2.0000948905944824, + "step": 8973 + }, + { + "epoch": 2.25, + "grad_norm": 45.20033645629883, + "learning_rate": 5.7971127049289345e-06, + "logits/chosen": -0.4293719530105591, + "logits/rejected": -0.5856256484985352, + "logps/chosen": -57.593196868896484, + "logps/rejected": -85.27155303955078, + "loss": 0.7326, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0925650596618652, + "rewards/margins": 6.8195624351501465, + "rewards/rejected": -3.7269973754882812, + "step": 8974 + }, + { + "epoch": 2.25, + "grad_norm": 8.946381568908691, + "learning_rate": 5.796336759865271e-06, + "logits/chosen": -0.5589032173156738, + "logits/rejected": -0.6002026796340942, + "logps/chosen": -53.859519958496094, + "logps/rejected": -99.5623779296875, + "loss": 0.6959, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7268283367156982, + "rewards/margins": 6.114099502563477, + "rewards/rejected": -3.387270927429199, + "step": 8975 + }, + { + "epoch": 2.25, + "grad_norm": 8.42745304107666, + "learning_rate": 5.795560795123279e-06, + "logits/chosen": -0.4617552161216736, + "logits/rejected": -0.5371873378753662, + "logps/chosen": -67.53105926513672, + "logps/rejected": -94.75321960449219, + "loss": 0.8343, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9936928749084473, + "rewards/margins": 5.8803510665893555, + "rewards/rejected": -2.886657953262329, + "step": 8976 + }, + { + "epoch": 2.25, + "grad_norm": 3.4465293884277344, + "learning_rate": 5.794784810722136e-06, + "logits/chosen": -0.5180834531784058, + "logits/rejected": -0.599946916103363, + "logps/chosen": -53.469512939453125, + "logps/rejected": -105.7279052734375, + "loss": 0.5846, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.926316261291504, + "rewards/margins": 6.271458625793457, + "rewards/rejected": -3.345142364501953, + "step": 8977 + }, + { + "epoch": 2.25, + "grad_norm": 7.2298102378845215, + "learning_rate": 5.794008806681011e-06, + "logits/chosen": -0.41123875975608826, + "logits/rejected": -0.4999369978904724, + "logps/chosen": -62.33579635620117, + "logps/rejected": -91.39153289794922, + "loss": 0.6899, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7729785442352295, + "rewards/margins": 5.074807167053223, + "rewards/rejected": -2.301828384399414, + "step": 8978 + }, + { + "epoch": 2.25, + "grad_norm": 4.907926082611084, + "learning_rate": 5.793232783019087e-06, + "logits/chosen": -0.4459625482559204, + "logits/rejected": -0.5226335525512695, + "logps/chosen": -47.83154296875, + "logps/rejected": -97.53726959228516, + "loss": 0.5903, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.858673572540283, + "rewards/margins": 7.343688011169434, + "rewards/rejected": -4.485013961791992, + "step": 8979 + }, + { + "epoch": 2.25, + "grad_norm": 4.000715255737305, + "learning_rate": 5.792456739755536e-06, + "logits/chosen": -0.5382987260818481, + "logits/rejected": -0.617954671382904, + "logps/chosen": -50.37491226196289, + "logps/rejected": -93.34612274169922, + "loss": 0.6816, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3777801990509033, + "rewards/margins": 5.6205549240112305, + "rewards/rejected": -2.24277400970459, + "step": 8980 + }, + { + "epoch": 2.25, + "grad_norm": 9.100439071655273, + "learning_rate": 5.791680676909536e-06, + "logits/chosen": -0.5503306984901428, + "logits/rejected": -0.5970382690429688, + "logps/chosen": -52.896671295166016, + "logps/rejected": -102.45137786865234, + "loss": 0.6386, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9391367435455322, + "rewards/margins": 6.446356296539307, + "rewards/rejected": -3.5072195529937744, + "step": 8981 + }, + { + "epoch": 2.25, + "grad_norm": 5.352538585662842, + "learning_rate": 5.790904594500264e-06, + "logits/chosen": -0.5526056289672852, + "logits/rejected": -0.6371356844902039, + "logps/chosen": -49.34809494018555, + "logps/rejected": -84.05746459960938, + "loss": 0.6442, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.047919750213623, + "rewards/margins": 6.30488395690918, + "rewards/rejected": -3.2569644451141357, + "step": 8982 + }, + { + "epoch": 2.25, + "grad_norm": 4.891163349151611, + "learning_rate": 5.790128492546901e-06, + "logits/chosen": -0.479329377412796, + "logits/rejected": -0.515095591545105, + "logps/chosen": -65.54899597167969, + "logps/rejected": -88.7025375366211, + "loss": 0.7425, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.196115255355835, + "rewards/margins": 4.8272528648376465, + "rewards/rejected": -1.631137728691101, + "step": 8983 + }, + { + "epoch": 2.25, + "grad_norm": 3.7124829292297363, + "learning_rate": 5.789352371068619e-06, + "logits/chosen": -0.44871753454208374, + "logits/rejected": -0.5384588241577148, + "logps/chosen": -61.11680221557617, + "logps/rejected": -97.95999145507812, + "loss": 0.7185, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.962989091873169, + "rewards/margins": 5.742433547973633, + "rewards/rejected": -2.7794442176818848, + "step": 8984 + }, + { + "epoch": 2.25, + "grad_norm": 7.3822221755981445, + "learning_rate": 5.788576230084602e-06, + "logits/chosen": -0.5315452814102173, + "logits/rejected": -0.5660103559494019, + "logps/chosen": -57.071083068847656, + "logps/rejected": -91.89629364013672, + "loss": 0.9378, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7820470333099365, + "rewards/margins": 4.569009780883789, + "rewards/rejected": -1.7869629859924316, + "step": 8985 + }, + { + "epoch": 2.25, + "grad_norm": 3.651895046234131, + "learning_rate": 5.787800069614028e-06, + "logits/chosen": -0.49545323848724365, + "logits/rejected": -0.6633520126342773, + "logps/chosen": -64.7333984375, + "logps/rejected": -86.44754028320312, + "loss": 0.6901, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0521435737609863, + "rewards/margins": 6.539800643920898, + "rewards/rejected": -3.487657308578491, + "step": 8986 + }, + { + "epoch": 2.25, + "grad_norm": 3.926239013671875, + "learning_rate": 5.787023889676074e-06, + "logits/chosen": -0.48136255145072937, + "logits/rejected": -0.5802165865898132, + "logps/chosen": -51.99496078491211, + "logps/rejected": -91.16207122802734, + "loss": 0.6132, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4030096530914307, + "rewards/margins": 6.634678363800049, + "rewards/rejected": -3.231668472290039, + "step": 8987 + }, + { + "epoch": 2.25, + "grad_norm": 4.3191423416137695, + "learning_rate": 5.7862476902899256e-06, + "logits/chosen": -0.5145233869552612, + "logits/rejected": -0.6273061037063599, + "logps/chosen": -61.46147155761719, + "logps/rejected": -91.357666015625, + "loss": 0.6707, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9821553230285645, + "rewards/margins": 6.482815265655518, + "rewards/rejected": -3.5006601810455322, + "step": 8988 + }, + { + "epoch": 2.25, + "grad_norm": 7.524168014526367, + "learning_rate": 5.785471471474758e-06, + "logits/chosen": -0.5043563842773438, + "logits/rejected": -0.552743673324585, + "logps/chosen": -53.641387939453125, + "logps/rejected": -93.96145629882812, + "loss": 0.6857, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.873239040374756, + "rewards/margins": 4.869271278381348, + "rewards/rejected": -1.9960322380065918, + "step": 8989 + }, + { + "epoch": 2.25, + "grad_norm": 8.293191909790039, + "learning_rate": 5.784695233249756e-06, + "logits/chosen": -0.46908169984817505, + "logits/rejected": -0.5887116193771362, + "logps/chosen": -54.91373062133789, + "logps/rejected": -111.07290649414062, + "loss": 0.7124, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8896937370300293, + "rewards/margins": 6.860138893127441, + "rewards/rejected": -3.970445156097412, + "step": 8990 + }, + { + "epoch": 2.25, + "grad_norm": 4.211553573608398, + "learning_rate": 5.7839189756341004e-06, + "logits/chosen": -0.5844773650169373, + "logits/rejected": -0.5893285274505615, + "logps/chosen": -43.7840690612793, + "logps/rejected": -104.55252838134766, + "loss": 0.5998, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3154964447021484, + "rewards/margins": 5.667708873748779, + "rewards/rejected": -2.3522121906280518, + "step": 8991 + }, + { + "epoch": 2.25, + "grad_norm": 8.614014625549316, + "learning_rate": 5.783142698646973e-06, + "logits/chosen": -0.5341227650642395, + "logits/rejected": -0.5765683054924011, + "logps/chosen": -64.69027709960938, + "logps/rejected": -99.14910125732422, + "loss": 0.814, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.974560260772705, + "rewards/margins": 5.390148162841797, + "rewards/rejected": -2.4155874252319336, + "step": 8992 + }, + { + "epoch": 2.25, + "grad_norm": 3.4339611530303955, + "learning_rate": 5.782366402307557e-06, + "logits/chosen": -0.5297369956970215, + "logits/rejected": -0.5651426911354065, + "logps/chosen": -55.48372268676758, + "logps/rejected": -103.78553771972656, + "loss": 0.6835, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0245046615600586, + "rewards/margins": 5.959491729736328, + "rewards/rejected": -2.9349870681762695, + "step": 8993 + }, + { + "epoch": 2.25, + "grad_norm": 3.912081241607666, + "learning_rate": 5.781590086635034e-06, + "logits/chosen": -0.5253180861473083, + "logits/rejected": -0.6107928156852722, + "logps/chosen": -51.57078552246094, + "logps/rejected": -84.6855697631836, + "loss": 0.6948, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9973838329315186, + "rewards/margins": 5.717291831970215, + "rewards/rejected": -2.719907522201538, + "step": 8994 + }, + { + "epoch": 2.25, + "grad_norm": 5.913467884063721, + "learning_rate": 5.780813751648589e-06, + "logits/chosen": -0.5039508938789368, + "logits/rejected": -0.5433120131492615, + "logps/chosen": -53.3123664855957, + "logps/rejected": -100.8215560913086, + "loss": 0.6836, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.779895067214966, + "rewards/margins": 6.504002571105957, + "rewards/rejected": -3.7241077423095703, + "step": 8995 + }, + { + "epoch": 2.25, + "grad_norm": 3.9242100715637207, + "learning_rate": 5.780037397367405e-06, + "logits/chosen": -0.4570397734642029, + "logits/rejected": -0.5011979937553406, + "logps/chosen": -45.538028717041016, + "logps/rejected": -86.04362487792969, + "loss": 0.6498, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1366982460021973, + "rewards/margins": 5.848803520202637, + "rewards/rejected": -2.7121057510375977, + "step": 8996 + }, + { + "epoch": 2.25, + "grad_norm": 3.1324753761291504, + "learning_rate": 5.7792610238106694e-06, + "logits/chosen": -0.5214188098907471, + "logits/rejected": -0.6475088000297546, + "logps/chosen": -48.286319732666016, + "logps/rejected": -90.13887023925781, + "loss": 0.5748, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.910888910293579, + "rewards/margins": 6.927839279174805, + "rewards/rejected": -4.016950607299805, + "step": 8997 + }, + { + "epoch": 2.25, + "grad_norm": 20.413894653320312, + "learning_rate": 5.778484630997562e-06, + "logits/chosen": -0.5378797054290771, + "logits/rejected": -0.5999466180801392, + "logps/chosen": -59.306827545166016, + "logps/rejected": -123.80074310302734, + "loss": 0.7561, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3184280395507812, + "rewards/margins": 7.757326602935791, + "rewards/rejected": -4.438898086547852, + "step": 8998 + }, + { + "epoch": 2.25, + "grad_norm": 4.415816307067871, + "learning_rate": 5.777708218947273e-06, + "logits/chosen": -0.4578828811645508, + "logits/rejected": -0.5252822637557983, + "logps/chosen": -61.92168045043945, + "logps/rejected": -118.2289810180664, + "loss": 0.5748, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8213446140289307, + "rewards/margins": 6.580771446228027, + "rewards/rejected": -3.7594268321990967, + "step": 8999 + }, + { + "epoch": 2.25, + "grad_norm": 9.007587432861328, + "learning_rate": 5.776931787678987e-06, + "logits/chosen": -0.4944486618041992, + "logits/rejected": -0.5377673506736755, + "logps/chosen": -53.20121765136719, + "logps/rejected": -92.40618896484375, + "loss": 0.8475, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8916752338409424, + "rewards/margins": 5.137656211853027, + "rewards/rejected": -2.245981216430664, + "step": 9000 + }, + { + "epoch": 2.25, + "grad_norm": 5.227017879486084, + "learning_rate": 5.7761553372118895e-06, + "logits/chosen": -0.47978124022483826, + "logits/rejected": -0.5322992205619812, + "logps/chosen": -47.26206970214844, + "logps/rejected": -86.70856475830078, + "loss": 0.6796, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.952087640762329, + "rewards/margins": 5.082767963409424, + "rewards/rejected": -2.1306800842285156, + "step": 9001 + }, + { + "epoch": 2.25, + "grad_norm": 2.793207883834839, + "learning_rate": 5.775378867565169e-06, + "logits/chosen": -0.47736895084381104, + "logits/rejected": -0.5486338138580322, + "logps/chosen": -56.99388885498047, + "logps/rejected": -94.92206573486328, + "loss": 0.5916, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0741569995880127, + "rewards/margins": 6.029445648193359, + "rewards/rejected": -2.955288887023926, + "step": 9002 + }, + { + "epoch": 2.25, + "grad_norm": 2.576864242553711, + "learning_rate": 5.774602378758011e-06, + "logits/chosen": -0.4367995858192444, + "logits/rejected": -0.5036523938179016, + "logps/chosen": -56.729393005371094, + "logps/rejected": -115.97592163085938, + "loss": 0.5783, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1034605503082275, + "rewards/margins": 7.437616348266602, + "rewards/rejected": -4.334155082702637, + "step": 9003 + }, + { + "epoch": 2.25, + "grad_norm": 4.457269191741943, + "learning_rate": 5.773825870809604e-06, + "logits/chosen": -0.5100318193435669, + "logits/rejected": -0.595171332359314, + "logps/chosen": -71.03257751464844, + "logps/rejected": -96.56468200683594, + "loss": 0.7294, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7153334617614746, + "rewards/margins": 5.820310115814209, + "rewards/rejected": -3.1049766540527344, + "step": 9004 + }, + { + "epoch": 2.25, + "grad_norm": 3.7769505977630615, + "learning_rate": 5.7730493437391375e-06, + "logits/chosen": -0.566516637802124, + "logits/rejected": -0.5734454393386841, + "logps/chosen": -43.10538864135742, + "logps/rejected": -103.335693359375, + "loss": 0.6137, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9355251789093018, + "rewards/margins": 5.747617244720459, + "rewards/rejected": -2.8120920658111572, + "step": 9005 + }, + { + "epoch": 2.25, + "grad_norm": 23.746952056884766, + "learning_rate": 5.7722727975658e-06, + "logits/chosen": -0.581851065158844, + "logits/rejected": -0.6578625440597534, + "logps/chosen": -54.631229400634766, + "logps/rejected": -105.5211410522461, + "loss": 0.7415, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.96936297416687, + "rewards/margins": 6.269625663757324, + "rewards/rejected": -3.300262451171875, + "step": 9006 + }, + { + "epoch": 2.25, + "grad_norm": 4.704607009887695, + "learning_rate": 5.771496232308778e-06, + "logits/chosen": -0.5108910799026489, + "logits/rejected": -0.6041701436042786, + "logps/chosen": -45.35545349121094, + "logps/rejected": -82.98003387451172, + "loss": 0.6288, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.291431427001953, + "rewards/margins": 5.891928672790527, + "rewards/rejected": -2.600497245788574, + "step": 9007 + }, + { + "epoch": 2.25, + "grad_norm": 5.8770012855529785, + "learning_rate": 5.770719647987266e-06, + "logits/chosen": -0.5972912907600403, + "logits/rejected": -0.6316063404083252, + "logps/chosen": -41.02027130126953, + "logps/rejected": -112.34558868408203, + "loss": 0.6977, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9009997844696045, + "rewards/margins": 6.895425319671631, + "rewards/rejected": -3.9944252967834473, + "step": 9008 + }, + { + "epoch": 2.25, + "grad_norm": 4.1254191398620605, + "learning_rate": 5.769943044620451e-06, + "logits/chosen": -0.47661060094833374, + "logits/rejected": -0.5294010043144226, + "logps/chosen": -59.07461929321289, + "logps/rejected": -115.12358856201172, + "loss": 0.6785, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.463674306869507, + "rewards/margins": 6.330474853515625, + "rewards/rejected": -2.866800308227539, + "step": 9009 + }, + { + "epoch": 2.25, + "grad_norm": 2.201601982116699, + "learning_rate": 5.769166422227523e-06, + "logits/chosen": -0.5398671627044678, + "logits/rejected": -0.6581915616989136, + "logps/chosen": -46.648956298828125, + "logps/rejected": -89.79194641113281, + "loss": 0.531, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8681230545043945, + "rewards/margins": 6.819845676422119, + "rewards/rejected": -3.9517226219177246, + "step": 9010 + }, + { + "epoch": 2.25, + "grad_norm": 3.291762590408325, + "learning_rate": 5.768389780827677e-06, + "logits/chosen": -0.55774986743927, + "logits/rejected": -0.6392631530761719, + "logps/chosen": -56.872467041015625, + "logps/rejected": -95.62501525878906, + "loss": 0.6207, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0251777172088623, + "rewards/margins": 6.9460906982421875, + "rewards/rejected": -3.9209132194519043, + "step": 9011 + }, + { + "epoch": 2.25, + "grad_norm": 3.0213890075683594, + "learning_rate": 5.767613120440101e-06, + "logits/chosen": -0.5571296215057373, + "logits/rejected": -0.645317792892456, + "logps/chosen": -59.139434814453125, + "logps/rejected": -94.17798614501953, + "loss": 0.5583, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1973252296447754, + "rewards/margins": 6.5344696044921875, + "rewards/rejected": -3.337144136428833, + "step": 9012 + }, + { + "epoch": 2.25, + "grad_norm": 4.9995880126953125, + "learning_rate": 5.7668364410839875e-06, + "logits/chosen": -0.4859007000923157, + "logits/rejected": -0.5865821242332458, + "logps/chosen": -60.76816940307617, + "logps/rejected": -75.38937377929688, + "loss": 0.6816, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9598124027252197, + "rewards/margins": 5.345775127410889, + "rewards/rejected": -2.385962963104248, + "step": 9013 + }, + { + "epoch": 2.26, + "grad_norm": 5.772801399230957, + "learning_rate": 5.7660597427785304e-06, + "logits/chosen": -0.46871763467788696, + "logits/rejected": -0.5803781747817993, + "logps/chosen": -54.9578971862793, + "logps/rejected": -97.34385681152344, + "loss": 0.5973, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.632899284362793, + "rewards/margins": 6.262923717498779, + "rewards/rejected": -3.6300244331359863, + "step": 9014 + }, + { + "epoch": 2.26, + "grad_norm": 5.4370222091674805, + "learning_rate": 5.765283025542923e-06, + "logits/chosen": -0.511322557926178, + "logits/rejected": -0.6181017756462097, + "logps/chosen": -53.27777862548828, + "logps/rejected": -90.74490356445312, + "loss": 0.569, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1037464141845703, + "rewards/margins": 6.438357830047607, + "rewards/rejected": -3.334611415863037, + "step": 9015 + }, + { + "epoch": 2.26, + "grad_norm": 5.8926310539245605, + "learning_rate": 5.764506289396356e-06, + "logits/chosen": -0.5264528393745422, + "logits/rejected": -0.5341505408287048, + "logps/chosen": -75.17461395263672, + "logps/rejected": -101.42182922363281, + "loss": 0.6385, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.093404769897461, + "rewards/margins": 5.847428798675537, + "rewards/rejected": -2.754024028778076, + "step": 9016 + }, + { + "epoch": 2.26, + "grad_norm": 4.414304256439209, + "learning_rate": 5.763729534358028e-06, + "logits/chosen": -0.5300865173339844, + "logits/rejected": -0.5877746343612671, + "logps/chosen": -49.431121826171875, + "logps/rejected": -88.86418151855469, + "loss": 0.6426, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.144674301147461, + "rewards/margins": 5.795931816101074, + "rewards/rejected": -2.6512575149536133, + "step": 9017 + }, + { + "epoch": 2.26, + "grad_norm": 3.4611475467681885, + "learning_rate": 5.7629527604471295e-06, + "logits/chosen": -0.507686972618103, + "logits/rejected": -0.6664084196090698, + "logps/chosen": -71.2839584350586, + "logps/rejected": -83.36956787109375, + "loss": 0.6394, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7653400897979736, + "rewards/margins": 5.890931129455566, + "rewards/rejected": -3.1255908012390137, + "step": 9018 + }, + { + "epoch": 2.26, + "grad_norm": 6.181783676147461, + "learning_rate": 5.762175967682857e-06, + "logits/chosen": -0.5661256313323975, + "logits/rejected": -0.6460944414138794, + "logps/chosen": -54.076480865478516, + "logps/rejected": -87.5413589477539, + "loss": 0.6406, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.902827262878418, + "rewards/margins": 5.647931098937988, + "rewards/rejected": -2.7451038360595703, + "step": 9019 + }, + { + "epoch": 2.26, + "grad_norm": 5.026932716369629, + "learning_rate": 5.761399156084406e-06, + "logits/chosen": -0.49193477630615234, + "logits/rejected": -0.554287850856781, + "logps/chosen": -59.41965103149414, + "logps/rejected": -110.57530212402344, + "loss": 0.754, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1549105644226074, + "rewards/margins": 6.497409820556641, + "rewards/rejected": -3.342499256134033, + "step": 9020 + }, + { + "epoch": 2.26, + "grad_norm": 5.393560409545898, + "learning_rate": 5.760622325670971e-06, + "logits/chosen": -0.5133957266807556, + "logits/rejected": -0.5988364219665527, + "logps/chosen": -54.91621398925781, + "logps/rejected": -81.40962219238281, + "loss": 0.6171, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.26009202003479, + "rewards/margins": 6.250738143920898, + "rewards/rejected": -2.9906458854675293, + "step": 9021 + }, + { + "epoch": 2.26, + "grad_norm": 4.272851943969727, + "learning_rate": 5.75984547646175e-06, + "logits/chosen": -0.45524927973747253, + "logits/rejected": -0.5253164172172546, + "logps/chosen": -57.15228271484375, + "logps/rejected": -108.83208465576172, + "loss": 0.6545, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8634884357452393, + "rewards/margins": 7.039571285247803, + "rewards/rejected": -4.176082611083984, + "step": 9022 + }, + { + "epoch": 2.26, + "grad_norm": 3.315553665161133, + "learning_rate": 5.7590686084759394e-06, + "logits/chosen": -0.5307601690292358, + "logits/rejected": -0.5925837159156799, + "logps/chosen": -50.041526794433594, + "logps/rejected": -95.75984954833984, + "loss": 0.5501, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.066720962524414, + "rewards/margins": 5.994382381439209, + "rewards/rejected": -2.927661418914795, + "step": 9023 + }, + { + "epoch": 2.26, + "grad_norm": 15.054073333740234, + "learning_rate": 5.758291721732736e-06, + "logits/chosen": -0.4272129535675049, + "logits/rejected": -0.5020620822906494, + "logps/chosen": -48.669281005859375, + "logps/rejected": -87.9201431274414, + "loss": 0.66, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.29876708984375, + "rewards/margins": 6.057125091552734, + "rewards/rejected": -2.758357286453247, + "step": 9024 + }, + { + "epoch": 2.26, + "grad_norm": 13.27569580078125, + "learning_rate": 5.757514816251338e-06, + "logits/chosen": -0.46360236406326294, + "logits/rejected": -0.533145010471344, + "logps/chosen": -66.7424545288086, + "logps/rejected": -88.322265625, + "loss": 0.7586, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.095400333404541, + "rewards/margins": 5.227936267852783, + "rewards/rejected": -2.132535696029663, + "step": 9025 + }, + { + "epoch": 2.26, + "grad_norm": 2.5031397342681885, + "learning_rate": 5.756737892050942e-06, + "logits/chosen": -0.5070399641990662, + "logits/rejected": -0.5665119290351868, + "logps/chosen": -53.402809143066406, + "logps/rejected": -123.88328552246094, + "loss": 0.5415, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1164801120758057, + "rewards/margins": 7.034048557281494, + "rewards/rejected": -3.9175686836242676, + "step": 9026 + }, + { + "epoch": 2.26, + "grad_norm": 5.269262790679932, + "learning_rate": 5.755960949150748e-06, + "logits/chosen": -0.5347557067871094, + "logits/rejected": -0.6115031242370605, + "logps/chosen": -54.26959228515625, + "logps/rejected": -89.86944580078125, + "loss": 0.7717, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.045525074005127, + "rewards/margins": 5.49298620223999, + "rewards/rejected": -2.4474611282348633, + "step": 9027 + }, + { + "epoch": 2.26, + "grad_norm": 6.48829460144043, + "learning_rate": 5.755183987569956e-06, + "logits/chosen": -0.6141420602798462, + "logits/rejected": -0.7032448053359985, + "logps/chosen": -49.7310905456543, + "logps/rejected": -90.52127838134766, + "loss": 0.6752, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7010698318481445, + "rewards/margins": 6.232285022735596, + "rewards/rejected": -3.531214475631714, + "step": 9028 + }, + { + "epoch": 2.26, + "grad_norm": 10.211956977844238, + "learning_rate": 5.754407007327764e-06, + "logits/chosen": -0.5664554834365845, + "logits/rejected": -0.6232088804244995, + "logps/chosen": -53.779258728027344, + "logps/rejected": -114.09378051757812, + "loss": 0.6628, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2189979553222656, + "rewards/margins": 7.263948917388916, + "rewards/rejected": -4.04495096206665, + "step": 9029 + }, + { + "epoch": 2.26, + "grad_norm": 8.593168258666992, + "learning_rate": 5.753630008443371e-06, + "logits/chosen": -0.45281219482421875, + "logits/rejected": -0.5429803133010864, + "logps/chosen": -58.448638916015625, + "logps/rejected": -121.04554748535156, + "loss": 0.593, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.92826509475708, + "rewards/margins": 6.589123725891113, + "rewards/rejected": -3.6608591079711914, + "step": 9030 + }, + { + "epoch": 2.26, + "grad_norm": 8.638019561767578, + "learning_rate": 5.752852990935981e-06, + "logits/chosen": -0.49706077575683594, + "logits/rejected": -0.5290147662162781, + "logps/chosen": -54.32270812988281, + "logps/rejected": -114.93619537353516, + "loss": 0.7667, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9949803352355957, + "rewards/margins": 5.960275650024414, + "rewards/rejected": -2.9652955532073975, + "step": 9031 + }, + { + "epoch": 2.26, + "grad_norm": 4.812466144561768, + "learning_rate": 5.752075954824791e-06, + "logits/chosen": -0.4507461190223694, + "logits/rejected": -0.5443338751792908, + "logps/chosen": -50.10478210449219, + "logps/rejected": -90.03265380859375, + "loss": 0.6001, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.90511155128479, + "rewards/margins": 5.937798023223877, + "rewards/rejected": -3.032686710357666, + "step": 9032 + }, + { + "epoch": 2.26, + "grad_norm": 9.24267864227295, + "learning_rate": 5.751298900129006e-06, + "logits/chosen": -0.5068892240524292, + "logits/rejected": -0.6098353862762451, + "logps/chosen": -57.99134063720703, + "logps/rejected": -111.56439208984375, + "loss": 0.7708, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8349618911743164, + "rewards/margins": 5.826010704040527, + "rewards/rejected": -2.99104905128479, + "step": 9033 + }, + { + "epoch": 2.26, + "grad_norm": 4.740194320678711, + "learning_rate": 5.750521826867824e-06, + "logits/chosen": -0.5919514894485474, + "logits/rejected": -0.7225220203399658, + "logps/chosen": -65.41642761230469, + "logps/rejected": -83.43586730957031, + "loss": 0.7331, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.998469829559326, + "rewards/margins": 6.152767181396484, + "rewards/rejected": -3.1542975902557373, + "step": 9034 + }, + { + "epoch": 2.26, + "grad_norm": 11.139730453491211, + "learning_rate": 5.7497447350604506e-06, + "logits/chosen": -0.504259467124939, + "logits/rejected": -0.6299695372581482, + "logps/chosen": -62.14213180541992, + "logps/rejected": -139.3365020751953, + "loss": 0.6781, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.863584041595459, + "rewards/margins": 9.292139053344727, + "rewards/rejected": -6.428554058074951, + "step": 9035 + }, + { + "epoch": 2.26, + "grad_norm": 8.613652229309082, + "learning_rate": 5.7489676247260875e-06, + "logits/chosen": -0.5417090058326721, + "logits/rejected": -0.608863890171051, + "logps/chosen": -55.036338806152344, + "logps/rejected": -116.01591491699219, + "loss": 0.7733, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.946627140045166, + "rewards/margins": 6.786240100860596, + "rewards/rejected": -3.839613199234009, + "step": 9036 + }, + { + "epoch": 2.26, + "grad_norm": 5.241815567016602, + "learning_rate": 5.748190495883937e-06, + "logits/chosen": -0.46595582365989685, + "logits/rejected": -0.5697466135025024, + "logps/chosen": -63.003501892089844, + "logps/rejected": -88.95820617675781, + "loss": 0.6517, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.722470760345459, + "rewards/margins": 5.59946870803833, + "rewards/rejected": -2.876997947692871, + "step": 9037 + }, + { + "epoch": 2.26, + "grad_norm": 9.107157707214355, + "learning_rate": 5.747413348553204e-06, + "logits/chosen": -0.5413005948066711, + "logits/rejected": -0.548533022403717, + "logps/chosen": -48.03560256958008, + "logps/rejected": -96.21809387207031, + "loss": 0.7548, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8570356369018555, + "rewards/margins": 4.971320152282715, + "rewards/rejected": -2.1142847537994385, + "step": 9038 + }, + { + "epoch": 2.26, + "grad_norm": 5.392341136932373, + "learning_rate": 5.746636182753091e-06, + "logits/chosen": -0.560535728931427, + "logits/rejected": -0.6934154033660889, + "logps/chosen": -67.11769104003906, + "logps/rejected": -102.55461883544922, + "loss": 0.6984, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9786903858184814, + "rewards/margins": 7.714645862579346, + "rewards/rejected": -4.735955238342285, + "step": 9039 + }, + { + "epoch": 2.26, + "grad_norm": 8.571361541748047, + "learning_rate": 5.7458589985028056e-06, + "logits/chosen": -0.4685875177383423, + "logits/rejected": -0.47289684414863586, + "logps/chosen": -57.55116271972656, + "logps/rejected": -114.949462890625, + "loss": 0.7419, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9358184337615967, + "rewards/margins": 5.624948501586914, + "rewards/rejected": -2.6891303062438965, + "step": 9040 + }, + { + "epoch": 2.26, + "grad_norm": 5.070909023284912, + "learning_rate": 5.7450817958215495e-06, + "logits/chosen": -0.5408434867858887, + "logits/rejected": -0.6369282007217407, + "logps/chosen": -64.83248138427734, + "logps/rejected": -109.642578125, + "loss": 0.6252, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4941864013671875, + "rewards/margins": 6.492973804473877, + "rewards/rejected": -3.9987876415252686, + "step": 9041 + }, + { + "epoch": 2.26, + "grad_norm": 9.569207191467285, + "learning_rate": 5.7443045747285306e-06, + "logits/chosen": -0.5136007070541382, + "logits/rejected": -0.5906752347946167, + "logps/chosen": -72.3897705078125, + "logps/rejected": -84.17401123046875, + "loss": 0.8193, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.796356678009033, + "rewards/margins": 4.528212547302246, + "rewards/rejected": -1.731856346130371, + "step": 9042 + }, + { + "epoch": 2.26, + "grad_norm": 2.991123676300049, + "learning_rate": 5.743527335242955e-06, + "logits/chosen": -0.4604541063308716, + "logits/rejected": -0.531955361366272, + "logps/chosen": -59.24835968017578, + "logps/rejected": -99.60858917236328, + "loss": 0.5372, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.188894748687744, + "rewards/margins": 7.359374046325684, + "rewards/rejected": -4.170479774475098, + "step": 9043 + }, + { + "epoch": 2.26, + "grad_norm": 9.326003074645996, + "learning_rate": 5.7427500773840265e-06, + "logits/chosen": -0.5176255702972412, + "logits/rejected": -0.5954396724700928, + "logps/chosen": -49.23603057861328, + "logps/rejected": -91.0077896118164, + "loss": 0.5859, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.154935359954834, + "rewards/margins": 6.362720966339111, + "rewards/rejected": -3.2077860832214355, + "step": 9044 + }, + { + "epoch": 2.26, + "grad_norm": 7.828938961029053, + "learning_rate": 5.741972801170954e-06, + "logits/chosen": -0.5032639503479004, + "logits/rejected": -0.5951957702636719, + "logps/chosen": -51.88134765625, + "logps/rejected": -96.04777526855469, + "loss": 0.646, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.350569248199463, + "rewards/margins": 6.805807590484619, + "rewards/rejected": -3.4552385807037354, + "step": 9045 + }, + { + "epoch": 2.26, + "grad_norm": 12.591934204101562, + "learning_rate": 5.741195506622945e-06, + "logits/chosen": -0.42805585265159607, + "logits/rejected": -0.5783533453941345, + "logps/chosen": -61.519004821777344, + "logps/rejected": -91.72586059570312, + "loss": 0.8232, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6817097663879395, + "rewards/margins": 5.818770885467529, + "rewards/rejected": -3.1370608806610107, + "step": 9046 + }, + { + "epoch": 2.26, + "grad_norm": 4.666693210601807, + "learning_rate": 5.740418193759205e-06, + "logits/chosen": -0.5281215906143188, + "logits/rejected": -0.5824039578437805, + "logps/chosen": -51.7823600769043, + "logps/rejected": -106.6428451538086, + "loss": 0.6338, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0614964962005615, + "rewards/margins": 6.402718544006348, + "rewards/rejected": -3.341222047805786, + "step": 9047 + }, + { + "epoch": 2.26, + "grad_norm": 19.519325256347656, + "learning_rate": 5.739640862598945e-06, + "logits/chosen": -0.5433692932128906, + "logits/rejected": -0.588171124458313, + "logps/chosen": -53.82103729248047, + "logps/rejected": -109.9516830444336, + "loss": 0.639, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1259655952453613, + "rewards/margins": 5.930310249328613, + "rewards/rejected": -2.804344415664673, + "step": 9048 + }, + { + "epoch": 2.26, + "grad_norm": 21.184219360351562, + "learning_rate": 5.738863513161374e-06, + "logits/chosen": -0.4753005802631378, + "logits/rejected": -0.5741032361984253, + "logps/chosen": -54.259979248046875, + "logps/rejected": -92.4060287475586, + "loss": 0.6889, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8357160091400146, + "rewards/margins": 5.205578804016113, + "rewards/rejected": -2.3698625564575195, + "step": 9049 + }, + { + "epoch": 2.26, + "grad_norm": 8.038236618041992, + "learning_rate": 5.738086145465698e-06, + "logits/chosen": -0.5156931281089783, + "logits/rejected": -0.5855948328971863, + "logps/chosen": -48.12472915649414, + "logps/rejected": -102.56700897216797, + "loss": 0.5944, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.022883653640747, + "rewards/margins": 6.4021806716918945, + "rewards/rejected": -3.3792972564697266, + "step": 9050 + }, + { + "epoch": 2.26, + "grad_norm": 6.317175388336182, + "learning_rate": 5.737308759531128e-06, + "logits/chosen": -0.35913896560668945, + "logits/rejected": -0.39779728651046753, + "logps/chosen": -54.53378677368164, + "logps/rejected": -110.5229721069336, + "loss": 0.6746, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.940354824066162, + "rewards/margins": 6.363132953643799, + "rewards/rejected": -3.4227776527404785, + "step": 9051 + }, + { + "epoch": 2.26, + "grad_norm": 7.550971508026123, + "learning_rate": 5.736531355376874e-06, + "logits/chosen": -0.5819925665855408, + "logits/rejected": -0.6658967733383179, + "logps/chosen": -56.65611267089844, + "logps/rejected": -85.87386322021484, + "loss": 0.7969, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.265369415283203, + "rewards/margins": 5.731327056884766, + "rewards/rejected": -2.4659574031829834, + "step": 9052 + }, + { + "epoch": 2.26, + "grad_norm": 1.7112380266189575, + "learning_rate": 5.735753933022148e-06, + "logits/chosen": -0.4921693503856659, + "logits/rejected": -0.6047642230987549, + "logps/chosen": -53.32200622558594, + "logps/rejected": -97.48839569091797, + "loss": 0.5826, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9603514671325684, + "rewards/margins": 7.073429107666016, + "rewards/rejected": -4.1130781173706055, + "step": 9053 + }, + { + "epoch": 2.27, + "grad_norm": 5.267312526702881, + "learning_rate": 5.7349764924861605e-06, + "logits/chosen": -0.45104682445526123, + "logits/rejected": -0.5286318063735962, + "logps/chosen": -55.245609283447266, + "logps/rejected": -82.76651000976562, + "loss": 0.7603, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9128668308258057, + "rewards/margins": 5.006458759307861, + "rewards/rejected": -2.0935916900634766, + "step": 9054 + }, + { + "epoch": 2.27, + "grad_norm": 5.467076301574707, + "learning_rate": 5.73419903378812e-06, + "logits/chosen": -0.45356759428977966, + "logits/rejected": -0.5758669972419739, + "logps/chosen": -62.887718200683594, + "logps/rejected": -82.41861724853516, + "loss": 0.6012, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1599605083465576, + "rewards/margins": 5.478552341461182, + "rewards/rejected": -2.318591833114624, + "step": 9055 + }, + { + "epoch": 2.27, + "grad_norm": 3.5668303966522217, + "learning_rate": 5.733421556947242e-06, + "logits/chosen": -0.47973352670669556, + "logits/rejected": -0.5748439431190491, + "logps/chosen": -53.0867805480957, + "logps/rejected": -93.28689575195312, + "loss": 0.6766, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.985748529434204, + "rewards/margins": 6.289150714874268, + "rewards/rejected": -3.3034021854400635, + "step": 9056 + }, + { + "epoch": 2.27, + "grad_norm": 2.343104600906372, + "learning_rate": 5.7326440619827365e-06, + "logits/chosen": -0.5204654932022095, + "logits/rejected": -0.6069662570953369, + "logps/chosen": -55.12175750732422, + "logps/rejected": -90.17169189453125, + "loss": 0.5907, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.306589126586914, + "rewards/margins": 6.410233020782471, + "rewards/rejected": -3.1036436557769775, + "step": 9057 + }, + { + "epoch": 2.27, + "grad_norm": 3.5135719776153564, + "learning_rate": 5.731866548913817e-06, + "logits/chosen": -0.499032199382782, + "logits/rejected": -0.608838677406311, + "logps/chosen": -47.424739837646484, + "logps/rejected": -83.48346710205078, + "loss": 0.6537, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0254147052764893, + "rewards/margins": 5.931242942810059, + "rewards/rejected": -2.905827522277832, + "step": 9058 + }, + { + "epoch": 2.27, + "grad_norm": 2.7414286136627197, + "learning_rate": 5.731089017759697e-06, + "logits/chosen": -0.48202404379844666, + "logits/rejected": -0.5506166815757751, + "logps/chosen": -47.07696533203125, + "logps/rejected": -119.34241485595703, + "loss": 0.5521, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3304333686828613, + "rewards/margins": 7.698622226715088, + "rewards/rejected": -4.368188858032227, + "step": 9059 + }, + { + "epoch": 2.27, + "grad_norm": 6.31104040145874, + "learning_rate": 5.7303114685395885e-06, + "logits/chosen": -0.4498896598815918, + "logits/rejected": -0.5190711617469788, + "logps/chosen": -61.628204345703125, + "logps/rejected": -115.96063995361328, + "loss": 0.6355, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.941904306411743, + "rewards/margins": 7.403634548187256, + "rewards/rejected": -4.461730003356934, + "step": 9060 + }, + { + "epoch": 2.27, + "grad_norm": 2.635420560836792, + "learning_rate": 5.729533901272708e-06, + "logits/chosen": -0.4408811330795288, + "logits/rejected": -0.518000066280365, + "logps/chosen": -60.68666076660156, + "logps/rejected": -115.31206512451172, + "loss": 0.5359, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.41867995262146, + "rewards/margins": 6.829036712646484, + "rewards/rejected": -3.4103567600250244, + "step": 9061 + }, + { + "epoch": 2.27, + "grad_norm": 2.8265295028686523, + "learning_rate": 5.728756315978269e-06, + "logits/chosen": -0.509811282157898, + "logits/rejected": -0.596731424331665, + "logps/chosen": -46.352256774902344, + "logps/rejected": -84.74932861328125, + "loss": 0.5572, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.073275089263916, + "rewards/margins": 5.86560583114624, + "rewards/rejected": -2.792330503463745, + "step": 9062 + }, + { + "epoch": 2.27, + "grad_norm": 7.447554111480713, + "learning_rate": 5.7279787126754875e-06, + "logits/chosen": -0.48908573389053345, + "logits/rejected": -0.5729862451553345, + "logps/chosen": -51.61226272583008, + "logps/rejected": -109.82978820800781, + "loss": 0.6307, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0231282711029053, + "rewards/margins": 7.6933722496032715, + "rewards/rejected": -4.670245170593262, + "step": 9063 + }, + { + "epoch": 2.27, + "grad_norm": 3.5223333835601807, + "learning_rate": 5.7272010913835744e-06, + "logits/chosen": -0.48614582419395447, + "logits/rejected": -0.5474463105201721, + "logps/chosen": -49.537933349609375, + "logps/rejected": -92.97314453125, + "loss": 0.6495, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0709047317504883, + "rewards/margins": 5.407270431518555, + "rewards/rejected": -2.336365222930908, + "step": 9064 + }, + { + "epoch": 2.27, + "grad_norm": 4.773117542266846, + "learning_rate": 5.726423452121751e-06, + "logits/chosen": -0.5335914492607117, + "logits/rejected": -0.5915363430976868, + "logps/chosen": -52.44125747680664, + "logps/rejected": -102.71158599853516, + "loss": 0.6383, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.000443458557129, + "rewards/margins": 5.906863689422607, + "rewards/rejected": -2.906421184539795, + "step": 9065 + }, + { + "epoch": 2.27, + "grad_norm": 3.775531053543091, + "learning_rate": 5.725645794909231e-06, + "logits/chosen": -0.4874310791492462, + "logits/rejected": -0.5725685954093933, + "logps/chosen": -62.6479377746582, + "logps/rejected": -89.42314910888672, + "loss": 0.757, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1575229167938232, + "rewards/margins": 5.161733627319336, + "rewards/rejected": -2.004210948944092, + "step": 9066 + }, + { + "epoch": 2.27, + "grad_norm": 4.362593173980713, + "learning_rate": 5.724868119765231e-06, + "logits/chosen": -0.4299619495868683, + "logits/rejected": -0.5654545426368713, + "logps/chosen": -59.24800109863281, + "logps/rejected": -91.42522430419922, + "loss": 0.6939, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0607542991638184, + "rewards/margins": 6.173666954040527, + "rewards/rejected": -3.112912654876709, + "step": 9067 + }, + { + "epoch": 2.27, + "grad_norm": 8.806936264038086, + "learning_rate": 5.7240904267089706e-06, + "logits/chosen": -0.4609070122241974, + "logits/rejected": -0.6058630347251892, + "logps/chosen": -59.97345733642578, + "logps/rejected": -89.07991790771484, + "loss": 0.6899, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9181084632873535, + "rewards/margins": 5.783121109008789, + "rewards/rejected": -2.8650124073028564, + "step": 9068 + }, + { + "epoch": 2.27, + "grad_norm": 4.508885860443115, + "learning_rate": 5.723312715759664e-06, + "logits/chosen": -0.49375250935554504, + "logits/rejected": -0.5991849303245544, + "logps/chosen": -52.96059036254883, + "logps/rejected": -89.89141845703125, + "loss": 0.6464, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.227534770965576, + "rewards/margins": 5.630836009979248, + "rewards/rejected": -2.403301239013672, + "step": 9069 + }, + { + "epoch": 2.27, + "grad_norm": 4.39337158203125, + "learning_rate": 5.722534986936531e-06, + "logits/chosen": -0.48979565501213074, + "logits/rejected": -0.5485786199569702, + "logps/chosen": -56.312076568603516, + "logps/rejected": -80.41464233398438, + "loss": 0.6732, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4570090770721436, + "rewards/margins": 4.835874557495117, + "rewards/rejected": -1.3788659572601318, + "step": 9070 + }, + { + "epoch": 2.27, + "grad_norm": 8.919647216796875, + "learning_rate": 5.72175724025879e-06, + "logits/chosen": -0.39173969626426697, + "logits/rejected": -0.4811984896659851, + "logps/chosen": -57.44900894165039, + "logps/rejected": -91.1226577758789, + "loss": 0.794, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7151713371276855, + "rewards/margins": 4.398748874664307, + "rewards/rejected": -1.6835777759552002, + "step": 9071 + }, + { + "epoch": 2.27, + "grad_norm": 16.660655975341797, + "learning_rate": 5.72097947574566e-06, + "logits/chosen": -0.5322203040122986, + "logits/rejected": -0.5914245247840881, + "logps/chosen": -50.22138977050781, + "logps/rejected": -89.0306625366211, + "loss": 0.7855, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7637569904327393, + "rewards/margins": 5.596475601196289, + "rewards/rejected": -2.832718849182129, + "step": 9072 + }, + { + "epoch": 2.27, + "grad_norm": 3.61543345451355, + "learning_rate": 5.72020169341636e-06, + "logits/chosen": -0.580371618270874, + "logits/rejected": -0.6205503344535828, + "logps/chosen": -50.915992736816406, + "logps/rejected": -85.95549011230469, + "loss": 0.6748, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0243213176727295, + "rewards/margins": 5.312012195587158, + "rewards/rejected": -2.287691354751587, + "step": 9073 + }, + { + "epoch": 2.27, + "grad_norm": 3.5301551818847656, + "learning_rate": 5.719423893290111e-06, + "logits/chosen": -0.47165927290916443, + "logits/rejected": -0.5498814582824707, + "logps/chosen": -48.436920166015625, + "logps/rejected": -92.56672668457031, + "loss": 0.6119, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.836642026901245, + "rewards/margins": 6.124639511108398, + "rewards/rejected": -3.287997007369995, + "step": 9074 + }, + { + "epoch": 2.27, + "grad_norm": 6.34430456161499, + "learning_rate": 5.718646075386132e-06, + "logits/chosen": -0.4054983854293823, + "logits/rejected": -0.4589606821537018, + "logps/chosen": -64.62321472167969, + "logps/rejected": -90.90516662597656, + "loss": 0.7427, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.72348690032959, + "rewards/margins": 4.859923362731934, + "rewards/rejected": -2.1364364624023438, + "step": 9075 + }, + { + "epoch": 2.27, + "grad_norm": 8.777897834777832, + "learning_rate": 5.717868239723643e-06, + "logits/chosen": -0.4626319408416748, + "logits/rejected": -0.5305390357971191, + "logps/chosen": -51.805198669433594, + "logps/rejected": -80.70689392089844, + "loss": 0.6732, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.898287296295166, + "rewards/margins": 4.5750885009765625, + "rewards/rejected": -1.6768014430999756, + "step": 9076 + }, + { + "epoch": 2.27, + "grad_norm": 4.960422992706299, + "learning_rate": 5.717090386321869e-06, + "logits/chosen": -0.4737655520439148, + "logits/rejected": -0.597968578338623, + "logps/chosen": -57.44316101074219, + "logps/rejected": -92.81556701660156, + "loss": 0.6776, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8755149841308594, + "rewards/margins": 6.461255073547363, + "rewards/rejected": -3.585740566253662, + "step": 9077 + }, + { + "epoch": 2.27, + "grad_norm": 4.472777366638184, + "learning_rate": 5.716312515200026e-06, + "logits/chosen": -0.539743959903717, + "logits/rejected": -0.6237080097198486, + "logps/chosen": -47.75723648071289, + "logps/rejected": -79.67469024658203, + "loss": 0.6055, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9345529079437256, + "rewards/margins": 5.481042385101318, + "rewards/rejected": -2.5464894771575928, + "step": 9078 + }, + { + "epoch": 2.27, + "grad_norm": 3.373234272003174, + "learning_rate": 5.715534626377339e-06, + "logits/chosen": -0.50142902135849, + "logits/rejected": -0.5607977509498596, + "logps/chosen": -51.61960220336914, + "logps/rejected": -98.29954528808594, + "loss": 0.6251, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.095021963119507, + "rewards/margins": 6.586945056915283, + "rewards/rejected": -3.4919230937957764, + "step": 9079 + }, + { + "epoch": 2.27, + "grad_norm": 3.637295722961426, + "learning_rate": 5.714756719873031e-06, + "logits/chosen": -0.4735778272151947, + "logits/rejected": -0.5588989853858948, + "logps/chosen": -59.295406341552734, + "logps/rejected": -107.09793853759766, + "loss": 0.6235, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1739463806152344, + "rewards/margins": 6.405481338500977, + "rewards/rejected": -3.2315354347229004, + "step": 9080 + }, + { + "epoch": 2.27, + "grad_norm": 5.383039951324463, + "learning_rate": 5.713978795706324e-06, + "logits/chosen": -0.43110668659210205, + "logits/rejected": -0.5099324584007263, + "logps/chosen": -55.16697692871094, + "logps/rejected": -85.79718017578125, + "loss": 0.7507, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.920804977416992, + "rewards/margins": 5.061098575592041, + "rewards/rejected": -2.140293598175049, + "step": 9081 + }, + { + "epoch": 2.27, + "grad_norm": 7.446007251739502, + "learning_rate": 5.713200853896442e-06, + "logits/chosen": -0.4462459683418274, + "logits/rejected": -0.5007603168487549, + "logps/chosen": -47.91963195800781, + "logps/rejected": -100.44901275634766, + "loss": 0.6952, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.001185655593872, + "rewards/margins": 5.885338306427002, + "rewards/rejected": -2.884152889251709, + "step": 9082 + }, + { + "epoch": 2.27, + "grad_norm": 5.483006477355957, + "learning_rate": 5.7124228944626096e-06, + "logits/chosen": -0.4537718892097473, + "logits/rejected": -0.6020411252975464, + "logps/chosen": -55.15485763549805, + "logps/rejected": -83.39378356933594, + "loss": 0.6201, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.179065227508545, + "rewards/margins": 6.852138996124268, + "rewards/rejected": -3.6730735301971436, + "step": 9083 + }, + { + "epoch": 2.27, + "grad_norm": 7.471411228179932, + "learning_rate": 5.7116449174240474e-06, + "logits/chosen": -0.5833799242973328, + "logits/rejected": -0.6350436210632324, + "logps/chosen": -50.396018981933594, + "logps/rejected": -86.97833251953125, + "loss": 0.7483, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.955634593963623, + "rewards/margins": 5.020549774169922, + "rewards/rejected": -2.064915180206299, + "step": 9084 + }, + { + "epoch": 2.27, + "grad_norm": 6.275088310241699, + "learning_rate": 5.7108669227999836e-06, + "logits/chosen": -0.5191959142684937, + "logits/rejected": -0.6084102392196655, + "logps/chosen": -58.95033645629883, + "logps/rejected": -82.0267333984375, + "loss": 0.6904, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0167593955993652, + "rewards/margins": 6.014096736907959, + "rewards/rejected": -2.9973373413085938, + "step": 9085 + }, + { + "epoch": 2.27, + "grad_norm": 9.590812683105469, + "learning_rate": 5.710088910609642e-06, + "logits/chosen": -0.5456258654594421, + "logits/rejected": -0.6483867764472961, + "logps/chosen": -48.52591323852539, + "logps/rejected": -98.54023742675781, + "loss": 0.614, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1459128856658936, + "rewards/margins": 6.302980899810791, + "rewards/rejected": -3.1570677757263184, + "step": 9086 + }, + { + "epoch": 2.27, + "grad_norm": 2.9382565021514893, + "learning_rate": 5.709310880872247e-06, + "logits/chosen": -0.5314967632293701, + "logits/rejected": -0.6221638321876526, + "logps/chosen": -52.53366470336914, + "logps/rejected": -90.96878051757812, + "loss": 0.6683, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.068702220916748, + "rewards/margins": 5.8700361251831055, + "rewards/rejected": -2.8013339042663574, + "step": 9087 + }, + { + "epoch": 2.27, + "grad_norm": 4.650652885437012, + "learning_rate": 5.708532833607027e-06, + "logits/chosen": -0.3954054117202759, + "logits/rejected": -0.5169193148612976, + "logps/chosen": -54.02444076538086, + "logps/rejected": -83.56327056884766, + "loss": 0.6419, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.766172170639038, + "rewards/margins": 5.247891902923584, + "rewards/rejected": -2.481719970703125, + "step": 9088 + }, + { + "epoch": 2.27, + "grad_norm": 4.643280029296875, + "learning_rate": 5.707754768833206e-06, + "logits/chosen": -0.6121101379394531, + "logits/rejected": -0.6710845232009888, + "logps/chosen": -48.14056396484375, + "logps/rejected": -103.58753204345703, + "loss": 0.6854, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8770813941955566, + "rewards/margins": 5.8841776847839355, + "rewards/rejected": -3.007096290588379, + "step": 9089 + }, + { + "epoch": 2.27, + "grad_norm": 6.4116106033325195, + "learning_rate": 5.706976686570012e-06, + "logits/chosen": -0.4489230513572693, + "logits/rejected": -0.577795684337616, + "logps/chosen": -59.28275680541992, + "logps/rejected": -104.87705993652344, + "loss": 0.6137, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.934169292449951, + "rewards/margins": 6.403054237365723, + "rewards/rejected": -3.4688847064971924, + "step": 9090 + }, + { + "epoch": 2.27, + "grad_norm": 3.833817720413208, + "learning_rate": 5.706198586836673e-06, + "logits/chosen": -0.4725242257118225, + "logits/rejected": -0.5813802480697632, + "logps/chosen": -59.23823547363281, + "logps/rejected": -95.23307800292969, + "loss": 0.6689, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0158724784851074, + "rewards/margins": 5.841683387756348, + "rewards/rejected": -2.8258113861083984, + "step": 9091 + }, + { + "epoch": 2.27, + "grad_norm": 5.124733924865723, + "learning_rate": 5.705420469652415e-06, + "logits/chosen": -0.5494863986968994, + "logits/rejected": -0.6595662832260132, + "logps/chosen": -53.29051971435547, + "logps/rejected": -103.56299591064453, + "loss": 0.696, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.100184440612793, + "rewards/margins": 6.349645614624023, + "rewards/rejected": -3.2494614124298096, + "step": 9092 + }, + { + "epoch": 2.27, + "grad_norm": 8.048408508300781, + "learning_rate": 5.704642335036468e-06, + "logits/chosen": -0.5484987497329712, + "logits/rejected": -0.6026650667190552, + "logps/chosen": -55.65314483642578, + "logps/rejected": -92.33609008789062, + "loss": 0.7724, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3230063915252686, + "rewards/margins": 5.623857498168945, + "rewards/rejected": -2.300851345062256, + "step": 9093 + }, + { + "epoch": 2.28, + "grad_norm": 3.9621095657348633, + "learning_rate": 5.703864183008058e-06, + "logits/chosen": -0.5500271320343018, + "logits/rejected": -0.6359885931015015, + "logps/chosen": -58.86477279663086, + "logps/rejected": -95.38655853271484, + "loss": 0.685, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7414746284484863, + "rewards/margins": 6.329751491546631, + "rewards/rejected": -3.5882773399353027, + "step": 9094 + }, + { + "epoch": 2.28, + "grad_norm": 3.3190014362335205, + "learning_rate": 5.703086013586417e-06, + "logits/chosen": -0.5038511753082275, + "logits/rejected": -0.5511009693145752, + "logps/chosen": -54.912803649902344, + "logps/rejected": -122.56829071044922, + "loss": 0.6078, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1669528484344482, + "rewards/margins": 6.82670259475708, + "rewards/rejected": -3.659749984741211, + "step": 9095 + }, + { + "epoch": 2.28, + "grad_norm": 6.2347893714904785, + "learning_rate": 5.7023078267907695e-06, + "logits/chosen": -0.4784507751464844, + "logits/rejected": -0.5305960774421692, + "logps/chosen": -52.03760528564453, + "logps/rejected": -93.2869644165039, + "loss": 0.7152, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.993096113204956, + "rewards/margins": 5.418208122253418, + "rewards/rejected": -2.425112247467041, + "step": 9096 + }, + { + "epoch": 2.28, + "grad_norm": 4.421745777130127, + "learning_rate": 5.7015296226403515e-06, + "logits/chosen": -0.4827348589897156, + "logits/rejected": -0.5927388072013855, + "logps/chosen": -56.30376052856445, + "logps/rejected": -92.6701889038086, + "loss": 0.5778, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0074095726013184, + "rewards/margins": 6.8517069816589355, + "rewards/rejected": -3.844297170639038, + "step": 9097 + }, + { + "epoch": 2.28, + "grad_norm": 4.139936447143555, + "learning_rate": 5.7007514011543894e-06, + "logits/chosen": -0.45728030800819397, + "logits/rejected": -0.5223771333694458, + "logps/chosen": -49.683868408203125, + "logps/rejected": -99.89495086669922, + "loss": 0.6208, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1109516620635986, + "rewards/margins": 5.91760778427124, + "rewards/rejected": -2.8066558837890625, + "step": 9098 + }, + { + "epoch": 2.28, + "grad_norm": 9.490972518920898, + "learning_rate": 5.6999731623521145e-06, + "logits/chosen": -0.39758914709091187, + "logits/rejected": -0.4860566258430481, + "logps/chosen": -50.78891372680664, + "logps/rejected": -101.6572265625, + "loss": 0.6317, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3652424812316895, + "rewards/margins": 5.634910583496094, + "rewards/rejected": -2.2696683406829834, + "step": 9099 + }, + { + "epoch": 2.28, + "grad_norm": 4.4906325340271, + "learning_rate": 5.699194906252761e-06, + "logits/chosen": -0.47815102338790894, + "logits/rejected": -0.5410321950912476, + "logps/chosen": -63.36065673828125, + "logps/rejected": -89.77430725097656, + "loss": 0.7773, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0310921669006348, + "rewards/margins": 5.482353210449219, + "rewards/rejected": -2.451261043548584, + "step": 9100 + }, + { + "epoch": 2.28, + "grad_norm": 7.7550859451293945, + "learning_rate": 5.698416632875553e-06, + "logits/chosen": -0.49197492003440857, + "logits/rejected": -0.53875333070755, + "logps/chosen": -57.130393981933594, + "logps/rejected": -85.44304656982422, + "loss": 0.7634, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.764167547225952, + "rewards/margins": 4.981904983520508, + "rewards/rejected": -2.2177369594573975, + "step": 9101 + }, + { + "epoch": 2.28, + "grad_norm": 7.259522914886475, + "learning_rate": 5.6976383422397305e-06, + "logits/chosen": -0.4891151487827301, + "logits/rejected": -0.5997409820556641, + "logps/chosen": -54.830074310302734, + "logps/rejected": -88.15156555175781, + "loss": 0.6035, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1127312183380127, + "rewards/margins": 6.035162925720215, + "rewards/rejected": -2.922431707382202, + "step": 9102 + }, + { + "epoch": 2.28, + "grad_norm": 5.248466491699219, + "learning_rate": 5.696860034364522e-06, + "logits/chosen": -0.526806652545929, + "logits/rejected": -0.5704126358032227, + "logps/chosen": -57.59735107421875, + "logps/rejected": -107.80551147460938, + "loss": 0.6649, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.849292278289795, + "rewards/margins": 7.049434661865234, + "rewards/rejected": -4.200141906738281, + "step": 9103 + }, + { + "epoch": 2.28, + "grad_norm": 2.316049337387085, + "learning_rate": 5.69608170926916e-06, + "logits/chosen": -0.4785800576210022, + "logits/rejected": -0.5919417142868042, + "logps/chosen": -57.148319244384766, + "logps/rejected": -84.34046173095703, + "loss": 0.5927, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.040947437286377, + "rewards/margins": 6.466681003570557, + "rewards/rejected": -3.4257333278656006, + "step": 9104 + }, + { + "epoch": 2.28, + "grad_norm": 4.397719860076904, + "learning_rate": 5.695303366972878e-06, + "logits/chosen": -0.5469933748245239, + "logits/rejected": -0.6372524499893188, + "logps/chosen": -62.246456146240234, + "logps/rejected": -102.0614013671875, + "loss": 0.679, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2622532844543457, + "rewards/margins": 6.597580432891846, + "rewards/rejected": -3.3353271484375, + "step": 9105 + }, + { + "epoch": 2.28, + "grad_norm": 5.067841053009033, + "learning_rate": 5.694525007494913e-06, + "logits/chosen": -0.5754258036613464, + "logits/rejected": -0.6289839148521423, + "logps/chosen": -53.705963134765625, + "logps/rejected": -101.627197265625, + "loss": 0.6561, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6691439151763916, + "rewards/margins": 5.694338321685791, + "rewards/rejected": -3.025193929672241, + "step": 9106 + }, + { + "epoch": 2.28, + "grad_norm": 4.305809497833252, + "learning_rate": 5.693746630854493e-06, + "logits/chosen": -0.5003746747970581, + "logits/rejected": -0.5783082246780396, + "logps/chosen": -54.12367248535156, + "logps/rejected": -100.32505798339844, + "loss": 0.6555, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9114174842834473, + "rewards/margins": 6.158924102783203, + "rewards/rejected": -3.2475063800811768, + "step": 9107 + }, + { + "epoch": 2.28, + "grad_norm": 10.307290077209473, + "learning_rate": 5.692968237070858e-06, + "logits/chosen": -0.4893072545528412, + "logits/rejected": -0.5449612736701965, + "logps/chosen": -53.59593200683594, + "logps/rejected": -99.79194641113281, + "loss": 0.6981, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9213907718658447, + "rewards/margins": 5.6658806800842285, + "rewards/rejected": -2.744490385055542, + "step": 9108 + }, + { + "epoch": 2.28, + "grad_norm": 9.810307502746582, + "learning_rate": 5.6921898261632415e-06, + "logits/chosen": -0.4373190701007843, + "logits/rejected": -0.5241376161575317, + "logps/chosen": -62.01232147216797, + "logps/rejected": -90.72369384765625, + "loss": 0.812, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7491559982299805, + "rewards/margins": 5.115589618682861, + "rewards/rejected": -2.3664331436157227, + "step": 9109 + }, + { + "epoch": 2.28, + "grad_norm": 2.2472951412200928, + "learning_rate": 5.6914113981508755e-06, + "logits/chosen": -0.5338153839111328, + "logits/rejected": -0.6608244180679321, + "logps/chosen": -62.314029693603516, + "logps/rejected": -90.0594253540039, + "loss": 0.5711, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0323641300201416, + "rewards/margins": 6.559126853942871, + "rewards/rejected": -3.5267624855041504, + "step": 9110 + }, + { + "epoch": 2.28, + "grad_norm": 4.173068046569824, + "learning_rate": 5.690632953053001e-06, + "logits/chosen": -0.528687596321106, + "logits/rejected": -0.632203221321106, + "logps/chosen": -55.97264862060547, + "logps/rejected": -93.79583740234375, + "loss": 0.6448, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.950857639312744, + "rewards/margins": 6.546878337860107, + "rewards/rejected": -3.5960206985473633, + "step": 9111 + }, + { + "epoch": 2.28, + "grad_norm": 5.787030220031738, + "learning_rate": 5.6898544908888495e-06, + "logits/chosen": -0.5105345845222473, + "logits/rejected": -0.5557658672332764, + "logps/chosen": -74.7925033569336, + "logps/rejected": -103.05770111083984, + "loss": 0.7549, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.958955764770508, + "rewards/margins": 5.845524311065674, + "rewards/rejected": -2.886569023132324, + "step": 9112 + }, + { + "epoch": 2.28, + "grad_norm": 9.049877166748047, + "learning_rate": 5.689076011677661e-06, + "logits/chosen": -0.4590335190296173, + "logits/rejected": -0.5529285669326782, + "logps/chosen": -61.50313186645508, + "logps/rejected": -108.63255310058594, + "loss": 0.7997, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.714052677154541, + "rewards/margins": 7.484098434448242, + "rewards/rejected": -4.770046234130859, + "step": 9113 + }, + { + "epoch": 2.28, + "grad_norm": 2.408092737197876, + "learning_rate": 5.688297515438671e-06, + "logits/chosen": -0.5034282207489014, + "logits/rejected": -0.5544413924217224, + "logps/chosen": -52.99598693847656, + "logps/rejected": -101.10215759277344, + "loss": 0.5981, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0562286376953125, + "rewards/margins": 5.514715194702148, + "rewards/rejected": -2.458486795425415, + "step": 9114 + }, + { + "epoch": 2.28, + "grad_norm": 4.697786331176758, + "learning_rate": 5.687519002191117e-06, + "logits/chosen": -0.5516341328620911, + "logits/rejected": -0.6537994146347046, + "logps/chosen": -48.494850158691406, + "logps/rejected": -88.4078598022461, + "loss": 0.5937, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.070901870727539, + "rewards/margins": 6.043850898742676, + "rewards/rejected": -2.9729490280151367, + "step": 9115 + }, + { + "epoch": 2.28, + "grad_norm": 6.48952579498291, + "learning_rate": 5.686740471954239e-06, + "logits/chosen": -0.5893812775611877, + "logits/rejected": -0.6595505475997925, + "logps/chosen": -48.484432220458984, + "logps/rejected": -109.55851745605469, + "loss": 0.6578, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3107917308807373, + "rewards/margins": 6.634050369262695, + "rewards/rejected": -3.3232581615448, + "step": 9116 + }, + { + "epoch": 2.28, + "grad_norm": 5.402587413787842, + "learning_rate": 5.685961924747271e-06, + "logits/chosen": -0.5165862441062927, + "logits/rejected": -0.5712624192237854, + "logps/chosen": -70.83592224121094, + "logps/rejected": -103.9147720336914, + "loss": 0.7647, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4903199672698975, + "rewards/margins": 5.3883562088012695, + "rewards/rejected": -2.898036241531372, + "step": 9117 + }, + { + "epoch": 2.28, + "grad_norm": 24.701929092407227, + "learning_rate": 5.6851833605894545e-06, + "logits/chosen": -0.5234283208847046, + "logits/rejected": -0.6059310436248779, + "logps/chosen": -58.81462097167969, + "logps/rejected": -90.45284271240234, + "loss": 0.8167, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6302993297576904, + "rewards/margins": 5.761260032653809, + "rewards/rejected": -3.130960464477539, + "step": 9118 + }, + { + "epoch": 2.28, + "grad_norm": 6.6399126052856445, + "learning_rate": 5.684404779500029e-06, + "logits/chosen": -0.4281548261642456, + "logits/rejected": -0.5398096442222595, + "logps/chosen": -53.22207260131836, + "logps/rejected": -96.70177459716797, + "loss": 0.6403, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.936715841293335, + "rewards/margins": 5.886396884918213, + "rewards/rejected": -2.949680805206299, + "step": 9119 + }, + { + "epoch": 2.28, + "grad_norm": 6.272231101989746, + "learning_rate": 5.683626181498234e-06, + "logits/chosen": -0.5756782293319702, + "logits/rejected": -0.6503809690475464, + "logps/chosen": -50.36990737915039, + "logps/rejected": -82.81035614013672, + "loss": 0.7052, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1609301567077637, + "rewards/margins": 5.201092720031738, + "rewards/rejected": -2.0401623249053955, + "step": 9120 + }, + { + "epoch": 2.28, + "grad_norm": 5.528844833374023, + "learning_rate": 5.6828475666033085e-06, + "logits/chosen": -0.5019922256469727, + "logits/rejected": -0.5341001152992249, + "logps/chosen": -53.22583770751953, + "logps/rejected": -111.02935791015625, + "loss": 0.6307, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.305405616760254, + "rewards/margins": 5.762413501739502, + "rewards/rejected": -2.45700740814209, + "step": 9121 + }, + { + "epoch": 2.28, + "grad_norm": 18.036903381347656, + "learning_rate": 5.682068934834494e-06, + "logits/chosen": -0.48544418811798096, + "logits/rejected": -0.619728684425354, + "logps/chosen": -59.895912170410156, + "logps/rejected": -91.61712646484375, + "loss": 0.746, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.753688097000122, + "rewards/margins": 6.6350274085998535, + "rewards/rejected": -3.8813390731811523, + "step": 9122 + }, + { + "epoch": 2.28, + "grad_norm": 7.557684421539307, + "learning_rate": 5.681290286211029e-06, + "logits/chosen": -0.48428046703338623, + "logits/rejected": -0.5642520189285278, + "logps/chosen": -51.987640380859375, + "logps/rejected": -91.82513427734375, + "loss": 0.5689, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.804654836654663, + "rewards/margins": 5.654959678649902, + "rewards/rejected": -2.85030460357666, + "step": 9123 + }, + { + "epoch": 2.28, + "grad_norm": 5.806033134460449, + "learning_rate": 5.680511620752157e-06, + "logits/chosen": -0.5450239181518555, + "logits/rejected": -0.617392897605896, + "logps/chosen": -54.101402282714844, + "logps/rejected": -96.38363647460938, + "loss": 0.6719, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.763700008392334, + "rewards/margins": 5.350151538848877, + "rewards/rejected": -2.586452007293701, + "step": 9124 + }, + { + "epoch": 2.28, + "grad_norm": 12.412259101867676, + "learning_rate": 5.679732938477121e-06, + "logits/chosen": -0.5169512033462524, + "logits/rejected": -0.6050271987915039, + "logps/chosen": -67.9012451171875, + "logps/rejected": -88.48858642578125, + "loss": 0.7291, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8041281700134277, + "rewards/margins": 4.867399215698242, + "rewards/rejected": -2.0632705688476562, + "step": 9125 + }, + { + "epoch": 2.28, + "grad_norm": 8.062024116516113, + "learning_rate": 5.678954239405159e-06, + "logits/chosen": -0.5071004629135132, + "logits/rejected": -0.5822847485542297, + "logps/chosen": -46.37738800048828, + "logps/rejected": -91.65188598632812, + "loss": 0.6622, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1419029235839844, + "rewards/margins": 6.6713104248046875, + "rewards/rejected": -3.529406785964966, + "step": 9126 + }, + { + "epoch": 2.28, + "grad_norm": 5.966008186340332, + "learning_rate": 5.678175523555515e-06, + "logits/chosen": -0.4310656785964966, + "logits/rejected": -0.5227019190788269, + "logps/chosen": -58.6947021484375, + "logps/rejected": -88.3009262084961, + "loss": 0.6033, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.103178024291992, + "rewards/margins": 6.021427631378174, + "rewards/rejected": -2.91825008392334, + "step": 9127 + }, + { + "epoch": 2.28, + "grad_norm": 5.8037943840026855, + "learning_rate": 5.677396790947434e-06, + "logits/chosen": -0.505492091178894, + "logits/rejected": -0.6289486885070801, + "logps/chosen": -57.479183197021484, + "logps/rejected": -105.04598999023438, + "loss": 0.7001, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.684567451477051, + "rewards/margins": 5.643584251403809, + "rewards/rejected": -2.959016799926758, + "step": 9128 + }, + { + "epoch": 2.28, + "grad_norm": 5.743059158325195, + "learning_rate": 5.676618041600157e-06, + "logits/chosen": -0.4748518466949463, + "logits/rejected": -0.5506492257118225, + "logps/chosen": -56.64225769042969, + "logps/rejected": -96.87919616699219, + "loss": 0.6246, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9158873558044434, + "rewards/margins": 5.704553604125977, + "rewards/rejected": -2.7886664867401123, + "step": 9129 + }, + { + "epoch": 2.28, + "grad_norm": 5.414534568786621, + "learning_rate": 5.675839275532927e-06, + "logits/chosen": -0.44627517461776733, + "logits/rejected": -0.5562326908111572, + "logps/chosen": -55.67888641357422, + "logps/rejected": -89.0696792602539, + "loss": 0.6567, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9594175815582275, + "rewards/margins": 6.417257308959961, + "rewards/rejected": -3.4578399658203125, + "step": 9130 + }, + { + "epoch": 2.28, + "grad_norm": 4.551272392272949, + "learning_rate": 5.675060492764993e-06, + "logits/chosen": -0.4872884452342987, + "logits/rejected": -0.5277060270309448, + "logps/chosen": -50.073822021484375, + "logps/rejected": -93.23786926269531, + "loss": 0.638, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1753759384155273, + "rewards/margins": 5.2317118644714355, + "rewards/rejected": -2.056335926055908, + "step": 9131 + }, + { + "epoch": 2.28, + "grad_norm": 3.610475540161133, + "learning_rate": 5.674281693315594e-06, + "logits/chosen": -0.5084093809127808, + "logits/rejected": -0.5945286154747009, + "logps/chosen": -53.34050369262695, + "logps/rejected": -92.05066680908203, + "loss": 0.6469, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4286792278289795, + "rewards/margins": 6.399293422698975, + "rewards/rejected": -2.970614194869995, + "step": 9132 + }, + { + "epoch": 2.28, + "grad_norm": 4.6378374099731445, + "learning_rate": 5.673502877203978e-06, + "logits/chosen": -0.48988932371139526, + "logits/rejected": -0.5420144200325012, + "logps/chosen": -50.70415115356445, + "logps/rejected": -89.15789794921875, + "loss": 0.7371, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.314213752746582, + "rewards/margins": 5.403076648712158, + "rewards/rejected": -2.0888631343841553, + "step": 9133 + }, + { + "epoch": 2.29, + "grad_norm": 1.8662816286087036, + "learning_rate": 5.67272404444939e-06, + "logits/chosen": -0.573363184928894, + "logits/rejected": -0.6562028527259827, + "logps/chosen": -49.437965393066406, + "logps/rejected": -84.0829086303711, + "loss": 0.6286, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0666160583496094, + "rewards/margins": 6.013054370880127, + "rewards/rejected": -2.9464380741119385, + "step": 9134 + }, + { + "epoch": 2.29, + "grad_norm": 4.381516456604004, + "learning_rate": 5.671945195071075e-06, + "logits/chosen": -0.534938633441925, + "logits/rejected": -0.6172246336936951, + "logps/chosen": -55.41731262207031, + "logps/rejected": -87.0936508178711, + "loss": 0.6669, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.909102201461792, + "rewards/margins": 6.2570624351501465, + "rewards/rejected": -3.3479602336883545, + "step": 9135 + }, + { + "epoch": 2.29, + "grad_norm": 3.0618271827697754, + "learning_rate": 5.671166329088278e-06, + "logits/chosen": -0.5199017524719238, + "logits/rejected": -0.5463385581970215, + "logps/chosen": -52.13743209838867, + "logps/rejected": -104.61007690429688, + "loss": 0.6581, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1679413318634033, + "rewards/margins": 5.8157267570495605, + "rewards/rejected": -2.647785186767578, + "step": 9136 + }, + { + "epoch": 2.29, + "grad_norm": 3.671665668487549, + "learning_rate": 5.670387446520248e-06, + "logits/chosen": -0.46046149730682373, + "logits/rejected": -0.5220041275024414, + "logps/chosen": -48.15797805786133, + "logps/rejected": -85.57347106933594, + "loss": 0.6966, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.013335704803467, + "rewards/margins": 5.652549743652344, + "rewards/rejected": -2.639213800430298, + "step": 9137 + }, + { + "epoch": 2.29, + "grad_norm": 4.154393672943115, + "learning_rate": 5.669608547386233e-06, + "logits/chosen": -0.5145843625068665, + "logits/rejected": -0.645395815372467, + "logps/chosen": -56.63932418823242, + "logps/rejected": -80.49256896972656, + "loss": 0.6449, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2780468463897705, + "rewards/margins": 6.057196617126465, + "rewards/rejected": -2.7791495323181152, + "step": 9138 + }, + { + "epoch": 2.29, + "grad_norm": 3.9200055599212646, + "learning_rate": 5.6688296317054745e-06, + "logits/chosen": -0.5388223528862, + "logits/rejected": -0.5958200693130493, + "logps/chosen": -55.858131408691406, + "logps/rejected": -109.50648498535156, + "loss": 0.6643, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.051098108291626, + "rewards/margins": 6.551590442657471, + "rewards/rejected": -3.5004920959472656, + "step": 9139 + }, + { + "epoch": 2.29, + "grad_norm": 6.31536865234375, + "learning_rate": 5.668050699497229e-06, + "logits/chosen": -0.4314672648906708, + "logits/rejected": -0.5117413997650146, + "logps/chosen": -55.4589729309082, + "logps/rejected": -90.2042465209961, + "loss": 0.7082, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7868833541870117, + "rewards/margins": 4.7143120765686035, + "rewards/rejected": -1.9274282455444336, + "step": 9140 + }, + { + "epoch": 2.29, + "grad_norm": 12.090485572814941, + "learning_rate": 5.6672717507807375e-06, + "logits/chosen": -0.4088982939720154, + "logits/rejected": -0.4561176300048828, + "logps/chosen": -54.242210388183594, + "logps/rejected": -106.16523742675781, + "loss": 0.7408, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8534340858459473, + "rewards/margins": 5.859294414520264, + "rewards/rejected": -3.0058603286743164, + "step": 9141 + }, + { + "epoch": 2.29, + "grad_norm": 13.3565673828125, + "learning_rate": 5.666492785575251e-06, + "logits/chosen": -0.4957607388496399, + "logits/rejected": -0.5397617816925049, + "logps/chosen": -54.63694763183594, + "logps/rejected": -109.81523895263672, + "loss": 0.7593, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6668474674224854, + "rewards/margins": 5.29914665222168, + "rewards/rejected": -2.6322996616363525, + "step": 9142 + }, + { + "epoch": 2.29, + "grad_norm": 14.099859237670898, + "learning_rate": 5.66571380390002e-06, + "logits/chosen": -0.5410454273223877, + "logits/rejected": -0.610554039478302, + "logps/chosen": -66.93447875976562, + "logps/rejected": -89.8760986328125, + "loss": 0.8081, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.88427734375, + "rewards/margins": 5.988669395446777, + "rewards/rejected": -3.1043922901153564, + "step": 9143 + }, + { + "epoch": 2.29, + "grad_norm": 9.604846954345703, + "learning_rate": 5.664934805774293e-06, + "logits/chosen": -0.4713103175163269, + "logits/rejected": -0.5378715991973877, + "logps/chosen": -57.22847366333008, + "logps/rejected": -102.41431427001953, + "loss": 0.7384, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9617016315460205, + "rewards/margins": 5.190577983856201, + "rewards/rejected": -2.2288761138916016, + "step": 9144 + }, + { + "epoch": 2.29, + "grad_norm": 5.258252143859863, + "learning_rate": 5.664155791217318e-06, + "logits/chosen": -0.4773835241794586, + "logits/rejected": -0.5731011629104614, + "logps/chosen": -54.14284133911133, + "logps/rejected": -108.613037109375, + "loss": 0.6918, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.82369065284729, + "rewards/margins": 6.269386291503906, + "rewards/rejected": -3.4456958770751953, + "step": 9145 + }, + { + "epoch": 2.29, + "grad_norm": 8.241548538208008, + "learning_rate": 5.663376760248348e-06, + "logits/chosen": -0.40774935483932495, + "logits/rejected": -0.5058150291442871, + "logps/chosen": -66.56478118896484, + "logps/rejected": -112.15840148925781, + "loss": 0.6994, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1186447143554688, + "rewards/margins": 7.015481472015381, + "rewards/rejected": -3.896836757659912, + "step": 9146 + }, + { + "epoch": 2.29, + "grad_norm": 3.39162540435791, + "learning_rate": 5.6625977128866325e-06, + "logits/chosen": -0.4464028775691986, + "logits/rejected": -0.5517823696136475, + "logps/chosen": -55.67108154296875, + "logps/rejected": -107.61502075195312, + "loss": 0.6861, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9347143173217773, + "rewards/margins": 6.527772903442383, + "rewards/rejected": -3.5930583477020264, + "step": 9147 + }, + { + "epoch": 2.29, + "grad_norm": 10.286566734313965, + "learning_rate": 5.661818649151422e-06, + "logits/chosen": -0.5430500507354736, + "logits/rejected": -0.6048232913017273, + "logps/chosen": -70.31068420410156, + "logps/rejected": -114.86865234375, + "loss": 0.8523, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.549318790435791, + "rewards/margins": 6.899710178375244, + "rewards/rejected": -4.350391387939453, + "step": 9148 + }, + { + "epoch": 2.29, + "grad_norm": 6.867060661315918, + "learning_rate": 5.661039569061973e-06, + "logits/chosen": -0.46309322118759155, + "logits/rejected": -0.5420218110084534, + "logps/chosen": -60.63713073730469, + "logps/rejected": -111.48544311523438, + "loss": 0.7468, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0982747077941895, + "rewards/margins": 6.337937355041504, + "rewards/rejected": -3.2396624088287354, + "step": 9149 + }, + { + "epoch": 2.29, + "grad_norm": 8.168249130249023, + "learning_rate": 5.660260472637529e-06, + "logits/chosen": -0.5035961866378784, + "logits/rejected": -0.5929789543151855, + "logps/chosen": -57.590938568115234, + "logps/rejected": -94.55519104003906, + "loss": 0.7262, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0429186820983887, + "rewards/margins": 5.559401512145996, + "rewards/rejected": -2.5164835453033447, + "step": 9150 + }, + { + "epoch": 2.29, + "grad_norm": 3.370142698287964, + "learning_rate": 5.659481359897346e-06, + "logits/chosen": -0.5016002058982849, + "logits/rejected": -0.6005058884620667, + "logps/chosen": -58.95933532714844, + "logps/rejected": -91.252685546875, + "loss": 0.6807, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0799448490142822, + "rewards/margins": 5.788922309875488, + "rewards/rejected": -2.708977460861206, + "step": 9151 + }, + { + "epoch": 2.29, + "grad_norm": 11.124163627624512, + "learning_rate": 5.65870223086068e-06, + "logits/chosen": -0.4953635334968567, + "logits/rejected": -0.5140032172203064, + "logps/chosen": -54.45463943481445, + "logps/rejected": -113.2563705444336, + "loss": 0.711, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9707562923431396, + "rewards/margins": 5.89453125, + "rewards/rejected": -2.9237749576568604, + "step": 9152 + }, + { + "epoch": 2.29, + "grad_norm": 4.006335258483887, + "learning_rate": 5.657923085546778e-06, + "logits/chosen": -0.459614098072052, + "logits/rejected": -0.5757334232330322, + "logps/chosen": -48.411048889160156, + "logps/rejected": -115.30307006835938, + "loss": 0.519, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.114806890487671, + "rewards/margins": 7.515923500061035, + "rewards/rejected": -4.401115894317627, + "step": 9153 + }, + { + "epoch": 2.29, + "grad_norm": 3.977174997329712, + "learning_rate": 5.657143923974899e-06, + "logits/chosen": -0.39503365755081177, + "logits/rejected": -0.4590897858142853, + "logps/chosen": -64.6047134399414, + "logps/rejected": -107.74664306640625, + "loss": 0.7239, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.323411464691162, + "rewards/margins": 7.231243133544922, + "rewards/rejected": -3.9078311920166016, + "step": 9154 + }, + { + "epoch": 2.29, + "grad_norm": 15.516617774963379, + "learning_rate": 5.656364746164294e-06, + "logits/chosen": -0.5307008624076843, + "logits/rejected": -0.6375695466995239, + "logps/chosen": -62.597923278808594, + "logps/rejected": -102.46746826171875, + "loss": 0.6254, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.038365364074707, + "rewards/margins": 6.6035003662109375, + "rewards/rejected": -3.5651350021362305, + "step": 9155 + }, + { + "epoch": 2.29, + "grad_norm": 9.044127464294434, + "learning_rate": 5.655585552134218e-06, + "logits/chosen": -0.45178458094596863, + "logits/rejected": -0.5297962427139282, + "logps/chosen": -62.30374526977539, + "logps/rejected": -84.38428497314453, + "loss": 0.7314, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9678235054016113, + "rewards/margins": 4.798260688781738, + "rewards/rejected": -1.8304373025894165, + "step": 9156 + }, + { + "epoch": 2.29, + "grad_norm": 6.739795684814453, + "learning_rate": 5.654806341903927e-06, + "logits/chosen": -0.5273644924163818, + "logits/rejected": -0.6117324233055115, + "logps/chosen": -66.48554992675781, + "logps/rejected": -89.59072875976562, + "loss": 0.7591, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8396048545837402, + "rewards/margins": 5.583905220031738, + "rewards/rejected": -2.7443008422851562, + "step": 9157 + }, + { + "epoch": 2.29, + "grad_norm": 6.496920108795166, + "learning_rate": 5.654027115492673e-06, + "logits/chosen": -0.47546136379241943, + "logits/rejected": -0.5939603447914124, + "logps/chosen": -57.04190444946289, + "logps/rejected": -83.67831420898438, + "loss": 0.7025, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.034956932067871, + "rewards/margins": 6.264770984649658, + "rewards/rejected": -3.229814052581787, + "step": 9158 + }, + { + "epoch": 2.29, + "grad_norm": 10.058090209960938, + "learning_rate": 5.653247872919714e-06, + "logits/chosen": -0.4953737258911133, + "logits/rejected": -0.5359509587287903, + "logps/chosen": -60.24073791503906, + "logps/rejected": -100.69769287109375, + "loss": 0.8345, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.807290554046631, + "rewards/margins": 5.988852500915527, + "rewards/rejected": -3.1815619468688965, + "step": 9159 + }, + { + "epoch": 2.29, + "grad_norm": 4.051291465759277, + "learning_rate": 5.652468614204305e-06, + "logits/chosen": -0.5404173731803894, + "logits/rejected": -0.6139032244682312, + "logps/chosen": -53.383609771728516, + "logps/rejected": -96.66792297363281, + "loss": 0.6005, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9183337688446045, + "rewards/margins": 6.518160820007324, + "rewards/rejected": -3.5998265743255615, + "step": 9160 + }, + { + "epoch": 2.29, + "grad_norm": 5.6886420249938965, + "learning_rate": 5.651689339365702e-06, + "logits/chosen": -0.45224177837371826, + "logits/rejected": -0.5132946372032166, + "logps/chosen": -66.72265625, + "logps/rejected": -82.06322479248047, + "loss": 0.7483, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.886690855026245, + "rewards/margins": 4.453252792358398, + "rewards/rejected": -1.5665618181228638, + "step": 9161 + }, + { + "epoch": 2.29, + "grad_norm": 12.561027526855469, + "learning_rate": 5.650910048423164e-06, + "logits/chosen": -0.47553449869155884, + "logits/rejected": -0.5253983736038208, + "logps/chosen": -57.031253814697266, + "logps/rejected": -97.8384780883789, + "loss": 0.7342, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9954943656921387, + "rewards/margins": 5.423718452453613, + "rewards/rejected": -2.428224563598633, + "step": 9162 + }, + { + "epoch": 2.29, + "grad_norm": 4.6975884437561035, + "learning_rate": 5.650130741395946e-06, + "logits/chosen": -0.525867223739624, + "logits/rejected": -0.6128253936767578, + "logps/chosen": -43.039207458496094, + "logps/rejected": -108.40302276611328, + "loss": 0.585, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0309154987335205, + "rewards/margins": 7.7283172607421875, + "rewards/rejected": -4.697401523590088, + "step": 9163 + }, + { + "epoch": 2.29, + "grad_norm": 5.237983226776123, + "learning_rate": 5.649351418303305e-06, + "logits/chosen": -0.5364336967468262, + "logits/rejected": -0.5756915807723999, + "logps/chosen": -56.46244812011719, + "logps/rejected": -105.99201965332031, + "loss": 0.5878, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0447561740875244, + "rewards/margins": 6.903383255004883, + "rewards/rejected": -3.8586266040802, + "step": 9164 + }, + { + "epoch": 2.29, + "grad_norm": 5.599919319152832, + "learning_rate": 5.648572079164499e-06, + "logits/chosen": -0.5782816410064697, + "logits/rejected": -0.6037266254425049, + "logps/chosen": -42.05680465698242, + "logps/rejected": -96.76261138916016, + "loss": 0.6521, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.087601661682129, + "rewards/margins": 5.603941917419434, + "rewards/rejected": -2.516340732574463, + "step": 9165 + }, + { + "epoch": 2.29, + "grad_norm": 4.664135932922363, + "learning_rate": 5.647792723998789e-06, + "logits/chosen": -0.49119535088539124, + "logits/rejected": -0.5935904383659363, + "logps/chosen": -61.314910888671875, + "logps/rejected": -104.82804107666016, + "loss": 0.6425, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9628403186798096, + "rewards/margins": 7.147250652313232, + "rewards/rejected": -4.184410572052002, + "step": 9166 + }, + { + "epoch": 2.29, + "grad_norm": 6.225367546081543, + "learning_rate": 5.64701335282543e-06, + "logits/chosen": -0.5058077573776245, + "logits/rejected": -0.5510267019271851, + "logps/chosen": -43.420413970947266, + "logps/rejected": -96.23237609863281, + "loss": 0.5916, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.953871250152588, + "rewards/margins": 6.290669918060303, + "rewards/rejected": -3.336798906326294, + "step": 9167 + }, + { + "epoch": 2.29, + "grad_norm": 4.854351997375488, + "learning_rate": 5.646233965663685e-06, + "logits/chosen": -0.5442261695861816, + "logits/rejected": -0.6339946985244751, + "logps/chosen": -60.86119079589844, + "logps/rejected": -98.97220611572266, + "loss": 0.7622, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.400299549102783, + "rewards/margins": 5.4412455558776855, + "rewards/rejected": -2.0409464836120605, + "step": 9168 + }, + { + "epoch": 2.29, + "grad_norm": 8.775702476501465, + "learning_rate": 5.64545456253281e-06, + "logits/chosen": -0.42454758286476135, + "logits/rejected": -0.5149153470993042, + "logps/chosen": -58.799163818359375, + "logps/rejected": -97.71220397949219, + "loss": 0.6096, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1432747840881348, + "rewards/margins": 6.277302265167236, + "rewards/rejected": -3.1340274810791016, + "step": 9169 + }, + { + "epoch": 2.29, + "grad_norm": 16.04994010925293, + "learning_rate": 5.644675143452065e-06, + "logits/chosen": -0.48377928137779236, + "logits/rejected": -0.5787739753723145, + "logps/chosen": -59.960262298583984, + "logps/rejected": -90.91123962402344, + "loss": 0.7555, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.034198522567749, + "rewards/margins": 5.859321117401123, + "rewards/rejected": -2.8251230716705322, + "step": 9170 + }, + { + "epoch": 2.29, + "grad_norm": 4.307497978210449, + "learning_rate": 5.643895708440713e-06, + "logits/chosen": -0.4496486783027649, + "logits/rejected": -0.5479884743690491, + "logps/chosen": -52.862266540527344, + "logps/rejected": -99.58385467529297, + "loss": 0.6953, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8686792850494385, + "rewards/margins": 6.454735279083252, + "rewards/rejected": -3.5860559940338135, + "step": 9171 + }, + { + "epoch": 2.29, + "grad_norm": 7.364035606384277, + "learning_rate": 5.643116257518013e-06, + "logits/chosen": -0.37843775749206543, + "logits/rejected": -0.45894408226013184, + "logps/chosen": -65.02495574951172, + "logps/rejected": -115.28010559082031, + "loss": 0.7436, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.694653034210205, + "rewards/margins": 5.987612724304199, + "rewards/rejected": -3.292959213256836, + "step": 9172 + }, + { + "epoch": 2.29, + "grad_norm": 3.8182318210601807, + "learning_rate": 5.642336790703225e-06, + "logits/chosen": -0.4547889828681946, + "logits/rejected": -0.5498131513595581, + "logps/chosen": -59.32861328125, + "logps/rejected": -91.8329086303711, + "loss": 0.6429, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4723448753356934, + "rewards/margins": 6.508211612701416, + "rewards/rejected": -3.0358662605285645, + "step": 9173 + }, + { + "epoch": 2.3, + "grad_norm": 4.56858491897583, + "learning_rate": 5.641557308015614e-06, + "logits/chosen": -0.4422755241394043, + "logits/rejected": -0.5635557174682617, + "logps/chosen": -59.07261276245117, + "logps/rejected": -93.60694885253906, + "loss": 0.7113, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7944955825805664, + "rewards/margins": 6.288480281829834, + "rewards/rejected": -3.4939844608306885, + "step": 9174 + }, + { + "epoch": 2.3, + "grad_norm": 7.257884979248047, + "learning_rate": 5.640777809474439e-06, + "logits/chosen": -0.47278282046318054, + "logits/rejected": -0.5415166616439819, + "logps/chosen": -64.30389404296875, + "logps/rejected": -98.68306732177734, + "loss": 0.777, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.974226951599121, + "rewards/margins": 4.894819259643555, + "rewards/rejected": -1.9205924272537231, + "step": 9175 + }, + { + "epoch": 2.3, + "grad_norm": 9.28867244720459, + "learning_rate": 5.639998295098961e-06, + "logits/chosen": -0.503166675567627, + "logits/rejected": -0.5977368354797363, + "logps/chosen": -49.79018020629883, + "logps/rejected": -112.7909164428711, + "loss": 0.5711, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2216856479644775, + "rewards/margins": 7.75797700881958, + "rewards/rejected": -4.536291122436523, + "step": 9176 + }, + { + "epoch": 2.3, + "grad_norm": 13.150675773620605, + "learning_rate": 5.639218764908446e-06, + "logits/chosen": -0.4882630407810211, + "logits/rejected": -0.5603383779525757, + "logps/chosen": -65.37115478515625, + "logps/rejected": -94.55693817138672, + "loss": 0.9693, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.696826457977295, + "rewards/margins": 5.361191749572754, + "rewards/rejected": -2.664365291595459, + "step": 9177 + }, + { + "epoch": 2.3, + "grad_norm": 4.651392936706543, + "learning_rate": 5.6384392189221544e-06, + "logits/chosen": -0.511786162853241, + "logits/rejected": -0.5443544387817383, + "logps/chosen": -56.775482177734375, + "logps/rejected": -113.85553741455078, + "loss": 0.641, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.232703685760498, + "rewards/margins": 7.456184387207031, + "rewards/rejected": -4.223480224609375, + "step": 9178 + }, + { + "epoch": 2.3, + "grad_norm": 16.555622100830078, + "learning_rate": 5.637659657159351e-06, + "logits/chosen": -0.41952866315841675, + "logits/rejected": -0.46256589889526367, + "logps/chosen": -67.45870208740234, + "logps/rejected": -104.10079193115234, + "loss": 0.7185, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3171839714050293, + "rewards/margins": 5.3646135330200195, + "rewards/rejected": -2.047429084777832, + "step": 9179 + }, + { + "epoch": 2.3, + "grad_norm": 6.951105117797852, + "learning_rate": 5.636880079639299e-06, + "logits/chosen": -0.5293366312980652, + "logits/rejected": -0.6122285723686218, + "logps/chosen": -50.49753189086914, + "logps/rejected": -80.59430694580078, + "loss": 0.7861, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.202916383743286, + "rewards/margins": 4.906935691833496, + "rewards/rejected": -1.704019546508789, + "step": 9180 + }, + { + "epoch": 2.3, + "grad_norm": 7.809295654296875, + "learning_rate": 5.636100486381265e-06, + "logits/chosen": -0.49370697140693665, + "logits/rejected": -0.6095936298370361, + "logps/chosen": -46.98512649536133, + "logps/rejected": -81.88117218017578, + "loss": 0.6311, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.270376682281494, + "rewards/margins": 5.4558000564575195, + "rewards/rejected": -2.1854233741760254, + "step": 9181 + }, + { + "epoch": 2.3, + "grad_norm": 3.8312737941741943, + "learning_rate": 5.6353208774045085e-06, + "logits/chosen": -0.4782988727092743, + "logits/rejected": -0.5648248195648193, + "logps/chosen": -57.537261962890625, + "logps/rejected": -96.75028991699219, + "loss": 0.6251, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1727874279022217, + "rewards/margins": 5.634267807006836, + "rewards/rejected": -2.4614803791046143, + "step": 9182 + }, + { + "epoch": 2.3, + "grad_norm": 8.734572410583496, + "learning_rate": 5.634541252728299e-06, + "logits/chosen": -0.5669786334037781, + "logits/rejected": -0.6172584891319275, + "logps/chosen": -46.78231430053711, + "logps/rejected": -104.07917022705078, + "loss": 0.5727, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.136828660964966, + "rewards/margins": 6.513150215148926, + "rewards/rejected": -3.376321792602539, + "step": 9183 + }, + { + "epoch": 2.3, + "grad_norm": 4.889770984649658, + "learning_rate": 5.633761612371899e-06, + "logits/chosen": -0.5186023712158203, + "logits/rejected": -0.6195513606071472, + "logps/chosen": -59.35624694824219, + "logps/rejected": -90.42857360839844, + "loss": 0.6211, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.133225440979004, + "rewards/margins": 5.916970252990723, + "rewards/rejected": -2.7837445735931396, + "step": 9184 + }, + { + "epoch": 2.3, + "grad_norm": 7.482025623321533, + "learning_rate": 5.632981956354576e-06, + "logits/chosen": -0.5519683361053467, + "logits/rejected": -0.6278171539306641, + "logps/chosen": -62.67912673950195, + "logps/rejected": -83.76068115234375, + "loss": 0.7399, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1965534687042236, + "rewards/margins": 5.096076011657715, + "rewards/rejected": -1.8995230197906494, + "step": 9185 + }, + { + "epoch": 2.3, + "grad_norm": 4.323368549346924, + "learning_rate": 5.632202284695597e-06, + "logits/chosen": -0.4811534881591797, + "logits/rejected": -0.5535445213317871, + "logps/chosen": -61.71440124511719, + "logps/rejected": -98.56553649902344, + "loss": 0.7058, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.220536708831787, + "rewards/margins": 5.793243885040283, + "rewards/rejected": -2.572706937789917, + "step": 9186 + }, + { + "epoch": 2.3, + "grad_norm": 5.475390911102295, + "learning_rate": 5.631422597414226e-06, + "logits/chosen": -0.4986443519592285, + "logits/rejected": -0.6209498643875122, + "logps/chosen": -54.627952575683594, + "logps/rejected": -106.8380355834961, + "loss": 0.5617, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.957634925842285, + "rewards/margins": 6.223848342895508, + "rewards/rejected": -3.2662129402160645, + "step": 9187 + }, + { + "epoch": 2.3, + "grad_norm": 2.8757503032684326, + "learning_rate": 5.63064289452973e-06, + "logits/chosen": -0.4832199513912201, + "logits/rejected": -0.5473623275756836, + "logps/chosen": -52.913848876953125, + "logps/rejected": -106.2624282836914, + "loss": 0.6648, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.243232011795044, + "rewards/margins": 6.724325180053711, + "rewards/rejected": -3.481092929840088, + "step": 9188 + }, + { + "epoch": 2.3, + "grad_norm": 5.7194719314575195, + "learning_rate": 5.629863176061379e-06, + "logits/chosen": -0.5594578981399536, + "logits/rejected": -0.6518686413764954, + "logps/chosen": -60.032833099365234, + "logps/rejected": -87.84855651855469, + "loss": 0.8452, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2670421600341797, + "rewards/margins": 5.520430088043213, + "rewards/rejected": -2.253387928009033, + "step": 9189 + }, + { + "epoch": 2.3, + "grad_norm": 20.176963806152344, + "learning_rate": 5.6290834420284365e-06, + "logits/chosen": -0.4826069176197052, + "logits/rejected": -0.5743672847747803, + "logps/chosen": -50.500526428222656, + "logps/rejected": -94.9080810546875, + "loss": 0.6885, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8769118785858154, + "rewards/margins": 5.9369730949401855, + "rewards/rejected": -3.06006121635437, + "step": 9190 + }, + { + "epoch": 2.3, + "grad_norm": 9.067537307739258, + "learning_rate": 5.628303692450175e-06, + "logits/chosen": -0.5101779103279114, + "logits/rejected": -0.5746707916259766, + "logps/chosen": -42.10557556152344, + "logps/rejected": -92.93788146972656, + "loss": 0.6606, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9243884086608887, + "rewards/margins": 5.593687534332275, + "rewards/rejected": -2.6692988872528076, + "step": 9191 + }, + { + "epoch": 2.3, + "grad_norm": 10.110991477966309, + "learning_rate": 5.627523927345861e-06, + "logits/chosen": -0.43870145082473755, + "logits/rejected": -0.5886706709861755, + "logps/chosen": -49.49803924560547, + "logps/rejected": -101.57357025146484, + "loss": 0.6948, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9083609580993652, + "rewards/margins": 6.274635314941406, + "rewards/rejected": -3.36627459526062, + "step": 9192 + }, + { + "epoch": 2.3, + "grad_norm": 2.836613416671753, + "learning_rate": 5.626744146734763e-06, + "logits/chosen": -0.39282503724098206, + "logits/rejected": -0.5172122716903687, + "logps/chosen": -55.0981559753418, + "logps/rejected": -85.88687896728516, + "loss": 0.5843, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8184661865234375, + "rewards/margins": 6.285820484161377, + "rewards/rejected": -3.4673542976379395, + "step": 9193 + }, + { + "epoch": 2.3, + "grad_norm": 4.587347984313965, + "learning_rate": 5.62596435063615e-06, + "logits/chosen": -0.49391239881515503, + "logits/rejected": -0.5606880784034729, + "logps/chosen": -62.18037414550781, + "logps/rejected": -95.89470672607422, + "loss": 0.7229, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0122575759887695, + "rewards/margins": 5.916353702545166, + "rewards/rejected": -2.9040958881378174, + "step": 9194 + }, + { + "epoch": 2.3, + "grad_norm": 2.9413645267486572, + "learning_rate": 5.625184539069294e-06, + "logits/chosen": -0.509515643119812, + "logits/rejected": -0.6158809661865234, + "logps/chosen": -56.81515884399414, + "logps/rejected": -100.78125, + "loss": 0.6361, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9507837295532227, + "rewards/margins": 6.901351451873779, + "rewards/rejected": -3.9505677223205566, + "step": 9195 + }, + { + "epoch": 2.3, + "grad_norm": 14.218938827514648, + "learning_rate": 5.62440471205346e-06, + "logits/chosen": -0.4996413588523865, + "logits/rejected": -0.565007746219635, + "logps/chosen": -51.76112747192383, + "logps/rejected": -101.72742462158203, + "loss": 0.7127, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.822251558303833, + "rewards/margins": 6.121498107910156, + "rewards/rejected": -3.2992472648620605, + "step": 9196 + }, + { + "epoch": 2.3, + "grad_norm": 2.9398810863494873, + "learning_rate": 5.623624869607923e-06, + "logits/chosen": -0.44856733083724976, + "logits/rejected": -0.5362179279327393, + "logps/chosen": -54.355247497558594, + "logps/rejected": -102.95557403564453, + "loss": 0.6582, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.90755558013916, + "rewards/margins": 6.285531044006348, + "rewards/rejected": -3.3779754638671875, + "step": 9197 + }, + { + "epoch": 2.3, + "grad_norm": 5.511178493499756, + "learning_rate": 5.622845011751954e-06, + "logits/chosen": -0.4867292046546936, + "logits/rejected": -0.5760269165039062, + "logps/chosen": -51.77488708496094, + "logps/rejected": -85.62886810302734, + "loss": 0.7458, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8300631046295166, + "rewards/margins": 6.046624183654785, + "rewards/rejected": -3.2165610790252686, + "step": 9198 + }, + { + "epoch": 2.3, + "grad_norm": 22.621610641479492, + "learning_rate": 5.622065138504818e-06, + "logits/chosen": -0.49634116888046265, + "logits/rejected": -0.6005798578262329, + "logps/chosen": -63.00505447387695, + "logps/rejected": -86.42948913574219, + "loss": 0.7786, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.810471773147583, + "rewards/margins": 5.180499076843262, + "rewards/rejected": -2.3700270652770996, + "step": 9199 + }, + { + "epoch": 2.3, + "grad_norm": 4.6912641525268555, + "learning_rate": 5.621285249885795e-06, + "logits/chosen": -0.38071155548095703, + "logits/rejected": -0.47332602739334106, + "logps/chosen": -57.7038459777832, + "logps/rejected": -90.36181640625, + "loss": 0.5591, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.026592254638672, + "rewards/margins": 5.875936985015869, + "rewards/rejected": -2.849344491958618, + "step": 9200 + }, + { + "epoch": 2.3, + "grad_norm": 4.231072902679443, + "learning_rate": 5.62050534591415e-06, + "logits/chosen": -0.45274871587753296, + "logits/rejected": -0.5304252505302429, + "logps/chosen": -52.887474060058594, + "logps/rejected": -83.07361602783203, + "loss": 0.6832, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1256237030029297, + "rewards/margins": 5.727047920227051, + "rewards/rejected": -2.6014244556427, + "step": 9201 + }, + { + "epoch": 2.3, + "grad_norm": 4.370391368865967, + "learning_rate": 5.6197254266091584e-06, + "logits/chosen": -0.44332823157310486, + "logits/rejected": -0.5264630913734436, + "logps/chosen": -65.89685821533203, + "logps/rejected": -90.64749145507812, + "loss": 0.6533, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.081716537475586, + "rewards/margins": 5.0592451095581055, + "rewards/rejected": -1.9775288105010986, + "step": 9202 + }, + { + "epoch": 2.3, + "grad_norm": 17.273414611816406, + "learning_rate": 5.618945491990092e-06, + "logits/chosen": -0.4611760079860687, + "logits/rejected": -0.5479170083999634, + "logps/chosen": -61.128387451171875, + "logps/rejected": -99.64545440673828, + "loss": 0.7101, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.062842845916748, + "rewards/margins": 5.962137699127197, + "rewards/rejected": -2.899294853210449, + "step": 9203 + }, + { + "epoch": 2.3, + "grad_norm": 4.690375804901123, + "learning_rate": 5.618165542076226e-06, + "logits/chosen": -0.5738059282302856, + "logits/rejected": -0.6666611433029175, + "logps/chosen": -55.12702941894531, + "logps/rejected": -86.78271484375, + "loss": 0.729, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.920156955718994, + "rewards/margins": 5.2457170486450195, + "rewards/rejected": -2.3255600929260254, + "step": 9204 + }, + { + "epoch": 2.3, + "grad_norm": 4.455809593200684, + "learning_rate": 5.617385576886829e-06, + "logits/chosen": -0.5877995491027832, + "logits/rejected": -0.6782130002975464, + "logps/chosen": -56.19417953491211, + "logps/rejected": -87.3947525024414, + "loss": 0.6812, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0342798233032227, + "rewards/margins": 6.240367889404297, + "rewards/rejected": -3.206087827682495, + "step": 9205 + }, + { + "epoch": 2.3, + "grad_norm": 3.341830015182495, + "learning_rate": 5.6166055964411815e-06, + "logits/chosen": -0.4727616012096405, + "logits/rejected": -0.5517866611480713, + "logps/chosen": -47.19671630859375, + "logps/rejected": -90.9731674194336, + "loss": 0.6257, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0899784564971924, + "rewards/margins": 6.914494514465332, + "rewards/rejected": -3.8245160579681396, + "step": 9206 + }, + { + "epoch": 2.3, + "grad_norm": 4.717611789703369, + "learning_rate": 5.615825600758551e-06, + "logits/chosen": -0.4279753565788269, + "logits/rejected": -0.5838468074798584, + "logps/chosen": -55.79376983642578, + "logps/rejected": -101.29008483886719, + "loss": 0.6164, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.122025489807129, + "rewards/margins": 7.537910461425781, + "rewards/rejected": -4.415884494781494, + "step": 9207 + }, + { + "epoch": 2.3, + "grad_norm": 4.225505828857422, + "learning_rate": 5.615045589858216e-06, + "logits/chosen": -0.516318142414093, + "logits/rejected": -0.585432231426239, + "logps/chosen": -45.35070037841797, + "logps/rejected": -86.07637023925781, + "loss": 0.6912, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0253994464874268, + "rewards/margins": 5.833984851837158, + "rewards/rejected": -2.8085854053497314, + "step": 9208 + }, + { + "epoch": 2.3, + "grad_norm": 3.2021172046661377, + "learning_rate": 5.61426556375945e-06, + "logits/chosen": -0.4965158700942993, + "logits/rejected": -0.551638126373291, + "logps/chosen": -50.04665756225586, + "logps/rejected": -107.5569839477539, + "loss": 0.5479, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.048325538635254, + "rewards/margins": 6.860491752624512, + "rewards/rejected": -3.8121652603149414, + "step": 9209 + }, + { + "epoch": 2.3, + "grad_norm": 1.8708430528640747, + "learning_rate": 5.6134855224815276e-06, + "logits/chosen": -0.4957035183906555, + "logits/rejected": -0.604560911655426, + "logps/chosen": -46.524234771728516, + "logps/rejected": -99.57597351074219, + "loss": 0.5375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.173135280609131, + "rewards/margins": 7.419300079345703, + "rewards/rejected": -4.2461652755737305, + "step": 9210 + }, + { + "epoch": 2.3, + "grad_norm": 6.090712070465088, + "learning_rate": 5.612705466043728e-06, + "logits/chosen": -0.5013512969017029, + "logits/rejected": -0.5635157227516174, + "logps/chosen": -56.0845947265625, + "logps/rejected": -95.08675384521484, + "loss": 0.73, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.027576446533203, + "rewards/margins": 6.173804759979248, + "rewards/rejected": -3.146228551864624, + "step": 9211 + }, + { + "epoch": 2.3, + "grad_norm": 4.09005069732666, + "learning_rate": 5.611925394465322e-06, + "logits/chosen": -0.5238121747970581, + "logits/rejected": -0.5968433022499084, + "logps/chosen": -40.237667083740234, + "logps/rejected": -87.21202087402344, + "loss": 0.6228, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2560200691223145, + "rewards/margins": 5.809112071990967, + "rewards/rejected": -2.5530920028686523, + "step": 9212 + }, + { + "epoch": 2.3, + "grad_norm": 26.58403778076172, + "learning_rate": 5.6111453077655895e-06, + "logits/chosen": -0.49410614371299744, + "logits/rejected": -0.5767189860343933, + "logps/chosen": -60.8338623046875, + "logps/rejected": -106.97245788574219, + "loss": 0.9276, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7652676105499268, + "rewards/margins": 6.337264060974121, + "rewards/rejected": -3.5719966888427734, + "step": 9213 + }, + { + "epoch": 2.31, + "grad_norm": 5.484489917755127, + "learning_rate": 5.610365205963806e-06, + "logits/chosen": -0.5447820425033569, + "logits/rejected": -0.5644993782043457, + "logps/chosen": -53.44356155395508, + "logps/rejected": -103.97330474853516, + "loss": 0.6986, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.635826587677002, + "rewards/margins": 5.5080156326293945, + "rewards/rejected": -2.8721888065338135, + "step": 9214 + }, + { + "epoch": 2.31, + "grad_norm": 7.378498077392578, + "learning_rate": 5.60958508907925e-06, + "logits/chosen": -0.48283442854881287, + "logits/rejected": -0.5518273115158081, + "logps/chosen": -57.40342712402344, + "logps/rejected": -77.5613021850586, + "loss": 0.777, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8852453231811523, + "rewards/margins": 4.582751750946045, + "rewards/rejected": -1.6975070238113403, + "step": 9215 + }, + { + "epoch": 2.31, + "grad_norm": 19.27385711669922, + "learning_rate": 5.6088049571311974e-06, + "logits/chosen": -0.522119402885437, + "logits/rejected": -0.5817279815673828, + "logps/chosen": -51.1817626953125, + "logps/rejected": -107.2016830444336, + "loss": 0.7243, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.077928066253662, + "rewards/margins": 6.647993564605713, + "rewards/rejected": -3.5700652599334717, + "step": 9216 + }, + { + "epoch": 2.31, + "grad_norm": 3.8930470943450928, + "learning_rate": 5.6080248101389265e-06, + "logits/chosen": -0.4774283170700073, + "logits/rejected": -0.5432083010673523, + "logps/chosen": -46.92057800292969, + "logps/rejected": -83.1719741821289, + "loss": 0.6274, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2092196941375732, + "rewards/margins": 5.255083084106445, + "rewards/rejected": -2.045863389968872, + "step": 9217 + }, + { + "epoch": 2.31, + "grad_norm": 3.9602372646331787, + "learning_rate": 5.607244648121718e-06, + "logits/chosen": -0.48211774230003357, + "logits/rejected": -0.5449373722076416, + "logps/chosen": -61.35379409790039, + "logps/rejected": -90.00650787353516, + "loss": 0.7092, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.29130482673645, + "rewards/margins": 5.296584129333496, + "rewards/rejected": -2.005279779434204, + "step": 9218 + }, + { + "epoch": 2.31, + "grad_norm": 3.3242876529693604, + "learning_rate": 5.6064644710988445e-06, + "logits/chosen": -0.4911883771419525, + "logits/rejected": -0.6518876552581787, + "logps/chosen": -57.042293548583984, + "logps/rejected": -90.465087890625, + "loss": 0.5937, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.87616229057312, + "rewards/margins": 6.919773101806641, + "rewards/rejected": -4.043610572814941, + "step": 9219 + }, + { + "epoch": 2.31, + "grad_norm": 4.132014751434326, + "learning_rate": 5.605684279089593e-06, + "logits/chosen": -0.5111272931098938, + "logits/rejected": -0.5719774961471558, + "logps/chosen": -44.76970672607422, + "logps/rejected": -88.41190338134766, + "loss": 0.6008, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.827676773071289, + "rewards/margins": 6.312277793884277, + "rewards/rejected": -3.484600782394409, + "step": 9220 + }, + { + "epoch": 2.31, + "grad_norm": 4.021463394165039, + "learning_rate": 5.604904072113236e-06, + "logits/chosen": -0.466126412153244, + "logits/rejected": -0.5581203103065491, + "logps/chosen": -50.85272216796875, + "logps/rejected": -80.54818725585938, + "loss": 0.5573, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0793752670288086, + "rewards/margins": 5.449809551239014, + "rewards/rejected": -2.370434522628784, + "step": 9221 + }, + { + "epoch": 2.31, + "grad_norm": 4.0984625816345215, + "learning_rate": 5.604123850189056e-06, + "logits/chosen": -0.5333131551742554, + "logits/rejected": -0.6075852513313293, + "logps/chosen": -59.042457580566406, + "logps/rejected": -103.77220153808594, + "loss": 0.5742, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6971960067749023, + "rewards/margins": 6.703538417816162, + "rewards/rejected": -4.00634241104126, + "step": 9222 + }, + { + "epoch": 2.31, + "grad_norm": 2.91903018951416, + "learning_rate": 5.603343613336335e-06, + "logits/chosen": -0.4337160885334015, + "logits/rejected": -0.5545162558555603, + "logps/chosen": -62.12644958496094, + "logps/rejected": -96.03204345703125, + "loss": 0.6464, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1926326751708984, + "rewards/margins": 7.2860541343688965, + "rewards/rejected": -4.093421459197998, + "step": 9223 + }, + { + "epoch": 2.31, + "grad_norm": 3.6522791385650635, + "learning_rate": 5.60256336157435e-06, + "logits/chosen": -0.5282532572746277, + "logits/rejected": -0.5490696430206299, + "logps/chosen": -61.016510009765625, + "logps/rejected": -102.75823211669922, + "loss": 0.6756, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.095123529434204, + "rewards/margins": 6.520317077636719, + "rewards/rejected": -3.425194025039673, + "step": 9224 + }, + { + "epoch": 2.31, + "grad_norm": 5.964228630065918, + "learning_rate": 5.601783094922384e-06, + "logits/chosen": -0.5215573310852051, + "logits/rejected": -0.5887273550033569, + "logps/chosen": -53.11912536621094, + "logps/rejected": -111.84461975097656, + "loss": 0.7284, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1237659454345703, + "rewards/margins": 6.501852512359619, + "rewards/rejected": -3.378087043762207, + "step": 9225 + }, + { + "epoch": 2.31, + "grad_norm": 12.64633846282959, + "learning_rate": 5.6010028133997176e-06, + "logits/chosen": -0.5403521656990051, + "logits/rejected": -0.6332677006721497, + "logps/chosen": -70.5240249633789, + "logps/rejected": -87.60515594482422, + "loss": 0.8972, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.935462474822998, + "rewards/margins": 5.557916641235352, + "rewards/rejected": -2.6224536895751953, + "step": 9226 + }, + { + "epoch": 2.31, + "grad_norm": 8.985297203063965, + "learning_rate": 5.600222517025632e-06, + "logits/chosen": -0.4728560447692871, + "logits/rejected": -0.587639331817627, + "logps/chosen": -51.864078521728516, + "logps/rejected": -83.6647720336914, + "loss": 0.6656, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.611995220184326, + "rewards/margins": 5.811054706573486, + "rewards/rejected": -3.199059247970581, + "step": 9227 + }, + { + "epoch": 2.31, + "grad_norm": 3.6737935543060303, + "learning_rate": 5.5994422058194085e-06, + "logits/chosen": -0.620936393737793, + "logits/rejected": -0.6612263917922974, + "logps/chosen": -52.91027069091797, + "logps/rejected": -116.2162094116211, + "loss": 0.6242, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.200162172317505, + "rewards/margins": 6.791635513305664, + "rewards/rejected": -3.59147310256958, + "step": 9228 + }, + { + "epoch": 2.31, + "grad_norm": 4.1366071701049805, + "learning_rate": 5.598661879800333e-06, + "logits/chosen": -0.5331448316574097, + "logits/rejected": -0.6324283480644226, + "logps/chosen": -59.714385986328125, + "logps/rejected": -92.62455749511719, + "loss": 0.7368, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7270076274871826, + "rewards/margins": 6.52430534362793, + "rewards/rejected": -3.797297716140747, + "step": 9229 + }, + { + "epoch": 2.31, + "grad_norm": 2.7281241416931152, + "learning_rate": 5.5978815389876836e-06, + "logits/chosen": -0.4990522265434265, + "logits/rejected": -0.5882344245910645, + "logps/chosen": -61.63149642944336, + "logps/rejected": -89.68505859375, + "loss": 0.6452, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.277219295501709, + "rewards/margins": 5.971113204956055, + "rewards/rejected": -2.693894386291504, + "step": 9230 + }, + { + "epoch": 2.31, + "grad_norm": 3.2169413566589355, + "learning_rate": 5.597101183400747e-06, + "logits/chosen": -0.4288514256477356, + "logits/rejected": -0.5773270130157471, + "logps/chosen": -65.93539428710938, + "logps/rejected": -91.16011047363281, + "loss": 0.6535, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.405398368835449, + "rewards/margins": 7.834873676300049, + "rewards/rejected": -4.429474830627441, + "step": 9231 + }, + { + "epoch": 2.31, + "grad_norm": 4.05625057220459, + "learning_rate": 5.596320813058805e-06, + "logits/chosen": -0.47889411449432373, + "logits/rejected": -0.5330752730369568, + "logps/chosen": -49.28395080566406, + "logps/rejected": -95.88223266601562, + "loss": 0.6082, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.083688497543335, + "rewards/margins": 5.574501037597656, + "rewards/rejected": -2.4908127784729004, + "step": 9232 + }, + { + "epoch": 2.31, + "grad_norm": 3.515872001647949, + "learning_rate": 5.59554042798114e-06, + "logits/chosen": -0.4727865159511566, + "logits/rejected": -0.5666211247444153, + "logps/chosen": -60.582763671875, + "logps/rejected": -93.55984497070312, + "loss": 0.6188, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.260763168334961, + "rewards/margins": 6.42355489730835, + "rewards/rejected": -3.1627917289733887, + "step": 9233 + }, + { + "epoch": 2.31, + "grad_norm": 3.5632715225219727, + "learning_rate": 5.59476002818704e-06, + "logits/chosen": -0.4579828381538391, + "logits/rejected": -0.5762662887573242, + "logps/chosen": -66.25543212890625, + "logps/rejected": -88.95446014404297, + "loss": 0.6659, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9604625701904297, + "rewards/margins": 6.202880859375, + "rewards/rejected": -3.242417812347412, + "step": 9234 + }, + { + "epoch": 2.31, + "grad_norm": 11.595890998840332, + "learning_rate": 5.593979613695785e-06, + "logits/chosen": -0.44563165307044983, + "logits/rejected": -0.4895078241825104, + "logps/chosen": -57.95077133178711, + "logps/rejected": -114.39439392089844, + "loss": 0.7028, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.988586664199829, + "rewards/margins": 4.7078752517700195, + "rewards/rejected": -1.7192888259887695, + "step": 9235 + }, + { + "epoch": 2.31, + "grad_norm": 12.336939811706543, + "learning_rate": 5.593199184526662e-06, + "logits/chosen": -0.4755999743938446, + "logits/rejected": -0.6091942191123962, + "logps/chosen": -54.91486740112305, + "logps/rejected": -88.89772033691406, + "loss": 0.6762, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7027366161346436, + "rewards/margins": 6.060061931610107, + "rewards/rejected": -3.357326030731201, + "step": 9236 + }, + { + "epoch": 2.31, + "grad_norm": 28.029102325439453, + "learning_rate": 5.592418740698956e-06, + "logits/chosen": -0.43186458945274353, + "logits/rejected": -0.5055002570152283, + "logps/chosen": -56.128604888916016, + "logps/rejected": -98.32829284667969, + "loss": 0.7529, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7473268508911133, + "rewards/margins": 5.012388229370117, + "rewards/rejected": -2.265061378479004, + "step": 9237 + }, + { + "epoch": 2.31, + "grad_norm": 6.785985469818115, + "learning_rate": 5.591638282231955e-06, + "logits/chosen": -0.4908014237880707, + "logits/rejected": -0.5607237219810486, + "logps/chosen": -66.81612396240234, + "logps/rejected": -99.3116455078125, + "loss": 0.6824, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.32421612739563, + "rewards/margins": 5.853888988494873, + "rewards/rejected": -2.5296733379364014, + "step": 9238 + }, + { + "epoch": 2.31, + "grad_norm": 6.582868576049805, + "learning_rate": 5.590857809144939e-06, + "logits/chosen": -0.47409307956695557, + "logits/rejected": -0.5331633687019348, + "logps/chosen": -47.32593536376953, + "logps/rejected": -97.96410369873047, + "loss": 0.6451, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9370980262756348, + "rewards/margins": 5.714670658111572, + "rewards/rejected": -2.7775728702545166, + "step": 9239 + }, + { + "epoch": 2.31, + "grad_norm": 13.961917877197266, + "learning_rate": 5.5900773214572016e-06, + "logits/chosen": -0.509681761264801, + "logits/rejected": -0.6470546126365662, + "logps/chosen": -54.708160400390625, + "logps/rejected": -76.63713836669922, + "loss": 0.8894, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.816084623336792, + "rewards/margins": 6.362375259399414, + "rewards/rejected": -3.546290636062622, + "step": 9240 + }, + { + "epoch": 2.31, + "grad_norm": 7.215301990509033, + "learning_rate": 5.589296819188025e-06, + "logits/chosen": -0.5177391171455383, + "logits/rejected": -0.5591391921043396, + "logps/chosen": -48.7112922668457, + "logps/rejected": -104.0699462890625, + "loss": 0.7795, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1732177734375, + "rewards/margins": 5.3875627517700195, + "rewards/rejected": -2.2143447399139404, + "step": 9241 + }, + { + "epoch": 2.31, + "grad_norm": 13.837907791137695, + "learning_rate": 5.588516302356696e-06, + "logits/chosen": -0.5219137072563171, + "logits/rejected": -0.5668047666549683, + "logps/chosen": -63.78887176513672, + "logps/rejected": -106.13703155517578, + "loss": 0.8325, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8516111373901367, + "rewards/margins": 5.845556735992432, + "rewards/rejected": -2.9939463138580322, + "step": 9242 + }, + { + "epoch": 2.31, + "grad_norm": 5.663329124450684, + "learning_rate": 5.587735770982504e-06, + "logits/chosen": -0.42523160576820374, + "logits/rejected": -0.476161390542984, + "logps/chosen": -47.51455307006836, + "logps/rejected": -111.50060272216797, + "loss": 0.5561, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.953338861465454, + "rewards/margins": 6.44637393951416, + "rewards/rejected": -3.4930357933044434, + "step": 9243 + }, + { + "epoch": 2.31, + "grad_norm": 8.087377548217773, + "learning_rate": 5.586955225084736e-06, + "logits/chosen": -0.5121335387229919, + "logits/rejected": -0.5552566051483154, + "logps/chosen": -49.8973274230957, + "logps/rejected": -80.57894134521484, + "loss": 0.7558, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8038272857666016, + "rewards/margins": 4.852484226226807, + "rewards/rejected": -2.048656702041626, + "step": 9244 + }, + { + "epoch": 2.31, + "grad_norm": 4.485918045043945, + "learning_rate": 5.58617466468268e-06, + "logits/chosen": -0.5380367636680603, + "logits/rejected": -0.6028016805648804, + "logps/chosen": -50.4136962890625, + "logps/rejected": -83.01058959960938, + "loss": 0.6712, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0326833724975586, + "rewards/margins": 5.115490436553955, + "rewards/rejected": -2.0828070640563965, + "step": 9245 + }, + { + "epoch": 2.31, + "grad_norm": 6.045861721038818, + "learning_rate": 5.585394089795625e-06, + "logits/chosen": -0.5134218335151672, + "logits/rejected": -0.6162134408950806, + "logps/chosen": -59.211669921875, + "logps/rejected": -92.33673858642578, + "loss": 0.6476, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9391160011291504, + "rewards/margins": 5.644255638122559, + "rewards/rejected": -2.705139636993408, + "step": 9246 + }, + { + "epoch": 2.31, + "grad_norm": 4.631124973297119, + "learning_rate": 5.5846135004428594e-06, + "logits/chosen": -0.45450904965400696, + "logits/rejected": -0.5292590260505676, + "logps/chosen": -59.63365173339844, + "logps/rejected": -86.06983947753906, + "loss": 0.6913, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.061835527420044, + "rewards/margins": 5.618679523468018, + "rewards/rejected": -2.5568432807922363, + "step": 9247 + }, + { + "epoch": 2.31, + "grad_norm": 6.248523235321045, + "learning_rate": 5.583832896643672e-06, + "logits/chosen": -0.47747308015823364, + "logits/rejected": -0.5593156218528748, + "logps/chosen": -62.769142150878906, + "logps/rejected": -78.37843322753906, + "loss": 0.7377, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.120051145553589, + "rewards/margins": 4.642289161682129, + "rewards/rejected": -1.52223801612854, + "step": 9248 + }, + { + "epoch": 2.31, + "grad_norm": 8.832842826843262, + "learning_rate": 5.5830522784173525e-06, + "logits/chosen": -0.4741264283657074, + "logits/rejected": -0.556663453578949, + "logps/chosen": -47.77744674682617, + "logps/rejected": -92.67819213867188, + "loss": 0.7245, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8387858867645264, + "rewards/margins": 5.973868370056152, + "rewards/rejected": -3.1350820064544678, + "step": 9249 + }, + { + "epoch": 2.31, + "grad_norm": 3.7675161361694336, + "learning_rate": 5.582271645783191e-06, + "logits/chosen": -0.4959765076637268, + "logits/rejected": -0.5478701591491699, + "logps/chosen": -62.16094207763672, + "logps/rejected": -103.3403549194336, + "loss": 0.7138, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0245485305786133, + "rewards/margins": 5.888863563537598, + "rewards/rejected": -2.8643150329589844, + "step": 9250 + }, + { + "epoch": 2.31, + "grad_norm": 5.836945533752441, + "learning_rate": 5.581490998760477e-06, + "logits/chosen": -0.41015854477882385, + "logits/rejected": -0.5248076319694519, + "logps/chosen": -58.01551818847656, + "logps/rejected": -98.90059661865234, + "loss": 0.654, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9581239223480225, + "rewards/margins": 6.445956230163574, + "rewards/rejected": -3.4878323078155518, + "step": 9251 + }, + { + "epoch": 2.31, + "grad_norm": 5.490739822387695, + "learning_rate": 5.5807103373685045e-06, + "logits/chosen": -0.48136693239212036, + "logits/rejected": -0.5695011615753174, + "logps/chosen": -56.85406494140625, + "logps/rejected": -95.38023376464844, + "loss": 0.7189, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.790834903717041, + "rewards/margins": 6.00929594039917, + "rewards/rejected": -3.21846079826355, + "step": 9252 + }, + { + "epoch": 2.31, + "grad_norm": 5.133723735809326, + "learning_rate": 5.57992966162656e-06, + "logits/chosen": -0.5339764356613159, + "logits/rejected": -0.6250596046447754, + "logps/chosen": -67.60650634765625, + "logps/rejected": -100.88824462890625, + "loss": 0.7026, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.215890645980835, + "rewards/margins": 6.484127998352051, + "rewards/rejected": -3.2682371139526367, + "step": 9253 + }, + { + "epoch": 2.32, + "grad_norm": 5.319944858551025, + "learning_rate": 5.579148971553937e-06, + "logits/chosen": -0.44601768255233765, + "logits/rejected": -0.5531125068664551, + "logps/chosen": -64.0250244140625, + "logps/rejected": -84.97614288330078, + "loss": 0.7804, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.841951608657837, + "rewards/margins": 4.161252498626709, + "rewards/rejected": -1.3193004131317139, + "step": 9254 + }, + { + "epoch": 2.32, + "grad_norm": 2.8998377323150635, + "learning_rate": 5.578368267169927e-06, + "logits/chosen": -0.5316888689994812, + "logits/rejected": -0.5968966484069824, + "logps/chosen": -58.943580627441406, + "logps/rejected": -110.35321044921875, + "loss": 0.6567, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0609850883483887, + "rewards/margins": 7.6599225997924805, + "rewards/rejected": -4.598937511444092, + "step": 9255 + }, + { + "epoch": 2.32, + "grad_norm": 4.458705902099609, + "learning_rate": 5.577587548493821e-06, + "logits/chosen": -0.4904504418373108, + "logits/rejected": -0.6448656320571899, + "logps/chosen": -51.25405502319336, + "logps/rejected": -80.85181427001953, + "loss": 0.6401, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0477404594421387, + "rewards/margins": 6.5640974044799805, + "rewards/rejected": -3.516356945037842, + "step": 9256 + }, + { + "epoch": 2.32, + "grad_norm": 6.811651229858398, + "learning_rate": 5.576806815544914e-06, + "logits/chosen": -0.5764759182929993, + "logits/rejected": -0.6764672994613647, + "logps/chosen": -53.806949615478516, + "logps/rejected": -78.4450454711914, + "loss": 0.7676, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.028202533721924, + "rewards/margins": 5.379444122314453, + "rewards/rejected": -2.3512415885925293, + "step": 9257 + }, + { + "epoch": 2.32, + "grad_norm": 6.245397090911865, + "learning_rate": 5.576026068342496e-06, + "logits/chosen": -0.4727925956249237, + "logits/rejected": -0.5492581725120544, + "logps/chosen": -57.23141098022461, + "logps/rejected": -90.05723571777344, + "loss": 0.7025, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9238243103027344, + "rewards/margins": 5.644744873046875, + "rewards/rejected": -2.720921039581299, + "step": 9258 + }, + { + "epoch": 2.32, + "grad_norm": 4.561863899230957, + "learning_rate": 5.575245306905861e-06, + "logits/chosen": -0.5212164521217346, + "logits/rejected": -0.6246553659439087, + "logps/chosen": -59.98749923706055, + "logps/rejected": -87.54449462890625, + "loss": 0.736, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8621275424957275, + "rewards/margins": 6.2757568359375, + "rewards/rejected": -3.4136290550231934, + "step": 9259 + }, + { + "epoch": 2.32, + "grad_norm": 7.285458564758301, + "learning_rate": 5.574464531254303e-06, + "logits/chosen": -0.49644213914871216, + "logits/rejected": -0.5853275656700134, + "logps/chosen": -48.96772003173828, + "logps/rejected": -93.96717071533203, + "loss": 0.8142, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.824387550354004, + "rewards/margins": 5.45277738571167, + "rewards/rejected": -2.628389835357666, + "step": 9260 + }, + { + "epoch": 2.32, + "grad_norm": 26.147104263305664, + "learning_rate": 5.573683741407117e-06, + "logits/chosen": -0.5388362407684326, + "logits/rejected": -0.6460298299789429, + "logps/chosen": -62.08839416503906, + "logps/rejected": -80.8686294555664, + "loss": 0.7424, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9889261722564697, + "rewards/margins": 5.531521320343018, + "rewards/rejected": -2.542595148086548, + "step": 9261 + }, + { + "epoch": 2.32, + "grad_norm": 8.473006248474121, + "learning_rate": 5.572902937383593e-06, + "logits/chosen": -0.5570973753929138, + "logits/rejected": -0.6777404546737671, + "logps/chosen": -67.94779205322266, + "logps/rejected": -91.53657531738281, + "loss": 0.6989, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8994414806365967, + "rewards/margins": 6.655255317687988, + "rewards/rejected": -3.7558133602142334, + "step": 9262 + }, + { + "epoch": 2.32, + "grad_norm": 4.03140115737915, + "learning_rate": 5.572122119203029e-06, + "logits/chosen": -0.42436662316322327, + "logits/rejected": -0.503631055355072, + "logps/chosen": -62.795135498046875, + "logps/rejected": -83.73914337158203, + "loss": 0.6963, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.039233684539795, + "rewards/margins": 5.9417009353637695, + "rewards/rejected": -2.902467727661133, + "step": 9263 + }, + { + "epoch": 2.32, + "grad_norm": 3.6142807006835938, + "learning_rate": 5.57134128688472e-06, + "logits/chosen": -0.5163291692733765, + "logits/rejected": -0.5770633816719055, + "logps/chosen": -39.63957595825195, + "logps/rejected": -98.741455078125, + "loss": 0.6439, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0840325355529785, + "rewards/margins": 5.801018714904785, + "rewards/rejected": -2.7169861793518066, + "step": 9264 + }, + { + "epoch": 2.32, + "grad_norm": 5.360105991363525, + "learning_rate": 5.570560440447959e-06, + "logits/chosen": -0.4614165425300598, + "logits/rejected": -0.5787503719329834, + "logps/chosen": -67.8659896850586, + "logps/rejected": -79.56756591796875, + "loss": 0.7841, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.984005928039551, + "rewards/margins": 4.9739885330200195, + "rewards/rejected": -1.9899827241897583, + "step": 9265 + }, + { + "epoch": 2.32, + "grad_norm": 1.8755903244018555, + "learning_rate": 5.569779579912042e-06, + "logits/chosen": -0.4886786937713623, + "logits/rejected": -0.5809876918792725, + "logps/chosen": -50.73099899291992, + "logps/rejected": -117.38896179199219, + "loss": 0.5386, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1838583946228027, + "rewards/margins": 8.069473266601562, + "rewards/rejected": -4.88561487197876, + "step": 9266 + }, + { + "epoch": 2.32, + "grad_norm": 4.216855049133301, + "learning_rate": 5.568998705296267e-06, + "logits/chosen": -0.43362122774124146, + "logits/rejected": -0.5237084627151489, + "logps/chosen": -58.83197784423828, + "logps/rejected": -73.24832916259766, + "loss": 0.6662, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1108486652374268, + "rewards/margins": 5.421400547027588, + "rewards/rejected": -2.310551881790161, + "step": 9267 + }, + { + "epoch": 2.32, + "grad_norm": 7.851985931396484, + "learning_rate": 5.568217816619927e-06, + "logits/chosen": -0.4778226912021637, + "logits/rejected": -0.5379367470741272, + "logps/chosen": -46.31150817871094, + "logps/rejected": -103.8128662109375, + "loss": 0.6842, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.982191562652588, + "rewards/margins": 6.351809501647949, + "rewards/rejected": -3.3696177005767822, + "step": 9268 + }, + { + "epoch": 2.32, + "grad_norm": 6.439434051513672, + "learning_rate": 5.567436913902322e-06, + "logits/chosen": -0.4873160123825073, + "logits/rejected": -0.5316738486289978, + "logps/chosen": -63.38874053955078, + "logps/rejected": -83.87374877929688, + "loss": 0.846, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8529508113861084, + "rewards/margins": 4.085306167602539, + "rewards/rejected": -1.2323554754257202, + "step": 9269 + }, + { + "epoch": 2.32, + "grad_norm": 6.109607696533203, + "learning_rate": 5.566655997162746e-06, + "logits/chosen": -0.40339094400405884, + "logits/rejected": -0.49828851222991943, + "logps/chosen": -63.71569061279297, + "logps/rejected": -83.78975677490234, + "loss": 0.73, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.925929307937622, + "rewards/margins": 5.143124103546143, + "rewards/rejected": -2.2171945571899414, + "step": 9270 + }, + { + "epoch": 2.32, + "grad_norm": 4.349989891052246, + "learning_rate": 5.565875066420499e-06, + "logits/chosen": -0.45253825187683105, + "logits/rejected": -0.5798053741455078, + "logps/chosen": -76.85363006591797, + "logps/rejected": -104.09317779541016, + "loss": 0.7393, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8766987323760986, + "rewards/margins": 5.789734840393066, + "rewards/rejected": -2.9130358695983887, + "step": 9271 + }, + { + "epoch": 2.32, + "grad_norm": 2.655014991760254, + "learning_rate": 5.565094121694876e-06, + "logits/chosen": -0.4723518192768097, + "logits/rejected": -0.5878298282623291, + "logps/chosen": -55.32839584350586, + "logps/rejected": -99.7302474975586, + "loss": 0.5355, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.791912794113159, + "rewards/margins": 6.405433654785156, + "rewards/rejected": -3.613520383834839, + "step": 9272 + }, + { + "epoch": 2.32, + "grad_norm": 2.2122128009796143, + "learning_rate": 5.564313163005177e-06, + "logits/chosen": -0.5069525241851807, + "logits/rejected": -0.5938959121704102, + "logps/chosen": -50.636390686035156, + "logps/rejected": -112.49652862548828, + "loss": 0.5291, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7986607551574707, + "rewards/margins": 7.507437705993652, + "rewards/rejected": -4.70877742767334, + "step": 9273 + }, + { + "epoch": 2.32, + "grad_norm": 5.4683942794799805, + "learning_rate": 5.563532190370698e-06, + "logits/chosen": -0.45441508293151855, + "logits/rejected": -0.5942503809928894, + "logps/chosen": -65.10939025878906, + "logps/rejected": -90.0596694946289, + "loss": 0.6409, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9956579208374023, + "rewards/margins": 5.376708030700684, + "rewards/rejected": -2.381049871444702, + "step": 9274 + }, + { + "epoch": 2.32, + "grad_norm": 7.056657314300537, + "learning_rate": 5.562751203810742e-06, + "logits/chosen": -0.41968250274658203, + "logits/rejected": -0.4702215790748596, + "logps/chosen": -52.20708084106445, + "logps/rejected": -106.4074935913086, + "loss": 0.6491, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.99153470993042, + "rewards/margins": 5.207244873046875, + "rewards/rejected": -2.215709924697876, + "step": 9275 + }, + { + "epoch": 2.32, + "grad_norm": 4.844661712646484, + "learning_rate": 5.5619702033446025e-06, + "logits/chosen": -0.5077160000801086, + "logits/rejected": -0.5767000913619995, + "logps/chosen": -50.19352722167969, + "logps/rejected": -85.13117980957031, + "loss": 0.6815, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0351145267486572, + "rewards/margins": 4.935863494873047, + "rewards/rejected": -1.9007489681243896, + "step": 9276 + }, + { + "epoch": 2.32, + "grad_norm": 3.4551987648010254, + "learning_rate": 5.561189188991584e-06, + "logits/chosen": -0.5502595901489258, + "logits/rejected": -0.6252986192703247, + "logps/chosen": -47.38410568237305, + "logps/rejected": -97.14350891113281, + "loss": 0.6155, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.432736873626709, + "rewards/margins": 5.362401008605957, + "rewards/rejected": -1.9296644926071167, + "step": 9277 + }, + { + "epoch": 2.32, + "grad_norm": 8.183022499084473, + "learning_rate": 5.560408160770981e-06, + "logits/chosen": -0.5026085376739502, + "logits/rejected": -0.5965661406517029, + "logps/chosen": -44.55119705200195, + "logps/rejected": -85.47496032714844, + "loss": 0.6697, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.878109931945801, + "rewards/margins": 6.0190958976745605, + "rewards/rejected": -3.140986204147339, + "step": 9278 + }, + { + "epoch": 2.32, + "grad_norm": 7.6861748695373535, + "learning_rate": 5.5596271187020976e-06, + "logits/chosen": -0.3945023715496063, + "logits/rejected": -0.4878569543361664, + "logps/chosen": -52.027984619140625, + "logps/rejected": -91.93740844726562, + "loss": 0.6497, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0793190002441406, + "rewards/margins": 5.561405658721924, + "rewards/rejected": -2.482086658477783, + "step": 9279 + }, + { + "epoch": 2.32, + "grad_norm": 4.129692554473877, + "learning_rate": 5.558846062804233e-06, + "logits/chosen": -0.5255532264709473, + "logits/rejected": -0.592686653137207, + "logps/chosen": -45.69011688232422, + "logps/rejected": -96.8487777709961, + "loss": 0.6305, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.158673048019409, + "rewards/margins": 6.168431282043457, + "rewards/rejected": -3.0097579956054688, + "step": 9280 + }, + { + "epoch": 2.32, + "grad_norm": 4.87126350402832, + "learning_rate": 5.558064993096688e-06, + "logits/chosen": -0.45077136158943176, + "logits/rejected": -0.543004035949707, + "logps/chosen": -53.83582305908203, + "logps/rejected": -88.97590637207031, + "loss": 0.6271, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9681010246276855, + "rewards/margins": 6.061071395874023, + "rewards/rejected": -3.092970371246338, + "step": 9281 + }, + { + "epoch": 2.32, + "grad_norm": 4.1100850105285645, + "learning_rate": 5.557283909598763e-06, + "logits/chosen": -0.4486555755138397, + "logits/rejected": -0.5294712781906128, + "logps/chosen": -66.21395874023438, + "logps/rejected": -81.58747863769531, + "loss": 0.6978, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.053575277328491, + "rewards/margins": 5.0342230796813965, + "rewards/rejected": -1.9806475639343262, + "step": 9282 + }, + { + "epoch": 2.32, + "grad_norm": 7.086760520935059, + "learning_rate": 5.556502812329761e-06, + "logits/chosen": -0.5465511679649353, + "logits/rejected": -0.6743064522743225, + "logps/chosen": -55.07918930053711, + "logps/rejected": -80.8289566040039, + "loss": 0.6721, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9703214168548584, + "rewards/margins": 5.053488731384277, + "rewards/rejected": -2.0831668376922607, + "step": 9283 + }, + { + "epoch": 2.32, + "grad_norm": 3.321763753890991, + "learning_rate": 5.555721701308981e-06, + "logits/chosen": -0.4167061448097229, + "logits/rejected": -0.5084193348884583, + "logps/chosen": -54.15660858154297, + "logps/rejected": -97.558349609375, + "loss": 0.5691, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5744361877441406, + "rewards/margins": 5.365383625030518, + "rewards/rejected": -1.7909469604492188, + "step": 9284 + }, + { + "epoch": 2.32, + "grad_norm": 16.096731185913086, + "learning_rate": 5.554940576555726e-06, + "logits/chosen": -0.47681474685668945, + "logits/rejected": -0.5856136083602905, + "logps/chosen": -69.14847564697266, + "logps/rejected": -96.6485595703125, + "loss": 0.8415, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0406577587127686, + "rewards/margins": 4.971446514129639, + "rewards/rejected": -1.9307879209518433, + "step": 9285 + }, + { + "epoch": 2.32, + "grad_norm": 6.0761942863464355, + "learning_rate": 5.5541594380893016e-06, + "logits/chosen": -0.5205848217010498, + "logits/rejected": -0.5429360270500183, + "logps/chosen": -49.84355163574219, + "logps/rejected": -102.98985290527344, + "loss": 0.6956, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3780317306518555, + "rewards/margins": 6.068061828613281, + "rewards/rejected": -2.690030574798584, + "step": 9286 + }, + { + "epoch": 2.32, + "grad_norm": 3.075502872467041, + "learning_rate": 5.553378285929006e-06, + "logits/chosen": -0.5048529505729675, + "logits/rejected": -0.5687845945358276, + "logps/chosen": -49.95098114013672, + "logps/rejected": -86.53126525878906, + "loss": 0.6432, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.884793281555176, + "rewards/margins": 5.646698951721191, + "rewards/rejected": -2.7619054317474365, + "step": 9287 + }, + { + "epoch": 2.32, + "grad_norm": 8.42895221710205, + "learning_rate": 5.552597120094146e-06, + "logits/chosen": -0.5216196775436401, + "logits/rejected": -0.555535078048706, + "logps/chosen": -53.35073471069336, + "logps/rejected": -98.65601348876953, + "loss": 0.8439, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0659008026123047, + "rewards/margins": 5.412261009216309, + "rewards/rejected": -2.346360206604004, + "step": 9288 + }, + { + "epoch": 2.32, + "grad_norm": 3.4982750415802, + "learning_rate": 5.551815940604023e-06, + "logits/chosen": -0.4525505304336548, + "logits/rejected": -0.5454063415527344, + "logps/chosen": -60.187660217285156, + "logps/rejected": -88.32278442382812, + "loss": 0.6391, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.146754026412964, + "rewards/margins": 5.838457107543945, + "rewards/rejected": -2.6917028427124023, + "step": 9289 + }, + { + "epoch": 2.32, + "grad_norm": 2.4660232067108154, + "learning_rate": 5.55103474747794e-06, + "logits/chosen": -0.5107339024543762, + "logits/rejected": -0.6414081454277039, + "logps/chosen": -55.060447692871094, + "logps/rejected": -82.47638702392578, + "loss": 0.606, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.978649139404297, + "rewards/margins": 6.294814586639404, + "rewards/rejected": -3.3161654472351074, + "step": 9290 + }, + { + "epoch": 2.32, + "grad_norm": 8.558728218078613, + "learning_rate": 5.550253540735202e-06, + "logits/chosen": -0.4490140974521637, + "logits/rejected": -0.4930330514907837, + "logps/chosen": -45.16359329223633, + "logps/rejected": -84.4275131225586, + "loss": 0.7706, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0149052143096924, + "rewards/margins": 4.894560813903809, + "rewards/rejected": -1.8796557188034058, + "step": 9291 + }, + { + "epoch": 2.32, + "grad_norm": 4.472157955169678, + "learning_rate": 5.549472320395115e-06, + "logits/chosen": -0.5165523886680603, + "logits/rejected": -0.5463243722915649, + "logps/chosen": -48.722129821777344, + "logps/rejected": -93.48057556152344, + "loss": 0.6178, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0110137462615967, + "rewards/margins": 5.374432563781738, + "rewards/rejected": -2.3634190559387207, + "step": 9292 + }, + { + "epoch": 2.32, + "grad_norm": 3.725878953933716, + "learning_rate": 5.548691086476982e-06, + "logits/chosen": -0.5283565521240234, + "logits/rejected": -0.6519858241081238, + "logps/chosen": -59.121185302734375, + "logps/rejected": -93.91207885742188, + "loss": 0.6039, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.119858503341675, + "rewards/margins": 6.700852394104004, + "rewards/rejected": -3.580994129180908, + "step": 9293 + }, + { + "epoch": 2.33, + "grad_norm": 6.723439693450928, + "learning_rate": 5.547909839000108e-06, + "logits/chosen": -0.6092162728309631, + "logits/rejected": -0.6412554979324341, + "logps/chosen": -50.77073669433594, + "logps/rejected": -96.83129119873047, + "loss": 0.7224, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0784173011779785, + "rewards/margins": 4.5696821212768555, + "rewards/rejected": -1.4912651777267456, + "step": 9294 + }, + { + "epoch": 2.33, + "grad_norm": 2.957509756088257, + "learning_rate": 5.547128577983801e-06, + "logits/chosen": -0.46706482768058777, + "logits/rejected": -0.630785346031189, + "logps/chosen": -52.553524017333984, + "logps/rejected": -70.28436279296875, + "loss": 0.5635, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2187376022338867, + "rewards/margins": 6.475883483886719, + "rewards/rejected": -3.257145643234253, + "step": 9295 + }, + { + "epoch": 2.33, + "grad_norm": 4.704648017883301, + "learning_rate": 5.546347303447364e-06, + "logits/chosen": -0.4178065359592438, + "logits/rejected": -0.5290071368217468, + "logps/chosen": -61.367916107177734, + "logps/rejected": -82.8276596069336, + "loss": 0.6427, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.178279161453247, + "rewards/margins": 5.67882776260376, + "rewards/rejected": -2.5005483627319336, + "step": 9296 + }, + { + "epoch": 2.33, + "grad_norm": 4.264470100402832, + "learning_rate": 5.545566015410103e-06, + "logits/chosen": -0.5637503862380981, + "logits/rejected": -0.6639187932014465, + "logps/chosen": -63.237911224365234, + "logps/rejected": -89.367919921875, + "loss": 0.6354, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0680291652679443, + "rewards/margins": 6.090435028076172, + "rewards/rejected": -3.0224063396453857, + "step": 9297 + }, + { + "epoch": 2.33, + "grad_norm": 4.750699996948242, + "learning_rate": 5.544784713891327e-06, + "logits/chosen": -0.45008715987205505, + "logits/rejected": -0.598659873008728, + "logps/chosen": -53.75257873535156, + "logps/rejected": -78.80680847167969, + "loss": 0.6128, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0486724376678467, + "rewards/margins": 5.9183173179626465, + "rewards/rejected": -2.8696448802948, + "step": 9298 + }, + { + "epoch": 2.33, + "grad_norm": 5.8732380867004395, + "learning_rate": 5.544003398910339e-06, + "logits/chosen": -0.45132941007614136, + "logits/rejected": -0.5199977159500122, + "logps/chosen": -56.959136962890625, + "logps/rejected": -85.28466796875, + "loss": 0.6493, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1893370151519775, + "rewards/margins": 5.242326259613037, + "rewards/rejected": -2.0529890060424805, + "step": 9299 + }, + { + "epoch": 2.33, + "grad_norm": 4.57706356048584, + "learning_rate": 5.543222070486451e-06, + "logits/chosen": -0.5036159157752991, + "logits/rejected": -0.5563799142837524, + "logps/chosen": -62.80656433105469, + "logps/rejected": -96.30706787109375, + "loss": 0.7155, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1151070594787598, + "rewards/margins": 5.242890357971191, + "rewards/rejected": -2.1277832984924316, + "step": 9300 + }, + { + "epoch": 2.33, + "grad_norm": 3.4843437671661377, + "learning_rate": 5.542440728638967e-06, + "logits/chosen": -0.49094560742378235, + "logits/rejected": -0.5913879871368408, + "logps/chosen": -45.36589431762695, + "logps/rejected": -84.98641967773438, + "loss": 0.6314, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.266416549682617, + "rewards/margins": 6.311285018920898, + "rewards/rejected": -3.044868230819702, + "step": 9301 + }, + { + "epoch": 2.33, + "grad_norm": 4.866394519805908, + "learning_rate": 5.5416593733871946e-06, + "logits/chosen": -0.5176255702972412, + "logits/rejected": -0.6283408403396606, + "logps/chosen": -57.6343879699707, + "logps/rejected": -85.2020492553711, + "loss": 0.7459, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7938385009765625, + "rewards/margins": 5.620630741119385, + "rewards/rejected": -2.826792001724243, + "step": 9302 + }, + { + "epoch": 2.33, + "grad_norm": 4.088376522064209, + "learning_rate": 5.540878004750444e-06, + "logits/chosen": -0.5057833194732666, + "logits/rejected": -0.629238486289978, + "logps/chosen": -60.513755798339844, + "logps/rejected": -96.0587387084961, + "loss": 0.6285, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.843982219696045, + "rewards/margins": 7.409987449645996, + "rewards/rejected": -4.566005229949951, + "step": 9303 + }, + { + "epoch": 2.33, + "grad_norm": 3.793104410171509, + "learning_rate": 5.540096622748024e-06, + "logits/chosen": -0.47192567586898804, + "logits/rejected": -0.583106517791748, + "logps/chosen": -57.01380920410156, + "logps/rejected": -99.82523345947266, + "loss": 0.6466, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0424094200134277, + "rewards/margins": 6.251343250274658, + "rewards/rejected": -3.2089340686798096, + "step": 9304 + }, + { + "epoch": 2.33, + "grad_norm": 3.611335039138794, + "learning_rate": 5.539315227399239e-06, + "logits/chosen": -0.46732524037361145, + "logits/rejected": -0.5708589553833008, + "logps/chosen": -55.597042083740234, + "logps/rejected": -91.50196838378906, + "loss": 0.6393, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2025258541107178, + "rewards/margins": 6.2046990394592285, + "rewards/rejected": -3.002173900604248, + "step": 9305 + }, + { + "epoch": 2.33, + "grad_norm": 3.7621867656707764, + "learning_rate": 5.538533818723403e-06, + "logits/chosen": -0.4752354025840759, + "logits/rejected": -0.548116147518158, + "logps/chosen": -69.33167266845703, + "logps/rejected": -103.7784423828125, + "loss": 0.6467, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7334988117218018, + "rewards/margins": 5.491222858428955, + "rewards/rejected": -2.7577242851257324, + "step": 9306 + }, + { + "epoch": 2.33, + "grad_norm": 7.75734281539917, + "learning_rate": 5.537752396739824e-06, + "logits/chosen": -0.505335807800293, + "logits/rejected": -0.586948812007904, + "logps/chosen": -62.20257568359375, + "logps/rejected": -98.29171752929688, + "loss": 0.6794, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.321554183959961, + "rewards/margins": 6.153848171234131, + "rewards/rejected": -2.832294225692749, + "step": 9307 + }, + { + "epoch": 2.33, + "grad_norm": 6.1111531257629395, + "learning_rate": 5.536970961467808e-06, + "logits/chosen": -0.4421282112598419, + "logits/rejected": -0.5346415638923645, + "logps/chosen": -67.5103530883789, + "logps/rejected": -111.1191177368164, + "loss": 0.7258, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0337352752685547, + "rewards/margins": 6.50010871887207, + "rewards/rejected": -3.4663736820220947, + "step": 9308 + }, + { + "epoch": 2.33, + "grad_norm": 6.649420738220215, + "learning_rate": 5.5361895129266715e-06, + "logits/chosen": -0.527997612953186, + "logits/rejected": -0.667544960975647, + "logps/chosen": -65.68964385986328, + "logps/rejected": -107.82355499267578, + "loss": 0.6657, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.024834632873535, + "rewards/margins": 7.919985771179199, + "rewards/rejected": -4.895151615142822, + "step": 9309 + }, + { + "epoch": 2.33, + "grad_norm": 5.811972141265869, + "learning_rate": 5.535408051135721e-06, + "logits/chosen": -0.4621165990829468, + "logits/rejected": -0.5575824975967407, + "logps/chosen": -64.80677032470703, + "logps/rejected": -108.91183471679688, + "loss": 0.6797, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7776365280151367, + "rewards/margins": 6.077690601348877, + "rewards/rejected": -3.3000545501708984, + "step": 9310 + }, + { + "epoch": 2.33, + "grad_norm": 7.325202941894531, + "learning_rate": 5.534626576114268e-06, + "logits/chosen": -0.5240542888641357, + "logits/rejected": -0.6359257698059082, + "logps/chosen": -64.93120574951172, + "logps/rejected": -106.84165954589844, + "loss": 0.6518, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.837663412094116, + "rewards/margins": 7.003143310546875, + "rewards/rejected": -4.16547966003418, + "step": 9311 + }, + { + "epoch": 2.33, + "grad_norm": 32.525291442871094, + "learning_rate": 5.533845087881623e-06, + "logits/chosen": -0.5432733297348022, + "logits/rejected": -0.6017197966575623, + "logps/chosen": -46.698787689208984, + "logps/rejected": -84.67866516113281, + "loss": 0.8667, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.62410831451416, + "rewards/margins": 5.379056453704834, + "rewards/rejected": -2.7549474239349365, + "step": 9312 + }, + { + "epoch": 2.33, + "grad_norm": 4.014915466308594, + "learning_rate": 5.533063586457099e-06, + "logits/chosen": -0.4748421311378479, + "logits/rejected": -0.5744590759277344, + "logps/chosen": -74.58277893066406, + "logps/rejected": -95.97247314453125, + "loss": 0.6282, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9106802940368652, + "rewards/margins": 5.707259654998779, + "rewards/rejected": -2.796579599380493, + "step": 9313 + }, + { + "epoch": 2.33, + "grad_norm": 8.57692813873291, + "learning_rate": 5.532282071860006e-06, + "logits/chosen": -0.4849817752838135, + "logits/rejected": -0.5623146891593933, + "logps/chosen": -56.3557243347168, + "logps/rejected": -89.68981170654297, + "loss": 0.8403, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.092372179031372, + "rewards/margins": 5.209823131561279, + "rewards/rejected": -2.1174511909484863, + "step": 9314 + }, + { + "epoch": 2.33, + "grad_norm": 6.285534858703613, + "learning_rate": 5.531500544109658e-06, + "logits/chosen": -0.4936281144618988, + "logits/rejected": -0.5369871258735657, + "logps/chosen": -47.850341796875, + "logps/rejected": -87.91767883300781, + "loss": 0.5855, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9895548820495605, + "rewards/margins": 5.152862548828125, + "rewards/rejected": -2.1633071899414062, + "step": 9315 + }, + { + "epoch": 2.33, + "grad_norm": 4.038135051727295, + "learning_rate": 5.530719003225366e-06, + "logits/chosen": -0.5008901357650757, + "logits/rejected": -0.5406811237335205, + "logps/chosen": -63.241912841796875, + "logps/rejected": -132.5032196044922, + "loss": 0.7088, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.697869300842285, + "rewards/margins": 6.930764198303223, + "rewards/rejected": -4.232895851135254, + "step": 9316 + }, + { + "epoch": 2.33, + "grad_norm": 5.8426432609558105, + "learning_rate": 5.5299374492264415e-06, + "logits/chosen": -0.5048327445983887, + "logits/rejected": -0.5792697072029114, + "logps/chosen": -49.361751556396484, + "logps/rejected": -98.74982452392578, + "loss": 0.6868, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8768248558044434, + "rewards/margins": 5.094963073730469, + "rewards/rejected": -2.218137741088867, + "step": 9317 + }, + { + "epoch": 2.33, + "grad_norm": 3.763546943664551, + "learning_rate": 5.529155882132201e-06, + "logits/chosen": -0.4366464912891388, + "logits/rejected": -0.5352697372436523, + "logps/chosen": -55.62737274169922, + "logps/rejected": -91.49556732177734, + "loss": 0.5797, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.149374008178711, + "rewards/margins": 6.265902519226074, + "rewards/rejected": -3.116528272628784, + "step": 9318 + }, + { + "epoch": 2.33, + "grad_norm": 4.588510036468506, + "learning_rate": 5.528374301961955e-06, + "logits/chosen": -0.5211023092269897, + "logits/rejected": -0.6170865297317505, + "logps/chosen": -64.80559539794922, + "logps/rejected": -105.67079162597656, + "loss": 0.6522, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8130943775177, + "rewards/margins": 7.452636241912842, + "rewards/rejected": -4.639541149139404, + "step": 9319 + }, + { + "epoch": 2.33, + "grad_norm": 5.528532028198242, + "learning_rate": 5.527592708735017e-06, + "logits/chosen": -0.5349398851394653, + "logits/rejected": -0.5679143667221069, + "logps/chosen": -41.469730377197266, + "logps/rejected": -96.34681701660156, + "loss": 0.6647, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.4758496284484863, + "rewards/margins": 5.3363542556762695, + "rewards/rejected": -1.860504388809204, + "step": 9320 + }, + { + "epoch": 2.33, + "grad_norm": 6.888350963592529, + "learning_rate": 5.526811102470703e-06, + "logits/chosen": -0.44644853472709656, + "logits/rejected": -0.6047747135162354, + "logps/chosen": -69.90445709228516, + "logps/rejected": -73.15505981445312, + "loss": 0.6996, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.924760341644287, + "rewards/margins": 5.350778102874756, + "rewards/rejected": -2.4260177612304688, + "step": 9321 + }, + { + "epoch": 2.33, + "grad_norm": 7.742109298706055, + "learning_rate": 5.5260294831883255e-06, + "logits/chosen": -0.5360689163208008, + "logits/rejected": -0.6255092620849609, + "logps/chosen": -50.11939239501953, + "logps/rejected": -83.83684539794922, + "loss": 0.6623, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.048746347427368, + "rewards/margins": 5.268784523010254, + "rewards/rejected": -2.220038414001465, + "step": 9322 + }, + { + "epoch": 2.33, + "grad_norm": 3.6239140033721924, + "learning_rate": 5.525247850907202e-06, + "logits/chosen": -0.4707588851451874, + "logits/rejected": -0.6089605093002319, + "logps/chosen": -59.58246994018555, + "logps/rejected": -80.12041473388672, + "loss": 0.6619, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0054099559783936, + "rewards/margins": 5.341078758239746, + "rewards/rejected": -2.33566951751709, + "step": 9323 + }, + { + "epoch": 2.33, + "grad_norm": 2.981590747833252, + "learning_rate": 5.524466205646644e-06, + "logits/chosen": -0.4682624042034149, + "logits/rejected": -0.5569751262664795, + "logps/chosen": -48.46420669555664, + "logps/rejected": -91.54009246826172, + "loss": 0.5593, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4557723999023438, + "rewards/margins": 6.632628440856934, + "rewards/rejected": -3.1768569946289062, + "step": 9324 + }, + { + "epoch": 2.33, + "grad_norm": 4.863194465637207, + "learning_rate": 5.523684547425967e-06, + "logits/chosen": -0.5275928378105164, + "logits/rejected": -0.6042118072509766, + "logps/chosen": -51.7735595703125, + "logps/rejected": -85.10476684570312, + "loss": 0.6711, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.990511894226074, + "rewards/margins": 4.955196380615234, + "rewards/rejected": -1.9646849632263184, + "step": 9325 + }, + { + "epoch": 2.33, + "grad_norm": 2.0556726455688477, + "learning_rate": 5.5229028762644885e-06, + "logits/chosen": -0.5389411449432373, + "logits/rejected": -0.6480597257614136, + "logps/chosen": -52.209129333496094, + "logps/rejected": -102.88922119140625, + "loss": 0.5839, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.022040367126465, + "rewards/margins": 7.2041215896606445, + "rewards/rejected": -4.18208122253418, + "step": 9326 + }, + { + "epoch": 2.33, + "grad_norm": 5.671397686004639, + "learning_rate": 5.522121192181526e-06, + "logits/chosen": -0.4647560715675354, + "logits/rejected": -0.5495908260345459, + "logps/chosen": -57.813907623291016, + "logps/rejected": -96.50318908691406, + "loss": 0.6972, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7893872261047363, + "rewards/margins": 5.765976905822754, + "rewards/rejected": -2.9765899181365967, + "step": 9327 + }, + { + "epoch": 2.33, + "grad_norm": 6.541374206542969, + "learning_rate": 5.52133949519639e-06, + "logits/chosen": -0.4550294876098633, + "logits/rejected": -0.5187544822692871, + "logps/chosen": -54.586997985839844, + "logps/rejected": -95.71562957763672, + "loss": 0.7476, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8727023601531982, + "rewards/margins": 4.346689701080322, + "rewards/rejected": -1.4739866256713867, + "step": 9328 + }, + { + "epoch": 2.33, + "grad_norm": 3.6477394104003906, + "learning_rate": 5.520557785328403e-06, + "logits/chosen": -0.45393574237823486, + "logits/rejected": -0.5995360016822815, + "logps/chosen": -61.64430236816406, + "logps/rejected": -92.00253295898438, + "loss": 0.6231, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3693058490753174, + "rewards/margins": 6.741008758544922, + "rewards/rejected": -3.3717031478881836, + "step": 9329 + }, + { + "epoch": 2.33, + "grad_norm": 7.395702362060547, + "learning_rate": 5.5197760625968786e-06, + "logits/chosen": -0.533550500869751, + "logits/rejected": -0.6200070381164551, + "logps/chosen": -57.43147659301758, + "logps/rejected": -87.19092559814453, + "loss": 0.6657, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0204076766967773, + "rewards/margins": 5.419760704040527, + "rewards/rejected": -2.399352788925171, + "step": 9330 + }, + { + "epoch": 2.33, + "grad_norm": 5.458432197570801, + "learning_rate": 5.518994327021135e-06, + "logits/chosen": -0.5187531113624573, + "logits/rejected": -0.5051922798156738, + "logps/chosen": -46.161476135253906, + "logps/rejected": -104.51461029052734, + "loss": 0.7264, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8030784130096436, + "rewards/margins": 4.439976692199707, + "rewards/rejected": -1.6368986368179321, + "step": 9331 + }, + { + "epoch": 2.33, + "grad_norm": 7.7584075927734375, + "learning_rate": 5.51821257862049e-06, + "logits/chosen": -0.47552698850631714, + "logits/rejected": -0.5779699087142944, + "logps/chosen": -70.58787536621094, + "logps/rejected": -98.33624267578125, + "loss": 0.6607, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9001474380493164, + "rewards/margins": 6.863914489746094, + "rewards/rejected": -3.9637668132781982, + "step": 9332 + }, + { + "epoch": 2.33, + "grad_norm": 11.113232612609863, + "learning_rate": 5.517430817414261e-06, + "logits/chosen": -0.5388075709342957, + "logits/rejected": -0.6046193838119507, + "logps/chosen": -42.44643783569336, + "logps/rejected": -81.31814575195312, + "loss": 0.8093, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.968162775039673, + "rewards/margins": 5.310262203216553, + "rewards/rejected": -2.342099189758301, + "step": 9333 + }, + { + "epoch": 2.34, + "grad_norm": 8.788026809692383, + "learning_rate": 5.516649043421765e-06, + "logits/chosen": -0.48305052518844604, + "logits/rejected": -0.6160004734992981, + "logps/chosen": -52.283424377441406, + "logps/rejected": -76.34461212158203, + "loss": 0.5927, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9919843673706055, + "rewards/margins": 5.780452251434326, + "rewards/rejected": -2.7884681224823, + "step": 9334 + }, + { + "epoch": 2.34, + "grad_norm": 3.644754648208618, + "learning_rate": 5.515867256662322e-06, + "logits/chosen": -0.5644695162773132, + "logits/rejected": -0.6807836294174194, + "logps/chosen": -48.548583984375, + "logps/rejected": -77.47956085205078, + "loss": 0.6369, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.290632963180542, + "rewards/margins": 6.549381256103516, + "rewards/rejected": -3.2587480545043945, + "step": 9335 + }, + { + "epoch": 2.34, + "grad_norm": 4.494802474975586, + "learning_rate": 5.51508545715525e-06, + "logits/chosen": -0.5235827565193176, + "logits/rejected": -0.5242281556129456, + "logps/chosen": -73.51158142089844, + "logps/rejected": -113.73402404785156, + "loss": 0.7363, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.018298387527466, + "rewards/margins": 5.987866401672363, + "rewards/rejected": -2.9695684909820557, + "step": 9336 + }, + { + "epoch": 2.34, + "grad_norm": 3.2734718322753906, + "learning_rate": 5.514303644919869e-06, + "logits/chosen": -0.4830615520477295, + "logits/rejected": -0.6065546870231628, + "logps/chosen": -70.78450012207031, + "logps/rejected": -92.1773681640625, + "loss": 0.5732, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.352231979370117, + "rewards/margins": 6.000787258148193, + "rewards/rejected": -2.648555278778076, + "step": 9337 + }, + { + "epoch": 2.34, + "grad_norm": 3.1023263931274414, + "learning_rate": 5.513521819975499e-06, + "logits/chosen": -0.53689044713974, + "logits/rejected": -0.6178029179573059, + "logps/chosen": -55.890380859375, + "logps/rejected": -79.44602966308594, + "loss": 0.6228, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7974908351898193, + "rewards/margins": 4.5841522216796875, + "rewards/rejected": -1.786661982536316, + "step": 9338 + }, + { + "epoch": 2.34, + "grad_norm": 6.171288013458252, + "learning_rate": 5.512739982341458e-06, + "logits/chosen": -0.49371176958084106, + "logits/rejected": -0.5830314755439758, + "logps/chosen": -52.4693717956543, + "logps/rejected": -86.05859375, + "loss": 0.6842, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8287830352783203, + "rewards/margins": 5.614231109619141, + "rewards/rejected": -2.785447597503662, + "step": 9339 + }, + { + "epoch": 2.34, + "grad_norm": 3.7079155445098877, + "learning_rate": 5.511958132037065e-06, + "logits/chosen": -0.4883333146572113, + "logits/rejected": -0.5945520997047424, + "logps/chosen": -57.7764892578125, + "logps/rejected": -86.0142593383789, + "loss": 0.6256, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.184800624847412, + "rewards/margins": 6.431432247161865, + "rewards/rejected": -3.2466323375701904, + "step": 9340 + }, + { + "epoch": 2.34, + "grad_norm": 4.280060768127441, + "learning_rate": 5.511176269081645e-06, + "logits/chosen": -0.46556028723716736, + "logits/rejected": -0.5013675689697266, + "logps/chosen": -54.40272903442383, + "logps/rejected": -91.26823425292969, + "loss": 0.7148, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2969160079956055, + "rewards/margins": 5.199613571166992, + "rewards/rejected": -1.9026973247528076, + "step": 9341 + }, + { + "epoch": 2.34, + "grad_norm": 3.114924430847168, + "learning_rate": 5.510394393494512e-06, + "logits/chosen": -0.4753802418708801, + "logits/rejected": -0.5551682710647583, + "logps/chosen": -54.52287673950195, + "logps/rejected": -92.24778747558594, + "loss": 0.6268, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3339731693267822, + "rewards/margins": 5.601921081542969, + "rewards/rejected": -2.2679479122161865, + "step": 9342 + }, + { + "epoch": 2.34, + "grad_norm": 24.029544830322266, + "learning_rate": 5.509612505294994e-06, + "logits/chosen": -0.5594063401222229, + "logits/rejected": -0.6177205443382263, + "logps/chosen": -51.08122253417969, + "logps/rejected": -105.55121612548828, + "loss": 0.6963, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0338516235351562, + "rewards/margins": 5.662318229675293, + "rewards/rejected": -2.6284661293029785, + "step": 9343 + }, + { + "epoch": 2.34, + "grad_norm": 3.4086005687713623, + "learning_rate": 5.508830604502408e-06, + "logits/chosen": -0.5172315835952759, + "logits/rejected": -0.6145428419113159, + "logps/chosen": -50.626220703125, + "logps/rejected": -94.22518157958984, + "loss": 0.5924, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.948359489440918, + "rewards/margins": 6.374207496643066, + "rewards/rejected": -3.4258482456207275, + "step": 9344 + }, + { + "epoch": 2.34, + "grad_norm": 3.668605089187622, + "learning_rate": 5.508048691136075e-06, + "logits/chosen": -0.4953364431858063, + "logits/rejected": -0.6155526638031006, + "logps/chosen": -62.83860397338867, + "logps/rejected": -85.9528579711914, + "loss": 0.6467, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.255352258682251, + "rewards/margins": 6.585314750671387, + "rewards/rejected": -3.329962730407715, + "step": 9345 + }, + { + "epoch": 2.34, + "grad_norm": 6.181486129760742, + "learning_rate": 5.50726676521532e-06, + "logits/chosen": -0.45809245109558105, + "logits/rejected": -0.5425229668617249, + "logps/chosen": -52.915367126464844, + "logps/rejected": -96.28983306884766, + "loss": 0.5914, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9679043292999268, + "rewards/margins": 5.858341693878174, + "rewards/rejected": -2.890437364578247, + "step": 9346 + }, + { + "epoch": 2.34, + "grad_norm": 3.6897122859954834, + "learning_rate": 5.506484826759464e-06, + "logits/chosen": -0.570475697517395, + "logits/rejected": -0.6716097593307495, + "logps/chosen": -65.88577270507812, + "logps/rejected": -85.00640869140625, + "loss": 0.6757, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.95975661277771, + "rewards/margins": 5.965592861175537, + "rewards/rejected": -3.005836009979248, + "step": 9347 + }, + { + "epoch": 2.34, + "grad_norm": 18.32112693786621, + "learning_rate": 5.505702875787829e-06, + "logits/chosen": -0.47609955072402954, + "logits/rejected": -0.5655220746994019, + "logps/chosen": -58.42184829711914, + "logps/rejected": -101.55717468261719, + "loss": 0.6574, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.077096462249756, + "rewards/margins": 6.656377792358398, + "rewards/rejected": -3.5792808532714844, + "step": 9348 + }, + { + "epoch": 2.34, + "grad_norm": 6.343172550201416, + "learning_rate": 5.504920912319736e-06, + "logits/chosen": -0.526580810546875, + "logits/rejected": -0.6240674257278442, + "logps/chosen": -53.14575958251953, + "logps/rejected": -90.38900756835938, + "loss": 0.7488, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6842188835144043, + "rewards/margins": 5.1924052238464355, + "rewards/rejected": -2.5081868171691895, + "step": 9349 + }, + { + "epoch": 2.34, + "grad_norm": 3.364229679107666, + "learning_rate": 5.504138936374513e-06, + "logits/chosen": -0.45884740352630615, + "logits/rejected": -0.5373945832252502, + "logps/chosen": -55.311988830566406, + "logps/rejected": -93.86918640136719, + "loss": 0.7041, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.108858108520508, + "rewards/margins": 5.6166229248046875, + "rewards/rejected": -2.5077648162841797, + "step": 9350 + }, + { + "epoch": 2.34, + "grad_norm": 4.033659934997559, + "learning_rate": 5.503356947971478e-06, + "logits/chosen": -0.5364840030670166, + "logits/rejected": -0.5607690215110779, + "logps/chosen": -47.722408294677734, + "logps/rejected": -107.68289184570312, + "loss": 0.6504, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.901660680770874, + "rewards/margins": 5.810005187988281, + "rewards/rejected": -2.9083445072174072, + "step": 9351 + }, + { + "epoch": 2.34, + "grad_norm": 4.214015960693359, + "learning_rate": 5.50257494712996e-06, + "logits/chosen": -0.5296277403831482, + "logits/rejected": -0.5931211113929749, + "logps/chosen": -72.21300506591797, + "logps/rejected": -105.58253479003906, + "loss": 0.7437, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8170347213745117, + "rewards/margins": 6.098186492919922, + "rewards/rejected": -3.2811520099639893, + "step": 9352 + }, + { + "epoch": 2.34, + "grad_norm": 2.413381338119507, + "learning_rate": 5.501792933869279e-06, + "logits/chosen": -0.5118503570556641, + "logits/rejected": -0.5970216393470764, + "logps/chosen": -55.570743560791016, + "logps/rejected": -103.9766845703125, + "loss": 0.5566, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.118894338607788, + "rewards/margins": 7.287114143371582, + "rewards/rejected": -4.168219089508057, + "step": 9353 + }, + { + "epoch": 2.34, + "grad_norm": 10.374755859375, + "learning_rate": 5.50101090820876e-06, + "logits/chosen": -0.5422027707099915, + "logits/rejected": -0.6629384160041809, + "logps/chosen": -57.402610778808594, + "logps/rejected": -79.89686584472656, + "loss": 0.6388, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8227343559265137, + "rewards/margins": 5.719951629638672, + "rewards/rejected": -2.897216796875, + "step": 9354 + }, + { + "epoch": 2.34, + "grad_norm": 8.180351257324219, + "learning_rate": 5.5002288701677305e-06, + "logits/chosen": -0.5376277565956116, + "logits/rejected": -0.6160452365875244, + "logps/chosen": -44.47893142700195, + "logps/rejected": -80.1434326171875, + "loss": 0.6917, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9700331687927246, + "rewards/margins": 5.641430854797363, + "rewards/rejected": -2.6713979244232178, + "step": 9355 + }, + { + "epoch": 2.34, + "grad_norm": 6.080971717834473, + "learning_rate": 5.4994468197655136e-06, + "logits/chosen": -0.5793458223342896, + "logits/rejected": -0.6686256527900696, + "logps/chosen": -54.730987548828125, + "logps/rejected": -94.64360809326172, + "loss": 0.6935, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.859680652618408, + "rewards/margins": 6.7235307693481445, + "rewards/rejected": -3.8638503551483154, + "step": 9356 + }, + { + "epoch": 2.34, + "grad_norm": 23.709823608398438, + "learning_rate": 5.498664757021434e-06, + "logits/chosen": -0.513194739818573, + "logits/rejected": -0.5730128884315491, + "logps/chosen": -53.17211151123047, + "logps/rejected": -81.79186248779297, + "loss": 0.7472, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0003817081451416, + "rewards/margins": 5.7666096687316895, + "rewards/rejected": -2.766227960586548, + "step": 9357 + }, + { + "epoch": 2.34, + "grad_norm": 6.935032844543457, + "learning_rate": 5.497882681954817e-06, + "logits/chosen": -0.5918378233909607, + "logits/rejected": -0.648520827293396, + "logps/chosen": -56.733299255371094, + "logps/rejected": -101.57844543457031, + "loss": 0.7143, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9715843200683594, + "rewards/margins": 5.406213283538818, + "rewards/rejected": -2.434628486633301, + "step": 9358 + }, + { + "epoch": 2.34, + "grad_norm": 6.140251636505127, + "learning_rate": 5.4971005945849894e-06, + "logits/chosen": -0.49849891662597656, + "logits/rejected": -0.5942568182945251, + "logps/chosen": -47.727394104003906, + "logps/rejected": -114.56364440917969, + "loss": 0.555, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.332925319671631, + "rewards/margins": 7.669620513916016, + "rewards/rejected": -4.336695194244385, + "step": 9359 + }, + { + "epoch": 2.34, + "grad_norm": 6.49520206451416, + "learning_rate": 5.496318494931278e-06, + "logits/chosen": -0.4781920313835144, + "logits/rejected": -0.6032871007919312, + "logps/chosen": -57.20071792602539, + "logps/rejected": -104.95868682861328, + "loss": 0.6048, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8569083213806152, + "rewards/margins": 8.184738159179688, + "rewards/rejected": -5.3278303146362305, + "step": 9360 + }, + { + "epoch": 2.34, + "grad_norm": 7.983177185058594, + "learning_rate": 5.495536383013009e-06, + "logits/chosen": -0.5108377933502197, + "logits/rejected": -0.610763430595398, + "logps/chosen": -56.86392593383789, + "logps/rejected": -97.21907806396484, + "loss": 0.5977, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8040220737457275, + "rewards/margins": 6.842739105224609, + "rewards/rejected": -4.038716793060303, + "step": 9361 + }, + { + "epoch": 2.34, + "grad_norm": 41.96364974975586, + "learning_rate": 5.494754258849508e-06, + "logits/chosen": -0.5669230818748474, + "logits/rejected": -0.6183497309684753, + "logps/chosen": -72.52993774414062, + "logps/rejected": -108.81492614746094, + "loss": 0.7505, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.21134614944458, + "rewards/margins": 6.444089412689209, + "rewards/rejected": -3.232743263244629, + "step": 9362 + }, + { + "epoch": 2.34, + "grad_norm": 7.450290679931641, + "learning_rate": 5.493972122460104e-06, + "logits/chosen": -0.5525422096252441, + "logits/rejected": -0.6490508317947388, + "logps/chosen": -47.61609649658203, + "logps/rejected": -102.19070434570312, + "loss": 0.6646, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.885183811187744, + "rewards/margins": 6.660114288330078, + "rewards/rejected": -3.774930477142334, + "step": 9363 + }, + { + "epoch": 2.34, + "grad_norm": 6.059629917144775, + "learning_rate": 5.493189973864124e-06, + "logits/chosen": -0.4621310234069824, + "logits/rejected": -0.5903480648994446, + "logps/chosen": -63.758888244628906, + "logps/rejected": -84.2534408569336, + "loss": 0.6883, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1804490089416504, + "rewards/margins": 5.938166618347168, + "rewards/rejected": -2.7577168941497803, + "step": 9364 + }, + { + "epoch": 2.34, + "grad_norm": 6.994843482971191, + "learning_rate": 5.492407813080892e-06, + "logits/chosen": -0.45559045672416687, + "logits/rejected": -0.5326603055000305, + "logps/chosen": -57.11375045776367, + "logps/rejected": -87.97224426269531, + "loss": 0.6457, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5569236278533936, + "rewards/margins": 5.781774520874023, + "rewards/rejected": -3.224851131439209, + "step": 9365 + }, + { + "epoch": 2.34, + "grad_norm": 6.100209712982178, + "learning_rate": 5.491625640129743e-06, + "logits/chosen": -0.45343875885009766, + "logits/rejected": -0.5445049405097961, + "logps/chosen": -56.24641418457031, + "logps/rejected": -97.69107055664062, + "loss": 0.6252, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9804930686950684, + "rewards/margins": 6.203958988189697, + "rewards/rejected": -3.2234654426574707, + "step": 9366 + }, + { + "epoch": 2.34, + "grad_norm": 8.071011543273926, + "learning_rate": 5.490843455029999e-06, + "logits/chosen": -0.5272114872932434, + "logits/rejected": -0.6032297611236572, + "logps/chosen": -56.19548034667969, + "logps/rejected": -95.80719757080078, + "loss": 0.6704, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8580594062805176, + "rewards/margins": 6.423355579376221, + "rewards/rejected": -3.565296173095703, + "step": 9367 + }, + { + "epoch": 2.34, + "grad_norm": 5.764888286590576, + "learning_rate": 5.490061257800991e-06, + "logits/chosen": -0.5784690380096436, + "logits/rejected": -0.6335129737854004, + "logps/chosen": -62.67741394042969, + "logps/rejected": -108.42455291748047, + "loss": 0.7266, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0785467624664307, + "rewards/margins": 6.33480167388916, + "rewards/rejected": -3.2562551498413086, + "step": 9368 + }, + { + "epoch": 2.34, + "grad_norm": 5.174933433532715, + "learning_rate": 5.489279048462049e-06, + "logits/chosen": -0.5350493788719177, + "logits/rejected": -0.6204015612602234, + "logps/chosen": -52.247337341308594, + "logps/rejected": -92.28673553466797, + "loss": 0.6194, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0290098190307617, + "rewards/margins": 5.869570732116699, + "rewards/rejected": -2.8405613899230957, + "step": 9369 + }, + { + "epoch": 2.34, + "grad_norm": 3.7423624992370605, + "learning_rate": 5.488496827032504e-06, + "logits/chosen": -0.5069433450698853, + "logits/rejected": -0.5545135140419006, + "logps/chosen": -56.346229553222656, + "logps/rejected": -111.48326110839844, + "loss": 0.6359, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.289022922515869, + "rewards/margins": 6.611917972564697, + "rewards/rejected": -3.3228955268859863, + "step": 9370 + }, + { + "epoch": 2.34, + "grad_norm": 16.2872371673584, + "learning_rate": 5.487714593531679e-06, + "logits/chosen": -0.4855937361717224, + "logits/rejected": -0.5987023711204529, + "logps/chosen": -51.310001373291016, + "logps/rejected": -100.59700012207031, + "loss": 0.5755, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0018582344055176, + "rewards/margins": 6.535713195800781, + "rewards/rejected": -3.5338547229766846, + "step": 9371 + }, + { + "epoch": 2.34, + "grad_norm": 14.95549201965332, + "learning_rate": 5.486932347978908e-06, + "logits/chosen": -0.4597495198249817, + "logits/rejected": -0.5722510814666748, + "logps/chosen": -63.467323303222656, + "logps/rejected": -99.18450164794922, + "loss": 0.8329, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8519749641418457, + "rewards/margins": 5.726283550262451, + "rewards/rejected": -2.8743081092834473, + "step": 9372 + }, + { + "epoch": 2.34, + "grad_norm": 6.655081748962402, + "learning_rate": 5.486150090393523e-06, + "logits/chosen": -0.4588753581047058, + "logits/rejected": -0.5337738990783691, + "logps/chosen": -48.89267349243164, + "logps/rejected": -95.74907684326172, + "loss": 0.6228, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.137073040008545, + "rewards/margins": 6.423454284667969, + "rewards/rejected": -3.2863810062408447, + "step": 9373 + }, + { + "epoch": 2.35, + "grad_norm": 8.358901023864746, + "learning_rate": 5.485367820794849e-06, + "logits/chosen": -0.6333749294281006, + "logits/rejected": -0.6768291592597961, + "logps/chosen": -48.06695556640625, + "logps/rejected": -91.52090454101562, + "loss": 0.7845, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.633031129837036, + "rewards/margins": 4.798724174499512, + "rewards/rejected": -2.1656930446624756, + "step": 9374 + }, + { + "epoch": 2.35, + "grad_norm": 9.142221450805664, + "learning_rate": 5.484585539202223e-06, + "logits/chosen": -0.4580642580986023, + "logits/rejected": -0.5578272938728333, + "logps/chosen": -64.29132080078125, + "logps/rejected": -89.62710571289062, + "loss": 0.9668, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8799946308135986, + "rewards/margins": 5.333981990814209, + "rewards/rejected": -2.4539871215820312, + "step": 9375 + }, + { + "epoch": 2.35, + "grad_norm": 2.809798240661621, + "learning_rate": 5.483803245634971e-06, + "logits/chosen": -0.5300845503807068, + "logits/rejected": -0.6998003721237183, + "logps/chosen": -70.38782501220703, + "logps/rejected": -90.17142486572266, + "loss": 0.6249, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.708535671234131, + "rewards/margins": 6.927563190460205, + "rewards/rejected": -4.219028472900391, + "step": 9376 + }, + { + "epoch": 2.35, + "grad_norm": 4.405095100402832, + "learning_rate": 5.483020940112427e-06, + "logits/chosen": -0.5698481798171997, + "logits/rejected": -0.6622419357299805, + "logps/chosen": -51.31962203979492, + "logps/rejected": -79.61968231201172, + "loss": 0.5918, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9182069301605225, + "rewards/margins": 6.175330638885498, + "rewards/rejected": -3.2571237087249756, + "step": 9377 + }, + { + "epoch": 2.35, + "grad_norm": 3.413224220275879, + "learning_rate": 5.482238622653921e-06, + "logits/chosen": -0.5094245672225952, + "logits/rejected": -0.5899205803871155, + "logps/chosen": -56.74308776855469, + "logps/rejected": -88.8012924194336, + "loss": 0.7522, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2355823516845703, + "rewards/margins": 6.07661771774292, + "rewards/rejected": -2.8410353660583496, + "step": 9378 + }, + { + "epoch": 2.35, + "grad_norm": 3.9256346225738525, + "learning_rate": 5.481456293278785e-06, + "logits/chosen": -0.5167214274406433, + "logits/rejected": -0.5585448741912842, + "logps/chosen": -45.29322814941406, + "logps/rejected": -86.5306396484375, + "loss": 0.6796, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4954662322998047, + "rewards/margins": 5.56458044052124, + "rewards/rejected": -2.0691144466400146, + "step": 9379 + }, + { + "epoch": 2.35, + "grad_norm": 4.651712417602539, + "learning_rate": 5.480673952006355e-06, + "logits/chosen": -0.44530802965164185, + "logits/rejected": -0.5298313498497009, + "logps/chosen": -56.540157318115234, + "logps/rejected": -83.90122985839844, + "loss": 0.5942, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9037234783172607, + "rewards/margins": 6.047830104827881, + "rewards/rejected": -3.144106388092041, + "step": 9380 + }, + { + "epoch": 2.35, + "grad_norm": 3.991565704345703, + "learning_rate": 5.479891598855957e-06, + "logits/chosen": -0.5664825439453125, + "logits/rejected": -0.6328680515289307, + "logps/chosen": -54.56085205078125, + "logps/rejected": -91.7224349975586, + "loss": 0.7605, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0441389083862305, + "rewards/margins": 5.805177688598633, + "rewards/rejected": -2.7610390186309814, + "step": 9381 + }, + { + "epoch": 2.35, + "grad_norm": 6.731088161468506, + "learning_rate": 5.4791092338469285e-06, + "logits/chosen": -0.40736067295074463, + "logits/rejected": -0.4557446241378784, + "logps/chosen": -50.49762725830078, + "logps/rejected": -110.51652526855469, + "loss": 0.7497, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.952310800552368, + "rewards/margins": 5.496364593505859, + "rewards/rejected": -2.544053792953491, + "step": 9382 + }, + { + "epoch": 2.35, + "grad_norm": 3.6197221279144287, + "learning_rate": 5.478326856998602e-06, + "logits/chosen": -0.5140541791915894, + "logits/rejected": -0.5739404559135437, + "logps/chosen": -49.530460357666016, + "logps/rejected": -99.0681381225586, + "loss": 0.6158, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8588571548461914, + "rewards/margins": 6.6825385093688965, + "rewards/rejected": -3.8236823081970215, + "step": 9383 + }, + { + "epoch": 2.35, + "grad_norm": 9.49448299407959, + "learning_rate": 5.47754446833031e-06, + "logits/chosen": -0.508776843547821, + "logits/rejected": -0.5817469954490662, + "logps/chosen": -57.82575225830078, + "logps/rejected": -90.01181030273438, + "loss": 0.8138, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.542428731918335, + "rewards/margins": 5.250507831573486, + "rewards/rejected": -2.7080798149108887, + "step": 9384 + }, + { + "epoch": 2.35, + "grad_norm": 7.2812323570251465, + "learning_rate": 5.476762067861385e-06, + "logits/chosen": -0.5143764615058899, + "logits/rejected": -0.538780152797699, + "logps/chosen": -44.82851791381836, + "logps/rejected": -103.82167053222656, + "loss": 0.7007, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.881791591644287, + "rewards/margins": 5.554912567138672, + "rewards/rejected": -2.673121213912964, + "step": 9385 + }, + { + "epoch": 2.35, + "grad_norm": 15.525755882263184, + "learning_rate": 5.475979655611163e-06, + "logits/chosen": -0.4623372554779053, + "logits/rejected": -0.5442081689834595, + "logps/chosen": -58.11327362060547, + "logps/rejected": -91.56099700927734, + "loss": 0.7405, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7271006107330322, + "rewards/margins": 5.459409713745117, + "rewards/rejected": -2.732308864593506, + "step": 9386 + }, + { + "epoch": 2.35, + "grad_norm": 4.2499775886535645, + "learning_rate": 5.475197231598978e-06, + "logits/chosen": -0.5304945707321167, + "logits/rejected": -0.6283648610115051, + "logps/chosen": -56.386409759521484, + "logps/rejected": -83.59474182128906, + "loss": 0.658, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1920418739318848, + "rewards/margins": 6.282026290893555, + "rewards/rejected": -3.089984655380249, + "step": 9387 + }, + { + "epoch": 2.35, + "grad_norm": 6.151022434234619, + "learning_rate": 5.474414795844163e-06, + "logits/chosen": -0.49860838055610657, + "logits/rejected": -0.6146707534790039, + "logps/chosen": -53.01927185058594, + "logps/rejected": -92.57420349121094, + "loss": 0.6802, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3018178939819336, + "rewards/margins": 6.313602924346924, + "rewards/rejected": -3.0117852687835693, + "step": 9388 + }, + { + "epoch": 2.35, + "grad_norm": 6.031075954437256, + "learning_rate": 5.473632348366055e-06, + "logits/chosen": -0.5487972497940063, + "logits/rejected": -0.6009643077850342, + "logps/chosen": -51.75399398803711, + "logps/rejected": -100.62501525878906, + "loss": 0.7132, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9931159019470215, + "rewards/margins": 5.49796199798584, + "rewards/rejected": -2.5048460960388184, + "step": 9389 + }, + { + "epoch": 2.35, + "grad_norm": 3.985224723815918, + "learning_rate": 5.472849889183986e-06, + "logits/chosen": -0.6586858034133911, + "logits/rejected": -0.7272068858146667, + "logps/chosen": -42.705345153808594, + "logps/rejected": -90.35562896728516, + "loss": 0.5348, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0667033195495605, + "rewards/margins": 6.5500006675720215, + "rewards/rejected": -3.483297348022461, + "step": 9390 + }, + { + "epoch": 2.35, + "grad_norm": 5.069839000701904, + "learning_rate": 5.472067418317294e-06, + "logits/chosen": -0.51667320728302, + "logits/rejected": -0.6175505518913269, + "logps/chosen": -51.84497833251953, + "logps/rejected": -89.42710876464844, + "loss": 0.6659, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8918423652648926, + "rewards/margins": 5.634828567504883, + "rewards/rejected": -2.7429866790771484, + "step": 9391 + }, + { + "epoch": 2.35, + "grad_norm": 3.348463296890259, + "learning_rate": 5.4712849357853144e-06, + "logits/chosen": -0.5179711580276489, + "logits/rejected": -0.6305626630783081, + "logps/chosen": -47.63477325439453, + "logps/rejected": -67.79390716552734, + "loss": 0.5883, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2147865295410156, + "rewards/margins": 5.664385795593262, + "rewards/rejected": -2.449599504470825, + "step": 9392 + }, + { + "epoch": 2.35, + "grad_norm": 4.2889084815979, + "learning_rate": 5.470502441607384e-06, + "logits/chosen": -0.4039343595504761, + "logits/rejected": -0.5044816732406616, + "logps/chosen": -62.40486145019531, + "logps/rejected": -98.93643188476562, + "loss": 0.623, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8966169357299805, + "rewards/margins": 6.261216640472412, + "rewards/rejected": -3.3645997047424316, + "step": 9393 + }, + { + "epoch": 2.35, + "grad_norm": 3.415562391281128, + "learning_rate": 5.469719935802834e-06, + "logits/chosen": -0.6248032450675964, + "logits/rejected": -0.694005012512207, + "logps/chosen": -43.74759292602539, + "logps/rejected": -81.64449310302734, + "loss": 0.6095, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8860220909118652, + "rewards/margins": 5.8156843185424805, + "rewards/rejected": -2.9296622276306152, + "step": 9394 + }, + { + "epoch": 2.35, + "grad_norm": 4.95786714553833, + "learning_rate": 5.4689374183910085e-06, + "logits/chosen": -0.5431916117668152, + "logits/rejected": -0.6147328019142151, + "logps/chosen": -49.12237548828125, + "logps/rejected": -85.1897201538086, + "loss": 0.6111, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1657302379608154, + "rewards/margins": 5.763070106506348, + "rewards/rejected": -2.597339630126953, + "step": 9395 + }, + { + "epoch": 2.35, + "grad_norm": 21.525604248046875, + "learning_rate": 5.468154889391239e-06, + "logits/chosen": -0.46980932354927063, + "logits/rejected": -0.54079669713974, + "logps/chosen": -63.503536224365234, + "logps/rejected": -91.91586303710938, + "loss": 0.8538, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.86405086517334, + "rewards/margins": 4.957912921905518, + "rewards/rejected": -2.0938620567321777, + "step": 9396 + }, + { + "epoch": 2.35, + "grad_norm": 17.265962600708008, + "learning_rate": 5.467372348822865e-06, + "logits/chosen": -0.441739022731781, + "logits/rejected": -0.544371485710144, + "logps/chosen": -61.43778991699219, + "logps/rejected": -95.39653778076172, + "loss": 0.7406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0700769424438477, + "rewards/margins": 6.709172248840332, + "rewards/rejected": -3.6390957832336426, + "step": 9397 + }, + { + "epoch": 2.35, + "grad_norm": 5.47228479385376, + "learning_rate": 5.466589796705223e-06, + "logits/chosen": -0.5052036643028259, + "logits/rejected": -0.5607865452766418, + "logps/chosen": -53.92290496826172, + "logps/rejected": -101.3172378540039, + "loss": 0.7111, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.914423942565918, + "rewards/margins": 6.395052909851074, + "rewards/rejected": -3.480628252029419, + "step": 9398 + }, + { + "epoch": 2.35, + "grad_norm": 8.717361450195312, + "learning_rate": 5.4658072330576505e-06, + "logits/chosen": -0.4770427644252777, + "logits/rejected": -0.6133627891540527, + "logps/chosen": -57.18334197998047, + "logps/rejected": -90.04833984375, + "loss": 0.7814, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.0222585201263428, + "rewards/margins": 5.715452671051025, + "rewards/rejected": -2.6931939125061035, + "step": 9399 + }, + { + "epoch": 2.35, + "grad_norm": 8.490346908569336, + "learning_rate": 5.465024657899486e-06, + "logits/chosen": -0.5764623284339905, + "logits/rejected": -0.6755728125572205, + "logps/chosen": -58.04859924316406, + "logps/rejected": -78.55044555664062, + "loss": 0.6836, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.032942295074463, + "rewards/margins": 4.968044281005859, + "rewards/rejected": -1.9351017475128174, + "step": 9400 + }, + { + "epoch": 2.35, + "grad_norm": 3.479872703552246, + "learning_rate": 5.464242071250067e-06, + "logits/chosen": -0.4933781325817108, + "logits/rejected": -0.5686286687850952, + "logps/chosen": -44.86442947387695, + "logps/rejected": -100.95405578613281, + "loss": 0.5445, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.999093770980835, + "rewards/margins": 6.73454475402832, + "rewards/rejected": -3.7354509830474854, + "step": 9401 + }, + { + "epoch": 2.35, + "grad_norm": 4.24899959564209, + "learning_rate": 5.463459473128733e-06, + "logits/chosen": -0.5448157787322998, + "logits/rejected": -0.5894986987113953, + "logps/chosen": -44.0878791809082, + "logps/rejected": -97.23050689697266, + "loss": 0.7005, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.132349729537964, + "rewards/margins": 4.803236961364746, + "rewards/rejected": -1.6708874702453613, + "step": 9402 + }, + { + "epoch": 2.35, + "grad_norm": 3.84627628326416, + "learning_rate": 5.4626768635548225e-06, + "logits/chosen": -0.43504202365875244, + "logits/rejected": -0.5702185034751892, + "logps/chosen": -62.2835693359375, + "logps/rejected": -102.27568054199219, + "loss": 0.5778, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.703407049179077, + "rewards/margins": 6.615390300750732, + "rewards/rejected": -3.9119834899902344, + "step": 9403 + }, + { + "epoch": 2.35, + "grad_norm": 6.950927257537842, + "learning_rate": 5.461894242547675e-06, + "logits/chosen": -0.5846801996231079, + "logits/rejected": -0.5950179696083069, + "logps/chosen": -55.52088928222656, + "logps/rejected": -114.1602554321289, + "loss": 0.7199, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.922548294067383, + "rewards/margins": 5.851649761199951, + "rewards/rejected": -2.9291014671325684, + "step": 9404 + }, + { + "epoch": 2.35, + "grad_norm": 4.7774505615234375, + "learning_rate": 5.4611116101266285e-06, + "logits/chosen": -0.5786288976669312, + "logits/rejected": -0.645543098449707, + "logps/chosen": -47.164527893066406, + "logps/rejected": -83.09798431396484, + "loss": 0.6507, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0317752361297607, + "rewards/margins": 5.771259784698486, + "rewards/rejected": -2.7394843101501465, + "step": 9405 + }, + { + "epoch": 2.35, + "grad_norm": 3.6740448474884033, + "learning_rate": 5.460328966311024e-06, + "logits/chosen": -0.5161569118499756, + "logits/rejected": -0.593355655670166, + "logps/chosen": -59.17653274536133, + "logps/rejected": -102.28590393066406, + "loss": 0.6595, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.003190517425537, + "rewards/margins": 6.683916091918945, + "rewards/rejected": -3.6807258129119873, + "step": 9406 + }, + { + "epoch": 2.35, + "grad_norm": 6.833048343658447, + "learning_rate": 5.4595463111202016e-06, + "logits/chosen": -0.5489720106124878, + "logits/rejected": -0.6178721785545349, + "logps/chosen": -52.665218353271484, + "logps/rejected": -111.5204086303711, + "loss": 0.5542, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9625442028045654, + "rewards/margins": 6.087093830108643, + "rewards/rejected": -3.1245501041412354, + "step": 9407 + }, + { + "epoch": 2.35, + "grad_norm": 8.419499397277832, + "learning_rate": 5.458763644573501e-06, + "logits/chosen": -0.506171464920044, + "logits/rejected": -0.5812203288078308, + "logps/chosen": -53.08572006225586, + "logps/rejected": -93.90999603271484, + "loss": 0.6931, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.147775650024414, + "rewards/margins": 6.479922294616699, + "rewards/rejected": -3.332146406173706, + "step": 9408 + }, + { + "epoch": 2.35, + "grad_norm": 3.7686779499053955, + "learning_rate": 5.457980966690263e-06, + "logits/chosen": -0.531363844871521, + "logits/rejected": -0.629121720790863, + "logps/chosen": -53.50635528564453, + "logps/rejected": -96.2603759765625, + "loss": 0.6009, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1187868118286133, + "rewards/margins": 6.397467613220215, + "rewards/rejected": -3.2786810398101807, + "step": 9409 + }, + { + "epoch": 2.35, + "grad_norm": 11.637099266052246, + "learning_rate": 5.457198277489827e-06, + "logits/chosen": -0.5218091011047363, + "logits/rejected": -0.6127067804336548, + "logps/chosen": -49.51874923706055, + "logps/rejected": -84.00054931640625, + "loss": 0.6119, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8574059009552, + "rewards/margins": 5.903600692749023, + "rewards/rejected": -3.0461950302124023, + "step": 9410 + }, + { + "epoch": 2.35, + "grad_norm": 8.671859741210938, + "learning_rate": 5.4564155769915365e-06, + "logits/chosen": -0.5217595100402832, + "logits/rejected": -0.5356345176696777, + "logps/chosen": -53.504615783691406, + "logps/rejected": -102.46128845214844, + "loss": 0.6513, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.143944501876831, + "rewards/margins": 4.848435401916504, + "rewards/rejected": -1.7044910192489624, + "step": 9411 + }, + { + "epoch": 2.35, + "grad_norm": 3.7549924850463867, + "learning_rate": 5.455632865214731e-06, + "logits/chosen": -0.548551082611084, + "logits/rejected": -0.6417139172554016, + "logps/chosen": -53.62434387207031, + "logps/rejected": -87.56246948242188, + "loss": 0.6416, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.288257360458374, + "rewards/margins": 5.703886985778809, + "rewards/rejected": -2.4156298637390137, + "step": 9412 + }, + { + "epoch": 2.35, + "grad_norm": 4.235039234161377, + "learning_rate": 5.4548501421787534e-06, + "logits/chosen": -0.4640007019042969, + "logits/rejected": -0.5300223231315613, + "logps/chosen": -54.621826171875, + "logps/rejected": -91.9093017578125, + "loss": 0.682, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0989255905151367, + "rewards/margins": 5.34653377532959, + "rewards/rejected": -2.247608184814453, + "step": 9413 + }, + { + "epoch": 2.36, + "grad_norm": 7.535680294036865, + "learning_rate": 5.454067407902944e-06, + "logits/chosen": -0.5432090163230896, + "logits/rejected": -0.5969170928001404, + "logps/chosen": -59.97880554199219, + "logps/rejected": -96.2864761352539, + "loss": 0.7656, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6987838745117188, + "rewards/margins": 4.867671966552734, + "rewards/rejected": -2.168888568878174, + "step": 9414 + }, + { + "epoch": 2.36, + "grad_norm": 3.5879242420196533, + "learning_rate": 5.453284662406646e-06, + "logits/chosen": -0.4787689745426178, + "logits/rejected": -0.6129556894302368, + "logps/chosen": -58.061580657958984, + "logps/rejected": -90.20616912841797, + "loss": 0.6077, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.037466049194336, + "rewards/margins": 7.03875207901001, + "rewards/rejected": -4.001285076141357, + "step": 9415 + }, + { + "epoch": 2.36, + "grad_norm": 5.670753002166748, + "learning_rate": 5.452501905709203e-06, + "logits/chosen": -0.43835675716400146, + "logits/rejected": -0.5545352697372437, + "logps/chosen": -49.844032287597656, + "logps/rejected": -86.65999603271484, + "loss": 0.6021, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9817471504211426, + "rewards/margins": 5.4807329177856445, + "rewards/rejected": -2.4989852905273438, + "step": 9416 + }, + { + "epoch": 2.36, + "grad_norm": 3.918475389480591, + "learning_rate": 5.451719137829955e-06, + "logits/chosen": -0.5748341083526611, + "logits/rejected": -0.650399923324585, + "logps/chosen": -50.8443489074707, + "logps/rejected": -91.349365234375, + "loss": 0.6775, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9414210319519043, + "rewards/margins": 5.404611587524414, + "rewards/rejected": -2.4631900787353516, + "step": 9417 + }, + { + "epoch": 2.36, + "grad_norm": 5.814828872680664, + "learning_rate": 5.450936358788248e-06, + "logits/chosen": -0.5815531015396118, + "logits/rejected": -0.6293348670005798, + "logps/chosen": -42.99090576171875, + "logps/rejected": -99.03759002685547, + "loss": 0.6854, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9050629138946533, + "rewards/margins": 6.446496963500977, + "rewards/rejected": -3.5414350032806396, + "step": 9418 + }, + { + "epoch": 2.36, + "grad_norm": 4.004345893859863, + "learning_rate": 5.450153568603424e-06, + "logits/chosen": -0.5447781085968018, + "logits/rejected": -0.5970187187194824, + "logps/chosen": -53.04185104370117, + "logps/rejected": -104.58338928222656, + "loss": 0.6909, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0817394256591797, + "rewards/margins": 6.6521196365356445, + "rewards/rejected": -3.570380210876465, + "step": 9419 + }, + { + "epoch": 2.36, + "grad_norm": 4.192820072174072, + "learning_rate": 5.449370767294826e-06, + "logits/chosen": -0.5749384760856628, + "logits/rejected": -0.6307665109634399, + "logps/chosen": -46.662635803222656, + "logps/rejected": -95.11183166503906, + "loss": 0.6402, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.788297176361084, + "rewards/margins": 5.857969284057617, + "rewards/rejected": -3.069671630859375, + "step": 9420 + }, + { + "epoch": 2.36, + "grad_norm": 11.013652801513672, + "learning_rate": 5.4485879548817985e-06, + "logits/chosen": -0.4822719097137451, + "logits/rejected": -0.6010599136352539, + "logps/chosen": -63.34941101074219, + "logps/rejected": -92.46453857421875, + "loss": 0.6714, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7096939086914062, + "rewards/margins": 6.099503993988037, + "rewards/rejected": -3.3898096084594727, + "step": 9421 + }, + { + "epoch": 2.36, + "grad_norm": 3.9504501819610596, + "learning_rate": 5.4478051313836854e-06, + "logits/chosen": -0.542289674282074, + "logits/rejected": -0.6441203355789185, + "logps/chosen": -53.56991958618164, + "logps/rejected": -81.2708969116211, + "loss": 0.6501, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.025571346282959, + "rewards/margins": 6.056609153747559, + "rewards/rejected": -3.0310378074645996, + "step": 9422 + }, + { + "epoch": 2.36, + "grad_norm": 15.308694839477539, + "learning_rate": 5.447022296819831e-06, + "logits/chosen": -0.47487926483154297, + "logits/rejected": -0.5656408071517944, + "logps/chosen": -46.60708999633789, + "logps/rejected": -83.80477142333984, + "loss": 0.6962, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.542872905731201, + "rewards/margins": 6.32063627243042, + "rewards/rejected": -3.7777631282806396, + "step": 9423 + }, + { + "epoch": 2.36, + "grad_norm": 7.012260913848877, + "learning_rate": 5.446239451209581e-06, + "logits/chosen": -0.5061501860618591, + "logits/rejected": -0.5452027320861816, + "logps/chosen": -56.514976501464844, + "logps/rejected": -126.23637390136719, + "loss": 0.671, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.040808916091919, + "rewards/margins": 7.324341773986816, + "rewards/rejected": -4.28353214263916, + "step": 9424 + }, + { + "epoch": 2.36, + "grad_norm": 6.926992416381836, + "learning_rate": 5.445456594572278e-06, + "logits/chosen": -0.5581778883934021, + "logits/rejected": -0.627348780632019, + "logps/chosen": -52.47166061401367, + "logps/rejected": -88.00605010986328, + "loss": 0.7127, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2616424560546875, + "rewards/margins": 5.50174617767334, + "rewards/rejected": -2.2401041984558105, + "step": 9425 + }, + { + "epoch": 2.36, + "grad_norm": 5.027368068695068, + "learning_rate": 5.4446737269272685e-06, + "logits/chosen": -0.4973651170730591, + "logits/rejected": -0.5956178307533264, + "logps/chosen": -50.488136291503906, + "logps/rejected": -97.43101501464844, + "loss": 0.5778, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.205397605895996, + "rewards/margins": 6.36409854888916, + "rewards/rejected": -3.158701181411743, + "step": 9426 + }, + { + "epoch": 2.36, + "grad_norm": 6.152937889099121, + "learning_rate": 5.443890848293901e-06, + "logits/chosen": -0.5289318561553955, + "logits/rejected": -0.6276228427886963, + "logps/chosen": -61.45943832397461, + "logps/rejected": -99.54798889160156, + "loss": 0.6378, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.847332000732422, + "rewards/margins": 6.5543670654296875, + "rewards/rejected": -3.7070345878601074, + "step": 9427 + }, + { + "epoch": 2.36, + "grad_norm": 4.302847862243652, + "learning_rate": 5.443107958691517e-06, + "logits/chosen": -0.5260041356086731, + "logits/rejected": -0.6085835695266724, + "logps/chosen": -45.23155212402344, + "logps/rejected": -87.93524169921875, + "loss": 0.5893, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3357350826263428, + "rewards/margins": 6.251183032989502, + "rewards/rejected": -2.9154484272003174, + "step": 9428 + }, + { + "epoch": 2.36, + "grad_norm": 4.60800838470459, + "learning_rate": 5.442325058139463e-06, + "logits/chosen": -0.5025537014007568, + "logits/rejected": -0.5921187400817871, + "logps/chosen": -60.417236328125, + "logps/rejected": -100.14733123779297, + "loss": 0.6846, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0425662994384766, + "rewards/margins": 5.440347671508789, + "rewards/rejected": -2.3977811336517334, + "step": 9429 + }, + { + "epoch": 2.36, + "grad_norm": 9.374146461486816, + "learning_rate": 5.441542146657088e-06, + "logits/chosen": -0.4946940839290619, + "logits/rejected": -0.5333736538887024, + "logps/chosen": -64.1772232055664, + "logps/rejected": -109.2091293334961, + "loss": 0.8516, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8617000579833984, + "rewards/margins": 5.761163711547852, + "rewards/rejected": -2.8994641304016113, + "step": 9430 + }, + { + "epoch": 2.36, + "grad_norm": 6.512482643127441, + "learning_rate": 5.4407592242637355e-06, + "logits/chosen": -0.4351823031902313, + "logits/rejected": -0.5475064516067505, + "logps/chosen": -55.60017395019531, + "logps/rejected": -86.92850494384766, + "loss": 0.6541, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6495113372802734, + "rewards/margins": 5.6485490798950195, + "rewards/rejected": -2.999037504196167, + "step": 9431 + }, + { + "epoch": 2.36, + "grad_norm": 4.9232258796691895, + "learning_rate": 5.439976290978755e-06, + "logits/chosen": -0.4930132329463959, + "logits/rejected": -0.5470304489135742, + "logps/chosen": -57.66359329223633, + "logps/rejected": -102.62916564941406, + "loss": 0.6622, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.146317958831787, + "rewards/margins": 6.420919418334961, + "rewards/rejected": -3.274601697921753, + "step": 9432 + }, + { + "epoch": 2.36, + "grad_norm": 5.951364517211914, + "learning_rate": 5.43919334682149e-06, + "logits/chosen": -0.5141157507896423, + "logits/rejected": -0.5749547481536865, + "logps/chosen": -44.99470520019531, + "logps/rejected": -99.72544860839844, + "loss": 0.566, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7263166904449463, + "rewards/margins": 6.390042781829834, + "rewards/rejected": -3.663726329803467, + "step": 9433 + }, + { + "epoch": 2.36, + "grad_norm": 4.008440017700195, + "learning_rate": 5.438410391811293e-06, + "logits/chosen": -0.5782041549682617, + "logits/rejected": -0.627487063407898, + "logps/chosen": -55.76598358154297, + "logps/rejected": -123.58665466308594, + "loss": 0.6645, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0089571475982666, + "rewards/margins": 7.422414302825928, + "rewards/rejected": -4.413456916809082, + "step": 9434 + }, + { + "epoch": 2.36, + "grad_norm": 5.479777812957764, + "learning_rate": 5.437627425967507e-06, + "logits/chosen": -0.5231137275695801, + "logits/rejected": -0.6040524840354919, + "logps/chosen": -75.1175537109375, + "logps/rejected": -96.31294250488281, + "loss": 0.7551, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.677497386932373, + "rewards/margins": 5.838111400604248, + "rewards/rejected": -3.160614013671875, + "step": 9435 + }, + { + "epoch": 2.36, + "grad_norm": 4.321670055389404, + "learning_rate": 5.436844449309484e-06, + "logits/chosen": -0.49024003744125366, + "logits/rejected": -0.5889815092086792, + "logps/chosen": -54.38275909423828, + "logps/rejected": -104.67243194580078, + "loss": 0.734, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2032246589660645, + "rewards/margins": 7.956882476806641, + "rewards/rejected": -4.753657341003418, + "step": 9436 + }, + { + "epoch": 2.36, + "grad_norm": 10.300190925598145, + "learning_rate": 5.436061461856569e-06, + "logits/chosen": -0.4770483076572418, + "logits/rejected": -0.5380201935768127, + "logps/chosen": -55.628639221191406, + "logps/rejected": -116.23680877685547, + "loss": 0.6857, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8166961669921875, + "rewards/margins": 6.257391929626465, + "rewards/rejected": -3.4406962394714355, + "step": 9437 + }, + { + "epoch": 2.36, + "grad_norm": 14.986383438110352, + "learning_rate": 5.435278463628111e-06, + "logits/chosen": -0.5532569885253906, + "logits/rejected": -0.6510782241821289, + "logps/chosen": -60.2264518737793, + "logps/rejected": -82.01537322998047, + "loss": 0.7185, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.021771192550659, + "rewards/margins": 6.306274890899658, + "rewards/rejected": -3.2845041751861572, + "step": 9438 + }, + { + "epoch": 2.36, + "grad_norm": 2.505115270614624, + "learning_rate": 5.434495454643459e-06, + "logits/chosen": -0.47100207209587097, + "logits/rejected": -0.5793522000312805, + "logps/chosen": -49.68497848510742, + "logps/rejected": -117.79912567138672, + "loss": 0.5736, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3630430698394775, + "rewards/margins": 8.83964729309082, + "rewards/rejected": -5.476604461669922, + "step": 9439 + }, + { + "epoch": 2.36, + "grad_norm": 9.881478309631348, + "learning_rate": 5.4337124349219625e-06, + "logits/chosen": -0.5635621547698975, + "logits/rejected": -0.6209660768508911, + "logps/chosen": -58.24211883544922, + "logps/rejected": -104.310791015625, + "loss": 0.7518, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8543708324432373, + "rewards/margins": 5.973905563354492, + "rewards/rejected": -3.1195342540740967, + "step": 9440 + }, + { + "epoch": 2.36, + "grad_norm": 4.805971622467041, + "learning_rate": 5.432929404482972e-06, + "logits/chosen": -0.5302810668945312, + "logits/rejected": -0.6291038393974304, + "logps/chosen": -65.64359283447266, + "logps/rejected": -89.55067443847656, + "loss": 0.7574, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.721816062927246, + "rewards/margins": 5.734613418579102, + "rewards/rejected": -3.0127973556518555, + "step": 9441 + }, + { + "epoch": 2.36, + "grad_norm": 5.391746520996094, + "learning_rate": 5.432146363345832e-06, + "logits/chosen": -0.5345723032951355, + "logits/rejected": -0.5690653920173645, + "logps/chosen": -62.72917175292969, + "logps/rejected": -108.3044204711914, + "loss": 0.7446, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8565773963928223, + "rewards/margins": 6.570187568664551, + "rewards/rejected": -3.7136101722717285, + "step": 9442 + }, + { + "epoch": 2.36, + "grad_norm": 3.4303321838378906, + "learning_rate": 5.431363311529898e-06, + "logits/chosen": -0.5072506666183472, + "logits/rejected": -0.5992922782897949, + "logps/chosen": -65.5709228515625, + "logps/rejected": -102.5714340209961, + "loss": 0.6097, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.100796937942505, + "rewards/margins": 6.963100433349609, + "rewards/rejected": -3.8623037338256836, + "step": 9443 + }, + { + "epoch": 2.36, + "grad_norm": 9.453158378601074, + "learning_rate": 5.4305802490545176e-06, + "logits/chosen": -0.5104684829711914, + "logits/rejected": -0.5471515655517578, + "logps/chosen": -53.553375244140625, + "logps/rejected": -90.47816467285156, + "loss": 0.6896, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1769750118255615, + "rewards/margins": 5.751026153564453, + "rewards/rejected": -2.5740509033203125, + "step": 9444 + }, + { + "epoch": 2.36, + "grad_norm": 2.7597239017486572, + "learning_rate": 5.42979717593904e-06, + "logits/chosen": -0.36540487408638, + "logits/rejected": -0.45463067293167114, + "logps/chosen": -59.271881103515625, + "logps/rejected": -102.31205749511719, + "loss": 0.5628, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.081705093383789, + "rewards/margins": 6.5773115158081055, + "rewards/rejected": -3.4956064224243164, + "step": 9445 + }, + { + "epoch": 2.36, + "grad_norm": 11.720927238464355, + "learning_rate": 5.4290140922028186e-06, + "logits/chosen": -0.5250157713890076, + "logits/rejected": -0.5979937314987183, + "logps/chosen": -53.412864685058594, + "logps/rejected": -101.76679992675781, + "loss": 0.7469, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.965359926223755, + "rewards/margins": 6.164173126220703, + "rewards/rejected": -3.1988131999969482, + "step": 9446 + }, + { + "epoch": 2.36, + "grad_norm": 4.622225761413574, + "learning_rate": 5.428230997865202e-06, + "logits/chosen": -0.4570348560810089, + "logits/rejected": -0.5441726446151733, + "logps/chosen": -70.77201080322266, + "logps/rejected": -107.8587646484375, + "loss": 0.7265, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.008272886276245, + "rewards/margins": 7.681006908416748, + "rewards/rejected": -4.672733783721924, + "step": 9447 + }, + { + "epoch": 2.36, + "grad_norm": 5.485498428344727, + "learning_rate": 5.4274478929455425e-06, + "logits/chosen": -0.4408547282218933, + "logits/rejected": -0.5708618760108948, + "logps/chosen": -63.729732513427734, + "logps/rejected": -88.40864562988281, + "loss": 0.635, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9139442443847656, + "rewards/margins": 5.8981451988220215, + "rewards/rejected": -2.984201431274414, + "step": 9448 + }, + { + "epoch": 2.36, + "grad_norm": 7.277076721191406, + "learning_rate": 5.426664777463191e-06, + "logits/chosen": -0.4984429180622101, + "logits/rejected": -0.5813199877738953, + "logps/chosen": -64.2891845703125, + "logps/rejected": -111.55805969238281, + "loss": 0.7629, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7876603603363037, + "rewards/margins": 6.680589199066162, + "rewards/rejected": -3.8929290771484375, + "step": 9449 + }, + { + "epoch": 2.36, + "grad_norm": 6.211165428161621, + "learning_rate": 5.425881651437499e-06, + "logits/chosen": -0.47368350625038147, + "logits/rejected": -0.5207709670066833, + "logps/chosen": -61.62446594238281, + "logps/rejected": -102.01081848144531, + "loss": 0.736, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2924375534057617, + "rewards/margins": 5.06124210357666, + "rewards/rejected": -1.7688043117523193, + "step": 9450 + }, + { + "epoch": 2.36, + "grad_norm": 4.1958441734313965, + "learning_rate": 5.425098514887816e-06, + "logits/chosen": -0.4950115382671356, + "logits/rejected": -0.577116847038269, + "logps/chosen": -64.96762084960938, + "logps/rejected": -95.80036163330078, + "loss": 0.8161, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.857076406478882, + "rewards/margins": 5.42974853515625, + "rewards/rejected": -2.572671890258789, + "step": 9451 + }, + { + "epoch": 2.36, + "grad_norm": 4.3446173667907715, + "learning_rate": 5.424315367833499e-06, + "logits/chosen": -0.5344765782356262, + "logits/rejected": -0.6190773248672485, + "logps/chosen": -66.07588958740234, + "logps/rejected": -111.3711929321289, + "loss": 0.6468, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.090714931488037, + "rewards/margins": 6.730626106262207, + "rewards/rejected": -3.6399106979370117, + "step": 9452 + }, + { + "epoch": 2.36, + "grad_norm": 6.177444934844971, + "learning_rate": 5.423532210293898e-06, + "logits/chosen": -0.503491222858429, + "logits/rejected": -0.5828429460525513, + "logps/chosen": -58.590415954589844, + "logps/rejected": -94.35941314697266, + "loss": 0.6665, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.065202236175537, + "rewards/margins": 6.059111595153809, + "rewards/rejected": -2.9939091205596924, + "step": 9453 + }, + { + "epoch": 2.37, + "grad_norm": 16.598047256469727, + "learning_rate": 5.422749042288364e-06, + "logits/chosen": -0.4868738651275635, + "logits/rejected": -0.5975626707077026, + "logps/chosen": -51.383628845214844, + "logps/rejected": -79.66483306884766, + "loss": 0.6416, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1928329467773438, + "rewards/margins": 6.449290752410889, + "rewards/rejected": -3.256458282470703, + "step": 9454 + }, + { + "epoch": 2.37, + "grad_norm": 8.515032768249512, + "learning_rate": 5.421965863836254e-06, + "logits/chosen": -0.5020791888237, + "logits/rejected": -0.6184104084968567, + "logps/chosen": -55.93017578125, + "logps/rejected": -109.6739501953125, + "loss": 0.6371, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9496443271636963, + "rewards/margins": 6.886946678161621, + "rewards/rejected": -3.937302589416504, + "step": 9455 + }, + { + "epoch": 2.37, + "grad_norm": 10.855274200439453, + "learning_rate": 5.421182674956916e-06, + "logits/chosen": -0.4783838391304016, + "logits/rejected": -0.5370619893074036, + "logps/chosen": -45.71177673339844, + "logps/rejected": -97.33079528808594, + "loss": 0.774, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.147151231765747, + "rewards/margins": 4.924838542938232, + "rewards/rejected": -1.7776871919631958, + "step": 9456 + }, + { + "epoch": 2.37, + "grad_norm": 9.177787780761719, + "learning_rate": 5.420399475669708e-06, + "logits/chosen": -0.46764835715293884, + "logits/rejected": -0.5630992650985718, + "logps/chosen": -49.518585205078125, + "logps/rejected": -82.50498962402344, + "loss": 0.6222, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.141016960144043, + "rewards/margins": 5.248055458068848, + "rewards/rejected": -2.1070384979248047, + "step": 9457 + }, + { + "epoch": 2.37, + "grad_norm": 8.062326431274414, + "learning_rate": 5.419616265993981e-06, + "logits/chosen": -0.495360791683197, + "logits/rejected": -0.575823187828064, + "logps/chosen": -56.41590118408203, + "logps/rejected": -82.21778869628906, + "loss": 0.7071, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0883495807647705, + "rewards/margins": 4.7886834144592285, + "rewards/rejected": -1.700333833694458, + "step": 9458 + }, + { + "epoch": 2.37, + "grad_norm": 5.27436637878418, + "learning_rate": 5.41883304594909e-06, + "logits/chosen": -0.4324468672275543, + "logits/rejected": -0.489897757768631, + "logps/chosen": -53.04413986206055, + "logps/rejected": -97.5101318359375, + "loss": 0.6853, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.999300956726074, + "rewards/margins": 5.6901631355285645, + "rewards/rejected": -2.6908621788024902, + "step": 9459 + }, + { + "epoch": 2.37, + "grad_norm": 5.958606243133545, + "learning_rate": 5.418049815554388e-06, + "logits/chosen": -0.5251166820526123, + "logits/rejected": -0.6021093726158142, + "logps/chosen": -57.07590866088867, + "logps/rejected": -96.65603637695312, + "loss": 0.703, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8655688762664795, + "rewards/margins": 6.095951080322266, + "rewards/rejected": -3.230381965637207, + "step": 9460 + }, + { + "epoch": 2.37, + "grad_norm": 11.289613723754883, + "learning_rate": 5.4172665748292316e-06, + "logits/chosen": -0.4549812376499176, + "logits/rejected": -0.5787438154220581, + "logps/chosen": -59.66180419921875, + "logps/rejected": -84.6057357788086, + "loss": 0.7063, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.148881196975708, + "rewards/margins": 5.890500545501709, + "rewards/rejected": -2.74161958694458, + "step": 9461 + }, + { + "epoch": 2.37, + "grad_norm": 12.496213912963867, + "learning_rate": 5.416483323792974e-06, + "logits/chosen": -0.5429480671882629, + "logits/rejected": -0.6182089447975159, + "logps/chosen": -52.53403854370117, + "logps/rejected": -95.65962219238281, + "loss": 0.6973, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.048326015472412, + "rewards/margins": 5.784103870391846, + "rewards/rejected": -2.7357778549194336, + "step": 9462 + }, + { + "epoch": 2.37, + "grad_norm": 6.706470966339111, + "learning_rate": 5.41570006246497e-06, + "logits/chosen": -0.5054286122322083, + "logits/rejected": -0.5805928111076355, + "logps/chosen": -50.041690826416016, + "logps/rejected": -97.843017578125, + "loss": 0.5912, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.019270420074463, + "rewards/margins": 6.719564437866211, + "rewards/rejected": -3.70029354095459, + "step": 9463 + }, + { + "epoch": 2.37, + "grad_norm": 6.613290309906006, + "learning_rate": 5.414916790864577e-06, + "logits/chosen": -0.5182117223739624, + "logits/rejected": -0.5873713493347168, + "logps/chosen": -49.582706451416016, + "logps/rejected": -87.4921875, + "loss": 0.7924, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9732460975646973, + "rewards/margins": 4.933655738830566, + "rewards/rejected": -1.9604097604751587, + "step": 9464 + }, + { + "epoch": 2.37, + "grad_norm": 7.711246967315674, + "learning_rate": 5.414133509011147e-06, + "logits/chosen": -0.5233675241470337, + "logits/rejected": -0.6352273225784302, + "logps/chosen": -69.95750427246094, + "logps/rejected": -100.68145751953125, + "loss": 0.69, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.664680004119873, + "rewards/margins": 6.134210586547852, + "rewards/rejected": -3.4695310592651367, + "step": 9465 + }, + { + "epoch": 2.37, + "grad_norm": 13.311690330505371, + "learning_rate": 5.413350216924037e-06, + "logits/chosen": -0.5071035623550415, + "logits/rejected": -0.5992978811264038, + "logps/chosen": -62.742820739746094, + "logps/rejected": -99.25822448730469, + "loss": 0.7817, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.000972032546997, + "rewards/margins": 5.9707231521606445, + "rewards/rejected": -2.9697513580322266, + "step": 9466 + }, + { + "epoch": 2.37, + "grad_norm": 4.449967384338379, + "learning_rate": 5.4125669146226045e-06, + "logits/chosen": -0.45501580834388733, + "logits/rejected": -0.5467809438705444, + "logps/chosen": -66.29170989990234, + "logps/rejected": -87.81787872314453, + "loss": 0.6517, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.058012008666992, + "rewards/margins": 5.683794975280762, + "rewards/rejected": -2.6257829666137695, + "step": 9467 + }, + { + "epoch": 2.37, + "grad_norm": 6.1273040771484375, + "learning_rate": 5.4117836021262045e-06, + "logits/chosen": -0.49061378836631775, + "logits/rejected": -0.5705649256706238, + "logps/chosen": -56.30929183959961, + "logps/rejected": -85.93766021728516, + "loss": 0.7466, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7726504802703857, + "rewards/margins": 5.451271057128906, + "rewards/rejected": -2.6786210536956787, + "step": 9468 + }, + { + "epoch": 2.37, + "grad_norm": 9.066911697387695, + "learning_rate": 5.411000279454194e-06, + "logits/chosen": -0.5075156688690186, + "logits/rejected": -0.6375589966773987, + "logps/chosen": -67.52224731445312, + "logps/rejected": -93.74978637695312, + "loss": 0.6859, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.052917957305908, + "rewards/margins": 6.2040863037109375, + "rewards/rejected": -3.15116810798645, + "step": 9469 + }, + { + "epoch": 2.37, + "grad_norm": 5.982620716094971, + "learning_rate": 5.41021694662593e-06, + "logits/chosen": -0.4421531856060028, + "logits/rejected": -0.518397331237793, + "logps/chosen": -59.813419342041016, + "logps/rejected": -109.156982421875, + "loss": 0.7749, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.884106159210205, + "rewards/margins": 6.264049053192139, + "rewards/rejected": -3.379943370819092, + "step": 9470 + }, + { + "epoch": 2.37, + "grad_norm": 14.248968124389648, + "learning_rate": 5.409433603660767e-06, + "logits/chosen": -0.47395598888397217, + "logits/rejected": -0.5036721229553223, + "logps/chosen": -56.0280647277832, + "logps/rejected": -112.76837158203125, + "loss": 0.6969, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8483240604400635, + "rewards/margins": 4.623142242431641, + "rewards/rejected": -1.7748180627822876, + "step": 9471 + }, + { + "epoch": 2.37, + "grad_norm": 3.260948896408081, + "learning_rate": 5.408650250578065e-06, + "logits/chosen": -0.46284282207489014, + "logits/rejected": -0.6419241428375244, + "logps/chosen": -60.86880874633789, + "logps/rejected": -90.80572509765625, + "loss": 0.548, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0337705612182617, + "rewards/margins": 7.360341548919678, + "rewards/rejected": -4.326570987701416, + "step": 9472 + }, + { + "epoch": 2.37, + "grad_norm": 8.893064498901367, + "learning_rate": 5.407866887397183e-06, + "logits/chosen": -0.5429808497428894, + "logits/rejected": -0.5849182605743408, + "logps/chosen": -49.64052963256836, + "logps/rejected": -81.46722412109375, + "loss": 0.6344, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.888906240463257, + "rewards/margins": 4.936054706573486, + "rewards/rejected": -2.0471479892730713, + "step": 9473 + }, + { + "epoch": 2.37, + "grad_norm": 8.347360610961914, + "learning_rate": 5.407083514137473e-06, + "logits/chosen": -0.41736358404159546, + "logits/rejected": -0.4615606665611267, + "logps/chosen": -60.89722442626953, + "logps/rejected": -94.26962280273438, + "loss": 0.8019, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.986252546310425, + "rewards/margins": 4.735132217407227, + "rewards/rejected": -1.748879313468933, + "step": 9474 + }, + { + "epoch": 2.37, + "grad_norm": 13.249333381652832, + "learning_rate": 5.406300130818299e-06, + "logits/chosen": -0.4155019223690033, + "logits/rejected": -0.5451693534851074, + "logps/chosen": -60.29901885986328, + "logps/rejected": -80.47457885742188, + "loss": 0.6929, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1265618801116943, + "rewards/margins": 5.671485900878906, + "rewards/rejected": -2.544924020767212, + "step": 9475 + }, + { + "epoch": 2.37, + "grad_norm": 3.016767978668213, + "learning_rate": 5.405516737459014e-06, + "logits/chosen": -0.5061739087104797, + "logits/rejected": -0.5706778168678284, + "logps/chosen": -53.07301712036133, + "logps/rejected": -104.9375, + "loss": 0.6638, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.051558017730713, + "rewards/margins": 6.742828369140625, + "rewards/rejected": -3.691270112991333, + "step": 9476 + }, + { + "epoch": 2.37, + "grad_norm": 5.514341354370117, + "learning_rate": 5.404733334078981e-06, + "logits/chosen": -0.4875819981098175, + "logits/rejected": -0.5625842809677124, + "logps/chosen": -53.216304779052734, + "logps/rejected": -102.97779846191406, + "loss": 0.6507, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.932718276977539, + "rewards/margins": 6.079089164733887, + "rewards/rejected": -3.146371364593506, + "step": 9477 + }, + { + "epoch": 2.37, + "grad_norm": 7.372520446777344, + "learning_rate": 5.403949920697557e-06, + "logits/chosen": -0.5813536643981934, + "logits/rejected": -0.6632686257362366, + "logps/chosen": -52.446868896484375, + "logps/rejected": -97.20869445800781, + "loss": 0.6638, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.30643892288208, + "rewards/margins": 6.123856544494629, + "rewards/rejected": -2.817417860031128, + "step": 9478 + }, + { + "epoch": 2.37, + "grad_norm": 6.140549182891846, + "learning_rate": 5.4031664973341e-06, + "logits/chosen": -0.5622507333755493, + "logits/rejected": -0.6312598586082458, + "logps/chosen": -56.238067626953125, + "logps/rejected": -80.98162078857422, + "loss": 0.7792, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.084181785583496, + "rewards/margins": 4.4773406982421875, + "rewards/rejected": -1.3931591510772705, + "step": 9479 + }, + { + "epoch": 2.37, + "grad_norm": 3.7809202671051025, + "learning_rate": 5.40238306400797e-06, + "logits/chosen": -0.5157589316368103, + "logits/rejected": -0.6184058785438538, + "logps/chosen": -63.09221649169922, + "logps/rejected": -100.36112976074219, + "loss": 0.5863, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.060133695602417, + "rewards/margins": 6.354328155517578, + "rewards/rejected": -3.2941949367523193, + "step": 9480 + }, + { + "epoch": 2.37, + "grad_norm": 4.871410846710205, + "learning_rate": 5.4015996207385255e-06, + "logits/chosen": -0.4159398376941681, + "logits/rejected": -0.5679450035095215, + "logps/chosen": -54.565696716308594, + "logps/rejected": -76.01396179199219, + "loss": 0.6123, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.234774112701416, + "rewards/margins": 5.6527228355407715, + "rewards/rejected": -2.4179489612579346, + "step": 9481 + }, + { + "epoch": 2.37, + "grad_norm": 4.373543739318848, + "learning_rate": 5.40081616754513e-06, + "logits/chosen": -0.4851950705051422, + "logits/rejected": -0.5660018920898438, + "logps/chosen": -56.285255432128906, + "logps/rejected": -97.35464477539062, + "loss": 0.6784, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8294520378112793, + "rewards/margins": 5.77772331237793, + "rewards/rejected": -2.948270559310913, + "step": 9482 + }, + { + "epoch": 2.37, + "grad_norm": 6.280808448791504, + "learning_rate": 5.400032704447138e-06, + "logits/chosen": -0.4973313808441162, + "logits/rejected": -0.4943046569824219, + "logps/chosen": -69.33663940429688, + "logps/rejected": -120.63662719726562, + "loss": 0.8069, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1646134853363037, + "rewards/margins": 5.587933540344238, + "rewards/rejected": -2.4233200550079346, + "step": 9483 + }, + { + "epoch": 2.37, + "grad_norm": 3.9607388973236084, + "learning_rate": 5.3992492314639135e-06, + "logits/chosen": -0.5510401725769043, + "logits/rejected": -0.5705602169036865, + "logps/chosen": -49.664634704589844, + "logps/rejected": -104.31998443603516, + "loss": 0.6673, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9601943492889404, + "rewards/margins": 6.097165107727051, + "rewards/rejected": -3.1369705200195312, + "step": 9484 + }, + { + "epoch": 2.37, + "grad_norm": 6.3810954093933105, + "learning_rate": 5.398465748614815e-06, + "logits/chosen": -0.43255114555358887, + "logits/rejected": -0.5668894052505493, + "logps/chosen": -50.5692253112793, + "logps/rejected": -75.57838439941406, + "loss": 0.5909, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9973442554473877, + "rewards/margins": 6.1538496017456055, + "rewards/rejected": -3.1565053462982178, + "step": 9485 + }, + { + "epoch": 2.37, + "grad_norm": 6.031062602996826, + "learning_rate": 5.397682255919206e-06, + "logits/chosen": -0.5061053037643433, + "logits/rejected": -0.5585232973098755, + "logps/chosen": -68.63543701171875, + "logps/rejected": -110.05012512207031, + "loss": 0.738, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.786809206008911, + "rewards/margins": 4.901193618774414, + "rewards/rejected": -2.114384174346924, + "step": 9486 + }, + { + "epoch": 2.37, + "grad_norm": 6.748647689819336, + "learning_rate": 5.3968987533964444e-06, + "logits/chosen": -0.5108922719955444, + "logits/rejected": -0.571067750453949, + "logps/chosen": -61.575138092041016, + "logps/rejected": -123.20600128173828, + "loss": 0.6619, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.821937084197998, + "rewards/margins": 6.927364349365234, + "rewards/rejected": -4.105427265167236, + "step": 9487 + }, + { + "epoch": 2.37, + "grad_norm": 4.445261478424072, + "learning_rate": 5.396115241065891e-06, + "logits/chosen": -0.4984228312969208, + "logits/rejected": -0.5469816327095032, + "logps/chosen": -54.18983840942383, + "logps/rejected": -105.66165924072266, + "loss": 0.6295, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.935567617416382, + "rewards/margins": 5.793076515197754, + "rewards/rejected": -2.857509136199951, + "step": 9488 + }, + { + "epoch": 2.37, + "grad_norm": 10.66529655456543, + "learning_rate": 5.39533171894691e-06, + "logits/chosen": -0.4356725811958313, + "logits/rejected": -0.5744184255599976, + "logps/chosen": -71.76506042480469, + "logps/rejected": -104.32688903808594, + "loss": 0.631, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.091480255126953, + "rewards/margins": 6.478280544281006, + "rewards/rejected": -3.386800765991211, + "step": 9489 + }, + { + "epoch": 2.37, + "grad_norm": 8.756230354309082, + "learning_rate": 5.394548187058861e-06, + "logits/chosen": -0.4758273661136627, + "logits/rejected": -0.5654668807983398, + "logps/chosen": -47.00845718383789, + "logps/rejected": -87.07413482666016, + "loss": 0.6401, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7683463096618652, + "rewards/margins": 5.224742889404297, + "rewards/rejected": -2.4563968181610107, + "step": 9490 + }, + { + "epoch": 2.37, + "grad_norm": 7.637080192565918, + "learning_rate": 5.393764645421108e-06, + "logits/chosen": -0.5183370113372803, + "logits/rejected": -0.6070829629898071, + "logps/chosen": -61.10505676269531, + "logps/rejected": -91.8531723022461, + "loss": 0.6821, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.160421371459961, + "rewards/margins": 5.412592887878418, + "rewards/rejected": -2.252171277999878, + "step": 9491 + }, + { + "epoch": 2.37, + "grad_norm": 15.84042739868164, + "learning_rate": 5.39298109405301e-06, + "logits/chosen": -0.519000768661499, + "logits/rejected": -0.5631871223449707, + "logps/chosen": -50.66510009765625, + "logps/rejected": -103.52513122558594, + "loss": 0.7422, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.672935724258423, + "rewards/margins": 5.154393196105957, + "rewards/rejected": -2.4814577102661133, + "step": 9492 + }, + { + "epoch": 2.37, + "grad_norm": 5.708798408508301, + "learning_rate": 5.392197532973934e-06, + "logits/chosen": -0.4240855872631073, + "logits/rejected": -0.5548935532569885, + "logps/chosen": -60.32533264160156, + "logps/rejected": -117.58425903320312, + "loss": 0.6481, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6517088413238525, + "rewards/margins": 8.501200675964355, + "rewards/rejected": -5.849493026733398, + "step": 9493 + }, + { + "epoch": 2.38, + "grad_norm": 5.7965874671936035, + "learning_rate": 5.391413962203236e-06, + "logits/chosen": -0.48012879490852356, + "logits/rejected": -0.4912492632865906, + "logps/chosen": -61.40630340576172, + "logps/rejected": -121.7158203125, + "loss": 0.7275, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.737187385559082, + "rewards/margins": 5.996100902557373, + "rewards/rejected": -3.25891375541687, + "step": 9494 + }, + { + "epoch": 2.38, + "grad_norm": 3.5212390422821045, + "learning_rate": 5.3906303817602854e-06, + "logits/chosen": -0.4651956856250763, + "logits/rejected": -0.5533726215362549, + "logps/chosen": -68.20645141601562, + "logps/rejected": -93.78633117675781, + "loss": 0.7309, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9548745155334473, + "rewards/margins": 5.537501811981201, + "rewards/rejected": -2.582627773284912, + "step": 9495 + }, + { + "epoch": 2.38, + "grad_norm": 3.7622947692871094, + "learning_rate": 5.389846791664444e-06, + "logits/chosen": -0.5257320404052734, + "logits/rejected": -0.5900895595550537, + "logps/chosen": -47.20936584472656, + "logps/rejected": -84.02193450927734, + "loss": 0.5627, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9484527111053467, + "rewards/margins": 5.863551139831543, + "rewards/rejected": -2.9150986671447754, + "step": 9496 + }, + { + "epoch": 2.38, + "grad_norm": 2.9880473613739014, + "learning_rate": 5.389063191935069e-06, + "logits/chosen": -0.40772053599357605, + "logits/rejected": -0.5166631937026978, + "logps/chosen": -58.68865203857422, + "logps/rejected": -102.48330688476562, + "loss": 0.5725, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.742004156112671, + "rewards/margins": 6.682894706726074, + "rewards/rejected": -3.940890073776245, + "step": 9497 + }, + { + "epoch": 2.38, + "grad_norm": 8.217710494995117, + "learning_rate": 5.388279582591533e-06, + "logits/chosen": -0.5363872647285461, + "logits/rejected": -0.6236156821250916, + "logps/chosen": -47.91135787963867, + "logps/rejected": -73.70022583007812, + "loss": 0.712, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9005634784698486, + "rewards/margins": 5.517185211181641, + "rewards/rejected": -2.616621732711792, + "step": 9498 + }, + { + "epoch": 2.38, + "grad_norm": 9.79252815246582, + "learning_rate": 5.387495963653193e-06, + "logits/chosen": -0.49321192502975464, + "logits/rejected": -0.5806539058685303, + "logps/chosen": -71.49907684326172, + "logps/rejected": -95.86576843261719, + "loss": 0.6888, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0843329429626465, + "rewards/margins": 5.941195487976074, + "rewards/rejected": -2.856863021850586, + "step": 9499 + }, + { + "epoch": 2.38, + "grad_norm": 8.264817237854004, + "learning_rate": 5.3867123351394165e-06, + "logits/chosen": -0.4983854293823242, + "logits/rejected": -0.5868830680847168, + "logps/chosen": -52.2027587890625, + "logps/rejected": -93.95704650878906, + "loss": 0.7576, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.983203887939453, + "rewards/margins": 5.774590015411377, + "rewards/rejected": -2.791386842727661, + "step": 9500 + }, + { + "epoch": 2.38, + "grad_norm": 10.696630477905273, + "learning_rate": 5.385928697069567e-06, + "logits/chosen": -0.4983436167240143, + "logits/rejected": -0.5806511044502258, + "logps/chosen": -55.0306396484375, + "logps/rejected": -93.22438049316406, + "loss": 0.8289, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2365715503692627, + "rewards/margins": 5.845163345336914, + "rewards/rejected": -2.6085922718048096, + "step": 9501 + }, + { + "epoch": 2.38, + "grad_norm": 4.138008117675781, + "learning_rate": 5.385145049463008e-06, + "logits/chosen": -0.5328981280326843, + "logits/rejected": -0.6340559124946594, + "logps/chosen": -47.265621185302734, + "logps/rejected": -91.86319732666016, + "loss": 0.6085, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3314132690429688, + "rewards/margins": 6.9227190017700195, + "rewards/rejected": -3.5913054943084717, + "step": 9502 + }, + { + "epoch": 2.38, + "grad_norm": 4.216465473175049, + "learning_rate": 5.384361392339107e-06, + "logits/chosen": -0.5400848388671875, + "logits/rejected": -0.5772174000740051, + "logps/chosen": -58.144527435302734, + "logps/rejected": -108.21806335449219, + "loss": 0.666, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1197924613952637, + "rewards/margins": 5.44328498840332, + "rewards/rejected": -2.3234927654266357, + "step": 9503 + }, + { + "epoch": 2.38, + "grad_norm": 11.286258697509766, + "learning_rate": 5.383577725717225e-06, + "logits/chosen": -0.47738325595855713, + "logits/rejected": -0.5918999314308167, + "logps/chosen": -58.468780517578125, + "logps/rejected": -100.13619232177734, + "loss": 0.6204, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9080605506896973, + "rewards/margins": 5.172163009643555, + "rewards/rejected": -2.264101982116699, + "step": 9504 + }, + { + "epoch": 2.38, + "grad_norm": 2.2280802726745605, + "learning_rate": 5.382794049616731e-06, + "logits/chosen": -0.5050336122512817, + "logits/rejected": -0.6535135507583618, + "logps/chosen": -46.146728515625, + "logps/rejected": -82.32483673095703, + "loss": 0.5423, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8994388580322266, + "rewards/margins": 7.259523391723633, + "rewards/rejected": -4.3600850105285645, + "step": 9505 + }, + { + "epoch": 2.38, + "grad_norm": 16.512432098388672, + "learning_rate": 5.382010364056988e-06, + "logits/chosen": -0.5243879556655884, + "logits/rejected": -0.5963325500488281, + "logps/chosen": -59.771217346191406, + "logps/rejected": -80.76673889160156, + "loss": 0.7283, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7866110801696777, + "rewards/margins": 4.899314880371094, + "rewards/rejected": -2.112703800201416, + "step": 9506 + }, + { + "epoch": 2.38, + "grad_norm": 8.597916603088379, + "learning_rate": 5.381226669057363e-06, + "logits/chosen": -0.4424384832382202, + "logits/rejected": -0.4938466250896454, + "logps/chosen": -52.49671173095703, + "logps/rejected": -96.00691986083984, + "loss": 0.5865, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0408647060394287, + "rewards/margins": 5.526268005371094, + "rewards/rejected": -2.4854040145874023, + "step": 9507 + }, + { + "epoch": 2.38, + "grad_norm": 3.9915590286254883, + "learning_rate": 5.380442964637221e-06, + "logits/chosen": -0.5952235460281372, + "logits/rejected": -0.6611349582672119, + "logps/chosen": -59.422508239746094, + "logps/rejected": -90.74736785888672, + "loss": 0.6854, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.085162401199341, + "rewards/margins": 5.881157875061035, + "rewards/rejected": -2.7959954738616943, + "step": 9508 + }, + { + "epoch": 2.38, + "grad_norm": 8.833250045776367, + "learning_rate": 5.37965925081593e-06, + "logits/chosen": -0.517924427986145, + "logits/rejected": -0.4968864321708679, + "logps/chosen": -56.05997848510742, + "logps/rejected": -104.49150085449219, + "loss": 0.7674, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0887370109558105, + "rewards/margins": 4.492128372192383, + "rewards/rejected": -1.403390884399414, + "step": 9509 + }, + { + "epoch": 2.38, + "grad_norm": 6.139267921447754, + "learning_rate": 5.378875527612854e-06, + "logits/chosen": -0.5529807806015015, + "logits/rejected": -0.6772156357765198, + "logps/chosen": -50.87835693359375, + "logps/rejected": -90.87959289550781, + "loss": 0.5975, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9466922283172607, + "rewards/margins": 6.669625282287598, + "rewards/rejected": -3.722933292388916, + "step": 9510 + }, + { + "epoch": 2.38, + "grad_norm": 4.8471856117248535, + "learning_rate": 5.3780917950473595e-06, + "logits/chosen": -0.49507206678390503, + "logits/rejected": -0.5058952569961548, + "logps/chosen": -46.19502258300781, + "logps/rejected": -90.42999267578125, + "loss": 0.7007, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8693246841430664, + "rewards/margins": 4.728358268737793, + "rewards/rejected": -1.859033226966858, + "step": 9511 + }, + { + "epoch": 2.38, + "grad_norm": 6.165402412414551, + "learning_rate": 5.3773080531388165e-06, + "logits/chosen": -0.5762638449668884, + "logits/rejected": -0.630550742149353, + "logps/chosen": -53.92582702636719, + "logps/rejected": -113.63158416748047, + "loss": 0.5287, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0221877098083496, + "rewards/margins": 6.955265045166016, + "rewards/rejected": -3.933077335357666, + "step": 9512 + }, + { + "epoch": 2.38, + "grad_norm": 2.5712451934814453, + "learning_rate": 5.37652430190659e-06, + "logits/chosen": -0.4089639484882355, + "logits/rejected": -0.4880951941013336, + "logps/chosen": -55.80876922607422, + "logps/rejected": -116.30778503417969, + "loss": 0.5827, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8456640243530273, + "rewards/margins": 6.383020401000977, + "rewards/rejected": -3.537355899810791, + "step": 9513 + }, + { + "epoch": 2.38, + "grad_norm": 2.9696998596191406, + "learning_rate": 5.375740541370047e-06, + "logits/chosen": -0.5025590658187866, + "logits/rejected": -0.5417460203170776, + "logps/chosen": -54.75263214111328, + "logps/rejected": -118.47314453125, + "loss": 0.6273, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9987828731536865, + "rewards/margins": 6.005847454071045, + "rewards/rejected": -3.0070648193359375, + "step": 9514 + }, + { + "epoch": 2.38, + "grad_norm": 5.185606479644775, + "learning_rate": 5.374956771548555e-06, + "logits/chosen": -0.540289580821991, + "logits/rejected": -0.6589105725288391, + "logps/chosen": -50.250553131103516, + "logps/rejected": -94.88600158691406, + "loss": 0.5651, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8361804485321045, + "rewards/margins": 6.887789726257324, + "rewards/rejected": -4.051609039306641, + "step": 9515 + }, + { + "epoch": 2.38, + "grad_norm": 4.392710208892822, + "learning_rate": 5.3741729924614835e-06, + "logits/chosen": -0.3950454592704773, + "logits/rejected": -0.4568154215812683, + "logps/chosen": -59.76244354248047, + "logps/rejected": -92.38124084472656, + "loss": 0.6772, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3583836555480957, + "rewards/margins": 5.201147556304932, + "rewards/rejected": -1.8427644968032837, + "step": 9516 + }, + { + "epoch": 2.38, + "grad_norm": 9.1426420211792, + "learning_rate": 5.373389204128199e-06, + "logits/chosen": -0.5573486685752869, + "logits/rejected": -0.6813604831695557, + "logps/chosen": -51.17721176147461, + "logps/rejected": -71.43777465820312, + "loss": 0.7271, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0197434425354004, + "rewards/margins": 5.644908428192139, + "rewards/rejected": -2.625164747238159, + "step": 9517 + }, + { + "epoch": 2.38, + "grad_norm": 5.729944705963135, + "learning_rate": 5.37260540656807e-06, + "logits/chosen": -0.5624133944511414, + "logits/rejected": -0.6337865591049194, + "logps/chosen": -45.570465087890625, + "logps/rejected": -90.955078125, + "loss": 0.5779, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1841306686401367, + "rewards/margins": 6.036894798278809, + "rewards/rejected": -2.85276460647583, + "step": 9518 + }, + { + "epoch": 2.38, + "grad_norm": 10.660642623901367, + "learning_rate": 5.371821599800465e-06, + "logits/chosen": -0.5679048299789429, + "logits/rejected": -0.613842785358429, + "logps/chosen": -55.94969940185547, + "logps/rejected": -105.35183715820312, + "loss": 0.6449, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9970309734344482, + "rewards/margins": 5.575056076049805, + "rewards/rejected": -2.5780248641967773, + "step": 9519 + }, + { + "epoch": 2.38, + "grad_norm": 6.081701755523682, + "learning_rate": 5.371037783844752e-06, + "logits/chosen": -0.5433754324913025, + "logits/rejected": -0.5837801694869995, + "logps/chosen": -50.119712829589844, + "logps/rejected": -107.6521224975586, + "loss": 0.6307, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.07893705368042, + "rewards/margins": 6.527185440063477, + "rewards/rejected": -3.4482486248016357, + "step": 9520 + }, + { + "epoch": 2.38, + "grad_norm": 11.964682579040527, + "learning_rate": 5.3702539587203025e-06, + "logits/chosen": -0.4495195746421814, + "logits/rejected": -0.5462597608566284, + "logps/chosen": -62.0131950378418, + "logps/rejected": -79.8056640625, + "loss": 0.747, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.749573230743408, + "rewards/margins": 4.525016784667969, + "rewards/rejected": -1.7754435539245605, + "step": 9521 + }, + { + "epoch": 2.38, + "grad_norm": 9.198742866516113, + "learning_rate": 5.369470124446483e-06, + "logits/chosen": -0.5609266757965088, + "logits/rejected": -0.6401897668838501, + "logps/chosen": -43.5793571472168, + "logps/rejected": -94.93492126464844, + "loss": 0.65, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9210681915283203, + "rewards/margins": 6.478916645050049, + "rewards/rejected": -3.5578479766845703, + "step": 9522 + }, + { + "epoch": 2.38, + "grad_norm": 5.666138648986816, + "learning_rate": 5.368686281042665e-06, + "logits/chosen": -0.4910030961036682, + "logits/rejected": -0.5807654857635498, + "logps/chosen": -50.80511474609375, + "logps/rejected": -102.05165100097656, + "loss": 0.7284, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7782914638519287, + "rewards/margins": 5.709005832672119, + "rewards/rejected": -2.9307146072387695, + "step": 9523 + }, + { + "epoch": 2.38, + "grad_norm": 4.039312839508057, + "learning_rate": 5.367902428528215e-06, + "logits/chosen": -0.4984557330608368, + "logits/rejected": -0.609745979309082, + "logps/chosen": -52.71035385131836, + "logps/rejected": -87.32089233398438, + "loss": 0.632, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.937215805053711, + "rewards/margins": 6.000932216644287, + "rewards/rejected": -3.063716411590576, + "step": 9524 + }, + { + "epoch": 2.38, + "grad_norm": 4.0650315284729, + "learning_rate": 5.367118566922506e-06, + "logits/chosen": -0.5089114904403687, + "logits/rejected": -0.5285791158676147, + "logps/chosen": -40.8480339050293, + "logps/rejected": -98.43083190917969, + "loss": 0.5862, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3411426544189453, + "rewards/margins": 5.5373430252075195, + "rewards/rejected": -2.1962006092071533, + "step": 9525 + }, + { + "epoch": 2.38, + "grad_norm": 3.322441339492798, + "learning_rate": 5.366334696244904e-06, + "logits/chosen": -0.531934380531311, + "logits/rejected": -0.5990844964981079, + "logps/chosen": -48.24108123779297, + "logps/rejected": -97.6654281616211, + "loss": 0.5963, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.022143602371216, + "rewards/margins": 6.466954231262207, + "rewards/rejected": -3.444810628890991, + "step": 9526 + }, + { + "epoch": 2.38, + "grad_norm": 4.517195224761963, + "learning_rate": 5.365550816514785e-06, + "logits/chosen": -0.5924793481826782, + "logits/rejected": -0.6842336654663086, + "logps/chosen": -54.916168212890625, + "logps/rejected": -95.46044921875, + "loss": 0.6807, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9257726669311523, + "rewards/margins": 6.0005950927734375, + "rewards/rejected": -3.074822425842285, + "step": 9527 + }, + { + "epoch": 2.38, + "grad_norm": 4.966346263885498, + "learning_rate": 5.364766927751515e-06, + "logits/chosen": -0.4421539008617401, + "logits/rejected": -0.5593520998954773, + "logps/chosen": -59.98170471191406, + "logps/rejected": -82.30928039550781, + "loss": 0.7004, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9402942657470703, + "rewards/margins": 5.660190105438232, + "rewards/rejected": -2.719895601272583, + "step": 9528 + }, + { + "epoch": 2.38, + "grad_norm": 6.614142894744873, + "learning_rate": 5.3639830299744665e-06, + "logits/chosen": -0.5030256509780884, + "logits/rejected": -0.6124283075332642, + "logps/chosen": -62.764183044433594, + "logps/rejected": -112.1283187866211, + "loss": 0.6135, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.949361801147461, + "rewards/margins": 6.336313724517822, + "rewards/rejected": -3.386951446533203, + "step": 9529 + }, + { + "epoch": 2.38, + "grad_norm": 6.136362075805664, + "learning_rate": 5.363199123203011e-06, + "logits/chosen": -0.5290120244026184, + "logits/rejected": -0.5803617238998413, + "logps/chosen": -55.16840362548828, + "logps/rejected": -102.69232177734375, + "loss": 0.7757, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1299118995666504, + "rewards/margins": 4.825125694274902, + "rewards/rejected": -1.695213794708252, + "step": 9530 + }, + { + "epoch": 2.38, + "grad_norm": 4.984914302825928, + "learning_rate": 5.362415207456518e-06, + "logits/chosen": -0.4572940468788147, + "logits/rejected": -0.48606085777282715, + "logps/chosen": -54.398319244384766, + "logps/rejected": -106.2614517211914, + "loss": 0.6598, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2471938133239746, + "rewards/margins": 6.050795555114746, + "rewards/rejected": -2.8036019802093506, + "step": 9531 + }, + { + "epoch": 2.38, + "grad_norm": 5.591263771057129, + "learning_rate": 5.36163128275436e-06, + "logits/chosen": -0.44213899970054626, + "logits/rejected": -0.5784648060798645, + "logps/chosen": -72.39217376708984, + "logps/rejected": -87.9404525756836, + "loss": 0.722, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.608267068862915, + "rewards/margins": 5.250829696655273, + "rewards/rejected": -2.6425623893737793, + "step": 9532 + }, + { + "epoch": 2.38, + "grad_norm": 7.139904499053955, + "learning_rate": 5.360847349115909e-06, + "logits/chosen": -0.6084036827087402, + "logits/rejected": -0.6313693523406982, + "logps/chosen": -51.539730072021484, + "logps/rejected": -89.34600830078125, + "loss": 0.9463, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8106207847595215, + "rewards/margins": 4.1014509201049805, + "rewards/rejected": -1.2908296585083008, + "step": 9533 + }, + { + "epoch": 2.39, + "grad_norm": 4.831020355224609, + "learning_rate": 5.360063406560535e-06, + "logits/chosen": -0.5060148239135742, + "logits/rejected": -0.5610736012458801, + "logps/chosen": -53.52324295043945, + "logps/rejected": -95.17051696777344, + "loss": 0.7534, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.008328914642334, + "rewards/margins": 5.706496715545654, + "rewards/rejected": -2.698167324066162, + "step": 9534 + }, + { + "epoch": 2.39, + "grad_norm": 4.09173583984375, + "learning_rate": 5.359279455107611e-06, + "logits/chosen": -0.45293018221855164, + "logits/rejected": -0.5555213689804077, + "logps/chosen": -52.50404357910156, + "logps/rejected": -86.58348846435547, + "loss": 0.5749, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.085728168487549, + "rewards/margins": 6.284356117248535, + "rewards/rejected": -3.1986279487609863, + "step": 9535 + }, + { + "epoch": 2.39, + "grad_norm": 6.477184295654297, + "learning_rate": 5.358495494776511e-06, + "logits/chosen": -0.5895476937294006, + "logits/rejected": -0.6997362971305847, + "logps/chosen": -46.472503662109375, + "logps/rejected": -92.1941909790039, + "loss": 0.6021, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8264987468719482, + "rewards/margins": 5.894759178161621, + "rewards/rejected": -3.068260669708252, + "step": 9536 + }, + { + "epoch": 2.39, + "grad_norm": 3.804180860519409, + "learning_rate": 5.357711525586606e-06, + "logits/chosen": -0.5049890279769897, + "logits/rejected": -0.6044938564300537, + "logps/chosen": -45.26551818847656, + "logps/rejected": -96.16773223876953, + "loss": 0.5918, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.991614580154419, + "rewards/margins": 6.685888290405273, + "rewards/rejected": -3.694272994995117, + "step": 9537 + }, + { + "epoch": 2.39, + "grad_norm": 6.26161527633667, + "learning_rate": 5.356927547557267e-06, + "logits/chosen": -0.4492523670196533, + "logits/rejected": -0.5700156092643738, + "logps/chosen": -63.6510124206543, + "logps/rejected": -86.58722686767578, + "loss": 0.6827, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3014721870422363, + "rewards/margins": 5.971925258636475, + "rewards/rejected": -2.67045259475708, + "step": 9538 + }, + { + "epoch": 2.39, + "grad_norm": 5.117246627807617, + "learning_rate": 5.35614356070787e-06, + "logits/chosen": -0.5879234671592712, + "logits/rejected": -0.6488630175590515, + "logps/chosen": -50.18316650390625, + "logps/rejected": -97.33792114257812, + "loss": 0.7349, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.103126049041748, + "rewards/margins": 6.008790493011475, + "rewards/rejected": -2.9056639671325684, + "step": 9539 + }, + { + "epoch": 2.39, + "grad_norm": 3.4992504119873047, + "learning_rate": 5.3553595650577865e-06, + "logits/chosen": -0.44722816348075867, + "logits/rejected": -0.5530444383621216, + "logps/chosen": -52.51338577270508, + "logps/rejected": -96.47872161865234, + "loss": 0.5977, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7929816246032715, + "rewards/margins": 6.362412929534912, + "rewards/rejected": -3.5694313049316406, + "step": 9540 + }, + { + "epoch": 2.39, + "grad_norm": 20.02120590209961, + "learning_rate": 5.354575560626391e-06, + "logits/chosen": -0.5232986211776733, + "logits/rejected": -0.5642365217208862, + "logps/chosen": -53.251190185546875, + "logps/rejected": -109.64674377441406, + "loss": 0.8162, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.742439031600952, + "rewards/margins": 6.181105136871338, + "rewards/rejected": -3.4386656284332275, + "step": 9541 + }, + { + "epoch": 2.39, + "grad_norm": 4.612454891204834, + "learning_rate": 5.353791547433055e-06, + "logits/chosen": -0.5448139905929565, + "logits/rejected": -0.6134563088417053, + "logps/chosen": -49.566383361816406, + "logps/rejected": -128.46131896972656, + "loss": 0.6065, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0124764442443848, + "rewards/margins": 7.839561939239502, + "rewards/rejected": -4.827085494995117, + "step": 9542 + }, + { + "epoch": 2.39, + "grad_norm": 17.384479522705078, + "learning_rate": 5.353007525497154e-06, + "logits/chosen": -0.5348113179206848, + "logits/rejected": -0.629492461681366, + "logps/chosen": -56.9432258605957, + "logps/rejected": -94.11710357666016, + "loss": 0.9194, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.659275770187378, + "rewards/margins": 6.532819747924805, + "rewards/rejected": -3.8735439777374268, + "step": 9543 + }, + { + "epoch": 2.39, + "grad_norm": 8.74260425567627, + "learning_rate": 5.352223494838061e-06, + "logits/chosen": -0.464108943939209, + "logits/rejected": -0.5759608149528503, + "logps/chosen": -62.1324462890625, + "logps/rejected": -96.26107025146484, + "loss": 0.76, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9777939319610596, + "rewards/margins": 7.025510787963867, + "rewards/rejected": -4.0477166175842285, + "step": 9544 + }, + { + "epoch": 2.39, + "grad_norm": 4.224147796630859, + "learning_rate": 5.351439455475151e-06, + "logits/chosen": -0.551908552646637, + "logits/rejected": -0.5966259837150574, + "logps/chosen": -50.11418533325195, + "logps/rejected": -101.30986022949219, + "loss": 0.6598, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1676478385925293, + "rewards/margins": 5.844485282897949, + "rewards/rejected": -2.67683744430542, + "step": 9545 + }, + { + "epoch": 2.39, + "grad_norm": 3.541740655899048, + "learning_rate": 5.350655407427799e-06, + "logits/chosen": -0.5190722346305847, + "logits/rejected": -0.563076376914978, + "logps/chosen": -46.01304626464844, + "logps/rejected": -110.66447448730469, + "loss": 0.6399, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.04132342338562, + "rewards/margins": 5.67207670211792, + "rewards/rejected": -2.6307528018951416, + "step": 9546 + }, + { + "epoch": 2.39, + "grad_norm": 3.6390380859375, + "learning_rate": 5.3498713507153785e-06, + "logits/chosen": -0.5815977454185486, + "logits/rejected": -0.6706773042678833, + "logps/chosen": -47.33959197998047, + "logps/rejected": -96.4992904663086, + "loss": 0.6365, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2144813537597656, + "rewards/margins": 6.808767795562744, + "rewards/rejected": -3.5942864418029785, + "step": 9547 + }, + { + "epoch": 2.39, + "grad_norm": 4.222569942474365, + "learning_rate": 5.349087285357264e-06, + "logits/chosen": -0.5317266583442688, + "logits/rejected": -0.6105004549026489, + "logps/chosen": -53.29290771484375, + "logps/rejected": -99.41246795654297, + "loss": 0.6301, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.982591152191162, + "rewards/margins": 6.064962863922119, + "rewards/rejected": -3.082371711730957, + "step": 9548 + }, + { + "epoch": 2.39, + "grad_norm": 4.253244876861572, + "learning_rate": 5.348303211372832e-06, + "logits/chosen": -0.39558207988739014, + "logits/rejected": -0.5104995965957642, + "logps/chosen": -61.78904724121094, + "logps/rejected": -86.68465423583984, + "loss": 0.7037, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0006086826324463, + "rewards/margins": 5.762477397918701, + "rewards/rejected": -2.761868953704834, + "step": 9549 + }, + { + "epoch": 2.39, + "grad_norm": 3.396833658218384, + "learning_rate": 5.347519128781459e-06, + "logits/chosen": -0.5211239457130432, + "logits/rejected": -0.6015782356262207, + "logps/chosen": -49.98664474487305, + "logps/rejected": -103.83386993408203, + "loss": 0.6039, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8792073726654053, + "rewards/margins": 6.76237154006958, + "rewards/rejected": -3.883164167404175, + "step": 9550 + }, + { + "epoch": 2.39, + "grad_norm": 5.859853267669678, + "learning_rate": 5.3467350376025165e-06, + "logits/chosen": -0.5468556880950928, + "logits/rejected": -0.5812965631484985, + "logps/chosen": -62.03199768066406, + "logps/rejected": -112.90103149414062, + "loss": 0.7227, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.828580141067505, + "rewards/margins": 5.978204727172852, + "rewards/rejected": -3.149624824523926, + "step": 9551 + }, + { + "epoch": 2.39, + "grad_norm": 10.950965881347656, + "learning_rate": 5.345950937855382e-06, + "logits/chosen": -0.5319154262542725, + "logits/rejected": -0.6226996183395386, + "logps/chosen": -68.10368347167969, + "logps/rejected": -87.16073608398438, + "loss": 0.7157, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.741830348968506, + "rewards/margins": 5.804906845092773, + "rewards/rejected": -3.0630767345428467, + "step": 9552 + }, + { + "epoch": 2.39, + "grad_norm": 6.891960620880127, + "learning_rate": 5.345166829559434e-06, + "logits/chosen": -0.4474525451660156, + "logits/rejected": -0.5098469257354736, + "logps/chosen": -73.51026916503906, + "logps/rejected": -104.5272216796875, + "loss": 0.7695, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0090065002441406, + "rewards/margins": 5.781680583953857, + "rewards/rejected": -2.7726738452911377, + "step": 9553 + }, + { + "epoch": 2.39, + "grad_norm": 4.211101055145264, + "learning_rate": 5.344382712734044e-06, + "logits/chosen": -0.5124598741531372, + "logits/rejected": -0.5165854692459106, + "logps/chosen": -52.6464958190918, + "logps/rejected": -116.16266632080078, + "loss": 0.7001, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2225160598754883, + "rewards/margins": 6.330418586730957, + "rewards/rejected": -3.1079022884368896, + "step": 9554 + }, + { + "epoch": 2.39, + "grad_norm": 4.004486083984375, + "learning_rate": 5.3435985873985926e-06, + "logits/chosen": -0.5315021276473999, + "logits/rejected": -0.6035498976707458, + "logps/chosen": -61.12080383300781, + "logps/rejected": -106.44168853759766, + "loss": 0.6862, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0027194023132324, + "rewards/margins": 5.923521995544434, + "rewards/rejected": -2.920802354812622, + "step": 9555 + }, + { + "epoch": 2.39, + "grad_norm": 5.82535982131958, + "learning_rate": 5.342814453572455e-06, + "logits/chosen": -0.44186389446258545, + "logits/rejected": -0.4858906865119934, + "logps/chosen": -59.42969512939453, + "logps/rejected": -121.21229553222656, + "loss": 0.7475, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0971949100494385, + "rewards/margins": 5.589744567871094, + "rewards/rejected": -2.4925496578216553, + "step": 9556 + }, + { + "epoch": 2.39, + "grad_norm": 14.842070579528809, + "learning_rate": 5.342030311275006e-06, + "logits/chosen": -0.47117745876312256, + "logits/rejected": -0.5701664686203003, + "logps/chosen": -53.15347671508789, + "logps/rejected": -80.77925109863281, + "loss": 0.723, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.95546817779541, + "rewards/margins": 5.094168663024902, + "rewards/rejected": -2.138700246810913, + "step": 9557 + }, + { + "epoch": 2.39, + "grad_norm": 8.690017700195312, + "learning_rate": 5.341246160525625e-06, + "logits/chosen": -0.5034120678901672, + "logits/rejected": -0.5845786333084106, + "logps/chosen": -48.51803207397461, + "logps/rejected": -91.709716796875, + "loss": 0.7598, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9513540267944336, + "rewards/margins": 5.5039262771606445, + "rewards/rejected": -2.55257248878479, + "step": 9558 + }, + { + "epoch": 2.39, + "grad_norm": 8.519604682922363, + "learning_rate": 5.340462001343689e-06, + "logits/chosen": -0.4387126863002777, + "logits/rejected": -0.47599703073501587, + "logps/chosen": -56.16940689086914, + "logps/rejected": -110.80074310302734, + "loss": 0.6188, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0286061763763428, + "rewards/margins": 5.950860023498535, + "rewards/rejected": -2.9222540855407715, + "step": 9559 + }, + { + "epoch": 2.39, + "grad_norm": 3.0325369834899902, + "learning_rate": 5.339677833748573e-06, + "logits/chosen": -0.5776438117027283, + "logits/rejected": -0.6224538087844849, + "logps/chosen": -45.5595703125, + "logps/rejected": -100.92206573486328, + "loss": 0.6132, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1457462310791016, + "rewards/margins": 6.100555896759033, + "rewards/rejected": -2.9548099040985107, + "step": 9560 + }, + { + "epoch": 2.39, + "grad_norm": 5.159363746643066, + "learning_rate": 5.338893657759658e-06, + "logits/chosen": -0.5320504307746887, + "logits/rejected": -0.6094040870666504, + "logps/chosen": -57.74870300292969, + "logps/rejected": -94.52289581298828, + "loss": 0.6604, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3532931804656982, + "rewards/margins": 6.179169654846191, + "rewards/rejected": -2.8258771896362305, + "step": 9561 + }, + { + "epoch": 2.39, + "grad_norm": 5.385663032531738, + "learning_rate": 5.338109473396321e-06, + "logits/chosen": -0.5184066891670227, + "logits/rejected": -0.5653469562530518, + "logps/chosen": -56.035545349121094, + "logps/rejected": -96.39376831054688, + "loss": 0.6995, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.926171064376831, + "rewards/margins": 6.014768600463867, + "rewards/rejected": -3.088597536087036, + "step": 9562 + }, + { + "epoch": 2.39, + "grad_norm": 5.581878662109375, + "learning_rate": 5.337325280677937e-06, + "logits/chosen": -0.5213368535041809, + "logits/rejected": -0.6130322813987732, + "logps/chosen": -67.84663391113281, + "logps/rejected": -82.23918151855469, + "loss": 0.7026, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9274327754974365, + "rewards/margins": 5.631872177124023, + "rewards/rejected": -2.704439401626587, + "step": 9563 + }, + { + "epoch": 2.39, + "grad_norm": 4.846169948577881, + "learning_rate": 5.336541079623888e-06, + "logits/chosen": -0.5353772640228271, + "logits/rejected": -0.5728867053985596, + "logps/chosen": -48.64472961425781, + "logps/rejected": -101.38799285888672, + "loss": 0.6978, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8730990886688232, + "rewards/margins": 5.937520980834961, + "rewards/rejected": -3.0644214153289795, + "step": 9564 + }, + { + "epoch": 2.39, + "grad_norm": 7.018143177032471, + "learning_rate": 5.335756870253551e-06, + "logits/chosen": -0.5578560829162598, + "logits/rejected": -0.6296103000640869, + "logps/chosen": -58.32941436767578, + "logps/rejected": -98.34611511230469, + "loss": 0.7108, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0863609313964844, + "rewards/margins": 6.233039855957031, + "rewards/rejected": -3.146679401397705, + "step": 9565 + }, + { + "epoch": 2.39, + "grad_norm": 12.306593894958496, + "learning_rate": 5.334972652586304e-06, + "logits/chosen": -0.4720842242240906, + "logits/rejected": -0.5908939838409424, + "logps/chosen": -59.472572326660156, + "logps/rejected": -90.33827209472656, + "loss": 1.0165, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7608914375305176, + "rewards/margins": 5.831619739532471, + "rewards/rejected": -3.0707285404205322, + "step": 9566 + }, + { + "epoch": 2.39, + "grad_norm": 3.7021121978759766, + "learning_rate": 5.3341884266415265e-06, + "logits/chosen": -0.5151684284210205, + "logits/rejected": -0.650007426738739, + "logps/chosen": -63.20928192138672, + "logps/rejected": -90.98919677734375, + "loss": 0.6368, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9567065238952637, + "rewards/margins": 6.2221174240112305, + "rewards/rejected": -3.265410900115967, + "step": 9567 + }, + { + "epoch": 2.39, + "grad_norm": 4.453731060028076, + "learning_rate": 5.333404192438599e-06, + "logits/chosen": -0.5332988500595093, + "logits/rejected": -0.6105560064315796, + "logps/chosen": -56.88408279418945, + "logps/rejected": -91.94715881347656, + "loss": 0.7461, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8127079010009766, + "rewards/margins": 5.133421421051025, + "rewards/rejected": -2.320714235305786, + "step": 9568 + }, + { + "epoch": 2.39, + "grad_norm": 24.48269271850586, + "learning_rate": 5.3326199499968975e-06, + "logits/chosen": -0.47132062911987305, + "logits/rejected": -0.5864236354827881, + "logps/chosen": -58.47906494140625, + "logps/rejected": -103.5520248413086, + "loss": 0.6986, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0221595764160156, + "rewards/margins": 5.9170989990234375, + "rewards/rejected": -2.8949391841888428, + "step": 9569 + }, + { + "epoch": 2.39, + "grad_norm": 9.169466972351074, + "learning_rate": 5.331835699335803e-06, + "logits/chosen": -0.5111619830131531, + "logits/rejected": -0.6001767516136169, + "logps/chosen": -55.501976013183594, + "logps/rejected": -94.2564468383789, + "loss": 0.6824, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9765281677246094, + "rewards/margins": 6.333572864532471, + "rewards/rejected": -3.3570444583892822, + "step": 9570 + }, + { + "epoch": 2.39, + "grad_norm": 5.808875560760498, + "learning_rate": 5.331051440474697e-06, + "logits/chosen": -0.540195107460022, + "logits/rejected": -0.6243562698364258, + "logps/chosen": -66.96211242675781, + "logps/rejected": -107.1070327758789, + "loss": 0.6901, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.169046640396118, + "rewards/margins": 7.2948222160339355, + "rewards/rejected": -4.125775337219238, + "step": 9571 + }, + { + "epoch": 2.39, + "grad_norm": 3.0625159740448, + "learning_rate": 5.330267173432957e-06, + "logits/chosen": -0.4685254395008087, + "logits/rejected": -0.569239616394043, + "logps/chosen": -66.72648620605469, + "logps/rejected": -87.49798583984375, + "loss": 0.665, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9558379650115967, + "rewards/margins": 6.888331413269043, + "rewards/rejected": -3.9324934482574463, + "step": 9572 + }, + { + "epoch": 2.39, + "grad_norm": 7.705096244812012, + "learning_rate": 5.329482898229964e-06, + "logits/chosen": -0.5289090871810913, + "logits/rejected": -0.5746772289276123, + "logps/chosen": -49.274681091308594, + "logps/rejected": -95.95053100585938, + "loss": 0.6285, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9156248569488525, + "rewards/margins": 5.743784427642822, + "rewards/rejected": -2.828159809112549, + "step": 9573 + }, + { + "epoch": 2.4, + "grad_norm": 10.499653816223145, + "learning_rate": 5.328698614885098e-06, + "logits/chosen": -0.4922800660133362, + "logits/rejected": -0.5072323083877563, + "logps/chosen": -57.603492736816406, + "logps/rejected": -107.94200134277344, + "loss": 0.8223, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.25595760345459, + "rewards/margins": 5.182811737060547, + "rewards/rejected": -1.926853895187378, + "step": 9574 + }, + { + "epoch": 2.4, + "grad_norm": 4.249330520629883, + "learning_rate": 5.327914323417739e-06, + "logits/chosen": -0.5367364287376404, + "logits/rejected": -0.6102805733680725, + "logps/chosen": -48.49458312988281, + "logps/rejected": -90.47518920898438, + "loss": 0.5842, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1575355529785156, + "rewards/margins": 6.4629034996032715, + "rewards/rejected": -3.305368185043335, + "step": 9575 + }, + { + "epoch": 2.4, + "grad_norm": 8.799766540527344, + "learning_rate": 5.327130023847269e-06, + "logits/chosen": -0.4338414669036865, + "logits/rejected": -0.5175581574440002, + "logps/chosen": -51.50045394897461, + "logps/rejected": -82.4095230102539, + "loss": 0.6519, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3110666275024414, + "rewards/margins": 5.110424041748047, + "rewards/rejected": -1.7993574142456055, + "step": 9576 + }, + { + "epoch": 2.4, + "grad_norm": 7.804141044616699, + "learning_rate": 5.326345716193068e-06, + "logits/chosen": -0.4893190860748291, + "logits/rejected": -0.5797609090805054, + "logps/chosen": -71.0479736328125, + "logps/rejected": -96.30384063720703, + "loss": 0.7161, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.814056396484375, + "rewards/margins": 5.508708953857422, + "rewards/rejected": -2.694653034210205, + "step": 9577 + }, + { + "epoch": 2.4, + "grad_norm": 3.1861727237701416, + "learning_rate": 5.325561400474518e-06, + "logits/chosen": -0.517142117023468, + "logits/rejected": -0.5939494371414185, + "logps/chosen": -55.111515045166016, + "logps/rejected": -99.77326202392578, + "loss": 0.6629, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0123655796051025, + "rewards/margins": 6.606807708740234, + "rewards/rejected": -3.594442844390869, + "step": 9578 + }, + { + "epoch": 2.4, + "grad_norm": 5.152138710021973, + "learning_rate": 5.324777076710998e-06, + "logits/chosen": -0.45782536268234253, + "logits/rejected": -0.5131281614303589, + "logps/chosen": -55.729007720947266, + "logps/rejected": -104.03109741210938, + "loss": 0.6997, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.81321120262146, + "rewards/margins": 6.133982181549072, + "rewards/rejected": -3.3207712173461914, + "step": 9579 + }, + { + "epoch": 2.4, + "grad_norm": 6.886508941650391, + "learning_rate": 5.323992744921892e-06, + "logits/chosen": -0.4290313720703125, + "logits/rejected": -0.5369517207145691, + "logps/chosen": -59.95159149169922, + "logps/rejected": -87.88917541503906, + "loss": 0.7159, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.545053720474243, + "rewards/margins": 5.245471954345703, + "rewards/rejected": -2.70041823387146, + "step": 9580 + }, + { + "epoch": 2.4, + "grad_norm": 4.921201705932617, + "learning_rate": 5.32320840512658e-06, + "logits/chosen": -0.4956091046333313, + "logits/rejected": -0.5725815296173096, + "logps/chosen": -53.688148498535156, + "logps/rejected": -96.53286743164062, + "loss": 0.7239, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9838149547576904, + "rewards/margins": 5.810606956481934, + "rewards/rejected": -2.8267924785614014, + "step": 9581 + }, + { + "epoch": 2.4, + "grad_norm": 6.311140060424805, + "learning_rate": 5.322424057344446e-06, + "logits/chosen": -0.46132364869117737, + "logits/rejected": -0.5532798171043396, + "logps/chosen": -51.878173828125, + "logps/rejected": -108.15550994873047, + "loss": 0.6373, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9870822429656982, + "rewards/margins": 6.708791732788086, + "rewards/rejected": -3.7217092514038086, + "step": 9582 + }, + { + "epoch": 2.4, + "grad_norm": 5.921715259552002, + "learning_rate": 5.321639701594869e-06, + "logits/chosen": -0.49592214822769165, + "logits/rejected": -0.6119226217269897, + "logps/chosen": -58.12866973876953, + "logps/rejected": -90.14874267578125, + "loss": 0.7619, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8814101219177246, + "rewards/margins": 6.204946517944336, + "rewards/rejected": -3.3235363960266113, + "step": 9583 + }, + { + "epoch": 2.4, + "grad_norm": 24.477880477905273, + "learning_rate": 5.3208553378972335e-06, + "logits/chosen": -0.5101114511489868, + "logits/rejected": -0.5526236891746521, + "logps/chosen": -66.00433349609375, + "logps/rejected": -95.02467346191406, + "loss": 0.8016, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.821561574935913, + "rewards/margins": 4.61123514175415, + "rewards/rejected": -1.7896738052368164, + "step": 9584 + }, + { + "epoch": 2.4, + "grad_norm": 5.758352279663086, + "learning_rate": 5.320070966270923e-06, + "logits/chosen": -0.49480485916137695, + "logits/rejected": -0.6301829218864441, + "logps/chosen": -51.27098846435547, + "logps/rejected": -85.0357894897461, + "loss": 0.6826, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.011941909790039, + "rewards/margins": 6.214635848999023, + "rewards/rejected": -3.2026944160461426, + "step": 9585 + }, + { + "epoch": 2.4, + "grad_norm": 4.935934066772461, + "learning_rate": 5.319286586735315e-06, + "logits/chosen": -0.5339176654815674, + "logits/rejected": -0.5681995153427124, + "logps/chosen": -54.229801177978516, + "logps/rejected": -104.57930755615234, + "loss": 0.6683, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.11391282081604, + "rewards/margins": 5.548280239105225, + "rewards/rejected": -2.4343676567077637, + "step": 9586 + }, + { + "epoch": 2.4, + "grad_norm": 16.630647659301758, + "learning_rate": 5.318502199309797e-06, + "logits/chosen": -0.5186740756034851, + "logits/rejected": -0.6169061660766602, + "logps/chosen": -68.63947296142578, + "logps/rejected": -92.69786071777344, + "loss": 0.8806, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.686367988586426, + "rewards/margins": 5.163795471191406, + "rewards/rejected": -2.4774279594421387, + "step": 9587 + }, + { + "epoch": 2.4, + "grad_norm": 17.65862464904785, + "learning_rate": 5.317717804013752e-06, + "logits/chosen": -0.41394391655921936, + "logits/rejected": -0.4820956289768219, + "logps/chosen": -68.12785339355469, + "logps/rejected": -102.91665649414062, + "loss": 0.7958, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.767646551132202, + "rewards/margins": 5.266648769378662, + "rewards/rejected": -2.4990017414093018, + "step": 9588 + }, + { + "epoch": 2.4, + "grad_norm": 9.467284202575684, + "learning_rate": 5.316933400866562e-06, + "logits/chosen": -0.4436069428920746, + "logits/rejected": -0.5510700941085815, + "logps/chosen": -70.11148071289062, + "logps/rejected": -103.27153015136719, + "loss": 0.7178, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.735023021697998, + "rewards/margins": 5.784058094024658, + "rewards/rejected": -3.04903507232666, + "step": 9589 + }, + { + "epoch": 2.4, + "grad_norm": 5.051344871520996, + "learning_rate": 5.31614898988761e-06, + "logits/chosen": -0.4995463788509369, + "logits/rejected": -0.5571906566619873, + "logps/chosen": -53.02326965332031, + "logps/rejected": -95.21546173095703, + "loss": 0.7016, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.860900402069092, + "rewards/margins": 5.545973777770996, + "rewards/rejected": -2.6850738525390625, + "step": 9590 + }, + { + "epoch": 2.4, + "grad_norm": 6.684058666229248, + "learning_rate": 5.315364571096281e-06, + "logits/chosen": -0.469307541847229, + "logits/rejected": -0.5908229351043701, + "logps/chosen": -60.0494270324707, + "logps/rejected": -95.01295471191406, + "loss": 0.7904, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.604691982269287, + "rewards/margins": 5.915947437286377, + "rewards/rejected": -3.31125545501709, + "step": 9591 + }, + { + "epoch": 2.4, + "grad_norm": 7.6364545822143555, + "learning_rate": 5.314580144511956e-06, + "logits/chosen": -0.525044322013855, + "logits/rejected": -0.6165362000465393, + "logps/chosen": -49.76393508911133, + "logps/rejected": -88.25320434570312, + "loss": 0.5629, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1624741554260254, + "rewards/margins": 5.862844944000244, + "rewards/rejected": -2.700371265411377, + "step": 9592 + }, + { + "epoch": 2.4, + "grad_norm": 7.4619975090026855, + "learning_rate": 5.3137957101540225e-06, + "logits/chosen": -0.42018574476242065, + "logits/rejected": -0.5531121492385864, + "logps/chosen": -51.702117919921875, + "logps/rejected": -93.52447509765625, + "loss": 0.5901, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.162508964538574, + "rewards/margins": 7.173728942871094, + "rewards/rejected": -4.0112199783325195, + "step": 9593 + }, + { + "epoch": 2.4, + "grad_norm": 9.241837501525879, + "learning_rate": 5.313011268041863e-06, + "logits/chosen": -0.4966525435447693, + "logits/rejected": -0.5705239176750183, + "logps/chosen": -60.39345932006836, + "logps/rejected": -102.23228454589844, + "loss": 0.7792, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0158512592315674, + "rewards/margins": 5.880827903747559, + "rewards/rejected": -2.8649768829345703, + "step": 9594 + }, + { + "epoch": 2.4, + "grad_norm": 4.608242034912109, + "learning_rate": 5.312226818194862e-06, + "logits/chosen": -0.4288335144519806, + "logits/rejected": -0.558989405632019, + "logps/chosen": -51.1198844909668, + "logps/rejected": -87.05311584472656, + "loss": 0.6171, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2303457260131836, + "rewards/margins": 6.825860977172852, + "rewards/rejected": -3.5955147743225098, + "step": 9595 + }, + { + "epoch": 2.4, + "grad_norm": 5.607171535491943, + "learning_rate": 5.311442360632405e-06, + "logits/chosen": -0.5479636788368225, + "logits/rejected": -0.5715051889419556, + "logps/chosen": -48.41650390625, + "logps/rejected": -99.26532745361328, + "loss": 0.7194, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.049971342086792, + "rewards/margins": 5.539000034332275, + "rewards/rejected": -2.4890284538269043, + "step": 9596 + }, + { + "epoch": 2.4, + "grad_norm": 3.646879196166992, + "learning_rate": 5.310657895373875e-06, + "logits/chosen": -0.502352774143219, + "logits/rejected": -0.5862138867378235, + "logps/chosen": -50.32244110107422, + "logps/rejected": -96.25298309326172, + "loss": 0.5673, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.30802583694458, + "rewards/margins": 6.250223159790039, + "rewards/rejected": -2.942197322845459, + "step": 9597 + }, + { + "epoch": 2.4, + "grad_norm": 2.391853094100952, + "learning_rate": 5.309873422438658e-06, + "logits/chosen": -0.4864083230495453, + "logits/rejected": -0.6300915479660034, + "logps/chosen": -54.43877029418945, + "logps/rejected": -74.46087646484375, + "loss": 0.6579, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.40008544921875, + "rewards/margins": 6.235387802124023, + "rewards/rejected": -2.8353028297424316, + "step": 9598 + }, + { + "epoch": 2.4, + "grad_norm": 4.0486578941345215, + "learning_rate": 5.30908894184614e-06, + "logits/chosen": -0.5211690068244934, + "logits/rejected": -0.5499728322029114, + "logps/chosen": -48.196346282958984, + "logps/rejected": -100.31961822509766, + "loss": 0.5654, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2834248542785645, + "rewards/margins": 5.99522590637207, + "rewards/rejected": -2.711801052093506, + "step": 9599 + }, + { + "epoch": 2.4, + "grad_norm": 7.390840530395508, + "learning_rate": 5.3083044536157045e-06, + "logits/chosen": -0.5385340452194214, + "logits/rejected": -0.6180335283279419, + "logps/chosen": -47.9007453918457, + "logps/rejected": -87.05776977539062, + "loss": 0.7662, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1431803703308105, + "rewards/margins": 5.698829650878906, + "rewards/rejected": -2.5556492805480957, + "step": 9600 + }, + { + "epoch": 2.4, + "grad_norm": 5.444338798522949, + "learning_rate": 5.307519957766739e-06, + "logits/chosen": -0.46608489751815796, + "logits/rejected": -0.558185338973999, + "logps/chosen": -72.31991577148438, + "logps/rejected": -111.40369415283203, + "loss": 0.7253, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9638519287109375, + "rewards/margins": 6.359567642211914, + "rewards/rejected": -3.3957152366638184, + "step": 9601 + }, + { + "epoch": 2.4, + "grad_norm": 5.3946709632873535, + "learning_rate": 5.306735454318628e-06, + "logits/chosen": -0.5242631435394287, + "logits/rejected": -0.5561649799346924, + "logps/chosen": -58.85493469238281, + "logps/rejected": -104.86858367919922, + "loss": 0.6039, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.115664005279541, + "rewards/margins": 5.577757358551025, + "rewards/rejected": -2.462092876434326, + "step": 9602 + }, + { + "epoch": 2.4, + "grad_norm": 4.387210845947266, + "learning_rate": 5.305950943290758e-06, + "logits/chosen": -0.5468490123748779, + "logits/rejected": -0.6053099036216736, + "logps/chosen": -61.72251892089844, + "logps/rejected": -83.641357421875, + "loss": 0.8236, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8951876163482666, + "rewards/margins": 4.265994071960449, + "rewards/rejected": -1.3708065748214722, + "step": 9603 + }, + { + "epoch": 2.4, + "grad_norm": 4.973824501037598, + "learning_rate": 5.305166424702514e-06, + "logits/chosen": -0.4219764471054077, + "logits/rejected": -0.4847381114959717, + "logps/chosen": -59.576690673828125, + "logps/rejected": -86.3780517578125, + "loss": 0.6605, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1796183586120605, + "rewards/margins": 5.004659175872803, + "rewards/rejected": -1.8250409364700317, + "step": 9604 + }, + { + "epoch": 2.4, + "grad_norm": 2.35693097114563, + "learning_rate": 5.304381898573284e-06, + "logits/chosen": -0.5033371448516846, + "logits/rejected": -0.5694519877433777, + "logps/chosen": -52.33088684082031, + "logps/rejected": -116.60409545898438, + "loss": 0.6215, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1219000816345215, + "rewards/margins": 7.305262565612793, + "rewards/rejected": -4.1833624839782715, + "step": 9605 + }, + { + "epoch": 2.4, + "grad_norm": 3.678720235824585, + "learning_rate": 5.303597364922452e-06, + "logits/chosen": -0.4922119081020355, + "logits/rejected": -0.5773136019706726, + "logps/chosen": -47.520572662353516, + "logps/rejected": -87.8891372680664, + "loss": 0.5887, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.249211549758911, + "rewards/margins": 5.671886444091797, + "rewards/rejected": -2.422675371170044, + "step": 9606 + }, + { + "epoch": 2.4, + "grad_norm": 11.174978256225586, + "learning_rate": 5.302812823769407e-06, + "logits/chosen": -0.4623635411262512, + "logits/rejected": -0.5679595470428467, + "logps/chosen": -63.911903381347656, + "logps/rejected": -109.99665832519531, + "loss": 0.668, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7791314125061035, + "rewards/margins": 5.375839710235596, + "rewards/rejected": -2.5967085361480713, + "step": 9607 + }, + { + "epoch": 2.4, + "grad_norm": 3.740264892578125, + "learning_rate": 5.302028275133535e-06, + "logits/chosen": -0.5748777389526367, + "logits/rejected": -0.6802673935890198, + "logps/chosen": -59.61752700805664, + "logps/rejected": -102.2271957397461, + "loss": 0.6151, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.294349193572998, + "rewards/margins": 7.194701194763184, + "rewards/rejected": -3.9003517627716064, + "step": 9608 + }, + { + "epoch": 2.4, + "grad_norm": 3.9935359954833984, + "learning_rate": 5.301243719034223e-06, + "logits/chosen": -0.4501185715198517, + "logits/rejected": -0.5810422897338867, + "logps/chosen": -59.78851318359375, + "logps/rejected": -75.45879364013672, + "loss": 0.6717, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.904963254928589, + "rewards/margins": 4.644412517547607, + "rewards/rejected": -1.7394487857818604, + "step": 9609 + }, + { + "epoch": 2.4, + "grad_norm": 4.737534046173096, + "learning_rate": 5.3004591554908595e-06, + "logits/chosen": -0.4645230770111084, + "logits/rejected": -0.5586496591567993, + "logps/chosen": -52.266170501708984, + "logps/rejected": -93.21346282958984, + "loss": 0.6797, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.944394588470459, + "rewards/margins": 5.3393330574035645, + "rewards/rejected": -2.3949382305145264, + "step": 9610 + }, + { + "epoch": 2.4, + "grad_norm": 8.411789894104004, + "learning_rate": 5.299674584522829e-06, + "logits/chosen": -0.47963947057724, + "logits/rejected": -0.5897607803344727, + "logps/chosen": -50.15817642211914, + "logps/rejected": -72.0352554321289, + "loss": 0.7366, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0557472705841064, + "rewards/margins": 5.093820095062256, + "rewards/rejected": -2.0380730628967285, + "step": 9611 + }, + { + "epoch": 2.4, + "grad_norm": 15.57554817199707, + "learning_rate": 5.2988900061495205e-06, + "logits/chosen": -0.5035073161125183, + "logits/rejected": -0.5357998609542847, + "logps/chosen": -80.70491027832031, + "logps/rejected": -100.60758209228516, + "loss": 0.866, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0388379096984863, + "rewards/margins": 4.608758926391602, + "rewards/rejected": -1.5699207782745361, + "step": 9612 + }, + { + "epoch": 2.4, + "grad_norm": 2.990018844604492, + "learning_rate": 5.298105420390324e-06, + "logits/chosen": -0.5388551950454712, + "logits/rejected": -0.5967593789100647, + "logps/chosen": -49.95896530151367, + "logps/rejected": -93.54927825927734, + "loss": 0.6506, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0593090057373047, + "rewards/margins": 5.71856164932251, + "rewards/rejected": -2.659252166748047, + "step": 9613 + }, + { + "epoch": 2.41, + "grad_norm": 5.840296268463135, + "learning_rate": 5.297320827264624e-06, + "logits/chosen": -0.5151926875114441, + "logits/rejected": -0.6046713590621948, + "logps/chosen": -57.444549560546875, + "logps/rejected": -105.19044494628906, + "loss": 0.6617, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9642281532287598, + "rewards/margins": 5.557342529296875, + "rewards/rejected": -2.593114137649536, + "step": 9614 + }, + { + "epoch": 2.41, + "grad_norm": 4.546699523925781, + "learning_rate": 5.29653622679181e-06, + "logits/chosen": -0.4809165894985199, + "logits/rejected": -0.589006245136261, + "logps/chosen": -57.822227478027344, + "logps/rejected": -101.78321075439453, + "loss": 0.6979, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0577211380004883, + "rewards/margins": 6.272490978240967, + "rewards/rejected": -3.2147700786590576, + "step": 9615 + }, + { + "epoch": 2.41, + "grad_norm": 4.108274936676025, + "learning_rate": 5.2957516189912715e-06, + "logits/chosen": -0.4352262616157532, + "logits/rejected": -0.4883255958557129, + "logps/chosen": -48.26792907714844, + "logps/rejected": -99.24732208251953, + "loss": 0.558, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.164156198501587, + "rewards/margins": 5.95395040512085, + "rewards/rejected": -2.789794445037842, + "step": 9616 + }, + { + "epoch": 2.41, + "grad_norm": 2.4737164974212646, + "learning_rate": 5.294967003882395e-06, + "logits/chosen": -0.4609020948410034, + "logits/rejected": -0.4614437222480774, + "logps/chosen": -51.540164947509766, + "logps/rejected": -113.13526153564453, + "loss": 0.6315, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8847339153289795, + "rewards/margins": 6.304736137390137, + "rewards/rejected": -3.4200026988983154, + "step": 9617 + }, + { + "epoch": 2.41, + "grad_norm": 7.667257785797119, + "learning_rate": 5.29418238148457e-06, + "logits/chosen": -0.4605826735496521, + "logits/rejected": -0.5429337620735168, + "logps/chosen": -51.868404388427734, + "logps/rejected": -83.67088317871094, + "loss": 0.6908, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.78320050239563, + "rewards/margins": 5.023004531860352, + "rewards/rejected": -2.2398030757904053, + "step": 9618 + }, + { + "epoch": 2.41, + "grad_norm": 4.4652605056762695, + "learning_rate": 5.293397751817188e-06, + "logits/chosen": -0.5102393627166748, + "logits/rejected": -0.539435863494873, + "logps/chosen": -45.76441955566406, + "logps/rejected": -103.36561584472656, + "loss": 0.6749, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.105112314224243, + "rewards/margins": 5.243584632873535, + "rewards/rejected": -2.138472318649292, + "step": 9619 + }, + { + "epoch": 2.41, + "grad_norm": 3.583777904510498, + "learning_rate": 5.292613114899631e-06, + "logits/chosen": -0.3891516923904419, + "logits/rejected": -0.4894716143608093, + "logps/chosen": -67.44171142578125, + "logps/rejected": -88.97222900390625, + "loss": 0.6753, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.202820062637329, + "rewards/margins": 5.712863922119141, + "rewards/rejected": -2.5100433826446533, + "step": 9620 + }, + { + "epoch": 2.41, + "grad_norm": 3.162137985229492, + "learning_rate": 5.291828470751295e-06, + "logits/chosen": -0.4858649671077728, + "logits/rejected": -0.5846189856529236, + "logps/chosen": -56.51121520996094, + "logps/rejected": -100.06864929199219, + "loss": 0.6078, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0319268703460693, + "rewards/margins": 6.877860069274902, + "rewards/rejected": -3.8459339141845703, + "step": 9621 + }, + { + "epoch": 2.41, + "grad_norm": 7.336938381195068, + "learning_rate": 5.291043819391568e-06, + "logits/chosen": -0.4800965189933777, + "logits/rejected": -0.5617931485176086, + "logps/chosen": -55.35060119628906, + "logps/rejected": -102.8731460571289, + "loss": 0.7785, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.368802309036255, + "rewards/margins": 5.577757358551025, + "rewards/rejected": -2.2089552879333496, + "step": 9622 + }, + { + "epoch": 2.41, + "grad_norm": 5.638926029205322, + "learning_rate": 5.290259160839838e-06, + "logits/chosen": -0.5757828950881958, + "logits/rejected": -0.5540964007377625, + "logps/chosen": -42.791099548339844, + "logps/rejected": -108.698486328125, + "loss": 0.6009, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.322904109954834, + "rewards/margins": 6.349782943725586, + "rewards/rejected": -3.0268781185150146, + "step": 9623 + }, + { + "epoch": 2.41, + "grad_norm": 10.989593505859375, + "learning_rate": 5.289474495115494e-06, + "logits/chosen": -0.569617509841919, + "logits/rejected": -0.6384721994400024, + "logps/chosen": -52.57713317871094, + "logps/rejected": -104.87589263916016, + "loss": 0.6771, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8757855892181396, + "rewards/margins": 6.1338582038879395, + "rewards/rejected": -3.258072853088379, + "step": 9624 + }, + { + "epoch": 2.41, + "grad_norm": 8.296910285949707, + "learning_rate": 5.28868982223793e-06, + "logits/chosen": -0.4777306020259857, + "logits/rejected": -0.5632364153862, + "logps/chosen": -55.63108825683594, + "logps/rejected": -100.73403930664062, + "loss": 0.7406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8402249813079834, + "rewards/margins": 6.050131797790527, + "rewards/rejected": -3.209906816482544, + "step": 9625 + }, + { + "epoch": 2.41, + "grad_norm": 5.543106555938721, + "learning_rate": 5.2879051422265305e-06, + "logits/chosen": -0.5294805765151978, + "logits/rejected": -0.5857332348823547, + "logps/chosen": -48.7819709777832, + "logps/rejected": -102.63520812988281, + "loss": 0.6539, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8820090293884277, + "rewards/margins": 5.863626003265381, + "rewards/rejected": -2.9816172122955322, + "step": 9626 + }, + { + "epoch": 2.41, + "grad_norm": 6.289992809295654, + "learning_rate": 5.287120455100692e-06, + "logits/chosen": -0.5757400393486023, + "logits/rejected": -0.620379626750946, + "logps/chosen": -59.50996398925781, + "logps/rejected": -84.78900909423828, + "loss": 0.8147, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.596649169921875, + "rewards/margins": 4.240972518920898, + "rewards/rejected": -1.6443235874176025, + "step": 9627 + }, + { + "epoch": 2.41, + "grad_norm": 4.712606906890869, + "learning_rate": 5.286335760879799e-06, + "logits/chosen": -0.5172215700149536, + "logits/rejected": -0.6199570894241333, + "logps/chosen": -53.72088623046875, + "logps/rejected": -102.5924301147461, + "loss": 0.5755, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.852572441101074, + "rewards/margins": 7.2784576416015625, + "rewards/rejected": -4.425884246826172, + "step": 9628 + }, + { + "epoch": 2.41, + "grad_norm": 18.138734817504883, + "learning_rate": 5.285551059583245e-06, + "logits/chosen": -0.5266283750534058, + "logits/rejected": -0.5842800140380859, + "logps/chosen": -63.90653610229492, + "logps/rejected": -87.49822998046875, + "loss": 0.6935, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9775989055633545, + "rewards/margins": 5.275928974151611, + "rewards/rejected": -2.298330068588257, + "step": 9629 + }, + { + "epoch": 2.41, + "grad_norm": 5.32060432434082, + "learning_rate": 5.284766351230423e-06, + "logits/chosen": -0.5439013242721558, + "logits/rejected": -0.579591691493988, + "logps/chosen": -63.12590789794922, + "logps/rejected": -120.23441314697266, + "loss": 0.7187, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6476504802703857, + "rewards/margins": 5.480863571166992, + "rewards/rejected": -2.8332133293151855, + "step": 9630 + }, + { + "epoch": 2.41, + "grad_norm": 4.060593605041504, + "learning_rate": 5.283981635840719e-06, + "logits/chosen": -0.41140639781951904, + "logits/rejected": -0.5170117616653442, + "logps/chosen": -63.05601501464844, + "logps/rejected": -97.80965423583984, + "loss": 0.6501, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1353812217712402, + "rewards/margins": 5.926032066345215, + "rewards/rejected": -2.7906503677368164, + "step": 9631 + }, + { + "epoch": 2.41, + "grad_norm": 3.848179340362549, + "learning_rate": 5.283196913433527e-06, + "logits/chosen": -0.5066298246383667, + "logits/rejected": -0.5553148984909058, + "logps/chosen": -52.381385803222656, + "logps/rejected": -103.3316421508789, + "loss": 0.667, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7673401832580566, + "rewards/margins": 5.583272457122803, + "rewards/rejected": -2.815932035446167, + "step": 9632 + }, + { + "epoch": 2.41, + "grad_norm": 5.940128326416016, + "learning_rate": 5.282412184028238e-06, + "logits/chosen": -0.5027183890342712, + "logits/rejected": -0.5440773367881775, + "logps/chosen": -51.93190002441406, + "logps/rejected": -91.06040954589844, + "loss": 0.7963, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.813286304473877, + "rewards/margins": 5.407027721405029, + "rewards/rejected": -2.593740940093994, + "step": 9633 + }, + { + "epoch": 2.41, + "grad_norm": 8.339659690856934, + "learning_rate": 5.281627447644245e-06, + "logits/chosen": -0.5249623656272888, + "logits/rejected": -0.5790674686431885, + "logps/chosen": -50.288631439208984, + "logps/rejected": -98.39961242675781, + "loss": 0.7431, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7180726528167725, + "rewards/margins": 5.176628112792969, + "rewards/rejected": -2.4585559368133545, + "step": 9634 + }, + { + "epoch": 2.41, + "grad_norm": 7.448155879974365, + "learning_rate": 5.280842704300937e-06, + "logits/chosen": -0.5315860509872437, + "logits/rejected": -0.6149882078170776, + "logps/chosen": -59.63543701171875, + "logps/rejected": -97.31098937988281, + "loss": 0.7957, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8875744342803955, + "rewards/margins": 5.708419322967529, + "rewards/rejected": -2.820845365524292, + "step": 9635 + }, + { + "epoch": 2.41, + "grad_norm": 10.48373794555664, + "learning_rate": 5.2800579540177074e-06, + "logits/chosen": -0.5260763764381409, + "logits/rejected": -0.6256383061408997, + "logps/chosen": -59.642520904541016, + "logps/rejected": -95.36270141601562, + "loss": 0.7073, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0162172317504883, + "rewards/margins": 6.188364028930664, + "rewards/rejected": -3.172146797180176, + "step": 9636 + }, + { + "epoch": 2.41, + "grad_norm": 3.3010401725769043, + "learning_rate": 5.279273196813948e-06, + "logits/chosen": -0.5912380218505859, + "logits/rejected": -0.7056734561920166, + "logps/chosen": -56.23725128173828, + "logps/rejected": -95.22727966308594, + "loss": 0.5944, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.128998041152954, + "rewards/margins": 7.2165303230285645, + "rewards/rejected": -4.087532997131348, + "step": 9637 + }, + { + "epoch": 2.41, + "grad_norm": 5.480834007263184, + "learning_rate": 5.27848843270905e-06, + "logits/chosen": -0.562400221824646, + "logits/rejected": -0.6626667380332947, + "logps/chosen": -59.86048126220703, + "logps/rejected": -94.201904296875, + "loss": 0.7268, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7619802951812744, + "rewards/margins": 5.756230354309082, + "rewards/rejected": -2.9942498207092285, + "step": 9638 + }, + { + "epoch": 2.41, + "grad_norm": 6.900951385498047, + "learning_rate": 5.277703661722409e-06, + "logits/chosen": -0.5455008149147034, + "logits/rejected": -0.6809020042419434, + "logps/chosen": -60.216346740722656, + "logps/rejected": -92.92494201660156, + "loss": 0.6116, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0539937019348145, + "rewards/margins": 6.210609436035156, + "rewards/rejected": -3.1566152572631836, + "step": 9639 + }, + { + "epoch": 2.41, + "grad_norm": 10.489784240722656, + "learning_rate": 5.276918883873412e-06, + "logits/chosen": -0.5691028833389282, + "logits/rejected": -0.6654354929924011, + "logps/chosen": -59.57534408569336, + "logps/rejected": -88.7780532836914, + "loss": 0.757, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5489814281463623, + "rewards/margins": 4.957049369812012, + "rewards/rejected": -2.4080677032470703, + "step": 9640 + }, + { + "epoch": 2.41, + "grad_norm": 4.2714924812316895, + "learning_rate": 5.276134099181457e-06, + "logits/chosen": -0.5464394092559814, + "logits/rejected": -0.5618955492973328, + "logps/chosen": -56.20527648925781, + "logps/rejected": -92.48614501953125, + "loss": 0.6389, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.139988899230957, + "rewards/margins": 5.446788787841797, + "rewards/rejected": -2.3067996501922607, + "step": 9641 + }, + { + "epoch": 2.41, + "grad_norm": 6.054879188537598, + "learning_rate": 5.2753493076659334e-06, + "logits/chosen": -0.49767783284187317, + "logits/rejected": -0.5894474387168884, + "logps/chosen": -63.90911102294922, + "logps/rejected": -87.68184661865234, + "loss": 0.7353, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9406092166900635, + "rewards/margins": 5.287907600402832, + "rewards/rejected": -2.3472981452941895, + "step": 9642 + }, + { + "epoch": 2.41, + "grad_norm": 14.228825569152832, + "learning_rate": 5.274564509346236e-06, + "logits/chosen": -0.520621120929718, + "logits/rejected": -0.641899049282074, + "logps/chosen": -56.300514221191406, + "logps/rejected": -90.66368103027344, + "loss": 0.581, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.176964282989502, + "rewards/margins": 6.325079917907715, + "rewards/rejected": -3.1481151580810547, + "step": 9643 + }, + { + "epoch": 2.41, + "grad_norm": 3.021787405014038, + "learning_rate": 5.273779704241758e-06, + "logits/chosen": -0.4910582900047302, + "logits/rejected": -0.4939083456993103, + "logps/chosen": -47.5361442565918, + "logps/rejected": -117.642333984375, + "loss": 0.5455, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2490904331207275, + "rewards/margins": 7.0302205085754395, + "rewards/rejected": -3.781129837036133, + "step": 9644 + }, + { + "epoch": 2.41, + "grad_norm": 6.464069366455078, + "learning_rate": 5.272994892371892e-06, + "logits/chosen": -0.4808962345123291, + "logits/rejected": -0.5417720079421997, + "logps/chosen": -61.958499908447266, + "logps/rejected": -91.61604309082031, + "loss": 0.7717, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.98294734954834, + "rewards/margins": 5.42457914352417, + "rewards/rejected": -2.441631555557251, + "step": 9645 + }, + { + "epoch": 2.41, + "grad_norm": 5.028491020202637, + "learning_rate": 5.272210073756031e-06, + "logits/chosen": -0.4631854295730591, + "logits/rejected": -0.5920884609222412, + "logps/chosen": -68.52471160888672, + "logps/rejected": -97.24844360351562, + "loss": 0.6708, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9099884033203125, + "rewards/margins": 6.553028106689453, + "rewards/rejected": -3.6430399417877197, + "step": 9646 + }, + { + "epoch": 2.41, + "grad_norm": 6.6013383865356445, + "learning_rate": 5.271425248413571e-06, + "logits/chosen": -0.5043661594390869, + "logits/rejected": -0.53681480884552, + "logps/chosen": -55.319854736328125, + "logps/rejected": -101.68658447265625, + "loss": 0.7904, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.781813383102417, + "rewards/margins": 5.7736310958862305, + "rewards/rejected": -2.9918177127838135, + "step": 9647 + }, + { + "epoch": 2.41, + "grad_norm": 5.536857604980469, + "learning_rate": 5.270640416363905e-06, + "logits/chosen": -0.577781081199646, + "logits/rejected": -0.6282421946525574, + "logps/chosen": -62.02375793457031, + "logps/rejected": -111.35070037841797, + "loss": 0.7484, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.059875726699829, + "rewards/margins": 7.145318984985352, + "rewards/rejected": -4.085443496704102, + "step": 9648 + }, + { + "epoch": 2.41, + "grad_norm": 3.651315689086914, + "learning_rate": 5.2698555776264235e-06, + "logits/chosen": -0.4747329354286194, + "logits/rejected": -0.6036978363990784, + "logps/chosen": -68.89533996582031, + "logps/rejected": -103.5639419555664, + "loss": 0.643, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.795278549194336, + "rewards/margins": 6.9562602043151855, + "rewards/rejected": -4.16098165512085, + "step": 9649 + }, + { + "epoch": 2.41, + "grad_norm": 5.039443492889404, + "learning_rate": 5.269070732220526e-06, + "logits/chosen": -0.5105704069137573, + "logits/rejected": -0.5783286094665527, + "logps/chosen": -64.95848846435547, + "logps/rejected": -101.04849243164062, + "loss": 0.6837, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9512906074523926, + "rewards/margins": 5.32621955871582, + "rewards/rejected": -2.374929428100586, + "step": 9650 + }, + { + "epoch": 2.41, + "grad_norm": 15.871721267700195, + "learning_rate": 5.268285880165605e-06, + "logits/chosen": -0.5148852467536926, + "logits/rejected": -0.5677759647369385, + "logps/chosen": -61.961875915527344, + "logps/rejected": -114.4632568359375, + "loss": 0.643, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8292665481567383, + "rewards/margins": 5.746427536010742, + "rewards/rejected": -2.917161226272583, + "step": 9651 + }, + { + "epoch": 2.41, + "grad_norm": 22.88749122619629, + "learning_rate": 5.267501021481052e-06, + "logits/chosen": -0.5652968883514404, + "logits/rejected": -0.6487417221069336, + "logps/chosen": -51.567203521728516, + "logps/rejected": -84.67508697509766, + "loss": 0.7775, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1424546241760254, + "rewards/margins": 5.686142921447754, + "rewards/rejected": -2.5436878204345703, + "step": 9652 + }, + { + "epoch": 2.41, + "grad_norm": 8.855537414550781, + "learning_rate": 5.266716156186266e-06, + "logits/chosen": -0.5361695289611816, + "logits/rejected": -0.542413592338562, + "logps/chosen": -53.085609436035156, + "logps/rejected": -106.33993530273438, + "loss": 0.6636, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.917336940765381, + "rewards/margins": 5.6004767417907715, + "rewards/rejected": -2.6831400394439697, + "step": 9653 + }, + { + "epoch": 2.42, + "grad_norm": 12.501424789428711, + "learning_rate": 5.265931284300639e-06, + "logits/chosen": -0.5114904642105103, + "logits/rejected": -0.5408039689064026, + "logps/chosen": -55.69085693359375, + "logps/rejected": -109.64553833007812, + "loss": 0.758, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.030932903289795, + "rewards/margins": 6.069492340087891, + "rewards/rejected": -3.0385591983795166, + "step": 9654 + }, + { + "epoch": 2.42, + "grad_norm": 11.42767333984375, + "learning_rate": 5.265146405843569e-06, + "logits/chosen": -0.5559324622154236, + "logits/rejected": -0.6054049730300903, + "logps/chosen": -51.376060485839844, + "logps/rejected": -108.25515747070312, + "loss": 0.7491, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2980310916900635, + "rewards/margins": 6.332882404327393, + "rewards/rejected": -3.034851312637329, + "step": 9655 + }, + { + "epoch": 2.42, + "grad_norm": 19.696571350097656, + "learning_rate": 5.264361520834447e-06, + "logits/chosen": -0.5434324145317078, + "logits/rejected": -0.6132379174232483, + "logps/chosen": -63.01737976074219, + "logps/rejected": -99.60667419433594, + "loss": 0.6728, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.557232141494751, + "rewards/margins": 6.4377336502075195, + "rewards/rejected": -3.8805012702941895, + "step": 9656 + }, + { + "epoch": 2.42, + "grad_norm": 5.807017803192139, + "learning_rate": 5.2635766292926725e-06, + "logits/chosen": -0.5038303136825562, + "logits/rejected": -0.5753933787345886, + "logps/chosen": -51.96046829223633, + "logps/rejected": -110.05841827392578, + "loss": 0.7242, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8228955268859863, + "rewards/margins": 6.515729904174805, + "rewards/rejected": -3.6928348541259766, + "step": 9657 + }, + { + "epoch": 2.42, + "grad_norm": 6.339654445648193, + "learning_rate": 5.262791731237637e-06, + "logits/chosen": -0.5299459099769592, + "logits/rejected": -0.6005347967147827, + "logps/chosen": -59.326290130615234, + "logps/rejected": -100.96903991699219, + "loss": 0.7527, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0666725635528564, + "rewards/margins": 6.58057975769043, + "rewards/rejected": -3.5139071941375732, + "step": 9658 + }, + { + "epoch": 2.42, + "grad_norm": 5.172359466552734, + "learning_rate": 5.26200682668874e-06, + "logits/chosen": -0.3956185579299927, + "logits/rejected": -0.45753219723701477, + "logps/chosen": -67.63785552978516, + "logps/rejected": -109.61151123046875, + "loss": 0.7417, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8343052864074707, + "rewards/margins": 5.41247034072876, + "rewards/rejected": -2.578165292739868, + "step": 9659 + }, + { + "epoch": 2.42, + "grad_norm": 5.122105598449707, + "learning_rate": 5.261221915665375e-06, + "logits/chosen": -0.40874385833740234, + "logits/rejected": -0.45627620816230774, + "logps/chosen": -82.9662094116211, + "logps/rejected": -96.72760772705078, + "loss": 0.775, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9187777042388916, + "rewards/margins": 5.524198055267334, + "rewards/rejected": -2.6054205894470215, + "step": 9660 + }, + { + "epoch": 2.42, + "grad_norm": 6.550150394439697, + "learning_rate": 5.2604369981869396e-06, + "logits/chosen": -0.5164527893066406, + "logits/rejected": -0.5271666049957275, + "logps/chosen": -47.47626495361328, + "logps/rejected": -97.89836883544922, + "loss": 0.7491, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6784865856170654, + "rewards/margins": 5.4534382820129395, + "rewards/rejected": -2.774951696395874, + "step": 9661 + }, + { + "epoch": 2.42, + "grad_norm": 5.358822822570801, + "learning_rate": 5.2596520742728295e-06, + "logits/chosen": -0.5529707074165344, + "logits/rejected": -0.6101694107055664, + "logps/chosen": -52.15878677368164, + "logps/rejected": -87.35612487792969, + "loss": 0.7295, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0456738471984863, + "rewards/margins": 5.4894819259643555, + "rewards/rejected": -2.443808078765869, + "step": 9662 + }, + { + "epoch": 2.42, + "grad_norm": 12.061979293823242, + "learning_rate": 5.258867143942437e-06, + "logits/chosen": -0.5585845112800598, + "logits/rejected": -0.5838624835014343, + "logps/chosen": -56.534122467041016, + "logps/rejected": -115.1772232055664, + "loss": 0.737, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2276418209075928, + "rewards/margins": 6.280033111572266, + "rewards/rejected": -3.052391529083252, + "step": 9663 + }, + { + "epoch": 2.42, + "grad_norm": 9.815714836120605, + "learning_rate": 5.258082207215166e-06, + "logits/chosen": -0.4926661550998688, + "logits/rejected": -0.5425030589103699, + "logps/chosen": -55.039039611816406, + "logps/rejected": -94.45306396484375, + "loss": 0.7669, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.064032793045044, + "rewards/margins": 4.9941582679748535, + "rewards/rejected": -1.9301254749298096, + "step": 9664 + }, + { + "epoch": 2.42, + "grad_norm": 5.057228088378906, + "learning_rate": 5.257297264110407e-06, + "logits/chosen": -0.4290132224559784, + "logits/rejected": -0.5153483152389526, + "logps/chosen": -56.001102447509766, + "logps/rejected": -82.04920959472656, + "loss": 0.6787, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.077310562133789, + "rewards/margins": 5.439932823181152, + "rewards/rejected": -2.362621784210205, + "step": 9665 + }, + { + "epoch": 2.42, + "grad_norm": 4.152174472808838, + "learning_rate": 5.256512314647559e-06, + "logits/chosen": -0.5505064725875854, + "logits/rejected": -0.5876748561859131, + "logps/chosen": -51.31962203979492, + "logps/rejected": -88.88018035888672, + "loss": 0.6987, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0194733142852783, + "rewards/margins": 5.408937454223633, + "rewards/rejected": -2.3894636631011963, + "step": 9666 + }, + { + "epoch": 2.42, + "grad_norm": 7.386268615722656, + "learning_rate": 5.255727358846019e-06, + "logits/chosen": -0.45403608679771423, + "logits/rejected": -0.5511526465415955, + "logps/chosen": -79.7935791015625, + "logps/rejected": -91.23078918457031, + "loss": 0.7808, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.023010015487671, + "rewards/margins": 5.42091178894043, + "rewards/rejected": -2.3979015350341797, + "step": 9667 + }, + { + "epoch": 2.42, + "grad_norm": 4.223948001861572, + "learning_rate": 5.254942396725186e-06, + "logits/chosen": -0.4950419068336487, + "logits/rejected": -0.5298836827278137, + "logps/chosen": -60.400962829589844, + "logps/rejected": -118.19406127929688, + "loss": 0.6127, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0705504417419434, + "rewards/margins": 6.174978256225586, + "rewards/rejected": -3.1044280529022217, + "step": 9668 + }, + { + "epoch": 2.42, + "grad_norm": 4.042781352996826, + "learning_rate": 5.254157428304453e-06, + "logits/chosen": -0.44632381200790405, + "logits/rejected": -0.5472968816757202, + "logps/chosen": -62.55029296875, + "logps/rejected": -86.79417419433594, + "loss": 0.7369, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0605578422546387, + "rewards/margins": 5.530989646911621, + "rewards/rejected": -2.4704320430755615, + "step": 9669 + }, + { + "epoch": 2.42, + "grad_norm": 3.489337205886841, + "learning_rate": 5.253372453603219e-06, + "logits/chosen": -0.550910234451294, + "logits/rejected": -0.5831515192985535, + "logps/chosen": -47.04791259765625, + "logps/rejected": -92.87297058105469, + "loss": 0.6732, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.046335458755493, + "rewards/margins": 5.573220252990723, + "rewards/rejected": -2.5268850326538086, + "step": 9670 + }, + { + "epoch": 2.42, + "grad_norm": 5.741044521331787, + "learning_rate": 5.252587472640884e-06, + "logits/chosen": -0.4422210156917572, + "logits/rejected": -0.5244122743606567, + "logps/chosen": -71.11449432373047, + "logps/rejected": -88.7589111328125, + "loss": 0.7558, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6022045612335205, + "rewards/margins": 5.510591506958008, + "rewards/rejected": -2.9083869457244873, + "step": 9671 + }, + { + "epoch": 2.42, + "grad_norm": 7.021687984466553, + "learning_rate": 5.251802485436842e-06, + "logits/chosen": -0.5564576983451843, + "logits/rejected": -0.6239791512489319, + "logps/chosen": -56.54724884033203, + "logps/rejected": -98.30194091796875, + "loss": 0.6319, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9082486629486084, + "rewards/margins": 5.629047870635986, + "rewards/rejected": -2.720798969268799, + "step": 9672 + }, + { + "epoch": 2.42, + "grad_norm": 4.710987091064453, + "learning_rate": 5.251017492010495e-06, + "logits/chosen": -0.42492902278900146, + "logits/rejected": -0.5188309550285339, + "logps/chosen": -57.484859466552734, + "logps/rejected": -83.8176040649414, + "loss": 0.6824, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.93489933013916, + "rewards/margins": 6.1081461906433105, + "rewards/rejected": -3.173246383666992, + "step": 9673 + }, + { + "epoch": 2.42, + "grad_norm": 4.599882125854492, + "learning_rate": 5.250232492381237e-06, + "logits/chosen": -0.543175220489502, + "logits/rejected": -0.5626814365386963, + "logps/chosen": -57.769710540771484, + "logps/rejected": -89.66696166992188, + "loss": 0.7429, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0116915702819824, + "rewards/margins": 4.993736743927002, + "rewards/rejected": -1.9820454120635986, + "step": 9674 + }, + { + "epoch": 2.42, + "grad_norm": 3.1787047386169434, + "learning_rate": 5.249447486568469e-06, + "logits/chosen": -0.4869841933250427, + "logits/rejected": -0.5210207104682922, + "logps/chosen": -54.86817932128906, + "logps/rejected": -101.99231719970703, + "loss": 0.6797, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.142521858215332, + "rewards/margins": 5.193477630615234, + "rewards/rejected": -2.0509564876556396, + "step": 9675 + }, + { + "epoch": 2.42, + "grad_norm": 6.966814041137695, + "learning_rate": 5.248662474591588e-06, + "logits/chosen": -0.4990062117576599, + "logits/rejected": -0.6310594081878662, + "logps/chosen": -65.83541870117188, + "logps/rejected": -74.18498229980469, + "loss": 0.6605, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1959404945373535, + "rewards/margins": 5.846190929412842, + "rewards/rejected": -2.650250196456909, + "step": 9676 + }, + { + "epoch": 2.42, + "grad_norm": 2.913646936416626, + "learning_rate": 5.247877456469992e-06, + "logits/chosen": -0.4188530445098877, + "logits/rejected": -0.5437304973602295, + "logps/chosen": -70.95709228515625, + "logps/rejected": -113.21821594238281, + "loss": 0.6379, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.024618625640869, + "rewards/margins": 6.295736312866211, + "rewards/rejected": -3.2711181640625, + "step": 9677 + }, + { + "epoch": 2.42, + "grad_norm": 5.715662956237793, + "learning_rate": 5.247092432223081e-06, + "logits/chosen": -0.425824373960495, + "logits/rejected": -0.5230069160461426, + "logps/chosen": -61.54297637939453, + "logps/rejected": -91.10951232910156, + "loss": 0.6253, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7473084926605225, + "rewards/margins": 5.961905479431152, + "rewards/rejected": -3.214596748352051, + "step": 9678 + }, + { + "epoch": 2.42, + "grad_norm": 6.4831414222717285, + "learning_rate": 5.246307401870253e-06, + "logits/chosen": -0.5440998077392578, + "logits/rejected": -0.6356906294822693, + "logps/chosen": -43.562286376953125, + "logps/rejected": -89.32485961914062, + "loss": 0.6098, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9380717277526855, + "rewards/margins": 5.924991607666016, + "rewards/rejected": -2.986920118331909, + "step": 9679 + }, + { + "epoch": 2.42, + "grad_norm": 6.9655985832214355, + "learning_rate": 5.245522365430907e-06, + "logits/chosen": -0.589504599571228, + "logits/rejected": -0.6250979900360107, + "logps/chosen": -53.24519729614258, + "logps/rejected": -90.90225219726562, + "loss": 0.8087, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8254733085632324, + "rewards/margins": 4.512827396392822, + "rewards/rejected": -1.6873539686203003, + "step": 9680 + }, + { + "epoch": 2.42, + "grad_norm": 4.224679946899414, + "learning_rate": 5.244737322924443e-06, + "logits/chosen": -0.5479572415351868, + "logits/rejected": -0.577666163444519, + "logps/chosen": -45.505157470703125, + "logps/rejected": -101.84688568115234, + "loss": 0.7055, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2700095176696777, + "rewards/margins": 6.065098285675049, + "rewards/rejected": -2.795088291168213, + "step": 9681 + }, + { + "epoch": 2.42, + "grad_norm": 8.398341178894043, + "learning_rate": 5.24395227437026e-06, + "logits/chosen": -0.49248021841049194, + "logits/rejected": -0.5405929088592529, + "logps/chosen": -55.551422119140625, + "logps/rejected": -105.84522247314453, + "loss": 0.7553, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.018862724304199, + "rewards/margins": 6.127811908721924, + "rewards/rejected": -3.108949661254883, + "step": 9682 + }, + { + "epoch": 2.42, + "grad_norm": 12.092424392700195, + "learning_rate": 5.243167219787755e-06, + "logits/chosen": -0.5426995754241943, + "logits/rejected": -0.5796908736228943, + "logps/chosen": -66.04682922363281, + "logps/rejected": -99.24275207519531, + "loss": 0.8611, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.030707359313965, + "rewards/margins": 5.766395092010498, + "rewards/rejected": -2.7356879711151123, + "step": 9683 + }, + { + "epoch": 2.42, + "grad_norm": 11.787640571594238, + "learning_rate": 5.24238215919633e-06, + "logits/chosen": -0.4695088565349579, + "logits/rejected": -0.545721709728241, + "logps/chosen": -57.67628479003906, + "logps/rejected": -102.29693603515625, + "loss": 0.7691, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9557623863220215, + "rewards/margins": 5.960660934448242, + "rewards/rejected": -3.0048983097076416, + "step": 9684 + }, + { + "epoch": 2.42, + "grad_norm": 4.559625625610352, + "learning_rate": 5.241597092615385e-06, + "logits/chosen": -0.5163739919662476, + "logits/rejected": -0.5682525038719177, + "logps/chosen": -50.538814544677734, + "logps/rejected": -88.64060974121094, + "loss": 0.7233, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.807805299758911, + "rewards/margins": 5.622740745544434, + "rewards/rejected": -2.8149349689483643, + "step": 9685 + }, + { + "epoch": 2.42, + "grad_norm": 4.21942663192749, + "learning_rate": 5.240812020064317e-06, + "logits/chosen": -0.5373870134353638, + "logits/rejected": -0.5867627859115601, + "logps/chosen": -54.2335205078125, + "logps/rejected": -98.91572570800781, + "loss": 0.7042, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.194519281387329, + "rewards/margins": 5.172146797180176, + "rewards/rejected": -1.977626919746399, + "step": 9686 + }, + { + "epoch": 2.42, + "grad_norm": 3.7816550731658936, + "learning_rate": 5.240026941562529e-06, + "logits/chosen": -0.5669936537742615, + "logits/rejected": -0.6785550117492676, + "logps/chosen": -49.113773345947266, + "logps/rejected": -104.23506164550781, + "loss": 0.6145, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1810808181762695, + "rewards/margins": 7.6597466468811035, + "rewards/rejected": -4.478665351867676, + "step": 9687 + }, + { + "epoch": 2.42, + "grad_norm": 5.065694332122803, + "learning_rate": 5.23924185712942e-06, + "logits/chosen": -0.58327317237854, + "logits/rejected": -0.6447809338569641, + "logps/chosen": -57.70112609863281, + "logps/rejected": -105.8842544555664, + "loss": 0.7083, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.895082473754883, + "rewards/margins": 6.204423904418945, + "rewards/rejected": -3.3093416690826416, + "step": 9688 + }, + { + "epoch": 2.42, + "grad_norm": 16.259437561035156, + "learning_rate": 5.23845676678439e-06, + "logits/chosen": -0.546454131603241, + "logits/rejected": -0.6342325806617737, + "logps/chosen": -49.73567199707031, + "logps/rejected": -88.49718475341797, + "loss": 0.6551, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0565547943115234, + "rewards/margins": 6.014865875244141, + "rewards/rejected": -2.958310127258301, + "step": 9689 + }, + { + "epoch": 2.42, + "grad_norm": 4.5969462394714355, + "learning_rate": 5.237671670546839e-06, + "logits/chosen": -0.557424783706665, + "logits/rejected": -0.6568438410758972, + "logps/chosen": -52.91655349731445, + "logps/rejected": -90.25559997558594, + "loss": 0.5647, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.411973237991333, + "rewards/margins": 5.973979949951172, + "rewards/rejected": -2.562006950378418, + "step": 9690 + }, + { + "epoch": 2.42, + "grad_norm": 6.132176399230957, + "learning_rate": 5.2368865684361705e-06, + "logits/chosen": -0.5675622820854187, + "logits/rejected": -0.6443638205528259, + "logps/chosen": -49.96022033691406, + "logps/rejected": -93.82731628417969, + "loss": 0.6671, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0428919792175293, + "rewards/margins": 6.447420120239258, + "rewards/rejected": -3.4045283794403076, + "step": 9691 + }, + { + "epoch": 2.42, + "grad_norm": 3.5013558864593506, + "learning_rate": 5.236101460471781e-06, + "logits/chosen": -0.5453351736068726, + "logits/rejected": -0.6224992871284485, + "logps/chosen": -61.245849609375, + "logps/rejected": -100.48622131347656, + "loss": 0.6315, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.907564878463745, + "rewards/margins": 6.488687992095947, + "rewards/rejected": -3.581122398376465, + "step": 9692 + }, + { + "epoch": 2.42, + "grad_norm": 2.852322578430176, + "learning_rate": 5.235316346673073e-06, + "logits/chosen": -0.5102962255477905, + "logits/rejected": -0.5711106657981873, + "logps/chosen": -45.69066619873047, + "logps/rejected": -94.18940734863281, + "loss": 0.5862, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.085221529006958, + "rewards/margins": 6.628149032592773, + "rewards/rejected": -3.5429275035858154, + "step": 9693 + }, + { + "epoch": 2.43, + "grad_norm": 4.271043300628662, + "learning_rate": 5.234531227059449e-06, + "logits/chosen": -0.5053786039352417, + "logits/rejected": -0.524481475353241, + "logps/chosen": -52.79605484008789, + "logps/rejected": -124.07420349121094, + "loss": 0.6622, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2971863746643066, + "rewards/margins": 7.161914348602295, + "rewards/rejected": -3.8647282123565674, + "step": 9694 + }, + { + "epoch": 2.43, + "grad_norm": 13.695417404174805, + "learning_rate": 5.233746101650308e-06, + "logits/chosen": -0.48730599880218506, + "logits/rejected": -0.5019190311431885, + "logps/chosen": -53.032737731933594, + "logps/rejected": -129.13330078125, + "loss": 0.6794, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0658531188964844, + "rewards/margins": 7.460720062255859, + "rewards/rejected": -4.394866466522217, + "step": 9695 + }, + { + "epoch": 2.43, + "grad_norm": 4.265527248382568, + "learning_rate": 5.232960970465053e-06, + "logits/chosen": -0.544175386428833, + "logits/rejected": -0.588162899017334, + "logps/chosen": -56.37665557861328, + "logps/rejected": -108.47637939453125, + "loss": 0.6939, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8749654293060303, + "rewards/margins": 6.426041603088379, + "rewards/rejected": -3.5510764122009277, + "step": 9696 + }, + { + "epoch": 2.43, + "grad_norm": 10.136622428894043, + "learning_rate": 5.232175833523083e-06, + "logits/chosen": -0.5165257453918457, + "logits/rejected": -0.5762124061584473, + "logps/chosen": -52.64104461669922, + "logps/rejected": -96.08494567871094, + "loss": 0.6666, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0241169929504395, + "rewards/margins": 5.612912654876709, + "rewards/rejected": -2.5887956619262695, + "step": 9697 + }, + { + "epoch": 2.43, + "grad_norm": 7.232222557067871, + "learning_rate": 5.231390690843803e-06, + "logits/chosen": -0.5260968208312988, + "logits/rejected": -0.6442788243293762, + "logps/chosen": -54.263797760009766, + "logps/rejected": -94.03384399414062, + "loss": 0.6425, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.273935556411743, + "rewards/margins": 7.092926979064941, + "rewards/rejected": -3.818991184234619, + "step": 9698 + }, + { + "epoch": 2.43, + "grad_norm": 3.1363260746002197, + "learning_rate": 5.230605542446612e-06, + "logits/chosen": -0.5106559991836548, + "logits/rejected": -0.5858757495880127, + "logps/chosen": -61.54014587402344, + "logps/rejected": -99.65286254882812, + "loss": 0.6807, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2877197265625, + "rewards/margins": 5.778560161590576, + "rewards/rejected": -2.4908406734466553, + "step": 9699 + }, + { + "epoch": 2.43, + "grad_norm": 6.228036880493164, + "learning_rate": 5.229820388350913e-06, + "logits/chosen": -0.48236989974975586, + "logits/rejected": -0.5875406861305237, + "logps/chosen": -53.715633392333984, + "logps/rejected": -74.9660415649414, + "loss": 0.6205, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.00077748298645, + "rewards/margins": 5.367582321166992, + "rewards/rejected": -2.366804838180542, + "step": 9700 + }, + { + "epoch": 2.43, + "grad_norm": 5.004096508026123, + "learning_rate": 5.229035228576108e-06, + "logits/chosen": -0.5108515024185181, + "logits/rejected": -0.5652517676353455, + "logps/chosen": -55.221248626708984, + "logps/rejected": -115.05421447753906, + "loss": 0.6457, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9499573707580566, + "rewards/margins": 7.075850009918213, + "rewards/rejected": -4.1258931159973145, + "step": 9701 + }, + { + "epoch": 2.43, + "grad_norm": 38.13581085205078, + "learning_rate": 5.228250063141599e-06, + "logits/chosen": -0.5695440769195557, + "logits/rejected": -0.5699690580368042, + "logps/chosen": -60.216163635253906, + "logps/rejected": -116.66407012939453, + "loss": 0.775, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.820324659347534, + "rewards/margins": 5.668684005737305, + "rewards/rejected": -2.8483595848083496, + "step": 9702 + }, + { + "epoch": 2.43, + "grad_norm": 7.671724319458008, + "learning_rate": 5.227464892066788e-06, + "logits/chosen": -0.5644313097000122, + "logits/rejected": -0.6408121585845947, + "logps/chosen": -54.12124252319336, + "logps/rejected": -98.33004760742188, + "loss": 0.659, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.922565460205078, + "rewards/margins": 6.475039482116699, + "rewards/rejected": -3.552473545074463, + "step": 9703 + }, + { + "epoch": 2.43, + "grad_norm": 31.101947784423828, + "learning_rate": 5.226679715371077e-06, + "logits/chosen": -0.4626438617706299, + "logits/rejected": -0.530299186706543, + "logps/chosen": -56.77683639526367, + "logps/rejected": -87.98817443847656, + "loss": 0.6868, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8553996086120605, + "rewards/margins": 5.96373987197876, + "rewards/rejected": -3.108340263366699, + "step": 9704 + }, + { + "epoch": 2.43, + "grad_norm": 2.1318838596343994, + "learning_rate": 5.225894533073871e-06, + "logits/chosen": -0.629235565662384, + "logits/rejected": -0.7261197566986084, + "logps/chosen": -50.240203857421875, + "logps/rejected": -90.38683319091797, + "loss": 0.5946, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.079580783843994, + "rewards/margins": 7.039206027984619, + "rewards/rejected": -3.959625720977783, + "step": 9705 + }, + { + "epoch": 2.43, + "grad_norm": 13.61019229888916, + "learning_rate": 5.225109345194569e-06, + "logits/chosen": -0.5437025427818298, + "logits/rejected": -0.6142574548721313, + "logps/chosen": -45.383541107177734, + "logps/rejected": -82.70372772216797, + "loss": 0.69, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.78475284576416, + "rewards/margins": 5.317347526550293, + "rewards/rejected": -2.532594680786133, + "step": 9706 + }, + { + "epoch": 2.43, + "grad_norm": 6.347746849060059, + "learning_rate": 5.224324151752575e-06, + "logits/chosen": -0.5212507247924805, + "logits/rejected": -0.6059587001800537, + "logps/chosen": -57.185672760009766, + "logps/rejected": -87.35696411132812, + "loss": 0.7319, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8745038509368896, + "rewards/margins": 4.026838779449463, + "rewards/rejected": -1.1523351669311523, + "step": 9707 + }, + { + "epoch": 2.43, + "grad_norm": 5.648055076599121, + "learning_rate": 5.223538952767294e-06, + "logits/chosen": -0.4814040958881378, + "logits/rejected": -0.5537830591201782, + "logps/chosen": -63.04795455932617, + "logps/rejected": -105.19844818115234, + "loss": 0.7252, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.924363136291504, + "rewards/margins": 6.780807971954346, + "rewards/rejected": -3.8564453125, + "step": 9708 + }, + { + "epoch": 2.43, + "grad_norm": 6.66942024230957, + "learning_rate": 5.222753748258127e-06, + "logits/chosen": -0.49799078702926636, + "logits/rejected": -0.508611798286438, + "logps/chosen": -50.878662109375, + "logps/rejected": -118.74404907226562, + "loss": 0.6227, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3122167587280273, + "rewards/margins": 6.80106258392334, + "rewards/rejected": -3.4888460636138916, + "step": 9709 + }, + { + "epoch": 2.43, + "grad_norm": 5.17341947555542, + "learning_rate": 5.221968538244479e-06, + "logits/chosen": -0.4492035508155823, + "logits/rejected": -0.5823288559913635, + "logps/chosen": -60.11599349975586, + "logps/rejected": -90.37886047363281, + "loss": 0.7169, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.95061993598938, + "rewards/margins": 5.594146728515625, + "rewards/rejected": -2.643526554107666, + "step": 9710 + }, + { + "epoch": 2.43, + "grad_norm": 6.556915760040283, + "learning_rate": 5.221183322745752e-06, + "logits/chosen": -0.5316631197929382, + "logits/rejected": -0.6334466934204102, + "logps/chosen": -57.41135025024414, + "logps/rejected": -91.39082336425781, + "loss": 0.6542, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1431047916412354, + "rewards/margins": 5.792037487030029, + "rewards/rejected": -2.648932695388794, + "step": 9711 + }, + { + "epoch": 2.43, + "grad_norm": 6.6820387840271, + "learning_rate": 5.220398101781349e-06, + "logits/chosen": -0.5643036961555481, + "logits/rejected": -0.6238920092582703, + "logps/chosen": -55.78688430786133, + "logps/rejected": -100.64492797851562, + "loss": 0.7583, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9608092308044434, + "rewards/margins": 5.07951021194458, + "rewards/rejected": -2.1187002658843994, + "step": 9712 + }, + { + "epoch": 2.43, + "grad_norm": 3.753169298171997, + "learning_rate": 5.2196128753706745e-06, + "logits/chosen": -0.4407179653644562, + "logits/rejected": -0.562849760055542, + "logps/chosen": -62.4612922668457, + "logps/rejected": -87.00674438476562, + "loss": 0.6164, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2756357192993164, + "rewards/margins": 6.348224639892578, + "rewards/rejected": -3.07258939743042, + "step": 9713 + }, + { + "epoch": 2.43, + "grad_norm": 4.471967697143555, + "learning_rate": 5.218827643533133e-06, + "logits/chosen": -0.524824321269989, + "logits/rejected": -0.6350629329681396, + "logps/chosen": -57.29320526123047, + "logps/rejected": -88.0583267211914, + "loss": 0.6645, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.051270008087158, + "rewards/margins": 5.630290508270264, + "rewards/rejected": -2.579019784927368, + "step": 9714 + }, + { + "epoch": 2.43, + "grad_norm": 6.846478462219238, + "learning_rate": 5.2180424062881265e-06, + "logits/chosen": -0.4204777479171753, + "logits/rejected": -0.5050552487373352, + "logps/chosen": -54.36827850341797, + "logps/rejected": -87.37593078613281, + "loss": 0.6466, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4088902473449707, + "rewards/margins": 5.65896463394165, + "rewards/rejected": -2.2500739097595215, + "step": 9715 + }, + { + "epoch": 2.43, + "grad_norm": 4.593559265136719, + "learning_rate": 5.217257163655062e-06, + "logits/chosen": -0.4853969216346741, + "logits/rejected": -0.5389629602432251, + "logps/chosen": -57.645896911621094, + "logps/rejected": -90.43836975097656, + "loss": 0.6209, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.400120735168457, + "rewards/margins": 6.067842960357666, + "rewards/rejected": -2.667722702026367, + "step": 9716 + }, + { + "epoch": 2.43, + "grad_norm": 7.354694366455078, + "learning_rate": 5.216471915653341e-06, + "logits/chosen": -0.5553368330001831, + "logits/rejected": -0.6591261625289917, + "logps/chosen": -61.71479415893555, + "logps/rejected": -94.4472885131836, + "loss": 0.7972, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.041863441467285, + "rewards/margins": 5.220214366912842, + "rewards/rejected": -2.1783506870269775, + "step": 9717 + }, + { + "epoch": 2.43, + "grad_norm": 4.965556621551514, + "learning_rate": 5.215686662302368e-06, + "logits/chosen": -0.536319375038147, + "logits/rejected": -0.6122481822967529, + "logps/chosen": -57.84154510498047, + "logps/rejected": -101.9339370727539, + "loss": 0.7211, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9079408645629883, + "rewards/margins": 6.416499137878418, + "rewards/rejected": -3.508558511734009, + "step": 9718 + }, + { + "epoch": 2.43, + "grad_norm": 3.7344698905944824, + "learning_rate": 5.214901403621551e-06, + "logits/chosen": -0.6185022592544556, + "logits/rejected": -0.6952151656150818, + "logps/chosen": -47.49701690673828, + "logps/rejected": -90.9870834350586, + "loss": 0.6211, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.294830083847046, + "rewards/margins": 6.692177772521973, + "rewards/rejected": -3.3973476886749268, + "step": 9719 + }, + { + "epoch": 2.43, + "grad_norm": 4.5884857177734375, + "learning_rate": 5.214116139630288e-06, + "logits/chosen": -0.4389793872833252, + "logits/rejected": -0.5332316756248474, + "logps/chosen": -53.35939407348633, + "logps/rejected": -99.54048919677734, + "loss": 0.6463, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.69726824760437, + "rewards/margins": 5.2110419273376465, + "rewards/rejected": -2.513773202896118, + "step": 9720 + }, + { + "epoch": 2.43, + "grad_norm": 7.350624084472656, + "learning_rate": 5.213330870347989e-06, + "logits/chosen": -0.47818440198898315, + "logits/rejected": -0.534264087677002, + "logps/chosen": -52.459693908691406, + "logps/rejected": -92.0441665649414, + "loss": 0.6316, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.419222593307495, + "rewards/margins": 5.633703231811523, + "rewards/rejected": -2.2144813537597656, + "step": 9721 + }, + { + "epoch": 2.43, + "grad_norm": 6.068338394165039, + "learning_rate": 5.212545595794058e-06, + "logits/chosen": -0.5062796473503113, + "logits/rejected": -0.5955349206924438, + "logps/chosen": -43.6562385559082, + "logps/rejected": -82.9031753540039, + "loss": 0.5789, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.863893508911133, + "rewards/margins": 5.617321014404297, + "rewards/rejected": -2.753427267074585, + "step": 9722 + }, + { + "epoch": 2.43, + "grad_norm": 3.8644258975982666, + "learning_rate": 5.211760315987898e-06, + "logits/chosen": -0.6022872924804688, + "logits/rejected": -0.6537929177284241, + "logps/chosen": -51.227577209472656, + "logps/rejected": -98.1727523803711, + "loss": 0.6098, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5574703216552734, + "rewards/margins": 6.604560852050781, + "rewards/rejected": -3.0470898151397705, + "step": 9723 + }, + { + "epoch": 2.43, + "grad_norm": 4.88829231262207, + "learning_rate": 5.210975030948916e-06, + "logits/chosen": -0.6017434000968933, + "logits/rejected": -0.6928600072860718, + "logps/chosen": -49.098480224609375, + "logps/rejected": -80.74528503417969, + "loss": 0.677, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.857447862625122, + "rewards/margins": 5.798352241516113, + "rewards/rejected": -2.9409048557281494, + "step": 9724 + }, + { + "epoch": 2.43, + "grad_norm": 4.422869682312012, + "learning_rate": 5.210189740696517e-06, + "logits/chosen": -0.5110276937484741, + "logits/rejected": -0.5674888491630554, + "logps/chosen": -46.202796936035156, + "logps/rejected": -119.53788757324219, + "loss": 0.6256, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.865208387374878, + "rewards/margins": 6.96270227432251, + "rewards/rejected": -4.097494125366211, + "step": 9725 + }, + { + "epoch": 2.43, + "grad_norm": 4.8476996421813965, + "learning_rate": 5.209404445250105e-06, + "logits/chosen": -0.5887481570243835, + "logits/rejected": -0.6799862384796143, + "logps/chosen": -44.451087951660156, + "logps/rejected": -88.7706069946289, + "loss": 0.6326, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9834656715393066, + "rewards/margins": 5.102666854858398, + "rewards/rejected": -2.11920166015625, + "step": 9726 + }, + { + "epoch": 2.43, + "grad_norm": 5.930055618286133, + "learning_rate": 5.208619144629087e-06, + "logits/chosen": -0.5503285527229309, + "logits/rejected": -0.6440850496292114, + "logps/chosen": -65.889404296875, + "logps/rejected": -94.58413696289062, + "loss": 0.7202, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.792940378189087, + "rewards/margins": 6.055121421813965, + "rewards/rejected": -3.262181520462036, + "step": 9727 + }, + { + "epoch": 2.43, + "grad_norm": 9.084423065185547, + "learning_rate": 5.207833838852868e-06, + "logits/chosen": -0.5113925933837891, + "logits/rejected": -0.5813068747520447, + "logps/chosen": -68.29299926757812, + "logps/rejected": -116.76637268066406, + "loss": 0.6791, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.626689910888672, + "rewards/margins": 6.902795314788818, + "rewards/rejected": -4.276105880737305, + "step": 9728 + }, + { + "epoch": 2.43, + "grad_norm": 5.015713691711426, + "learning_rate": 5.2070485279408535e-06, + "logits/chosen": -0.4573175311088562, + "logits/rejected": -0.5879695415496826, + "logps/chosen": -73.03919219970703, + "logps/rejected": -90.1152572631836, + "loss": 0.6583, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.399930953979492, + "rewards/margins": 5.335495948791504, + "rewards/rejected": -2.9355649948120117, + "step": 9729 + }, + { + "epoch": 2.43, + "grad_norm": 5.909056186676025, + "learning_rate": 5.20626321191245e-06, + "logits/chosen": -0.5171079635620117, + "logits/rejected": -0.5513205528259277, + "logps/chosen": -55.45945739746094, + "logps/rejected": -104.12179565429688, + "loss": 0.766, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.904087543487549, + "rewards/margins": 5.074000835418701, + "rewards/rejected": -2.169912815093994, + "step": 9730 + }, + { + "epoch": 2.43, + "grad_norm": 9.75105094909668, + "learning_rate": 5.205477890787063e-06, + "logits/chosen": -0.575707197189331, + "logits/rejected": -0.613525390625, + "logps/chosen": -54.64554977416992, + "logps/rejected": -104.09065246582031, + "loss": 0.831, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8455543518066406, + "rewards/margins": 5.11728048324585, + "rewards/rejected": -2.271726369857788, + "step": 9731 + }, + { + "epoch": 2.43, + "grad_norm": 10.97441291809082, + "learning_rate": 5.204692564584099e-06, + "logits/chosen": -0.5304774641990662, + "logits/rejected": -0.6548876166343689, + "logps/chosen": -51.74109649658203, + "logps/rejected": -67.55207824707031, + "loss": 0.7039, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0962884426116943, + "rewards/margins": 6.037428379058838, + "rewards/rejected": -2.941140651702881, + "step": 9732 + }, + { + "epoch": 2.43, + "grad_norm": 4.999787330627441, + "learning_rate": 5.203907233322963e-06, + "logits/chosen": -0.5464276075363159, + "logits/rejected": -0.6167023181915283, + "logps/chosen": -52.94195556640625, + "logps/rejected": -104.90254211425781, + "loss": 0.5758, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1283020973205566, + "rewards/margins": 6.983904838562012, + "rewards/rejected": -3.8556032180786133, + "step": 9733 + }, + { + "epoch": 2.44, + "grad_norm": 11.163308143615723, + "learning_rate": 5.203121897023064e-06, + "logits/chosen": -0.5552050471305847, + "logits/rejected": -0.615993320941925, + "logps/chosen": -53.48820114135742, + "logps/rejected": -107.78240203857422, + "loss": 0.5782, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1255626678466797, + "rewards/margins": 6.546215057373047, + "rewards/rejected": -3.420652389526367, + "step": 9734 + }, + { + "epoch": 2.44, + "grad_norm": 4.499764919281006, + "learning_rate": 5.202336555703806e-06, + "logits/chosen": -0.5214238166809082, + "logits/rejected": -0.5907201170921326, + "logps/chosen": -52.06774139404297, + "logps/rejected": -100.99069213867188, + "loss": 0.6813, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0839898586273193, + "rewards/margins": 6.314286708831787, + "rewards/rejected": -3.230297327041626, + "step": 9735 + }, + { + "epoch": 2.44, + "grad_norm": 6.469022750854492, + "learning_rate": 5.201551209384597e-06, + "logits/chosen": -0.5393497943878174, + "logits/rejected": -0.669570803642273, + "logps/chosen": -62.63595199584961, + "logps/rejected": -91.64497375488281, + "loss": 0.6712, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.778282642364502, + "rewards/margins": 5.942885875701904, + "rewards/rejected": -3.1646034717559814, + "step": 9736 + }, + { + "epoch": 2.44, + "grad_norm": 3.323702335357666, + "learning_rate": 5.200765858084843e-06, + "logits/chosen": -0.5898317098617554, + "logits/rejected": -0.6679443717002869, + "logps/chosen": -63.89474868774414, + "logps/rejected": -95.25402069091797, + "loss": 0.596, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.63212513923645, + "rewards/margins": 6.734719753265381, + "rewards/rejected": -4.102593898773193, + "step": 9737 + }, + { + "epoch": 2.44, + "grad_norm": 5.374808311462402, + "learning_rate": 5.199980501823949e-06, + "logits/chosen": -0.46610158681869507, + "logits/rejected": -0.5228604674339294, + "logps/chosen": -51.05387878417969, + "logps/rejected": -90.73200988769531, + "loss": 0.6794, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.977512836456299, + "rewards/margins": 4.654447078704834, + "rewards/rejected": -1.6769344806671143, + "step": 9738 + }, + { + "epoch": 2.44, + "grad_norm": 5.441516876220703, + "learning_rate": 5.199195140621327e-06, + "logits/chosen": -0.5141956806182861, + "logits/rejected": -0.5637194514274597, + "logps/chosen": -48.86106491088867, + "logps/rejected": -110.3598861694336, + "loss": 0.7043, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7447092533111572, + "rewards/margins": 6.395660877227783, + "rewards/rejected": -3.650951862335205, + "step": 9739 + }, + { + "epoch": 2.44, + "grad_norm": 5.178539276123047, + "learning_rate": 5.19840977449638e-06, + "logits/chosen": -0.5171482563018799, + "logits/rejected": -0.5885752439498901, + "logps/chosen": -51.671058654785156, + "logps/rejected": -96.7982177734375, + "loss": 0.7206, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.816251277923584, + "rewards/margins": 4.9103899002075195, + "rewards/rejected": -2.0941390991210938, + "step": 9740 + }, + { + "epoch": 2.44, + "grad_norm": 3.096322774887085, + "learning_rate": 5.1976244034685165e-06, + "logits/chosen": -0.5924715399742126, + "logits/rejected": -0.6710178256034851, + "logps/chosen": -50.31010818481445, + "logps/rejected": -102.72406768798828, + "loss": 0.622, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2603423595428467, + "rewards/margins": 6.876274108886719, + "rewards/rejected": -3.615931510925293, + "step": 9741 + }, + { + "epoch": 2.44, + "grad_norm": 3.1944351196289062, + "learning_rate": 5.1968390275571446e-06, + "logits/chosen": -0.5275503993034363, + "logits/rejected": -0.5924580693244934, + "logps/chosen": -51.0311164855957, + "logps/rejected": -109.8364486694336, + "loss": 0.623, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0053038597106934, + "rewards/margins": 6.4356255531311035, + "rewards/rejected": -3.430321455001831, + "step": 9742 + }, + { + "epoch": 2.44, + "grad_norm": 6.4336957931518555, + "learning_rate": 5.19605364678167e-06, + "logits/chosen": -0.4566406309604645, + "logits/rejected": -0.5365264415740967, + "logps/chosen": -51.105796813964844, + "logps/rejected": -107.90143585205078, + "loss": 0.6467, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1959872245788574, + "rewards/margins": 6.9289374351501465, + "rewards/rejected": -3.732950448989868, + "step": 9743 + }, + { + "epoch": 2.44, + "grad_norm": 7.179769515991211, + "learning_rate": 5.195268261161501e-06, + "logits/chosen": -0.5668691396713257, + "logits/rejected": -0.6205560564994812, + "logps/chosen": -58.0062370300293, + "logps/rejected": -86.37945556640625, + "loss": 0.9633, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9043798446655273, + "rewards/margins": 4.706854820251465, + "rewards/rejected": -1.8024746179580688, + "step": 9744 + }, + { + "epoch": 2.44, + "grad_norm": 8.870088577270508, + "learning_rate": 5.194482870716047e-06, + "logits/chosen": -0.4752305746078491, + "logits/rejected": -0.5512794256210327, + "logps/chosen": -67.8389892578125, + "logps/rejected": -97.9191665649414, + "loss": 0.7574, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7999720573425293, + "rewards/margins": 5.615421295166016, + "rewards/rejected": -2.815448760986328, + "step": 9745 + }, + { + "epoch": 2.44, + "grad_norm": 4.129340171813965, + "learning_rate": 5.193697475464713e-06, + "logits/chosen": -0.43889129161834717, + "logits/rejected": -0.5363756418228149, + "logps/chosen": -62.67441177368164, + "logps/rejected": -88.6297836303711, + "loss": 0.651, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0615975856781006, + "rewards/margins": 5.97047758102417, + "rewards/rejected": -2.9088797569274902, + "step": 9746 + }, + { + "epoch": 2.44, + "grad_norm": 7.043310165405273, + "learning_rate": 5.192912075426908e-06, + "logits/chosen": -0.4921639561653137, + "logits/rejected": -0.5544753074645996, + "logps/chosen": -53.38487243652344, + "logps/rejected": -101.66746520996094, + "loss": 0.6709, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9813246726989746, + "rewards/margins": 5.955739498138428, + "rewards/rejected": -2.9744150638580322, + "step": 9747 + }, + { + "epoch": 2.44, + "grad_norm": 6.824847221374512, + "learning_rate": 5.192126670622042e-06, + "logits/chosen": -0.5025030374526978, + "logits/rejected": -0.6374559998512268, + "logps/chosen": -63.188690185546875, + "logps/rejected": -98.39096069335938, + "loss": 0.6797, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0297608375549316, + "rewards/margins": 5.863890171051025, + "rewards/rejected": -2.8341290950775146, + "step": 9748 + }, + { + "epoch": 2.44, + "grad_norm": 10.37076187133789, + "learning_rate": 5.191341261069521e-06, + "logits/chosen": -0.5343762636184692, + "logits/rejected": -0.6343126893043518, + "logps/chosen": -60.870853424072266, + "logps/rejected": -84.38928985595703, + "loss": 0.5649, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.937462329864502, + "rewards/margins": 6.7151198387146, + "rewards/rejected": -3.7776570320129395, + "step": 9749 + }, + { + "epoch": 2.44, + "grad_norm": 4.547053813934326, + "learning_rate": 5.190555846788754e-06, + "logits/chosen": -0.5245451927185059, + "logits/rejected": -0.583927571773529, + "logps/chosen": -52.0157585144043, + "logps/rejected": -106.24359130859375, + "loss": 0.6112, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.866514205932617, + "rewards/margins": 5.692835330963135, + "rewards/rejected": -2.8263211250305176, + "step": 9750 + }, + { + "epoch": 2.44, + "grad_norm": 6.075235366821289, + "learning_rate": 5.189770427799149e-06, + "logits/chosen": -0.6853613257408142, + "logits/rejected": -0.7478563189506531, + "logps/chosen": -42.99766159057617, + "logps/rejected": -90.91249084472656, + "loss": 0.546, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.019951581954956, + "rewards/margins": 6.059764385223389, + "rewards/rejected": -3.039813280105591, + "step": 9751 + }, + { + "epoch": 2.44, + "grad_norm": 3.9397222995758057, + "learning_rate": 5.188985004120114e-06, + "logits/chosen": -0.49551379680633545, + "logits/rejected": -0.6049473285675049, + "logps/chosen": -62.298736572265625, + "logps/rejected": -84.61418914794922, + "loss": 0.6778, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.724069595336914, + "rewards/margins": 5.290760517120361, + "rewards/rejected": -2.5666909217834473, + "step": 9752 + }, + { + "epoch": 2.44, + "grad_norm": 5.39784049987793, + "learning_rate": 5.1881995757710604e-06, + "logits/chosen": -0.5553082823753357, + "logits/rejected": -0.6383732557296753, + "logps/chosen": -58.01042556762695, + "logps/rejected": -79.88763427734375, + "loss": 0.7588, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.949652910232544, + "rewards/margins": 5.645819187164307, + "rewards/rejected": -2.696166515350342, + "step": 9753 + }, + { + "epoch": 2.44, + "grad_norm": 14.360363006591797, + "learning_rate": 5.187414142771394e-06, + "logits/chosen": -0.46211373805999756, + "logits/rejected": -0.532156229019165, + "logps/chosen": -61.483482360839844, + "logps/rejected": -87.84373474121094, + "loss": 0.7566, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.680054187774658, + "rewards/margins": 5.3366923332214355, + "rewards/rejected": -2.6566381454467773, + "step": 9754 + }, + { + "epoch": 2.44, + "grad_norm": 5.177253246307373, + "learning_rate": 5.186628705140525e-06, + "logits/chosen": -0.43630439043045044, + "logits/rejected": -0.49788373708724976, + "logps/chosen": -54.016998291015625, + "logps/rejected": -91.08699035644531, + "loss": 0.6417, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4832630157470703, + "rewards/margins": 6.162464618682861, + "rewards/rejected": -2.67920184135437, + "step": 9755 + }, + { + "epoch": 2.44, + "grad_norm": 11.365882873535156, + "learning_rate": 5.185843262897862e-06, + "logits/chosen": -0.5059875249862671, + "logits/rejected": -0.5977869033813477, + "logps/chosen": -55.705413818359375, + "logps/rejected": -88.4449462890625, + "loss": 0.6607, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.025632381439209, + "rewards/margins": 6.005291938781738, + "rewards/rejected": -2.9796600341796875, + "step": 9756 + }, + { + "epoch": 2.44, + "grad_norm": 23.702857971191406, + "learning_rate": 5.185057816062815e-06, + "logits/chosen": -0.44770416617393494, + "logits/rejected": -0.5230625867843628, + "logps/chosen": -56.549232482910156, + "logps/rejected": -84.82221984863281, + "loss": 0.7806, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.820981979370117, + "rewards/margins": 4.792313575744629, + "rewards/rejected": -1.971331238746643, + "step": 9757 + }, + { + "epoch": 2.44, + "grad_norm": 6.258830547332764, + "learning_rate": 5.184272364654792e-06, + "logits/chosen": -0.5194594860076904, + "logits/rejected": -0.5655211210250854, + "logps/chosen": -54.376708984375, + "logps/rejected": -107.17639923095703, + "loss": 0.6826, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7641184329986572, + "rewards/margins": 5.413835525512695, + "rewards/rejected": -2.649717330932617, + "step": 9758 + }, + { + "epoch": 2.44, + "grad_norm": 6.121893405914307, + "learning_rate": 5.183486908693202e-06, + "logits/chosen": -0.4835684299468994, + "logits/rejected": -0.5309960842132568, + "logps/chosen": -53.24787139892578, + "logps/rejected": -112.82230377197266, + "loss": 0.618, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.074920654296875, + "rewards/margins": 5.980528354644775, + "rewards/rejected": -2.9056079387664795, + "step": 9759 + }, + { + "epoch": 2.44, + "grad_norm": 2.682356595993042, + "learning_rate": 5.182701448197458e-06, + "logits/chosen": -0.4337875247001648, + "logits/rejected": -0.524803638458252, + "logps/chosen": -46.993927001953125, + "logps/rejected": -89.26187133789062, + "loss": 0.5226, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.166276454925537, + "rewards/margins": 6.251639366149902, + "rewards/rejected": -3.085362672805786, + "step": 9760 + }, + { + "epoch": 2.44, + "grad_norm": 2.999072790145874, + "learning_rate": 5.181915983186963e-06, + "logits/chosen": -0.4405686855316162, + "logits/rejected": -0.5331912040710449, + "logps/chosen": -61.69448471069336, + "logps/rejected": -91.34672546386719, + "loss": 0.6164, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2176966667175293, + "rewards/margins": 6.4984283447265625, + "rewards/rejected": -3.2807319164276123, + "step": 9761 + }, + { + "epoch": 2.44, + "grad_norm": 9.955389976501465, + "learning_rate": 5.181130513681134e-06, + "logits/chosen": -0.46655964851379395, + "logits/rejected": -0.5678492188453674, + "logps/chosen": -50.76606750488281, + "logps/rejected": -89.77397155761719, + "loss": 0.6795, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9480550289154053, + "rewards/margins": 5.754264831542969, + "rewards/rejected": -2.8062095642089844, + "step": 9762 + }, + { + "epoch": 2.44, + "grad_norm": 3.6001179218292236, + "learning_rate": 5.180345039699376e-06, + "logits/chosen": -0.5459538698196411, + "logits/rejected": -0.5199012756347656, + "logps/chosen": -44.902381896972656, + "logps/rejected": -113.42401123046875, + "loss": 0.609, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4123473167419434, + "rewards/margins": 6.519781112670898, + "rewards/rejected": -3.1074342727661133, + "step": 9763 + }, + { + "epoch": 2.44, + "grad_norm": 9.192693710327148, + "learning_rate": 5.1795595612611e-06, + "logits/chosen": -0.5169231295585632, + "logits/rejected": -0.5457452535629272, + "logps/chosen": -62.55365753173828, + "logps/rejected": -124.78448486328125, + "loss": 0.6778, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.336695432662964, + "rewards/margins": 5.8141961097717285, + "rewards/rejected": -2.4775004386901855, + "step": 9764 + }, + { + "epoch": 2.44, + "grad_norm": 5.702316761016846, + "learning_rate": 5.1787740783857164e-06, + "logits/chosen": -0.5132785439491272, + "logits/rejected": -0.6168556809425354, + "logps/chosen": -57.03837203979492, + "logps/rejected": -77.30291748046875, + "loss": 0.6807, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8801534175872803, + "rewards/margins": 4.565483093261719, + "rewards/rejected": -1.6853296756744385, + "step": 9765 + }, + { + "epoch": 2.44, + "grad_norm": 3.48785138130188, + "learning_rate": 5.177988591092637e-06, + "logits/chosen": -0.575448215007782, + "logits/rejected": -0.6830474734306335, + "logps/chosen": -53.945289611816406, + "logps/rejected": -106.2826919555664, + "loss": 0.5574, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1859183311462402, + "rewards/margins": 7.666667461395264, + "rewards/rejected": -4.480749130249023, + "step": 9766 + }, + { + "epoch": 2.44, + "grad_norm": 3.9449338912963867, + "learning_rate": 5.177203099401268e-06, + "logits/chosen": -0.5545761585235596, + "logits/rejected": -0.650080144405365, + "logps/chosen": -43.82720947265625, + "logps/rejected": -83.9241943359375, + "loss": 0.5989, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.155989408493042, + "rewards/margins": 7.017461776733398, + "rewards/rejected": -3.8614721298217773, + "step": 9767 + }, + { + "epoch": 2.44, + "grad_norm": 5.190894603729248, + "learning_rate": 5.176417603331022e-06, + "logits/chosen": -0.4850226640701294, + "logits/rejected": -0.5675832629203796, + "logps/chosen": -50.92626953125, + "logps/rejected": -99.57882690429688, + "loss": 0.5815, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.969374179840088, + "rewards/margins": 6.400805473327637, + "rewards/rejected": -3.431431531906128, + "step": 9768 + }, + { + "epoch": 2.44, + "grad_norm": 4.515087604522705, + "learning_rate": 5.175632102901309e-06, + "logits/chosen": -0.4939316213130951, + "logits/rejected": -0.5494381189346313, + "logps/chosen": -51.28858184814453, + "logps/rejected": -96.1123275756836, + "loss": 0.666, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0879640579223633, + "rewards/margins": 6.405271053314209, + "rewards/rejected": -3.317307472229004, + "step": 9769 + }, + { + "epoch": 2.44, + "grad_norm": 8.585007667541504, + "learning_rate": 5.174846598131541e-06, + "logits/chosen": -0.5191761255264282, + "logits/rejected": -0.6191376447677612, + "logps/chosen": -51.47358703613281, + "logps/rejected": -81.17514038085938, + "loss": 0.6344, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0819013118743896, + "rewards/margins": 5.800008296966553, + "rewards/rejected": -2.718106746673584, + "step": 9770 + }, + { + "epoch": 2.44, + "grad_norm": 3.642807960510254, + "learning_rate": 5.174061089041128e-06, + "logits/chosen": -0.4968510866165161, + "logits/rejected": -0.5538263320922852, + "logps/chosen": -55.26844787597656, + "logps/rejected": -91.86900329589844, + "loss": 0.6627, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.097276449203491, + "rewards/margins": 5.650630950927734, + "rewards/rejected": -2.5533547401428223, + "step": 9771 + }, + { + "epoch": 2.44, + "grad_norm": 9.382004737854004, + "learning_rate": 5.173275575649479e-06, + "logits/chosen": -0.5233580470085144, + "logits/rejected": -0.6412111520767212, + "logps/chosen": -62.529869079589844, + "logps/rejected": -105.147705078125, + "loss": 0.703, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6946048736572266, + "rewards/margins": 6.647173881530762, + "rewards/rejected": -3.952568531036377, + "step": 9772 + }, + { + "epoch": 2.44, + "grad_norm": 4.966001510620117, + "learning_rate": 5.172490057976007e-06, + "logits/chosen": -0.5310259461402893, + "logits/rejected": -0.5656560659408569, + "logps/chosen": -63.01808166503906, + "logps/rejected": -103.27188873291016, + "loss": 0.7554, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1993913650512695, + "rewards/margins": 4.965228080749512, + "rewards/rejected": -1.7658371925354004, + "step": 9773 + }, + { + "epoch": 2.45, + "grad_norm": 5.8933634757995605, + "learning_rate": 5.171704536040122e-06, + "logits/chosen": -0.452056884765625, + "logits/rejected": -0.5665347576141357, + "logps/chosen": -63.03644943237305, + "logps/rejected": -87.21997833251953, + "loss": 0.7065, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9828615188598633, + "rewards/margins": 5.946645736694336, + "rewards/rejected": -2.9637835025787354, + "step": 9774 + }, + { + "epoch": 2.45, + "grad_norm": 14.699124336242676, + "learning_rate": 5.170919009861235e-06, + "logits/chosen": -0.5863142609596252, + "logits/rejected": -0.6938890218734741, + "logps/chosen": -68.30948638916016, + "logps/rejected": -86.52359008789062, + "loss": 0.7226, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0769121646881104, + "rewards/margins": 5.49429178237915, + "rewards/rejected": -2.41737961769104, + "step": 9775 + }, + { + "epoch": 2.45, + "grad_norm": 5.352419853210449, + "learning_rate": 5.170133479458759e-06, + "logits/chosen": -0.5092192888259888, + "logits/rejected": -0.6081124544143677, + "logps/chosen": -65.2887954711914, + "logps/rejected": -92.16769409179688, + "loss": 0.7604, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.065291404724121, + "rewards/margins": 5.429566860198975, + "rewards/rejected": -2.3642754554748535, + "step": 9776 + }, + { + "epoch": 2.45, + "grad_norm": 9.385909080505371, + "learning_rate": 5.1693479448521025e-06, + "logits/chosen": -0.5580573678016663, + "logits/rejected": -0.6104182600975037, + "logps/chosen": -52.4461669921875, + "logps/rejected": -105.8156967163086, + "loss": 0.6441, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0315604209899902, + "rewards/margins": 6.1545729637146, + "rewards/rejected": -3.1230123043060303, + "step": 9777 + }, + { + "epoch": 2.45, + "grad_norm": 7.099945068359375, + "learning_rate": 5.168562406060677e-06, + "logits/chosen": -0.5375137329101562, + "logits/rejected": -0.5667328238487244, + "logps/chosen": -43.89301300048828, + "logps/rejected": -118.85794067382812, + "loss": 0.645, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0897183418273926, + "rewards/margins": 7.087480545043945, + "rewards/rejected": -3.9977617263793945, + "step": 9778 + }, + { + "epoch": 2.45, + "grad_norm": 4.7761430740356445, + "learning_rate": 5.167776863103897e-06, + "logits/chosen": -0.5008334517478943, + "logits/rejected": -0.5713819861412048, + "logps/chosen": -54.13252258300781, + "logps/rejected": -101.90057373046875, + "loss": 0.6332, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8504269123077393, + "rewards/margins": 6.458123683929443, + "rewards/rejected": -3.6076972484588623, + "step": 9779 + }, + { + "epoch": 2.45, + "grad_norm": 8.953614234924316, + "learning_rate": 5.166991316001173e-06, + "logits/chosen": -0.45644310116767883, + "logits/rejected": -0.5028911828994751, + "logps/chosen": -53.488338470458984, + "logps/rejected": -107.63201141357422, + "loss": 0.7319, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8887557983398438, + "rewards/margins": 5.334517478942871, + "rewards/rejected": -2.4457616806030273, + "step": 9780 + }, + { + "epoch": 2.45, + "grad_norm": 22.432111740112305, + "learning_rate": 5.166205764771913e-06, + "logits/chosen": -0.5227912664413452, + "logits/rejected": -0.658303439617157, + "logps/chosen": -56.254295349121094, + "logps/rejected": -100.96641540527344, + "loss": 0.5701, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.082799196243286, + "rewards/margins": 7.029314994812012, + "rewards/rejected": -3.9465157985687256, + "step": 9781 + }, + { + "epoch": 2.45, + "grad_norm": 16.078414916992188, + "learning_rate": 5.165420209435535e-06, + "logits/chosen": -0.6019880771636963, + "logits/rejected": -0.6624024510383606, + "logps/chosen": -51.79082489013672, + "logps/rejected": -89.432373046875, + "loss": 0.7292, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.883781909942627, + "rewards/margins": 5.166128635406494, + "rewards/rejected": -2.2823469638824463, + "step": 9782 + }, + { + "epoch": 2.45, + "grad_norm": 13.981295585632324, + "learning_rate": 5.164634650011445e-06, + "logits/chosen": -0.5243028402328491, + "logits/rejected": -0.5804510116577148, + "logps/chosen": -54.55224609375, + "logps/rejected": -105.17495727539062, + "loss": 0.6496, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.049454927444458, + "rewards/margins": 6.276324272155762, + "rewards/rejected": -3.2268693447113037, + "step": 9783 + }, + { + "epoch": 2.45, + "grad_norm": 6.583042621612549, + "learning_rate": 5.163849086519058e-06, + "logits/chosen": -0.50380539894104, + "logits/rejected": -0.5638337731361389, + "logps/chosen": -57.71207809448242, + "logps/rejected": -98.2406005859375, + "loss": 0.7759, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1630187034606934, + "rewards/margins": 6.000463008880615, + "rewards/rejected": -2.837444305419922, + "step": 9784 + }, + { + "epoch": 2.45, + "grad_norm": 12.15407943725586, + "learning_rate": 5.1630635189777875e-06, + "logits/chosen": -0.5445394515991211, + "logits/rejected": -0.5980770587921143, + "logps/chosen": -49.53864669799805, + "logps/rejected": -109.07154846191406, + "loss": 0.7378, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.437023162841797, + "rewards/margins": 6.373582363128662, + "rewards/rejected": -3.9365596771240234, + "step": 9785 + }, + { + "epoch": 2.45, + "grad_norm": 7.7113776206970215, + "learning_rate": 5.162277947407044e-06, + "logits/chosen": -0.5351917743682861, + "logits/rejected": -0.5904943346977234, + "logps/chosen": -50.95790100097656, + "logps/rejected": -91.01043701171875, + "loss": 0.713, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7278566360473633, + "rewards/margins": 6.1031975746154785, + "rewards/rejected": -3.3753416538238525, + "step": 9786 + }, + { + "epoch": 2.45, + "grad_norm": 10.524919509887695, + "learning_rate": 5.1614923718262375e-06, + "logits/chosen": -0.5323589444160461, + "logits/rejected": -0.6057479381561279, + "logps/chosen": -66.19589233398438, + "logps/rejected": -108.00503540039062, + "loss": 0.7024, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.114302635192871, + "rewards/margins": 6.57382345199585, + "rewards/rejected": -3.4595205783843994, + "step": 9787 + }, + { + "epoch": 2.45, + "grad_norm": 3.167776107788086, + "learning_rate": 5.160706792254785e-06, + "logits/chosen": -0.5520710945129395, + "logits/rejected": -0.6306663751602173, + "logps/chosen": -56.30030822753906, + "logps/rejected": -103.2144546508789, + "loss": 0.7015, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.759253740310669, + "rewards/margins": 6.835245132446289, + "rewards/rejected": -4.075991153717041, + "step": 9788 + }, + { + "epoch": 2.45, + "grad_norm": 13.779044151306152, + "learning_rate": 5.159921208712096e-06, + "logits/chosen": -0.47340357303619385, + "logits/rejected": -0.5769141912460327, + "logps/chosen": -59.251529693603516, + "logps/rejected": -91.02051544189453, + "loss": 0.7474, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.827488422393799, + "rewards/margins": 5.2773942947387695, + "rewards/rejected": -2.4499051570892334, + "step": 9789 + }, + { + "epoch": 2.45, + "grad_norm": 5.5404229164123535, + "learning_rate": 5.159135621217582e-06, + "logits/chosen": -0.5099161863327026, + "logits/rejected": -0.6059601902961731, + "logps/chosen": -54.41948318481445, + "logps/rejected": -106.23184967041016, + "loss": 0.7055, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8621363639831543, + "rewards/margins": 5.650800704956055, + "rewards/rejected": -2.7886645793914795, + "step": 9790 + }, + { + "epoch": 2.45, + "grad_norm": 4.156740188598633, + "learning_rate": 5.158350029790661e-06, + "logits/chosen": -0.4555796682834625, + "logits/rejected": -0.5884484648704529, + "logps/chosen": -63.06306838989258, + "logps/rejected": -90.95101928710938, + "loss": 0.5792, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2634944915771484, + "rewards/margins": 6.358426094055176, + "rewards/rejected": -3.0949316024780273, + "step": 9791 + }, + { + "epoch": 2.45, + "grad_norm": 4.6778082847595215, + "learning_rate": 5.15756443445074e-06, + "logits/chosen": -0.5150952339172363, + "logits/rejected": -0.5592620372772217, + "logps/chosen": -55.53447341918945, + "logps/rejected": -97.28614807128906, + "loss": 0.7135, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8930211067199707, + "rewards/margins": 5.882224082946777, + "rewards/rejected": -2.9892029762268066, + "step": 9792 + }, + { + "epoch": 2.45, + "grad_norm": 3.022021532058716, + "learning_rate": 5.156778835217234e-06, + "logits/chosen": -0.527407169342041, + "logits/rejected": -0.5732275247573853, + "logps/chosen": -47.75657653808594, + "logps/rejected": -93.80722045898438, + "loss": 0.6104, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.919257879257202, + "rewards/margins": 5.994950771331787, + "rewards/rejected": -3.075692892074585, + "step": 9793 + }, + { + "epoch": 2.45, + "grad_norm": 3.2728140354156494, + "learning_rate": 5.155993232109558e-06, + "logits/chosen": -0.5720763206481934, + "logits/rejected": -0.6386270523071289, + "logps/chosen": -49.28059768676758, + "logps/rejected": -91.64161682128906, + "loss": 0.5895, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9192276000976562, + "rewards/margins": 6.271461009979248, + "rewards/rejected": -3.3522329330444336, + "step": 9794 + }, + { + "epoch": 2.45, + "grad_norm": 3.058199405670166, + "learning_rate": 5.155207625147121e-06, + "logits/chosen": -0.49288105964660645, + "logits/rejected": -0.6083916425704956, + "logps/chosen": -50.497657775878906, + "logps/rejected": -92.26327514648438, + "loss": 0.6013, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.917724847793579, + "rewards/margins": 6.942545413970947, + "rewards/rejected": -4.024820327758789, + "step": 9795 + }, + { + "epoch": 2.45, + "grad_norm": 7.811363697052002, + "learning_rate": 5.154422014349341e-06, + "logits/chosen": -0.5578123331069946, + "logits/rejected": -0.6058433055877686, + "logps/chosen": -65.70661163330078, + "logps/rejected": -105.69017028808594, + "loss": 0.8747, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.713639259338379, + "rewards/margins": 6.361475944519043, + "rewards/rejected": -3.647836208343506, + "step": 9796 + }, + { + "epoch": 2.45, + "grad_norm": 17.368515014648438, + "learning_rate": 5.1536363997356275e-06, + "logits/chosen": -0.470813512802124, + "logits/rejected": -0.547935426235199, + "logps/chosen": -53.79129409790039, + "logps/rejected": -88.19877624511719, + "loss": 0.7944, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7326724529266357, + "rewards/margins": 5.292568206787109, + "rewards/rejected": -2.5598955154418945, + "step": 9797 + }, + { + "epoch": 2.45, + "grad_norm": 2.933384656906128, + "learning_rate": 5.152850781325395e-06, + "logits/chosen": -0.5740156173706055, + "logits/rejected": -0.6127587556838989, + "logps/chosen": -54.6658935546875, + "logps/rejected": -106.55902862548828, + "loss": 0.5858, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4385297298431396, + "rewards/margins": 6.549281597137451, + "rewards/rejected": -3.1107518672943115, + "step": 9798 + }, + { + "epoch": 2.45, + "grad_norm": 4.58714485168457, + "learning_rate": 5.152065159138058e-06, + "logits/chosen": -0.458015501499176, + "logits/rejected": -0.5493975877761841, + "logps/chosen": -53.97516632080078, + "logps/rejected": -100.07109069824219, + "loss": 0.5733, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.965423107147217, + "rewards/margins": 6.576193332672119, + "rewards/rejected": -3.6107702255249023, + "step": 9799 + }, + { + "epoch": 2.45, + "grad_norm": 12.834979057312012, + "learning_rate": 5.151279533193027e-06, + "logits/chosen": -0.4740365743637085, + "logits/rejected": -0.5401605367660522, + "logps/chosen": -69.71918487548828, + "logps/rejected": -98.8205337524414, + "loss": 0.8386, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.694859504699707, + "rewards/margins": 5.783177375793457, + "rewards/rejected": -3.088318347930908, + "step": 9800 + }, + { + "epoch": 2.45, + "grad_norm": 3.616718053817749, + "learning_rate": 5.15049390350972e-06, + "logits/chosen": -0.5315867066383362, + "logits/rejected": -0.6171379089355469, + "logps/chosen": -54.701690673828125, + "logps/rejected": -91.95124053955078, + "loss": 0.5836, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0772695541381836, + "rewards/margins": 6.273005485534668, + "rewards/rejected": -3.1957361698150635, + "step": 9801 + }, + { + "epoch": 2.45, + "grad_norm": 9.962986946105957, + "learning_rate": 5.149708270107548e-06, + "logits/chosen": -0.43354347348213196, + "logits/rejected": -0.5751350522041321, + "logps/chosen": -78.74454498291016, + "logps/rejected": -88.27796173095703, + "loss": 0.7758, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1232171058654785, + "rewards/margins": 6.338005542755127, + "rewards/rejected": -3.2147886753082275, + "step": 9802 + }, + { + "epoch": 2.45, + "grad_norm": 10.765440940856934, + "learning_rate": 5.148922633005926e-06, + "logits/chosen": -0.5348760485649109, + "logits/rejected": -0.5699267983436584, + "logps/chosen": -57.019386291503906, + "logps/rejected": -87.02881622314453, + "loss": 0.7312, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2771153450012207, + "rewards/margins": 5.517584323883057, + "rewards/rejected": -2.240468740463257, + "step": 9803 + }, + { + "epoch": 2.45, + "grad_norm": 21.659881591796875, + "learning_rate": 5.148136992224265e-06, + "logits/chosen": -0.48787206411361694, + "logits/rejected": -0.579461395740509, + "logps/chosen": -65.78105163574219, + "logps/rejected": -91.05864715576172, + "loss": 0.7702, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4332447052001953, + "rewards/margins": 4.867824077606201, + "rewards/rejected": -2.434579849243164, + "step": 9804 + }, + { + "epoch": 2.45, + "grad_norm": 6.199990749359131, + "learning_rate": 5.147351347781985e-06, + "logits/chosen": -0.5053051710128784, + "logits/rejected": -0.6340805292129517, + "logps/chosen": -55.45965576171875, + "logps/rejected": -89.6153335571289, + "loss": 0.652, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.024792432785034, + "rewards/margins": 7.638335227966309, + "rewards/rejected": -4.613542079925537, + "step": 9805 + }, + { + "epoch": 2.45, + "grad_norm": 4.479370594024658, + "learning_rate": 5.146565699698494e-06, + "logits/chosen": -0.4209284782409668, + "logits/rejected": -0.5126458406448364, + "logps/chosen": -62.02409362792969, + "logps/rejected": -100.16670989990234, + "loss": 0.6757, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4461536407470703, + "rewards/margins": 5.27298641204834, + "rewards/rejected": -2.8268327713012695, + "step": 9806 + }, + { + "epoch": 2.45, + "grad_norm": 4.772902011871338, + "learning_rate": 5.145780047993209e-06, + "logits/chosen": -0.5651816129684448, + "logits/rejected": -0.631956934928894, + "logps/chosen": -44.46648406982422, + "logps/rejected": -99.90515899658203, + "loss": 0.6658, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2726070880889893, + "rewards/margins": 6.530195236206055, + "rewards/rejected": -3.2575883865356445, + "step": 9807 + }, + { + "epoch": 2.45, + "grad_norm": 7.952615737915039, + "learning_rate": 5.144994392685545e-06, + "logits/chosen": -0.5997805595397949, + "logits/rejected": -0.623397707939148, + "logps/chosen": -52.2053337097168, + "logps/rejected": -95.37345886230469, + "loss": 0.8594, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.847851037979126, + "rewards/margins": 5.258636951446533, + "rewards/rejected": -2.4107861518859863, + "step": 9808 + }, + { + "epoch": 2.45, + "grad_norm": 17.229572296142578, + "learning_rate": 5.144208733794913e-06, + "logits/chosen": -0.5440512299537659, + "logits/rejected": -0.6279036402702332, + "logps/chosen": -63.00291061401367, + "logps/rejected": -98.8707504272461, + "loss": 0.8049, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9661600589752197, + "rewards/margins": 6.488192558288574, + "rewards/rejected": -3.5220322608947754, + "step": 9809 + }, + { + "epoch": 2.45, + "grad_norm": 8.38965129852295, + "learning_rate": 5.143423071340732e-06, + "logits/chosen": -0.5110872983932495, + "logits/rejected": -0.5855695009231567, + "logps/chosen": -61.27330780029297, + "logps/rejected": -102.74900817871094, + "loss": 0.6719, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.470797300338745, + "rewards/margins": 6.108554363250732, + "rewards/rejected": -2.637756824493408, + "step": 9810 + }, + { + "epoch": 2.45, + "grad_norm": 6.0391621589660645, + "learning_rate": 5.142637405342414e-06, + "logits/chosen": -0.4666259288787842, + "logits/rejected": -0.6008578538894653, + "logps/chosen": -55.92283630371094, + "logps/rejected": -87.98426818847656, + "loss": 0.708, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.001028537750244, + "rewards/margins": 5.487887859344482, + "rewards/rejected": -2.4868595600128174, + "step": 9811 + }, + { + "epoch": 2.45, + "grad_norm": 1.9686026573181152, + "learning_rate": 5.141851735819372e-06, + "logits/chosen": -0.5452638268470764, + "logits/rejected": -0.6491213440895081, + "logps/chosen": -39.94976043701172, + "logps/rejected": -84.43875122070312, + "loss": 0.5268, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2253897190093994, + "rewards/margins": 6.712153434753418, + "rewards/rejected": -3.4867639541625977, + "step": 9812 + }, + { + "epoch": 2.45, + "grad_norm": 8.32991886138916, + "learning_rate": 5.141066062791024e-06, + "logits/chosen": -0.5068773627281189, + "logits/rejected": -0.6090160608291626, + "logps/chosen": -54.34800720214844, + "logps/rejected": -93.5818862915039, + "loss": 0.6277, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.27359676361084, + "rewards/margins": 6.837737083435059, + "rewards/rejected": -3.5641403198242188, + "step": 9813 + }, + { + "epoch": 2.46, + "grad_norm": 5.492316722869873, + "learning_rate": 5.1402803862767845e-06, + "logits/chosen": -0.5680179595947266, + "logits/rejected": -0.6035827994346619, + "logps/chosen": -46.94900131225586, + "logps/rejected": -86.7170181274414, + "loss": 0.6801, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0820789337158203, + "rewards/margins": 5.5926666259765625, + "rewards/rejected": -2.510587215423584, + "step": 9814 + }, + { + "epoch": 2.46, + "grad_norm": 3.1964025497436523, + "learning_rate": 5.139494706296067e-06, + "logits/chosen": -0.4757433235645294, + "logits/rejected": -0.5629247426986694, + "logps/chosen": -58.11717224121094, + "logps/rejected": -91.2578125, + "loss": 0.6233, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.350501298904419, + "rewards/margins": 6.424354553222656, + "rewards/rejected": -3.0738537311553955, + "step": 9815 + }, + { + "epoch": 2.46, + "grad_norm": 4.266732692718506, + "learning_rate": 5.138709022868287e-06, + "logits/chosen": -0.5036579370498657, + "logits/rejected": -0.5534005165100098, + "logps/chosen": -40.28413009643555, + "logps/rejected": -88.34832763671875, + "loss": 0.5338, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.22691011428833, + "rewards/margins": 5.6006693840026855, + "rewards/rejected": -2.3737597465515137, + "step": 9816 + }, + { + "epoch": 2.46, + "grad_norm": 10.062617301940918, + "learning_rate": 5.13792333601286e-06, + "logits/chosen": -0.4896085858345032, + "logits/rejected": -0.6047579646110535, + "logps/chosen": -61.251312255859375, + "logps/rejected": -83.60970306396484, + "loss": 0.7816, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1991920471191406, + "rewards/margins": 6.313790321350098, + "rewards/rejected": -3.114597797393799, + "step": 9817 + }, + { + "epoch": 2.46, + "grad_norm": 23.53714942932129, + "learning_rate": 5.1371376457491985e-06, + "logits/chosen": -0.4834788739681244, + "logits/rejected": -0.5319139957427979, + "logps/chosen": -64.81792449951172, + "logps/rejected": -101.66145324707031, + "loss": 0.79, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.92049241065979, + "rewards/margins": 5.585710048675537, + "rewards/rejected": -2.665217876434326, + "step": 9818 + }, + { + "epoch": 2.46, + "grad_norm": 4.189737319946289, + "learning_rate": 5.136351952096722e-06, + "logits/chosen": -0.49030983448028564, + "logits/rejected": -0.5460638403892517, + "logps/chosen": -50.710304260253906, + "logps/rejected": -100.02259063720703, + "loss": 0.5401, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2254414558410645, + "rewards/margins": 6.7764482498168945, + "rewards/rejected": -3.55100679397583, + "step": 9819 + }, + { + "epoch": 2.46, + "grad_norm": 4.520187854766846, + "learning_rate": 5.135566255074843e-06, + "logits/chosen": -0.5237646102905273, + "logits/rejected": -0.6240760684013367, + "logps/chosen": -51.29726028442383, + "logps/rejected": -87.81340789794922, + "loss": 0.7017, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.053384780883789, + "rewards/margins": 5.823925495147705, + "rewards/rejected": -2.770540952682495, + "step": 9820 + }, + { + "epoch": 2.46, + "grad_norm": 4.222034931182861, + "learning_rate": 5.134780554702977e-06, + "logits/chosen": -0.4417637884616852, + "logits/rejected": -0.5048893094062805, + "logps/chosen": -50.76807403564453, + "logps/rejected": -88.14022827148438, + "loss": 0.5986, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.110776901245117, + "rewards/margins": 5.798311233520508, + "rewards/rejected": -2.6875343322753906, + "step": 9821 + }, + { + "epoch": 2.46, + "grad_norm": 7.312430381774902, + "learning_rate": 5.133994851000541e-06, + "logits/chosen": -0.4934368133544922, + "logits/rejected": -0.5517069697380066, + "logps/chosen": -56.30103302001953, + "logps/rejected": -102.43534851074219, + "loss": 0.6493, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6444251537323, + "rewards/margins": 6.6154656410217285, + "rewards/rejected": -3.971040964126587, + "step": 9822 + }, + { + "epoch": 2.46, + "grad_norm": 9.184329986572266, + "learning_rate": 5.13320914398695e-06, + "logits/chosen": -0.5301535725593567, + "logits/rejected": -0.6238702535629272, + "logps/chosen": -58.02772903442383, + "logps/rejected": -95.669189453125, + "loss": 0.6437, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.042081832885742, + "rewards/margins": 6.774904727935791, + "rewards/rejected": -3.7328224182128906, + "step": 9823 + }, + { + "epoch": 2.46, + "grad_norm": 2.651486396789551, + "learning_rate": 5.132423433681619e-06, + "logits/chosen": -0.46031683683395386, + "logits/rejected": -0.5433461666107178, + "logps/chosen": -59.48766326904297, + "logps/rejected": -106.3861083984375, + "loss": 0.6746, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9910802841186523, + "rewards/margins": 6.335593223571777, + "rewards/rejected": -3.344513177871704, + "step": 9824 + }, + { + "epoch": 2.46, + "grad_norm": 9.385984420776367, + "learning_rate": 5.131637720103964e-06, + "logits/chosen": -0.4947313666343689, + "logits/rejected": -0.52967369556427, + "logps/chosen": -57.129547119140625, + "logps/rejected": -119.49093627929688, + "loss": 0.7952, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.94175386428833, + "rewards/margins": 6.275917053222656, + "rewards/rejected": -3.334162712097168, + "step": 9825 + }, + { + "epoch": 2.46, + "grad_norm": 3.1129143238067627, + "learning_rate": 5.130852003273401e-06, + "logits/chosen": -0.516727864742279, + "logits/rejected": -0.5706490278244019, + "logps/chosen": -52.87815475463867, + "logps/rejected": -104.96562194824219, + "loss": 0.5686, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0802230834960938, + "rewards/margins": 6.429537773132324, + "rewards/rejected": -3.3493144512176514, + "step": 9826 + }, + { + "epoch": 2.46, + "grad_norm": 26.947471618652344, + "learning_rate": 5.130066283209345e-06, + "logits/chosen": -0.5406138896942139, + "logits/rejected": -0.5958677530288696, + "logps/chosen": -64.71338653564453, + "logps/rejected": -91.70606231689453, + "loss": 0.9853, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.654930353164673, + "rewards/margins": 4.576272964477539, + "rewards/rejected": -1.9213424921035767, + "step": 9827 + }, + { + "epoch": 2.46, + "grad_norm": 4.8222126960754395, + "learning_rate": 5.129280559931215e-06, + "logits/chosen": -0.5530513525009155, + "logits/rejected": -0.6385720372200012, + "logps/chosen": -67.693603515625, + "logps/rejected": -64.10619354248047, + "loss": 0.7446, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8426673412323, + "rewards/margins": 5.159905433654785, + "rewards/rejected": -2.3172380924224854, + "step": 9828 + }, + { + "epoch": 2.46, + "grad_norm": 4.373491287231445, + "learning_rate": 5.128494833458423e-06, + "logits/chosen": -0.4781200587749481, + "logits/rejected": -0.6220375895500183, + "logps/chosen": -68.64273834228516, + "logps/rejected": -78.00054931640625, + "loss": 0.702, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.027393102645874, + "rewards/margins": 6.010652542114258, + "rewards/rejected": -2.983259677886963, + "step": 9829 + }, + { + "epoch": 2.46, + "grad_norm": 5.818283557891846, + "learning_rate": 5.127709103810387e-06, + "logits/chosen": -0.5121455192565918, + "logits/rejected": -0.5515111088752747, + "logps/chosen": -56.347354888916016, + "logps/rejected": -118.64246368408203, + "loss": 0.6662, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.14732027053833, + "rewards/margins": 6.254518508911133, + "rewards/rejected": -3.1071975231170654, + "step": 9830 + }, + { + "epoch": 2.46, + "grad_norm": 6.413827896118164, + "learning_rate": 5.126923371006524e-06, + "logits/chosen": -0.5029315948486328, + "logits/rejected": -0.6059670448303223, + "logps/chosen": -59.72826385498047, + "logps/rejected": -114.5440902709961, + "loss": 0.6516, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7783336639404297, + "rewards/margins": 7.083422660827637, + "rewards/rejected": -4.305088996887207, + "step": 9831 + }, + { + "epoch": 2.46, + "grad_norm": 2.8909502029418945, + "learning_rate": 5.126137635066248e-06, + "logits/chosen": -0.5376558899879456, + "logits/rejected": -0.6151818633079529, + "logps/chosen": -53.50996398925781, + "logps/rejected": -106.25518035888672, + "loss": 0.6338, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1092581748962402, + "rewards/margins": 6.136762619018555, + "rewards/rejected": -3.0275042057037354, + "step": 9832 + }, + { + "epoch": 2.46, + "grad_norm": 30.8645076751709, + "learning_rate": 5.125351896008979e-06, + "logits/chosen": -0.5130515098571777, + "logits/rejected": -0.5731015205383301, + "logps/chosen": -57.57373046875, + "logps/rejected": -90.65606689453125, + "loss": 0.7943, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0438499450683594, + "rewards/margins": 5.496953964233398, + "rewards/rejected": -2.453104257583618, + "step": 9833 + }, + { + "epoch": 2.46, + "grad_norm": 3.978823184967041, + "learning_rate": 5.124566153854131e-06, + "logits/chosen": -0.4979335069656372, + "logits/rejected": -0.5448089838027954, + "logps/chosen": -50.351402282714844, + "logps/rejected": -107.40238189697266, + "loss": 0.6796, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8200063705444336, + "rewards/margins": 5.817906856536865, + "rewards/rejected": -2.9979002475738525, + "step": 9834 + }, + { + "epoch": 2.46, + "grad_norm": 5.892277240753174, + "learning_rate": 5.123780408621118e-06, + "logits/chosen": -0.4921632409095764, + "logits/rejected": -0.5698888301849365, + "logps/chosen": -56.054168701171875, + "logps/rejected": -90.73409271240234, + "loss": 0.6665, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.983133554458618, + "rewards/margins": 5.160706520080566, + "rewards/rejected": -2.1775732040405273, + "step": 9835 + }, + { + "epoch": 2.46, + "grad_norm": 4.600202560424805, + "learning_rate": 5.1229946603293624e-06, + "logits/chosen": -0.5074984431266785, + "logits/rejected": -0.6096850633621216, + "logps/chosen": -55.48910140991211, + "logps/rejected": -90.73780059814453, + "loss": 0.6642, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0048139095306396, + "rewards/margins": 6.882536888122559, + "rewards/rejected": -3.877723217010498, + "step": 9836 + }, + { + "epoch": 2.46, + "grad_norm": 9.865104675292969, + "learning_rate": 5.122208908998277e-06, + "logits/chosen": -0.5123104453086853, + "logits/rejected": -0.5637698173522949, + "logps/chosen": -53.18289566040039, + "logps/rejected": -94.76683044433594, + "loss": 0.6543, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.024663209915161, + "rewards/margins": 5.764354705810547, + "rewards/rejected": -2.7396912574768066, + "step": 9837 + }, + { + "epoch": 2.46, + "grad_norm": 3.5046637058258057, + "learning_rate": 5.121423154647279e-06, + "logits/chosen": -0.4226033389568329, + "logits/rejected": -0.5514490604400635, + "logps/chosen": -65.30431365966797, + "logps/rejected": -79.54663848876953, + "loss": 0.6542, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.975402355194092, + "rewards/margins": 5.89134407043457, + "rewards/rejected": -2.9159414768218994, + "step": 9838 + }, + { + "epoch": 2.46, + "grad_norm": 4.952018737792969, + "learning_rate": 5.120637397295787e-06, + "logits/chosen": -0.5588834881782532, + "logits/rejected": -0.6374218463897705, + "logps/chosen": -60.37221145629883, + "logps/rejected": -87.54489135742188, + "loss": 0.6851, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4558870792388916, + "rewards/margins": 6.1259589195251465, + "rewards/rejected": -2.670071601867676, + "step": 9839 + }, + { + "epoch": 2.46, + "grad_norm": 4.173508167266846, + "learning_rate": 5.119851636963216e-06, + "logits/chosen": -0.4075292944908142, + "logits/rejected": -0.489740252494812, + "logps/chosen": -66.06513977050781, + "logps/rejected": -105.2305908203125, + "loss": 0.6488, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2349853515625, + "rewards/margins": 6.266850471496582, + "rewards/rejected": -3.031865119934082, + "step": 9840 + }, + { + "epoch": 2.46, + "grad_norm": 5.143890380859375, + "learning_rate": 5.119065873668982e-06, + "logits/chosen": -0.48124662041664124, + "logits/rejected": -0.5667073726654053, + "logps/chosen": -68.69144439697266, + "logps/rejected": -90.48339080810547, + "loss": 0.7042, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7113442420959473, + "rewards/margins": 5.893460273742676, + "rewards/rejected": -3.1821157932281494, + "step": 9841 + }, + { + "epoch": 2.46, + "grad_norm": 7.456787109375, + "learning_rate": 5.1182801074325074e-06, + "logits/chosen": -0.4712197780609131, + "logits/rejected": -0.5360118746757507, + "logps/chosen": -56.57468795776367, + "logps/rejected": -98.3814468383789, + "loss": 0.6722, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8353683948516846, + "rewards/margins": 6.122352600097656, + "rewards/rejected": -3.2869842052459717, + "step": 9842 + }, + { + "epoch": 2.46, + "grad_norm": 3.3854753971099854, + "learning_rate": 5.117494338273202e-06, + "logits/chosen": -0.46380653977394104, + "logits/rejected": -0.6220624446868896, + "logps/chosen": -63.63249969482422, + "logps/rejected": -76.4648208618164, + "loss": 0.6195, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0797526836395264, + "rewards/margins": 6.131447792053223, + "rewards/rejected": -3.0516955852508545, + "step": 9843 + }, + { + "epoch": 2.46, + "grad_norm": 3.8978054523468018, + "learning_rate": 5.116708566210487e-06, + "logits/chosen": -0.5332507491111755, + "logits/rejected": -0.6019828915596008, + "logps/chosen": -50.974342346191406, + "logps/rejected": -115.39041900634766, + "loss": 0.6184, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.168867826461792, + "rewards/margins": 6.334359169006348, + "rewards/rejected": -3.1654911041259766, + "step": 9844 + }, + { + "epoch": 2.46, + "grad_norm": 17.889991760253906, + "learning_rate": 5.115922791263781e-06, + "logits/chosen": -0.5272778272628784, + "logits/rejected": -0.5817962288856506, + "logps/chosen": -54.0496826171875, + "logps/rejected": -109.50900268554688, + "loss": 0.6982, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.304840564727783, + "rewards/margins": 5.8880228996276855, + "rewards/rejected": -2.5831825733184814, + "step": 9845 + }, + { + "epoch": 2.46, + "grad_norm": 2.509730815887451, + "learning_rate": 5.115137013452499e-06, + "logits/chosen": -0.5082980394363403, + "logits/rejected": -0.5754032731056213, + "logps/chosen": -54.026241302490234, + "logps/rejected": -112.75141906738281, + "loss": 0.6306, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.14095401763916, + "rewards/margins": 7.41672420501709, + "rewards/rejected": -4.275770664215088, + "step": 9846 + }, + { + "epoch": 2.46, + "grad_norm": 3.659496545791626, + "learning_rate": 5.114351232796057e-06, + "logits/chosen": -0.5111483931541443, + "logits/rejected": -0.5923126935958862, + "logps/chosen": -54.18230438232422, + "logps/rejected": -93.59199523925781, + "loss": 0.6127, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.152684211730957, + "rewards/margins": 6.062769889831543, + "rewards/rejected": -2.910085678100586, + "step": 9847 + }, + { + "epoch": 2.46, + "grad_norm": 12.543841361999512, + "learning_rate": 5.1135654493138765e-06, + "logits/chosen": -0.48028138279914856, + "logits/rejected": -0.5910324454307556, + "logps/chosen": -58.98811340332031, + "logps/rejected": -84.98926544189453, + "loss": 0.8428, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.144054889678955, + "rewards/margins": 4.666526794433594, + "rewards/rejected": -1.522472620010376, + "step": 9848 + }, + { + "epoch": 2.46, + "grad_norm": 7.047362327575684, + "learning_rate": 5.112779663025372e-06, + "logits/chosen": -0.5278233885765076, + "logits/rejected": -0.6515532732009888, + "logps/chosen": -55.1685676574707, + "logps/rejected": -104.91704559326172, + "loss": 0.554, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.003857135772705, + "rewards/margins": 7.3868937492370605, + "rewards/rejected": -4.383037567138672, + "step": 9849 + }, + { + "epoch": 2.46, + "grad_norm": 2.7741806507110596, + "learning_rate": 5.111993873949962e-06, + "logits/chosen": -0.5708388090133667, + "logits/rejected": -0.6288462281227112, + "logps/chosen": -38.14296340942383, + "logps/rejected": -104.52850341796875, + "loss": 0.5272, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.182781457901001, + "rewards/margins": 7.404721736907959, + "rewards/rejected": -4.221940040588379, + "step": 9850 + }, + { + "epoch": 2.46, + "grad_norm": 59.8424072265625, + "learning_rate": 5.1112080821070656e-06, + "logits/chosen": -0.5112500190734863, + "logits/rejected": -0.6261447072029114, + "logps/chosen": -57.95112228393555, + "logps/rejected": -99.61033630371094, + "loss": 0.7349, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8410072326660156, + "rewards/margins": 6.438982009887695, + "rewards/rejected": -3.5979740619659424, + "step": 9851 + }, + { + "epoch": 2.46, + "grad_norm": 5.437526226043701, + "learning_rate": 5.110422287516098e-06, + "logits/chosen": -0.482114315032959, + "logits/rejected": -0.5410124659538269, + "logps/chosen": -50.86151123046875, + "logps/rejected": -80.53102111816406, + "loss": 0.6335, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.185831069946289, + "rewards/margins": 6.490490913391113, + "rewards/rejected": -3.304659366607666, + "step": 9852 + }, + { + "epoch": 2.46, + "grad_norm": 7.798551559448242, + "learning_rate": 5.109636490196478e-06, + "logits/chosen": -0.5068817734718323, + "logits/rejected": -0.5546351075172424, + "logps/chosen": -52.7929573059082, + "logps/rejected": -104.74347686767578, + "loss": 0.6389, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.080615758895874, + "rewards/margins": 6.1763014793396, + "rewards/rejected": -3.095686197280884, + "step": 9853 + }, + { + "epoch": 2.47, + "grad_norm": 5.043104648590088, + "learning_rate": 5.1088506901676236e-06, + "logits/chosen": -0.5953770279884338, + "logits/rejected": -0.6272101402282715, + "logps/chosen": -55.76254653930664, + "logps/rejected": -112.23139953613281, + "loss": 0.8483, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.036757469177246, + "rewards/margins": 5.586148262023926, + "rewards/rejected": -2.5493905544281006, + "step": 9854 + }, + { + "epoch": 2.47, + "grad_norm": 4.069929599761963, + "learning_rate": 5.108064887448953e-06, + "logits/chosen": -0.5057004690170288, + "logits/rejected": -0.568709671497345, + "logps/chosen": -52.65283203125, + "logps/rejected": -103.09248352050781, + "loss": 0.6235, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0080108642578125, + "rewards/margins": 6.354180812835693, + "rewards/rejected": -3.346169948577881, + "step": 9855 + }, + { + "epoch": 2.47, + "grad_norm": 9.268025398254395, + "learning_rate": 5.107279082059885e-06, + "logits/chosen": -0.5685868263244629, + "logits/rejected": -0.6715226769447327, + "logps/chosen": -54.61186218261719, + "logps/rejected": -77.64006042480469, + "loss": 0.6385, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.160158157348633, + "rewards/margins": 5.678295612335205, + "rewards/rejected": -2.5181374549865723, + "step": 9856 + }, + { + "epoch": 2.47, + "grad_norm": 4.206342697143555, + "learning_rate": 5.106493274019837e-06, + "logits/chosen": -0.5306814908981323, + "logits/rejected": -0.6049709916114807, + "logps/chosen": -54.96276092529297, + "logps/rejected": -85.64674377441406, + "loss": 0.6107, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0950958728790283, + "rewards/margins": 6.002453327178955, + "rewards/rejected": -2.9073572158813477, + "step": 9857 + }, + { + "epoch": 2.47, + "grad_norm": 6.844726085662842, + "learning_rate": 5.105707463348226e-06, + "logits/chosen": -0.42989620566368103, + "logits/rejected": -0.5448527336120605, + "logps/chosen": -59.41670227050781, + "logps/rejected": -96.53782653808594, + "loss": 0.8042, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0961666107177734, + "rewards/margins": 5.970061779022217, + "rewards/rejected": -2.8738949298858643, + "step": 9858 + }, + { + "epoch": 2.47, + "grad_norm": 9.580513954162598, + "learning_rate": 5.104921650064471e-06, + "logits/chosen": -0.5458335876464844, + "logits/rejected": -0.6302202343940735, + "logps/chosen": -47.73862075805664, + "logps/rejected": -79.18511199951172, + "loss": 0.6364, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.075133800506592, + "rewards/margins": 5.771849632263184, + "rewards/rejected": -2.696715831756592, + "step": 9859 + }, + { + "epoch": 2.47, + "grad_norm": 1.980710506439209, + "learning_rate": 5.104135834187991e-06, + "logits/chosen": -0.5303440093994141, + "logits/rejected": -0.570583701133728, + "logps/chosen": -54.33922576904297, + "logps/rejected": -111.76724243164062, + "loss": 0.6076, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.09425687789917, + "rewards/margins": 7.201876640319824, + "rewards/rejected": -4.1076202392578125, + "step": 9860 + }, + { + "epoch": 2.47, + "grad_norm": 5.835821151733398, + "learning_rate": 5.1033500157382025e-06, + "logits/chosen": -0.5401593446731567, + "logits/rejected": -0.6313093900680542, + "logps/chosen": -58.596046447753906, + "logps/rejected": -91.29903411865234, + "loss": 0.7461, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.899962902069092, + "rewards/margins": 6.080375671386719, + "rewards/rejected": -3.180412530899048, + "step": 9861 + }, + { + "epoch": 2.47, + "grad_norm": 10.988834381103516, + "learning_rate": 5.102564194734527e-06, + "logits/chosen": -0.43283623456954956, + "logits/rejected": -0.5176147222518921, + "logps/chosen": -69.80533599853516, + "logps/rejected": -87.3124008178711, + "loss": 0.7979, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9254815578460693, + "rewards/margins": 5.71573543548584, + "rewards/rejected": -2.7902536392211914, + "step": 9862 + }, + { + "epoch": 2.47, + "grad_norm": 8.769224166870117, + "learning_rate": 5.10177837119638e-06, + "logits/chosen": -0.5377998948097229, + "logits/rejected": -0.6247825026512146, + "logps/chosen": -57.336002349853516, + "logps/rejected": -86.87667083740234, + "loss": 0.6923, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.626554250717163, + "rewards/margins": 6.041669845581055, + "rewards/rejected": -3.415116310119629, + "step": 9863 + }, + { + "epoch": 2.47, + "grad_norm": 8.390393257141113, + "learning_rate": 5.100992545143182e-06, + "logits/chosen": -0.5044116973876953, + "logits/rejected": -0.6019493341445923, + "logps/chosen": -47.73216247558594, + "logps/rejected": -83.86175537109375, + "loss": 0.6104, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7405340671539307, + "rewards/margins": 5.518853664398193, + "rewards/rejected": -2.778319835662842, + "step": 9864 + }, + { + "epoch": 2.47, + "grad_norm": 7.620011329650879, + "learning_rate": 5.1002067165943504e-06, + "logits/chosen": -0.5236481428146362, + "logits/rejected": -0.5691859722137451, + "logps/chosen": -51.44266128540039, + "logps/rejected": -100.37948608398438, + "loss": 0.6112, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.161952495574951, + "rewards/margins": 5.921877861022949, + "rewards/rejected": -2.7599258422851562, + "step": 9865 + }, + { + "epoch": 2.47, + "grad_norm": 6.001368522644043, + "learning_rate": 5.099420885569304e-06, + "logits/chosen": -0.5029554963111877, + "logits/rejected": -0.5836881995201111, + "logps/chosen": -60.111305236816406, + "logps/rejected": -100.05853271484375, + "loss": 0.6538, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.235057830810547, + "rewards/margins": 5.738241195678711, + "rewards/rejected": -2.5031826496124268, + "step": 9866 + }, + { + "epoch": 2.47, + "grad_norm": 4.8048553466796875, + "learning_rate": 5.0986350520874615e-06, + "logits/chosen": -0.44538477063179016, + "logits/rejected": -0.5823232531547546, + "logps/chosen": -60.800315856933594, + "logps/rejected": -103.9255599975586, + "loss": 0.6468, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0832200050354004, + "rewards/margins": 7.205860614776611, + "rewards/rejected": -4.122640609741211, + "step": 9867 + }, + { + "epoch": 2.47, + "grad_norm": 5.35378360748291, + "learning_rate": 5.097849216168242e-06, + "logits/chosen": -0.5347321629524231, + "logits/rejected": -0.5975298881530762, + "logps/chosen": -52.55091094970703, + "logps/rejected": -93.51802825927734, + "loss": 0.6773, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7323293685913086, + "rewards/margins": 5.990628242492676, + "rewards/rejected": -3.258298635482788, + "step": 9868 + }, + { + "epoch": 2.47, + "grad_norm": 10.857926368713379, + "learning_rate": 5.097063377831067e-06, + "logits/chosen": -0.515929639339447, + "logits/rejected": -0.6152353286743164, + "logps/chosen": -67.39419555664062, + "logps/rejected": -99.931396484375, + "loss": 0.7252, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.899332046508789, + "rewards/margins": 5.542333126068115, + "rewards/rejected": -2.643001079559326, + "step": 9869 + }, + { + "epoch": 2.47, + "grad_norm": 5.647562503814697, + "learning_rate": 5.096277537095348e-06, + "logits/chosen": -0.5636652708053589, + "logits/rejected": -0.6438634395599365, + "logps/chosen": -55.73431396484375, + "logps/rejected": -84.33615112304688, + "loss": 0.7135, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.104701280593872, + "rewards/margins": 4.793924331665039, + "rewards/rejected": -1.6892231702804565, + "step": 9870 + }, + { + "epoch": 2.47, + "grad_norm": 6.551692962646484, + "learning_rate": 5.095491693980512e-06, + "logits/chosen": -0.550450325012207, + "logits/rejected": -0.618873655796051, + "logps/chosen": -53.63057327270508, + "logps/rejected": -94.9344482421875, + "loss": 0.7338, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.116885185241699, + "rewards/margins": 6.413565635681152, + "rewards/rejected": -3.296680212020874, + "step": 9871 + }, + { + "epoch": 2.47, + "grad_norm": 7.318517208099365, + "learning_rate": 5.094705848505973e-06, + "logits/chosen": -0.5174527764320374, + "logits/rejected": -0.6187355518341064, + "logps/chosen": -47.358001708984375, + "logps/rejected": -96.33721923828125, + "loss": 0.5254, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.267289400100708, + "rewards/margins": 7.429845809936523, + "rewards/rejected": -4.162556171417236, + "step": 9872 + }, + { + "epoch": 2.47, + "grad_norm": 6.798942565917969, + "learning_rate": 5.093920000691153e-06, + "logits/chosen": -0.44815701246261597, + "logits/rejected": -0.5584921836853027, + "logps/chosen": -51.614715576171875, + "logps/rejected": -84.81770324707031, + "loss": 0.6377, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.954623222351074, + "rewards/margins": 6.419490337371826, + "rewards/rejected": -3.464867115020752, + "step": 9873 + }, + { + "epoch": 2.47, + "grad_norm": 3.489838123321533, + "learning_rate": 5.09313415055547e-06, + "logits/chosen": -0.5227677226066589, + "logits/rejected": -0.5553514361381531, + "logps/chosen": -55.25975036621094, + "logps/rejected": -104.01578521728516, + "loss": 0.6517, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2173726558685303, + "rewards/margins": 6.286831855773926, + "rewards/rejected": -3.0694594383239746, + "step": 9874 + }, + { + "epoch": 2.47, + "grad_norm": 3.3038032054901123, + "learning_rate": 5.092348298118342e-06, + "logits/chosen": -0.5213624238967896, + "logits/rejected": -0.5830715894699097, + "logps/chosen": -55.18242645263672, + "logps/rejected": -121.03494262695312, + "loss": 0.6512, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.032841682434082, + "rewards/margins": 7.619589328765869, + "rewards/rejected": -4.586747646331787, + "step": 9875 + }, + { + "epoch": 2.47, + "grad_norm": 11.909255981445312, + "learning_rate": 5.09156244339919e-06, + "logits/chosen": -0.525722086429596, + "logits/rejected": -0.6162046790122986, + "logps/chosen": -57.703712463378906, + "logps/rejected": -96.6766357421875, + "loss": 0.8367, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.405933380126953, + "rewards/margins": 5.681750297546387, + "rewards/rejected": -3.275817394256592, + "step": 9876 + }, + { + "epoch": 2.47, + "grad_norm": 4.139497756958008, + "learning_rate": 5.090776586417432e-06, + "logits/chosen": -0.4901133179664612, + "logits/rejected": -0.5480721592903137, + "logps/chosen": -55.835105895996094, + "logps/rejected": -107.44480895996094, + "loss": 0.6092, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8632686138153076, + "rewards/margins": 6.44199275970459, + "rewards/rejected": -3.578723430633545, + "step": 9877 + }, + { + "epoch": 2.47, + "grad_norm": 12.079116821289062, + "learning_rate": 5.089990727192488e-06, + "logits/chosen": -0.5975655317306519, + "logits/rejected": -0.6771693825721741, + "logps/chosen": -46.06660461425781, + "logps/rejected": -89.22864532470703, + "loss": 0.6863, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.818615436553955, + "rewards/margins": 6.192956924438477, + "rewards/rejected": -3.374340772628784, + "step": 9878 + }, + { + "epoch": 2.47, + "grad_norm": 9.616809844970703, + "learning_rate": 5.089204865743778e-06, + "logits/chosen": -0.4921385645866394, + "logits/rejected": -0.6144832968711853, + "logps/chosen": -56.55552673339844, + "logps/rejected": -96.40670776367188, + "loss": 0.7293, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.094784736633301, + "rewards/margins": 7.379826068878174, + "rewards/rejected": -4.285041332244873, + "step": 9879 + }, + { + "epoch": 2.47, + "grad_norm": 8.967227935791016, + "learning_rate": 5.088419002090721e-06, + "logits/chosen": -0.5041810274124146, + "logits/rejected": -0.6037821173667908, + "logps/chosen": -62.305782318115234, + "logps/rejected": -109.28330993652344, + "loss": 0.6669, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.182978868484497, + "rewards/margins": 7.00468635559082, + "rewards/rejected": -3.821707010269165, + "step": 9880 + }, + { + "epoch": 2.47, + "grad_norm": 10.948183059692383, + "learning_rate": 5.0876331362527344e-06, + "logits/chosen": -0.5486512184143066, + "logits/rejected": -0.6460214257240295, + "logps/chosen": -52.09893798828125, + "logps/rejected": -97.4759521484375, + "loss": 0.7176, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9559473991394043, + "rewards/margins": 5.710205078125, + "rewards/rejected": -2.754258155822754, + "step": 9881 + }, + { + "epoch": 2.47, + "grad_norm": 7.716388702392578, + "learning_rate": 5.086847268249242e-06, + "logits/chosen": -0.49241307377815247, + "logits/rejected": -0.5685976147651672, + "logps/chosen": -53.02822494506836, + "logps/rejected": -100.08975982666016, + "loss": 0.8165, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.857631206512451, + "rewards/margins": 5.708744049072266, + "rewards/rejected": -2.8511126041412354, + "step": 9882 + }, + { + "epoch": 2.47, + "grad_norm": 9.736678123474121, + "learning_rate": 5.086061398099662e-06, + "logits/chosen": -0.5089147686958313, + "logits/rejected": -0.5906612873077393, + "logps/chosen": -66.37943267822266, + "logps/rejected": -126.65354919433594, + "loss": 0.6443, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9614880084991455, + "rewards/margins": 7.447474002838135, + "rewards/rejected": -4.48598575592041, + "step": 9883 + }, + { + "epoch": 2.47, + "grad_norm": 12.196528434753418, + "learning_rate": 5.085275525823411e-06, + "logits/chosen": -0.5040743350982666, + "logits/rejected": -0.578493595123291, + "logps/chosen": -56.94489288330078, + "logps/rejected": -94.8941650390625, + "loss": 0.6331, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1088109016418457, + "rewards/margins": 5.412596702575684, + "rewards/rejected": -2.303785562515259, + "step": 9884 + }, + { + "epoch": 2.47, + "grad_norm": 49.52263641357422, + "learning_rate": 5.0844896514399125e-06, + "logits/chosen": -0.44947370886802673, + "logits/rejected": -0.5416244268417358, + "logps/chosen": -59.60823059082031, + "logps/rejected": -90.00241088867188, + "loss": 0.9262, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.43434739112854, + "rewards/margins": 4.938440799713135, + "rewards/rejected": -2.504093647003174, + "step": 9885 + }, + { + "epoch": 2.47, + "grad_norm": 4.60452127456665, + "learning_rate": 5.083703774968584e-06, + "logits/chosen": -0.49141547083854675, + "logits/rejected": -0.533503532409668, + "logps/chosen": -51.642311096191406, + "logps/rejected": -97.06187438964844, + "loss": 0.6391, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1629791259765625, + "rewards/margins": 5.600430965423584, + "rewards/rejected": -2.4374520778656006, + "step": 9886 + }, + { + "epoch": 2.47, + "grad_norm": 36.89310836791992, + "learning_rate": 5.0829178964288475e-06, + "logits/chosen": -0.5138669013977051, + "logits/rejected": -0.5413388013839722, + "logps/chosen": -58.10005187988281, + "logps/rejected": -104.39535522460938, + "loss": 0.8877, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8528645038604736, + "rewards/margins": 5.7522687911987305, + "rewards/rejected": -2.899404525756836, + "step": 9887 + }, + { + "epoch": 2.47, + "grad_norm": 7.1477227210998535, + "learning_rate": 5.08213201584012e-06, + "logits/chosen": -0.4103201627731323, + "logits/rejected": -0.5311175584793091, + "logps/chosen": -60.419151306152344, + "logps/rejected": -102.81927490234375, + "loss": 0.6431, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.030331611633301, + "rewards/margins": 6.425714015960693, + "rewards/rejected": -3.3953824043273926, + "step": 9888 + }, + { + "epoch": 2.47, + "grad_norm": 4.211315631866455, + "learning_rate": 5.081346133221824e-06, + "logits/chosen": -0.47696465253829956, + "logits/rejected": -0.6093963980674744, + "logps/chosen": -60.749176025390625, + "logps/rejected": -90.73408508300781, + "loss": 0.6883, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8117282390594482, + "rewards/margins": 6.946981430053711, + "rewards/rejected": -4.135253429412842, + "step": 9889 + }, + { + "epoch": 2.47, + "grad_norm": 10.657737731933594, + "learning_rate": 5.080560248593378e-06, + "logits/chosen": -0.5999264121055603, + "logits/rejected": -0.6053603291511536, + "logps/chosen": -60.63230895996094, + "logps/rejected": -103.41363525390625, + "loss": 0.7136, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7794508934020996, + "rewards/margins": 4.963674545288086, + "rewards/rejected": -2.1842234134674072, + "step": 9890 + }, + { + "epoch": 2.47, + "grad_norm": 11.938860893249512, + "learning_rate": 5.079774361974203e-06, + "logits/chosen": -0.5846018195152283, + "logits/rejected": -0.7040255069732666, + "logps/chosen": -75.12264251708984, + "logps/rejected": -89.44896697998047, + "loss": 0.7803, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.801952362060547, + "rewards/margins": 6.319230079650879, + "rewards/rejected": -3.5172781944274902, + "step": 9891 + }, + { + "epoch": 2.47, + "grad_norm": 9.831405639648438, + "learning_rate": 5.078988473383718e-06, + "logits/chosen": -0.4771616458892822, + "logits/rejected": -0.5409339070320129, + "logps/chosen": -53.9275016784668, + "logps/rejected": -98.23696899414062, + "loss": 0.689, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9685444831848145, + "rewards/margins": 6.070716857910156, + "rewards/rejected": -3.1021721363067627, + "step": 9892 + }, + { + "epoch": 2.47, + "grad_norm": 3.7220633029937744, + "learning_rate": 5.078202582841344e-06, + "logits/chosen": -0.5376982688903809, + "logits/rejected": -0.6055693030357361, + "logps/chosen": -50.15116500854492, + "logps/rejected": -101.50154876708984, + "loss": 0.6303, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.188920021057129, + "rewards/margins": 6.308229923248291, + "rewards/rejected": -3.119309663772583, + "step": 9893 + }, + { + "epoch": 2.48, + "grad_norm": 4.155606746673584, + "learning_rate": 5.077416690366502e-06, + "logits/chosen": -0.4900660514831543, + "logits/rejected": -0.5949746966362, + "logps/chosen": -62.22284698486328, + "logps/rejected": -92.43211364746094, + "loss": 0.7432, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.897193670272827, + "rewards/margins": 6.315885543823242, + "rewards/rejected": -3.418691396713257, + "step": 9894 + }, + { + "epoch": 2.48, + "grad_norm": 5.52959680557251, + "learning_rate": 5.076630795978611e-06, + "logits/chosen": -0.5232124924659729, + "logits/rejected": -0.6043077707290649, + "logps/chosen": -54.55596160888672, + "logps/rejected": -116.76351165771484, + "loss": 0.6796, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.682676076889038, + "rewards/margins": 6.759517669677734, + "rewards/rejected": -4.076841354370117, + "step": 9895 + }, + { + "epoch": 2.48, + "grad_norm": 5.605751991271973, + "learning_rate": 5.07584489969709e-06, + "logits/chosen": -0.5705655813217163, + "logits/rejected": -0.6203250885009766, + "logps/chosen": -47.97722625732422, + "logps/rejected": -105.79426574707031, + "loss": 0.6596, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8227434158325195, + "rewards/margins": 6.471580982208252, + "rewards/rejected": -3.648837089538574, + "step": 9896 + }, + { + "epoch": 2.48, + "grad_norm": 3.034085750579834, + "learning_rate": 5.075059001541361e-06, + "logits/chosen": -0.5545209050178528, + "logits/rejected": -0.6020262241363525, + "logps/chosen": -46.74338150024414, + "logps/rejected": -99.80657958984375, + "loss": 0.5478, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.080833911895752, + "rewards/margins": 7.001962184906006, + "rewards/rejected": -3.921128749847412, + "step": 9897 + }, + { + "epoch": 2.48, + "grad_norm": 13.341440200805664, + "learning_rate": 5.074273101530845e-06, + "logits/chosen": -0.4958694279193878, + "logits/rejected": -0.611958384513855, + "logps/chosen": -60.179935455322266, + "logps/rejected": -91.03623962402344, + "loss": 0.673, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2551817893981934, + "rewards/margins": 6.139401435852051, + "rewards/rejected": -2.8842201232910156, + "step": 9898 + }, + { + "epoch": 2.48, + "grad_norm": 3.454946994781494, + "learning_rate": 5.073487199684963e-06, + "logits/chosen": -0.5682698488235474, + "logits/rejected": -0.668384850025177, + "logps/chosen": -49.48017120361328, + "logps/rejected": -90.44149017333984, + "loss": 0.6171, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0211455821990967, + "rewards/margins": 6.945246696472168, + "rewards/rejected": -3.924100637435913, + "step": 9899 + }, + { + "epoch": 2.48, + "grad_norm": 5.012363433837891, + "learning_rate": 5.0727012960231335e-06, + "logits/chosen": -0.5517746806144714, + "logits/rejected": -0.6013049483299255, + "logps/chosen": -52.22663879394531, + "logps/rejected": -91.17388916015625, + "loss": 0.6976, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.917294979095459, + "rewards/margins": 6.02025842666626, + "rewards/rejected": -3.102963447570801, + "step": 9900 + }, + { + "epoch": 2.48, + "grad_norm": 9.12009334564209, + "learning_rate": 5.071915390564777e-06, + "logits/chosen": -0.4744631052017212, + "logits/rejected": -0.5966620445251465, + "logps/chosen": -60.42100524902344, + "logps/rejected": -94.60619354248047, + "loss": 0.6343, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0866947174072266, + "rewards/margins": 6.8048415184021, + "rewards/rejected": -3.718146800994873, + "step": 9901 + }, + { + "epoch": 2.48, + "grad_norm": 4.391355514526367, + "learning_rate": 5.0711294833293145e-06, + "logits/chosen": -0.4788025915622711, + "logits/rejected": -0.5410872101783752, + "logps/chosen": -68.02254486083984, + "logps/rejected": -97.97518920898438, + "loss": 0.6745, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9033901691436768, + "rewards/margins": 5.598665237426758, + "rewards/rejected": -2.69527530670166, + "step": 9902 + }, + { + "epoch": 2.48, + "grad_norm": 5.77194356918335, + "learning_rate": 5.070343574336167e-06, + "logits/chosen": -0.46806612610816956, + "logits/rejected": -0.5528044104576111, + "logps/chosen": -66.52093505859375, + "logps/rejected": -89.09003448486328, + "loss": 0.6985, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.552414655685425, + "rewards/margins": 5.887452602386475, + "rewards/rejected": -2.33503794670105, + "step": 9903 + }, + { + "epoch": 2.48, + "grad_norm": 6.0254716873168945, + "learning_rate": 5.0695576636047535e-06, + "logits/chosen": -0.5607589483261108, + "logits/rejected": -0.614335834980011, + "logps/chosen": -52.30582046508789, + "logps/rejected": -107.42843627929688, + "loss": 0.7432, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2109341621398926, + "rewards/margins": 4.964052200317383, + "rewards/rejected": -1.7531181573867798, + "step": 9904 + }, + { + "epoch": 2.48, + "grad_norm": 10.880108833312988, + "learning_rate": 5.0687717511545e-06, + "logits/chosen": -0.4724486470222473, + "logits/rejected": -0.6161121726036072, + "logps/chosen": -70.3214340209961, + "logps/rejected": -98.10017395019531, + "loss": 0.6391, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.020193099975586, + "rewards/margins": 7.185014724731445, + "rewards/rejected": -4.164822101593018, + "step": 9905 + }, + { + "epoch": 2.48, + "grad_norm": 3.1246609687805176, + "learning_rate": 5.067985837004819e-06, + "logits/chosen": -0.47568464279174805, + "logits/rejected": -0.5573743581771851, + "logps/chosen": -55.98058319091797, + "logps/rejected": -88.2132797241211, + "loss": 0.6761, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.312135696411133, + "rewards/margins": 6.293771266937256, + "rewards/rejected": -2.981635570526123, + "step": 9906 + }, + { + "epoch": 2.48, + "grad_norm": 4.072267532348633, + "learning_rate": 5.067199921175139e-06, + "logits/chosen": -0.5283653736114502, + "logits/rejected": -0.5881375670433044, + "logps/chosen": -50.652008056640625, + "logps/rejected": -102.70132446289062, + "loss": 0.6474, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.266357183456421, + "rewards/margins": 7.042991638183594, + "rewards/rejected": -3.7766342163085938, + "step": 9907 + }, + { + "epoch": 2.48, + "grad_norm": 4.854830265045166, + "learning_rate": 5.0664140036848765e-06, + "logits/chosen": -0.5631444454193115, + "logits/rejected": -0.612949788570404, + "logps/chosen": -56.71022033691406, + "logps/rejected": -101.1337890625, + "loss": 0.7715, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2727553844451904, + "rewards/margins": 5.881113529205322, + "rewards/rejected": -2.6083579063415527, + "step": 9908 + }, + { + "epoch": 2.48, + "grad_norm": 3.563055992126465, + "learning_rate": 5.065628084553453e-06, + "logits/chosen": -0.4964844584465027, + "logits/rejected": -0.5659321546554565, + "logps/chosen": -55.48546600341797, + "logps/rejected": -95.37931823730469, + "loss": 0.6658, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1019647121429443, + "rewards/margins": 5.8137078285217285, + "rewards/rejected": -2.7117433547973633, + "step": 9909 + }, + { + "epoch": 2.48, + "grad_norm": 44.21131134033203, + "learning_rate": 5.06484216380029e-06, + "logits/chosen": -0.5234218835830688, + "logits/rejected": -0.6470577120780945, + "logps/chosen": -63.220394134521484, + "logps/rejected": -95.5109634399414, + "loss": 0.7935, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.874345064163208, + "rewards/margins": 5.776249408721924, + "rewards/rejected": -2.9019041061401367, + "step": 9910 + }, + { + "epoch": 2.48, + "grad_norm": 5.403100967407227, + "learning_rate": 5.064056241444807e-06, + "logits/chosen": -0.5047513246536255, + "logits/rejected": -0.6292712688446045, + "logps/chosen": -64.36274719238281, + "logps/rejected": -98.14331817626953, + "loss": 0.7103, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1365716457366943, + "rewards/margins": 6.588029384613037, + "rewards/rejected": -3.451457977294922, + "step": 9911 + }, + { + "epoch": 2.48, + "grad_norm": 8.150824546813965, + "learning_rate": 5.0632703175064275e-06, + "logits/chosen": -0.46598339080810547, + "logits/rejected": -0.5489368438720703, + "logps/chosen": -63.16374206542969, + "logps/rejected": -92.75358581542969, + "loss": 0.6324, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2656424045562744, + "rewards/margins": 5.692756175994873, + "rewards/rejected": -2.4271135330200195, + "step": 9912 + }, + { + "epoch": 2.48, + "grad_norm": 6.189955234527588, + "learning_rate": 5.062484392004569e-06, + "logits/chosen": -0.5074693560600281, + "logits/rejected": -0.5427509546279907, + "logps/chosen": -68.23822784423828, + "logps/rejected": -104.48974609375, + "loss": 0.8342, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9570255279541016, + "rewards/margins": 5.603992462158203, + "rewards/rejected": -2.6469664573669434, + "step": 9913 + }, + { + "epoch": 2.48, + "grad_norm": 6.169958591461182, + "learning_rate": 5.061698464958659e-06, + "logits/chosen": -0.49890315532684326, + "logits/rejected": -0.6088680028915405, + "logps/chosen": -46.82443618774414, + "logps/rejected": -96.8603744506836, + "loss": 0.6341, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.912874221801758, + "rewards/margins": 6.562992095947266, + "rewards/rejected": -3.650118112564087, + "step": 9914 + }, + { + "epoch": 2.48, + "grad_norm": 2.9126265048980713, + "learning_rate": 5.06091253638811e-06, + "logits/chosen": -0.5071808695793152, + "logits/rejected": -0.6103482842445374, + "logps/chosen": -57.57147216796875, + "logps/rejected": -98.24126434326172, + "loss": 0.6384, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.875885486602783, + "rewards/margins": 6.783082962036133, + "rewards/rejected": -3.907197952270508, + "step": 9915 + }, + { + "epoch": 2.48, + "grad_norm": 6.118388652801514, + "learning_rate": 5.06012660631235e-06, + "logits/chosen": -0.49944740533828735, + "logits/rejected": -0.5751817226409912, + "logps/chosen": -57.43521499633789, + "logps/rejected": -100.64469909667969, + "loss": 0.7327, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8616650104522705, + "rewards/margins": 5.688577651977539, + "rewards/rejected": -2.8269124031066895, + "step": 9916 + }, + { + "epoch": 2.48, + "grad_norm": 7.269924640655518, + "learning_rate": 5.059340674750796e-06, + "logits/chosen": -0.5013711452484131, + "logits/rejected": -0.5872637033462524, + "logps/chosen": -53.04075622558594, + "logps/rejected": -105.09008026123047, + "loss": 0.6973, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8742825984954834, + "rewards/margins": 5.53732442855835, + "rewards/rejected": -2.663041591644287, + "step": 9917 + }, + { + "epoch": 2.48, + "grad_norm": 1.8653720617294312, + "learning_rate": 5.058554741722873e-06, + "logits/chosen": -0.5816823244094849, + "logits/rejected": -0.6233210563659668, + "logps/chosen": -43.24755859375, + "logps/rejected": -113.08665466308594, + "loss": 0.5611, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0347299575805664, + "rewards/margins": 7.218810081481934, + "rewards/rejected": -4.184079647064209, + "step": 9918 + }, + { + "epoch": 2.48, + "grad_norm": 3.1269028186798096, + "learning_rate": 5.057768807247998e-06, + "logits/chosen": -0.46091407537460327, + "logits/rejected": -0.5688917636871338, + "logps/chosen": -51.992820739746094, + "logps/rejected": -98.9365005493164, + "loss": 0.6037, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1855506896972656, + "rewards/margins": 6.498256683349609, + "rewards/rejected": -3.3127059936523438, + "step": 9919 + }, + { + "epoch": 2.48, + "grad_norm": 6.69579553604126, + "learning_rate": 5.056982871345595e-06, + "logits/chosen": -0.5160905122756958, + "logits/rejected": -0.570246160030365, + "logps/chosen": -45.2601203918457, + "logps/rejected": -87.85528564453125, + "loss": 0.6893, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.931220531463623, + "rewards/margins": 5.4419989585876465, + "rewards/rejected": -2.5107784271240234, + "step": 9920 + }, + { + "epoch": 2.48, + "grad_norm": 22.601062774658203, + "learning_rate": 5.056196934035083e-06, + "logits/chosen": -0.4879390001296997, + "logits/rejected": -0.575884222984314, + "logps/chosen": -64.35195922851562, + "logps/rejected": -87.14228820800781, + "loss": 0.7109, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.930213212966919, + "rewards/margins": 5.261786460876465, + "rewards/rejected": -2.331573009490967, + "step": 9921 + }, + { + "epoch": 2.48, + "grad_norm": 7.461819171905518, + "learning_rate": 5.055410995335888e-06, + "logits/chosen": -0.5043792724609375, + "logits/rejected": -0.5822290778160095, + "logps/chosen": -55.25466537475586, + "logps/rejected": -95.57928466796875, + "loss": 0.6741, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.037931203842163, + "rewards/margins": 5.900859832763672, + "rewards/rejected": -2.862928867340088, + "step": 9922 + }, + { + "epoch": 2.48, + "grad_norm": 4.3680949211120605, + "learning_rate": 5.054625055267427e-06, + "logits/chosen": -0.5617736577987671, + "logits/rejected": -0.6956782937049866, + "logps/chosen": -50.512107849121094, + "logps/rejected": -108.31942749023438, + "loss": 0.6268, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.056246280670166, + "rewards/margins": 8.227296829223633, + "rewards/rejected": -5.171051025390625, + "step": 9923 + }, + { + "epoch": 2.48, + "grad_norm": 7.604948997497559, + "learning_rate": 5.053839113849123e-06, + "logits/chosen": -0.4880260229110718, + "logits/rejected": -0.5470876097679138, + "logps/chosen": -45.951053619384766, + "logps/rejected": -93.56123352050781, + "loss": 0.6114, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9108948707580566, + "rewards/margins": 5.570810794830322, + "rewards/rejected": -2.6599159240722656, + "step": 9924 + }, + { + "epoch": 2.48, + "grad_norm": 6.292874813079834, + "learning_rate": 5.053053171100396e-06, + "logits/chosen": -0.6682203412055969, + "logits/rejected": -0.7036912441253662, + "logps/chosen": -65.4980239868164, + "logps/rejected": -116.32501220703125, + "loss": 0.8925, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1457223892211914, + "rewards/margins": 5.647772789001465, + "rewards/rejected": -2.5020503997802734, + "step": 9925 + }, + { + "epoch": 2.48, + "grad_norm": 8.160676956176758, + "learning_rate": 5.052267227040672e-06, + "logits/chosen": -0.49945271015167236, + "logits/rejected": -0.5436750650405884, + "logps/chosen": -54.096256256103516, + "logps/rejected": -102.59329986572266, + "loss": 0.6293, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.104752779006958, + "rewards/margins": 6.370800971984863, + "rewards/rejected": -3.2660484313964844, + "step": 9926 + }, + { + "epoch": 2.48, + "grad_norm": 10.979525566101074, + "learning_rate": 5.0514812816893645e-06, + "logits/chosen": -0.520117998123169, + "logits/rejected": -0.597872257232666, + "logps/chosen": -63.60404968261719, + "logps/rejected": -99.57843780517578, + "loss": 0.7037, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.768376350402832, + "rewards/margins": 5.478881359100342, + "rewards/rejected": -2.710505723953247, + "step": 9927 + }, + { + "epoch": 2.48, + "grad_norm": 3.3118748664855957, + "learning_rate": 5.050695335065903e-06, + "logits/chosen": -0.47481703758239746, + "logits/rejected": -0.6019483208656311, + "logps/chosen": -65.37164306640625, + "logps/rejected": -88.2021255493164, + "loss": 0.6372, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.029370069503784, + "rewards/margins": 6.420281887054443, + "rewards/rejected": -3.390911102294922, + "step": 9928 + }, + { + "epoch": 2.48, + "grad_norm": 3.221254348754883, + "learning_rate": 5.049909387189704e-06, + "logits/chosen": -0.5229554772377014, + "logits/rejected": -0.5484369397163391, + "logps/chosen": -55.5804443359375, + "logps/rejected": -101.45887756347656, + "loss": 0.7091, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.592703104019165, + "rewards/margins": 6.909426689147949, + "rewards/rejected": -3.316723585128784, + "step": 9929 + }, + { + "epoch": 2.48, + "grad_norm": 7.0501556396484375, + "learning_rate": 5.049123438080192e-06, + "logits/chosen": -0.4737180173397064, + "logits/rejected": -0.5614757537841797, + "logps/chosen": -54.15342330932617, + "logps/rejected": -107.91246032714844, + "loss": 0.6459, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.264580488204956, + "rewards/margins": 7.518576622009277, + "rewards/rejected": -4.2539963722229, + "step": 9930 + }, + { + "epoch": 2.48, + "grad_norm": 8.30109691619873, + "learning_rate": 5.048337487756789e-06, + "logits/chosen": -0.48546645045280457, + "logits/rejected": -0.5673990249633789, + "logps/chosen": -59.33795928955078, + "logps/rejected": -90.18363952636719, + "loss": 0.7132, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.992375373840332, + "rewards/margins": 5.967075347900391, + "rewards/rejected": -2.9746997356414795, + "step": 9931 + }, + { + "epoch": 2.48, + "grad_norm": 3.995922565460205, + "learning_rate": 5.047551536238913e-06, + "logits/chosen": -0.4759754240512848, + "logits/rejected": -0.5582839846611023, + "logps/chosen": -63.22484588623047, + "logps/rejected": -95.76649475097656, + "loss": 0.7048, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1780028343200684, + "rewards/margins": 6.108541011810303, + "rewards/rejected": -2.9305384159088135, + "step": 9932 + }, + { + "epoch": 2.48, + "grad_norm": 2.6142375469207764, + "learning_rate": 5.04676558354599e-06, + "logits/chosen": -0.5306975841522217, + "logits/rejected": -0.6159011125564575, + "logps/chosen": -49.19706344604492, + "logps/rejected": -114.51338195800781, + "loss": 0.5525, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.909346580505371, + "rewards/margins": 7.552412033081055, + "rewards/rejected": -4.643064975738525, + "step": 9933 + }, + { + "epoch": 2.49, + "grad_norm": 13.243413925170898, + "learning_rate": 5.045979629697437e-06, + "logits/chosen": -0.5548423528671265, + "logits/rejected": -0.6247603297233582, + "logps/chosen": -51.30255126953125, + "logps/rejected": -99.05366516113281, + "loss": 0.8801, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2742576599121094, + "rewards/margins": 6.5459465980529785, + "rewards/rejected": -3.2716891765594482, + "step": 9934 + }, + { + "epoch": 2.49, + "grad_norm": 5.454047679901123, + "learning_rate": 5.045193674712681e-06, + "logits/chosen": -0.523229718208313, + "logits/rejected": -0.6655873656272888, + "logps/chosen": -70.98033142089844, + "logps/rejected": -87.7220687866211, + "loss": 0.6999, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.710008382797241, + "rewards/margins": 6.402525424957275, + "rewards/rejected": -3.692516326904297, + "step": 9935 + }, + { + "epoch": 2.49, + "grad_norm": 4.4205217361450195, + "learning_rate": 5.044407718611138e-06, + "logits/chosen": -0.4964042901992798, + "logits/rejected": -0.5702394843101501, + "logps/chosen": -56.98030090332031, + "logps/rejected": -92.06790924072266, + "loss": 0.7427, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.151521682739258, + "rewards/margins": 6.182190418243408, + "rewards/rejected": -3.0306684970855713, + "step": 9936 + }, + { + "epoch": 2.49, + "grad_norm": 3.556995153427124, + "learning_rate": 5.043621761412237e-06, + "logits/chosen": -0.5383146405220032, + "logits/rejected": -0.5852924585342407, + "logps/chosen": -57.03857421875, + "logps/rejected": -113.42445373535156, + "loss": 0.6147, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.815809965133667, + "rewards/margins": 6.884543418884277, + "rewards/rejected": -4.0687336921691895, + "step": 9937 + }, + { + "epoch": 2.49, + "grad_norm": 6.914399147033691, + "learning_rate": 5.042835803135394e-06, + "logits/chosen": -0.4868640601634979, + "logits/rejected": -0.5245382785797119, + "logps/chosen": -56.14746856689453, + "logps/rejected": -98.63829803466797, + "loss": 0.6944, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0579302310943604, + "rewards/margins": 5.190269947052002, + "rewards/rejected": -2.1323392391204834, + "step": 9938 + }, + { + "epoch": 2.49, + "grad_norm": 3.6489145755767822, + "learning_rate": 5.042049843800032e-06, + "logits/chosen": -0.5678636431694031, + "logits/rejected": -0.7141135931015015, + "logps/chosen": -57.55976486206055, + "logps/rejected": -100.77338409423828, + "loss": 0.6049, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.166097640991211, + "rewards/margins": 7.251901149749756, + "rewards/rejected": -4.085803508758545, + "step": 9939 + }, + { + "epoch": 2.49, + "grad_norm": 5.994480609893799, + "learning_rate": 5.0412638834255755e-06, + "logits/chosen": -0.5801281332969666, + "logits/rejected": -0.6425538659095764, + "logps/chosen": -45.41291046142578, + "logps/rejected": -85.91656494140625, + "loss": 0.6902, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.310307025909424, + "rewards/margins": 6.218575477600098, + "rewards/rejected": -2.908268451690674, + "step": 9940 + }, + { + "epoch": 2.49, + "grad_norm": 3.7972400188446045, + "learning_rate": 5.040477922031442e-06, + "logits/chosen": -0.4900292158126831, + "logits/rejected": -0.6046504378318787, + "logps/chosen": -56.985538482666016, + "logps/rejected": -79.28956604003906, + "loss": 0.634, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0043981075286865, + "rewards/margins": 5.402172088623047, + "rewards/rejected": -2.3977742195129395, + "step": 9941 + }, + { + "epoch": 2.49, + "grad_norm": 8.426961898803711, + "learning_rate": 5.039691959637059e-06, + "logits/chosen": -0.5180442333221436, + "logits/rejected": -0.5765323638916016, + "logps/chosen": -63.67572784423828, + "logps/rejected": -99.49427032470703, + "loss": 0.8002, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.645232915878296, + "rewards/margins": 5.794693470001221, + "rewards/rejected": -3.149460554122925, + "step": 9942 + }, + { + "epoch": 2.49, + "grad_norm": 5.5506815910339355, + "learning_rate": 5.038905996261844e-06, + "logits/chosen": -0.497941255569458, + "logits/rejected": -0.5985060334205627, + "logps/chosen": -52.651397705078125, + "logps/rejected": -99.07659149169922, + "loss": 0.6651, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.24800443649292, + "rewards/margins": 6.1686811447143555, + "rewards/rejected": -2.9206769466400146, + "step": 9943 + }, + { + "epoch": 2.49, + "grad_norm": 8.345155715942383, + "learning_rate": 5.03812003192522e-06, + "logits/chosen": -0.53180992603302, + "logits/rejected": -0.6344022750854492, + "logps/chosen": -69.80911254882812, + "logps/rejected": -104.9041748046875, + "loss": 0.7474, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8345565795898438, + "rewards/margins": 6.246496200561523, + "rewards/rejected": -3.4119396209716797, + "step": 9944 + }, + { + "epoch": 2.49, + "grad_norm": 5.111241817474365, + "learning_rate": 5.0373340666466094e-06, + "logits/chosen": -0.5406596064567566, + "logits/rejected": -0.6241007447242737, + "logps/chosen": -50.34202194213867, + "logps/rejected": -97.96307373046875, + "loss": 0.7586, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.007610321044922, + "rewards/margins": 5.813053607940674, + "rewards/rejected": -2.80544376373291, + "step": 9945 + }, + { + "epoch": 2.49, + "grad_norm": 4.052443504333496, + "learning_rate": 5.0365481004454355e-06, + "logits/chosen": -0.6422321200370789, + "logits/rejected": -0.69390469789505, + "logps/chosen": -52.45370864868164, + "logps/rejected": -90.81547546386719, + "loss": 0.6738, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0532355308532715, + "rewards/margins": 4.8881754875183105, + "rewards/rejected": -1.8349401950836182, + "step": 9946 + }, + { + "epoch": 2.49, + "grad_norm": 4.287330150604248, + "learning_rate": 5.035762133341117e-06, + "logits/chosen": -0.5020601749420166, + "logits/rejected": -0.5706774592399597, + "logps/chosen": -53.86539077758789, + "logps/rejected": -88.08544921875, + "loss": 0.7172, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0897207260131836, + "rewards/margins": 5.324731826782227, + "rewards/rejected": -2.235011100769043, + "step": 9947 + }, + { + "epoch": 2.49, + "grad_norm": 4.752102851867676, + "learning_rate": 5.0349761653530805e-06, + "logits/chosen": -0.4810866117477417, + "logits/rejected": -0.5445871949195862, + "logps/chosen": -44.829132080078125, + "logps/rejected": -95.50968933105469, + "loss": 0.6223, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.090301752090454, + "rewards/margins": 6.140963554382324, + "rewards/rejected": -3.050661563873291, + "step": 9948 + }, + { + "epoch": 2.49, + "grad_norm": 4.7883076667785645, + "learning_rate": 5.034190196500746e-06, + "logits/chosen": -0.40306782722473145, + "logits/rejected": -0.4982774257659912, + "logps/chosen": -63.99677658081055, + "logps/rejected": -98.3701171875, + "loss": 0.6895, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.731013298034668, + "rewards/margins": 6.2526116371154785, + "rewards/rejected": -3.521599054336548, + "step": 9949 + }, + { + "epoch": 2.49, + "grad_norm": 4.5670366287231445, + "learning_rate": 5.0334042268035325e-06, + "logits/chosen": -0.5233654975891113, + "logits/rejected": -0.6102611422538757, + "logps/chosen": -52.893577575683594, + "logps/rejected": -85.51397705078125, + "loss": 0.6387, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1464946269989014, + "rewards/margins": 6.087772846221924, + "rewards/rejected": -2.9412784576416016, + "step": 9950 + }, + { + "epoch": 2.49, + "grad_norm": 10.82695198059082, + "learning_rate": 5.032618256280868e-06, + "logits/chosen": -0.5128594636917114, + "logits/rejected": -0.6494263410568237, + "logps/chosen": -67.26605224609375, + "logps/rejected": -99.12816619873047, + "loss": 0.7861, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6740732192993164, + "rewards/margins": 5.865623950958252, + "rewards/rejected": -3.1915504932403564, + "step": 9951 + }, + { + "epoch": 2.49, + "grad_norm": 15.990494728088379, + "learning_rate": 5.03183228495217e-06, + "logits/chosen": -0.5040500164031982, + "logits/rejected": -0.5948399305343628, + "logps/chosen": -57.08073806762695, + "logps/rejected": -89.69562530517578, + "loss": 0.7542, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7824950218200684, + "rewards/margins": 5.400766849517822, + "rewards/rejected": -2.6182708740234375, + "step": 9952 + }, + { + "epoch": 2.49, + "grad_norm": 10.74129867553711, + "learning_rate": 5.031046312836863e-06, + "logits/chosen": -0.45268967747688293, + "logits/rejected": -0.5813227295875549, + "logps/chosen": -64.78805541992188, + "logps/rejected": -86.64431762695312, + "loss": 0.9107, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7539143562316895, + "rewards/margins": 6.412770748138428, + "rewards/rejected": -3.658855438232422, + "step": 9953 + }, + { + "epoch": 2.49, + "grad_norm": 5.818580627441406, + "learning_rate": 5.030260339954368e-06, + "logits/chosen": -0.4931385815143585, + "logits/rejected": -0.5815232396125793, + "logps/chosen": -53.7762565612793, + "logps/rejected": -90.55728912353516, + "loss": 0.7165, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.99603533744812, + "rewards/margins": 5.364224433898926, + "rewards/rejected": -2.3681888580322266, + "step": 9954 + }, + { + "epoch": 2.49, + "grad_norm": 3.9758825302124023, + "learning_rate": 5.029474366324109e-06, + "logits/chosen": -0.47167256474494934, + "logits/rejected": -0.5805287957191467, + "logps/chosen": -59.22160720825195, + "logps/rejected": -89.16830444335938, + "loss": 0.6517, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.203409194946289, + "rewards/margins": 5.477074146270752, + "rewards/rejected": -2.273665189743042, + "step": 9955 + }, + { + "epoch": 2.49, + "grad_norm": 18.734193801879883, + "learning_rate": 5.028688391965506e-06, + "logits/chosen": -0.5521910786628723, + "logits/rejected": -0.5969495177268982, + "logps/chosen": -69.66686248779297, + "logps/rejected": -87.72398376464844, + "loss": 0.8889, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.136962413787842, + "rewards/margins": 5.162515640258789, + "rewards/rejected": -2.025552749633789, + "step": 9956 + }, + { + "epoch": 2.49, + "grad_norm": 12.062352180480957, + "learning_rate": 5.027902416897982e-06, + "logits/chosen": -0.5220229625701904, + "logits/rejected": -0.5950286388397217, + "logps/chosen": -54.34611511230469, + "logps/rejected": -95.79362487792969, + "loss": 0.7698, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.4924886226654053, + "rewards/margins": 5.939920425415039, + "rewards/rejected": -2.447432279586792, + "step": 9957 + }, + { + "epoch": 2.49, + "grad_norm": 5.9398908615112305, + "learning_rate": 5.02711644114096e-06, + "logits/chosen": -0.5215312838554382, + "logits/rejected": -0.5562810301780701, + "logps/chosen": -48.19417190551758, + "logps/rejected": -94.05035400390625, + "loss": 0.7093, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0694031715393066, + "rewards/margins": 5.192404270172119, + "rewards/rejected": -2.1230010986328125, + "step": 9958 + }, + { + "epoch": 2.49, + "grad_norm": 9.583549499511719, + "learning_rate": 5.0263304647138625e-06, + "logits/chosen": -0.6195237636566162, + "logits/rejected": -0.7059094905853271, + "logps/chosen": -50.15165710449219, + "logps/rejected": -89.7497787475586, + "loss": 0.6199, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.70468807220459, + "rewards/margins": 5.614643096923828, + "rewards/rejected": -2.9099555015563965, + "step": 9959 + }, + { + "epoch": 2.49, + "grad_norm": 14.225614547729492, + "learning_rate": 5.025544487636111e-06, + "logits/chosen": -0.43739384412765503, + "logits/rejected": -0.5001079440116882, + "logps/chosen": -66.30783081054688, + "logps/rejected": -88.24406433105469, + "loss": 0.7724, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.901937961578369, + "rewards/margins": 5.016152381896973, + "rewards/rejected": -2.1142146587371826, + "step": 9960 + }, + { + "epoch": 2.49, + "grad_norm": 2.4997453689575195, + "learning_rate": 5.024758509927128e-06, + "logits/chosen": -0.5120643377304077, + "logits/rejected": -0.5880483984947205, + "logps/chosen": -61.34614944458008, + "logps/rejected": -85.10662841796875, + "loss": 0.6678, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9916882514953613, + "rewards/margins": 5.616658687591553, + "rewards/rejected": -2.6249704360961914, + "step": 9961 + }, + { + "epoch": 2.49, + "grad_norm": 3.9432101249694824, + "learning_rate": 5.023972531606335e-06, + "logits/chosen": -0.49682068824768066, + "logits/rejected": -0.5671882629394531, + "logps/chosen": -60.10041046142578, + "logps/rejected": -95.47428894042969, + "loss": 0.6942, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.927715301513672, + "rewards/margins": 5.9011688232421875, + "rewards/rejected": -2.973453998565674, + "step": 9962 + }, + { + "epoch": 2.49, + "grad_norm": 5.343127727508545, + "learning_rate": 5.023186552693156e-06, + "logits/chosen": -0.5625184774398804, + "logits/rejected": -0.6721022129058838, + "logps/chosen": -56.85099792480469, + "logps/rejected": -87.01409912109375, + "loss": 0.7191, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8317770957946777, + "rewards/margins": 6.134355545043945, + "rewards/rejected": -3.3025786876678467, + "step": 9963 + }, + { + "epoch": 2.49, + "grad_norm": 3.8416588306427, + "learning_rate": 5.022400573207012e-06, + "logits/chosen": -0.430441290140152, + "logits/rejected": -0.47230327129364014, + "logps/chosen": -58.5246467590332, + "logps/rejected": -95.30583953857422, + "loss": 0.6557, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9349515438079834, + "rewards/margins": 5.824361801147461, + "rewards/rejected": -2.8894102573394775, + "step": 9964 + }, + { + "epoch": 2.49, + "grad_norm": 4.134758949279785, + "learning_rate": 5.021614593167327e-06, + "logits/chosen": -0.4951600432395935, + "logits/rejected": -0.5804124474525452, + "logps/chosen": -58.340232849121094, + "logps/rejected": -94.46321105957031, + "loss": 0.702, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2748475074768066, + "rewards/margins": 6.456258296966553, + "rewards/rejected": -3.181410789489746, + "step": 9965 + }, + { + "epoch": 2.49, + "grad_norm": 8.975605964660645, + "learning_rate": 5.020828612593522e-06, + "logits/chosen": -0.4436110258102417, + "logits/rejected": -0.4861387014389038, + "logps/chosen": -52.960044860839844, + "logps/rejected": -96.32202911376953, + "loss": 0.5816, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4297397136688232, + "rewards/margins": 5.4738664627075195, + "rewards/rejected": -2.044126510620117, + "step": 9966 + }, + { + "epoch": 2.49, + "grad_norm": 4.73235559463501, + "learning_rate": 5.02004263150502e-06, + "logits/chosen": -0.5086413025856018, + "logits/rejected": -0.6079556345939636, + "logps/chosen": -61.89107894897461, + "logps/rejected": -100.28521728515625, + "loss": 0.7326, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.214076042175293, + "rewards/margins": 6.9156951904296875, + "rewards/rejected": -3.7016186714172363, + "step": 9967 + }, + { + "epoch": 2.49, + "grad_norm": 10.262158393859863, + "learning_rate": 5.0192566499212425e-06, + "logits/chosen": -0.5698376297950745, + "logits/rejected": -0.6298074126243591, + "logps/chosen": -50.962520599365234, + "logps/rejected": -87.68407440185547, + "loss": 0.652, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.703195571899414, + "rewards/margins": 5.492284774780273, + "rewards/rejected": -2.7890889644622803, + "step": 9968 + }, + { + "epoch": 2.49, + "grad_norm": 2.99898099899292, + "learning_rate": 5.0184706678616145e-06, + "logits/chosen": -0.49260345101356506, + "logits/rejected": -0.6078908443450928, + "logps/chosen": -49.8562126159668, + "logps/rejected": -98.58708190917969, + "loss": 0.6388, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2365574836730957, + "rewards/margins": 6.928873062133789, + "rewards/rejected": -3.6923162937164307, + "step": 9969 + }, + { + "epoch": 2.49, + "grad_norm": 4.899782657623291, + "learning_rate": 5.017684685345554e-06, + "logits/chosen": -0.5633296370506287, + "logits/rejected": -0.6316484212875366, + "logps/chosen": -57.52778625488281, + "logps/rejected": -111.78790283203125, + "loss": 0.6468, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9962520599365234, + "rewards/margins": 6.924488067626953, + "rewards/rejected": -3.928236246109009, + "step": 9970 + }, + { + "epoch": 2.49, + "grad_norm": 7.1513671875, + "learning_rate": 5.016898702392489e-06, + "logits/chosen": -0.553117036819458, + "logits/rejected": -0.6238802671432495, + "logps/chosen": -51.74937438964844, + "logps/rejected": -102.45925903320312, + "loss": 0.6906, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.960139274597168, + "rewards/margins": 6.861637592315674, + "rewards/rejected": -3.901498794555664, + "step": 9971 + }, + { + "epoch": 2.49, + "grad_norm": 11.391541481018066, + "learning_rate": 5.016112719021838e-06, + "logits/chosen": -0.4433530867099762, + "logits/rejected": -0.5029730200767517, + "logps/chosen": -51.73860168457031, + "logps/rejected": -95.08836364746094, + "loss": 0.6582, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9929003715515137, + "rewards/margins": 5.496738910675049, + "rewards/rejected": -2.503838300704956, + "step": 9972 + }, + { + "epoch": 2.49, + "grad_norm": 3.9733688831329346, + "learning_rate": 5.015326735253023e-06, + "logits/chosen": -0.5527222156524658, + "logits/rejected": -0.5930885672569275, + "logps/chosen": -46.372650146484375, + "logps/rejected": -95.27080535888672, + "loss": 0.5528, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1680614948272705, + "rewards/margins": 6.191910743713379, + "rewards/rejected": -3.0238490104675293, + "step": 9973 + }, + { + "epoch": 2.5, + "grad_norm": 6.444521903991699, + "learning_rate": 5.01454075110547e-06, + "logits/chosen": -0.4291098713874817, + "logits/rejected": -0.5092942118644714, + "logps/chosen": -54.05370330810547, + "logps/rejected": -90.7440185546875, + "loss": 0.7481, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9160585403442383, + "rewards/margins": 5.091375350952148, + "rewards/rejected": -2.17531681060791, + "step": 9974 + }, + { + "epoch": 2.5, + "grad_norm": 10.760553359985352, + "learning_rate": 5.013754766598599e-06, + "logits/chosen": -0.48161470890045166, + "logits/rejected": -0.5721619725227356, + "logps/chosen": -59.509159088134766, + "logps/rejected": -109.68733978271484, + "loss": 0.7478, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1606123447418213, + "rewards/margins": 7.519707679748535, + "rewards/rejected": -4.359095096588135, + "step": 9975 + }, + { + "epoch": 2.5, + "grad_norm": 5.905452251434326, + "learning_rate": 5.012968781751833e-06, + "logits/chosen": -0.5127623081207275, + "logits/rejected": -0.5184154510498047, + "logps/chosen": -45.883155822753906, + "logps/rejected": -112.9415512084961, + "loss": 0.661, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.089254379272461, + "rewards/margins": 5.570950984954834, + "rewards/rejected": -2.481696367263794, + "step": 9976 + }, + { + "epoch": 2.5, + "grad_norm": 6.8923845291137695, + "learning_rate": 5.012182796584594e-06, + "logits/chosen": -0.4146207571029663, + "logits/rejected": -0.5386788249015808, + "logps/chosen": -70.48741912841797, + "logps/rejected": -91.26200866699219, + "loss": 0.6831, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.73787784576416, + "rewards/margins": 6.297995567321777, + "rewards/rejected": -3.5601179599761963, + "step": 9977 + }, + { + "epoch": 2.5, + "grad_norm": 5.9296393394470215, + "learning_rate": 5.011396811116306e-06, + "logits/chosen": -0.498838871717453, + "logits/rejected": -0.5792304277420044, + "logps/chosen": -65.4321517944336, + "logps/rejected": -106.38382720947266, + "loss": 0.734, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.820408344268799, + "rewards/margins": 5.992559432983398, + "rewards/rejected": -3.1721508502960205, + "step": 9978 + }, + { + "epoch": 2.5, + "grad_norm": 5.023716449737549, + "learning_rate": 5.010610825366389e-06, + "logits/chosen": -0.538734495639801, + "logits/rejected": -0.6174986958503723, + "logps/chosen": -71.71075439453125, + "logps/rejected": -103.93607330322266, + "loss": 0.6906, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.003859758377075, + "rewards/margins": 7.197821617126465, + "rewards/rejected": -4.193961143493652, + "step": 9979 + }, + { + "epoch": 2.5, + "grad_norm": 4.069208145141602, + "learning_rate": 5.009824839354269e-06, + "logits/chosen": -0.47440505027770996, + "logits/rejected": -0.5618312358856201, + "logps/chosen": -48.633827209472656, + "logps/rejected": -109.35112762451172, + "loss": 0.6323, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0857975482940674, + "rewards/margins": 6.749396800994873, + "rewards/rejected": -3.6635992527008057, + "step": 9980 + }, + { + "epoch": 2.5, + "grad_norm": 5.4820942878723145, + "learning_rate": 5.009038853099367e-06, + "logits/chosen": -0.4797135889530182, + "logits/rejected": -0.5547008514404297, + "logps/chosen": -65.13995361328125, + "logps/rejected": -89.6808090209961, + "loss": 0.7318, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0262351036071777, + "rewards/margins": 4.773406982421875, + "rewards/rejected": -1.7471715211868286, + "step": 9981 + }, + { + "epoch": 2.5, + "grad_norm": 14.225240707397461, + "learning_rate": 5.008252866621104e-06, + "logits/chosen": -0.5231440663337708, + "logits/rejected": -0.6423802375793457, + "logps/chosen": -52.788307189941406, + "logps/rejected": -96.76111602783203, + "loss": 0.632, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.904715061187744, + "rewards/margins": 6.098632335662842, + "rewards/rejected": -3.1939175128936768, + "step": 9982 + }, + { + "epoch": 2.5, + "grad_norm": 6.719607353210449, + "learning_rate": 5.007466879938906e-06, + "logits/chosen": -0.5467572212219238, + "logits/rejected": -0.6105621457099915, + "logps/chosen": -55.409034729003906, + "logps/rejected": -107.85162353515625, + "loss": 0.7795, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0773069858551025, + "rewards/margins": 6.958109378814697, + "rewards/rejected": -3.8808021545410156, + "step": 9983 + }, + { + "epoch": 2.5, + "grad_norm": 9.636809349060059, + "learning_rate": 5.006680893072191e-06, + "logits/chosen": -0.4987402558326721, + "logits/rejected": -0.5984847545623779, + "logps/chosen": -45.54737854003906, + "logps/rejected": -87.51234436035156, + "loss": 0.5566, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.076564311981201, + "rewards/margins": 6.56759786605835, + "rewards/rejected": -3.4910335540771484, + "step": 9984 + }, + { + "epoch": 2.5, + "grad_norm": 8.264249801635742, + "learning_rate": 5.0058949060403846e-06, + "logits/chosen": -0.5612590312957764, + "logits/rejected": -0.6208563446998596, + "logps/chosen": -62.239112854003906, + "logps/rejected": -94.5052261352539, + "loss": 0.7564, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.096299886703491, + "rewards/margins": 5.263317108154297, + "rewards/rejected": -2.1670172214508057, + "step": 9985 + }, + { + "epoch": 2.5, + "grad_norm": 5.793172359466553, + "learning_rate": 5.00510891886291e-06, + "logits/chosen": -0.47034066915512085, + "logits/rejected": -0.561882734298706, + "logps/chosen": -50.78322982788086, + "logps/rejected": -94.90940856933594, + "loss": 0.6485, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1691505908966064, + "rewards/margins": 6.050469875335693, + "rewards/rejected": -2.881319284439087, + "step": 9986 + }, + { + "epoch": 2.5, + "grad_norm": 6.773842811584473, + "learning_rate": 5.004322931559188e-06, + "logits/chosen": -0.5272620320320129, + "logits/rejected": -0.5943605899810791, + "logps/chosen": -61.7197265625, + "logps/rejected": -90.16175079345703, + "loss": 0.84, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.078444719314575, + "rewards/margins": 5.277439117431641, + "rewards/rejected": -2.1989943981170654, + "step": 9987 + }, + { + "epoch": 2.5, + "grad_norm": 2.7933573722839355, + "learning_rate": 5.0035369441486405e-06, + "logits/chosen": -0.5192229747772217, + "logits/rejected": -0.6431342959403992, + "logps/chosen": -61.44883728027344, + "logps/rejected": -86.83442687988281, + "loss": 0.6039, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.095285654067993, + "rewards/margins": 6.883058071136475, + "rewards/rejected": -3.7877721786499023, + "step": 9988 + }, + { + "epoch": 2.5, + "grad_norm": 7.789211750030518, + "learning_rate": 5.002750956650693e-06, + "logits/chosen": -0.42128992080688477, + "logits/rejected": -0.5052081346511841, + "logps/chosen": -65.26242065429688, + "logps/rejected": -93.9012451171875, + "loss": 0.7321, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.22918963432312, + "rewards/margins": 5.227425575256348, + "rewards/rejected": -1.998236060142517, + "step": 9989 + }, + { + "epoch": 2.5, + "grad_norm": 8.01758098602295, + "learning_rate": 5.001964969084766e-06, + "logits/chosen": -0.536074697971344, + "logits/rejected": -0.6285066604614258, + "logps/chosen": -55.42715835571289, + "logps/rejected": -93.2733383178711, + "loss": 0.6776, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9014365673065186, + "rewards/margins": 6.357349872589111, + "rewards/rejected": -3.4559130668640137, + "step": 9990 + }, + { + "epoch": 2.5, + "grad_norm": 6.135647773742676, + "learning_rate": 5.001178981470282e-06, + "logits/chosen": -0.5021394491195679, + "logits/rejected": -0.610465407371521, + "logps/chosen": -66.79318237304688, + "logps/rejected": -94.63632202148438, + "loss": 0.7694, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7065072059631348, + "rewards/margins": 5.067603588104248, + "rewards/rejected": -2.3610963821411133, + "step": 9991 + }, + { + "epoch": 2.5, + "grad_norm": 4.107243537902832, + "learning_rate": 5.000392993826666e-06, + "logits/chosen": -0.4691101908683777, + "logits/rejected": -0.541634202003479, + "logps/chosen": -54.36176300048828, + "logps/rejected": -98.66354370117188, + "loss": 0.6707, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.216491937637329, + "rewards/margins": 6.406421661376953, + "rewards/rejected": -3.1899304389953613, + "step": 9992 + }, + { + "epoch": 2.5, + "grad_norm": 4.338942527770996, + "learning_rate": 4.9996070061733366e-06, + "logits/chosen": -0.4634179472923279, + "logits/rejected": -0.5685580372810364, + "logps/chosen": -53.92847442626953, + "logps/rejected": -100.16258239746094, + "loss": 0.6075, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.067074775695801, + "rewards/margins": 6.413965225219727, + "rewards/rejected": -3.346890926361084, + "step": 9993 + }, + { + "epoch": 2.5, + "grad_norm": 2.2984495162963867, + "learning_rate": 4.998821018529719e-06, + "logits/chosen": -0.532579243183136, + "logits/rejected": -0.5937392115592957, + "logps/chosen": -58.94574737548828, + "logps/rejected": -96.8292236328125, + "loss": 0.6006, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.131453514099121, + "rewards/margins": 5.955408096313477, + "rewards/rejected": -2.8239541053771973, + "step": 9994 + }, + { + "epoch": 2.5, + "grad_norm": 7.50449275970459, + "learning_rate": 4.998035030915236e-06, + "logits/chosen": -0.45583057403564453, + "logits/rejected": -0.5123947262763977, + "logps/chosen": -53.929203033447266, + "logps/rejected": -100.09635162353516, + "loss": 0.6766, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1756067276000977, + "rewards/margins": 5.861438751220703, + "rewards/rejected": -2.6858325004577637, + "step": 9995 + }, + { + "epoch": 2.5, + "grad_norm": 2.7924718856811523, + "learning_rate": 4.9972490433493066e-06, + "logits/chosen": -0.5132495760917664, + "logits/rejected": -0.6110363006591797, + "logps/chosen": -45.12212371826172, + "logps/rejected": -83.13945770263672, + "loss": 0.589, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9238123893737793, + "rewards/margins": 5.818482875823975, + "rewards/rejected": -2.8946709632873535, + "step": 9996 + }, + { + "epoch": 2.5, + "grad_norm": 8.054238319396973, + "learning_rate": 4.99646305585136e-06, + "logits/chosen": -0.49685177206993103, + "logits/rejected": -0.6162617802619934, + "logps/chosen": -58.25715637207031, + "logps/rejected": -88.80314636230469, + "loss": 0.6558, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0337820053100586, + "rewards/margins": 6.31778621673584, + "rewards/rejected": -3.2840042114257812, + "step": 9997 + }, + { + "epoch": 2.5, + "grad_norm": 5.329650402069092, + "learning_rate": 4.995677068440814e-06, + "logits/chosen": -0.46649569272994995, + "logits/rejected": -0.5395090579986572, + "logps/chosen": -47.74700164794922, + "logps/rejected": -86.81233978271484, + "loss": 0.6018, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1550133228302, + "rewards/margins": 5.596870422363281, + "rewards/rejected": -2.441856622695923, + "step": 9998 + }, + { + "epoch": 2.5, + "grad_norm": 3.862365961074829, + "learning_rate": 4.99489108113709e-06, + "logits/chosen": -0.5499914288520813, + "logits/rejected": -0.6786074638366699, + "logps/chosen": -52.956417083740234, + "logps/rejected": -91.26958465576172, + "loss": 0.5981, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1672797203063965, + "rewards/margins": 7.157904148101807, + "rewards/rejected": -3.9906249046325684, + "step": 9999 + }, + { + "epoch": 2.5, + "grad_norm": 5.506633758544922, + "learning_rate": 4.994105093959617e-06, + "logits/chosen": -0.5290117859840393, + "logits/rejected": -0.5941410660743713, + "logps/chosen": -58.012794494628906, + "logps/rejected": -113.38951873779297, + "loss": 0.6351, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8945670127868652, + "rewards/margins": 7.641791820526123, + "rewards/rejected": -4.747224807739258, + "step": 10000 + }, + { + "epoch": 2.5, + "grad_norm": 7.186264991760254, + "learning_rate": 4.99331910692781e-06, + "logits/chosen": -0.5278317928314209, + "logits/rejected": -0.6090831756591797, + "logps/chosen": -57.14491653442383, + "logps/rejected": -92.43987274169922, + "loss": 0.6623, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9287338256835938, + "rewards/margins": 6.143069267272949, + "rewards/rejected": -3.214334726333618, + "step": 10001 + }, + { + "epoch": 2.5, + "grad_norm": 6.428077220916748, + "learning_rate": 4.9925331200610956e-06, + "logits/chosen": -0.540634036064148, + "logits/rejected": -0.5408998727798462, + "logps/chosen": -45.12469482421875, + "logps/rejected": -104.99041748046875, + "loss": 0.7557, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7806084156036377, + "rewards/margins": 4.386871337890625, + "rewards/rejected": -1.6062630414962769, + "step": 10002 + }, + { + "epoch": 2.5, + "grad_norm": 6.289060592651367, + "learning_rate": 4.991747133378896e-06, + "logits/chosen": -0.5218534469604492, + "logits/rejected": -0.613910973072052, + "logps/chosen": -56.81919860839844, + "logps/rejected": -92.29943084716797, + "loss": 0.7014, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.996441602706909, + "rewards/margins": 5.123889923095703, + "rewards/rejected": -2.127448320388794, + "step": 10003 + }, + { + "epoch": 2.5, + "grad_norm": 17.052505493164062, + "learning_rate": 4.9909611469006345e-06, + "logits/chosen": -0.5279893279075623, + "logits/rejected": -0.5903924703598022, + "logps/chosen": -52.19662857055664, + "logps/rejected": -103.89299774169922, + "loss": 0.756, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9119584560394287, + "rewards/margins": 7.182013511657715, + "rewards/rejected": -4.270054817199707, + "step": 10004 + }, + { + "epoch": 2.5, + "grad_norm": 8.441319465637207, + "learning_rate": 4.990175160645732e-06, + "logits/chosen": -0.34611839056015015, + "logits/rejected": -0.4334855079650879, + "logps/chosen": -70.54103088378906, + "logps/rejected": -117.82866668701172, + "loss": 0.6932, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.207617998123169, + "rewards/margins": 6.577830791473389, + "rewards/rejected": -3.370213270187378, + "step": 10005 + }, + { + "epoch": 2.5, + "grad_norm": 4.198389053344727, + "learning_rate": 4.989389174633611e-06, + "logits/chosen": -0.5327169895172119, + "logits/rejected": -0.6062443852424622, + "logps/chosen": -45.323055267333984, + "logps/rejected": -98.1188735961914, + "loss": 0.7147, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.068448305130005, + "rewards/margins": 7.13001823425293, + "rewards/rejected": -4.061568260192871, + "step": 10006 + }, + { + "epoch": 2.5, + "grad_norm": 3.4252493381500244, + "learning_rate": 4.988603188883696e-06, + "logits/chosen": -0.524297297000885, + "logits/rejected": -0.5912719368934631, + "logps/chosen": -48.60706329345703, + "logps/rejected": -88.33068084716797, + "loss": 0.5559, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.028130531311035, + "rewards/margins": 5.978460311889648, + "rewards/rejected": -2.9503304958343506, + "step": 10007 + }, + { + "epoch": 2.5, + "grad_norm": 11.332322120666504, + "learning_rate": 4.9878172034154076e-06, + "logits/chosen": -0.4550435543060303, + "logits/rejected": -0.5690671801567078, + "logps/chosen": -60.314151763916016, + "logps/rejected": -85.16861724853516, + "loss": 0.7038, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.881951332092285, + "rewards/margins": 5.909703731536865, + "rewards/rejected": -3.0277528762817383, + "step": 10008 + }, + { + "epoch": 2.5, + "grad_norm": 12.15938949584961, + "learning_rate": 4.987031218248169e-06, + "logits/chosen": -0.5001132488250732, + "logits/rejected": -0.5137295126914978, + "logps/chosen": -50.16725158691406, + "logps/rejected": -111.15975189208984, + "loss": 0.7334, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.261164903640747, + "rewards/margins": 5.27208948135376, + "rewards/rejected": -2.0109241008758545, + "step": 10009 + }, + { + "epoch": 2.5, + "grad_norm": 3.3155219554901123, + "learning_rate": 4.986245233401403e-06, + "logits/chosen": -0.5924746990203857, + "logits/rejected": -0.6940714120864868, + "logps/chosen": -50.42715072631836, + "logps/rejected": -77.41686248779297, + "loss": 0.6328, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.042168140411377, + "rewards/margins": 7.083884239196777, + "rewards/rejected": -4.041715621948242, + "step": 10010 + }, + { + "epoch": 2.5, + "grad_norm": 3.8677361011505127, + "learning_rate": 4.985459248894532e-06, + "logits/chosen": -0.5665766000747681, + "logits/rejected": -0.6355730891227722, + "logps/chosen": -71.31588745117188, + "logps/rejected": -105.20870971679688, + "loss": 0.6624, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3436105251312256, + "rewards/margins": 6.894644737243652, + "rewards/rejected": -3.551034450531006, + "step": 10011 + }, + { + "epoch": 2.5, + "grad_norm": 4.87780237197876, + "learning_rate": 4.984673264746978e-06, + "logits/chosen": -0.48821625113487244, + "logits/rejected": -0.5356301665306091, + "logps/chosen": -49.57379913330078, + "logps/rejected": -115.33622741699219, + "loss": 0.6985, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.317798137664795, + "rewards/margins": 6.314459800720215, + "rewards/rejected": -2.99666166305542, + "step": 10012 + }, + { + "epoch": 2.5, + "grad_norm": 2.418985366821289, + "learning_rate": 4.983887280978163e-06, + "logits/chosen": -0.5636751651763916, + "logits/rejected": -0.6727991104125977, + "logps/chosen": -49.232765197753906, + "logps/rejected": -85.45326232910156, + "loss": 0.6318, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.600574016571045, + "rewards/margins": 7.241475582122803, + "rewards/rejected": -3.640901565551758, + "step": 10013 + }, + { + "epoch": 2.51, + "grad_norm": 6.269966125488281, + "learning_rate": 4.983101297607514e-06, + "logits/chosen": -0.619094729423523, + "logits/rejected": -0.6820322871208191, + "logps/chosen": -46.633392333984375, + "logps/rejected": -86.74201965332031, + "loss": 0.7416, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.119229316711426, + "rewards/margins": 6.010527610778809, + "rewards/rejected": -2.89129900932312, + "step": 10014 + }, + { + "epoch": 2.51, + "grad_norm": 3.467702865600586, + "learning_rate": 4.982315314654447e-06, + "logits/chosen": -0.540529191493988, + "logits/rejected": -0.6014240980148315, + "logps/chosen": -68.26461791992188, + "logps/rejected": -110.02458953857422, + "loss": 0.7201, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9874064922332764, + "rewards/margins": 6.464550495147705, + "rewards/rejected": -3.4771437644958496, + "step": 10015 + }, + { + "epoch": 2.51, + "grad_norm": 4.049666881561279, + "learning_rate": 4.981529332138387e-06, + "logits/chosen": -0.5449245572090149, + "logits/rejected": -0.6495158076286316, + "logps/chosen": -53.36812210083008, + "logps/rejected": -94.92312622070312, + "loss": 0.5912, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.228536367416382, + "rewards/margins": 6.8923516273498535, + "rewards/rejected": -3.6638145446777344, + "step": 10016 + }, + { + "epoch": 2.51, + "grad_norm": 5.2651848793029785, + "learning_rate": 4.9807433500787575e-06, + "logits/chosen": -0.6038945317268372, + "logits/rejected": -0.6785827279090881, + "logps/chosen": -54.7568359375, + "logps/rejected": -100.49444580078125, + "loss": 0.6364, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1625800132751465, + "rewards/margins": 6.666637420654297, + "rewards/rejected": -3.5040574073791504, + "step": 10017 + }, + { + "epoch": 2.51, + "grad_norm": 7.994479179382324, + "learning_rate": 4.979957368494982e-06, + "logits/chosen": -0.5963380336761475, + "logits/rejected": -0.6853633522987366, + "logps/chosen": -52.76435852050781, + "logps/rejected": -83.68925476074219, + "loss": 0.6519, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.790921449661255, + "rewards/margins": 5.370329856872559, + "rewards/rejected": -2.579409122467041, + "step": 10018 + }, + { + "epoch": 2.51, + "grad_norm": 5.468685626983643, + "learning_rate": 4.979171387406479e-06, + "logits/chosen": -0.5114585161209106, + "logits/rejected": -0.6001255512237549, + "logps/chosen": -59.86259078979492, + "logps/rejected": -103.7572250366211, + "loss": 0.6772, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6827988624572754, + "rewards/margins": 6.589345455169678, + "rewards/rejected": -3.9065468311309814, + "step": 10019 + }, + { + "epoch": 2.51, + "grad_norm": 2.880514144897461, + "learning_rate": 4.9783854068326735e-06, + "logits/chosen": -0.5440176725387573, + "logits/rejected": -0.5732845067977905, + "logps/chosen": -59.01689147949219, + "logps/rejected": -110.35956573486328, + "loss": 0.6739, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1671366691589355, + "rewards/margins": 6.944843292236328, + "rewards/rejected": -3.7777068614959717, + "step": 10020 + }, + { + "epoch": 2.51, + "grad_norm": 6.586736679077148, + "learning_rate": 4.977599426792989e-06, + "logits/chosen": -0.444622665643692, + "logits/rejected": -0.5807691216468811, + "logps/chosen": -80.42562103271484, + "logps/rejected": -97.43070220947266, + "loss": 0.6772, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.534606456756592, + "rewards/margins": 6.20027494430542, + "rewards/rejected": -3.6656689643859863, + "step": 10021 + }, + { + "epoch": 2.51, + "grad_norm": 12.698559761047363, + "learning_rate": 4.976813447306846e-06, + "logits/chosen": -0.5337961316108704, + "logits/rejected": -0.6106530427932739, + "logps/chosen": -52.72499465942383, + "logps/rejected": -117.79609680175781, + "loss": 0.5964, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2040817737579346, + "rewards/margins": 7.204708576202393, + "rewards/rejected": -4.000627517700195, + "step": 10022 + }, + { + "epoch": 2.51, + "grad_norm": 4.843943119049072, + "learning_rate": 4.976027468393667e-06, + "logits/chosen": -0.46451622247695923, + "logits/rejected": -0.5109857320785522, + "logps/chosen": -61.445552825927734, + "logps/rejected": -114.74897766113281, + "loss": 0.7163, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.999100923538208, + "rewards/margins": 5.817684173583984, + "rewards/rejected": -2.8185839653015137, + "step": 10023 + }, + { + "epoch": 2.51, + "grad_norm": 4.530150413513184, + "learning_rate": 4.975241490072874e-06, + "logits/chosen": -0.5322701334953308, + "logits/rejected": -0.5708449482917786, + "logps/chosen": -47.496341705322266, + "logps/rejected": -108.44718170166016, + "loss": 0.6249, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0229296684265137, + "rewards/margins": 5.6533355712890625, + "rewards/rejected": -2.630406618118286, + "step": 10024 + }, + { + "epoch": 2.51, + "grad_norm": 3.682643175125122, + "learning_rate": 4.974455512363891e-06, + "logits/chosen": -0.5190527439117432, + "logits/rejected": -0.619779109954834, + "logps/chosen": -55.517635345458984, + "logps/rejected": -96.75140380859375, + "loss": 0.5478, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.256882667541504, + "rewards/margins": 7.054045677185059, + "rewards/rejected": -3.7971625328063965, + "step": 10025 + }, + { + "epoch": 2.51, + "grad_norm": 10.498847007751465, + "learning_rate": 4.97366953528614e-06, + "logits/chosen": -0.4617748558521271, + "logits/rejected": -0.5328274369239807, + "logps/chosen": -64.5546875, + "logps/rejected": -92.78718566894531, + "loss": 0.8252, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8993985652923584, + "rewards/margins": 4.918915271759033, + "rewards/rejected": -2.019516944885254, + "step": 10026 + }, + { + "epoch": 2.51, + "grad_norm": 17.33133888244629, + "learning_rate": 4.9728835588590415e-06, + "logits/chosen": -0.5000994801521301, + "logits/rejected": -0.5632126331329346, + "logps/chosen": -56.865753173828125, + "logps/rejected": -87.80966186523438, + "loss": 0.7252, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.950706720352173, + "rewards/margins": 5.294763088226318, + "rewards/rejected": -2.3440563678741455, + "step": 10027 + }, + { + "epoch": 2.51, + "grad_norm": 5.442707061767578, + "learning_rate": 4.972097583102019e-06, + "logits/chosen": -0.5560799837112427, + "logits/rejected": -0.6629768013954163, + "logps/chosen": -63.436954498291016, + "logps/rejected": -88.51947021484375, + "loss": 0.7194, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9134202003479004, + "rewards/margins": 6.297792911529541, + "rewards/rejected": -3.3843724727630615, + "step": 10028 + }, + { + "epoch": 2.51, + "grad_norm": 6.49004602432251, + "learning_rate": 4.971311608034497e-06, + "logits/chosen": -0.5610834956169128, + "logits/rejected": -0.5850108861923218, + "logps/chosen": -53.69889831542969, + "logps/rejected": -105.65617370605469, + "loss": 0.6471, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.504070520401001, + "rewards/margins": 5.820372581481934, + "rewards/rejected": -2.3163022994995117, + "step": 10029 + }, + { + "epoch": 2.51, + "grad_norm": 4.9965081214904785, + "learning_rate": 4.970525633675893e-06, + "logits/chosen": -0.4223547577857971, + "logits/rejected": -0.5908686518669128, + "logps/chosen": -60.420928955078125, + "logps/rejected": -103.0344467163086, + "loss": 0.6199, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9382410049438477, + "rewards/margins": 6.421520709991455, + "rewards/rejected": -3.4832797050476074, + "step": 10030 + }, + { + "epoch": 2.51, + "grad_norm": 7.545833587646484, + "learning_rate": 4.9697396600456325e-06, + "logits/chosen": -0.4424613118171692, + "logits/rejected": -0.539879560470581, + "logps/chosen": -59.29561233520508, + "logps/rejected": -96.69965362548828, + "loss": 0.6871, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.19016695022583, + "rewards/margins": 5.897518157958984, + "rewards/rejected": -2.707350730895996, + "step": 10031 + }, + { + "epoch": 2.51, + "grad_norm": 5.106191635131836, + "learning_rate": 4.96895368716314e-06, + "logits/chosen": -0.5133422017097473, + "logits/rejected": -0.6259678602218628, + "logps/chosen": -60.36259460449219, + "logps/rejected": -103.04006958007812, + "loss": 0.6584, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.075117349624634, + "rewards/margins": 7.618633270263672, + "rewards/rejected": -4.543516159057617, + "step": 10032 + }, + { + "epoch": 2.51, + "grad_norm": 5.184979438781738, + "learning_rate": 4.968167715047831e-06, + "logits/chosen": -0.6134209632873535, + "logits/rejected": -0.7021658420562744, + "logps/chosen": -50.39836120605469, + "logps/rejected": -89.2680435180664, + "loss": 0.7773, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.222959041595459, + "rewards/margins": 6.095378875732422, + "rewards/rejected": -2.872420072555542, + "step": 10033 + }, + { + "epoch": 2.51, + "grad_norm": 3.9787325859069824, + "learning_rate": 4.967381743719133e-06, + "logits/chosen": -0.5770935416221619, + "logits/rejected": -0.6441116333007812, + "logps/chosen": -50.49382019042969, + "logps/rejected": -106.40792846679688, + "loss": 0.6396, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0382556915283203, + "rewards/margins": 6.952733039855957, + "rewards/rejected": -3.9144771099090576, + "step": 10034 + }, + { + "epoch": 2.51, + "grad_norm": 4.32419490814209, + "learning_rate": 4.966595773196467e-06, + "logits/chosen": -0.4352177381515503, + "logits/rejected": -0.5503359436988831, + "logps/chosen": -54.50810623168945, + "logps/rejected": -103.31875610351562, + "loss": 0.5952, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0987141132354736, + "rewards/margins": 7.779218673706055, + "rewards/rejected": -4.68050479888916, + "step": 10035 + }, + { + "epoch": 2.51, + "grad_norm": 3.3675198554992676, + "learning_rate": 4.9658098034992565e-06, + "logits/chosen": -0.5848957896232605, + "logits/rejected": -0.6143200993537903, + "logps/chosen": -60.283512115478516, + "logps/rejected": -108.68956756591797, + "loss": 0.7195, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.121211290359497, + "rewards/margins": 5.942087650299072, + "rewards/rejected": -2.8208765983581543, + "step": 10036 + }, + { + "epoch": 2.51, + "grad_norm": 4.624424457550049, + "learning_rate": 4.965023834646921e-06, + "logits/chosen": -0.48327597975730896, + "logits/rejected": -0.5588363409042358, + "logps/chosen": -58.25566864013672, + "logps/rejected": -111.2577133178711, + "loss": 0.6298, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8729708194732666, + "rewards/margins": 6.383479595184326, + "rewards/rejected": -3.5105090141296387, + "step": 10037 + }, + { + "epoch": 2.51, + "grad_norm": 4.953720569610596, + "learning_rate": 4.964237866658883e-06, + "logits/chosen": -0.5206180810928345, + "logits/rejected": -0.5439029932022095, + "logps/chosen": -52.764469146728516, + "logps/rejected": -107.58601379394531, + "loss": 0.6143, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.988173484802246, + "rewards/margins": 6.601743698120117, + "rewards/rejected": -3.6135706901550293, + "step": 10038 + }, + { + "epoch": 2.51, + "grad_norm": 3.939128875732422, + "learning_rate": 4.963451899554567e-06, + "logits/chosen": -0.5128016471862793, + "logits/rejected": -0.6129406094551086, + "logps/chosen": -58.99275207519531, + "logps/rejected": -112.33830261230469, + "loss": 0.7369, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7184395790100098, + "rewards/margins": 5.656036376953125, + "rewards/rejected": -2.937596559524536, + "step": 10039 + }, + { + "epoch": 2.51, + "grad_norm": 16.962158203125, + "learning_rate": 4.962665933353392e-06, + "logits/chosen": -0.5724551677703857, + "logits/rejected": -0.6417285203933716, + "logps/chosen": -49.61970901489258, + "logps/rejected": -88.46444702148438, + "loss": 0.7406, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8931469917297363, + "rewards/margins": 5.418319225311279, + "rewards/rejected": -2.525172233581543, + "step": 10040 + }, + { + "epoch": 2.51, + "grad_norm": 8.113471984863281, + "learning_rate": 4.961879968074782e-06, + "logits/chosen": -0.4977233409881592, + "logits/rejected": -0.5530933141708374, + "logps/chosen": -49.64868927001953, + "logps/rejected": -92.3448486328125, + "loss": 0.6613, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1942920684814453, + "rewards/margins": 5.988979339599609, + "rewards/rejected": -2.7946863174438477, + "step": 10041 + }, + { + "epoch": 2.51, + "grad_norm": 6.91703987121582, + "learning_rate": 4.961094003738156e-06, + "logits/chosen": -0.546967625617981, + "logits/rejected": -0.6401699185371399, + "logps/chosen": -58.571685791015625, + "logps/rejected": -109.79083251953125, + "loss": 0.7069, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0495550632476807, + "rewards/margins": 6.342385292053223, + "rewards/rejected": -3.292830228805542, + "step": 10042 + }, + { + "epoch": 2.51, + "grad_norm": 1.9816985130310059, + "learning_rate": 4.9603080403629436e-06, + "logits/chosen": -0.5916615724563599, + "logits/rejected": -0.6741402745246887, + "logps/chosen": -58.023677825927734, + "logps/rejected": -111.47908782958984, + "loss": 0.5664, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9813711643218994, + "rewards/margins": 7.639379501342773, + "rewards/rejected": -4.658007621765137, + "step": 10043 + }, + { + "epoch": 2.51, + "grad_norm": 10.249648094177246, + "learning_rate": 4.9595220779685585e-06, + "logits/chosen": -0.4399191737174988, + "logits/rejected": -0.5051661729812622, + "logps/chosen": -57.283546447753906, + "logps/rejected": -77.40512084960938, + "loss": 0.7097, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.846804141998291, + "rewards/margins": 5.23818826675415, + "rewards/rejected": -2.3913838863372803, + "step": 10044 + }, + { + "epoch": 2.51, + "grad_norm": 5.216222763061523, + "learning_rate": 4.958736116574426e-06, + "logits/chosen": -0.4342661499977112, + "logits/rejected": -0.5127667188644409, + "logps/chosen": -50.901153564453125, + "logps/rejected": -94.07612609863281, + "loss": 0.6216, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.164771556854248, + "rewards/margins": 5.96347713470459, + "rewards/rejected": -2.7987053394317627, + "step": 10045 + }, + { + "epoch": 2.51, + "grad_norm": 3.114217519760132, + "learning_rate": 4.957950156199967e-06, + "logits/chosen": -0.4776703119277954, + "logits/rejected": -0.5209518671035767, + "logps/chosen": -60.27378845214844, + "logps/rejected": -96.86258697509766, + "loss": 0.676, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.026524543762207, + "rewards/margins": 5.557989120483398, + "rewards/rejected": -2.5314645767211914, + "step": 10046 + }, + { + "epoch": 2.51, + "grad_norm": 4.3296942710876465, + "learning_rate": 4.957164196864608e-06, + "logits/chosen": -0.4842000901699066, + "logits/rejected": -0.5881121754646301, + "logps/chosen": -52.543949127197266, + "logps/rejected": -87.3292465209961, + "loss": 0.5854, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.074321746826172, + "rewards/margins": 6.760424613952637, + "rewards/rejected": -3.686103582382202, + "step": 10047 + }, + { + "epoch": 2.51, + "grad_norm": 1.9715973138809204, + "learning_rate": 4.956378238587764e-06, + "logits/chosen": -0.5384637117385864, + "logits/rejected": -0.5985947847366333, + "logps/chosen": -60.626556396484375, + "logps/rejected": -128.10440063476562, + "loss": 0.6054, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9441630840301514, + "rewards/margins": 7.4863386154174805, + "rewards/rejected": -4.54217529296875, + "step": 10048 + }, + { + "epoch": 2.51, + "grad_norm": 6.055789470672607, + "learning_rate": 4.955592281388862e-06, + "logits/chosen": -0.429674357175827, + "logits/rejected": -0.46459853649139404, + "logps/chosen": -69.24243927001953, + "logps/rejected": -118.2586441040039, + "loss": 0.7339, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7803170680999756, + "rewards/margins": 6.663223743438721, + "rewards/rejected": -3.8829071521759033, + "step": 10049 + }, + { + "epoch": 2.51, + "grad_norm": 15.256077766418457, + "learning_rate": 4.9548063252873216e-06, + "logits/chosen": -0.5088474154472351, + "logits/rejected": -0.593979001045227, + "logps/chosen": -66.14665222167969, + "logps/rejected": -125.71025085449219, + "loss": 0.7305, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1286306381225586, + "rewards/margins": 7.785414218902588, + "rewards/rejected": -4.656783103942871, + "step": 10050 + }, + { + "epoch": 2.51, + "grad_norm": 4.330202102661133, + "learning_rate": 4.954020370302564e-06, + "logits/chosen": -0.49604272842407227, + "logits/rejected": -0.6109161376953125, + "logps/chosen": -64.17261505126953, + "logps/rejected": -94.94904327392578, + "loss": 0.6568, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.829041004180908, + "rewards/margins": 6.584268569946289, + "rewards/rejected": -3.755227565765381, + "step": 10051 + }, + { + "epoch": 2.51, + "grad_norm": 7.890018939971924, + "learning_rate": 4.953234416454013e-06, + "logits/chosen": -0.522210419178009, + "logits/rejected": -0.6354591250419617, + "logps/chosen": -49.26234436035156, + "logps/rejected": -93.88309478759766, + "loss": 0.5952, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.918604850769043, + "rewards/margins": 6.81729793548584, + "rewards/rejected": -3.8986926078796387, + "step": 10052 + }, + { + "epoch": 2.51, + "grad_norm": 7.453836441040039, + "learning_rate": 4.952448463761088e-06, + "logits/chosen": -0.5748424530029297, + "logits/rejected": -0.6795713901519775, + "logps/chosen": -55.01376724243164, + "logps/rejected": -101.25409698486328, + "loss": 0.6223, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.85722279548645, + "rewards/margins": 6.905928134918213, + "rewards/rejected": -4.048705101013184, + "step": 10053 + }, + { + "epoch": 2.52, + "grad_norm": 3.25362491607666, + "learning_rate": 4.9516625122432136e-06, + "logits/chosen": -0.5172934532165527, + "logits/rejected": -0.6117962598800659, + "logps/chosen": -55.297080993652344, + "logps/rejected": -102.24221801757812, + "loss": 0.6535, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2976040840148926, + "rewards/margins": 6.596402645111084, + "rewards/rejected": -3.2987990379333496, + "step": 10054 + }, + { + "epoch": 2.52, + "grad_norm": 6.4376115798950195, + "learning_rate": 4.950876561919809e-06, + "logits/chosen": -0.5073503255844116, + "logits/rejected": -0.6088980436325073, + "logps/chosen": -53.78208541870117, + "logps/rejected": -88.89100646972656, + "loss": 0.7173, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.773998260498047, + "rewards/margins": 5.916120529174805, + "rewards/rejected": -3.142122268676758, + "step": 10055 + }, + { + "epoch": 2.52, + "grad_norm": 11.19313907623291, + "learning_rate": 4.950090612810295e-06, + "logits/chosen": -0.46071264147758484, + "logits/rejected": -0.5694482326507568, + "logps/chosen": -58.9702033996582, + "logps/rejected": -91.9825210571289, + "loss": 0.7211, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1719846725463867, + "rewards/margins": 6.5104475021362305, + "rewards/rejected": -3.3384628295898438, + "step": 10056 + }, + { + "epoch": 2.52, + "grad_norm": 4.71150541305542, + "learning_rate": 4.9493046649341e-06, + "logits/chosen": -0.5338435173034668, + "logits/rejected": -0.6589163541793823, + "logps/chosen": -58.526763916015625, + "logps/rejected": -82.78093719482422, + "loss": 0.6968, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1704797744750977, + "rewards/margins": 6.436931133270264, + "rewards/rejected": -3.266451120376587, + "step": 10057 + }, + { + "epoch": 2.52, + "grad_norm": 3.365954875946045, + "learning_rate": 4.948518718310637e-06, + "logits/chosen": -0.5408494472503662, + "logits/rejected": -0.6005735397338867, + "logps/chosen": -51.06572723388672, + "logps/rejected": -100.61573028564453, + "loss": 0.5912, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0837972164154053, + "rewards/margins": 6.749090194702148, + "rewards/rejected": -3.6652941703796387, + "step": 10058 + }, + { + "epoch": 2.52, + "grad_norm": 9.042224884033203, + "learning_rate": 4.947732772959331e-06, + "logits/chosen": -0.47059568762779236, + "logits/rejected": -0.4998393952846527, + "logps/chosen": -55.10518264770508, + "logps/rejected": -100.2880859375, + "loss": 0.67, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.137650966644287, + "rewards/margins": 5.637931823730469, + "rewards/rejected": -2.5002801418304443, + "step": 10059 + }, + { + "epoch": 2.52, + "grad_norm": 6.175361633300781, + "learning_rate": 4.946946828899604e-06, + "logits/chosen": -0.47388505935668945, + "logits/rejected": -0.5318973660469055, + "logps/chosen": -51.88704299926758, + "logps/rejected": -96.93507385253906, + "loss": 0.6338, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8506267070770264, + "rewards/margins": 6.974173545837402, + "rewards/rejected": -4.123546123504639, + "step": 10060 + }, + { + "epoch": 2.52, + "grad_norm": 6.010364055633545, + "learning_rate": 4.9461608861508785e-06, + "logits/chosen": -0.5956276655197144, + "logits/rejected": -0.6783527135848999, + "logps/chosen": -44.37710952758789, + "logps/rejected": -83.06626892089844, + "loss": 0.6788, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1007161140441895, + "rewards/margins": 5.859405994415283, + "rewards/rejected": -2.758690118789673, + "step": 10061 + }, + { + "epoch": 2.52, + "grad_norm": 5.206772327423096, + "learning_rate": 4.945374944732574e-06, + "logits/chosen": -0.5760040283203125, + "logits/rejected": -0.6826396584510803, + "logps/chosen": -59.82878875732422, + "logps/rejected": -117.66915130615234, + "loss": 0.6757, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0283288955688477, + "rewards/margins": 7.384339809417725, + "rewards/rejected": -4.356010437011719, + "step": 10062 + }, + { + "epoch": 2.52, + "grad_norm": 11.89306640625, + "learning_rate": 4.944589004664113e-06, + "logits/chosen": -0.5546293258666992, + "logits/rejected": -0.6517205238342285, + "logps/chosen": -56.61281204223633, + "logps/rejected": -82.49727630615234, + "loss": 0.6677, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7890586853027344, + "rewards/margins": 5.308308124542236, + "rewards/rejected": -2.5192489624023438, + "step": 10063 + }, + { + "epoch": 2.52, + "grad_norm": 4.924042224884033, + "learning_rate": 4.9438030659649175e-06, + "logits/chosen": -0.5649890899658203, + "logits/rejected": -0.6131007075309753, + "logps/chosen": -51.796051025390625, + "logps/rejected": -105.94998168945312, + "loss": 0.6932, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9179434776306152, + "rewards/margins": 6.111322402954102, + "rewards/rejected": -3.1933794021606445, + "step": 10064 + }, + { + "epoch": 2.52, + "grad_norm": 3.381561040878296, + "learning_rate": 4.943017128654407e-06, + "logits/chosen": -0.5235710144042969, + "logits/rejected": -0.6261413097381592, + "logps/chosen": -52.100799560546875, + "logps/rejected": -80.40934753417969, + "loss": 0.5715, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2257533073425293, + "rewards/margins": 5.401212692260742, + "rewards/rejected": -2.1754589080810547, + "step": 10065 + }, + { + "epoch": 2.52, + "grad_norm": 4.802178382873535, + "learning_rate": 4.942231192752003e-06, + "logits/chosen": -0.4647183418273926, + "logits/rejected": -0.5645421743392944, + "logps/chosen": -61.456764221191406, + "logps/rejected": -100.49441528320312, + "loss": 0.6277, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2703747749328613, + "rewards/margins": 6.963406085968018, + "rewards/rejected": -3.6930317878723145, + "step": 10066 + }, + { + "epoch": 2.52, + "grad_norm": 3.535857915878296, + "learning_rate": 4.941445258277129e-06, + "logits/chosen": -0.5367610454559326, + "logits/rejected": -0.6124506592750549, + "logps/chosen": -50.99787139892578, + "logps/rejected": -95.28278350830078, + "loss": 0.592, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.458146810531616, + "rewards/margins": 6.864190578460693, + "rewards/rejected": -3.4060447216033936, + "step": 10067 + }, + { + "epoch": 2.52, + "grad_norm": 1.9634813070297241, + "learning_rate": 4.940659325249205e-06, + "logits/chosen": -0.4943806529045105, + "logits/rejected": -0.5914024114608765, + "logps/chosen": -55.36956787109375, + "logps/rejected": -91.88191223144531, + "loss": 0.5249, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.261974334716797, + "rewards/margins": 6.547148704528809, + "rewards/rejected": -3.2851738929748535, + "step": 10068 + }, + { + "epoch": 2.52, + "grad_norm": 5.358365535736084, + "learning_rate": 4.939873393687652e-06, + "logits/chosen": -0.4957181215286255, + "logits/rejected": -0.5573300719261169, + "logps/chosen": -64.45125579833984, + "logps/rejected": -109.31580352783203, + "loss": 0.6899, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1324872970581055, + "rewards/margins": 6.207214832305908, + "rewards/rejected": -3.0747270584106445, + "step": 10069 + }, + { + "epoch": 2.52, + "grad_norm": 3.400280237197876, + "learning_rate": 4.9390874636118905e-06, + "logits/chosen": -0.48142459988594055, + "logits/rejected": -0.5770173072814941, + "logps/chosen": -55.992881774902344, + "logps/rejected": -79.7237548828125, + "loss": 0.6617, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2449758052825928, + "rewards/margins": 5.642374038696289, + "rewards/rejected": -2.397397756576538, + "step": 10070 + }, + { + "epoch": 2.52, + "grad_norm": 8.646626472473145, + "learning_rate": 4.9383015350413445e-06, + "logits/chosen": -0.5007542967796326, + "logits/rejected": -0.5922425985336304, + "logps/chosen": -56.308048248291016, + "logps/rejected": -87.49803161621094, + "loss": 0.6089, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.005204916000366, + "rewards/margins": 4.86098575592041, + "rewards/rejected": -1.855780839920044, + "step": 10071 + }, + { + "epoch": 2.52, + "grad_norm": 1.5732783079147339, + "learning_rate": 4.9375156079954316e-06, + "logits/chosen": -0.5171670317649841, + "logits/rejected": -0.6446291208267212, + "logps/chosen": -55.91053009033203, + "logps/rejected": -86.23616027832031, + "loss": 0.5743, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4021401405334473, + "rewards/margins": 7.345005035400391, + "rewards/rejected": -3.9428653717041016, + "step": 10072 + }, + { + "epoch": 2.52, + "grad_norm": 5.035252094268799, + "learning_rate": 4.936729682493573e-06, + "logits/chosen": -0.5264461636543274, + "logits/rejected": -0.6299619078636169, + "logps/chosen": -49.50746154785156, + "logps/rejected": -91.51412963867188, + "loss": 0.6051, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9444875717163086, + "rewards/margins": 6.059891223907471, + "rewards/rejected": -3.115403413772583, + "step": 10073 + }, + { + "epoch": 2.52, + "grad_norm": 6.097705364227295, + "learning_rate": 4.935943758555193e-06, + "logits/chosen": -0.49525752663612366, + "logits/rejected": -0.5931117534637451, + "logps/chosen": -61.97250747680664, + "logps/rejected": -103.87745666503906, + "loss": 0.7611, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.663102865219116, + "rewards/margins": 6.756528854370117, + "rewards/rejected": -4.093425750732422, + "step": 10074 + }, + { + "epoch": 2.52, + "grad_norm": 5.207289695739746, + "learning_rate": 4.935157836199713e-06, + "logits/chosen": -0.48943889141082764, + "logits/rejected": -0.5818743705749512, + "logps/chosen": -48.510772705078125, + "logps/rejected": -90.56443786621094, + "loss": 0.5625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1740641593933105, + "rewards/margins": 6.304023265838623, + "rewards/rejected": -3.129958391189575, + "step": 10075 + }, + { + "epoch": 2.52, + "grad_norm": 3.4713094234466553, + "learning_rate": 4.934371915446549e-06, + "logits/chosen": -0.5031044483184814, + "logits/rejected": -0.5804650783538818, + "logps/chosen": -52.62158203125, + "logps/rejected": -81.8719482421875, + "loss": 0.6491, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.085831642150879, + "rewards/margins": 5.450852870941162, + "rewards/rejected": -2.365021228790283, + "step": 10076 + }, + { + "epoch": 2.52, + "grad_norm": 4.005307197570801, + "learning_rate": 4.933585996315125e-06, + "logits/chosen": -0.5108126401901245, + "logits/rejected": -0.5859152674674988, + "logps/chosen": -44.67063522338867, + "logps/rejected": -96.83314514160156, + "loss": 0.5259, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0382680892944336, + "rewards/margins": 6.636260986328125, + "rewards/rejected": -3.5979928970336914, + "step": 10077 + }, + { + "epoch": 2.52, + "grad_norm": 4.370841979980469, + "learning_rate": 4.932800078824862e-06, + "logits/chosen": -0.4850911498069763, + "logits/rejected": -0.5859196186065674, + "logps/chosen": -48.79711151123047, + "logps/rejected": -96.65438079833984, + "loss": 0.6583, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.638122320175171, + "rewards/margins": 6.072981357574463, + "rewards/rejected": -3.434858560562134, + "step": 10078 + }, + { + "epoch": 2.52, + "grad_norm": 3.2578542232513428, + "learning_rate": 4.9320141629951814e-06, + "logits/chosen": -0.4468806982040405, + "logits/rejected": -0.5596420168876648, + "logps/chosen": -57.78410339355469, + "logps/rejected": -101.35633850097656, + "loss": 0.6502, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9974372386932373, + "rewards/margins": 7.329696178436279, + "rewards/rejected": -4.332258701324463, + "step": 10079 + }, + { + "epoch": 2.52, + "grad_norm": 3.375934362411499, + "learning_rate": 4.931228248845502e-06, + "logits/chosen": -0.5701014995574951, + "logits/rejected": -0.6417256593704224, + "logps/chosen": -56.6801872253418, + "logps/rejected": -131.31747436523438, + "loss": 0.6619, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0528879165649414, + "rewards/margins": 8.087349891662598, + "rewards/rejected": -5.034461498260498, + "step": 10080 + }, + { + "epoch": 2.52, + "grad_norm": 3.4733986854553223, + "learning_rate": 4.930442336395246e-06, + "logits/chosen": -0.5148126482963562, + "logits/rejected": -0.6107396483421326, + "logps/chosen": -63.03672790527344, + "logps/rejected": -108.23892211914062, + "loss": 0.5994, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0148231983184814, + "rewards/margins": 6.79078483581543, + "rewards/rejected": -3.775960922241211, + "step": 10081 + }, + { + "epoch": 2.52, + "grad_norm": 9.629944801330566, + "learning_rate": 4.929656425663835e-06, + "logits/chosen": -0.4376547336578369, + "logits/rejected": -0.5631483197212219, + "logps/chosen": -59.31620407104492, + "logps/rejected": -99.45166015625, + "loss": 0.6593, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8906428813934326, + "rewards/margins": 6.418500900268555, + "rewards/rejected": -3.527857780456543, + "step": 10082 + }, + { + "epoch": 2.52, + "grad_norm": 10.28451156616211, + "learning_rate": 4.928870516670687e-06, + "logits/chosen": -0.46431559324264526, + "logits/rejected": -0.5291174054145813, + "logps/chosen": -58.49828338623047, + "logps/rejected": -101.27491760253906, + "loss": 0.7084, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1187376976013184, + "rewards/margins": 5.586536407470703, + "rewards/rejected": -2.4677977561950684, + "step": 10083 + }, + { + "epoch": 2.52, + "grad_norm": 8.746649742126465, + "learning_rate": 4.928084609435225e-06, + "logits/chosen": -0.5323044657707214, + "logits/rejected": -0.5992777943611145, + "logps/chosen": -55.851810455322266, + "logps/rejected": -102.86701965332031, + "loss": 0.6476, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0135865211486816, + "rewards/margins": 7.194819450378418, + "rewards/rejected": -4.181232452392578, + "step": 10084 + }, + { + "epoch": 2.52, + "grad_norm": 6.732459545135498, + "learning_rate": 4.927298703976867e-06, + "logits/chosen": -0.5221095085144043, + "logits/rejected": -0.5237890481948853, + "logps/chosen": -55.13638687133789, + "logps/rejected": -113.51253509521484, + "loss": 0.8348, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6954057216644287, + "rewards/margins": 6.207312107086182, + "rewards/rejected": -3.511906623840332, + "step": 10085 + }, + { + "epoch": 2.52, + "grad_norm": 6.462888717651367, + "learning_rate": 4.9265128003150395e-06, + "logits/chosen": -0.5360585451126099, + "logits/rejected": -0.6624030470848083, + "logps/chosen": -60.958648681640625, + "logps/rejected": -90.01333618164062, + "loss": 0.6269, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.076960802078247, + "rewards/margins": 7.289247989654541, + "rewards/rejected": -4.212287425994873, + "step": 10086 + }, + { + "epoch": 2.52, + "grad_norm": 17.332029342651367, + "learning_rate": 4.9257268984691555e-06, + "logits/chosen": -0.4896937906742096, + "logits/rejected": -0.5614891052246094, + "logps/chosen": -60.23340606689453, + "logps/rejected": -89.98723602294922, + "loss": 0.7654, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8975167274475098, + "rewards/margins": 4.714296340942383, + "rewards/rejected": -1.816779613494873, + "step": 10087 + }, + { + "epoch": 2.52, + "grad_norm": 7.014595031738281, + "learning_rate": 4.924940998458639e-06, + "logits/chosen": -0.5014331936836243, + "logits/rejected": -0.6212624907493591, + "logps/chosen": -61.04446029663086, + "logps/rejected": -88.19419860839844, + "loss": 0.803, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.21097469329834, + "rewards/margins": 6.505568504333496, + "rewards/rejected": -3.2945940494537354, + "step": 10088 + }, + { + "epoch": 2.52, + "grad_norm": 4.186141014099121, + "learning_rate": 4.924155100302912e-06, + "logits/chosen": -0.5146941542625427, + "logits/rejected": -0.5229028463363647, + "logps/chosen": -59.07858657836914, + "logps/rejected": -103.3747787475586, + "loss": 0.6932, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0246427059173584, + "rewards/margins": 5.167222023010254, + "rewards/rejected": -2.1425788402557373, + "step": 10089 + }, + { + "epoch": 2.52, + "grad_norm": 4.564287185668945, + "learning_rate": 4.923369204021391e-06, + "logits/chosen": -0.5054713487625122, + "logits/rejected": -0.5763501524925232, + "logps/chosen": -61.52793502807617, + "logps/rejected": -113.01758575439453, + "loss": 0.7463, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8256053924560547, + "rewards/margins": 6.309446811676025, + "rewards/rejected": -3.4838409423828125, + "step": 10090 + }, + { + "epoch": 2.52, + "grad_norm": 10.325687408447266, + "learning_rate": 4.922583309633499e-06, + "logits/chosen": -0.5451252460479736, + "logits/rejected": -0.6309614777565002, + "logps/chosen": -57.349220275878906, + "logps/rejected": -108.65898132324219, + "loss": 0.728, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0700206756591797, + "rewards/margins": 6.9698872566223145, + "rewards/rejected": -3.8998661041259766, + "step": 10091 + }, + { + "epoch": 2.52, + "grad_norm": 8.461114883422852, + "learning_rate": 4.921797417158656e-06, + "logits/chosen": -0.5701602101325989, + "logits/rejected": -0.6379092931747437, + "logps/chosen": -44.95612716674805, + "logps/rejected": -80.89270782470703, + "loss": 0.6386, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0429344177246094, + "rewards/margins": 5.015423774719238, + "rewards/rejected": -1.9724886417388916, + "step": 10092 + }, + { + "epoch": 2.52, + "grad_norm": 7.644598960876465, + "learning_rate": 4.921011526616283e-06, + "logits/chosen": -0.45990967750549316, + "logits/rejected": -0.5363309979438782, + "logps/chosen": -55.63998794555664, + "logps/rejected": -77.76721954345703, + "loss": 0.7197, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.064732313156128, + "rewards/margins": 4.896811008453369, + "rewards/rejected": -1.8320786952972412, + "step": 10093 + }, + { + "epoch": 2.53, + "grad_norm": 4.565515041351318, + "learning_rate": 4.920225638025798e-06, + "logits/chosen": -0.48099663853645325, + "logits/rejected": -0.6100800037384033, + "logps/chosen": -48.11940383911133, + "logps/rejected": -85.3707275390625, + "loss": 0.5898, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.114046335220337, + "rewards/margins": 6.482789993286133, + "rewards/rejected": -3.368743658065796, + "step": 10094 + }, + { + "epoch": 2.53, + "grad_norm": 3.7625391483306885, + "learning_rate": 4.919439751406623e-06, + "logits/chosen": -0.524986982345581, + "logits/rejected": -0.5814130306243896, + "logps/chosen": -55.57168197631836, + "logps/rejected": -98.47341918945312, + "loss": 0.5986, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.042253017425537, + "rewards/margins": 6.230772018432617, + "rewards/rejected": -3.18851900100708, + "step": 10095 + }, + { + "epoch": 2.53, + "grad_norm": 4.464743137359619, + "learning_rate": 4.918653866778178e-06, + "logits/chosen": -0.5042915940284729, + "logits/rejected": -0.5600047707557678, + "logps/chosen": -62.458099365234375, + "logps/rejected": -98.56327056884766, + "loss": 0.7224, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.837996006011963, + "rewards/margins": 6.402676105499268, + "rewards/rejected": -3.5646800994873047, + "step": 10096 + }, + { + "epoch": 2.53, + "grad_norm": 5.927774429321289, + "learning_rate": 4.9178679841598816e-06, + "logits/chosen": -0.5973557233810425, + "logits/rejected": -0.6050805449485779, + "logps/chosen": -51.919132232666016, + "logps/rejected": -111.07298278808594, + "loss": 0.6646, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.754953145980835, + "rewards/margins": 6.8548383712768555, + "rewards/rejected": -4.0998854637146, + "step": 10097 + }, + { + "epoch": 2.53, + "grad_norm": 3.2189531326293945, + "learning_rate": 4.917082103571155e-06, + "logits/chosen": -0.574042558670044, + "logits/rejected": -0.6466261744499207, + "logps/chosen": -60.20566940307617, + "logps/rejected": -96.36429595947266, + "loss": 0.7041, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.978196859359741, + "rewards/margins": 5.655077934265137, + "rewards/rejected": -2.6768813133239746, + "step": 10098 + }, + { + "epoch": 2.53, + "grad_norm": 3.7840447425842285, + "learning_rate": 4.916296225031417e-06, + "logits/chosen": -0.4824962019920349, + "logits/rejected": -0.5579968690872192, + "logps/chosen": -63.917747497558594, + "logps/rejected": -99.70259857177734, + "loss": 0.6207, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0763893127441406, + "rewards/margins": 6.012722492218018, + "rewards/rejected": -2.936333417892456, + "step": 10099 + }, + { + "epoch": 2.53, + "grad_norm": 4.482023239135742, + "learning_rate": 4.91551034856009e-06, + "logits/chosen": -0.5035940408706665, + "logits/rejected": -0.5450993180274963, + "logps/chosen": -60.044158935546875, + "logps/rejected": -94.89706420898438, + "loss": 0.7323, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1789801120758057, + "rewards/margins": 5.526466369628906, + "rewards/rejected": -2.3474862575531006, + "step": 10100 + }, + { + "epoch": 2.53, + "grad_norm": 8.65873908996582, + "learning_rate": 4.9147244741765905e-06, + "logits/chosen": -0.5194948315620422, + "logits/rejected": -0.6173866391181946, + "logps/chosen": -51.46931457519531, + "logps/rejected": -79.50402069091797, + "loss": 0.6949, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8647475242614746, + "rewards/margins": 4.818381309509277, + "rewards/rejected": -1.953634262084961, + "step": 10101 + }, + { + "epoch": 2.53, + "grad_norm": 7.77446174621582, + "learning_rate": 4.9139386019003395e-06, + "logits/chosen": -0.571729838848114, + "logits/rejected": -0.5912819504737854, + "logps/chosen": -81.15292358398438, + "logps/rejected": -110.94969940185547, + "loss": 0.6729, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1645255088806152, + "rewards/margins": 7.32557487487793, + "rewards/rejected": -4.161048889160156, + "step": 10102 + }, + { + "epoch": 2.53, + "grad_norm": 4.757099151611328, + "learning_rate": 4.913152731750758e-06, + "logits/chosen": -0.5383508801460266, + "logits/rejected": -0.5980035662651062, + "logps/chosen": -49.785030364990234, + "logps/rejected": -88.20112609863281, + "loss": 0.6815, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.113236427307129, + "rewards/margins": 6.051620006561279, + "rewards/rejected": -2.9383833408355713, + "step": 10103 + }, + { + "epoch": 2.53, + "grad_norm": 8.281819343566895, + "learning_rate": 4.912366863747266e-06, + "logits/chosen": -0.5525842308998108, + "logits/rejected": -0.6227372288703918, + "logps/chosen": -52.36386489868164, + "logps/rejected": -97.24227142333984, + "loss": 0.7135, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9144785404205322, + "rewards/margins": 5.700498580932617, + "rewards/rejected": -2.786020278930664, + "step": 10104 + }, + { + "epoch": 2.53, + "grad_norm": 5.100689888000488, + "learning_rate": 4.911580997909281e-06, + "logits/chosen": -0.5413023829460144, + "logits/rejected": -0.5993421077728271, + "logps/chosen": -55.572265625, + "logps/rejected": -98.53266906738281, + "loss": 0.6396, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.304105281829834, + "rewards/margins": 6.715750217437744, + "rewards/rejected": -3.41164493560791, + "step": 10105 + }, + { + "epoch": 2.53, + "grad_norm": 11.169321060180664, + "learning_rate": 4.910795134256223e-06, + "logits/chosen": -0.5589455366134644, + "logits/rejected": -0.6224665641784668, + "logps/chosen": -47.249263763427734, + "logps/rejected": -85.3766098022461, + "loss": 0.6278, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0142483711242676, + "rewards/margins": 4.740259170532227, + "rewards/rejected": -1.7260106801986694, + "step": 10106 + }, + { + "epoch": 2.53, + "grad_norm": 17.636245727539062, + "learning_rate": 4.910009272807514e-06, + "logits/chosen": -0.4608765244483948, + "logits/rejected": -0.47712424397468567, + "logps/chosen": -60.371849060058594, + "logps/rejected": -118.32701110839844, + "loss": 0.7348, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8146705627441406, + "rewards/margins": 5.7448530197143555, + "rewards/rejected": -2.930182456970215, + "step": 10107 + }, + { + "epoch": 2.53, + "grad_norm": 8.12121295928955, + "learning_rate": 4.909223413582569e-06, + "logits/chosen": -0.5007390975952148, + "logits/rejected": -0.600911557674408, + "logps/chosen": -58.15447998046875, + "logps/rejected": -100.50947570800781, + "loss": 0.6406, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3122825622558594, + "rewards/margins": 6.619283676147461, + "rewards/rejected": -3.3070011138916016, + "step": 10108 + }, + { + "epoch": 2.53, + "grad_norm": 10.788934707641602, + "learning_rate": 4.908437556600812e-06, + "logits/chosen": -0.437238484621048, + "logits/rejected": -0.5381673574447632, + "logps/chosen": -64.7356185913086, + "logps/rejected": -85.23878479003906, + "loss": 0.7089, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9800589084625244, + "rewards/margins": 5.403517723083496, + "rewards/rejected": -2.42345929145813, + "step": 10109 + }, + { + "epoch": 2.53, + "grad_norm": 4.20548152923584, + "learning_rate": 4.907651701881659e-06, + "logits/chosen": -0.5125609636306763, + "logits/rejected": -0.5817893147468567, + "logps/chosen": -57.516326904296875, + "logps/rejected": -102.0480728149414, + "loss": 0.6443, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.985764503479004, + "rewards/margins": 5.441826343536377, + "rewards/rejected": -2.456062078475952, + "step": 10110 + }, + { + "epoch": 2.53, + "grad_norm": 11.616185188293457, + "learning_rate": 4.906865849444533e-06, + "logits/chosen": -0.5739634037017822, + "logits/rejected": -0.6425953507423401, + "logps/chosen": -57.64699935913086, + "logps/rejected": -115.30043029785156, + "loss": 0.6557, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2317395210266113, + "rewards/margins": 7.170722961425781, + "rewards/rejected": -3.938983201980591, + "step": 10111 + }, + { + "epoch": 2.53, + "grad_norm": 7.533827304840088, + "learning_rate": 4.906079999308849e-06, + "logits/chosen": -0.5253579616546631, + "logits/rejected": -0.685470461845398, + "logps/chosen": -83.10093688964844, + "logps/rejected": -92.90977478027344, + "loss": 0.7649, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3706436157226562, + "rewards/margins": 5.392075538635254, + "rewards/rejected": -3.0214321613311768, + "step": 10112 + }, + { + "epoch": 2.53, + "grad_norm": 4.7322564125061035, + "learning_rate": 4.905294151494028e-06, + "logits/chosen": -0.5195637941360474, + "logits/rejected": -0.6319390535354614, + "logps/chosen": -53.717437744140625, + "logps/rejected": -97.62339782714844, + "loss": 0.5671, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0493695735931396, + "rewards/margins": 6.800768852233887, + "rewards/rejected": -3.7513985633850098, + "step": 10113 + }, + { + "epoch": 2.53, + "grad_norm": 5.487103462219238, + "learning_rate": 4.904508306019491e-06, + "logits/chosen": -0.5793517827987671, + "logits/rejected": -0.6218640804290771, + "logps/chosen": -56.150238037109375, + "logps/rejected": -94.95132446289062, + "loss": 0.6744, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.09895658493042, + "rewards/margins": 5.690859317779541, + "rewards/rejected": -2.591902494430542, + "step": 10114 + }, + { + "epoch": 2.53, + "grad_norm": 6.037837028503418, + "learning_rate": 4.903722462904653e-06, + "logits/chosen": -0.38135379552841187, + "logits/rejected": -0.5387231707572937, + "logps/chosen": -58.655208587646484, + "logps/rejected": -84.3910903930664, + "loss": 0.5843, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.885878324508667, + "rewards/margins": 6.181278705596924, + "rewards/rejected": -3.295400857925415, + "step": 10115 + }, + { + "epoch": 2.53, + "grad_norm": 7.927608013153076, + "learning_rate": 4.902936622168936e-06, + "logits/chosen": -0.46334120631217957, + "logits/rejected": -0.6067928075790405, + "logps/chosen": -69.53972625732422, + "logps/rejected": -88.71688079833984, + "loss": 0.6571, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.793804883956909, + "rewards/margins": 6.699300765991211, + "rewards/rejected": -3.9054954051971436, + "step": 10116 + }, + { + "epoch": 2.53, + "grad_norm": 3.1591293811798096, + "learning_rate": 4.902150783831758e-06, + "logits/chosen": -0.5619444847106934, + "logits/rejected": -0.6260427236557007, + "logps/chosen": -47.76394271850586, + "logps/rejected": -93.70809173583984, + "loss": 0.6475, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2958500385284424, + "rewards/margins": 6.6483612060546875, + "rewards/rejected": -3.352510929107666, + "step": 10117 + }, + { + "epoch": 2.53, + "grad_norm": 5.933968544006348, + "learning_rate": 4.901364947912541e-06, + "logits/chosen": -0.5419089198112488, + "logits/rejected": -0.6419941782951355, + "logps/chosen": -60.27738571166992, + "logps/rejected": -88.63582611083984, + "loss": 0.6776, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.831411838531494, + "rewards/margins": 6.371367931365967, + "rewards/rejected": -3.5399560928344727, + "step": 10118 + }, + { + "epoch": 2.53, + "grad_norm": 9.041975021362305, + "learning_rate": 4.900579114430697e-06, + "logits/chosen": -0.5524495840072632, + "logits/rejected": -0.639826774597168, + "logps/chosen": -52.99396514892578, + "logps/rejected": -81.51311492919922, + "loss": 0.7086, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8002233505249023, + "rewards/margins": 5.566462516784668, + "rewards/rejected": -2.7662391662597656, + "step": 10119 + }, + { + "epoch": 2.53, + "grad_norm": 6.490211009979248, + "learning_rate": 4.89979328340565e-06, + "logits/chosen": -0.479844868183136, + "logits/rejected": -0.5249885320663452, + "logps/chosen": -52.230804443359375, + "logps/rejected": -103.50457763671875, + "loss": 0.7507, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.860053062438965, + "rewards/margins": 5.911617755889893, + "rewards/rejected": -3.0515646934509277, + "step": 10120 + }, + { + "epoch": 2.53, + "grad_norm": 3.0472161769866943, + "learning_rate": 4.89900745485682e-06, + "logits/chosen": -0.45974522829055786, + "logits/rejected": -0.5495116710662842, + "logps/chosen": -58.9277229309082, + "logps/rejected": -105.87977600097656, + "loss": 0.5806, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.04256534576416, + "rewards/margins": 7.203406810760498, + "rewards/rejected": -4.160841464996338, + "step": 10121 + }, + { + "epoch": 2.53, + "grad_norm": 8.244943618774414, + "learning_rate": 4.898221628803621e-06, + "logits/chosen": -0.5205110311508179, + "logits/rejected": -0.6074267625808716, + "logps/chosen": -47.85035705566406, + "logps/rejected": -83.62747192382812, + "loss": 0.6466, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8409693241119385, + "rewards/margins": 5.413259506225586, + "rewards/rejected": -2.5722908973693848, + "step": 10122 + }, + { + "epoch": 2.53, + "grad_norm": 10.50571060180664, + "learning_rate": 4.897435805265474e-06, + "logits/chosen": -0.5286023020744324, + "logits/rejected": -0.5360701084136963, + "logps/chosen": -55.19255447387695, + "logps/rejected": -97.96481323242188, + "loss": 0.7642, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9042611122131348, + "rewards/margins": 4.650058269500732, + "rewards/rejected": -1.7457975149154663, + "step": 10123 + }, + { + "epoch": 2.53, + "grad_norm": 28.272367477416992, + "learning_rate": 4.8966499842617974e-06, + "logits/chosen": -0.5673316121101379, + "logits/rejected": -0.6522471308708191, + "logps/chosen": -55.94720458984375, + "logps/rejected": -100.59921264648438, + "loss": 0.655, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1297223567962646, + "rewards/margins": 7.356626987457275, + "rewards/rejected": -4.22690486907959, + "step": 10124 + }, + { + "epoch": 2.53, + "grad_norm": 2.807874917984009, + "learning_rate": 4.895864165812011e-06, + "logits/chosen": -0.6212025880813599, + "logits/rejected": -0.6756442189216614, + "logps/chosen": -48.44761657714844, + "logps/rejected": -105.97026824951172, + "loss": 0.5918, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.651705503463745, + "rewards/margins": 6.744543552398682, + "rewards/rejected": -4.092838287353516, + "step": 10125 + }, + { + "epoch": 2.53, + "grad_norm": 9.732050895690918, + "learning_rate": 4.895078349935531e-06, + "logits/chosen": -0.4771249294281006, + "logits/rejected": -0.5157503485679626, + "logps/chosen": -55.55305862426758, + "logps/rejected": -98.15385437011719, + "loss": 0.7957, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1437275409698486, + "rewards/margins": 5.856003761291504, + "rewards/rejected": -2.7122766971588135, + "step": 10126 + }, + { + "epoch": 2.53, + "grad_norm": 4.327150821685791, + "learning_rate": 4.894292536651776e-06, + "logits/chosen": -0.5331487655639648, + "logits/rejected": -0.5967727899551392, + "logps/chosen": -57.805816650390625, + "logps/rejected": -96.9727783203125, + "loss": 0.6861, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.260589838027954, + "rewards/margins": 5.860117435455322, + "rewards/rejected": -2.5995278358459473, + "step": 10127 + }, + { + "epoch": 2.53, + "grad_norm": 3.660501718521118, + "learning_rate": 4.893506725980163e-06, + "logits/chosen": -0.5344306230545044, + "logits/rejected": -0.6080026626586914, + "logps/chosen": -48.82776641845703, + "logps/rejected": -98.27800750732422, + "loss": 0.645, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.005537986755371, + "rewards/margins": 7.145376682281494, + "rewards/rejected": -4.139838695526123, + "step": 10128 + }, + { + "epoch": 2.53, + "grad_norm": 1.1235429048538208, + "learning_rate": 4.892720917940117e-06, + "logits/chosen": -0.5386306643486023, + "logits/rejected": -0.6423438191413879, + "logps/chosen": -53.89212417602539, + "logps/rejected": -109.35118103027344, + "loss": 0.5393, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1516878604888916, + "rewards/margins": 9.02783489227295, + "rewards/rejected": -5.876147270202637, + "step": 10129 + }, + { + "epoch": 2.53, + "grad_norm": 9.488357543945312, + "learning_rate": 4.8919351125510475e-06, + "logits/chosen": -0.6121295690536499, + "logits/rejected": -0.7014674544334412, + "logps/chosen": -58.499534606933594, + "logps/rejected": -99.25679016113281, + "loss": 0.6312, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.16298770904541, + "rewards/margins": 6.613657474517822, + "rewards/rejected": -3.4506702423095703, + "step": 10130 + }, + { + "epoch": 2.53, + "grad_norm": 7.7355828285217285, + "learning_rate": 4.891149309832376e-06, + "logits/chosen": -0.5198985934257507, + "logits/rejected": -0.5737089514732361, + "logps/chosen": -59.618560791015625, + "logps/rejected": -99.61016082763672, + "loss": 0.7472, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6691694259643555, + "rewards/margins": 5.914421558380127, + "rewards/rejected": -3.2452516555786133, + "step": 10131 + }, + { + "epoch": 2.53, + "grad_norm": 4.45898962020874, + "learning_rate": 4.8903635098035246e-06, + "logits/chosen": -0.6169314980506897, + "logits/rejected": -0.7271488904953003, + "logps/chosen": -45.803619384765625, + "logps/rejected": -85.9047622680664, + "loss": 0.6489, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.036998748779297, + "rewards/margins": 6.581101417541504, + "rewards/rejected": -3.544102668762207, + "step": 10132 + }, + { + "epoch": 2.53, + "grad_norm": 3.5372509956359863, + "learning_rate": 4.889577712483903e-06, + "logits/chosen": -0.489655077457428, + "logits/rejected": -0.5970315337181091, + "logps/chosen": -48.26026153564453, + "logps/rejected": -84.17627716064453, + "loss": 0.6145, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.030634641647339, + "rewards/margins": 6.2525739669799805, + "rewards/rejected": -3.2219395637512207, + "step": 10133 + }, + { + "epoch": 2.54, + "grad_norm": 2.2064523696899414, + "learning_rate": 4.888791917892936e-06, + "logits/chosen": -0.511589527130127, + "logits/rejected": -0.5967675447463989, + "logps/chosen": -40.701263427734375, + "logps/rejected": -106.16070556640625, + "loss": 0.5116, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.460207223892212, + "rewards/margins": 7.861385345458984, + "rewards/rejected": -4.401178359985352, + "step": 10134 + }, + { + "epoch": 2.54, + "grad_norm": 5.715933799743652, + "learning_rate": 4.8880061260500385e-06, + "logits/chosen": -0.5185753107070923, + "logits/rejected": -0.5642825961112976, + "logps/chosen": -49.951622009277344, + "logps/rejected": -95.66239929199219, + "loss": 0.6308, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.721859931945801, + "rewards/margins": 4.858445167541504, + "rewards/rejected": -2.136584997177124, + "step": 10135 + }, + { + "epoch": 2.54, + "grad_norm": 5.097565174102783, + "learning_rate": 4.88722033697463e-06, + "logits/chosen": -0.5340834259986877, + "logits/rejected": -0.6089639663696289, + "logps/chosen": -52.25193405151367, + "logps/rejected": -106.18344116210938, + "loss": 0.6748, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8008456230163574, + "rewards/margins": 6.146109104156494, + "rewards/rejected": -3.3452632427215576, + "step": 10136 + }, + { + "epoch": 2.54, + "grad_norm": 3.32783579826355, + "learning_rate": 4.886434550686125e-06, + "logits/chosen": -0.5273832082748413, + "logits/rejected": -0.6183090209960938, + "logps/chosen": -51.29096221923828, + "logps/rejected": -98.84939575195312, + "loss": 0.6155, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5127005577087402, + "rewards/margins": 7.104567527770996, + "rewards/rejected": -3.591866970062256, + "step": 10137 + }, + { + "epoch": 2.54, + "grad_norm": 4.5475897789001465, + "learning_rate": 4.885648767203943e-06, + "logits/chosen": -0.4966960847377777, + "logits/rejected": -0.5347119569778442, + "logps/chosen": -54.14474868774414, + "logps/rejected": -92.37968444824219, + "loss": 0.6781, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.229434013366699, + "rewards/margins": 5.123513698577881, + "rewards/rejected": -1.8940794467926025, + "step": 10138 + }, + { + "epoch": 2.54, + "grad_norm": 10.55848503112793, + "learning_rate": 4.884862986547503e-06, + "logits/chosen": -0.5133619904518127, + "logits/rejected": -0.49355196952819824, + "logps/chosen": -50.026065826416016, + "logps/rejected": -123.39705657958984, + "loss": 0.7474, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.095283269882202, + "rewards/margins": 4.8835673332214355, + "rewards/rejected": -1.7882835865020752, + "step": 10139 + }, + { + "epoch": 2.54, + "grad_norm": 6.163671970367432, + "learning_rate": 4.884077208736221e-06, + "logits/chosen": -0.49997514486312866, + "logits/rejected": -0.5932536125183105, + "logps/chosen": -55.99540710449219, + "logps/rejected": -106.583984375, + "loss": 0.7943, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.906068801879883, + "rewards/margins": 6.558361053466797, + "rewards/rejected": -3.6522929668426514, + "step": 10140 + }, + { + "epoch": 2.54, + "grad_norm": 6.620160102844238, + "learning_rate": 4.8832914337895135e-06, + "logits/chosen": -0.4893966615200043, + "logits/rejected": -0.590772271156311, + "logps/chosen": -53.86626434326172, + "logps/rejected": -92.88987731933594, + "loss": 0.652, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1685211658477783, + "rewards/margins": 5.915639877319336, + "rewards/rejected": -2.7471182346343994, + "step": 10141 + }, + { + "epoch": 2.54, + "grad_norm": 6.42720365524292, + "learning_rate": 4.882505661726799e-06, + "logits/chosen": -0.6515724658966064, + "logits/rejected": -0.7195900082588196, + "logps/chosen": -41.48606491088867, + "logps/rejected": -102.81492614746094, + "loss": 0.6243, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.037970781326294, + "rewards/margins": 6.797472953796387, + "rewards/rejected": -3.75950288772583, + "step": 10142 + }, + { + "epoch": 2.54, + "grad_norm": 17.551992416381836, + "learning_rate": 4.881719892567496e-06, + "logits/chosen": -0.5641886591911316, + "logits/rejected": -0.5965094566345215, + "logps/chosen": -54.83269500732422, + "logps/rejected": -95.81632995605469, + "loss": 0.9108, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.78096342086792, + "rewards/margins": 5.330702781677246, + "rewards/rejected": -2.549739360809326, + "step": 10143 + }, + { + "epoch": 2.54, + "grad_norm": 4.03688383102417, + "learning_rate": 4.880934126331018e-06, + "logits/chosen": -0.4437198340892792, + "logits/rejected": -0.5085147023200989, + "logps/chosen": -62.14579391479492, + "logps/rejected": -101.02290344238281, + "loss": 0.6853, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3318495750427246, + "rewards/margins": 5.672278881072998, + "rewards/rejected": -2.3404297828674316, + "step": 10144 + }, + { + "epoch": 2.54, + "grad_norm": 24.821346282958984, + "learning_rate": 4.880148363036784e-06, + "logits/chosen": -0.4536018669605255, + "logits/rejected": -0.5688010454177856, + "logps/chosen": -58.33689880371094, + "logps/rejected": -97.84384155273438, + "loss": 0.7247, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7740375995635986, + "rewards/margins": 6.255711555480957, + "rewards/rejected": -3.4816744327545166, + "step": 10145 + }, + { + "epoch": 2.54, + "grad_norm": 5.702116966247559, + "learning_rate": 4.8793626027042155e-06, + "logits/chosen": -0.5608129501342773, + "logits/rejected": -0.5827853083610535, + "logps/chosen": -59.850677490234375, + "logps/rejected": -103.39459228515625, + "loss": 0.6851, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.76179838180542, + "rewards/margins": 5.378992557525635, + "rewards/rejected": -2.6171936988830566, + "step": 10146 + }, + { + "epoch": 2.54, + "grad_norm": 3.8146297931671143, + "learning_rate": 4.878576845352723e-06, + "logits/chosen": -0.608129620552063, + "logits/rejected": -0.6843010783195496, + "logps/chosen": -62.782379150390625, + "logps/rejected": -86.49781036376953, + "loss": 0.6204, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.273454189300537, + "rewards/margins": 6.163985729217529, + "rewards/rejected": -2.8905324935913086, + "step": 10147 + }, + { + "epoch": 2.54, + "grad_norm": 6.171902656555176, + "learning_rate": 4.877791091001724e-06, + "logits/chosen": -0.4855155944824219, + "logits/rejected": -0.5582950115203857, + "logps/chosen": -69.90470886230469, + "logps/rejected": -105.5669937133789, + "loss": 0.7725, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.621615171432495, + "rewards/margins": 6.2073516845703125, + "rewards/rejected": -3.5857365131378174, + "step": 10148 + }, + { + "epoch": 2.54, + "grad_norm": 7.342153549194336, + "learning_rate": 4.8770053396706375e-06, + "logits/chosen": -0.593082845211029, + "logits/rejected": -0.689743161201477, + "logps/chosen": -66.27411651611328, + "logps/rejected": -118.11610412597656, + "loss": 0.7389, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8646936416625977, + "rewards/margins": 6.673642158508301, + "rewards/rejected": -3.808948040008545, + "step": 10149 + }, + { + "epoch": 2.54, + "grad_norm": 7.871535778045654, + "learning_rate": 4.8762195913788825e-06, + "logits/chosen": -0.5018806457519531, + "logits/rejected": -0.6011412143707275, + "logps/chosen": -59.085975646972656, + "logps/rejected": -90.34031677246094, + "loss": 0.5877, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.121337413787842, + "rewards/margins": 6.545523643493652, + "rewards/rejected": -3.4241859912872314, + "step": 10150 + }, + { + "epoch": 2.54, + "grad_norm": 10.596741676330566, + "learning_rate": 4.875433846145872e-06, + "logits/chosen": -0.5314316153526306, + "logits/rejected": -0.6460900902748108, + "logps/chosen": -53.33324432373047, + "logps/rejected": -90.00517272949219, + "loss": 0.6604, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0545270442962646, + "rewards/margins": 6.445405006408691, + "rewards/rejected": -3.390878200531006, + "step": 10151 + }, + { + "epoch": 2.54, + "grad_norm": 20.139978408813477, + "learning_rate": 4.874648103991023e-06, + "logits/chosen": -0.560387372970581, + "logits/rejected": -0.6848024725914001, + "logps/chosen": -60.329002380371094, + "logps/rejected": -100.42765808105469, + "loss": 0.6356, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8918328285217285, + "rewards/margins": 6.724038124084473, + "rewards/rejected": -3.832205295562744, + "step": 10152 + }, + { + "epoch": 2.54, + "grad_norm": 4.421146392822266, + "learning_rate": 4.8738623649337525e-06, + "logits/chosen": -0.5270195603370667, + "logits/rejected": -0.5979369878768921, + "logps/chosen": -57.06825637817383, + "logps/rejected": -107.69598388671875, + "loss": 0.6822, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0824835300445557, + "rewards/margins": 6.362415790557861, + "rewards/rejected": -3.2799322605133057, + "step": 10153 + }, + { + "epoch": 2.54, + "grad_norm": 4.984781742095947, + "learning_rate": 4.8730766289934785e-06, + "logits/chosen": -0.49710869789123535, + "logits/rejected": -0.5726408362388611, + "logps/chosen": -49.1530876159668, + "logps/rejected": -91.915771484375, + "loss": 0.5576, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2124266624450684, + "rewards/margins": 6.277605056762695, + "rewards/rejected": -3.065178632736206, + "step": 10154 + }, + { + "epoch": 2.54, + "grad_norm": 3.327855110168457, + "learning_rate": 4.872290896189615e-06, + "logits/chosen": -0.5425292253494263, + "logits/rejected": -0.6381556391716003, + "logps/chosen": -59.403846740722656, + "logps/rejected": -86.62620544433594, + "loss": 0.6803, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.411085367202759, + "rewards/margins": 6.712520599365234, + "rewards/rejected": -3.3014354705810547, + "step": 10155 + }, + { + "epoch": 2.54, + "grad_norm": 4.220470905303955, + "learning_rate": 4.871505166541579e-06, + "logits/chosen": -0.48551398515701294, + "logits/rejected": -0.5532958507537842, + "logps/chosen": -61.435855865478516, + "logps/rejected": -104.29332733154297, + "loss": 0.6293, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8370394706726074, + "rewards/margins": 6.845379829406738, + "rewards/rejected": -4.008340835571289, + "step": 10156 + }, + { + "epoch": 2.54, + "grad_norm": 6.4015326499938965, + "learning_rate": 4.8707194400687876e-06, + "logits/chosen": -0.5008417367935181, + "logits/rejected": -0.5616147518157959, + "logps/chosen": -53.660003662109375, + "logps/rejected": -109.29505157470703, + "loss": 0.6184, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.831875801086426, + "rewards/margins": 6.735182285308838, + "rewards/rejected": -3.903306484222412, + "step": 10157 + }, + { + "epoch": 2.54, + "grad_norm": 5.204062461853027, + "learning_rate": 4.869933716790657e-06, + "logits/chosen": -0.4639661908149719, + "logits/rejected": -0.5594263076782227, + "logps/chosen": -49.665565490722656, + "logps/rejected": -86.42472839355469, + "loss": 0.5718, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.318127155303955, + "rewards/margins": 6.351996421813965, + "rewards/rejected": -3.0338690280914307, + "step": 10158 + }, + { + "epoch": 2.54, + "grad_norm": 3.852349281311035, + "learning_rate": 4.869147996726601e-06, + "logits/chosen": -0.6152793765068054, + "logits/rejected": -0.6683546304702759, + "logps/chosen": -41.66489028930664, + "logps/rejected": -89.06539916992188, + "loss": 0.6371, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0906057357788086, + "rewards/margins": 5.813218116760254, + "rewards/rejected": -2.722612142562866, + "step": 10159 + }, + { + "epoch": 2.54, + "grad_norm": 4.725888729095459, + "learning_rate": 4.868362279896036e-06, + "logits/chosen": -0.5039064884185791, + "logits/rejected": -0.6186167001724243, + "logps/chosen": -59.012516021728516, + "logps/rejected": -89.51646423339844, + "loss": 0.7318, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.90499210357666, + "rewards/margins": 5.649297714233398, + "rewards/rejected": -2.7443056106567383, + "step": 10160 + }, + { + "epoch": 2.54, + "grad_norm": 5.390497207641602, + "learning_rate": 4.867576566318383e-06, + "logits/chosen": -0.5551906824111938, + "logits/rejected": -0.6142221689224243, + "logps/chosen": -54.87242126464844, + "logps/rejected": -97.79887390136719, + "loss": 0.6987, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.035548686981201, + "rewards/margins": 6.883457660675049, + "rewards/rejected": -3.847909450531006, + "step": 10161 + }, + { + "epoch": 2.54, + "grad_norm": 5.084793567657471, + "learning_rate": 4.8667908560130515e-06, + "logits/chosen": -0.4990648925304413, + "logits/rejected": -0.5380234122276306, + "logps/chosen": -49.70621871948242, + "logps/rejected": -114.04237365722656, + "loss": 0.6538, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.048579454421997, + "rewards/margins": 6.678717613220215, + "rewards/rejected": -3.630138874053955, + "step": 10162 + }, + { + "epoch": 2.54, + "grad_norm": 7.88667631149292, + "learning_rate": 4.866005148999459e-06, + "logits/chosen": -0.44753527641296387, + "logits/rejected": -0.5764217376708984, + "logps/chosen": -52.60773468017578, + "logps/rejected": -91.69281768798828, + "loss": 0.6551, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0653295516967773, + "rewards/margins": 6.211077690124512, + "rewards/rejected": -3.1457486152648926, + "step": 10163 + }, + { + "epoch": 2.54, + "grad_norm": 4.224672317504883, + "learning_rate": 4.8652194452970245e-06, + "logits/chosen": -0.5342539548873901, + "logits/rejected": -0.5888819694519043, + "logps/chosen": -45.246002197265625, + "logps/rejected": -94.58287811279297, + "loss": 0.5544, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3010480403900146, + "rewards/margins": 5.436063766479492, + "rewards/rejected": -2.1350159645080566, + "step": 10164 + }, + { + "epoch": 2.54, + "grad_norm": 6.320306301116943, + "learning_rate": 4.864433744925158e-06, + "logits/chosen": -0.48069509863853455, + "logits/rejected": -0.565695583820343, + "logps/chosen": -54.297264099121094, + "logps/rejected": -102.54474639892578, + "loss": 0.6788, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9966742992401123, + "rewards/margins": 5.949212551116943, + "rewards/rejected": -2.9525387287139893, + "step": 10165 + }, + { + "epoch": 2.54, + "grad_norm": 4.217679977416992, + "learning_rate": 4.863648047903279e-06, + "logits/chosen": -0.5231373310089111, + "logits/rejected": -0.6550618410110474, + "logps/chosen": -70.68724822998047, + "logps/rejected": -86.97346496582031, + "loss": 0.6791, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9547698497772217, + "rewards/margins": 6.522237777709961, + "rewards/rejected": -3.5674681663513184, + "step": 10166 + }, + { + "epoch": 2.54, + "grad_norm": 4.2195868492126465, + "learning_rate": 4.862862354250802e-06, + "logits/chosen": -0.4836987853050232, + "logits/rejected": -0.5538613796234131, + "logps/chosen": -48.65040588378906, + "logps/rejected": -85.43701171875, + "loss": 0.5808, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.691291570663452, + "rewards/margins": 5.211833477020264, + "rewards/rejected": -2.5205421447753906, + "step": 10167 + }, + { + "epoch": 2.54, + "grad_norm": 5.398910999298096, + "learning_rate": 4.8620766639871425e-06, + "logits/chosen": -0.5085835456848145, + "logits/rejected": -0.5829291343688965, + "logps/chosen": -52.615421295166016, + "logps/rejected": -103.81565856933594, + "loss": 0.6503, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.093566656112671, + "rewards/margins": 5.465038776397705, + "rewards/rejected": -2.3714728355407715, + "step": 10168 + }, + { + "epoch": 2.54, + "grad_norm": 4.102850914001465, + "learning_rate": 4.861290977131715e-06, + "logits/chosen": -0.5510385632514954, + "logits/rejected": -0.6377891302108765, + "logps/chosen": -50.473976135253906, + "logps/rejected": -101.24623107910156, + "loss": 0.6646, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8366539478302, + "rewards/margins": 6.077695369720459, + "rewards/rejected": -3.241041421890259, + "step": 10169 + }, + { + "epoch": 2.54, + "grad_norm": 10.125988960266113, + "learning_rate": 4.860505293703934e-06, + "logits/chosen": -0.39871615171432495, + "logits/rejected": -0.5495100617408752, + "logps/chosen": -55.93301773071289, + "logps/rejected": -73.40750122070312, + "loss": 0.6925, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6571571826934814, + "rewards/margins": 4.923020362854004, + "rewards/rejected": -2.2658629417419434, + "step": 10170 + }, + { + "epoch": 2.54, + "grad_norm": 6.962005615234375, + "learning_rate": 4.859719613723217e-06, + "logits/chosen": -0.5247513055801392, + "logits/rejected": -0.5794727802276611, + "logps/chosen": -47.16032791137695, + "logps/rejected": -90.55595397949219, + "loss": 0.6977, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9750123023986816, + "rewards/margins": 5.73516845703125, + "rewards/rejected": -2.76015567779541, + "step": 10171 + }, + { + "epoch": 2.54, + "grad_norm": 4.248904705047607, + "learning_rate": 4.858933937208977e-06, + "logits/chosen": -0.46513909101486206, + "logits/rejected": -0.5446493625640869, + "logps/chosen": -46.188419342041016, + "logps/rejected": -79.75868225097656, + "loss": 0.6179, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.862595319747925, + "rewards/margins": 4.822370529174805, + "rewards/rejected": -1.9597744941711426, + "step": 10172 + }, + { + "epoch": 2.54, + "grad_norm": 6.623319149017334, + "learning_rate": 4.858148264180629e-06, + "logits/chosen": -0.564017117023468, + "logits/rejected": -0.5759940147399902, + "logps/chosen": -45.347862243652344, + "logps/rejected": -103.65904998779297, + "loss": 0.6852, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.019622802734375, + "rewards/margins": 5.029448509216309, + "rewards/rejected": -2.009826183319092, + "step": 10173 + }, + { + "epoch": 2.55, + "grad_norm": 5.484215259552002, + "learning_rate": 4.857362594657587e-06, + "logits/chosen": -0.4819062650203705, + "logits/rejected": -0.5454676151275635, + "logps/chosen": -64.47525787353516, + "logps/rejected": -101.92159271240234, + "loss": 0.8061, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7484536170959473, + "rewards/margins": 5.230037212371826, + "rewards/rejected": -2.481583833694458, + "step": 10174 + }, + { + "epoch": 2.55, + "grad_norm": 4.028243064880371, + "learning_rate": 4.856576928659271e-06, + "logits/chosen": -0.49842533469200134, + "logits/rejected": -0.609079122543335, + "logps/chosen": -59.62815856933594, + "logps/rejected": -90.9432373046875, + "loss": 0.6212, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8846495151519775, + "rewards/margins": 6.485165119171143, + "rewards/rejected": -3.600515842437744, + "step": 10175 + }, + { + "epoch": 2.55, + "grad_norm": 45.06917953491211, + "learning_rate": 4.8557912662050875e-06, + "logits/chosen": -0.5385334491729736, + "logits/rejected": -0.628383219242096, + "logps/chosen": -78.03614044189453, + "logps/rejected": -104.54557800292969, + "loss": 0.8353, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.36655855178833, + "rewards/margins": 6.40994930267334, + "rewards/rejected": -3.0433907508850098, + "step": 10176 + }, + { + "epoch": 2.55, + "grad_norm": 5.588070392608643, + "learning_rate": 4.8550056073144565e-06, + "logits/chosen": -0.4965053200721741, + "logits/rejected": -0.5479722023010254, + "logps/chosen": -55.69097900390625, + "logps/rejected": -109.34620666503906, + "loss": 0.7711, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.309020757675171, + "rewards/margins": 6.884593963623047, + "rewards/rejected": -3.575573205947876, + "step": 10177 + }, + { + "epoch": 2.55, + "grad_norm": 14.016880989074707, + "learning_rate": 4.854219952006794e-06, + "logits/chosen": -0.5232424139976501, + "logits/rejected": -0.5576819777488708, + "logps/chosen": -68.65077209472656, + "logps/rejected": -104.14323425292969, + "loss": 0.8935, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7960479259490967, + "rewards/margins": 5.787991523742676, + "rewards/rejected": -2.991943359375, + "step": 10178 + }, + { + "epoch": 2.55, + "grad_norm": 3.3128414154052734, + "learning_rate": 4.853434300301508e-06, + "logits/chosen": -0.5059213638305664, + "logits/rejected": -0.5393147468566895, + "logps/chosen": -46.62139129638672, + "logps/rejected": -95.06920623779297, + "loss": 0.5951, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.25915789604187, + "rewards/margins": 5.5720109939575195, + "rewards/rejected": -2.3128533363342285, + "step": 10179 + }, + { + "epoch": 2.55, + "grad_norm": 3.4256622791290283, + "learning_rate": 4.852648652218017e-06, + "logits/chosen": -0.48668476939201355, + "logits/rejected": -0.5498330593109131, + "logps/chosen": -50.60578155517578, + "logps/rejected": -113.27145385742188, + "loss": 0.5931, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.186230182647705, + "rewards/margins": 7.013553619384766, + "rewards/rejected": -3.8273236751556396, + "step": 10180 + }, + { + "epoch": 2.55, + "grad_norm": 5.252811908721924, + "learning_rate": 4.851863007775735e-06, + "logits/chosen": -0.514922022819519, + "logits/rejected": -0.6135846376419067, + "logps/chosen": -54.34122085571289, + "logps/rejected": -80.58171844482422, + "loss": 0.6992, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9067375659942627, + "rewards/margins": 5.225619792938232, + "rewards/rejected": -2.318882465362549, + "step": 10181 + }, + { + "epoch": 2.55, + "grad_norm": 4.411864757537842, + "learning_rate": 4.8510773669940765e-06, + "logits/chosen": -0.4965040683746338, + "logits/rejected": -0.6263070702552795, + "logps/chosen": -65.95015716552734, + "logps/rejected": -86.27259826660156, + "loss": 0.6378, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.101754903793335, + "rewards/margins": 5.917255401611328, + "rewards/rejected": -2.815500259399414, + "step": 10182 + }, + { + "epoch": 2.55, + "grad_norm": 13.306241035461426, + "learning_rate": 4.850291729892453e-06, + "logits/chosen": -0.42793309688568115, + "logits/rejected": -0.5152987837791443, + "logps/chosen": -60.978919982910156, + "logps/rejected": -115.28080749511719, + "loss": 0.8144, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6343047618865967, + "rewards/margins": 5.465935707092285, + "rewards/rejected": -2.8316304683685303, + "step": 10183 + }, + { + "epoch": 2.55, + "grad_norm": 7.025801658630371, + "learning_rate": 4.849506096490281e-06, + "logits/chosen": -0.4645090699195862, + "logits/rejected": -0.5320562720298767, + "logps/chosen": -51.226131439208984, + "logps/rejected": -107.52523040771484, + "loss": 0.6268, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2744460105895996, + "rewards/margins": 6.865425109863281, + "rewards/rejected": -3.5909781455993652, + "step": 10184 + }, + { + "epoch": 2.55, + "grad_norm": 7.40404748916626, + "learning_rate": 4.8487204668069735e-06, + "logits/chosen": -0.5561639666557312, + "logits/rejected": -0.6433893442153931, + "logps/chosen": -57.895877838134766, + "logps/rejected": -90.82864379882812, + "loss": 0.7272, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1919422149658203, + "rewards/margins": 6.20837926864624, + "rewards/rejected": -3.016436815261841, + "step": 10185 + }, + { + "epoch": 2.55, + "grad_norm": 4.966202735900879, + "learning_rate": 4.847934840861945e-06, + "logits/chosen": -0.5426890254020691, + "logits/rejected": -0.6675031781196594, + "logps/chosen": -63.555763244628906, + "logps/rejected": -89.30879974365234, + "loss": 0.6867, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1935696601867676, + "rewards/margins": 5.796072006225586, + "rewards/rejected": -2.6025023460388184, + "step": 10186 + }, + { + "epoch": 2.55, + "grad_norm": 1.787015676498413, + "learning_rate": 4.847149218674607e-06, + "logits/chosen": -0.49467796087265015, + "logits/rejected": -0.6231334805488586, + "logps/chosen": -53.791446685791016, + "logps/rejected": -102.84294891357422, + "loss": 0.5217, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9536263942718506, + "rewards/margins": 7.21265172958374, + "rewards/rejected": -4.259025573730469, + "step": 10187 + }, + { + "epoch": 2.55, + "grad_norm": 7.747024059295654, + "learning_rate": 4.846363600264373e-06, + "logits/chosen": -0.5553210973739624, + "logits/rejected": -0.6751476526260376, + "logps/chosen": -50.559425354003906, + "logps/rejected": -102.19882202148438, + "loss": 0.7412, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0309810638427734, + "rewards/margins": 6.745183944702148, + "rewards/rejected": -3.714202642440796, + "step": 10188 + }, + { + "epoch": 2.55, + "grad_norm": 9.14340877532959, + "learning_rate": 4.845577985650662e-06, + "logits/chosen": -0.46071937680244446, + "logits/rejected": -0.56069415807724, + "logps/chosen": -49.909908294677734, + "logps/rejected": -83.41259002685547, + "loss": 0.6239, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.214019298553467, + "rewards/margins": 4.884830474853516, + "rewards/rejected": -1.6708111763000488, + "step": 10189 + }, + { + "epoch": 2.55, + "grad_norm": 4.419879913330078, + "learning_rate": 4.84479237485288e-06, + "logits/chosen": -0.5099876523017883, + "logits/rejected": -0.6505590081214905, + "logps/chosen": -53.139060974121094, + "logps/rejected": -99.16021728515625, + "loss": 0.5863, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0104737281799316, + "rewards/margins": 7.132246017456055, + "rewards/rejected": -4.121772766113281, + "step": 10190 + }, + { + "epoch": 2.55, + "grad_norm": 7.26828145980835, + "learning_rate": 4.844006767890444e-06, + "logits/chosen": -0.47026368975639343, + "logits/rejected": -0.5928747057914734, + "logps/chosen": -60.92308807373047, + "logps/rejected": -94.64938354492188, + "loss": 0.6361, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.358567237854004, + "rewards/margins": 6.9605326652526855, + "rewards/rejected": -3.6019651889801025, + "step": 10191 + }, + { + "epoch": 2.55, + "grad_norm": 5.880340099334717, + "learning_rate": 4.843221164782766e-06, + "logits/chosen": -0.5189535617828369, + "logits/rejected": -0.5327341556549072, + "logps/chosen": -54.84075164794922, + "logps/rejected": -114.44020080566406, + "loss": 0.711, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.892076015472412, + "rewards/margins": 5.452138900756836, + "rewards/rejected": -2.560062885284424, + "step": 10192 + }, + { + "epoch": 2.55, + "grad_norm": 12.48983097076416, + "learning_rate": 4.842435565549262e-06, + "logits/chosen": -0.5020207762718201, + "logits/rejected": -0.5375683903694153, + "logps/chosen": -56.4929084777832, + "logps/rejected": -109.91610717773438, + "loss": 0.8127, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.739386558532715, + "rewards/margins": 5.415905952453613, + "rewards/rejected": -2.6765189170837402, + "step": 10193 + }, + { + "epoch": 2.55, + "grad_norm": 4.121589183807373, + "learning_rate": 4.841649970209341e-06, + "logits/chosen": -0.5294763445854187, + "logits/rejected": -0.5896034240722656, + "logps/chosen": -46.03659439086914, + "logps/rejected": -95.3181381225586, + "loss": 0.6641, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.513056755065918, + "rewards/margins": 6.366563320159912, + "rewards/rejected": -2.853506565093994, + "step": 10194 + }, + { + "epoch": 2.55, + "grad_norm": 6.129052639007568, + "learning_rate": 4.840864378782418e-06, + "logits/chosen": -0.49499350786209106, + "logits/rejected": -0.5570871829986572, + "logps/chosen": -51.90901184082031, + "logps/rejected": -100.76866149902344, + "loss": 0.7411, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1974565982818604, + "rewards/margins": 6.635268688201904, + "rewards/rejected": -3.437812089920044, + "step": 10195 + }, + { + "epoch": 2.55, + "grad_norm": 7.047422885894775, + "learning_rate": 4.840078791287907e-06, + "logits/chosen": -0.4625319838523865, + "logits/rejected": -0.5427899360656738, + "logps/chosen": -46.39863586425781, + "logps/rejected": -83.48474884033203, + "loss": 0.6239, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8329503536224365, + "rewards/margins": 5.391149520874023, + "rewards/rejected": -2.558199405670166, + "step": 10196 + }, + { + "epoch": 2.55, + "grad_norm": 6.398101329803467, + "learning_rate": 4.839293207745217e-06, + "logits/chosen": -0.5379860401153564, + "logits/rejected": -0.6014443635940552, + "logps/chosen": -54.135921478271484, + "logps/rejected": -98.51348876953125, + "loss": 0.6874, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0540294647216797, + "rewards/margins": 6.319240570068359, + "rewards/rejected": -3.265211343765259, + "step": 10197 + }, + { + "epoch": 2.55, + "grad_norm": 3.518157720565796, + "learning_rate": 4.838507628173763e-06, + "logits/chosen": -0.4853779077529907, + "logits/rejected": -0.5675702095031738, + "logps/chosen": -53.94260025024414, + "logps/rejected": -88.4246826171875, + "loss": 0.5858, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.113262891769409, + "rewards/margins": 5.912391662597656, + "rewards/rejected": -2.799128770828247, + "step": 10198 + }, + { + "epoch": 2.55, + "grad_norm": 2.775467872619629, + "learning_rate": 4.837722052592958e-06, + "logits/chosen": -0.38909128308296204, + "logits/rejected": -0.4738885164260864, + "logps/chosen": -53.6860466003418, + "logps/rejected": -110.80904388427734, + "loss": 0.5292, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.80216121673584, + "rewards/margins": 6.6873979568481445, + "rewards/rejected": -3.885237216949463, + "step": 10199 + }, + { + "epoch": 2.55, + "grad_norm": 5.127522945404053, + "learning_rate": 4.836936481022215e-06, + "logits/chosen": -0.5260167121887207, + "logits/rejected": -0.6407032608985901, + "logps/chosen": -56.03197479248047, + "logps/rejected": -97.9074478149414, + "loss": 0.652, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.767946243286133, + "rewards/margins": 6.725898265838623, + "rewards/rejected": -3.9579524993896484, + "step": 10200 + }, + { + "epoch": 2.55, + "grad_norm": 4.108701229095459, + "learning_rate": 4.8361509134809424e-06, + "logits/chosen": -0.47741517424583435, + "logits/rejected": -0.5459005236625671, + "logps/chosen": -53.608463287353516, + "logps/rejected": -83.35433959960938, + "loss": 0.7057, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.977182388305664, + "rewards/margins": 5.148473262786865, + "rewards/rejected": -2.1712911128997803, + "step": 10201 + }, + { + "epoch": 2.55, + "grad_norm": 13.650273323059082, + "learning_rate": 4.835365349988556e-06, + "logits/chosen": -0.48522520065307617, + "logits/rejected": -0.5403168201446533, + "logps/chosen": -49.78714370727539, + "logps/rejected": -109.85276794433594, + "loss": 0.7702, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9240517616271973, + "rewards/margins": 6.751889228820801, + "rewards/rejected": -3.8278372287750244, + "step": 10202 + }, + { + "epoch": 2.55, + "grad_norm": 3.466481924057007, + "learning_rate": 4.834579790564468e-06, + "logits/chosen": -0.45634227991104126, + "logits/rejected": -0.48508715629577637, + "logps/chosen": -47.09090805053711, + "logps/rejected": -91.1736068725586, + "loss": 0.5829, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.284050226211548, + "rewards/margins": 4.817176818847656, + "rewards/rejected": -1.5331265926361084, + "step": 10203 + }, + { + "epoch": 2.55, + "grad_norm": 13.73631763458252, + "learning_rate": 4.833794235228089e-06, + "logits/chosen": -0.48518118262290955, + "logits/rejected": -0.5498578548431396, + "logps/chosen": -55.3491096496582, + "logps/rejected": -82.33407592773438, + "loss": 0.7116, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1124422550201416, + "rewards/margins": 5.096170902252197, + "rewards/rejected": -1.9837291240692139, + "step": 10204 + }, + { + "epoch": 2.55, + "grad_norm": 6.545392036437988, + "learning_rate": 4.8330086839988284e-06, + "logits/chosen": -0.5461868047714233, + "logits/rejected": -0.6186643242835999, + "logps/chosen": -47.697147369384766, + "logps/rejected": -94.30074310302734, + "loss": 0.7468, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.901157855987549, + "rewards/margins": 5.603696346282959, + "rewards/rejected": -2.70253849029541, + "step": 10205 + }, + { + "epoch": 2.55, + "grad_norm": 5.386034965515137, + "learning_rate": 4.832223136896103e-06, + "logits/chosen": -0.4495389759540558, + "logits/rejected": -0.5355788469314575, + "logps/chosen": -58.49673080444336, + "logps/rejected": -88.70640563964844, + "loss": 0.7154, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.933184862136841, + "rewards/margins": 4.775294780731201, + "rewards/rejected": -1.8421101570129395, + "step": 10206 + }, + { + "epoch": 2.55, + "grad_norm": 12.2954683303833, + "learning_rate": 4.831437593939324e-06, + "logits/chosen": -0.45639145374298096, + "logits/rejected": -0.5481155514717102, + "logps/chosen": -57.77339553833008, + "logps/rejected": -96.5577163696289, + "loss": 0.7716, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.018503189086914, + "rewards/margins": 6.087192058563232, + "rewards/rejected": -3.0686895847320557, + "step": 10207 + }, + { + "epoch": 2.55, + "grad_norm": 7.5150651931762695, + "learning_rate": 4.830652055147899e-06, + "logits/chosen": -0.43278631567955017, + "logits/rejected": -0.5284582376480103, + "logps/chosen": -64.91636657714844, + "logps/rejected": -90.19243621826172, + "loss": 0.896, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8666775226593018, + "rewards/margins": 5.303211688995361, + "rewards/rejected": -2.4365346431732178, + "step": 10208 + }, + { + "epoch": 2.55, + "grad_norm": 4.401941299438477, + "learning_rate": 4.829866520541243e-06, + "logits/chosen": -0.5293564200401306, + "logits/rejected": -0.6615837812423706, + "logps/chosen": -48.7916259765625, + "logps/rejected": -97.33988952636719, + "loss": 0.5669, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1838386058807373, + "rewards/margins": 7.065206527709961, + "rewards/rejected": -3.8813679218292236, + "step": 10209 + }, + { + "epoch": 2.55, + "grad_norm": 15.21113395690918, + "learning_rate": 4.829080990138765e-06, + "logits/chosen": -0.5186861753463745, + "logits/rejected": -0.5764992833137512, + "logps/chosen": -58.86896896362305, + "logps/rejected": -95.1689682006836, + "loss": 0.7823, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7382357120513916, + "rewards/margins": 5.443319320678711, + "rewards/rejected": -2.7050833702087402, + "step": 10210 + }, + { + "epoch": 2.55, + "grad_norm": 6.1313605308532715, + "learning_rate": 4.828295463959879e-06, + "logits/chosen": -0.5652084350585938, + "logits/rejected": -0.6494905948638916, + "logps/chosen": -43.57626724243164, + "logps/rejected": -108.8672866821289, + "loss": 0.6078, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.058898448944092, + "rewards/margins": 6.324542045593262, + "rewards/rejected": -3.2656431198120117, + "step": 10211 + }, + { + "epoch": 2.55, + "grad_norm": 5.623495578765869, + "learning_rate": 4.827509942023994e-06, + "logits/chosen": -0.4660297632217407, + "logits/rejected": -0.515701413154602, + "logps/chosen": -58.47246170043945, + "logps/rejected": -101.60321044921875, + "loss": 0.7759, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9293994903564453, + "rewards/margins": 4.871585845947266, + "rewards/rejected": -1.9421865940093994, + "step": 10212 + }, + { + "epoch": 2.55, + "grad_norm": 2.476928949356079, + "learning_rate": 4.826724424350522e-06, + "logits/chosen": -0.44835638999938965, + "logits/rejected": -0.6228024363517761, + "logps/chosen": -53.25991439819336, + "logps/rejected": -94.39616394042969, + "loss": 0.5358, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.100402593612671, + "rewards/margins": 7.9522705078125, + "rewards/rejected": -4.85186767578125, + "step": 10213 + }, + { + "epoch": 2.56, + "grad_norm": 5.752492427825928, + "learning_rate": 4.825938910958874e-06, + "logits/chosen": -0.48905670642852783, + "logits/rejected": -0.5971673727035522, + "logps/chosen": -52.45607376098633, + "logps/rejected": -98.45182800292969, + "loss": 0.6488, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.048682928085327, + "rewards/margins": 6.817787170410156, + "rewards/rejected": -3.76910400390625, + "step": 10214 + }, + { + "epoch": 2.56, + "grad_norm": 3.3221898078918457, + "learning_rate": 4.82515340186846e-06, + "logits/chosen": -0.5070742964744568, + "logits/rejected": -0.5795482993125916, + "logps/chosen": -54.33757781982422, + "logps/rejected": -89.11392974853516, + "loss": 0.6514, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.798065662384033, + "rewards/margins": 5.617832660675049, + "rewards/rejected": -2.8197665214538574, + "step": 10215 + }, + { + "epoch": 2.56, + "grad_norm": 4.450613021850586, + "learning_rate": 4.824367897098692e-06, + "logits/chosen": -0.5065339803695679, + "logits/rejected": -0.5954481959342957, + "logps/chosen": -66.97755432128906, + "logps/rejected": -104.17462158203125, + "loss": 0.7717, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7893621921539307, + "rewards/margins": 5.803442001342773, + "rewards/rejected": -3.0140798091888428, + "step": 10216 + }, + { + "epoch": 2.56, + "grad_norm": 5.518383026123047, + "learning_rate": 4.823582396668978e-06, + "logits/chosen": -0.43889832496643066, + "logits/rejected": -0.5357151627540588, + "logps/chosen": -55.380149841308594, + "logps/rejected": -96.4681625366211, + "loss": 0.5839, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8604683876037598, + "rewards/margins": 6.454470157623291, + "rewards/rejected": -3.594001531600952, + "step": 10217 + }, + { + "epoch": 2.56, + "grad_norm": 7.849263668060303, + "learning_rate": 4.822796900598735e-06, + "logits/chosen": -0.4155184030532837, + "logits/rejected": -0.503318727016449, + "logps/chosen": -53.2650146484375, + "logps/rejected": -95.76591491699219, + "loss": 0.6431, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.023996591567993, + "rewards/margins": 5.966644287109375, + "rewards/rejected": -2.942647695541382, + "step": 10218 + }, + { + "epoch": 2.56, + "grad_norm": 6.001240253448486, + "learning_rate": 4.822011408907365e-06, + "logits/chosen": -0.46399977803230286, + "logits/rejected": -0.5802633166313171, + "logps/chosen": -55.14494705200195, + "logps/rejected": -89.04920196533203, + "loss": 0.7264, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0042903423309326, + "rewards/margins": 5.7536516189575195, + "rewards/rejected": -2.749361038208008, + "step": 10219 + }, + { + "epoch": 2.56, + "grad_norm": 4.317863941192627, + "learning_rate": 4.8212259216142835e-06, + "logits/chosen": -0.4906291365623474, + "logits/rejected": -0.6223202347755432, + "logps/chosen": -52.642982482910156, + "logps/rejected": -84.01046752929688, + "loss": 0.7239, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.00557017326355, + "rewards/margins": 6.446672439575195, + "rewards/rejected": -3.4411017894744873, + "step": 10220 + }, + { + "epoch": 2.56, + "grad_norm": 5.570959568023682, + "learning_rate": 4.820440438738902e-06, + "logits/chosen": -0.4927363991737366, + "logits/rejected": -0.563837468624115, + "logps/chosen": -54.73875045776367, + "logps/rejected": -99.44918823242188, + "loss": 0.739, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3314292430877686, + "rewards/margins": 6.5483856201171875, + "rewards/rejected": -3.216956853866577, + "step": 10221 + }, + { + "epoch": 2.56, + "grad_norm": 5.001797199249268, + "learning_rate": 4.819654960300625e-06, + "logits/chosen": -0.5220495462417603, + "logits/rejected": -0.6031755805015564, + "logps/chosen": -59.991737365722656, + "logps/rejected": -90.52947235107422, + "loss": 0.6273, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3070731163024902, + "rewards/margins": 6.118398189544678, + "rewards/rejected": -2.8113248348236084, + "step": 10222 + }, + { + "epoch": 2.56, + "grad_norm": 25.52832794189453, + "learning_rate": 4.818869486318867e-06, + "logits/chosen": -0.47053271532058716, + "logits/rejected": -0.5526304841041565, + "logps/chosen": -55.237693786621094, + "logps/rejected": -90.8392333984375, + "loss": 0.7236, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9630300998687744, + "rewards/margins": 4.965982913970947, + "rewards/rejected": -2.0029525756835938, + "step": 10223 + }, + { + "epoch": 2.56, + "grad_norm": 8.383684158325195, + "learning_rate": 4.8180840168130366e-06, + "logits/chosen": -0.5326130390167236, + "logits/rejected": -0.6157697439193726, + "logps/chosen": -54.86016845703125, + "logps/rejected": -100.88787078857422, + "loss": 0.7185, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.90910005569458, + "rewards/margins": 6.405332088470459, + "rewards/rejected": -3.496232271194458, + "step": 10224 + }, + { + "epoch": 2.56, + "grad_norm": 9.739679336547852, + "learning_rate": 4.817298551802544e-06, + "logits/chosen": -0.5536218285560608, + "logits/rejected": -0.6332136988639832, + "logps/chosen": -46.75145721435547, + "logps/rejected": -98.58528137207031, + "loss": 0.6044, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2498321533203125, + "rewards/margins": 6.762314796447754, + "rewards/rejected": -3.512482166290283, + "step": 10225 + }, + { + "epoch": 2.56, + "grad_norm": 2.4768006801605225, + "learning_rate": 4.816513091306798e-06, + "logits/chosen": -0.5166967511177063, + "logits/rejected": -0.6135469079017639, + "logps/chosen": -54.40369415283203, + "logps/rejected": -105.20955657958984, + "loss": 0.5492, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.309863567352295, + "rewards/margins": 6.471871376037598, + "rewards/rejected": -3.1620073318481445, + "step": 10226 + }, + { + "epoch": 2.56, + "grad_norm": 5.866385459899902, + "learning_rate": 4.815727635345209e-06, + "logits/chosen": -0.45728689432144165, + "logits/rejected": -0.5158876180648804, + "logps/chosen": -57.56340789794922, + "logps/rejected": -93.78649139404297, + "loss": 0.7908, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.165820360183716, + "rewards/margins": 5.971423149108887, + "rewards/rejected": -2.80560302734375, + "step": 10227 + }, + { + "epoch": 2.56, + "grad_norm": 9.834312438964844, + "learning_rate": 4.814942183937187e-06, + "logits/chosen": -0.44744619727134705, + "logits/rejected": -0.5557920336723328, + "logps/chosen": -54.39560317993164, + "logps/rejected": -87.14790344238281, + "loss": 0.6503, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.156085729598999, + "rewards/margins": 6.5284576416015625, + "rewards/rejected": -3.3723719120025635, + "step": 10228 + }, + { + "epoch": 2.56, + "grad_norm": 4.964892387390137, + "learning_rate": 4.814156737102139e-06, + "logits/chosen": -0.46800529956817627, + "logits/rejected": -0.6029844880104065, + "logps/chosen": -62.399024963378906, + "logps/rejected": -86.14295196533203, + "loss": 0.6135, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2485954761505127, + "rewards/margins": 6.453708648681641, + "rewards/rejected": -3.205112934112549, + "step": 10229 + }, + { + "epoch": 2.56, + "grad_norm": 8.044158935546875, + "learning_rate": 4.8133712948594765e-06, + "logits/chosen": -0.5569052696228027, + "logits/rejected": -0.6346913576126099, + "logps/chosen": -61.0851936340332, + "logps/rejected": -104.33419799804688, + "loss": 0.6877, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8009090423583984, + "rewards/margins": 6.746844291687012, + "rewards/rejected": -3.945934772491455, + "step": 10230 + }, + { + "epoch": 2.56, + "grad_norm": 4.8149518966674805, + "learning_rate": 4.812585857228606e-06, + "logits/chosen": -0.45532530546188354, + "logits/rejected": -0.5618990659713745, + "logps/chosen": -52.378787994384766, + "logps/rejected": -97.5338134765625, + "loss": 0.6743, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1334455013275146, + "rewards/margins": 6.165725231170654, + "rewards/rejected": -3.0322794914245605, + "step": 10231 + }, + { + "epoch": 2.56, + "grad_norm": 10.730972290039062, + "learning_rate": 4.811800424228942e-06, + "logits/chosen": -0.3842451274394989, + "logits/rejected": -0.438148558139801, + "logps/chosen": -47.68697738647461, + "logps/rejected": -100.642578125, + "loss": 0.6657, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.904067277908325, + "rewards/margins": 5.266880035400391, + "rewards/rejected": -2.3628125190734863, + "step": 10232 + }, + { + "epoch": 2.56, + "grad_norm": 5.297936916351318, + "learning_rate": 4.811014995879887e-06, + "logits/chosen": -0.4129956066608429, + "logits/rejected": -0.4940643906593323, + "logps/chosen": -46.898704528808594, + "logps/rejected": -108.66790008544922, + "loss": 0.5451, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0110015869140625, + "rewards/margins": 6.661252021789551, + "rewards/rejected": -3.6502504348754883, + "step": 10233 + }, + { + "epoch": 2.56, + "grad_norm": 4.300922393798828, + "learning_rate": 4.810229572200852e-06, + "logits/chosen": -0.5987284183502197, + "logits/rejected": -0.6710848808288574, + "logps/chosen": -57.84392547607422, + "logps/rejected": -86.85374450683594, + "loss": 0.6553, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9950573444366455, + "rewards/margins": 6.415407180786133, + "rewards/rejected": -3.4203500747680664, + "step": 10234 + }, + { + "epoch": 2.56, + "grad_norm": 3.7509655952453613, + "learning_rate": 4.809444153211247e-06, + "logits/chosen": -0.4965077042579651, + "logits/rejected": -0.5910016298294067, + "logps/chosen": -52.07245635986328, + "logps/rejected": -105.37277221679688, + "loss": 0.5315, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.096175193786621, + "rewards/margins": 7.386301040649414, + "rewards/rejected": -4.290126323699951, + "step": 10235 + }, + { + "epoch": 2.56, + "grad_norm": 2.406034469604492, + "learning_rate": 4.80865873893048e-06, + "logits/chosen": -0.49763989448547363, + "logits/rejected": -0.6064357757568359, + "logps/chosen": -55.49724578857422, + "logps/rejected": -92.39794921875, + "loss": 0.5531, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9013609886169434, + "rewards/margins": 6.176375865936279, + "rewards/rejected": -3.275015115737915, + "step": 10236 + }, + { + "epoch": 2.56, + "grad_norm": 14.014803886413574, + "learning_rate": 4.807873329377959e-06, + "logits/chosen": -0.47246819734573364, + "logits/rejected": -0.5182666182518005, + "logps/chosen": -54.647727966308594, + "logps/rejected": -100.64044952392578, + "loss": 0.6025, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.345667600631714, + "rewards/margins": 5.620900630950928, + "rewards/rejected": -2.275233030319214, + "step": 10237 + }, + { + "epoch": 2.56, + "grad_norm": 6.096248626708984, + "learning_rate": 4.807087924573092e-06, + "logits/chosen": -0.4671284556388855, + "logits/rejected": -0.5237656831741333, + "logps/chosen": -59.018959045410156, + "logps/rejected": -89.55927276611328, + "loss": 0.8145, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.03112530708313, + "rewards/margins": 4.952775955200195, + "rewards/rejected": -1.9216505289077759, + "step": 10238 + }, + { + "epoch": 2.56, + "grad_norm": 3.3350796699523926, + "learning_rate": 4.806302524535289e-06, + "logits/chosen": -0.5330364108085632, + "logits/rejected": -0.6000330448150635, + "logps/chosen": -46.871009826660156, + "logps/rejected": -99.10275268554688, + "loss": 0.6315, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4616496562957764, + "rewards/margins": 6.02137565612793, + "rewards/rejected": -2.5597262382507324, + "step": 10239 + }, + { + "epoch": 2.56, + "grad_norm": 8.348567962646484, + "learning_rate": 4.805517129283956e-06, + "logits/chosen": -0.541897177696228, + "logits/rejected": -0.672458827495575, + "logps/chosen": -60.22344207763672, + "logps/rejected": -100.72268676757812, + "loss": 0.6719, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.107599973678589, + "rewards/margins": 6.8982930183410645, + "rewards/rejected": -3.7906928062438965, + "step": 10240 + }, + { + "epoch": 2.56, + "grad_norm": 3.7711243629455566, + "learning_rate": 4.8047317388385004e-06, + "logits/chosen": -0.425235390663147, + "logits/rejected": -0.5689429044723511, + "logps/chosen": -58.62672424316406, + "logps/rejected": -88.64686584472656, + "loss": 0.5491, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.110836982727051, + "rewards/margins": 6.6048970222473145, + "rewards/rejected": -3.4940600395202637, + "step": 10241 + }, + { + "epoch": 2.56, + "grad_norm": 19.356149673461914, + "learning_rate": 4.803946353218331e-06, + "logits/chosen": -0.5452091693878174, + "logits/rejected": -0.6523026823997498, + "logps/chosen": -47.25259017944336, + "logps/rejected": -94.3891830444336, + "loss": 0.5748, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8089656829833984, + "rewards/margins": 6.354701519012451, + "rewards/rejected": -3.545736074447632, + "step": 10242 + }, + { + "epoch": 2.56, + "grad_norm": 6.351850986480713, + "learning_rate": 4.803160972442858e-06, + "logits/chosen": -0.5556403398513794, + "logits/rejected": -0.6803576946258545, + "logps/chosen": -61.02199172973633, + "logps/rejected": -89.99662017822266, + "loss": 0.7421, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0527021884918213, + "rewards/margins": 5.875307559967041, + "rewards/rejected": -2.822605609893799, + "step": 10243 + }, + { + "epoch": 2.56, + "grad_norm": 7.046515941619873, + "learning_rate": 4.802375596531485e-06, + "logits/chosen": -0.4709526598453522, + "logits/rejected": -0.5399569869041443, + "logps/chosen": -57.399410247802734, + "logps/rejected": -102.42520904541016, + "loss": 0.6464, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9349184036254883, + "rewards/margins": 6.424136161804199, + "rewards/rejected": -3.489217758178711, + "step": 10244 + }, + { + "epoch": 2.56, + "grad_norm": 5.015069007873535, + "learning_rate": 4.8015902255036216e-06, + "logits/chosen": -0.5306023359298706, + "logits/rejected": -0.5972210168838501, + "logps/chosen": -60.329498291015625, + "logps/rejected": -106.26253509521484, + "loss": 0.6317, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1602976322174072, + "rewards/margins": 6.585667610168457, + "rewards/rejected": -3.425370216369629, + "step": 10245 + }, + { + "epoch": 2.56, + "grad_norm": 2.608982563018799, + "learning_rate": 4.800804859378675e-06, + "logits/chosen": -0.5086904764175415, + "logits/rejected": -0.6100436449050903, + "logps/chosen": -57.12856674194336, + "logps/rejected": -97.1005630493164, + "loss": 0.622, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1798288822174072, + "rewards/margins": 6.246593475341797, + "rewards/rejected": -3.0667645931243896, + "step": 10246 + }, + { + "epoch": 2.56, + "grad_norm": 2.4343550205230713, + "learning_rate": 4.8000194981760516e-06, + "logits/chosen": -0.5227934122085571, + "logits/rejected": -0.6325802803039551, + "logps/chosen": -59.17631149291992, + "logps/rejected": -87.78807067871094, + "loss": 0.6012, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.977137804031372, + "rewards/margins": 6.602188587188721, + "rewards/rejected": -3.6250503063201904, + "step": 10247 + }, + { + "epoch": 2.56, + "grad_norm": 2.6628360748291016, + "learning_rate": 4.799234141915159e-06, + "logits/chosen": -0.4065455496311188, + "logits/rejected": -0.5286297798156738, + "logps/chosen": -66.85820007324219, + "logps/rejected": -98.05501556396484, + "loss": 0.6024, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.107642412185669, + "rewards/margins": 6.766103744506836, + "rewards/rejected": -3.658461332321167, + "step": 10248 + }, + { + "epoch": 2.56, + "grad_norm": 4.780600547790527, + "learning_rate": 4.798448790615404e-06, + "logits/chosen": -0.5502200722694397, + "logits/rejected": -0.6507378220558167, + "logps/chosen": -60.138404846191406, + "logps/rejected": -83.10759735107422, + "loss": 0.6394, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2197136878967285, + "rewards/margins": 5.8664116859436035, + "rewards/rejected": -2.646697759628296, + "step": 10249 + }, + { + "epoch": 2.56, + "grad_norm": 5.269693374633789, + "learning_rate": 4.797663444296197e-06, + "logits/chosen": -0.5004825592041016, + "logits/rejected": -0.5613378286361694, + "logps/chosen": -54.49496841430664, + "logps/rejected": -97.74250793457031, + "loss": 0.6934, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0792055130004883, + "rewards/margins": 6.653186798095703, + "rewards/rejected": -3.5739810466766357, + "step": 10250 + }, + { + "epoch": 2.56, + "grad_norm": 3.4701802730560303, + "learning_rate": 4.7968781029769375e-06, + "logits/chosen": -0.5065345764160156, + "logits/rejected": -0.6149123907089233, + "logps/chosen": -53.59387969970703, + "logps/rejected": -89.62516021728516, + "loss": 0.6526, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.160367965698242, + "rewards/margins": 5.770971775054932, + "rewards/rejected": -2.6106040477752686, + "step": 10251 + }, + { + "epoch": 2.56, + "grad_norm": 3.146996259689331, + "learning_rate": 4.796092766677038e-06, + "logits/chosen": -0.4690227210521698, + "logits/rejected": -0.5501947999000549, + "logps/chosen": -50.765159606933594, + "logps/rejected": -88.17188262939453, + "loss": 0.6112, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2968995571136475, + "rewards/margins": 6.5384650230407715, + "rewards/rejected": -3.2415647506713867, + "step": 10252 + }, + { + "epoch": 2.56, + "grad_norm": 9.834288597106934, + "learning_rate": 4.795307435415903e-06, + "logits/chosen": -0.6084293127059937, + "logits/rejected": -0.6893726587295532, + "logps/chosen": -53.51832580566406, + "logps/rejected": -108.59880065917969, + "loss": 0.6644, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.050966501235962, + "rewards/margins": 7.261521816253662, + "rewards/rejected": -4.2105560302734375, + "step": 10253 + }, + { + "epoch": 2.57, + "grad_norm": 5.416876792907715, + "learning_rate": 4.794522109212938e-06, + "logits/chosen": -0.543364405632019, + "logits/rejected": -0.5787519216537476, + "logps/chosen": -45.80509948730469, + "logps/rejected": -111.98054504394531, + "loss": 0.6404, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0094103813171387, + "rewards/margins": 6.040378093719482, + "rewards/rejected": -3.030967950820923, + "step": 10254 + }, + { + "epoch": 2.57, + "grad_norm": 3.380906105041504, + "learning_rate": 4.7937367880875514e-06, + "logits/chosen": -0.4641701877117157, + "logits/rejected": -0.4845110774040222, + "logps/chosen": -47.6731071472168, + "logps/rejected": -112.68052673339844, + "loss": 0.5668, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1003990173339844, + "rewards/margins": 6.193932056427002, + "rewards/rejected": -3.0935332775115967, + "step": 10255 + }, + { + "epoch": 2.57, + "grad_norm": 9.829222679138184, + "learning_rate": 4.792951472059147e-06, + "logits/chosen": -0.5062052011489868, + "logits/rejected": -0.5802306532859802, + "logps/chosen": -47.82483673095703, + "logps/rejected": -83.48820495605469, + "loss": 0.7133, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9930641651153564, + "rewards/margins": 5.44347620010376, + "rewards/rejected": -2.4504120349884033, + "step": 10256 + }, + { + "epoch": 2.57, + "grad_norm": 3.6839776039123535, + "learning_rate": 4.792166161147134e-06, + "logits/chosen": -0.5238555669784546, + "logits/rejected": -0.6046426892280579, + "logps/chosen": -45.9099006652832, + "logps/rejected": -90.20439147949219, + "loss": 0.6068, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9706802368164062, + "rewards/margins": 5.988348960876465, + "rewards/rejected": -3.017669439315796, + "step": 10257 + }, + { + "epoch": 2.57, + "grad_norm": 6.537755966186523, + "learning_rate": 4.791380855370915e-06, + "logits/chosen": -0.5696313977241516, + "logits/rejected": -0.6503483057022095, + "logps/chosen": -53.48780059814453, + "logps/rejected": -89.28518676757812, + "loss": 0.7171, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0879087448120117, + "rewards/margins": 6.721089839935303, + "rewards/rejected": -3.6331801414489746, + "step": 10258 + }, + { + "epoch": 2.57, + "grad_norm": 3.851666212081909, + "learning_rate": 4.790595554749896e-06, + "logits/chosen": -0.5259629487991333, + "logits/rejected": -0.6023848056793213, + "logps/chosen": -51.479766845703125, + "logps/rejected": -90.97396087646484, + "loss": 0.6534, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8313639163970947, + "rewards/margins": 5.67910623550415, + "rewards/rejected": -2.8477425575256348, + "step": 10259 + }, + { + "epoch": 2.57, + "grad_norm": 6.560166358947754, + "learning_rate": 4.789810259303484e-06, + "logits/chosen": -0.6030372381210327, + "logits/rejected": -0.6793452501296997, + "logps/chosen": -52.59233474731445, + "logps/rejected": -109.993896484375, + "loss": 0.7519, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3373608589172363, + "rewards/margins": 6.829410552978516, + "rewards/rejected": -3.4920501708984375, + "step": 10260 + }, + { + "epoch": 2.57, + "grad_norm": 5.408576011657715, + "learning_rate": 4.789024969051086e-06, + "logits/chosen": -0.5112717151641846, + "logits/rejected": -0.6062557101249695, + "logps/chosen": -50.05303955078125, + "logps/rejected": -99.15509033203125, + "loss": 0.6463, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.201763153076172, + "rewards/margins": 6.215946197509766, + "rewards/rejected": -3.0141825675964355, + "step": 10261 + }, + { + "epoch": 2.57, + "grad_norm": 4.0876054763793945, + "learning_rate": 4.788239684012104e-06, + "logits/chosen": -0.4859623610973358, + "logits/rejected": -0.5593616962432861, + "logps/chosen": -59.548095703125, + "logps/rejected": -114.24772644042969, + "loss": 0.6448, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8414862155914307, + "rewards/margins": 7.125544548034668, + "rewards/rejected": -4.2840576171875, + "step": 10262 + }, + { + "epoch": 2.57, + "grad_norm": 8.067859649658203, + "learning_rate": 4.787454404205943e-06, + "logits/chosen": -0.49363774061203003, + "logits/rejected": -0.5832268595695496, + "logps/chosen": -53.73436737060547, + "logps/rejected": -101.30843353271484, + "loss": 0.5975, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.975677013397217, + "rewards/margins": 6.304071426391602, + "rewards/rejected": -3.328394889831543, + "step": 10263 + }, + { + "epoch": 2.57, + "grad_norm": 4.963359832763672, + "learning_rate": 4.786669129652013e-06, + "logits/chosen": -0.5170150399208069, + "logits/rejected": -0.6026299595832825, + "logps/chosen": -45.53150177001953, + "logps/rejected": -104.98494720458984, + "loss": 0.5913, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9496893882751465, + "rewards/margins": 7.033436298370361, + "rewards/rejected": -4.083746910095215, + "step": 10264 + }, + { + "epoch": 2.57, + "grad_norm": 4.4994096755981445, + "learning_rate": 4.785883860369713e-06, + "logits/chosen": -0.5161585807800293, + "logits/rejected": -0.5969791412353516, + "logps/chosen": -45.295589447021484, + "logps/rejected": -112.18836975097656, + "loss": 0.6136, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.238515853881836, + "rewards/margins": 7.307814598083496, + "rewards/rejected": -4.069298267364502, + "step": 10265 + }, + { + "epoch": 2.57, + "grad_norm": 17.043312072753906, + "learning_rate": 4.7850985963784515e-06, + "logits/chosen": -0.4837404489517212, + "logits/rejected": -0.5289621353149414, + "logps/chosen": -43.078025817871094, + "logps/rejected": -103.5404052734375, + "loss": 0.6722, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8751680850982666, + "rewards/margins": 6.480374813079834, + "rewards/rejected": -3.6052069664001465, + "step": 10266 + }, + { + "epoch": 2.57, + "grad_norm": 10.820978164672852, + "learning_rate": 4.784313337697632e-06, + "logits/chosen": -0.47346392273902893, + "logits/rejected": -0.5605821013450623, + "logps/chosen": -52.28124237060547, + "logps/rejected": -87.3740005493164, + "loss": 0.6016, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5226621627807617, + "rewards/margins": 5.483766555786133, + "rewards/rejected": -2.961104393005371, + "step": 10267 + }, + { + "epoch": 2.57, + "grad_norm": 14.165694236755371, + "learning_rate": 4.783528084346661e-06, + "logits/chosen": -0.5832128524780273, + "logits/rejected": -0.6613295078277588, + "logps/chosen": -45.337318420410156, + "logps/rejected": -98.47443389892578, + "loss": 0.7706, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6379501819610596, + "rewards/margins": 5.552602291107178, + "rewards/rejected": -2.914651870727539, + "step": 10268 + }, + { + "epoch": 2.57, + "grad_norm": 7.241593360900879, + "learning_rate": 4.782742836344939e-06, + "logits/chosen": -0.6525945067405701, + "logits/rejected": -0.6419017314910889, + "logps/chosen": -51.78651428222656, + "logps/rejected": -108.23450469970703, + "loss": 0.8245, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.966672420501709, + "rewards/margins": 5.297795295715332, + "rewards/rejected": -2.3311233520507812, + "step": 10269 + }, + { + "epoch": 2.57, + "grad_norm": 3.87394380569458, + "learning_rate": 4.781957593711874e-06, + "logits/chosen": -0.5207818746566772, + "logits/rejected": -0.5718688368797302, + "logps/chosen": -48.19933319091797, + "logps/rejected": -109.20433044433594, + "loss": 0.6373, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3278565406799316, + "rewards/margins": 7.8913726806640625, + "rewards/rejected": -4.563516616821289, + "step": 10270 + }, + { + "epoch": 2.57, + "grad_norm": 4.402026176452637, + "learning_rate": 4.781172356466869e-06, + "logits/chosen": -0.5293412804603577, + "logits/rejected": -0.599711537361145, + "logps/chosen": -58.198116302490234, + "logps/rejected": -99.60914611816406, + "loss": 0.7436, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2507946491241455, + "rewards/margins": 6.344855308532715, + "rewards/rejected": -3.0940604209899902, + "step": 10271 + }, + { + "epoch": 2.57, + "grad_norm": 3.2476160526275635, + "learning_rate": 4.780387124629327e-06, + "logits/chosen": -0.5712916851043701, + "logits/rejected": -0.6101463437080383, + "logps/chosen": -45.5455322265625, + "logps/rejected": -104.962646484375, + "loss": 0.657, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.007392168045044, + "rewards/margins": 6.165408134460449, + "rewards/rejected": -3.1580162048339844, + "step": 10272 + }, + { + "epoch": 2.57, + "grad_norm": 10.22811222076416, + "learning_rate": 4.779601898218653e-06, + "logits/chosen": -0.5334895253181458, + "logits/rejected": -0.6034260988235474, + "logps/chosen": -63.2494010925293, + "logps/rejected": -93.02092742919922, + "loss": 0.9691, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.556734561920166, + "rewards/margins": 5.57842493057251, + "rewards/rejected": -3.0216903686523438, + "step": 10273 + }, + { + "epoch": 2.57, + "grad_norm": 10.478768348693848, + "learning_rate": 4.778816677254249e-06, + "logits/chosen": -0.549187421798706, + "logits/rejected": -0.6323596239089966, + "logps/chosen": -59.11766815185547, + "logps/rejected": -96.43846130371094, + "loss": 0.7501, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.71819806098938, + "rewards/margins": 5.3675994873046875, + "rewards/rejected": -2.6494014263153076, + "step": 10274 + }, + { + "epoch": 2.57, + "grad_norm": 4.000488758087158, + "learning_rate": 4.778031461755523e-06, + "logits/chosen": -0.5505651235580444, + "logits/rejected": -0.6330541968345642, + "logps/chosen": -53.020198822021484, + "logps/rejected": -93.46058654785156, + "loss": 0.5775, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1766717433929443, + "rewards/margins": 6.606569290161133, + "rewards/rejected": -3.4298975467681885, + "step": 10275 + }, + { + "epoch": 2.57, + "grad_norm": 7.287330627441406, + "learning_rate": 4.777246251741875e-06, + "logits/chosen": -0.5253458619117737, + "logits/rejected": -0.6231063008308411, + "logps/chosen": -49.06126403808594, + "logps/rejected": -93.92626190185547, + "loss": 0.6279, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.103670597076416, + "rewards/margins": 5.441829204559326, + "rewards/rejected": -2.338158130645752, + "step": 10276 + }, + { + "epoch": 2.57, + "grad_norm": 2.404618978500366, + "learning_rate": 4.776461047232707e-06, + "logits/chosen": -0.55244380235672, + "logits/rejected": -0.5862825512886047, + "logps/chosen": -66.30835723876953, + "logps/rejected": -121.4195327758789, + "loss": 0.6663, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1767024993896484, + "rewards/margins": 7.3106184005737305, + "rewards/rejected": -4.133915424346924, + "step": 10277 + }, + { + "epoch": 2.57, + "grad_norm": 14.049461364746094, + "learning_rate": 4.775675848247427e-06, + "logits/chosen": -0.4767635464668274, + "logits/rejected": -0.5349775552749634, + "logps/chosen": -55.15666961669922, + "logps/rejected": -113.92869567871094, + "loss": 0.8232, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7985312938690186, + "rewards/margins": 6.9235734939575195, + "rewards/rejected": -4.125042915344238, + "step": 10278 + }, + { + "epoch": 2.57, + "grad_norm": 11.078858375549316, + "learning_rate": 4.774890654805433e-06, + "logits/chosen": -0.4796277582645416, + "logits/rejected": -0.5716686844825745, + "logps/chosen": -51.613983154296875, + "logps/rejected": -84.5539779663086, + "loss": 0.6847, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.935115337371826, + "rewards/margins": 5.13883113861084, + "rewards/rejected": -2.2037158012390137, + "step": 10279 + }, + { + "epoch": 2.57, + "grad_norm": 5.554602146148682, + "learning_rate": 4.774105466926131e-06, + "logits/chosen": -0.44049298763275146, + "logits/rejected": -0.5241047739982605, + "logps/chosen": -58.91078567504883, + "logps/rejected": -84.23309326171875, + "loss": 0.6361, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.174801826477051, + "rewards/margins": 5.8673529624938965, + "rewards/rejected": -2.6925506591796875, + "step": 10280 + }, + { + "epoch": 2.57, + "grad_norm": 6.343365669250488, + "learning_rate": 4.773320284628924e-06, + "logits/chosen": -0.6060620546340942, + "logits/rejected": -0.6540932655334473, + "logps/chosen": -51.50676345825195, + "logps/rejected": -93.1565933227539, + "loss": 0.625, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.039979934692383, + "rewards/margins": 5.959596157073975, + "rewards/rejected": -2.919616460800171, + "step": 10281 + }, + { + "epoch": 2.57, + "grad_norm": 4.112285137176514, + "learning_rate": 4.7725351079332135e-06, + "logits/chosen": -0.5201786756515503, + "logits/rejected": -0.5864309668540955, + "logps/chosen": -50.84682083129883, + "logps/rejected": -121.80550384521484, + "loss": 0.6437, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.010617256164551, + "rewards/margins": 6.253381729125977, + "rewards/rejected": -3.242764472961426, + "step": 10282 + }, + { + "epoch": 2.57, + "grad_norm": 6.009342670440674, + "learning_rate": 4.771749936858402e-06, + "logits/chosen": -0.4576835036277771, + "logits/rejected": -0.5067869424819946, + "logps/chosen": -50.66606903076172, + "logps/rejected": -89.59488677978516, + "loss": 0.7495, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.988231897354126, + "rewards/margins": 4.780947208404541, + "rewards/rejected": -1.7927155494689941, + "step": 10283 + }, + { + "epoch": 2.57, + "grad_norm": 12.241095542907715, + "learning_rate": 4.7709647714238925e-06, + "logits/chosen": -0.5083077549934387, + "logits/rejected": -0.602216362953186, + "logps/chosen": -58.99528121948242, + "logps/rejected": -102.42196655273438, + "loss": 0.752, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2374303340911865, + "rewards/margins": 6.124521255493164, + "rewards/rejected": -2.8870911598205566, + "step": 10284 + }, + { + "epoch": 2.57, + "grad_norm": 5.394942760467529, + "learning_rate": 4.7701796116490885e-06, + "logits/chosen": -0.48705172538757324, + "logits/rejected": -0.5789768695831299, + "logps/chosen": -55.680213928222656, + "logps/rejected": -98.38714599609375, + "loss": 0.6179, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7362475395202637, + "rewards/margins": 6.4920654296875, + "rewards/rejected": -3.7558178901672363, + "step": 10285 + }, + { + "epoch": 2.57, + "grad_norm": 9.561604499816895, + "learning_rate": 4.76939445755339e-06, + "logits/chosen": -0.49969202280044556, + "logits/rejected": -0.5950953960418701, + "logps/chosen": -62.412784576416016, + "logps/rejected": -98.76952362060547, + "loss": 0.7596, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9208226203918457, + "rewards/margins": 5.753776550292969, + "rewards/rejected": -2.832953691482544, + "step": 10286 + }, + { + "epoch": 2.57, + "grad_norm": 6.0101165771484375, + "learning_rate": 4.768609309156199e-06, + "logits/chosen": -0.5348281264305115, + "logits/rejected": -0.608370840549469, + "logps/chosen": -53.18196487426758, + "logps/rejected": -105.94182586669922, + "loss": 0.6595, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.64768648147583, + "rewards/margins": 6.038732051849365, + "rewards/rejected": -3.391044855117798, + "step": 10287 + }, + { + "epoch": 2.57, + "grad_norm": 5.688398361206055, + "learning_rate": 4.767824166476918e-06, + "logits/chosen": -0.4675283432006836, + "logits/rejected": -0.5352882146835327, + "logps/chosen": -63.14599609375, + "logps/rejected": -88.54092407226562, + "loss": 0.7283, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.906708002090454, + "rewards/margins": 5.401224136352539, + "rewards/rejected": -2.494515895843506, + "step": 10288 + }, + { + "epoch": 2.57, + "grad_norm": 5.267609119415283, + "learning_rate": 4.76703902953495e-06, + "logits/chosen": -0.39457061886787415, + "logits/rejected": -0.5259622931480408, + "logps/chosen": -65.71543884277344, + "logps/rejected": -118.93805694580078, + "loss": 0.6536, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0135459899902344, + "rewards/margins": 7.216119289398193, + "rewards/rejected": -4.202573299407959, + "step": 10289 + }, + { + "epoch": 2.57, + "grad_norm": 4.269800662994385, + "learning_rate": 4.766253898349694e-06, + "logits/chosen": -0.49047625064849854, + "logits/rejected": -0.5628980398178101, + "logps/chosen": -57.29564666748047, + "logps/rejected": -114.569580078125, + "loss": 0.6428, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2056705951690674, + "rewards/margins": 6.749941349029541, + "rewards/rejected": -3.5442707538604736, + "step": 10290 + }, + { + "epoch": 2.57, + "grad_norm": 3.8328683376312256, + "learning_rate": 4.765468772940552e-06, + "logits/chosen": -0.4946295917034149, + "logits/rejected": -0.553267776966095, + "logps/chosen": -55.07411193847656, + "logps/rejected": -94.4554443359375, + "loss": 0.6287, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.757272958755493, + "rewards/margins": 5.4379143714904785, + "rewards/rejected": -2.6806416511535645, + "step": 10291 + }, + { + "epoch": 2.57, + "grad_norm": 4.730795860290527, + "learning_rate": 4.764683653326928e-06, + "logits/chosen": -0.5438011884689331, + "logits/rejected": -0.6198583841323853, + "logps/chosen": -53.42214584350586, + "logps/rejected": -100.35591125488281, + "loss": 0.644, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1032662391662598, + "rewards/margins": 6.730325698852539, + "rewards/rejected": -3.6270599365234375, + "step": 10292 + }, + { + "epoch": 2.57, + "grad_norm": 4.15499210357666, + "learning_rate": 4.763898539528223e-06, + "logits/chosen": -0.5355070233345032, + "logits/rejected": -0.5848070383071899, + "logps/chosen": -49.46895217895508, + "logps/rejected": -114.70419311523438, + "loss": 0.6201, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8998186588287354, + "rewards/margins": 6.703442573547363, + "rewards/rejected": -3.803623914718628, + "step": 10293 + }, + { + "epoch": 2.58, + "grad_norm": 4.066052436828613, + "learning_rate": 4.763113431563832e-06, + "logits/chosen": -0.48463714122772217, + "logits/rejected": -0.5345385670661926, + "logps/chosen": -48.03382110595703, + "logps/rejected": -105.89811706542969, + "loss": 0.6566, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.995664596557617, + "rewards/margins": 6.595943927764893, + "rewards/rejected": -3.6002793312072754, + "step": 10294 + }, + { + "epoch": 2.58, + "grad_norm": 6.249892711639404, + "learning_rate": 4.762328329453162e-06, + "logits/chosen": -0.5507751107215881, + "logits/rejected": -0.5851712822914124, + "logps/chosen": -47.1329345703125, + "logps/rejected": -88.44473266601562, + "loss": 0.733, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.054011821746826, + "rewards/margins": 5.096561908721924, + "rewards/rejected": -2.0425500869750977, + "step": 10295 + }, + { + "epoch": 2.58, + "grad_norm": 6.4457783699035645, + "learning_rate": 4.761543233215612e-06, + "logits/chosen": -0.49920302629470825, + "logits/rejected": -0.5861674547195435, + "logps/chosen": -54.879180908203125, + "logps/rejected": -94.52272033691406, + "loss": 0.6997, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9221949577331543, + "rewards/margins": 5.665034294128418, + "rewards/rejected": -2.7428390979766846, + "step": 10296 + }, + { + "epoch": 2.58, + "grad_norm": 4.743097305297852, + "learning_rate": 4.7607581428705815e-06, + "logits/chosen": -0.5251166820526123, + "logits/rejected": -0.6467692255973816, + "logps/chosen": -55.055477142333984, + "logps/rejected": -86.05048370361328, + "loss": 0.6187, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.918470859527588, + "rewards/margins": 6.410102367401123, + "rewards/rejected": -3.491631507873535, + "step": 10297 + }, + { + "epoch": 2.58, + "grad_norm": 6.322535514831543, + "learning_rate": 4.759973058437473e-06, + "logits/chosen": -0.49965405464172363, + "logits/rejected": -0.577991247177124, + "logps/chosen": -60.445152282714844, + "logps/rejected": -103.44145965576172, + "loss": 0.7139, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.898993968963623, + "rewards/margins": 6.659991264343262, + "rewards/rejected": -3.7609975337982178, + "step": 10298 + }, + { + "epoch": 2.58, + "grad_norm": 5.33754825592041, + "learning_rate": 4.759187979935684e-06, + "logits/chosen": -0.47934776544570923, + "logits/rejected": -0.6064466238021851, + "logps/chosen": -56.768646240234375, + "logps/rejected": -84.1443862915039, + "loss": 0.6501, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0599365234375, + "rewards/margins": 6.724078178405762, + "rewards/rejected": -3.66414213180542, + "step": 10299 + }, + { + "epoch": 2.58, + "grad_norm": 4.068924427032471, + "learning_rate": 4.758402907384618e-06, + "logits/chosen": -0.5145232677459717, + "logits/rejected": -0.6368184685707092, + "logps/chosen": -61.095096588134766, + "logps/rejected": -89.86750030517578, + "loss": 0.6304, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.168403387069702, + "rewards/margins": 6.602327346801758, + "rewards/rejected": -3.433924674987793, + "step": 10300 + }, + { + "epoch": 2.58, + "grad_norm": 1.9063215255737305, + "learning_rate": 4.757617840803672e-06, + "logits/chosen": -0.5136862993240356, + "logits/rejected": -0.6388113498687744, + "logps/chosen": -69.34149169921875, + "logps/rejected": -97.55118560791016, + "loss": 0.6001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3596060276031494, + "rewards/margins": 7.429920196533203, + "rewards/rejected": -4.070314884185791, + "step": 10301 + }, + { + "epoch": 2.58, + "grad_norm": 5.041460037231445, + "learning_rate": 4.756832780212247e-06, + "logits/chosen": -0.5241873264312744, + "logits/rejected": -0.6079310178756714, + "logps/chosen": -44.85786819458008, + "logps/rejected": -87.16314697265625, + "loss": 0.5776, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.462764263153076, + "rewards/margins": 6.261335849761963, + "rewards/rejected": -2.798572301864624, + "step": 10302 + }, + { + "epoch": 2.58, + "grad_norm": 9.003833770751953, + "learning_rate": 4.756047725629744e-06, + "logits/chosen": -0.47035956382751465, + "logits/rejected": -0.48716792464256287, + "logps/chosen": -58.291595458984375, + "logps/rejected": -98.38151550292969, + "loss": 1.0306, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.933450698852539, + "rewards/margins": 4.576324462890625, + "rewards/rejected": -1.6428738832473755, + "step": 10303 + }, + { + "epoch": 2.58, + "grad_norm": 5.3338117599487305, + "learning_rate": 4.755262677075559e-06, + "logits/chosen": -0.5300810933113098, + "logits/rejected": -0.5982933640480042, + "logps/chosen": -57.42163848876953, + "logps/rejected": -89.88288879394531, + "loss": 0.7827, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.984848976135254, + "rewards/margins": 5.286506652832031, + "rewards/rejected": -2.301657199859619, + "step": 10304 + }, + { + "epoch": 2.58, + "grad_norm": 6.075064659118652, + "learning_rate": 4.754477634569094e-06, + "logits/chosen": -0.5068169236183167, + "logits/rejected": -0.5814027786254883, + "logps/chosen": -56.57651138305664, + "logps/rejected": -109.87152099609375, + "loss": 0.7076, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1434175968170166, + "rewards/margins": 7.292959213256836, + "rewards/rejected": -4.149541854858398, + "step": 10305 + }, + { + "epoch": 2.58, + "grad_norm": 7.496376991271973, + "learning_rate": 4.753692598129747e-06, + "logits/chosen": -0.5772878527641296, + "logits/rejected": -0.6647746562957764, + "logps/chosen": -61.44493103027344, + "logps/rejected": -99.93289184570312, + "loss": 0.7453, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1868414878845215, + "rewards/margins": 6.3408098220825195, + "rewards/rejected": -3.153968572616577, + "step": 10306 + }, + { + "epoch": 2.58, + "grad_norm": 2.225055456161499, + "learning_rate": 4.752907567776921e-06, + "logits/chosen": -0.6020587682723999, + "logits/rejected": -0.68731290102005, + "logps/chosen": -46.40596008300781, + "logps/rejected": -102.0256576538086, + "loss": 0.595, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.102015495300293, + "rewards/margins": 6.505859375, + "rewards/rejected": -3.403843402862549, + "step": 10307 + }, + { + "epoch": 2.58, + "grad_norm": 3.425185203552246, + "learning_rate": 4.752122543530009e-06, + "logits/chosen": -0.4529561400413513, + "logits/rejected": -0.5443748831748962, + "logps/chosen": -61.81676483154297, + "logps/rejected": -113.27545928955078, + "loss": 0.5621, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1194374561309814, + "rewards/margins": 7.017451286315918, + "rewards/rejected": -3.8980140686035156, + "step": 10308 + }, + { + "epoch": 2.58, + "grad_norm": 2.968716859817505, + "learning_rate": 4.751337525408413e-06, + "logits/chosen": -0.5210052728652954, + "logits/rejected": -0.5881626605987549, + "logps/chosen": -51.0028076171875, + "logps/rejected": -99.44076538085938, + "loss": 0.6557, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1406548023223877, + "rewards/margins": 6.001362323760986, + "rewards/rejected": -2.8607072830200195, + "step": 10309 + }, + { + "epoch": 2.58, + "grad_norm": 3.8281004428863525, + "learning_rate": 4.750552513431534e-06, + "logits/chosen": -0.531599223613739, + "logits/rejected": -0.6444970965385437, + "logps/chosen": -60.89738464355469, + "logps/rejected": -88.90863037109375, + "loss": 0.7178, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9691340923309326, + "rewards/margins": 6.343688011169434, + "rewards/rejected": -3.374553680419922, + "step": 10310 + }, + { + "epoch": 2.58, + "grad_norm": 5.191071033477783, + "learning_rate": 4.749767507618764e-06, + "logits/chosen": -0.5128529071807861, + "logits/rejected": -0.5918357372283936, + "logps/chosen": -57.408294677734375, + "logps/rejected": -92.43065643310547, + "loss": 0.7734, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.914886713027954, + "rewards/margins": 5.963954925537109, + "rewards/rejected": -3.0490682125091553, + "step": 10311 + }, + { + "epoch": 2.58, + "grad_norm": 5.768508434295654, + "learning_rate": 4.748982507989507e-06, + "logits/chosen": -0.45278578996658325, + "logits/rejected": -0.5073575973510742, + "logps/chosen": -60.46094512939453, + "logps/rejected": -97.7583236694336, + "loss": 0.741, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1118736267089844, + "rewards/margins": 5.806930065155029, + "rewards/rejected": -2.695056438446045, + "step": 10312 + }, + { + "epoch": 2.58, + "grad_norm": 7.268344402313232, + "learning_rate": 4.748197514563159e-06, + "logits/chosen": -0.5148289799690247, + "logits/rejected": -0.6096825003623962, + "logps/chosen": -50.46283721923828, + "logps/rejected": -104.135986328125, + "loss": 0.6882, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.998134136199951, + "rewards/margins": 6.583679676055908, + "rewards/rejected": -3.585545539855957, + "step": 10313 + }, + { + "epoch": 2.58, + "grad_norm": 5.6404619216918945, + "learning_rate": 4.747412527359118e-06, + "logits/chosen": -0.4017801582813263, + "logits/rejected": -0.4677514433860779, + "logps/chosen": -56.88966369628906, + "logps/rejected": -94.32886505126953, + "loss": 0.7532, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.049903392791748, + "rewards/margins": 4.880192756652832, + "rewards/rejected": -1.8302900791168213, + "step": 10314 + }, + { + "epoch": 2.58, + "grad_norm": 3.495912551879883, + "learning_rate": 4.746627546396783e-06, + "logits/chosen": -0.5852124094963074, + "logits/rejected": -0.6674439311027527, + "logps/chosen": -41.256221771240234, + "logps/rejected": -99.869384765625, + "loss": 0.5851, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.099050998687744, + "rewards/margins": 7.601541042327881, + "rewards/rejected": -4.502490043640137, + "step": 10315 + }, + { + "epoch": 2.58, + "grad_norm": 4.695819854736328, + "learning_rate": 4.7458425716955495e-06, + "logits/chosen": -0.5446197986602783, + "logits/rejected": -0.6296533346176147, + "logps/chosen": -61.22627258300781, + "logps/rejected": -85.047119140625, + "loss": 0.6562, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.98032808303833, + "rewards/margins": 5.008375644683838, + "rewards/rejected": -2.0280470848083496, + "step": 10316 + }, + { + "epoch": 2.58, + "grad_norm": 11.634550094604492, + "learning_rate": 4.745057603274817e-06, + "logits/chosen": -0.39089131355285645, + "logits/rejected": -0.4989790916442871, + "logps/chosen": -51.83790588378906, + "logps/rejected": -84.94627380371094, + "loss": 0.6286, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0585832595825195, + "rewards/margins": 5.782671928405762, + "rewards/rejected": -2.7240891456604004, + "step": 10317 + }, + { + "epoch": 2.58, + "grad_norm": 4.745438575744629, + "learning_rate": 4.744272641153983e-06, + "logits/chosen": -0.5277655124664307, + "logits/rejected": -0.6131779551506042, + "logps/chosen": -60.65711212158203, + "logps/rejected": -94.62271881103516, + "loss": 0.6944, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.842918872833252, + "rewards/margins": 6.084347248077393, + "rewards/rejected": -3.2414283752441406, + "step": 10318 + }, + { + "epoch": 2.58, + "grad_norm": 3.2142837047576904, + "learning_rate": 4.7434876853524425e-06, + "logits/chosen": -0.5144175291061401, + "logits/rejected": -0.5802764892578125, + "logps/chosen": -70.99090576171875, + "logps/rejected": -101.16382598876953, + "loss": 0.7264, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.405907392501831, + "rewards/margins": 7.097468376159668, + "rewards/rejected": -3.691560745239258, + "step": 10319 + }, + { + "epoch": 2.58, + "grad_norm": 5.689807891845703, + "learning_rate": 4.742702735889594e-06, + "logits/chosen": -0.528099000453949, + "logits/rejected": -0.6024217009544373, + "logps/chosen": -54.10069274902344, + "logps/rejected": -95.38946533203125, + "loss": 0.6266, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2672488689422607, + "rewards/margins": 5.582590579986572, + "rewards/rejected": -2.3153421878814697, + "step": 10320 + }, + { + "epoch": 2.58, + "grad_norm": 3.174741744995117, + "learning_rate": 4.741917792784837e-06, + "logits/chosen": -0.47519931197166443, + "logits/rejected": -0.5131486654281616, + "logps/chosen": -63.21131896972656, + "logps/rejected": -114.43954467773438, + "loss": 0.6217, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.12138032913208, + "rewards/margins": 5.905436038970947, + "rewards/rejected": -2.784055471420288, + "step": 10321 + }, + { + "epoch": 2.58, + "grad_norm": 5.616085052490234, + "learning_rate": 4.7411328560575635e-06, + "logits/chosen": -0.5104984045028687, + "logits/rejected": -0.5974884629249573, + "logps/chosen": -55.3748893737793, + "logps/rejected": -93.73063659667969, + "loss": 0.7066, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.759687900543213, + "rewards/margins": 5.965764045715332, + "rewards/rejected": -3.206076145172119, + "step": 10322 + }, + { + "epoch": 2.58, + "grad_norm": 6.404204845428467, + "learning_rate": 4.740347925727172e-06, + "logits/chosen": -0.5419244766235352, + "logits/rejected": -0.6316972374916077, + "logps/chosen": -45.97224426269531, + "logps/rejected": -87.39818572998047, + "loss": 0.5957, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0595929622650146, + "rewards/margins": 6.562704086303711, + "rewards/rejected": -3.5031113624572754, + "step": 10323 + }, + { + "epoch": 2.58, + "grad_norm": 6.471529483795166, + "learning_rate": 4.73956300181306e-06, + "logits/chosen": -0.4861716032028198, + "logits/rejected": -0.5793617963790894, + "logps/chosen": -53.85096740722656, + "logps/rejected": -95.04808044433594, + "loss": 0.6064, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.160964012145996, + "rewards/margins": 5.940522193908691, + "rewards/rejected": -2.7795584201812744, + "step": 10324 + }, + { + "epoch": 2.58, + "grad_norm": 11.42125129699707, + "learning_rate": 4.738778084334625e-06, + "logits/chosen": -0.5396676659584045, + "logits/rejected": -0.633368730545044, + "logps/chosen": -59.209232330322266, + "logps/rejected": -85.65518188476562, + "loss": 0.9079, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.799271821975708, + "rewards/margins": 5.608529567718506, + "rewards/rejected": -2.8092575073242188, + "step": 10325 + }, + { + "epoch": 2.58, + "grad_norm": 4.787158966064453, + "learning_rate": 4.737993173311261e-06, + "logits/chosen": -0.5073928236961365, + "logits/rejected": -0.6245648264884949, + "logps/chosen": -55.87664794921875, + "logps/rejected": -77.26429748535156, + "loss": 0.6946, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9846978187561035, + "rewards/margins": 5.126350402832031, + "rewards/rejected": -2.1416521072387695, + "step": 10326 + }, + { + "epoch": 2.58, + "grad_norm": 2.7116847038269043, + "learning_rate": 4.737208268762364e-06, + "logits/chosen": -0.44071221351623535, + "logits/rejected": -0.5465958118438721, + "logps/chosen": -50.36589050292969, + "logps/rejected": -93.87349700927734, + "loss": 0.5377, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1977133750915527, + "rewards/margins": 6.8774309158325195, + "rewards/rejected": -3.679718017578125, + "step": 10327 + }, + { + "epoch": 2.58, + "grad_norm": 5.871499061584473, + "learning_rate": 4.73642337070733e-06, + "logits/chosen": -0.45164644718170166, + "logits/rejected": -0.5703858137130737, + "logps/chosen": -58.76801681518555, + "logps/rejected": -106.19784545898438, + "loss": 0.6867, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9087629318237305, + "rewards/margins": 6.885769844055176, + "rewards/rejected": -3.9770069122314453, + "step": 10328 + }, + { + "epoch": 2.58, + "grad_norm": 8.46413803100586, + "learning_rate": 4.735638479165554e-06, + "logits/chosen": -0.44032251834869385, + "logits/rejected": -0.5343479514122009, + "logps/chosen": -59.846763610839844, + "logps/rejected": -86.07927703857422, + "loss": 0.6917, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5379559993743896, + "rewards/margins": 6.307885646820068, + "rewards/rejected": -3.7699296474456787, + "step": 10329 + }, + { + "epoch": 2.58, + "grad_norm": 5.057631015777588, + "learning_rate": 4.734853594156434e-06, + "logits/chosen": -0.4200778603553772, + "logits/rejected": -0.4804415702819824, + "logps/chosen": -60.02370834350586, + "logps/rejected": -91.31303405761719, + "loss": 0.6564, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.845881700515747, + "rewards/margins": 5.207207679748535, + "rewards/rejected": -2.361326217651367, + "step": 10330 + }, + { + "epoch": 2.58, + "grad_norm": 6.608206272125244, + "learning_rate": 4.734068715699362e-06, + "logits/chosen": -0.5620952844619751, + "logits/rejected": -0.6691449880599976, + "logps/chosen": -54.0869026184082, + "logps/rejected": -78.90232849121094, + "loss": 0.7704, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9471359252929688, + "rewards/margins": 5.473174095153809, + "rewards/rejected": -2.5260379314422607, + "step": 10331 + }, + { + "epoch": 2.58, + "grad_norm": 2.8924965858459473, + "learning_rate": 4.733283843813736e-06, + "logits/chosen": -0.5493297576904297, + "logits/rejected": -0.6339088082313538, + "logps/chosen": -55.10151672363281, + "logps/rejected": -89.91876220703125, + "loss": 0.6171, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9164481163024902, + "rewards/margins": 6.530256271362305, + "rewards/rejected": -3.613807439804077, + "step": 10332 + }, + { + "epoch": 2.58, + "grad_norm": 5.0551605224609375, + "learning_rate": 4.7324989785189495e-06, + "logits/chosen": -0.4775075614452362, + "logits/rejected": -0.49225670099258423, + "logps/chosen": -52.99944305419922, + "logps/rejected": -120.41439819335938, + "loss": 0.6443, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9056429862976074, + "rewards/margins": 5.853077411651611, + "rewards/rejected": -2.947434902191162, + "step": 10333 + }, + { + "epoch": 2.59, + "grad_norm": 4.319678783416748, + "learning_rate": 4.731714119834397e-06, + "logits/chosen": -0.4994503855705261, + "logits/rejected": -0.6139677166938782, + "logps/chosen": -52.44605255126953, + "logps/rejected": -81.96607208251953, + "loss": 0.6141, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6851229667663574, + "rewards/margins": 5.833982944488525, + "rewards/rejected": -3.148859977722168, + "step": 10334 + }, + { + "epoch": 2.59, + "grad_norm": 4.543271064758301, + "learning_rate": 4.730929267779476e-06, + "logits/chosen": -0.5394991040229797, + "logits/rejected": -0.6310797929763794, + "logps/chosen": -49.19043731689453, + "logps/rejected": -90.810546875, + "loss": 0.6853, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0695993900299072, + "rewards/margins": 6.259633541107178, + "rewards/rejected": -3.1900336742401123, + "step": 10335 + }, + { + "epoch": 2.59, + "grad_norm": 4.486553192138672, + "learning_rate": 4.730144422373578e-06, + "logits/chosen": -0.4896552860736847, + "logits/rejected": -0.5737242698669434, + "logps/chosen": -48.991539001464844, + "logps/rejected": -101.46104431152344, + "loss": 0.6032, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.256459951400757, + "rewards/margins": 7.300456523895264, + "rewards/rejected": -4.043996334075928, + "step": 10336 + }, + { + "epoch": 2.59, + "grad_norm": 6.608023166656494, + "learning_rate": 4.729359583636098e-06, + "logits/chosen": -0.5498002767562866, + "logits/rejected": -0.5906602144241333, + "logps/chosen": -45.404205322265625, + "logps/rejected": -95.74688720703125, + "loss": 0.5857, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8790292739868164, + "rewards/margins": 5.907054424285889, + "rewards/rejected": -3.0280253887176514, + "step": 10337 + }, + { + "epoch": 2.59, + "grad_norm": 6.22788143157959, + "learning_rate": 4.728574751586429e-06, + "logits/chosen": -0.5170773863792419, + "logits/rejected": -0.5793758630752563, + "logps/chosen": -50.93512725830078, + "logps/rejected": -98.5689468383789, + "loss": 0.7939, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0569825172424316, + "rewards/margins": 4.156047344207764, + "rewards/rejected": -1.0990649461746216, + "step": 10338 + }, + { + "epoch": 2.59, + "grad_norm": 4.323389530181885, + "learning_rate": 4.72778992624397e-06, + "logits/chosen": -0.5547512769699097, + "logits/rejected": -0.6497500538825989, + "logps/chosen": -56.1132698059082, + "logps/rejected": -95.92277526855469, + "loss": 0.6623, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1100361347198486, + "rewards/margins": 6.382911205291748, + "rewards/rejected": -3.2728750705718994, + "step": 10339 + }, + { + "epoch": 2.59, + "grad_norm": 3.3160996437072754, + "learning_rate": 4.7270051076281095e-06, + "logits/chosen": -0.5030998587608337, + "logits/rejected": -0.5365388989448547, + "logps/chosen": -50.07677459716797, + "logps/rejected": -104.40288543701172, + "loss": 0.5996, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1833436489105225, + "rewards/margins": 6.7855353355407715, + "rewards/rejected": -3.602191925048828, + "step": 10340 + }, + { + "epoch": 2.59, + "grad_norm": 9.029017448425293, + "learning_rate": 4.726220295758243e-06, + "logits/chosen": -0.5398955345153809, + "logits/rejected": -0.6537412405014038, + "logps/chosen": -63.753807067871094, + "logps/rejected": -96.32715606689453, + "loss": 0.7579, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1615653038024902, + "rewards/margins": 6.464467525482178, + "rewards/rejected": -3.30290150642395, + "step": 10341 + }, + { + "epoch": 2.59, + "grad_norm": 3.871079206466675, + "learning_rate": 4.725435490653765e-06, + "logits/chosen": -0.4619983434677124, + "logits/rejected": -0.5052129030227661, + "logps/chosen": -57.24839401245117, + "logps/rejected": -99.14275360107422, + "loss": 0.6617, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0133488178253174, + "rewards/margins": 4.9050726890563965, + "rewards/rejected": -1.891723871231079, + "step": 10342 + }, + { + "epoch": 2.59, + "grad_norm": 4.133667469024658, + "learning_rate": 4.724650692334068e-06, + "logits/chosen": -0.4860248863697052, + "logits/rejected": -0.5529654026031494, + "logps/chosen": -49.80998229980469, + "logps/rejected": -88.25493621826172, + "loss": 0.5808, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2168095111846924, + "rewards/margins": 5.896541118621826, + "rewards/rejected": -2.6797313690185547, + "step": 10343 + }, + { + "epoch": 2.59, + "grad_norm": 4.46819543838501, + "learning_rate": 4.723865900818545e-06, + "logits/chosen": -0.5778586864471436, + "logits/rejected": -0.6601797342300415, + "logps/chosen": -51.122169494628906, + "logps/rejected": -91.51945495605469, + "loss": 0.6159, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.120755434036255, + "rewards/margins": 5.914137840270996, + "rewards/rejected": -2.7933828830718994, + "step": 10344 + }, + { + "epoch": 2.59, + "grad_norm": 4.317911624908447, + "learning_rate": 4.7230811161265885e-06, + "logits/chosen": -0.5564848780632019, + "logits/rejected": -0.6023440361022949, + "logps/chosen": -53.43199157714844, + "logps/rejected": -115.44965362548828, + "loss": 0.6551, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.142418622970581, + "rewards/margins": 7.027146816253662, + "rewards/rejected": -3.884727954864502, + "step": 10345 + }, + { + "epoch": 2.59, + "grad_norm": 8.533004760742188, + "learning_rate": 4.7222963382775944e-06, + "logits/chosen": -0.5544722676277161, + "logits/rejected": -0.62661212682724, + "logps/chosen": -67.68944549560547, + "logps/rejected": -81.67332458496094, + "loss": 0.7585, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8678786754608154, + "rewards/margins": 3.9118874073028564, + "rewards/rejected": -1.0440083742141724, + "step": 10346 + }, + { + "epoch": 2.59, + "grad_norm": 3.613117218017578, + "learning_rate": 4.721511567290951e-06, + "logits/chosen": -0.4623504877090454, + "logits/rejected": -0.5767480134963989, + "logps/chosen": -61.64079284667969, + "logps/rejected": -100.97831726074219, + "loss": 0.6607, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7672369480133057, + "rewards/margins": 6.376897811889648, + "rewards/rejected": -3.6096606254577637, + "step": 10347 + }, + { + "epoch": 2.59, + "grad_norm": 6.124184608459473, + "learning_rate": 4.720726803186054e-06, + "logits/chosen": -0.5284598469734192, + "logits/rejected": -0.6571922898292542, + "logps/chosen": -47.077877044677734, + "logps/rejected": -109.6059341430664, + "loss": 0.5866, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.985337257385254, + "rewards/margins": 7.986843585968018, + "rewards/rejected": -5.001506328582764, + "step": 10348 + }, + { + "epoch": 2.59, + "grad_norm": 2.6950807571411133, + "learning_rate": 4.719942045982293e-06, + "logits/chosen": -0.5684983134269714, + "logits/rejected": -0.6343029141426086, + "logps/chosen": -61.157958984375, + "logps/rejected": -102.6285171508789, + "loss": 0.5817, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2754030227661133, + "rewards/margins": 7.27308988571167, + "rewards/rejected": -3.9976868629455566, + "step": 10349 + }, + { + "epoch": 2.59, + "grad_norm": 4.518805027008057, + "learning_rate": 4.719157295699065e-06, + "logits/chosen": -0.53062504529953, + "logits/rejected": -0.6038622856140137, + "logps/chosen": -47.71881866455078, + "logps/rejected": -100.88846588134766, + "loss": 0.6054, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.063213586807251, + "rewards/margins": 6.636432647705078, + "rewards/rejected": -3.573218584060669, + "step": 10350 + }, + { + "epoch": 2.59, + "grad_norm": 5.314666271209717, + "learning_rate": 4.718372552355756e-06, + "logits/chosen": -0.5180282592773438, + "logits/rejected": -0.5800869464874268, + "logps/chosen": -47.986209869384766, + "logps/rejected": -85.90900421142578, + "loss": 0.6391, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.077702760696411, + "rewards/margins": 4.956524848937988, + "rewards/rejected": -1.878822684288025, + "step": 10351 + }, + { + "epoch": 2.59, + "grad_norm": 5.875401973724365, + "learning_rate": 4.717587815971762e-06, + "logits/chosen": -0.48555561900138855, + "logits/rejected": -0.5586423277854919, + "logps/chosen": -58.627235412597656, + "logps/rejected": -119.5271224975586, + "loss": 0.748, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.875277280807495, + "rewards/margins": 7.448241233825684, + "rewards/rejected": -4.572963714599609, + "step": 10352 + }, + { + "epoch": 2.59, + "grad_norm": 3.2884457111358643, + "learning_rate": 4.716803086566475e-06, + "logits/chosen": -0.3767821490764618, + "logits/rejected": -0.4894578456878662, + "logps/chosen": -70.21160888671875, + "logps/rejected": -112.9209213256836, + "loss": 0.5942, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7590994834899902, + "rewards/margins": 6.551088809967041, + "rewards/rejected": -3.791990041732788, + "step": 10353 + }, + { + "epoch": 2.59, + "grad_norm": 6.870447158813477, + "learning_rate": 4.716018364159282e-06, + "logits/chosen": -0.5230193138122559, + "logits/rejected": -0.6199474334716797, + "logps/chosen": -61.33589172363281, + "logps/rejected": -103.39908599853516, + "loss": 0.7964, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0138444900512695, + "rewards/margins": 7.238627910614014, + "rewards/rejected": -4.224783897399902, + "step": 10354 + }, + { + "epoch": 2.59, + "grad_norm": 19.50660514831543, + "learning_rate": 4.71523364876958e-06, + "logits/chosen": -0.5154016613960266, + "logits/rejected": -0.5917529463768005, + "logps/chosen": -68.88024139404297, + "logps/rejected": -107.21879577636719, + "loss": 0.8784, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.14266300201416, + "rewards/margins": 6.5378828048706055, + "rewards/rejected": -3.3952198028564453, + "step": 10355 + }, + { + "epoch": 2.59, + "grad_norm": 6.442637920379639, + "learning_rate": 4.714448940416756e-06, + "logits/chosen": -0.4608847498893738, + "logits/rejected": -0.529749870300293, + "logps/chosen": -52.85009765625, + "logps/rejected": -100.49115753173828, + "loss": 0.7786, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.858295440673828, + "rewards/margins": 5.9234418869018555, + "rewards/rejected": -3.0651462078094482, + "step": 10356 + }, + { + "epoch": 2.59, + "grad_norm": 3.2139596939086914, + "learning_rate": 4.713664239120202e-06, + "logits/chosen": -0.5079267621040344, + "logits/rejected": -0.5728168487548828, + "logps/chosen": -50.34575271606445, + "logps/rejected": -93.78314208984375, + "loss": 0.6142, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0235390663146973, + "rewards/margins": 6.211025238037109, + "rewards/rejected": -3.187486171722412, + "step": 10357 + }, + { + "epoch": 2.59, + "grad_norm": 7.342255592346191, + "learning_rate": 4.712879544899311e-06, + "logits/chosen": -0.45040321350097656, + "logits/rejected": -0.505918025970459, + "logps/chosen": -57.980587005615234, + "logps/rejected": -100.77379608154297, + "loss": 0.6939, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.464181423187256, + "rewards/margins": 6.277429580688477, + "rewards/rejected": -2.813248872756958, + "step": 10358 + }, + { + "epoch": 2.59, + "grad_norm": 15.37413501739502, + "learning_rate": 4.7120948577734694e-06, + "logits/chosen": -0.48357126116752625, + "logits/rejected": -0.5445383787155151, + "logps/chosen": -53.34451675415039, + "logps/rejected": -104.33405303955078, + "loss": 0.7545, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.693523406982422, + "rewards/margins": 5.220225811004639, + "rewards/rejected": -2.5267019271850586, + "step": 10359 + }, + { + "epoch": 2.59, + "grad_norm": 11.506025314331055, + "learning_rate": 4.711310177762072e-06, + "logits/chosen": -0.5083938837051392, + "logits/rejected": -0.60220867395401, + "logps/chosen": -67.44236755371094, + "logps/rejected": -92.64486694335938, + "loss": 0.6567, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6294939517974854, + "rewards/margins": 5.972523212432861, + "rewards/rejected": -3.343029499053955, + "step": 10360 + }, + { + "epoch": 2.59, + "grad_norm": 3.947028875350952, + "learning_rate": 4.710525504884507e-06, + "logits/chosen": -0.42394885420799255, + "logits/rejected": -0.5673930644989014, + "logps/chosen": -72.31398010253906, + "logps/rejected": -102.14773559570312, + "loss": 0.6799, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0721895694732666, + "rewards/margins": 7.35726261138916, + "rewards/rejected": -4.2850728034973145, + "step": 10361 + }, + { + "epoch": 2.59, + "grad_norm": 3.039381980895996, + "learning_rate": 4.709740839160164e-06, + "logits/chosen": -0.5681300163269043, + "logits/rejected": -0.6605128049850464, + "logps/chosen": -57.45810317993164, + "logps/rejected": -102.45860290527344, + "loss": 0.5582, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.051982879638672, + "rewards/margins": 6.899478435516357, + "rewards/rejected": -3.8474960327148438, + "step": 10362 + }, + { + "epoch": 2.59, + "grad_norm": 7.926583290100098, + "learning_rate": 4.708956180608432e-06, + "logits/chosen": -0.5406527519226074, + "logits/rejected": -0.6043651103973389, + "logps/chosen": -57.22486114501953, + "logps/rejected": -109.35692596435547, + "loss": 0.7949, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5980076789855957, + "rewards/margins": 5.023223400115967, + "rewards/rejected": -2.42521595954895, + "step": 10363 + }, + { + "epoch": 2.59, + "grad_norm": 5.165571212768555, + "learning_rate": 4.708171529248706e-06, + "logits/chosen": -0.45218873023986816, + "logits/rejected": -0.5108489990234375, + "logps/chosen": -51.191795349121094, + "logps/rejected": -97.85847473144531, + "loss": 0.5883, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.978121757507324, + "rewards/margins": 6.71445894241333, + "rewards/rejected": -3.736337661743164, + "step": 10364 + }, + { + "epoch": 2.59, + "grad_norm": 4.692627906799316, + "learning_rate": 4.70738688510037e-06, + "logits/chosen": -0.526204526424408, + "logits/rejected": -0.6419272422790527, + "logps/chosen": -69.41387176513672, + "logps/rejected": -100.52337646484375, + "loss": 0.6635, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9746296405792236, + "rewards/margins": 6.529820919036865, + "rewards/rejected": -3.5551910400390625, + "step": 10365 + }, + { + "epoch": 2.59, + "grad_norm": 3.7732133865356445, + "learning_rate": 4.706602248182815e-06, + "logits/chosen": -0.4627615511417389, + "logits/rejected": -0.5932607650756836, + "logps/chosen": -52.546661376953125, + "logps/rejected": -85.28507232666016, + "loss": 0.575, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7506561279296875, + "rewards/margins": 6.2165632247924805, + "rewards/rejected": -3.465907096862793, + "step": 10366 + }, + { + "epoch": 2.59, + "grad_norm": 3.1117191314697266, + "learning_rate": 4.70581761851543e-06, + "logits/chosen": -0.4653448760509491, + "logits/rejected": -0.6033725738525391, + "logps/chosen": -62.6976318359375, + "logps/rejected": -103.08700561523438, + "loss": 0.706, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1666970252990723, + "rewards/margins": 7.776035308837891, + "rewards/rejected": -4.609338760375977, + "step": 10367 + }, + { + "epoch": 2.59, + "grad_norm": 6.696470260620117, + "learning_rate": 4.705032996117606e-06, + "logits/chosen": -0.4632665812969208, + "logits/rejected": -0.5860981941223145, + "logps/chosen": -64.66476440429688, + "logps/rejected": -105.58067321777344, + "loss": 0.6216, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.162294387817383, + "rewards/margins": 7.453583717346191, + "rewards/rejected": -4.291289329528809, + "step": 10368 + }, + { + "epoch": 2.59, + "grad_norm": 2.8356292247772217, + "learning_rate": 4.704248381008729e-06, + "logits/chosen": -0.4313378930091858, + "logits/rejected": -0.5611752867698669, + "logps/chosen": -67.64534759521484, + "logps/rejected": -94.7445068359375, + "loss": 0.6308, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8683223724365234, + "rewards/margins": 5.913719177246094, + "rewards/rejected": -3.045396566390991, + "step": 10369 + }, + { + "epoch": 2.59, + "grad_norm": 8.54123306274414, + "learning_rate": 4.70346377320819e-06, + "logits/chosen": -0.5046126246452332, + "logits/rejected": -0.6182578802108765, + "logps/chosen": -59.12953186035156, + "logps/rejected": -92.66939544677734, + "loss": 0.8276, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.204568862915039, + "rewards/margins": 6.060346603393555, + "rewards/rejected": -2.8557777404785156, + "step": 10370 + }, + { + "epoch": 2.59, + "grad_norm": 5.45065975189209, + "learning_rate": 4.702679172735378e-06, + "logits/chosen": -0.49259063601493835, + "logits/rejected": -0.5554101467132568, + "logps/chosen": -54.36445617675781, + "logps/rejected": -88.10708618164062, + "loss": 0.641, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.122255802154541, + "rewards/margins": 5.057409763336182, + "rewards/rejected": -1.9351534843444824, + "step": 10371 + }, + { + "epoch": 2.59, + "grad_norm": 3.3507039546966553, + "learning_rate": 4.701894579609678e-06, + "logits/chosen": -0.5615096092224121, + "logits/rejected": -0.6928802728652954, + "logps/chosen": -50.543052673339844, + "logps/rejected": -105.23243713378906, + "loss": 0.6164, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3301401138305664, + "rewards/margins": 8.043692588806152, + "rewards/rejected": -4.713552474975586, + "step": 10372 + }, + { + "epoch": 2.59, + "grad_norm": 3.260883331298828, + "learning_rate": 4.70110999385048e-06, + "logits/chosen": -0.48512977361679077, + "logits/rejected": -0.5216642618179321, + "logps/chosen": -51.603694915771484, + "logps/rejected": -104.84796905517578, + "loss": 0.5836, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2052369117736816, + "rewards/margins": 6.23281192779541, + "rewards/rejected": -3.0275745391845703, + "step": 10373 + }, + { + "epoch": 2.6, + "grad_norm": 5.383920669555664, + "learning_rate": 4.700325415477173e-06, + "logits/chosen": -0.49228280782699585, + "logits/rejected": -0.5242195129394531, + "logps/chosen": -62.87424087524414, + "logps/rejected": -96.7139663696289, + "loss": 0.6868, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1058104038238525, + "rewards/margins": 6.434323310852051, + "rewards/rejected": -3.328512668609619, + "step": 10374 + }, + { + "epoch": 2.6, + "grad_norm": 5.401259422302246, + "learning_rate": 4.699540844509143e-06, + "logits/chosen": -0.49033278226852417, + "logits/rejected": -0.6065973043441772, + "logps/chosen": -62.26383972167969, + "logps/rejected": -91.90884399414062, + "loss": 0.6385, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.877323627471924, + "rewards/margins": 5.220892906188965, + "rewards/rejected": -2.343569755554199, + "step": 10375 + }, + { + "epoch": 2.6, + "grad_norm": 6.085350036621094, + "learning_rate": 4.698756280965778e-06, + "logits/chosen": -0.5092767477035522, + "logits/rejected": -0.5713680386543274, + "logps/chosen": -71.34426879882812, + "logps/rejected": -120.14012145996094, + "loss": 0.7584, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.757204532623291, + "rewards/margins": 8.052075386047363, + "rewards/rejected": -5.2948713302612305, + "step": 10376 + }, + { + "epoch": 2.6, + "grad_norm": 5.019702434539795, + "learning_rate": 4.6979717248664665e-06, + "logits/chosen": -0.49446094036102295, + "logits/rejected": -0.5440678000450134, + "logps/chosen": -56.145606994628906, + "logps/rejected": -103.82533264160156, + "loss": 0.707, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1189663410186768, + "rewards/margins": 5.755765914916992, + "rewards/rejected": -2.6367993354797363, + "step": 10377 + }, + { + "epoch": 2.6, + "grad_norm": 7.274574279785156, + "learning_rate": 4.697187176230595e-06, + "logits/chosen": -0.467154324054718, + "logits/rejected": -0.5325412154197693, + "logps/chosen": -67.36848449707031, + "logps/rejected": -105.74396514892578, + "loss": 0.7114, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5801095962524414, + "rewards/margins": 5.864138603210449, + "rewards/rejected": -3.2840287685394287, + "step": 10378 + }, + { + "epoch": 2.6, + "grad_norm": 16.795320510864258, + "learning_rate": 4.69640263507755e-06, + "logits/chosen": -0.4626045823097229, + "logits/rejected": -0.5049264430999756, + "logps/chosen": -58.943519592285156, + "logps/rejected": -103.00455474853516, + "loss": 0.6749, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6754446029663086, + "rewards/margins": 5.558954238891602, + "rewards/rejected": -2.883509635925293, + "step": 10379 + }, + { + "epoch": 2.6, + "grad_norm": 21.27239227294922, + "learning_rate": 4.695618101426717e-06, + "logits/chosen": -0.5334334969520569, + "logits/rejected": -0.6324164867401123, + "logps/chosen": -60.72549819946289, + "logps/rejected": -99.48362731933594, + "loss": 0.798, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9311165809631348, + "rewards/margins": 6.156204700469971, + "rewards/rejected": -3.225087881088257, + "step": 10380 + }, + { + "epoch": 2.6, + "grad_norm": 10.54753303527832, + "learning_rate": 4.694833575297486e-06, + "logits/chosen": -0.4473223090171814, + "logits/rejected": -0.5308138132095337, + "logps/chosen": -61.77909851074219, + "logps/rejected": -113.15325927734375, + "loss": 0.699, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9352927207946777, + "rewards/margins": 6.562178611755371, + "rewards/rejected": -3.626885414123535, + "step": 10381 + }, + { + "epoch": 2.6, + "grad_norm": 22.80854034423828, + "learning_rate": 4.694049056709245e-06, + "logits/chosen": -0.4834389388561249, + "logits/rejected": -0.558508574962616, + "logps/chosen": -61.712345123291016, + "logps/rejected": -91.7939682006836, + "loss": 0.7172, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.641862630844116, + "rewards/margins": 5.434408187866211, + "rewards/rejected": -2.792545795440674, + "step": 10382 + }, + { + "epoch": 2.6, + "grad_norm": 55.71795654296875, + "learning_rate": 4.693264545681373e-06, + "logits/chosen": -0.4638536274433136, + "logits/rejected": -0.49387043714523315, + "logps/chosen": -61.32929229736328, + "logps/rejected": -107.3718032836914, + "loss": 0.7992, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.798372745513916, + "rewards/margins": 5.530354022979736, + "rewards/rejected": -2.7319815158843994, + "step": 10383 + }, + { + "epoch": 2.6, + "grad_norm": 2.2473981380462646, + "learning_rate": 4.692480042233262e-06, + "logits/chosen": -0.517780065536499, + "logits/rejected": -0.6260848045349121, + "logps/chosen": -53.06733703613281, + "logps/rejected": -104.61508178710938, + "loss": 0.5788, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1588120460510254, + "rewards/margins": 7.493047714233398, + "rewards/rejected": -4.334235191345215, + "step": 10384 + }, + { + "epoch": 2.6, + "grad_norm": 5.742724418640137, + "learning_rate": 4.691695546384296e-06, + "logits/chosen": -0.5195581912994385, + "logits/rejected": -0.6401944756507874, + "logps/chosen": -72.1949234008789, + "logps/rejected": -96.05928802490234, + "loss": 0.8022, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.086390495300293, + "rewards/margins": 5.755520343780518, + "rewards/rejected": -2.669130325317383, + "step": 10385 + }, + { + "epoch": 2.6, + "grad_norm": 13.00905990600586, + "learning_rate": 4.690911058153861e-06, + "logits/chosen": -0.5400968790054321, + "logits/rejected": -0.6071959137916565, + "logps/chosen": -56.35927200317383, + "logps/rejected": -93.99097442626953, + "loss": 0.9545, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.879814386367798, + "rewards/margins": 4.7119526863098145, + "rewards/rejected": -1.8321386575698853, + "step": 10386 + }, + { + "epoch": 2.6, + "grad_norm": 1.5683366060256958, + "learning_rate": 4.690126577561343e-06, + "logits/chosen": -0.5506889820098877, + "logits/rejected": -0.6555747389793396, + "logps/chosen": -61.1953010559082, + "logps/rejected": -114.83650207519531, + "loss": 0.5861, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1962759494781494, + "rewards/margins": 8.562605857849121, + "rewards/rejected": -5.366328716278076, + "step": 10387 + }, + { + "epoch": 2.6, + "grad_norm": 8.068208694458008, + "learning_rate": 4.6893421046261255e-06, + "logits/chosen": -0.5016676187515259, + "logits/rejected": -0.5803737044334412, + "logps/chosen": -63.04039001464844, + "logps/rejected": -101.89056396484375, + "loss": 0.7909, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.580676555633545, + "rewards/margins": 5.938767433166504, + "rewards/rejected": -3.358090877532959, + "step": 10388 + }, + { + "epoch": 2.6, + "grad_norm": 3.487095832824707, + "learning_rate": 4.6885576393675975e-06, + "logits/chosen": -0.5686226487159729, + "logits/rejected": -0.6598352789878845, + "logps/chosen": -59.63420867919922, + "logps/rejected": -96.34976196289062, + "loss": 0.6811, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0472452640533447, + "rewards/margins": 6.860760688781738, + "rewards/rejected": -3.8135156631469727, + "step": 10389 + }, + { + "epoch": 2.6, + "grad_norm": 15.775117874145508, + "learning_rate": 4.687773181805139e-06, + "logits/chosen": -0.4899188280105591, + "logits/rejected": -0.5672860741615295, + "logps/chosen": -68.73089599609375, + "logps/rejected": -103.15189361572266, + "loss": 0.7222, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.560351848602295, + "rewards/margins": 6.279240131378174, + "rewards/rejected": -3.718888759613037, + "step": 10390 + }, + { + "epoch": 2.6, + "grad_norm": 3.2496118545532227, + "learning_rate": 4.686988731958138e-06, + "logits/chosen": -0.5149432420730591, + "logits/rejected": -0.6119112372398376, + "logps/chosen": -47.031368255615234, + "logps/rejected": -94.1395034790039, + "loss": 0.6069, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.002354145050049, + "rewards/margins": 6.3401641845703125, + "rewards/rejected": -3.3378095626831055, + "step": 10391 + }, + { + "epoch": 2.6, + "grad_norm": 5.948096752166748, + "learning_rate": 4.68620428984598e-06, + "logits/chosen": -0.5610727071762085, + "logits/rejected": -0.6580661535263062, + "logps/chosen": -51.344566345214844, + "logps/rejected": -94.04584503173828, + "loss": 0.6533, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7418179512023926, + "rewards/margins": 6.34535551071167, + "rewards/rejected": -3.6035375595092773, + "step": 10392 + }, + { + "epoch": 2.6, + "grad_norm": 6.283392906188965, + "learning_rate": 4.685419855488046e-06, + "logits/chosen": -0.5037617683410645, + "logits/rejected": -0.5918025970458984, + "logps/chosen": -54.27214050292969, + "logps/rejected": -90.10904693603516, + "loss": 0.737, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0095784664154053, + "rewards/margins": 5.507904052734375, + "rewards/rejected": -2.4983253479003906, + "step": 10393 + }, + { + "epoch": 2.6, + "grad_norm": 3.922316312789917, + "learning_rate": 4.684635428903721e-06, + "logits/chosen": -0.5958169102668762, + "logits/rejected": -0.6762125492095947, + "logps/chosen": -59.08348083496094, + "logps/rejected": -99.57063293457031, + "loss": 0.6633, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8686561584472656, + "rewards/margins": 6.797668933868408, + "rewards/rejected": -3.9290127754211426, + "step": 10394 + }, + { + "epoch": 2.6, + "grad_norm": 19.622163772583008, + "learning_rate": 4.683851010112391e-06, + "logits/chosen": -0.48950597643852234, + "logits/rejected": -0.5606191754341125, + "logps/chosen": -47.546485900878906, + "logps/rejected": -104.95681762695312, + "loss": 0.6298, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9865589141845703, + "rewards/margins": 7.189595699310303, + "rewards/rejected": -4.203036785125732, + "step": 10395 + }, + { + "epoch": 2.6, + "grad_norm": 4.8957295417785645, + "learning_rate": 4.683066599133441e-06, + "logits/chosen": -0.443692684173584, + "logits/rejected": -0.5287272930145264, + "logps/chosen": -59.09603500366211, + "logps/rejected": -112.05746459960938, + "loss": 0.5694, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.82684063911438, + "rewards/margins": 6.935279846191406, + "rewards/rejected": -4.1084394454956055, + "step": 10396 + }, + { + "epoch": 2.6, + "grad_norm": 8.731809616088867, + "learning_rate": 4.6822821959862495e-06, + "logits/chosen": -0.5197606682777405, + "logits/rejected": -0.5275907516479492, + "logps/chosen": -54.01102828979492, + "logps/rejected": -109.28799438476562, + "loss": 0.7689, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.770531415939331, + "rewards/margins": 4.9352641105651855, + "rewards/rejected": -2.1647324562072754, + "step": 10397 + }, + { + "epoch": 2.6, + "grad_norm": 5.238671779632568, + "learning_rate": 4.681497800690204e-06, + "logits/chosen": -0.45454755425453186, + "logits/rejected": -0.5233157277107239, + "logps/chosen": -65.8897705078125, + "logps/rejected": -99.11908721923828, + "loss": 0.7855, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8711743354797363, + "rewards/margins": 5.498715877532959, + "rewards/rejected": -2.62754225730896, + "step": 10398 + }, + { + "epoch": 2.6, + "grad_norm": 13.901473045349121, + "learning_rate": 4.680713413264686e-06, + "logits/chosen": -0.5510724782943726, + "logits/rejected": -0.5755619406700134, + "logps/chosen": -91.9116439819336, + "logps/rejected": -94.7774429321289, + "loss": 0.7125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7952322959899902, + "rewards/margins": 5.9286603927612305, + "rewards/rejected": -3.1334280967712402, + "step": 10399 + }, + { + "epoch": 2.6, + "grad_norm": 4.648486137390137, + "learning_rate": 4.679929033729081e-06, + "logits/chosen": -0.42885661125183105, + "logits/rejected": -0.493460088968277, + "logps/chosen": -53.772422790527344, + "logps/rejected": -97.02220916748047, + "loss": 0.7042, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0276808738708496, + "rewards/margins": 5.186058044433594, + "rewards/rejected": -2.158376693725586, + "step": 10400 + }, + { + "epoch": 2.6, + "grad_norm": 4.101669788360596, + "learning_rate": 4.679144662102768e-06, + "logits/chosen": -0.5002644062042236, + "logits/rejected": -0.6083507537841797, + "logps/chosen": -48.79096603393555, + "logps/rejected": -85.89920043945312, + "loss": 0.5749, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.876493453979492, + "rewards/margins": 4.987006187438965, + "rewards/rejected": -2.1105129718780518, + "step": 10401 + }, + { + "epoch": 2.6, + "grad_norm": 4.0057759284973145, + "learning_rate": 4.678360298405132e-06, + "logits/chosen": -0.4347914457321167, + "logits/rejected": -0.5214937925338745, + "logps/chosen": -57.14097595214844, + "logps/rejected": -105.33891296386719, + "loss": 0.6273, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3563904762268066, + "rewards/margins": 6.543288707733154, + "rewards/rejected": -3.1868982315063477, + "step": 10402 + }, + { + "epoch": 2.6, + "grad_norm": 7.265990257263184, + "learning_rate": 4.677575942655557e-06, + "logits/chosen": -0.4941614270210266, + "logits/rejected": -0.6071938276290894, + "logps/chosen": -62.5814094543457, + "logps/rejected": -96.58598327636719, + "loss": 0.6839, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8601770401000977, + "rewards/margins": 5.4535956382751465, + "rewards/rejected": -2.5934181213378906, + "step": 10403 + }, + { + "epoch": 2.6, + "grad_norm": 4.035065174102783, + "learning_rate": 4.676791594873421e-06, + "logits/chosen": -0.4450458586215973, + "logits/rejected": -0.5402776598930359, + "logps/chosen": -59.4573974609375, + "logps/rejected": -80.18206024169922, + "loss": 0.6488, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8960745334625244, + "rewards/margins": 5.290042877197266, + "rewards/rejected": -2.3939685821533203, + "step": 10404 + }, + { + "epoch": 2.6, + "grad_norm": 4.0986127853393555, + "learning_rate": 4.6760072550781095e-06, + "logits/chosen": -0.4169798493385315, + "logits/rejected": -0.4661194682121277, + "logps/chosen": -52.21800994873047, + "logps/rejected": -101.69461059570312, + "loss": 0.66, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3581414222717285, + "rewards/margins": 5.69169282913208, + "rewards/rejected": -2.3335516452789307, + "step": 10405 + }, + { + "epoch": 2.6, + "grad_norm": 4.307903289794922, + "learning_rate": 4.675222923289002e-06, + "logits/chosen": -0.4477001428604126, + "logits/rejected": -0.5350819230079651, + "logps/chosen": -53.03778076171875, + "logps/rejected": -97.0202865600586, + "loss": 0.5252, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.864901542663574, + "rewards/margins": 6.677572250366211, + "rewards/rejected": -3.8126702308654785, + "step": 10406 + }, + { + "epoch": 2.6, + "grad_norm": 10.312337875366211, + "learning_rate": 4.6744385995254844e-06, + "logits/chosen": -0.4964976906776428, + "logits/rejected": -0.4892471134662628, + "logps/chosen": -60.63307571411133, + "logps/rejected": -110.04032135009766, + "loss": 0.769, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0789809226989746, + "rewards/margins": 4.9553961753845215, + "rewards/rejected": -1.8764153718948364, + "step": 10407 + }, + { + "epoch": 2.6, + "grad_norm": 17.187789916992188, + "learning_rate": 4.673654283806934e-06, + "logits/chosen": -0.5878936052322388, + "logits/rejected": -0.6293320059776306, + "logps/chosen": -56.13689041137695, + "logps/rejected": -93.16085815429688, + "loss": 0.7874, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.892192840576172, + "rewards/margins": 5.513265609741211, + "rewards/rejected": -2.6210732460021973, + "step": 10408 + }, + { + "epoch": 2.6, + "grad_norm": 7.799008846282959, + "learning_rate": 4.672869976152731e-06, + "logits/chosen": -0.4211675822734833, + "logits/rejected": -0.523396909236908, + "logps/chosen": -58.26125717163086, + "logps/rejected": -105.06658935546875, + "loss": 0.5929, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2011561393737793, + "rewards/margins": 6.72944974899292, + "rewards/rejected": -3.5282931327819824, + "step": 10409 + }, + { + "epoch": 2.6, + "grad_norm": 9.485746383666992, + "learning_rate": 4.672085676582263e-06, + "logits/chosen": -0.5656708478927612, + "logits/rejected": -0.6724189519882202, + "logps/chosen": -64.8721694946289, + "logps/rejected": -101.98146057128906, + "loss": 0.7208, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.908961772918701, + "rewards/margins": 6.310741424560547, + "rewards/rejected": -3.401779890060425, + "step": 10410 + }, + { + "epoch": 2.6, + "grad_norm": 4.538535118103027, + "learning_rate": 4.671301385114903e-06, + "logits/chosen": -0.46779611706733704, + "logits/rejected": -0.59503173828125, + "logps/chosen": -65.00603485107422, + "logps/rejected": -104.53681945800781, + "loss": 0.6082, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.029210090637207, + "rewards/margins": 7.114073276519775, + "rewards/rejected": -4.08486270904541, + "step": 10411 + }, + { + "epoch": 2.6, + "grad_norm": 5.321934223175049, + "learning_rate": 4.670517101770037e-06, + "logits/chosen": -0.49842190742492676, + "logits/rejected": -0.5822443962097168, + "logps/chosen": -51.77951431274414, + "logps/rejected": -112.89353942871094, + "loss": 0.5459, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.732036590576172, + "rewards/margins": 6.43176794052124, + "rewards/rejected": -3.6997313499450684, + "step": 10412 + }, + { + "epoch": 2.6, + "grad_norm": 3.6035356521606445, + "learning_rate": 4.6697328265670435e-06, + "logits/chosen": -0.42532461881637573, + "logits/rejected": -0.5882508754730225, + "logps/chosen": -69.01680755615234, + "logps/rejected": -100.18021392822266, + "loss": 0.7656, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.103562831878662, + "rewards/margins": 6.877505302429199, + "rewards/rejected": -3.7739429473876953, + "step": 10413 + }, + { + "epoch": 2.61, + "grad_norm": 5.191025257110596, + "learning_rate": 4.668948559525304e-06, + "logits/chosen": -0.47598540782928467, + "logits/rejected": -0.5657097101211548, + "logps/chosen": -48.65895462036133, + "logps/rejected": -97.5477523803711, + "loss": 0.7463, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8301186561584473, + "rewards/margins": 5.733242034912109, + "rewards/rejected": -2.9031238555908203, + "step": 10414 + }, + { + "epoch": 2.61, + "grad_norm": 8.960258483886719, + "learning_rate": 4.668164300664198e-06, + "logits/chosen": -0.5330092906951904, + "logits/rejected": -0.6290227770805359, + "logps/chosen": -52.234886169433594, + "logps/rejected": -83.8666763305664, + "loss": 0.772, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.922745704650879, + "rewards/margins": 5.652405738830566, + "rewards/rejected": -2.7296600341796875, + "step": 10415 + }, + { + "epoch": 2.61, + "grad_norm": 2.663113832473755, + "learning_rate": 4.667380050003103e-06, + "logits/chosen": -0.4509585499763489, + "logits/rejected": -0.5920877456665039, + "logps/chosen": -55.17040252685547, + "logps/rejected": -104.27643585205078, + "loss": 0.5728, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1796035766601562, + "rewards/margins": 7.970696449279785, + "rewards/rejected": -4.791092872619629, + "step": 10416 + }, + { + "epoch": 2.61, + "grad_norm": 9.299456596374512, + "learning_rate": 4.666595807561403e-06, + "logits/chosen": -0.43628284335136414, + "logits/rejected": -0.45963308215141296, + "logps/chosen": -54.45133972167969, + "logps/rejected": -90.94664764404297, + "loss": 0.7172, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7101497650146484, + "rewards/margins": 4.693704605102539, + "rewards/rejected": -1.9835546016693115, + "step": 10417 + }, + { + "epoch": 2.61, + "grad_norm": 5.242029666900635, + "learning_rate": 4.665811573358474e-06, + "logits/chosen": -0.3804391324520111, + "logits/rejected": -0.4318179190158844, + "logps/chosen": -62.11772155761719, + "logps/rejected": -116.22120666503906, + "loss": 0.7121, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0413904190063477, + "rewards/margins": 6.016453742980957, + "rewards/rejected": -2.9750633239746094, + "step": 10418 + }, + { + "epoch": 2.61, + "grad_norm": 6.157301425933838, + "learning_rate": 4.665027347413697e-06, + "logits/chosen": -0.47823700308799744, + "logits/rejected": -0.489923894405365, + "logps/chosen": -56.83279800415039, + "logps/rejected": -110.39604187011719, + "loss": 0.8424, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8958215713500977, + "rewards/margins": 5.175053596496582, + "rewards/rejected": -2.2792317867279053, + "step": 10419 + }, + { + "epoch": 2.61, + "grad_norm": 4.868755340576172, + "learning_rate": 4.6642431297464506e-06, + "logits/chosen": -0.5895138382911682, + "logits/rejected": -0.6953588724136353, + "logps/chosen": -59.646331787109375, + "logps/rejected": -84.86320495605469, + "loss": 0.7211, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9730000495910645, + "rewards/margins": 6.040869235992432, + "rewards/rejected": -3.0678696632385254, + "step": 10420 + }, + { + "epoch": 2.61, + "grad_norm": 21.80140495300293, + "learning_rate": 4.6634589203761135e-06, + "logits/chosen": -0.44729503989219666, + "logits/rejected": -0.5683820247650146, + "logps/chosen": -69.46141052246094, + "logps/rejected": -84.2924575805664, + "loss": 0.8057, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.864271879196167, + "rewards/margins": 5.536251544952393, + "rewards/rejected": -2.6719794273376465, + "step": 10421 + }, + { + "epoch": 2.61, + "grad_norm": 6.738330364227295, + "learning_rate": 4.6626747193220645e-06, + "logits/chosen": -0.4783362150192261, + "logits/rejected": -0.5621175765991211, + "logps/chosen": -54.90585708618164, + "logps/rejected": -103.75468444824219, + "loss": 0.6412, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8669397830963135, + "rewards/margins": 6.697755336761475, + "rewards/rejected": -3.830815315246582, + "step": 10422 + }, + { + "epoch": 2.61, + "grad_norm": 7.0093994140625, + "learning_rate": 4.66189052660368e-06, + "logits/chosen": -0.5285218358039856, + "logits/rejected": -0.5774636268615723, + "logps/chosen": -53.95050811767578, + "logps/rejected": -95.74217224121094, + "loss": 0.7387, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9721062183380127, + "rewards/margins": 5.650565147399902, + "rewards/rejected": -2.6784591674804688, + "step": 10423 + }, + { + "epoch": 2.61, + "grad_norm": 7.9929399490356445, + "learning_rate": 4.661106342240342e-06, + "logits/chosen": -0.49430277943611145, + "logits/rejected": -0.6053736209869385, + "logps/chosen": -57.279483795166016, + "logps/rejected": -101.18024444580078, + "loss": 0.6968, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0242786407470703, + "rewards/margins": 6.532042980194092, + "rewards/rejected": -3.5077648162841797, + "step": 10424 + }, + { + "epoch": 2.61, + "grad_norm": 6.154139995574951, + "learning_rate": 4.660322166251428e-06, + "logits/chosen": -0.5381937026977539, + "logits/rejected": -0.6381602883338928, + "logps/chosen": -52.55149459838867, + "logps/rejected": -85.69641876220703, + "loss": 0.7845, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8980798721313477, + "rewards/margins": 5.676514148712158, + "rewards/rejected": -2.778434991836548, + "step": 10425 + }, + { + "epoch": 2.61, + "grad_norm": 2.912649154663086, + "learning_rate": 4.6595379986563125e-06, + "logits/chosen": -0.44835537672042847, + "logits/rejected": -0.5338197946548462, + "logps/chosen": -58.94921875, + "logps/rejected": -109.69670867919922, + "loss": 0.6022, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.054226875305176, + "rewards/margins": 7.036839485168457, + "rewards/rejected": -3.9826126098632812, + "step": 10426 + }, + { + "epoch": 2.61, + "grad_norm": 8.220682144165039, + "learning_rate": 4.658753839474376e-06, + "logits/chosen": -0.5390872955322266, + "logits/rejected": -0.605014979839325, + "logps/chosen": -58.7305793762207, + "logps/rejected": -95.42357635498047, + "loss": 0.6717, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.055657148361206, + "rewards/margins": 6.019908428192139, + "rewards/rejected": -2.9642508029937744, + "step": 10427 + }, + { + "epoch": 2.61, + "grad_norm": 5.435070514678955, + "learning_rate": 4.657969688724995e-06, + "logits/chosen": -0.5333911180496216, + "logits/rejected": -0.6455179452896118, + "logps/chosen": -57.88020706176758, + "logps/rejected": -97.39168548583984, + "loss": 0.6435, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.019613742828369, + "rewards/margins": 6.412460803985596, + "rewards/rejected": -3.3928465843200684, + "step": 10428 + }, + { + "epoch": 2.61, + "grad_norm": 3.61767840385437, + "learning_rate": 4.6571855464275474e-06, + "logits/chosen": -0.42406120896339417, + "logits/rejected": -0.5074414014816284, + "logps/chosen": -52.135398864746094, + "logps/rejected": -98.00039672851562, + "loss": 0.6543, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1980226039886475, + "rewards/margins": 5.991965293884277, + "rewards/rejected": -2.793942451477051, + "step": 10429 + }, + { + "epoch": 2.61, + "grad_norm": 3.670924663543701, + "learning_rate": 4.656401412601408e-06, + "logits/chosen": -0.5140818953514099, + "logits/rejected": -0.564857006072998, + "logps/chosen": -53.47502136230469, + "logps/rejected": -99.97593688964844, + "loss": 0.6977, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.049285650253296, + "rewards/margins": 6.460169792175293, + "rewards/rejected": -3.410883903503418, + "step": 10430 + }, + { + "epoch": 2.61, + "grad_norm": 2.54887318611145, + "learning_rate": 4.655617287265957e-06, + "logits/chosen": -0.5995227098464966, + "logits/rejected": -0.6868985891342163, + "logps/chosen": -48.03593063354492, + "logps/rejected": -87.33700561523438, + "loss": 0.5477, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.182185173034668, + "rewards/margins": 6.170660018920898, + "rewards/rejected": -2.9884748458862305, + "step": 10431 + }, + { + "epoch": 2.61, + "grad_norm": 6.680759429931641, + "learning_rate": 4.654833170440569e-06, + "logits/chosen": -0.5220984816551208, + "logits/rejected": -0.5601218342781067, + "logps/chosen": -53.05718231201172, + "logps/rejected": -97.61273193359375, + "loss": 0.6554, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0103633403778076, + "rewards/margins": 5.414786338806152, + "rewards/rejected": -2.404423236846924, + "step": 10432 + }, + { + "epoch": 2.61, + "grad_norm": 8.21401596069336, + "learning_rate": 4.654049062144619e-06, + "logits/chosen": -0.5539131164550781, + "logits/rejected": -0.6481455564498901, + "logps/chosen": -57.26069641113281, + "logps/rejected": -88.40522766113281, + "loss": 0.7772, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.928372621536255, + "rewards/margins": 5.399844646453857, + "rewards/rejected": -2.4714717864990234, + "step": 10433 + }, + { + "epoch": 2.61, + "grad_norm": 2.9795286655426025, + "learning_rate": 4.653264962397485e-06, + "logits/chosen": -0.3673129975795746, + "logits/rejected": -0.4449608623981476, + "logps/chosen": -58.20677185058594, + "logps/rejected": -108.18623352050781, + "loss": 0.6417, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1009347438812256, + "rewards/margins": 6.991255283355713, + "rewards/rejected": -3.8903205394744873, + "step": 10434 + }, + { + "epoch": 2.61, + "grad_norm": 6.205501556396484, + "learning_rate": 4.652480871218544e-06, + "logits/chosen": -0.4137866795063019, + "logits/rejected": -0.5701758861541748, + "logps/chosen": -60.15984344482422, + "logps/rejected": -85.95014190673828, + "loss": 0.7196, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9470601081848145, + "rewards/margins": 5.7148919105529785, + "rewards/rejected": -2.767831563949585, + "step": 10435 + }, + { + "epoch": 2.61, + "grad_norm": 7.691421985626221, + "learning_rate": 4.651696788627169e-06, + "logits/chosen": -0.40096786618232727, + "logits/rejected": -0.46668997406959534, + "logps/chosen": -62.595680236816406, + "logps/rejected": -113.01417541503906, + "loss": 0.8368, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1208972930908203, + "rewards/margins": 6.2277960777282715, + "rewards/rejected": -3.1068992614746094, + "step": 10436 + }, + { + "epoch": 2.61, + "grad_norm": 3.37677001953125, + "learning_rate": 4.650912714642737e-06, + "logits/chosen": -0.528335452079773, + "logits/rejected": -0.5784974098205566, + "logps/chosen": -53.159942626953125, + "logps/rejected": -104.38658905029297, + "loss": 0.6384, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3804149627685547, + "rewards/margins": 6.616173267364502, + "rewards/rejected": -3.235758066177368, + "step": 10437 + }, + { + "epoch": 2.61, + "grad_norm": 2.8476481437683105, + "learning_rate": 4.650128649284622e-06, + "logits/chosen": -0.533687174320221, + "logits/rejected": -0.6822231411933899, + "logps/chosen": -62.09688949584961, + "logps/rejected": -83.33948516845703, + "loss": 0.6668, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9045040607452393, + "rewards/margins": 7.05642032623291, + "rewards/rejected": -4.15191650390625, + "step": 10438 + }, + { + "epoch": 2.61, + "grad_norm": 9.72491455078125, + "learning_rate": 4.6493445925722034e-06, + "logits/chosen": -0.5698914527893066, + "logits/rejected": -0.6376971006393433, + "logps/chosen": -69.49622344970703, + "logps/rejected": -95.57074737548828, + "loss": 0.8095, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.860133647918701, + "rewards/margins": 6.104423999786377, + "rewards/rejected": -3.244290351867676, + "step": 10439 + }, + { + "epoch": 2.61, + "grad_norm": 7.144850730895996, + "learning_rate": 4.64856054452485e-06, + "logits/chosen": -0.4243195950984955, + "logits/rejected": -0.5288779735565186, + "logps/chosen": -53.25180435180664, + "logps/rejected": -86.8415298461914, + "loss": 0.6678, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0252630710601807, + "rewards/margins": 5.836762428283691, + "rewards/rejected": -2.811499834060669, + "step": 10440 + }, + { + "epoch": 2.61, + "grad_norm": 4.71938419342041, + "learning_rate": 4.64777650516194e-06, + "logits/chosen": -0.4707150459289551, + "logits/rejected": -0.5526297688484192, + "logps/chosen": -50.8214225769043, + "logps/rejected": -95.8256607055664, + "loss": 0.7154, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.855599880218506, + "rewards/margins": 5.90166711807251, + "rewards/rejected": -3.046067237854004, + "step": 10441 + }, + { + "epoch": 2.61, + "grad_norm": 3.2393300533294678, + "learning_rate": 4.64699247450285e-06, + "logits/chosen": -0.5162363648414612, + "logits/rejected": -0.597175657749176, + "logps/chosen": -63.006248474121094, + "logps/rejected": -93.651123046875, + "loss": 0.6775, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.125739097595215, + "rewards/margins": 7.169188499450684, + "rewards/rejected": -4.0434489250183105, + "step": 10442 + }, + { + "epoch": 2.61, + "grad_norm": 10.46807861328125, + "learning_rate": 4.646208452566947e-06, + "logits/chosen": -0.4561668932437897, + "logits/rejected": -0.5641224980354309, + "logps/chosen": -59.753639221191406, + "logps/rejected": -100.12644958496094, + "loss": 0.6874, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.368507146835327, + "rewards/margins": 7.134034156799316, + "rewards/rejected": -3.7655274868011475, + "step": 10443 + }, + { + "epoch": 2.61, + "grad_norm": 4.828649044036865, + "learning_rate": 4.645424439373612e-06, + "logits/chosen": -0.5159705877304077, + "logits/rejected": -0.5874466300010681, + "logps/chosen": -57.884864807128906, + "logps/rejected": -85.12452697753906, + "loss": 0.6706, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.92459774017334, + "rewards/margins": 5.486905574798584, + "rewards/rejected": -2.562307834625244, + "step": 10444 + }, + { + "epoch": 2.61, + "grad_norm": 8.407069206237793, + "learning_rate": 4.644640434942214e-06, + "logits/chosen": -0.43157529830932617, + "logits/rejected": -0.4989033341407776, + "logps/chosen": -61.151309967041016, + "logps/rejected": -89.59272003173828, + "loss": 0.8137, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8988263607025146, + "rewards/margins": 4.890728950500488, + "rewards/rejected": -1.9919025897979736, + "step": 10445 + }, + { + "epoch": 2.61, + "grad_norm": 4.52707052230835, + "learning_rate": 4.643856439292132e-06, + "logits/chosen": -0.5616226196289062, + "logits/rejected": -0.6346163749694824, + "logps/chosen": -45.365943908691406, + "logps/rejected": -89.76136779785156, + "loss": 0.5936, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.180180788040161, + "rewards/margins": 6.358005523681641, + "rewards/rejected": -3.1778252124786377, + "step": 10446 + }, + { + "epoch": 2.61, + "grad_norm": 7.020995616912842, + "learning_rate": 4.643072452442735e-06, + "logits/chosen": -0.46485793590545654, + "logits/rejected": -0.5371399521827698, + "logps/chosen": -61.893455505371094, + "logps/rejected": -101.09135437011719, + "loss": 0.7879, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.3852264881134033, + "rewards/margins": 5.622507572174072, + "rewards/rejected": -3.237281560897827, + "step": 10447 + }, + { + "epoch": 2.61, + "grad_norm": 3.9040353298187256, + "learning_rate": 4.642288474413396e-06, + "logits/chosen": -0.5033886432647705, + "logits/rejected": -0.6078166365623474, + "logps/chosen": -47.99509811401367, + "logps/rejected": -103.28591918945312, + "loss": 0.6256, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.140979766845703, + "rewards/margins": 7.123807430267334, + "rewards/rejected": -3.9828286170959473, + "step": 10448 + }, + { + "epoch": 2.61, + "grad_norm": 18.66283416748047, + "learning_rate": 4.641504505223489e-06, + "logits/chosen": -0.45956575870513916, + "logits/rejected": -0.5378237366676331, + "logps/chosen": -71.74996948242188, + "logps/rejected": -80.72763061523438, + "loss": 0.9523, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5200724601745605, + "rewards/margins": 4.531750679016113, + "rewards/rejected": -2.0116782188415527, + "step": 10449 + }, + { + "epoch": 2.61, + "grad_norm": 8.754739761352539, + "learning_rate": 4.64072054489239e-06, + "logits/chosen": -0.4774293303489685, + "logits/rejected": -0.5247196555137634, + "logps/chosen": -53.09790802001953, + "logps/rejected": -100.20658111572266, + "loss": 0.7062, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9506356716156006, + "rewards/margins": 5.980817794799805, + "rewards/rejected": -3.0301826000213623, + "step": 10450 + }, + { + "epoch": 2.61, + "grad_norm": 4.789848327636719, + "learning_rate": 4.639936593439467e-06, + "logits/chosen": -0.43961086869239807, + "logits/rejected": -0.564511239528656, + "logps/chosen": -53.44395065307617, + "logps/rejected": -100.05235290527344, + "loss": 0.6715, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.095691680908203, + "rewards/margins": 7.623796463012695, + "rewards/rejected": -4.528104782104492, + "step": 10451 + }, + { + "epoch": 2.61, + "grad_norm": 7.923612594604492, + "learning_rate": 4.639152650884092e-06, + "logits/chosen": -0.39355963468551636, + "logits/rejected": -0.550250768661499, + "logps/chosen": -58.04417419433594, + "logps/rejected": -100.140380859375, + "loss": 0.6184, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8793082237243652, + "rewards/margins": 7.327436447143555, + "rewards/rejected": -4.448128700256348, + "step": 10452 + }, + { + "epoch": 2.61, + "grad_norm": 4.276617527008057, + "learning_rate": 4.638368717245642e-06, + "logits/chosen": -0.501946210861206, + "logits/rejected": -0.6363707780838013, + "logps/chosen": -61.53296661376953, + "logps/rejected": -73.80268859863281, + "loss": 0.6185, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0675857067108154, + "rewards/margins": 5.557485580444336, + "rewards/rejected": -2.4898998737335205, + "step": 10453 + }, + { + "epoch": 2.62, + "grad_norm": 4.115222454071045, + "learning_rate": 4.637584792543483e-06, + "logits/chosen": -0.43156135082244873, + "logits/rejected": -0.5228148698806763, + "logps/chosen": -47.50007629394531, + "logps/rejected": -97.663330078125, + "loss": 0.6261, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1173925399780273, + "rewards/margins": 6.976284503936768, + "rewards/rejected": -3.8588924407958984, + "step": 10454 + }, + { + "epoch": 2.62, + "grad_norm": 14.063347816467285, + "learning_rate": 4.63680087679699e-06, + "logits/chosen": -0.4769759178161621, + "logits/rejected": -0.47113722562789917, + "logps/chosen": -66.44574737548828, + "logps/rejected": -117.5740737915039, + "loss": 0.8517, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.011117458343506, + "rewards/margins": 5.008364677429199, + "rewards/rejected": -1.9972469806671143, + "step": 10455 + }, + { + "epoch": 2.62, + "grad_norm": 2.606839895248413, + "learning_rate": 4.6360169700255335e-06, + "logits/chosen": -0.5040436387062073, + "logits/rejected": -0.6345126032829285, + "logps/chosen": -48.9892463684082, + "logps/rejected": -87.60580444335938, + "loss": 0.5102, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0182042121887207, + "rewards/margins": 6.322519302368164, + "rewards/rejected": -3.3043150901794434, + "step": 10456 + }, + { + "epoch": 2.62, + "grad_norm": 6.564678192138672, + "learning_rate": 4.635233072248486e-06, + "logits/chosen": -0.46544352173805237, + "logits/rejected": -0.5310320854187012, + "logps/chosen": -60.538570404052734, + "logps/rejected": -100.02507781982422, + "loss": 0.7431, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.226349115371704, + "rewards/margins": 5.652021408081055, + "rewards/rejected": -2.4256722927093506, + "step": 10457 + }, + { + "epoch": 2.62, + "grad_norm": 5.918905735015869, + "learning_rate": 4.634449183485217e-06, + "logits/chosen": -0.5065647959709167, + "logits/rejected": -0.5729526281356812, + "logps/chosen": -55.126686096191406, + "logps/rejected": -96.4161376953125, + "loss": 0.6357, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2439634799957275, + "rewards/margins": 6.609035968780518, + "rewards/rejected": -3.365072727203369, + "step": 10458 + }, + { + "epoch": 2.62, + "grad_norm": 44.54043960571289, + "learning_rate": 4.633665303755096e-06, + "logits/chosen": -0.4831625521183014, + "logits/rejected": -0.5110598206520081, + "logps/chosen": -59.100215911865234, + "logps/rejected": -99.7525634765625, + "loss": 0.7513, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.010392189025879, + "rewards/margins": 5.442498683929443, + "rewards/rejected": -2.4321064949035645, + "step": 10459 + }, + { + "epoch": 2.62, + "grad_norm": 4.58211612701416, + "learning_rate": 4.632881433077498e-06, + "logits/chosen": -0.5859217047691345, + "logits/rejected": -0.6556406021118164, + "logps/chosen": -50.16226577758789, + "logps/rejected": -93.51860809326172, + "loss": 0.7329, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3711719512939453, + "rewards/margins": 6.551515102386475, + "rewards/rejected": -3.1803431510925293, + "step": 10460 + }, + { + "epoch": 2.62, + "grad_norm": 3.9679534435272217, + "learning_rate": 4.632097571471787e-06, + "logits/chosen": -0.4871389865875244, + "logits/rejected": -0.5566525459289551, + "logps/chosen": -46.45285415649414, + "logps/rejected": -95.77629089355469, + "loss": 0.5967, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1680190563201904, + "rewards/margins": 5.728522777557373, + "rewards/rejected": -2.5605037212371826, + "step": 10461 + }, + { + "epoch": 2.62, + "grad_norm": 5.0551910400390625, + "learning_rate": 4.631313718957338e-06, + "logits/chosen": -0.49558666348457336, + "logits/rejected": -0.5626745820045471, + "logps/chosen": -54.59542465209961, + "logps/rejected": -107.87481689453125, + "loss": 0.7029, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.11670184135437, + "rewards/margins": 6.835549831390381, + "rewards/rejected": -3.7188477516174316, + "step": 10462 + }, + { + "epoch": 2.62, + "grad_norm": 7.726607322692871, + "learning_rate": 4.630529875553517e-06, + "logits/chosen": -0.5959989428520203, + "logits/rejected": -0.6442892551422119, + "logps/chosen": -48.14787673950195, + "logps/rejected": -84.5824966430664, + "loss": 0.7131, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.031306266784668, + "rewards/margins": 4.865942478179932, + "rewards/rejected": -1.8346366882324219, + "step": 10463 + }, + { + "epoch": 2.62, + "grad_norm": 7.217918872833252, + "learning_rate": 4.629746041279699e-06, + "logits/chosen": -0.4431588649749756, + "logits/rejected": -0.5041528940200806, + "logps/chosen": -57.38039779663086, + "logps/rejected": -109.60740661621094, + "loss": 0.7496, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0396859645843506, + "rewards/margins": 4.77333402633667, + "rewards/rejected": -1.7336475849151611, + "step": 10464 + }, + { + "epoch": 2.62, + "grad_norm": 8.451318740844727, + "learning_rate": 4.628962216155249e-06, + "logits/chosen": -0.4784257113933563, + "logits/rejected": -0.5304868221282959, + "logps/chosen": -59.313011169433594, + "logps/rejected": -98.64039611816406, + "loss": 0.7246, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9477717876434326, + "rewards/margins": 6.258289813995361, + "rewards/rejected": -3.310518264770508, + "step": 10465 + }, + { + "epoch": 2.62, + "grad_norm": 2.8028523921966553, + "learning_rate": 4.628178400199535e-06, + "logits/chosen": -0.5276444554328918, + "logits/rejected": -0.6372864842414856, + "logps/chosen": -46.44744873046875, + "logps/rejected": -96.07356262207031, + "loss": 0.5571, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.147679328918457, + "rewards/margins": 7.163234710693359, + "rewards/rejected": -4.015554904937744, + "step": 10466 + }, + { + "epoch": 2.62, + "grad_norm": 7.462740421295166, + "learning_rate": 4.627394593431933e-06, + "logits/chosen": -0.4953274428844452, + "logits/rejected": -0.5699219107627869, + "logps/chosen": -56.407779693603516, + "logps/rejected": -114.54721069335938, + "loss": 0.6738, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.037126302719116, + "rewards/margins": 7.61571741104126, + "rewards/rejected": -4.5785908699035645, + "step": 10467 + }, + { + "epoch": 2.62, + "grad_norm": 3.9377429485321045, + "learning_rate": 4.626610795871803e-06, + "logits/chosen": -0.5414901971817017, + "logits/rejected": -0.6136960983276367, + "logps/chosen": -51.87855529785156, + "logps/rejected": -99.85492706298828, + "loss": 0.6819, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.19057035446167, + "rewards/margins": 6.813735008239746, + "rewards/rejected": -3.6231651306152344, + "step": 10468 + }, + { + "epoch": 2.62, + "grad_norm": 6.064854145050049, + "learning_rate": 4.625827007538517e-06, + "logits/chosen": -0.5722187161445618, + "logits/rejected": -0.6507976055145264, + "logps/chosen": -48.055850982666016, + "logps/rejected": -103.59039306640625, + "loss": 0.6526, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.935551643371582, + "rewards/margins": 7.01982307434082, + "rewards/rejected": -4.0842719078063965, + "step": 10469 + }, + { + "epoch": 2.62, + "grad_norm": 8.512045860290527, + "learning_rate": 4.625043228451445e-06, + "logits/chosen": -0.47157445549964905, + "logits/rejected": -0.5501624345779419, + "logps/chosen": -56.41172790527344, + "logps/rejected": -107.4549789428711, + "loss": 0.7319, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.896493673324585, + "rewards/margins": 7.449219226837158, + "rewards/rejected": -4.552725791931152, + "step": 10470 + }, + { + "epoch": 2.62, + "grad_norm": 4.5490031242370605, + "learning_rate": 4.624259458629955e-06, + "logits/chosen": -0.45305222272872925, + "logits/rejected": -0.4435611963272095, + "logps/chosen": -63.49541091918945, + "logps/rejected": -101.99116516113281, + "loss": 0.7494, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.04306697845459, + "rewards/margins": 5.729888439178467, + "rewards/rejected": -2.686821460723877, + "step": 10471 + }, + { + "epoch": 2.62, + "grad_norm": 5.477583885192871, + "learning_rate": 4.623475698093411e-06, + "logits/chosen": -0.5181599259376526, + "logits/rejected": -0.5893363356590271, + "logps/chosen": -53.600738525390625, + "logps/rejected": -98.44380950927734, + "loss": 0.7441, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8889589309692383, + "rewards/margins": 5.645458221435547, + "rewards/rejected": -2.7564988136291504, + "step": 10472 + }, + { + "epoch": 2.62, + "grad_norm": 7.332916259765625, + "learning_rate": 4.622691946861184e-06, + "logits/chosen": -0.4031210243701935, + "logits/rejected": -0.48374080657958984, + "logps/chosen": -56.674217224121094, + "logps/rejected": -101.96638488769531, + "loss": 0.612, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9224328994750977, + "rewards/margins": 6.576776027679443, + "rewards/rejected": -3.6543426513671875, + "step": 10473 + }, + { + "epoch": 2.62, + "grad_norm": 6.26862907409668, + "learning_rate": 4.621908204952641e-06, + "logits/chosen": -0.4784737229347229, + "logits/rejected": -0.6057716608047485, + "logps/chosen": -52.012386322021484, + "logps/rejected": -83.1617660522461, + "loss": 0.6998, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0942418575286865, + "rewards/margins": 6.5893659591674805, + "rewards/rejected": -3.495123863220215, + "step": 10474 + }, + { + "epoch": 2.62, + "grad_norm": 10.69991397857666, + "learning_rate": 4.621124472387149e-06, + "logits/chosen": -0.44355902075767517, + "logits/rejected": -0.5330217480659485, + "logps/chosen": -59.84599304199219, + "logps/rejected": -90.16204071044922, + "loss": 0.687, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0656509399414062, + "rewards/margins": 6.223187446594238, + "rewards/rejected": -3.157536745071411, + "step": 10475 + }, + { + "epoch": 2.62, + "grad_norm": 3.7025113105773926, + "learning_rate": 4.620340749184072e-06, + "logits/chosen": -0.5056228041648865, + "logits/rejected": -0.5614907741546631, + "logps/chosen": -48.73756408691406, + "logps/rejected": -90.82667541503906, + "loss": 0.6671, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.936115264892578, + "rewards/margins": 5.8507208824157715, + "rewards/rejected": -2.9146058559417725, + "step": 10476 + }, + { + "epoch": 2.62, + "grad_norm": 5.885085105895996, + "learning_rate": 4.61955703536278e-06, + "logits/chosen": -0.5217785239219666, + "logits/rejected": -0.6064084768295288, + "logps/chosen": -61.837493896484375, + "logps/rejected": -82.42233276367188, + "loss": 0.6891, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1620640754699707, + "rewards/margins": 5.504676342010498, + "rewards/rejected": -2.3426125049591064, + "step": 10477 + }, + { + "epoch": 2.62, + "grad_norm": 4.012826442718506, + "learning_rate": 4.6187733309426394e-06, + "logits/chosen": -0.4376620352268219, + "logits/rejected": -0.6107295751571655, + "logps/chosen": -62.70488357543945, + "logps/rejected": -85.5577163696289, + "loss": 0.6474, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.228949785232544, + "rewards/margins": 7.080562591552734, + "rewards/rejected": -3.8516132831573486, + "step": 10478 + }, + { + "epoch": 2.62, + "grad_norm": 6.311089992523193, + "learning_rate": 4.617989635943014e-06, + "logits/chosen": -0.5678024291992188, + "logits/rejected": -0.6403845548629761, + "logps/chosen": -62.18815612792969, + "logps/rejected": -123.01986694335938, + "loss": 0.9836, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.007349967956543, + "rewards/margins": 6.502719879150391, + "rewards/rejected": -3.4953689575195312, + "step": 10479 + }, + { + "epoch": 2.62, + "grad_norm": 7.435939311981201, + "learning_rate": 4.617205950383271e-06, + "logits/chosen": -0.4324755072593689, + "logits/rejected": -0.526940643787384, + "logps/chosen": -57.263179779052734, + "logps/rejected": -99.6506576538086, + "loss": 0.7312, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7484538555145264, + "rewards/margins": 5.6495561599731445, + "rewards/rejected": -2.901101589202881, + "step": 10480 + }, + { + "epoch": 2.62, + "grad_norm": 3.6783533096313477, + "learning_rate": 4.616422274282775e-06, + "logits/chosen": -0.4725499153137207, + "logits/rejected": -0.51285320520401, + "logps/chosen": -46.195892333984375, + "logps/rejected": -107.44055938720703, + "loss": 0.565, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.320211410522461, + "rewards/margins": 5.7937774658203125, + "rewards/rejected": -2.4735655784606934, + "step": 10481 + }, + { + "epoch": 2.62, + "grad_norm": 4.71372127532959, + "learning_rate": 4.615638607660896e-06, + "logits/chosen": -0.48877114057540894, + "logits/rejected": -0.5463888645172119, + "logps/chosen": -45.44791793823242, + "logps/rejected": -99.92936706542969, + "loss": 0.5623, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.902238607406616, + "rewards/margins": 5.462554931640625, + "rewards/rejected": -2.560316562652588, + "step": 10482 + }, + { + "epoch": 2.62, + "grad_norm": 5.8049235343933105, + "learning_rate": 4.6148549505369925e-06, + "logits/chosen": -0.5041359066963196, + "logits/rejected": -0.5542322993278503, + "logps/chosen": -44.262481689453125, + "logps/rejected": -99.39889526367188, + "loss": 0.6416, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.985743522644043, + "rewards/margins": 5.419355869293213, + "rewards/rejected": -2.433612108230591, + "step": 10483 + }, + { + "epoch": 2.62, + "grad_norm": 4.89064359664917, + "learning_rate": 4.614071302930434e-06, + "logits/chosen": -0.48717692494392395, + "logits/rejected": -0.5435415506362915, + "logps/chosen": -52.89794921875, + "logps/rejected": -103.2828369140625, + "loss": 0.7563, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.777656316757202, + "rewards/margins": 6.62251091003418, + "rewards/rejected": -2.8448550701141357, + "step": 10484 + }, + { + "epoch": 2.62, + "grad_norm": 5.723993301391602, + "learning_rate": 4.613287664860586e-06, + "logits/chosen": -0.4411728084087372, + "logits/rejected": -0.5826890468597412, + "logps/chosen": -72.21751403808594, + "logps/rejected": -96.4952392578125, + "loss": 0.6815, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.547041893005371, + "rewards/margins": 7.196202278137207, + "rewards/rejected": -3.6491611003875732, + "step": 10485 + }, + { + "epoch": 2.62, + "grad_norm": 18.24620819091797, + "learning_rate": 4.6125040363468084e-06, + "logits/chosen": -0.5198673009872437, + "logits/rejected": -0.5224866271018982, + "logps/chosen": -55.92979431152344, + "logps/rejected": -106.99607849121094, + "loss": 0.9154, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.895970344543457, + "rewards/margins": 4.780909538269043, + "rewards/rejected": -1.8849396705627441, + "step": 10486 + }, + { + "epoch": 2.62, + "grad_norm": 19.360450744628906, + "learning_rate": 4.611720417408469e-06, + "logits/chosen": -0.522315502166748, + "logits/rejected": -0.5631390810012817, + "logps/chosen": -51.19089889526367, + "logps/rejected": -106.23296356201172, + "loss": 0.6422, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8703086376190186, + "rewards/margins": 6.0951995849609375, + "rewards/rejected": -3.22489070892334, + "step": 10487 + }, + { + "epoch": 2.62, + "grad_norm": 7.721559524536133, + "learning_rate": 4.610936808064931e-06, + "logits/chosen": -0.512166440486908, + "logits/rejected": -0.5689435005187988, + "logps/chosen": -50.67206573486328, + "logps/rejected": -91.03652954101562, + "loss": 0.75, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.37300181388855, + "rewards/margins": 5.766017436981201, + "rewards/rejected": -2.3930158615112305, + "step": 10488 + }, + { + "epoch": 2.62, + "grad_norm": 10.397045135498047, + "learning_rate": 4.61015320833556e-06, + "logits/chosen": -0.5201159119606018, + "logits/rejected": -0.5857020616531372, + "logps/chosen": -57.93757247924805, + "logps/rejected": -106.33277893066406, + "loss": 0.7424, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9971184730529785, + "rewards/margins": 6.524350166320801, + "rewards/rejected": -3.5272319316864014, + "step": 10489 + }, + { + "epoch": 2.62, + "grad_norm": 3.4329679012298584, + "learning_rate": 4.609369618239715e-06, + "logits/chosen": -0.5136548280715942, + "logits/rejected": -0.5825106501579285, + "logps/chosen": -50.05687713623047, + "logps/rejected": -92.86671447753906, + "loss": 0.612, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.141390085220337, + "rewards/margins": 5.9471330642700195, + "rewards/rejected": -2.8057432174682617, + "step": 10490 + }, + { + "epoch": 2.62, + "grad_norm": 6.431369304656982, + "learning_rate": 4.6085860377967635e-06, + "logits/chosen": -0.4907243847846985, + "logits/rejected": -0.5582533478736877, + "logps/chosen": -51.82284164428711, + "logps/rejected": -90.0168685913086, + "loss": 0.7193, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.80012845993042, + "rewards/margins": 5.5063090324401855, + "rewards/rejected": -2.7061808109283447, + "step": 10491 + }, + { + "epoch": 2.62, + "grad_norm": 3.114576816558838, + "learning_rate": 4.607802467026069e-06, + "logits/chosen": -0.5146842002868652, + "logits/rejected": -0.6421182751655579, + "logps/chosen": -56.30579376220703, + "logps/rejected": -78.41270446777344, + "loss": 0.6387, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.092355966567993, + "rewards/margins": 7.322481155395508, + "rewards/rejected": -4.230125427246094, + "step": 10492 + }, + { + "epoch": 2.62, + "grad_norm": 3.970400094985962, + "learning_rate": 4.607018905946991e-06, + "logits/chosen": -0.4597487151622772, + "logits/rejected": -0.615519642829895, + "logps/chosen": -59.045745849609375, + "logps/rejected": -98.92681121826172, + "loss": 0.6006, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.990926742553711, + "rewards/margins": 7.1625823974609375, + "rewards/rejected": -4.171655654907227, + "step": 10493 + }, + { + "epoch": 2.63, + "grad_norm": 4.084622859954834, + "learning_rate": 4.606235354578894e-06, + "logits/chosen": -0.5339305996894836, + "logits/rejected": -0.5854577422142029, + "logps/chosen": -51.4847526550293, + "logps/rejected": -93.91496276855469, + "loss": 0.6323, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.203178882598877, + "rewards/margins": 5.325151443481445, + "rewards/rejected": -2.1219725608825684, + "step": 10494 + }, + { + "epoch": 2.63, + "grad_norm": 4.965746879577637, + "learning_rate": 4.605451812941139e-06, + "logits/chosen": -0.45563918352127075, + "logits/rejected": -0.5577487349510193, + "logps/chosen": -55.94566345214844, + "logps/rejected": -105.43351745605469, + "loss": 0.5538, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0655598640441895, + "rewards/margins": 7.838022232055664, + "rewards/rejected": -4.772462368011475, + "step": 10495 + }, + { + "epoch": 2.63, + "grad_norm": 7.325497627258301, + "learning_rate": 4.604668281053093e-06, + "logits/chosen": -0.48315921425819397, + "logits/rejected": -0.5245909094810486, + "logps/chosen": -52.37175369262695, + "logps/rejected": -88.17657470703125, + "loss": 0.7526, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.242405414581299, + "rewards/margins": 4.4369683265686035, + "rewards/rejected": -1.194562554359436, + "step": 10496 + }, + { + "epoch": 2.63, + "grad_norm": 4.248112678527832, + "learning_rate": 4.60388475893411e-06, + "logits/chosen": -0.4885077476501465, + "logits/rejected": -0.5613072514533997, + "logps/chosen": -54.391761779785156, + "logps/rejected": -90.35269165039062, + "loss": 0.7114, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.907708168029785, + "rewards/margins": 5.852694511413574, + "rewards/rejected": -2.944985866546631, + "step": 10497 + }, + { + "epoch": 2.63, + "grad_norm": 6.203943252563477, + "learning_rate": 4.603101246603557e-06, + "logits/chosen": -0.4797916114330292, + "logits/rejected": -0.585620105266571, + "logps/chosen": -56.52307891845703, + "logps/rejected": -80.25721740722656, + "loss": 0.6844, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1769652366638184, + "rewards/margins": 5.325802326202393, + "rewards/rejected": -2.148837089538574, + "step": 10498 + }, + { + "epoch": 2.63, + "grad_norm": 4.191835880279541, + "learning_rate": 4.602317744080798e-06, + "logits/chosen": -0.4844440519809723, + "logits/rejected": -0.5358771681785583, + "logps/chosen": -46.40264892578125, + "logps/rejected": -110.67262268066406, + "loss": 0.6835, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0642893314361572, + "rewards/margins": 5.722090721130371, + "rewards/rejected": -2.6578009128570557, + "step": 10499 + }, + { + "epoch": 2.63, + "grad_norm": 2.929295063018799, + "learning_rate": 4.6015342513851854e-06, + "logits/chosen": -0.4378260374069214, + "logits/rejected": -0.5216454267501831, + "logps/chosen": -43.4456787109375, + "logps/rejected": -87.66352081298828, + "loss": 0.5046, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.429879665374756, + "rewards/margins": 6.60567569732666, + "rewards/rejected": -3.175795793533325, + "step": 10500 + }, + { + "epoch": 2.63, + "grad_norm": 22.99934959411621, + "learning_rate": 4.600750768536087e-06, + "logits/chosen": -0.4799400269985199, + "logits/rejected": -0.558613657951355, + "logps/chosen": -60.29264450073242, + "logps/rejected": -112.49671173095703, + "loss": 0.7192, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.735905408859253, + "rewards/margins": 6.854259490966797, + "rewards/rejected": -4.118353843688965, + "step": 10501 + }, + { + "epoch": 2.63, + "grad_norm": 9.761139869689941, + "learning_rate": 4.599967295552863e-06, + "logits/chosen": -0.4662923216819763, + "logits/rejected": -0.5695208311080933, + "logps/chosen": -51.515716552734375, + "logps/rejected": -107.85696411132812, + "loss": 0.6304, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0039916038513184, + "rewards/margins": 6.061972141265869, + "rewards/rejected": -3.0579802989959717, + "step": 10502 + }, + { + "epoch": 2.63, + "grad_norm": 3.9253921508789062, + "learning_rate": 4.5991838324548725e-06, + "logits/chosen": -0.48522213101387024, + "logits/rejected": -0.6073017120361328, + "logps/chosen": -49.885623931884766, + "logps/rejected": -81.37611389160156, + "loss": 0.6108, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.042632818222046, + "rewards/margins": 6.3843913078308105, + "rewards/rejected": -3.3417582511901855, + "step": 10503 + }, + { + "epoch": 2.63, + "grad_norm": 3.9273808002471924, + "learning_rate": 4.598400379261475e-06, + "logits/chosen": -0.5088239908218384, + "logits/rejected": -0.5673599243164062, + "logps/chosen": -49.10567855834961, + "logps/rejected": -102.41468811035156, + "loss": 0.58, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0373167991638184, + "rewards/margins": 6.3349103927612305, + "rewards/rejected": -3.297593116760254, + "step": 10504 + }, + { + "epoch": 2.63, + "grad_norm": 5.697566509246826, + "learning_rate": 4.597616935992032e-06, + "logits/chosen": -0.512232780456543, + "logits/rejected": -0.6132594347000122, + "logps/chosen": -54.07099151611328, + "logps/rejected": -85.93140411376953, + "loss": 0.6701, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1315743923187256, + "rewards/margins": 5.885655403137207, + "rewards/rejected": -2.7540810108184814, + "step": 10505 + }, + { + "epoch": 2.63, + "grad_norm": 5.361230850219727, + "learning_rate": 4.596833502665901e-06, + "logits/chosen": -0.45622095465660095, + "logits/rejected": -0.523553729057312, + "logps/chosen": -62.905029296875, + "logps/rejected": -97.94652557373047, + "loss": 0.7792, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.795191526412964, + "rewards/margins": 5.491235733032227, + "rewards/rejected": -2.6960437297821045, + "step": 10506 + }, + { + "epoch": 2.63, + "grad_norm": 7.075145244598389, + "learning_rate": 4.596050079302446e-06, + "logits/chosen": -0.4308136999607086, + "logits/rejected": -0.5404446125030518, + "logps/chosen": -57.56637191772461, + "logps/rejected": -94.77205657958984, + "loss": 0.6434, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.935406446456909, + "rewards/margins": 6.4462480545043945, + "rewards/rejected": -3.5108413696289062, + "step": 10507 + }, + { + "epoch": 2.63, + "grad_norm": 3.310293674468994, + "learning_rate": 4.595266665921021e-06, + "logits/chosen": -0.49082159996032715, + "logits/rejected": -0.5222117900848389, + "logps/chosen": -46.89675521850586, + "logps/rejected": -112.4343490600586, + "loss": 0.6255, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.610334873199463, + "rewards/margins": 7.330695152282715, + "rewards/rejected": -3.720360040664673, + "step": 10508 + }, + { + "epoch": 2.63, + "grad_norm": 7.271595001220703, + "learning_rate": 4.594483262540986e-06, + "logits/chosen": -0.47114476561546326, + "logits/rejected": -0.5099624395370483, + "logps/chosen": -50.620399475097656, + "logps/rejected": -86.29701232910156, + "loss": 0.785, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.224527359008789, + "rewards/margins": 4.76921272277832, + "rewards/rejected": -1.5446851253509521, + "step": 10509 + }, + { + "epoch": 2.63, + "grad_norm": 3.3481574058532715, + "learning_rate": 4.593699869181704e-06, + "logits/chosen": -0.5492857694625854, + "logits/rejected": -0.6336460113525391, + "logps/chosen": -44.39684295654297, + "logps/rejected": -95.5082015991211, + "loss": 0.6063, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1904678344726562, + "rewards/margins": 6.912790298461914, + "rewards/rejected": -3.7223219871520996, + "step": 10510 + }, + { + "epoch": 2.63, + "grad_norm": 5.388125896453857, + "learning_rate": 4.592916485862529e-06, + "logits/chosen": -0.5884349346160889, + "logits/rejected": -0.6212623119354248, + "logps/chosen": -52.50757598876953, + "logps/rejected": -98.44874572753906, + "loss": 0.7948, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0353147983551025, + "rewards/margins": 5.720501899719238, + "rewards/rejected": -2.6851866245269775, + "step": 10511 + }, + { + "epoch": 2.63, + "grad_norm": 3.8202826976776123, + "learning_rate": 4.592133112602818e-06, + "logits/chosen": -0.5090900659561157, + "logits/rejected": -0.5796196460723877, + "logps/chosen": -47.73924255371094, + "logps/rejected": -93.61149597167969, + "loss": 0.6827, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.002574920654297, + "rewards/margins": 6.462407112121582, + "rewards/rejected": -3.459832191467285, + "step": 10512 + }, + { + "epoch": 2.63, + "grad_norm": 4.477668285369873, + "learning_rate": 4.5913497494219346e-06, + "logits/chosen": -0.4936244487762451, + "logits/rejected": -0.5738080143928528, + "logps/chosen": -53.794517517089844, + "logps/rejected": -89.51751708984375, + "loss": 0.6125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9341611862182617, + "rewards/margins": 5.839786052703857, + "rewards/rejected": -2.9056246280670166, + "step": 10513 + }, + { + "epoch": 2.63, + "grad_norm": 13.622175216674805, + "learning_rate": 4.5905663963392335e-06, + "logits/chosen": -0.4993334412574768, + "logits/rejected": -0.5342193841934204, + "logps/chosen": -44.78057861328125, + "logps/rejected": -100.77119445800781, + "loss": 0.7072, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.001436710357666, + "rewards/margins": 6.03074312210083, + "rewards/rejected": -3.029306411743164, + "step": 10514 + }, + { + "epoch": 2.63, + "grad_norm": 6.647373199462891, + "learning_rate": 4.589783053374072e-06, + "logits/chosen": -0.47377318143844604, + "logits/rejected": -0.5041089057922363, + "logps/chosen": -47.332252502441406, + "logps/rejected": -102.91044616699219, + "loss": 0.7541, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0086522102355957, + "rewards/margins": 4.9549994468688965, + "rewards/rejected": -1.9463474750518799, + "step": 10515 + }, + { + "epoch": 2.63, + "grad_norm": 11.850947380065918, + "learning_rate": 4.588999720545806e-06, + "logits/chosen": -0.47257858514785767, + "logits/rejected": -0.5522888898849487, + "logps/chosen": -57.1317253112793, + "logps/rejected": -97.15071105957031, + "loss": 0.6486, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9093027114868164, + "rewards/margins": 6.337100028991699, + "rewards/rejected": -3.427797555923462, + "step": 10516 + }, + { + "epoch": 2.63, + "grad_norm": 17.996986389160156, + "learning_rate": 4.588216397873797e-06, + "logits/chosen": -0.5624269843101501, + "logits/rejected": -0.6192190051078796, + "logps/chosen": -50.5185546875, + "logps/rejected": -90.33425903320312, + "loss": 0.88, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0111050605773926, + "rewards/margins": 5.694729804992676, + "rewards/rejected": -2.683624744415283, + "step": 10517 + }, + { + "epoch": 2.63, + "grad_norm": 7.7854461669921875, + "learning_rate": 4.587433085377397e-06, + "logits/chosen": -0.45359548926353455, + "logits/rejected": -0.516747772693634, + "logps/chosen": -64.56499481201172, + "logps/rejected": -92.63762664794922, + "loss": 0.7928, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8156473636627197, + "rewards/margins": 5.517351150512695, + "rewards/rejected": -2.7017035484313965, + "step": 10518 + }, + { + "epoch": 2.63, + "grad_norm": 6.093291282653809, + "learning_rate": 4.586649783075964e-06, + "logits/chosen": -0.40782028436660767, + "logits/rejected": -0.517534613609314, + "logps/chosen": -58.465667724609375, + "logps/rejected": -96.11575317382812, + "loss": 0.7653, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.853545665740967, + "rewards/margins": 5.663436412811279, + "rewards/rejected": -2.8098912239074707, + "step": 10519 + }, + { + "epoch": 2.63, + "grad_norm": 3.5108282566070557, + "learning_rate": 4.585866490988855e-06, + "logits/chosen": -0.41418755054473877, + "logits/rejected": -0.5250252485275269, + "logps/chosen": -56.47289276123047, + "logps/rejected": -102.17610931396484, + "loss": 0.5689, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.192239284515381, + "rewards/margins": 6.684533596038818, + "rewards/rejected": -3.4922945499420166, + "step": 10520 + }, + { + "epoch": 2.63, + "grad_norm": 3.524717330932617, + "learning_rate": 4.585083209135426e-06, + "logits/chosen": -0.5140609741210938, + "logits/rejected": -0.5341402888298035, + "logps/chosen": -57.54692840576172, + "logps/rejected": -113.82157897949219, + "loss": 0.695, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0816073417663574, + "rewards/margins": 6.773552894592285, + "rewards/rejected": -3.6919450759887695, + "step": 10521 + }, + { + "epoch": 2.63, + "grad_norm": 7.5920891761779785, + "learning_rate": 4.584299937535031e-06, + "logits/chosen": -0.4818919003009796, + "logits/rejected": -0.5664147138595581, + "logps/chosen": -59.978694915771484, + "logps/rejected": -104.00108337402344, + "loss": 0.6183, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9901418685913086, + "rewards/margins": 6.4682793617248535, + "rewards/rejected": -3.478137254714966, + "step": 10522 + }, + { + "epoch": 2.63, + "grad_norm": 1.944089412689209, + "learning_rate": 4.583516676207028e-06, + "logits/chosen": -0.49024319648742676, + "logits/rejected": -0.6322023272514343, + "logps/chosen": -65.35322570800781, + "logps/rejected": -103.73993682861328, + "loss": 0.555, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8526735305786133, + "rewards/margins": 7.772468090057373, + "rewards/rejected": -4.919794082641602, + "step": 10523 + }, + { + "epoch": 2.63, + "grad_norm": 25.664575576782227, + "learning_rate": 4.582733425170771e-06, + "logits/chosen": -0.47741368412971497, + "logits/rejected": -0.5629292130470276, + "logps/chosen": -54.716346740722656, + "logps/rejected": -94.00592803955078, + "loss": 0.8082, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.520923614501953, + "rewards/margins": 5.609663963317871, + "rewards/rejected": -3.088740110397339, + "step": 10524 + }, + { + "epoch": 2.63, + "grad_norm": 6.709239959716797, + "learning_rate": 4.581950184445614e-06, + "logits/chosen": -0.5731201767921448, + "logits/rejected": -0.6174225807189941, + "logps/chosen": -49.6557731628418, + "logps/rejected": -100.68206787109375, + "loss": 0.677, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0160343647003174, + "rewards/margins": 5.498020172119141, + "rewards/rejected": -2.4819862842559814, + "step": 10525 + }, + { + "epoch": 2.63, + "grad_norm": 2.4808011054992676, + "learning_rate": 4.5811669540509105e-06, + "logits/chosen": -0.5160750150680542, + "logits/rejected": -0.6205848455429077, + "logps/chosen": -43.74298095703125, + "logps/rejected": -89.8392562866211, + "loss": 0.5651, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1993982791900635, + "rewards/margins": 6.418183326721191, + "rewards/rejected": -3.218785047531128, + "step": 10526 + }, + { + "epoch": 2.63, + "grad_norm": 13.285357475280762, + "learning_rate": 4.58038373400602e-06, + "logits/chosen": -0.4117812216281891, + "logits/rejected": -0.5229240655899048, + "logps/chosen": -59.23239517211914, + "logps/rejected": -99.58746337890625, + "loss": 0.7224, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9322738647460938, + "rewards/margins": 6.31419038772583, + "rewards/rejected": -3.381916046142578, + "step": 10527 + }, + { + "epoch": 2.63, + "grad_norm": 9.951833724975586, + "learning_rate": 4.5796005243302945e-06, + "logits/chosen": -0.5286710262298584, + "logits/rejected": -0.5950770378112793, + "logps/chosen": -58.24105453491211, + "logps/rejected": -103.53968811035156, + "loss": 0.6868, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6208982467651367, + "rewards/margins": 6.148102760314941, + "rewards/rejected": -3.527205228805542, + "step": 10528 + }, + { + "epoch": 2.63, + "grad_norm": 6.545805931091309, + "learning_rate": 4.5788173250430845e-06, + "logits/chosen": -0.46374014019966125, + "logits/rejected": -0.5684946775436401, + "logps/chosen": -48.39139175415039, + "logps/rejected": -113.9386978149414, + "loss": 0.7142, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.97352933883667, + "rewards/margins": 6.893923282623291, + "rewards/rejected": -3.9203941822052, + "step": 10529 + }, + { + "epoch": 2.63, + "grad_norm": 3.165313482284546, + "learning_rate": 4.578034136163748e-06, + "logits/chosen": -0.471437007188797, + "logits/rejected": -0.6034620404243469, + "logps/chosen": -62.087242126464844, + "logps/rejected": -97.16888427734375, + "loss": 0.5774, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.788226366043091, + "rewards/margins": 7.5139689445495605, + "rewards/rejected": -4.725742816925049, + "step": 10530 + }, + { + "epoch": 2.63, + "grad_norm": 3.665008306503296, + "learning_rate": 4.577250957711636e-06, + "logits/chosen": -0.5347025394439697, + "logits/rejected": -0.555759847164154, + "logps/chosen": -50.07112121582031, + "logps/rejected": -109.73695373535156, + "loss": 0.6003, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0527091026306152, + "rewards/margins": 6.000964641571045, + "rewards/rejected": -2.948256015777588, + "step": 10531 + }, + { + "epoch": 2.63, + "grad_norm": 7.783342361450195, + "learning_rate": 4.576467789706104e-06, + "logits/chosen": -0.47506970167160034, + "logits/rejected": -0.5375891327857971, + "logps/chosen": -56.402793884277344, + "logps/rejected": -111.62501525878906, + "loss": 0.7968, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.377009153366089, + "rewards/margins": 5.401211738586426, + "rewards/rejected": -2.024203300476074, + "step": 10532 + }, + { + "epoch": 2.63, + "grad_norm": 3.818141460418701, + "learning_rate": 4.5756846321665015e-06, + "logits/chosen": -0.4701518714427948, + "logits/rejected": -0.539627730846405, + "logps/chosen": -48.49712371826172, + "logps/rejected": -93.29661560058594, + "loss": 0.6405, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.329648017883301, + "rewards/margins": 5.969359397888184, + "rewards/rejected": -2.6397109031677246, + "step": 10533 + }, + { + "epoch": 2.64, + "grad_norm": 6.20610237121582, + "learning_rate": 4.574901485112185e-06, + "logits/chosen": -0.4240380823612213, + "logits/rejected": -0.5175818204879761, + "logps/chosen": -59.96555709838867, + "logps/rejected": -100.18328857421875, + "loss": 0.614, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.816711902618408, + "rewards/margins": 6.090213775634766, + "rewards/rejected": -3.273501396179199, + "step": 10534 + }, + { + "epoch": 2.64, + "grad_norm": 4.28627347946167, + "learning_rate": 4.5741183485625044e-06, + "logits/chosen": -0.4927437901496887, + "logits/rejected": -0.6053398847579956, + "logps/chosen": -70.64529418945312, + "logps/rejected": -88.23301696777344, + "loss": 0.6981, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.943110227584839, + "rewards/margins": 5.681527137756348, + "rewards/rejected": -2.7384166717529297, + "step": 10535 + }, + { + "epoch": 2.64, + "grad_norm": 2.8733887672424316, + "learning_rate": 4.573335222536811e-06, + "logits/chosen": -0.4604502320289612, + "logits/rejected": -0.5745641589164734, + "logps/chosen": -64.36151885986328, + "logps/rejected": -80.4374008178711, + "loss": 0.6421, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0724990367889404, + "rewards/margins": 6.315630912780762, + "rewards/rejected": -3.243131637573242, + "step": 10536 + }, + { + "epoch": 2.64, + "grad_norm": 3.3871400356292725, + "learning_rate": 4.572552107054458e-06, + "logits/chosen": -0.4425937831401825, + "logits/rejected": -0.5318138599395752, + "logps/chosen": -55.49811935424805, + "logps/rejected": -89.71515655517578, + "loss": 0.5916, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.312098979949951, + "rewards/margins": 5.78400182723999, + "rewards/rejected": -2.4719033241271973, + "step": 10537 + }, + { + "epoch": 2.64, + "grad_norm": 5.687597751617432, + "learning_rate": 4.571769002134798e-06, + "logits/chosen": -0.4226769804954529, + "logits/rejected": -0.5381475687026978, + "logps/chosen": -56.56864929199219, + "logps/rejected": -83.08924865722656, + "loss": 0.7119, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.917802572250366, + "rewards/margins": 5.837321758270264, + "rewards/rejected": -2.9195191860198975, + "step": 10538 + }, + { + "epoch": 2.64, + "grad_norm": 5.177141189575195, + "learning_rate": 4.570985907797183e-06, + "logits/chosen": -0.4031553864479065, + "logits/rejected": -0.4658246636390686, + "logps/chosen": -58.153350830078125, + "logps/rejected": -114.54356384277344, + "loss": 0.799, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.109471082687378, + "rewards/margins": 5.5676798820495605, + "rewards/rejected": -2.4582090377807617, + "step": 10539 + }, + { + "epoch": 2.64, + "grad_norm": 11.326264381408691, + "learning_rate": 4.570202824060961e-06, + "logits/chosen": -0.43620723485946655, + "logits/rejected": -0.5649623870849609, + "logps/chosen": -64.0159683227539, + "logps/rejected": -99.13299560546875, + "loss": 0.7175, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9599006175994873, + "rewards/margins": 6.678736686706543, + "rewards/rejected": -3.7188363075256348, + "step": 10540 + }, + { + "epoch": 2.64, + "grad_norm": 7.155141353607178, + "learning_rate": 4.569419750945483e-06, + "logits/chosen": -0.47480514645576477, + "logits/rejected": -0.5554443001747131, + "logps/chosen": -49.95478439331055, + "logps/rejected": -99.56498718261719, + "loss": 0.6709, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.102299690246582, + "rewards/margins": 6.5224199295043945, + "rewards/rejected": -3.4201204776763916, + "step": 10541 + }, + { + "epoch": 2.64, + "grad_norm": 14.886515617370605, + "learning_rate": 4.568636688470105e-06, + "logits/chosen": -0.49863389134407043, + "logits/rejected": -0.5635229349136353, + "logps/chosen": -57.687381744384766, + "logps/rejected": -100.47576904296875, + "loss": 0.7059, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8326780796051025, + "rewards/margins": 5.055387496948242, + "rewards/rejected": -2.2227094173431396, + "step": 10542 + }, + { + "epoch": 2.64, + "grad_norm": 2.609811782836914, + "learning_rate": 4.5678536366541685e-06, + "logits/chosen": -0.4509584307670593, + "logits/rejected": -0.5193732976913452, + "logps/chosen": -51.707279205322266, + "logps/rejected": -98.6654052734375, + "loss": 0.5834, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.045532703399658, + "rewards/margins": 6.078803062438965, + "rewards/rejected": -3.0332698822021484, + "step": 10543 + }, + { + "epoch": 2.64, + "grad_norm": 22.87324333190918, + "learning_rate": 4.567070595517031e-06, + "logits/chosen": -0.5090531706809998, + "logits/rejected": -0.5286175012588501, + "logps/chosen": -56.36064910888672, + "logps/rejected": -99.01437377929688, + "loss": 0.7931, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.615692615509033, + "rewards/margins": 5.5542402267456055, + "rewards/rejected": -2.9385476112365723, + "step": 10544 + }, + { + "epoch": 2.64, + "grad_norm": 7.749320983886719, + "learning_rate": 4.566287565078038e-06, + "logits/chosen": -0.40438729524612427, + "logits/rejected": -0.5247249603271484, + "logps/chosen": -58.429141998291016, + "logps/rejected": -94.57925415039062, + "loss": 0.6973, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8676841259002686, + "rewards/margins": 5.137697696685791, + "rewards/rejected": -2.2700135707855225, + "step": 10545 + }, + { + "epoch": 2.64, + "grad_norm": 12.383708000183105, + "learning_rate": 4.565504545356543e-06, + "logits/chosen": -0.46237385272979736, + "logits/rejected": -0.5104255676269531, + "logps/chosen": -52.66188049316406, + "logps/rejected": -93.88469696044922, + "loss": 0.7025, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.546710252761841, + "rewards/margins": 5.535763740539551, + "rewards/rejected": -2.989053726196289, + "step": 10546 + }, + { + "epoch": 2.64, + "grad_norm": 2.8273048400878906, + "learning_rate": 4.5647215363718905e-06, + "logits/chosen": -0.5763722658157349, + "logits/rejected": -0.6648638248443604, + "logps/chosen": -51.42137908935547, + "logps/rejected": -97.97952270507812, + "loss": 0.5885, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3006417751312256, + "rewards/margins": 6.806097984313965, + "rewards/rejected": -3.505455493927002, + "step": 10547 + }, + { + "epoch": 2.64, + "grad_norm": 4.112672805786133, + "learning_rate": 4.563938538143433e-06, + "logits/chosen": -0.5790450572967529, + "logits/rejected": -0.6740944385528564, + "logps/chosen": -55.5568962097168, + "logps/rejected": -108.7782211303711, + "loss": 0.6921, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.220397472381592, + "rewards/margins": 6.0386128425598145, + "rewards/rejected": -2.818215847015381, + "step": 10548 + }, + { + "epoch": 2.64, + "grad_norm": 7.6240434646606445, + "learning_rate": 4.563155550690519e-06, + "logits/chosen": -0.4387083649635315, + "logits/rejected": -0.5217311978340149, + "logps/chosen": -53.74199676513672, + "logps/rejected": -107.98392486572266, + "loss": 0.7429, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9257349967956543, + "rewards/margins": 6.605754852294922, + "rewards/rejected": -3.6800198554992676, + "step": 10549 + }, + { + "epoch": 2.64, + "grad_norm": 5.206517696380615, + "learning_rate": 4.5623725740324934e-06, + "logits/chosen": -0.43875670433044434, + "logits/rejected": -0.5044105052947998, + "logps/chosen": -44.517364501953125, + "logps/rejected": -98.13518524169922, + "loss": 0.6357, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.161494731903076, + "rewards/margins": 6.347450256347656, + "rewards/rejected": -3.18595552444458, + "step": 10550 + }, + { + "epoch": 2.64, + "grad_norm": 11.289283752441406, + "learning_rate": 4.561589608188709e-06, + "logits/chosen": -0.44996505975723267, + "logits/rejected": -0.5510072708129883, + "logps/chosen": -57.58129119873047, + "logps/rejected": -90.92694854736328, + "loss": 0.8536, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9338912963867188, + "rewards/margins": 5.850531578063965, + "rewards/rejected": -2.916640281677246, + "step": 10551 + }, + { + "epoch": 2.64, + "grad_norm": 9.827101707458496, + "learning_rate": 4.560806653178509e-06, + "logits/chosen": -0.45901286602020264, + "logits/rejected": -0.5539959073066711, + "logps/chosen": -60.263858795166016, + "logps/rejected": -94.10884857177734, + "loss": 0.7393, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7511544227600098, + "rewards/margins": 5.804748058319092, + "rewards/rejected": -3.053593397140503, + "step": 10552 + }, + { + "epoch": 2.64, + "grad_norm": 4.107783794403076, + "learning_rate": 4.560023709021248e-06, + "logits/chosen": -0.4690907597541809, + "logits/rejected": -0.5549360513687134, + "logps/chosen": -55.17527770996094, + "logps/rejected": -93.38961029052734, + "loss": 0.7116, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3878092765808105, + "rewards/margins": 6.380743503570557, + "rewards/rejected": -2.992933750152588, + "step": 10553 + }, + { + "epoch": 2.64, + "grad_norm": 3.3426640033721924, + "learning_rate": 4.559240775736267e-06, + "logits/chosen": -0.4383949041366577, + "logits/rejected": -0.5151529908180237, + "logps/chosen": -52.312835693359375, + "logps/rejected": -83.90093231201172, + "loss": 0.5884, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.337559223175049, + "rewards/margins": 5.125794410705566, + "rewards/rejected": -1.788234829902649, + "step": 10554 + }, + { + "epoch": 2.64, + "grad_norm": 8.982943534851074, + "learning_rate": 4.558457853342913e-06, + "logits/chosen": -0.40334123373031616, + "logits/rejected": -0.48928526043891907, + "logps/chosen": -67.90362548828125, + "logps/rejected": -107.53663635253906, + "loss": 0.7119, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.985485553741455, + "rewards/margins": 6.689604759216309, + "rewards/rejected": -3.7041192054748535, + "step": 10555 + }, + { + "epoch": 2.64, + "grad_norm": 3.059025287628174, + "learning_rate": 4.557674941860537e-06, + "logits/chosen": -0.4856053590774536, + "logits/rejected": -0.5862410664558411, + "logps/chosen": -57.07614517211914, + "logps/rejected": -88.22026062011719, + "loss": 0.6422, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.08955717086792, + "rewards/margins": 6.560012340545654, + "rewards/rejected": -3.4704554080963135, + "step": 10556 + }, + { + "epoch": 2.64, + "grad_norm": 3.3315300941467285, + "learning_rate": 4.556892041308485e-06, + "logits/chosen": -0.4605919122695923, + "logits/rejected": -0.5114201307296753, + "logps/chosen": -51.03330612182617, + "logps/rejected": -99.5992431640625, + "loss": 0.5737, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1792986392974854, + "rewards/margins": 6.671568393707275, + "rewards/rejected": -3.492269277572632, + "step": 10557 + }, + { + "epoch": 2.64, + "grad_norm": 15.773545265197754, + "learning_rate": 4.5561091517061e-06, + "logits/chosen": -0.4309249520301819, + "logits/rejected": -0.4697408080101013, + "logps/chosen": -53.664310455322266, + "logps/rejected": -114.99177551269531, + "loss": 0.7136, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1193487644195557, + "rewards/margins": 6.734713077545166, + "rewards/rejected": -3.6153640747070312, + "step": 10558 + }, + { + "epoch": 2.64, + "grad_norm": 3.0420615673065186, + "learning_rate": 4.555326273072731e-06, + "logits/chosen": -0.5611563920974731, + "logits/rejected": -0.5994807481765747, + "logps/chosen": -51.189476013183594, + "logps/rejected": -103.39702606201172, + "loss": 0.5868, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.259270668029785, + "rewards/margins": 6.507993698120117, + "rewards/rejected": -3.248723268508911, + "step": 10559 + }, + { + "epoch": 2.64, + "grad_norm": 5.495584964752197, + "learning_rate": 4.554543405427724e-06, + "logits/chosen": -0.4823164641857147, + "logits/rejected": -0.5617482662200928, + "logps/chosen": -58.07625198364258, + "logps/rejected": -95.65889739990234, + "loss": 0.6777, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.96555495262146, + "rewards/margins": 6.284183502197266, + "rewards/rejected": -3.318629264831543, + "step": 10560 + }, + { + "epoch": 2.64, + "grad_norm": 5.363163471221924, + "learning_rate": 4.5537605487904215e-06, + "logits/chosen": -0.5311877131462097, + "logits/rejected": -0.5606521368026733, + "logps/chosen": -57.550132751464844, + "logps/rejected": -113.69136047363281, + "loss": 0.7126, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.666905403137207, + "rewards/margins": 6.379740238189697, + "rewards/rejected": -3.712834358215332, + "step": 10561 + }, + { + "epoch": 2.64, + "grad_norm": 10.450366973876953, + "learning_rate": 4.55297770318017e-06, + "logits/chosen": -0.47084811329841614, + "logits/rejected": -0.547133207321167, + "logps/chosen": -65.2528076171875, + "logps/rejected": -109.4643783569336, + "loss": 0.7174, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.573551893234253, + "rewards/margins": 5.975069522857666, + "rewards/rejected": -3.401517868041992, + "step": 10562 + }, + { + "epoch": 2.64, + "grad_norm": 2.8167049884796143, + "learning_rate": 4.552194868616316e-06, + "logits/chosen": -0.4863100051879883, + "logits/rejected": -0.5253118276596069, + "logps/chosen": -53.787715911865234, + "logps/rejected": -101.98722839355469, + "loss": 0.7365, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.243563652038574, + "rewards/margins": 6.524176120758057, + "rewards/rejected": -3.2806129455566406, + "step": 10563 + }, + { + "epoch": 2.64, + "grad_norm": 4.378298759460449, + "learning_rate": 4.551412045118204e-06, + "logits/chosen": -0.5055834054946899, + "logits/rejected": -0.5989927053451538, + "logps/chosen": -51.36024856567383, + "logps/rejected": -115.83589172363281, + "loss": 0.602, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.138923406600952, + "rewards/margins": 6.46861457824707, + "rewards/rejected": -3.3296914100646973, + "step": 10564 + }, + { + "epoch": 2.64, + "grad_norm": 5.7542405128479, + "learning_rate": 4.550629232705176e-06, + "logits/chosen": -0.4474128782749176, + "logits/rejected": -0.562289297580719, + "logps/chosen": -74.39009094238281, + "logps/rejected": -94.073974609375, + "loss": 0.6968, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1819560527801514, + "rewards/margins": 6.651498317718506, + "rewards/rejected": -3.4695427417755127, + "step": 10565 + }, + { + "epoch": 2.64, + "grad_norm": 6.088616847991943, + "learning_rate": 4.549846431396578e-06, + "logits/chosen": -0.4058215022087097, + "logits/rejected": -0.4538065195083618, + "logps/chosen": -56.970794677734375, + "logps/rejected": -103.6313705444336, + "loss": 0.619, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2254698276519775, + "rewards/margins": 5.900551795959473, + "rewards/rejected": -2.675081729888916, + "step": 10566 + }, + { + "epoch": 2.64, + "grad_norm": 6.667294025421143, + "learning_rate": 4.549063641211754e-06, + "logits/chosen": -0.5955188870429993, + "logits/rejected": -0.6585607528686523, + "logps/chosen": -49.79493713378906, + "logps/rejected": -101.6106948852539, + "loss": 0.6891, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.943565845489502, + "rewards/margins": 6.242205619812012, + "rewards/rejected": -3.2986395359039307, + "step": 10567 + }, + { + "epoch": 2.64, + "grad_norm": 3.2708187103271484, + "learning_rate": 4.5482808621700465e-06, + "logits/chosen": -0.5097631812095642, + "logits/rejected": -0.6047447919845581, + "logps/chosen": -53.397117614746094, + "logps/rejected": -99.1840591430664, + "loss": 0.6618, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0123682022094727, + "rewards/margins": 7.03673791885376, + "rewards/rejected": -4.024370193481445, + "step": 10568 + }, + { + "epoch": 2.64, + "grad_norm": 16.95258903503418, + "learning_rate": 4.5474980942907984e-06, + "logits/chosen": -0.5122551918029785, + "logits/rejected": -0.5680825114250183, + "logps/chosen": -51.59081268310547, + "logps/rejected": -89.67826843261719, + "loss": 0.7307, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.938588857650757, + "rewards/margins": 5.116968154907227, + "rewards/rejected": -2.1783792972564697, + "step": 10569 + }, + { + "epoch": 2.64, + "grad_norm": 20.9179744720459, + "learning_rate": 4.546715337593354e-06, + "logits/chosen": -0.5022661089897156, + "logits/rejected": -0.5425970554351807, + "logps/chosen": -54.4917106628418, + "logps/rejected": -97.51679992675781, + "loss": 0.8481, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7758240699768066, + "rewards/margins": 5.299393177032471, + "rewards/rejected": -2.523568868637085, + "step": 10570 + }, + { + "epoch": 2.64, + "grad_norm": 3.853360176086426, + "learning_rate": 4.5459325920970586e-06, + "logits/chosen": -0.41750872135162354, + "logits/rejected": -0.4744100272655487, + "logps/chosen": -65.45122528076172, + "logps/rejected": -97.33922576904297, + "loss": 0.7048, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1168839931488037, + "rewards/margins": 5.7905120849609375, + "rewards/rejected": -2.6736278533935547, + "step": 10571 + }, + { + "epoch": 2.64, + "grad_norm": 4.8979172706604, + "learning_rate": 4.545149857821248e-06, + "logits/chosen": -0.4842800498008728, + "logits/rejected": -0.5799021124839783, + "logps/chosen": -54.41250991821289, + "logps/rejected": -95.80126190185547, + "loss": 0.703, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6692326068878174, + "rewards/margins": 7.202218055725098, + "rewards/rejected": -4.532985210418701, + "step": 10572 + }, + { + "epoch": 2.65, + "grad_norm": 6.9369072914123535, + "learning_rate": 4.54436713478527e-06, + "logits/chosen": -0.5702840685844421, + "logits/rejected": -0.5674440860748291, + "logps/chosen": -53.07331085205078, + "logps/rejected": -106.03099060058594, + "loss": 0.7159, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0475821495056152, + "rewards/margins": 5.354331016540527, + "rewards/rejected": -2.306748867034912, + "step": 10573 + }, + { + "epoch": 2.65, + "grad_norm": 6.355814456939697, + "learning_rate": 4.543584423008466e-06, + "logits/chosen": -0.4244271218776703, + "logits/rejected": -0.4981972873210907, + "logps/chosen": -62.502742767333984, + "logps/rejected": -106.79029846191406, + "loss": 0.7051, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8748795986175537, + "rewards/margins": 6.7801079750061035, + "rewards/rejected": -3.90522837638855, + "step": 10574 + }, + { + "epoch": 2.65, + "grad_norm": 5.912476062774658, + "learning_rate": 4.542801722510175e-06, + "logits/chosen": -0.5678689479827881, + "logits/rejected": -0.6276028156280518, + "logps/chosen": -56.57456970214844, + "logps/rejected": -81.86824035644531, + "loss": 0.7524, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0934839248657227, + "rewards/margins": 5.388123989105225, + "rewards/rejected": -2.2946407794952393, + "step": 10575 + }, + { + "epoch": 2.65, + "grad_norm": 5.167147636413574, + "learning_rate": 4.542019033309739e-06, + "logits/chosen": -0.43958428502082825, + "logits/rejected": -0.5200287103652954, + "logps/chosen": -50.194740295410156, + "logps/rejected": -84.92234802246094, + "loss": 0.6437, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2073516845703125, + "rewards/margins": 5.192842960357666, + "rewards/rejected": -1.9854915142059326, + "step": 10576 + }, + { + "epoch": 2.65, + "grad_norm": 7.416764259338379, + "learning_rate": 4.5412363554265e-06, + "logits/chosen": -0.4394356310367584, + "logits/rejected": -0.5004766583442688, + "logps/chosen": -57.553253173828125, + "logps/rejected": -117.52041625976562, + "loss": 0.618, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0707972049713135, + "rewards/margins": 6.693987846374512, + "rewards/rejected": -3.623190402984619, + "step": 10577 + }, + { + "epoch": 2.65, + "grad_norm": 16.17082977294922, + "learning_rate": 4.5404536888798e-06, + "logits/chosen": -0.4685702323913574, + "logits/rejected": -0.5872271656990051, + "logps/chosen": -57.869651794433594, + "logps/rejected": -117.60509490966797, + "loss": 0.7281, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.981612205505371, + "rewards/margins": 7.217140197753906, + "rewards/rejected": -4.235527515411377, + "step": 10578 + }, + { + "epoch": 2.65, + "grad_norm": 4.789523601531982, + "learning_rate": 4.539671033688977e-06, + "logits/chosen": -0.5470346212387085, + "logits/rejected": -0.6353803277015686, + "logps/chosen": -51.112300872802734, + "logps/rejected": -105.66828155517578, + "loss": 0.632, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.855681896209717, + "rewards/margins": 7.251064300537109, + "rewards/rejected": -4.395382881164551, + "step": 10579 + }, + { + "epoch": 2.65, + "grad_norm": 3.752631902694702, + "learning_rate": 4.538888389873372e-06, + "logits/chosen": -0.47904375195503235, + "logits/rejected": -0.6150698661804199, + "logps/chosen": -67.72576904296875, + "logps/rejected": -109.43113708496094, + "loss": 0.7028, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3190860748291016, + "rewards/margins": 6.850131511688232, + "rewards/rejected": -3.5310451984405518, + "step": 10580 + }, + { + "epoch": 2.65, + "grad_norm": 14.627710342407227, + "learning_rate": 4.538105757452326e-06, + "logits/chosen": -0.5450696349143982, + "logits/rejected": -0.6358756422996521, + "logps/chosen": -44.243080139160156, + "logps/rejected": -98.82203674316406, + "loss": 0.5639, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.838240146636963, + "rewards/margins": 6.474188804626465, + "rewards/rejected": -3.635948896408081, + "step": 10581 + }, + { + "epoch": 2.65, + "grad_norm": 34.45304489135742, + "learning_rate": 4.53732313644518e-06, + "logits/chosen": -0.538267970085144, + "logits/rejected": -0.6038752198219299, + "logps/chosen": -52.10153579711914, + "logps/rejected": -110.98028564453125, + "loss": 0.764, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9563698768615723, + "rewards/margins": 6.676884174346924, + "rewards/rejected": -3.7205142974853516, + "step": 10582 + }, + { + "epoch": 2.65, + "grad_norm": 6.534718036651611, + "learning_rate": 4.5365405268712695e-06, + "logits/chosen": -0.494322806596756, + "logits/rejected": -0.5952500104904175, + "logps/chosen": -54.74120330810547, + "logps/rejected": -89.86734008789062, + "loss": 0.6665, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.814467191696167, + "rewards/margins": 5.369138717651367, + "rewards/rejected": -2.5546715259552, + "step": 10583 + }, + { + "epoch": 2.65, + "grad_norm": 6.3857526779174805, + "learning_rate": 4.535757928749934e-06, + "logits/chosen": -0.5204935073852539, + "logits/rejected": -0.6357452273368835, + "logps/chosen": -62.509361267089844, + "logps/rejected": -92.46134948730469, + "loss": 0.6981, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.944464683532715, + "rewards/margins": 6.2113356590271, + "rewards/rejected": -3.266871452331543, + "step": 10584 + }, + { + "epoch": 2.65, + "grad_norm": 3.2132833003997803, + "learning_rate": 4.534975342100517e-06, + "logits/chosen": -0.44598913192749023, + "logits/rejected": -0.5240596532821655, + "logps/chosen": -46.402706146240234, + "logps/rejected": -105.88031768798828, + "loss": 0.5623, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.390181303024292, + "rewards/margins": 6.800385475158691, + "rewards/rejected": -3.4102044105529785, + "step": 10585 + }, + { + "epoch": 2.65, + "grad_norm": 10.035026550292969, + "learning_rate": 4.534192766942351e-06, + "logits/chosen": -0.5417477488517761, + "logits/rejected": -0.6522137522697449, + "logps/chosen": -58.10663604736328, + "logps/rejected": -86.42146301269531, + "loss": 0.7791, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.868367910385132, + "rewards/margins": 6.919469833374023, + "rewards/rejected": -4.0511016845703125, + "step": 10586 + }, + { + "epoch": 2.65, + "grad_norm": 6.570184707641602, + "learning_rate": 4.533410203294779e-06, + "logits/chosen": -0.402094304561615, + "logits/rejected": -0.47606807947158813, + "logps/chosen": -58.659156799316406, + "logps/rejected": -101.03369140625, + "loss": 0.6891, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0069656372070312, + "rewards/margins": 6.018204689025879, + "rewards/rejected": -3.0112392902374268, + "step": 10587 + }, + { + "epoch": 2.65, + "grad_norm": 3.7712817192077637, + "learning_rate": 4.532627651177136e-06, + "logits/chosen": -0.48990505933761597, + "logits/rejected": -0.5709452033042908, + "logps/chosen": -52.62677001953125, + "logps/rejected": -121.57437133789062, + "loss": 0.6583, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2175488471984863, + "rewards/margins": 7.426336765289307, + "rewards/rejected": -4.208786964416504, + "step": 10588 + }, + { + "epoch": 2.65, + "grad_norm": 6.216098308563232, + "learning_rate": 4.5318451106087625e-06, + "logits/chosen": -0.40927231311798096, + "logits/rejected": -0.4954575002193451, + "logps/chosen": -57.84636688232422, + "logps/rejected": -98.05167388916016, + "loss": 0.6662, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.052398920059204, + "rewards/margins": 5.486941814422607, + "rewards/rejected": -2.434542655944824, + "step": 10589 + }, + { + "epoch": 2.65, + "grad_norm": 3.472113847732544, + "learning_rate": 4.531062581608992e-06, + "logits/chosen": -0.5182504653930664, + "logits/rejected": -0.5966603755950928, + "logps/chosen": -51.86431884765625, + "logps/rejected": -99.02337646484375, + "loss": 0.6523, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.946261167526245, + "rewards/margins": 6.4099860191345215, + "rewards/rejected": -3.4637248516082764, + "step": 10590 + }, + { + "epoch": 2.65, + "grad_norm": 9.27515697479248, + "learning_rate": 4.530280064197166e-06, + "logits/chosen": -0.5325335264205933, + "logits/rejected": -0.5555931329727173, + "logps/chosen": -53.25824737548828, + "logps/rejected": -102.45612335205078, + "loss": 0.8539, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.75173282623291, + "rewards/margins": 4.524594306945801, + "rewards/rejected": -1.7728618383407593, + "step": 10591 + }, + { + "epoch": 2.65, + "grad_norm": 14.4767427444458, + "learning_rate": 4.5294975583926195e-06, + "logits/chosen": -0.5037326812744141, + "logits/rejected": -0.5571799278259277, + "logps/chosen": -46.631710052490234, + "logps/rejected": -86.30059814453125, + "loss": 0.7495, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.248798131942749, + "rewards/margins": 4.714126110076904, + "rewards/rejected": -1.4653279781341553, + "step": 10592 + }, + { + "epoch": 2.65, + "grad_norm": 6.434416770935059, + "learning_rate": 4.528715064214687e-06, + "logits/chosen": -0.4876753091812134, + "logits/rejected": -0.6037272810935974, + "logps/chosen": -57.56022644042969, + "logps/rejected": -93.65388488769531, + "loss": 0.7239, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.810319185256958, + "rewards/margins": 5.435714244842529, + "rewards/rejected": -2.6253952980041504, + "step": 10593 + }, + { + "epoch": 2.65, + "grad_norm": 3.801898717880249, + "learning_rate": 4.527932581682707e-06, + "logits/chosen": -0.5369168519973755, + "logits/rejected": -0.5897334814071655, + "logps/chosen": -57.302879333496094, + "logps/rejected": -112.13721466064453, + "loss": 0.6131, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0026497840881348, + "rewards/margins": 5.506145000457764, + "rewards/rejected": -2.5034947395324707, + "step": 10594 + }, + { + "epoch": 2.65, + "grad_norm": 3.471576452255249, + "learning_rate": 4.5271501108160156e-06, + "logits/chosen": -0.3680131137371063, + "logits/rejected": -0.4599287211894989, + "logps/chosen": -48.50299072265625, + "logps/rejected": -105.77450561523438, + "loss": 0.6819, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.81545352935791, + "rewards/margins": 6.838104248046875, + "rewards/rejected": -4.022650718688965, + "step": 10595 + }, + { + "epoch": 2.65, + "grad_norm": 14.005392074584961, + "learning_rate": 4.5263676516339475e-06, + "logits/chosen": -0.4047352373600006, + "logits/rejected": -0.5187109112739563, + "logps/chosen": -60.20396041870117, + "logps/rejected": -80.5509262084961, + "loss": 0.6894, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.604398488998413, + "rewards/margins": 4.705408096313477, + "rewards/rejected": -2.1010096073150635, + "step": 10596 + }, + { + "epoch": 2.65, + "grad_norm": 9.829363822937012, + "learning_rate": 4.525585204155839e-06, + "logits/chosen": -0.487775981426239, + "logits/rejected": -0.611700177192688, + "logps/chosen": -71.06621551513672, + "logps/rejected": -101.18614196777344, + "loss": 0.6913, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.711519718170166, + "rewards/margins": 6.2825541496276855, + "rewards/rejected": -3.5710346698760986, + "step": 10597 + }, + { + "epoch": 2.65, + "grad_norm": 6.4986090660095215, + "learning_rate": 4.524802768401023e-06, + "logits/chosen": -0.3962729871273041, + "logits/rejected": -0.525497555732727, + "logps/chosen": -64.1243896484375, + "logps/rejected": -92.10530090332031, + "loss": 0.6834, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3548781871795654, + "rewards/margins": 6.123982906341553, + "rewards/rejected": -2.769104480743408, + "step": 10598 + }, + { + "epoch": 2.65, + "grad_norm": 11.101576805114746, + "learning_rate": 4.524020344388839e-06, + "logits/chosen": -0.4220677614212036, + "logits/rejected": -0.4792305529117584, + "logps/chosen": -53.65415573120117, + "logps/rejected": -102.23440551757812, + "loss": 0.7512, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.976468563079834, + "rewards/margins": 5.397233009338379, + "rewards/rejected": -2.4207639694213867, + "step": 10599 + }, + { + "epoch": 2.65, + "grad_norm": 3.850316286087036, + "learning_rate": 4.523237932138616e-06, + "logits/chosen": -0.5255801677703857, + "logits/rejected": -0.6453493237495422, + "logps/chosen": -54.04547119140625, + "logps/rejected": -95.66253662109375, + "loss": 0.5791, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.441842794418335, + "rewards/margins": 6.615138530731201, + "rewards/rejected": -3.1732959747314453, + "step": 10600 + }, + { + "epoch": 2.65, + "grad_norm": 3.261542797088623, + "learning_rate": 4.522455531669691e-06, + "logits/chosen": -0.46086829900741577, + "logits/rejected": -0.5532190799713135, + "logps/chosen": -63.88746643066406, + "logps/rejected": -96.68623352050781, + "loss": 0.6536, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.245648145675659, + "rewards/margins": 6.137611389160156, + "rewards/rejected": -2.891963481903076, + "step": 10601 + }, + { + "epoch": 2.65, + "grad_norm": 8.427083969116211, + "learning_rate": 4.521673143001398e-06, + "logits/chosen": -0.5180816650390625, + "logits/rejected": -0.5595705509185791, + "logps/chosen": -41.078125, + "logps/rejected": -82.09666442871094, + "loss": 0.7171, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2272419929504395, + "rewards/margins": 4.873046398162842, + "rewards/rejected": -1.645804762840271, + "step": 10602 + }, + { + "epoch": 2.65, + "grad_norm": 3.507080316543579, + "learning_rate": 4.520890766153072e-06, + "logits/chosen": -0.541353702545166, + "logits/rejected": -0.5432428121566772, + "logps/chosen": -51.32598876953125, + "logps/rejected": -106.12643432617188, + "loss": 0.645, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.177711009979248, + "rewards/margins": 6.462619304656982, + "rewards/rejected": -3.284907579421997, + "step": 10603 + }, + { + "epoch": 2.65, + "grad_norm": 7.097860813140869, + "learning_rate": 4.5201084011440435e-06, + "logits/chosen": -0.492867112159729, + "logits/rejected": -0.5860580205917358, + "logps/chosen": -60.47522735595703, + "logps/rejected": -88.63793182373047, + "loss": 0.7152, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0634589195251465, + "rewards/margins": 4.835775852203369, + "rewards/rejected": -1.772316813468933, + "step": 10604 + }, + { + "epoch": 2.65, + "grad_norm": 5.533419132232666, + "learning_rate": 4.519326047993647e-06, + "logits/chosen": -0.49002042412757874, + "logits/rejected": -0.5670742988586426, + "logps/chosen": -60.96673583984375, + "logps/rejected": -105.16656494140625, + "loss": 0.7425, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1308743953704834, + "rewards/margins": 6.01289701461792, + "rewards/rejected": -2.8820223808288574, + "step": 10605 + }, + { + "epoch": 2.65, + "grad_norm": 3.9814555644989014, + "learning_rate": 4.5185437067212155e-06, + "logits/chosen": -0.5817915201187134, + "logits/rejected": -0.694257378578186, + "logps/chosen": -49.099220275878906, + "logps/rejected": -85.44131469726562, + "loss": 0.5928, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2628731727600098, + "rewards/margins": 6.731413841247559, + "rewards/rejected": -3.4685401916503906, + "step": 10606 + }, + { + "epoch": 2.65, + "grad_norm": 3.4524872303009033, + "learning_rate": 4.517761377346081e-06, + "logits/chosen": -0.5019091367721558, + "logits/rejected": -0.6030678153038025, + "logps/chosen": -55.71209716796875, + "logps/rejected": -89.96669006347656, + "loss": 0.5856, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9130702018737793, + "rewards/margins": 6.679388046264648, + "rewards/rejected": -3.76631760597229, + "step": 10607 + }, + { + "epoch": 2.65, + "grad_norm": 7.463126182556152, + "learning_rate": 4.516979059887575e-06, + "logits/chosen": -0.49285072088241577, + "logits/rejected": -0.63834148645401, + "logps/chosen": -58.437313079833984, + "logps/rejected": -75.91960144042969, + "loss": 0.8211, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5502102375030518, + "rewards/margins": 5.669981956481934, + "rewards/rejected": -3.1197726726531982, + "step": 10608 + }, + { + "epoch": 2.65, + "grad_norm": 10.226956367492676, + "learning_rate": 4.5161967543650305e-06, + "logits/chosen": -0.3994503617286682, + "logits/rejected": -0.49038392305374146, + "logps/chosen": -77.65303802490234, + "logps/rejected": -86.2566909790039, + "loss": 0.7987, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1884267330169678, + "rewards/margins": 5.770273685455322, + "rewards/rejected": -2.5818467140197754, + "step": 10609 + }, + { + "epoch": 2.65, + "grad_norm": 5.49338436126709, + "learning_rate": 4.51541446079778e-06, + "logits/chosen": -0.5196832418441772, + "logits/rejected": -0.578558087348938, + "logps/chosen": -60.87944793701172, + "logps/rejected": -95.21625518798828, + "loss": 0.6872, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1649184226989746, + "rewards/margins": 6.599720478057861, + "rewards/rejected": -3.4348018169403076, + "step": 10610 + }, + { + "epoch": 2.65, + "grad_norm": 16.814294815063477, + "learning_rate": 4.514632179205152e-06, + "logits/chosen": -0.43135231733322144, + "logits/rejected": -0.4896821975708008, + "logps/chosen": -50.15382385253906, + "logps/rejected": -100.20265197753906, + "loss": 0.8589, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7728607654571533, + "rewards/margins": 5.77626895904541, + "rewards/rejected": -3.003408432006836, + "step": 10611 + }, + { + "epoch": 2.65, + "grad_norm": 22.583303451538086, + "learning_rate": 4.5138499096064785e-06, + "logits/chosen": -0.4497112035751343, + "logits/rejected": -0.4984021484851837, + "logps/chosen": -59.872581481933594, + "logps/rejected": -105.08354187011719, + "loss": 0.704, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1317708492279053, + "rewards/margins": 6.284534931182861, + "rewards/rejected": -3.152764081954956, + "step": 10612 + }, + { + "epoch": 2.66, + "grad_norm": 5.173786163330078, + "learning_rate": 4.513067652021092e-06, + "logits/chosen": -0.4680425524711609, + "logits/rejected": -0.5587737560272217, + "logps/chosen": -58.4105224609375, + "logps/rejected": -80.9066390991211, + "loss": 0.7755, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.280409336090088, + "rewards/margins": 6.057799339294434, + "rewards/rejected": -2.7773897647857666, + "step": 10613 + }, + { + "epoch": 2.66, + "grad_norm": 4.565174579620361, + "learning_rate": 4.5122854064683234e-06, + "logits/chosen": -0.48894035816192627, + "logits/rejected": -0.5482638478279114, + "logps/chosen": -50.925140380859375, + "logps/rejected": -116.01423645019531, + "loss": 0.6575, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.360750913619995, + "rewards/margins": 7.303768634796143, + "rewards/rejected": -3.9430177211761475, + "step": 10614 + }, + { + "epoch": 2.66, + "grad_norm": 13.426924705505371, + "learning_rate": 4.511503172967499e-06, + "logits/chosen": -0.4048502445220947, + "logits/rejected": -0.5237541198730469, + "logps/chosen": -65.04295349121094, + "logps/rejected": -93.72805786132812, + "loss": 0.7756, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.991950511932373, + "rewards/margins": 5.0695905685424805, + "rewards/rejected": -2.0776400566101074, + "step": 10615 + }, + { + "epoch": 2.66, + "grad_norm": 6.0797834396362305, + "learning_rate": 4.510720951537951e-06, + "logits/chosen": -0.46729186177253723, + "logits/rejected": -0.5399273037910461, + "logps/chosen": -58.308773040771484, + "logps/rejected": -84.47091674804688, + "loss": 0.6847, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.093247175216675, + "rewards/margins": 5.066123008728027, + "rewards/rejected": -1.9728755950927734, + "step": 10616 + }, + { + "epoch": 2.66, + "grad_norm": 4.7771806716918945, + "learning_rate": 4.5099387421990095e-06, + "logits/chosen": -0.5058498382568359, + "logits/rejected": -0.6425700783729553, + "logps/chosen": -61.92765808105469, + "logps/rejected": -114.2715072631836, + "loss": 0.5977, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.102038860321045, + "rewards/margins": 7.60536527633667, + "rewards/rejected": -4.503325939178467, + "step": 10617 + }, + { + "epoch": 2.66, + "grad_norm": 3.161925792694092, + "learning_rate": 4.5091565449700025e-06, + "logits/chosen": -0.49170753359794617, + "logits/rejected": -0.5335691571235657, + "logps/chosen": -50.656005859375, + "logps/rejected": -127.88890075683594, + "loss": 0.5903, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2390341758728027, + "rewards/margins": 7.730524063110352, + "rewards/rejected": -4.491489410400391, + "step": 10618 + }, + { + "epoch": 2.66, + "grad_norm": 5.691586017608643, + "learning_rate": 4.508374359870259e-06, + "logits/chosen": -0.45624345541000366, + "logits/rejected": -0.5254852771759033, + "logps/chosen": -57.820045471191406, + "logps/rejected": -95.17900085449219, + "loss": 0.7512, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.172081232070923, + "rewards/margins": 6.200173854827881, + "rewards/rejected": -3.028092384338379, + "step": 10619 + }, + { + "epoch": 2.66, + "grad_norm": 7.144211769104004, + "learning_rate": 4.507592186919108e-06, + "logits/chosen": -0.4670025110244751, + "logits/rejected": -0.5750027298927307, + "logps/chosen": -57.80360412597656, + "logps/rejected": -102.30921173095703, + "loss": 0.6882, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4268412590026855, + "rewards/margins": 7.180034160614014, + "rewards/rejected": -3.753192901611328, + "step": 10620 + }, + { + "epoch": 2.66, + "grad_norm": 3.6358439922332764, + "learning_rate": 4.506810026135879e-06, + "logits/chosen": -0.5434926152229309, + "logits/rejected": -0.6365634202957153, + "logps/chosen": -65.75933074951172, + "logps/rejected": -100.95280456542969, + "loss": 0.785, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9677278995513916, + "rewards/margins": 5.782286643981934, + "rewards/rejected": -2.814558506011963, + "step": 10621 + }, + { + "epoch": 2.66, + "grad_norm": 4.953577041625977, + "learning_rate": 4.506027877539898e-06, + "logits/chosen": -0.517241358757019, + "logits/rejected": -0.5535188913345337, + "logps/chosen": -58.75283432006836, + "logps/rejected": -96.98713684082031, + "loss": 0.6867, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9306023120880127, + "rewards/margins": 5.005947589874268, + "rewards/rejected": -2.075345993041992, + "step": 10622 + }, + { + "epoch": 2.66, + "grad_norm": 3.041161060333252, + "learning_rate": 4.505245741150493e-06, + "logits/chosen": -0.5119720697402954, + "logits/rejected": -0.614919900894165, + "logps/chosen": -55.816734313964844, + "logps/rejected": -101.53816223144531, + "loss": 0.5817, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2876923084259033, + "rewards/margins": 6.10842227935791, + "rewards/rejected": -2.820730209350586, + "step": 10623 + }, + { + "epoch": 2.66, + "grad_norm": 3.7727179527282715, + "learning_rate": 4.504463616986993e-06, + "logits/chosen": -0.39654645323753357, + "logits/rejected": -0.4986300468444824, + "logps/chosen": -51.714630126953125, + "logps/rejected": -92.84917449951172, + "loss": 0.705, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.3563742637634277, + "rewards/margins": 6.004023551940918, + "rewards/rejected": -2.6476492881774902, + "step": 10624 + }, + { + "epoch": 2.66, + "grad_norm": 4.556802749633789, + "learning_rate": 4.503681505068724e-06, + "logits/chosen": -0.4524122476577759, + "logits/rejected": -0.5175867676734924, + "logps/chosen": -73.17389678955078, + "logps/rejected": -100.30121612548828, + "loss": 0.746, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0634660720825195, + "rewards/margins": 5.07949686050415, + "rewards/rejected": -2.0160303115844727, + "step": 10625 + }, + { + "epoch": 2.66, + "grad_norm": 7.049708843231201, + "learning_rate": 4.502899405415012e-06, + "logits/chosen": -0.5337925553321838, + "logits/rejected": -0.6241784691810608, + "logps/chosen": -50.747169494628906, + "logps/rejected": -91.50275421142578, + "loss": 0.6607, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1626839637756348, + "rewards/margins": 6.677077770233154, + "rewards/rejected": -3.5143935680389404, + "step": 10626 + }, + { + "epoch": 2.66, + "grad_norm": 4.777403354644775, + "learning_rate": 4.502117318045184e-06, + "logits/chosen": -0.3951460123062134, + "logits/rejected": -0.4748821556568146, + "logps/chosen": -59.63282775878906, + "logps/rejected": -92.51237487792969, + "loss": 0.6554, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.266873359680176, + "rewards/margins": 5.943120956420898, + "rewards/rejected": -2.6762473583221436, + "step": 10627 + }, + { + "epoch": 2.66, + "grad_norm": 5.227560997009277, + "learning_rate": 4.50133524297857e-06, + "logits/chosen": -0.4822675585746765, + "logits/rejected": -0.5441854596138, + "logps/chosen": -55.503631591796875, + "logps/rejected": -94.2015380859375, + "loss": 0.7622, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2190451622009277, + "rewards/margins": 5.649246692657471, + "rewards/rejected": -2.430201530456543, + "step": 10628 + }, + { + "epoch": 2.66, + "grad_norm": 3.3356125354766846, + "learning_rate": 4.500553180234488e-06, + "logits/chosen": -0.5107567310333252, + "logits/rejected": -0.6329865455627441, + "logps/chosen": -58.47466278076172, + "logps/rejected": -85.38346099853516, + "loss": 0.5993, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3877639770507812, + "rewards/margins": 6.215558052062988, + "rewards/rejected": -2.827794313430786, + "step": 10629 + }, + { + "epoch": 2.66, + "grad_norm": 6.327144145965576, + "learning_rate": 4.49977112983227e-06, + "logits/chosen": -0.4390278458595276, + "logits/rejected": -0.5277676582336426, + "logps/chosen": -56.399051666259766, + "logps/rejected": -88.03340911865234, + "loss": 0.6799, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2439441680908203, + "rewards/margins": 5.170761585235596, + "rewards/rejected": -1.9268176555633545, + "step": 10630 + }, + { + "epoch": 2.66, + "grad_norm": 3.3876020908355713, + "learning_rate": 4.498989091791241e-06, + "logits/chosen": -0.5347182154655457, + "logits/rejected": -0.6221423745155334, + "logps/chosen": -40.15262985229492, + "logps/rejected": -103.7866439819336, + "loss": 0.5123, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.250359058380127, + "rewards/margins": 7.0220136642456055, + "rewards/rejected": -3.7716548442840576, + "step": 10631 + }, + { + "epoch": 2.66, + "grad_norm": 5.4295525550842285, + "learning_rate": 4.498207066130722e-06, + "logits/chosen": -0.459266722202301, + "logits/rejected": -0.5708909034729004, + "logps/chosen": -57.639251708984375, + "logps/rejected": -105.06411743164062, + "loss": 0.6405, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.465346336364746, + "rewards/margins": 7.634509086608887, + "rewards/rejected": -4.169162750244141, + "step": 10632 + }, + { + "epoch": 2.66, + "grad_norm": 5.938688278198242, + "learning_rate": 4.497425052870042e-06, + "logits/chosen": -0.4493201971054077, + "logits/rejected": -0.5754234790802002, + "logps/chosen": -61.14037322998047, + "logps/rejected": -84.81416320800781, + "loss": 0.7721, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1316916942596436, + "rewards/margins": 5.030659198760986, + "rewards/rejected": -1.8989677429199219, + "step": 10633 + }, + { + "epoch": 2.66, + "grad_norm": 7.294982433319092, + "learning_rate": 4.496643052028522e-06, + "logits/chosen": -0.4074538052082062, + "logits/rejected": -0.5285423398017883, + "logps/chosen": -57.13964080810547, + "logps/rejected": -103.88346862792969, + "loss": 0.643, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9503591060638428, + "rewards/margins": 6.814574718475342, + "rewards/rejected": -3.8642160892486572, + "step": 10634 + }, + { + "epoch": 2.66, + "grad_norm": 5.8397698402404785, + "learning_rate": 4.495861063625489e-06, + "logits/chosen": -0.524448812007904, + "logits/rejected": -0.5696876049041748, + "logps/chosen": -56.60566711425781, + "logps/rejected": -92.30834197998047, + "loss": 0.6828, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0495669841766357, + "rewards/margins": 6.3319478034973145, + "rewards/rejected": -3.2823808193206787, + "step": 10635 + }, + { + "epoch": 2.66, + "grad_norm": 6.97679328918457, + "learning_rate": 4.495079087680265e-06, + "logits/chosen": -0.4782337546348572, + "logits/rejected": -0.5620056986808777, + "logps/chosen": -56.207252502441406, + "logps/rejected": -99.59800720214844, + "loss": 0.5806, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1823604106903076, + "rewards/margins": 6.860122203826904, + "rewards/rejected": -3.6777610778808594, + "step": 10636 + }, + { + "epoch": 2.66, + "grad_norm": 3.1815900802612305, + "learning_rate": 4.494297124212174e-06, + "logits/chosen": -0.4988917112350464, + "logits/rejected": -0.5562529563903809, + "logps/chosen": -58.98988342285156, + "logps/rejected": -103.98312377929688, + "loss": 0.6747, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.715064525604248, + "rewards/margins": 6.5066118240356445, + "rewards/rejected": -3.7915472984313965, + "step": 10637 + }, + { + "epoch": 2.66, + "grad_norm": 5.473250389099121, + "learning_rate": 4.493515173240537e-06, + "logits/chosen": -0.4317586421966553, + "logits/rejected": -0.48325759172439575, + "logps/chosen": -54.005401611328125, + "logps/rejected": -94.82991027832031, + "loss": 0.6132, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.019827127456665, + "rewards/margins": 5.291507244110107, + "rewards/rejected": -2.2716808319091797, + "step": 10638 + }, + { + "epoch": 2.66, + "grad_norm": 2.858815908432007, + "learning_rate": 4.492733234784682e-06, + "logits/chosen": -0.5512633323669434, + "logits/rejected": -0.6116360425949097, + "logps/chosen": -43.90351486206055, + "logps/rejected": -97.41324615478516, + "loss": 0.515, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8513479232788086, + "rewards/margins": 6.082991123199463, + "rewards/rejected": -3.231642961502075, + "step": 10639 + }, + { + "epoch": 2.66, + "grad_norm": 5.327878475189209, + "learning_rate": 4.491951308863926e-06, + "logits/chosen": -0.4750462770462036, + "logits/rejected": -0.5957152247428894, + "logps/chosen": -60.67100143432617, + "logps/rejected": -87.22897338867188, + "loss": 0.6922, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0032083988189697, + "rewards/margins": 5.965048313140869, + "rewards/rejected": -2.9618396759033203, + "step": 10640 + }, + { + "epoch": 2.66, + "grad_norm": 4.448053359985352, + "learning_rate": 4.491169395497593e-06, + "logits/chosen": -0.4622977077960968, + "logits/rejected": -0.530825138092041, + "logps/chosen": -65.54243469238281, + "logps/rejected": -91.17558288574219, + "loss": 0.7336, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.220369815826416, + "rewards/margins": 5.891612529754639, + "rewards/rejected": -2.6712427139282227, + "step": 10641 + }, + { + "epoch": 2.66, + "grad_norm": 4.621025562286377, + "learning_rate": 4.490387494705008e-06, + "logits/chosen": -0.524683952331543, + "logits/rejected": -0.6307456493377686, + "logps/chosen": -49.75439453125, + "logps/rejected": -81.671630859375, + "loss": 0.6255, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0531961917877197, + "rewards/margins": 5.306792259216309, + "rewards/rejected": -2.253596544265747, + "step": 10642 + }, + { + "epoch": 2.66, + "grad_norm": 7.050386905670166, + "learning_rate": 4.4896056065054885e-06, + "logits/chosen": -0.48537471890449524, + "logits/rejected": -0.5344082713127136, + "logps/chosen": -56.15798568725586, + "logps/rejected": -91.12051391601562, + "loss": 0.6565, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.08253812789917, + "rewards/margins": 4.767279624938965, + "rewards/rejected": -1.6847418546676636, + "step": 10643 + }, + { + "epoch": 2.66, + "grad_norm": 5.64715576171875, + "learning_rate": 4.488823730918357e-06, + "logits/chosen": -0.5432158708572388, + "logits/rejected": -0.6144170761108398, + "logps/chosen": -59.88348388671875, + "logps/rejected": -96.22364807128906, + "loss": 0.8618, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1647818088531494, + "rewards/margins": 5.7539849281311035, + "rewards/rejected": -2.589203357696533, + "step": 10644 + }, + { + "epoch": 2.66, + "grad_norm": 2.861435651779175, + "learning_rate": 4.488041867962935e-06, + "logits/chosen": -0.513931155204773, + "logits/rejected": -0.6044875383377075, + "logps/chosen": -48.72685241699219, + "logps/rejected": -83.32601165771484, + "loss": 0.6086, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0303120613098145, + "rewards/margins": 6.0624260902404785, + "rewards/rejected": -3.032114028930664, + "step": 10645 + }, + { + "epoch": 2.66, + "grad_norm": 2.739778757095337, + "learning_rate": 4.487260017658544e-06, + "logits/chosen": -0.4503275156021118, + "logits/rejected": -0.5614018440246582, + "logps/chosen": -57.508079528808594, + "logps/rejected": -96.11175537109375, + "loss": 0.568, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2519800662994385, + "rewards/margins": 5.705233097076416, + "rewards/rejected": -2.4532532691955566, + "step": 10646 + }, + { + "epoch": 2.66, + "grad_norm": 3.358829975128174, + "learning_rate": 4.486478180024502e-06, + "logits/chosen": -0.4268765449523926, + "logits/rejected": -0.5186798572540283, + "logps/chosen": -63.33186340332031, + "logps/rejected": -98.24077606201172, + "loss": 0.6444, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3537588119506836, + "rewards/margins": 7.118010997772217, + "rewards/rejected": -3.764252185821533, + "step": 10647 + }, + { + "epoch": 2.66, + "grad_norm": 6.072933197021484, + "learning_rate": 4.4856963550801315e-06, + "logits/chosen": -0.49123266339302063, + "logits/rejected": -0.5563334226608276, + "logps/chosen": -55.018131256103516, + "logps/rejected": -100.97714233398438, + "loss": 0.7192, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9693069458007812, + "rewards/margins": 5.989381790161133, + "rewards/rejected": -3.0200750827789307, + "step": 10648 + }, + { + "epoch": 2.66, + "grad_norm": 3.6761410236358643, + "learning_rate": 4.484914542844751e-06, + "logits/chosen": -0.5232580900192261, + "logits/rejected": -0.6164005398750305, + "logps/chosen": -44.61604690551758, + "logps/rejected": -88.35956573486328, + "loss": 0.5521, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.060448169708252, + "rewards/margins": 6.493474006652832, + "rewards/rejected": -3.4330263137817383, + "step": 10649 + }, + { + "epoch": 2.66, + "grad_norm": 4.501733303070068, + "learning_rate": 4.48413274333768e-06, + "logits/chosen": -0.5564794540405273, + "logits/rejected": -0.6058194041252136, + "logps/chosen": -38.968421936035156, + "logps/rejected": -82.79769134521484, + "loss": 0.5758, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.284605026245117, + "rewards/margins": 5.081049919128418, + "rewards/rejected": -1.796445369720459, + "step": 10650 + }, + { + "epoch": 2.66, + "grad_norm": 5.427728176116943, + "learning_rate": 4.483350956578236e-06, + "logits/chosen": -0.4629290699958801, + "logits/rejected": -0.5830560326576233, + "logps/chosen": -61.07133102416992, + "logps/rejected": -98.43728637695312, + "loss": 0.6806, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9704649448394775, + "rewards/margins": 6.17554235458374, + "rewards/rejected": -3.2050766944885254, + "step": 10651 + }, + { + "epoch": 2.66, + "grad_norm": 2.9486935138702393, + "learning_rate": 4.4825691825857405e-06, + "logits/chosen": -0.5517049431800842, + "logits/rejected": -0.6598438620567322, + "logps/chosen": -59.76000213623047, + "logps/rejected": -97.79533386230469, + "loss": 0.6119, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8500869274139404, + "rewards/margins": 6.2280755043029785, + "rewards/rejected": -3.377988338470459, + "step": 10652 + }, + { + "epoch": 2.67, + "grad_norm": 6.919676780700684, + "learning_rate": 4.481787421379512e-06, + "logits/chosen": -0.43693187832832336, + "logits/rejected": -0.49683719873428345, + "logps/chosen": -57.57765197753906, + "logps/rejected": -100.33058166503906, + "loss": 0.6216, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.022096633911133, + "rewards/margins": 5.55396032333374, + "rewards/rejected": -2.5318636894226074, + "step": 10653 + }, + { + "epoch": 2.67, + "grad_norm": 7.9199538230896, + "learning_rate": 4.481005672978866e-06, + "logits/chosen": -0.4567387104034424, + "logits/rejected": -0.5266662240028381, + "logps/chosen": -56.43901443481445, + "logps/rejected": -84.11189270019531, + "loss": 0.6892, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1495349407196045, + "rewards/margins": 5.137810707092285, + "rewards/rejected": -1.988275408744812, + "step": 10654 + }, + { + "epoch": 2.67, + "grad_norm": 4.721612453460693, + "learning_rate": 4.480223937403122e-06, + "logits/chosen": -0.5340379476547241, + "logits/rejected": -0.5640928149223328, + "logps/chosen": -54.003482818603516, + "logps/rejected": -103.0046615600586, + "loss": 0.8094, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0562949180603027, + "rewards/margins": 5.914614677429199, + "rewards/rejected": -2.858319044113159, + "step": 10655 + }, + { + "epoch": 2.67, + "grad_norm": 6.550652503967285, + "learning_rate": 4.4794422146716e-06, + "logits/chosen": -0.4748523533344269, + "logits/rejected": -0.5201504230499268, + "logps/chosen": -54.139041900634766, + "logps/rejected": -109.03685760498047, + "loss": 0.7504, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9446396827697754, + "rewards/margins": 6.166476249694824, + "rewards/rejected": -3.221837043762207, + "step": 10656 + }, + { + "epoch": 2.67, + "grad_norm": 3.8401854038238525, + "learning_rate": 4.478660504803611e-06, + "logits/chosen": -0.5350345373153687, + "logits/rejected": -0.6406258940696716, + "logps/chosen": -47.67643356323242, + "logps/rejected": -90.26307678222656, + "loss": 0.6263, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.13997220993042, + "rewards/margins": 6.85462760925293, + "rewards/rejected": -3.714655637741089, + "step": 10657 + }, + { + "epoch": 2.67, + "grad_norm": 9.593404769897461, + "learning_rate": 4.477878807818476e-06, + "logits/chosen": -0.47933781147003174, + "logits/rejected": -0.5524814128875732, + "logps/chosen": -61.394630432128906, + "logps/rejected": -95.47521209716797, + "loss": 0.8802, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.036919116973877, + "rewards/margins": 5.301615238189697, + "rewards/rejected": -2.2646961212158203, + "step": 10658 + }, + { + "epoch": 2.67, + "grad_norm": 9.12768268585205, + "learning_rate": 4.477097123735511e-06, + "logits/chosen": -0.49390172958374023, + "logits/rejected": -0.6105100512504578, + "logps/chosen": -60.03274154663086, + "logps/rejected": -120.53337097167969, + "loss": 0.6913, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8564376831054688, + "rewards/margins": 6.534120559692383, + "rewards/rejected": -3.677682876586914, + "step": 10659 + }, + { + "epoch": 2.67, + "grad_norm": 4.773848056793213, + "learning_rate": 4.4763154525740355e-06, + "logits/chosen": -0.4836016297340393, + "logits/rejected": -0.6201629638671875, + "logps/chosen": -61.11546325683594, + "logps/rejected": -96.12876892089844, + "loss": 0.6372, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1922030448913574, + "rewards/margins": 7.58194637298584, + "rewards/rejected": -4.389743804931641, + "step": 10660 + }, + { + "epoch": 2.67, + "grad_norm": 6.2403435707092285, + "learning_rate": 4.475533794353359e-06, + "logits/chosen": -0.4990782141685486, + "logits/rejected": -0.5758692622184753, + "logps/chosen": -63.351646423339844, + "logps/rejected": -101.21087646484375, + "loss": 0.8016, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.865647554397583, + "rewards/margins": 5.968274116516113, + "rewards/rejected": -3.102626323699951, + "step": 10661 + }, + { + "epoch": 2.67, + "grad_norm": 4.144500732421875, + "learning_rate": 4.4747521490928e-06, + "logits/chosen": -0.506176769733429, + "logits/rejected": -0.579293429851532, + "logps/chosen": -55.73646545410156, + "logps/rejected": -97.32332611083984, + "loss": 0.6524, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2901344299316406, + "rewards/margins": 6.532740592956543, + "rewards/rejected": -3.242605447769165, + "step": 10662 + }, + { + "epoch": 2.67, + "grad_norm": 7.096433162689209, + "learning_rate": 4.4739705168116745e-06, + "logits/chosen": -0.5251536965370178, + "logits/rejected": -0.5565524101257324, + "logps/chosen": -51.33557891845703, + "logps/rejected": -102.57675170898438, + "loss": 0.6382, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9601011276245117, + "rewards/margins": 6.208963394165039, + "rewards/rejected": -3.2488620281219482, + "step": 10663 + }, + { + "epoch": 2.67, + "grad_norm": 7.758874893188477, + "learning_rate": 4.473188897529299e-06, + "logits/chosen": -0.5466614365577698, + "logits/rejected": -0.5783034563064575, + "logps/chosen": -47.290069580078125, + "logps/rejected": -86.38934326171875, + "loss": 0.7346, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.67147159576416, + "rewards/margins": 4.669918060302734, + "rewards/rejected": -1.9984461069107056, + "step": 10664 + }, + { + "epoch": 2.67, + "grad_norm": 5.356542587280273, + "learning_rate": 4.472407291264984e-06, + "logits/chosen": -0.5124698877334595, + "logits/rejected": -0.6152794361114502, + "logps/chosen": -58.99951171875, + "logps/rejected": -104.30365753173828, + "loss": 0.648, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.058657169342041, + "rewards/margins": 6.929616928100586, + "rewards/rejected": -3.8709588050842285, + "step": 10665 + }, + { + "epoch": 2.67, + "grad_norm": 21.63722038269043, + "learning_rate": 4.4716256980380465e-06, + "logits/chosen": -0.5683763027191162, + "logits/rejected": -0.6605522632598877, + "logps/chosen": -56.989341735839844, + "logps/rejected": -106.64151000976562, + "loss": 0.7156, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1714084148406982, + "rewards/margins": 7.371285915374756, + "rewards/rejected": -4.199878215789795, + "step": 10666 + }, + { + "epoch": 2.67, + "grad_norm": 4.906063556671143, + "learning_rate": 4.470844117867802e-06, + "logits/chosen": -0.5447546243667603, + "logits/rejected": -0.6543691754341125, + "logps/chosen": -58.42594528198242, + "logps/rejected": -89.08180236816406, + "loss": 0.6097, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.295790672302246, + "rewards/margins": 6.816580295562744, + "rewards/rejected": -3.520789623260498, + "step": 10667 + }, + { + "epoch": 2.67, + "grad_norm": 8.871281623840332, + "learning_rate": 4.470062550773559e-06, + "logits/chosen": -0.49931374192237854, + "logits/rejected": -0.5784716606140137, + "logps/chosen": -50.3828239440918, + "logps/rejected": -94.33243560791016, + "loss": 0.6085, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.695533037185669, + "rewards/margins": 6.9783854484558105, + "rewards/rejected": -4.2828521728515625, + "step": 10668 + }, + { + "epoch": 2.67, + "grad_norm": 9.7298583984375, + "learning_rate": 4.469280996774636e-06, + "logits/chosen": -0.4359039068222046, + "logits/rejected": -0.5108054876327515, + "logps/chosen": -71.78842163085938, + "logps/rejected": -85.67247772216797, + "loss": 0.8307, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.126474380493164, + "rewards/margins": 5.399014949798584, + "rewards/rejected": -2.2725412845611572, + "step": 10669 + }, + { + "epoch": 2.67, + "grad_norm": 6.453808307647705, + "learning_rate": 4.4684994558903425e-06, + "logits/chosen": -0.5056049823760986, + "logits/rejected": -0.5826295614242554, + "logps/chosen": -47.421142578125, + "logps/rejected": -98.42141723632812, + "loss": 0.6785, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6450986862182617, + "rewards/margins": 5.884200096130371, + "rewards/rejected": -3.2391014099121094, + "step": 10670 + }, + { + "epoch": 2.67, + "grad_norm": 14.66735553741455, + "learning_rate": 4.467717928139995e-06, + "logits/chosen": -0.45041418075561523, + "logits/rejected": -0.559364378452301, + "logps/chosen": -74.5078125, + "logps/rejected": -97.94913482666016, + "loss": 0.968, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.57707142829895, + "rewards/margins": 5.313647270202637, + "rewards/rejected": -2.7365760803222656, + "step": 10671 + }, + { + "epoch": 2.67, + "grad_norm": 9.5716552734375, + "learning_rate": 4.466936413542902e-06, + "logits/chosen": -0.5134513974189758, + "logits/rejected": -0.6146246790885925, + "logps/chosen": -45.476619720458984, + "logps/rejected": -101.66436004638672, + "loss": 0.6145, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9513585567474365, + "rewards/margins": 7.151966094970703, + "rewards/rejected": -4.2006072998046875, + "step": 10672 + }, + { + "epoch": 2.67, + "grad_norm": 29.414470672607422, + "learning_rate": 4.4661549121183775e-06, + "logits/chosen": -0.4563177227973938, + "logits/rejected": -0.5374330282211304, + "logps/chosen": -62.07592010498047, + "logps/rejected": -103.37718963623047, + "loss": 0.7778, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1126511096954346, + "rewards/margins": 6.4808807373046875, + "rewards/rejected": -3.3682303428649902, + "step": 10673 + }, + { + "epoch": 2.67, + "grad_norm": 6.64208984375, + "learning_rate": 4.465373423885735e-06, + "logits/chosen": -0.49621516466140747, + "logits/rejected": -0.5477389693260193, + "logps/chosen": -47.78679656982422, + "logps/rejected": -110.18421936035156, + "loss": 0.5918, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.227564573287964, + "rewards/margins": 7.497950553894043, + "rewards/rejected": -4.270385265350342, + "step": 10674 + }, + { + "epoch": 2.67, + "grad_norm": 9.671412467956543, + "learning_rate": 4.464591948864281e-06, + "logits/chosen": -0.5686596632003784, + "logits/rejected": -0.6503444314002991, + "logps/chosen": -59.95454025268555, + "logps/rejected": -83.442626953125, + "loss": 0.8491, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7365915775299072, + "rewards/margins": 6.084744930267334, + "rewards/rejected": -3.348154067993164, + "step": 10675 + }, + { + "epoch": 2.67, + "grad_norm": 8.864327430725098, + "learning_rate": 4.463810487073329e-06, + "logits/chosen": -0.48512518405914307, + "logits/rejected": -0.5876352787017822, + "logps/chosen": -60.859004974365234, + "logps/rejected": -91.31371307373047, + "loss": 0.7839, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.660364866256714, + "rewards/margins": 6.815473556518555, + "rewards/rejected": -4.15510892868042, + "step": 10676 + }, + { + "epoch": 2.67, + "grad_norm": 20.737503051757812, + "learning_rate": 4.4630290385321925e-06, + "logits/chosen": -0.47262096405029297, + "logits/rejected": -0.5270082950592041, + "logps/chosen": -51.83845520019531, + "logps/rejected": -110.5013427734375, + "loss": 0.7717, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.437474250793457, + "rewards/margins": 6.168209075927734, + "rewards/rejected": -3.730734348297119, + "step": 10677 + }, + { + "epoch": 2.67, + "grad_norm": 2.7448201179504395, + "learning_rate": 4.462247603260179e-06, + "logits/chosen": -0.5282960534095764, + "logits/rejected": -0.5744855999946594, + "logps/chosen": -50.24110794067383, + "logps/rejected": -96.37565612792969, + "loss": 0.6077, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.350808620452881, + "rewards/margins": 6.615488052368164, + "rewards/rejected": -3.264679193496704, + "step": 10678 + }, + { + "epoch": 2.67, + "grad_norm": 13.554720878601074, + "learning_rate": 4.461466181276599e-06, + "logits/chosen": -0.4626608192920685, + "logits/rejected": -0.5615793466567993, + "logps/chosen": -61.69206237792969, + "logps/rejected": -97.4647216796875, + "loss": 0.8203, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0437545776367188, + "rewards/margins": 4.867588043212891, + "rewards/rejected": -1.8238338232040405, + "step": 10679 + }, + { + "epoch": 2.67, + "grad_norm": 5.423125267028809, + "learning_rate": 4.460684772600762e-06, + "logits/chosen": -0.5236948728561401, + "logits/rejected": -0.6180038452148438, + "logps/chosen": -58.463783264160156, + "logps/rejected": -88.6475830078125, + "loss": 0.6542, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5674710273742676, + "rewards/margins": 5.215449333190918, + "rewards/rejected": -2.6479787826538086, + "step": 10680 + }, + { + "epoch": 2.67, + "grad_norm": 12.460237503051758, + "learning_rate": 4.45990337725198e-06, + "logits/chosen": -0.4253773093223572, + "logits/rejected": -0.5198321342468262, + "logps/chosen": -65.72901153564453, + "logps/rejected": -105.62212371826172, + "loss": 0.7517, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2938108444213867, + "rewards/margins": 6.3969197273254395, + "rewards/rejected": -3.1031086444854736, + "step": 10681 + }, + { + "epoch": 2.67, + "grad_norm": 10.226719856262207, + "learning_rate": 4.4591219952495576e-06, + "logits/chosen": -0.5267311334609985, + "logits/rejected": -0.5801485776901245, + "logps/chosen": -54.423553466796875, + "logps/rejected": -91.54953002929688, + "loss": 0.7904, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.811350107192993, + "rewards/margins": 6.152048587799072, + "rewards/rejected": -3.340698719024658, + "step": 10682 + }, + { + "epoch": 2.67, + "grad_norm": 12.825657844543457, + "learning_rate": 4.458340626612806e-06, + "logits/chosen": -0.49872902035713196, + "logits/rejected": -0.5685654282569885, + "logps/chosen": -57.31599426269531, + "logps/rejected": -100.05848693847656, + "loss": 0.7439, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8567254543304443, + "rewards/margins": 6.084288597106934, + "rewards/rejected": -3.227562427520752, + "step": 10683 + }, + { + "epoch": 2.67, + "grad_norm": 8.352049827575684, + "learning_rate": 4.457559271361033e-06, + "logits/chosen": -0.4556436240673065, + "logits/rejected": -0.5696742534637451, + "logps/chosen": -64.67155456542969, + "logps/rejected": -76.805908203125, + "loss": 0.753, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0819058418273926, + "rewards/margins": 5.427720069885254, + "rewards/rejected": -2.3458144664764404, + "step": 10684 + }, + { + "epoch": 2.67, + "grad_norm": 32.658119201660156, + "learning_rate": 4.456777929513551e-06, + "logits/chosen": -0.4040527939796448, + "logits/rejected": -0.46339333057403564, + "logps/chosen": -58.23297119140625, + "logps/rejected": -92.31468200683594, + "loss": 0.857, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5536699295043945, + "rewards/margins": 5.839871883392334, + "rewards/rejected": -3.2862021923065186, + "step": 10685 + }, + { + "epoch": 2.67, + "grad_norm": 8.842137336730957, + "learning_rate": 4.455996601089662e-06, + "logits/chosen": -0.4708973169326782, + "logits/rejected": -0.5718859434127808, + "logps/chosen": -55.4109001159668, + "logps/rejected": -89.75845336914062, + "loss": 0.6103, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9458441734313965, + "rewards/margins": 6.568480491638184, + "rewards/rejected": -3.622636079788208, + "step": 10686 + }, + { + "epoch": 2.67, + "grad_norm": 5.2724528312683105, + "learning_rate": 4.455215286108674e-06, + "logits/chosen": -0.49237921833992004, + "logits/rejected": -0.5500696301460266, + "logps/chosen": -58.812164306640625, + "logps/rejected": -98.4420166015625, + "loss": 0.676, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2544400691986084, + "rewards/margins": 5.907217025756836, + "rewards/rejected": -2.6527771949768066, + "step": 10687 + }, + { + "epoch": 2.67, + "grad_norm": 8.274125099182129, + "learning_rate": 4.454433984589897e-06, + "logits/chosen": -0.4514926075935364, + "logits/rejected": -0.5781229734420776, + "logps/chosen": -65.47071838378906, + "logps/rejected": -85.09320068359375, + "loss": 0.6708, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.709420680999756, + "rewards/margins": 6.12069034576416, + "rewards/rejected": -3.4112701416015625, + "step": 10688 + }, + { + "epoch": 2.67, + "grad_norm": 8.307044982910156, + "learning_rate": 4.453652696552638e-06, + "logits/chosen": -0.48200905323028564, + "logits/rejected": -0.5522934198379517, + "logps/chosen": -56.50376892089844, + "logps/rejected": -79.76762390136719, + "loss": 0.7539, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.705895185470581, + "rewards/margins": 4.3607306480407715, + "rewards/rejected": -1.6548352241516113, + "step": 10689 + }, + { + "epoch": 2.67, + "grad_norm": 3.443751811981201, + "learning_rate": 4.4528714220162e-06, + "logits/chosen": -0.4695207476615906, + "logits/rejected": -0.5528976917266846, + "logps/chosen": -57.021446228027344, + "logps/rejected": -101.40364074707031, + "loss": 0.6404, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1450726985931396, + "rewards/margins": 6.80799674987793, + "rewards/rejected": -3.662923812866211, + "step": 10690 + }, + { + "epoch": 2.67, + "grad_norm": 4.912691593170166, + "learning_rate": 4.452090160999892e-06, + "logits/chosen": -0.5375760197639465, + "logits/rejected": -0.5828026533126831, + "logps/chosen": -50.40125274658203, + "logps/rejected": -89.65497589111328, + "loss": 0.6397, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.834641456604004, + "rewards/margins": 4.517985820770264, + "rewards/rejected": -1.6833447217941284, + "step": 10691 + }, + { + "epoch": 2.67, + "grad_norm": 3.513117790222168, + "learning_rate": 4.45130891352302e-06, + "logits/chosen": -0.4829096794128418, + "logits/rejected": -0.5561105608940125, + "logps/chosen": -59.24333953857422, + "logps/rejected": -105.56160736083984, + "loss": 0.5999, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.25044322013855, + "rewards/margins": 6.250402450561523, + "rewards/rejected": -2.9999592304229736, + "step": 10692 + }, + { + "epoch": 2.68, + "grad_norm": 3.2482097148895264, + "learning_rate": 4.450527679604887e-06, + "logits/chosen": -0.5730711817741394, + "logits/rejected": -0.6628411412239075, + "logps/chosen": -48.703834533691406, + "logps/rejected": -81.60881042480469, + "loss": 0.5842, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7890877723693848, + "rewards/margins": 5.29078483581543, + "rewards/rejected": -2.501697301864624, + "step": 10693 + }, + { + "epoch": 2.68, + "grad_norm": 3.626178503036499, + "learning_rate": 4.449746459264799e-06, + "logits/chosen": -0.5520534515380859, + "logits/rejected": -0.6208717823028564, + "logps/chosen": -45.16899871826172, + "logps/rejected": -100.53179931640625, + "loss": 0.6315, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9595274925231934, + "rewards/margins": 6.740971088409424, + "rewards/rejected": -3.7814435958862305, + "step": 10694 + }, + { + "epoch": 2.68, + "grad_norm": 3.5522778034210205, + "learning_rate": 4.448965252522062e-06, + "logits/chosen": -0.542949378490448, + "logits/rejected": -0.6564422845840454, + "logps/chosen": -51.69873809814453, + "logps/rejected": -90.0177001953125, + "loss": 0.6914, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.927777051925659, + "rewards/margins": 6.166147232055664, + "rewards/rejected": -3.238370180130005, + "step": 10695 + }, + { + "epoch": 2.68, + "grad_norm": 3.42101788520813, + "learning_rate": 4.44818405939598e-06, + "logits/chosen": -0.5065771341323853, + "logits/rejected": -0.5749718546867371, + "logps/chosen": -64.27530670166016, + "logps/rejected": -94.90777587890625, + "loss": 0.6681, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8757402896881104, + "rewards/margins": 6.581779479980469, + "rewards/rejected": -3.7060389518737793, + "step": 10696 + }, + { + "epoch": 2.68, + "grad_norm": 4.2465715408325195, + "learning_rate": 4.447402879905857e-06, + "logits/chosen": -0.4022294282913208, + "logits/rejected": -0.5352710485458374, + "logps/chosen": -59.503814697265625, + "logps/rejected": -85.14418029785156, + "loss": 0.6551, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8760900497436523, + "rewards/margins": 6.580437660217285, + "rewards/rejected": -3.704347610473633, + "step": 10697 + }, + { + "epoch": 2.68, + "grad_norm": 2.5190348625183105, + "learning_rate": 4.446621714070995e-06, + "logits/chosen": -0.425433874130249, + "logits/rejected": -0.5201847553253174, + "logps/chosen": -46.73722839355469, + "logps/rejected": -115.11207580566406, + "loss": 0.5159, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2951273918151855, + "rewards/margins": 7.884126663208008, + "rewards/rejected": -4.588998794555664, + "step": 10698 + }, + { + "epoch": 2.68, + "grad_norm": 7.195003986358643, + "learning_rate": 4.445840561910702e-06, + "logits/chosen": -0.49696090817451477, + "logits/rejected": -0.6266400218009949, + "logps/chosen": -51.49306869506836, + "logps/rejected": -91.08460235595703, + "loss": 0.5796, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.097841262817383, + "rewards/margins": 6.0689263343811035, + "rewards/rejected": -2.9710850715637207, + "step": 10699 + }, + { + "epoch": 2.68, + "grad_norm": 4.162420272827148, + "learning_rate": 4.4450594234442754e-06, + "logits/chosen": -0.5474416613578796, + "logits/rejected": -0.6402460336685181, + "logps/chosen": -58.453548431396484, + "logps/rejected": -108.43107604980469, + "loss": 0.6448, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9815444946289062, + "rewards/margins": 6.619317054748535, + "rewards/rejected": -3.63777232170105, + "step": 10700 + }, + { + "epoch": 2.68, + "grad_norm": 7.709001541137695, + "learning_rate": 4.44427829869102e-06, + "logits/chosen": -0.5726541876792908, + "logits/rejected": -0.6836606860160828, + "logps/chosen": -55.88957977294922, + "logps/rejected": -98.52726745605469, + "loss": 0.6323, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0405569076538086, + "rewards/margins": 6.981597423553467, + "rewards/rejected": -3.941040515899658, + "step": 10701 + }, + { + "epoch": 2.68, + "grad_norm": 2.996298313140869, + "learning_rate": 4.44349718767024e-06, + "logits/chosen": -0.48021602630615234, + "logits/rejected": -0.5801606178283691, + "logps/chosen": -58.009849548339844, + "logps/rejected": -98.21424102783203, + "loss": 0.5748, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1091976165771484, + "rewards/margins": 5.493055820465088, + "rewards/rejected": -2.3838582038879395, + "step": 10702 + }, + { + "epoch": 2.68, + "grad_norm": 6.49738073348999, + "learning_rate": 4.442716090401239e-06, + "logits/chosen": -0.5033810138702393, + "logits/rejected": -0.57454514503479, + "logps/chosen": -60.54518127441406, + "logps/rejected": -106.27406311035156, + "loss": 0.7461, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.056582450866699, + "rewards/margins": 6.210787296295166, + "rewards/rejected": -3.1542046070098877, + "step": 10703 + }, + { + "epoch": 2.68, + "grad_norm": 13.36612606048584, + "learning_rate": 4.441935006903313e-06, + "logits/chosen": -0.532096266746521, + "logits/rejected": -0.5874069929122925, + "logps/chosen": -51.93951416015625, + "logps/rejected": -86.272705078125, + "loss": 0.8235, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.774723768234253, + "rewards/margins": 5.131091594696045, + "rewards/rejected": -2.35636830329895, + "step": 10704 + }, + { + "epoch": 2.68, + "grad_norm": 13.680954933166504, + "learning_rate": 4.441153937195767e-06, + "logits/chosen": -0.47012150287628174, + "logits/rejected": -0.5953769683837891, + "logps/chosen": -78.76683807373047, + "logps/rejected": -84.59022521972656, + "loss": 0.7705, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0049068927764893, + "rewards/margins": 5.570079803466797, + "rewards/rejected": -2.5651729106903076, + "step": 10705 + }, + { + "epoch": 2.68, + "grad_norm": 4.102762222290039, + "learning_rate": 4.440372881297903e-06, + "logits/chosen": -0.5270721316337585, + "logits/rejected": -0.5856660008430481, + "logps/chosen": -62.22864532470703, + "logps/rejected": -94.22721862792969, + "loss": 0.7348, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.78399658203125, + "rewards/margins": 5.6338090896606445, + "rewards/rejected": -2.8498125076293945, + "step": 10706 + }, + { + "epoch": 2.68, + "grad_norm": 7.5826005935668945, + "learning_rate": 4.43959183922902e-06, + "logits/chosen": -0.4937746822834015, + "logits/rejected": -0.5821029543876648, + "logps/chosen": -57.743614196777344, + "logps/rejected": -96.21400451660156, + "loss": 0.7809, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9867849349975586, + "rewards/margins": 6.042561054229736, + "rewards/rejected": -3.0557761192321777, + "step": 10707 + }, + { + "epoch": 2.68, + "grad_norm": 7.945165157318115, + "learning_rate": 4.438810811008418e-06, + "logits/chosen": -0.5382393598556519, + "logits/rejected": -0.6457958817481995, + "logps/chosen": -57.722618103027344, + "logps/rejected": -91.46946716308594, + "loss": 0.7639, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.894590377807617, + "rewards/margins": 5.976917266845703, + "rewards/rejected": -3.082326889038086, + "step": 10708 + }, + { + "epoch": 2.68, + "grad_norm": 4.419515132904053, + "learning_rate": 4.438029796655398e-06, + "logits/chosen": -0.48833706974983215, + "logits/rejected": -0.6080436110496521, + "logps/chosen": -50.5946044921875, + "logps/rejected": -89.23338317871094, + "loss": 0.547, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9458882808685303, + "rewards/margins": 5.7285308837890625, + "rewards/rejected": -2.7826426029205322, + "step": 10709 + }, + { + "epoch": 2.68, + "grad_norm": 5.225142002105713, + "learning_rate": 4.43724879618926e-06, + "logits/chosen": -0.41502445936203003, + "logits/rejected": -0.5275495052337646, + "logps/chosen": -59.325687408447266, + "logps/rejected": -95.14212799072266, + "loss": 0.7306, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6098732948303223, + "rewards/margins": 5.958948135375977, + "rewards/rejected": -3.3490753173828125, + "step": 10710 + }, + { + "epoch": 2.68, + "grad_norm": 8.583453178405762, + "learning_rate": 4.4364678096293025e-06, + "logits/chosen": -0.5196127891540527, + "logits/rejected": -0.6076562404632568, + "logps/chosen": -42.15191650390625, + "logps/rejected": -96.21907043457031, + "loss": 0.6782, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.195246696472168, + "rewards/margins": 5.377872467041016, + "rewards/rejected": -2.1826257705688477, + "step": 10711 + }, + { + "epoch": 2.68, + "grad_norm": 13.1234769821167, + "learning_rate": 4.435686836994825e-06, + "logits/chosen": -0.454153448343277, + "logits/rejected": -0.5519839525222778, + "logps/chosen": -52.46111297607422, + "logps/rejected": -91.05384063720703, + "loss": 0.6771, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.042417526245117, + "rewards/margins": 5.377751350402832, + "rewards/rejected": -2.3353335857391357, + "step": 10712 + }, + { + "epoch": 2.68, + "grad_norm": 4.7776055335998535, + "learning_rate": 4.434905878305123e-06, + "logits/chosen": -0.5517187118530273, + "logits/rejected": -0.625580906867981, + "logps/chosen": -57.91344451904297, + "logps/rejected": -84.45455169677734, + "loss": 0.5646, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9594385623931885, + "rewards/margins": 5.826138019561768, + "rewards/rejected": -2.866699457168579, + "step": 10713 + }, + { + "epoch": 2.68, + "grad_norm": 10.316317558288574, + "learning_rate": 4.434124933579504e-06, + "logits/chosen": -0.44830429553985596, + "logits/rejected": -0.5032563805580139, + "logps/chosen": -74.92232513427734, + "logps/rejected": -104.91156005859375, + "loss": 0.8124, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6234562397003174, + "rewards/margins": 4.674756050109863, + "rewards/rejected": -2.051300048828125, + "step": 10714 + }, + { + "epoch": 2.68, + "grad_norm": 9.476237297058105, + "learning_rate": 4.433344002837255e-06, + "logits/chosen": -0.46821945905685425, + "logits/rejected": -0.5454042553901672, + "logps/chosen": -46.38874435424805, + "logps/rejected": -79.6779556274414, + "loss": 0.8123, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7538914680480957, + "rewards/margins": 4.949148178100586, + "rewards/rejected": -2.1952569484710693, + "step": 10715 + }, + { + "epoch": 2.68, + "grad_norm": 6.809206962585449, + "learning_rate": 4.432563086097679e-06, + "logits/chosen": -0.48520126938819885, + "logits/rejected": -0.5568777322769165, + "logps/chosen": -63.18235778808594, + "logps/rejected": -101.13973999023438, + "loss": 0.6488, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0034215450286865, + "rewards/margins": 6.477728366851807, + "rewards/rejected": -3.474306344985962, + "step": 10716 + }, + { + "epoch": 2.68, + "grad_norm": 7.1526689529418945, + "learning_rate": 4.4317821833800755e-06, + "logits/chosen": -0.521435022354126, + "logits/rejected": -0.6073278188705444, + "logps/chosen": -55.739784240722656, + "logps/rejected": -84.32799530029297, + "loss": 0.6793, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7761783599853516, + "rewards/margins": 5.31029748916626, + "rewards/rejected": -2.5341198444366455, + "step": 10717 + }, + { + "epoch": 2.68, + "grad_norm": 11.955368995666504, + "learning_rate": 4.431001294703735e-06, + "logits/chosen": -0.4781796336174011, + "logits/rejected": -0.5744649767875671, + "logps/chosen": -53.491573333740234, + "logps/rejected": -101.34819793701172, + "loss": 0.6737, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1337814331054688, + "rewards/margins": 7.743724346160889, + "rewards/rejected": -4.60994291305542, + "step": 10718 + }, + { + "epoch": 2.68, + "grad_norm": 7.442778587341309, + "learning_rate": 4.430220420087959e-06, + "logits/chosen": -0.5717213749885559, + "logits/rejected": -0.6367897987365723, + "logps/chosen": -46.94956588745117, + "logps/rejected": -102.40577697753906, + "loss": 0.6858, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6444718837738037, + "rewards/margins": 5.884833812713623, + "rewards/rejected": -3.2403616905212402, + "step": 10719 + }, + { + "epoch": 2.68, + "grad_norm": 3.679748058319092, + "learning_rate": 4.429439559552042e-06, + "logits/chosen": -0.5622700452804565, + "logits/rejected": -0.6485112309455872, + "logps/chosen": -53.829952239990234, + "logps/rejected": -110.96626281738281, + "loss": 0.6065, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.024397611618042, + "rewards/margins": 7.149105072021484, + "rewards/rejected": -4.124707221984863, + "step": 10720 + }, + { + "epoch": 2.68, + "grad_norm": 8.100614547729492, + "learning_rate": 4.428658713115282e-06, + "logits/chosen": -0.49885398149490356, + "logits/rejected": -0.5627530813217163, + "logps/chosen": -60.627838134765625, + "logps/rejected": -86.99090576171875, + "loss": 0.6694, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6466574668884277, + "rewards/margins": 5.321269989013672, + "rewards/rejected": -2.674612522125244, + "step": 10721 + }, + { + "epoch": 2.68, + "grad_norm": 2.573561906814575, + "learning_rate": 4.427877880796972e-06, + "logits/chosen": -0.534923255443573, + "logits/rejected": -0.6359275579452515, + "logps/chosen": -51.34015655517578, + "logps/rejected": -119.82249450683594, + "loss": 0.6125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9602596759796143, + "rewards/margins": 7.637731552124023, + "rewards/rejected": -4.677472114562988, + "step": 10722 + }, + { + "epoch": 2.68, + "grad_norm": 16.880348205566406, + "learning_rate": 4.427097062616408e-06, + "logits/chosen": -0.5354723930358887, + "logits/rejected": -0.6059349775314331, + "logps/chosen": -57.30253219604492, + "logps/rejected": -101.61673736572266, + "loss": 0.8489, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.455453634262085, + "rewards/margins": 5.256771087646484, + "rewards/rejected": -2.801316976547241, + "step": 10723 + }, + { + "epoch": 2.68, + "grad_norm": 3.9114084243774414, + "learning_rate": 4.4263162585928856e-06, + "logits/chosen": -0.4356151223182678, + "logits/rejected": -0.5998239517211914, + "logps/chosen": -59.713409423828125, + "logps/rejected": -91.42703247070312, + "loss": 0.6045, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.022608757019043, + "rewards/margins": 6.977344989776611, + "rewards/rejected": -3.9547364711761475, + "step": 10724 + }, + { + "epoch": 2.68, + "grad_norm": 22.640729904174805, + "learning_rate": 4.425535468745698e-06, + "logits/chosen": -0.554657518863678, + "logits/rejected": -0.6738144159317017, + "logps/chosen": -60.45613098144531, + "logps/rejected": -121.5587158203125, + "loss": 0.7523, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.697814702987671, + "rewards/margins": 7.168654441833496, + "rewards/rejected": -4.470839977264404, + "step": 10725 + }, + { + "epoch": 2.68, + "grad_norm": 8.812172889709473, + "learning_rate": 4.4247546930941395e-06, + "logits/chosen": -0.5015922784805298, + "logits/rejected": -0.5909358263015747, + "logps/chosen": -62.36707305908203, + "logps/rejected": -91.32807922363281, + "loss": 0.9375, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1059648990631104, + "rewards/margins": 4.838863849639893, + "rewards/rejected": -1.7328988313674927, + "step": 10726 + }, + { + "epoch": 2.68, + "grad_norm": 3.991201639175415, + "learning_rate": 4.423973931657504e-06, + "logits/chosen": -0.5397286415100098, + "logits/rejected": -0.6057631373405457, + "logps/chosen": -57.20553970336914, + "logps/rejected": -83.86944580078125, + "loss": 0.7521, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4578306674957275, + "rewards/margins": 6.208358287811279, + "rewards/rejected": -2.750527858734131, + "step": 10727 + }, + { + "epoch": 2.68, + "grad_norm": 8.69676685333252, + "learning_rate": 4.423193184455089e-06, + "logits/chosen": -0.4467872083187103, + "logits/rejected": -0.49958235025405884, + "logps/chosen": -69.24451446533203, + "logps/rejected": -103.6748275756836, + "loss": 0.847, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1146087646484375, + "rewards/margins": 4.816393852233887, + "rewards/rejected": -1.7017849683761597, + "step": 10728 + }, + { + "epoch": 2.68, + "grad_norm": 10.017906188964844, + "learning_rate": 4.422412451506181e-06, + "logits/chosen": -0.46730560064315796, + "logits/rejected": -0.5313816070556641, + "logps/chosen": -51.88452911376953, + "logps/rejected": -89.66920471191406, + "loss": 0.6959, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9939379692077637, + "rewards/margins": 4.66065788269043, + "rewards/rejected": -1.666719913482666, + "step": 10729 + }, + { + "epoch": 2.68, + "grad_norm": 2.939373254776001, + "learning_rate": 4.421631732830074e-06, + "logits/chosen": -0.47586789727211, + "logits/rejected": -0.5374020338058472, + "logps/chosen": -45.10580825805664, + "logps/rejected": -119.9271011352539, + "loss": 0.5468, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.956800699234009, + "rewards/margins": 7.87001371383667, + "rewards/rejected": -4.913212776184082, + "step": 10730 + }, + { + "epoch": 2.68, + "grad_norm": 5.430081844329834, + "learning_rate": 4.420851028446066e-06, + "logits/chosen": -0.42599788308143616, + "logits/rejected": -0.482730507850647, + "logps/chosen": -52.61943435668945, + "logps/rejected": -90.21714782714844, + "loss": 0.6652, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0792157649993896, + "rewards/margins": 5.689461708068848, + "rewards/rejected": -2.610246181488037, + "step": 10731 + }, + { + "epoch": 2.68, + "grad_norm": 4.196244716644287, + "learning_rate": 4.420070338373442e-06, + "logits/chosen": -0.40929099917411804, + "logits/rejected": -0.5297377705574036, + "logps/chosen": -53.56727600097656, + "logps/rejected": -82.27198028564453, + "loss": 0.6343, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.294949531555176, + "rewards/margins": 6.6452860832214355, + "rewards/rejected": -3.3503365516662598, + "step": 10732 + }, + { + "epoch": 2.69, + "grad_norm": 7.356419086456299, + "learning_rate": 4.419289662631497e-06, + "logits/chosen": -0.5005130171775818, + "logits/rejected": -0.573891282081604, + "logps/chosen": -57.967918395996094, + "logps/rejected": -87.1364974975586, + "loss": 0.7513, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.303799629211426, + "rewards/margins": 5.857162952423096, + "rewards/rejected": -2.5533628463745117, + "step": 10733 + }, + { + "epoch": 2.69, + "grad_norm": 4.199844837188721, + "learning_rate": 4.418509001239522e-06, + "logits/chosen": -0.5699015855789185, + "logits/rejected": -0.6664183735847473, + "logps/chosen": -54.95140838623047, + "logps/rejected": -111.53645324707031, + "loss": 0.6273, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8047378063201904, + "rewards/margins": 7.4506072998046875, + "rewards/rejected": -4.645869255065918, + "step": 10734 + }, + { + "epoch": 2.69, + "grad_norm": 3.937828779220581, + "learning_rate": 4.417728354216811e-06, + "logits/chosen": -0.43362003564834595, + "logits/rejected": -0.5987632870674133, + "logps/chosen": -72.96858215332031, + "logps/rejected": -84.42726135253906, + "loss": 0.6657, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.861966609954834, + "rewards/margins": 5.591888427734375, + "rewards/rejected": -2.7299208641052246, + "step": 10735 + }, + { + "epoch": 2.69, + "grad_norm": 7.912510871887207, + "learning_rate": 4.416947721582649e-06, + "logits/chosen": -0.440021276473999, + "logits/rejected": -0.5446513295173645, + "logps/chosen": -62.035606384277344, + "logps/rejected": -88.5506591796875, + "loss": 0.6892, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.159876585006714, + "rewards/margins": 6.384056091308594, + "rewards/rejected": -3.224179983139038, + "step": 10736 + }, + { + "epoch": 2.69, + "grad_norm": 6.571726322174072, + "learning_rate": 4.41616710335633e-06, + "logits/chosen": -0.5056972503662109, + "logits/rejected": -0.5832885503768921, + "logps/chosen": -53.3883056640625, + "logps/rejected": -90.3743667602539, + "loss": 0.6196, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.414276123046875, + "rewards/margins": 6.557348728179932, + "rewards/rejected": -3.1430728435516357, + "step": 10737 + }, + { + "epoch": 2.69, + "grad_norm": 7.871256351470947, + "learning_rate": 4.415386499557143e-06, + "logits/chosen": -0.48193877935409546, + "logits/rejected": -0.5411355495452881, + "logps/chosen": -50.462730407714844, + "logps/rejected": -100.31471252441406, + "loss": 0.6314, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0349583625793457, + "rewards/margins": 5.9070353507995605, + "rewards/rejected": -2.872076988220215, + "step": 10738 + }, + { + "epoch": 2.69, + "grad_norm": 5.100319862365723, + "learning_rate": 4.414605910204377e-06, + "logits/chosen": -0.5630105137825012, + "logits/rejected": -0.6110786199569702, + "logps/chosen": -54.899105072021484, + "logps/rejected": -100.12950134277344, + "loss": 0.7278, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.954725980758667, + "rewards/margins": 5.6586174964904785, + "rewards/rejected": -2.7038917541503906, + "step": 10739 + }, + { + "epoch": 2.69, + "grad_norm": 8.43586254119873, + "learning_rate": 4.413825335317321e-06, + "logits/chosen": -0.5328311920166016, + "logits/rejected": -0.6087741851806641, + "logps/chosen": -54.05213165283203, + "logps/rejected": -98.61947631835938, + "loss": 0.6531, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0574209690093994, + "rewards/margins": 6.34255313873291, + "rewards/rejected": -3.2851321697235107, + "step": 10740 + }, + { + "epoch": 2.69, + "grad_norm": 5.534794807434082, + "learning_rate": 4.413044774915265e-06, + "logits/chosen": -0.5019564032554626, + "logits/rejected": -0.6302219033241272, + "logps/chosen": -50.807899475097656, + "logps/rejected": -87.73249816894531, + "loss": 0.5944, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.17246675491333, + "rewards/margins": 7.241622447967529, + "rewards/rejected": -4.069156169891357, + "step": 10741 + }, + { + "epoch": 2.69, + "grad_norm": 5.877401351928711, + "learning_rate": 4.412264229017498e-06, + "logits/chosen": -0.5039404630661011, + "logits/rejected": -0.5507364273071289, + "logps/chosen": -54.76357650756836, + "logps/rejected": -99.16122436523438, + "loss": 0.6586, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9870662689208984, + "rewards/margins": 5.298852920532227, + "rewards/rejected": -2.311786651611328, + "step": 10742 + }, + { + "epoch": 2.69, + "grad_norm": 5.281763553619385, + "learning_rate": 4.411483697643306e-06, + "logits/chosen": -0.47052890062332153, + "logits/rejected": -0.5660229921340942, + "logps/chosen": -59.16532897949219, + "logps/rejected": -95.05235290527344, + "loss": 0.6394, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1178011894226074, + "rewards/margins": 6.180084228515625, + "rewards/rejected": -3.0622832775115967, + "step": 10743 + }, + { + "epoch": 2.69, + "grad_norm": 6.197811603546143, + "learning_rate": 4.410703180811976e-06, + "logits/chosen": -0.5503681898117065, + "logits/rejected": -0.5543085336685181, + "logps/chosen": -57.55963897705078, + "logps/rejected": -107.72880554199219, + "loss": 0.6619, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.908456325531006, + "rewards/margins": 6.080683708190918, + "rewards/rejected": -3.172226905822754, + "step": 10744 + }, + { + "epoch": 2.69, + "grad_norm": 3.771604537963867, + "learning_rate": 4.409922678542799e-06, + "logits/chosen": -0.5378645062446594, + "logits/rejected": -0.6255056858062744, + "logps/chosen": -53.61598205566406, + "logps/rejected": -100.89099884033203, + "loss": 0.6569, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9565463066101074, + "rewards/margins": 7.22678279876709, + "rewards/rejected": -4.270236492156982, + "step": 10745 + }, + { + "epoch": 2.69, + "grad_norm": 9.3748140335083, + "learning_rate": 4.409142190855063e-06, + "logits/chosen": -0.49586260318756104, + "logits/rejected": -0.6233660578727722, + "logps/chosen": -68.8589096069336, + "logps/rejected": -97.6702651977539, + "loss": 0.6322, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.980668067932129, + "rewards/margins": 6.542518138885498, + "rewards/rejected": -3.5618503093719482, + "step": 10746 + }, + { + "epoch": 2.69, + "grad_norm": 7.700911521911621, + "learning_rate": 4.408361717768047e-06, + "logits/chosen": -0.5024660229682922, + "logits/rejected": -0.5962576866149902, + "logps/chosen": -61.097591400146484, + "logps/rejected": -114.41679382324219, + "loss": 0.7043, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.4232306480407715, + "rewards/margins": 7.156208038330078, + "rewards/rejected": -3.7329769134521484, + "step": 10747 + }, + { + "epoch": 2.69, + "grad_norm": 5.139981269836426, + "learning_rate": 4.407581259301044e-06, + "logits/chosen": -0.4048205316066742, + "logits/rejected": -0.5361396670341492, + "logps/chosen": -54.363525390625, + "logps/rejected": -95.25096893310547, + "loss": 0.5457, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8525595664978027, + "rewards/margins": 6.451117038726807, + "rewards/rejected": -3.5985569953918457, + "step": 10748 + }, + { + "epoch": 2.69, + "grad_norm": 3.5827760696411133, + "learning_rate": 4.40680081547334e-06, + "logits/chosen": -0.38783764839172363, + "logits/rejected": -0.5082223415374756, + "logps/chosen": -78.13707733154297, + "logps/rejected": -92.0604248046875, + "loss": 0.5955, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9693686962127686, + "rewards/margins": 6.159140110015869, + "rewards/rejected": -3.189771890640259, + "step": 10749 + }, + { + "epoch": 2.69, + "grad_norm": 2.800109386444092, + "learning_rate": 4.406020386304217e-06, + "logits/chosen": -0.4409095048904419, + "logits/rejected": -0.5343021154403687, + "logps/chosen": -54.54494094848633, + "logps/rejected": -120.41236114501953, + "loss": 0.5354, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.053558111190796, + "rewards/margins": 7.761889934539795, + "rewards/rejected": -4.708331108093262, + "step": 10750 + }, + { + "epoch": 2.69, + "grad_norm": 12.676660537719727, + "learning_rate": 4.405239971812963e-06, + "logits/chosen": -0.5233600735664368, + "logits/rejected": -0.5583181977272034, + "logps/chosen": -56.416969299316406, + "logps/rejected": -93.53704833984375, + "loss": 0.6411, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9764881134033203, + "rewards/margins": 5.334209442138672, + "rewards/rejected": -2.3577208518981934, + "step": 10751 + }, + { + "epoch": 2.69, + "grad_norm": 6.120203971862793, + "learning_rate": 4.40445957201886e-06, + "logits/chosen": -0.5632969737052917, + "logits/rejected": -0.6721740961074829, + "logps/chosen": -56.392845153808594, + "logps/rejected": -85.6837158203125, + "loss": 0.6985, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.028733730316162, + "rewards/margins": 5.92730712890625, + "rewards/rejected": -2.898573637008667, + "step": 10752 + }, + { + "epoch": 2.69, + "grad_norm": 5.978456497192383, + "learning_rate": 4.403679186941197e-06, + "logits/chosen": -0.4924844801425934, + "logits/rejected": -0.5600507259368896, + "logps/chosen": -61.68488311767578, + "logps/rejected": -93.93465423583984, + "loss": 0.6281, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2217912673950195, + "rewards/margins": 6.083673477172852, + "rewards/rejected": -2.861882209777832, + "step": 10753 + }, + { + "epoch": 2.69, + "grad_norm": 4.471810817718506, + "learning_rate": 4.402898816599254e-06, + "logits/chosen": -0.45668455958366394, + "logits/rejected": -0.6175462007522583, + "logps/chosen": -66.73379516601562, + "logps/rejected": -83.82564544677734, + "loss": 0.7134, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.5766987800598145, + "rewards/margins": 5.621618270874023, + "rewards/rejected": -3.0449185371398926, + "step": 10754 + }, + { + "epoch": 2.69, + "grad_norm": 18.740983963012695, + "learning_rate": 4.402118461012317e-06, + "logits/chosen": -0.5284808278083801, + "logits/rejected": -0.582717776298523, + "logps/chosen": -57.546607971191406, + "logps/rejected": -119.15422058105469, + "loss": 0.7295, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.840144157409668, + "rewards/margins": 7.300162315368652, + "rewards/rejected": -4.460017681121826, + "step": 10755 + }, + { + "epoch": 2.69, + "grad_norm": 5.56843900680542, + "learning_rate": 4.401338120199669e-06, + "logits/chosen": -0.5121458768844604, + "logits/rejected": -0.6489874720573425, + "logps/chosen": -63.297523498535156, + "logps/rejected": -81.14678192138672, + "loss": 0.7036, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1201906204223633, + "rewards/margins": 6.008652687072754, + "rewards/rejected": -2.888462543487549, + "step": 10756 + }, + { + "epoch": 2.69, + "grad_norm": 2.687932252883911, + "learning_rate": 4.400557794180592e-06, + "logits/chosen": -0.5274035930633545, + "logits/rejected": -0.5836741924285889, + "logps/chosen": -50.47638702392578, + "logps/rejected": -109.23828125, + "loss": 0.6108, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.265233039855957, + "rewards/margins": 6.823007583618164, + "rewards/rejected": -3.557774305343628, + "step": 10757 + }, + { + "epoch": 2.69, + "grad_norm": 3.9528720378875732, + "learning_rate": 4.39977748297437e-06, + "logits/chosen": -0.5233059525489807, + "logits/rejected": -0.5852881073951721, + "logps/chosen": -43.715511322021484, + "logps/rejected": -102.85843658447266, + "loss": 0.5778, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.076979160308838, + "rewards/margins": 5.93438720703125, + "rewards/rejected": -2.857408285140991, + "step": 10758 + }, + { + "epoch": 2.69, + "grad_norm": 16.94822883605957, + "learning_rate": 4.398997186600283e-06, + "logits/chosen": -0.4536043405532837, + "logits/rejected": -0.5361742377281189, + "logps/chosen": -65.46078491210938, + "logps/rejected": -106.87512969970703, + "loss": 0.7012, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.678562879562378, + "rewards/margins": 5.3901872634887695, + "rewards/rejected": -2.7116246223449707, + "step": 10759 + }, + { + "epoch": 2.69, + "grad_norm": 5.65221643447876, + "learning_rate": 4.398216905077618e-06, + "logits/chosen": -0.5192899107933044, + "logits/rejected": -0.5476551651954651, + "logps/chosen": -60.93741989135742, + "logps/rejected": -111.56062316894531, + "loss": 0.7779, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.924111843109131, + "rewards/margins": 5.131815433502197, + "rewards/rejected": -2.2077035903930664, + "step": 10760 + }, + { + "epoch": 2.69, + "grad_norm": 9.347797393798828, + "learning_rate": 4.397436638425652e-06, + "logits/chosen": -0.5370972752571106, + "logits/rejected": -0.5646253228187561, + "logps/chosen": -55.97239685058594, + "logps/rejected": -101.93991088867188, + "loss": 0.7784, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9409291744232178, + "rewards/margins": 5.248796463012695, + "rewards/rejected": -2.3078668117523193, + "step": 10761 + }, + { + "epoch": 2.69, + "grad_norm": 5.884679317474365, + "learning_rate": 4.396656386663666e-06, + "logits/chosen": -0.42644697427749634, + "logits/rejected": -0.5130728483200073, + "logps/chosen": -58.468257904052734, + "logps/rejected": -90.66169738769531, + "loss": 0.7294, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.797457218170166, + "rewards/margins": 5.385885715484619, + "rewards/rejected": -2.588428497314453, + "step": 10762 + }, + { + "epoch": 2.69, + "grad_norm": 7.162813663482666, + "learning_rate": 4.395876149810946e-06, + "logits/chosen": -0.5171371698379517, + "logits/rejected": -0.5830702185630798, + "logps/chosen": -52.932071685791016, + "logps/rejected": -100.4708480834961, + "loss": 0.6791, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2269697189331055, + "rewards/margins": 5.498912811279297, + "rewards/rejected": -2.271942615509033, + "step": 10763 + }, + { + "epoch": 2.69, + "grad_norm": 5.987552642822266, + "learning_rate": 4.395095927886765e-06, + "logits/chosen": -0.5552635192871094, + "logits/rejected": -0.6325236558914185, + "logps/chosen": -59.60258102416992, + "logps/rejected": -93.477783203125, + "loss": 0.6099, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0639963150024414, + "rewards/margins": 6.344551086425781, + "rewards/rejected": -3.280555009841919, + "step": 10764 + }, + { + "epoch": 2.69, + "grad_norm": 7.7319464683532715, + "learning_rate": 4.394315720910409e-06, + "logits/chosen": -0.5094401240348816, + "logits/rejected": -0.6031439900398254, + "logps/chosen": -56.57123947143555, + "logps/rejected": -85.62960815429688, + "loss": 0.6492, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9165186882019043, + "rewards/margins": 5.271852970123291, + "rewards/rejected": -2.3553340435028076, + "step": 10765 + }, + { + "epoch": 2.69, + "grad_norm": 4.998089790344238, + "learning_rate": 4.393535528901155e-06, + "logits/chosen": -0.5412579774856567, + "logits/rejected": -0.6858240365982056, + "logps/chosen": -47.750267028808594, + "logps/rejected": -100.96076965332031, + "loss": 0.6084, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.721686363220215, + "rewards/margins": 7.257684230804443, + "rewards/rejected": -4.53599739074707, + "step": 10766 + }, + { + "epoch": 2.69, + "grad_norm": 6.51626443862915, + "learning_rate": 4.392755351878284e-06, + "logits/chosen": -0.6045634746551514, + "logits/rejected": -0.6698054075241089, + "logps/chosen": -61.180908203125, + "logps/rejected": -94.64484405517578, + "loss": 0.721, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2299656867980957, + "rewards/margins": 6.506278038024902, + "rewards/rejected": -3.2763123512268066, + "step": 10767 + }, + { + "epoch": 2.69, + "grad_norm": 6.3834943771362305, + "learning_rate": 4.391975189861074e-06, + "logits/chosen": -0.533094048500061, + "logits/rejected": -0.5990186929702759, + "logps/chosen": -68.23033142089844, + "logps/rejected": -100.6959228515625, + "loss": 0.7344, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.924588441848755, + "rewards/margins": 6.555739879608154, + "rewards/rejected": -3.6311521530151367, + "step": 10768 + }, + { + "epoch": 2.69, + "grad_norm": 3.342994213104248, + "learning_rate": 4.391195042868803e-06, + "logits/chosen": -0.5344618558883667, + "logits/rejected": -0.5604619383811951, + "logps/chosen": -50.18580627441406, + "logps/rejected": -90.01202392578125, + "loss": 0.6426, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7087082862854004, + "rewards/margins": 4.764965534210205, + "rewards/rejected": -2.056257486343384, + "step": 10769 + }, + { + "epoch": 2.69, + "grad_norm": 3.264704465866089, + "learning_rate": 4.3904149109207515e-06, + "logits/chosen": -0.440990686416626, + "logits/rejected": -0.5941208004951477, + "logps/chosen": -64.63703155517578, + "logps/rejected": -85.36178588867188, + "loss": 0.6494, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.164477825164795, + "rewards/margins": 5.903242588043213, + "rewards/rejected": -2.738765239715576, + "step": 10770 + }, + { + "epoch": 2.69, + "grad_norm": 4.163995265960693, + "learning_rate": 4.3896347940361955e-06, + "logits/chosen": -0.44601762294769287, + "logits/rejected": -0.5318670868873596, + "logps/chosen": -58.73908615112305, + "logps/rejected": -98.14773559570312, + "loss": 0.7031, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.013956308364868, + "rewards/margins": 6.747617721557617, + "rewards/rejected": -3.73366117477417, + "step": 10771 + }, + { + "epoch": 2.69, + "grad_norm": 3.5626211166381836, + "learning_rate": 4.388854692234412e-06, + "logits/chosen": -0.47889333963394165, + "logits/rejected": -0.580617368221283, + "logps/chosen": -55.573062896728516, + "logps/rejected": -99.09564208984375, + "loss": 0.699, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2514808177948, + "rewards/margins": 6.68782377243042, + "rewards/rejected": -3.436343193054199, + "step": 10772 + }, + { + "epoch": 2.7, + "grad_norm": 7.145478248596191, + "learning_rate": 4.388074605534678e-06, + "logits/chosen": -0.4678977131843567, + "logits/rejected": -0.5474658012390137, + "logps/chosen": -51.89478302001953, + "logps/rejected": -101.92138671875, + "loss": 0.7701, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.850693464279175, + "rewards/margins": 6.5653886795043945, + "rewards/rejected": -3.714695453643799, + "step": 10773 + }, + { + "epoch": 2.7, + "grad_norm": 3.8885068893432617, + "learning_rate": 4.3872945339562755e-06, + "logits/chosen": -0.5523059964179993, + "logits/rejected": -0.6665685176849365, + "logps/chosen": -70.24585723876953, + "logps/rejected": -79.51387023925781, + "loss": 0.671, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.00264310836792, + "rewards/margins": 6.572916507720947, + "rewards/rejected": -3.5702738761901855, + "step": 10774 + }, + { + "epoch": 2.7, + "grad_norm": 2.4059157371520996, + "learning_rate": 4.386514477518474e-06, + "logits/chosen": -0.5347212553024292, + "logits/rejected": -0.6025144457817078, + "logps/chosen": -56.116859436035156, + "logps/rejected": -99.5943603515625, + "loss": 0.6512, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.188436985015869, + "rewards/margins": 7.684266567230225, + "rewards/rejected": -4.495829105377197, + "step": 10775 + }, + { + "epoch": 2.7, + "grad_norm": 4.1088385581970215, + "learning_rate": 4.385734436240552e-06, + "logits/chosen": -0.5312398672103882, + "logits/rejected": -0.6151978373527527, + "logps/chosen": -44.57610321044922, + "logps/rejected": -92.15787506103516, + "loss": 0.5848, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.400209665298462, + "rewards/margins": 7.055392265319824, + "rewards/rejected": -3.655182123184204, + "step": 10776 + }, + { + "epoch": 2.7, + "grad_norm": 7.605061054229736, + "learning_rate": 4.384954410141785e-06, + "logits/chosen": -0.5269583463668823, + "logits/rejected": -0.6190662384033203, + "logps/chosen": -49.84355926513672, + "logps/rejected": -91.76714324951172, + "loss": 0.6109, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.083531379699707, + "rewards/margins": 6.010447025299072, + "rewards/rejected": -2.926915168762207, + "step": 10777 + }, + { + "epoch": 2.7, + "grad_norm": 9.39154052734375, + "learning_rate": 4.3841743992414504e-06, + "logits/chosen": -0.45348605513572693, + "logits/rejected": -0.5010668039321899, + "logps/chosen": -58.60771179199219, + "logps/rejected": -97.38218688964844, + "loss": 0.6476, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.161125421524048, + "rewards/margins": 5.698672771453857, + "rewards/rejected": -2.5375475883483887, + "step": 10778 + }, + { + "epoch": 2.7, + "grad_norm": 6.290308475494385, + "learning_rate": 4.38339440355882e-06, + "logits/chosen": -0.5661147236824036, + "logits/rejected": -0.6396619081497192, + "logps/chosen": -58.09777069091797, + "logps/rejected": -103.51455688476562, + "loss": 0.6872, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0422685146331787, + "rewards/margins": 6.506289482116699, + "rewards/rejected": -3.464021921157837, + "step": 10779 + }, + { + "epoch": 2.7, + "grad_norm": 4.132315635681152, + "learning_rate": 4.382614423113171e-06, + "logits/chosen": -0.40430253744125366, + "logits/rejected": -0.5334241986274719, + "logps/chosen": -62.487064361572266, + "logps/rejected": -90.98171997070312, + "loss": 0.6247, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8397531509399414, + "rewards/margins": 5.817190647125244, + "rewards/rejected": -2.977437734603882, + "step": 10780 + }, + { + "epoch": 2.7, + "grad_norm": 6.171016216278076, + "learning_rate": 4.381834457923777e-06, + "logits/chosen": -0.5678331851959229, + "logits/rejected": -0.6381940841674805, + "logps/chosen": -36.89106369018555, + "logps/rejected": -84.04090881347656, + "loss": 0.6321, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.121269941329956, + "rewards/margins": 6.435079574584961, + "rewards/rejected": -3.313809871673584, + "step": 10781 + }, + { + "epoch": 2.7, + "grad_norm": 4.412426471710205, + "learning_rate": 4.381054508009909e-06, + "logits/chosen": -0.48142650723457336, + "logits/rejected": -0.6145836710929871, + "logps/chosen": -57.90876007080078, + "logps/rejected": -90.28459930419922, + "loss": 0.6344, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.068742036819458, + "rewards/margins": 6.2541022300720215, + "rewards/rejected": -3.1853599548339844, + "step": 10782 + }, + { + "epoch": 2.7, + "grad_norm": 8.828171730041504, + "learning_rate": 4.380274573390843e-06, + "logits/chosen": -0.5261226892471313, + "logits/rejected": -0.6198353171348572, + "logps/chosen": -57.3275146484375, + "logps/rejected": -103.51953125, + "loss": 0.6726, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.704951763153076, + "rewards/margins": 6.82982063293457, + "rewards/rejected": -4.124867916107178, + "step": 10783 + }, + { + "epoch": 2.7, + "grad_norm": 8.914742469787598, + "learning_rate": 4.3794946540858515e-06, + "logits/chosen": -0.4811445474624634, + "logits/rejected": -0.5196259021759033, + "logps/chosen": -61.94914245605469, + "logps/rejected": -88.22257995605469, + "loss": 0.7465, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.204043388366699, + "rewards/margins": 4.739723205566406, + "rewards/rejected": -1.535679817199707, + "step": 10784 + }, + { + "epoch": 2.7, + "grad_norm": 11.459863662719727, + "learning_rate": 4.378714750114208e-06, + "logits/chosen": -0.5165126323699951, + "logits/rejected": -0.6345260739326477, + "logps/chosen": -65.94226837158203, + "logps/rejected": -75.85874938964844, + "loss": 0.6681, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8853890895843506, + "rewards/margins": 5.87162971496582, + "rewards/rejected": -2.986240863800049, + "step": 10785 + }, + { + "epoch": 2.7, + "grad_norm": 4.778590679168701, + "learning_rate": 4.3779348614951824e-06, + "logits/chosen": -0.528049111366272, + "logits/rejected": -0.5829175114631653, + "logps/chosen": -41.72911834716797, + "logps/rejected": -98.01639556884766, + "loss": 0.555, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.265993356704712, + "rewards/margins": 5.87012243270874, + "rewards/rejected": -2.604128837585449, + "step": 10786 + }, + { + "epoch": 2.7, + "grad_norm": 5.004448413848877, + "learning_rate": 4.377154988248048e-06, + "logits/chosen": -0.4330289363861084, + "logits/rejected": -0.5233867168426514, + "logps/chosen": -51.09813690185547, + "logps/rejected": -83.75215911865234, + "loss": 0.6651, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.177175998687744, + "rewards/margins": 6.316208839416504, + "rewards/rejected": -3.139033317565918, + "step": 10787 + }, + { + "epoch": 2.7, + "grad_norm": 11.894387245178223, + "learning_rate": 4.376375130392079e-06, + "logits/chosen": -0.47984778881073, + "logits/rejected": -0.6373424530029297, + "logps/chosen": -58.14249038696289, + "logps/rejected": -90.51768493652344, + "loss": 0.7998, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9818315505981445, + "rewards/margins": 6.339663982391357, + "rewards/rejected": -3.357832670211792, + "step": 10788 + }, + { + "epoch": 2.7, + "grad_norm": 12.143016815185547, + "learning_rate": 4.375595287946542e-06, + "logits/chosen": -0.492938756942749, + "logits/rejected": -0.5693603754043579, + "logps/chosen": -70.5298843383789, + "logps/rejected": -100.63422393798828, + "loss": 0.8603, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.651700496673584, + "rewards/margins": 5.516180992126465, + "rewards/rejected": -2.864480495452881, + "step": 10789 + }, + { + "epoch": 2.7, + "grad_norm": 4.09018611907959, + "learning_rate": 4.374815460930708e-06, + "logits/chosen": -0.48448917269706726, + "logits/rejected": -0.5842980742454529, + "logps/chosen": -64.337158203125, + "logps/rejected": -112.20736694335938, + "loss": 0.6469, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0753862857818604, + "rewards/margins": 7.442173004150391, + "rewards/rejected": -4.366786479949951, + "step": 10790 + }, + { + "epoch": 2.7, + "grad_norm": 2.581650495529175, + "learning_rate": 4.374035649363851e-06, + "logits/chosen": -0.5113038420677185, + "logits/rejected": -0.6104397773742676, + "logps/chosen": -57.482810974121094, + "logps/rejected": -107.65823364257812, + "loss": 0.6223, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.095155954360962, + "rewards/margins": 7.873530864715576, + "rewards/rejected": -4.778374671936035, + "step": 10791 + }, + { + "epoch": 2.7, + "grad_norm": 5.156230926513672, + "learning_rate": 4.373255853265239e-06, + "logits/chosen": -0.4815334975719452, + "logits/rejected": -0.5479024648666382, + "logps/chosen": -51.72478485107422, + "logps/rejected": -93.21752166748047, + "loss": 0.6312, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.731860637664795, + "rewards/margins": 5.924950122833252, + "rewards/rejected": -3.193089485168457, + "step": 10792 + }, + { + "epoch": 2.7, + "grad_norm": 6.580723762512207, + "learning_rate": 4.37247607265414e-06, + "logits/chosen": -0.564179539680481, + "logits/rejected": -0.6128650903701782, + "logps/chosen": -48.35038757324219, + "logps/rejected": -111.4859848022461, + "loss": 0.6203, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9640164375305176, + "rewards/margins": 6.9635515213012695, + "rewards/rejected": -3.999534845352173, + "step": 10793 + }, + { + "epoch": 2.7, + "grad_norm": 3.6931586265563965, + "learning_rate": 4.3716963075498254e-06, + "logits/chosen": -0.5641958117485046, + "logits/rejected": -0.6579810380935669, + "logps/chosen": -57.015602111816406, + "logps/rejected": -86.70716857910156, + "loss": 0.6264, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.447580337524414, + "rewards/margins": 6.790399551391602, + "rewards/rejected": -3.342819929122925, + "step": 10794 + }, + { + "epoch": 2.7, + "grad_norm": 6.592141628265381, + "learning_rate": 4.3709165579715635e-06, + "logits/chosen": -0.4994223713874817, + "logits/rejected": -0.5862893462181091, + "logps/chosen": -57.30292510986328, + "logps/rejected": -93.85331726074219, + "loss": 0.6638, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3068811893463135, + "rewards/margins": 6.3750152587890625, + "rewards/rejected": -3.06813383102417, + "step": 10795 + }, + { + "epoch": 2.7, + "grad_norm": 9.657609939575195, + "learning_rate": 4.370136823938624e-06, + "logits/chosen": -0.6211181282997131, + "logits/rejected": -0.708490252494812, + "logps/chosen": -57.94227600097656, + "logps/rejected": -101.00935363769531, + "loss": 0.6895, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.986645221710205, + "rewards/margins": 6.599620819091797, + "rewards/rejected": -3.6129751205444336, + "step": 10796 + }, + { + "epoch": 2.7, + "grad_norm": 6.291775226593018, + "learning_rate": 4.369357105470271e-06, + "logits/chosen": -0.5542371273040771, + "logits/rejected": -0.6417750120162964, + "logps/chosen": -48.04960632324219, + "logps/rejected": -91.80903625488281, + "loss": 0.6817, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4889461994171143, + "rewards/margins": 6.028130531311035, + "rewards/rejected": -2.539184331893921, + "step": 10797 + }, + { + "epoch": 2.7, + "grad_norm": 6.044656276702881, + "learning_rate": 4.368577402585776e-06, + "logits/chosen": -0.4779120683670044, + "logits/rejected": -0.5929750204086304, + "logps/chosen": -58.73833465576172, + "logps/rejected": -96.62507629394531, + "loss": 0.6215, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1594009399414062, + "rewards/margins": 6.87156867980957, + "rewards/rejected": -3.7121670246124268, + "step": 10798 + }, + { + "epoch": 2.7, + "grad_norm": 5.758243083953857, + "learning_rate": 4.367797715304405e-06, + "logits/chosen": -0.47842440009117126, + "logits/rejected": -0.5399619340896606, + "logps/chosen": -58.04888153076172, + "logps/rejected": -96.25481414794922, + "loss": 0.7107, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8282546997070312, + "rewards/margins": 5.629306793212891, + "rewards/rejected": -2.8010520935058594, + "step": 10799 + }, + { + "epoch": 2.7, + "grad_norm": 4.938932418823242, + "learning_rate": 4.367018043645425e-06, + "logits/chosen": -0.5254589319229126, + "logits/rejected": -0.5551069378852844, + "logps/chosen": -48.807373046875, + "logps/rejected": -122.11028289794922, + "loss": 0.6159, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0021681785583496, + "rewards/margins": 6.181668758392334, + "rewards/rejected": -3.1795005798339844, + "step": 10800 + }, + { + "epoch": 2.7, + "grad_norm": 12.95773696899414, + "learning_rate": 4.366238387628102e-06, + "logits/chosen": -0.4576230049133301, + "logits/rejected": -0.5336450338363647, + "logps/chosen": -59.05081558227539, + "logps/rejected": -117.34790802001953, + "loss": 0.6943, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.28959584236145, + "rewards/margins": 6.700538158416748, + "rewards/rejected": -3.410942554473877, + "step": 10801 + }, + { + "epoch": 2.7, + "grad_norm": 9.832595825195312, + "learning_rate": 4.365458747271701e-06, + "logits/chosen": -0.5168726444244385, + "logits/rejected": -0.5457988977432251, + "logps/chosen": -52.3543586730957, + "logps/rejected": -97.42533874511719, + "loss": 0.7232, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.144124984741211, + "rewards/margins": 4.924534797668457, + "rewards/rejected": -1.7804092168807983, + "step": 10802 + }, + { + "epoch": 2.7, + "grad_norm": 14.632991790771484, + "learning_rate": 4.364679122595493e-06, + "logits/chosen": -0.4653797149658203, + "logits/rejected": -0.5078050494194031, + "logps/chosen": -61.74577713012695, + "logps/rejected": -94.32498168945312, + "loss": 0.8082, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9519309997558594, + "rewards/margins": 5.225515365600586, + "rewards/rejected": -2.273585319519043, + "step": 10803 + }, + { + "epoch": 2.7, + "grad_norm": 3.415806770324707, + "learning_rate": 4.3638995136187365e-06, + "logits/chosen": -0.5110235810279846, + "logits/rejected": -0.6219694018363953, + "logps/chosen": -64.41535949707031, + "logps/rejected": -94.1349868774414, + "loss": 0.592, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2010602951049805, + "rewards/margins": 6.249813079833984, + "rewards/rejected": -3.0487523078918457, + "step": 10804 + }, + { + "epoch": 2.7, + "grad_norm": 11.832733154296875, + "learning_rate": 4.3631199203607006e-06, + "logits/chosen": -0.5818255543708801, + "logits/rejected": -0.6023250818252563, + "logps/chosen": -47.1473388671875, + "logps/rejected": -106.26261138916016, + "loss": 0.756, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.055324077606201, + "rewards/margins": 5.584428787231445, + "rewards/rejected": -2.529104709625244, + "step": 10805 + }, + { + "epoch": 2.7, + "grad_norm": 4.014306545257568, + "learning_rate": 4.362340342840651e-06, + "logits/chosen": -0.5247578620910645, + "logits/rejected": -0.5692604184150696, + "logps/chosen": -49.72237014770508, + "logps/rejected": -93.07284545898438, + "loss": 0.6289, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.037670850753784, + "rewards/margins": 5.483668804168701, + "rewards/rejected": -2.445997714996338, + "step": 10806 + }, + { + "epoch": 2.7, + "grad_norm": 5.926201820373535, + "learning_rate": 4.361560781077846e-06, + "logits/chosen": -0.5600929260253906, + "logits/rejected": -0.6187025904655457, + "logps/chosen": -49.67262649536133, + "logps/rejected": -97.80819702148438, + "loss": 0.6851, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6827945709228516, + "rewards/margins": 6.071899890899658, + "rewards/rejected": -3.3891050815582275, + "step": 10807 + }, + { + "epoch": 2.7, + "grad_norm": 5.541230201721191, + "learning_rate": 4.360781235091555e-06, + "logits/chosen": -0.513157069683075, + "logits/rejected": -0.536334216594696, + "logps/chosen": -49.07365417480469, + "logps/rejected": -140.79000854492188, + "loss": 0.5771, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0722999572753906, + "rewards/margins": 6.878276824951172, + "rewards/rejected": -3.805976152420044, + "step": 10808 + }, + { + "epoch": 2.7, + "grad_norm": 5.912709712982178, + "learning_rate": 4.360001704901039e-06, + "logits/chosen": -0.5149406790733337, + "logits/rejected": -0.5651485919952393, + "logps/chosen": -57.64081954956055, + "logps/rejected": -103.05704498291016, + "loss": 0.609, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.191047430038452, + "rewards/margins": 6.596796035766602, + "rewards/rejected": -3.405748128890991, + "step": 10809 + }, + { + "epoch": 2.7, + "grad_norm": 7.146153450012207, + "learning_rate": 4.3592221905255636e-06, + "logits/chosen": -0.4686211049556732, + "logits/rejected": -0.5567234754562378, + "logps/chosen": -61.94886779785156, + "logps/rejected": -102.21282196044922, + "loss": 0.6462, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0062289237976074, + "rewards/margins": 6.331272125244141, + "rewards/rejected": -3.3250436782836914, + "step": 10810 + }, + { + "epoch": 2.7, + "grad_norm": 13.030618667602539, + "learning_rate": 4.358442691984387e-06, + "logits/chosen": -0.6029355525970459, + "logits/rejected": -0.6565638184547424, + "logps/chosen": -52.260555267333984, + "logps/rejected": -94.7147445678711, + "loss": 0.7208, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0274484157562256, + "rewards/margins": 6.039816856384277, + "rewards/rejected": -3.0123682022094727, + "step": 10811 + }, + { + "epoch": 2.7, + "grad_norm": 16.81454849243164, + "learning_rate": 4.357663209296775e-06, + "logits/chosen": -0.6151025295257568, + "logits/rejected": -0.6976485252380371, + "logps/chosen": -52.04460144042969, + "logps/rejected": -89.56033325195312, + "loss": 0.6626, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.842393398284912, + "rewards/margins": 6.348639488220215, + "rewards/rejected": -3.5062456130981445, + "step": 10812 + }, + { + "epoch": 2.71, + "grad_norm": 4.8543596267700195, + "learning_rate": 4.356883742481989e-06, + "logits/chosen": -0.4599536657333374, + "logits/rejected": -0.5628839731216431, + "logps/chosen": -61.077186584472656, + "logps/rejected": -88.912109375, + "loss": 0.7098, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2709834575653076, + "rewards/margins": 5.768884658813477, + "rewards/rejected": -2.497901678085327, + "step": 10813 + }, + { + "epoch": 2.71, + "grad_norm": 2.6935839653015137, + "learning_rate": 4.356104291559289e-06, + "logits/chosen": -0.5543427467346191, + "logits/rejected": -0.6315164566040039, + "logps/chosen": -47.13473892211914, + "logps/rejected": -115.02583312988281, + "loss": 0.5549, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2030749320983887, + "rewards/margins": 8.185998916625977, + "rewards/rejected": -4.982923984527588, + "step": 10814 + }, + { + "epoch": 2.71, + "grad_norm": 5.3735432624816895, + "learning_rate": 4.355324856547936e-06, + "logits/chosen": -0.5598440766334534, + "logits/rejected": -0.6704810857772827, + "logps/chosen": -63.2396240234375, + "logps/rejected": -90.0085678100586, + "loss": 0.7447, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.872041940689087, + "rewards/margins": 6.468202590942383, + "rewards/rejected": -3.596160888671875, + "step": 10815 + }, + { + "epoch": 2.71, + "grad_norm": 6.028149604797363, + "learning_rate": 4.354545437467191e-06, + "logits/chosen": -0.539106547832489, + "logits/rejected": -0.677133321762085, + "logps/chosen": -59.48643493652344, + "logps/rejected": -99.99913787841797, + "loss": 0.7996, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8055248260498047, + "rewards/margins": 6.917208671569824, + "rewards/rejected": -4.111683368682861, + "step": 10816 + }, + { + "epoch": 2.71, + "grad_norm": 5.623252868652344, + "learning_rate": 4.353766034336318e-06, + "logits/chosen": -0.5066890120506287, + "logits/rejected": -0.564088761806488, + "logps/chosen": -54.14219284057617, + "logps/rejected": -96.9776840209961, + "loss": 0.6494, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1885006427764893, + "rewards/margins": 5.337083339691162, + "rewards/rejected": -2.148582696914673, + "step": 10817 + }, + { + "epoch": 2.71, + "grad_norm": 5.365602970123291, + "learning_rate": 4.352986647174572e-06, + "logits/chosen": -0.47242501378059387, + "logits/rejected": -0.5281197428703308, + "logps/chosen": -49.12092971801758, + "logps/rejected": -106.34944152832031, + "loss": 0.5859, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.925856590270996, + "rewards/margins": 6.428604602813721, + "rewards/rejected": -3.5027477741241455, + "step": 10818 + }, + { + "epoch": 2.71, + "grad_norm": 5.547578811645508, + "learning_rate": 4.352207276001212e-06, + "logits/chosen": -0.428435742855072, + "logits/rejected": -0.5298427939414978, + "logps/chosen": -59.378028869628906, + "logps/rejected": -90.25826263427734, + "loss": 0.6726, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.020723819732666, + "rewards/margins": 5.013004779815674, + "rewards/rejected": -1.9922815561294556, + "step": 10819 + }, + { + "epoch": 2.71, + "grad_norm": 2.663573741912842, + "learning_rate": 4.351427920835501e-06, + "logits/chosen": -0.42077699303627014, + "logits/rejected": -0.5718845725059509, + "logps/chosen": -85.20877075195312, + "logps/rejected": -93.77253723144531, + "loss": 0.5848, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0546770095825195, + "rewards/margins": 6.67085075378418, + "rewards/rejected": -3.616173028945923, + "step": 10820 + }, + { + "epoch": 2.71, + "grad_norm": 3.285367488861084, + "learning_rate": 4.350648581696697e-06, + "logits/chosen": -0.5441375970840454, + "logits/rejected": -0.5967394113540649, + "logps/chosen": -56.092227935791016, + "logps/rejected": -110.87428283691406, + "loss": 0.6763, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.242119073867798, + "rewards/margins": 6.65118408203125, + "rewards/rejected": -3.4090652465820312, + "step": 10821 + }, + { + "epoch": 2.71, + "grad_norm": 3.362982749938965, + "learning_rate": 4.349869258604056e-06, + "logits/chosen": -0.525450587272644, + "logits/rejected": -0.6453731656074524, + "logps/chosen": -59.40846252441406, + "logps/rejected": -99.37004089355469, + "loss": 0.6005, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.037240505218506, + "rewards/margins": 6.803457736968994, + "rewards/rejected": -3.7662174701690674, + "step": 10822 + }, + { + "epoch": 2.71, + "grad_norm": 13.919655799865723, + "learning_rate": 4.349089951576837e-06, + "logits/chosen": -0.49859803915023804, + "logits/rejected": -0.54420006275177, + "logps/chosen": -57.692420959472656, + "logps/rejected": -98.64934539794922, + "loss": 0.7457, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8768932819366455, + "rewards/margins": 5.032224655151367, + "rewards/rejected": -2.155331611633301, + "step": 10823 + }, + { + "epoch": 2.71, + "grad_norm": 5.59543514251709, + "learning_rate": 4.3483106606342986e-06, + "logits/chosen": -0.5024978518486023, + "logits/rejected": -0.523532509803772, + "logps/chosen": -45.197940826416016, + "logps/rejected": -106.11898040771484, + "loss": 0.6648, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0014991760253906, + "rewards/margins": 5.9111409187316895, + "rewards/rejected": -2.909641742706299, + "step": 10824 + }, + { + "epoch": 2.71, + "grad_norm": 25.491275787353516, + "learning_rate": 4.347531385795696e-06, + "logits/chosen": -0.5282980799674988, + "logits/rejected": -0.557582437992096, + "logps/chosen": -52.825958251953125, + "logps/rejected": -115.93016815185547, + "loss": 0.8403, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.128124713897705, + "rewards/margins": 6.23588752746582, + "rewards/rejected": -3.1077628135681152, + "step": 10825 + }, + { + "epoch": 2.71, + "grad_norm": 6.234562873840332, + "learning_rate": 4.346752127080287e-06, + "logits/chosen": -0.5115587115287781, + "logits/rejected": -0.636145830154419, + "logps/chosen": -52.17719650268555, + "logps/rejected": -92.476318359375, + "loss": 0.5637, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2404377460479736, + "rewards/margins": 7.137472152709961, + "rewards/rejected": -3.8970346450805664, + "step": 10826 + }, + { + "epoch": 2.71, + "grad_norm": 15.383500099182129, + "learning_rate": 4.345972884507328e-06, + "logits/chosen": -0.5422998070716858, + "logits/rejected": -0.590851902961731, + "logps/chosen": -46.050758361816406, + "logps/rejected": -103.2806396484375, + "loss": 0.6071, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1325325965881348, + "rewards/margins": 6.01817512512207, + "rewards/rejected": -2.8856427669525146, + "step": 10827 + }, + { + "epoch": 2.71, + "grad_norm": 5.845578670501709, + "learning_rate": 4.3451936580960756e-06, + "logits/chosen": -0.513769268989563, + "logits/rejected": -0.6111736297607422, + "logps/chosen": -55.48247528076172, + "logps/rejected": -89.99887084960938, + "loss": 0.6803, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9073708057403564, + "rewards/margins": 6.040867328643799, + "rewards/rejected": -3.1334965229034424, + "step": 10828 + }, + { + "epoch": 2.71, + "grad_norm": 5.448396682739258, + "learning_rate": 4.344414447865783e-06, + "logits/chosen": -0.5188077092170715, + "logits/rejected": -0.560274064540863, + "logps/chosen": -62.40983581542969, + "logps/rejected": -109.23860931396484, + "loss": 0.8367, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.22430682182312, + "rewards/margins": 5.019089698791504, + "rewards/rejected": -1.7947828769683838, + "step": 10829 + }, + { + "epoch": 2.71, + "grad_norm": 4.523688793182373, + "learning_rate": 4.343635253835706e-06, + "logits/chosen": -0.5310108661651611, + "logits/rejected": -0.6187117099761963, + "logps/chosen": -61.18239212036133, + "logps/rejected": -100.16809844970703, + "loss": 0.6204, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8515782356262207, + "rewards/margins": 6.482024669647217, + "rewards/rejected": -3.630446434020996, + "step": 10830 + }, + { + "epoch": 2.71, + "grad_norm": 6.6319780349731445, + "learning_rate": 4.3428560760251025e-06, + "logits/chosen": -0.5425129532814026, + "logits/rejected": -0.6024162769317627, + "logps/chosen": -45.337425231933594, + "logps/rejected": -100.9022445678711, + "loss": 0.6045, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1908257007598877, + "rewards/margins": 6.9043192863464355, + "rewards/rejected": -3.7134926319122314, + "step": 10831 + }, + { + "epoch": 2.71, + "grad_norm": 9.824993133544922, + "learning_rate": 4.342076914453223e-06, + "logits/chosen": -0.49459904432296753, + "logits/rejected": -0.537949800491333, + "logps/chosen": -59.546424865722656, + "logps/rejected": -86.24940490722656, + "loss": 0.6985, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9720911979675293, + "rewards/margins": 5.086550712585449, + "rewards/rejected": -2.114459991455078, + "step": 10832 + }, + { + "epoch": 2.71, + "grad_norm": 17.50423240661621, + "learning_rate": 4.341297769139322e-06, + "logits/chosen": -0.5141263008117676, + "logits/rejected": -0.5312561988830566, + "logps/chosen": -51.743770599365234, + "logps/rejected": -110.28056335449219, + "loss": 0.7063, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.278027057647705, + "rewards/margins": 6.297332286834717, + "rewards/rejected": -3.0193052291870117, + "step": 10833 + }, + { + "epoch": 2.71, + "grad_norm": 6.565397262573242, + "learning_rate": 4.340518640102654e-06, + "logits/chosen": -0.5873901844024658, + "logits/rejected": -0.6239223480224609, + "logps/chosen": -50.98457336425781, + "logps/rejected": -90.19607543945312, + "loss": 0.7022, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.101632595062256, + "rewards/margins": 5.107048988342285, + "rewards/rejected": -2.0054168701171875, + "step": 10834 + }, + { + "epoch": 2.71, + "grad_norm": 7.395171642303467, + "learning_rate": 4.339739527362473e-06, + "logits/chosen": -0.5154999494552612, + "logits/rejected": -0.5934717059135437, + "logps/chosen": -56.66975021362305, + "logps/rejected": -85.53173828125, + "loss": 0.6518, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2574949264526367, + "rewards/margins": 6.105362892150879, + "rewards/rejected": -2.847867965698242, + "step": 10835 + }, + { + "epoch": 2.71, + "grad_norm": 4.872171878814697, + "learning_rate": 4.33896043093803e-06, + "logits/chosen": -0.5945479273796082, + "logits/rejected": -0.6698305606842041, + "logps/chosen": -53.72361755371094, + "logps/rejected": -103.60529327392578, + "loss": 0.6526, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.093477725982666, + "rewards/margins": 7.5322265625, + "rewards/rejected": -4.438749313354492, + "step": 10836 + }, + { + "epoch": 2.71, + "grad_norm": 7.056951522827148, + "learning_rate": 4.338181350848577e-06, + "logits/chosen": -0.5414762496948242, + "logits/rejected": -0.6268430948257446, + "logps/chosen": -55.7989501953125, + "logps/rejected": -98.75704193115234, + "loss": 0.6741, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.923800468444824, + "rewards/margins": 6.216743469238281, + "rewards/rejected": -3.292942762374878, + "step": 10837 + }, + { + "epoch": 2.71, + "grad_norm": 6.885749340057373, + "learning_rate": 4.337402287113368e-06, + "logits/chosen": -0.5132173299789429, + "logits/rejected": -0.5887748599052429, + "logps/chosen": -48.92462158203125, + "logps/rejected": -98.58197784423828, + "loss": 0.632, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0465598106384277, + "rewards/margins": 6.894747734069824, + "rewards/rejected": -3.8481879234313965, + "step": 10838 + }, + { + "epoch": 2.71, + "grad_norm": 6.604552268981934, + "learning_rate": 4.336623239751653e-06, + "logits/chosen": -0.4838903546333313, + "logits/rejected": -0.561833381652832, + "logps/chosen": -68.61408233642578, + "logps/rejected": -98.52713775634766, + "loss": 0.7497, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9074831008911133, + "rewards/margins": 6.003208160400391, + "rewards/rejected": -3.095724582672119, + "step": 10839 + }, + { + "epoch": 2.71, + "grad_norm": 6.385030746459961, + "learning_rate": 4.335844208782683e-06, + "logits/chosen": -0.541602611541748, + "logits/rejected": -0.6233257055282593, + "logps/chosen": -58.306060791015625, + "logps/rejected": -94.0477523803711, + "loss": 0.6747, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.947779417037964, + "rewards/margins": 6.66996955871582, + "rewards/rejected": -3.7221901416778564, + "step": 10840 + }, + { + "epoch": 2.71, + "grad_norm": 3.189079523086548, + "learning_rate": 4.335065194225709e-06, + "logits/chosen": -0.5256507396697998, + "logits/rejected": -0.5930648446083069, + "logps/chosen": -51.082252502441406, + "logps/rejected": -107.50675964355469, + "loss": 0.5372, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1871771812438965, + "rewards/margins": 6.557515621185303, + "rewards/rejected": -3.3703384399414062, + "step": 10841 + }, + { + "epoch": 2.71, + "grad_norm": 2.546851873397827, + "learning_rate": 4.3342861960999826e-06, + "logits/chosen": -0.4583125114440918, + "logits/rejected": -0.617652177810669, + "logps/chosen": -56.056488037109375, + "logps/rejected": -94.9050064086914, + "loss": 0.5493, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9438352584838867, + "rewards/margins": 7.424405097961426, + "rewards/rejected": -4.4805707931518555, + "step": 10842 + }, + { + "epoch": 2.71, + "grad_norm": 25.91136360168457, + "learning_rate": 4.33350721442475e-06, + "logits/chosen": -0.5179505944252014, + "logits/rejected": -0.5634484887123108, + "logps/chosen": -61.10376739501953, + "logps/rejected": -88.89791870117188, + "loss": 0.8945, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.966486930847168, + "rewards/margins": 5.0960001945495605, + "rewards/rejected": -2.1295127868652344, + "step": 10843 + }, + { + "epoch": 2.71, + "grad_norm": 5.285205364227295, + "learning_rate": 4.332728249219265e-06, + "logits/chosen": -0.5780567526817322, + "logits/rejected": -0.6384740471839905, + "logps/chosen": -56.53696823120117, + "logps/rejected": -110.85800170898438, + "loss": 0.7723, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1139378547668457, + "rewards/margins": 6.934017658233643, + "rewards/rejected": -3.8200793266296387, + "step": 10844 + }, + { + "epoch": 2.71, + "grad_norm": 3.2641654014587402, + "learning_rate": 4.331949300502775e-06, + "logits/chosen": -0.5248177647590637, + "logits/rejected": -0.6222224235534668, + "logps/chosen": -52.17291259765625, + "logps/rejected": -102.544189453125, + "loss": 0.5735, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3537614345550537, + "rewards/margins": 7.3077287673950195, + "rewards/rejected": -3.953967571258545, + "step": 10845 + }, + { + "epoch": 2.71, + "grad_norm": 2.1807057857513428, + "learning_rate": 4.331170368294526e-06, + "logits/chosen": -0.492770791053772, + "logits/rejected": -0.5934979319572449, + "logps/chosen": -56.98966598510742, + "logps/rejected": -108.19113159179688, + "loss": 0.582, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.47255539894104, + "rewards/margins": 7.719480991363525, + "rewards/rejected": -4.246925354003906, + "step": 10846 + }, + { + "epoch": 2.71, + "grad_norm": 10.395157814025879, + "learning_rate": 4.330391452613769e-06, + "logits/chosen": -0.5216147303581238, + "logits/rejected": -0.5835546255111694, + "logps/chosen": -55.950077056884766, + "logps/rejected": -96.63153839111328, + "loss": 0.7712, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2932028770446777, + "rewards/margins": 5.698331356048584, + "rewards/rejected": -2.405128240585327, + "step": 10847 + }, + { + "epoch": 2.71, + "grad_norm": 9.952310562133789, + "learning_rate": 4.329612553479752e-06, + "logits/chosen": -0.5457695126533508, + "logits/rejected": -0.63570237159729, + "logps/chosen": -45.676998138427734, + "logps/rejected": -93.26113891601562, + "loss": 0.5966, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.05228328704834, + "rewards/margins": 7.175206661224365, + "rewards/rejected": -4.122923374176025, + "step": 10848 + }, + { + "epoch": 2.71, + "grad_norm": 5.101437091827393, + "learning_rate": 4.3288336709117246e-06, + "logits/chosen": -0.40529513359069824, + "logits/rejected": -0.4844227135181427, + "logps/chosen": -52.5632438659668, + "logps/rejected": -101.66002655029297, + "loss": 0.628, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1212587356567383, + "rewards/margins": 6.007342338562012, + "rewards/rejected": -2.8860838413238525, + "step": 10849 + }, + { + "epoch": 2.71, + "grad_norm": 5.271615505218506, + "learning_rate": 4.3280548049289275e-06, + "logits/chosen": -0.5345568060874939, + "logits/rejected": -0.6324110627174377, + "logps/chosen": -49.68745803833008, + "logps/rejected": -99.0919418334961, + "loss": 0.5555, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2464749813079834, + "rewards/margins": 6.740973472595215, + "rewards/rejected": -3.4944987297058105, + "step": 10850 + }, + { + "epoch": 2.71, + "grad_norm": 4.113086223602295, + "learning_rate": 4.327275955550611e-06, + "logits/chosen": -0.49017632007598877, + "logits/rejected": -0.6017484664916992, + "logps/chosen": -68.41910552978516, + "logps/rejected": -99.63883972167969, + "loss": 0.6997, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.111095428466797, + "rewards/margins": 6.294304847717285, + "rewards/rejected": -3.18320894241333, + "step": 10851 + }, + { + "epoch": 2.71, + "grad_norm": 15.868038177490234, + "learning_rate": 4.3264971227960224e-06, + "logits/chosen": -0.4833561182022095, + "logits/rejected": -0.54693603515625, + "logps/chosen": -51.095516204833984, + "logps/rejected": -100.31714630126953, + "loss": 0.7704, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1187806129455566, + "rewards/margins": 5.652814865112305, + "rewards/rejected": -2.534034013748169, + "step": 10852 + }, + { + "epoch": 2.72, + "grad_norm": 6.7447943687438965, + "learning_rate": 4.325718306684407e-06, + "logits/chosen": -0.527187705039978, + "logits/rejected": -0.5921116471290588, + "logps/chosen": -52.00438690185547, + "logps/rejected": -98.72944641113281, + "loss": 0.6933, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.78305721282959, + "rewards/margins": 6.367673873901367, + "rewards/rejected": -3.5846166610717773, + "step": 10853 + }, + { + "epoch": 2.72, + "grad_norm": 4.880516529083252, + "learning_rate": 4.324939507235009e-06, + "logits/chosen": -0.5586333274841309, + "logits/rejected": -0.6446065902709961, + "logps/chosen": -52.09406661987305, + "logps/rejected": -93.06344604492188, + "loss": 0.711, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.123284339904785, + "rewards/margins": 5.694634914398193, + "rewards/rejected": -2.571350336074829, + "step": 10854 + }, + { + "epoch": 2.72, + "grad_norm": 7.781393527984619, + "learning_rate": 4.324160724467073e-06, + "logits/chosen": -0.4912761449813843, + "logits/rejected": -0.5594210028648376, + "logps/chosen": -65.78852844238281, + "logps/rejected": -94.23548126220703, + "loss": 0.7404, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.810360908508301, + "rewards/margins": 5.849307060241699, + "rewards/rejected": -3.0389466285705566, + "step": 10855 + }, + { + "epoch": 2.72, + "grad_norm": 4.952090740203857, + "learning_rate": 4.323381958399845e-06, + "logits/chosen": -0.5308742523193359, + "logits/rejected": -0.6036881804466248, + "logps/chosen": -51.92864990234375, + "logps/rejected": -83.19685363769531, + "loss": 0.7113, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1379308700561523, + "rewards/margins": 5.187321186065674, + "rewards/rejected": -2.0493900775909424, + "step": 10856 + }, + { + "epoch": 2.72, + "grad_norm": 5.161538124084473, + "learning_rate": 4.3226032090525686e-06, + "logits/chosen": -0.591029703617096, + "logits/rejected": -0.6631896495819092, + "logps/chosen": -52.70418167114258, + "logps/rejected": -96.98338317871094, + "loss": 0.6744, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8470470905303955, + "rewards/margins": 5.554278373718262, + "rewards/rejected": -2.7072315216064453, + "step": 10857 + }, + { + "epoch": 2.72, + "grad_norm": 3.7664999961853027, + "learning_rate": 4.321824476444486e-06, + "logits/chosen": -0.45273301005363464, + "logits/rejected": -0.5501806735992432, + "logps/chosen": -53.91128158569336, + "logps/rejected": -75.16226196289062, + "loss": 0.691, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.21838641166687, + "rewards/margins": 5.669505596160889, + "rewards/rejected": -2.4511189460754395, + "step": 10858 + }, + { + "epoch": 2.72, + "grad_norm": 4.115355968475342, + "learning_rate": 4.321045760594842e-06, + "logits/chosen": -0.499986469745636, + "logits/rejected": -0.6086125373840332, + "logps/chosen": -60.983741760253906, + "logps/rejected": -96.06124114990234, + "loss": 0.6109, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4757626056671143, + "rewards/margins": 5.814935684204102, + "rewards/rejected": -3.339172601699829, + "step": 10859 + }, + { + "epoch": 2.72, + "grad_norm": 5.371307849884033, + "learning_rate": 4.3202670615228825e-06, + "logits/chosen": -0.4622259736061096, + "logits/rejected": -0.5776903629302979, + "logps/chosen": -57.02178192138672, + "logps/rejected": -103.73274230957031, + "loss": 0.6572, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3251750469207764, + "rewards/margins": 7.399163246154785, + "rewards/rejected": -4.073988914489746, + "step": 10860 + }, + { + "epoch": 2.72, + "grad_norm": 2.0166947841644287, + "learning_rate": 4.319488379247845e-06, + "logits/chosen": -0.5036531686782837, + "logits/rejected": -0.6066033840179443, + "logps/chosen": -59.94923400878906, + "logps/rejected": -104.40663146972656, + "loss": 0.5987, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2554492950439453, + "rewards/margins": 7.446130752563477, + "rewards/rejected": -4.190681457519531, + "step": 10861 + }, + { + "epoch": 2.72, + "grad_norm": 6.941702842712402, + "learning_rate": 4.318709713788971e-06, + "logits/chosen": -0.5451228618621826, + "logits/rejected": -0.6262532472610474, + "logps/chosen": -61.65687942504883, + "logps/rejected": -94.27730560302734, + "loss": 0.6952, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0063493251800537, + "rewards/margins": 5.3331379890441895, + "rewards/rejected": -2.326788902282715, + "step": 10862 + }, + { + "epoch": 2.72, + "grad_norm": 2.981004238128662, + "learning_rate": 4.3179310651655095e-06, + "logits/chosen": -0.43407949805259705, + "logits/rejected": -0.5023768544197083, + "logps/chosen": -60.85426712036133, + "logps/rejected": -98.68289947509766, + "loss": 0.6518, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.132704973220825, + "rewards/margins": 6.072243690490723, + "rewards/rejected": -2.9395391941070557, + "step": 10863 + }, + { + "epoch": 2.72, + "grad_norm": 5.484091758728027, + "learning_rate": 4.317152433396692e-06, + "logits/chosen": -0.5561231374740601, + "logits/rejected": -0.62674480676651, + "logps/chosen": -59.33631134033203, + "logps/rejected": -131.849853515625, + "loss": 0.6596, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9225857257843018, + "rewards/margins": 8.098512649536133, + "rewards/rejected": -5.17592716217041, + "step": 10864 + }, + { + "epoch": 2.72, + "grad_norm": 8.449443817138672, + "learning_rate": 4.316373818501767e-06, + "logits/chosen": -0.5141518712043762, + "logits/rejected": -0.6181809902191162, + "logps/chosen": -62.38662338256836, + "logps/rejected": -102.42948913574219, + "loss": 0.7148, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8494620323181152, + "rewards/margins": 6.068661212921143, + "rewards/rejected": -3.2191989421844482, + "step": 10865 + }, + { + "epoch": 2.72, + "grad_norm": 6.02760648727417, + "learning_rate": 4.315595220499971e-06, + "logits/chosen": -0.5287755727767944, + "logits/rejected": -0.5851536393165588, + "logps/chosen": -45.326087951660156, + "logps/rejected": -97.43372344970703, + "loss": 0.619, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2579288482666016, + "rewards/margins": 6.3874969482421875, + "rewards/rejected": -3.129568099975586, + "step": 10866 + }, + { + "epoch": 2.72, + "grad_norm": 5.821000099182129, + "learning_rate": 4.314816639410546e-06, + "logits/chosen": -0.5055209398269653, + "logits/rejected": -0.596057116985321, + "logps/chosen": -47.5435905456543, + "logps/rejected": -94.34381103515625, + "loss": 0.7613, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.49855375289917, + "rewards/margins": 6.091203689575195, + "rewards/rejected": -2.592649221420288, + "step": 10867 + }, + { + "epoch": 2.72, + "grad_norm": 6.708637714385986, + "learning_rate": 4.314038075252731e-06, + "logits/chosen": -0.45266643166542053, + "logits/rejected": -0.5241730213165283, + "logps/chosen": -47.366180419921875, + "logps/rejected": -89.64757537841797, + "loss": 0.5504, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2648119926452637, + "rewards/margins": 6.032027244567871, + "rewards/rejected": -2.7672150135040283, + "step": 10868 + }, + { + "epoch": 2.72, + "grad_norm": 2.4641947746276855, + "learning_rate": 4.313259528045763e-06, + "logits/chosen": -0.49515771865844727, + "logits/rejected": -0.5561919808387756, + "logps/chosen": -46.917503356933594, + "logps/rejected": -103.11445617675781, + "loss": 0.5434, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.365466833114624, + "rewards/margins": 6.945583820343018, + "rewards/rejected": -3.5801172256469727, + "step": 10869 + }, + { + "epoch": 2.72, + "grad_norm": 5.5941853523254395, + "learning_rate": 4.3124809978088845e-06, + "logits/chosen": -0.539522647857666, + "logits/rejected": -0.6375621557235718, + "logps/chosen": -57.98543167114258, + "logps/rejected": -97.27754211425781, + "loss": 0.7879, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.14740252494812, + "rewards/margins": 6.741062164306641, + "rewards/rejected": -3.5936601161956787, + "step": 10870 + }, + { + "epoch": 2.72, + "grad_norm": 4.0391693115234375, + "learning_rate": 4.311702484561331e-06, + "logits/chosen": -0.4737294316291809, + "logits/rejected": -0.5162551403045654, + "logps/chosen": -49.06718063354492, + "logps/rejected": -111.66989135742188, + "loss": 0.5345, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0071189403533936, + "rewards/margins": 7.318361282348633, + "rewards/rejected": -4.311242580413818, + "step": 10871 + }, + { + "epoch": 2.72, + "grad_norm": 4.5308637619018555, + "learning_rate": 4.310923988322341e-06, + "logits/chosen": -0.610363781452179, + "logits/rejected": -0.6367435455322266, + "logps/chosen": -37.795631408691406, + "logps/rejected": -100.3330307006836, + "loss": 0.6373, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.009666681289673, + "rewards/margins": 6.263996601104736, + "rewards/rejected": -3.254330635070801, + "step": 10872 + }, + { + "epoch": 2.72, + "grad_norm": 4.7270917892456055, + "learning_rate": 4.310145509111151e-06, + "logits/chosen": -0.5162039995193481, + "logits/rejected": -0.5712510347366333, + "logps/chosen": -64.84457397460938, + "logps/rejected": -113.52424621582031, + "loss": 0.626, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7769203186035156, + "rewards/margins": 6.567025661468506, + "rewards/rejected": -3.790105104446411, + "step": 10873 + }, + { + "epoch": 2.72, + "grad_norm": 17.09236717224121, + "learning_rate": 4.309367046947001e-06, + "logits/chosen": -0.5301554203033447, + "logits/rejected": -0.6038639545440674, + "logps/chosen": -43.91108322143555, + "logps/rejected": -77.04196166992188, + "loss": 0.7797, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.785282850265503, + "rewards/margins": 4.903658866882324, + "rewards/rejected": -2.118375778198242, + "step": 10874 + }, + { + "epoch": 2.72, + "grad_norm": 4.175058364868164, + "learning_rate": 4.308588601849126e-06, + "logits/chosen": -0.43653663992881775, + "logits/rejected": -0.5106688737869263, + "logps/chosen": -61.001136779785156, + "logps/rejected": -111.06851196289062, + "loss": 0.688, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.348851203918457, + "rewards/margins": 6.570354461669922, + "rewards/rejected": -3.2215030193328857, + "step": 10875 + }, + { + "epoch": 2.72, + "grad_norm": 5.97450065612793, + "learning_rate": 4.30781017383676e-06, + "logits/chosen": -0.5054275393486023, + "logits/rejected": -0.5478734970092773, + "logps/chosen": -65.8825454711914, + "logps/rejected": -111.47065734863281, + "loss": 0.6773, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0491464138031006, + "rewards/margins": 5.783478260040283, + "rewards/rejected": -2.7343316078186035, + "step": 10876 + }, + { + "epoch": 2.72, + "grad_norm": 5.0617170333862305, + "learning_rate": 4.307031762929142e-06, + "logits/chosen": -0.5221770405769348, + "logits/rejected": -0.5726190805435181, + "logps/chosen": -56.081153869628906, + "logps/rejected": -92.06727600097656, + "loss": 0.6642, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.267241954803467, + "rewards/margins": 5.637794494628906, + "rewards/rejected": -2.3705527782440186, + "step": 10877 + }, + { + "epoch": 2.72, + "grad_norm": 4.548876762390137, + "learning_rate": 4.306253369145508e-06, + "logits/chosen": -0.5354107618331909, + "logits/rejected": -0.5715705156326294, + "logps/chosen": -47.77745056152344, + "logps/rejected": -106.18636322021484, + "loss": 0.6345, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.226675510406494, + "rewards/margins": 6.093109130859375, + "rewards/rejected": -2.8664333820343018, + "step": 10878 + }, + { + "epoch": 2.72, + "grad_norm": 7.902095317840576, + "learning_rate": 4.305474992505089e-06, + "logits/chosen": -0.5708065032958984, + "logits/rejected": -0.5730910301208496, + "logps/chosen": -44.00471115112305, + "logps/rejected": -114.22578430175781, + "loss": 0.6351, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.085820198059082, + "rewards/margins": 5.6816792488098145, + "rewards/rejected": -2.5958595275878906, + "step": 10879 + }, + { + "epoch": 2.72, + "grad_norm": 1.7925156354904175, + "learning_rate": 4.304696633027122e-06, + "logits/chosen": -0.49718600511550903, + "logits/rejected": -0.5699384212493896, + "logps/chosen": -47.77545166015625, + "logps/rejected": -122.16105651855469, + "loss": 0.5307, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.273744821548462, + "rewards/margins": 8.449732780456543, + "rewards/rejected": -5.175987720489502, + "step": 10880 + }, + { + "epoch": 2.72, + "grad_norm": 3.3674299716949463, + "learning_rate": 4.303918290730842e-06, + "logits/chosen": -0.5687122344970703, + "logits/rejected": -0.6349267363548279, + "logps/chosen": -55.02047348022461, + "logps/rejected": -95.95305633544922, + "loss": 0.6053, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8200254440307617, + "rewards/margins": 6.284900665283203, + "rewards/rejected": -3.4648756980895996, + "step": 10881 + }, + { + "epoch": 2.72, + "grad_norm": 7.766141414642334, + "learning_rate": 4.30313996563548e-06, + "logits/chosen": -0.48962587118148804, + "logits/rejected": -0.5661267042160034, + "logps/chosen": -51.0306396484375, + "logps/rejected": -95.79530334472656, + "loss": 0.7309, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.762716054916382, + "rewards/margins": 5.901421070098877, + "rewards/rejected": -3.138705015182495, + "step": 10882 + }, + { + "epoch": 2.72, + "grad_norm": 3.864004611968994, + "learning_rate": 4.30236165776027e-06, + "logits/chosen": -0.543674647808075, + "logits/rejected": -0.6397874355316162, + "logps/chosen": -56.430320739746094, + "logps/rejected": -88.21119689941406, + "loss": 0.6438, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.126290798187256, + "rewards/margins": 6.426453113555908, + "rewards/rejected": -3.3001620769500732, + "step": 10883 + }, + { + "epoch": 2.72, + "grad_norm": 7.931826591491699, + "learning_rate": 4.301583367124447e-06, + "logits/chosen": -0.47140103578567505, + "logits/rejected": -0.5023356676101685, + "logps/chosen": -60.8163948059082, + "logps/rejected": -102.87956237792969, + "loss": 0.8601, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.951878547668457, + "rewards/margins": 5.595930099487305, + "rewards/rejected": -2.6440517902374268, + "step": 10884 + }, + { + "epoch": 2.72, + "grad_norm": 2.156633138656616, + "learning_rate": 4.3008050937472424e-06, + "logits/chosen": -0.5006280541419983, + "logits/rejected": -0.6057790517807007, + "logps/chosen": -64.99425506591797, + "logps/rejected": -102.51256561279297, + "loss": 0.5822, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0629730224609375, + "rewards/margins": 7.600221157073975, + "rewards/rejected": -4.537247657775879, + "step": 10885 + }, + { + "epoch": 2.72, + "grad_norm": 4.709620475769043, + "learning_rate": 4.300026837647886e-06, + "logits/chosen": -0.4724188446998596, + "logits/rejected": -0.5207056999206543, + "logps/chosen": -48.8586311340332, + "logps/rejected": -107.55873107910156, + "loss": 0.5958, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0603747367858887, + "rewards/margins": 7.084911823272705, + "rewards/rejected": -4.024537086486816, + "step": 10886 + }, + { + "epoch": 2.72, + "grad_norm": 9.115052223205566, + "learning_rate": 4.299248598845611e-06, + "logits/chosen": -0.48532596230506897, + "logits/rejected": -0.5413531064987183, + "logps/chosen": -54.29144287109375, + "logps/rejected": -96.81144714355469, + "loss": 0.6604, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.97654390335083, + "rewards/margins": 6.479101657867432, + "rewards/rejected": -3.502558469772339, + "step": 10887 + }, + { + "epoch": 2.72, + "grad_norm": 5.882084846496582, + "learning_rate": 4.29847037735965e-06, + "logits/chosen": -0.5630913972854614, + "logits/rejected": -0.6376203894615173, + "logps/chosen": -43.75245666503906, + "logps/rejected": -90.72607421875, + "loss": 0.5518, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0546352863311768, + "rewards/margins": 7.1070380210876465, + "rewards/rejected": -4.052402496337891, + "step": 10888 + }, + { + "epoch": 2.72, + "grad_norm": 3.4948408603668213, + "learning_rate": 4.297692173209231e-06, + "logits/chosen": -0.4511309862136841, + "logits/rejected": -0.5605349540710449, + "logps/chosen": -63.533226013183594, + "logps/rejected": -91.89697265625, + "loss": 0.6942, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.413107395172119, + "rewards/margins": 6.319768905639648, + "rewards/rejected": -2.90666127204895, + "step": 10889 + }, + { + "epoch": 2.72, + "grad_norm": 2.88962459564209, + "learning_rate": 4.296913986413585e-06, + "logits/chosen": -0.5248222351074219, + "logits/rejected": -0.5821676850318909, + "logps/chosen": -69.41967010498047, + "logps/rejected": -91.41725158691406, + "loss": 0.6672, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5047879219055176, + "rewards/margins": 5.797441482543945, + "rewards/rejected": -2.2926533222198486, + "step": 10890 + }, + { + "epoch": 2.72, + "grad_norm": 4.227969169616699, + "learning_rate": 4.296135816991942e-06, + "logits/chosen": -0.4928733706474304, + "logits/rejected": -0.5434167385101318, + "logps/chosen": -60.151710510253906, + "logps/rejected": -90.82119750976562, + "loss": 0.6605, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9134280681610107, + "rewards/margins": 5.893324851989746, + "rewards/rejected": -2.9798972606658936, + "step": 10891 + }, + { + "epoch": 2.72, + "grad_norm": 4.725782871246338, + "learning_rate": 4.295357664963536e-06, + "logits/chosen": -0.5805960297584534, + "logits/rejected": -0.6715194582939148, + "logps/chosen": -67.75845336914062, + "logps/rejected": -91.99842071533203, + "loss": 0.6812, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.579965353012085, + "rewards/margins": 5.920140266418457, + "rewards/rejected": -3.340174913406372, + "step": 10892 + }, + { + "epoch": 2.73, + "grad_norm": 5.311522483825684, + "learning_rate": 4.2945795303475855e-06, + "logits/chosen": -0.5127170085906982, + "logits/rejected": -0.6417137384414673, + "logps/chosen": -60.77004623413086, + "logps/rejected": -106.50286865234375, + "loss": 0.7198, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9551432132720947, + "rewards/margins": 7.452391624450684, + "rewards/rejected": -4.497248649597168, + "step": 10893 + }, + { + "epoch": 2.73, + "grad_norm": 5.1341729164123535, + "learning_rate": 4.293801413163328e-06, + "logits/chosen": -0.47333455085754395, + "logits/rejected": -0.5184616446495056, + "logps/chosen": -58.691505432128906, + "logps/rejected": -100.4552001953125, + "loss": 0.7203, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2499403953552246, + "rewards/margins": 6.71175479888916, + "rewards/rejected": -3.4618146419525146, + "step": 10894 + }, + { + "epoch": 2.73, + "grad_norm": 5.466676235198975, + "learning_rate": 4.293023313429989e-06, + "logits/chosen": -0.43639421463012695, + "logits/rejected": -0.4974971115589142, + "logps/chosen": -43.97414779663086, + "logps/rejected": -74.28965759277344, + "loss": 0.6432, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.883601665496826, + "rewards/margins": 4.173116683959961, + "rewards/rejected": -1.2895147800445557, + "step": 10895 + }, + { + "epoch": 2.73, + "grad_norm": 9.532180786132812, + "learning_rate": 4.292245231166795e-06, + "logits/chosen": -0.4923143982887268, + "logits/rejected": -0.5837650895118713, + "logps/chosen": -57.244014739990234, + "logps/rejected": -94.45484161376953, + "loss": 0.8757, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9447245597839355, + "rewards/margins": 5.831122398376465, + "rewards/rejected": -2.8863978385925293, + "step": 10896 + }, + { + "epoch": 2.73, + "grad_norm": 3.165296792984009, + "learning_rate": 4.291467166392975e-06, + "logits/chosen": -0.5136785507202148, + "logits/rejected": -0.5588906407356262, + "logps/chosen": -56.41115188598633, + "logps/rejected": -105.38460540771484, + "loss": 0.602, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3559134006500244, + "rewards/margins": 7.3650054931640625, + "rewards/rejected": -4.009092330932617, + "step": 10897 + }, + { + "epoch": 2.73, + "grad_norm": 4.806278705596924, + "learning_rate": 4.290689119127754e-06, + "logits/chosen": -0.5187087059020996, + "logits/rejected": -0.6049183011054993, + "logps/chosen": -47.9194450378418, + "logps/rejected": -91.3037109375, + "loss": 0.6116, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.001622200012207, + "rewards/margins": 6.2712531089782715, + "rewards/rejected": -3.2696309089660645, + "step": 10898 + }, + { + "epoch": 2.73, + "grad_norm": 3.121063709259033, + "learning_rate": 4.28991108939036e-06, + "logits/chosen": -0.461743026971817, + "logits/rejected": -0.5486330986022949, + "logps/chosen": -69.53815460205078, + "logps/rejected": -107.89482879638672, + "loss": 0.6347, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.085541009902954, + "rewards/margins": 6.725123882293701, + "rewards/rejected": -3.639582633972168, + "step": 10899 + }, + { + "epoch": 2.73, + "grad_norm": 8.056909561157227, + "learning_rate": 4.289133077200018e-06, + "logits/chosen": -0.5840323567390442, + "logits/rejected": -0.6846076250076294, + "logps/chosen": -48.956298828125, + "logps/rejected": -90.62183380126953, + "loss": 0.7704, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8859424591064453, + "rewards/margins": 5.220005989074707, + "rewards/rejected": -2.33406400680542, + "step": 10900 + }, + { + "epoch": 2.73, + "grad_norm": 3.8018319606781006, + "learning_rate": 4.288355082575954e-06, + "logits/chosen": -0.5331331491470337, + "logits/rejected": -0.5928720235824585, + "logps/chosen": -46.24399948120117, + "logps/rejected": -88.88040161132812, + "loss": 0.5561, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.077491044998169, + "rewards/margins": 5.945934772491455, + "rewards/rejected": -2.868443489074707, + "step": 10901 + }, + { + "epoch": 2.73, + "grad_norm": 4.064457893371582, + "learning_rate": 4.287577105537391e-06, + "logits/chosen": -0.5121036767959595, + "logits/rejected": -0.6398024559020996, + "logps/chosen": -58.90095138549805, + "logps/rejected": -97.24810028076172, + "loss": 0.5802, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.930082082748413, + "rewards/margins": 7.0253005027771, + "rewards/rejected": -4.095218658447266, + "step": 10902 + }, + { + "epoch": 2.73, + "grad_norm": 12.55565357208252, + "learning_rate": 4.286799146103559e-06, + "logits/chosen": -0.5409741401672363, + "logits/rejected": -0.6547989249229431, + "logps/chosen": -60.78804397583008, + "logps/rejected": -103.4348373413086, + "loss": 0.6868, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1337599754333496, + "rewards/margins": 7.91589879989624, + "rewards/rejected": -4.782138347625732, + "step": 10903 + }, + { + "epoch": 2.73, + "grad_norm": 3.176870346069336, + "learning_rate": 4.286021204293677e-06, + "logits/chosen": -0.6455546617507935, + "logits/rejected": -0.6907994747161865, + "logps/chosen": -49.59117126464844, + "logps/rejected": -125.28819274902344, + "loss": 0.5907, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1035828590393066, + "rewards/margins": 8.262150764465332, + "rewards/rejected": -5.158568382263184, + "step": 10904 + }, + { + "epoch": 2.73, + "grad_norm": 2.388103723526001, + "learning_rate": 4.285243280126969e-06, + "logits/chosen": -0.5651194453239441, + "logits/rejected": -0.6431130766868591, + "logps/chosen": -60.21262741088867, + "logps/rejected": -118.62923431396484, + "loss": 0.5564, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.130427360534668, + "rewards/margins": 7.94840145111084, + "rewards/rejected": -4.817974090576172, + "step": 10905 + }, + { + "epoch": 2.73, + "grad_norm": 9.563666343688965, + "learning_rate": 4.284465373622663e-06, + "logits/chosen": -0.45462891459465027, + "logits/rejected": -0.48869094252586365, + "logps/chosen": -56.893043518066406, + "logps/rejected": -124.61089324951172, + "loss": 0.7318, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.57971453666687, + "rewards/margins": 6.607686996459961, + "rewards/rejected": -4.027972221374512, + "step": 10906 + }, + { + "epoch": 2.73, + "grad_norm": 4.276523590087891, + "learning_rate": 4.283687484799975e-06, + "logits/chosen": -0.5161991119384766, + "logits/rejected": -0.5589946508407593, + "logps/chosen": -42.79280471801758, + "logps/rejected": -100.67308044433594, + "loss": 0.6063, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9316980838775635, + "rewards/margins": 6.702917575836182, + "rewards/rejected": -3.7712197303771973, + "step": 10907 + }, + { + "epoch": 2.73, + "grad_norm": 6.236926555633545, + "learning_rate": 4.282909613678133e-06, + "logits/chosen": -0.4970022439956665, + "logits/rejected": -0.5502281188964844, + "logps/chosen": -56.05421447753906, + "logps/rejected": -107.92384338378906, + "loss": 0.7779, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1856889724731445, + "rewards/margins": 5.649521827697754, + "rewards/rejected": -2.4638330936431885, + "step": 10908 + }, + { + "epoch": 2.73, + "grad_norm": 6.785843849182129, + "learning_rate": 4.282131760276357e-06, + "logits/chosen": -0.47811150550842285, + "logits/rejected": -0.5772649645805359, + "logps/chosen": -63.900115966796875, + "logps/rejected": -76.95950317382812, + "loss": 0.7721, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8650639057159424, + "rewards/margins": 4.762458801269531, + "rewards/rejected": -1.8973946571350098, + "step": 10909 + }, + { + "epoch": 2.73, + "grad_norm": 2.606429100036621, + "learning_rate": 4.2813539246138705e-06, + "logits/chosen": -0.6004436612129211, + "logits/rejected": -0.7231847047805786, + "logps/chosen": -66.68758392333984, + "logps/rejected": -101.72958374023438, + "loss": 0.6085, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8631398677825928, + "rewards/margins": 7.499716758728027, + "rewards/rejected": -4.636577129364014, + "step": 10910 + }, + { + "epoch": 2.73, + "grad_norm": 9.070162773132324, + "learning_rate": 4.28057610670989e-06, + "logits/chosen": -0.5521126389503479, + "logits/rejected": -0.6157487630844116, + "logps/chosen": -64.26538848876953, + "logps/rejected": -115.056640625, + "loss": 0.6547, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8918657302856445, + "rewards/margins": 6.598020553588867, + "rewards/rejected": -3.7061550617218018, + "step": 10911 + }, + { + "epoch": 2.73, + "grad_norm": 7.585307598114014, + "learning_rate": 4.279798306583641e-06, + "logits/chosen": -0.5251114368438721, + "logits/rejected": -0.6082912087440491, + "logps/chosen": -77.66313171386719, + "logps/rejected": -104.95574951171875, + "loss": 0.7517, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0743424892425537, + "rewards/margins": 7.574385643005371, + "rewards/rejected": -4.5000433921813965, + "step": 10912 + }, + { + "epoch": 2.73, + "grad_norm": 4.8607354164123535, + "learning_rate": 4.279020524254342e-06, + "logits/chosen": -0.4723917245864868, + "logits/rejected": -0.5617498159408569, + "logps/chosen": -61.51904296875, + "logps/rejected": -96.50856018066406, + "loss": 0.7303, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1736481189727783, + "rewards/margins": 5.852866172790527, + "rewards/rejected": -2.6792185306549072, + "step": 10913 + }, + { + "epoch": 2.73, + "grad_norm": 6.206093788146973, + "learning_rate": 4.278242759741212e-06, + "logits/chosen": -0.4792519807815552, + "logits/rejected": -0.5500975847244263, + "logps/chosen": -55.274681091308594, + "logps/rejected": -103.54988861083984, + "loss": 0.6046, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9640188217163086, + "rewards/margins": 6.917603969573975, + "rewards/rejected": -3.953585147857666, + "step": 10914 + }, + { + "epoch": 2.73, + "grad_norm": 10.59496021270752, + "learning_rate": 4.27746501306347e-06, + "logits/chosen": -0.4905169606208801, + "logits/rejected": -0.5969067811965942, + "logps/chosen": -61.09543991088867, + "logps/rejected": -99.92230224609375, + "loss": 0.6183, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8023264408111572, + "rewards/margins": 6.6662397384643555, + "rewards/rejected": -3.863913059234619, + "step": 10915 + }, + { + "epoch": 2.73, + "grad_norm": 7.0225653648376465, + "learning_rate": 4.276687284240337e-06, + "logits/chosen": -0.5242668390274048, + "logits/rejected": -0.5756904482841492, + "logps/chosen": -56.305763244628906, + "logps/rejected": -107.2039794921875, + "loss": 0.8673, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8134782314300537, + "rewards/margins": 5.297672271728516, + "rewards/rejected": -2.484194278717041, + "step": 10916 + }, + { + "epoch": 2.73, + "grad_norm": 18.7192325592041, + "learning_rate": 4.275909573291032e-06, + "logits/chosen": -0.5440912842750549, + "logits/rejected": -0.6410310864448547, + "logps/chosen": -55.784297943115234, + "logps/rejected": -98.80392456054688, + "loss": 0.6741, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.836505889892578, + "rewards/margins": 6.660787582397461, + "rewards/rejected": -3.824281692504883, + "step": 10917 + }, + { + "epoch": 2.73, + "grad_norm": 5.9379987716674805, + "learning_rate": 4.27513188023477e-06, + "logits/chosen": -0.4543638229370117, + "logits/rejected": -0.5845435261726379, + "logps/chosen": -77.23078155517578, + "logps/rejected": -101.90654754638672, + "loss": 0.7043, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0593199729919434, + "rewards/margins": 6.734679222106934, + "rewards/rejected": -3.6753597259521484, + "step": 10918 + }, + { + "epoch": 2.73, + "grad_norm": 11.847990036010742, + "learning_rate": 4.27435420509077e-06, + "logits/chosen": -0.5633949041366577, + "logits/rejected": -0.6308979988098145, + "logps/chosen": -54.04350280761719, + "logps/rejected": -96.91838073730469, + "loss": 0.6323, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.916008710861206, + "rewards/margins": 6.58279275894165, + "rewards/rejected": -3.6667838096618652, + "step": 10919 + }, + { + "epoch": 2.73, + "grad_norm": 11.23330307006836, + "learning_rate": 4.273576547878252e-06, + "logits/chosen": -0.5796645879745483, + "logits/rejected": -0.6385437846183777, + "logps/chosen": -71.65300750732422, + "logps/rejected": -105.23343658447266, + "loss": 0.6691, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1730966567993164, + "rewards/margins": 7.381023406982422, + "rewards/rejected": -4.207926273345947, + "step": 10920 + }, + { + "epoch": 2.73, + "grad_norm": 10.947078704833984, + "learning_rate": 4.272798908616427e-06, + "logits/chosen": -0.5170908570289612, + "logits/rejected": -0.6043215990066528, + "logps/chosen": -48.90104293823242, + "logps/rejected": -95.530029296875, + "loss": 0.6995, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.990814208984375, + "rewards/margins": 5.7031097412109375, + "rewards/rejected": -2.7122950553894043, + "step": 10921 + }, + { + "epoch": 2.73, + "grad_norm": 2.764749765396118, + "learning_rate": 4.272021287324515e-06, + "logits/chosen": -0.4617495536804199, + "logits/rejected": -0.5398826003074646, + "logps/chosen": -50.381927490234375, + "logps/rejected": -102.90206909179688, + "loss": 0.5697, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.135676145553589, + "rewards/margins": 7.310731410980225, + "rewards/rejected": -4.175055503845215, + "step": 10922 + }, + { + "epoch": 2.73, + "grad_norm": 17.176424026489258, + "learning_rate": 4.271243684021731e-06, + "logits/chosen": -0.5358067154884338, + "logits/rejected": -0.5894137620925903, + "logps/chosen": -53.33937072753906, + "logps/rejected": -90.91493225097656, + "loss": 0.7138, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1075940132141113, + "rewards/margins": 5.176324367523193, + "rewards/rejected": -2.068730354309082, + "step": 10923 + }, + { + "epoch": 2.73, + "grad_norm": 2.412222146987915, + "learning_rate": 4.270466098727293e-06, + "logits/chosen": -0.6060225963592529, + "logits/rejected": -0.6532383561134338, + "logps/chosen": -46.82265853881836, + "logps/rejected": -101.32394409179688, + "loss": 0.5484, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0781078338623047, + "rewards/margins": 6.525124549865723, + "rewards/rejected": -3.447016477584839, + "step": 10924 + }, + { + "epoch": 2.73, + "grad_norm": 4.5494489669799805, + "learning_rate": 4.269688531460412e-06, + "logits/chosen": -0.5919545888900757, + "logits/rejected": -0.6444644331932068, + "logps/chosen": -49.81462097167969, + "logps/rejected": -94.7890625, + "loss": 0.6984, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.068833112716675, + "rewards/margins": 6.103355407714844, + "rewards/rejected": -3.034522294998169, + "step": 10925 + }, + { + "epoch": 2.73, + "grad_norm": 3.366262435913086, + "learning_rate": 4.268910982240304e-06, + "logits/chosen": -0.6055694818496704, + "logits/rejected": -0.6777712106704712, + "logps/chosen": -48.604286193847656, + "logps/rejected": -94.35415649414062, + "loss": 0.6496, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8405961990356445, + "rewards/margins": 6.569530487060547, + "rewards/rejected": -3.728933811187744, + "step": 10926 + }, + { + "epoch": 2.73, + "grad_norm": 4.685418605804443, + "learning_rate": 4.268133451086184e-06, + "logits/chosen": -0.5393707156181335, + "logits/rejected": -0.6391487121582031, + "logps/chosen": -47.63788604736328, + "logps/rejected": -98.73680114746094, + "loss": 0.5525, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.290792942047119, + "rewards/margins": 7.143718719482422, + "rewards/rejected": -3.8529255390167236, + "step": 10927 + }, + { + "epoch": 2.73, + "grad_norm": 3.994063138961792, + "learning_rate": 4.267355938017265e-06, + "logits/chosen": -0.5291788578033447, + "logits/rejected": -0.5679299235343933, + "logps/chosen": -65.21495819091797, + "logps/rejected": -117.26675415039062, + "loss": 0.6763, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.029365062713623, + "rewards/margins": 6.333393096923828, + "rewards/rejected": -3.304028034210205, + "step": 10928 + }, + { + "epoch": 2.73, + "grad_norm": 7.160322666168213, + "learning_rate": 4.26657844305276e-06, + "logits/chosen": -0.49652349948883057, + "logits/rejected": -0.6283435821533203, + "logps/chosen": -67.26524353027344, + "logps/rejected": -99.7823486328125, + "loss": 0.6335, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0825939178466797, + "rewards/margins": 6.332147598266602, + "rewards/rejected": -3.249553680419922, + "step": 10929 + }, + { + "epoch": 2.73, + "grad_norm": 12.83042049407959, + "learning_rate": 4.265800966211881e-06, + "logits/chosen": -0.5328825116157532, + "logits/rejected": -0.5844427943229675, + "logps/chosen": -55.101402282714844, + "logps/rejected": -99.54364013671875, + "loss": 0.6897, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2554473876953125, + "rewards/margins": 6.183722972869873, + "rewards/rejected": -2.9282758235931396, + "step": 10930 + }, + { + "epoch": 2.73, + "grad_norm": 8.408456802368164, + "learning_rate": 4.265023507513842e-06, + "logits/chosen": -0.5691311955451965, + "logits/rejected": -0.6419194340705872, + "logps/chosen": -54.534515380859375, + "logps/rejected": -116.67001342773438, + "loss": 0.6546, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0279736518859863, + "rewards/margins": 7.263540744781494, + "rewards/rejected": -4.235567092895508, + "step": 10931 + }, + { + "epoch": 2.73, + "grad_norm": 12.02086353302002, + "learning_rate": 4.264246066977854e-06, + "logits/chosen": -0.4682251811027527, + "logits/rejected": -0.5420266389846802, + "logps/chosen": -52.247169494628906, + "logps/rejected": -109.20647430419922, + "loss": 0.6279, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8016724586486816, + "rewards/margins": 7.168472766876221, + "rewards/rejected": -4.366799831390381, + "step": 10932 + }, + { + "epoch": 2.74, + "grad_norm": 4.68123197555542, + "learning_rate": 4.263468644623127e-06, + "logits/chosen": -0.6043481826782227, + "logits/rejected": -0.6753635406494141, + "logps/chosen": -60.2269401550293, + "logps/rejected": -103.88599395751953, + "loss": 0.6869, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0776426792144775, + "rewards/margins": 6.555222988128662, + "rewards/rejected": -3.4775805473327637, + "step": 10933 + }, + { + "epoch": 2.74, + "grad_norm": 3.7426059246063232, + "learning_rate": 4.262691240468873e-06, + "logits/chosen": -0.5751951336860657, + "logits/rejected": -0.6599114537239075, + "logps/chosen": -54.00883102416992, + "logps/rejected": -94.7353515625, + "loss": 0.6354, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2251782417297363, + "rewards/margins": 6.37487268447876, + "rewards/rejected": -3.1496942043304443, + "step": 10934 + }, + { + "epoch": 2.74, + "grad_norm": 6.864872455596924, + "learning_rate": 4.2619138545343044e-06, + "logits/chosen": -0.5310183763504028, + "logits/rejected": -0.5518196821212769, + "logps/chosen": -62.99330520629883, + "logps/rejected": -100.03591918945312, + "loss": 0.7697, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9002697467803955, + "rewards/margins": 5.3453240394592285, + "rewards/rejected": -2.445054292678833, + "step": 10935 + }, + { + "epoch": 2.74, + "grad_norm": 4.559340953826904, + "learning_rate": 4.261136486838628e-06, + "logits/chosen": -0.5263741612434387, + "logits/rejected": -0.6228985786437988, + "logps/chosen": -66.39881134033203, + "logps/rejected": -84.55479431152344, + "loss": 0.5689, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.049980878829956, + "rewards/margins": 6.506867408752441, + "rewards/rejected": -3.456885814666748, + "step": 10936 + }, + { + "epoch": 2.74, + "grad_norm": 3.6479275226593018, + "learning_rate": 4.260359137401055e-06, + "logits/chosen": -0.5365338325500488, + "logits/rejected": -0.5874968767166138, + "logps/chosen": -43.63460922241211, + "logps/rejected": -90.97042846679688, + "loss": 0.6109, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.303558588027954, + "rewards/margins": 6.290614604949951, + "rewards/rejected": -2.987055778503418, + "step": 10937 + }, + { + "epoch": 2.74, + "grad_norm": 3.1048858165740967, + "learning_rate": 4.259581806240796e-06, + "logits/chosen": -0.4995207190513611, + "logits/rejected": -0.5954611897468567, + "logps/chosen": -57.03473663330078, + "logps/rejected": -108.78445434570312, + "loss": 0.5885, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1207473278045654, + "rewards/margins": 7.293160915374756, + "rewards/rejected": -4.172412872314453, + "step": 10938 + }, + { + "epoch": 2.74, + "grad_norm": 3.205301284790039, + "learning_rate": 4.258804493377057e-06, + "logits/chosen": -0.4625251591205597, + "logits/rejected": -0.5792717337608337, + "logps/chosen": -59.9825553894043, + "logps/rejected": -94.39370727539062, + "loss": 0.6335, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.045961618423462, + "rewards/margins": 6.975852012634277, + "rewards/rejected": -3.9298903942108154, + "step": 10939 + }, + { + "epoch": 2.74, + "grad_norm": 7.434484004974365, + "learning_rate": 4.2580271988290465e-06, + "logits/chosen": -0.5826743245124817, + "logits/rejected": -0.6380670070648193, + "logps/chosen": -55.757652282714844, + "logps/rejected": -92.53160858154297, + "loss": 0.7942, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0577404499053955, + "rewards/margins": 5.828066349029541, + "rewards/rejected": -2.7703256607055664, + "step": 10940 + }, + { + "epoch": 2.74, + "grad_norm": 3.454479694366455, + "learning_rate": 4.257249922615974e-06, + "logits/chosen": -0.4986051917076111, + "logits/rejected": -0.5907703042030334, + "logps/chosen": -55.427406311035156, + "logps/rejected": -100.3901596069336, + "loss": 0.6129, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1407012939453125, + "rewards/margins": 7.013686656951904, + "rewards/rejected": -3.872985601425171, + "step": 10941 + }, + { + "epoch": 2.74, + "grad_norm": 13.217399597167969, + "learning_rate": 4.256472664757047e-06, + "logits/chosen": -0.4580252766609192, + "logits/rejected": -0.516636073589325, + "logps/chosen": -57.54191589355469, + "logps/rejected": -102.52890014648438, + "loss": 0.6933, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8461108207702637, + "rewards/margins": 5.643360137939453, + "rewards/rejected": -2.7972493171691895, + "step": 10942 + }, + { + "epoch": 2.74, + "grad_norm": 5.386112689971924, + "learning_rate": 4.25569542527147e-06, + "logits/chosen": -0.5571585893630981, + "logits/rejected": -0.6028245687484741, + "logps/chosen": -51.65414810180664, + "logps/rejected": -96.68978881835938, + "loss": 0.6365, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3695993423461914, + "rewards/margins": 5.4966511726379395, + "rewards/rejected": -2.127051591873169, + "step": 10943 + }, + { + "epoch": 2.74, + "grad_norm": 4.590933322906494, + "learning_rate": 4.254918204178451e-06, + "logits/chosen": -0.5312002897262573, + "logits/rejected": -0.5535575151443481, + "logps/chosen": -55.14820861816406, + "logps/rejected": -109.0650863647461, + "loss": 0.6627, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3464221954345703, + "rewards/margins": 6.41420316696167, + "rewards/rejected": -3.0677809715270996, + "step": 10944 + }, + { + "epoch": 2.74, + "grad_norm": 6.591798782348633, + "learning_rate": 4.254141001497196e-06, + "logits/chosen": -0.5481095910072327, + "logits/rejected": -0.6265010237693787, + "logps/chosen": -52.450157165527344, + "logps/rejected": -105.15916442871094, + "loss": 0.6621, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.880326271057129, + "rewards/margins": 5.996763229370117, + "rewards/rejected": -3.1164369583129883, + "step": 10945 + }, + { + "epoch": 2.74, + "grad_norm": 7.479510307312012, + "learning_rate": 4.25336381724691e-06, + "logits/chosen": -0.4490078091621399, + "logits/rejected": -0.5385063886642456, + "logps/chosen": -62.75071716308594, + "logps/rejected": -116.17695617675781, + "loss": 0.7562, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9033660888671875, + "rewards/margins": 7.155487060546875, + "rewards/rejected": -4.2521209716796875, + "step": 10946 + }, + { + "epoch": 2.74, + "grad_norm": 6.312690258026123, + "learning_rate": 4.252586651446798e-06, + "logits/chosen": -0.5720573663711548, + "logits/rejected": -0.6415703296661377, + "logps/chosen": -54.2586669921875, + "logps/rejected": -88.22864532470703, + "loss": 0.7354, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.165588140487671, + "rewards/margins": 5.62303352355957, + "rewards/rejected": -2.457444667816162, + "step": 10947 + }, + { + "epoch": 2.74, + "grad_norm": 20.157846450805664, + "learning_rate": 4.251809504116063e-06, + "logits/chosen": -0.5513760447502136, + "logits/rejected": -0.6103942394256592, + "logps/chosen": -56.76609802246094, + "logps/rejected": -93.48414611816406, + "loss": 0.8216, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.196820020675659, + "rewards/margins": 5.696487903594971, + "rewards/rejected": -2.4996676445007324, + "step": 10948 + }, + { + "epoch": 2.74, + "grad_norm": 5.3722076416015625, + "learning_rate": 4.251032375273915e-06, + "logits/chosen": -0.5005755424499512, + "logits/rejected": -0.6313199996948242, + "logps/chosen": -63.41217803955078, + "logps/rejected": -108.59734344482422, + "loss": 0.6386, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.297445774078369, + "rewards/margins": 7.323596477508545, + "rewards/rejected": -4.026149749755859, + "step": 10949 + }, + { + "epoch": 2.74, + "grad_norm": 7.967662334442139, + "learning_rate": 4.25025526493955e-06, + "logits/chosen": -0.542020320892334, + "logits/rejected": -0.5851682424545288, + "logps/chosen": -52.13338088989258, + "logps/rejected": -114.73184204101562, + "loss": 0.6775, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.64884352684021, + "rewards/margins": 6.002374649047852, + "rewards/rejected": -3.3535308837890625, + "step": 10950 + }, + { + "epoch": 2.74, + "grad_norm": 12.165937423706055, + "learning_rate": 4.249478173132177e-06, + "logits/chosen": -0.5278205275535583, + "logits/rejected": -0.6064996123313904, + "logps/chosen": -50.311912536621094, + "logps/rejected": -93.66027069091797, + "loss": 0.6926, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.022097587585449, + "rewards/margins": 6.08614444732666, + "rewards/rejected": -3.06404709815979, + "step": 10951 + }, + { + "epoch": 2.74, + "grad_norm": 15.703665733337402, + "learning_rate": 4.2487010998709976e-06, + "logits/chosen": -0.5521363019943237, + "logits/rejected": -0.6153767108917236, + "logps/chosen": -53.163063049316406, + "logps/rejected": -93.33208465576172, + "loss": 0.7795, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.060296058654785, + "rewards/margins": 5.118984699249268, + "rewards/rejected": -2.0586886405944824, + "step": 10952 + }, + { + "epoch": 2.74, + "grad_norm": 3.799901008605957, + "learning_rate": 4.24792404517521e-06, + "logits/chosen": -0.5017585158348083, + "logits/rejected": -0.5730221271514893, + "logps/chosen": -59.346412658691406, + "logps/rejected": -87.06400299072266, + "loss": 0.7024, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9705092906951904, + "rewards/margins": 5.2940874099731445, + "rewards/rejected": -2.3235788345336914, + "step": 10953 + }, + { + "epoch": 2.74, + "grad_norm": 10.274922370910645, + "learning_rate": 4.247147009064021e-06, + "logits/chosen": -0.510539174079895, + "logits/rejected": -0.5795639753341675, + "logps/chosen": -58.423030853271484, + "logps/rejected": -91.69390106201172, + "loss": 0.8219, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1054840087890625, + "rewards/margins": 5.983335018157959, + "rewards/rejected": -2.8778512477874756, + "step": 10954 + }, + { + "epoch": 2.74, + "grad_norm": 6.1550164222717285, + "learning_rate": 4.24636999155663e-06, + "logits/chosen": -0.4451790153980255, + "logits/rejected": -0.5736784934997559, + "logps/chosen": -48.325096130371094, + "logps/rejected": -91.40032196044922, + "loss": 0.5987, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8431568145751953, + "rewards/margins": 7.282345771789551, + "rewards/rejected": -4.439188480377197, + "step": 10955 + }, + { + "epoch": 2.74, + "grad_norm": 2.467288017272949, + "learning_rate": 4.245592992672238e-06, + "logits/chosen": -0.5018786191940308, + "logits/rejected": -0.6250526309013367, + "logps/chosen": -60.23419189453125, + "logps/rejected": -94.75223541259766, + "loss": 0.5819, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.108682155609131, + "rewards/margins": 7.071905136108398, + "rewards/rejected": -3.9632227420806885, + "step": 10956 + }, + { + "epoch": 2.74, + "grad_norm": 13.010724067687988, + "learning_rate": 4.244816012430046e-06, + "logits/chosen": -0.5513765811920166, + "logits/rejected": -0.6373823881149292, + "logps/chosen": -51.160728454589844, + "logps/rejected": -85.20518493652344, + "loss": 0.7729, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9750618934631348, + "rewards/margins": 6.174298286437988, + "rewards/rejected": -3.1992363929748535, + "step": 10957 + }, + { + "epoch": 2.74, + "grad_norm": 4.007813930511475, + "learning_rate": 4.2440390508492525e-06, + "logits/chosen": -0.5352197885513306, + "logits/rejected": -0.6327929496765137, + "logps/chosen": -50.33185958862305, + "logps/rejected": -95.37737274169922, + "loss": 0.664, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.180863857269287, + "rewards/margins": 6.852860450744629, + "rewards/rejected": -3.6719961166381836, + "step": 10958 + }, + { + "epoch": 2.74, + "grad_norm": 6.285894393920898, + "learning_rate": 4.2432621079490585e-06, + "logits/chosen": -0.5457481145858765, + "logits/rejected": -0.6221322417259216, + "logps/chosen": -55.78011703491211, + "logps/rejected": -104.98902893066406, + "loss": 0.6693, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9146132469177246, + "rewards/margins": 6.993380546569824, + "rewards/rejected": -4.078767776489258, + "step": 10959 + }, + { + "epoch": 2.74, + "grad_norm": 7.189465522766113, + "learning_rate": 4.242485183748665e-06, + "logits/chosen": -0.49783599376678467, + "logits/rejected": -0.5733615756034851, + "logps/chosen": -62.900821685791016, + "logps/rejected": -108.01541900634766, + "loss": 0.6743, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.332021951675415, + "rewards/margins": 6.607333660125732, + "rewards/rejected": -3.2753117084503174, + "step": 10960 + }, + { + "epoch": 2.74, + "grad_norm": 3.524507999420166, + "learning_rate": 4.241708278267266e-06, + "logits/chosen": -0.5261242389678955, + "logits/rejected": -0.6245666742324829, + "logps/chosen": -53.05154037475586, + "logps/rejected": -91.31065368652344, + "loss": 0.6271, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1170132160186768, + "rewards/margins": 6.459234714508057, + "rewards/rejected": -3.342221975326538, + "step": 10961 + }, + { + "epoch": 2.74, + "grad_norm": 6.827823638916016, + "learning_rate": 4.2409313915240605e-06, + "logits/chosen": -0.5315955877304077, + "logits/rejected": -0.6207181215286255, + "logps/chosen": -50.66112518310547, + "logps/rejected": -90.76959991455078, + "loss": 0.7418, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.968768358230591, + "rewards/margins": 5.691783428192139, + "rewards/rejected": -2.7230148315429688, + "step": 10962 + }, + { + "epoch": 2.74, + "grad_norm": 6.408209800720215, + "learning_rate": 4.240154523538252e-06, + "logits/chosen": -0.5708135962486267, + "logits/rejected": -0.6546311378479004, + "logps/chosen": -48.08275604248047, + "logps/rejected": -94.57575225830078, + "loss": 0.6938, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0728914737701416, + "rewards/margins": 6.841087341308594, + "rewards/rejected": -3.768195152282715, + "step": 10963 + }, + { + "epoch": 2.74, + "grad_norm": 6.828334331512451, + "learning_rate": 4.2393776743290305e-06, + "logits/chosen": -0.48012036085128784, + "logits/rejected": -0.5526576638221741, + "logps/chosen": -46.136199951171875, + "logps/rejected": -88.31969451904297, + "loss": 0.6276, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.040693759918213, + "rewards/margins": 5.146471977233887, + "rewards/rejected": -2.1057779788970947, + "step": 10964 + }, + { + "epoch": 2.74, + "grad_norm": 6.482235431671143, + "learning_rate": 4.2386008439155945e-06, + "logits/chosen": -0.46343857049942017, + "logits/rejected": -0.5334183573722839, + "logps/chosen": -50.80896759033203, + "logps/rejected": -90.00459289550781, + "loss": 0.6192, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.340048313140869, + "rewards/margins": 6.382813930511475, + "rewards/rejected": -3.0427656173706055, + "step": 10965 + }, + { + "epoch": 2.74, + "grad_norm": 11.787458419799805, + "learning_rate": 4.237824032317143e-06, + "logits/chosen": -0.5991343259811401, + "logits/rejected": -0.6488984823226929, + "logps/chosen": -48.452178955078125, + "logps/rejected": -87.87163543701172, + "loss": 0.8771, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1189286708831787, + "rewards/margins": 4.583538055419922, + "rewards/rejected": -1.4646095037460327, + "step": 10966 + }, + { + "epoch": 2.74, + "grad_norm": 6.595036506652832, + "learning_rate": 4.237047239552871e-06, + "logits/chosen": -0.5888640880584717, + "logits/rejected": -0.6361675262451172, + "logps/chosen": -49.31431579589844, + "logps/rejected": -124.7448501586914, + "loss": 0.6481, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.277151584625244, + "rewards/margins": 8.760597229003906, + "rewards/rejected": -5.483445644378662, + "step": 10967 + }, + { + "epoch": 2.74, + "grad_norm": 4.649130344390869, + "learning_rate": 4.236270465641973e-06, + "logits/chosen": -0.5670170783996582, + "logits/rejected": -0.6363732814788818, + "logps/chosen": -55.627159118652344, + "logps/rejected": -94.32583618164062, + "loss": 0.6828, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0292916297912598, + "rewards/margins": 6.006675720214844, + "rewards/rejected": -2.977383852005005, + "step": 10968 + }, + { + "epoch": 2.74, + "grad_norm": 2.256917953491211, + "learning_rate": 4.235493710603645e-06, + "logits/chosen": -0.4901334345340729, + "logits/rejected": -0.5880675315856934, + "logps/chosen": -60.203773498535156, + "logps/rejected": -98.44989013671875, + "loss": 0.579, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2479166984558105, + "rewards/margins": 7.066612720489502, + "rewards/rejected": -3.8186962604522705, + "step": 10969 + }, + { + "epoch": 2.74, + "grad_norm": 5.645654201507568, + "learning_rate": 4.2347169744570796e-06, + "logits/chosen": -0.41066157817840576, + "logits/rejected": -0.5333665609359741, + "logps/chosen": -59.112060546875, + "logps/rejected": -85.44607543945312, + "loss": 0.6634, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1978330612182617, + "rewards/margins": 5.510437965393066, + "rewards/rejected": -2.312605142593384, + "step": 10970 + }, + { + "epoch": 2.74, + "grad_norm": 6.3095011711120605, + "learning_rate": 4.233940257221471e-06, + "logits/chosen": -0.4862247109413147, + "logits/rejected": -0.5421447157859802, + "logps/chosen": -58.180728912353516, + "logps/rejected": -107.18399047851562, + "loss": 0.6767, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.180992603302002, + "rewards/margins": 6.0632781982421875, + "rewards/rejected": -2.8822855949401855, + "step": 10971 + }, + { + "epoch": 2.74, + "grad_norm": 12.416630744934082, + "learning_rate": 4.233163558916014e-06, + "logits/chosen": -0.5237582325935364, + "logits/rejected": -0.609792947769165, + "logps/chosen": -53.960975646972656, + "logps/rejected": -85.18285369873047, + "loss": 0.6514, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.290149450302124, + "rewards/margins": 5.299903869628906, + "rewards/rejected": -2.009753704071045, + "step": 10972 + }, + { + "epoch": 2.75, + "grad_norm": 10.065473556518555, + "learning_rate": 4.232386879559901e-06, + "logits/chosen": -0.5373045802116394, + "logits/rejected": -0.6571645736694336, + "logps/chosen": -63.04347229003906, + "logps/rejected": -90.93240356445312, + "loss": 0.6627, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1767287254333496, + "rewards/margins": 6.3575053215026855, + "rewards/rejected": -3.180776596069336, + "step": 10973 + }, + { + "epoch": 2.75, + "grad_norm": 4.279682636260986, + "learning_rate": 4.231610219172326e-06, + "logits/chosen": -0.6341107487678528, + "logits/rejected": -0.6816616654396057, + "logps/chosen": -43.208248138427734, + "logps/rejected": -117.9738998413086, + "loss": 0.6354, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2362942695617676, + "rewards/margins": 7.437556743621826, + "rewards/rejected": -4.201262950897217, + "step": 10974 + }, + { + "epoch": 2.75, + "grad_norm": 7.83835506439209, + "learning_rate": 4.230833577772478e-06, + "logits/chosen": -0.5294097661972046, + "logits/rejected": -0.5461583137512207, + "logps/chosen": -50.65983581542969, + "logps/rejected": -101.68710327148438, + "loss": 0.7205, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1221351623535156, + "rewards/margins": 5.267696380615234, + "rewards/rejected": -2.1455609798431396, + "step": 10975 + }, + { + "epoch": 2.75, + "grad_norm": 3.0139882564544678, + "learning_rate": 4.230056955379551e-06, + "logits/chosen": -0.6121261119842529, + "logits/rejected": -0.7531118392944336, + "logps/chosen": -44.97467803955078, + "logps/rejected": -81.0650634765625, + "loss": 0.5593, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.812976360321045, + "rewards/margins": 7.368697643280029, + "rewards/rejected": -4.555720806121826, + "step": 10976 + }, + { + "epoch": 2.75, + "grad_norm": 6.31751823425293, + "learning_rate": 4.229280352012737e-06, + "logits/chosen": -0.4903961718082428, + "logits/rejected": -0.6192196011543274, + "logps/chosen": -67.55133819580078, + "logps/rejected": -94.26435852050781, + "loss": 0.7654, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0346832275390625, + "rewards/margins": 5.982454776763916, + "rewards/rejected": -2.9477720260620117, + "step": 10977 + }, + { + "epoch": 2.75, + "grad_norm": 6.336513042449951, + "learning_rate": 4.228503767691223e-06, + "logits/chosen": -0.5135058760643005, + "logits/rejected": -0.5537477135658264, + "logps/chosen": -59.81801223754883, + "logps/rejected": -98.70275115966797, + "loss": 0.7036, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0167908668518066, + "rewards/margins": 4.851800918579102, + "rewards/rejected": -1.8350099325180054, + "step": 10978 + }, + { + "epoch": 2.75, + "grad_norm": 6.013741970062256, + "learning_rate": 4.227727202434201e-06, + "logits/chosen": -0.6471843123435974, + "logits/rejected": -0.7045872211456299, + "logps/chosen": -47.73514938354492, + "logps/rejected": -91.57191467285156, + "loss": 0.7802, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8643319606781006, + "rewards/margins": 5.170251846313477, + "rewards/rejected": -2.305919885635376, + "step": 10979 + }, + { + "epoch": 2.75, + "grad_norm": 7.5683441162109375, + "learning_rate": 4.226950656260863e-06, + "logits/chosen": -0.5319265127182007, + "logits/rejected": -0.5719045400619507, + "logps/chosen": -51.25510025024414, + "logps/rejected": -97.23758697509766, + "loss": 0.8217, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9614317417144775, + "rewards/margins": 5.6541595458984375, + "rewards/rejected": -2.692727565765381, + "step": 10980 + }, + { + "epoch": 2.75, + "grad_norm": 4.955077171325684, + "learning_rate": 4.226174129190398e-06, + "logits/chosen": -0.517084538936615, + "logits/rejected": -0.553873598575592, + "logps/chosen": -44.44594955444336, + "logps/rejected": -103.6962661743164, + "loss": 0.6335, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.720489501953125, + "rewards/margins": 6.79005765914917, + "rewards/rejected": -4.069568634033203, + "step": 10981 + }, + { + "epoch": 2.75, + "grad_norm": 4.895555019378662, + "learning_rate": 4.22539762124199e-06, + "logits/chosen": -0.5726868510246277, + "logits/rejected": -0.620038628578186, + "logps/chosen": -50.93089294433594, + "logps/rejected": -92.76773834228516, + "loss": 0.6957, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.841350555419922, + "rewards/margins": 5.345557689666748, + "rewards/rejected": -2.504207134246826, + "step": 10982 + }, + { + "epoch": 2.75, + "grad_norm": 2.750070810317993, + "learning_rate": 4.224621132434832e-06, + "logits/chosen": -0.5410058498382568, + "logits/rejected": -0.6407613754272461, + "logps/chosen": -53.732051849365234, + "logps/rejected": -92.97544860839844, + "loss": 0.6059, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.521150588989258, + "rewards/margins": 6.3195037841796875, + "rewards/rejected": -2.7983531951904297, + "step": 10983 + }, + { + "epoch": 2.75, + "grad_norm": 3.930297613143921, + "learning_rate": 4.2238446627881105e-06, + "logits/chosen": -0.589531421661377, + "logits/rejected": -0.6488435864448547, + "logps/chosen": -47.187164306640625, + "logps/rejected": -81.81163787841797, + "loss": 0.6645, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1428723335266113, + "rewards/margins": 5.360499858856201, + "rewards/rejected": -2.21762752532959, + "step": 10984 + }, + { + "epoch": 2.75, + "grad_norm": 5.4569597244262695, + "learning_rate": 4.223068212321014e-06, + "logits/chosen": -0.4866679310798645, + "logits/rejected": -0.5999857187271118, + "logps/chosen": -54.50060272216797, + "logps/rejected": -90.61076354980469, + "loss": 0.6604, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.110821485519409, + "rewards/margins": 6.047119617462158, + "rewards/rejected": -2.93629789352417, + "step": 10985 + }, + { + "epoch": 2.75, + "grad_norm": 6.010521411895752, + "learning_rate": 4.222291781052728e-06, + "logits/chosen": -0.5422700047492981, + "logits/rejected": -0.6458204388618469, + "logps/chosen": -62.16901397705078, + "logps/rejected": -93.28803253173828, + "loss": 0.7251, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1860930919647217, + "rewards/margins": 4.614831447601318, + "rewards/rejected": -1.4287382364273071, + "step": 10986 + }, + { + "epoch": 2.75, + "grad_norm": 7.848834037780762, + "learning_rate": 4.221515369002439e-06, + "logits/chosen": -0.43106338381767273, + "logits/rejected": -0.49670666456222534, + "logps/chosen": -53.733543395996094, + "logps/rejected": -88.80181884765625, + "loss": 0.6318, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.110607624053955, + "rewards/margins": 5.890298843383789, + "rewards/rejected": -2.779690742492676, + "step": 10987 + }, + { + "epoch": 2.75, + "grad_norm": 3.96185302734375, + "learning_rate": 4.220738976189334e-06, + "logits/chosen": -0.5213602185249329, + "logits/rejected": -0.5860841274261475, + "logps/chosen": -70.69021606445312, + "logps/rejected": -96.85368347167969, + "loss": 0.7144, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8738741874694824, + "rewards/margins": 5.140933036804199, + "rewards/rejected": -2.267058849334717, + "step": 10988 + }, + { + "epoch": 2.75, + "grad_norm": 7.539275646209717, + "learning_rate": 4.219962602632596e-06, + "logits/chosen": -0.5491651892662048, + "logits/rejected": -0.6380183100700378, + "logps/chosen": -51.638919830322266, + "logps/rejected": -102.80326080322266, + "loss": 0.646, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0667736530303955, + "rewards/margins": 5.431573390960693, + "rewards/rejected": -2.3647990226745605, + "step": 10989 + }, + { + "epoch": 2.75, + "grad_norm": 5.271752834320068, + "learning_rate": 4.219186248351413e-06, + "logits/chosen": -0.5703839659690857, + "logits/rejected": -0.6574323177337646, + "logps/chosen": -53.62480926513672, + "logps/rejected": -101.70594024658203, + "loss": 0.7966, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.656863212585449, + "rewards/margins": 6.353885650634766, + "rewards/rejected": -3.6970226764678955, + "step": 10990 + }, + { + "epoch": 2.75, + "grad_norm": 3.0131943225860596, + "learning_rate": 4.218409913364966e-06, + "logits/chosen": -0.512563943862915, + "logits/rejected": -0.6286836862564087, + "logps/chosen": -50.6529655456543, + "logps/rejected": -97.18403625488281, + "loss": 0.5984, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1126608848571777, + "rewards/margins": 6.758460998535156, + "rewards/rejected": -3.6458001136779785, + "step": 10991 + }, + { + "epoch": 2.75, + "grad_norm": 9.24626636505127, + "learning_rate": 4.217633597692446e-06, + "logits/chosen": -0.5295240879058838, + "logits/rejected": -0.6060687899589539, + "logps/chosen": -49.454002380371094, + "logps/rejected": -86.87042999267578, + "loss": 0.8029, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.110879898071289, + "rewards/margins": 5.300396919250488, + "rewards/rejected": -2.189517021179199, + "step": 10992 + }, + { + "epoch": 2.75, + "grad_norm": 3.602790594100952, + "learning_rate": 4.216857301353029e-06, + "logits/chosen": -0.5365378260612488, + "logits/rejected": -0.5985228419303894, + "logps/chosen": -53.468505859375, + "logps/rejected": -89.61381530761719, + "loss": 0.6664, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1156435012817383, + "rewards/margins": 6.463124752044678, + "rewards/rejected": -3.347480535507202, + "step": 10993 + }, + { + "epoch": 2.75, + "grad_norm": 5.864558219909668, + "learning_rate": 4.2160810243659e-06, + "logits/chosen": -0.5497851967811584, + "logits/rejected": -0.6373779773712158, + "logps/chosen": -50.904415130615234, + "logps/rejected": -74.48673248291016, + "loss": 0.7161, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8241634368896484, + "rewards/margins": 5.440928936004639, + "rewards/rejected": -2.616765260696411, + "step": 10994 + }, + { + "epoch": 2.75, + "grad_norm": 17.647485733032227, + "learning_rate": 4.2153047667502466e-06, + "logits/chosen": -0.5134900808334351, + "logits/rejected": -0.5503340363502502, + "logps/chosen": -65.8447036743164, + "logps/rejected": -98.39321899414062, + "loss": 1.0387, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.682518720626831, + "rewards/margins": 5.5769124031066895, + "rewards/rejected": -2.8943934440612793, + "step": 10995 + }, + { + "epoch": 2.75, + "grad_norm": 4.084921360015869, + "learning_rate": 4.214528528525243e-06, + "logits/chosen": -0.5226327776908875, + "logits/rejected": -0.6173658967018127, + "logps/chosen": -58.74909210205078, + "logps/rejected": -88.8075180053711, + "loss": 0.687, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.291623115539551, + "rewards/margins": 6.250270843505859, + "rewards/rejected": -2.9586479663848877, + "step": 10996 + }, + { + "epoch": 2.75, + "grad_norm": 5.868018627166748, + "learning_rate": 4.213752309710076e-06, + "logits/chosen": -0.5152716636657715, + "logits/rejected": -0.5805076956748962, + "logps/chosen": -58.9766731262207, + "logps/rejected": -104.84693145751953, + "loss": 0.6983, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0324935913085938, + "rewards/margins": 6.201473712921143, + "rewards/rejected": -3.168980121612549, + "step": 10997 + }, + { + "epoch": 2.75, + "grad_norm": 5.405698776245117, + "learning_rate": 4.212976110323926e-06, + "logits/chosen": -0.457903653383255, + "logits/rejected": -0.5379513502120972, + "logps/chosen": -55.01521682739258, + "logps/rejected": -74.94364929199219, + "loss": 0.663, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.394258737564087, + "rewards/margins": 5.212693691253662, + "rewards/rejected": -1.8184345960617065, + "step": 10998 + }, + { + "epoch": 2.75, + "grad_norm": 5.632958889007568, + "learning_rate": 4.212199930385975e-06, + "logits/chosen": -0.5355954170227051, + "logits/rejected": -0.6150894165039062, + "logps/chosen": -51.86919021606445, + "logps/rejected": -81.60765075683594, + "loss": 0.7272, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.908245801925659, + "rewards/margins": 5.366196632385254, + "rewards/rejected": -2.4579505920410156, + "step": 10999 + }, + { + "epoch": 2.75, + "grad_norm": 16.125944137573242, + "learning_rate": 4.211423769915399e-06, + "logits/chosen": -0.4772956371307373, + "logits/rejected": -0.5353575944900513, + "logps/chosen": -63.11962127685547, + "logps/rejected": -106.61581420898438, + "loss": 0.6853, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.218984603881836, + "rewards/margins": 6.337249755859375, + "rewards/rejected": -3.118265390396118, + "step": 11000 + }, + { + "epoch": 2.75, + "grad_norm": 4.803861618041992, + "learning_rate": 4.210647628931382e-06, + "logits/chosen": -0.4652997851371765, + "logits/rejected": -0.5405747890472412, + "logps/chosen": -53.193397521972656, + "logps/rejected": -93.52548217773438, + "loss": 0.6267, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.781562328338623, + "rewards/margins": 5.813309192657471, + "rewards/rejected": -3.0317468643188477, + "step": 11001 + }, + { + "epoch": 2.75, + "grad_norm": 7.402839660644531, + "learning_rate": 4.209871507453102e-06, + "logits/chosen": -0.4734313488006592, + "logits/rejected": -0.5662567019462585, + "logps/chosen": -51.12054443359375, + "logps/rejected": -80.4261703491211, + "loss": 0.7481, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8575632572174072, + "rewards/margins": 5.32905912399292, + "rewards/rejected": -2.471496105194092, + "step": 11002 + }, + { + "epoch": 2.75, + "grad_norm": 10.076444625854492, + "learning_rate": 4.209095405499737e-06, + "logits/chosen": -0.4712570309638977, + "logits/rejected": -0.5957778692245483, + "logps/chosen": -64.18931579589844, + "logps/rejected": -107.56016540527344, + "loss": 0.7683, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9213778972625732, + "rewards/margins": 6.488354206085205, + "rewards/rejected": -3.566976308822632, + "step": 11003 + }, + { + "epoch": 2.75, + "grad_norm": 2.9663448333740234, + "learning_rate": 4.208319323090465e-06, + "logits/chosen": -0.5121654272079468, + "logits/rejected": -0.6111778616905212, + "logps/chosen": -55.22343444824219, + "logps/rejected": -95.47438049316406, + "loss": 0.6016, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1453347206115723, + "rewards/margins": 6.406381130218506, + "rewards/rejected": -3.261047124862671, + "step": 11004 + }, + { + "epoch": 2.75, + "grad_norm": 6.332666397094727, + "learning_rate": 4.2075432602444645e-06, + "logits/chosen": -0.5687296390533447, + "logits/rejected": -0.6123561859130859, + "logps/chosen": -45.429100036621094, + "logps/rejected": -110.20152282714844, + "loss": 0.633, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2504935264587402, + "rewards/margins": 6.107583999633789, + "rewards/rejected": -2.857090711593628, + "step": 11005 + }, + { + "epoch": 2.75, + "grad_norm": 5.440766334533691, + "learning_rate": 4.206767216980916e-06, + "logits/chosen": -0.46330973505973816, + "logits/rejected": -0.5682005882263184, + "logps/chosen": -54.994510650634766, + "logps/rejected": -100.91320037841797, + "loss": 0.6135, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1580464839935303, + "rewards/margins": 6.732799530029297, + "rewards/rejected": -3.5747532844543457, + "step": 11006 + }, + { + "epoch": 2.75, + "grad_norm": 7.286933898925781, + "learning_rate": 4.20599119331899e-06, + "logits/chosen": -0.547885537147522, + "logits/rejected": -0.5630814433097839, + "logps/chosen": -50.21715545654297, + "logps/rejected": -105.51199340820312, + "loss": 0.7492, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2663841247558594, + "rewards/margins": 6.373632431030273, + "rewards/rejected": -3.1072487831115723, + "step": 11007 + }, + { + "epoch": 2.75, + "grad_norm": 3.928527355194092, + "learning_rate": 4.205215189277866e-06, + "logits/chosen": -0.5208820700645447, + "logits/rejected": -0.5627670884132385, + "logps/chosen": -50.7376708984375, + "logps/rejected": -98.78681945800781, + "loss": 0.6242, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3749332427978516, + "rewards/margins": 6.622925758361816, + "rewards/rejected": -3.247992992401123, + "step": 11008 + }, + { + "epoch": 2.75, + "grad_norm": 10.234885215759277, + "learning_rate": 4.204439204876721e-06, + "logits/chosen": -0.5755884647369385, + "logits/rejected": -0.6037157773971558, + "logps/chosen": -59.124088287353516, + "logps/rejected": -105.97201538085938, + "loss": 0.7915, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9347238540649414, + "rewards/margins": 5.706700801849365, + "rewards/rejected": -2.7719767093658447, + "step": 11009 + }, + { + "epoch": 2.75, + "grad_norm": 5.188915252685547, + "learning_rate": 4.2036632401347305e-06, + "logits/chosen": -0.4893374443054199, + "logits/rejected": -0.6017840504646301, + "logps/chosen": -60.037532806396484, + "logps/rejected": -89.90730285644531, + "loss": 0.6755, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.239603042602539, + "rewards/margins": 6.4658098220825195, + "rewards/rejected": -3.2262072563171387, + "step": 11010 + }, + { + "epoch": 2.75, + "grad_norm": 7.708982944488525, + "learning_rate": 4.202887295071067e-06, + "logits/chosen": -0.5907539129257202, + "logits/rejected": -0.68532395362854, + "logps/chosen": -57.69865036010742, + "logps/rejected": -99.87863159179688, + "loss": 0.6917, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9633865356445312, + "rewards/margins": 6.421990394592285, + "rewards/rejected": -3.4586031436920166, + "step": 11011 + }, + { + "epoch": 2.75, + "grad_norm": 4.312065601348877, + "learning_rate": 4.202111369704907e-06, + "logits/chosen": -0.43129754066467285, + "logits/rejected": -0.5219051241874695, + "logps/chosen": -62.50568389892578, + "logps/rejected": -110.02055358886719, + "loss": 0.5669, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9996449947357178, + "rewards/margins": 7.239373207092285, + "rewards/rejected": -4.2397284507751465, + "step": 11012 + }, + { + "epoch": 2.76, + "grad_norm": 6.8417439460754395, + "learning_rate": 4.201335464055424e-06, + "logits/chosen": -0.566319465637207, + "logits/rejected": -0.6382001042366028, + "logps/chosen": -47.96930694580078, + "logps/rejected": -108.78892517089844, + "loss": 0.5361, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.294929265975952, + "rewards/margins": 7.671463966369629, + "rewards/rejected": -4.376534461975098, + "step": 11013 + }, + { + "epoch": 2.76, + "grad_norm": 9.176589965820312, + "learning_rate": 4.2005595781417905e-06, + "logits/chosen": -0.5504875183105469, + "logits/rejected": -0.6173229217529297, + "logps/chosen": -63.89316177368164, + "logps/rejected": -99.72602844238281, + "loss": 0.7889, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.047776222229004, + "rewards/margins": 6.7375969886779785, + "rewards/rejected": -3.6898207664489746, + "step": 11014 + }, + { + "epoch": 2.76, + "grad_norm": 6.671741485595703, + "learning_rate": 4.19978371198318e-06, + "logits/chosen": -0.5609673261642456, + "logits/rejected": -0.6408202648162842, + "logps/chosen": -56.656272888183594, + "logps/rejected": -103.65284729003906, + "loss": 0.6412, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8536956310272217, + "rewards/margins": 6.63824462890625, + "rewards/rejected": -3.7845492362976074, + "step": 11015 + }, + { + "epoch": 2.76, + "grad_norm": 6.928475856781006, + "learning_rate": 4.199007865598765e-06, + "logits/chosen": -0.4866792559623718, + "logits/rejected": -0.5366073250770569, + "logps/chosen": -44.9396858215332, + "logps/rejected": -81.33692932128906, + "loss": 0.7302, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3021931648254395, + "rewards/margins": 5.1507887840271, + "rewards/rejected": -1.848595142364502, + "step": 11016 + }, + { + "epoch": 2.76, + "grad_norm": 3.6990504264831543, + "learning_rate": 4.198232039007719e-06, + "logits/chosen": -0.43025749921798706, + "logits/rejected": -0.5503244400024414, + "logps/chosen": -54.48123550415039, + "logps/rejected": -87.58524322509766, + "loss": 0.5898, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9473581314086914, + "rewards/margins": 5.968109130859375, + "rewards/rejected": -3.020750045776367, + "step": 11017 + }, + { + "epoch": 2.76, + "grad_norm": 12.817197799682617, + "learning_rate": 4.197456232229211e-06, + "logits/chosen": -0.5640833377838135, + "logits/rejected": -0.6418156623840332, + "logps/chosen": -57.49970245361328, + "logps/rejected": -122.26512145996094, + "loss": 0.6104, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.900238513946533, + "rewards/margins": 6.784265518188477, + "rewards/rejected": -3.8840267658233643, + "step": 11018 + }, + { + "epoch": 2.76, + "grad_norm": 2.491910696029663, + "learning_rate": 4.196680445282413e-06, + "logits/chosen": -0.4986223578453064, + "logits/rejected": -0.5997025966644287, + "logps/chosen": -58.383819580078125, + "logps/rejected": -107.38439178466797, + "loss": 0.5596, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.978574752807617, + "rewards/margins": 6.846279144287109, + "rewards/rejected": -3.867704153060913, + "step": 11019 + }, + { + "epoch": 2.76, + "grad_norm": 4.601319789886475, + "learning_rate": 4.1959046781864965e-06, + "logits/chosen": -0.5190317630767822, + "logits/rejected": -0.5583429932594299, + "logps/chosen": -55.29897689819336, + "logps/rejected": -118.77926635742188, + "loss": 0.7135, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1334011554718018, + "rewards/margins": 6.183187007904053, + "rewards/rejected": -3.049785614013672, + "step": 11020 + }, + { + "epoch": 2.76, + "grad_norm": 7.317586421966553, + "learning_rate": 4.195128930960631e-06, + "logits/chosen": -0.46489211916923523, + "logits/rejected": -0.5670512914657593, + "logps/chosen": -58.44660949707031, + "logps/rejected": -91.37017822265625, + "loss": 0.7643, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.847010612487793, + "rewards/margins": 5.030431270599365, + "rewards/rejected": -2.1834206581115723, + "step": 11021 + }, + { + "epoch": 2.76, + "grad_norm": 7.704941749572754, + "learning_rate": 4.194353203623983e-06, + "logits/chosen": -0.5735874772071838, + "logits/rejected": -0.6512970924377441, + "logps/chosen": -48.03522491455078, + "logps/rejected": -100.62877655029297, + "loss": 0.5815, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1715877056121826, + "rewards/margins": 6.807806491851807, + "rewards/rejected": -3.636218547821045, + "step": 11022 + }, + { + "epoch": 2.76, + "grad_norm": 2.6583807468414307, + "learning_rate": 4.193577496195725e-06, + "logits/chosen": -0.5109602212905884, + "logits/rejected": -0.5938383936882019, + "logps/chosen": -60.42723083496094, + "logps/rejected": -103.64759826660156, + "loss": 0.6036, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.974250078201294, + "rewards/margins": 6.7532124519348145, + "rewards/rejected": -3.7789623737335205, + "step": 11023 + }, + { + "epoch": 2.76, + "grad_norm": 8.871959686279297, + "learning_rate": 4.192801808695028e-06, + "logits/chosen": -0.5493030548095703, + "logits/rejected": -0.6376516819000244, + "logps/chosen": -58.45533752441406, + "logps/rejected": -101.08958435058594, + "loss": 0.6919, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8622167110443115, + "rewards/margins": 7.056540012359619, + "rewards/rejected": -4.1943230628967285, + "step": 11024 + }, + { + "epoch": 2.76, + "grad_norm": 21.01055145263672, + "learning_rate": 4.192026141141054e-06, + "logits/chosen": -0.45478355884552, + "logits/rejected": -0.5486252903938293, + "logps/chosen": -59.62200164794922, + "logps/rejected": -89.95063781738281, + "loss": 0.6841, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0356128215789795, + "rewards/margins": 5.936376571655273, + "rewards/rejected": -2.900763511657715, + "step": 11025 + }, + { + "epoch": 2.76, + "grad_norm": 2.9742591381073, + "learning_rate": 4.191250493552974e-06, + "logits/chosen": -0.539514422416687, + "logits/rejected": -0.6448447704315186, + "logps/chosen": -54.05069351196289, + "logps/rejected": -98.20418548583984, + "loss": 0.5931, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.946945905685425, + "rewards/margins": 5.816979885101318, + "rewards/rejected": -2.8700339794158936, + "step": 11026 + }, + { + "epoch": 2.76, + "grad_norm": 2.948486328125, + "learning_rate": 4.190474865949956e-06, + "logits/chosen": -0.48082974553108215, + "logits/rejected": -0.5785634517669678, + "logps/chosen": -63.896240234375, + "logps/rejected": -111.74649047851562, + "loss": 0.5863, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0931594371795654, + "rewards/margins": 6.872724533081055, + "rewards/rejected": -3.7795653343200684, + "step": 11027 + }, + { + "epoch": 2.76, + "grad_norm": 8.109169960021973, + "learning_rate": 4.189699258351163e-06, + "logits/chosen": -0.5372695922851562, + "logits/rejected": -0.6071417331695557, + "logps/chosen": -60.650108337402344, + "logps/rejected": -116.25383758544922, + "loss": 0.6384, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.082073211669922, + "rewards/margins": 6.41653299331665, + "rewards/rejected": -3.334459066390991, + "step": 11028 + }, + { + "epoch": 2.76, + "grad_norm": 3.3821840286254883, + "learning_rate": 4.188923670775764e-06, + "logits/chosen": -0.5102177262306213, + "logits/rejected": -0.5945369601249695, + "logps/chosen": -65.71878051757812, + "logps/rejected": -99.06334686279297, + "loss": 0.6769, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7557806968688965, + "rewards/margins": 6.778237342834473, + "rewards/rejected": -4.022456169128418, + "step": 11029 + }, + { + "epoch": 2.76, + "grad_norm": 10.43210506439209, + "learning_rate": 4.188148103242924e-06, + "logits/chosen": -0.5269229412078857, + "logits/rejected": -0.5760747790336609, + "logps/chosen": -64.56275939941406, + "logps/rejected": -103.76068878173828, + "loss": 0.8201, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8237063884735107, + "rewards/margins": 5.909193515777588, + "rewards/rejected": -3.0854876041412354, + "step": 11030 + }, + { + "epoch": 2.76, + "grad_norm": 3.216841459274292, + "learning_rate": 4.187372555771808e-06, + "logits/chosen": -0.643959641456604, + "logits/rejected": -0.7233928442001343, + "logps/chosen": -50.94398880004883, + "logps/rejected": -113.78101348876953, + "loss": 0.604, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1701560020446777, + "rewards/margins": 8.00008773803711, + "rewards/rejected": -4.829931259155273, + "step": 11031 + }, + { + "epoch": 2.76, + "grad_norm": 4.730409622192383, + "learning_rate": 4.18659702838158e-06, + "logits/chosen": -0.41364604234695435, + "logits/rejected": -0.5089757442474365, + "logps/chosen": -56.23829650878906, + "logps/rejected": -98.76592254638672, + "loss": 0.6437, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.266577959060669, + "rewards/margins": 7.2121381759643555, + "rewards/rejected": -3.945561408996582, + "step": 11032 + }, + { + "epoch": 2.76, + "grad_norm": 21.522748947143555, + "learning_rate": 4.185821521091405e-06, + "logits/chosen": -0.5384299755096436, + "logits/rejected": -0.6188896894454956, + "logps/chosen": -54.192230224609375, + "logps/rejected": -87.7303466796875, + "loss": 0.6915, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7120254039764404, + "rewards/margins": 5.250833988189697, + "rewards/rejected": -2.538808822631836, + "step": 11033 + }, + { + "epoch": 2.76, + "grad_norm": 7.792855739593506, + "learning_rate": 4.185046033920445e-06, + "logits/chosen": -0.5683779716491699, + "logits/rejected": -0.6270706653594971, + "logps/chosen": -58.208526611328125, + "logps/rejected": -110.11334228515625, + "loss": 0.6441, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9724180698394775, + "rewards/margins": 7.037500381469727, + "rewards/rejected": -4.06508207321167, + "step": 11034 + }, + { + "epoch": 2.76, + "grad_norm": 18.578487396240234, + "learning_rate": 4.184270566887867e-06, + "logits/chosen": -0.46701550483703613, + "logits/rejected": -0.5790393948554993, + "logps/chosen": -63.30168151855469, + "logps/rejected": -88.48226928710938, + "loss": 0.8407, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.90510892868042, + "rewards/margins": 6.084005355834961, + "rewards/rejected": -3.178896427154541, + "step": 11035 + }, + { + "epoch": 2.76, + "grad_norm": 10.81169605255127, + "learning_rate": 4.183495120012829e-06, + "logits/chosen": -0.4519616961479187, + "logits/rejected": -0.5234280824661255, + "logps/chosen": -64.1590805053711, + "logps/rejected": -92.5046157836914, + "loss": 0.7541, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9232678413391113, + "rewards/margins": 6.146762847900391, + "rewards/rejected": -3.2234950065612793, + "step": 11036 + }, + { + "epoch": 2.76, + "grad_norm": 4.042428016662598, + "learning_rate": 4.182719693314494e-06, + "logits/chosen": -0.5953758358955383, + "logits/rejected": -0.6810888051986694, + "logps/chosen": -47.35554504394531, + "logps/rejected": -96.06369018554688, + "loss": 0.6169, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9213359355926514, + "rewards/margins": 6.714303016662598, + "rewards/rejected": -3.7929670810699463, + "step": 11037 + }, + { + "epoch": 2.76, + "grad_norm": 4.004310131072998, + "learning_rate": 4.181944286812028e-06, + "logits/chosen": -0.5175508856773376, + "logits/rejected": -0.6461141109466553, + "logps/chosen": -61.59250259399414, + "logps/rejected": -101.65977478027344, + "loss": 0.6174, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0601131916046143, + "rewards/margins": 6.2090983390808105, + "rewards/rejected": -3.1489853858947754, + "step": 11038 + }, + { + "epoch": 2.76, + "grad_norm": 4.363428115844727, + "learning_rate": 4.181168900524586e-06, + "logits/chosen": -0.5101826190948486, + "logits/rejected": -0.633263885974884, + "logps/chosen": -68.87328338623047, + "logps/rejected": -100.62698364257812, + "loss": 0.6273, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3407304286956787, + "rewards/margins": 7.472670555114746, + "rewards/rejected": -4.131939888000488, + "step": 11039 + }, + { + "epoch": 2.76, + "grad_norm": 4.529674530029297, + "learning_rate": 4.180393534471332e-06, + "logits/chosen": -0.5458226203918457, + "logits/rejected": -0.6183333396911621, + "logps/chosen": -46.9692497253418, + "logps/rejected": -110.72979736328125, + "loss": 0.6229, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1822240352630615, + "rewards/margins": 7.554532051086426, + "rewards/rejected": -4.372308254241943, + "step": 11040 + }, + { + "epoch": 2.76, + "grad_norm": 16.592437744140625, + "learning_rate": 4.179618188671425e-06, + "logits/chosen": -0.4870246946811676, + "logits/rejected": -0.5712119340896606, + "logps/chosen": -66.24649047851562, + "logps/rejected": -98.16728973388672, + "loss": 0.7908, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6356704235076904, + "rewards/margins": 6.412381172180176, + "rewards/rejected": -3.7767112255096436, + "step": 11041 + }, + { + "epoch": 2.76, + "grad_norm": 23.52452278137207, + "learning_rate": 4.178842863144027e-06, + "logits/chosen": -0.5278167724609375, + "logits/rejected": -0.5927442312240601, + "logps/chosen": -59.17264938354492, + "logps/rejected": -105.11094665527344, + "loss": 0.8694, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8791794776916504, + "rewards/margins": 6.21675968170166, + "rewards/rejected": -3.3375799655914307, + "step": 11042 + }, + { + "epoch": 2.76, + "grad_norm": 10.378101348876953, + "learning_rate": 4.178067557908294e-06, + "logits/chosen": -0.5960893630981445, + "logits/rejected": -0.6496821641921997, + "logps/chosen": -53.867042541503906, + "logps/rejected": -88.03121948242188, + "loss": 0.7289, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7172536849975586, + "rewards/margins": 6.134451389312744, + "rewards/rejected": -3.4171974658966064, + "step": 11043 + }, + { + "epoch": 2.76, + "grad_norm": 6.723011493682861, + "learning_rate": 4.177292272983386e-06, + "logits/chosen": -0.488640695810318, + "logits/rejected": -0.5684657096862793, + "logps/chosen": -58.65380859375, + "logps/rejected": -87.85517883300781, + "loss": 0.7486, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7608866691589355, + "rewards/margins": 5.114114761352539, + "rewards/rejected": -2.3532285690307617, + "step": 11044 + }, + { + "epoch": 2.76, + "grad_norm": 6.396402359008789, + "learning_rate": 4.1765170083884625e-06, + "logits/chosen": -0.5960091352462769, + "logits/rejected": -0.6449927687644958, + "logps/chosen": -50.172203063964844, + "logps/rejected": -104.171142578125, + "loss": 0.6616, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9934563636779785, + "rewards/margins": 6.8405656814575195, + "rewards/rejected": -3.847108840942383, + "step": 11045 + }, + { + "epoch": 2.76, + "grad_norm": 5.81982946395874, + "learning_rate": 4.175741764142678e-06, + "logits/chosen": -0.5263093709945679, + "logits/rejected": -0.5810359120368958, + "logps/chosen": -60.57902526855469, + "logps/rejected": -110.64874267578125, + "loss": 0.6383, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1759450435638428, + "rewards/margins": 6.099842071533203, + "rewards/rejected": -2.9238970279693604, + "step": 11046 + }, + { + "epoch": 2.76, + "grad_norm": 17.090749740600586, + "learning_rate": 4.174966540265191e-06, + "logits/chosen": -0.4901137948036194, + "logits/rejected": -0.5750958323478699, + "logps/chosen": -59.15080261230469, + "logps/rejected": -83.33775329589844, + "loss": 0.8051, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.5896482467651367, + "rewards/margins": 5.0383100509643555, + "rewards/rejected": -2.4486618041992188, + "step": 11047 + }, + { + "epoch": 2.76, + "grad_norm": 5.738124847412109, + "learning_rate": 4.1741913367751585e-06, + "logits/chosen": -0.5545114278793335, + "logits/rejected": -0.6333839893341064, + "logps/chosen": -56.272701263427734, + "logps/rejected": -114.52291107177734, + "loss": 0.5887, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0322582721710205, + "rewards/margins": 6.915765762329102, + "rewards/rejected": -3.883507251739502, + "step": 11048 + }, + { + "epoch": 2.76, + "grad_norm": 3.21127986907959, + "learning_rate": 4.173416153691738e-06, + "logits/chosen": -0.49543508887290955, + "logits/rejected": -0.5707471966743469, + "logps/chosen": -60.70242691040039, + "logps/rejected": -96.9432373046875, + "loss": 0.6472, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3970518112182617, + "rewards/margins": 6.375692844390869, + "rewards/rejected": -2.9786415100097656, + "step": 11049 + }, + { + "epoch": 2.76, + "grad_norm": 5.81753396987915, + "learning_rate": 4.172640991034082e-06, + "logits/chosen": -0.4328817129135132, + "logits/rejected": -0.540988564491272, + "logps/chosen": -52.18780517578125, + "logps/rejected": -88.32733154296875, + "loss": 0.68, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.891141891479492, + "rewards/margins": 5.703855991363525, + "rewards/rejected": -2.812713861465454, + "step": 11050 + }, + { + "epoch": 2.76, + "grad_norm": 9.98808765411377, + "learning_rate": 4.171865848821346e-06, + "logits/chosen": -0.5292576551437378, + "logits/rejected": -0.5720406770706177, + "logps/chosen": -56.30524444580078, + "logps/rejected": -99.29579162597656, + "loss": 0.6877, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2223105430603027, + "rewards/margins": 6.6556077003479, + "rewards/rejected": -3.4332966804504395, + "step": 11051 + }, + { + "epoch": 2.76, + "grad_norm": 9.208589553833008, + "learning_rate": 4.1710907270726895e-06, + "logits/chosen": -0.4796753227710724, + "logits/rejected": -0.5808889269828796, + "logps/chosen": -57.527435302734375, + "logps/rejected": -108.44557189941406, + "loss": 0.6385, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0882859230041504, + "rewards/margins": 6.860272407531738, + "rewards/rejected": -3.7719874382019043, + "step": 11052 + }, + { + "epoch": 2.77, + "grad_norm": 16.505386352539062, + "learning_rate": 4.17031562580726e-06, + "logits/chosen": -0.5084335803985596, + "logits/rejected": -0.6064547300338745, + "logps/chosen": -62.79214096069336, + "logps/rejected": -94.8172378540039, + "loss": 0.748, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9029223918914795, + "rewards/margins": 5.6032490730285645, + "rewards/rejected": -2.700326919555664, + "step": 11053 + }, + { + "epoch": 2.77, + "grad_norm": 11.497414588928223, + "learning_rate": 4.169540545044213e-06, + "logits/chosen": -0.4878259599208832, + "logits/rejected": -0.5645732283592224, + "logps/chosen": -48.36064910888672, + "logps/rejected": -101.17894744873047, + "loss": 0.7283, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.070941925048828, + "rewards/margins": 6.896090507507324, + "rewards/rejected": -3.825148105621338, + "step": 11054 + }, + { + "epoch": 2.77, + "grad_norm": 8.179841041564941, + "learning_rate": 4.168765484802703e-06, + "logits/chosen": -0.5627188086509705, + "logits/rejected": -0.6118814945220947, + "logps/chosen": -44.62459945678711, + "logps/rejected": -94.08335876464844, + "loss": 0.7627, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.004084825515747, + "rewards/margins": 5.020913600921631, + "rewards/rejected": -2.0168285369873047, + "step": 11055 + }, + { + "epoch": 2.77, + "grad_norm": 5.192295074462891, + "learning_rate": 4.167990445101883e-06, + "logits/chosen": -0.5068482756614685, + "logits/rejected": -0.585017204284668, + "logps/chosen": -51.793121337890625, + "logps/rejected": -89.1725082397461, + "loss": 0.6237, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0291147232055664, + "rewards/margins": 5.72796106338501, + "rewards/rejected": -2.6988468170166016, + "step": 11056 + }, + { + "epoch": 2.77, + "grad_norm": 3.9683988094329834, + "learning_rate": 4.167215425960905e-06, + "logits/chosen": -0.49643635749816895, + "logits/rejected": -0.6019467115402222, + "logps/chosen": -46.81139373779297, + "logps/rejected": -99.6034164428711, + "loss": 0.5697, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.256523370742798, + "rewards/margins": 6.135316848754883, + "rewards/rejected": -2.878793239593506, + "step": 11057 + }, + { + "epoch": 2.77, + "grad_norm": 4.877581596374512, + "learning_rate": 4.166440427398918e-06, + "logits/chosen": -0.5501148104667664, + "logits/rejected": -0.5961303114891052, + "logps/chosen": -47.051055908203125, + "logps/rejected": -116.36882019042969, + "loss": 0.6047, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.011176586151123, + "rewards/margins": 8.448213577270508, + "rewards/rejected": -5.437037467956543, + "step": 11058 + }, + { + "epoch": 2.77, + "grad_norm": 4.607442378997803, + "learning_rate": 4.165665449435076e-06, + "logits/chosen": -0.5461888909339905, + "logits/rejected": -0.6356882452964783, + "logps/chosen": -53.58008575439453, + "logps/rejected": -90.98456573486328, + "loss": 0.7099, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.299424409866333, + "rewards/margins": 6.528116226196289, + "rewards/rejected": -3.228692054748535, + "step": 11059 + }, + { + "epoch": 2.77, + "grad_norm": 10.573904037475586, + "learning_rate": 4.164890492088527e-06, + "logits/chosen": -0.4858502447605133, + "logits/rejected": -0.5415509343147278, + "logps/chosen": -59.17264175415039, + "logps/rejected": -97.26958465576172, + "loss": 0.7183, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2371108531951904, + "rewards/margins": 6.109686374664307, + "rewards/rejected": -2.8725759983062744, + "step": 11060 + }, + { + "epoch": 2.77, + "grad_norm": 8.212660789489746, + "learning_rate": 4.1641155553784216e-06, + "logits/chosen": -0.45811179280281067, + "logits/rejected": -0.5038154721260071, + "logps/chosen": -63.22309875488281, + "logps/rejected": -111.29666137695312, + "loss": 0.7349, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.048358917236328, + "rewards/margins": 5.853153228759766, + "rewards/rejected": -2.8047943115234375, + "step": 11061 + }, + { + "epoch": 2.77, + "grad_norm": 6.778418064117432, + "learning_rate": 4.1633406393239115e-06, + "logits/chosen": -0.4144529104232788, + "logits/rejected": -0.5723354816436768, + "logps/chosen": -74.64251708984375, + "logps/rejected": -94.81147766113281, + "loss": 0.6807, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9061856269836426, + "rewards/margins": 6.803466796875, + "rewards/rejected": -3.897280693054199, + "step": 11062 + }, + { + "epoch": 2.77, + "grad_norm": 14.432004928588867, + "learning_rate": 4.162565743944143e-06, + "logits/chosen": -0.46502944827079773, + "logits/rejected": -0.5396711826324463, + "logps/chosen": -59.89695739746094, + "logps/rejected": -88.39586639404297, + "loss": 0.7442, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8757777214050293, + "rewards/margins": 4.928609848022461, + "rewards/rejected": -2.0528318881988525, + "step": 11063 + }, + { + "epoch": 2.77, + "grad_norm": 4.528263568878174, + "learning_rate": 4.161790869258267e-06, + "logits/chosen": -0.499732106924057, + "logits/rejected": -0.5744332671165466, + "logps/chosen": -49.126277923583984, + "logps/rejected": -99.50891876220703, + "loss": 0.5865, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8438775539398193, + "rewards/margins": 5.649712085723877, + "rewards/rejected": -2.8058345317840576, + "step": 11064 + }, + { + "epoch": 2.77, + "grad_norm": 4.442355632781982, + "learning_rate": 4.161016015285428e-06, + "logits/chosen": -0.49371567368507385, + "logits/rejected": -0.5866496562957764, + "logps/chosen": -57.1214485168457, + "logps/rejected": -98.53270721435547, + "loss": 0.6377, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0324432849884033, + "rewards/margins": 6.432233810424805, + "rewards/rejected": -3.3997902870178223, + "step": 11065 + }, + { + "epoch": 2.77, + "grad_norm": 1.5750969648361206, + "learning_rate": 4.160241182044776e-06, + "logits/chosen": -0.5063789486885071, + "logits/rejected": -0.6298856139183044, + "logps/chosen": -49.98353576660156, + "logps/rejected": -111.62369537353516, + "loss": 0.5067, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.900984287261963, + "rewards/margins": 7.948774337768555, + "rewards/rejected": -5.04779052734375, + "step": 11066 + }, + { + "epoch": 2.77, + "grad_norm": 8.468358993530273, + "learning_rate": 4.159466369555461e-06, + "logits/chosen": -0.434037983417511, + "logits/rejected": -0.5800541043281555, + "logps/chosen": -64.45899963378906, + "logps/rejected": -91.30823516845703, + "loss": 0.5816, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0407392978668213, + "rewards/margins": 5.913529872894287, + "rewards/rejected": -2.872791290283203, + "step": 11067 + }, + { + "epoch": 2.77, + "grad_norm": 3.248793125152588, + "learning_rate": 4.158691577836622e-06, + "logits/chosen": -0.5360047221183777, + "logits/rejected": -0.6168947219848633, + "logps/chosen": -43.02848815917969, + "logps/rejected": -98.07466888427734, + "loss": 0.5453, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1801974773406982, + "rewards/margins": 7.172008991241455, + "rewards/rejected": -3.991811990737915, + "step": 11068 + }, + { + "epoch": 2.77, + "grad_norm": 3.363860607147217, + "learning_rate": 4.15791680690741e-06, + "logits/chosen": -0.4507485330104828, + "logits/rejected": -0.5088205933570862, + "logps/chosen": -49.932708740234375, + "logps/rejected": -98.18324279785156, + "loss": 0.5555, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2691359519958496, + "rewards/margins": 6.293282985687256, + "rewards/rejected": -3.0241470336914062, + "step": 11069 + }, + { + "epoch": 2.77, + "grad_norm": 5.782909393310547, + "learning_rate": 4.157142056786971e-06, + "logits/chosen": -0.4739478528499603, + "logits/rejected": -0.5907841324806213, + "logps/chosen": -51.47553253173828, + "logps/rejected": -97.68562316894531, + "loss": 0.5524, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0291237831115723, + "rewards/margins": 6.376267910003662, + "rewards/rejected": -3.347144365310669, + "step": 11070 + }, + { + "epoch": 2.77, + "grad_norm": 9.630440711975098, + "learning_rate": 4.156367327494447e-06, + "logits/chosen": -0.5037115216255188, + "logits/rejected": -0.5367443561553955, + "logps/chosen": -46.309173583984375, + "logps/rejected": -97.23721313476562, + "loss": 0.7381, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.105335235595703, + "rewards/margins": 6.065765857696533, + "rewards/rejected": -2.960430145263672, + "step": 11071 + }, + { + "epoch": 2.77, + "grad_norm": 17.016651153564453, + "learning_rate": 4.155592619048984e-06, + "logits/chosen": -0.42370709776878357, + "logits/rejected": -0.5178199410438538, + "logps/chosen": -60.00653839111328, + "logps/rejected": -108.31047058105469, + "loss": 0.7239, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8536503314971924, + "rewards/margins": 6.760026931762695, + "rewards/rejected": -3.9063773155212402, + "step": 11072 + }, + { + "epoch": 2.77, + "grad_norm": 7.483956813812256, + "learning_rate": 4.154817931469725e-06, + "logits/chosen": -0.46889257431030273, + "logits/rejected": -0.562541127204895, + "logps/chosen": -59.7371711730957, + "logps/rejected": -105.01481628417969, + "loss": 0.7007, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0594935417175293, + "rewards/margins": 6.05214786529541, + "rewards/rejected": -2.99265456199646, + "step": 11073 + }, + { + "epoch": 2.77, + "grad_norm": 9.605316162109375, + "learning_rate": 4.154043264775816e-06, + "logits/chosen": -0.4444780647754669, + "logits/rejected": -0.5846207737922668, + "logps/chosen": -59.434173583984375, + "logps/rejected": -87.41642761230469, + "loss": 0.6358, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1912341117858887, + "rewards/margins": 5.719308853149414, + "rewards/rejected": -2.5280747413635254, + "step": 11074 + }, + { + "epoch": 2.77, + "grad_norm": 5.499424457550049, + "learning_rate": 4.153268618986395e-06, + "logits/chosen": -0.5124608278274536, + "logits/rejected": -0.5850399136543274, + "logps/chosen": -52.25194549560547, + "logps/rejected": -90.94226837158203, + "loss": 0.6603, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8370871543884277, + "rewards/margins": 5.287124156951904, + "rewards/rejected": -2.4500367641448975, + "step": 11075 + }, + { + "epoch": 2.77, + "grad_norm": 13.29870891571045, + "learning_rate": 4.152493994120608e-06, + "logits/chosen": -0.5071548223495483, + "logits/rejected": -0.6116772294044495, + "logps/chosen": -61.391685485839844, + "logps/rejected": -92.15304565429688, + "loss": 0.6773, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7350850105285645, + "rewards/margins": 5.9322099685668945, + "rewards/rejected": -3.197125196456909, + "step": 11076 + }, + { + "epoch": 2.77, + "grad_norm": 11.227386474609375, + "learning_rate": 4.151719390197597e-06, + "logits/chosen": -0.5425685048103333, + "logits/rejected": -0.6405399441719055, + "logps/chosen": -56.90009307861328, + "logps/rejected": -112.45999908447266, + "loss": 0.6595, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9405465126037598, + "rewards/margins": 7.092922210693359, + "rewards/rejected": -4.152376174926758, + "step": 11077 + }, + { + "epoch": 2.77, + "grad_norm": 5.555932998657227, + "learning_rate": 4.150944807236501e-06, + "logits/chosen": -0.5213810205459595, + "logits/rejected": -0.6174339056015015, + "logps/chosen": -64.29817199707031, + "logps/rejected": -98.4706802368164, + "loss": 0.661, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0458109378814697, + "rewards/margins": 6.628101348876953, + "rewards/rejected": -3.5822901725769043, + "step": 11078 + }, + { + "epoch": 2.77, + "grad_norm": 10.487739562988281, + "learning_rate": 4.150170245256461e-06, + "logits/chosen": -0.4916996955871582, + "logits/rejected": -0.5716513991355896, + "logps/chosen": -40.46549606323242, + "logps/rejected": -96.51466369628906, + "loss": 0.6247, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0455000400543213, + "rewards/margins": 6.1688923835754395, + "rewards/rejected": -3.123392105102539, + "step": 11079 + }, + { + "epoch": 2.77, + "grad_norm": 2.068784236907959, + "learning_rate": 4.149395704276618e-06, + "logits/chosen": -0.5493952631950378, + "logits/rejected": -0.6597399711608887, + "logps/chosen": -52.7777099609375, + "logps/rejected": -111.66964721679688, + "loss": 0.5197, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.775324821472168, + "rewards/margins": 8.471426010131836, + "rewards/rejected": -5.696101188659668, + "step": 11080 + }, + { + "epoch": 2.77, + "grad_norm": 11.355070114135742, + "learning_rate": 4.148621184316115e-06, + "logits/chosen": -0.4669415056705475, + "logits/rejected": -0.5356771945953369, + "logps/chosen": -77.5277328491211, + "logps/rejected": -105.39826965332031, + "loss": 0.7425, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.944282293319702, + "rewards/margins": 6.467891693115234, + "rewards/rejected": -3.5236096382141113, + "step": 11081 + }, + { + "epoch": 2.77, + "grad_norm": 3.9557788372039795, + "learning_rate": 4.147846685394085e-06, + "logits/chosen": -0.5024189949035645, + "logits/rejected": -0.5708829164505005, + "logps/chosen": -55.572235107421875, + "logps/rejected": -107.8539047241211, + "loss": 0.7265, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.967468500137329, + "rewards/margins": 6.774699687957764, + "rewards/rejected": -3.8072309494018555, + "step": 11082 + }, + { + "epoch": 2.77, + "grad_norm": 26.117618560791016, + "learning_rate": 4.147072207529671e-06, + "logits/chosen": -0.406719833612442, + "logits/rejected": -0.4913097023963928, + "logps/chosen": -57.242103576660156, + "logps/rejected": -107.56298828125, + "loss": 0.6942, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.884488582611084, + "rewards/margins": 6.685031414031982, + "rewards/rejected": -3.8005423545837402, + "step": 11083 + }, + { + "epoch": 2.77, + "grad_norm": 21.74510955810547, + "learning_rate": 4.146297750742012e-06, + "logits/chosen": -0.542151153087616, + "logits/rejected": -0.5915747880935669, + "logps/chosen": -58.89592361450195, + "logps/rejected": -107.80258178710938, + "loss": 0.7774, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.968545436859131, + "rewards/margins": 5.602386951446533, + "rewards/rejected": -2.6338415145874023, + "step": 11084 + }, + { + "epoch": 2.77, + "grad_norm": 7.019643306732178, + "learning_rate": 4.1455233150502396e-06, + "logits/chosen": -0.5431769490242004, + "logits/rejected": -0.6388739347457886, + "logps/chosen": -51.24188995361328, + "logps/rejected": -98.38340759277344, + "loss": 0.6823, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.212268590927124, + "rewards/margins": 6.717728614807129, + "rewards/rejected": -3.505459785461426, + "step": 11085 + }, + { + "epoch": 2.77, + "grad_norm": 7.171504974365234, + "learning_rate": 4.144748900473497e-06, + "logits/chosen": -0.4812132716178894, + "logits/rejected": -0.5347638726234436, + "logps/chosen": -53.777305603027344, + "logps/rejected": -99.52616119384766, + "loss": 0.6738, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.300532817840576, + "rewards/margins": 6.23710298538208, + "rewards/rejected": -2.936569929122925, + "step": 11086 + }, + { + "epoch": 2.77, + "grad_norm": 23.51535987854004, + "learning_rate": 4.143974507030919e-06, + "logits/chosen": -0.4802319407463074, + "logits/rejected": -0.5491396188735962, + "logps/chosen": -55.332908630371094, + "logps/rejected": -98.63733673095703, + "loss": 0.6601, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.757272720336914, + "rewards/margins": 5.634769916534424, + "rewards/rejected": -2.8774967193603516, + "step": 11087 + }, + { + "epoch": 2.77, + "grad_norm": 4.449436664581299, + "learning_rate": 4.143200134741641e-06, + "logits/chosen": -0.4077621400356293, + "logits/rejected": -0.5199205875396729, + "logps/chosen": -58.97886276245117, + "logps/rejected": -107.82472229003906, + "loss": 0.6176, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9645004272460938, + "rewards/margins": 6.250065803527832, + "rewards/rejected": -3.285565137863159, + "step": 11088 + }, + { + "epoch": 2.77, + "grad_norm": 23.42204475402832, + "learning_rate": 4.142425783624799e-06, + "logits/chosen": -0.5854178071022034, + "logits/rejected": -0.6839178800582886, + "logps/chosen": -59.6040153503418, + "logps/rejected": -97.83646392822266, + "loss": 0.773, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.61441707611084, + "rewards/margins": 5.4948649406433105, + "rewards/rejected": -2.8804476261138916, + "step": 11089 + }, + { + "epoch": 2.77, + "grad_norm": 5.76539421081543, + "learning_rate": 4.141651453699528e-06, + "logits/chosen": -0.4711669385433197, + "logits/rejected": -0.5408839583396912, + "logps/chosen": -57.355224609375, + "logps/rejected": -107.43189239501953, + "loss": 0.665, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9011974334716797, + "rewards/margins": 5.363284587860107, + "rewards/rejected": -2.462087631225586, + "step": 11090 + }, + { + "epoch": 2.77, + "grad_norm": 9.170683860778809, + "learning_rate": 4.140877144984962e-06, + "logits/chosen": -0.5932524800300598, + "logits/rejected": -0.6152521967887878, + "logps/chosen": -52.60425567626953, + "logps/rejected": -108.06941223144531, + "loss": 0.6891, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2520220279693604, + "rewards/margins": 6.108700752258301, + "rewards/rejected": -2.8566787242889404, + "step": 11091 + }, + { + "epoch": 2.77, + "grad_norm": 6.0760884284973145, + "learning_rate": 4.140102857500237e-06, + "logits/chosen": -0.5113750100135803, + "logits/rejected": -0.572985827922821, + "logps/chosen": -58.68059158325195, + "logps/rejected": -118.20840454101562, + "loss": 0.7739, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.779588460922241, + "rewards/margins": 6.787215709686279, + "rewards/rejected": -4.007626533508301, + "step": 11092 + }, + { + "epoch": 2.78, + "grad_norm": 3.582040548324585, + "learning_rate": 4.139328591264484e-06, + "logits/chosen": -0.4745834767818451, + "logits/rejected": -0.5792679786682129, + "logps/chosen": -63.062278747558594, + "logps/rejected": -96.87073516845703, + "loss": 0.696, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0116097927093506, + "rewards/margins": 6.566954612731934, + "rewards/rejected": -3.555344820022583, + "step": 11093 + }, + { + "epoch": 2.78, + "grad_norm": 4.686629295349121, + "learning_rate": 4.138554346296837e-06, + "logits/chosen": -0.5628957152366638, + "logits/rejected": -0.6597787737846375, + "logps/chosen": -57.29944610595703, + "logps/rejected": -102.27864074707031, + "loss": 0.7526, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.459472894668579, + "rewards/margins": 6.310055255889893, + "rewards/rejected": -3.8505828380584717, + "step": 11094 + }, + { + "epoch": 2.78, + "grad_norm": 2.364060401916504, + "learning_rate": 4.13778012261643e-06, + "logits/chosen": -0.4471505880355835, + "logits/rejected": -0.5447174310684204, + "logps/chosen": -58.999568939208984, + "logps/rejected": -117.7525405883789, + "loss": 0.5684, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.685070753097534, + "rewards/margins": 7.22857141494751, + "rewards/rejected": -4.5435004234313965, + "step": 11095 + }, + { + "epoch": 2.78, + "grad_norm": 10.187644958496094, + "learning_rate": 4.137005920242392e-06, + "logits/chosen": -0.4210736155509949, + "logits/rejected": -0.5207445621490479, + "logps/chosen": -59.400386810302734, + "logps/rejected": -107.06575775146484, + "loss": 0.787, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.148942708969116, + "rewards/margins": 6.40228271484375, + "rewards/rejected": -3.2533397674560547, + "step": 11096 + }, + { + "epoch": 2.78, + "grad_norm": 2.7321314811706543, + "learning_rate": 4.136231739193853e-06, + "logits/chosen": -0.5410060882568359, + "logits/rejected": -0.5904619097709656, + "logps/chosen": -48.902793884277344, + "logps/rejected": -92.53857421875, + "loss": 0.5785, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.03403639793396, + "rewards/margins": 5.596304416656494, + "rewards/rejected": -2.562267303466797, + "step": 11097 + }, + { + "epoch": 2.78, + "grad_norm": 4.483874320983887, + "learning_rate": 4.135457579489949e-06, + "logits/chosen": -0.521074652671814, + "logits/rejected": -0.6257995367050171, + "logps/chosen": -55.832359313964844, + "logps/rejected": -89.1295166015625, + "loss": 0.6106, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9612910747528076, + "rewards/margins": 5.467632293701172, + "rewards/rejected": -2.5063412189483643, + "step": 11098 + }, + { + "epoch": 2.78, + "grad_norm": 6.33563756942749, + "learning_rate": 4.134683441149809e-06, + "logits/chosen": -0.5422664880752563, + "logits/rejected": -0.5929919481277466, + "logps/chosen": -54.450286865234375, + "logps/rejected": -110.73291778564453, + "loss": 0.6784, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9787983894348145, + "rewards/margins": 6.352260112762451, + "rewards/rejected": -3.3734612464904785, + "step": 11099 + }, + { + "epoch": 2.78, + "grad_norm": 9.686944961547852, + "learning_rate": 4.13390932419256e-06, + "logits/chosen": -0.5100474953651428, + "logits/rejected": -0.5706268548965454, + "logps/chosen": -55.20716857910156, + "logps/rejected": -95.27922058105469, + "loss": 0.6706, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.056504487991333, + "rewards/margins": 5.336982250213623, + "rewards/rejected": -2.280478000640869, + "step": 11100 + }, + { + "epoch": 2.78, + "grad_norm": 4.270945072174072, + "learning_rate": 4.133135228637333e-06, + "logits/chosen": -0.4591456651687622, + "logits/rejected": -0.5887190103530884, + "logps/chosen": -70.85935974121094, + "logps/rejected": -92.09098052978516, + "loss": 0.7279, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.225902557373047, + "rewards/margins": 5.38701057434082, + "rewards/rejected": -2.1611077785491943, + "step": 11101 + }, + { + "epoch": 2.78, + "grad_norm": 3.130739688873291, + "learning_rate": 4.132361154503257e-06, + "logits/chosen": -0.46849900484085083, + "logits/rejected": -0.5931298732757568, + "logps/chosen": -64.04664611816406, + "logps/rejected": -93.66431427001953, + "loss": 0.7248, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0214099884033203, + "rewards/margins": 6.869523048400879, + "rewards/rejected": -3.8481132984161377, + "step": 11102 + }, + { + "epoch": 2.78, + "grad_norm": 4.669011116027832, + "learning_rate": 4.1315871018094595e-06, + "logits/chosen": -0.4176812767982483, + "logits/rejected": -0.5474209189414978, + "logps/chosen": -66.60008239746094, + "logps/rejected": -104.25675201416016, + "loss": 0.6307, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1613621711730957, + "rewards/margins": 5.841126918792725, + "rewards/rejected": -2.679764747619629, + "step": 11103 + }, + { + "epoch": 2.78, + "grad_norm": 3.4168968200683594, + "learning_rate": 4.130813070575068e-06, + "logits/chosen": -0.43181663751602173, + "logits/rejected": -0.5325861573219299, + "logps/chosen": -64.5241470336914, + "logps/rejected": -87.72160339355469, + "loss": 0.6536, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1589221954345703, + "rewards/margins": 5.974083423614502, + "rewards/rejected": -2.8151607513427734, + "step": 11104 + }, + { + "epoch": 2.78, + "grad_norm": 3.7580018043518066, + "learning_rate": 4.13003906081921e-06, + "logits/chosen": -0.4833039343357086, + "logits/rejected": -0.6132739186286926, + "logps/chosen": -54.39504623413086, + "logps/rejected": -101.45659637451172, + "loss": 0.6731, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.366116762161255, + "rewards/margins": 7.350255966186523, + "rewards/rejected": -3.9841389656066895, + "step": 11105 + }, + { + "epoch": 2.78, + "grad_norm": 4.717896461486816, + "learning_rate": 4.1292650725610136e-06, + "logits/chosen": -0.59906405210495, + "logits/rejected": -0.6968101263046265, + "logps/chosen": -55.43760681152344, + "logps/rejected": -74.68783569335938, + "loss": 0.6267, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2344202995300293, + "rewards/margins": 6.8857197761535645, + "rewards/rejected": -3.6512999534606934, + "step": 11106 + }, + { + "epoch": 2.78, + "grad_norm": 4.780033111572266, + "learning_rate": 4.128491105819602e-06, + "logits/chosen": -0.46308255195617676, + "logits/rejected": -0.5740730166435242, + "logps/chosen": -52.56079864501953, + "logps/rejected": -95.02970886230469, + "loss": 0.6622, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.201676607131958, + "rewards/margins": 6.818543910980225, + "rewards/rejected": -3.6168668270111084, + "step": 11107 + }, + { + "epoch": 2.78, + "grad_norm": 6.096056938171387, + "learning_rate": 4.127717160614102e-06, + "logits/chosen": -0.5222018361091614, + "logits/rejected": -0.6419682502746582, + "logps/chosen": -58.031532287597656, + "logps/rejected": -92.06253051757812, + "loss": 0.7016, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.966536521911621, + "rewards/margins": 6.9663920402526855, + "rewards/rejected": -3.9998559951782227, + "step": 11108 + }, + { + "epoch": 2.78, + "grad_norm": 5.274704933166504, + "learning_rate": 4.12694323696364e-06, + "logits/chosen": -0.5279766917228699, + "logits/rejected": -0.6092469096183777, + "logps/chosen": -50.80289840698242, + "logps/rejected": -102.5074234008789, + "loss": 0.6129, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.963289737701416, + "rewards/margins": 7.174959182739258, + "rewards/rejected": -4.211669445037842, + "step": 11109 + }, + { + "epoch": 2.78, + "grad_norm": 3.959329843521118, + "learning_rate": 4.1261693348873385e-06, + "logits/chosen": -0.5412367582321167, + "logits/rejected": -0.6536731719970703, + "logps/chosen": -56.62013244628906, + "logps/rejected": -102.37244415283203, + "loss": 0.6674, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8254101276397705, + "rewards/margins": 7.7739644050598145, + "rewards/rejected": -4.948554992675781, + "step": 11110 + }, + { + "epoch": 2.78, + "grad_norm": 10.648965835571289, + "learning_rate": 4.12539545440432e-06, + "logits/chosen": -0.5800411701202393, + "logits/rejected": -0.6811175346374512, + "logps/chosen": -51.31798553466797, + "logps/rejected": -90.46333312988281, + "loss": 0.6544, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.858430862426758, + "rewards/margins": 6.547093391418457, + "rewards/rejected": -3.6886627674102783, + "step": 11111 + }, + { + "epoch": 2.78, + "grad_norm": 4.235867023468018, + "learning_rate": 4.124621595533713e-06, + "logits/chosen": -0.5171486735343933, + "logits/rejected": -0.5981313586235046, + "logps/chosen": -49.76564407348633, + "logps/rejected": -103.67359924316406, + "loss": 0.639, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.185133457183838, + "rewards/margins": 6.294417858123779, + "rewards/rejected": -3.1092846393585205, + "step": 11112 + }, + { + "epoch": 2.78, + "grad_norm": 3.8783953189849854, + "learning_rate": 4.123847758294638e-06, + "logits/chosen": -0.5925548076629639, + "logits/rejected": -0.6014397144317627, + "logps/chosen": -73.48170471191406, + "logps/rejected": -115.53289794921875, + "loss": 0.6123, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.299919843673706, + "rewards/margins": 7.345426082611084, + "rewards/rejected": -4.045506477355957, + "step": 11113 + }, + { + "epoch": 2.78, + "grad_norm": 8.291388511657715, + "learning_rate": 4.123073942706215e-06, + "logits/chosen": -0.5859795212745667, + "logits/rejected": -0.5977159738540649, + "logps/chosen": -49.479339599609375, + "logps/rejected": -119.16653442382812, + "loss": 0.8369, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.009716510772705, + "rewards/margins": 6.655711650848389, + "rewards/rejected": -3.6459949016571045, + "step": 11114 + }, + { + "epoch": 2.78, + "grad_norm": 3.8376097679138184, + "learning_rate": 4.122300148787567e-06, + "logits/chosen": -0.4878191351890564, + "logits/rejected": -0.5801804661750793, + "logps/chosen": -58.74972152709961, + "logps/rejected": -109.54048156738281, + "loss": 0.6178, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.113990306854248, + "rewards/margins": 6.670339107513428, + "rewards/rejected": -3.556349277496338, + "step": 11115 + }, + { + "epoch": 2.78, + "grad_norm": 7.6118292808532715, + "learning_rate": 4.121526376557816e-06, + "logits/chosen": -0.46492722630500793, + "logits/rejected": -0.5839927196502686, + "logps/chosen": -48.359893798828125, + "logps/rejected": -95.96931457519531, + "loss": 0.7083, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0330967903137207, + "rewards/margins": 5.465388298034668, + "rewards/rejected": -2.4322915077209473, + "step": 11116 + }, + { + "epoch": 2.78, + "grad_norm": 5.1100335121154785, + "learning_rate": 4.120752626036086e-06, + "logits/chosen": -0.5592697858810425, + "logits/rejected": -0.6349554061889648, + "logps/chosen": -52.0025634765625, + "logps/rejected": -98.27789306640625, + "loss": 0.6248, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.196845293045044, + "rewards/margins": 6.8414435386657715, + "rewards/rejected": -3.644599199295044, + "step": 11117 + }, + { + "epoch": 2.78, + "grad_norm": 4.452618598937988, + "learning_rate": 4.11997889724149e-06, + "logits/chosen": -0.5789121985435486, + "logits/rejected": -0.6015158891677856, + "logps/chosen": -45.353050231933594, + "logps/rejected": -114.49101257324219, + "loss": 0.5692, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.002626419067383, + "rewards/margins": 7.681484222412109, + "rewards/rejected": -4.678857803344727, + "step": 11118 + }, + { + "epoch": 2.78, + "grad_norm": 5.259761810302734, + "learning_rate": 4.119205190193153e-06, + "logits/chosen": -0.47170189023017883, + "logits/rejected": -0.571477472782135, + "logps/chosen": -46.85987091064453, + "logps/rejected": -86.222900390625, + "loss": 0.553, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.127495288848877, + "rewards/margins": 5.950536251068115, + "rewards/rejected": -2.8230414390563965, + "step": 11119 + }, + { + "epoch": 2.78, + "grad_norm": 8.184696197509766, + "learning_rate": 4.118431504910194e-06, + "logits/chosen": -0.4465610980987549, + "logits/rejected": -0.5162089467048645, + "logps/chosen": -47.64236068725586, + "logps/rejected": -103.471923828125, + "loss": 0.6672, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.832815647125244, + "rewards/margins": 6.884993553161621, + "rewards/rejected": -4.052178382873535, + "step": 11120 + }, + { + "epoch": 2.78, + "grad_norm": 3.3214261531829834, + "learning_rate": 4.117657841411728e-06, + "logits/chosen": -0.43106067180633545, + "logits/rejected": -0.47784411907196045, + "logps/chosen": -67.49808502197266, + "logps/rejected": -117.55770874023438, + "loss": 0.639, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.141427516937256, + "rewards/margins": 6.438973426818848, + "rewards/rejected": -3.297546148300171, + "step": 11121 + }, + { + "epoch": 2.78, + "grad_norm": 13.020646095275879, + "learning_rate": 4.116884199716876e-06, + "logits/chosen": -0.5027565956115723, + "logits/rejected": -0.5635254383087158, + "logps/chosen": -58.42842102050781, + "logps/rejected": -85.74305725097656, + "loss": 0.6623, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.961728096008301, + "rewards/margins": 4.81652307510376, + "rewards/rejected": -1.854795217514038, + "step": 11122 + }, + { + "epoch": 2.78, + "grad_norm": 3.4308342933654785, + "learning_rate": 4.116110579844753e-06, + "logits/chosen": -0.41619884967803955, + "logits/rejected": -0.5410589575767517, + "logps/chosen": -58.31882095336914, + "logps/rejected": -85.46468353271484, + "loss": 0.5654, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.041832447052002, + "rewards/margins": 6.349577903747559, + "rewards/rejected": -3.3077456951141357, + "step": 11123 + }, + { + "epoch": 2.78, + "grad_norm": 14.28387451171875, + "learning_rate": 4.115336981814481e-06, + "logits/chosen": -0.5113462209701538, + "logits/rejected": -0.5728219151496887, + "logps/chosen": -58.6179084777832, + "logps/rejected": -96.6275863647461, + "loss": 0.5997, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.794680595397949, + "rewards/margins": 6.108381748199463, + "rewards/rejected": -3.313701629638672, + "step": 11124 + }, + { + "epoch": 2.78, + "grad_norm": 17.1967830657959, + "learning_rate": 4.11456340564517e-06, + "logits/chosen": -0.5375673770904541, + "logits/rejected": -0.623175323009491, + "logps/chosen": -57.93436813354492, + "logps/rejected": -112.31411743164062, + "loss": 0.6787, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5673890113830566, + "rewards/margins": 7.384678840637207, + "rewards/rejected": -4.817289352416992, + "step": 11125 + }, + { + "epoch": 2.78, + "grad_norm": 20.60387420654297, + "learning_rate": 4.113789851355941e-06, + "logits/chosen": -0.5143879652023315, + "logits/rejected": -0.5981829166412354, + "logps/chosen": -54.80766296386719, + "logps/rejected": -104.26762390136719, + "loss": 0.6228, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8705310821533203, + "rewards/margins": 6.380685806274414, + "rewards/rejected": -3.510154962539673, + "step": 11126 + }, + { + "epoch": 2.78, + "grad_norm": 8.344696044921875, + "learning_rate": 4.113016318965909e-06, + "logits/chosen": -0.48708969354629517, + "logits/rejected": -0.5205847024917603, + "logps/chosen": -56.28569412231445, + "logps/rejected": -105.43570709228516, + "loss": 0.7543, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.059260129928589, + "rewards/margins": 6.837632656097412, + "rewards/rejected": -3.7783727645874023, + "step": 11127 + }, + { + "epoch": 2.78, + "grad_norm": 15.80705451965332, + "learning_rate": 4.112242808494183e-06, + "logits/chosen": -0.5329762697219849, + "logits/rejected": -0.5937716960906982, + "logps/chosen": -49.684505462646484, + "logps/rejected": -105.85971069335938, + "loss": 0.7666, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.06211256980896, + "rewards/margins": 6.520689964294434, + "rewards/rejected": -3.4585771560668945, + "step": 11128 + }, + { + "epoch": 2.78, + "grad_norm": 6.6699090003967285, + "learning_rate": 4.1114693199598846e-06, + "logits/chosen": -0.5184147357940674, + "logits/rejected": -0.5527390241622925, + "logps/chosen": -52.101348876953125, + "logps/rejected": -112.73489379882812, + "loss": 0.6188, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.819950580596924, + "rewards/margins": 6.455496788024902, + "rewards/rejected": -3.6355462074279785, + "step": 11129 + }, + { + "epoch": 2.78, + "grad_norm": 9.300077438354492, + "learning_rate": 4.110695853382123e-06, + "logits/chosen": -0.515203595161438, + "logits/rejected": -0.5873306393623352, + "logps/chosen": -51.201820373535156, + "logps/rejected": -92.45018005371094, + "loss": 0.7534, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6648292541503906, + "rewards/margins": 4.963597774505615, + "rewards/rejected": -2.2987685203552246, + "step": 11130 + }, + { + "epoch": 2.78, + "grad_norm": 7.656260967254639, + "learning_rate": 4.109922408780014e-06, + "logits/chosen": -0.47881484031677246, + "logits/rejected": -0.55047208070755, + "logps/chosen": -57.88142776489258, + "logps/rejected": -98.27161407470703, + "loss": 0.7761, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0410051345825195, + "rewards/margins": 6.050924777984619, + "rewards/rejected": -3.0099194049835205, + "step": 11131 + }, + { + "epoch": 2.78, + "grad_norm": 2.2581138610839844, + "learning_rate": 4.1091489861726675e-06, + "logits/chosen": -0.5275484323501587, + "logits/rejected": -0.6442018747329712, + "logps/chosen": -56.81208801269531, + "logps/rejected": -94.28934478759766, + "loss": 0.562, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3624606132507324, + "rewards/margins": 7.332866668701172, + "rewards/rejected": -3.9704055786132812, + "step": 11132 + }, + { + "epoch": 2.79, + "grad_norm": 16.98233413696289, + "learning_rate": 4.108375585579198e-06, + "logits/chosen": -0.45894545316696167, + "logits/rejected": -0.5071954727172852, + "logps/chosen": -48.460262298583984, + "logps/rejected": -93.44049835205078, + "loss": 0.7428, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.877350330352783, + "rewards/margins": 5.323747158050537, + "rewards/rejected": -2.446397304534912, + "step": 11133 + }, + { + "epoch": 2.79, + "grad_norm": 13.783637046813965, + "learning_rate": 4.107602207018716e-06, + "logits/chosen": -0.439412385225296, + "logits/rejected": -0.4938145875930786, + "logps/chosen": -58.24817657470703, + "logps/rejected": -103.0683364868164, + "loss": 0.7736, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.875499725341797, + "rewards/margins": 6.00940465927124, + "rewards/rejected": -3.1339049339294434, + "step": 11134 + }, + { + "epoch": 2.79, + "grad_norm": 4.61620569229126, + "learning_rate": 4.106828850510333e-06, + "logits/chosen": -0.46092334389686584, + "logits/rejected": -0.5037460923194885, + "logps/chosen": -59.236785888671875, + "logps/rejected": -116.75572967529297, + "loss": 0.5809, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.070091485977173, + "rewards/margins": 6.842260360717773, + "rewards/rejected": -3.7721686363220215, + "step": 11135 + }, + { + "epoch": 2.79, + "grad_norm": 4.974923610687256, + "learning_rate": 4.106055516073159e-06, + "logits/chosen": -0.536469578742981, + "logits/rejected": -0.614006757736206, + "logps/chosen": -57.19219207763672, + "logps/rejected": -115.32594299316406, + "loss": 0.6058, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7722089290618896, + "rewards/margins": 8.09062671661377, + "rewards/rejected": -5.318417549133301, + "step": 11136 + }, + { + "epoch": 2.79, + "grad_norm": 8.646710395812988, + "learning_rate": 4.105282203726302e-06, + "logits/chosen": -0.49604812264442444, + "logits/rejected": -0.6110577583312988, + "logps/chosen": -66.87901306152344, + "logps/rejected": -124.57485961914062, + "loss": 0.7748, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1728057861328125, + "rewards/margins": 7.3415207862854, + "rewards/rejected": -4.168715000152588, + "step": 11137 + }, + { + "epoch": 2.79, + "grad_norm": 7.151507377624512, + "learning_rate": 4.104508913488877e-06, + "logits/chosen": -0.4670613408088684, + "logits/rejected": -0.483148992061615, + "logps/chosen": -51.97343826293945, + "logps/rejected": -113.59222412109375, + "loss": 0.6362, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1639294624328613, + "rewards/margins": 5.755232334136963, + "rewards/rejected": -2.5913028717041016, + "step": 11138 + }, + { + "epoch": 2.79, + "grad_norm": 3.603778839111328, + "learning_rate": 4.103735645379987e-06, + "logits/chosen": -0.6080885529518127, + "logits/rejected": -0.6766279935836792, + "logps/chosen": -47.01810836791992, + "logps/rejected": -90.98612213134766, + "loss": 0.551, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.326690196990967, + "rewards/margins": 6.550500869750977, + "rewards/rejected": -3.2238106727600098, + "step": 11139 + }, + { + "epoch": 2.79, + "grad_norm": 18.897014617919922, + "learning_rate": 4.102962399418742e-06, + "logits/chosen": -0.49262362718582153, + "logits/rejected": -0.5427353382110596, + "logps/chosen": -55.97290802001953, + "logps/rejected": -110.95846557617188, + "loss": 0.7822, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.011995315551758, + "rewards/margins": 6.087711334228516, + "rewards/rejected": -3.075716495513916, + "step": 11140 + }, + { + "epoch": 2.79, + "grad_norm": 2.4145121574401855, + "learning_rate": 4.10218917562425e-06, + "logits/chosen": -0.4354158639907837, + "logits/rejected": -0.5208746790885925, + "logps/chosen": -67.10820770263672, + "logps/rejected": -115.94351196289062, + "loss": 0.6711, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3503592014312744, + "rewards/margins": 7.298815727233887, + "rewards/rejected": -3.948456048965454, + "step": 11141 + }, + { + "epoch": 2.79, + "grad_norm": 10.046238899230957, + "learning_rate": 4.101415974015621e-06, + "logits/chosen": -0.4488219618797302, + "logits/rejected": -0.5189083218574524, + "logps/chosen": -57.42559051513672, + "logps/rejected": -100.77996063232422, + "loss": 0.6795, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5427048206329346, + "rewards/margins": 6.10559606552124, + "rewards/rejected": -3.562891960144043, + "step": 11142 + }, + { + "epoch": 2.79, + "grad_norm": 6.7597174644470215, + "learning_rate": 4.100642794611958e-06, + "logits/chosen": -0.4531170427799225, + "logits/rejected": -0.5507242679595947, + "logps/chosen": -62.63016891479492, + "logps/rejected": -90.01667785644531, + "loss": 0.6935, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.25724720954895, + "rewards/margins": 6.045741081237793, + "rewards/rejected": -2.788494110107422, + "step": 11143 + }, + { + "epoch": 2.79, + "grad_norm": 4.12955904006958, + "learning_rate": 4.099869637432367e-06, + "logits/chosen": -0.5953879952430725, + "logits/rejected": -0.7290332317352295, + "logps/chosen": -58.62665939331055, + "logps/rejected": -113.18357849121094, + "loss": 0.6468, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.112210988998413, + "rewards/margins": 8.568814277648926, + "rewards/rejected": -5.456603050231934, + "step": 11144 + }, + { + "epoch": 2.79, + "grad_norm": 6.781009674072266, + "learning_rate": 4.099096502495957e-06, + "logits/chosen": -0.4884459376335144, + "logits/rejected": -0.6296952366828918, + "logps/chosen": -61.07535934448242, + "logps/rejected": -95.66427612304688, + "loss": 0.6757, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.632023334503174, + "rewards/margins": 7.117955207824707, + "rewards/rejected": -4.485932350158691, + "step": 11145 + }, + { + "epoch": 2.79, + "grad_norm": 3.292982578277588, + "learning_rate": 4.098323389821828e-06, + "logits/chosen": -0.4889453053474426, + "logits/rejected": -0.6009932160377502, + "logps/chosen": -52.13435363769531, + "logps/rejected": -102.77667236328125, + "loss": 0.5619, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0205559730529785, + "rewards/margins": 6.934731483459473, + "rewards/rejected": -3.914175033569336, + "step": 11146 + }, + { + "epoch": 2.79, + "grad_norm": 2.7403783798217773, + "learning_rate": 4.097550299429088e-06, + "logits/chosen": -0.5744817852973938, + "logits/rejected": -0.6612153053283691, + "logps/chosen": -51.834651947021484, + "logps/rejected": -97.32662963867188, + "loss": 0.5823, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.266550064086914, + "rewards/margins": 6.767218112945557, + "rewards/rejected": -3.5006675720214844, + "step": 11147 + }, + { + "epoch": 2.79, + "grad_norm": 3.53098726272583, + "learning_rate": 4.09677723133684e-06, + "logits/chosen": -0.5080904364585876, + "logits/rejected": -0.5992340445518494, + "logps/chosen": -58.78358459472656, + "logps/rejected": -106.35293579101562, + "loss": 0.6353, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2943851947784424, + "rewards/margins": 7.587930679321289, + "rewards/rejected": -4.293545722961426, + "step": 11148 + }, + { + "epoch": 2.79, + "grad_norm": 6.575754642486572, + "learning_rate": 4.096004185564189e-06, + "logits/chosen": -0.5182504653930664, + "logits/rejected": -0.6026648283004761, + "logps/chosen": -62.567176818847656, + "logps/rejected": -108.10459899902344, + "loss": 0.6636, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9313056468963623, + "rewards/margins": 6.739116668701172, + "rewards/rejected": -3.8078110218048096, + "step": 11149 + }, + { + "epoch": 2.79, + "grad_norm": 3.7065815925598145, + "learning_rate": 4.095231162130234e-06, + "logits/chosen": -0.5119743943214417, + "logits/rejected": -0.5655819177627563, + "logps/chosen": -40.09557342529297, + "logps/rejected": -98.59568786621094, + "loss": 0.5472, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.049377202987671, + "rewards/margins": 6.893134117126465, + "rewards/rejected": -3.843756675720215, + "step": 11150 + }, + { + "epoch": 2.79, + "grad_norm": 7.697305202484131, + "learning_rate": 4.09445816105408e-06, + "logits/chosen": -0.51768958568573, + "logits/rejected": -0.6299737691879272, + "logps/chosen": -54.34421157836914, + "logps/rejected": -112.74691772460938, + "loss": 0.6385, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0049381256103516, + "rewards/margins": 7.79006814956665, + "rewards/rejected": -4.785130023956299, + "step": 11151 + }, + { + "epoch": 2.79, + "grad_norm": 9.283730506896973, + "learning_rate": 4.093685182354829e-06, + "logits/chosen": -0.5331640243530273, + "logits/rejected": -0.6233721971511841, + "logps/chosen": -51.00019836425781, + "logps/rejected": -100.54971313476562, + "loss": 0.6061, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.094503879547119, + "rewards/margins": 7.26790714263916, + "rewards/rejected": -4.173403739929199, + "step": 11152 + }, + { + "epoch": 2.79, + "grad_norm": 5.405277729034424, + "learning_rate": 4.0929122260515796e-06, + "logits/chosen": -0.5083635449409485, + "logits/rejected": -0.5471572279930115, + "logps/chosen": -46.07538986206055, + "logps/rejected": -101.21870422363281, + "loss": 0.6413, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9427578449249268, + "rewards/margins": 6.531031608581543, + "rewards/rejected": -3.588273763656616, + "step": 11153 + }, + { + "epoch": 2.79, + "grad_norm": 10.437742233276367, + "learning_rate": 4.092139292163433e-06, + "logits/chosen": -0.5167412161827087, + "logits/rejected": -0.5708963871002197, + "logps/chosen": -53.89143753051758, + "logps/rejected": -105.2246322631836, + "loss": 0.6052, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2755441665649414, + "rewards/margins": 6.600505828857422, + "rewards/rejected": -3.3249621391296387, + "step": 11154 + }, + { + "epoch": 2.79, + "grad_norm": 8.623710632324219, + "learning_rate": 4.091366380709492e-06, + "logits/chosen": -0.5474230051040649, + "logits/rejected": -0.6438323259353638, + "logps/chosen": -45.267478942871094, + "logps/rejected": -115.00701904296875, + "loss": 0.5464, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5680272579193115, + "rewards/margins": 8.326948165893555, + "rewards/rejected": -5.758920669555664, + "step": 11155 + }, + { + "epoch": 2.79, + "grad_norm": 4.659050464630127, + "learning_rate": 4.090593491708856e-06, + "logits/chosen": -0.48993736505508423, + "logits/rejected": -0.5525580644607544, + "logps/chosen": -52.095542907714844, + "logps/rejected": -107.95584106445312, + "loss": 0.6066, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0781755447387695, + "rewards/margins": 7.1711812019348145, + "rewards/rejected": -4.093005180358887, + "step": 11156 + }, + { + "epoch": 2.79, + "grad_norm": 6.32490348815918, + "learning_rate": 4.089820625180618e-06, + "logits/chosen": -0.4791909456253052, + "logits/rejected": -0.5798476934432983, + "logps/chosen": -49.73307800292969, + "logps/rejected": -93.87960052490234, + "loss": 0.6806, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.930140733718872, + "rewards/margins": 6.4650163650512695, + "rewards/rejected": -3.5348756313323975, + "step": 11157 + }, + { + "epoch": 2.79, + "grad_norm": 4.574207782745361, + "learning_rate": 4.089047781143882e-06, + "logits/chosen": -0.41735148429870605, + "logits/rejected": -0.5337307453155518, + "logps/chosen": -58.406253814697266, + "logps/rejected": -106.63783264160156, + "loss": 0.5553, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8830912113189697, + "rewards/margins": 7.330849647521973, + "rewards/rejected": -4.447757720947266, + "step": 11158 + }, + { + "epoch": 2.79, + "grad_norm": 3.675997018814087, + "learning_rate": 4.088274959617746e-06, + "logits/chosen": -0.4965059161186218, + "logits/rejected": -0.5622019171714783, + "logps/chosen": -59.53227233886719, + "logps/rejected": -101.41448211669922, + "loss": 0.6627, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.834054470062256, + "rewards/margins": 6.305123805999756, + "rewards/rejected": -3.471069097518921, + "step": 11159 + }, + { + "epoch": 2.79, + "grad_norm": 4.187294960021973, + "learning_rate": 4.087502160621304e-06, + "logits/chosen": -0.4523750841617584, + "logits/rejected": -0.5670652389526367, + "logps/chosen": -44.233177185058594, + "logps/rejected": -80.80400848388672, + "loss": 0.5452, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.179664134979248, + "rewards/margins": 6.2027435302734375, + "rewards/rejected": -3.0230796337127686, + "step": 11160 + }, + { + "epoch": 2.79, + "grad_norm": 4.940463066101074, + "learning_rate": 4.0867293841736545e-06, + "logits/chosen": -0.5850878357887268, + "logits/rejected": -0.6255214810371399, + "logps/chosen": -46.76692199707031, + "logps/rejected": -111.90556335449219, + "loss": 0.5415, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8323705196380615, + "rewards/margins": 7.3701677322387695, + "rewards/rejected": -4.537797927856445, + "step": 11161 + }, + { + "epoch": 2.79, + "grad_norm": 10.373186111450195, + "learning_rate": 4.085956630293892e-06, + "logits/chosen": -0.5221218466758728, + "logits/rejected": -0.5861181616783142, + "logps/chosen": -53.986907958984375, + "logps/rejected": -103.66778564453125, + "loss": 0.8013, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7930796146392822, + "rewards/margins": 6.638776779174805, + "rewards/rejected": -3.8456974029541016, + "step": 11162 + }, + { + "epoch": 2.79, + "grad_norm": 8.02280044555664, + "learning_rate": 4.085183899001116e-06, + "logits/chosen": -0.47769245505332947, + "logits/rejected": -0.5916550159454346, + "logps/chosen": -75.56571960449219, + "logps/rejected": -92.45684051513672, + "loss": 0.6822, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0014986991882324, + "rewards/margins": 6.80971097946167, + "rewards/rejected": -3.8082122802734375, + "step": 11163 + }, + { + "epoch": 2.79, + "grad_norm": 5.5327863693237305, + "learning_rate": 4.0844111903144165e-06, + "logits/chosen": -0.5691630840301514, + "logits/rejected": -0.6790077090263367, + "logps/chosen": -54.331417083740234, + "logps/rejected": -89.47213745117188, + "loss": 0.7136, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9873671531677246, + "rewards/margins": 6.601531505584717, + "rewards/rejected": -3.6141648292541504, + "step": 11164 + }, + { + "epoch": 2.79, + "grad_norm": 6.222782611846924, + "learning_rate": 4.083638504252891e-06, + "logits/chosen": -0.5382037162780762, + "logits/rejected": -0.6192041039466858, + "logps/chosen": -54.06629943847656, + "logps/rejected": -95.0136489868164, + "loss": 0.6963, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1777217388153076, + "rewards/margins": 6.441116809844971, + "rewards/rejected": -3.263395071029663, + "step": 11165 + }, + { + "epoch": 2.79, + "grad_norm": 6.0793633460998535, + "learning_rate": 4.0828658408356345e-06, + "logits/chosen": -0.4275479018688202, + "logits/rejected": -0.5404530763626099, + "logps/chosen": -58.26261901855469, + "logps/rejected": -85.31221008300781, + "loss": 0.6332, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9813365936279297, + "rewards/margins": 5.6606669425964355, + "rewards/rejected": -2.679330587387085, + "step": 11166 + }, + { + "epoch": 2.79, + "grad_norm": 8.221367835998535, + "learning_rate": 4.082093200081737e-06, + "logits/chosen": -0.5155853629112244, + "logits/rejected": -0.6054448485374451, + "logps/chosen": -55.60167694091797, + "logps/rejected": -85.79917907714844, + "loss": 0.6861, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3491926193237305, + "rewards/margins": 6.906670570373535, + "rewards/rejected": -3.5574774742126465, + "step": 11167 + }, + { + "epoch": 2.79, + "grad_norm": 2.808053493499756, + "learning_rate": 4.081320582010292e-06, + "logits/chosen": -0.5960158109664917, + "logits/rejected": -0.6670645475387573, + "logps/chosen": -57.42201614379883, + "logps/rejected": -104.13745880126953, + "loss": 0.5898, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.161608934402466, + "rewards/margins": 7.3381195068359375, + "rewards/rejected": -4.176511764526367, + "step": 11168 + }, + { + "epoch": 2.79, + "grad_norm": 5.9453277587890625, + "learning_rate": 4.080547986640393e-06, + "logits/chosen": -0.5013806223869324, + "logits/rejected": -0.6039576530456543, + "logps/chosen": -58.510353088378906, + "logps/rejected": -77.5291519165039, + "loss": 0.7494, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8863017559051514, + "rewards/margins": 5.774312973022461, + "rewards/rejected": -2.8880109786987305, + "step": 11169 + }, + { + "epoch": 2.79, + "grad_norm": 11.742086410522461, + "learning_rate": 4.0797754139911335e-06, + "logits/chosen": -0.5098124742507935, + "logits/rejected": -0.5609795451164246, + "logps/chosen": -54.206260681152344, + "logps/rejected": -97.4807357788086, + "loss": 0.7555, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7467894554138184, + "rewards/margins": 5.966028690338135, + "rewards/rejected": -3.2192394733428955, + "step": 11170 + }, + { + "epoch": 2.79, + "grad_norm": 8.660504341125488, + "learning_rate": 4.079002864081599e-06, + "logits/chosen": -0.47200340032577515, + "logits/rejected": -0.562158465385437, + "logps/chosen": -50.91253662109375, + "logps/rejected": -94.93655395507812, + "loss": 0.6174, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3604390621185303, + "rewards/margins": 7.669981002807617, + "rewards/rejected": -4.309541702270508, + "step": 11171 + }, + { + "epoch": 2.79, + "grad_norm": 6.559352397918701, + "learning_rate": 4.078230336930885e-06, + "logits/chosen": -0.43533754348754883, + "logits/rejected": -0.5397466421127319, + "logps/chosen": -59.41032409667969, + "logps/rejected": -105.21692657470703, + "loss": 0.6651, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9969325065612793, + "rewards/margins": 6.915074348449707, + "rewards/rejected": -3.9181416034698486, + "step": 11172 + }, + { + "epoch": 2.8, + "grad_norm": 30.333454132080078, + "learning_rate": 4.077457832558079e-06, + "logits/chosen": -0.5336649417877197, + "logits/rejected": -0.6104767322540283, + "logps/chosen": -64.87435150146484, + "logps/rejected": -89.8807373046875, + "loss": 0.8946, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.073045492172241, + "rewards/margins": 5.854304790496826, + "rewards/rejected": -2.781259059906006, + "step": 11173 + }, + { + "epoch": 2.8, + "grad_norm": 12.59691333770752, + "learning_rate": 4.0766853509822735e-06, + "logits/chosen": -0.4893931448459625, + "logits/rejected": -0.5247519016265869, + "logps/chosen": -58.84177780151367, + "logps/rejected": -110.65038299560547, + "loss": 0.805, + "rewards/accuracies": 0.78125, + "rewards/chosen": 2.924656391143799, + "rewards/margins": 5.2291741371154785, + "rewards/rejected": -2.3045177459716797, + "step": 11174 + }, + { + "epoch": 2.8, + "grad_norm": 4.743666648864746, + "learning_rate": 4.075912892222554e-06, + "logits/chosen": -0.5684835910797119, + "logits/rejected": -0.6871280670166016, + "logps/chosen": -54.69083023071289, + "logps/rejected": -90.9616928100586, + "loss": 0.6337, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3597004413604736, + "rewards/margins": 7.159573554992676, + "rewards/rejected": -3.799872875213623, + "step": 11175 + }, + { + "epoch": 2.8, + "grad_norm": 2.2298779487609863, + "learning_rate": 4.075140456298008e-06, + "logits/chosen": -0.5561189651489258, + "logits/rejected": -0.6063405275344849, + "logps/chosen": -52.50807189941406, + "logps/rejected": -112.80038452148438, + "loss": 0.6186, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0109665393829346, + "rewards/margins": 7.383754730224609, + "rewards/rejected": -4.372787952423096, + "step": 11176 + }, + { + "epoch": 2.8, + "grad_norm": 4.754886150360107, + "learning_rate": 4.074368043227728e-06, + "logits/chosen": -0.5707845687866211, + "logits/rejected": -0.6324701309204102, + "logps/chosen": -57.78571319580078, + "logps/rejected": -102.956787109375, + "loss": 0.7027, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0779311656951904, + "rewards/margins": 5.755924224853516, + "rewards/rejected": -2.6779932975769043, + "step": 11177 + }, + { + "epoch": 2.8, + "grad_norm": 6.096137046813965, + "learning_rate": 4.073595653030796e-06, + "logits/chosen": -0.4206165373325348, + "logits/rejected": -0.477784663438797, + "logps/chosen": -54.443450927734375, + "logps/rejected": -121.69248962402344, + "loss": 0.5828, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.165842056274414, + "rewards/margins": 7.324670791625977, + "rewards/rejected": -4.158828258514404, + "step": 11178 + }, + { + "epoch": 2.8, + "grad_norm": 2.9249844551086426, + "learning_rate": 4.072823285726301e-06, + "logits/chosen": -0.4823533892631531, + "logits/rejected": -0.5935676097869873, + "logps/chosen": -61.81703567504883, + "logps/rejected": -118.21990966796875, + "loss": 0.5962, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.540480136871338, + "rewards/margins": 8.426417350769043, + "rewards/rejected": -4.885937690734863, + "step": 11179 + }, + { + "epoch": 2.8, + "grad_norm": 15.742877006530762, + "learning_rate": 4.072050941333328e-06, + "logits/chosen": -0.5236374735832214, + "logits/rejected": -0.6227839589118958, + "logps/chosen": -49.93366241455078, + "logps/rejected": -91.763916015625, + "loss": 0.547, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.105808973312378, + "rewards/margins": 6.3563151359558105, + "rewards/rejected": -3.2505059242248535, + "step": 11180 + }, + { + "epoch": 2.8, + "grad_norm": 4.356021881103516, + "learning_rate": 4.071278619870967e-06, + "logits/chosen": -0.44902104139328003, + "logits/rejected": -0.5446211099624634, + "logps/chosen": -53.97854232788086, + "logps/rejected": -96.61911010742188, + "loss": 0.5713, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0867156982421875, + "rewards/margins": 6.180983543395996, + "rewards/rejected": -3.0942676067352295, + "step": 11181 + }, + { + "epoch": 2.8, + "grad_norm": 12.450915336608887, + "learning_rate": 4.0705063213582956e-06, + "logits/chosen": -0.5453892350196838, + "logits/rejected": -0.650906503200531, + "logps/chosen": -47.19804763793945, + "logps/rejected": -103.98413848876953, + "loss": 0.528, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9633302688598633, + "rewards/margins": 7.595953941345215, + "rewards/rejected": -4.632623195648193, + "step": 11182 + }, + { + "epoch": 2.8, + "grad_norm": 6.713068962097168, + "learning_rate": 4.069734045814402e-06, + "logits/chosen": -0.5213909149169922, + "logits/rejected": -0.5893239378929138, + "logps/chosen": -75.84383392333984, + "logps/rejected": -101.5382308959961, + "loss": 0.8297, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9062881469726562, + "rewards/margins": 5.232010841369629, + "rewards/rejected": -2.3257229328155518, + "step": 11183 + }, + { + "epoch": 2.8, + "grad_norm": 2.5137858390808105, + "learning_rate": 4.0689617932583714e-06, + "logits/chosen": -0.5534595251083374, + "logits/rejected": -0.6656835079193115, + "logps/chosen": -54.632320404052734, + "logps/rejected": -104.66754150390625, + "loss": 0.6135, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.289381265640259, + "rewards/margins": 7.4984331130981445, + "rewards/rejected": -4.209051609039307, + "step": 11184 + }, + { + "epoch": 2.8, + "grad_norm": 2.0467042922973633, + "learning_rate": 4.068189563709283e-06, + "logits/chosen": -0.48624706268310547, + "logits/rejected": -0.6129563450813293, + "logps/chosen": -48.719207763671875, + "logps/rejected": -105.7242202758789, + "loss": 0.5344, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3285861015319824, + "rewards/margins": 8.172867774963379, + "rewards/rejected": -4.8442816734313965, + "step": 11185 + }, + { + "epoch": 2.8, + "grad_norm": 5.309332847595215, + "learning_rate": 4.067417357186222e-06, + "logits/chosen": -0.49930813908576965, + "logits/rejected": -0.5854473114013672, + "logps/chosen": -51.86075210571289, + "logps/rejected": -88.74253845214844, + "loss": 0.6106, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.225677967071533, + "rewards/margins": 6.316864013671875, + "rewards/rejected": -3.091186046600342, + "step": 11186 + }, + { + "epoch": 2.8, + "grad_norm": 7.340982437133789, + "learning_rate": 4.06664517370827e-06, + "logits/chosen": -0.473091185092926, + "logits/rejected": -0.5542004704475403, + "logps/chosen": -50.20948791503906, + "logps/rejected": -81.5285873413086, + "loss": 0.6973, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.117088556289673, + "rewards/margins": 5.512692928314209, + "rewards/rejected": -2.395604133605957, + "step": 11187 + }, + { + "epoch": 2.8, + "grad_norm": 3.4681155681610107, + "learning_rate": 4.0658730132945105e-06, + "logits/chosen": -0.5824946761131287, + "logits/rejected": -0.6658985018730164, + "logps/chosen": -59.286888122558594, + "logps/rejected": -91.99028778076172, + "loss": 0.6641, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8117141723632812, + "rewards/margins": 5.646778106689453, + "rewards/rejected": -2.8350634574890137, + "step": 11188 + }, + { + "epoch": 2.8, + "grad_norm": 5.96508264541626, + "learning_rate": 4.065100875964021e-06, + "logits/chosen": -0.44489386677742004, + "logits/rejected": -0.5450834631919861, + "logps/chosen": -67.861572265625, + "logps/rejected": -112.46018981933594, + "loss": 0.7654, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.961050510406494, + "rewards/margins": 6.131338596343994, + "rewards/rejected": -3.1702890396118164, + "step": 11189 + }, + { + "epoch": 2.8, + "grad_norm": 2.9288575649261475, + "learning_rate": 4.0643287617358826e-06, + "logits/chosen": -0.45365357398986816, + "logits/rejected": -0.6071010828018188, + "logps/chosen": -50.127803802490234, + "logps/rejected": -85.94310760498047, + "loss": 0.554, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.842803716659546, + "rewards/margins": 6.0589399337768555, + "rewards/rejected": -3.2161362171173096, + "step": 11190 + }, + { + "epoch": 2.8, + "grad_norm": 3.9600391387939453, + "learning_rate": 4.063556670629177e-06, + "logits/chosen": -0.5201519727706909, + "logits/rejected": -0.5690447092056274, + "logps/chosen": -49.21342468261719, + "logps/rejected": -107.57579040527344, + "loss": 0.6584, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.065943956375122, + "rewards/margins": 6.034501552581787, + "rewards/rejected": -2.968557834625244, + "step": 11191 + }, + { + "epoch": 2.8, + "grad_norm": 8.193635940551758, + "learning_rate": 4.062784602662982e-06, + "logits/chosen": -0.5101983547210693, + "logits/rejected": -0.5883444547653198, + "logps/chosen": -59.854286193847656, + "logps/rejected": -106.86356353759766, + "loss": 0.6776, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9064149856567383, + "rewards/margins": 5.524305820465088, + "rewards/rejected": -2.617891311645508, + "step": 11192 + }, + { + "epoch": 2.8, + "grad_norm": 12.499273300170898, + "learning_rate": 4.062012557856376e-06, + "logits/chosen": -0.47873055934906006, + "logits/rejected": -0.5701776146888733, + "logps/chosen": -61.965919494628906, + "logps/rejected": -106.15815734863281, + "loss": 0.7184, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5920159816741943, + "rewards/margins": 5.877661228179932, + "rewards/rejected": -3.285645008087158, + "step": 11193 + }, + { + "epoch": 2.8, + "grad_norm": 6.362910270690918, + "learning_rate": 4.061240536228437e-06, + "logits/chosen": -0.5989059805870056, + "logits/rejected": -0.6274212002754211, + "logps/chosen": -56.3438720703125, + "logps/rejected": -102.37815856933594, + "loss": 0.6623, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2552878856658936, + "rewards/margins": 5.834795951843262, + "rewards/rejected": -2.5795083045959473, + "step": 11194 + }, + { + "epoch": 2.8, + "grad_norm": 2.950561285018921, + "learning_rate": 4.060468537798244e-06, + "logits/chosen": -0.38345998525619507, + "logits/rejected": -0.46895575523376465, + "logps/chosen": -62.680206298828125, + "logps/rejected": -113.11540222167969, + "loss": 0.6389, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.288022518157959, + "rewards/margins": 7.597076892852783, + "rewards/rejected": -4.309054851531982, + "step": 11195 + }, + { + "epoch": 2.8, + "grad_norm": 4.867254257202148, + "learning_rate": 4.0596965625848715e-06, + "logits/chosen": -0.5106707215309143, + "logits/rejected": -0.5877187252044678, + "logps/chosen": -55.636505126953125, + "logps/rejected": -102.61797332763672, + "loss": 0.686, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.774409294128418, + "rewards/margins": 6.2549238204956055, + "rewards/rejected": -3.4805140495300293, + "step": 11196 + }, + { + "epoch": 2.8, + "grad_norm": 6.696836948394775, + "learning_rate": 4.058924610607396e-06, + "logits/chosen": -0.517346978187561, + "logits/rejected": -0.6015301942825317, + "logps/chosen": -55.070945739746094, + "logps/rejected": -101.3431396484375, + "loss": 0.6352, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9088521003723145, + "rewards/margins": 6.338695526123047, + "rewards/rejected": -3.4298436641693115, + "step": 11197 + }, + { + "epoch": 2.8, + "grad_norm": 7.803403854370117, + "learning_rate": 4.058152681884896e-06, + "logits/chosen": -0.42837831377983093, + "logits/rejected": -0.5513677597045898, + "logps/chosen": -58.34056091308594, + "logps/rejected": -94.33965301513672, + "loss": 0.6171, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4137110710144043, + "rewards/margins": 6.183669090270996, + "rewards/rejected": -2.769958019256592, + "step": 11198 + }, + { + "epoch": 2.8, + "grad_norm": 4.183452129364014, + "learning_rate": 4.0573807764364465e-06, + "logits/chosen": -0.534730076789856, + "logits/rejected": -0.6118021607398987, + "logps/chosen": -58.211204528808594, + "logps/rejected": -110.78721618652344, + "loss": 0.6187, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9069645404815674, + "rewards/margins": 7.020627021789551, + "rewards/rejected": -4.1136627197265625, + "step": 11199 + }, + { + "epoch": 2.8, + "grad_norm": 3.9838788509368896, + "learning_rate": 4.056608894281118e-06, + "logits/chosen": -0.5424544811248779, + "logits/rejected": -0.6106880307197571, + "logps/chosen": -50.52672576904297, + "logps/rejected": -109.97821044921875, + "loss": 0.6324, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9406561851501465, + "rewards/margins": 7.105889320373535, + "rewards/rejected": -4.165233135223389, + "step": 11200 + }, + { + "epoch": 2.8, + "grad_norm": 5.416768550872803, + "learning_rate": 4.055837035437988e-06, + "logits/chosen": -0.5137723088264465, + "logits/rejected": -0.5927166938781738, + "logps/chosen": -47.49585723876953, + "logps/rejected": -90.146240234375, + "loss": 0.6944, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9524612426757812, + "rewards/margins": 5.718250274658203, + "rewards/rejected": -2.7657885551452637, + "step": 11201 + }, + { + "epoch": 2.8, + "grad_norm": 7.932840824127197, + "learning_rate": 4.05506519992613e-06, + "logits/chosen": -0.4647195041179657, + "logits/rejected": -0.5283014178276062, + "logps/chosen": -61.4848747253418, + "logps/rejected": -106.18585968017578, + "loss": 0.7946, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.458383083343506, + "rewards/margins": 6.328546524047852, + "rewards/rejected": -3.8701627254486084, + "step": 11202 + }, + { + "epoch": 2.8, + "grad_norm": 4.7162885665893555, + "learning_rate": 4.054293387764615e-06, + "logits/chosen": -0.5451345443725586, + "logits/rejected": -0.5829387903213501, + "logps/chosen": -48.5413818359375, + "logps/rejected": -103.72147369384766, + "loss": 0.682, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.93760347366333, + "rewards/margins": 5.825135231018066, + "rewards/rejected": -2.8875319957733154, + "step": 11203 + }, + { + "epoch": 2.8, + "grad_norm": 4.804733753204346, + "learning_rate": 4.053521598972517e-06, + "logits/chosen": -0.4481728672981262, + "logits/rejected": -0.5887027978897095, + "logps/chosen": -68.87288665771484, + "logps/rejected": -99.6729507446289, + "loss": 0.7062, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.035245418548584, + "rewards/margins": 6.700588703155518, + "rewards/rejected": -3.6653432846069336, + "step": 11204 + }, + { + "epoch": 2.8, + "grad_norm": 6.794405460357666, + "learning_rate": 4.0527498335689054e-06, + "logits/chosen": -0.4596870541572571, + "logits/rejected": -0.5476040244102478, + "logps/chosen": -59.19806671142578, + "logps/rejected": -98.91532897949219, + "loss": 0.6568, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9470033645629883, + "rewards/margins": 5.987969875335693, + "rewards/rejected": -3.040966510772705, + "step": 11205 + }, + { + "epoch": 2.8, + "grad_norm": 9.003409385681152, + "learning_rate": 4.051978091572855e-06, + "logits/chosen": -0.5189286470413208, + "logits/rejected": -0.5822150111198425, + "logps/chosen": -53.42256546020508, + "logps/rejected": -104.56617736816406, + "loss": 0.6864, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.923809051513672, + "rewards/margins": 6.578584671020508, + "rewards/rejected": -3.654775619506836, + "step": 11206 + }, + { + "epoch": 2.8, + "grad_norm": 14.884462356567383, + "learning_rate": 4.0512063730034334e-06, + "logits/chosen": -0.4862004220485687, + "logits/rejected": -0.48736241459846497, + "logps/chosen": -49.505245208740234, + "logps/rejected": -117.48709869384766, + "loss": 0.7119, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9867169857025146, + "rewards/margins": 5.843951225280762, + "rewards/rejected": -2.857234239578247, + "step": 11207 + }, + { + "epoch": 2.8, + "grad_norm": 14.300715446472168, + "learning_rate": 4.050434677879711e-06, + "logits/chosen": -0.47613921761512756, + "logits/rejected": -0.5773340463638306, + "logps/chosen": -59.49774169921875, + "logps/rejected": -106.2126693725586, + "loss": 0.7131, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.975893497467041, + "rewards/margins": 6.526665687561035, + "rewards/rejected": -3.550772190093994, + "step": 11208 + }, + { + "epoch": 2.8, + "grad_norm": 4.377708911895752, + "learning_rate": 4.049663006220759e-06, + "logits/chosen": -0.4807825982570648, + "logits/rejected": -0.6175076961517334, + "logps/chosen": -63.42308044433594, + "logps/rejected": -88.1439208984375, + "loss": 0.7095, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9458019733428955, + "rewards/margins": 6.136904716491699, + "rewards/rejected": -3.1911025047302246, + "step": 11209 + }, + { + "epoch": 2.8, + "grad_norm": 5.954390048980713, + "learning_rate": 4.048891358045644e-06, + "logits/chosen": -0.5155968070030212, + "logits/rejected": -0.5804852247238159, + "logps/chosen": -68.2344741821289, + "logps/rejected": -97.82234191894531, + "loss": 0.6915, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1270840167999268, + "rewards/margins": 5.953522682189941, + "rewards/rejected": -2.8264389038085938, + "step": 11210 + }, + { + "epoch": 2.8, + "grad_norm": 5.959066867828369, + "learning_rate": 4.048119733373435e-06, + "logits/chosen": -0.4864729642868042, + "logits/rejected": -0.6072952151298523, + "logps/chosen": -54.50820541381836, + "logps/rejected": -110.3521957397461, + "loss": 0.6777, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6801722049713135, + "rewards/margins": 6.5703535079956055, + "rewards/rejected": -3.8901805877685547, + "step": 11211 + }, + { + "epoch": 2.8, + "grad_norm": 4.85117769241333, + "learning_rate": 4.047348132223199e-06, + "logits/chosen": -0.4598706364631653, + "logits/rejected": -0.5262142419815063, + "logps/chosen": -63.771385192871094, + "logps/rejected": -106.63660430908203, + "loss": 0.6283, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.177018165588379, + "rewards/margins": 7.0754899978637695, + "rewards/rejected": -3.8984711170196533, + "step": 11212 + }, + { + "epoch": 2.81, + "grad_norm": 4.7041401863098145, + "learning_rate": 4.046576554614007e-06, + "logits/chosen": -0.49470794200897217, + "logits/rejected": -0.6358948945999146, + "logps/chosen": -70.07079315185547, + "logps/rejected": -98.18650817871094, + "loss": 0.7464, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.67382550239563, + "rewards/margins": 5.5934247970581055, + "rewards/rejected": -2.9195988178253174, + "step": 11213 + }, + { + "epoch": 2.81, + "grad_norm": 7.445594787597656, + "learning_rate": 4.04580500056492e-06, + "logits/chosen": -0.4138282239437103, + "logits/rejected": -0.5603510141372681, + "logps/chosen": -63.77266311645508, + "logps/rejected": -81.50116729736328, + "loss": 0.768, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2726686000823975, + "rewards/margins": 6.276737689971924, + "rewards/rejected": -3.004068374633789, + "step": 11214 + }, + { + "epoch": 2.81, + "grad_norm": 18.390470504760742, + "learning_rate": 4.0450334700950065e-06, + "logits/chosen": -0.553744912147522, + "logits/rejected": -0.6300208568572998, + "logps/chosen": -59.135135650634766, + "logps/rejected": -111.64916229248047, + "loss": 0.7092, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.683565139770508, + "rewards/margins": 5.893304824829102, + "rewards/rejected": -3.209739923477173, + "step": 11215 + }, + { + "epoch": 2.81, + "grad_norm": 7.201967716217041, + "learning_rate": 4.0442619632233335e-06, + "logits/chosen": -0.4959927201271057, + "logits/rejected": -0.5962240695953369, + "logps/chosen": -50.5442008972168, + "logps/rejected": -85.15081787109375, + "loss": 0.7357, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.204549789428711, + "rewards/margins": 6.015857696533203, + "rewards/rejected": -2.811307430267334, + "step": 11216 + }, + { + "epoch": 2.81, + "grad_norm": 3.3935036659240723, + "learning_rate": 4.043490479968962e-06, + "logits/chosen": -0.5000078082084656, + "logits/rejected": -0.5470256209373474, + "logps/chosen": -57.551517486572266, + "logps/rejected": -117.42131805419922, + "loss": 0.6328, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.290411949157715, + "rewards/margins": 6.771738529205322, + "rewards/rejected": -3.4813263416290283, + "step": 11217 + }, + { + "epoch": 2.81, + "grad_norm": 13.442329406738281, + "learning_rate": 4.042719020350958e-06, + "logits/chosen": -0.5169354677200317, + "logits/rejected": -0.6182833909988403, + "logps/chosen": -71.00225067138672, + "logps/rejected": -102.59028625488281, + "loss": 0.7317, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9803285598754883, + "rewards/margins": 6.43766450881958, + "rewards/rejected": -3.457336187362671, + "step": 11218 + }, + { + "epoch": 2.81, + "grad_norm": 7.89543342590332, + "learning_rate": 4.041947584388386e-06, + "logits/chosen": -0.4681831896305084, + "logits/rejected": -0.5784388184547424, + "logps/chosen": -60.48515701293945, + "logps/rejected": -74.53557586669922, + "loss": 0.6765, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.020167350769043, + "rewards/margins": 5.169463634490967, + "rewards/rejected": -2.1492958068847656, + "step": 11219 + }, + { + "epoch": 2.81, + "grad_norm": 6.966615200042725, + "learning_rate": 4.041176172100309e-06, + "logits/chosen": -0.466768741607666, + "logits/rejected": -0.5305660367012024, + "logps/chosen": -52.758262634277344, + "logps/rejected": -103.19842529296875, + "loss": 0.6143, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1909286975860596, + "rewards/margins": 6.108624458312988, + "rewards/rejected": -2.917695999145508, + "step": 11220 + }, + { + "epoch": 2.81, + "grad_norm": 2.524132490158081, + "learning_rate": 4.040404783505788e-06, + "logits/chosen": -0.4619871973991394, + "logits/rejected": -0.5944241881370544, + "logps/chosen": -68.52565002441406, + "logps/rejected": -99.3449935913086, + "loss": 0.6329, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9194300174713135, + "rewards/margins": 6.800989151000977, + "rewards/rejected": -3.881559371948242, + "step": 11221 + }, + { + "epoch": 2.81, + "grad_norm": 4.559057712554932, + "learning_rate": 4.039633418623885e-06, + "logits/chosen": -0.49683600664138794, + "logits/rejected": -0.5449391603469849, + "logps/chosen": -51.6670036315918, + "logps/rejected": -96.43681335449219, + "loss": 0.676, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.520303726196289, + "rewards/margins": 4.8811540603637695, + "rewards/rejected": -1.3608506917953491, + "step": 11222 + }, + { + "epoch": 2.81, + "grad_norm": 9.728898048400879, + "learning_rate": 4.03886207747366e-06, + "logits/chosen": -0.4553551971912384, + "logits/rejected": -0.5370523929595947, + "logps/chosen": -48.43117141723633, + "logps/rejected": -88.57968139648438, + "loss": 0.7506, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.862295627593994, + "rewards/margins": 5.863584518432617, + "rewards/rejected": -3.001288890838623, + "step": 11223 + }, + { + "epoch": 2.81, + "grad_norm": 7.303623199462891, + "learning_rate": 4.03809076007418e-06, + "logits/chosen": -0.5598583221435547, + "logits/rejected": -0.6328186988830566, + "logps/chosen": -47.52178955078125, + "logps/rejected": -110.05797576904297, + "loss": 0.5744, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.276411771774292, + "rewards/margins": 7.3509016036987305, + "rewards/rejected": -4.074490547180176, + "step": 11224 + }, + { + "epoch": 2.81, + "grad_norm": 11.954986572265625, + "learning_rate": 4.037319466444498e-06, + "logits/chosen": -0.5653343796730042, + "logits/rejected": -0.6222019791603088, + "logps/chosen": -56.549903869628906, + "logps/rejected": -106.3204116821289, + "loss": 0.7796, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9558637142181396, + "rewards/margins": 6.541451454162598, + "rewards/rejected": -3.585587978363037, + "step": 11225 + }, + { + "epoch": 2.81, + "grad_norm": 6.526849269866943, + "learning_rate": 4.036548196603674e-06, + "logits/chosen": -0.5125361680984497, + "logits/rejected": -0.5541332960128784, + "logps/chosen": -55.98027801513672, + "logps/rejected": -112.04837799072266, + "loss": 0.7399, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.198707103729248, + "rewards/margins": 6.009211540222168, + "rewards/rejected": -2.810504913330078, + "step": 11226 + }, + { + "epoch": 2.81, + "grad_norm": 16.013639450073242, + "learning_rate": 4.035776950570772e-06, + "logits/chosen": -0.5311572551727295, + "logits/rejected": -0.6324431896209717, + "logps/chosen": -66.2999267578125, + "logps/rejected": -84.18709564208984, + "loss": 0.8684, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8116374015808105, + "rewards/margins": 4.626125812530518, + "rewards/rejected": -1.8144885301589966, + "step": 11227 + }, + { + "epoch": 2.81, + "grad_norm": 5.603330612182617, + "learning_rate": 4.035005728364845e-06, + "logits/chosen": -0.5255680084228516, + "logits/rejected": -0.542040228843689, + "logps/chosen": -43.3657341003418, + "logps/rejected": -99.65248107910156, + "loss": 0.6282, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4073638916015625, + "rewards/margins": 5.762601852416992, + "rewards/rejected": -2.355238437652588, + "step": 11228 + }, + { + "epoch": 2.81, + "grad_norm": 8.719598770141602, + "learning_rate": 4.034234530004953e-06, + "logits/chosen": -0.4366059899330139, + "logits/rejected": -0.5452007055282593, + "logps/chosen": -73.07676696777344, + "logps/rejected": -85.93226623535156, + "loss": 0.8397, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.909930944442749, + "rewards/margins": 6.125931739807129, + "rewards/rejected": -3.216001033782959, + "step": 11229 + }, + { + "epoch": 2.81, + "grad_norm": 11.280681610107422, + "learning_rate": 4.033463355510154e-06, + "logits/chosen": -0.480110764503479, + "logits/rejected": -0.5446476936340332, + "logps/chosen": -62.289432525634766, + "logps/rejected": -94.39846801757812, + "loss": 1.0038, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.598006010055542, + "rewards/margins": 5.558978080749512, + "rewards/rejected": -2.9609720706939697, + "step": 11230 + }, + { + "epoch": 2.81, + "grad_norm": 5.124775409698486, + "learning_rate": 4.032692204899504e-06, + "logits/chosen": -0.4977967441082001, + "logits/rejected": -0.5623632073402405, + "logps/chosen": -51.88732147216797, + "logps/rejected": -87.68331909179688, + "loss": 0.8103, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9444639682769775, + "rewards/margins": 4.613786697387695, + "rewards/rejected": -1.6693228483200073, + "step": 11231 + }, + { + "epoch": 2.81, + "grad_norm": 3.564138412475586, + "learning_rate": 4.031921078192057e-06, + "logits/chosen": -0.4585832953453064, + "logits/rejected": -0.504401683807373, + "logps/chosen": -54.21159362792969, + "logps/rejected": -92.51775360107422, + "loss": 0.5933, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0748448371887207, + "rewards/margins": 5.634737968444824, + "rewards/rejected": -2.5598931312561035, + "step": 11232 + }, + { + "epoch": 2.81, + "grad_norm": 6.599062919616699, + "learning_rate": 4.0311499754068705e-06, + "logits/chosen": -0.5237513184547424, + "logits/rejected": -0.6561400294303894, + "logps/chosen": -67.4292221069336, + "logps/rejected": -88.40911865234375, + "loss": 0.6666, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.158238649368286, + "rewards/margins": 5.569314956665039, + "rewards/rejected": -2.411076545715332, + "step": 11233 + }, + { + "epoch": 2.81, + "grad_norm": 6.974562168121338, + "learning_rate": 4.030378896562999e-06, + "logits/chosen": -0.42931362986564636, + "logits/rejected": -0.4759148061275482, + "logps/chosen": -47.36085510253906, + "logps/rejected": -93.53987121582031, + "loss": 0.6552, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1699769496917725, + "rewards/margins": 5.812806129455566, + "rewards/rejected": -2.642829418182373, + "step": 11234 + }, + { + "epoch": 2.81, + "grad_norm": 6.37152099609375, + "learning_rate": 4.029607841679496e-06, + "logits/chosen": -0.44658419489860535, + "logits/rejected": -0.5177966952323914, + "logps/chosen": -50.937191009521484, + "logps/rejected": -94.70997619628906, + "loss": 0.7158, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9693069458007812, + "rewards/margins": 6.114775657653809, + "rewards/rejected": -3.145468235015869, + "step": 11235 + }, + { + "epoch": 2.81, + "grad_norm": 6.964526176452637, + "learning_rate": 4.028836810775415e-06, + "logits/chosen": -0.4811890125274658, + "logits/rejected": -0.5362399220466614, + "logps/chosen": -58.460426330566406, + "logps/rejected": -95.43878936767578, + "loss": 0.7105, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0184741020202637, + "rewards/margins": 5.380558013916016, + "rewards/rejected": -2.362084150314331, + "step": 11236 + }, + { + "epoch": 2.81, + "grad_norm": 8.247750282287598, + "learning_rate": 4.02806580386981e-06, + "logits/chosen": -0.4492233395576477, + "logits/rejected": -0.49910682439804077, + "logps/chosen": -54.6864013671875, + "logps/rejected": -102.68495178222656, + "loss": 0.7187, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2972733974456787, + "rewards/margins": 5.471513748168945, + "rewards/rejected": -2.1742405891418457, + "step": 11237 + }, + { + "epoch": 2.81, + "grad_norm": 4.590329170227051, + "learning_rate": 4.027294820981733e-06, + "logits/chosen": -0.5202740430831909, + "logits/rejected": -0.5892643928527832, + "logps/chosen": -44.93691635131836, + "logps/rejected": -102.23809814453125, + "loss": 0.5408, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.412254810333252, + "rewards/margins": 7.058880805969238, + "rewards/rejected": -3.6466257572174072, + "step": 11238 + }, + { + "epoch": 2.81, + "grad_norm": 7.083935737609863, + "learning_rate": 4.0265238621302346e-06, + "logits/chosen": -0.5116022825241089, + "logits/rejected": -0.5737404227256775, + "logps/chosen": -47.31000518798828, + "logps/rejected": -108.30715942382812, + "loss": 0.6219, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.02294659614563, + "rewards/margins": 6.328706741333008, + "rewards/rejected": -3.305759906768799, + "step": 11239 + }, + { + "epoch": 2.81, + "grad_norm": 2.3895998001098633, + "learning_rate": 4.025752927334367e-06, + "logits/chosen": -0.4360943138599396, + "logits/rejected": -0.5237698554992676, + "logps/chosen": -65.73307037353516, + "logps/rejected": -90.98530578613281, + "loss": 0.6294, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0323703289031982, + "rewards/margins": 5.269782543182373, + "rewards/rejected": -2.2374119758605957, + "step": 11240 + }, + { + "epoch": 2.81, + "grad_norm": 6.402448654174805, + "learning_rate": 4.024982016613183e-06, + "logits/chosen": -0.45434123277664185, + "logits/rejected": -0.5423826575279236, + "logps/chosen": -52.70159912109375, + "logps/rejected": -82.70220947265625, + "loss": 0.7574, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0320425033569336, + "rewards/margins": 5.249693870544434, + "rewards/rejected": -2.217651128768921, + "step": 11241 + }, + { + "epoch": 2.81, + "grad_norm": 4.829483985900879, + "learning_rate": 4.024211129985728e-06, + "logits/chosen": -0.4710080921649933, + "logits/rejected": -0.512385904788971, + "logps/chosen": -64.41661834716797, + "logps/rejected": -104.80947875976562, + "loss": 0.6609, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.289083242416382, + "rewards/margins": 6.099399566650391, + "rewards/rejected": -2.810316562652588, + "step": 11242 + }, + { + "epoch": 2.81, + "grad_norm": 3.194263458251953, + "learning_rate": 4.023440267471054e-06, + "logits/chosen": -0.4807823896408081, + "logits/rejected": -0.5836737155914307, + "logps/chosen": -54.0664176940918, + "logps/rejected": -112.75069427490234, + "loss": 0.6985, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0732357501983643, + "rewards/margins": 7.686798095703125, + "rewards/rejected": -4.613562107086182, + "step": 11243 + }, + { + "epoch": 2.81, + "grad_norm": 37.747398376464844, + "learning_rate": 4.02266942908821e-06, + "logits/chosen": -0.47138166427612305, + "logits/rejected": -0.5985546708106995, + "logps/chosen": -52.59844970703125, + "logps/rejected": -89.9912338256836, + "loss": 0.6974, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9318346977233887, + "rewards/margins": 6.405402183532715, + "rewards/rejected": -3.473567008972168, + "step": 11244 + }, + { + "epoch": 2.81, + "grad_norm": 3.9385406970977783, + "learning_rate": 4.021898614856244e-06, + "logits/chosen": -0.5326240062713623, + "logits/rejected": -0.6729291677474976, + "logps/chosen": -55.04997253417969, + "logps/rejected": -93.890380859375, + "loss": 0.6567, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.092085838317871, + "rewards/margins": 7.696457386016846, + "rewards/rejected": -4.604370594024658, + "step": 11245 + }, + { + "epoch": 2.81, + "grad_norm": 3.2702443599700928, + "learning_rate": 4.021127824794204e-06, + "logits/chosen": -0.5677574276924133, + "logits/rejected": -0.6318431496620178, + "logps/chosen": -48.68128204345703, + "logps/rejected": -97.57501220703125, + "loss": 0.6353, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.233783721923828, + "rewards/margins": 6.017097473144531, + "rewards/rejected": -2.7833139896392822, + "step": 11246 + }, + { + "epoch": 2.81, + "grad_norm": 8.646859169006348, + "learning_rate": 4.020357058921137e-06, + "logits/chosen": -0.49461671710014343, + "logits/rejected": -0.5977606773376465, + "logps/chosen": -60.34214782714844, + "logps/rejected": -86.56105041503906, + "loss": 0.7593, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.783250570297241, + "rewards/margins": 5.365285873413086, + "rewards/rejected": -2.5820353031158447, + "step": 11247 + }, + { + "epoch": 2.81, + "grad_norm": 15.939136505126953, + "learning_rate": 4.0195863172560875e-06, + "logits/chosen": -0.5075798034667969, + "logits/rejected": -0.5944029092788696, + "logps/chosen": -47.01374053955078, + "logps/rejected": -93.6122817993164, + "loss": 0.6853, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1247053146362305, + "rewards/margins": 6.961779594421387, + "rewards/rejected": -3.837074041366577, + "step": 11248 + }, + { + "epoch": 2.81, + "grad_norm": 3.8059678077697754, + "learning_rate": 4.018815599818105e-06, + "logits/chosen": -0.4410620927810669, + "logits/rejected": -0.4937712550163269, + "logps/chosen": -47.69649124145508, + "logps/rejected": -91.0996322631836, + "loss": 0.5844, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.363799571990967, + "rewards/margins": 5.575314521789551, + "rewards/rejected": -2.211514949798584, + "step": 11249 + }, + { + "epoch": 2.81, + "grad_norm": 3.6442017555236816, + "learning_rate": 4.018044906626231e-06, + "logits/chosen": -0.47063660621643066, + "logits/rejected": -0.5715831518173218, + "logps/chosen": -61.094417572021484, + "logps/rejected": -106.65650177001953, + "loss": 0.625, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.93284273147583, + "rewards/margins": 6.8342766761779785, + "rewards/rejected": -3.9014341831207275, + "step": 11250 + }, + { + "epoch": 2.81, + "grad_norm": 36.16856384277344, + "learning_rate": 4.017274237699513e-06, + "logits/chosen": -0.4912872314453125, + "logits/rejected": -0.5533050298690796, + "logps/chosen": -55.67335510253906, + "logps/rejected": -114.02116394042969, + "loss": 0.6927, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2018444538116455, + "rewards/margins": 6.275277137756348, + "rewards/rejected": -3.073432683944702, + "step": 11251 + }, + { + "epoch": 2.81, + "grad_norm": 8.121295928955078, + "learning_rate": 4.016503593056994e-06, + "logits/chosen": -0.5006051659584045, + "logits/rejected": -0.5696510076522827, + "logps/chosen": -57.02618408203125, + "logps/rejected": -96.32470703125, + "loss": 0.7494, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.929605007171631, + "rewards/margins": 6.089474201202393, + "rewards/rejected": -3.1598691940307617, + "step": 11252 + }, + { + "epoch": 2.82, + "grad_norm": 4.982641696929932, + "learning_rate": 4.0157329727177165e-06, + "logits/chosen": -0.3717527687549591, + "logits/rejected": -0.48785048723220825, + "logps/chosen": -61.878196716308594, + "logps/rejected": -96.85322570800781, + "loss": 0.6216, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0374393463134766, + "rewards/margins": 6.462772369384766, + "rewards/rejected": -3.425333023071289, + "step": 11253 + }, + { + "epoch": 2.82, + "grad_norm": 3.8257391452789307, + "learning_rate": 4.014962376700724e-06, + "logits/chosen": -0.4566745162010193, + "logits/rejected": -0.5716683268547058, + "logps/chosen": -68.74955749511719, + "logps/rejected": -118.84357452392578, + "loss": 0.6586, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8664262294769287, + "rewards/margins": 7.5501837730407715, + "rewards/rejected": -4.683757781982422, + "step": 11254 + }, + { + "epoch": 2.82, + "grad_norm": 5.200094699859619, + "learning_rate": 4.014191805025057e-06, + "logits/chosen": -0.5237928032875061, + "logits/rejected": -0.6006156802177429, + "logps/chosen": -52.03225326538086, + "logps/rejected": -94.76759338378906, + "loss": 0.6707, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2433242797851562, + "rewards/margins": 6.448512554168701, + "rewards/rejected": -3.205188274383545, + "step": 11255 + }, + { + "epoch": 2.82, + "grad_norm": 4.49805212020874, + "learning_rate": 4.0134212577097635e-06, + "logits/chosen": -0.5329394340515137, + "logits/rejected": -0.5717861652374268, + "logps/chosen": -49.083011627197266, + "logps/rejected": -92.14615631103516, + "loss": 0.6464, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0770339965820312, + "rewards/margins": 5.8438825607299805, + "rewards/rejected": -2.766848087310791, + "step": 11256 + }, + { + "epoch": 2.82, + "grad_norm": 4.339966297149658, + "learning_rate": 4.012650734773876e-06, + "logits/chosen": -0.5926152467727661, + "logits/rejected": -0.5921269059181213, + "logps/chosen": -71.78760528564453, + "logps/rejected": -119.49102020263672, + "loss": 0.5676, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2387428283691406, + "rewards/margins": 7.534956455230713, + "rewards/rejected": -4.2962141036987305, + "step": 11257 + }, + { + "epoch": 2.82, + "grad_norm": 6.338780879974365, + "learning_rate": 4.011880236236441e-06, + "logits/chosen": -0.5266575217247009, + "logits/rejected": -0.5836617350578308, + "logps/chosen": -58.6566162109375, + "logps/rejected": -98.24581909179688, + "loss": 0.7969, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6732919216156006, + "rewards/margins": 5.616089820861816, + "rewards/rejected": -2.9427976608276367, + "step": 11258 + }, + { + "epoch": 2.82, + "grad_norm": 4.768364906311035, + "learning_rate": 4.0111097621164984e-06, + "logits/chosen": -0.4874158799648285, + "logits/rejected": -0.5544238686561584, + "logps/chosen": -46.10334777832031, + "logps/rejected": -95.64368438720703, + "loss": 0.5817, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3448009490966797, + "rewards/margins": 6.7714433670043945, + "rewards/rejected": -3.4266419410705566, + "step": 11259 + }, + { + "epoch": 2.82, + "grad_norm": 14.33869743347168, + "learning_rate": 4.010339312433082e-06, + "logits/chosen": -0.40260958671569824, + "logits/rejected": -0.5058223009109497, + "logps/chosen": -59.495269775390625, + "logps/rejected": -107.93597412109375, + "loss": 0.6666, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.808887004852295, + "rewards/margins": 6.0041985511779785, + "rewards/rejected": -3.1953110694885254, + "step": 11260 + }, + { + "epoch": 2.82, + "grad_norm": 20.735624313354492, + "learning_rate": 4.009568887205234e-06, + "logits/chosen": -0.4992837905883789, + "logits/rejected": -0.6201531887054443, + "logps/chosen": -56.43980407714844, + "logps/rejected": -100.9002456665039, + "loss": 0.7152, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8779077529907227, + "rewards/margins": 6.944411754608154, + "rewards/rejected": -4.066503524780273, + "step": 11261 + }, + { + "epoch": 2.82, + "grad_norm": 7.86303186416626, + "learning_rate": 4.008798486451993e-06, + "logits/chosen": -0.4445803463459015, + "logits/rejected": -0.5384491086006165, + "logps/chosen": -52.3909912109375, + "logps/rejected": -94.46691131591797, + "loss": 0.6693, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0647623538970947, + "rewards/margins": 6.546807765960693, + "rewards/rejected": -3.4820454120635986, + "step": 11262 + }, + { + "epoch": 2.82, + "grad_norm": 10.430723190307617, + "learning_rate": 4.008028110192398e-06, + "logits/chosen": -0.48178476095199585, + "logits/rejected": -0.5957912802696228, + "logps/chosen": -60.7886962890625, + "logps/rejected": -100.26545715332031, + "loss": 0.6386, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1099233627319336, + "rewards/margins": 7.345912456512451, + "rewards/rejected": -4.235989093780518, + "step": 11263 + }, + { + "epoch": 2.82, + "grad_norm": 1.6240322589874268, + "learning_rate": 4.007257758445481e-06, + "logits/chosen": -0.5910707712173462, + "logits/rejected": -0.6668381690979004, + "logps/chosen": -53.44085693359375, + "logps/rejected": -99.75435638427734, + "loss": 0.5885, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1639955043792725, + "rewards/margins": 6.395540714263916, + "rewards/rejected": -3.2315452098846436, + "step": 11264 + }, + { + "epoch": 2.82, + "grad_norm": 17.51512336730957, + "learning_rate": 4.006487431230281e-06, + "logits/chosen": -0.5107735395431519, + "logits/rejected": -0.6035186648368835, + "logps/chosen": -54.622772216796875, + "logps/rejected": -83.90167999267578, + "loss": 0.93, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.1285300254821777, + "rewards/margins": 4.907872200012207, + "rewards/rejected": -1.7793421745300293, + "step": 11265 + }, + { + "epoch": 2.82, + "grad_norm": 5.009686470031738, + "learning_rate": 4.005717128565834e-06, + "logits/chosen": -0.5120729207992554, + "logits/rejected": -0.6129730343818665, + "logps/chosen": -64.81031799316406, + "logps/rejected": -94.33247375488281, + "loss": 0.6713, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2216198444366455, + "rewards/margins": 5.362882137298584, + "rewards/rejected": -2.1412622928619385, + "step": 11266 + }, + { + "epoch": 2.82, + "grad_norm": 44.9356689453125, + "learning_rate": 4.004946850471174e-06, + "logits/chosen": -0.522234320640564, + "logits/rejected": -0.5461863875389099, + "logps/chosen": -55.054935455322266, + "logps/rejected": -129.44300842285156, + "loss": 0.7827, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.976658821105957, + "rewards/margins": 5.832907676696777, + "rewards/rejected": -2.8562490940093994, + "step": 11267 + }, + { + "epoch": 2.82, + "grad_norm": 16.226022720336914, + "learning_rate": 4.0041765969653354e-06, + "logits/chosen": -0.48227840662002563, + "logits/rejected": -0.5439340472221375, + "logps/chosen": -58.35282516479492, + "logps/rejected": -82.17280578613281, + "loss": 0.8838, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0105745792388916, + "rewards/margins": 4.239109516143799, + "rewards/rejected": -1.2285351753234863, + "step": 11268 + }, + { + "epoch": 2.82, + "grad_norm": 6.085466384887695, + "learning_rate": 4.003406368067351e-06, + "logits/chosen": -0.4897021949291229, + "logits/rejected": -0.6037999987602234, + "logps/chosen": -52.94824981689453, + "logps/rejected": -81.58778381347656, + "loss": 0.6772, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0545384883880615, + "rewards/margins": 6.3997578620910645, + "rewards/rejected": -3.345219850540161, + "step": 11269 + }, + { + "epoch": 2.82, + "grad_norm": 5.457298278808594, + "learning_rate": 4.002636163796259e-06, + "logits/chosen": -0.46787071228027344, + "logits/rejected": -0.5254014730453491, + "logps/chosen": -48.763893127441406, + "logps/rejected": -73.96798706054688, + "loss": 0.7291, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.329951286315918, + "rewards/margins": 4.844334602355957, + "rewards/rejected": -1.5143829584121704, + "step": 11270 + }, + { + "epoch": 2.82, + "grad_norm": 6.371075630187988, + "learning_rate": 4.0018659841710854e-06, + "logits/chosen": -0.4364335834980011, + "logits/rejected": -0.5467351675033569, + "logps/chosen": -60.635257720947266, + "logps/rejected": -97.34031677246094, + "loss": 0.6798, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8529019355773926, + "rewards/margins": 7.0537190437316895, + "rewards/rejected": -4.200817108154297, + "step": 11271 + }, + { + "epoch": 2.82, + "grad_norm": 3.3709702491760254, + "learning_rate": 4.001095829210864e-06, + "logits/chosen": -0.5383087396621704, + "logits/rejected": -0.5542851686477661, + "logps/chosen": -42.71981430053711, + "logps/rejected": -101.17725372314453, + "loss": 0.4947, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1328721046447754, + "rewards/margins": 6.662692070007324, + "rewards/rejected": -3.5298197269439697, + "step": 11272 + }, + { + "epoch": 2.82, + "grad_norm": 18.09337043762207, + "learning_rate": 4.000325698934631e-06, + "logits/chosen": -0.4632629454135895, + "logits/rejected": -0.5271994471549988, + "logps/chosen": -58.355281829833984, + "logps/rejected": -101.76129913330078, + "loss": 0.7267, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8521690368652344, + "rewards/margins": 6.185643196105957, + "rewards/rejected": -3.333475112915039, + "step": 11273 + }, + { + "epoch": 2.82, + "grad_norm": 9.156573295593262, + "learning_rate": 3.9995555933614094e-06, + "logits/chosen": -0.5187019109725952, + "logits/rejected": -0.6452658176422119, + "logps/chosen": -55.18303298950195, + "logps/rejected": -79.67967987060547, + "loss": 0.6389, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.045114278793335, + "rewards/margins": 5.77384090423584, + "rewards/rejected": -2.728726625442505, + "step": 11274 + }, + { + "epoch": 2.82, + "grad_norm": 3.105614423751831, + "learning_rate": 3.9987855125102345e-06, + "logits/chosen": -0.4585985243320465, + "logits/rejected": -0.532864511013031, + "logps/chosen": -66.63750457763672, + "logps/rejected": -106.79011535644531, + "loss": 0.6805, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7326626777648926, + "rewards/margins": 5.781028747558594, + "rewards/rejected": -3.048366069793701, + "step": 11275 + }, + { + "epoch": 2.82, + "grad_norm": 3.471491575241089, + "learning_rate": 3.998015456400135e-06, + "logits/chosen": -0.4789571762084961, + "logits/rejected": -0.5809090733528137, + "logps/chosen": -55.89828109741211, + "logps/rejected": -106.700927734375, + "loss": 0.6264, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3673574924468994, + "rewards/margins": 7.409579277038574, + "rewards/rejected": -4.042221546173096, + "step": 11276 + }, + { + "epoch": 2.82, + "grad_norm": 5.38048791885376, + "learning_rate": 3.9972454250501405e-06, + "logits/chosen": -0.4354720711708069, + "logits/rejected": -0.5079960227012634, + "logps/chosen": -53.2122688293457, + "logps/rejected": -87.03713989257812, + "loss": 0.6233, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0957040786743164, + "rewards/margins": 5.714559555053711, + "rewards/rejected": -2.6188559532165527, + "step": 11277 + }, + { + "epoch": 2.82, + "grad_norm": 18.62665367126465, + "learning_rate": 3.996475418479276e-06, + "logits/chosen": -0.5320255756378174, + "logits/rejected": -0.6380870938301086, + "logps/chosen": -61.32265853881836, + "logps/rejected": -91.34822082519531, + "loss": 0.6865, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9555015563964844, + "rewards/margins": 5.652675628662109, + "rewards/rejected": -2.697173833847046, + "step": 11278 + }, + { + "epoch": 2.82, + "grad_norm": 6.002009391784668, + "learning_rate": 3.9957054367065715e-06, + "logits/chosen": -0.47654372453689575, + "logits/rejected": -0.5657933950424194, + "logps/chosen": -54.89863204956055, + "logps/rejected": -97.169677734375, + "loss": 0.6436, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.975940704345703, + "rewards/margins": 5.594021797180176, + "rewards/rejected": -2.6180808544158936, + "step": 11279 + }, + { + "epoch": 2.82, + "grad_norm": 5.170263290405273, + "learning_rate": 3.994935479751055e-06, + "logits/chosen": -0.5668827295303345, + "logits/rejected": -0.6669583320617676, + "logps/chosen": -53.54419708251953, + "logps/rejected": -85.9977798461914, + "loss": 0.6525, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.106306552886963, + "rewards/margins": 5.846648216247559, + "rewards/rejected": -2.7403409481048584, + "step": 11280 + }, + { + "epoch": 2.82, + "grad_norm": 4.401147842407227, + "learning_rate": 3.994165547631752e-06, + "logits/chosen": -0.5579360723495483, + "logits/rejected": -0.622886061668396, + "logps/chosen": -45.193355560302734, + "logps/rejected": -90.47855377197266, + "loss": 0.6533, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9886064529418945, + "rewards/margins": 6.019001483917236, + "rewards/rejected": -3.0303955078125, + "step": 11281 + }, + { + "epoch": 2.82, + "grad_norm": 2.4315319061279297, + "learning_rate": 3.993395640367687e-06, + "logits/chosen": -0.47397682070732117, + "logits/rejected": -0.5199136734008789, + "logps/chosen": -42.17811584472656, + "logps/rejected": -94.65357971191406, + "loss": 0.5341, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2785470485687256, + "rewards/margins": 6.922475337982178, + "rewards/rejected": -3.6439287662506104, + "step": 11282 + }, + { + "epoch": 2.82, + "grad_norm": 3.0987229347229004, + "learning_rate": 3.992625757977886e-06, + "logits/chosen": -0.47449973225593567, + "logits/rejected": -0.611663818359375, + "logps/chosen": -56.86619567871094, + "logps/rejected": -94.81452178955078, + "loss": 0.5841, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1820149421691895, + "rewards/margins": 7.320722579956055, + "rewards/rejected": -4.138707160949707, + "step": 11283 + }, + { + "epoch": 2.82, + "grad_norm": 6.00314998626709, + "learning_rate": 3.991855900481376e-06, + "logits/chosen": -0.5531376004219055, + "logits/rejected": -0.5841777920722961, + "logps/chosen": -52.4317512512207, + "logps/rejected": -111.32759094238281, + "loss": 0.6239, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4276857376098633, + "rewards/margins": 7.208975315093994, + "rewards/rejected": -3.781289577484131, + "step": 11284 + }, + { + "epoch": 2.82, + "grad_norm": 11.126507759094238, + "learning_rate": 3.991086067897178e-06, + "logits/chosen": -0.4312618374824524, + "logits/rejected": -0.5368109941482544, + "logps/chosen": -59.043739318847656, + "logps/rejected": -87.14269256591797, + "loss": 0.7178, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9613943099975586, + "rewards/margins": 6.191300868988037, + "rewards/rejected": -3.2299067974090576, + "step": 11285 + }, + { + "epoch": 2.82, + "grad_norm": 6.820568561553955, + "learning_rate": 3.9903162602443155e-06, + "logits/chosen": -0.48771047592163086, + "logits/rejected": -0.5551737546920776, + "logps/chosen": -55.306800842285156, + "logps/rejected": -89.62493133544922, + "loss": 0.7509, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9784607887268066, + "rewards/margins": 5.613295555114746, + "rewards/rejected": -2.6348347663879395, + "step": 11286 + }, + { + "epoch": 2.82, + "grad_norm": 7.65933084487915, + "learning_rate": 3.989546477541813e-06, + "logits/chosen": -0.4822652339935303, + "logits/rejected": -0.5292118191719055, + "logps/chosen": -63.63994598388672, + "logps/rejected": -93.19825744628906, + "loss": 0.7133, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9724650382995605, + "rewards/margins": 5.198341369628906, + "rewards/rejected": -2.2258763313293457, + "step": 11287 + }, + { + "epoch": 2.82, + "grad_norm": 35.14484786987305, + "learning_rate": 3.988776719808692e-06, + "logits/chosen": -0.595822274684906, + "logits/rejected": -0.6119210124015808, + "logps/chosen": -42.591148376464844, + "logps/rejected": -103.63424682617188, + "loss": 0.6312, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3350634574890137, + "rewards/margins": 6.780156135559082, + "rewards/rejected": -3.4450926780700684, + "step": 11288 + }, + { + "epoch": 2.82, + "grad_norm": 5.251272678375244, + "learning_rate": 3.988006987063973e-06, + "logits/chosen": -0.4565722346305847, + "logits/rejected": -0.5347056984901428, + "logps/chosen": -51.96285629272461, + "logps/rejected": -109.43426513671875, + "loss": 0.6037, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8528079986572266, + "rewards/margins": 6.009884834289551, + "rewards/rejected": -3.157076597213745, + "step": 11289 + }, + { + "epoch": 2.82, + "grad_norm": 6.732868194580078, + "learning_rate": 3.9872372793266785e-06, + "logits/chosen": -0.4916498064994812, + "logits/rejected": -0.5712964534759521, + "logps/chosen": -52.04829025268555, + "logps/rejected": -69.4300765991211, + "loss": 0.8439, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.69625186920166, + "rewards/margins": 4.183658599853516, + "rewards/rejected": -1.4874067306518555, + "step": 11290 + }, + { + "epoch": 2.82, + "grad_norm": 1.9333672523498535, + "learning_rate": 3.986467596615828e-06, + "logits/chosen": -0.523105263710022, + "logits/rejected": -0.6739141941070557, + "logps/chosen": -53.42372512817383, + "logps/rejected": -91.3665771484375, + "loss": 0.5508, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8370933532714844, + "rewards/margins": 7.273244857788086, + "rewards/rejected": -4.43615198135376, + "step": 11291 + }, + { + "epoch": 2.82, + "grad_norm": 4.05519962310791, + "learning_rate": 3.985697938950441e-06, + "logits/chosen": -0.47959864139556885, + "logits/rejected": -0.5779354572296143, + "logps/chosen": -61.74704360961914, + "logps/rejected": -120.56822204589844, + "loss": 0.6327, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0236904621124268, + "rewards/margins": 6.806160926818848, + "rewards/rejected": -3.782470703125, + "step": 11292 + }, + { + "epoch": 2.83, + "grad_norm": 4.34934139251709, + "learning_rate": 3.984928306349537e-06, + "logits/chosen": -0.48686403036117554, + "logits/rejected": -0.5325142741203308, + "logps/chosen": -56.89503860473633, + "logps/rejected": -118.38391876220703, + "loss": 0.598, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0907669067382812, + "rewards/margins": 6.823631763458252, + "rewards/rejected": -3.7328643798828125, + "step": 11293 + }, + { + "epoch": 2.83, + "grad_norm": 3.253838300704956, + "learning_rate": 3.984158698832134e-06, + "logits/chosen": -0.5125666260719299, + "logits/rejected": -0.5532578825950623, + "logps/chosen": -49.30116653442383, + "logps/rejected": -91.34905242919922, + "loss": 0.6187, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.463988780975342, + "rewards/margins": 5.68078088760376, + "rewards/rejected": -2.216792106628418, + "step": 11294 + }, + { + "epoch": 2.83, + "grad_norm": 3.0479352474212646, + "learning_rate": 3.983389116417251e-06, + "logits/chosen": -0.5023325085639954, + "logits/rejected": -0.5704787373542786, + "logps/chosen": -57.83341979980469, + "logps/rejected": -105.45035552978516, + "loss": 0.5769, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.045729398727417, + "rewards/margins": 6.45914888381958, + "rewards/rejected": -3.413419485092163, + "step": 11295 + }, + { + "epoch": 2.83, + "grad_norm": 5.354089736938477, + "learning_rate": 3.982619559123903e-06, + "logits/chosen": -0.42787832021713257, + "logits/rejected": -0.5477384328842163, + "logps/chosen": -61.1096305847168, + "logps/rejected": -81.8277587890625, + "loss": 0.7127, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9459025859832764, + "rewards/margins": 6.037838459014893, + "rewards/rejected": -3.091935873031616, + "step": 11296 + }, + { + "epoch": 2.83, + "grad_norm": 7.280076503753662, + "learning_rate": 3.981850026971108e-06, + "logits/chosen": -0.5686938762664795, + "logits/rejected": -0.6848539113998413, + "logps/chosen": -56.10826873779297, + "logps/rejected": -94.79386901855469, + "loss": 0.7444, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.239441156387329, + "rewards/margins": 6.975584983825684, + "rewards/rejected": -3.73614501953125, + "step": 11297 + }, + { + "epoch": 2.83, + "grad_norm": 6.347419261932373, + "learning_rate": 3.981080519977883e-06, + "logits/chosen": -0.6072991490364075, + "logits/rejected": -0.6981865763664246, + "logps/chosen": -45.738853454589844, + "logps/rejected": -88.9931869506836, + "loss": 0.7435, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2787699699401855, + "rewards/margins": 6.666956424713135, + "rewards/rejected": -3.388186454772949, + "step": 11298 + }, + { + "epoch": 2.83, + "grad_norm": 5.317601203918457, + "learning_rate": 3.980311038163241e-06, + "logits/chosen": -0.5724877119064331, + "logits/rejected": -0.5676600337028503, + "logps/chosen": -45.754791259765625, + "logps/rejected": -114.87789916992188, + "loss": 0.6669, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.896329641342163, + "rewards/margins": 6.0097198486328125, + "rewards/rejected": -3.113389730453491, + "step": 11299 + }, + { + "epoch": 2.83, + "grad_norm": 6.406678199768066, + "learning_rate": 3.979541581546197e-06, + "logits/chosen": -0.5316441655158997, + "logits/rejected": -0.5467901825904846, + "logps/chosen": -43.52080154418945, + "logps/rejected": -102.02510833740234, + "loss": 0.7499, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2730422019958496, + "rewards/margins": 6.803277492523193, + "rewards/rejected": -3.5302352905273438, + "step": 11300 + }, + { + "epoch": 2.83, + "grad_norm": 10.401116371154785, + "learning_rate": 3.978772150145767e-06, + "logits/chosen": -0.45362910628318787, + "logits/rejected": -0.5274717211723328, + "logps/chosen": -60.805267333984375, + "logps/rejected": -99.1872787475586, + "loss": 0.7531, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.147942543029785, + "rewards/margins": 5.781889915466309, + "rewards/rejected": -2.6339478492736816, + "step": 11301 + }, + { + "epoch": 2.83, + "grad_norm": 8.110024452209473, + "learning_rate": 3.978002743980964e-06, + "logits/chosen": -0.5396846532821655, + "logits/rejected": -0.6116743683815002, + "logps/chosen": -56.19029998779297, + "logps/rejected": -94.30829620361328, + "loss": 0.7232, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0740206241607666, + "rewards/margins": 5.375803470611572, + "rewards/rejected": -2.3017826080322266, + "step": 11302 + }, + { + "epoch": 2.83, + "grad_norm": 11.339753150939941, + "learning_rate": 3.977233363070799e-06, + "logits/chosen": -0.429395467042923, + "logits/rejected": -0.5235554575920105, + "logps/chosen": -60.58089065551758, + "logps/rejected": -95.50259399414062, + "loss": 0.6913, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8178162574768066, + "rewards/margins": 6.011495590209961, + "rewards/rejected": -3.193678617477417, + "step": 11303 + }, + { + "epoch": 2.83, + "grad_norm": 3.454650402069092, + "learning_rate": 3.976464007434286e-06, + "logits/chosen": -0.5345398783683777, + "logits/rejected": -0.5943385362625122, + "logps/chosen": -53.17036819458008, + "logps/rejected": -91.06156921386719, + "loss": 0.6698, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.930302381515503, + "rewards/margins": 6.0273966789245605, + "rewards/rejected": -3.097094774246216, + "step": 11304 + }, + { + "epoch": 2.83, + "grad_norm": 6.426307201385498, + "learning_rate": 3.975694677090436e-06, + "logits/chosen": -0.470542848110199, + "logits/rejected": -0.5313845872879028, + "logps/chosen": -45.74999237060547, + "logps/rejected": -101.07103729248047, + "loss": 0.642, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.256988763809204, + "rewards/margins": 6.603093147277832, + "rewards/rejected": -3.346104145050049, + "step": 11305 + }, + { + "epoch": 2.83, + "grad_norm": 4.505393981933594, + "learning_rate": 3.974925372058261e-06, + "logits/chosen": -0.5358371138572693, + "logits/rejected": -0.5724130868911743, + "logps/chosen": -52.50785827636719, + "logps/rejected": -92.36347198486328, + "loss": 0.6605, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.769719123840332, + "rewards/margins": 5.588454246520996, + "rewards/rejected": -2.818735361099243, + "step": 11306 + }, + { + "epoch": 2.83, + "grad_norm": 3.93620228767395, + "learning_rate": 3.974156092356769e-06, + "logits/chosen": -0.5371235013008118, + "logits/rejected": -0.5921013951301575, + "logps/chosen": -45.547019958496094, + "logps/rejected": -99.46300506591797, + "loss": 0.6255, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9136743545532227, + "rewards/margins": 5.692168235778809, + "rewards/rejected": -2.778493642807007, + "step": 11307 + }, + { + "epoch": 2.83, + "grad_norm": 2.873843193054199, + "learning_rate": 3.973386838004971e-06, + "logits/chosen": -0.48226189613342285, + "logits/rejected": -0.6106132864952087, + "logps/chosen": -71.0982437133789, + "logps/rejected": -89.92062377929688, + "loss": 0.6383, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.838876247406006, + "rewards/margins": 6.38114595413208, + "rewards/rejected": -3.542269468307495, + "step": 11308 + }, + { + "epoch": 2.83, + "grad_norm": 12.165472984313965, + "learning_rate": 3.972617609021878e-06, + "logits/chosen": -0.4227830767631531, + "logits/rejected": -0.48375922441482544, + "logps/chosen": -48.86474609375, + "logps/rejected": -126.06997680664062, + "loss": 0.6718, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8747682571411133, + "rewards/margins": 6.460825443267822, + "rewards/rejected": -3.586056709289551, + "step": 11309 + }, + { + "epoch": 2.83, + "grad_norm": 3.252708673477173, + "learning_rate": 3.971848405426495e-06, + "logits/chosen": -0.479442834854126, + "logits/rejected": -0.5697870254516602, + "logps/chosen": -50.02898406982422, + "logps/rejected": -108.77424621582031, + "loss": 0.6305, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.112205743789673, + "rewards/margins": 7.045638084411621, + "rewards/rejected": -3.9334330558776855, + "step": 11310 + }, + { + "epoch": 2.83, + "grad_norm": 5.1717658042907715, + "learning_rate": 3.971079227237832e-06, + "logits/chosen": -0.41542649269104004, + "logits/rejected": -0.4339815676212311, + "logps/chosen": -57.039772033691406, + "logps/rejected": -127.88594818115234, + "loss": 0.5549, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.95930814743042, + "rewards/margins": 6.999826431274414, + "rewards/rejected": -4.040518283843994, + "step": 11311 + }, + { + "epoch": 2.83, + "grad_norm": 13.647204399108887, + "learning_rate": 3.970310074474894e-06, + "logits/chosen": -0.4929600954055786, + "logits/rejected": -0.6085641980171204, + "logps/chosen": -53.301490783691406, + "logps/rejected": -85.50601959228516, + "loss": 0.7703, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.175964117050171, + "rewards/margins": 6.571897983551025, + "rewards/rejected": -3.3959341049194336, + "step": 11312 + }, + { + "epoch": 2.83, + "grad_norm": 4.651698589324951, + "learning_rate": 3.969540947156693e-06, + "logits/chosen": -0.43770843744277954, + "logits/rejected": -0.49863123893737793, + "logps/chosen": -54.351341247558594, + "logps/rejected": -104.92277526855469, + "loss": 0.6593, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.118084192276001, + "rewards/margins": 5.649692058563232, + "rewards/rejected": -2.5316081047058105, + "step": 11313 + }, + { + "epoch": 2.83, + "grad_norm": 3.555333375930786, + "learning_rate": 3.968771845302228e-06, + "logits/chosen": -0.5270692110061646, + "logits/rejected": -0.5798704624176025, + "logps/chosen": -51.91338348388672, + "logps/rejected": -95.98220825195312, + "loss": 0.5629, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.216953754425049, + "rewards/margins": 7.052267074584961, + "rewards/rejected": -3.835313558578491, + "step": 11314 + }, + { + "epoch": 2.83, + "grad_norm": 5.367669582366943, + "learning_rate": 3.968002768930508e-06, + "logits/chosen": -0.42224907875061035, + "logits/rejected": -0.549224853515625, + "logps/chosen": -65.56138610839844, + "logps/rejected": -99.6027603149414, + "loss": 0.6203, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1005821228027344, + "rewards/margins": 6.67538595199585, + "rewards/rejected": -3.574802875518799, + "step": 11315 + }, + { + "epoch": 2.83, + "grad_norm": 4.336030960083008, + "learning_rate": 3.9672337180605404e-06, + "logits/chosen": -0.5021321177482605, + "logits/rejected": -0.5735846757888794, + "logps/chosen": -52.970027923583984, + "logps/rejected": -104.7888412475586, + "loss": 0.6388, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4062893390655518, + "rewards/margins": 6.336042881011963, + "rewards/rejected": -2.929753541946411, + "step": 11316 + }, + { + "epoch": 2.83, + "grad_norm": 4.848924160003662, + "learning_rate": 3.9664646927113225e-06, + "logits/chosen": -0.5549358129501343, + "logits/rejected": -0.6222701668739319, + "logps/chosen": -45.95271301269531, + "logps/rejected": -93.5616683959961, + "loss": 0.6154, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2591865062713623, + "rewards/margins": 5.927420616149902, + "rewards/rejected": -2.66823410987854, + "step": 11317 + }, + { + "epoch": 2.83, + "grad_norm": 4.913609504699707, + "learning_rate": 3.965695692901862e-06, + "logits/chosen": -0.4100401997566223, + "logits/rejected": -0.47485285997390747, + "logps/chosen": -69.54846954345703, + "logps/rejected": -109.3357925415039, + "loss": 0.6907, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1749374866485596, + "rewards/margins": 6.166117191314697, + "rewards/rejected": -2.9911797046661377, + "step": 11318 + }, + { + "epoch": 2.83, + "grad_norm": 16.046770095825195, + "learning_rate": 3.964926718651161e-06, + "logits/chosen": -0.4360819458961487, + "logits/rejected": -0.5138317346572876, + "logps/chosen": -65.85737609863281, + "logps/rejected": -102.27317810058594, + "loss": 0.762, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.512819290161133, + "rewards/margins": 5.764286518096924, + "rewards/rejected": -3.251467227935791, + "step": 11319 + }, + { + "epoch": 2.83, + "grad_norm": 11.771151542663574, + "learning_rate": 3.964157769978224e-06, + "logits/chosen": -0.3770873248577118, + "logits/rejected": -0.48776349425315857, + "logps/chosen": -62.200748443603516, + "logps/rejected": -95.6627197265625, + "loss": 0.7242, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.777876853942871, + "rewards/margins": 5.268711090087891, + "rewards/rejected": -2.4908339977264404, + "step": 11320 + }, + { + "epoch": 2.83, + "grad_norm": 5.823633670806885, + "learning_rate": 3.963388846902047e-06, + "logits/chosen": -0.6128722429275513, + "logits/rejected": -0.6699168682098389, + "logps/chosen": -53.405113220214844, + "logps/rejected": -87.91582489013672, + "loss": 0.6593, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8727059364318848, + "rewards/margins": 5.47711181640625, + "rewards/rejected": -2.6044061183929443, + "step": 11321 + }, + { + "epoch": 2.83, + "grad_norm": 4.052046775817871, + "learning_rate": 3.9626199494416364e-06, + "logits/chosen": -0.5298833250999451, + "logits/rejected": -0.6096640825271606, + "logps/chosen": -55.530555725097656, + "logps/rejected": -103.01243591308594, + "loss": 0.6112, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0275964736938477, + "rewards/margins": 7.715169429779053, + "rewards/rejected": -4.687573432922363, + "step": 11322 + }, + { + "epoch": 2.83, + "grad_norm": 4.570409297943115, + "learning_rate": 3.96185107761599e-06, + "logits/chosen": -0.5149205327033997, + "logits/rejected": -0.5779849290847778, + "logps/chosen": -55.931602478027344, + "logps/rejected": -81.35430908203125, + "loss": 0.6847, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.12093186378479, + "rewards/margins": 5.350818634033203, + "rewards/rejected": -2.229886531829834, + "step": 11323 + }, + { + "epoch": 2.83, + "grad_norm": 3.5135018825531006, + "learning_rate": 3.9610822314441075e-06, + "logits/chosen": -0.48019343614578247, + "logits/rejected": -0.5832119584083557, + "logps/chosen": -57.66001510620117, + "logps/rejected": -85.93455505371094, + "loss": 0.6991, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9470200538635254, + "rewards/margins": 5.88885498046875, + "rewards/rejected": -2.9418346881866455, + "step": 11324 + }, + { + "epoch": 2.83, + "grad_norm": 4.429009914398193, + "learning_rate": 3.960313410944988e-06, + "logits/chosen": -0.4909818470478058, + "logits/rejected": -0.6356730461120605, + "logps/chosen": -63.08620834350586, + "logps/rejected": -88.51496124267578, + "loss": 0.6491, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.927316188812256, + "rewards/margins": 7.156650543212891, + "rewards/rejected": -4.229333877563477, + "step": 11325 + }, + { + "epoch": 2.83, + "grad_norm": 4.174520492553711, + "learning_rate": 3.959544616137629e-06, + "logits/chosen": -0.4341639280319214, + "logits/rejected": -0.5531433820724487, + "logps/chosen": -65.70333099365234, + "logps/rejected": -95.87574768066406, + "loss": 0.5895, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.966256618499756, + "rewards/margins": 6.952835559844971, + "rewards/rejected": -3.986579179763794, + "step": 11326 + }, + { + "epoch": 2.83, + "grad_norm": 11.9329195022583, + "learning_rate": 3.958775847041031e-06, + "logits/chosen": -0.49878525733947754, + "logits/rejected": -0.5480088591575623, + "logps/chosen": -59.02107620239258, + "logps/rejected": -118.84088134765625, + "loss": 0.6942, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.920102119445801, + "rewards/margins": 5.075499057769775, + "rewards/rejected": -2.1553964614868164, + "step": 11327 + }, + { + "epoch": 2.83, + "grad_norm": 15.331363677978516, + "learning_rate": 3.958007103674189e-06, + "logits/chosen": -0.48031339049339294, + "logits/rejected": -0.5805538892745972, + "logps/chosen": -55.10293197631836, + "logps/rejected": -100.38319396972656, + "loss": 0.684, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8814263343811035, + "rewards/margins": 6.88957405090332, + "rewards/rejected": -4.008147239685059, + "step": 11328 + }, + { + "epoch": 2.83, + "grad_norm": 3.7930116653442383, + "learning_rate": 3.957238386056098e-06, + "logits/chosen": -0.4985693097114563, + "logits/rejected": -0.5662990808486938, + "logps/chosen": -57.72998046875, + "logps/rejected": -100.92984008789062, + "loss": 0.6783, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0158960819244385, + "rewards/margins": 6.5283050537109375, + "rewards/rejected": -3.512409210205078, + "step": 11329 + }, + { + "epoch": 2.83, + "grad_norm": 4.380565643310547, + "learning_rate": 3.956469694205757e-06, + "logits/chosen": -0.5668972730636597, + "logits/rejected": -0.6717625260353088, + "logps/chosen": -44.16310119628906, + "logps/rejected": -79.51565551757812, + "loss": 0.6462, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1955339908599854, + "rewards/margins": 6.101596832275391, + "rewards/rejected": -2.9060633182525635, + "step": 11330 + }, + { + "epoch": 2.83, + "grad_norm": 8.679605484008789, + "learning_rate": 3.955701028142161e-06, + "logits/chosen": -0.48964351415634155, + "logits/rejected": -0.5628840327262878, + "logps/chosen": -53.24433517456055, + "logps/rejected": -101.73208618164062, + "loss": 0.6639, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.021576166152954, + "rewards/margins": 5.721127033233643, + "rewards/rejected": -2.6995506286621094, + "step": 11331 + }, + { + "epoch": 2.83, + "grad_norm": 9.059389114379883, + "learning_rate": 3.954932387884301e-06, + "logits/chosen": -0.47130686044692993, + "logits/rejected": -0.5507882833480835, + "logps/chosen": -52.369895935058594, + "logps/rejected": -102.44802856445312, + "loss": 0.6826, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.112423896789551, + "rewards/margins": 5.996986389160156, + "rewards/rejected": -2.8845620155334473, + "step": 11332 + }, + { + "epoch": 2.84, + "grad_norm": 4.279228687286377, + "learning_rate": 3.954163773451174e-06, + "logits/chosen": -0.5642949938774109, + "logits/rejected": -0.6734470725059509, + "logps/chosen": -53.16820526123047, + "logps/rejected": -89.04596710205078, + "loss": 0.6102, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.418020725250244, + "rewards/margins": 6.617910385131836, + "rewards/rejected": -3.199889659881592, + "step": 11333 + }, + { + "epoch": 2.84, + "grad_norm": 4.426114559173584, + "learning_rate": 3.9533951848617735e-06, + "logits/chosen": -0.48842546343803406, + "logits/rejected": -0.5710248947143555, + "logps/chosen": -46.633853912353516, + "logps/rejected": -97.94253540039062, + "loss": 0.6042, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9815590381622314, + "rewards/margins": 6.73726224899292, + "rewards/rejected": -3.7557029724121094, + "step": 11334 + }, + { + "epoch": 2.84, + "grad_norm": 8.430460929870605, + "learning_rate": 3.952626622135091e-06, + "logits/chosen": -0.4792931377887726, + "logits/rejected": -0.6069399118423462, + "logps/chosen": -65.00606536865234, + "logps/rejected": -79.79679870605469, + "loss": 0.728, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2041091918945312, + "rewards/margins": 5.728744983673096, + "rewards/rejected": -2.5246365070343018, + "step": 11335 + }, + { + "epoch": 2.84, + "grad_norm": 7.241184711456299, + "learning_rate": 3.951858085290117e-06, + "logits/chosen": -0.43727728724479675, + "logits/rejected": -0.5216525793075562, + "logps/chosen": -68.60565185546875, + "logps/rejected": -92.36558532714844, + "loss": 0.7029, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0864064693450928, + "rewards/margins": 5.572109222412109, + "rewards/rejected": -2.4857025146484375, + "step": 11336 + }, + { + "epoch": 2.84, + "grad_norm": 6.267665386199951, + "learning_rate": 3.951089574345846e-06, + "logits/chosen": -0.5023175477981567, + "logits/rejected": -0.5791828632354736, + "logps/chosen": -57.225189208984375, + "logps/rejected": -112.81208801269531, + "loss": 0.7253, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.222465753555298, + "rewards/margins": 5.880578517913818, + "rewards/rejected": -2.6581125259399414, + "step": 11337 + }, + { + "epoch": 2.84, + "grad_norm": 12.190101623535156, + "learning_rate": 3.950321089321268e-06, + "logits/chosen": -0.40837520360946655, + "logits/rejected": -0.5467291474342346, + "logps/chosen": -67.34980773925781, + "logps/rejected": -101.8030776977539, + "loss": 0.6142, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.111616611480713, + "rewards/margins": 6.550107479095459, + "rewards/rejected": -3.438491106033325, + "step": 11338 + }, + { + "epoch": 2.84, + "grad_norm": 4.695420742034912, + "learning_rate": 3.9495526302353716e-06, + "logits/chosen": -0.429281085729599, + "logits/rejected": -0.544651210308075, + "logps/chosen": -46.97334671020508, + "logps/rejected": -95.01812744140625, + "loss": 0.5632, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0152504444122314, + "rewards/margins": 6.339715480804443, + "rewards/rejected": -3.32446551322937, + "step": 11339 + }, + { + "epoch": 2.84, + "grad_norm": 4.275743007659912, + "learning_rate": 3.948784197107146e-06, + "logits/chosen": -0.579240620136261, + "logits/rejected": -0.6373029351234436, + "logps/chosen": -49.114044189453125, + "logps/rejected": -86.20999145507812, + "loss": 0.7196, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1498656272888184, + "rewards/margins": 6.053944110870361, + "rewards/rejected": -2.904078722000122, + "step": 11340 + }, + { + "epoch": 2.84, + "grad_norm": 6.804479122161865, + "learning_rate": 3.948015789955583e-06, + "logits/chosen": -0.44185128808021545, + "logits/rejected": -0.5391719937324524, + "logps/chosen": -57.28362274169922, + "logps/rejected": -94.82600402832031, + "loss": 0.6206, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.024937868118286, + "rewards/margins": 6.224762916564941, + "rewards/rejected": -3.1998252868652344, + "step": 11341 + }, + { + "epoch": 2.84, + "grad_norm": 11.899590492248535, + "learning_rate": 3.947247408799668e-06, + "logits/chosen": -0.5286438465118408, + "logits/rejected": -0.5876626968383789, + "logps/chosen": -55.65681457519531, + "logps/rejected": -110.67503356933594, + "loss": 0.5921, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9126739501953125, + "rewards/margins": 6.960262298583984, + "rewards/rejected": -4.047588348388672, + "step": 11342 + }, + { + "epoch": 2.84, + "grad_norm": 5.902003288269043, + "learning_rate": 3.946479053658387e-06, + "logits/chosen": -0.49818307161331177, + "logits/rejected": -0.5802762508392334, + "logps/chosen": -62.52015686035156, + "logps/rejected": -90.32410430908203, + "loss": 0.7395, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.147385358810425, + "rewards/margins": 6.201990127563477, + "rewards/rejected": -3.0546045303344727, + "step": 11343 + }, + { + "epoch": 2.84, + "grad_norm": 13.364702224731445, + "learning_rate": 3.945710724550731e-06, + "logits/chosen": -0.40597862005233765, + "logits/rejected": -0.4794747829437256, + "logps/chosen": -56.6792106628418, + "logps/rejected": -103.35855102539062, + "loss": 0.7485, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8833796977996826, + "rewards/margins": 5.855307579040527, + "rewards/rejected": -2.9719274044036865, + "step": 11344 + }, + { + "epoch": 2.84, + "grad_norm": 16.00780487060547, + "learning_rate": 3.944942421495687e-06, + "logits/chosen": -0.5121973156929016, + "logits/rejected": -0.6169007420539856, + "logps/chosen": -55.158348083496094, + "logps/rejected": -85.98717498779297, + "loss": 0.7846, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9477803707122803, + "rewards/margins": 5.947044849395752, + "rewards/rejected": -2.999264717102051, + "step": 11345 + }, + { + "epoch": 2.84, + "grad_norm": 5.05226993560791, + "learning_rate": 3.9441741445122335e-06, + "logits/chosen": -0.43697208166122437, + "logits/rejected": -0.5013576149940491, + "logps/chosen": -70.98014068603516, + "logps/rejected": -83.29052734375, + "loss": 0.8414, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.467365026473999, + "rewards/margins": 4.844447612762451, + "rewards/rejected": -1.3770822286605835, + "step": 11346 + }, + { + "epoch": 2.84, + "grad_norm": 4.621273994445801, + "learning_rate": 3.943405893619362e-06, + "logits/chosen": -0.4420420229434967, + "logits/rejected": -0.566677987575531, + "logps/chosen": -52.43836212158203, + "logps/rejected": -85.66604614257812, + "loss": 0.5554, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8851263523101807, + "rewards/margins": 5.410215854644775, + "rewards/rejected": -2.525089740753174, + "step": 11347 + }, + { + "epoch": 2.84, + "grad_norm": 3.57146954536438, + "learning_rate": 3.942637668836055e-06, + "logits/chosen": -0.5274672508239746, + "logits/rejected": -0.5814465284347534, + "logps/chosen": -50.04224395751953, + "logps/rejected": -81.17557525634766, + "loss": 0.676, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1611826419830322, + "rewards/margins": 6.211163520812988, + "rewards/rejected": -3.049980878829956, + "step": 11348 + }, + { + "epoch": 2.84, + "grad_norm": 4.2438459396362305, + "learning_rate": 3.941869470181296e-06, + "logits/chosen": -0.4629557430744171, + "logits/rejected": -0.571595311164856, + "logps/chosen": -55.89621353149414, + "logps/rejected": -95.7193832397461, + "loss": 0.6662, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9127659797668457, + "rewards/margins": 6.7248992919921875, + "rewards/rejected": -3.8121328353881836, + "step": 11349 + }, + { + "epoch": 2.84, + "grad_norm": 10.174543380737305, + "learning_rate": 3.941101297674067e-06, + "logits/chosen": -0.5058755874633789, + "logits/rejected": -0.5637106895446777, + "logps/chosen": -44.53445816040039, + "logps/rejected": -83.12275695800781, + "loss": 0.7125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.333059310913086, + "rewards/margins": 6.280048847198486, + "rewards/rejected": -2.9469900131225586, + "step": 11350 + }, + { + "epoch": 2.84, + "grad_norm": 5.177655220031738, + "learning_rate": 3.940333151333351e-06, + "logits/chosen": -0.44028031826019287, + "logits/rejected": -0.5263710618019104, + "logps/chosen": -48.13997268676758, + "logps/rejected": -86.54743957519531, + "loss": 0.6902, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1619338989257812, + "rewards/margins": 6.358552932739258, + "rewards/rejected": -3.1966190338134766, + "step": 11351 + }, + { + "epoch": 2.84, + "grad_norm": 9.636215209960938, + "learning_rate": 3.939565031178132e-06, + "logits/chosen": -0.5210464000701904, + "logits/rejected": -0.590383768081665, + "logps/chosen": -66.24615478515625, + "logps/rejected": -90.29776000976562, + "loss": 0.7779, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9147307872772217, + "rewards/margins": 5.836353302001953, + "rewards/rejected": -2.9216222763061523, + "step": 11352 + }, + { + "epoch": 2.84, + "grad_norm": 4.7867960929870605, + "learning_rate": 3.938796937227387e-06, + "logits/chosen": -0.4358351528644562, + "logits/rejected": -0.5087217688560486, + "logps/chosen": -56.83262252807617, + "logps/rejected": -92.89663696289062, + "loss": 0.6985, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.961995840072632, + "rewards/margins": 5.873256683349609, + "rewards/rejected": -2.9112610816955566, + "step": 11353 + }, + { + "epoch": 2.84, + "grad_norm": 28.827789306640625, + "learning_rate": 3.938028869500099e-06, + "logits/chosen": -0.5474162697792053, + "logits/rejected": -0.5753653049468994, + "logps/chosen": -47.854026794433594, + "logps/rejected": -104.23458862304688, + "loss": 0.8565, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7849535942077637, + "rewards/margins": 5.051111698150635, + "rewards/rejected": -2.266158103942871, + "step": 11354 + }, + { + "epoch": 2.84, + "grad_norm": 5.612708568572998, + "learning_rate": 3.937260828015246e-06, + "logits/chosen": -0.4925151765346527, + "logits/rejected": -0.6012527346611023, + "logps/chosen": -47.22889709472656, + "logps/rejected": -93.76494598388672, + "loss": 0.613, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0805180072784424, + "rewards/margins": 6.573016166687012, + "rewards/rejected": -3.4924979209899902, + "step": 11355 + }, + { + "epoch": 2.84, + "grad_norm": 3.869332790374756, + "learning_rate": 3.936492812791812e-06, + "logits/chosen": -0.4756149351596832, + "logits/rejected": -0.5684474110603333, + "logps/chosen": -55.768070220947266, + "logps/rejected": -88.54917907714844, + "loss": 0.6725, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.209319591522217, + "rewards/margins": 5.737430095672607, + "rewards/rejected": -2.5281105041503906, + "step": 11356 + }, + { + "epoch": 2.84, + "grad_norm": 2.992182493209839, + "learning_rate": 3.935724823848768e-06, + "logits/chosen": -0.5020042061805725, + "logits/rejected": -0.5779234766960144, + "logps/chosen": -62.29296112060547, + "logps/rejected": -90.90460205078125, + "loss": 0.6495, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2054128646850586, + "rewards/margins": 7.187074184417725, + "rewards/rejected": -3.981661796569824, + "step": 11357 + }, + { + "epoch": 2.84, + "grad_norm": 11.19144058227539, + "learning_rate": 3.934956861205096e-06, + "logits/chosen": -0.5199747681617737, + "logits/rejected": -0.6283860802650452, + "logps/chosen": -51.731807708740234, + "logps/rejected": -104.26924896240234, + "loss": 0.7783, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.081530809402466, + "rewards/margins": 7.276863098144531, + "rewards/rejected": -4.195332050323486, + "step": 11358 + }, + { + "epoch": 2.84, + "grad_norm": 4.336803913116455, + "learning_rate": 3.9341889248797754e-06, + "logits/chosen": -0.484924852848053, + "logits/rejected": -0.5456217527389526, + "logps/chosen": -54.50960159301758, + "logps/rejected": -103.78827667236328, + "loss": 0.6754, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3121178150177, + "rewards/margins": 5.846408367156982, + "rewards/rejected": -2.5342905521392822, + "step": 11359 + }, + { + "epoch": 2.84, + "grad_norm": 3.1632320880889893, + "learning_rate": 3.933421014891777e-06, + "logits/chosen": -0.46483466029167175, + "logits/rejected": -0.5211647152900696, + "logps/chosen": -54.608856201171875, + "logps/rejected": -121.07717895507812, + "loss": 0.6818, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9992096424102783, + "rewards/margins": 7.049656391143799, + "rewards/rejected": -4.050446033477783, + "step": 11360 + }, + { + "epoch": 2.84, + "grad_norm": 4.13823938369751, + "learning_rate": 3.932653131260081e-06, + "logits/chosen": -0.45422986149787903, + "logits/rejected": -0.5403279066085815, + "logps/chosen": -49.72224807739258, + "logps/rejected": -106.6169662475586, + "loss": 0.6185, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.138979911804199, + "rewards/margins": 6.133668899536133, + "rewards/rejected": -2.9946889877319336, + "step": 11361 + }, + { + "epoch": 2.84, + "grad_norm": 5.109488487243652, + "learning_rate": 3.931885274003661e-06, + "logits/chosen": -0.5234979391098022, + "logits/rejected": -0.6396430134773254, + "logps/chosen": -50.60243225097656, + "logps/rejected": -90.85060119628906, + "loss": 0.6465, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1016111373901367, + "rewards/margins": 7.113818168640137, + "rewards/rejected": -4.012206554412842, + "step": 11362 + }, + { + "epoch": 2.84, + "grad_norm": 8.706485748291016, + "learning_rate": 3.931117443141494e-06, + "logits/chosen": -0.41279685497283936, + "logits/rejected": -0.5069501399993896, + "logps/chosen": -60.93968963623047, + "logps/rejected": -82.35784912109375, + "loss": 0.7071, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.047633171081543, + "rewards/margins": 6.23223876953125, + "rewards/rejected": -3.184605598449707, + "step": 11363 + }, + { + "epoch": 2.84, + "grad_norm": 3.6971232891082764, + "learning_rate": 3.9303496386925495e-06, + "logits/chosen": -0.4968876242637634, + "logits/rejected": -0.6179327964782715, + "logps/chosen": -51.282684326171875, + "logps/rejected": -89.01852416992188, + "loss": 0.6159, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2258756160736084, + "rewards/margins": 6.835784912109375, + "rewards/rejected": -3.6099088191986084, + "step": 11364 + }, + { + "epoch": 2.84, + "grad_norm": 2.7528090476989746, + "learning_rate": 3.929581860675804e-06, + "logits/chosen": -0.5011571049690247, + "logits/rejected": -0.5961048603057861, + "logps/chosen": -51.180564880371094, + "logps/rejected": -90.9850082397461, + "loss": 0.687, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.348707914352417, + "rewards/margins": 6.3150739669799805, + "rewards/rejected": -2.9663660526275635, + "step": 11365 + }, + { + "epoch": 2.84, + "grad_norm": 2.928812026977539, + "learning_rate": 3.92881410911023e-06, + "logits/chosen": -0.43755578994750977, + "logits/rejected": -0.5279003977775574, + "logps/chosen": -45.67197036743164, + "logps/rejected": -98.60942077636719, + "loss": 0.5323, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.112119674682617, + "rewards/margins": 6.3920793533325195, + "rewards/rejected": -3.279958963394165, + "step": 11366 + }, + { + "epoch": 2.84, + "grad_norm": 3.946575880050659, + "learning_rate": 3.9280463840147985e-06, + "logits/chosen": -0.45834270119667053, + "logits/rejected": -0.6194097995758057, + "logps/chosen": -58.79749298095703, + "logps/rejected": -76.65837860107422, + "loss": 0.6285, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.4232265949249268, + "rewards/margins": 6.273838043212891, + "rewards/rejected": -2.850611686706543, + "step": 11367 + }, + { + "epoch": 2.84, + "grad_norm": 6.8721089363098145, + "learning_rate": 3.92727868540848e-06, + "logits/chosen": -0.43487703800201416, + "logits/rejected": -0.47441232204437256, + "logps/chosen": -71.12285614013672, + "logps/rejected": -104.06310272216797, + "loss": 0.8161, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.656219244003296, + "rewards/margins": 4.862132549285889, + "rewards/rejected": -2.2059133052825928, + "step": 11368 + }, + { + "epoch": 2.84, + "grad_norm": 11.559921264648438, + "learning_rate": 3.926511013310247e-06, + "logits/chosen": -0.4384046792984009, + "logits/rejected": -0.533278226852417, + "logps/chosen": -58.668487548828125, + "logps/rejected": -96.88363647460938, + "loss": 0.6729, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2747437953948975, + "rewards/margins": 5.359622478485107, + "rewards/rejected": -2.08487868309021, + "step": 11369 + }, + { + "epoch": 2.84, + "grad_norm": 7.19423770904541, + "learning_rate": 3.925743367739069e-06, + "logits/chosen": -0.5363772511482239, + "logits/rejected": -0.5892162322998047, + "logps/chosen": -59.91440963745117, + "logps/rejected": -110.68216705322266, + "loss": 0.7094, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7366783618927, + "rewards/margins": 7.188664436340332, + "rewards/rejected": -4.451986312866211, + "step": 11370 + }, + { + "epoch": 2.84, + "grad_norm": 4.584683418273926, + "learning_rate": 3.924975748713914e-06, + "logits/chosen": -0.4652014672756195, + "logits/rejected": -0.5815710425376892, + "logps/chosen": -51.052825927734375, + "logps/rejected": -78.46885681152344, + "loss": 0.5774, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.192173480987549, + "rewards/margins": 6.035060882568359, + "rewards/rejected": -2.8428871631622314, + "step": 11371 + }, + { + "epoch": 2.84, + "grad_norm": 6.04462194442749, + "learning_rate": 3.924208156253752e-06, + "logits/chosen": -0.5105099678039551, + "logits/rejected": -0.5929269790649414, + "logps/chosen": -80.06828308105469, + "logps/rejected": -91.48593139648438, + "loss": 0.6467, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2121405601501465, + "rewards/margins": 6.608261585235596, + "rewards/rejected": -3.3961217403411865, + "step": 11372 + }, + { + "epoch": 2.85, + "grad_norm": 5.742345809936523, + "learning_rate": 3.923440590377553e-06, + "logits/chosen": -0.5254250168800354, + "logits/rejected": -0.6598705649375916, + "logps/chosen": -61.97782897949219, + "logps/rejected": -83.15728759765625, + "loss": 0.7534, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0141987800598145, + "rewards/margins": 6.6369781494140625, + "rewards/rejected": -3.622779369354248, + "step": 11373 + }, + { + "epoch": 2.85, + "grad_norm": 2.907273054122925, + "learning_rate": 3.922673051104279e-06, + "logits/chosen": -0.5086931586265564, + "logits/rejected": -0.5432729125022888, + "logps/chosen": -49.047943115234375, + "logps/rejected": -88.17973327636719, + "loss": 0.6402, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.282860517501831, + "rewards/margins": 5.80911922454834, + "rewards/rejected": -2.5262584686279297, + "step": 11374 + }, + { + "epoch": 2.85, + "grad_norm": 5.531331539154053, + "learning_rate": 3.9219055384529e-06, + "logits/chosen": -0.47371023893356323, + "logits/rejected": -0.5641701221466064, + "logps/chosen": -46.50792694091797, + "logps/rejected": -92.74726104736328, + "loss": 0.6221, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.920419216156006, + "rewards/margins": 6.1714677810668945, + "rewards/rejected": -3.2510488033294678, + "step": 11375 + }, + { + "epoch": 2.85, + "grad_norm": 13.126941680908203, + "learning_rate": 3.921138052442382e-06, + "logits/chosen": -0.40963536500930786, + "logits/rejected": -0.5291786789894104, + "logps/chosen": -63.59152603149414, + "logps/rejected": -101.63021850585938, + "loss": 0.7259, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6078405380249023, + "rewards/margins": 6.837539196014404, + "rewards/rejected": -4.22969913482666, + "step": 11376 + }, + { + "epoch": 2.85, + "grad_norm": 6.470991134643555, + "learning_rate": 3.920370593091692e-06, + "logits/chosen": -0.4615568518638611, + "logits/rejected": -0.5348856449127197, + "logps/chosen": -61.48720932006836, + "logps/rejected": -103.56717681884766, + "loss": 0.8318, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.021268129348755, + "rewards/margins": 5.342174530029297, + "rewards/rejected": -2.320906639099121, + "step": 11377 + }, + { + "epoch": 2.85, + "grad_norm": 11.750654220581055, + "learning_rate": 3.919603160419791e-06, + "logits/chosen": -0.4889484941959381, + "logits/rejected": -0.5983229279518127, + "logps/chosen": -50.50428771972656, + "logps/rejected": -95.72367858886719, + "loss": 0.6303, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.887190341949463, + "rewards/margins": 6.07985782623291, + "rewards/rejected": -3.192667245864868, + "step": 11378 + }, + { + "epoch": 2.85, + "grad_norm": 6.245121479034424, + "learning_rate": 3.9188357544456464e-06, + "logits/chosen": -0.4452117085456848, + "logits/rejected": -0.4868636429309845, + "logps/chosen": -61.282691955566406, + "logps/rejected": -109.85054779052734, + "loss": 0.6368, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6822803020477295, + "rewards/margins": 6.281157493591309, + "rewards/rejected": -3.598877191543579, + "step": 11379 + }, + { + "epoch": 2.85, + "grad_norm": 3.673774242401123, + "learning_rate": 3.91806837518822e-06, + "logits/chosen": -0.5708181858062744, + "logits/rejected": -0.676019012928009, + "logps/chosen": -44.65718078613281, + "logps/rejected": -108.39883422851562, + "loss": 0.5778, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.163595199584961, + "rewards/margins": 6.484006404876709, + "rewards/rejected": -3.320411443710327, + "step": 11380 + }, + { + "epoch": 2.85, + "grad_norm": 11.96932601928711, + "learning_rate": 3.9173010226664754e-06, + "logits/chosen": -0.49093419313430786, + "logits/rejected": -0.5683406591415405, + "logps/chosen": -56.20488739013672, + "logps/rejected": -115.26368713378906, + "loss": 0.6686, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9159433841705322, + "rewards/margins": 6.253230571746826, + "rewards/rejected": -3.337287187576294, + "step": 11381 + }, + { + "epoch": 2.85, + "grad_norm": 6.589631080627441, + "learning_rate": 3.916533696899373e-06, + "logits/chosen": -0.5817909240722656, + "logits/rejected": -0.6466348171234131, + "logps/chosen": -48.418373107910156, + "logps/rejected": -101.12158203125, + "loss": 0.6963, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8660383224487305, + "rewards/margins": 6.959177017211914, + "rewards/rejected": -4.093138217926025, + "step": 11382 + }, + { + "epoch": 2.85, + "grad_norm": 2.8780555725097656, + "learning_rate": 3.915766397905876e-06, + "logits/chosen": -0.5687098503112793, + "logits/rejected": -0.6257428526878357, + "logps/chosen": -69.30301666259766, + "logps/rejected": -109.7676010131836, + "loss": 0.6232, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.329613447189331, + "rewards/margins": 5.794751167297363, + "rewards/rejected": -2.4651379585266113, + "step": 11383 + }, + { + "epoch": 2.85, + "grad_norm": 4.447767734527588, + "learning_rate": 3.914999125704945e-06, + "logits/chosen": -0.43090906739234924, + "logits/rejected": -0.49584639072418213, + "logps/chosen": -49.8510627746582, + "logps/rejected": -103.31399536132812, + "loss": 0.6527, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0457754135131836, + "rewards/margins": 6.5602641105651855, + "rewards/rejected": -3.514488697052002, + "step": 11384 + }, + { + "epoch": 2.85, + "grad_norm": 12.845196723937988, + "learning_rate": 3.914231880315539e-06, + "logits/chosen": -0.4656165838241577, + "logits/rejected": -0.51481032371521, + "logps/chosen": -57.800933837890625, + "logps/rejected": -89.81742858886719, + "loss": 0.7353, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9066309928894043, + "rewards/margins": 5.202226638793945, + "rewards/rejected": -2.295595645904541, + "step": 11385 + }, + { + "epoch": 2.85, + "grad_norm": 10.580880165100098, + "learning_rate": 3.913464661756618e-06, + "logits/chosen": -0.47146332263946533, + "logits/rejected": -0.6012341976165771, + "logps/chosen": -60.04592514038086, + "logps/rejected": -100.93037414550781, + "loss": 0.7417, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8694918155670166, + "rewards/margins": 5.782113075256348, + "rewards/rejected": -2.91262149810791, + "step": 11386 + }, + { + "epoch": 2.85, + "grad_norm": 2.6745505332946777, + "learning_rate": 3.91269747004714e-06, + "logits/chosen": -0.5300145745277405, + "logits/rejected": -0.6363783478736877, + "logps/chosen": -45.356895446777344, + "logps/rejected": -109.1572036743164, + "loss": 0.5311, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1475331783294678, + "rewards/margins": 7.427661895751953, + "rewards/rejected": -4.280128479003906, + "step": 11387 + }, + { + "epoch": 2.85, + "grad_norm": 6.703486442565918, + "learning_rate": 3.911930305206066e-06, + "logits/chosen": -0.5214037895202637, + "logits/rejected": -0.5824411511421204, + "logps/chosen": -60.07011795043945, + "logps/rejected": -96.40996551513672, + "loss": 0.7223, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.953016757965088, + "rewards/margins": 5.243583679199219, + "rewards/rejected": -2.29056715965271, + "step": 11388 + }, + { + "epoch": 2.85, + "grad_norm": 5.166757106781006, + "learning_rate": 3.9111631672523496e-06, + "logits/chosen": -0.49201810359954834, + "logits/rejected": -0.541954517364502, + "logps/chosen": -46.05959701538086, + "logps/rejected": -112.36276245117188, + "loss": 0.5584, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.26015043258667, + "rewards/margins": 7.528771877288818, + "rewards/rejected": -4.268621444702148, + "step": 11389 + }, + { + "epoch": 2.85, + "grad_norm": 3.7628822326660156, + "learning_rate": 3.910396056204951e-06, + "logits/chosen": -0.5403482913970947, + "logits/rejected": -0.6722003221511841, + "logps/chosen": -57.867584228515625, + "logps/rejected": -86.94031524658203, + "loss": 0.6801, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0407445430755615, + "rewards/margins": 6.476750373840332, + "rewards/rejected": -3.436005115509033, + "step": 11390 + }, + { + "epoch": 2.85, + "grad_norm": 11.743118286132812, + "learning_rate": 3.909628972082826e-06, + "logits/chosen": -0.48738062381744385, + "logits/rejected": -0.5593751072883606, + "logps/chosen": -54.71607208251953, + "logps/rejected": -101.5648193359375, + "loss": 0.6574, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.998546600341797, + "rewards/margins": 5.479104042053223, + "rewards/rejected": -2.480557680130005, + "step": 11391 + }, + { + "epoch": 2.85, + "grad_norm": 3.6781065464019775, + "learning_rate": 3.908861914904926e-06, + "logits/chosen": -0.46119073033332825, + "logits/rejected": -0.5942748785018921, + "logps/chosen": -67.12313842773438, + "logps/rejected": -103.67149353027344, + "loss": 0.5466, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9300637245178223, + "rewards/margins": 6.986545562744141, + "rewards/rejected": -4.056482315063477, + "step": 11392 + }, + { + "epoch": 2.85, + "grad_norm": 4.6721954345703125, + "learning_rate": 3.908094884690209e-06, + "logits/chosen": -0.4932370185852051, + "logits/rejected": -0.5885308384895325, + "logps/chosen": -54.38389587402344, + "logps/rejected": -103.18321228027344, + "loss": 0.5949, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3014492988586426, + "rewards/margins": 6.059924125671387, + "rewards/rejected": -2.7584753036499023, + "step": 11393 + }, + { + "epoch": 2.85, + "grad_norm": 4.631969451904297, + "learning_rate": 3.9073278814576295e-06, + "logits/chosen": -0.4132678508758545, + "logits/rejected": -0.5371902585029602, + "logps/chosen": -46.36412048339844, + "logps/rejected": -89.05480194091797, + "loss": 0.6613, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0767722129821777, + "rewards/margins": 6.283096790313721, + "rewards/rejected": -3.206324577331543, + "step": 11394 + }, + { + "epoch": 2.85, + "grad_norm": 5.558708667755127, + "learning_rate": 3.906560905226141e-06, + "logits/chosen": -0.4836593270301819, + "logits/rejected": -0.5686662197113037, + "logps/chosen": -52.647708892822266, + "logps/rejected": -100.41949462890625, + "loss": 0.6189, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1948044300079346, + "rewards/margins": 6.654760360717773, + "rewards/rejected": -3.459956169128418, + "step": 11395 + }, + { + "epoch": 2.85, + "grad_norm": 2.749788522720337, + "learning_rate": 3.905793956014695e-06, + "logits/chosen": -0.4768732190132141, + "logits/rejected": -0.5933531522750854, + "logps/chosen": -56.793548583984375, + "logps/rejected": -111.0665054321289, + "loss": 0.6139, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3758785724639893, + "rewards/margins": 8.031023979187012, + "rewards/rejected": -4.655145168304443, + "step": 11396 + }, + { + "epoch": 2.85, + "grad_norm": 5.881240367889404, + "learning_rate": 3.905027033842244e-06, + "logits/chosen": -0.5230304598808289, + "logits/rejected": -0.619175910949707, + "logps/chosen": -60.70099639892578, + "logps/rejected": -101.71762084960938, + "loss": 0.8265, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.074204206466675, + "rewards/margins": 6.1644463539123535, + "rewards/rejected": -3.0902421474456787, + "step": 11397 + }, + { + "epoch": 2.85, + "grad_norm": 5.48006010055542, + "learning_rate": 3.90426013872774e-06, + "logits/chosen": -0.50804203748703, + "logits/rejected": -0.6115676164627075, + "logps/chosen": -51.22373962402344, + "logps/rejected": -100.6330337524414, + "loss": 0.7041, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.834425449371338, + "rewards/margins": 5.968976020812988, + "rewards/rejected": -3.1345505714416504, + "step": 11398 + }, + { + "epoch": 2.85, + "grad_norm": 7.0586957931518555, + "learning_rate": 3.903493270690133e-06, + "logits/chosen": -0.4862149953842163, + "logits/rejected": -0.613342821598053, + "logps/chosen": -61.145816802978516, + "logps/rejected": -98.57107543945312, + "loss": 0.7307, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.041452169418335, + "rewards/margins": 6.363180160522461, + "rewards/rejected": -3.321728229522705, + "step": 11399 + }, + { + "epoch": 2.85, + "grad_norm": 7.171053886413574, + "learning_rate": 3.902726429748374e-06, + "logits/chosen": -0.4375801384449005, + "logits/rejected": -0.5561579465866089, + "logps/chosen": -52.859153747558594, + "logps/rejected": -80.2812271118164, + "loss": 0.5878, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7441506385803223, + "rewards/margins": 5.458359718322754, + "rewards/rejected": -2.71420955657959, + "step": 11400 + }, + { + "epoch": 2.85, + "grad_norm": 6.817304611206055, + "learning_rate": 3.90195961592141e-06, + "logits/chosen": -0.5123466849327087, + "logits/rejected": -0.566989004611969, + "logps/chosen": -54.702247619628906, + "logps/rejected": -105.47946166992188, + "loss": 0.7476, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.120755672454834, + "rewards/margins": 6.15156364440918, + "rewards/rejected": -3.0308079719543457, + "step": 11401 + }, + { + "epoch": 2.85, + "grad_norm": 6.675551891326904, + "learning_rate": 3.901192829228196e-06, + "logits/chosen": -0.5952252149581909, + "logits/rejected": -0.635918378829956, + "logps/chosen": -55.905738830566406, + "logps/rejected": -92.48738098144531, + "loss": 0.6282, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6215972900390625, + "rewards/margins": 6.222954750061035, + "rewards/rejected": -2.6013572216033936, + "step": 11402 + }, + { + "epoch": 2.85, + "grad_norm": 3.8692688941955566, + "learning_rate": 3.9004260696876726e-06, + "logits/chosen": -0.45925211906433105, + "logits/rejected": -0.6068757772445679, + "logps/chosen": -62.32799530029297, + "logps/rejected": -95.89703369140625, + "loss": 0.5841, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8665599822998047, + "rewards/margins": 7.120090007781982, + "rewards/rejected": -4.253530025482178, + "step": 11403 + }, + { + "epoch": 2.85, + "grad_norm": 8.285249710083008, + "learning_rate": 3.8996593373187905e-06, + "logits/chosen": -0.45403194427490234, + "logits/rejected": -0.5858698487281799, + "logps/chosen": -51.748573303222656, + "logps/rejected": -87.11650848388672, + "loss": 0.6254, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9113643169403076, + "rewards/margins": 6.889988422393799, + "rewards/rejected": -3.978624105453491, + "step": 11404 + }, + { + "epoch": 2.85, + "grad_norm": 3.526982069015503, + "learning_rate": 3.8988926321405e-06, + "logits/chosen": -0.4602561593055725, + "logits/rejected": -0.5043430328369141, + "logps/chosen": -52.85884094238281, + "logps/rejected": -100.29302978515625, + "loss": 0.5587, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.990454912185669, + "rewards/margins": 6.688170909881592, + "rewards/rejected": -3.697716236114502, + "step": 11405 + }, + { + "epoch": 2.85, + "grad_norm": 10.270230293273926, + "learning_rate": 3.89812595417174e-06, + "logits/chosen": -0.5381141901016235, + "logits/rejected": -0.616704523563385, + "logps/chosen": -54.85704803466797, + "logps/rejected": -97.58548736572266, + "loss": 0.71, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.798931121826172, + "rewards/margins": 5.988239765167236, + "rewards/rejected": -3.1893084049224854, + "step": 11406 + }, + { + "epoch": 2.85, + "grad_norm": 2.458401679992676, + "learning_rate": 3.897359303431461e-06, + "logits/chosen": -0.4951077699661255, + "logits/rejected": -0.6248626112937927, + "logps/chosen": -48.37671661376953, + "logps/rejected": -91.870361328125, + "loss": 0.5341, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1987452507019043, + "rewards/margins": 6.241010665893555, + "rewards/rejected": -3.0422656536102295, + "step": 11407 + }, + { + "epoch": 2.85, + "grad_norm": 26.132408142089844, + "learning_rate": 3.896592679938606e-06, + "logits/chosen": -0.43098652362823486, + "logits/rejected": -0.48183655738830566, + "logps/chosen": -63.29051208496094, + "logps/rejected": -95.27086639404297, + "loss": 0.7081, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.088987112045288, + "rewards/margins": 5.445328712463379, + "rewards/rejected": -2.3563413619995117, + "step": 11408 + }, + { + "epoch": 2.85, + "grad_norm": 4.342888832092285, + "learning_rate": 3.895826083712121e-06, + "logits/chosen": -0.5024912357330322, + "logits/rejected": -0.5672192573547363, + "logps/chosen": -51.20963668823242, + "logps/rejected": -97.29605865478516, + "loss": 0.6285, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0231218338012695, + "rewards/margins": 5.81127405166626, + "rewards/rejected": -2.7881524562835693, + "step": 11409 + }, + { + "epoch": 2.85, + "grad_norm": 5.047169208526611, + "learning_rate": 3.895059514770947e-06, + "logits/chosen": -0.5799608826637268, + "logits/rejected": -0.6419969797134399, + "logps/chosen": -49.99476623535156, + "logps/rejected": -116.16787719726562, + "loss": 0.6105, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9922313690185547, + "rewards/margins": 7.54841947555542, + "rewards/rejected": -4.556188583374023, + "step": 11410 + }, + { + "epoch": 2.85, + "grad_norm": 8.301301002502441, + "learning_rate": 3.894292973134028e-06, + "logits/chosen": -0.4808551073074341, + "logits/rejected": -0.5520843267440796, + "logps/chosen": -59.258514404296875, + "logps/rejected": -90.08859252929688, + "loss": 0.7106, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9379310607910156, + "rewards/margins": 5.379022121429443, + "rewards/rejected": -2.4410908222198486, + "step": 11411 + }, + { + "epoch": 2.85, + "grad_norm": 21.241207122802734, + "learning_rate": 3.893526458820304e-06, + "logits/chosen": -0.47815895080566406, + "logits/rejected": -0.5233322381973267, + "logps/chosen": -55.25760269165039, + "logps/rejected": -89.55197143554688, + "loss": 0.7984, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.931321382522583, + "rewards/margins": 5.39195442199707, + "rewards/rejected": -2.4606330394744873, + "step": 11412 + }, + { + "epoch": 2.86, + "grad_norm": 4.392611503601074, + "learning_rate": 3.89275997184872e-06, + "logits/chosen": -0.4724552631378174, + "logits/rejected": -0.5737439393997192, + "logps/chosen": -51.195701599121094, + "logps/rejected": -98.54560852050781, + "loss": 0.6427, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8793649673461914, + "rewards/margins": 6.670066833496094, + "rewards/rejected": -3.7907016277313232, + "step": 11413 + }, + { + "epoch": 2.86, + "grad_norm": 4.243833541870117, + "learning_rate": 3.891993512238215e-06, + "logits/chosen": -0.4836804270744324, + "logits/rejected": -0.5430686473846436, + "logps/chosen": -53.911224365234375, + "logps/rejected": -105.16231536865234, + "loss": 0.585, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9854249954223633, + "rewards/margins": 6.008697509765625, + "rewards/rejected": -3.023272752761841, + "step": 11414 + }, + { + "epoch": 2.86, + "grad_norm": 4.15604829788208, + "learning_rate": 3.891227080007726e-06, + "logits/chosen": -0.43974411487579346, + "logits/rejected": -0.5759814381599426, + "logps/chosen": -68.19654083251953, + "logps/rejected": -83.89844512939453, + "loss": 0.6631, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.015367269515991, + "rewards/margins": 6.390271186828613, + "rewards/rejected": -3.374903440475464, + "step": 11415 + }, + { + "epoch": 2.86, + "grad_norm": 15.145432472229004, + "learning_rate": 3.890460675176199e-06, + "logits/chosen": -0.47685784101486206, + "logits/rejected": -0.5611196160316467, + "logps/chosen": -59.283897399902344, + "logps/rejected": -101.68746948242188, + "loss": 0.6971, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0897529125213623, + "rewards/margins": 6.843374252319336, + "rewards/rejected": -3.7536213397979736, + "step": 11416 + }, + { + "epoch": 2.86, + "grad_norm": 2.8656551837921143, + "learning_rate": 3.889694297762567e-06, + "logits/chosen": -0.5105980634689331, + "logits/rejected": -0.603848397731781, + "logps/chosen": -52.697364807128906, + "logps/rejected": -95.1545181274414, + "loss": 0.5252, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1992549896240234, + "rewards/margins": 6.256653785705566, + "rewards/rejected": -3.057399272918701, + "step": 11417 + }, + { + "epoch": 2.86, + "grad_norm": 6.85163688659668, + "learning_rate": 3.888927947785769e-06, + "logits/chosen": -0.5274089574813843, + "logits/rejected": -0.5470410585403442, + "logps/chosen": -47.11955261230469, + "logps/rejected": -95.93742370605469, + "loss": 0.7057, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9991869926452637, + "rewards/margins": 5.438104629516602, + "rewards/rejected": -2.438917398452759, + "step": 11418 + }, + { + "epoch": 2.86, + "grad_norm": 5.216708183288574, + "learning_rate": 3.888161625264744e-06, + "logits/chosen": -0.5128729343414307, + "logits/rejected": -0.6221129298210144, + "logps/chosen": -60.013404846191406, + "logps/rejected": -91.61329650878906, + "loss": 0.7259, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9234778881073, + "rewards/margins": 5.839534282684326, + "rewards/rejected": -2.9160566329956055, + "step": 11419 + }, + { + "epoch": 2.86, + "grad_norm": 18.5517520904541, + "learning_rate": 3.887395330218429e-06, + "logits/chosen": -0.3985334634780884, + "logits/rejected": -0.49375656247138977, + "logps/chosen": -51.79267501831055, + "logps/rejected": -85.05642700195312, + "loss": 0.6369, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0501699447631836, + "rewards/margins": 5.497127056121826, + "rewards/rejected": -2.4469571113586426, + "step": 11420 + }, + { + "epoch": 2.86, + "grad_norm": 3.952895164489746, + "learning_rate": 3.886629062665757e-06, + "logits/chosen": -0.46551233530044556, + "logits/rejected": -0.5712648034095764, + "logps/chosen": -58.221500396728516, + "logps/rejected": -100.74629974365234, + "loss": 0.5619, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1510586738586426, + "rewards/margins": 7.029320240020752, + "rewards/rejected": -3.878261089324951, + "step": 11421 + }, + { + "epoch": 2.86, + "grad_norm": 6.327371120452881, + "learning_rate": 3.885862822625666e-06, + "logits/chosen": -0.5067819952964783, + "logits/rejected": -0.6505639553070068, + "logps/chosen": -60.30613327026367, + "logps/rejected": -104.31742858886719, + "loss": 0.5863, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9662392139434814, + "rewards/margins": 6.723947525024414, + "rewards/rejected": -3.7577085494995117, + "step": 11422 + }, + { + "epoch": 2.86, + "grad_norm": 32.33552169799805, + "learning_rate": 3.885096610117091e-06, + "logits/chosen": -0.4780505299568176, + "logits/rejected": -0.5957962274551392, + "logps/chosen": -67.42748260498047, + "logps/rejected": -96.87620544433594, + "loss": 0.717, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.962646007537842, + "rewards/margins": 6.713674545288086, + "rewards/rejected": -3.751027822494507, + "step": 11423 + }, + { + "epoch": 2.86, + "grad_norm": 15.04167366027832, + "learning_rate": 3.884330425158963e-06, + "logits/chosen": -0.5382353663444519, + "logits/rejected": -0.6092482209205627, + "logps/chosen": -56.65089416503906, + "logps/rejected": -87.82856750488281, + "loss": 0.6456, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.079455852508545, + "rewards/margins": 5.933987617492676, + "rewards/rejected": -2.854532241821289, + "step": 11424 + }, + { + "epoch": 2.86, + "grad_norm": 14.979000091552734, + "learning_rate": 3.883564267770216e-06, + "logits/chosen": -0.47558921575546265, + "logits/rejected": -0.5677886009216309, + "logps/chosen": -62.56123733520508, + "logps/rejected": -85.87079620361328, + "loss": 0.6721, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2294931411743164, + "rewards/margins": 4.757049560546875, + "rewards/rejected": -1.5275564193725586, + "step": 11425 + }, + { + "epoch": 2.86, + "grad_norm": 5.659397125244141, + "learning_rate": 3.882798137969785e-06, + "logits/chosen": -0.4738900363445282, + "logits/rejected": -0.5381037592887878, + "logps/chosen": -54.48191833496094, + "logps/rejected": -83.4267578125, + "loss": 0.7199, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.035339593887329, + "rewards/margins": 4.619419097900391, + "rewards/rejected": -1.5840790271759033, + "step": 11426 + }, + { + "epoch": 2.86, + "grad_norm": 4.412862300872803, + "learning_rate": 3.882032035776601e-06, + "logits/chosen": -0.47439044713974, + "logits/rejected": -0.5265341401100159, + "logps/chosen": -50.08345031738281, + "logps/rejected": -111.41866302490234, + "loss": 0.5656, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2023823261260986, + "rewards/margins": 6.088127136230469, + "rewards/rejected": -2.885744571685791, + "step": 11427 + }, + { + "epoch": 2.86, + "grad_norm": 6.911512851715088, + "learning_rate": 3.8812659612095925e-06, + "logits/chosen": -0.3926762640476227, + "logits/rejected": -0.48576152324676514, + "logps/chosen": -49.690399169921875, + "logps/rejected": -91.59141540527344, + "loss": 0.6509, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.874053955078125, + "rewards/margins": 6.214027404785156, + "rewards/rejected": -3.3399734497070312, + "step": 11428 + }, + { + "epoch": 2.86, + "grad_norm": 5.564064025878906, + "learning_rate": 3.880499914287693e-06, + "logits/chosen": -0.39466074109077454, + "logits/rejected": -0.49282991886138916, + "logps/chosen": -68.27197265625, + "logps/rejected": -96.14701843261719, + "loss": 0.8459, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.877169370651245, + "rewards/margins": 5.332515716552734, + "rewards/rejected": -2.455345630645752, + "step": 11429 + }, + { + "epoch": 2.86, + "grad_norm": 5.528555393218994, + "learning_rate": 3.879733895029833e-06, + "logits/chosen": -0.4636877775192261, + "logits/rejected": -0.5635554194450378, + "logps/chosen": -53.7513542175293, + "logps/rejected": -84.97332763671875, + "loss": 0.7178, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8609962463378906, + "rewards/margins": 4.943331718444824, + "rewards/rejected": -2.0823354721069336, + "step": 11430 + }, + { + "epoch": 2.86, + "grad_norm": 6.827998638153076, + "learning_rate": 3.878967903454937e-06, + "logits/chosen": -0.4952724575996399, + "logits/rejected": -0.5769211053848267, + "logps/chosen": -67.8053207397461, + "logps/rejected": -113.74363708496094, + "loss": 0.7883, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.808260202407837, + "rewards/margins": 6.2892255783081055, + "rewards/rejected": -3.4809656143188477, + "step": 11431 + }, + { + "epoch": 2.86, + "grad_norm": 10.444793701171875, + "learning_rate": 3.878201939581937e-06, + "logits/chosen": -0.5487743616104126, + "logits/rejected": -0.6303583383560181, + "logps/chosen": -60.3551025390625, + "logps/rejected": -100.90977478027344, + "loss": 0.639, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.819725513458252, + "rewards/margins": 6.345943450927734, + "rewards/rejected": -3.526217460632324, + "step": 11432 + }, + { + "epoch": 2.86, + "grad_norm": 4.028501987457275, + "learning_rate": 3.8774360034297606e-06, + "logits/chosen": -0.4695332646369934, + "logits/rejected": -0.5967085361480713, + "logps/chosen": -60.61846923828125, + "logps/rejected": -86.74063873291016, + "loss": 0.5784, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3110034465789795, + "rewards/margins": 6.156929016113281, + "rewards/rejected": -2.845925807952881, + "step": 11433 + }, + { + "epoch": 2.86, + "grad_norm": 6.321132183074951, + "learning_rate": 3.876670095017337e-06, + "logits/chosen": -0.46894335746765137, + "logits/rejected": -0.5504414439201355, + "logps/chosen": -55.181480407714844, + "logps/rejected": -87.69866943359375, + "loss": 0.7271, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9075050354003906, + "rewards/margins": 5.160187244415283, + "rewards/rejected": -2.2526817321777344, + "step": 11434 + }, + { + "epoch": 2.86, + "grad_norm": 3.3913464546203613, + "learning_rate": 3.875904214363587e-06, + "logits/chosen": -0.4612838923931122, + "logits/rejected": -0.5262712240219116, + "logps/chosen": -52.54684066772461, + "logps/rejected": -96.04855346679688, + "loss": 0.6695, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3064136505126953, + "rewards/margins": 7.3444437980651855, + "rewards/rejected": -4.038029670715332, + "step": 11435 + }, + { + "epoch": 2.86, + "grad_norm": 10.847397804260254, + "learning_rate": 3.8751383614874405e-06, + "logits/chosen": -0.5700140595436096, + "logits/rejected": -0.5994645357131958, + "logps/chosen": -56.51597213745117, + "logps/rejected": -98.23699951171875, + "loss": 0.7453, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0310864448547363, + "rewards/margins": 5.794329643249512, + "rewards/rejected": -2.7632431983947754, + "step": 11436 + }, + { + "epoch": 2.86, + "grad_norm": 3.638681650161743, + "learning_rate": 3.874372536407822e-06, + "logits/chosen": -0.4486370384693146, + "logits/rejected": -0.4715615510940552, + "logps/chosen": -57.63556671142578, + "logps/rejected": -117.09382629394531, + "loss": 0.6297, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8825395107269287, + "rewards/margins": 6.2118635177612305, + "rewards/rejected": -3.3293235301971436, + "step": 11437 + }, + { + "epoch": 2.86, + "grad_norm": 6.315662860870361, + "learning_rate": 3.8736067391436556e-06, + "logits/chosen": -0.45703786611557007, + "logits/rejected": -0.5143701434135437, + "logps/chosen": -59.32746124267578, + "logps/rejected": -107.05335998535156, + "loss": 0.6027, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.135338306427002, + "rewards/margins": 5.997542381286621, + "rewards/rejected": -2.8622045516967773, + "step": 11438 + }, + { + "epoch": 2.86, + "grad_norm": 2.0050644874572754, + "learning_rate": 3.872840969713864e-06, + "logits/chosen": -0.5599918961524963, + "logits/rejected": -0.6318085789680481, + "logps/chosen": -41.13855743408203, + "logps/rejected": -93.52489471435547, + "loss": 0.5075, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5760350227355957, + "rewards/margins": 7.443282127380371, + "rewards/rejected": -3.867246627807617, + "step": 11439 + }, + { + "epoch": 2.86, + "grad_norm": 3.7590017318725586, + "learning_rate": 3.872075228137371e-06, + "logits/chosen": -0.4201408624649048, + "logits/rejected": -0.48245149850845337, + "logps/chosen": -52.88758087158203, + "logps/rejected": -95.23515319824219, + "loss": 0.6451, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.135063648223877, + "rewards/margins": 6.379609107971191, + "rewards/rejected": -3.2445454597473145, + "step": 11440 + }, + { + "epoch": 2.86, + "grad_norm": 3.8328871726989746, + "learning_rate": 3.8713095144331e-06, + "logits/chosen": -0.5187010169029236, + "logits/rejected": -0.624140739440918, + "logps/chosen": -53.92596435546875, + "logps/rejected": -109.52914428710938, + "loss": 0.6081, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9201483726501465, + "rewards/margins": 7.231832027435303, + "rewards/rejected": -4.311683654785156, + "step": 11441 + }, + { + "epoch": 2.86, + "grad_norm": 5.024115085601807, + "learning_rate": 3.87054382861997e-06, + "logits/chosen": -0.5645933747291565, + "logits/rejected": -0.5855531096458435, + "logps/chosen": -52.76692199707031, + "logps/rejected": -101.02481842041016, + "loss": 0.6074, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.082484722137451, + "rewards/margins": 5.836985111236572, + "rewards/rejected": -2.754499912261963, + "step": 11442 + }, + { + "epoch": 2.86, + "grad_norm": 9.508121490478516, + "learning_rate": 3.869778170716904e-06, + "logits/chosen": -0.49956372380256653, + "logits/rejected": -0.5469735264778137, + "logps/chosen": -67.22583770751953, + "logps/rejected": -97.3705062866211, + "loss": 0.8813, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6946067810058594, + "rewards/margins": 4.7497406005859375, + "rewards/rejected": -2.055133819580078, + "step": 11443 + }, + { + "epoch": 2.86, + "grad_norm": 10.693037033081055, + "learning_rate": 3.869012540742821e-06, + "logits/chosen": -0.542169988155365, + "logits/rejected": -0.587648868560791, + "logps/chosen": -48.48290252685547, + "logps/rejected": -108.14765167236328, + "loss": 0.7349, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0800392627716064, + "rewards/margins": 5.894368648529053, + "rewards/rejected": -2.814329147338867, + "step": 11444 + }, + { + "epoch": 2.86, + "grad_norm": 6.738425254821777, + "learning_rate": 3.868246938716643e-06, + "logits/chosen": -0.5030432939529419, + "logits/rejected": -0.5881394743919373, + "logps/chosen": -56.29789352416992, + "logps/rejected": -103.10948944091797, + "loss": 0.6574, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.147778034210205, + "rewards/margins": 7.175662040710449, + "rewards/rejected": -4.027884006500244, + "step": 11445 + }, + { + "epoch": 2.86, + "grad_norm": 2.756957769393921, + "learning_rate": 3.867481364657285e-06, + "logits/chosen": -0.47550439834594727, + "logits/rejected": -0.5068752765655518, + "logps/chosen": -57.40576171875, + "logps/rejected": -102.23541259765625, + "loss": 0.6429, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.079890012741089, + "rewards/margins": 6.088397026062012, + "rewards/rejected": -3.008507251739502, + "step": 11446 + }, + { + "epoch": 2.86, + "grad_norm": 5.548238754272461, + "learning_rate": 3.866715818583666e-06, + "logits/chosen": -0.6445094347000122, + "logits/rejected": -0.6918563842773438, + "logps/chosen": -44.933868408203125, + "logps/rejected": -93.09037780761719, + "loss": 0.7164, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.406188488006592, + "rewards/margins": 5.794927597045898, + "rewards/rejected": -2.3887388706207275, + "step": 11447 + }, + { + "epoch": 2.86, + "grad_norm": 5.192986965179443, + "learning_rate": 3.8659503005147085e-06, + "logits/chosen": -0.5584242939949036, + "logits/rejected": -0.6094402074813843, + "logps/chosen": -48.01175308227539, + "logps/rejected": -104.83220672607422, + "loss": 0.6946, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.118415355682373, + "rewards/margins": 6.805387496948242, + "rewards/rejected": -3.6869726181030273, + "step": 11448 + }, + { + "epoch": 2.86, + "grad_norm": 3.747714042663574, + "learning_rate": 3.865184810469321e-06, + "logits/chosen": -0.4717927575111389, + "logits/rejected": -0.6083717346191406, + "logps/chosen": -55.02473068237305, + "logps/rejected": -90.87319946289062, + "loss": 0.5637, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.059631586074829, + "rewards/margins": 6.892399311065674, + "rewards/rejected": -3.8327674865722656, + "step": 11449 + }, + { + "epoch": 2.86, + "grad_norm": 12.44308090209961, + "learning_rate": 3.864419348466426e-06, + "logits/chosen": -0.4768245220184326, + "logits/rejected": -0.5733050107955933, + "logps/chosen": -46.27250289916992, + "logps/rejected": -91.38105773925781, + "loss": 0.6843, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1923046112060547, + "rewards/margins": 6.247092247009277, + "rewards/rejected": -3.0547871589660645, + "step": 11450 + }, + { + "epoch": 2.86, + "grad_norm": 4.303244590759277, + "learning_rate": 3.863653914524936e-06, + "logits/chosen": -0.4933919608592987, + "logits/rejected": -0.6316742300987244, + "logps/chosen": -59.433990478515625, + "logps/rejected": -85.34679412841797, + "loss": 0.6923, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.041335344314575, + "rewards/margins": 6.668514728546143, + "rewards/rejected": -3.6271791458129883, + "step": 11451 + }, + { + "epoch": 2.86, + "grad_norm": 11.965662002563477, + "learning_rate": 3.862888508663768e-06, + "logits/chosen": -0.4494556188583374, + "logits/rejected": -0.5885575413703918, + "logps/chosen": -56.01406478881836, + "logps/rejected": -89.79798126220703, + "loss": 0.6529, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7406632900238037, + "rewards/margins": 6.811525344848633, + "rewards/rejected": -4.070862770080566, + "step": 11452 + }, + { + "epoch": 2.87, + "grad_norm": 5.771514415740967, + "learning_rate": 3.862123130901833e-06, + "logits/chosen": -0.5806746482849121, + "logits/rejected": -0.6502945423126221, + "logps/chosen": -52.852195739746094, + "logps/rejected": -110.554931640625, + "loss": 0.6821, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.868039131164551, + "rewards/margins": 6.246607780456543, + "rewards/rejected": -3.378567934036255, + "step": 11453 + }, + { + "epoch": 2.87, + "grad_norm": 3.0005311965942383, + "learning_rate": 3.861357781258046e-06, + "logits/chosen": -0.4192792475223541, + "logits/rejected": -0.5300976037979126, + "logps/chosen": -52.661041259765625, + "logps/rejected": -114.9327163696289, + "loss": 0.5232, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.227215051651001, + "rewards/margins": 6.829930305480957, + "rewards/rejected": -3.6027157306671143, + "step": 11454 + }, + { + "epoch": 2.87, + "grad_norm": 5.841943264007568, + "learning_rate": 3.8605924597513206e-06, + "logits/chosen": -0.505408763885498, + "logits/rejected": -0.5399551391601562, + "logps/chosen": -67.55256652832031, + "logps/rejected": -100.45628356933594, + "loss": 0.7614, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2084336280822754, + "rewards/margins": 6.396153926849365, + "rewards/rejected": -3.18772029876709, + "step": 11455 + }, + { + "epoch": 2.87, + "grad_norm": 6.993703365325928, + "learning_rate": 3.859827166400566e-06, + "logits/chosen": -0.49036431312561035, + "logits/rejected": -0.5926103591918945, + "logps/chosen": -59.443504333496094, + "logps/rejected": -89.977294921875, + "loss": 0.7547, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9504294395446777, + "rewards/margins": 5.586914539337158, + "rewards/rejected": -2.6364853382110596, + "step": 11456 + }, + { + "epoch": 2.87, + "grad_norm": 6.915070056915283, + "learning_rate": 3.859061901224695e-06, + "logits/chosen": -0.5186551809310913, + "logits/rejected": -0.6181612610816956, + "logps/chosen": -60.7420654296875, + "logps/rejected": -92.38951110839844, + "loss": 0.6667, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.895899534225464, + "rewards/margins": 6.239047050476074, + "rewards/rejected": -3.3431477546691895, + "step": 11457 + }, + { + "epoch": 2.87, + "grad_norm": 11.456526756286621, + "learning_rate": 3.858296664242617e-06, + "logits/chosen": -0.5111028552055359, + "logits/rejected": -0.5739786028862, + "logps/chosen": -60.09867858886719, + "logps/rejected": -86.61058044433594, + "loss": 0.9868, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.4856069087982178, + "rewards/margins": 4.550337791442871, + "rewards/rejected": -2.0647308826446533, + "step": 11458 + }, + { + "epoch": 2.87, + "grad_norm": 12.893194198608398, + "learning_rate": 3.857531455473246e-06, + "logits/chosen": -0.511793851852417, + "logits/rejected": -0.6209431886672974, + "logps/chosen": -49.79617691040039, + "logps/rejected": -93.52764892578125, + "loss": 0.6216, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8687496185302734, + "rewards/margins": 6.534092903137207, + "rewards/rejected": -3.6653430461883545, + "step": 11459 + }, + { + "epoch": 2.87, + "grad_norm": 3.5439300537109375, + "learning_rate": 3.856766274935485e-06, + "logits/chosen": -0.510391354560852, + "logits/rejected": -0.6121355891227722, + "logps/chosen": -56.31168746948242, + "logps/rejected": -91.82149505615234, + "loss": 0.5915, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.138150215148926, + "rewards/margins": 6.38094425201416, + "rewards/rejected": -3.2427947521209717, + "step": 11460 + }, + { + "epoch": 2.87, + "grad_norm": 4.271139621734619, + "learning_rate": 3.856001122648245e-06, + "logits/chosen": -0.5430091023445129, + "logits/rejected": -0.606873631477356, + "logps/chosen": -45.567108154296875, + "logps/rejected": -91.15628051757812, + "loss": 0.5541, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.929227352142334, + "rewards/margins": 6.343562126159668, + "rewards/rejected": -3.414334535598755, + "step": 11461 + }, + { + "epoch": 2.87, + "grad_norm": 3.5223348140716553, + "learning_rate": 3.855235998630436e-06, + "logits/chosen": -0.4426426589488983, + "logits/rejected": -0.5851763486862183, + "logps/chosen": -73.00679779052734, + "logps/rejected": -103.40646362304688, + "loss": 0.6302, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1072826385498047, + "rewards/margins": 7.320676326751709, + "rewards/rejected": -4.213393688201904, + "step": 11462 + }, + { + "epoch": 2.87, + "grad_norm": 5.837108135223389, + "learning_rate": 3.8544709029009625e-06, + "logits/chosen": -0.4798249900341034, + "logits/rejected": -0.5309039950370789, + "logps/chosen": -64.18779754638672, + "logps/rejected": -131.38905334472656, + "loss": 0.7441, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7464630603790283, + "rewards/margins": 6.6913065910339355, + "rewards/rejected": -3.9448437690734863, + "step": 11463 + }, + { + "epoch": 2.87, + "grad_norm": 6.1406965255737305, + "learning_rate": 3.853705835478731e-06, + "logits/chosen": -0.5267657041549683, + "logits/rejected": -0.5993577241897583, + "logps/chosen": -68.05999755859375, + "logps/rejected": -109.24716186523438, + "loss": 0.7078, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9057977199554443, + "rewards/margins": 6.881631851196289, + "rewards/rejected": -3.9758341312408447, + "step": 11464 + }, + { + "epoch": 2.87, + "grad_norm": 2.9407920837402344, + "learning_rate": 3.852940796382647e-06, + "logits/chosen": -0.5401239395141602, + "logits/rejected": -0.5976709723472595, + "logps/chosen": -42.60987854003906, + "logps/rejected": -96.05523681640625, + "loss": 0.5843, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.222135305404663, + "rewards/margins": 6.162650108337402, + "rewards/rejected": -2.9405157566070557, + "step": 11465 + }, + { + "epoch": 2.87, + "grad_norm": 4.505366802215576, + "learning_rate": 3.852175785631618e-06, + "logits/chosen": -0.45543575286865234, + "logits/rejected": -0.4984201192855835, + "logps/chosen": -55.84004592895508, + "logps/rejected": -102.39691162109375, + "loss": 0.6136, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.121227741241455, + "rewards/margins": 5.086005210876465, + "rewards/rejected": -1.9647774696350098, + "step": 11466 + }, + { + "epoch": 2.87, + "grad_norm": 5.477740287780762, + "learning_rate": 3.851410803244546e-06, + "logits/chosen": -0.5635669231414795, + "logits/rejected": -0.6456747651100159, + "logps/chosen": -52.86548614501953, + "logps/rejected": -74.57064056396484, + "loss": 0.7022, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.187208414077759, + "rewards/margins": 5.883096694946289, + "rewards/rejected": -2.6958882808685303, + "step": 11467 + }, + { + "epoch": 2.87, + "grad_norm": 4.710931777954102, + "learning_rate": 3.8506458492403335e-06, + "logits/chosen": -0.5178441405296326, + "logits/rejected": -0.6119086742401123, + "logps/chosen": -58.769248962402344, + "logps/rejected": -97.97930908203125, + "loss": 0.724, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0223748683929443, + "rewards/margins": 6.34174919128418, + "rewards/rejected": -3.319373607635498, + "step": 11468 + }, + { + "epoch": 2.87, + "grad_norm": 10.874760627746582, + "learning_rate": 3.849880923637884e-06, + "logits/chosen": -0.4725278317928314, + "logits/rejected": -0.5132171511650085, + "logps/chosen": -59.40352249145508, + "logps/rejected": -113.65977478027344, + "loss": 0.7689, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7487359046936035, + "rewards/margins": 5.909071922302246, + "rewards/rejected": -3.1603360176086426, + "step": 11469 + }, + { + "epoch": 2.87, + "grad_norm": 8.01964282989502, + "learning_rate": 3.849116026456102e-06, + "logits/chosen": -0.5930458903312683, + "logits/rejected": -0.6446352601051331, + "logps/chosen": -52.71604537963867, + "logps/rejected": -98.29934692382812, + "loss": 0.7885, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.067070722579956, + "rewards/margins": 6.344627857208252, + "rewards/rejected": -3.277557134628296, + "step": 11470 + }, + { + "epoch": 2.87, + "grad_norm": 5.897876739501953, + "learning_rate": 3.848351157713886e-06, + "logits/chosen": -0.4075796902179718, + "logits/rejected": -0.49847862124443054, + "logps/chosen": -62.616539001464844, + "logps/rejected": -90.40526580810547, + "loss": 0.6397, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.952439546585083, + "rewards/margins": 5.2808403968811035, + "rewards/rejected": -2.3284010887145996, + "step": 11471 + }, + { + "epoch": 2.87, + "grad_norm": 12.148422241210938, + "learning_rate": 3.847586317430137e-06, + "logits/chosen": -0.5493096113204956, + "logits/rejected": -0.6507456302642822, + "logps/chosen": -47.400203704833984, + "logps/rejected": -98.21316528320312, + "loss": 0.5676, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.135572910308838, + "rewards/margins": 7.1686248779296875, + "rewards/rejected": -4.033051490783691, + "step": 11472 + }, + { + "epoch": 2.87, + "grad_norm": 8.434976577758789, + "learning_rate": 3.846821505623758e-06, + "logits/chosen": -0.4177802503108978, + "logits/rejected": -0.5090728402137756, + "logps/chosen": -55.962738037109375, + "logps/rejected": -107.16481018066406, + "loss": 0.7613, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.908759355545044, + "rewards/margins": 5.0112833976745605, + "rewards/rejected": -2.1025238037109375, + "step": 11473 + }, + { + "epoch": 2.87, + "grad_norm": 18.17827606201172, + "learning_rate": 3.846056722313646e-06, + "logits/chosen": -0.46947938203811646, + "logits/rejected": -0.601371705532074, + "logps/chosen": -62.8070182800293, + "logps/rejected": -90.62903594970703, + "loss": 0.7365, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8914003372192383, + "rewards/margins": 6.223733901977539, + "rewards/rejected": -3.332333564758301, + "step": 11474 + }, + { + "epoch": 2.87, + "grad_norm": 16.43943977355957, + "learning_rate": 3.845291967518697e-06, + "logits/chosen": -0.4876089096069336, + "logits/rejected": -0.5814202427864075, + "logps/chosen": -65.65052795410156, + "logps/rejected": -101.8863525390625, + "loss": 0.8287, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4196951389312744, + "rewards/margins": 5.717833518981934, + "rewards/rejected": -3.2981386184692383, + "step": 11475 + }, + { + "epoch": 2.87, + "grad_norm": 10.403515815734863, + "learning_rate": 3.844527241257813e-06, + "logits/chosen": -0.5503209829330444, + "logits/rejected": -0.6595793962478638, + "logps/chosen": -67.26806640625, + "logps/rejected": -102.4646987915039, + "loss": 0.7424, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.865724563598633, + "rewards/margins": 6.256479740142822, + "rewards/rejected": -3.3907558917999268, + "step": 11476 + }, + { + "epoch": 2.87, + "grad_norm": 12.164146423339844, + "learning_rate": 3.843762543549893e-06, + "logits/chosen": -0.5060082674026489, + "logits/rejected": -0.5942509174346924, + "logps/chosen": -53.1497802734375, + "logps/rejected": -100.46749114990234, + "loss": 0.6519, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8727970123291016, + "rewards/margins": 6.091429710388184, + "rewards/rejected": -3.218632221221924, + "step": 11477 + }, + { + "epoch": 2.87, + "grad_norm": 14.19522476196289, + "learning_rate": 3.842997874413826e-06, + "logits/chosen": -0.49214449524879456, + "logits/rejected": -0.6134816408157349, + "logps/chosen": -61.227455139160156, + "logps/rejected": -127.53356170654297, + "loss": 0.6542, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.618387222290039, + "rewards/margins": 7.135515213012695, + "rewards/rejected": -4.517127990722656, + "step": 11478 + }, + { + "epoch": 2.87, + "grad_norm": 10.386394500732422, + "learning_rate": 3.8422332338685145e-06, + "logits/chosen": -0.5410614013671875, + "logits/rejected": -0.6392385363578796, + "logps/chosen": -55.26099395751953, + "logps/rejected": -95.59911346435547, + "loss": 0.7112, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0363757610321045, + "rewards/margins": 6.9700164794921875, + "rewards/rejected": -3.933640718460083, + "step": 11479 + }, + { + "epoch": 2.87, + "grad_norm": 8.690930366516113, + "learning_rate": 3.841468621932851e-06, + "logits/chosen": -0.41480183601379395, + "logits/rejected": -0.5372968316078186, + "logps/chosen": -63.57250213623047, + "logps/rejected": -107.31671142578125, + "loss": 0.7554, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.86440372467041, + "rewards/margins": 5.99176025390625, + "rewards/rejected": -3.1273562908172607, + "step": 11480 + }, + { + "epoch": 2.87, + "grad_norm": 4.447145462036133, + "learning_rate": 3.84070403862573e-06, + "logits/chosen": -0.5257160663604736, + "logits/rejected": -0.643068790435791, + "logps/chosen": -49.36311340332031, + "logps/rejected": -104.22187805175781, + "loss": 0.5872, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9555282592773438, + "rewards/margins": 6.572620868682861, + "rewards/rejected": -3.617091655731201, + "step": 11481 + }, + { + "epoch": 2.87, + "grad_norm": 8.151973724365234, + "learning_rate": 3.839939483966045e-06, + "logits/chosen": -0.49551308155059814, + "logits/rejected": -0.5704015493392944, + "logps/chosen": -69.92719268798828, + "logps/rejected": -81.86807250976562, + "loss": 0.8803, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.010885000228882, + "rewards/margins": 4.622232437133789, + "rewards/rejected": -1.6113471984863281, + "step": 11482 + }, + { + "epoch": 2.87, + "grad_norm": 5.81229305267334, + "learning_rate": 3.839174957972689e-06, + "logits/chosen": -0.463506817817688, + "logits/rejected": -0.5603882074356079, + "logps/chosen": -70.17118835449219, + "logps/rejected": -97.92745971679688, + "loss": 0.7222, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.847778081893921, + "rewards/margins": 5.626402854919434, + "rewards/rejected": -2.778625011444092, + "step": 11483 + }, + { + "epoch": 2.87, + "grad_norm": 1.9377684593200684, + "learning_rate": 3.838410460664556e-06, + "logits/chosen": -0.45691239833831787, + "logits/rejected": -0.6231911182403564, + "logps/chosen": -61.36351013183594, + "logps/rejected": -108.74329376220703, + "loss": 0.5822, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0126430988311768, + "rewards/margins": 8.160210609436035, + "rewards/rejected": -5.147567272186279, + "step": 11484 + }, + { + "epoch": 2.87, + "grad_norm": 4.842472076416016, + "learning_rate": 3.837645992060534e-06, + "logits/chosen": -0.5303416848182678, + "logits/rejected": -0.5925925970077515, + "logps/chosen": -57.728057861328125, + "logps/rejected": -93.72422790527344, + "loss": 0.6784, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.956690788269043, + "rewards/margins": 6.190466403961182, + "rewards/rejected": -3.2337753772735596, + "step": 11485 + }, + { + "epoch": 2.87, + "grad_norm": 6.104277610778809, + "learning_rate": 3.836881552179516e-06, + "logits/chosen": -0.5678420662879944, + "logits/rejected": -0.6694945096969604, + "logps/chosen": -59.424503326416016, + "logps/rejected": -110.0235824584961, + "loss": 0.6135, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.150913715362549, + "rewards/margins": 7.514647960662842, + "rewards/rejected": -4.363734722137451, + "step": 11486 + }, + { + "epoch": 2.87, + "grad_norm": 3.602891206741333, + "learning_rate": 3.836117141040392e-06, + "logits/chosen": -0.5322726964950562, + "logits/rejected": -0.5851316452026367, + "logps/chosen": -52.91268539428711, + "logps/rejected": -121.17330932617188, + "loss": 0.6754, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.19413685798645, + "rewards/margins": 6.996002197265625, + "rewards/rejected": -3.8018648624420166, + "step": 11487 + }, + { + "epoch": 2.87, + "grad_norm": 6.632505893707275, + "learning_rate": 3.835352758662054e-06, + "logits/chosen": -0.5200921297073364, + "logits/rejected": -0.5710427761077881, + "logps/chosen": -65.24565887451172, + "logps/rejected": -88.83727264404297, + "loss": 0.7927, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.926955223083496, + "rewards/margins": 4.611099720001221, + "rewards/rejected": -1.684144139289856, + "step": 11488 + }, + { + "epoch": 2.87, + "grad_norm": 3.986599922180176, + "learning_rate": 3.834588405063387e-06, + "logits/chosen": -0.4721129536628723, + "logits/rejected": -0.5513504147529602, + "logps/chosen": -55.42778778076172, + "logps/rejected": -102.15994262695312, + "loss": 0.6474, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9551055431365967, + "rewards/margins": 6.143014430999756, + "rewards/rejected": -3.1879096031188965, + "step": 11489 + }, + { + "epoch": 2.87, + "grad_norm": 2.6528358459472656, + "learning_rate": 3.833824080263278e-06, + "logits/chosen": -0.5618420839309692, + "logits/rejected": -0.6360342502593994, + "logps/chosen": -45.69929885864258, + "logps/rejected": -92.654296875, + "loss": 0.5789, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0922982692718506, + "rewards/margins": 6.849672317504883, + "rewards/rejected": -3.7573742866516113, + "step": 11490 + }, + { + "epoch": 2.87, + "grad_norm": 19.907838821411133, + "learning_rate": 3.833059784280619e-06, + "logits/chosen": -0.43261444568634033, + "logits/rejected": -0.523361325263977, + "logps/chosen": -58.938411712646484, + "logps/rejected": -102.98786926269531, + "loss": 0.7295, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0853989124298096, + "rewards/margins": 6.633176803588867, + "rewards/rejected": -3.5477781295776367, + "step": 11491 + }, + { + "epoch": 2.87, + "grad_norm": 2.4232795238494873, + "learning_rate": 3.8322955171342914e-06, + "logits/chosen": -0.5489833950996399, + "logits/rejected": -0.6927019953727722, + "logps/chosen": -65.4372787475586, + "logps/rejected": -83.56063079833984, + "loss": 0.6746, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.199256420135498, + "rewards/margins": 6.807193756103516, + "rewards/rejected": -3.6079368591308594, + "step": 11492 + }, + { + "epoch": 2.88, + "grad_norm": 9.041125297546387, + "learning_rate": 3.831531278843185e-06, + "logits/chosen": -0.4884372055530548, + "logits/rejected": -0.5899190306663513, + "logps/chosen": -56.59938049316406, + "logps/rejected": -83.06914520263672, + "loss": 0.8002, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.128370761871338, + "rewards/margins": 5.401688575744629, + "rewards/rejected": -2.273317575454712, + "step": 11493 + }, + { + "epoch": 2.88, + "grad_norm": 2.665372371673584, + "learning_rate": 3.8307670694261835e-06, + "logits/chosen": -0.5001563429832458, + "logits/rejected": -0.631668210029602, + "logps/chosen": -54.3190803527832, + "logps/rejected": -101.76981353759766, + "loss": 0.5883, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0873334407806396, + "rewards/margins": 7.280534267425537, + "rewards/rejected": -4.193201065063477, + "step": 11494 + }, + { + "epoch": 2.88, + "grad_norm": 6.833026885986328, + "learning_rate": 3.8300028889021715e-06, + "logits/chosen": -0.4860103726387024, + "logits/rejected": -0.5267602205276489, + "logps/chosen": -53.42792510986328, + "logps/rejected": -97.29901885986328, + "loss": 0.6825, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1974892616271973, + "rewards/margins": 4.909966945648193, + "rewards/rejected": -1.7124779224395752, + "step": 11495 + }, + { + "epoch": 2.88, + "grad_norm": 4.20217227935791, + "learning_rate": 3.8292387372900325e-06, + "logits/chosen": -0.5013195276260376, + "logits/rejected": -0.5790071487426758, + "logps/chosen": -64.58796691894531, + "logps/rejected": -96.59126281738281, + "loss": 0.7248, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.097790241241455, + "rewards/margins": 6.247150421142578, + "rewards/rejected": -3.149359941482544, + "step": 11496 + }, + { + "epoch": 2.88, + "grad_norm": 14.237295150756836, + "learning_rate": 3.828474614608649e-06, + "logits/chosen": -0.526477038860321, + "logits/rejected": -0.5772566795349121, + "logps/chosen": -54.37482452392578, + "logps/rejected": -109.730712890625, + "loss": 0.7347, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.054572582244873, + "rewards/margins": 6.204331398010254, + "rewards/rejected": -3.1497578620910645, + "step": 11497 + }, + { + "epoch": 2.88, + "grad_norm": 5.442988395690918, + "learning_rate": 3.827710520876906e-06, + "logits/chosen": -0.538902223110199, + "logits/rejected": -0.6060647368431091, + "logps/chosen": -44.940040588378906, + "logps/rejected": -96.67174530029297, + "loss": 0.6142, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.088845729827881, + "rewards/margins": 5.853156089782715, + "rewards/rejected": -2.764310121536255, + "step": 11498 + }, + { + "epoch": 2.88, + "grad_norm": 10.908120155334473, + "learning_rate": 3.826946456113681e-06, + "logits/chosen": -0.459949254989624, + "logits/rejected": -0.5246837735176086, + "logps/chosen": -51.759918212890625, + "logps/rejected": -94.28396606445312, + "loss": 0.6956, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9441139698028564, + "rewards/margins": 6.288118839263916, + "rewards/rejected": -3.3440051078796387, + "step": 11499 + }, + { + "epoch": 2.88, + "grad_norm": 5.431602478027344, + "learning_rate": 3.826182420337856e-06, + "logits/chosen": -0.528037428855896, + "logits/rejected": -0.6505592465400696, + "logps/chosen": -51.56126403808594, + "logps/rejected": -93.3548583984375, + "loss": 0.6506, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9254109859466553, + "rewards/margins": 6.686439514160156, + "rewards/rejected": -3.761028528213501, + "step": 11500 + }, + { + "epoch": 2.88, + "grad_norm": 6.587138652801514, + "learning_rate": 3.825418413568312e-06, + "logits/chosen": -0.564376711845398, + "logits/rejected": -0.5980596542358398, + "logps/chosen": -47.2164306640625, + "logps/rejected": -103.16606140136719, + "loss": 0.7574, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0387730598449707, + "rewards/margins": 6.154412269592285, + "rewards/rejected": -3.1156389713287354, + "step": 11501 + }, + { + "epoch": 2.88, + "grad_norm": 2.9686787128448486, + "learning_rate": 3.824654435823931e-06, + "logits/chosen": -0.5409038066864014, + "logits/rejected": -0.5847895741462708, + "logps/chosen": -62.179725646972656, + "logps/rejected": -111.69932556152344, + "loss": 0.6857, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.962664842605591, + "rewards/margins": 6.7531232833862305, + "rewards/rejected": -3.7904582023620605, + "step": 11502 + }, + { + "epoch": 2.88, + "grad_norm": 6.93815803527832, + "learning_rate": 3.823890487123587e-06, + "logits/chosen": -0.4838791489601135, + "logits/rejected": -0.5632182359695435, + "logps/chosen": -57.93375015258789, + "logps/rejected": -92.73839569091797, + "loss": 0.7402, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8270819187164307, + "rewards/margins": 5.288549423217773, + "rewards/rejected": -2.461467742919922, + "step": 11503 + }, + { + "epoch": 2.88, + "grad_norm": 6.299243450164795, + "learning_rate": 3.82312656748616e-06, + "logits/chosen": -0.428249329328537, + "logits/rejected": -0.49257612228393555, + "logps/chosen": -61.57730484008789, + "logps/rejected": -105.61283874511719, + "loss": 0.748, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.880206346511841, + "rewards/margins": 6.150631904602051, + "rewards/rejected": -3.2704248428344727, + "step": 11504 + }, + { + "epoch": 2.88, + "grad_norm": 7.052130222320557, + "learning_rate": 3.822362676930529e-06, + "logits/chosen": -0.44187191128730774, + "logits/rejected": -0.5147168636322021, + "logps/chosen": -47.248138427734375, + "logps/rejected": -98.362060546875, + "loss": 0.5759, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2319154739379883, + "rewards/margins": 6.9723219871521, + "rewards/rejected": -3.7404065132141113, + "step": 11505 + }, + { + "epoch": 2.88, + "grad_norm": 6.6037163734436035, + "learning_rate": 3.821598815475566e-06, + "logits/chosen": -0.5094354748725891, + "logits/rejected": -0.5762192010879517, + "logps/chosen": -49.01836395263672, + "logps/rejected": -81.55276489257812, + "loss": 0.7242, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0946946144104004, + "rewards/margins": 5.111761093139648, + "rewards/rejected": -2.017066478729248, + "step": 11506 + }, + { + "epoch": 2.88, + "grad_norm": 4.820046424865723, + "learning_rate": 3.820834983140151e-06, + "logits/chosen": -0.5200104117393494, + "logits/rejected": -0.5903933644294739, + "logps/chosen": -66.58888244628906, + "logps/rejected": -94.47784423828125, + "loss": 0.6964, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2886173725128174, + "rewards/margins": 6.393985271453857, + "rewards/rejected": -3.10536789894104, + "step": 11507 + }, + { + "epoch": 2.88, + "grad_norm": 3.960702896118164, + "learning_rate": 3.8200711799431574e-06, + "logits/chosen": -0.4885428249835968, + "logits/rejected": -0.5761207342147827, + "logps/chosen": -51.362613677978516, + "logps/rejected": -99.10579681396484, + "loss": 0.6429, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0734403133392334, + "rewards/margins": 6.6887030601501465, + "rewards/rejected": -3.615262031555176, + "step": 11508 + }, + { + "epoch": 2.88, + "grad_norm": 17.469289779663086, + "learning_rate": 3.819307405903462e-06, + "logits/chosen": -0.49387627840042114, + "logits/rejected": -0.5965039730072021, + "logps/chosen": -59.76800537109375, + "logps/rejected": -78.13714599609375, + "loss": 0.8133, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8649139404296875, + "rewards/margins": 5.503377914428711, + "rewards/rejected": -2.6384639739990234, + "step": 11509 + }, + { + "epoch": 2.88, + "grad_norm": 8.313770294189453, + "learning_rate": 3.818543661039935e-06, + "logits/chosen": -0.5512955188751221, + "logits/rejected": -0.6365311145782471, + "logps/chosen": -47.042869567871094, + "logps/rejected": -86.90707397460938, + "loss": 0.7132, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1382839679718018, + "rewards/margins": 5.263874053955078, + "rewards/rejected": -2.1255900859832764, + "step": 11510 + }, + { + "epoch": 2.88, + "grad_norm": 11.304158210754395, + "learning_rate": 3.817779945371451e-06, + "logits/chosen": -0.43849530816078186, + "logits/rejected": -0.5207003355026245, + "logps/chosen": -59.075340270996094, + "logps/rejected": -96.6902084350586, + "loss": 0.7535, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1098709106445312, + "rewards/margins": 5.5796709060668945, + "rewards/rejected": -2.469799280166626, + "step": 11511 + }, + { + "epoch": 2.88, + "grad_norm": 9.749222755432129, + "learning_rate": 3.817016258916882e-06, + "logits/chosen": -0.4755234122276306, + "logits/rejected": -0.6118932366371155, + "logps/chosen": -78.22692108154297, + "logps/rejected": -96.48584747314453, + "loss": 0.796, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8295342922210693, + "rewards/margins": 6.00750732421875, + "rewards/rejected": -3.1779730319976807, + "step": 11512 + }, + { + "epoch": 2.88, + "grad_norm": 12.536246299743652, + "learning_rate": 3.816252601695099e-06, + "logits/chosen": -0.5411178469657898, + "logits/rejected": -0.6343311071395874, + "logps/chosen": -57.22868728637695, + "logps/rejected": -101.27374267578125, + "loss": 0.6655, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.78869366645813, + "rewards/margins": 6.193134784698486, + "rewards/rejected": -3.4044408798217773, + "step": 11513 + }, + { + "epoch": 2.88, + "grad_norm": 9.001025199890137, + "learning_rate": 3.815488973724974e-06, + "logits/chosen": -0.5386425256729126, + "logits/rejected": -0.6293300986289978, + "logps/chosen": -62.67666244506836, + "logps/rejected": -90.92048645019531, + "loss": 0.7849, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9915173053741455, + "rewards/margins": 4.975915431976318, + "rewards/rejected": -1.9843981266021729, + "step": 11514 + }, + { + "epoch": 2.88, + "grad_norm": 25.372941970825195, + "learning_rate": 3.814725375025376e-06, + "logits/chosen": -0.5727414488792419, + "logits/rejected": -0.6424903273582458, + "logps/chosen": -46.26726150512695, + "logps/rejected": -98.57184600830078, + "loss": 0.6824, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.819674491882324, + "rewards/margins": 6.404568195343018, + "rewards/rejected": -3.5848937034606934, + "step": 11515 + }, + { + "epoch": 2.88, + "grad_norm": 4.743191719055176, + "learning_rate": 3.813961805615175e-06, + "logits/chosen": -0.4568968117237091, + "logits/rejected": -0.519316554069519, + "logps/chosen": -69.30435180664062, + "logps/rejected": -95.72342681884766, + "loss": 0.782, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.749894142150879, + "rewards/margins": 5.177372455596924, + "rewards/rejected": -2.4274778366088867, + "step": 11516 + }, + { + "epoch": 2.88, + "grad_norm": 4.691710472106934, + "learning_rate": 3.813198265513239e-06, + "logits/chosen": -0.513462245464325, + "logits/rejected": -0.6183063387870789, + "logps/chosen": -61.411407470703125, + "logps/rejected": -105.39910125732422, + "loss": 0.6786, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.773339033126831, + "rewards/margins": 6.544047832489014, + "rewards/rejected": -3.7707087993621826, + "step": 11517 + }, + { + "epoch": 2.88, + "grad_norm": 4.397342681884766, + "learning_rate": 3.812434754738434e-06, + "logits/chosen": -0.37854304909706116, + "logits/rejected": -0.5288994908332825, + "logps/chosen": -76.12516784667969, + "logps/rejected": -94.7745590209961, + "loss": 0.6697, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.805924892425537, + "rewards/margins": 5.845908164978027, + "rewards/rejected": -3.039983034133911, + "step": 11518 + }, + { + "epoch": 2.88, + "grad_norm": 2.4353082180023193, + "learning_rate": 3.8116712733096318e-06, + "logits/chosen": -0.588986337184906, + "logits/rejected": -0.628598690032959, + "logps/chosen": -46.58046340942383, + "logps/rejected": -137.2987823486328, + "loss": 0.5701, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.883880138397217, + "rewards/margins": 8.429834365844727, + "rewards/rejected": -5.545953750610352, + "step": 11519 + }, + { + "epoch": 2.88, + "grad_norm": 9.811034202575684, + "learning_rate": 3.810907821245698e-06, + "logits/chosen": -0.48091694712638855, + "logits/rejected": -0.6183592081069946, + "logps/chosen": -65.33260345458984, + "logps/rejected": -91.53097534179688, + "loss": 0.677, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0245940685272217, + "rewards/margins": 7.011425018310547, + "rewards/rejected": -3.9868311882019043, + "step": 11520 + }, + { + "epoch": 2.88, + "grad_norm": 4.955931186676025, + "learning_rate": 3.810144398565494e-06, + "logits/chosen": -0.488765150308609, + "logits/rejected": -0.5708125829696655, + "logps/chosen": -56.59476089477539, + "logps/rejected": -101.09036254882812, + "loss": 0.634, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.366710901260376, + "rewards/margins": 7.082828521728516, + "rewards/rejected": -3.7161173820495605, + "step": 11521 + }, + { + "epoch": 2.88, + "grad_norm": 7.5146074295043945, + "learning_rate": 3.8093810052878878e-06, + "logits/chosen": -0.47475293278694153, + "logits/rejected": -0.54740971326828, + "logps/chosen": -60.14163589477539, + "logps/rejected": -103.6214370727539, + "loss": 0.7513, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9317610263824463, + "rewards/margins": 6.103312969207764, + "rewards/rejected": -3.1715521812438965, + "step": 11522 + }, + { + "epoch": 2.88, + "grad_norm": 4.742349624633789, + "learning_rate": 3.8086176414317447e-06, + "logits/chosen": -0.46253418922424316, + "logits/rejected": -0.5219016671180725, + "logps/chosen": -52.562217712402344, + "logps/rejected": -97.65773010253906, + "loss": 0.6185, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.156404495239258, + "rewards/margins": 5.998353004455566, + "rewards/rejected": -2.8419487476348877, + "step": 11523 + }, + { + "epoch": 2.88, + "grad_norm": 5.270615577697754, + "learning_rate": 3.807854307015926e-06, + "logits/chosen": -0.5510582327842712, + "logits/rejected": -0.6228624582290649, + "logps/chosen": -49.621673583984375, + "logps/rejected": -89.19523620605469, + "loss": 0.5763, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9354138374328613, + "rewards/margins": 6.0153889656066895, + "rewards/rejected": -3.0799753665924072, + "step": 11524 + }, + { + "epoch": 2.88, + "grad_norm": 6.464890480041504, + "learning_rate": 3.8070910020592954e-06, + "logits/chosen": -0.49155256152153015, + "logits/rejected": -0.5253428816795349, + "logps/chosen": -55.756263732910156, + "logps/rejected": -120.30691528320312, + "loss": 0.5811, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.80203914642334, + "rewards/margins": 7.257547855377197, + "rewards/rejected": -4.455509185791016, + "step": 11525 + }, + { + "epoch": 2.88, + "grad_norm": 10.356363296508789, + "learning_rate": 3.8063277265807146e-06, + "logits/chosen": -0.5083725452423096, + "logits/rejected": -0.6148438453674316, + "logps/chosen": -53.61012268066406, + "logps/rejected": -91.07064056396484, + "loss": 0.6145, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0392723083496094, + "rewards/margins": 6.43381404876709, + "rewards/rejected": -3.3945415019989014, + "step": 11526 + }, + { + "epoch": 2.88, + "grad_norm": 4.987064838409424, + "learning_rate": 3.8055644805990466e-06, + "logits/chosen": -0.48266085982322693, + "logits/rejected": -0.598983645439148, + "logps/chosen": -59.560935974121094, + "logps/rejected": -95.48343658447266, + "loss": 0.6148, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6780292987823486, + "rewards/margins": 6.261835098266602, + "rewards/rejected": -3.5838065147399902, + "step": 11527 + }, + { + "epoch": 2.88, + "grad_norm": 8.45083999633789, + "learning_rate": 3.80480126413315e-06, + "logits/chosen": -0.5510088801383972, + "logits/rejected": -0.6098104119300842, + "logps/chosen": -65.69109344482422, + "logps/rejected": -93.70933532714844, + "loss": 0.7148, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5352535247802734, + "rewards/margins": 5.260181903839111, + "rewards/rejected": -2.724928855895996, + "step": 11528 + }, + { + "epoch": 2.88, + "grad_norm": 4.566201210021973, + "learning_rate": 3.8040380772018847e-06, + "logits/chosen": -0.4364320933818817, + "logits/rejected": -0.518277108669281, + "logps/chosen": -61.62104797363281, + "logps/rejected": -102.10810852050781, + "loss": 0.6979, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2732412815093994, + "rewards/margins": 7.075181007385254, + "rewards/rejected": -3.8019397258758545, + "step": 11529 + }, + { + "epoch": 2.88, + "grad_norm": 5.435880184173584, + "learning_rate": 3.8032749198241125e-06, + "logits/chosen": -0.5481350421905518, + "logits/rejected": -0.6551912426948547, + "logps/chosen": -45.300601959228516, + "logps/rejected": -103.78233337402344, + "loss": 0.6322, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5980477333068848, + "rewards/margins": 7.468995094299316, + "rewards/rejected": -4.870947360992432, + "step": 11530 + }, + { + "epoch": 2.88, + "grad_norm": 6.258362293243408, + "learning_rate": 3.802511792018688e-06, + "logits/chosen": -0.502336859703064, + "logits/rejected": -0.5786517858505249, + "logps/chosen": -44.01374816894531, + "logps/rejected": -104.88138580322266, + "loss": 0.6213, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.883073568344116, + "rewards/margins": 6.97493839263916, + "rewards/rejected": -4.091865062713623, + "step": 11531 + }, + { + "epoch": 2.88, + "grad_norm": 5.5578999519348145, + "learning_rate": 3.801748693804472e-06, + "logits/chosen": -0.5381835103034973, + "logits/rejected": -0.6191980242729187, + "logps/chosen": -55.84813690185547, + "logps/rejected": -101.13002014160156, + "loss": 0.7056, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9682087898254395, + "rewards/margins": 6.3096723556518555, + "rewards/rejected": -3.341463804244995, + "step": 11532 + }, + { + "epoch": 2.89, + "grad_norm": 6.360084533691406, + "learning_rate": 3.8009856252003186e-06, + "logits/chosen": -0.49112647771835327, + "logits/rejected": -0.542201578617096, + "logps/chosen": -58.529296875, + "logps/rejected": -106.40296936035156, + "loss": 0.678, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.839425563812256, + "rewards/margins": 4.831630706787109, + "rewards/rejected": -1.9922051429748535, + "step": 11533 + }, + { + "epoch": 2.89, + "grad_norm": 4.969311714172363, + "learning_rate": 3.8002225862250885e-06, + "logits/chosen": -0.4784773290157318, + "logits/rejected": -0.6094537377357483, + "logps/chosen": -55.05291748046875, + "logps/rejected": -90.90316009521484, + "loss": 0.516, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.177980661392212, + "rewards/margins": 6.958417892456055, + "rewards/rejected": -3.7804367542266846, + "step": 11534 + }, + { + "epoch": 2.89, + "grad_norm": 7.075658321380615, + "learning_rate": 3.799459576897632e-06, + "logits/chosen": -0.5324934720993042, + "logits/rejected": -0.6023534536361694, + "logps/chosen": -52.59162521362305, + "logps/rejected": -106.76136779785156, + "loss": 0.6417, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8858675956726074, + "rewards/margins": 5.977133750915527, + "rewards/rejected": -3.091266393661499, + "step": 11535 + }, + { + "epoch": 2.89, + "grad_norm": 2.832646131515503, + "learning_rate": 3.7986965972368067e-06, + "logits/chosen": -0.46808385848999023, + "logits/rejected": -0.6190592050552368, + "logps/chosen": -68.12887573242188, + "logps/rejected": -105.06513214111328, + "loss": 0.5992, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1850414276123047, + "rewards/margins": 7.579207420349121, + "rewards/rejected": -4.3941650390625, + "step": 11536 + }, + { + "epoch": 2.89, + "grad_norm": 3.849486827850342, + "learning_rate": 3.7979336472614688e-06, + "logits/chosen": -0.44187548756599426, + "logits/rejected": -0.5277664065361023, + "logps/chosen": -58.11335754394531, + "logps/rejected": -102.40166473388672, + "loss": 0.5868, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.163309097290039, + "rewards/margins": 6.806557655334473, + "rewards/rejected": -3.6432485580444336, + "step": 11537 + }, + { + "epoch": 2.89, + "grad_norm": 8.799805641174316, + "learning_rate": 3.7971707269904654e-06, + "logits/chosen": -0.6071103811264038, + "logits/rejected": -0.6271345019340515, + "logps/chosen": -46.62248229980469, + "logps/rejected": -109.91570281982422, + "loss": 0.8282, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0072574615478516, + "rewards/margins": 5.738379955291748, + "rewards/rejected": -2.7311220169067383, + "step": 11538 + }, + { + "epoch": 2.89, + "grad_norm": 5.531414985656738, + "learning_rate": 3.7964078364426537e-06, + "logits/chosen": -0.3948938250541687, + "logits/rejected": -0.4847443699836731, + "logps/chosen": -57.247047424316406, + "logps/rejected": -106.32249450683594, + "loss": 0.5878, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.176187038421631, + "rewards/margins": 7.747593879699707, + "rewards/rejected": -4.571407318115234, + "step": 11539 + }, + { + "epoch": 2.89, + "grad_norm": 5.280520915985107, + "learning_rate": 3.7956449756368853e-06, + "logits/chosen": -0.5424314737319946, + "logits/rejected": -0.6267787218093872, + "logps/chosen": -51.18678665161133, + "logps/rejected": -99.74018859863281, + "loss": 0.612, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8457047939300537, + "rewards/margins": 6.210047721862793, + "rewards/rejected": -3.3643429279327393, + "step": 11540 + }, + { + "epoch": 2.89, + "grad_norm": 4.872520923614502, + "learning_rate": 3.7948821445920115e-06, + "logits/chosen": -0.516158938407898, + "logits/rejected": -0.6332985758781433, + "logps/chosen": -47.335548400878906, + "logps/rejected": -86.38212585449219, + "loss": 0.5728, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.225475311279297, + "rewards/margins": 7.0882062911987305, + "rewards/rejected": -3.8627307415008545, + "step": 11541 + }, + { + "epoch": 2.89, + "grad_norm": 4.874145030975342, + "learning_rate": 3.7941193433268804e-06, + "logits/chosen": -0.5576531887054443, + "logits/rejected": -0.6385490894317627, + "logps/chosen": -52.3541145324707, + "logps/rejected": -96.19769287109375, + "loss": 0.6308, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1190855503082275, + "rewards/margins": 6.363504886627197, + "rewards/rejected": -3.2444193363189697, + "step": 11542 + }, + { + "epoch": 2.89, + "grad_norm": 5.035838603973389, + "learning_rate": 3.793356571860343e-06, + "logits/chosen": -0.5539849400520325, + "logits/rejected": -0.6115246415138245, + "logps/chosen": -57.38736343383789, + "logps/rejected": -115.87906646728516, + "loss": 0.689, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7693209648132324, + "rewards/margins": 6.674464702606201, + "rewards/rejected": -3.9051437377929688, + "step": 11543 + }, + { + "epoch": 2.89, + "grad_norm": 2.3486416339874268, + "learning_rate": 3.792593830211248e-06, + "logits/chosen": -0.4920773208141327, + "logits/rejected": -0.5017861723899841, + "logps/chosen": -49.78729248046875, + "logps/rejected": -107.2281723022461, + "loss": 0.608, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9406776428222656, + "rewards/margins": 6.286685466766357, + "rewards/rejected": -3.346008539199829, + "step": 11544 + }, + { + "epoch": 2.89, + "grad_norm": 6.033702850341797, + "learning_rate": 3.7918311183984446e-06, + "logits/chosen": -0.5258274078369141, + "logits/rejected": -0.6142019033432007, + "logps/chosen": -45.88618087768555, + "logps/rejected": -85.6804428100586, + "loss": 0.6126, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1334683895111084, + "rewards/margins": 6.110379219055176, + "rewards/rejected": -2.9769110679626465, + "step": 11545 + }, + { + "epoch": 2.89, + "grad_norm": 4.491169452667236, + "learning_rate": 3.791068436440779e-06, + "logits/chosen": -0.5874879956245422, + "logits/rejected": -0.6247855424880981, + "logps/chosen": -46.91952133178711, + "logps/rejected": -114.27031707763672, + "loss": 0.6988, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0638539791107178, + "rewards/margins": 7.333214282989502, + "rewards/rejected": -4.269360065460205, + "step": 11546 + }, + { + "epoch": 2.89, + "grad_norm": 5.334766864776611, + "learning_rate": 3.7903057843570966e-06, + "logits/chosen": -0.4625217020511627, + "logits/rejected": -0.594791829586029, + "logps/chosen": -68.20185089111328, + "logps/rejected": -107.25235748291016, + "loss": 0.7291, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.170621156692505, + "rewards/margins": 7.388562202453613, + "rewards/rejected": -4.2179412841796875, + "step": 11547 + }, + { + "epoch": 2.89, + "grad_norm": 4.441423416137695, + "learning_rate": 3.789543162166248e-06, + "logits/chosen": -0.5070512890815735, + "logits/rejected": -0.5771399140357971, + "logps/chosen": -64.15792083740234, + "logps/rejected": -105.56148529052734, + "loss": 0.64, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7563488483428955, + "rewards/margins": 5.542567729949951, + "rewards/rejected": -2.7862188816070557, + "step": 11548 + }, + { + "epoch": 2.89, + "grad_norm": 5.49372673034668, + "learning_rate": 3.788780569887074e-06, + "logits/chosen": -0.509185791015625, + "logits/rejected": -0.5931869149208069, + "logps/chosen": -51.57158660888672, + "logps/rejected": -106.52704620361328, + "loss": 0.6887, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2356908321380615, + "rewards/margins": 7.090421676635742, + "rewards/rejected": -3.854731321334839, + "step": 11549 + }, + { + "epoch": 2.89, + "grad_norm": 4.440592288970947, + "learning_rate": 3.788018007538419e-06, + "logits/chosen": -0.4586780369281769, + "logits/rejected": -0.6033412218093872, + "logps/chosen": -61.213897705078125, + "logps/rejected": -97.0390396118164, + "loss": 0.6486, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8273732662200928, + "rewards/margins": 6.55328369140625, + "rewards/rejected": -3.725910186767578, + "step": 11550 + }, + { + "epoch": 2.89, + "grad_norm": 6.476389408111572, + "learning_rate": 3.7872554751391286e-06, + "logits/chosen": -0.4913666248321533, + "logits/rejected": -0.4916536808013916, + "logps/chosen": -55.60430145263672, + "logps/rejected": -110.29716491699219, + "loss": 0.6772, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.954179525375366, + "rewards/margins": 5.819375514984131, + "rewards/rejected": -2.8651962280273438, + "step": 11551 + }, + { + "epoch": 2.89, + "grad_norm": 7.602178573608398, + "learning_rate": 3.7864929727080467e-06, + "logits/chosen": -0.4060911238193512, + "logits/rejected": -0.5221438407897949, + "logps/chosen": -62.150657653808594, + "logps/rejected": -85.83753967285156, + "loss": 0.6853, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.611298084259033, + "rewards/margins": 5.97158670425415, + "rewards/rejected": -3.3602890968322754, + "step": 11552 + }, + { + "epoch": 2.89, + "grad_norm": 6.220170021057129, + "learning_rate": 3.785730500264014e-06, + "logits/chosen": -0.5352992415428162, + "logits/rejected": -0.5896703600883484, + "logps/chosen": -47.39326477050781, + "logps/rejected": -103.81526947021484, + "loss": 0.5993, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1584372520446777, + "rewards/margins": 6.218235015869141, + "rewards/rejected": -3.059797763824463, + "step": 11553 + }, + { + "epoch": 2.89, + "grad_norm": 8.286932945251465, + "learning_rate": 3.7849680578258713e-06, + "logits/chosen": -0.4524366855621338, + "logits/rejected": -0.5456070899963379, + "logps/chosen": -63.057533264160156, + "logps/rejected": -102.06734466552734, + "loss": 0.6602, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8286054134368896, + "rewards/margins": 6.167129993438721, + "rewards/rejected": -3.338524341583252, + "step": 11554 + }, + { + "epoch": 2.89, + "grad_norm": 6.407358169555664, + "learning_rate": 3.7842056454124616e-06, + "logits/chosen": -0.5738865733146667, + "logits/rejected": -0.6619484424591064, + "logps/chosen": -68.11058807373047, + "logps/rejected": -113.69173431396484, + "loss": 0.7614, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.259394884109497, + "rewards/margins": 6.050422191619873, + "rewards/rejected": -2.791027307510376, + "step": 11555 + }, + { + "epoch": 2.89, + "grad_norm": 5.054941654205322, + "learning_rate": 3.7834432630426223e-06, + "logits/chosen": -0.572284996509552, + "logits/rejected": -0.6747490763664246, + "logps/chosen": -49.4069938659668, + "logps/rejected": -87.2392578125, + "loss": 0.6307, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1425082683563232, + "rewards/margins": 6.428107261657715, + "rewards/rejected": -3.2855982780456543, + "step": 11556 + }, + { + "epoch": 2.89, + "grad_norm": 4.415579795837402, + "learning_rate": 3.7826809107351943e-06, + "logits/chosen": -0.53879314661026, + "logits/rejected": -0.599360466003418, + "logps/chosen": -53.52082443237305, + "logps/rejected": -94.9823226928711, + "loss": 0.6515, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0676193237304688, + "rewards/margins": 5.6556315422058105, + "rewards/rejected": -2.588012218475342, + "step": 11557 + }, + { + "epoch": 2.89, + "grad_norm": 7.964451789855957, + "learning_rate": 3.7819185885090154e-06, + "logits/chosen": -0.5336946249008179, + "logits/rejected": -0.5889701843261719, + "logps/chosen": -47.526329040527344, + "logps/rejected": -89.16050720214844, + "loss": 0.7067, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7023966312408447, + "rewards/margins": 4.9741692543029785, + "rewards/rejected": -2.2717723846435547, + "step": 11558 + }, + { + "epoch": 2.89, + "grad_norm": 9.20401668548584, + "learning_rate": 3.781156296382925e-06, + "logits/chosen": -0.6117416620254517, + "logits/rejected": -0.6384386420249939, + "logps/chosen": -42.10254669189453, + "logps/rejected": -91.7443618774414, + "loss": 0.703, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8563292026519775, + "rewards/margins": 5.791069984436035, + "rewards/rejected": -2.9347410202026367, + "step": 11559 + }, + { + "epoch": 2.89, + "grad_norm": 3.5238940715789795, + "learning_rate": 3.780394034375758e-06, + "logits/chosen": -0.575027346611023, + "logits/rejected": -0.6163427233695984, + "logps/chosen": -53.1658935546875, + "logps/rejected": -105.44654083251953, + "loss": 0.582, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9704434871673584, + "rewards/margins": 6.399172782897949, + "rewards/rejected": -3.4287290573120117, + "step": 11560 + }, + { + "epoch": 2.89, + "grad_norm": 4.4615631103515625, + "learning_rate": 3.7796318025063505e-06, + "logits/chosen": -0.5421044826507568, + "logits/rejected": -0.6284582018852234, + "logps/chosen": -56.280860900878906, + "logps/rejected": -78.44285583496094, + "loss": 0.6604, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.233798027038574, + "rewards/margins": 5.358887672424316, + "rewards/rejected": -2.1250898838043213, + "step": 11561 + }, + { + "epoch": 2.89, + "grad_norm": 8.926506996154785, + "learning_rate": 3.778869600793542e-06, + "logits/chosen": -0.425821453332901, + "logits/rejected": -0.48994389176368713, + "logps/chosen": -57.79621124267578, + "logps/rejected": -101.81378936767578, + "loss": 0.8421, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.060626983642578, + "rewards/margins": 4.996374130249023, + "rewards/rejected": -1.9357469081878662, + "step": 11562 + }, + { + "epoch": 2.89, + "grad_norm": 8.582362174987793, + "learning_rate": 3.778107429256163e-06, + "logits/chosen": -0.4803035855293274, + "logits/rejected": -0.5861499309539795, + "logps/chosen": -54.75475311279297, + "logps/rejected": -102.97518920898438, + "loss": 0.6829, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8320975303649902, + "rewards/margins": 6.328050136566162, + "rewards/rejected": -3.495952844619751, + "step": 11563 + }, + { + "epoch": 2.89, + "grad_norm": 2.1847000122070312, + "learning_rate": 3.777345287913048e-06, + "logits/chosen": -0.493613064289093, + "logits/rejected": -0.600749135017395, + "logps/chosen": -49.989463806152344, + "logps/rejected": -90.7365493774414, + "loss": 0.5747, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.228518009185791, + "rewards/margins": 7.147851943969727, + "rewards/rejected": -3.9193332195281982, + "step": 11564 + }, + { + "epoch": 2.89, + "grad_norm": 3.851454496383667, + "learning_rate": 3.7765831767830323e-06, + "logits/chosen": -0.5858592987060547, + "logits/rejected": -0.6769083738327026, + "logps/chosen": -53.992919921875, + "logps/rejected": -97.25447082519531, + "loss": 0.621, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.147247552871704, + "rewards/margins": 6.980856418609619, + "rewards/rejected": -3.833609104156494, + "step": 11565 + }, + { + "epoch": 2.89, + "grad_norm": 14.023763656616211, + "learning_rate": 3.7758210958849486e-06, + "logits/chosen": -0.5032285451889038, + "logits/rejected": -0.5709937810897827, + "logps/chosen": -53.14555358886719, + "logps/rejected": -102.97161865234375, + "loss": 0.7132, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.716149091720581, + "rewards/margins": 6.541284561157227, + "rewards/rejected": -3.8251352310180664, + "step": 11566 + }, + { + "epoch": 2.89, + "grad_norm": 3.661966323852539, + "learning_rate": 3.7750590452376267e-06, + "logits/chosen": -0.5076620578765869, + "logits/rejected": -0.6327601671218872, + "logps/chosen": -51.27386474609375, + "logps/rejected": -91.20114135742188, + "loss": 0.5595, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.047412633895874, + "rewards/margins": 6.739875793457031, + "rewards/rejected": -3.6924633979797363, + "step": 11567 + }, + { + "epoch": 2.89, + "grad_norm": 9.339089393615723, + "learning_rate": 3.7742970248598988e-06, + "logits/chosen": -0.5181646943092346, + "logits/rejected": -0.6156370639801025, + "logps/chosen": -53.106204986572266, + "logps/rejected": -84.54295349121094, + "loss": 0.7212, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.940929651260376, + "rewards/margins": 5.93798828125, + "rewards/rejected": -2.9970591068267822, + "step": 11568 + }, + { + "epoch": 2.89, + "grad_norm": 4.332186222076416, + "learning_rate": 3.7735350347705948e-06, + "logits/chosen": -0.5257283449172974, + "logits/rejected": -0.545479416847229, + "logps/chosen": -55.39445495605469, + "logps/rejected": -103.68170166015625, + "loss": 0.6676, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0893020629882812, + "rewards/margins": 6.519952774047852, + "rewards/rejected": -3.4306507110595703, + "step": 11569 + }, + { + "epoch": 2.89, + "grad_norm": 9.515874862670898, + "learning_rate": 3.772773074988546e-06, + "logits/chosen": -0.4370090365409851, + "logits/rejected": -0.5092769265174866, + "logps/chosen": -60.93280029296875, + "logps/rejected": -90.20652770996094, + "loss": 0.6568, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.971982955932617, + "rewards/margins": 5.736037731170654, + "rewards/rejected": -2.764054536819458, + "step": 11570 + }, + { + "epoch": 2.89, + "grad_norm": 3.486793041229248, + "learning_rate": 3.7720111455325785e-06, + "logits/chosen": -0.4684409201145172, + "logits/rejected": -0.5312707424163818, + "logps/chosen": -49.926734924316406, + "logps/rejected": -109.05280303955078, + "loss": 0.5885, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.039280891418457, + "rewards/margins": 7.171270847320557, + "rewards/rejected": -4.1319899559021, + "step": 11571 + }, + { + "epoch": 2.89, + "grad_norm": 5.759896755218506, + "learning_rate": 3.7712492464215223e-06, + "logits/chosen": -0.5681651830673218, + "logits/rejected": -0.6610862612724304, + "logps/chosen": -56.01929473876953, + "logps/rejected": -91.9889907836914, + "loss": 0.6586, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8722023963928223, + "rewards/margins": 5.954504489898682, + "rewards/rejected": -3.082301616668701, + "step": 11572 + }, + { + "epoch": 2.9, + "grad_norm": 11.48517894744873, + "learning_rate": 3.7704873776742048e-06, + "logits/chosen": -0.4444301724433899, + "logits/rejected": -0.46829020977020264, + "logps/chosen": -64.09200286865234, + "logps/rejected": -99.41654968261719, + "loss": 0.7851, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.996124505996704, + "rewards/margins": 5.281525611877441, + "rewards/rejected": -2.285400390625, + "step": 11573 + }, + { + "epoch": 2.9, + "grad_norm": 6.5290703773498535, + "learning_rate": 3.7697255393094504e-06, + "logits/chosen": -0.4577590823173523, + "logits/rejected": -0.5562580227851868, + "logps/chosen": -53.36294174194336, + "logps/rejected": -109.33676147460938, + "loss": 0.593, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4418158531188965, + "rewards/margins": 6.3978776931762695, + "rewards/rejected": -2.956061363220215, + "step": 11574 + }, + { + "epoch": 2.9, + "grad_norm": 8.877068519592285, + "learning_rate": 3.7689637313460876e-06, + "logits/chosen": -0.5150924921035767, + "logits/rejected": -0.6232721209526062, + "logps/chosen": -58.82743835449219, + "logps/rejected": -84.09960174560547, + "loss": 0.7638, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9575092792510986, + "rewards/margins": 4.746760368347168, + "rewards/rejected": -1.7892513275146484, + "step": 11575 + }, + { + "epoch": 2.9, + "grad_norm": 3.2290303707122803, + "learning_rate": 3.7682019538029385e-06, + "logits/chosen": -0.4698628783226013, + "logits/rejected": -0.5296436548233032, + "logps/chosen": -55.2750129699707, + "logps/rejected": -107.62669372558594, + "loss": 0.6895, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.400322437286377, + "rewards/margins": 5.769507884979248, + "rewards/rejected": -2.369184970855713, + "step": 11576 + }, + { + "epoch": 2.9, + "grad_norm": 6.603373050689697, + "learning_rate": 3.767440206698833e-06, + "logits/chosen": -0.48274701833724976, + "logits/rejected": -0.5766782164573669, + "logps/chosen": -48.26699447631836, + "logps/rejected": -99.88601684570312, + "loss": 0.6525, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.008026599884033, + "rewards/margins": 6.297987937927246, + "rewards/rejected": -3.289961814880371, + "step": 11577 + }, + { + "epoch": 2.9, + "grad_norm": 3.389148473739624, + "learning_rate": 3.7666784900525876e-06, + "logits/chosen": -0.5137302875518799, + "logits/rejected": -0.6399864554405212, + "logps/chosen": -56.56488800048828, + "logps/rejected": -90.85343170166016, + "loss": 0.6666, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1274003982543945, + "rewards/margins": 6.726840972900391, + "rewards/rejected": -3.5994410514831543, + "step": 11578 + }, + { + "epoch": 2.9, + "grad_norm": 5.141503810882568, + "learning_rate": 3.7659168038830296e-06, + "logits/chosen": -0.46459633111953735, + "logits/rejected": -0.5334751009941101, + "logps/chosen": -58.76116943359375, + "logps/rejected": -114.09660339355469, + "loss": 0.6799, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6758034229278564, + "rewards/margins": 6.501704216003418, + "rewards/rejected": -3.8259007930755615, + "step": 11579 + }, + { + "epoch": 2.9, + "grad_norm": 4.685301303863525, + "learning_rate": 3.765155148208982e-06, + "logits/chosen": -0.5570852756500244, + "logits/rejected": -0.6510698795318604, + "logps/chosen": -55.980567932128906, + "logps/rejected": -95.94025421142578, + "loss": 0.6594, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2285852432250977, + "rewards/margins": 7.538001537322998, + "rewards/rejected": -4.309416770935059, + "step": 11580 + }, + { + "epoch": 2.9, + "grad_norm": 5.553313255310059, + "learning_rate": 3.764393523049261e-06, + "logits/chosen": -0.4800184965133667, + "logits/rejected": -0.571821928024292, + "logps/chosen": -55.69365310668945, + "logps/rejected": -109.0545654296875, + "loss": 0.7255, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.356614351272583, + "rewards/margins": 7.181005001068115, + "rewards/rejected": -3.824390411376953, + "step": 11581 + }, + { + "epoch": 2.9, + "grad_norm": 6.047578811645508, + "learning_rate": 3.763631928422692e-06, + "logits/chosen": -0.4645875096321106, + "logits/rejected": -0.5530929565429688, + "logps/chosen": -59.002777099609375, + "logps/rejected": -89.88814544677734, + "loss": 0.7201, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.190164566040039, + "rewards/margins": 6.229307174682617, + "rewards/rejected": -3.039142370223999, + "step": 11582 + }, + { + "epoch": 2.9, + "grad_norm": 3.500333070755005, + "learning_rate": 3.7628703643480925e-06, + "logits/chosen": -0.5474214553833008, + "logits/rejected": -0.6197088956832886, + "logps/chosen": -52.357666015625, + "logps/rejected": -102.977783203125, + "loss": 0.5763, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.16644024848938, + "rewards/margins": 6.868766784667969, + "rewards/rejected": -3.702326536178589, + "step": 11583 + }, + { + "epoch": 2.9, + "grad_norm": 7.097850322723389, + "learning_rate": 3.762108830844283e-06, + "logits/chosen": -0.49145179986953735, + "logits/rejected": -0.5908981561660767, + "logps/chosen": -52.577415466308594, + "logps/rejected": -111.72990417480469, + "loss": 0.7935, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.752359390258789, + "rewards/margins": 6.4274797439575195, + "rewards/rejected": -3.6751205921173096, + "step": 11584 + }, + { + "epoch": 2.9, + "grad_norm": 4.107580661773682, + "learning_rate": 3.7613473279300804e-06, + "logits/chosen": -0.615720272064209, + "logits/rejected": -0.6940099000930786, + "logps/chosen": -49.812313079833984, + "logps/rejected": -104.29547119140625, + "loss": 0.7227, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1443023681640625, + "rewards/margins": 6.2627854347229, + "rewards/rejected": -3.118483304977417, + "step": 11585 + }, + { + "epoch": 2.9, + "grad_norm": 3.6499290466308594, + "learning_rate": 3.7605858556243023e-06, + "logits/chosen": -0.5073152184486389, + "logits/rejected": -0.5962600111961365, + "logps/chosen": -56.87175750732422, + "logps/rejected": -92.2696533203125, + "loss": 0.6859, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0529043674468994, + "rewards/margins": 6.956618309020996, + "rewards/rejected": -3.9037139415740967, + "step": 11586 + }, + { + "epoch": 2.9, + "grad_norm": 3.2185473442077637, + "learning_rate": 3.7598244139457673e-06, + "logits/chosen": -0.5552709102630615, + "logits/rejected": -0.6160604953765869, + "logps/chosen": -49.034122467041016, + "logps/rejected": -100.02356719970703, + "loss": 0.6004, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8302090167999268, + "rewards/margins": 6.777917861938477, + "rewards/rejected": -3.9477086067199707, + "step": 11587 + }, + { + "epoch": 2.9, + "grad_norm": 4.260781764984131, + "learning_rate": 3.759063002913289e-06, + "logits/chosen": -0.45337241888046265, + "logits/rejected": -0.534176766872406, + "logps/chosen": -43.803958892822266, + "logps/rejected": -99.61909484863281, + "loss": 0.549, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.145153045654297, + "rewards/margins": 7.298679828643799, + "rewards/rejected": -4.153526782989502, + "step": 11588 + }, + { + "epoch": 2.9, + "grad_norm": 11.81229305267334, + "learning_rate": 3.7583016225456843e-06, + "logits/chosen": -0.5194349884986877, + "logits/rejected": -0.6309381723403931, + "logps/chosen": -56.07067108154297, + "logps/rejected": -79.39419555664062, + "loss": 0.6208, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.224710702896118, + "rewards/margins": 6.203496932983398, + "rewards/rejected": -2.9787864685058594, + "step": 11589 + }, + { + "epoch": 2.9, + "grad_norm": 4.384899616241455, + "learning_rate": 3.7575402728617662e-06, + "logits/chosen": -0.5144815444946289, + "logits/rejected": -0.5897123217582703, + "logps/chosen": -55.13116455078125, + "logps/rejected": -96.70450592041016, + "loss": 0.6708, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.215886116027832, + "rewards/margins": 6.587096214294434, + "rewards/rejected": -3.3712105751037598, + "step": 11590 + }, + { + "epoch": 2.9, + "grad_norm": 8.690421104431152, + "learning_rate": 3.756778953880352e-06, + "logits/chosen": -0.5490987300872803, + "logits/rejected": -0.6977412700653076, + "logps/chosen": -61.56476593017578, + "logps/rejected": -110.77737426757812, + "loss": 0.6916, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.891627550125122, + "rewards/margins": 8.207043647766113, + "rewards/rejected": -5.315415859222412, + "step": 11591 + }, + { + "epoch": 2.9, + "grad_norm": 10.25366497039795, + "learning_rate": 3.75601766562025e-06, + "logits/chosen": -0.4833759665489197, + "logits/rejected": -0.5860708355903625, + "logps/chosen": -55.349308013916016, + "logps/rejected": -90.2940444946289, + "loss": 0.6252, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.093255043029785, + "rewards/margins": 5.945228099822998, + "rewards/rejected": -2.8519725799560547, + "step": 11592 + }, + { + "epoch": 2.9, + "grad_norm": 14.6975679397583, + "learning_rate": 3.755256408100274e-06, + "logits/chosen": -0.46686851978302, + "logits/rejected": -0.5224733948707581, + "logps/chosen": -53.78438186645508, + "logps/rejected": -99.19821166992188, + "loss": 0.7078, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.807664632797241, + "rewards/margins": 5.393242359161377, + "rewards/rejected": -2.5855770111083984, + "step": 11593 + }, + { + "epoch": 2.9, + "grad_norm": 15.264330863952637, + "learning_rate": 3.754495181339237e-06, + "logits/chosen": -0.515536367893219, + "logits/rejected": -0.6012784838676453, + "logps/chosen": -50.26976013183594, + "logps/rejected": -98.44942474365234, + "loss": 0.6236, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.983320713043213, + "rewards/margins": 6.76923942565918, + "rewards/rejected": -3.7859179973602295, + "step": 11594 + }, + { + "epoch": 2.9, + "grad_norm": 6.636056423187256, + "learning_rate": 3.753733985355951e-06, + "logits/chosen": -0.4387078583240509, + "logits/rejected": -0.523433268070221, + "logps/chosen": -62.17296600341797, + "logps/rejected": -88.64801788330078, + "loss": 0.702, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0012195110321045, + "rewards/margins": 5.246077537536621, + "rewards/rejected": -2.2448580265045166, + "step": 11595 + }, + { + "epoch": 2.9, + "grad_norm": 21.11243438720703, + "learning_rate": 3.752972820169221e-06, + "logits/chosen": -0.5206969976425171, + "logits/rejected": -0.6191917657852173, + "logps/chosen": -56.09626007080078, + "logps/rejected": -94.05901336669922, + "loss": 0.7786, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5234785079956055, + "rewards/margins": 6.563669204711914, + "rewards/rejected": -4.04019021987915, + "step": 11596 + }, + { + "epoch": 2.9, + "grad_norm": 40.406978607177734, + "learning_rate": 3.7522116857978603e-06, + "logits/chosen": -0.5315157771110535, + "logits/rejected": -0.5903836488723755, + "logps/chosen": -46.36754608154297, + "logps/rejected": -103.47674560546875, + "loss": 0.82, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6456382274627686, + "rewards/margins": 5.854197025299072, + "rewards/rejected": -3.2085587978363037, + "step": 11597 + }, + { + "epoch": 2.9, + "grad_norm": 4.497354507446289, + "learning_rate": 3.7514505822606766e-06, + "logits/chosen": -0.5418223142623901, + "logits/rejected": -0.6295081973075867, + "logps/chosen": -49.40599060058594, + "logps/rejected": -94.27320861816406, + "loss": 0.7503, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.4189510345458984, + "rewards/margins": 6.507898807525635, + "rewards/rejected": -3.0889484882354736, + "step": 11598 + }, + { + "epoch": 2.9, + "grad_norm": 10.875503540039062, + "learning_rate": 3.750689509576476e-06, + "logits/chosen": -0.5988970398902893, + "logits/rejected": -0.6476348042488098, + "logps/chosen": -46.677032470703125, + "logps/rejected": -111.11376953125, + "loss": 0.5501, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8989627361297607, + "rewards/margins": 6.581854820251465, + "rewards/rejected": -3.682892084121704, + "step": 11599 + }, + { + "epoch": 2.9, + "grad_norm": 5.15380859375, + "learning_rate": 3.7499284677640664e-06, + "logits/chosen": -0.5404768586158752, + "logits/rejected": -0.6022676825523376, + "logps/chosen": -63.62053680419922, + "logps/rejected": -107.4398193359375, + "loss": 0.6754, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1714260578155518, + "rewards/margins": 7.187497138977051, + "rewards/rejected": -4.01607084274292, + "step": 11600 + }, + { + "epoch": 2.9, + "grad_norm": 13.725627899169922, + "learning_rate": 3.749167456842254e-06, + "logits/chosen": -0.505198061466217, + "logits/rejected": -0.5387616157531738, + "logps/chosen": -49.31524658203125, + "logps/rejected": -102.63536834716797, + "loss": 0.7097, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.722851514816284, + "rewards/margins": 5.720434188842773, + "rewards/rejected": -2.99758243560791, + "step": 11601 + }, + { + "epoch": 2.9, + "grad_norm": 8.715935707092285, + "learning_rate": 3.7484064768298456e-06, + "logits/chosen": -0.6043171286582947, + "logits/rejected": -0.6771134734153748, + "logps/chosen": -65.42472839355469, + "logps/rejected": -112.03038024902344, + "loss": 0.6616, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0882019996643066, + "rewards/margins": 7.371594429016113, + "rewards/rejected": -4.283392906188965, + "step": 11602 + }, + { + "epoch": 2.9, + "grad_norm": 3.2858147621154785, + "learning_rate": 3.7476455277456427e-06, + "logits/chosen": -0.4955301284790039, + "logits/rejected": -0.5666355490684509, + "logps/chosen": -55.750999450683594, + "logps/rejected": -114.02777862548828, + "loss": 0.6036, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9997568130493164, + "rewards/margins": 7.491262435913086, + "rewards/rejected": -4.4915056228637695, + "step": 11603 + }, + { + "epoch": 2.9, + "grad_norm": 5.959083557128906, + "learning_rate": 3.746884609608451e-06, + "logits/chosen": -0.5122668743133545, + "logits/rejected": -0.6088774800300598, + "logps/chosen": -62.96904754638672, + "logps/rejected": -97.83323669433594, + "loss": 0.6835, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8395657539367676, + "rewards/margins": 7.009191036224365, + "rewards/rejected": -4.169625282287598, + "step": 11604 + }, + { + "epoch": 2.9, + "grad_norm": 7.582810401916504, + "learning_rate": 3.746123722437075e-06, + "logits/chosen": -0.5571885108947754, + "logits/rejected": -0.5812348127365112, + "logps/chosen": -50.320674896240234, + "logps/rejected": -113.65281677246094, + "loss": 0.6554, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.906794786453247, + "rewards/margins": 6.223677158355713, + "rewards/rejected": -3.3168821334838867, + "step": 11605 + }, + { + "epoch": 2.9, + "grad_norm": 5.811727523803711, + "learning_rate": 3.745362866250314e-06, + "logits/chosen": -0.43265312910079956, + "logits/rejected": -0.5136067867279053, + "logps/chosen": -56.73051452636719, + "logps/rejected": -105.39278411865234, + "loss": 0.6302, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1239283084869385, + "rewards/margins": 7.155783653259277, + "rewards/rejected": -4.03185510635376, + "step": 11606 + }, + { + "epoch": 2.9, + "grad_norm": 3.2775838375091553, + "learning_rate": 3.7446020410669704e-06, + "logits/chosen": -0.47167256474494934, + "logits/rejected": -0.5463833212852478, + "logps/chosen": -51.8510627746582, + "logps/rejected": -106.30293273925781, + "loss": 0.521, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0819554328918457, + "rewards/margins": 6.745417594909668, + "rewards/rejected": -3.6634626388549805, + "step": 11607 + }, + { + "epoch": 2.9, + "grad_norm": 9.255633354187012, + "learning_rate": 3.7438412469058466e-06, + "logits/chosen": -0.5075395107269287, + "logits/rejected": -0.5721080899238586, + "logps/chosen": -69.8866958618164, + "logps/rejected": -89.70317840576172, + "loss": 0.8952, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.718082904815674, + "rewards/margins": 5.980467796325684, + "rewards/rejected": -3.2623848915100098, + "step": 11608 + }, + { + "epoch": 2.9, + "grad_norm": 5.961507797241211, + "learning_rate": 3.7430804837857436e-06, + "logits/chosen": -0.5018718838691711, + "logits/rejected": -0.5817002058029175, + "logps/chosen": -59.141456604003906, + "logps/rejected": -88.30419921875, + "loss": 0.7347, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.823390483856201, + "rewards/margins": 5.389326572418213, + "rewards/rejected": -2.5659360885620117, + "step": 11609 + }, + { + "epoch": 2.9, + "grad_norm": 5.318781852722168, + "learning_rate": 3.742319751725456e-06, + "logits/chosen": -0.5090576410293579, + "logits/rejected": -0.6311924457550049, + "logps/chosen": -52.42805862426758, + "logps/rejected": -102.67546081542969, + "loss": 0.577, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9329047203063965, + "rewards/margins": 6.31596040725708, + "rewards/rejected": -3.3830552101135254, + "step": 11610 + }, + { + "epoch": 2.9, + "grad_norm": 13.471773147583008, + "learning_rate": 3.741559050743786e-06, + "logits/chosen": -0.46080663800239563, + "logits/rejected": -0.5700534582138062, + "logps/chosen": -60.132144927978516, + "logps/rejected": -108.25817108154297, + "loss": 0.5912, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.309349536895752, + "rewards/margins": 7.09879732131958, + "rewards/rejected": -3.7894487380981445, + "step": 11611 + }, + { + "epoch": 2.9, + "grad_norm": 9.101065635681152, + "learning_rate": 3.7407983808595312e-06, + "logits/chosen": -0.5401337742805481, + "logits/rejected": -0.5927324891090393, + "logps/chosen": -52.14244842529297, + "logps/rejected": -113.92047882080078, + "loss": 0.8316, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9841270446777344, + "rewards/margins": 5.664158344268799, + "rewards/rejected": -2.6800315380096436, + "step": 11612 + }, + { + "epoch": 2.91, + "grad_norm": 6.862268447875977, + "learning_rate": 3.740037742091487e-06, + "logits/chosen": -0.5045685768127441, + "logits/rejected": -0.5485039353370667, + "logps/chosen": -60.91344451904297, + "logps/rejected": -133.66429138183594, + "loss": 0.6364, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1494622230529785, + "rewards/margins": 7.172050476074219, + "rewards/rejected": -4.022587776184082, + "step": 11613 + }, + { + "epoch": 2.91, + "grad_norm": 7.989374160766602, + "learning_rate": 3.73927713445845e-06, + "logits/chosen": -0.5706403255462646, + "logits/rejected": -0.5666933059692383, + "logps/chosen": -51.33525848388672, + "logps/rejected": -108.55929565429688, + "loss": 0.7815, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0100631713867188, + "rewards/margins": 5.609123229980469, + "rewards/rejected": -2.59906005859375, + "step": 11614 + }, + { + "epoch": 2.91, + "grad_norm": 8.007821083068848, + "learning_rate": 3.738516557979216e-06, + "logits/chosen": -0.5185920000076294, + "logits/rejected": -0.5885169506072998, + "logps/chosen": -63.931453704833984, + "logps/rejected": -93.02860260009766, + "loss": 0.8289, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.96166729927063, + "rewards/margins": 5.387750625610352, + "rewards/rejected": -2.4260830879211426, + "step": 11615 + }, + { + "epoch": 2.91, + "grad_norm": 3.2053635120391846, + "learning_rate": 3.737756012672581e-06, + "logits/chosen": -0.551540732383728, + "logits/rejected": -0.6201800107955933, + "logps/chosen": -44.373313903808594, + "logps/rejected": -104.44087219238281, + "loss": 0.6388, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.945633888244629, + "rewards/margins": 7.241477012634277, + "rewards/rejected": -4.295842170715332, + "step": 11616 + }, + { + "epoch": 2.91, + "grad_norm": 7.955833435058594, + "learning_rate": 3.7369954985573358e-06, + "logits/chosen": -0.4830349385738373, + "logits/rejected": -0.5545250773429871, + "logps/chosen": -62.854469299316406, + "logps/rejected": -98.86115264892578, + "loss": 0.6314, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1790385246276855, + "rewards/margins": 6.525234222412109, + "rewards/rejected": -3.3461952209472656, + "step": 11617 + }, + { + "epoch": 2.91, + "grad_norm": 6.402656555175781, + "learning_rate": 3.736235015652276e-06, + "logits/chosen": -0.40873396396636963, + "logits/rejected": -0.4118844270706177, + "logps/chosen": -53.29993438720703, + "logps/rejected": -99.16010284423828, + "loss": 0.6633, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1825735569000244, + "rewards/margins": 4.769772052764893, + "rewards/rejected": -1.5871986150741577, + "step": 11618 + }, + { + "epoch": 2.91, + "grad_norm": 5.609379291534424, + "learning_rate": 3.7354745639761934e-06, + "logits/chosen": -0.5106902122497559, + "logits/rejected": -0.5147470235824585, + "logps/chosen": -50.956565856933594, + "logps/rejected": -123.39830780029297, + "loss": 0.585, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8325443267822266, + "rewards/margins": 7.15989351272583, + "rewards/rejected": -4.32735013961792, + "step": 11619 + }, + { + "epoch": 2.91, + "grad_norm": 5.572770595550537, + "learning_rate": 3.734714143547879e-06, + "logits/chosen": -0.5490928888320923, + "logits/rejected": -0.5846350789070129, + "logps/chosen": -47.143985748291016, + "logps/rejected": -104.31684875488281, + "loss": 0.612, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.075805425643921, + "rewards/margins": 5.424304008483887, + "rewards/rejected": -2.348498582839966, + "step": 11620 + }, + { + "epoch": 2.91, + "grad_norm": 5.956000804901123, + "learning_rate": 3.7339537543861224e-06, + "logits/chosen": -0.5076302289962769, + "logits/rejected": -0.6123272180557251, + "logps/chosen": -56.28693389892578, + "logps/rejected": -106.25045776367188, + "loss": 0.5866, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1118857860565186, + "rewards/margins": 7.677770137786865, + "rewards/rejected": -4.565884590148926, + "step": 11621 + }, + { + "epoch": 2.91, + "grad_norm": 3.022256374359131, + "learning_rate": 3.7331933965097168e-06, + "logits/chosen": -0.4537001848220825, + "logits/rejected": -0.6007797718048096, + "logps/chosen": -50.29044723510742, + "logps/rejected": -95.69232940673828, + "loss": 0.5173, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.304969549179077, + "rewards/margins": 7.231212139129639, + "rewards/rejected": -3.9262423515319824, + "step": 11622 + }, + { + "epoch": 2.91, + "grad_norm": 5.381992340087891, + "learning_rate": 3.7324330699374516e-06, + "logits/chosen": -0.6926372647285461, + "logits/rejected": -0.7222640514373779, + "logps/chosen": -65.22900390625, + "logps/rejected": -102.51932525634766, + "loss": 0.7456, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.065323829650879, + "rewards/margins": 6.1673431396484375, + "rewards/rejected": -3.1020193099975586, + "step": 11623 + }, + { + "epoch": 2.91, + "grad_norm": 5.745694160461426, + "learning_rate": 3.7316727746881104e-06, + "logits/chosen": -0.47454482316970825, + "logits/rejected": -0.5338616371154785, + "logps/chosen": -55.49421691894531, + "logps/rejected": -98.79887390136719, + "loss": 0.7168, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0849151611328125, + "rewards/margins": 6.056519985198975, + "rewards/rejected": -2.9716053009033203, + "step": 11624 + }, + { + "epoch": 2.91, + "grad_norm": 4.603720188140869, + "learning_rate": 3.7309125107804845e-06, + "logits/chosen": -0.4905293881893158, + "logits/rejected": -0.5366736650466919, + "logps/chosen": -57.26959991455078, + "logps/rejected": -104.45816040039062, + "loss": 0.66, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6361069679260254, + "rewards/margins": 5.6528801918029785, + "rewards/rejected": -3.0167737007141113, + "step": 11625 + }, + { + "epoch": 2.91, + "grad_norm": 12.212913513183594, + "learning_rate": 3.730152278233361e-06, + "logits/chosen": -0.4457226097583771, + "logits/rejected": -0.5897260308265686, + "logps/chosen": -59.49811553955078, + "logps/rejected": -91.92589569091797, + "loss": 0.7106, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.710012435913086, + "rewards/margins": 7.358951568603516, + "rewards/rejected": -4.6489386558532715, + "step": 11626 + }, + { + "epoch": 2.91, + "grad_norm": 16.82923698425293, + "learning_rate": 3.7293920770655262e-06, + "logits/chosen": -0.5033870339393616, + "logits/rejected": -0.5928769111633301, + "logps/chosen": -55.636966705322266, + "logps/rejected": -95.2998275756836, + "loss": 0.7521, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.198312282562256, + "rewards/margins": 6.701653480529785, + "rewards/rejected": -3.5033411979675293, + "step": 11627 + }, + { + "epoch": 2.91, + "grad_norm": 10.005452156066895, + "learning_rate": 3.728631907295763e-06, + "logits/chosen": -0.5699340105056763, + "logits/rejected": -0.6190710067749023, + "logps/chosen": -49.952056884765625, + "logps/rejected": -106.41714477539062, + "loss": 0.6382, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9948039054870605, + "rewards/margins": 8.120804786682129, + "rewards/rejected": -5.126000881195068, + "step": 11628 + }, + { + "epoch": 2.91, + "grad_norm": 2.913007974624634, + "learning_rate": 3.727871768942859e-06, + "logits/chosen": -0.5795224905014038, + "logits/rejected": -0.6511962413787842, + "logps/chosen": -56.671653747558594, + "logps/rejected": -90.46984100341797, + "loss": 0.616, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0006537437438965, + "rewards/margins": 7.440813064575195, + "rewards/rejected": -4.440159320831299, + "step": 11629 + }, + { + "epoch": 2.91, + "grad_norm": 4.491119384765625, + "learning_rate": 3.727111662025597e-06, + "logits/chosen": -0.4493476450443268, + "logits/rejected": -0.5267453193664551, + "logps/chosen": -54.707942962646484, + "logps/rejected": -122.65772247314453, + "loss": 0.6518, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1849703788757324, + "rewards/margins": 7.806007385253906, + "rewards/rejected": -4.621036529541016, + "step": 11630 + }, + { + "epoch": 2.91, + "grad_norm": 4.061071395874023, + "learning_rate": 3.7263515865627585e-06, + "logits/chosen": -0.5702672004699707, + "logits/rejected": -0.6804764270782471, + "logps/chosen": -52.181827545166016, + "logps/rejected": -96.97647094726562, + "loss": 0.5816, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9294917583465576, + "rewards/margins": 7.043826580047607, + "rewards/rejected": -4.114334583282471, + "step": 11631 + }, + { + "epoch": 2.91, + "grad_norm": 7.027633190155029, + "learning_rate": 3.725591542573127e-06, + "logits/chosen": -0.49713006615638733, + "logits/rejected": -0.531311571598053, + "logps/chosen": -48.116661071777344, + "logps/rejected": -102.83727264404297, + "loss": 0.7076, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1606640815734863, + "rewards/margins": 6.713990688323975, + "rewards/rejected": -3.5533266067504883, + "step": 11632 + }, + { + "epoch": 2.91, + "grad_norm": 2.623478412628174, + "learning_rate": 3.7248315300754835e-06, + "logits/chosen": -0.5251452922821045, + "logits/rejected": -0.6217650175094604, + "logps/chosen": -50.03783416748047, + "logps/rejected": -82.25637817382812, + "loss": 0.5494, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1524384021759033, + "rewards/margins": 6.38419246673584, + "rewards/rejected": -3.2317545413970947, + "step": 11633 + }, + { + "epoch": 2.91, + "grad_norm": 4.064264297485352, + "learning_rate": 3.724071549088612e-06, + "logits/chosen": -0.527798593044281, + "logits/rejected": -0.6211754679679871, + "logps/chosen": -57.660614013671875, + "logps/rejected": -90.21746063232422, + "loss": 0.6034, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.371516227722168, + "rewards/margins": 6.525036811828613, + "rewards/rejected": -3.1535205841064453, + "step": 11634 + }, + { + "epoch": 2.91, + "grad_norm": 4.019850730895996, + "learning_rate": 3.7233115996312876e-06, + "logits/chosen": -0.5736824870109558, + "logits/rejected": -0.7006960511207581, + "logps/chosen": -46.081138610839844, + "logps/rejected": -86.42815399169922, + "loss": 0.6554, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3261842727661133, + "rewards/margins": 7.519193649291992, + "rewards/rejected": -4.193009853363037, + "step": 11635 + }, + { + "epoch": 2.91, + "grad_norm": 3.1620333194732666, + "learning_rate": 3.7225516817222905e-06, + "logits/chosen": -0.4884793758392334, + "logits/rejected": -0.5797459483146667, + "logps/chosen": -64.81922912597656, + "logps/rejected": -76.86174774169922, + "loss": 0.7114, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3563613891601562, + "rewards/margins": 5.474736213684082, + "rewards/rejected": -2.118375301361084, + "step": 11636 + }, + { + "epoch": 2.91, + "grad_norm": 4.399899005889893, + "learning_rate": 3.721791795380403e-06, + "logits/chosen": -0.46166661381721497, + "logits/rejected": -0.5440841317176819, + "logps/chosen": -53.526763916015625, + "logps/rejected": -106.8198013305664, + "loss": 0.5908, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.516087770462036, + "rewards/margins": 7.356228828430176, + "rewards/rejected": -3.8401408195495605, + "step": 11637 + }, + { + "epoch": 2.91, + "grad_norm": 5.602583885192871, + "learning_rate": 3.7210319406243965e-06, + "logits/chosen": -0.513850748538971, + "logits/rejected": -0.5517637133598328, + "logps/chosen": -58.9402961730957, + "logps/rejected": -101.83601379394531, + "loss": 0.8069, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.109056234359741, + "rewards/margins": 5.336116313934326, + "rewards/rejected": -2.2270593643188477, + "step": 11638 + }, + { + "epoch": 2.91, + "grad_norm": 3.352353096008301, + "learning_rate": 3.720272117473053e-06, + "logits/chosen": -0.43711262941360474, + "logits/rejected": -0.5276704430580139, + "logps/chosen": -58.58245086669922, + "logps/rejected": -102.56243896484375, + "loss": 0.6533, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.889698028564453, + "rewards/margins": 7.069883346557617, + "rewards/rejected": -4.180185794830322, + "step": 11639 + }, + { + "epoch": 2.91, + "grad_norm": 3.78568172454834, + "learning_rate": 3.719512325945146e-06, + "logits/chosen": -0.47615575790405273, + "logits/rejected": -0.5742508172988892, + "logps/chosen": -56.08761978149414, + "logps/rejected": -92.65312194824219, + "loss": 0.6485, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.983875274658203, + "rewards/margins": 6.008133411407471, + "rewards/rejected": -3.0242581367492676, + "step": 11640 + }, + { + "epoch": 2.91, + "grad_norm": 4.941430568695068, + "learning_rate": 3.7187525660594524e-06, + "logits/chosen": -0.5018547773361206, + "logits/rejected": -0.5934823155403137, + "logps/chosen": -53.35867691040039, + "logps/rejected": -79.28685760498047, + "loss": 0.5987, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5184273719787598, + "rewards/margins": 6.386094570159912, + "rewards/rejected": -2.8676671981811523, + "step": 11641 + }, + { + "epoch": 2.91, + "grad_norm": 3.0074710845947266, + "learning_rate": 3.717992837834745e-06, + "logits/chosen": -0.5202139616012573, + "logits/rejected": -0.6030983924865723, + "logps/chosen": -42.19569396972656, + "logps/rejected": -88.193115234375, + "loss": 0.5567, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3007428646087646, + "rewards/margins": 7.268961429595947, + "rewards/rejected": -3.9682185649871826, + "step": 11642 + }, + { + "epoch": 2.91, + "grad_norm": 7.152562618255615, + "learning_rate": 3.7172331412897984e-06, + "logits/chosen": -0.4494594633579254, + "logits/rejected": -0.5275872945785522, + "logps/chosen": -69.2810287475586, + "logps/rejected": -110.97547912597656, + "loss": 0.7854, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.770803451538086, + "rewards/margins": 5.742434978485107, + "rewards/rejected": -2.9716320037841797, + "step": 11643 + }, + { + "epoch": 2.91, + "grad_norm": 3.845417022705078, + "learning_rate": 3.7164734764433856e-06, + "logits/chosen": -0.5076930522918701, + "logits/rejected": -0.559894859790802, + "logps/chosen": -47.72864532470703, + "logps/rejected": -107.90943908691406, + "loss": 0.6974, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.20092511177063, + "rewards/margins": 6.531139373779297, + "rewards/rejected": -3.330214262008667, + "step": 11644 + }, + { + "epoch": 2.91, + "grad_norm": 5.463112831115723, + "learning_rate": 3.7157138433142782e-06, + "logits/chosen": -0.4443332850933075, + "logits/rejected": -0.4663785398006439, + "logps/chosen": -55.838623046875, + "logps/rejected": -111.75638580322266, + "loss": 0.6592, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.126786708831787, + "rewards/margins": 6.445841312408447, + "rewards/rejected": -3.31905460357666, + "step": 11645 + }, + { + "epoch": 2.91, + "grad_norm": 7.291932582855225, + "learning_rate": 3.714954241921247e-06, + "logits/chosen": -0.49314823746681213, + "logits/rejected": -0.576498806476593, + "logps/chosen": -56.83281326293945, + "logps/rejected": -89.68555450439453, + "loss": 0.8007, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9623961448669434, + "rewards/margins": 5.737511157989502, + "rewards/rejected": -2.7751145362854004, + "step": 11646 + }, + { + "epoch": 2.91, + "grad_norm": 5.52947473526001, + "learning_rate": 3.7141946722830637e-06, + "logits/chosen": -0.5353742837905884, + "logits/rejected": -0.6416875123977661, + "logps/chosen": -62.55900573730469, + "logps/rejected": -94.52935028076172, + "loss": 0.7421, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.05076265335083, + "rewards/margins": 5.329597473144531, + "rewards/rejected": -2.2788355350494385, + "step": 11647 + }, + { + "epoch": 2.91, + "grad_norm": 2.7522408962249756, + "learning_rate": 3.7134351344184987e-06, + "logits/chosen": -0.523531436920166, + "logits/rejected": -0.5987913012504578, + "logps/chosen": -46.75705337524414, + "logps/rejected": -100.67132568359375, + "loss": 0.5378, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3947055339813232, + "rewards/margins": 7.0661234855651855, + "rewards/rejected": -3.6714181900024414, + "step": 11648 + }, + { + "epoch": 2.91, + "grad_norm": 2.7598419189453125, + "learning_rate": 3.712675628346319e-06, + "logits/chosen": -0.4743192791938782, + "logits/rejected": -0.523063063621521, + "logps/chosen": -61.739906311035156, + "logps/rejected": -101.48260498046875, + "loss": 0.6468, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0407214164733887, + "rewards/margins": 7.215511798858643, + "rewards/rejected": -4.174790382385254, + "step": 11649 + }, + { + "epoch": 2.91, + "grad_norm": 5.432828426361084, + "learning_rate": 3.7119161540852923e-06, + "logits/chosen": -0.5435014963150024, + "logits/rejected": -0.6422114968299866, + "logps/chosen": -55.80195236206055, + "logps/rejected": -103.3292007446289, + "loss": 0.6679, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9742441177368164, + "rewards/margins": 7.28416109085083, + "rewards/rejected": -4.309917449951172, + "step": 11650 + }, + { + "epoch": 2.91, + "grad_norm": 7.866123199462891, + "learning_rate": 3.7111567116541893e-06, + "logits/chosen": -0.5218468308448792, + "logits/rejected": -0.6177184581756592, + "logps/chosen": -45.325294494628906, + "logps/rejected": -101.88629150390625, + "loss": 0.6664, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.051675319671631, + "rewards/margins": 6.434053421020508, + "rewards/rejected": -3.3823773860931396, + "step": 11651 + }, + { + "epoch": 2.91, + "grad_norm": 9.462967872619629, + "learning_rate": 3.7103973010717762e-06, + "logits/chosen": -0.521948516368866, + "logits/rejected": -0.5198276042938232, + "logps/chosen": -42.68233108520508, + "logps/rejected": -97.31808471679688, + "loss": 0.6339, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3388266563415527, + "rewards/margins": 5.6707987785339355, + "rewards/rejected": -2.331972599029541, + "step": 11652 + }, + { + "epoch": 2.92, + "grad_norm": 20.402816772460938, + "learning_rate": 3.7096379223568145e-06, + "logits/chosen": -0.46010226011276245, + "logits/rejected": -0.5536181926727295, + "logps/chosen": -58.89572525024414, + "logps/rejected": -95.27195739746094, + "loss": 0.7887, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8359012603759766, + "rewards/margins": 6.472952842712402, + "rewards/rejected": -3.6370513439178467, + "step": 11653 + }, + { + "epoch": 2.92, + "grad_norm": 9.900230407714844, + "learning_rate": 3.708878575528074e-06, + "logits/chosen": -0.48642563819885254, + "logits/rejected": -0.5554053783416748, + "logps/chosen": -47.68288803100586, + "logps/rejected": -92.97177124023438, + "loss": 0.5995, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0678861141204834, + "rewards/margins": 6.20645809173584, + "rewards/rejected": -3.1385719776153564, + "step": 11654 + }, + { + "epoch": 2.92, + "grad_norm": 4.422605514526367, + "learning_rate": 3.708119260604317e-06, + "logits/chosen": -0.4760037660598755, + "logits/rejected": -0.5774813890457153, + "logps/chosen": -68.50753021240234, + "logps/rejected": -91.81758117675781, + "loss": 0.7417, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.225914239883423, + "rewards/margins": 5.793926239013672, + "rewards/rejected": -2.56801176071167, + "step": 11655 + }, + { + "epoch": 2.92, + "grad_norm": 8.60987377166748, + "learning_rate": 3.7073599776043066e-06, + "logits/chosen": -0.45708540081977844, + "logits/rejected": -0.5356073379516602, + "logps/chosen": -81.50641632080078, + "logps/rejected": -97.79931640625, + "loss": 0.815, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.120002269744873, + "rewards/margins": 6.668389320373535, + "rewards/rejected": -3.5483877658843994, + "step": 11656 + }, + { + "epoch": 2.92, + "grad_norm": 12.893641471862793, + "learning_rate": 3.706600726546807e-06, + "logits/chosen": -0.48209986090660095, + "logits/rejected": -0.5669745206832886, + "logps/chosen": -53.80340576171875, + "logps/rejected": -95.56122589111328, + "loss": 0.7551, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9602999687194824, + "rewards/margins": 6.249530792236328, + "rewards/rejected": -3.2892305850982666, + "step": 11657 + }, + { + "epoch": 2.92, + "grad_norm": 3.546962022781372, + "learning_rate": 3.7058415074505784e-06, + "logits/chosen": -0.4581821858882904, + "logits/rejected": -0.5679146647453308, + "logps/chosen": -55.59013366699219, + "logps/rejected": -87.359619140625, + "loss": 0.6495, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.391561508178711, + "rewards/margins": 6.500986099243164, + "rewards/rejected": -3.1094248294830322, + "step": 11658 + }, + { + "epoch": 2.92, + "grad_norm": 8.736010551452637, + "learning_rate": 3.7050823203343833e-06, + "logits/chosen": -0.4771823585033417, + "logits/rejected": -0.5789148807525635, + "logps/chosen": -60.260616302490234, + "logps/rejected": -100.15357208251953, + "loss": 0.7951, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8649494647979736, + "rewards/margins": 6.27261209487915, + "rewards/rejected": -3.4076626300811768, + "step": 11659 + }, + { + "epoch": 2.92, + "grad_norm": 5.359426021575928, + "learning_rate": 3.704323165216981e-06, + "logits/chosen": -0.5426322221755981, + "logits/rejected": -0.5980061292648315, + "logps/chosen": -57.30205535888672, + "logps/rejected": -108.94122314453125, + "loss": 0.6638, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.909701108932495, + "rewards/margins": 5.716926097869873, + "rewards/rejected": -2.807224988937378, + "step": 11660 + }, + { + "epoch": 2.92, + "grad_norm": 7.245604515075684, + "learning_rate": 3.7035640421171313e-06, + "logits/chosen": -0.49421587586402893, + "logits/rejected": -0.5450667142868042, + "logps/chosen": -49.742462158203125, + "logps/rejected": -107.56786346435547, + "loss": 0.588, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9376091957092285, + "rewards/margins": 6.068732261657715, + "rewards/rejected": -3.131122589111328, + "step": 11661 + }, + { + "epoch": 2.92, + "grad_norm": 4.5107421875, + "learning_rate": 3.702804951053594e-06, + "logits/chosen": -0.4841982126235962, + "logits/rejected": -0.6136780381202698, + "logps/chosen": -50.79008865356445, + "logps/rejected": -87.11685943603516, + "loss": 0.6136, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.868523359298706, + "rewards/margins": 7.015410423278809, + "rewards/rejected": -4.146887302398682, + "step": 11662 + }, + { + "epoch": 2.92, + "grad_norm": 7.937435150146484, + "learning_rate": 3.702045892045125e-06, + "logits/chosen": -0.4697261452674866, + "logits/rejected": -0.5291492938995361, + "logps/chosen": -45.999244689941406, + "logps/rejected": -97.29981994628906, + "loss": 0.6594, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8243746757507324, + "rewards/margins": 5.77554988861084, + "rewards/rejected": -2.951174736022949, + "step": 11663 + }, + { + "epoch": 2.92, + "grad_norm": 10.940935134887695, + "learning_rate": 3.701286865110483e-06, + "logits/chosen": -0.5562728047370911, + "logits/rejected": -0.6231447458267212, + "logps/chosen": -48.03395080566406, + "logps/rejected": -102.58360290527344, + "loss": 0.6782, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3896231651306152, + "rewards/margins": 5.965923309326172, + "rewards/rejected": -2.5763003826141357, + "step": 11664 + }, + { + "epoch": 2.92, + "grad_norm": 7.684734344482422, + "learning_rate": 3.7005278702684223e-06, + "logits/chosen": -0.5371318459510803, + "logits/rejected": -0.6197993755340576, + "logps/chosen": -47.722320556640625, + "logps/rejected": -93.70619201660156, + "loss": 0.6228, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1594033241271973, + "rewards/margins": 5.716968536376953, + "rewards/rejected": -2.5575647354125977, + "step": 11665 + }, + { + "epoch": 2.92, + "grad_norm": 18.426597595214844, + "learning_rate": 3.6997689075377034e-06, + "logits/chosen": -0.48430192470550537, + "logits/rejected": -0.527050793170929, + "logps/chosen": -58.84400939941406, + "logps/rejected": -98.11968994140625, + "loss": 0.7311, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5681450366973877, + "rewards/margins": 5.488689422607422, + "rewards/rejected": -2.920544147491455, + "step": 11666 + }, + { + "epoch": 2.92, + "grad_norm": 7.0710954666137695, + "learning_rate": 3.6990099769370745e-06, + "logits/chosen": -0.5714728832244873, + "logits/rejected": -0.634924054145813, + "logps/chosen": -52.00377655029297, + "logps/rejected": -103.50344848632812, + "loss": 0.6172, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.917341470718384, + "rewards/margins": 6.241528511047363, + "rewards/rejected": -3.3241870403289795, + "step": 11667 + }, + { + "epoch": 2.92, + "grad_norm": 17.107389450073242, + "learning_rate": 3.6982510784852944e-06, + "logits/chosen": -0.48045864701271057, + "logits/rejected": -0.5595337152481079, + "logps/chosen": -57.64072799682617, + "logps/rejected": -105.61820983886719, + "loss": 0.7805, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8155386447906494, + "rewards/margins": 7.356640338897705, + "rewards/rejected": -4.541102409362793, + "step": 11668 + }, + { + "epoch": 2.92, + "grad_norm": 15.358662605285645, + "learning_rate": 3.697492212201117e-06, + "logits/chosen": -0.440330445766449, + "logits/rejected": -0.5205036401748657, + "logps/chosen": -61.57769775390625, + "logps/rejected": -101.51156616210938, + "loss": 0.811, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.898965835571289, + "rewards/margins": 6.333945274353027, + "rewards/rejected": -3.43497896194458, + "step": 11669 + }, + { + "epoch": 2.92, + "grad_norm": 5.554605484008789, + "learning_rate": 3.6967333781032887e-06, + "logits/chosen": -0.5160413384437561, + "logits/rejected": -0.6470654606819153, + "logps/chosen": -62.828006744384766, + "logps/rejected": -103.06243133544922, + "loss": 0.6688, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.329166889190674, + "rewards/margins": 8.012300491333008, + "rewards/rejected": -4.683133602142334, + "step": 11670 + }, + { + "epoch": 2.92, + "grad_norm": 7.200848579406738, + "learning_rate": 3.6959745762105664e-06, + "logits/chosen": -0.4636462330818176, + "logits/rejected": -0.5856196880340576, + "logps/chosen": -65.20451354980469, + "logps/rejected": -95.60476684570312, + "loss": 0.6809, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2347445487976074, + "rewards/margins": 6.564764022827148, + "rewards/rejected": -3.33001971244812, + "step": 11671 + }, + { + "epoch": 2.92, + "grad_norm": 17.032550811767578, + "learning_rate": 3.695215806541699e-06, + "logits/chosen": -0.4762270450592041, + "logits/rejected": -0.5592795610427856, + "logps/chosen": -53.25895690917969, + "logps/rejected": -94.57624053955078, + "loss": 0.6946, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9838619232177734, + "rewards/margins": 6.19258975982666, + "rewards/rejected": -3.208728313446045, + "step": 11672 + }, + { + "epoch": 2.92, + "grad_norm": 1.9392051696777344, + "learning_rate": 3.694457069115438e-06, + "logits/chosen": -0.4488460421562195, + "logits/rejected": -0.5317833423614502, + "logps/chosen": -55.55507278442383, + "logps/rejected": -94.31587219238281, + "loss": 0.5546, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3754940032958984, + "rewards/margins": 7.143309116363525, + "rewards/rejected": -3.767815351486206, + "step": 11673 + }, + { + "epoch": 2.92, + "grad_norm": 10.660181999206543, + "learning_rate": 3.69369836395053e-06, + "logits/chosen": -0.3993220925331116, + "logits/rejected": -0.5311316251754761, + "logps/chosen": -55.82185745239258, + "logps/rejected": -109.9747314453125, + "loss": 0.6109, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9387149810791016, + "rewards/margins": 6.455117702484131, + "rewards/rejected": -3.51640248298645, + "step": 11674 + }, + { + "epoch": 2.92, + "grad_norm": 5.753843307495117, + "learning_rate": 3.6929396910657254e-06, + "logits/chosen": -0.516444206237793, + "logits/rejected": -0.568408727645874, + "logps/chosen": -62.99061584472656, + "logps/rejected": -113.23667907714844, + "loss": 0.7081, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.905109405517578, + "rewards/margins": 6.157205581665039, + "rewards/rejected": -3.252096176147461, + "step": 11675 + }, + { + "epoch": 2.92, + "grad_norm": 5.448991775512695, + "learning_rate": 3.6921810504797705e-06, + "logits/chosen": -0.5293759107589722, + "logits/rejected": -0.5883544087409973, + "logps/chosen": -54.832176208496094, + "logps/rejected": -121.93848419189453, + "loss": 0.6281, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.260125160217285, + "rewards/margins": 6.645802021026611, + "rewards/rejected": -3.385676145553589, + "step": 11676 + }, + { + "epoch": 2.92, + "grad_norm": 12.773418426513672, + "learning_rate": 3.6914224422114154e-06, + "logits/chosen": -0.4968031346797943, + "logits/rejected": -0.5890902280807495, + "logps/chosen": -56.802127838134766, + "logps/rejected": -108.44928741455078, + "loss": 0.6705, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5910098552703857, + "rewards/margins": 6.406970024108887, + "rewards/rejected": -3.815959930419922, + "step": 11677 + }, + { + "epoch": 2.92, + "grad_norm": 3.898787498474121, + "learning_rate": 3.6906638662794015e-06, + "logits/chosen": -0.5158854126930237, + "logits/rejected": -0.5987831354141235, + "logps/chosen": -57.11924362182617, + "logps/rejected": -97.58566284179688, + "loss": 0.7008, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2345991134643555, + "rewards/margins": 6.569716930389404, + "rewards/rejected": -3.3351173400878906, + "step": 11678 + }, + { + "epoch": 2.92, + "grad_norm": 3.6980361938476562, + "learning_rate": 3.6899053227024754e-06, + "logits/chosen": -0.46765679121017456, + "logits/rejected": -0.5188353657722473, + "logps/chosen": -55.04294967651367, + "logps/rejected": -104.98607635498047, + "loss": 0.6023, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.989527702331543, + "rewards/margins": 6.304673194885254, + "rewards/rejected": -3.315145492553711, + "step": 11679 + }, + { + "epoch": 2.92, + "grad_norm": 13.765950202941895, + "learning_rate": 3.6891468114993857e-06, + "logits/chosen": -0.46895459294319153, + "logits/rejected": -0.534551203250885, + "logps/chosen": -70.47412872314453, + "logps/rejected": -113.10607147216797, + "loss": 0.9479, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.417834758758545, + "rewards/margins": 5.675876140594482, + "rewards/rejected": -3.2580413818359375, + "step": 11680 + }, + { + "epoch": 2.92, + "grad_norm": 10.057761192321777, + "learning_rate": 3.688388332688869e-06, + "logits/chosen": -0.45047515630722046, + "logits/rejected": -0.5307789444923401, + "logps/chosen": -63.640323638916016, + "logps/rejected": -106.60279846191406, + "loss": 0.6604, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6821484565734863, + "rewards/margins": 5.614234924316406, + "rewards/rejected": -2.93208646774292, + "step": 11681 + }, + { + "epoch": 2.92, + "grad_norm": 5.351722240447998, + "learning_rate": 3.6876298862896733e-06, + "logits/chosen": -0.4346984922885895, + "logits/rejected": -0.5317856073379517, + "logps/chosen": -57.888450622558594, + "logps/rejected": -98.04732513427734, + "loss": 0.5991, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7262885570526123, + "rewards/margins": 5.803538799285889, + "rewards/rejected": -3.0772504806518555, + "step": 11682 + }, + { + "epoch": 2.92, + "grad_norm": 3.358147144317627, + "learning_rate": 3.6868714723205377e-06, + "logits/chosen": -0.4400608539581299, + "logits/rejected": -0.5363575220108032, + "logps/chosen": -50.19648742675781, + "logps/rejected": -112.93690490722656, + "loss": 0.5893, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.192124605178833, + "rewards/margins": 7.539468288421631, + "rewards/rejected": -4.3473429679870605, + "step": 11683 + }, + { + "epoch": 2.92, + "grad_norm": 15.951237678527832, + "learning_rate": 3.686113090800207e-06, + "logits/chosen": -0.4782464802265167, + "logits/rejected": -0.5393239259719849, + "logps/chosen": -59.60689163208008, + "logps/rejected": -100.69444274902344, + "loss": 0.8215, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.692892551422119, + "rewards/margins": 5.869900226593018, + "rewards/rejected": -3.1770076751708984, + "step": 11684 + }, + { + "epoch": 2.92, + "grad_norm": 7.201351165771484, + "learning_rate": 3.685354741747418e-06, + "logits/chosen": -0.43291687965393066, + "logits/rejected": -0.5090380907058716, + "logps/chosen": -58.30076217651367, + "logps/rejected": -96.77437591552734, + "loss": 0.5913, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9964005947113037, + "rewards/margins": 6.593428611755371, + "rewards/rejected": -3.5970277786254883, + "step": 11685 + }, + { + "epoch": 2.92, + "grad_norm": 7.022405624389648, + "learning_rate": 3.6845964251809113e-06, + "logits/chosen": -0.4559336006641388, + "logits/rejected": -0.5800613164901733, + "logps/chosen": -60.3397102355957, + "logps/rejected": -91.33464813232422, + "loss": 0.7614, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.95949649810791, + "rewards/margins": 6.1463117599487305, + "rewards/rejected": -3.1868152618408203, + "step": 11686 + }, + { + "epoch": 2.92, + "grad_norm": 4.211556911468506, + "learning_rate": 3.6838381411194274e-06, + "logits/chosen": -0.4958692193031311, + "logits/rejected": -0.6499635577201843, + "logps/chosen": -56.381832122802734, + "logps/rejected": -103.18159484863281, + "loss": 0.5933, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9618992805480957, + "rewards/margins": 8.231816291809082, + "rewards/rejected": -5.269916534423828, + "step": 11687 + }, + { + "epoch": 2.92, + "grad_norm": 17.25528907775879, + "learning_rate": 3.683079889581702e-06, + "logits/chosen": -0.47573763132095337, + "logits/rejected": -0.6087914109230042, + "logps/chosen": -50.64645004272461, + "logps/rejected": -99.22046661376953, + "loss": 0.5627, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.090376377105713, + "rewards/margins": 6.397407531738281, + "rewards/rejected": -3.3070313930511475, + "step": 11688 + }, + { + "epoch": 2.92, + "grad_norm": 6.226195335388184, + "learning_rate": 3.682321670586473e-06, + "logits/chosen": -0.5150241255760193, + "logits/rejected": -0.6169048547744751, + "logps/chosen": -56.049320220947266, + "logps/rejected": -91.6020278930664, + "loss": 0.5932, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1201653480529785, + "rewards/margins": 6.717163562774658, + "rewards/rejected": -3.5969977378845215, + "step": 11689 + }, + { + "epoch": 2.92, + "grad_norm": 18.938018798828125, + "learning_rate": 3.681563484152477e-06, + "logits/chosen": -0.4993962049484253, + "logits/rejected": -0.590217649936676, + "logps/chosen": -57.97003936767578, + "logps/rejected": -102.85205841064453, + "loss": 0.7942, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8242881298065186, + "rewards/margins": 6.47318172454834, + "rewards/rejected": -3.648893117904663, + "step": 11690 + }, + { + "epoch": 2.92, + "grad_norm": 2.528805732727051, + "learning_rate": 3.680805330298451e-06, + "logits/chosen": -0.41492757201194763, + "logits/rejected": -0.5474304556846619, + "logps/chosen": -53.26435470581055, + "logps/rejected": -84.6180648803711, + "loss": 0.5297, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.407947063446045, + "rewards/margins": 7.245490074157715, + "rewards/rejected": -3.8375425338745117, + "step": 11691 + }, + { + "epoch": 2.92, + "grad_norm": 4.749764442443848, + "learning_rate": 3.680047209043127e-06, + "logits/chosen": -0.43889766931533813, + "logits/rejected": -0.4901507496833801, + "logps/chosen": -51.00309371948242, + "logps/rejected": -95.09001159667969, + "loss": 0.687, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9086694717407227, + "rewards/margins": 5.86225700378418, + "rewards/rejected": -2.953587770462036, + "step": 11692 + }, + { + "epoch": 2.93, + "grad_norm": 6.293358325958252, + "learning_rate": 3.67928912040524e-06, + "logits/chosen": -0.4388176202774048, + "logits/rejected": -0.5277136564254761, + "logps/chosen": -71.55831909179688, + "logps/rejected": -106.38224792480469, + "loss": 0.7374, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.738881826400757, + "rewards/margins": 5.993711471557617, + "rewards/rejected": -3.2548294067382812, + "step": 11693 + }, + { + "epoch": 2.93, + "grad_norm": 3.4581656455993652, + "learning_rate": 3.6785310644035265e-06, + "logits/chosen": -0.460674524307251, + "logits/rejected": -0.5254380702972412, + "logps/chosen": -44.460296630859375, + "logps/rejected": -106.81526947021484, + "loss": 0.5909, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0719563961029053, + "rewards/margins": 7.263411045074463, + "rewards/rejected": -4.191454887390137, + "step": 11694 + }, + { + "epoch": 2.93, + "grad_norm": 7.797329425811768, + "learning_rate": 3.6777730410567136e-06, + "logits/chosen": -0.44322794675827026, + "logits/rejected": -0.5262369513511658, + "logps/chosen": -61.780982971191406, + "logps/rejected": -107.90934753417969, + "loss": 0.6876, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3191628456115723, + "rewards/margins": 6.59951639175415, + "rewards/rejected": -3.280353546142578, + "step": 11695 + }, + { + "epoch": 2.93, + "grad_norm": 3.7638583183288574, + "learning_rate": 3.6770150503835343e-06, + "logits/chosen": -0.5135014057159424, + "logits/rejected": -0.6146275401115417, + "logps/chosen": -46.12184524536133, + "logps/rejected": -92.94608306884766, + "loss": 0.6298, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1711645126342773, + "rewards/margins": 6.454362392425537, + "rewards/rejected": -3.2831978797912598, + "step": 11696 + }, + { + "epoch": 2.93, + "grad_norm": 13.43498420715332, + "learning_rate": 3.676257092402722e-06, + "logits/chosen": -0.5326529145240784, + "logits/rejected": -0.6616952419281006, + "logps/chosen": -61.41967010498047, + "logps/rejected": -85.9544677734375, + "loss": 0.8654, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8956284523010254, + "rewards/margins": 6.145392417907715, + "rewards/rejected": -3.2497644424438477, + "step": 11697 + }, + { + "epoch": 2.93, + "grad_norm": 11.957730293273926, + "learning_rate": 3.675499167133005e-06, + "logits/chosen": -0.4777643084526062, + "logits/rejected": -0.5077961683273315, + "logps/chosen": -63.02028274536133, + "logps/rejected": -114.99519348144531, + "loss": 0.843, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.5930919647216797, + "rewards/margins": 4.8246283531188965, + "rewards/rejected": -2.2315359115600586, + "step": 11698 + }, + { + "epoch": 2.93, + "grad_norm": 2.4256107807159424, + "learning_rate": 3.674741274593112e-06, + "logits/chosen": -0.4894196391105652, + "logits/rejected": -0.5689600110054016, + "logps/chosen": -45.500404357910156, + "logps/rejected": -88.90985870361328, + "loss": 0.555, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.357022285461426, + "rewards/margins": 5.9419755935668945, + "rewards/rejected": -2.5849533081054688, + "step": 11699 + }, + { + "epoch": 2.93, + "grad_norm": 3.9869773387908936, + "learning_rate": 3.673983414801772e-06, + "logits/chosen": -0.49311813712120056, + "logits/rejected": -0.5842167139053345, + "logps/chosen": -51.04705810546875, + "logps/rejected": -88.32054138183594, + "loss": 0.5832, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1583216190338135, + "rewards/margins": 6.802258491516113, + "rewards/rejected": -3.6439368724823, + "step": 11700 + }, + { + "epoch": 2.93, + "grad_norm": 8.21367073059082, + "learning_rate": 3.6732255877777114e-06, + "logits/chosen": -0.4677085280418396, + "logits/rejected": -0.525833785533905, + "logps/chosen": -50.942935943603516, + "logps/rejected": -105.87500762939453, + "loss": 0.674, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0147998332977295, + "rewards/margins": 6.074764251708984, + "rewards/rejected": -3.0599639415740967, + "step": 11701 + }, + { + "epoch": 2.93, + "grad_norm": 6.2052764892578125, + "learning_rate": 3.672467793539659e-06, + "logits/chosen": -0.4930311143398285, + "logits/rejected": -0.6066855192184448, + "logps/chosen": -55.944488525390625, + "logps/rejected": -88.80774688720703, + "loss": 0.6635, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8392343521118164, + "rewards/margins": 5.923747539520264, + "rewards/rejected": -3.0845134258270264, + "step": 11702 + }, + { + "epoch": 2.93, + "grad_norm": 7.303165435791016, + "learning_rate": 3.671710032106338e-06, + "logits/chosen": -0.468721866607666, + "logits/rejected": -0.5263007879257202, + "logps/chosen": -63.64923095703125, + "logps/rejected": -98.15071105957031, + "loss": 0.6911, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2702486515045166, + "rewards/margins": 5.745412826538086, + "rewards/rejected": -2.4751639366149902, + "step": 11703 + }, + { + "epoch": 2.93, + "grad_norm": 2.413529396057129, + "learning_rate": 3.6709523034964756e-06, + "logits/chosen": -0.47008490562438965, + "logits/rejected": -0.5107003450393677, + "logps/chosen": -59.52692413330078, + "logps/rejected": -117.64781188964844, + "loss": 0.5823, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3878231048583984, + "rewards/margins": 6.712018013000488, + "rewards/rejected": -3.3241946697235107, + "step": 11704 + }, + { + "epoch": 2.93, + "grad_norm": 19.71312141418457, + "learning_rate": 3.6701946077287965e-06, + "logits/chosen": -0.4811527729034424, + "logits/rejected": -0.5219366550445557, + "logps/chosen": -50.656368255615234, + "logps/rejected": -100.09140014648438, + "loss": 0.8645, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6005685329437256, + "rewards/margins": 5.379795551300049, + "rewards/rejected": -2.779226779937744, + "step": 11705 + }, + { + "epoch": 2.93, + "grad_norm": 7.280776023864746, + "learning_rate": 3.669436944822022e-06, + "logits/chosen": -0.38338515162467957, + "logits/rejected": -0.5417678952217102, + "logps/chosen": -69.66048431396484, + "logps/rejected": -95.67398071289062, + "loss": 0.6578, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7163267135620117, + "rewards/margins": 6.777675151824951, + "rewards/rejected": -4.0613484382629395, + "step": 11706 + }, + { + "epoch": 2.93, + "grad_norm": 5.786664009094238, + "learning_rate": 3.668679314794875e-06, + "logits/chosen": -0.4727154076099396, + "logits/rejected": -0.5379228591918945, + "logps/chosen": -54.72041702270508, + "logps/rejected": -90.5372314453125, + "loss": 0.6912, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.10394024848938, + "rewards/margins": 6.505727767944336, + "rewards/rejected": -3.401787042617798, + "step": 11707 + }, + { + "epoch": 2.93, + "grad_norm": 6.623659610748291, + "learning_rate": 3.667921717666078e-06, + "logits/chosen": -0.4664698839187622, + "logits/rejected": -0.5480167269706726, + "logps/chosen": -48.480072021484375, + "logps/rejected": -102.95803833007812, + "loss": 0.6502, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1128859519958496, + "rewards/margins": 6.374061107635498, + "rewards/rejected": -3.2611751556396484, + "step": 11708 + }, + { + "epoch": 2.93, + "grad_norm": 6.4574360847473145, + "learning_rate": 3.6671641534543558e-06, + "logits/chosen": -0.5008800625801086, + "logits/rejected": -0.5945053100585938, + "logps/chosen": -54.70811080932617, + "logps/rejected": -101.11808013916016, + "loss": 0.6924, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9770946502685547, + "rewards/margins": 6.5906877517700195, + "rewards/rejected": -3.613593339920044, + "step": 11709 + }, + { + "epoch": 2.93, + "grad_norm": 4.200397968292236, + "learning_rate": 3.666406622178421e-06, + "logits/chosen": -0.4511711001396179, + "logits/rejected": -0.5527623295783997, + "logps/chosen": -53.85557174682617, + "logps/rejected": -105.18830871582031, + "loss": 0.5469, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.73974609375, + "rewards/margins": 7.233564853668213, + "rewards/rejected": -4.493818759918213, + "step": 11710 + }, + { + "epoch": 2.93, + "grad_norm": 4.346320629119873, + "learning_rate": 3.665649123856998e-06, + "logits/chosen": -0.4870210886001587, + "logits/rejected": -0.5822834372520447, + "logps/chosen": -54.3155403137207, + "logps/rejected": -82.51094055175781, + "loss": 0.6278, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.085519313812256, + "rewards/margins": 5.694502353668213, + "rewards/rejected": -2.608982801437378, + "step": 11711 + }, + { + "epoch": 2.93, + "grad_norm": 4.684399604797363, + "learning_rate": 3.6648916585088066e-06, + "logits/chosen": -0.38834112882614136, + "logits/rejected": -0.4099346995353699, + "logps/chosen": -55.761436462402344, + "logps/rejected": -102.0496597290039, + "loss": 0.6444, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.409278392791748, + "rewards/margins": 5.195616245269775, + "rewards/rejected": -1.7863373756408691, + "step": 11712 + }, + { + "epoch": 2.93, + "grad_norm": 3.3173131942749023, + "learning_rate": 3.6641342261525604e-06, + "logits/chosen": -0.4923263192176819, + "logits/rejected": -0.5886238217353821, + "logps/chosen": -48.637046813964844, + "logps/rejected": -92.69032287597656, + "loss": 0.6209, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9907968044281006, + "rewards/margins": 6.083831787109375, + "rewards/rejected": -3.0930352210998535, + "step": 11713 + }, + { + "epoch": 2.93, + "grad_norm": 3.201647996902466, + "learning_rate": 3.663376826806978e-06, + "logits/chosen": -0.458545058965683, + "logits/rejected": -0.5270987749099731, + "logps/chosen": -51.22452163696289, + "logps/rejected": -92.67591857910156, + "loss": 0.6285, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4208881855010986, + "rewards/margins": 5.64755916595459, + "rewards/rejected": -2.2266712188720703, + "step": 11714 + }, + { + "epoch": 2.93, + "grad_norm": 3.6318576335906982, + "learning_rate": 3.6626194604907766e-06, + "logits/chosen": -0.45274460315704346, + "logits/rejected": -0.5061867237091064, + "logps/chosen": -53.12925338745117, + "logps/rejected": -103.78934478759766, + "loss": 0.5576, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.315065622329712, + "rewards/margins": 6.496175289154053, + "rewards/rejected": -3.1811091899871826, + "step": 11715 + }, + { + "epoch": 2.93, + "grad_norm": 7.9719977378845215, + "learning_rate": 3.661862127222672e-06, + "logits/chosen": -0.463735431432724, + "logits/rejected": -0.5381723642349243, + "logps/chosen": -51.22868347167969, + "logps/rejected": -109.40494537353516, + "loss": 0.6143, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.773897171020508, + "rewards/margins": 5.7099151611328125, + "rewards/rejected": -2.936018705368042, + "step": 11716 + }, + { + "epoch": 2.93, + "grad_norm": 16.185138702392578, + "learning_rate": 3.661104827021376e-06, + "logits/chosen": -0.46643540263175964, + "logits/rejected": -0.533877968788147, + "logps/chosen": -55.99227523803711, + "logps/rejected": -101.53475952148438, + "loss": 0.6524, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0471959114074707, + "rewards/margins": 5.963162422180176, + "rewards/rejected": -2.915966510772705, + "step": 11717 + }, + { + "epoch": 2.93, + "grad_norm": 5.577203750610352, + "learning_rate": 3.660347559905604e-06, + "logits/chosen": -0.4931679964065552, + "logits/rejected": -0.650189220905304, + "logps/chosen": -70.94715881347656, + "logps/rejected": -82.17427062988281, + "loss": 0.7573, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.4536852836608887, + "rewards/margins": 6.2109222412109375, + "rewards/rejected": -2.7572367191314697, + "step": 11718 + }, + { + "epoch": 2.93, + "grad_norm": 4.693493843078613, + "learning_rate": 3.65959032589407e-06, + "logits/chosen": -0.4435218572616577, + "logits/rejected": -0.5225928425788879, + "logps/chosen": -52.70254898071289, + "logps/rejected": -100.453857421875, + "loss": 0.6363, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9479191303253174, + "rewards/margins": 6.8946990966796875, + "rewards/rejected": -3.9467790126800537, + "step": 11719 + }, + { + "epoch": 2.93, + "grad_norm": 11.03332233428955, + "learning_rate": 3.658833125005484e-06, + "logits/chosen": -0.5076566934585571, + "logits/rejected": -0.6369069814682007, + "logps/chosen": -55.655967712402344, + "logps/rejected": -90.74246978759766, + "loss": 0.7423, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2586522102355957, + "rewards/margins": 6.119302749633789, + "rewards/rejected": -2.8606507778167725, + "step": 11720 + }, + { + "epoch": 2.93, + "grad_norm": 4.371381759643555, + "learning_rate": 3.6580759572585573e-06, + "logits/chosen": -0.4833909273147583, + "logits/rejected": -0.6160876154899597, + "logps/chosen": -64.59656524658203, + "logps/rejected": -89.60377502441406, + "loss": 0.6924, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8551688194274902, + "rewards/margins": 5.67987585067749, + "rewards/rejected": -2.824706554412842, + "step": 11721 + }, + { + "epoch": 2.93, + "grad_norm": 4.208666801452637, + "learning_rate": 3.6573188226720004e-06, + "logits/chosen": -0.45373186469078064, + "logits/rejected": -0.5318788290023804, + "logps/chosen": -43.53279113769531, + "logps/rejected": -102.76690673828125, + "loss": 0.5592, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2187647819519043, + "rewards/margins": 6.496835708618164, + "rewards/rejected": -3.2780709266662598, + "step": 11722 + }, + { + "epoch": 2.93, + "grad_norm": 3.88177752494812, + "learning_rate": 3.6565617212645265e-06, + "logits/chosen": -0.5358771681785583, + "logits/rejected": -0.6308133006095886, + "logps/chosen": -51.75339126586914, + "logps/rejected": -113.88348388671875, + "loss": 0.5487, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0869102478027344, + "rewards/margins": 7.5543012619018555, + "rewards/rejected": -4.4673895835876465, + "step": 11723 + }, + { + "epoch": 2.93, + "grad_norm": 1.741214394569397, + "learning_rate": 3.655804653054839e-06, + "logits/chosen": -0.43247494101524353, + "logits/rejected": -0.5813413858413696, + "logps/chosen": -51.48614501953125, + "logps/rejected": -85.06019592285156, + "loss": 0.5344, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0808708667755127, + "rewards/margins": 7.0893778800964355, + "rewards/rejected": -4.008507251739502, + "step": 11724 + }, + { + "epoch": 2.93, + "grad_norm": 4.020658493041992, + "learning_rate": 3.655047618061648e-06, + "logits/chosen": -0.46759364008903503, + "logits/rejected": -0.6009487509727478, + "logps/chosen": -61.02235412597656, + "logps/rejected": -93.1649169921875, + "loss": 0.6349, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8115358352661133, + "rewards/margins": 6.143957138061523, + "rewards/rejected": -3.3324215412139893, + "step": 11725 + }, + { + "epoch": 2.93, + "grad_norm": 2.567281484603882, + "learning_rate": 3.6542906163036636e-06, + "logits/chosen": -0.48077479004859924, + "logits/rejected": -0.5549013614654541, + "logps/chosen": -41.966583251953125, + "logps/rejected": -101.30953979492188, + "loss": 0.5142, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3377015590667725, + "rewards/margins": 7.011256217956543, + "rewards/rejected": -3.6735544204711914, + "step": 11726 + }, + { + "epoch": 2.93, + "grad_norm": 12.176057815551758, + "learning_rate": 3.6535336477995864e-06, + "logits/chosen": -0.46546274423599243, + "logits/rejected": -0.5595364570617676, + "logps/chosen": -61.29841613769531, + "logps/rejected": -94.34400177001953, + "loss": 0.7611, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.928457736968994, + "rewards/margins": 6.76994514465332, + "rewards/rejected": -3.8414878845214844, + "step": 11727 + }, + { + "epoch": 2.93, + "grad_norm": 8.631853103637695, + "learning_rate": 3.6527767125681255e-06, + "logits/chosen": -0.4930482506752014, + "logits/rejected": -0.5836203694343567, + "logps/chosen": -52.78144836425781, + "logps/rejected": -88.04736328125, + "loss": 0.6553, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3722214698791504, + "rewards/margins": 5.526936054229736, + "rewards/rejected": -2.154714584350586, + "step": 11728 + }, + { + "epoch": 2.93, + "grad_norm": 18.004148483276367, + "learning_rate": 3.652019810627986e-06, + "logits/chosen": -0.5690516233444214, + "logits/rejected": -0.6836086511611938, + "logps/chosen": -50.06439971923828, + "logps/rejected": -88.80148315429688, + "loss": 0.7182, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5200154781341553, + "rewards/margins": 6.550182342529297, + "rewards/rejected": -4.0301666259765625, + "step": 11729 + }, + { + "epoch": 2.93, + "grad_norm": 5.74094820022583, + "learning_rate": 3.6512629419978707e-06, + "logits/chosen": -0.4975508451461792, + "logits/rejected": -0.5947408676147461, + "logps/chosen": -64.83506774902344, + "logps/rejected": -97.71282958984375, + "loss": 0.6181, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.025015115737915, + "rewards/margins": 6.4408721923828125, + "rewards/rejected": -3.41585636138916, + "step": 11730 + }, + { + "epoch": 2.93, + "grad_norm": 7.197531223297119, + "learning_rate": 3.650506106696482e-06, + "logits/chosen": -0.5015904307365417, + "logits/rejected": -0.5972563624382019, + "logps/chosen": -52.4937858581543, + "logps/rejected": -100.34479522705078, + "loss": 0.6735, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.112121105194092, + "rewards/margins": 7.530149936676025, + "rewards/rejected": -4.418028354644775, + "step": 11731 + }, + { + "epoch": 2.93, + "grad_norm": 3.327430248260498, + "learning_rate": 3.649749304742523e-06, + "logits/chosen": -0.4569999575614929, + "logits/rejected": -0.5464923977851868, + "logps/chosen": -54.19514465332031, + "logps/rejected": -100.57317352294922, + "loss": 0.5141, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1456172466278076, + "rewards/margins": 6.41727876663208, + "rewards/rejected": -3.2716612815856934, + "step": 11732 + }, + { + "epoch": 2.94, + "grad_norm": 6.047255516052246, + "learning_rate": 3.648992536154695e-06, + "logits/chosen": -0.5315481424331665, + "logits/rejected": -0.6101245880126953, + "logps/chosen": -48.88434982299805, + "logps/rejected": -100.63811492919922, + "loss": 0.6482, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8617658615112305, + "rewards/margins": 6.936201572418213, + "rewards/rejected": -4.074435710906982, + "step": 11733 + }, + { + "epoch": 2.94, + "grad_norm": 26.43478775024414, + "learning_rate": 3.6482358009516995e-06, + "logits/chosen": -0.47395768761634827, + "logits/rejected": -0.5552101731300354, + "logps/chosen": -51.07157516479492, + "logps/rejected": -94.90180969238281, + "loss": 0.7311, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.02728533744812, + "rewards/margins": 6.451455593109131, + "rewards/rejected": -3.424170732498169, + "step": 11734 + }, + { + "epoch": 2.94, + "grad_norm": 11.354137420654297, + "learning_rate": 3.6474790991522333e-06, + "logits/chosen": -0.4812803864479065, + "logits/rejected": -0.5278360247612, + "logps/chosen": -47.03266906738281, + "logps/rejected": -103.10890197753906, + "loss": 0.5844, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4307215213775635, + "rewards/margins": 7.003335952758789, + "rewards/rejected": -3.572614908218384, + "step": 11735 + }, + { + "epoch": 2.94, + "grad_norm": 6.4720778465271, + "learning_rate": 3.646722430774997e-06, + "logits/chosen": -0.5211215615272522, + "logits/rejected": -0.6198546886444092, + "logps/chosen": -51.02018737792969, + "logps/rejected": -90.90200805664062, + "loss": 0.6598, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.031228542327881, + "rewards/margins": 6.227983474731445, + "rewards/rejected": -3.1967549324035645, + "step": 11736 + }, + { + "epoch": 2.94, + "grad_norm": 14.850797653198242, + "learning_rate": 3.645965795838692e-06, + "logits/chosen": -0.5330935716629028, + "logits/rejected": -0.6285043954849243, + "logps/chosen": -54.243446350097656, + "logps/rejected": -89.75397491455078, + "loss": 0.6931, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.199516773223877, + "rewards/margins": 6.088930130004883, + "rewards/rejected": -2.889413356781006, + "step": 11737 + }, + { + "epoch": 2.94, + "grad_norm": 14.818500518798828, + "learning_rate": 3.6452091943620095e-06, + "logits/chosen": -0.503915548324585, + "logits/rejected": -0.5915250182151794, + "logps/chosen": -58.31561279296875, + "logps/rejected": -107.98130798339844, + "loss": 0.6956, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.981943130493164, + "rewards/margins": 7.651941299438477, + "rewards/rejected": -4.669997692108154, + "step": 11738 + }, + { + "epoch": 2.94, + "grad_norm": 11.697234153747559, + "learning_rate": 3.644452626363648e-06, + "logits/chosen": -0.40655839443206787, + "logits/rejected": -0.5364609360694885, + "logps/chosen": -70.95182037353516, + "logps/rejected": -91.24197387695312, + "loss": 0.9074, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.621903419494629, + "rewards/margins": 5.712807655334473, + "rewards/rejected": -3.0909039974212646, + "step": 11739 + }, + { + "epoch": 2.94, + "grad_norm": 25.861631393432617, + "learning_rate": 3.6436960918623056e-06, + "logits/chosen": -0.4877771735191345, + "logits/rejected": -0.5506237745285034, + "logps/chosen": -53.35902786254883, + "logps/rejected": -108.72471618652344, + "loss": 0.8841, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9306674003601074, + "rewards/margins": 6.633626461029053, + "rewards/rejected": -3.702958822250366, + "step": 11740 + }, + { + "epoch": 2.94, + "grad_norm": 8.915190696716309, + "learning_rate": 3.6429395908766763e-06, + "logits/chosen": -0.4921998977661133, + "logits/rejected": -0.577084481716156, + "logps/chosen": -57.01506042480469, + "logps/rejected": -83.89129638671875, + "loss": 0.6384, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1967222690582275, + "rewards/margins": 6.004758834838867, + "rewards/rejected": -2.8080363273620605, + "step": 11741 + }, + { + "epoch": 2.94, + "grad_norm": 9.39976692199707, + "learning_rate": 3.642183123425452e-06, + "logits/chosen": -0.48226264119148254, + "logits/rejected": -0.5198299884796143, + "logps/chosen": -60.321441650390625, + "logps/rejected": -108.32807922363281, + "loss": 0.6597, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.17366361618042, + "rewards/margins": 6.183376312255859, + "rewards/rejected": -3.0097126960754395, + "step": 11742 + }, + { + "epoch": 2.94, + "grad_norm": 12.737215995788574, + "learning_rate": 3.6414266895273264e-06, + "logits/chosen": -0.4968233108520508, + "logits/rejected": -0.5648149251937866, + "logps/chosen": -52.56306457519531, + "logps/rejected": -80.78616333007812, + "loss": 0.6978, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.177595615386963, + "rewards/margins": 5.029869079589844, + "rewards/rejected": -1.8522741794586182, + "step": 11743 + }, + { + "epoch": 2.94, + "grad_norm": 3.5751514434814453, + "learning_rate": 3.6406702892009936e-06, + "logits/chosen": -0.4245372414588928, + "logits/rejected": -0.5385138392448425, + "logps/chosen": -60.9826545715332, + "logps/rejected": -98.8790283203125, + "loss": 0.6677, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9868478775024414, + "rewards/margins": 6.738544464111328, + "rewards/rejected": -3.7516963481903076, + "step": 11744 + }, + { + "epoch": 2.94, + "grad_norm": 5.256040573120117, + "learning_rate": 3.6399139224651435e-06, + "logits/chosen": -0.5177908539772034, + "logits/rejected": -0.6202362775802612, + "logps/chosen": -50.709449768066406, + "logps/rejected": -94.2232666015625, + "loss": 0.6438, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.868821144104004, + "rewards/margins": 7.1634602546691895, + "rewards/rejected": -4.2946391105651855, + "step": 11745 + }, + { + "epoch": 2.94, + "grad_norm": 10.19808578491211, + "learning_rate": 3.6391575893384668e-06, + "logits/chosen": -0.46689391136169434, + "logits/rejected": -0.5346847176551819, + "logps/chosen": -56.4981803894043, + "logps/rejected": -99.22190856933594, + "loss": 0.6479, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.218255043029785, + "rewards/margins": 6.0590925216674805, + "rewards/rejected": -2.8408379554748535, + "step": 11746 + }, + { + "epoch": 2.94, + "grad_norm": 2.6207799911499023, + "learning_rate": 3.6384012898396524e-06, + "logits/chosen": -0.4921315908432007, + "logits/rejected": -0.5797409415245056, + "logps/chosen": -49.548336029052734, + "logps/rejected": -110.28872680664062, + "loss": 0.5971, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4095191955566406, + "rewards/margins": 7.833898067474365, + "rewards/rejected": -4.424378395080566, + "step": 11747 + }, + { + "epoch": 2.94, + "grad_norm": 4.143277168273926, + "learning_rate": 3.6376450239873927e-06, + "logits/chosen": -0.47511026263237, + "logits/rejected": -0.5355110764503479, + "logps/chosen": -50.28318405151367, + "logps/rejected": -110.26628875732422, + "loss": 0.6586, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.114919424057007, + "rewards/margins": 5.769491672515869, + "rewards/rejected": -2.654572010040283, + "step": 11748 + }, + { + "epoch": 2.94, + "grad_norm": 3.3755009174346924, + "learning_rate": 3.636888791800372e-06, + "logits/chosen": -0.49539464712142944, + "logits/rejected": -0.5119394659996033, + "logps/chosen": -45.644798278808594, + "logps/rejected": -96.75325775146484, + "loss": 0.5779, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.361356258392334, + "rewards/margins": 5.909933090209961, + "rewards/rejected": -2.5485761165618896, + "step": 11749 + }, + { + "epoch": 2.94, + "grad_norm": 15.431297302246094, + "learning_rate": 3.636132593297279e-06, + "logits/chosen": -0.5563032627105713, + "logits/rejected": -0.6340553760528564, + "logps/chosen": -51.19295120239258, + "logps/rejected": -74.84777069091797, + "loss": 0.7699, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0946738719940186, + "rewards/margins": 5.266472816467285, + "rewards/rejected": -2.1717984676361084, + "step": 11750 + }, + { + "epoch": 2.94, + "grad_norm": 3.14121413230896, + "learning_rate": 3.6353764284968018e-06, + "logits/chosen": -0.5165770053863525, + "logits/rejected": -0.6423704624176025, + "logps/chosen": -59.94155502319336, + "logps/rejected": -111.66317749023438, + "loss": 0.6812, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.940284013748169, + "rewards/margins": 6.7455034255981445, + "rewards/rejected": -3.805219888687134, + "step": 11751 + }, + { + "epoch": 2.94, + "grad_norm": 6.548205852508545, + "learning_rate": 3.634620297417623e-06, + "logits/chosen": -0.43750137090682983, + "logits/rejected": -0.5032941699028015, + "logps/chosen": -48.1218376159668, + "logps/rejected": -104.74044799804688, + "loss": 0.5764, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2389895915985107, + "rewards/margins": 7.370625972747803, + "rewards/rejected": -4.131635665893555, + "step": 11752 + }, + { + "epoch": 2.94, + "grad_norm": 3.2448880672454834, + "learning_rate": 3.6338642000784287e-06, + "logits/chosen": -0.43630075454711914, + "logits/rejected": -0.5736517310142517, + "logps/chosen": -70.14592742919922, + "logps/rejected": -101.11421966552734, + "loss": 0.6177, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.12522029876709, + "rewards/margins": 7.369649887084961, + "rewards/rejected": -4.244429588317871, + "step": 11753 + }, + { + "epoch": 2.94, + "grad_norm": 7.835289001464844, + "learning_rate": 3.633108136497903e-06, + "logits/chosen": -0.579474687576294, + "logits/rejected": -0.5528228282928467, + "logps/chosen": -48.089332580566406, + "logps/rejected": -102.57106018066406, + "loss": 0.7326, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.078523635864258, + "rewards/margins": 5.660549640655518, + "rewards/rejected": -2.5820260047912598, + "step": 11754 + }, + { + "epoch": 2.94, + "grad_norm": 7.732363224029541, + "learning_rate": 3.632352106694732e-06, + "logits/chosen": -0.43493539094924927, + "logits/rejected": -0.548305332660675, + "logps/chosen": -52.0692253112793, + "logps/rejected": -85.27682495117188, + "loss": 0.6507, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9808969497680664, + "rewards/margins": 5.34061861038208, + "rewards/rejected": -2.3597216606140137, + "step": 11755 + }, + { + "epoch": 2.94, + "grad_norm": 25.091909408569336, + "learning_rate": 3.631596110687592e-06, + "logits/chosen": -0.4596581757068634, + "logits/rejected": -0.5734624862670898, + "logps/chosen": -59.61808776855469, + "logps/rejected": -84.19577026367188, + "loss": 0.7596, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9613289833068848, + "rewards/margins": 6.3689374923706055, + "rewards/rejected": -3.4076087474823, + "step": 11756 + }, + { + "epoch": 2.94, + "grad_norm": 2.5842132568359375, + "learning_rate": 3.6308401484951682e-06, + "logits/chosen": -0.524296224117279, + "logits/rejected": -0.6187814474105835, + "logps/chosen": -51.97948455810547, + "logps/rejected": -109.47150421142578, + "loss": 0.64, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3142812252044678, + "rewards/margins": 7.360271453857422, + "rewards/rejected": -4.045990467071533, + "step": 11757 + }, + { + "epoch": 2.94, + "grad_norm": 3.2965362071990967, + "learning_rate": 3.6300842201361408e-06, + "logits/chosen": -0.4931849539279938, + "logits/rejected": -0.5841381549835205, + "logps/chosen": -62.85110855102539, + "logps/rejected": -101.9228744506836, + "loss": 0.6661, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8617703914642334, + "rewards/margins": 6.664786338806152, + "rewards/rejected": -3.803016424179077, + "step": 11758 + }, + { + "epoch": 2.94, + "grad_norm": 10.255128860473633, + "learning_rate": 3.6293283256291908e-06, + "logits/chosen": -0.5142220854759216, + "logits/rejected": -0.5906760692596436, + "logps/chosen": -51.68611145019531, + "logps/rejected": -105.85407257080078, + "loss": 0.7303, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7109878063201904, + "rewards/margins": 5.711967468261719, + "rewards/rejected": -3.0009799003601074, + "step": 11759 + }, + { + "epoch": 2.94, + "grad_norm": 13.74532699584961, + "learning_rate": 3.6285724649929944e-06, + "logits/chosen": -0.4795583486557007, + "logits/rejected": -0.5358856320381165, + "logps/chosen": -64.85413360595703, + "logps/rejected": -97.86700439453125, + "loss": 0.7576, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7222213745117188, + "rewards/margins": 5.324658393859863, + "rewards/rejected": -2.6024367809295654, + "step": 11760 + }, + { + "epoch": 2.94, + "grad_norm": 5.426124572753906, + "learning_rate": 3.627816638246231e-06, + "logits/chosen": -0.45169979333877563, + "logits/rejected": -0.5338175296783447, + "logps/chosen": -62.099082946777344, + "logps/rejected": -89.31279754638672, + "loss": 0.6852, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.961704969406128, + "rewards/margins": 5.352856636047363, + "rewards/rejected": -2.3911519050598145, + "step": 11761 + }, + { + "epoch": 2.94, + "grad_norm": 10.629544258117676, + "learning_rate": 3.6270608454075797e-06, + "logits/chosen": -0.5545381307601929, + "logits/rejected": -0.5983439087867737, + "logps/chosen": -58.971458435058594, + "logps/rejected": -110.03910827636719, + "loss": 0.738, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.941643714904785, + "rewards/margins": 6.170084476470947, + "rewards/rejected": -3.228440999984741, + "step": 11762 + }, + { + "epoch": 2.94, + "grad_norm": 11.302629470825195, + "learning_rate": 3.626305086495713e-06, + "logits/chosen": -0.4411616325378418, + "logits/rejected": -0.5709605813026428, + "logps/chosen": -70.92656707763672, + "logps/rejected": -86.15139770507812, + "loss": 0.7456, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.524252414703369, + "rewards/margins": 4.455562591552734, + "rewards/rejected": -1.9313099384307861, + "step": 11763 + }, + { + "epoch": 2.94, + "grad_norm": 6.3367085456848145, + "learning_rate": 3.6255493615293097e-06, + "logits/chosen": -0.5947359800338745, + "logits/rejected": -0.6262730956077576, + "logps/chosen": -59.581321716308594, + "logps/rejected": -104.9319076538086, + "loss": 0.7553, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.030003786087036, + "rewards/margins": 5.570869445800781, + "rewards/rejected": -2.540865182876587, + "step": 11764 + }, + { + "epoch": 2.94, + "grad_norm": 6.43660831451416, + "learning_rate": 3.6247936705270427e-06, + "logits/chosen": -0.47500625252723694, + "logits/rejected": -0.5339869260787964, + "logps/chosen": -70.43587493896484, + "logps/rejected": -110.50641632080078, + "loss": 0.6658, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6818902492523193, + "rewards/margins": 6.106055736541748, + "rewards/rejected": -3.424165725708008, + "step": 11765 + }, + { + "epoch": 2.94, + "grad_norm": 11.716222763061523, + "learning_rate": 3.6240380135075892e-06, + "logits/chosen": -0.543583869934082, + "logits/rejected": -0.6333275437355042, + "logps/chosen": -58.2862548828125, + "logps/rejected": -89.92809295654297, + "loss": 0.655, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.879282236099243, + "rewards/margins": 6.280343532562256, + "rewards/rejected": -3.401061773300171, + "step": 11766 + }, + { + "epoch": 2.94, + "grad_norm": 10.710941314697266, + "learning_rate": 3.623282390489618e-06, + "logits/chosen": -0.5502915382385254, + "logits/rejected": -0.6276278495788574, + "logps/chosen": -52.25796127319336, + "logps/rejected": -103.29615783691406, + "loss": 0.6755, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9208555221557617, + "rewards/margins": 6.5842790603637695, + "rewards/rejected": -3.663423776626587, + "step": 11767 + }, + { + "epoch": 2.94, + "grad_norm": 10.038874626159668, + "learning_rate": 3.6225268014918023e-06, + "logits/chosen": -0.5442000031471252, + "logits/rejected": -0.6619818806648254, + "logps/chosen": -53.753726959228516, + "logps/rejected": -90.60459899902344, + "loss": 0.6202, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.900907039642334, + "rewards/margins": 7.402599334716797, + "rewards/rejected": -4.501692295074463, + "step": 11768 + }, + { + "epoch": 2.94, + "grad_norm": 7.814568519592285, + "learning_rate": 3.621771246532818e-06, + "logits/chosen": -0.4862247109413147, + "logits/rejected": -0.5307925343513489, + "logps/chosen": -54.16783142089844, + "logps/rejected": -123.43215942382812, + "loss": 0.6657, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2309305667877197, + "rewards/margins": 7.78421688079834, + "rewards/rejected": -4.553285598754883, + "step": 11769 + }, + { + "epoch": 2.94, + "grad_norm": 4.1383795738220215, + "learning_rate": 3.6210157256313287e-06, + "logits/chosen": -0.46426382660865784, + "logits/rejected": -0.5383532047271729, + "logps/chosen": -56.60890579223633, + "logps/rejected": -101.6572265625, + "loss": 0.6004, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.143420457839966, + "rewards/margins": 6.543649196624756, + "rewards/rejected": -3.4002292156219482, + "step": 11770 + }, + { + "epoch": 2.94, + "grad_norm": 10.243009567260742, + "learning_rate": 3.620260238806009e-06, + "logits/chosen": -0.46925899386405945, + "logits/rejected": -0.5478420257568359, + "logps/chosen": -64.22991943359375, + "logps/rejected": -99.09683990478516, + "loss": 0.8891, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.95816969871521, + "rewards/margins": 5.596969127655029, + "rewards/rejected": -2.6387994289398193, + "step": 11771 + }, + { + "epoch": 2.94, + "grad_norm": 15.398260116577148, + "learning_rate": 3.619504786075526e-06, + "logits/chosen": -0.48437735438346863, + "logits/rejected": -0.5940176248550415, + "logps/chosen": -57.809017181396484, + "logps/rejected": -91.86839294433594, + "loss": 0.7095, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9227561950683594, + "rewards/margins": 5.895167350769043, + "rewards/rejected": -2.9724111557006836, + "step": 11772 + }, + { + "epoch": 2.95, + "grad_norm": 36.13539505004883, + "learning_rate": 3.6187493674585487e-06, + "logits/chosen": -0.46181535720825195, + "logits/rejected": -0.5263193249702454, + "logps/chosen": -55.178707122802734, + "logps/rejected": -106.24903869628906, + "loss": 0.6787, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7921342849731445, + "rewards/margins": 7.085801124572754, + "rewards/rejected": -4.293666839599609, + "step": 11773 + }, + { + "epoch": 2.95, + "grad_norm": 2.67865252494812, + "learning_rate": 3.617993982973743e-06, + "logits/chosen": -0.4802214503288269, + "logits/rejected": -0.5954115390777588, + "logps/chosen": -60.43247985839844, + "logps/rejected": -93.7772445678711, + "loss": 0.5387, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.07613468170166, + "rewards/margins": 7.049551010131836, + "rewards/rejected": -3.973416328430176, + "step": 11774 + }, + { + "epoch": 2.95, + "grad_norm": 24.500038146972656, + "learning_rate": 3.617238632639777e-06, + "logits/chosen": -0.5747420787811279, + "logits/rejected": -0.602250874042511, + "logps/chosen": -58.16706085205078, + "logps/rejected": -103.95870971679688, + "loss": 0.8072, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9872076511383057, + "rewards/margins": 5.77830171585083, + "rewards/rejected": -2.7910943031311035, + "step": 11775 + }, + { + "epoch": 2.95, + "grad_norm": 40.47633361816406, + "learning_rate": 3.6164833164753143e-06, + "logits/chosen": -0.46095749735832214, + "logits/rejected": -0.5394070148468018, + "logps/chosen": -56.413307189941406, + "logps/rejected": -112.46392059326172, + "loss": 0.7252, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0617117881774902, + "rewards/margins": 5.330611228942871, + "rewards/rejected": -2.2688989639282227, + "step": 11776 + }, + { + "epoch": 2.95, + "grad_norm": 6.6550798416137695, + "learning_rate": 3.6157280344990208e-06, + "logits/chosen": -0.5004379749298096, + "logits/rejected": -0.6169770956039429, + "logps/chosen": -50.53377914428711, + "logps/rejected": -82.309814453125, + "loss": 0.6007, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2116217613220215, + "rewards/margins": 6.718412399291992, + "rewards/rejected": -3.5067903995513916, + "step": 11777 + }, + { + "epoch": 2.95, + "grad_norm": 4.909216403961182, + "learning_rate": 3.614972786729559e-06, + "logits/chosen": -0.440437912940979, + "logits/rejected": -0.5429604053497314, + "logps/chosen": -55.45404052734375, + "logps/rejected": -113.41004943847656, + "loss": 0.6468, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.433582305908203, + "rewards/margins": 7.811499118804932, + "rewards/rejected": -4.3779168128967285, + "step": 11778 + }, + { + "epoch": 2.95, + "grad_norm": 7.739800453186035, + "learning_rate": 3.614217573185594e-06, + "logits/chosen": -0.46165597438812256, + "logits/rejected": -0.5572918057441711, + "logps/chosen": -48.46659851074219, + "logps/rejected": -106.2272720336914, + "loss": 0.6977, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.065647602081299, + "rewards/margins": 7.421415328979492, + "rewards/rejected": -4.355767250061035, + "step": 11779 + }, + { + "epoch": 2.95, + "grad_norm": 5.706618309020996, + "learning_rate": 3.6134623938857873e-06, + "logits/chosen": -0.4248853623867035, + "logits/rejected": -0.44612669944763184, + "logps/chosen": -56.76643371582031, + "logps/rejected": -104.90989685058594, + "loss": 0.7522, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8222808837890625, + "rewards/margins": 5.678217887878418, + "rewards/rejected": -2.8559372425079346, + "step": 11780 + }, + { + "epoch": 2.95, + "grad_norm": 4.675431251525879, + "learning_rate": 3.612707248848798e-06, + "logits/chosen": -0.5201742053031921, + "logits/rejected": -0.6220152378082275, + "logps/chosen": -56.41318130493164, + "logps/rejected": -87.562255859375, + "loss": 0.6464, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9084057807922363, + "rewards/margins": 6.0877461433410645, + "rewards/rejected": -3.17933988571167, + "step": 11781 + }, + { + "epoch": 2.95, + "grad_norm": 4.170663833618164, + "learning_rate": 3.6119521380932877e-06, + "logits/chosen": -0.5791335701942444, + "logits/rejected": -0.6446360349655151, + "logps/chosen": -48.00688934326172, + "logps/rejected": -97.84469604492188, + "loss": 0.6629, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8885040283203125, + "rewards/margins": 6.401456832885742, + "rewards/rejected": -3.5129525661468506, + "step": 11782 + }, + { + "epoch": 2.95, + "grad_norm": 4.488561630249023, + "learning_rate": 3.611197061637917e-06, + "logits/chosen": -0.49920427799224854, + "logits/rejected": -0.622346043586731, + "logps/chosen": -49.55727767944336, + "logps/rejected": -96.70037841796875, + "loss": 0.6645, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3436880111694336, + "rewards/margins": 7.108048915863037, + "rewards/rejected": -3.7643604278564453, + "step": 11783 + }, + { + "epoch": 2.95, + "grad_norm": 4.862210750579834, + "learning_rate": 3.6104420195013468e-06, + "logits/chosen": -0.45285776257514954, + "logits/rejected": -0.5255171656608582, + "logps/chosen": -41.158203125, + "logps/rejected": -80.1269302368164, + "loss": 0.5486, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9145724773406982, + "rewards/margins": 5.811694622039795, + "rewards/rejected": -2.897122383117676, + "step": 11784 + }, + { + "epoch": 2.95, + "grad_norm": 4.1494598388671875, + "learning_rate": 3.6096870117022288e-06, + "logits/chosen": -0.4326798915863037, + "logits/rejected": -0.5442878007888794, + "logps/chosen": -50.992889404296875, + "logps/rejected": -95.89288330078125, + "loss": 0.5625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1007590293884277, + "rewards/margins": 7.053746223449707, + "rewards/rejected": -3.9529869556427, + "step": 11785 + }, + { + "epoch": 2.95, + "grad_norm": 7.8528265953063965, + "learning_rate": 3.6089320382592245e-06, + "logits/chosen": -0.46625959873199463, + "logits/rejected": -0.5690242052078247, + "logps/chosen": -57.069602966308594, + "logps/rejected": -102.56437683105469, + "loss": 0.6802, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.962256908416748, + "rewards/margins": 6.425213813781738, + "rewards/rejected": -3.4629569053649902, + "step": 11786 + }, + { + "epoch": 2.95, + "grad_norm": 5.940094470977783, + "learning_rate": 3.6081770991909903e-06, + "logits/chosen": -0.46767234802246094, + "logits/rejected": -0.5537983179092407, + "logps/chosen": -46.047340393066406, + "logps/rejected": -104.33533477783203, + "loss": 0.6437, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8600804805755615, + "rewards/margins": 7.702596187591553, + "rewards/rejected": -4.842515468597412, + "step": 11787 + }, + { + "epoch": 2.95, + "grad_norm": 5.155792236328125, + "learning_rate": 3.6074221945161796e-06, + "logits/chosen": -0.5470560789108276, + "logits/rejected": -0.5991092920303345, + "logps/chosen": -55.44147872924805, + "logps/rejected": -101.52070617675781, + "loss": 0.6169, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3093466758728027, + "rewards/margins": 7.040220260620117, + "rewards/rejected": -3.7308735847473145, + "step": 11788 + }, + { + "epoch": 2.95, + "grad_norm": 15.103896141052246, + "learning_rate": 3.6066673242534477e-06, + "logits/chosen": -0.5165925025939941, + "logits/rejected": -0.5629562139511108, + "logps/chosen": -54.630714416503906, + "logps/rejected": -104.15019989013672, + "loss": 0.7668, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8618390560150146, + "rewards/margins": 6.696269989013672, + "rewards/rejected": -3.834430694580078, + "step": 11789 + }, + { + "epoch": 2.95, + "grad_norm": 3.292710542678833, + "learning_rate": 3.605912488421448e-06, + "logits/chosen": -0.4524911344051361, + "logits/rejected": -0.5498236417770386, + "logps/chosen": -67.74191284179688, + "logps/rejected": -101.9798812866211, + "loss": 0.6286, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0133135318756104, + "rewards/margins": 6.905207633972168, + "rewards/rejected": -3.8918938636779785, + "step": 11790 + }, + { + "epoch": 2.95, + "grad_norm": 6.087648868560791, + "learning_rate": 3.6051576870388345e-06, + "logits/chosen": -0.41474711894989014, + "logits/rejected": -0.45383375883102417, + "logps/chosen": -45.79996871948242, + "logps/rejected": -97.49614715576172, + "loss": 0.6659, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2878077030181885, + "rewards/margins": 4.41700553894043, + "rewards/rejected": -1.1291978359222412, + "step": 11791 + }, + { + "epoch": 2.95, + "grad_norm": 4.888400554656982, + "learning_rate": 3.604402920124257e-06, + "logits/chosen": -0.48480910062789917, + "logits/rejected": -0.5696582794189453, + "logps/chosen": -50.48279571533203, + "logps/rejected": -101.28623962402344, + "loss": 0.6358, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9822802543640137, + "rewards/margins": 5.755692481994629, + "rewards/rejected": -2.7734122276306152, + "step": 11792 + }, + { + "epoch": 2.95, + "grad_norm": 3.2560672760009766, + "learning_rate": 3.603648187696368e-06, + "logits/chosen": -0.5331637263298035, + "logits/rejected": -0.6305686831474304, + "logps/chosen": -50.74272918701172, + "logps/rejected": -83.30865478515625, + "loss": 0.6047, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.08223819732666, + "rewards/margins": 5.1844635009765625, + "rewards/rejected": -2.102224826812744, + "step": 11793 + }, + { + "epoch": 2.95, + "grad_norm": 3.853034496307373, + "learning_rate": 3.6028934897738186e-06, + "logits/chosen": -0.4946504533290863, + "logits/rejected": -0.5672620534896851, + "logps/chosen": -56.41980743408203, + "logps/rejected": -98.50440979003906, + "loss": 0.619, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.125739097595215, + "rewards/margins": 7.068357467651367, + "rewards/rejected": -3.942617654800415, + "step": 11794 + }, + { + "epoch": 2.95, + "grad_norm": 8.975934028625488, + "learning_rate": 3.6021388263752566e-06, + "logits/chosen": -0.4987829327583313, + "logits/rejected": -0.5321637392044067, + "logps/chosen": -52.45115661621094, + "logps/rejected": -96.99964904785156, + "loss": 0.7096, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9781086444854736, + "rewards/margins": 6.108162879943848, + "rewards/rejected": -3.130054473876953, + "step": 11795 + }, + { + "epoch": 2.95, + "grad_norm": 4.188623428344727, + "learning_rate": 3.6013841975193287e-06, + "logits/chosen": -0.394553542137146, + "logits/rejected": -0.5097793340682983, + "logps/chosen": -52.96885299682617, + "logps/rejected": -85.6904525756836, + "loss": 0.5701, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.980499505996704, + "rewards/margins": 6.388372421264648, + "rewards/rejected": -3.4078726768493652, + "step": 11796 + }, + { + "epoch": 2.95, + "grad_norm": 6.220348834991455, + "learning_rate": 3.6006296032246867e-06, + "logits/chosen": -0.530264675617218, + "logits/rejected": -0.5839620232582092, + "logps/chosen": -43.89639663696289, + "logps/rejected": -91.06153869628906, + "loss": 0.6033, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2001516819000244, + "rewards/margins": 5.501802444458008, + "rewards/rejected": -2.3016510009765625, + "step": 11797 + }, + { + "epoch": 2.95, + "grad_norm": 6.106019973754883, + "learning_rate": 3.5998750435099785e-06, + "logits/chosen": -0.49489879608154297, + "logits/rejected": -0.5317338705062866, + "logps/chosen": -46.49168395996094, + "logps/rejected": -97.81172180175781, + "loss": 0.6723, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3387093544006348, + "rewards/margins": 5.560461521148682, + "rewards/rejected": -2.2217519283294678, + "step": 11798 + }, + { + "epoch": 2.95, + "grad_norm": 16.031667709350586, + "learning_rate": 3.599120518393843e-06, + "logits/chosen": -0.4427681267261505, + "logits/rejected": -0.5641737580299377, + "logps/chosen": -70.75091552734375, + "logps/rejected": -81.59324645996094, + "loss": 0.8092, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.792635440826416, + "rewards/margins": 5.42968225479126, + "rewards/rejected": -2.6370468139648438, + "step": 11799 + }, + { + "epoch": 2.95, + "grad_norm": 2.914416790008545, + "learning_rate": 3.598366027894931e-06, + "logits/chosen": -0.49097058176994324, + "logits/rejected": -0.5336048007011414, + "logps/chosen": -43.95823669433594, + "logps/rejected": -95.88490295410156, + "loss": 0.4765, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8902177810668945, + "rewards/margins": 5.99439811706543, + "rewards/rejected": -3.104180097579956, + "step": 11800 + }, + { + "epoch": 2.95, + "grad_norm": 2.457404136657715, + "learning_rate": 3.597611572031886e-06, + "logits/chosen": -0.504143238067627, + "logits/rejected": -0.5797739624977112, + "logps/chosen": -43.65723419189453, + "logps/rejected": -95.33233642578125, + "loss": 0.5413, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.043494701385498, + "rewards/margins": 7.0174102783203125, + "rewards/rejected": -3.9739160537719727, + "step": 11801 + }, + { + "epoch": 2.95, + "grad_norm": 5.424077033996582, + "learning_rate": 3.5968571508233495e-06, + "logits/chosen": -0.5129356384277344, + "logits/rejected": -0.659787654876709, + "logps/chosen": -60.41498565673828, + "logps/rejected": -92.5391845703125, + "loss": 0.7206, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.829111099243164, + "rewards/margins": 6.271513938903809, + "rewards/rejected": -3.4424033164978027, + "step": 11802 + }, + { + "epoch": 2.95, + "grad_norm": 6.093687534332275, + "learning_rate": 3.5961027642879654e-06, + "logits/chosen": -0.4977831244468689, + "logits/rejected": -0.5870978236198425, + "logps/chosen": -51.04422378540039, + "logps/rejected": -106.20720672607422, + "loss": 0.4991, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.19909930229187, + "rewards/margins": 7.062741279602051, + "rewards/rejected": -3.8636417388916016, + "step": 11803 + }, + { + "epoch": 2.95, + "grad_norm": 4.726801872253418, + "learning_rate": 3.5953484124443755e-06, + "logits/chosen": -0.406892329454422, + "logits/rejected": -0.4837602376937866, + "logps/chosen": -54.19699478149414, + "logps/rejected": -90.95714569091797, + "loss": 0.6521, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.239126443862915, + "rewards/margins": 5.413115501403809, + "rewards/rejected": -2.1739885807037354, + "step": 11804 + }, + { + "epoch": 2.95, + "grad_norm": 3.597010612487793, + "learning_rate": 3.5945940953112212e-06, + "logits/chosen": -0.4118451178073883, + "logits/rejected": -0.5330575108528137, + "logps/chosen": -56.99250411987305, + "logps/rejected": -83.58432006835938, + "loss": 0.657, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2405009269714355, + "rewards/margins": 6.098814964294434, + "rewards/rejected": -2.858314275741577, + "step": 11805 + }, + { + "epoch": 2.95, + "grad_norm": 19.98672103881836, + "learning_rate": 3.593839812907141e-06, + "logits/chosen": -0.47259747982025146, + "logits/rejected": -0.5653954744338989, + "logps/chosen": -51.33428955078125, + "logps/rejected": -88.48786163330078, + "loss": 0.7267, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0607166290283203, + "rewards/margins": 5.950603485107422, + "rewards/rejected": -2.8898868560791016, + "step": 11806 + }, + { + "epoch": 2.95, + "grad_norm": 5.391867160797119, + "learning_rate": 3.5930855652507747e-06, + "logits/chosen": -0.5912266373634338, + "logits/rejected": -0.6769319772720337, + "logps/chosen": -43.81623840332031, + "logps/rejected": -97.6646499633789, + "loss": 0.5333, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.173027992248535, + "rewards/margins": 6.842181205749512, + "rewards/rejected": -3.6691529750823975, + "step": 11807 + }, + { + "epoch": 2.95, + "grad_norm": 7.350398540496826, + "learning_rate": 3.592331352360759e-06, + "logits/chosen": -0.5796641111373901, + "logits/rejected": -0.6561462879180908, + "logps/chosen": -59.47579574584961, + "logps/rejected": -101.66004943847656, + "loss": 0.6234, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3829050064086914, + "rewards/margins": 7.153041839599609, + "rewards/rejected": -3.770137310028076, + "step": 11808 + }, + { + "epoch": 2.95, + "grad_norm": 8.367411613464355, + "learning_rate": 3.591577174255736e-06, + "logits/chosen": -0.5526660084724426, + "logits/rejected": -0.5870174169540405, + "logps/chosen": -53.18629837036133, + "logps/rejected": -110.6395492553711, + "loss": 0.6869, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7428741455078125, + "rewards/margins": 6.10579252243042, + "rewards/rejected": -3.3629183769226074, + "step": 11809 + }, + { + "epoch": 2.95, + "grad_norm": 5.62495231628418, + "learning_rate": 3.590823030954338e-06, + "logits/chosen": -0.4379279613494873, + "logits/rejected": -0.4961967170238495, + "logps/chosen": -62.87482452392578, + "logps/rejected": -114.82708740234375, + "loss": 0.6841, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9754083156585693, + "rewards/margins": 5.797191619873047, + "rewards/rejected": -2.8217830657958984, + "step": 11810 + }, + { + "epoch": 2.95, + "grad_norm": 7.229991912841797, + "learning_rate": 3.5900689224751993e-06, + "logits/chosen": -0.5108452439308167, + "logits/rejected": -0.5939797163009644, + "logps/chosen": -51.07920837402344, + "logps/rejected": -92.68883514404297, + "loss": 0.5969, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6982874870300293, + "rewards/margins": 6.394169330596924, + "rewards/rejected": -3.6958818435668945, + "step": 11811 + }, + { + "epoch": 2.95, + "grad_norm": 2.5441701412200928, + "learning_rate": 3.589314848836961e-06, + "logits/chosen": -0.3673850893974304, + "logits/rejected": -0.5119174718856812, + "logps/chosen": -61.504737854003906, + "logps/rejected": -95.1798095703125, + "loss": 0.5712, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4032232761383057, + "rewards/margins": 7.330635070800781, + "rewards/rejected": -3.9274113178253174, + "step": 11812 + }, + { + "epoch": 2.96, + "grad_norm": 6.8177809715271, + "learning_rate": 3.58856081005825e-06, + "logits/chosen": -0.4899010956287384, + "logits/rejected": -0.5610292553901672, + "logps/chosen": -60.82745361328125, + "logps/rejected": -96.94732666015625, + "loss": 0.6965, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0656280517578125, + "rewards/margins": 5.347349166870117, + "rewards/rejected": -2.281721830368042, + "step": 11813 + }, + { + "epoch": 2.96, + "grad_norm": 15.928549766540527, + "learning_rate": 3.587806806157704e-06, + "logits/chosen": -0.5035815834999084, + "logits/rejected": -0.5479437112808228, + "logps/chosen": -45.544612884521484, + "logps/rejected": -87.68167877197266, + "loss": 0.6426, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.181243419647217, + "rewards/margins": 6.015565395355225, + "rewards/rejected": -2.834321975708008, + "step": 11814 + }, + { + "epoch": 2.96, + "grad_norm": 9.002290725708008, + "learning_rate": 3.5870528371539536e-06, + "logits/chosen": -0.5391170382499695, + "logits/rejected": -0.6335370540618896, + "logps/chosen": -52.473731994628906, + "logps/rejected": -82.90254974365234, + "loss": 0.7977, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.288925886154175, + "rewards/margins": 5.71075439453125, + "rewards/rejected": -2.421828269958496, + "step": 11815 + }, + { + "epoch": 2.96, + "grad_norm": 14.676897048950195, + "learning_rate": 3.586298903065632e-06, + "logits/chosen": -0.4775984287261963, + "logits/rejected": -0.4872337579727173, + "logps/chosen": -59.41350173950195, + "logps/rejected": -132.59878540039062, + "loss": 0.6992, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8102223873138428, + "rewards/margins": 7.1077117919921875, + "rewards/rejected": -4.297490119934082, + "step": 11816 + }, + { + "epoch": 2.96, + "grad_norm": 215.81529235839844, + "learning_rate": 3.5855450039113665e-06, + "logits/chosen": -0.43395811319351196, + "logits/rejected": -0.5645464658737183, + "logps/chosen": -62.75041961669922, + "logps/rejected": -91.01612854003906, + "loss": 0.67, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1276655197143555, + "rewards/margins": 6.152207374572754, + "rewards/rejected": -3.0245418548583984, + "step": 11817 + }, + { + "epoch": 2.96, + "grad_norm": 4.419847011566162, + "learning_rate": 3.584791139709789e-06, + "logits/chosen": -0.5289412140846252, + "logits/rejected": -0.6539394855499268, + "logps/chosen": -55.67116928100586, + "logps/rejected": -105.92315673828125, + "loss": 0.5788, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2067127227783203, + "rewards/margins": 8.164134979248047, + "rewards/rejected": -4.957422256469727, + "step": 11818 + }, + { + "epoch": 2.96, + "grad_norm": 4.175830841064453, + "learning_rate": 3.5840373104795286e-06, + "logits/chosen": -0.38837918639183044, + "logits/rejected": -0.46583548188209534, + "logps/chosen": -64.1304702758789, + "logps/rejected": -94.64091491699219, + "loss": 0.698, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8041086196899414, + "rewards/margins": 5.492896556854248, + "rewards/rejected": -2.6887879371643066, + "step": 11819 + }, + { + "epoch": 2.96, + "grad_norm": 9.581307411193848, + "learning_rate": 3.583283516239211e-06, + "logits/chosen": -0.5425688624382019, + "logits/rejected": -0.6369462013244629, + "logps/chosen": -48.57601547241211, + "logps/rejected": -85.839111328125, + "loss": 0.7021, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0584638118743896, + "rewards/margins": 5.770236968994141, + "rewards/rejected": -2.7117738723754883, + "step": 11820 + }, + { + "epoch": 2.96, + "grad_norm": 11.850014686584473, + "learning_rate": 3.5825297570074647e-06, + "logits/chosen": -0.4033554792404175, + "logits/rejected": -0.5274758338928223, + "logps/chosen": -55.5822868347168, + "logps/rejected": -85.70600891113281, + "loss": 0.5854, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8788301944732666, + "rewards/margins": 6.212377548217773, + "rewards/rejected": -3.333547592163086, + "step": 11821 + }, + { + "epoch": 2.96, + "grad_norm": 5.668248653411865, + "learning_rate": 3.581776032802916e-06, + "logits/chosen": -0.6149471998214722, + "logits/rejected": -0.5824747085571289, + "logps/chosen": -50.42335510253906, + "logps/rejected": -133.71287536621094, + "loss": 0.6992, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.362574815750122, + "rewards/margins": 5.799304962158203, + "rewards/rejected": -2.4367308616638184, + "step": 11822 + }, + { + "epoch": 2.96, + "grad_norm": 10.454048156738281, + "learning_rate": 3.5810223436441905e-06, + "logits/chosen": -0.5177453756332397, + "logits/rejected": -0.6179944276809692, + "logps/chosen": -60.689849853515625, + "logps/rejected": -103.81983947753906, + "loss": 0.7297, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.994500160217285, + "rewards/margins": 6.186652183532715, + "rewards/rejected": -3.192152261734009, + "step": 11823 + }, + { + "epoch": 2.96, + "grad_norm": 2.97487473487854, + "learning_rate": 3.5802686895499117e-06, + "logits/chosen": -0.5894032716751099, + "logits/rejected": -0.6037642359733582, + "logps/chosen": -41.63555908203125, + "logps/rejected": -111.94145965576172, + "loss": 0.5872, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2064757347106934, + "rewards/margins": 7.04577112197876, + "rewards/rejected": -3.8392951488494873, + "step": 11824 + }, + { + "epoch": 2.96, + "grad_norm": 5.447925567626953, + "learning_rate": 3.579515070538703e-06, + "logits/chosen": -0.5105350017547607, + "logits/rejected": -0.6308363676071167, + "logps/chosen": -59.327423095703125, + "logps/rejected": -73.17279052734375, + "loss": 0.6014, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0878851413726807, + "rewards/margins": 6.190700531005859, + "rewards/rejected": -3.102816343307495, + "step": 11825 + }, + { + "epoch": 2.96, + "grad_norm": 3.3832180500030518, + "learning_rate": 3.57876148662919e-06, + "logits/chosen": -0.4809155762195587, + "logits/rejected": -0.5690253973007202, + "logps/chosen": -49.218788146972656, + "logps/rejected": -87.597412109375, + "loss": 0.5947, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.506964683532715, + "rewards/margins": 6.4861040115356445, + "rewards/rejected": -2.979139804840088, + "step": 11826 + }, + { + "epoch": 2.96, + "grad_norm": 3.0255837440490723, + "learning_rate": 3.5780079378399905e-06, + "logits/chosen": -0.4981367886066437, + "logits/rejected": -0.5683534145355225, + "logps/chosen": -49.21742248535156, + "logps/rejected": -101.74994659423828, + "loss": 0.5752, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.013110876083374, + "rewards/margins": 6.653269290924072, + "rewards/rejected": -3.640158176422119, + "step": 11827 + }, + { + "epoch": 2.96, + "grad_norm": 16.245948791503906, + "learning_rate": 3.577254424189727e-06, + "logits/chosen": -0.45691120624542236, + "logits/rejected": -0.5131204724311829, + "logps/chosen": -67.0037841796875, + "logps/rejected": -98.9283218383789, + "loss": 0.8363, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5640249252319336, + "rewards/margins": 5.101574897766113, + "rewards/rejected": -2.5375499725341797, + "step": 11828 + }, + { + "epoch": 2.96, + "grad_norm": 9.25971794128418, + "learning_rate": 3.57650094569702e-06, + "logits/chosen": -0.5530748963356018, + "logits/rejected": -0.64129638671875, + "logps/chosen": -53.11440658569336, + "logps/rejected": -84.83891296386719, + "loss": 0.6549, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8888561725616455, + "rewards/margins": 6.773486137390137, + "rewards/rejected": -3.8846302032470703, + "step": 11829 + }, + { + "epoch": 2.96, + "grad_norm": 22.30521011352539, + "learning_rate": 3.5757475023804907e-06, + "logits/chosen": -0.5459322333335876, + "logits/rejected": -0.6054481863975525, + "logps/chosen": -57.415977478027344, + "logps/rejected": -100.8815689086914, + "loss": 0.7713, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.747682571411133, + "rewards/margins": 5.8017377853393555, + "rewards/rejected": -3.0540544986724854, + "step": 11830 + }, + { + "epoch": 2.96, + "grad_norm": 5.296454429626465, + "learning_rate": 3.574994094258754e-06, + "logits/chosen": -0.4315313696861267, + "logits/rejected": -0.5274907350540161, + "logps/chosen": -60.888580322265625, + "logps/rejected": -96.15574645996094, + "loss": 0.6351, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.405062198638916, + "rewards/margins": 6.322428226470947, + "rewards/rejected": -2.9173662662506104, + "step": 11831 + }, + { + "epoch": 2.96, + "grad_norm": 4.568473815917969, + "learning_rate": 3.574240721350429e-06, + "logits/chosen": -0.48950690031051636, + "logits/rejected": -0.6541188359260559, + "logps/chosen": -63.076927185058594, + "logps/rejected": -85.67074584960938, + "loss": 0.6814, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.251723527908325, + "rewards/margins": 6.902751445770264, + "rewards/rejected": -3.6510281562805176, + "step": 11832 + }, + { + "epoch": 2.96, + "grad_norm": 8.363709449768066, + "learning_rate": 3.5734873836741334e-06, + "logits/chosen": -0.4949570298194885, + "logits/rejected": -0.5714269876480103, + "logps/chosen": -66.86375427246094, + "logps/rejected": -104.74398803710938, + "loss": 0.75, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.965360164642334, + "rewards/margins": 5.732150077819824, + "rewards/rejected": -2.766789436340332, + "step": 11833 + }, + { + "epoch": 2.96, + "grad_norm": 4.4069132804870605, + "learning_rate": 3.5727340812484805e-06, + "logits/chosen": -0.510671079158783, + "logits/rejected": -0.5815399885177612, + "logps/chosen": -55.03607177734375, + "logps/rejected": -114.9975814819336, + "loss": 0.5553, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8932082653045654, + "rewards/margins": 6.6826982498168945, + "rewards/rejected": -3.789489269256592, + "step": 11834 + }, + { + "epoch": 2.96, + "grad_norm": 4.530121326446533, + "learning_rate": 3.571980814092087e-06, + "logits/chosen": -0.44969040155410767, + "logits/rejected": -0.5371125936508179, + "logps/chosen": -67.86785888671875, + "logps/rejected": -89.79618835449219, + "loss": 0.6915, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.033496618270874, + "rewards/margins": 5.05342960357666, + "rewards/rejected": -2.019932508468628, + "step": 11835 + }, + { + "epoch": 2.96, + "grad_norm": 2.418077230453491, + "learning_rate": 3.5712275822235664e-06, + "logits/chosen": -0.4984353184700012, + "logits/rejected": -0.5911304950714111, + "logps/chosen": -45.66836929321289, + "logps/rejected": -100.91230010986328, + "loss": 0.554, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3357017040252686, + "rewards/margins": 7.50070858001709, + "rewards/rejected": -4.1650071144104, + "step": 11836 + }, + { + "epoch": 2.96, + "grad_norm": 11.095237731933594, + "learning_rate": 3.5704743856615333e-06, + "logits/chosen": -0.5758576989173889, + "logits/rejected": -0.6032707095146179, + "logps/chosen": -52.12784957885742, + "logps/rejected": -116.57701110839844, + "loss": 0.7656, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1690850257873535, + "rewards/margins": 6.5610833168029785, + "rewards/rejected": -3.391998767852783, + "step": 11837 + }, + { + "epoch": 2.96, + "grad_norm": 3.1411542892456055, + "learning_rate": 3.569721224424597e-06, + "logits/chosen": -0.5125565528869629, + "logits/rejected": -0.5993481278419495, + "logps/chosen": -59.18593215942383, + "logps/rejected": -83.04261779785156, + "loss": 0.6093, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.009650230407715, + "rewards/margins": 6.037044525146484, + "rewards/rejected": -3.0273940563201904, + "step": 11838 + }, + { + "epoch": 2.96, + "grad_norm": 6.643232822418213, + "learning_rate": 3.568968098531369e-06, + "logits/chosen": -0.4544309377670288, + "logits/rejected": -0.5272300243377686, + "logps/chosen": -54.35828399658203, + "logps/rejected": -108.36746215820312, + "loss": 0.6116, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.070687770843506, + "rewards/margins": 6.936219215393066, + "rewards/rejected": -3.8655314445495605, + "step": 11839 + }, + { + "epoch": 2.96, + "grad_norm": 6.666083335876465, + "learning_rate": 3.5682150080004635e-06, + "logits/chosen": -0.46323227882385254, + "logits/rejected": -0.5214040279388428, + "logps/chosen": -59.53763961791992, + "logps/rejected": -95.44815063476562, + "loss": 0.8343, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8330438137054443, + "rewards/margins": 6.235624313354492, + "rewards/rejected": -3.4025797843933105, + "step": 11840 + }, + { + "epoch": 2.96, + "grad_norm": 3.5626325607299805, + "learning_rate": 3.56746195285049e-06, + "logits/chosen": -0.5013006925582886, + "logits/rejected": -0.5761173367500305, + "logps/chosen": -54.76826477050781, + "logps/rejected": -98.00572204589844, + "loss": 0.6312, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0834434032440186, + "rewards/margins": 6.617012977600098, + "rewards/rejected": -3.5335693359375, + "step": 11841 + }, + { + "epoch": 2.96, + "grad_norm": 6.096637725830078, + "learning_rate": 3.566708933100052e-06, + "logits/chosen": -0.4909713864326477, + "logits/rejected": -0.5791061520576477, + "logps/chosen": -60.49168395996094, + "logps/rejected": -89.08293914794922, + "loss": 0.7609, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1937711238861084, + "rewards/margins": 5.861946105957031, + "rewards/rejected": -2.6681747436523438, + "step": 11842 + }, + { + "epoch": 2.96, + "grad_norm": 15.652628898620605, + "learning_rate": 3.565955948767762e-06, + "logits/chosen": -0.45044171810150146, + "logits/rejected": -0.5477653741836548, + "logps/chosen": -70.21669006347656, + "logps/rejected": -103.06269836425781, + "loss": 0.8359, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6747684478759766, + "rewards/margins": 6.410525798797607, + "rewards/rejected": -3.73575758934021, + "step": 11843 + }, + { + "epoch": 2.96, + "grad_norm": 3.629228115081787, + "learning_rate": 3.5652029998722264e-06, + "logits/chosen": -0.5448383092880249, + "logits/rejected": -0.6342288851737976, + "logps/chosen": -58.202484130859375, + "logps/rejected": -91.66023254394531, + "loss": 0.5768, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.295973062515259, + "rewards/margins": 6.534321308135986, + "rewards/rejected": -3.2383480072021484, + "step": 11844 + }, + { + "epoch": 2.96, + "grad_norm": 3.7747342586517334, + "learning_rate": 3.5644500864320502e-06, + "logits/chosen": -0.5338225960731506, + "logits/rejected": -0.6275597810745239, + "logps/chosen": -47.206275939941406, + "logps/rejected": -95.76448822021484, + "loss": 0.5414, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2722482681274414, + "rewards/margins": 6.667324066162109, + "rewards/rejected": -3.395075559616089, + "step": 11845 + }, + { + "epoch": 2.96, + "grad_norm": 5.376933574676514, + "learning_rate": 3.5636972084658396e-06, + "logits/chosen": -0.4843200445175171, + "logits/rejected": -0.6117613315582275, + "logps/chosen": -81.06285095214844, + "logps/rejected": -92.28873443603516, + "loss": 0.7412, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.065953254699707, + "rewards/margins": 6.303723335266113, + "rewards/rejected": -3.237769603729248, + "step": 11846 + }, + { + "epoch": 2.96, + "grad_norm": 2.770282030105591, + "learning_rate": 3.562944365992198e-06, + "logits/chosen": -0.5229261517524719, + "logits/rejected": -0.5651642084121704, + "logps/chosen": -51.24915313720703, + "logps/rejected": -98.74861907958984, + "loss": 0.6221, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2789769172668457, + "rewards/margins": 6.640800476074219, + "rewards/rejected": -3.361823558807373, + "step": 11847 + }, + { + "epoch": 2.96, + "grad_norm": 43.948795318603516, + "learning_rate": 3.562191559029731e-06, + "logits/chosen": -0.4806239604949951, + "logits/rejected": -0.5396096110343933, + "logps/chosen": -54.79547119140625, + "logps/rejected": -94.81109619140625, + "loss": 0.8198, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.696547508239746, + "rewards/margins": 5.008556842803955, + "rewards/rejected": -2.312009572982788, + "step": 11848 + }, + { + "epoch": 2.96, + "grad_norm": 7.104272842407227, + "learning_rate": 3.5614387875970385e-06, + "logits/chosen": -0.47989988327026367, + "logits/rejected": -0.5464959740638733, + "logps/chosen": -57.78428268432617, + "logps/rejected": -98.01274108886719, + "loss": 0.65, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.474518299102783, + "rewards/margins": 6.099212646484375, + "rewards/rejected": -2.624694585800171, + "step": 11849 + }, + { + "epoch": 2.96, + "grad_norm": 7.180952548980713, + "learning_rate": 3.560686051712724e-06, + "logits/chosen": -0.5214409828186035, + "logits/rejected": -0.6214460730552673, + "logps/chosen": -59.83544158935547, + "logps/rejected": -99.50772857666016, + "loss": 0.6836, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1438217163085938, + "rewards/margins": 6.102046966552734, + "rewards/rejected": -2.9582252502441406, + "step": 11850 + }, + { + "epoch": 2.96, + "grad_norm": 3.414165735244751, + "learning_rate": 3.5599333513953894e-06, + "logits/chosen": -0.4897148609161377, + "logits/rejected": -0.6279639005661011, + "logps/chosen": -63.048866271972656, + "logps/rejected": -126.44998168945312, + "loss": 0.5989, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3381688594818115, + "rewards/margins": 8.173065185546875, + "rewards/rejected": -4.834895610809326, + "step": 11851 + }, + { + "epoch": 2.96, + "grad_norm": 4.486513614654541, + "learning_rate": 3.559180686663632e-06, + "logits/chosen": -0.48452794551849365, + "logits/rejected": -0.5362840294837952, + "logps/chosen": -57.26087188720703, + "logps/rejected": -118.97745513916016, + "loss": 0.6363, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.179194927215576, + "rewards/margins": 6.7269062995910645, + "rewards/rejected": -3.5477113723754883, + "step": 11852 + }, + { + "epoch": 2.97, + "grad_norm": 6.280115604400635, + "learning_rate": 3.558428057536052e-06, + "logits/chosen": -0.4572828412055969, + "logits/rejected": -0.5143870115280151, + "logps/chosen": -54.57493591308594, + "logps/rejected": -89.88114166259766, + "loss": 0.6662, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1657776832580566, + "rewards/margins": 6.210277557373047, + "rewards/rejected": -3.0445003509521484, + "step": 11853 + }, + { + "epoch": 2.97, + "grad_norm": 21.201326370239258, + "learning_rate": 3.5576754640312477e-06, + "logits/chosen": -0.5993061661720276, + "logits/rejected": -0.6434972286224365, + "logps/chosen": -47.96403884887695, + "logps/rejected": -101.91972351074219, + "loss": 0.8187, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7970590591430664, + "rewards/margins": 5.805504322052002, + "rewards/rejected": -3.0084455013275146, + "step": 11854 + }, + { + "epoch": 2.97, + "grad_norm": 7.66944694519043, + "learning_rate": 3.5569229061678197e-06, + "logits/chosen": -0.4940229058265686, + "logits/rejected": -0.6041756272315979, + "logps/chosen": -54.49934768676758, + "logps/rejected": -83.88066101074219, + "loss": 0.617, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1450114250183105, + "rewards/margins": 6.111920356750488, + "rewards/rejected": -2.9669086933135986, + "step": 11855 + }, + { + "epoch": 2.97, + "grad_norm": 5.328159809112549, + "learning_rate": 3.5561703839643578e-06, + "logits/chosen": -0.5197798013687134, + "logits/rejected": -0.6304929256439209, + "logps/chosen": -55.844581604003906, + "logps/rejected": -99.974609375, + "loss": 0.5695, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.127331256866455, + "rewards/margins": 7.584561347961426, + "rewards/rejected": -4.4572296142578125, + "step": 11856 + }, + { + "epoch": 2.97, + "grad_norm": 6.207993030548096, + "learning_rate": 3.555417897439464e-06, + "logits/chosen": -0.49553048610687256, + "logits/rejected": -0.5524632930755615, + "logps/chosen": -63.65496826171875, + "logps/rejected": -101.41757202148438, + "loss": 0.6869, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.006500720977783, + "rewards/margins": 7.256310939788818, + "rewards/rejected": -4.249810218811035, + "step": 11857 + }, + { + "epoch": 2.97, + "grad_norm": 4.958606243133545, + "learning_rate": 3.5546654466117313e-06, + "logits/chosen": -0.5392318964004517, + "logits/rejected": -0.6286082863807678, + "logps/chosen": -47.17067337036133, + "logps/rejected": -89.66253662109375, + "loss": 0.5497, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0252134799957275, + "rewards/margins": 6.503501892089844, + "rewards/rejected": -3.478288412094116, + "step": 11858 + }, + { + "epoch": 2.97, + "grad_norm": 3.9143903255462646, + "learning_rate": 3.5539130314997498e-06, + "logits/chosen": -0.5851863622665405, + "logits/rejected": -0.6155096888542175, + "logps/chosen": -46.71788024902344, + "logps/rejected": -121.09403991699219, + "loss": 0.5825, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.193509101867676, + "rewards/margins": 7.534139156341553, + "rewards/rejected": -4.340629577636719, + "step": 11859 + }, + { + "epoch": 2.97, + "grad_norm": 3.414803981781006, + "learning_rate": 3.553160652122117e-06, + "logits/chosen": -0.46555930376052856, + "logits/rejected": -0.6076200008392334, + "logps/chosen": -61.90275573730469, + "logps/rejected": -92.8971176147461, + "loss": 0.5731, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9523768424987793, + "rewards/margins": 6.8690595626831055, + "rewards/rejected": -3.9166829586029053, + "step": 11860 + }, + { + "epoch": 2.97, + "grad_norm": 5.518362045288086, + "learning_rate": 3.5524083084974228e-06, + "logits/chosen": -0.5055981874465942, + "logits/rejected": -0.5866082310676575, + "logps/chosen": -48.760623931884766, + "logps/rejected": -96.09770965576172, + "loss": 0.5908, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.056546211242676, + "rewards/margins": 5.930577754974365, + "rewards/rejected": -2.874032497406006, + "step": 11861 + }, + { + "epoch": 2.97, + "grad_norm": 2.7324416637420654, + "learning_rate": 3.5516560006442603e-06, + "logits/chosen": -0.5259214043617249, + "logits/rejected": -0.5583004951477051, + "logps/chosen": -49.99399948120117, + "logps/rejected": -108.51932525634766, + "loss": 0.5979, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.694985866546631, + "rewards/margins": 5.994457244873047, + "rewards/rejected": -3.299471139907837, + "step": 11862 + }, + { + "epoch": 2.97, + "grad_norm": 9.378697395324707, + "learning_rate": 3.5509037285812176e-06, + "logits/chosen": -0.5182293653488159, + "logits/rejected": -0.5979101061820984, + "logps/chosen": -73.62222290039062, + "logps/rejected": -88.0582504272461, + "loss": 0.8677, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.268193244934082, + "rewards/margins": 4.952770233154297, + "rewards/rejected": -1.6845769882202148, + "step": 11863 + }, + { + "epoch": 2.97, + "grad_norm": 2.550882339477539, + "learning_rate": 3.5501514923268854e-06, + "logits/chosen": -0.5019274353981018, + "logits/rejected": -0.6208165884017944, + "logps/chosen": -47.87559127807617, + "logps/rejected": -91.45296478271484, + "loss": 0.5653, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.060150384902954, + "rewards/margins": 7.402768135070801, + "rewards/rejected": -4.342617511749268, + "step": 11864 + }, + { + "epoch": 2.97, + "grad_norm": 15.750718116760254, + "learning_rate": 3.549399291899851e-06, + "logits/chosen": -0.4157624840736389, + "logits/rejected": -0.5029710531234741, + "logps/chosen": -58.29829406738281, + "logps/rejected": -101.34862518310547, + "loss": 0.6664, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8994300365448, + "rewards/margins": 5.788470268249512, + "rewards/rejected": -2.889040470123291, + "step": 11865 + }, + { + "epoch": 2.97, + "grad_norm": 7.845514297485352, + "learning_rate": 3.548647127318705e-06, + "logits/chosen": -0.5254533290863037, + "logits/rejected": -0.6696944236755371, + "logps/chosen": -52.68329620361328, + "logps/rejected": -101.59494018554688, + "loss": 0.6352, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.089306116104126, + "rewards/margins": 7.368575572967529, + "rewards/rejected": -4.279268741607666, + "step": 11866 + }, + { + "epoch": 2.97, + "grad_norm": 6.440074443817139, + "learning_rate": 3.5478949986020306e-06, + "logits/chosen": -0.510249674320221, + "logits/rejected": -0.5577298402786255, + "logps/chosen": -59.57679748535156, + "logps/rejected": -96.14151763916016, + "loss": 0.6231, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1228199005126953, + "rewards/margins": 5.523632049560547, + "rewards/rejected": -2.4008121490478516, + "step": 11867 + }, + { + "epoch": 2.97, + "grad_norm": 3.968747615814209, + "learning_rate": 3.5471429057684147e-06, + "logits/chosen": -0.5290865898132324, + "logits/rejected": -0.5975220799446106, + "logps/chosen": -53.418338775634766, + "logps/rejected": -96.01023864746094, + "loss": 0.6221, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9803972244262695, + "rewards/margins": 5.775983810424805, + "rewards/rejected": -2.795586585998535, + "step": 11868 + }, + { + "epoch": 2.97, + "grad_norm": 11.11329460144043, + "learning_rate": 3.546390848836446e-06, + "logits/chosen": -0.4652080833911896, + "logits/rejected": -0.5492902398109436, + "logps/chosen": -69.10899353027344, + "logps/rejected": -90.37467193603516, + "loss": 0.8283, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0039708614349365, + "rewards/margins": 4.725505352020264, + "rewards/rejected": -1.721534013748169, + "step": 11869 + }, + { + "epoch": 2.97, + "grad_norm": 6.007941246032715, + "learning_rate": 3.5456388278247034e-06, + "logits/chosen": -0.48957711458206177, + "logits/rejected": -0.6475348472595215, + "logps/chosen": -82.14458465576172, + "logps/rejected": -87.16173553466797, + "loss": 0.8368, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1086843013763428, + "rewards/margins": 6.2110137939453125, + "rewards/rejected": -3.1023294925689697, + "step": 11870 + }, + { + "epoch": 2.97, + "grad_norm": 5.805503845214844, + "learning_rate": 3.5448868427517714e-06, + "logits/chosen": -0.5427933931350708, + "logits/rejected": -0.6375777721405029, + "logps/chosen": -41.1763801574707, + "logps/rejected": -81.41067504882812, + "loss": 0.5733, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0419368743896484, + "rewards/margins": 6.252195358276367, + "rewards/rejected": -3.2102580070495605, + "step": 11871 + }, + { + "epoch": 2.97, + "grad_norm": 4.277935981750488, + "learning_rate": 3.5441348936362335e-06, + "logits/chosen": -0.5123983025550842, + "logits/rejected": -0.5974452495574951, + "logps/chosen": -54.19219207763672, + "logps/rejected": -93.29743957519531, + "loss": 0.7102, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2520947456359863, + "rewards/margins": 6.088848114013672, + "rewards/rejected": -2.8367526531219482, + "step": 11872 + }, + { + "epoch": 2.97, + "grad_norm": 23.87326431274414, + "learning_rate": 3.5433829804966725e-06, + "logits/chosen": -0.501301646232605, + "logits/rejected": -0.5285645127296448, + "logps/chosen": -48.79338073730469, + "logps/rejected": -111.93305969238281, + "loss": 0.7716, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9749019145965576, + "rewards/margins": 5.890796661376953, + "rewards/rejected": -2.9158947467803955, + "step": 11873 + }, + { + "epoch": 2.97, + "grad_norm": 4.6103129386901855, + "learning_rate": 3.5426311033516665e-06, + "logits/chosen": -0.4809475243091583, + "logits/rejected": -0.48098084330558777, + "logps/chosen": -63.2703857421875, + "logps/rejected": -110.51441955566406, + "loss": 0.6633, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0769453048706055, + "rewards/margins": 5.209967613220215, + "rewards/rejected": -2.133021831512451, + "step": 11874 + }, + { + "epoch": 2.97, + "grad_norm": 4.6372480392456055, + "learning_rate": 3.541879262219796e-06, + "logits/chosen": -0.5549198389053345, + "logits/rejected": -0.66462242603302, + "logps/chosen": -52.416622161865234, + "logps/rejected": -91.34768676757812, + "loss": 0.632, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.343538284301758, + "rewards/margins": 7.061298370361328, + "rewards/rejected": -3.7177600860595703, + "step": 11875 + }, + { + "epoch": 2.97, + "grad_norm": 2.776015520095825, + "learning_rate": 3.5411274571196407e-06, + "logits/chosen": -0.5482279062271118, + "logits/rejected": -0.5942654609680176, + "logps/chosen": -56.43598175048828, + "logps/rejected": -108.26864624023438, + "loss": 0.6131, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0667998790740967, + "rewards/margins": 6.909277439117432, + "rewards/rejected": -3.842477798461914, + "step": 11876 + }, + { + "epoch": 2.97, + "grad_norm": 3.5364322662353516, + "learning_rate": 3.5403756880697764e-06, + "logits/chosen": -0.44767698645591736, + "logits/rejected": -0.5655839443206787, + "logps/chosen": -48.883811950683594, + "logps/rejected": -90.17679595947266, + "loss": 0.5798, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3487250804901123, + "rewards/margins": 6.318601131439209, + "rewards/rejected": -2.9698758125305176, + "step": 11877 + }, + { + "epoch": 2.97, + "grad_norm": 2.340244770050049, + "learning_rate": 3.539623955088782e-06, + "logits/chosen": -0.4250245988368988, + "logits/rejected": -0.5180445313453674, + "logps/chosen": -62.3476448059082, + "logps/rejected": -111.83013916015625, + "loss": 0.5861, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3272478580474854, + "rewards/margins": 7.584561824798584, + "rewards/rejected": -4.257314205169678, + "step": 11878 + }, + { + "epoch": 2.97, + "grad_norm": 13.18359661102295, + "learning_rate": 3.5388722581952327e-06, + "logits/chosen": -0.5301094055175781, + "logits/rejected": -0.6074982285499573, + "logps/chosen": -47.983280181884766, + "logps/rejected": -107.10186767578125, + "loss": 0.5615, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0703964233398438, + "rewards/margins": 6.401993274688721, + "rewards/rejected": -3.3315975666046143, + "step": 11879 + }, + { + "epoch": 2.97, + "grad_norm": 5.29888916015625, + "learning_rate": 3.538120597407705e-06, + "logits/chosen": -0.5633256435394287, + "logits/rejected": -0.5931299328804016, + "logps/chosen": -52.15459060668945, + "logps/rejected": -108.27295684814453, + "loss": 0.5907, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.073378801345825, + "rewards/margins": 5.937292098999023, + "rewards/rejected": -2.8639137744903564, + "step": 11880 + }, + { + "epoch": 2.97, + "grad_norm": 2.9050254821777344, + "learning_rate": 3.5373689727447714e-06, + "logits/chosen": -0.4656137526035309, + "logits/rejected": -0.5838418006896973, + "logps/chosen": -57.726959228515625, + "logps/rejected": -91.37344360351562, + "loss": 0.6403, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8201935291290283, + "rewards/margins": 6.230807304382324, + "rewards/rejected": -3.410614013671875, + "step": 11881 + }, + { + "epoch": 2.97, + "grad_norm": 3.1419975757598877, + "learning_rate": 3.5366173842250064e-06, + "logits/chosen": -0.42209550738334656, + "logits/rejected": -0.5241506695747375, + "logps/chosen": -59.487548828125, + "logps/rejected": -93.52854919433594, + "loss": 0.622, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.10648250579834, + "rewards/margins": 6.023936748504639, + "rewards/rejected": -2.9174540042877197, + "step": 11882 + }, + { + "epoch": 2.97, + "grad_norm": 3.4344706535339355, + "learning_rate": 3.5358658318669824e-06, + "logits/chosen": -0.44598454236984253, + "logits/rejected": -0.5367749929428101, + "logps/chosen": -72.78901672363281, + "logps/rejected": -106.35102844238281, + "loss": 0.6157, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.868778944015503, + "rewards/margins": 6.467695236206055, + "rewards/rejected": -3.5989155769348145, + "step": 11883 + }, + { + "epoch": 2.97, + "grad_norm": 4.054080009460449, + "learning_rate": 3.5351143156892707e-06, + "logits/chosen": -0.5284652709960938, + "logits/rejected": -0.6383720636367798, + "logps/chosen": -50.61582946777344, + "logps/rejected": -80.44915771484375, + "loss": 0.6438, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.349412202835083, + "rewards/margins": 7.289965629577637, + "rewards/rejected": -3.9405531883239746, + "step": 11884 + }, + { + "epoch": 2.97, + "grad_norm": 7.6080803871154785, + "learning_rate": 3.5343628357104416e-06, + "logits/chosen": -0.5843261480331421, + "logits/rejected": -0.6440313458442688, + "logps/chosen": -59.71540069580078, + "logps/rejected": -102.67020416259766, + "loss": 0.8398, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.909014940261841, + "rewards/margins": 5.104867458343506, + "rewards/rejected": -2.195852518081665, + "step": 11885 + }, + { + "epoch": 2.97, + "grad_norm": 4.660405158996582, + "learning_rate": 3.533611391949066e-06, + "logits/chosen": -0.464292973279953, + "logits/rejected": -0.5747494697570801, + "logps/chosen": -51.434539794921875, + "logps/rejected": -83.70729064941406, + "loss": 0.5827, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.168260335922241, + "rewards/margins": 6.4987382888793945, + "rewards/rejected": -3.330477714538574, + "step": 11886 + }, + { + "epoch": 2.97, + "grad_norm": 5.6442952156066895, + "learning_rate": 3.5328599844237153e-06, + "logits/chosen": -0.4611184000968933, + "logits/rejected": -0.5117812752723694, + "logps/chosen": -44.85894775390625, + "logps/rejected": -81.0468521118164, + "loss": 0.6844, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.003582000732422, + "rewards/margins": 5.127188682556152, + "rewards/rejected": -2.1236066818237305, + "step": 11887 + }, + { + "epoch": 2.97, + "grad_norm": 11.340367317199707, + "learning_rate": 3.532108613152952e-06, + "logits/chosen": -0.38055241107940674, + "logits/rejected": -0.47177597880363464, + "logps/chosen": -61.25181198120117, + "logps/rejected": -92.46196746826172, + "loss": 0.7491, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.03263521194458, + "rewards/margins": 5.503932952880859, + "rewards/rejected": -2.4712977409362793, + "step": 11888 + }, + { + "epoch": 2.97, + "grad_norm": 5.307669162750244, + "learning_rate": 3.5313572781553467e-06, + "logits/chosen": -0.45612603425979614, + "logits/rejected": -0.5904541015625, + "logps/chosen": -57.8622932434082, + "logps/rejected": -85.11776733398438, + "loss": 0.6151, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9904897212982178, + "rewards/margins": 6.119234085083008, + "rewards/rejected": -3.1287450790405273, + "step": 11889 + }, + { + "epoch": 2.97, + "grad_norm": 5.4590253829956055, + "learning_rate": 3.5306059794494652e-06, + "logits/chosen": -0.6199865937232971, + "logits/rejected": -0.7442057132720947, + "logps/chosen": -56.88568115234375, + "logps/rejected": -80.74026489257812, + "loss": 0.6689, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0432403087615967, + "rewards/margins": 5.9136176109313965, + "rewards/rejected": -2.8703773021698, + "step": 11890 + }, + { + "epoch": 2.97, + "grad_norm": 6.234589576721191, + "learning_rate": 3.5298547170538745e-06, + "logits/chosen": -0.44102147221565247, + "logits/rejected": -0.4922883212566376, + "logps/chosen": -59.830718994140625, + "logps/rejected": -107.75448608398438, + "loss": 0.748, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9008004665374756, + "rewards/margins": 5.325708866119385, + "rewards/rejected": -2.424907922744751, + "step": 11891 + }, + { + "epoch": 2.97, + "grad_norm": 7.7495341300964355, + "learning_rate": 3.529103490987136e-06, + "logits/chosen": -0.4623802602291107, + "logits/rejected": -0.5776463747024536, + "logps/chosen": -69.78617095947266, + "logps/rejected": -89.83131408691406, + "loss": 0.8725, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7616703510284424, + "rewards/margins": 5.320814609527588, + "rewards/rejected": -2.5591442584991455, + "step": 11892 + }, + { + "epoch": 2.98, + "grad_norm": 5.749563217163086, + "learning_rate": 3.5283523012678145e-06, + "logits/chosen": -0.5069801807403564, + "logits/rejected": -0.6143983006477356, + "logps/chosen": -57.70081329345703, + "logps/rejected": -97.8040771484375, + "loss": 0.7542, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8440847396850586, + "rewards/margins": 5.967051029205322, + "rewards/rejected": -3.1229660511016846, + "step": 11893 + }, + { + "epoch": 2.98, + "grad_norm": 6.502685070037842, + "learning_rate": 3.5276011479144745e-06, + "logits/chosen": -0.4172827899456024, + "logits/rejected": -0.5543620586395264, + "logps/chosen": -53.5437126159668, + "logps/rejected": -84.13451385498047, + "loss": 0.5518, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.068037271499634, + "rewards/margins": 6.548050403594971, + "rewards/rejected": -3.480013370513916, + "step": 11894 + }, + { + "epoch": 2.98, + "grad_norm": 5.339160919189453, + "learning_rate": 3.5268500309456744e-06, + "logits/chosen": -0.48757535219192505, + "logits/rejected": -0.5605144500732422, + "logps/chosen": -60.71765899658203, + "logps/rejected": -115.01681518554688, + "loss": 0.6741, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6071510314941406, + "rewards/margins": 6.480172157287598, + "rewards/rejected": -3.8730216026306152, + "step": 11895 + }, + { + "epoch": 2.98, + "grad_norm": 4.809534072875977, + "learning_rate": 3.526098950379978e-06, + "logits/chosen": -0.4767959415912628, + "logits/rejected": -0.5831259489059448, + "logps/chosen": -56.02900695800781, + "logps/rejected": -80.86082458496094, + "loss": 0.6507, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1653144359588623, + "rewards/margins": 5.6592206954956055, + "rewards/rejected": -2.493906259536743, + "step": 11896 + }, + { + "epoch": 2.98, + "grad_norm": 4.545932292938232, + "learning_rate": 3.5253479062359426e-06, + "logits/chosen": -0.4811934530735016, + "logits/rejected": -0.586651623249054, + "logps/chosen": -49.577091217041016, + "logps/rejected": -108.13545989990234, + "loss": 0.6499, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.095764636993408, + "rewards/margins": 6.264318466186523, + "rewards/rejected": -3.1685538291931152, + "step": 11897 + }, + { + "epoch": 2.98, + "grad_norm": 4.538883686065674, + "learning_rate": 3.524596898532132e-06, + "logits/chosen": -0.4960348606109619, + "logits/rejected": -0.5603613257408142, + "logps/chosen": -52.48331069946289, + "logps/rejected": -91.00755310058594, + "loss": 0.6692, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1462953090667725, + "rewards/margins": 6.526595115661621, + "rewards/rejected": -3.3802995681762695, + "step": 11898 + }, + { + "epoch": 2.98, + "grad_norm": 4.139518737792969, + "learning_rate": 3.5238459272870983e-06, + "logits/chosen": -0.5004203915596008, + "logits/rejected": -0.5600032210350037, + "logps/chosen": -48.778045654296875, + "logps/rejected": -109.46249389648438, + "loss": 0.6076, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2364494800567627, + "rewards/margins": 6.003098487854004, + "rewards/rejected": -2.7666492462158203, + "step": 11899 + }, + { + "epoch": 2.98, + "grad_norm": 5.700623989105225, + "learning_rate": 3.5230949925194034e-06, + "logits/chosen": -0.4923620820045471, + "logits/rejected": -0.5771569013595581, + "logps/chosen": -63.871665954589844, + "logps/rejected": -102.16768646240234, + "loss": 0.6531, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.100844383239746, + "rewards/margins": 6.38862419128418, + "rewards/rejected": -3.2877800464630127, + "step": 11900 + }, + { + "epoch": 2.98, + "grad_norm": 3.4244840145111084, + "learning_rate": 3.522344094247604e-06, + "logits/chosen": -0.5475953817367554, + "logits/rejected": -0.6331706643104553, + "logps/chosen": -45.58769226074219, + "logps/rejected": -106.27928161621094, + "loss": 0.5601, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2170944213867188, + "rewards/margins": 7.203693389892578, + "rewards/rejected": -3.986598491668701, + "step": 11901 + }, + { + "epoch": 2.98, + "grad_norm": 3.923048734664917, + "learning_rate": 3.5215932324902503e-06, + "logits/chosen": -0.5174800157546997, + "logits/rejected": -0.6151465773582458, + "logps/chosen": -59.97080993652344, + "logps/rejected": -93.24441528320312, + "loss": 0.6329, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7985236644744873, + "rewards/margins": 6.527945518493652, + "rewards/rejected": -3.729421615600586, + "step": 11902 + }, + { + "epoch": 2.98, + "grad_norm": 3.4775073528289795, + "learning_rate": 3.520842407265901e-06, + "logits/chosen": -0.4955452084541321, + "logits/rejected": -0.6314411759376526, + "logps/chosen": -60.08125305175781, + "logps/rejected": -90.34921264648438, + "loss": 0.6366, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3041131496429443, + "rewards/margins": 5.846342086791992, + "rewards/rejected": -2.5422286987304688, + "step": 11903 + }, + { + "epoch": 2.98, + "grad_norm": 7.7587080001831055, + "learning_rate": 3.52009161859311e-06, + "logits/chosen": -0.5284953713417053, + "logits/rejected": -0.6024607419967651, + "logps/chosen": -56.261409759521484, + "logps/rejected": -81.57601928710938, + "loss": 0.7352, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1550841331481934, + "rewards/margins": 5.621890068054199, + "rewards/rejected": -2.466805934906006, + "step": 11904 + }, + { + "epoch": 2.98, + "grad_norm": 6.199102878570557, + "learning_rate": 3.5193408664904292e-06, + "logits/chosen": -0.46246594190597534, + "logits/rejected": -0.5703389644622803, + "logps/chosen": -52.93197250366211, + "logps/rejected": -88.31851196289062, + "loss": 0.6375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1847312450408936, + "rewards/margins": 6.483471393585205, + "rewards/rejected": -3.2987399101257324, + "step": 11905 + }, + { + "epoch": 2.98, + "grad_norm": 4.775441646575928, + "learning_rate": 3.518590150976411e-06, + "logits/chosen": -0.5575212836265564, + "logits/rejected": -0.6408077478408813, + "logps/chosen": -64.88728332519531, + "logps/rejected": -95.09719848632812, + "loss": 0.6493, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8599982261657715, + "rewards/margins": 5.970859527587891, + "rewards/rejected": -3.110861301422119, + "step": 11906 + }, + { + "epoch": 2.98, + "grad_norm": 6.114700794219971, + "learning_rate": 3.5178394720696046e-06, + "logits/chosen": -0.5173467993736267, + "logits/rejected": -0.5423834323883057, + "logps/chosen": -54.75373840332031, + "logps/rejected": -94.44695281982422, + "loss": 0.6844, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0867223739624023, + "rewards/margins": 4.972663879394531, + "rewards/rejected": -1.885941982269287, + "step": 11907 + }, + { + "epoch": 2.98, + "grad_norm": 9.915926933288574, + "learning_rate": 3.517088829788563e-06, + "logits/chosen": -0.477555513381958, + "logits/rejected": -0.5690212845802307, + "logps/chosen": -60.1760139465332, + "logps/rejected": -113.09346008300781, + "loss": 0.7308, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.824326276779175, + "rewards/margins": 6.466325759887695, + "rewards/rejected": -3.642000198364258, + "step": 11908 + }, + { + "epoch": 2.98, + "grad_norm": 6.048985004425049, + "learning_rate": 3.5163382241518318e-06, + "logits/chosen": -0.5203932523727417, + "logits/rejected": -0.6233614683151245, + "logps/chosen": -50.58687973022461, + "logps/rejected": -96.1008529663086, + "loss": 0.6813, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.269362688064575, + "rewards/margins": 6.171050071716309, + "rewards/rejected": -2.9016873836517334, + "step": 11909 + }, + { + "epoch": 2.98, + "grad_norm": 5.514932632446289, + "learning_rate": 3.515587655177962e-06, + "logits/chosen": -0.5186977386474609, + "logits/rejected": -0.6034486889839172, + "logps/chosen": -62.97803497314453, + "logps/rejected": -79.46814727783203, + "loss": 0.7666, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1108970642089844, + "rewards/margins": 4.495150089263916, + "rewards/rejected": -1.3842527866363525, + "step": 11910 + }, + { + "epoch": 2.98, + "grad_norm": 5.6178507804870605, + "learning_rate": 3.514837122885499e-06, + "logits/chosen": -0.5155627727508545, + "logits/rejected": -0.5987704992294312, + "logps/chosen": -49.95022964477539, + "logps/rejected": -104.56012725830078, + "loss": 0.5882, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.013021469116211, + "rewards/margins": 7.267204284667969, + "rewards/rejected": -4.2541823387146, + "step": 11911 + }, + { + "epoch": 2.98, + "grad_norm": 6.648347854614258, + "learning_rate": 3.514086627292993e-06, + "logits/chosen": -0.47728967666625977, + "logits/rejected": -0.5388490557670593, + "logps/chosen": -56.88057327270508, + "logps/rejected": -97.69181823730469, + "loss": 0.7539, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.256075620651245, + "rewards/margins": 5.821142673492432, + "rewards/rejected": -2.5650672912597656, + "step": 11912 + }, + { + "epoch": 2.98, + "grad_norm": 5.644441604614258, + "learning_rate": 3.513336168418986e-06, + "logits/chosen": -0.6208866834640503, + "logits/rejected": -0.6682140827178955, + "logps/chosen": -48.652828216552734, + "logps/rejected": -98.5787124633789, + "loss": 0.6548, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.072603225708008, + "rewards/margins": 6.185646057128906, + "rewards/rejected": -3.1130428314208984, + "step": 11913 + }, + { + "epoch": 2.98, + "grad_norm": 5.720218181610107, + "learning_rate": 3.512585746282021e-06, + "logits/chosen": -0.4992406964302063, + "logits/rejected": -0.5130606889724731, + "logps/chosen": -58.9575080871582, + "logps/rejected": -119.03126525878906, + "loss": 0.6931, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.474350929260254, + "rewards/margins": 7.142176628112793, + "rewards/rejected": -3.66782546043396, + "step": 11914 + }, + { + "epoch": 2.98, + "grad_norm": 4.0418524742126465, + "learning_rate": 3.5118353609006463e-06, + "logits/chosen": -0.5523913502693176, + "logits/rejected": -0.6018780469894409, + "logps/chosen": -44.649932861328125, + "logps/rejected": -93.12542724609375, + "loss": 0.5843, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9205617904663086, + "rewards/margins": 6.638063430786133, + "rewards/rejected": -3.717501163482666, + "step": 11915 + }, + { + "epoch": 2.98, + "grad_norm": 3.6257951259613037, + "learning_rate": 3.5110850122934037e-06, + "logits/chosen": -0.4898376166820526, + "logits/rejected": -0.5772414207458496, + "logps/chosen": -54.57358932495117, + "logps/rejected": -105.56370544433594, + "loss": 0.7078, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.055013418197632, + "rewards/margins": 7.179135799407959, + "rewards/rejected": -4.124122619628906, + "step": 11916 + }, + { + "epoch": 2.98, + "grad_norm": 15.248454093933105, + "learning_rate": 3.510334700478833e-06, + "logits/chosen": -0.5285106897354126, + "logits/rejected": -0.6843109726905823, + "logps/chosen": -59.743927001953125, + "logps/rejected": -84.82698059082031, + "loss": 0.7338, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8935885429382324, + "rewards/margins": 6.314803600311279, + "rewards/rejected": -3.4212148189544678, + "step": 11917 + }, + { + "epoch": 2.98, + "grad_norm": 3.803731679916382, + "learning_rate": 3.5095844254754764e-06, + "logits/chosen": -0.5247658491134644, + "logits/rejected": -0.5948097705841064, + "logps/chosen": -42.28902053833008, + "logps/rejected": -97.72933197021484, + "loss": 0.5808, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3612401485443115, + "rewards/margins": 7.031613826751709, + "rewards/rejected": -3.6703734397888184, + "step": 11918 + }, + { + "epoch": 2.98, + "grad_norm": 4.355676174163818, + "learning_rate": 3.508834187301875e-06, + "logits/chosen": -0.4805259108543396, + "logits/rejected": -0.5459847450256348, + "logps/chosen": -45.228904724121094, + "logps/rejected": -86.33109283447266, + "loss": 0.6423, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0145609378814697, + "rewards/margins": 5.478569030761719, + "rewards/rejected": -2.464008092880249, + "step": 11919 + }, + { + "epoch": 2.98, + "grad_norm": 4.088496685028076, + "learning_rate": 3.508083985976565e-06, + "logits/chosen": -0.5043427348136902, + "logits/rejected": -0.5629724264144897, + "logps/chosen": -59.827972412109375, + "logps/rejected": -94.24580383300781, + "loss": 0.6323, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.104064702987671, + "rewards/margins": 5.506539344787598, + "rewards/rejected": -2.4024744033813477, + "step": 11920 + }, + { + "epoch": 2.98, + "grad_norm": 3.6951088905334473, + "learning_rate": 3.5073338215180873e-06, + "logits/chosen": -0.5721012353897095, + "logits/rejected": -0.684383749961853, + "logps/chosen": -61.520965576171875, + "logps/rejected": -84.83744812011719, + "loss": 0.6357, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.997556686401367, + "rewards/margins": 6.5382256507873535, + "rewards/rejected": -3.540668487548828, + "step": 11921 + }, + { + "epoch": 2.98, + "grad_norm": 2.5182371139526367, + "learning_rate": 3.506583693944979e-06, + "logits/chosen": -0.5232518911361694, + "logits/rejected": -0.5864789485931396, + "logps/chosen": -42.71565246582031, + "logps/rejected": -97.18479919433594, + "loss": 0.5602, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.998375177383423, + "rewards/margins": 7.691081523895264, + "rewards/rejected": -4.69270658493042, + "step": 11922 + }, + { + "epoch": 2.98, + "grad_norm": 9.760411262512207, + "learning_rate": 3.5058336032757766e-06, + "logits/chosen": -0.4911808967590332, + "logits/rejected": -0.5187733173370361, + "logps/chosen": -58.46031951904297, + "logps/rejected": -111.38725280761719, + "loss": 0.738, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2452385425567627, + "rewards/margins": 5.670233726501465, + "rewards/rejected": -2.4249954223632812, + "step": 11923 + }, + { + "epoch": 2.98, + "grad_norm": 3.8514976501464844, + "learning_rate": 3.5050835495290136e-06, + "logits/chosen": -0.4451076090335846, + "logits/rejected": -0.5360339879989624, + "logps/chosen": -53.366539001464844, + "logps/rejected": -94.89813232421875, + "loss": 0.6158, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0408201217651367, + "rewards/margins": 5.674829006195068, + "rewards/rejected": -2.6340088844299316, + "step": 11924 + }, + { + "epoch": 2.98, + "grad_norm": 4.775284290313721, + "learning_rate": 3.504333532723227e-06, + "logits/chosen": -0.5025060772895813, + "logits/rejected": -0.6042205095291138, + "logps/chosen": -57.657894134521484, + "logps/rejected": -83.86198425292969, + "loss": 0.7308, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7902660369873047, + "rewards/margins": 5.317284107208252, + "rewards/rejected": -2.5270180702209473, + "step": 11925 + }, + { + "epoch": 2.98, + "grad_norm": 10.659598350524902, + "learning_rate": 3.50358355287695e-06, + "logits/chosen": -0.5691087245941162, + "logits/rejected": -0.6244596242904663, + "logps/chosen": -51.44355773925781, + "logps/rejected": -95.43160247802734, + "loss": 0.7056, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.756838321685791, + "rewards/margins": 5.643786430358887, + "rewards/rejected": -2.8869478702545166, + "step": 11926 + }, + { + "epoch": 2.98, + "grad_norm": 8.84681224822998, + "learning_rate": 3.5028336100087146e-06, + "logits/chosen": -0.432925283908844, + "logits/rejected": -0.5228450894355774, + "logps/chosen": -65.52111053466797, + "logps/rejected": -114.58989715576172, + "loss": 0.7909, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9875435829162598, + "rewards/margins": 6.859622478485107, + "rewards/rejected": -3.8720788955688477, + "step": 11927 + }, + { + "epoch": 2.98, + "grad_norm": 4.449512004852295, + "learning_rate": 3.5020837041370525e-06, + "logits/chosen": -0.5247032046318054, + "logits/rejected": -0.6354228258132935, + "logps/chosen": -58.09858703613281, + "logps/rejected": -96.25780487060547, + "loss": 0.6903, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.884294271469116, + "rewards/margins": 6.149626731872559, + "rewards/rejected": -3.2653324604034424, + "step": 11928 + }, + { + "epoch": 2.98, + "grad_norm": 6.085699081420898, + "learning_rate": 3.5013338352804952e-06, + "logits/chosen": -0.479172945022583, + "logits/rejected": -0.5623667240142822, + "logps/chosen": -60.550819396972656, + "logps/rejected": -97.79173278808594, + "loss": 0.6731, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.361206531524658, + "rewards/margins": 6.365942001342773, + "rewards/rejected": -3.004735231399536, + "step": 11929 + }, + { + "epoch": 2.98, + "grad_norm": 6.940337657928467, + "learning_rate": 3.5005840034575753e-06, + "logits/chosen": -0.4995160400867462, + "logits/rejected": -0.5752756595611572, + "logps/chosen": -55.01525115966797, + "logps/rejected": -90.01750946044922, + "loss": 0.6804, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.991206407546997, + "rewards/margins": 5.878214359283447, + "rewards/rejected": -2.887007713317871, + "step": 11930 + }, + { + "epoch": 2.98, + "grad_norm": 4.229207515716553, + "learning_rate": 3.4998342086868174e-06, + "logits/chosen": -0.4348607659339905, + "logits/rejected": -0.598077118396759, + "logps/chosen": -66.87007141113281, + "logps/rejected": -87.61563110351562, + "loss": 0.6747, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.611969232559204, + "rewards/margins": 6.931118011474609, + "rewards/rejected": -3.3191492557525635, + "step": 11931 + }, + { + "epoch": 2.98, + "grad_norm": 19.63558578491211, + "learning_rate": 3.4990844509867517e-06, + "logits/chosen": -0.43546387553215027, + "logits/rejected": -0.5384697914123535, + "logps/chosen": -55.96527099609375, + "logps/rejected": -81.77299499511719, + "loss": 0.6503, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.856316089630127, + "rewards/margins": 5.606229782104492, + "rewards/rejected": -2.749912977218628, + "step": 11932 + }, + { + "epoch": 2.99, + "grad_norm": 13.82100772857666, + "learning_rate": 3.4983347303759073e-06, + "logits/chosen": -0.5626451969146729, + "logits/rejected": -0.6659320592880249, + "logps/chosen": -57.833099365234375, + "logps/rejected": -97.0426025390625, + "loss": 0.7552, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.90116024017334, + "rewards/margins": 6.4855852127075195, + "rewards/rejected": -3.584425210952759, + "step": 11933 + }, + { + "epoch": 2.99, + "grad_norm": 1.9411652088165283, + "learning_rate": 3.4975850468728077e-06, + "logits/chosen": -0.5914088487625122, + "logits/rejected": -0.6240783333778381, + "logps/chosen": -51.394691467285156, + "logps/rejected": -120.1253890991211, + "loss": 0.5409, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1949832439422607, + "rewards/margins": 7.20351505279541, + "rewards/rejected": -4.0085320472717285, + "step": 11934 + }, + { + "epoch": 2.99, + "grad_norm": 4.111469268798828, + "learning_rate": 3.4968354004959804e-06, + "logits/chosen": -0.5160630941390991, + "logits/rejected": -0.6832612752914429, + "logps/chosen": -56.56947326660156, + "logps/rejected": -95.771484375, + "loss": 0.5877, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.086806297302246, + "rewards/margins": 7.381054878234863, + "rewards/rejected": -4.294248104095459, + "step": 11935 + }, + { + "epoch": 2.99, + "grad_norm": 5.389636516571045, + "learning_rate": 3.496085791263948e-06, + "logits/chosen": -0.5687566995620728, + "logits/rejected": -0.6252655386924744, + "logps/chosen": -61.684940338134766, + "logps/rejected": -90.95265197753906, + "loss": 0.7097, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.205179452896118, + "rewards/margins": 5.937564849853516, + "rewards/rejected": -2.7323856353759766, + "step": 11936 + }, + { + "epoch": 2.99, + "grad_norm": 3.2549662590026855, + "learning_rate": 3.495336219195237e-06, + "logits/chosen": -0.4509649872779846, + "logits/rejected": -0.5566542148590088, + "logps/chosen": -50.784576416015625, + "logps/rejected": -109.57368469238281, + "loss": 0.5861, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.240565299987793, + "rewards/margins": 6.385157108306885, + "rewards/rejected": -3.144592046737671, + "step": 11937 + }, + { + "epoch": 2.99, + "grad_norm": 5.543984413146973, + "learning_rate": 3.4945866843083674e-06, + "logits/chosen": -0.48009154200553894, + "logits/rejected": -0.6125965118408203, + "logps/chosen": -64.24935913085938, + "logps/rejected": -87.31016540527344, + "loss": 0.6317, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.249678134918213, + "rewards/margins": 6.727222919464111, + "rewards/rejected": -3.4775447845458984, + "step": 11938 + }, + { + "epoch": 2.99, + "grad_norm": 9.307259559631348, + "learning_rate": 3.493837186621862e-06, + "logits/chosen": -0.5451968312263489, + "logits/rejected": -0.603769063949585, + "logps/chosen": -48.81397247314453, + "logps/rejected": -95.67395782470703, + "loss": 0.7225, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0241713523864746, + "rewards/margins": 5.826268196105957, + "rewards/rejected": -2.8020966053009033, + "step": 11939 + }, + { + "epoch": 2.99, + "grad_norm": 2.4832122325897217, + "learning_rate": 3.493087726154243e-06, + "logits/chosen": -0.5782954692840576, + "logits/rejected": -0.6762862801551819, + "logps/chosen": -44.674049377441406, + "logps/rejected": -85.87315368652344, + "loss": 0.5689, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.00500226020813, + "rewards/margins": 6.158633708953857, + "rewards/rejected": -3.1536309719085693, + "step": 11940 + }, + { + "epoch": 2.99, + "grad_norm": 5.100445747375488, + "learning_rate": 3.4923383029240276e-06, + "logits/chosen": -0.47620126605033875, + "logits/rejected": -0.5411655902862549, + "logps/chosen": -47.462013244628906, + "logps/rejected": -90.71387481689453, + "loss": 0.6128, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1034955978393555, + "rewards/margins": 5.676718711853027, + "rewards/rejected": -2.5732221603393555, + "step": 11941 + }, + { + "epoch": 2.99, + "grad_norm": 8.241829872131348, + "learning_rate": 3.4915889169497363e-06, + "logits/chosen": -0.5105603933334351, + "logits/rejected": -0.582511842250824, + "logps/chosen": -52.75603485107422, + "logps/rejected": -78.06396484375, + "loss": 0.7786, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9991297721862793, + "rewards/margins": 4.8455305099487305, + "rewards/rejected": -1.846400260925293, + "step": 11942 + }, + { + "epoch": 2.99, + "grad_norm": 3.747823715209961, + "learning_rate": 3.4908395682498862e-06, + "logits/chosen": -0.5966640710830688, + "logits/rejected": -0.6551378965377808, + "logps/chosen": -54.25901794433594, + "logps/rejected": -92.60045623779297, + "loss": 0.7014, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2402071952819824, + "rewards/margins": 6.199163436889648, + "rewards/rejected": -2.958956241607666, + "step": 11943 + }, + { + "epoch": 2.99, + "grad_norm": 10.388456344604492, + "learning_rate": 3.4900902568429994e-06, + "logits/chosen": -0.5186759233474731, + "logits/rejected": -0.5966249704360962, + "logps/chosen": -56.42627716064453, + "logps/rejected": -107.91732788085938, + "loss": 0.7739, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.304964542388916, + "rewards/margins": 6.519519329071045, + "rewards/rejected": -3.214554786682129, + "step": 11944 + }, + { + "epoch": 2.99, + "grad_norm": 5.2362871170043945, + "learning_rate": 3.4893409827475845e-06, + "logits/chosen": -0.46406492590904236, + "logits/rejected": -0.5772413611412048, + "logps/chosen": -68.28363037109375, + "logps/rejected": -89.02445220947266, + "loss": 0.7584, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0272369384765625, + "rewards/margins": 5.627936363220215, + "rewards/rejected": -2.6006991863250732, + "step": 11945 + }, + { + "epoch": 2.99, + "grad_norm": 7.609086036682129, + "learning_rate": 3.488591745982162e-06, + "logits/chosen": -0.4034111499786377, + "logits/rejected": -0.49822354316711426, + "logps/chosen": -70.32151794433594, + "logps/rejected": -94.33031463623047, + "loss": 0.7591, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2025821208953857, + "rewards/margins": 5.255733489990234, + "rewards/rejected": -2.0531511306762695, + "step": 11946 + }, + { + "epoch": 2.99, + "grad_norm": 4.957435131072998, + "learning_rate": 3.4878425465652453e-06, + "logits/chosen": -0.4619307518005371, + "logits/rejected": -0.538903534412384, + "logps/chosen": -54.614723205566406, + "logps/rejected": -89.78817749023438, + "loss": 0.7758, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9982197284698486, + "rewards/margins": 6.257053375244141, + "rewards/rejected": -3.258833169937134, + "step": 11947 + }, + { + "epoch": 2.99, + "grad_norm": 6.316638946533203, + "learning_rate": 3.4870933845153486e-06, + "logits/chosen": -0.5224906802177429, + "logits/rejected": -0.6158009767532349, + "logps/chosen": -43.8482780456543, + "logps/rejected": -85.1614990234375, + "loss": 0.5763, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6919960975646973, + "rewards/margins": 5.358501434326172, + "rewards/rejected": -2.6665053367614746, + "step": 11948 + }, + { + "epoch": 2.99, + "grad_norm": 3.2636873722076416, + "learning_rate": 3.4863442598509828e-06, + "logits/chosen": -0.46947792172431946, + "logits/rejected": -0.612270176410675, + "logps/chosen": -66.61746215820312, + "logps/rejected": -90.73527526855469, + "loss": 0.6355, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.786440372467041, + "rewards/margins": 6.970986366271973, + "rewards/rejected": -4.184545993804932, + "step": 11949 + }, + { + "epoch": 2.99, + "grad_norm": 10.175992965698242, + "learning_rate": 3.48559517259066e-06, + "logits/chosen": -0.4624096155166626, + "logits/rejected": -0.5561349987983704, + "logps/chosen": -61.112022399902344, + "logps/rejected": -111.58391571044922, + "loss": 0.5974, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.00978946685791, + "rewards/margins": 7.07191801071167, + "rewards/rejected": -4.06212854385376, + "step": 11950 + }, + { + "epoch": 2.99, + "grad_norm": 7.5448994636535645, + "learning_rate": 3.4848461227528923e-06, + "logits/chosen": -0.41900530457496643, + "logits/rejected": -0.47913289070129395, + "logps/chosen": -60.24982833862305, + "logps/rejected": -84.54751586914062, + "loss": 0.8411, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1401400566101074, + "rewards/margins": 4.187215805053711, + "rewards/rejected": -1.0470762252807617, + "step": 11951 + }, + { + "epoch": 2.99, + "grad_norm": 4.794485569000244, + "learning_rate": 3.4840971103561876e-06, + "logits/chosen": -0.469899445772171, + "logits/rejected": -0.5733197927474976, + "logps/chosen": -72.5837631225586, + "logps/rejected": -97.75692749023438, + "loss": 0.7529, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0317041873931885, + "rewards/margins": 5.9923095703125, + "rewards/rejected": -2.960604429244995, + "step": 11952 + }, + { + "epoch": 2.99, + "grad_norm": 5.24991512298584, + "learning_rate": 3.4833481354190556e-06, + "logits/chosen": -0.4475501775741577, + "logits/rejected": -0.503448486328125, + "logps/chosen": -51.69285202026367, + "logps/rejected": -104.55254364013672, + "loss": 0.6344, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1966755390167236, + "rewards/margins": 5.977821350097656, + "rewards/rejected": -2.7811455726623535, + "step": 11953 + }, + { + "epoch": 2.99, + "grad_norm": 12.514028549194336, + "learning_rate": 3.4825991979600033e-06, + "logits/chosen": -0.5243479013442993, + "logits/rejected": -0.6611044406890869, + "logps/chosen": -60.784461975097656, + "logps/rejected": -109.13471984863281, + "loss": 0.6403, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.115415573120117, + "rewards/margins": 6.605874061584473, + "rewards/rejected": -3.4904584884643555, + "step": 11954 + }, + { + "epoch": 2.99, + "grad_norm": 6.194401264190674, + "learning_rate": 3.4818502979975423e-06, + "logits/chosen": -0.5551449060440063, + "logits/rejected": -0.6713718175888062, + "logps/chosen": -53.91765594482422, + "logps/rejected": -99.64625549316406, + "loss": 0.5928, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4288442134857178, + "rewards/margins": 6.278633117675781, + "rewards/rejected": -2.8497886657714844, + "step": 11955 + }, + { + "epoch": 2.99, + "grad_norm": 8.445322036743164, + "learning_rate": 3.4811014355501727e-06, + "logits/chosen": -0.5161905288696289, + "logits/rejected": -0.6683309078216553, + "logps/chosen": -68.49500274658203, + "logps/rejected": -84.99214172363281, + "loss": 0.7104, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9523630142211914, + "rewards/margins": 5.701693534851074, + "rewards/rejected": -2.749330997467041, + "step": 11956 + }, + { + "epoch": 2.99, + "grad_norm": 14.513254165649414, + "learning_rate": 3.4803526106364013e-06, + "logits/chosen": -0.5586884617805481, + "logits/rejected": -0.5734034180641174, + "logps/chosen": -50.031768798828125, + "logps/rejected": -97.20855712890625, + "loss": 0.7577, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3769350051879883, + "rewards/margins": 4.681397914886475, + "rewards/rejected": -1.3044629096984863, + "step": 11957 + }, + { + "epoch": 2.99, + "grad_norm": 6.221525192260742, + "learning_rate": 3.4796038232747365e-06, + "logits/chosen": -0.47145581245422363, + "logits/rejected": -0.5613430142402649, + "logps/chosen": -61.45044708251953, + "logps/rejected": -92.66612243652344, + "loss": 0.7352, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1220319271087646, + "rewards/margins": 5.746390342712402, + "rewards/rejected": -2.6243584156036377, + "step": 11958 + }, + { + "epoch": 2.99, + "grad_norm": 9.941762924194336, + "learning_rate": 3.4788550734836747e-06, + "logits/chosen": -0.5155466794967651, + "logits/rejected": -0.5984460711479187, + "logps/chosen": -66.85635375976562, + "logps/rejected": -109.17167663574219, + "loss": 0.7631, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8363096714019775, + "rewards/margins": 5.955909729003906, + "rewards/rejected": -3.119600296020508, + "step": 11959 + }, + { + "epoch": 2.99, + "grad_norm": 7.337680816650391, + "learning_rate": 3.4781063612817235e-06, + "logits/chosen": -0.542083740234375, + "logits/rejected": -0.6197217106819153, + "logps/chosen": -52.382755279541016, + "logps/rejected": -106.71815490722656, + "loss": 0.7385, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6433424949645996, + "rewards/margins": 6.886539459228516, + "rewards/rejected": -4.243196964263916, + "step": 11960 + }, + { + "epoch": 2.99, + "grad_norm": 4.386979579925537, + "learning_rate": 3.4773576866873825e-06, + "logits/chosen": -0.4778691232204437, + "logits/rejected": -0.565578818321228, + "logps/chosen": -52.26925277709961, + "logps/rejected": -103.74752807617188, + "loss": 0.6306, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.385409116744995, + "rewards/margins": 6.862857818603516, + "rewards/rejected": -3.4774489402770996, + "step": 11961 + }, + { + "epoch": 2.99, + "grad_norm": 12.000866889953613, + "learning_rate": 3.476609049719154e-06, + "logits/chosen": -0.5941945910453796, + "logits/rejected": -0.6997760534286499, + "logps/chosen": -50.06403350830078, + "logps/rejected": -97.6018295288086, + "loss": 0.7171, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.87107253074646, + "rewards/margins": 6.641162872314453, + "rewards/rejected": -3.7700905799865723, + "step": 11962 + }, + { + "epoch": 2.99, + "grad_norm": 2.4512205123901367, + "learning_rate": 3.475860450395535e-06, + "logits/chosen": -0.516732931137085, + "logits/rejected": -0.6092071533203125, + "logps/chosen": -46.98649978637695, + "logps/rejected": -123.0016098022461, + "loss": 0.5696, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4304721355438232, + "rewards/margins": 8.754467010498047, + "rewards/rejected": -5.323995113372803, + "step": 11963 + }, + { + "epoch": 2.99, + "grad_norm": 6.985219478607178, + "learning_rate": 3.4751118887350255e-06, + "logits/chosen": -0.5525826215744019, + "logits/rejected": -0.6196252107620239, + "logps/chosen": -39.86631774902344, + "logps/rejected": -76.70710754394531, + "loss": 0.6508, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.947157621383667, + "rewards/margins": 4.724730014801025, + "rewards/rejected": -1.7775721549987793, + "step": 11964 + }, + { + "epoch": 2.99, + "grad_norm": 3.153308868408203, + "learning_rate": 3.474363364756125e-06, + "logits/chosen": -0.4285913109779358, + "logits/rejected": -0.5333315134048462, + "logps/chosen": -48.094173431396484, + "logps/rejected": -100.23592376708984, + "loss": 0.5907, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.940676689147949, + "rewards/margins": 6.649735450744629, + "rewards/rejected": -3.709059000015259, + "step": 11965 + }, + { + "epoch": 2.99, + "grad_norm": 2.8698832988739014, + "learning_rate": 3.473614878477326e-06, + "logits/chosen": -0.54412442445755, + "logits/rejected": -0.6551894545555115, + "logps/chosen": -54.487274169921875, + "logps/rejected": -101.26924896240234, + "loss": 0.5514, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.211663246154785, + "rewards/margins": 6.853920936584473, + "rewards/rejected": -3.64225697517395, + "step": 11966 + }, + { + "epoch": 2.99, + "grad_norm": 4.003058433532715, + "learning_rate": 3.472866429917128e-06, + "logits/chosen": -0.4494086503982544, + "logits/rejected": -0.5505744218826294, + "logps/chosen": -92.65615844726562, + "logps/rejected": -125.19583892822266, + "loss": 0.7848, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0551440715789795, + "rewards/margins": 7.156774997711182, + "rewards/rejected": -4.101630687713623, + "step": 11967 + }, + { + "epoch": 2.99, + "grad_norm": 4.83530855178833, + "learning_rate": 3.4721180190940245e-06, + "logits/chosen": -0.4562302827835083, + "logits/rejected": -0.5031053423881531, + "logps/chosen": -43.46791076660156, + "logps/rejected": -102.76678466796875, + "loss": 0.5751, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2690482139587402, + "rewards/margins": 5.956942081451416, + "rewards/rejected": -2.687894344329834, + "step": 11968 + }, + { + "epoch": 2.99, + "grad_norm": 11.048384666442871, + "learning_rate": 3.471369646026511e-06, + "logits/chosen": -0.5019846558570862, + "logits/rejected": -0.6197884678840637, + "logps/chosen": -60.86084747314453, + "logps/rejected": -117.93048095703125, + "loss": 0.8702, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.709501266479492, + "rewards/margins": 5.972797393798828, + "rewards/rejected": -3.263296365737915, + "step": 11969 + }, + { + "epoch": 2.99, + "grad_norm": 4.266377925872803, + "learning_rate": 3.470621310733078e-06, + "logits/chosen": -0.6408615112304688, + "logits/rejected": -0.6776153445243835, + "logps/chosen": -50.63811492919922, + "logps/rejected": -110.68670654296875, + "loss": 0.6411, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9524731636047363, + "rewards/margins": 7.034258842468262, + "rewards/rejected": -4.081785678863525, + "step": 11970 + }, + { + "epoch": 2.99, + "grad_norm": 16.12000274658203, + "learning_rate": 3.469873013232219e-06, + "logits/chosen": -0.5090793967247009, + "logits/rejected": -0.611146092414856, + "logps/chosen": -53.44431686401367, + "logps/rejected": -90.24342346191406, + "loss": 0.705, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8044939041137695, + "rewards/margins": 5.568140983581543, + "rewards/rejected": -2.7636466026306152, + "step": 11971 + }, + { + "epoch": 2.99, + "grad_norm": 5.199954509735107, + "learning_rate": 3.4691247535424254e-06, + "logits/chosen": -0.5472140312194824, + "logits/rejected": -0.6264735460281372, + "logps/chosen": -59.20284652709961, + "logps/rejected": -93.37020874023438, + "loss": 0.7543, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.044294834136963, + "rewards/margins": 5.872092247009277, + "rewards/rejected": -2.8277971744537354, + "step": 11972 + }, + { + "epoch": 3.0, + "grad_norm": 8.503995895385742, + "learning_rate": 3.46837653168219e-06, + "logits/chosen": -0.5225685834884644, + "logits/rejected": -0.6531633734703064, + "logps/chosen": -50.626258850097656, + "logps/rejected": -89.00657653808594, + "loss": 0.6251, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.869539737701416, + "rewards/margins": 6.783424377441406, + "rewards/rejected": -3.9138851165771484, + "step": 11973 + }, + { + "epoch": 3.0, + "grad_norm": 7.246974468231201, + "learning_rate": 3.4676283476699968e-06, + "logits/chosen": -0.48389750719070435, + "logits/rejected": -0.5568604469299316, + "logps/chosen": -71.66909790039062, + "logps/rejected": -96.4370346069336, + "loss": 0.7985, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.023850917816162, + "rewards/margins": 6.2071661949157715, + "rewards/rejected": -3.1833152770996094, + "step": 11974 + }, + { + "epoch": 3.0, + "grad_norm": 6.140787601470947, + "learning_rate": 3.4668802015243374e-06, + "logits/chosen": -0.4769209325313568, + "logits/rejected": -0.5663347244262695, + "logps/chosen": -57.19293975830078, + "logps/rejected": -104.31834411621094, + "loss": 0.6731, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1299214363098145, + "rewards/margins": 6.386584281921387, + "rewards/rejected": -3.256662130355835, + "step": 11975 + }, + { + "epoch": 3.0, + "grad_norm": 4.16580057144165, + "learning_rate": 3.4661320932637e-06, + "logits/chosen": -0.5722052454948425, + "logits/rejected": -0.6529633402824402, + "logps/chosen": -50.128379821777344, + "logps/rejected": -109.04566955566406, + "loss": 0.6422, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.365276336669922, + "rewards/margins": 7.188986301422119, + "rewards/rejected": -3.8237099647521973, + "step": 11976 + }, + { + "epoch": 3.0, + "grad_norm": 4.613558292388916, + "learning_rate": 3.4653840229065694e-06, + "logits/chosen": -0.48473483324050903, + "logits/rejected": -0.504414975643158, + "logps/chosen": -55.27898406982422, + "logps/rejected": -120.29855346679688, + "loss": 0.631, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.188094139099121, + "rewards/margins": 6.9762372970581055, + "rewards/rejected": -3.788142442703247, + "step": 11977 + }, + { + "epoch": 3.0, + "grad_norm": 8.024874687194824, + "learning_rate": 3.464635990471432e-06, + "logits/chosen": -0.45857205986976624, + "logits/rejected": -0.5995764136314392, + "logps/chosen": -65.45474243164062, + "logps/rejected": -80.04354858398438, + "loss": 0.7031, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2076759338378906, + "rewards/margins": 6.339414596557617, + "rewards/rejected": -3.1317384243011475, + "step": 11978 + }, + { + "epoch": 3.0, + "grad_norm": 7.184777736663818, + "learning_rate": 3.463887995976772e-06, + "logits/chosen": -0.4918891489505768, + "logits/rejected": -0.6650871634483337, + "logps/chosen": -51.59370422363281, + "logps/rejected": -91.4592514038086, + "loss": 0.6088, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0219075679779053, + "rewards/margins": 7.3629655838012695, + "rewards/rejected": -4.341057300567627, + "step": 11979 + }, + { + "epoch": 3.0, + "grad_norm": 3.9864425659179688, + "learning_rate": 3.4631400394410746e-06, + "logits/chosen": -0.5631216764450073, + "logits/rejected": -0.6475462913513184, + "logps/chosen": -51.26063919067383, + "logps/rejected": -99.4453125, + "loss": 0.55, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1455228328704834, + "rewards/margins": 7.589423656463623, + "rewards/rejected": -4.4439005851745605, + "step": 11980 + }, + { + "epoch": 3.0, + "grad_norm": 5.423631191253662, + "learning_rate": 3.4623921208828204e-06, + "logits/chosen": -0.5327025055885315, + "logits/rejected": -0.5987966656684875, + "logps/chosen": -46.96849060058594, + "logps/rejected": -107.5208511352539, + "loss": 0.5947, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.009884834289551, + "rewards/margins": 7.616350173950195, + "rewards/rejected": -4.606465816497803, + "step": 11981 + }, + { + "epoch": 3.0, + "grad_norm": 3.580900192260742, + "learning_rate": 3.461644240320493e-06, + "logits/chosen": -0.5008796453475952, + "logits/rejected": -0.5478032231330872, + "logps/chosen": -51.985748291015625, + "logps/rejected": -102.30732727050781, + "loss": 0.6517, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1330111026763916, + "rewards/margins": 5.297246932983398, + "rewards/rejected": -2.1642355918884277, + "step": 11982 + }, + { + "epoch": 3.0, + "grad_norm": 5.876757621765137, + "learning_rate": 3.460896397772573e-06, + "logits/chosen": -0.5742634534835815, + "logits/rejected": -0.6355602741241455, + "logps/chosen": -48.93506622314453, + "logps/rejected": -99.16797637939453, + "loss": 0.6621, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.003911018371582, + "rewards/margins": 5.805181503295898, + "rewards/rejected": -2.8012702465057373, + "step": 11983 + }, + { + "epoch": 3.0, + "grad_norm": 8.037423133850098, + "learning_rate": 3.4601485932575396e-06, + "logits/chosen": -0.5270553827285767, + "logits/rejected": -0.6218427419662476, + "logps/chosen": -60.23284149169922, + "logps/rejected": -92.51298522949219, + "loss": 0.8116, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.6830861568450928, + "rewards/margins": 5.55880880355835, + "rewards/rejected": -2.875723123550415, + "step": 11984 + }, + { + "epoch": 3.0, + "grad_norm": 4.2330098152160645, + "learning_rate": 3.4594008267938716e-06, + "logits/chosen": -0.49724769592285156, + "logits/rejected": -0.6027093529701233, + "logps/chosen": -62.64087677001953, + "logps/rejected": -91.19495391845703, + "loss": 0.5906, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0860865116119385, + "rewards/margins": 6.520495414733887, + "rewards/rejected": -3.4344093799591064, + "step": 11985 + }, + { + "epoch": 3.0, + "grad_norm": 9.688511848449707, + "learning_rate": 3.4586530984000472e-06, + "logits/chosen": -0.4814290702342987, + "logits/rejected": -0.5415067076683044, + "logps/chosen": -49.53290557861328, + "logps/rejected": -91.64395904541016, + "loss": 0.6606, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1361167430877686, + "rewards/margins": 5.426779747009277, + "rewards/rejected": -2.290663003921509, + "step": 11986 + }, + { + "epoch": 3.0, + "grad_norm": 8.325819969177246, + "learning_rate": 3.457905408094547e-06, + "logits/chosen": -0.5108321309089661, + "logits/rejected": -0.591392993927002, + "logps/chosen": -60.55558776855469, + "logps/rejected": -101.71419525146484, + "loss": 0.7233, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.754673480987549, + "rewards/margins": 6.832950592041016, + "rewards/rejected": -4.078276634216309, + "step": 11987 + }, + { + "epoch": 3.0, + "grad_norm": 3.8983287811279297, + "learning_rate": 3.457157755895842e-06, + "logits/chosen": -0.4582030177116394, + "logits/rejected": -0.5592435002326965, + "logps/chosen": -63.06230926513672, + "logps/rejected": -112.16284942626953, + "loss": 0.641, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.820565700531006, + "rewards/margins": 6.884171962738037, + "rewards/rejected": -4.063606262207031, + "step": 11988 + }, + { + "epoch": 3.0, + "grad_norm": 7.824241638183594, + "learning_rate": 3.4564101418224104e-06, + "logits/chosen": -0.5011683702468872, + "logits/rejected": -0.5839461088180542, + "logps/chosen": -58.362144470214844, + "logps/rejected": -106.60777282714844, + "loss": 0.6817, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.682490110397339, + "rewards/margins": 6.540230751037598, + "rewards/rejected": -3.8577401638031006, + "step": 11989 + }, + { + "epoch": 3.0, + "grad_norm": 15.783780097961426, + "learning_rate": 3.4556625658927277e-06, + "logits/chosen": -0.45869070291519165, + "logits/rejected": -0.5403732061386108, + "logps/chosen": -59.21726989746094, + "logps/rejected": -108.5936279296875, + "loss": 0.767, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.724442958831787, + "rewards/margins": 5.888360023498535, + "rewards/rejected": -3.1639182567596436, + "step": 11990 + }, + { + "epoch": 3.0, + "grad_norm": 6.203650951385498, + "learning_rate": 3.4549150281252635e-06, + "logits/chosen": -0.6223424673080444, + "logits/rejected": -0.6343888640403748, + "logps/chosen": -76.49851989746094, + "logps/rejected": -86.47779846191406, + "loss": 0.6574, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.03147292137146, + "rewards/margins": 5.663622856140137, + "rewards/rejected": -2.632150173187256, + "step": 11991 + }, + { + "epoch": 3.0, + "grad_norm": 10.468365669250488, + "learning_rate": 3.454167528538493e-06, + "logits/chosen": -0.4600341320037842, + "logits/rejected": -0.5408562421798706, + "logps/chosen": -66.34964752197266, + "logps/rejected": -93.61341857910156, + "loss": 0.6126, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0104575157165527, + "rewards/margins": 5.9902663230896, + "rewards/rejected": -2.979809045791626, + "step": 11992 + }, + { + "epoch": 3.0, + "grad_norm": 3.383474349975586, + "learning_rate": 3.4534200671508865e-06, + "logits/chosen": -0.4755430519580841, + "logits/rejected": -0.5849837064743042, + "logps/chosen": -51.65842056274414, + "logps/rejected": -103.78111267089844, + "loss": 0.56, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1909282207489014, + "rewards/margins": 6.942297458648682, + "rewards/rejected": -3.751368761062622, + "step": 11993 + }, + { + "epoch": 3.0, + "grad_norm": 3.7931270599365234, + "learning_rate": 3.452672643980917e-06, + "logits/chosen": -0.4917072653770447, + "logits/rejected": -0.567311704158783, + "logps/chosen": -52.891441345214844, + "logps/rejected": -97.9801254272461, + "loss": 0.6244, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.389627456665039, + "rewards/margins": 6.0063090324401855, + "rewards/rejected": -2.6166810989379883, + "step": 11994 + }, + { + "epoch": 3.0, + "grad_norm": 3.934312343597412, + "learning_rate": 3.4519252590470507e-06, + "logits/chosen": -0.4448865056037903, + "logits/rejected": -0.5402888655662537, + "logps/chosen": -51.892311096191406, + "logps/rejected": -115.37810516357422, + "loss": 0.6208, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.019228219985962, + "rewards/margins": 7.316464900970459, + "rewards/rejected": -4.297236442565918, + "step": 11995 + }, + { + "epoch": 3.0, + "grad_norm": 4.4279890060424805, + "learning_rate": 3.4511779123677584e-06, + "logits/chosen": -0.4371192157268524, + "logits/rejected": -0.5003480315208435, + "logps/chosen": -51.403385162353516, + "logps/rejected": -107.90708923339844, + "loss": 0.5999, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.894955635070801, + "rewards/margins": 6.426280975341797, + "rewards/rejected": -3.531325578689575, + "step": 11996 + }, + { + "epoch": 3.0, + "grad_norm": 5.993869304656982, + "learning_rate": 3.4504306039615075e-06, + "logits/chosen": -0.5490013360977173, + "logits/rejected": -0.5996429920196533, + "logps/chosen": -40.070068359375, + "logps/rejected": -101.68363952636719, + "loss": 0.7325, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.4002585411071777, + "rewards/margins": 5.423678398132324, + "rewards/rejected": -2.0234193801879883, + "step": 11997 + }, + { + "epoch": 3.0, + "grad_norm": 2.779118537902832, + "learning_rate": 3.449683333846765e-06, + "logits/chosen": -0.48531609773635864, + "logits/rejected": -0.6212190389633179, + "logps/chosen": -58.974945068359375, + "logps/rejected": -96.24224090576172, + "loss": 0.5817, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.32438325881958, + "rewards/margins": 6.800941467285156, + "rewards/rejected": -3.4765586853027344, + "step": 11998 + }, + { + "epoch": 3.0, + "grad_norm": 4.496356964111328, + "learning_rate": 3.4489361020419955e-06, + "logits/chosen": -0.48753729462623596, + "logits/rejected": -0.5908676981925964, + "logps/chosen": -61.720272064208984, + "logps/rejected": -85.78375244140625, + "loss": 0.577, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2826831340789795, + "rewards/margins": 6.294268608093262, + "rewards/rejected": -3.011585235595703, + "step": 11999 + }, + { + "epoch": 3.0, + "grad_norm": 3.65572452545166, + "learning_rate": 3.4481889085656633e-06, + "logits/chosen": -0.4730534255504608, + "logits/rejected": -0.5452291369438171, + "logps/chosen": -54.753726959228516, + "logps/rejected": -115.57198333740234, + "loss": 0.6009, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2727854251861572, + "rewards/margins": 7.396478652954102, + "rewards/rejected": -4.123693466186523, + "step": 12000 + }, + { + "epoch": 3.0, + "grad_norm": 17.895841598510742, + "learning_rate": 3.447441753436237e-06, + "logits/chosen": -0.45992207527160645, + "logits/rejected": -0.5542000532150269, + "logps/chosen": -64.59992980957031, + "logps/rejected": -102.8243408203125, + "loss": 0.6194, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.888659715652466, + "rewards/margins": 6.124166965484619, + "rewards/rejected": -3.2355074882507324, + "step": 12001 + }, + { + "epoch": 3.0, + "grad_norm": 3.6767117977142334, + "learning_rate": 3.4466946366721744e-06, + "logits/chosen": -0.5697183012962341, + "logits/rejected": -0.6065483093261719, + "logps/chosen": -56.6496696472168, + "logps/rejected": -108.32422637939453, + "loss": 0.6618, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0550689697265625, + "rewards/margins": 6.056639671325684, + "rewards/rejected": -3.001570224761963, + "step": 12002 + }, + { + "epoch": 3.0, + "grad_norm": 6.570983409881592, + "learning_rate": 3.4459475582919386e-06, + "logits/chosen": -0.550408124923706, + "logits/rejected": -0.5845030546188354, + "logps/chosen": -48.97325134277344, + "logps/rejected": -109.53598022460938, + "loss": 0.6604, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.957672357559204, + "rewards/margins": 6.467869281768799, + "rewards/rejected": -3.5101966857910156, + "step": 12003 + }, + { + "epoch": 3.0, + "grad_norm": 6.847461223602295, + "learning_rate": 3.4452005183139926e-06, + "logits/chosen": -0.49281787872314453, + "logits/rejected": -0.6095677614212036, + "logps/chosen": -43.88399124145508, + "logps/rejected": -99.20307922363281, + "loss": 0.5503, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0551180839538574, + "rewards/margins": 7.304177284240723, + "rewards/rejected": -4.249058723449707, + "step": 12004 + }, + { + "epoch": 3.0, + "grad_norm": 2.527604579925537, + "learning_rate": 3.444453516756796e-06, + "logits/chosen": -0.5248300433158875, + "logits/rejected": -0.6641733646392822, + "logps/chosen": -66.1446762084961, + "logps/rejected": -102.87016296386719, + "loss": 0.5724, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.404066801071167, + "rewards/margins": 8.238221168518066, + "rewards/rejected": -4.834153175354004, + "step": 12005 + }, + { + "epoch": 3.0, + "grad_norm": 10.899751663208008, + "learning_rate": 3.443706553638807e-06, + "logits/chosen": -0.5445331335067749, + "logits/rejected": -0.6185799837112427, + "logps/chosen": -60.99113082885742, + "logps/rejected": -131.50103759765625, + "loss": 0.6385, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1384358406066895, + "rewards/margins": 8.026182174682617, + "rewards/rejected": -4.887746334075928, + "step": 12006 + }, + { + "epoch": 3.0, + "grad_norm": 26.306835174560547, + "learning_rate": 3.442959628978485e-06, + "logits/chosen": -0.6161745190620422, + "logits/rejected": -0.7165992259979248, + "logps/chosen": -48.903053283691406, + "logps/rejected": -87.71519470214844, + "loss": 0.7032, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.971040725708008, + "rewards/margins": 7.083840847015381, + "rewards/rejected": -4.112800121307373, + "step": 12007 + }, + { + "epoch": 3.0, + "grad_norm": 4.559444904327393, + "learning_rate": 3.442212742794287e-06, + "logits/chosen": -0.4682343900203705, + "logits/rejected": -0.5748656392097473, + "logps/chosen": -57.746829986572266, + "logps/rejected": -96.53199768066406, + "loss": 0.6927, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0424747467041016, + "rewards/margins": 6.50154972076416, + "rewards/rejected": -3.4590744972229004, + "step": 12008 + }, + { + "epoch": 3.0, + "grad_norm": 5.332392692565918, + "learning_rate": 3.441465895104668e-06, + "logits/chosen": -0.5624078512191772, + "logits/rejected": -0.6169191598892212, + "logps/chosen": -45.94046401977539, + "logps/rejected": -105.57998657226562, + "loss": 0.6199, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3696157932281494, + "rewards/margins": 7.549833297729492, + "rewards/rejected": -4.180217266082764, + "step": 12009 + }, + { + "epoch": 3.0, + "grad_norm": 4.840854644775391, + "learning_rate": 3.4407190859280847e-06, + "logits/chosen": -0.4999268054962158, + "logits/rejected": -0.5820608139038086, + "logps/chosen": -61.787132263183594, + "logps/rejected": -111.80227661132812, + "loss": 0.7218, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9523634910583496, + "rewards/margins": 6.841952800750732, + "rewards/rejected": -3.889589548110962, + "step": 12010 + }, + { + "epoch": 3.0, + "grad_norm": 5.879753112792969, + "learning_rate": 3.4399723152829913e-06, + "logits/chosen": -0.49837303161621094, + "logits/rejected": -0.6126959919929504, + "logps/chosen": -47.9999885559082, + "logps/rejected": -92.33537292480469, + "loss": 0.5574, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4060940742492676, + "rewards/margins": 7.152223110198975, + "rewards/rejected": -3.746129035949707, + "step": 12011 + }, + { + "epoch": 3.0, + "grad_norm": 8.863992691040039, + "learning_rate": 3.4392255831878425e-06, + "logits/chosen": -0.5097622871398926, + "logits/rejected": -0.5529319643974304, + "logps/chosen": -64.21424865722656, + "logps/rejected": -98.54427337646484, + "loss": 0.8004, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.632047653198242, + "rewards/margins": 5.337101459503174, + "rewards/rejected": -2.7050535678863525, + "step": 12012 + }, + { + "epoch": 3.01, + "grad_norm": 7.623290538787842, + "learning_rate": 3.438478889661088e-06, + "logits/chosen": -0.549627423286438, + "logits/rejected": -0.6171693205833435, + "logps/chosen": -57.90239715576172, + "logps/rejected": -105.58514404296875, + "loss": 0.7307, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.803886890411377, + "rewards/margins": 6.329277038574219, + "rewards/rejected": -3.5253896713256836, + "step": 12013 + }, + { + "epoch": 3.01, + "grad_norm": 9.035504341125488, + "learning_rate": 3.4377322347211808e-06, + "logits/chosen": -0.4243317246437073, + "logits/rejected": -0.4711560606956482, + "logps/chosen": -56.49526596069336, + "logps/rejected": -115.45972442626953, + "loss": 0.752, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.113518238067627, + "rewards/margins": 6.242362976074219, + "rewards/rejected": -3.1288442611694336, + "step": 12014 + }, + { + "epoch": 3.01, + "grad_norm": 4.819398880004883, + "learning_rate": 3.436985618386575e-06, + "logits/chosen": -0.5154001712799072, + "logits/rejected": -0.6185154318809509, + "logps/chosen": -55.33393859863281, + "logps/rejected": -89.7853775024414, + "loss": 0.6601, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2777020931243896, + "rewards/margins": 6.299981594085693, + "rewards/rejected": -3.022279739379883, + "step": 12015 + }, + { + "epoch": 3.01, + "grad_norm": 4.209695339202881, + "learning_rate": 3.436239040675714e-06, + "logits/chosen": -0.6160998940467834, + "logits/rejected": -0.659028172492981, + "logps/chosen": -43.76266860961914, + "logps/rejected": -87.68291473388672, + "loss": 0.6654, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.25933575630188, + "rewards/margins": 5.929283142089844, + "rewards/rejected": -2.669947385787964, + "step": 12016 + }, + { + "epoch": 3.01, + "grad_norm": 5.900004863739014, + "learning_rate": 3.435492501607049e-06, + "logits/chosen": -0.5441063046455383, + "logits/rejected": -0.6234589219093323, + "logps/chosen": -59.25497817993164, + "logps/rejected": -93.31001281738281, + "loss": 0.6131, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0292036533355713, + "rewards/margins": 6.507781028747559, + "rewards/rejected": -3.4785776138305664, + "step": 12017 + }, + { + "epoch": 3.01, + "grad_norm": 3.471158027648926, + "learning_rate": 3.4347460011990297e-06, + "logits/chosen": -0.529159426689148, + "logits/rejected": -0.6432126760482788, + "logps/chosen": -58.10835266113281, + "logps/rejected": -89.42611694335938, + "loss": 0.5448, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0053625106811523, + "rewards/margins": 5.879615783691406, + "rewards/rejected": -2.8742542266845703, + "step": 12018 + }, + { + "epoch": 3.01, + "grad_norm": 10.66499137878418, + "learning_rate": 3.4339995394701022e-06, + "logits/chosen": -0.4742729365825653, + "logits/rejected": -0.5804271101951599, + "logps/chosen": -73.16759490966797, + "logps/rejected": -103.141357421875, + "loss": 0.8176, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.863129138946533, + "rewards/margins": 5.946041107177734, + "rewards/rejected": -3.082911491394043, + "step": 12019 + }, + { + "epoch": 3.01, + "grad_norm": 4.3867411613464355, + "learning_rate": 3.4332531164387105e-06, + "logits/chosen": -0.5166563987731934, + "logits/rejected": -0.5935600996017456, + "logps/chosen": -62.06196594238281, + "logps/rejected": -90.50647735595703, + "loss": 0.6818, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.896319627761841, + "rewards/margins": 5.417613506317139, + "rewards/rejected": -2.521293878555298, + "step": 12020 + }, + { + "epoch": 3.01, + "grad_norm": 6.4533820152282715, + "learning_rate": 3.4325067321233013e-06, + "logits/chosen": -0.4824289083480835, + "logits/rejected": -0.5508604645729065, + "logps/chosen": -49.95911407470703, + "logps/rejected": -94.59764099121094, + "loss": 0.6344, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.587690830230713, + "rewards/margins": 6.228060722351074, + "rewards/rejected": -3.6403698921203613, + "step": 12021 + }, + { + "epoch": 3.01, + "grad_norm": 4.933352470397949, + "learning_rate": 3.4317603865423175e-06, + "logits/chosen": -0.45531904697418213, + "logits/rejected": -0.5492667555809021, + "logps/chosen": -77.91563415527344, + "logps/rejected": -96.19967651367188, + "loss": 0.6108, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0522279739379883, + "rewards/margins": 5.785513401031494, + "rewards/rejected": -2.733285427093506, + "step": 12022 + }, + { + "epoch": 3.01, + "grad_norm": 3.1386077404022217, + "learning_rate": 3.4310140797142044e-06, + "logits/chosen": -0.5086160898208618, + "logits/rejected": -0.6030310392379761, + "logps/chosen": -48.25986862182617, + "logps/rejected": -96.51235961914062, + "loss": 0.5895, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4484031200408936, + "rewards/margins": 6.117094993591309, + "rewards/rejected": -2.668691635131836, + "step": 12023 + }, + { + "epoch": 3.01, + "grad_norm": 3.558201313018799, + "learning_rate": 3.4302678116574007e-06, + "logits/chosen": -0.5116309523582458, + "logits/rejected": -0.5956785678863525, + "logps/chosen": -51.446353912353516, + "logps/rejected": -98.74604034423828, + "loss": 0.6278, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.928341865539551, + "rewards/margins": 6.125627040863037, + "rewards/rejected": -3.1972856521606445, + "step": 12024 + }, + { + "epoch": 3.01, + "grad_norm": 8.661516189575195, + "learning_rate": 3.4295215823903493e-06, + "logits/chosen": -0.5065680146217346, + "logits/rejected": -0.5910389423370361, + "logps/chosen": -68.00884246826172, + "logps/rejected": -87.36756134033203, + "loss": 0.7252, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2662243843078613, + "rewards/margins": 5.781827449798584, + "rewards/rejected": -2.5156028270721436, + "step": 12025 + }, + { + "epoch": 3.01, + "grad_norm": 12.348688125610352, + "learning_rate": 3.428775391931491e-06, + "logits/chosen": -0.5146370530128479, + "logits/rejected": -0.6377067565917969, + "logps/chosen": -53.01419448852539, + "logps/rejected": -93.04478454589844, + "loss": 0.7799, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7643227577209473, + "rewards/margins": 5.713401794433594, + "rewards/rejected": -2.9490795135498047, + "step": 12026 + }, + { + "epoch": 3.01, + "grad_norm": 4.493387699127197, + "learning_rate": 3.4280292402992633e-06, + "logits/chosen": -0.5706056356430054, + "logits/rejected": -0.6227385401725769, + "logps/chosen": -53.84455871582031, + "logps/rejected": -90.73617553710938, + "loss": 0.6428, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3597490787506104, + "rewards/margins": 5.536487579345703, + "rewards/rejected": -2.176738739013672, + "step": 12027 + }, + { + "epoch": 3.01, + "grad_norm": 5.272102355957031, + "learning_rate": 3.4272831275121044e-06, + "logits/chosen": -0.5379557609558105, + "logits/rejected": -0.6351054310798645, + "logps/chosen": -58.48584747314453, + "logps/rejected": -121.82007598876953, + "loss": 0.6647, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.974766731262207, + "rewards/margins": 7.539729595184326, + "rewards/rejected": -4.5649638175964355, + "step": 12028 + }, + { + "epoch": 3.01, + "grad_norm": 6.201269149780273, + "learning_rate": 3.4265370535884513e-06, + "logits/chosen": -0.5148965120315552, + "logits/rejected": -0.5939940214157104, + "logps/chosen": -51.95608901977539, + "logps/rejected": -94.33392333984375, + "loss": 0.6364, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3721699714660645, + "rewards/margins": 5.988809585571289, + "rewards/rejected": -2.616640090942383, + "step": 12029 + }, + { + "epoch": 3.01, + "grad_norm": 3.4684934616088867, + "learning_rate": 3.4257910185467446e-06, + "logits/chosen": -0.5084196925163269, + "logits/rejected": -0.5975164175033569, + "logps/chosen": -54.72914123535156, + "logps/rejected": -98.83914947509766, + "loss": 0.5752, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.940230369567871, + "rewards/margins": 6.0822272300720215, + "rewards/rejected": -3.1419970989227295, + "step": 12030 + }, + { + "epoch": 3.01, + "grad_norm": 3.0090415477752686, + "learning_rate": 3.425045022405413e-06, + "logits/chosen": -0.47889724373817444, + "logits/rejected": -0.5841681361198425, + "logps/chosen": -47.40264129638672, + "logps/rejected": -91.88191223144531, + "loss": 0.5522, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1935689449310303, + "rewards/margins": 6.4628682136535645, + "rewards/rejected": -3.269299030303955, + "step": 12031 + }, + { + "epoch": 3.01, + "grad_norm": 8.36255931854248, + "learning_rate": 3.424299065182895e-06, + "logits/chosen": -0.458453893661499, + "logits/rejected": -0.5962000489234924, + "logps/chosen": -56.875144958496094, + "logps/rejected": -91.78530883789062, + "loss": 0.6257, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8796958923339844, + "rewards/margins": 6.367099761962891, + "rewards/rejected": -3.4874041080474854, + "step": 12032 + }, + { + "epoch": 3.01, + "grad_norm": 3.0729804039001465, + "learning_rate": 3.4235531468976256e-06, + "logits/chosen": -0.5717888474464417, + "logits/rejected": -0.6552993655204773, + "logps/chosen": -46.643096923828125, + "logps/rejected": -83.70066833496094, + "loss": 0.5904, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2477948665618896, + "rewards/margins": 6.049328327178955, + "rewards/rejected": -2.8015332221984863, + "step": 12033 + }, + { + "epoch": 3.01, + "grad_norm": 5.491885662078857, + "learning_rate": 3.422807267568031e-06, + "logits/chosen": -0.4589725732803345, + "logits/rejected": -0.5583892464637756, + "logps/chosen": -68.93204498291016, + "logps/rejected": -84.19679260253906, + "loss": 0.7055, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8110508918762207, + "rewards/margins": 5.527324199676514, + "rewards/rejected": -2.716273069381714, + "step": 12034 + }, + { + "epoch": 3.01, + "grad_norm": 6.43869161605835, + "learning_rate": 3.4220614272125474e-06, + "logits/chosen": -0.5166999697685242, + "logits/rejected": -0.5888429880142212, + "logps/chosen": -49.96080780029297, + "logps/rejected": -87.30960083007812, + "loss": 0.7138, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.038245677947998, + "rewards/margins": 5.298452377319336, + "rewards/rejected": -2.260206699371338, + "step": 12035 + }, + { + "epoch": 3.01, + "grad_norm": 4.991374969482422, + "learning_rate": 3.421315625849605e-06, + "logits/chosen": -0.4333952069282532, + "logits/rejected": -0.5245277285575867, + "logps/chosen": -53.3431510925293, + "logps/rejected": -84.57189178466797, + "loss": 0.6587, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1699092388153076, + "rewards/margins": 6.640748977661133, + "rewards/rejected": -3.470839500427246, + "step": 12036 + }, + { + "epoch": 3.01, + "grad_norm": 1.6273705959320068, + "learning_rate": 3.4205698634976335e-06, + "logits/chosen": -0.5480877757072449, + "logits/rejected": -0.6450909376144409, + "logps/chosen": -52.73519515991211, + "logps/rejected": -98.13914489746094, + "loss": 0.5988, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2805819511413574, + "rewards/margins": 8.106112480163574, + "rewards/rejected": -4.825531482696533, + "step": 12037 + }, + { + "epoch": 3.01, + "grad_norm": 3.96122407913208, + "learning_rate": 3.4198241401750594e-06, + "logits/chosen": -0.48170793056488037, + "logits/rejected": -0.5515522956848145, + "logps/chosen": -47.8415412902832, + "logps/rejected": -97.36328887939453, + "loss": 0.589, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3997597694396973, + "rewards/margins": 6.892117500305176, + "rewards/rejected": -3.4923572540283203, + "step": 12038 + }, + { + "epoch": 3.01, + "grad_norm": 3.55049729347229, + "learning_rate": 3.4190784559003116e-06, + "logits/chosen": -0.45322346687316895, + "logits/rejected": -0.5765719413757324, + "logps/chosen": -65.5404052734375, + "logps/rejected": -94.56500244140625, + "loss": 0.6013, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.170133113861084, + "rewards/margins": 6.780292987823486, + "rewards/rejected": -3.6101603507995605, + "step": 12039 + }, + { + "epoch": 3.01, + "grad_norm": 3.8414957523345947, + "learning_rate": 3.4183328106918177e-06, + "logits/chosen": -0.49568119645118713, + "logits/rejected": -0.6044544577598572, + "logps/chosen": -57.24369430541992, + "logps/rejected": -92.40119171142578, + "loss": 0.5416, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.126451253890991, + "rewards/margins": 6.519774913787842, + "rewards/rejected": -3.393324375152588, + "step": 12040 + }, + { + "epoch": 3.01, + "grad_norm": 5.037481784820557, + "learning_rate": 3.4175872045680015e-06, + "logits/chosen": -0.48195260763168335, + "logits/rejected": -0.5588167905807495, + "logps/chosen": -50.26613235473633, + "logps/rejected": -90.02218627929688, + "loss": 0.6234, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1891579627990723, + "rewards/margins": 5.753474235534668, + "rewards/rejected": -2.5643162727355957, + "step": 12041 + }, + { + "epoch": 3.01, + "grad_norm": 18.603349685668945, + "learning_rate": 3.416841637547288e-06, + "logits/chosen": -0.48598283529281616, + "logits/rejected": -0.5131270885467529, + "logps/chosen": -48.87564468383789, + "logps/rejected": -111.49687957763672, + "loss": 0.619, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.846696376800537, + "rewards/margins": 6.233026504516602, + "rewards/rejected": -3.386329412460327, + "step": 12042 + }, + { + "epoch": 3.01, + "grad_norm": 3.310272216796875, + "learning_rate": 3.4160961096481017e-06, + "logits/chosen": -0.5341359972953796, + "logits/rejected": -0.6067841053009033, + "logps/chosen": -54.00977325439453, + "logps/rejected": -104.70899963378906, + "loss": 0.6651, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9188308715820312, + "rewards/margins": 6.59935188293457, + "rewards/rejected": -3.68052077293396, + "step": 12043 + }, + { + "epoch": 3.01, + "grad_norm": 4.179055690765381, + "learning_rate": 3.415350620888867e-06, + "logits/chosen": -0.5343388915061951, + "logits/rejected": -0.5727104544639587, + "logps/chosen": -43.216670989990234, + "logps/rejected": -91.20254516601562, + "loss": 0.6268, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.091050148010254, + "rewards/margins": 6.265657901763916, + "rewards/rejected": -3.174607992172241, + "step": 12044 + }, + { + "epoch": 3.01, + "grad_norm": 7.590217113494873, + "learning_rate": 3.4146051712880023e-06, + "logits/chosen": -0.5401569604873657, + "logits/rejected": -0.6453770399093628, + "logps/chosen": -59.86668014526367, + "logps/rejected": -98.0533218383789, + "loss": 0.6759, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.891038417816162, + "rewards/margins": 6.263323783874512, + "rewards/rejected": -3.3722853660583496, + "step": 12045 + }, + { + "epoch": 3.01, + "grad_norm": 4.669960975646973, + "learning_rate": 3.4138597608639297e-06, + "logits/chosen": -0.3812573254108429, + "logits/rejected": -0.4874010384082794, + "logps/chosen": -53.39984130859375, + "logps/rejected": -95.0055160522461, + "loss": 0.5674, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0311975479125977, + "rewards/margins": 6.320802688598633, + "rewards/rejected": -3.289604902267456, + "step": 12046 + }, + { + "epoch": 3.01, + "grad_norm": 7.753361225128174, + "learning_rate": 3.4131143896350723e-06, + "logits/chosen": -0.5218689441680908, + "logits/rejected": -0.6050474047660828, + "logps/chosen": -57.6159782409668, + "logps/rejected": -99.19337463378906, + "loss": 0.7356, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8173820972442627, + "rewards/margins": 6.918679714202881, + "rewards/rejected": -4.101297855377197, + "step": 12047 + }, + { + "epoch": 3.01, + "grad_norm": 2.245112419128418, + "learning_rate": 3.412369057619843e-06, + "logits/chosen": -0.49127712845802307, + "logits/rejected": -0.6013538837432861, + "logps/chosen": -50.0711669921875, + "logps/rejected": -107.31554412841797, + "loss": 0.5467, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3560500144958496, + "rewards/margins": 8.437461853027344, + "rewards/rejected": -5.081411838531494, + "step": 12048 + }, + { + "epoch": 3.01, + "grad_norm": 5.856493949890137, + "learning_rate": 3.411623764836664e-06, + "logits/chosen": -0.4754894971847534, + "logits/rejected": -0.5581363439559937, + "logps/chosen": -54.050201416015625, + "logps/rejected": -92.45748901367188, + "loss": 0.6636, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0574841499328613, + "rewards/margins": 5.247365951538086, + "rewards/rejected": -2.1898820400238037, + "step": 12049 + }, + { + "epoch": 3.01, + "grad_norm": 6.287380695343018, + "learning_rate": 3.410878511303951e-06, + "logits/chosen": -0.5152876377105713, + "logits/rejected": -0.5644485354423523, + "logps/chosen": -63.4885368347168, + "logps/rejected": -106.54763793945312, + "loss": 0.702, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0041747093200684, + "rewards/margins": 5.603581428527832, + "rewards/rejected": -2.5994067192077637, + "step": 12050 + }, + { + "epoch": 3.01, + "grad_norm": 4.263055324554443, + "learning_rate": 3.410133297040122e-06, + "logits/chosen": -0.53623366355896, + "logits/rejected": -0.621531069278717, + "logps/chosen": -62.13784408569336, + "logps/rejected": -98.03141784667969, + "loss": 0.6833, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8047075271606445, + "rewards/margins": 6.267032623291016, + "rewards/rejected": -3.462325096130371, + "step": 12051 + }, + { + "epoch": 3.01, + "grad_norm": 3.3194291591644287, + "learning_rate": 3.4093881220635883e-06, + "logits/chosen": -0.458673894405365, + "logits/rejected": -0.5130248069763184, + "logps/chosen": -52.212440490722656, + "logps/rejected": -97.10835266113281, + "loss": 0.6062, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1834683418273926, + "rewards/margins": 6.670988082885742, + "rewards/rejected": -3.4875190258026123, + "step": 12052 + }, + { + "epoch": 3.02, + "grad_norm": 8.282682418823242, + "learning_rate": 3.4086429863927668e-06, + "logits/chosen": -0.39127877354621887, + "logits/rejected": -0.5080564618110657, + "logps/chosen": -58.502681732177734, + "logps/rejected": -84.9872055053711, + "loss": 0.6369, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.097576141357422, + "rewards/margins": 6.903992652893066, + "rewards/rejected": -3.8064169883728027, + "step": 12053 + }, + { + "epoch": 3.02, + "grad_norm": 1.9763295650482178, + "learning_rate": 3.407897890046069e-06, + "logits/chosen": -0.5238258838653564, + "logits/rejected": -0.6547142267227173, + "logps/chosen": -76.26951599121094, + "logps/rejected": -122.01932525634766, + "loss": 0.6122, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.335744857788086, + "rewards/margins": 8.773435592651367, + "rewards/rejected": -5.4376912117004395, + "step": 12054 + }, + { + "epoch": 3.02, + "grad_norm": 4.137777805328369, + "learning_rate": 3.4071528330419096e-06, + "logits/chosen": -0.4792076349258423, + "logits/rejected": -0.5323024392127991, + "logps/chosen": -57.215301513671875, + "logps/rejected": -100.51741027832031, + "loss": 0.6403, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.328479766845703, + "rewards/margins": 6.30851936340332, + "rewards/rejected": -2.980039596557617, + "step": 12055 + }, + { + "epoch": 3.02, + "grad_norm": 6.05151891708374, + "learning_rate": 3.406407815398697e-06, + "logits/chosen": -0.5046035051345825, + "logits/rejected": -0.5954827666282654, + "logps/chosen": -56.150291442871094, + "logps/rejected": -112.74089813232422, + "loss": 0.631, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.762691020965576, + "rewards/margins": 7.95241641998291, + "rewards/rejected": -5.189724922180176, + "step": 12056 + }, + { + "epoch": 3.02, + "grad_norm": 2.7913050651550293, + "learning_rate": 3.405662837134841e-06, + "logits/chosen": -0.5157715678215027, + "logits/rejected": -0.6070241332054138, + "logps/chosen": -49.2972297668457, + "logps/rejected": -95.08308410644531, + "loss": 0.6415, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.089920997619629, + "rewards/margins": 6.434539794921875, + "rewards/rejected": -3.344619035720825, + "step": 12057 + }, + { + "epoch": 3.02, + "grad_norm": 5.291675090789795, + "learning_rate": 3.404917898268755e-06, + "logits/chosen": -0.5528563857078552, + "logits/rejected": -0.6394347548484802, + "logps/chosen": -50.896461486816406, + "logps/rejected": -104.2227554321289, + "loss": 0.5459, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.338853359222412, + "rewards/margins": 7.4062910079956055, + "rewards/rejected": -4.067437171936035, + "step": 12058 + }, + { + "epoch": 3.02, + "grad_norm": 4.505798816680908, + "learning_rate": 3.4041729988188416e-06, + "logits/chosen": -0.43127623200416565, + "logits/rejected": -0.5199087858200073, + "logps/chosen": -55.68510818481445, + "logps/rejected": -105.11283874511719, + "loss": 0.6148, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.303410291671753, + "rewards/margins": 6.435144424438477, + "rewards/rejected": -3.1317343711853027, + "step": 12059 + }, + { + "epoch": 3.02, + "grad_norm": 5.528514385223389, + "learning_rate": 3.403428138803511e-06, + "logits/chosen": -0.5346738696098328, + "logits/rejected": -0.6419832110404968, + "logps/chosen": -52.89246368408203, + "logps/rejected": -105.89775085449219, + "loss": 0.6425, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9679558277130127, + "rewards/margins": 7.078328609466553, + "rewards/rejected": -4.110372543334961, + "step": 12060 + }, + { + "epoch": 3.02, + "grad_norm": 2.0878958702087402, + "learning_rate": 3.402683318241169e-06, + "logits/chosen": -0.4551001191139221, + "logits/rejected": -0.5900996327400208, + "logps/chosen": -41.64069366455078, + "logps/rejected": -92.7608642578125, + "loss": 0.527, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.324603319168091, + "rewards/margins": 7.757748126983643, + "rewards/rejected": -4.433144569396973, + "step": 12061 + }, + { + "epoch": 3.02, + "grad_norm": 4.752833843231201, + "learning_rate": 3.4019385371502246e-06, + "logits/chosen": -0.45426633954048157, + "logits/rejected": -0.513545036315918, + "logps/chosen": -63.37470245361328, + "logps/rejected": -115.17239379882812, + "loss": 0.749, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.197366714477539, + "rewards/margins": 6.197520732879639, + "rewards/rejected": -3.0001540184020996, + "step": 12062 + }, + { + "epoch": 3.02, + "grad_norm": 5.814483165740967, + "learning_rate": 3.401193795549075e-06, + "logits/chosen": -0.5349786281585693, + "logits/rejected": -0.5998352766036987, + "logps/chosen": -49.645137786865234, + "logps/rejected": -102.70650482177734, + "loss": 0.6015, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0278515815734863, + "rewards/margins": 6.369129180908203, + "rewards/rejected": -3.341277599334717, + "step": 12063 + }, + { + "epoch": 3.02, + "grad_norm": 5.904261589050293, + "learning_rate": 3.400449093456128e-06, + "logits/chosen": -0.4424106180667877, + "logits/rejected": -0.5524998903274536, + "logps/chosen": -50.27755355834961, + "logps/rejected": -76.91313171386719, + "loss": 0.5984, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1604225635528564, + "rewards/margins": 5.863224983215332, + "rewards/rejected": -2.702802896499634, + "step": 12064 + }, + { + "epoch": 3.02, + "grad_norm": 5.245514392852783, + "learning_rate": 3.3997044308897865e-06, + "logits/chosen": -0.47335946559906006, + "logits/rejected": -0.5498266220092773, + "logps/chosen": -55.99675750732422, + "logps/rejected": -95.3412094116211, + "loss": 0.6171, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.183927297592163, + "rewards/margins": 5.620362281799316, + "rewards/rejected": -2.436434507369995, + "step": 12065 + }, + { + "epoch": 3.02, + "grad_norm": 3.9292643070220947, + "learning_rate": 3.3989598078684497e-06, + "logits/chosen": -0.5762845277786255, + "logits/rejected": -0.6872739195823669, + "logps/chosen": -60.077945709228516, + "logps/rejected": -97.83477020263672, + "loss": 0.6629, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.022164821624756, + "rewards/margins": 6.854730606079102, + "rewards/rejected": -3.832566499710083, + "step": 12066 + }, + { + "epoch": 3.02, + "grad_norm": 4.530391693115234, + "learning_rate": 3.3982152244105193e-06, + "logits/chosen": -0.4973754584789276, + "logits/rejected": -0.590562641620636, + "logps/chosen": -58.601036071777344, + "logps/rejected": -99.1520004272461, + "loss": 0.6251, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4339728355407715, + "rewards/margins": 5.956298351287842, + "rewards/rejected": -2.5223255157470703, + "step": 12067 + }, + { + "epoch": 3.02, + "grad_norm": 5.198118209838867, + "learning_rate": 3.397470680534394e-06, + "logits/chosen": -0.5376113057136536, + "logits/rejected": -0.6083669662475586, + "logps/chosen": -48.953147888183594, + "logps/rejected": -101.24411010742188, + "loss": 0.6706, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.083566427230835, + "rewards/margins": 5.9446210861206055, + "rewards/rejected": -2.8610548973083496, + "step": 12068 + }, + { + "epoch": 3.02, + "grad_norm": 5.230793476104736, + "learning_rate": 3.3967261762584735e-06, + "logits/chosen": -0.4629395306110382, + "logits/rejected": -0.566128671169281, + "logps/chosen": -59.01478958129883, + "logps/rejected": -91.96430969238281, + "loss": 0.7084, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.170947551727295, + "rewards/margins": 5.98974084854126, + "rewards/rejected": -2.818794012069702, + "step": 12069 + }, + { + "epoch": 3.02, + "grad_norm": 5.487985134124756, + "learning_rate": 3.395981711601154e-06, + "logits/chosen": -0.5578297972679138, + "logits/rejected": -0.635732114315033, + "logps/chosen": -53.488189697265625, + "logps/rejected": -106.34004974365234, + "loss": 0.6849, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9193320274353027, + "rewards/margins": 6.315683364868164, + "rewards/rejected": -3.3963515758514404, + "step": 12070 + }, + { + "epoch": 3.02, + "grad_norm": 8.149798393249512, + "learning_rate": 3.3952372865808326e-06, + "logits/chosen": -0.48997893929481506, + "logits/rejected": -0.5272774696350098, + "logps/chosen": -60.4280891418457, + "logps/rejected": -89.96759796142578, + "loss": 0.7376, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.715721607208252, + "rewards/margins": 5.094869613647461, + "rewards/rejected": -2.379148006439209, + "step": 12071 + }, + { + "epoch": 3.02, + "grad_norm": 6.441179275512695, + "learning_rate": 3.3944929012159056e-06, + "logits/chosen": -0.574776828289032, + "logits/rejected": -0.6450940370559692, + "logps/chosen": -52.01062774658203, + "logps/rejected": -115.68728637695312, + "loss": 0.6187, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.095208168029785, + "rewards/margins": 6.8512959480285645, + "rewards/rejected": -3.7560882568359375, + "step": 12072 + }, + { + "epoch": 3.02, + "grad_norm": 4.434639930725098, + "learning_rate": 3.3937485555247664e-06, + "logits/chosen": -0.4992220401763916, + "logits/rejected": -0.581496000289917, + "logps/chosen": -53.60608673095703, + "logps/rejected": -96.89762878417969, + "loss": 0.6124, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.096797466278076, + "rewards/margins": 6.659553527832031, + "rewards/rejected": -3.562756061553955, + "step": 12073 + }, + { + "epoch": 3.02, + "grad_norm": 5.005081653594971, + "learning_rate": 3.3930042495258063e-06, + "logits/chosen": -0.4306410849094391, + "logits/rejected": -0.5261853933334351, + "logps/chosen": -60.31821060180664, + "logps/rejected": -103.07219696044922, + "loss": 0.568, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1677987575531006, + "rewards/margins": 6.064470291137695, + "rewards/rejected": -2.8966715335845947, + "step": 12074 + }, + { + "epoch": 3.02, + "grad_norm": 7.564104080200195, + "learning_rate": 3.3922599832374224e-06, + "logits/chosen": -0.47132810950279236, + "logits/rejected": -0.5429771542549133, + "logps/chosen": -56.456050872802734, + "logps/rejected": -98.33753967285156, + "loss": 0.6663, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.945589303970337, + "rewards/margins": 6.357151985168457, + "rewards/rejected": -3.411562442779541, + "step": 12075 + }, + { + "epoch": 3.02, + "grad_norm": 4.144740104675293, + "learning_rate": 3.3915157566780067e-06, + "logits/chosen": -0.4998737871646881, + "logits/rejected": -0.5545727610588074, + "logps/chosen": -46.058685302734375, + "logps/rejected": -103.08963012695312, + "loss": 0.5316, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.132242441177368, + "rewards/margins": 6.877432346343994, + "rewards/rejected": -3.745189666748047, + "step": 12076 + }, + { + "epoch": 3.02, + "grad_norm": 6.551277160644531, + "learning_rate": 3.3907715698659437e-06, + "logits/chosen": -0.4334331154823303, + "logits/rejected": -0.5423101186752319, + "logps/chosen": -47.70729446411133, + "logps/rejected": -81.85111999511719, + "loss": 0.6011, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.865480899810791, + "rewards/margins": 6.398187637329102, + "rewards/rejected": -3.532707691192627, + "step": 12077 + }, + { + "epoch": 3.02, + "grad_norm": 5.3676371574401855, + "learning_rate": 3.3900274228196284e-06, + "logits/chosen": -0.5361263155937195, + "logits/rejected": -0.5603072643280029, + "logps/chosen": -44.100460052490234, + "logps/rejected": -101.58264923095703, + "loss": 0.6342, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1955642700195312, + "rewards/margins": 5.768661975860596, + "rewards/rejected": -2.5730979442596436, + "step": 12078 + }, + { + "epoch": 3.02, + "grad_norm": 5.065428256988525, + "learning_rate": 3.3892833155574477e-06, + "logits/chosen": -0.44434118270874023, + "logits/rejected": -0.5587490200996399, + "logps/chosen": -59.44397735595703, + "logps/rejected": -97.66070556640625, + "loss": 0.6695, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8767919540405273, + "rewards/margins": 6.124664783477783, + "rewards/rejected": -3.247873067855835, + "step": 12079 + }, + { + "epoch": 3.02, + "grad_norm": 3.453632116317749, + "learning_rate": 3.3885392480977907e-06, + "logits/chosen": -0.543837308883667, + "logits/rejected": -0.5740203857421875, + "logps/chosen": -44.22440719604492, + "logps/rejected": -116.98152160644531, + "loss": 0.5547, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.486743450164795, + "rewards/margins": 7.151924133300781, + "rewards/rejected": -3.665179491043091, + "step": 12080 + }, + { + "epoch": 3.02, + "grad_norm": 3.821854591369629, + "learning_rate": 3.387795220459042e-06, + "logits/chosen": -0.6150844097137451, + "logits/rejected": -0.6995126605033875, + "logps/chosen": -51.07617950439453, + "logps/rejected": -93.28213500976562, + "loss": 0.657, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0484912395477295, + "rewards/margins": 7.017977237701416, + "rewards/rejected": -3.9694857597351074, + "step": 12081 + }, + { + "epoch": 3.02, + "grad_norm": 5.310894966125488, + "learning_rate": 3.387051232659588e-06, + "logits/chosen": -0.5554956197738647, + "logits/rejected": -0.6463629007339478, + "logps/chosen": -55.378143310546875, + "logps/rejected": -90.80260467529297, + "loss": 0.584, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1660776138305664, + "rewards/margins": 6.7039875984191895, + "rewards/rejected": -3.537909746170044, + "step": 12082 + }, + { + "epoch": 3.02, + "grad_norm": 4.973193645477295, + "learning_rate": 3.3863072847178145e-06, + "logits/chosen": -0.5501211285591125, + "logits/rejected": -0.5886275768280029, + "logps/chosen": -45.364498138427734, + "logps/rejected": -96.03446197509766, + "loss": 0.6823, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.872398614883423, + "rewards/margins": 6.299994945526123, + "rewards/rejected": -3.4275965690612793, + "step": 12083 + }, + { + "epoch": 3.02, + "grad_norm": 8.156102180480957, + "learning_rate": 3.3855633766521035e-06, + "logits/chosen": -0.5560373067855835, + "logits/rejected": -0.6211581230163574, + "logps/chosen": -46.85879135131836, + "logps/rejected": -101.75688171386719, + "loss": 0.658, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2531962394714355, + "rewards/margins": 6.845112323760986, + "rewards/rejected": -3.5919153690338135, + "step": 12084 + }, + { + "epoch": 3.02, + "grad_norm": 4.833631992340088, + "learning_rate": 3.384819508480839e-06, + "logits/chosen": -0.5032578110694885, + "logits/rejected": -0.6081928610801697, + "logps/chosen": -48.82685089111328, + "logps/rejected": -93.00699615478516, + "loss": 0.6124, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9038643836975098, + "rewards/margins": 6.899170875549316, + "rewards/rejected": -3.9953064918518066, + "step": 12085 + }, + { + "epoch": 3.02, + "grad_norm": 4.747791767120361, + "learning_rate": 3.3840756802224006e-06, + "logits/chosen": -0.5740618705749512, + "logits/rejected": -0.6466838121414185, + "logps/chosen": -52.20192337036133, + "logps/rejected": -96.72529602050781, + "loss": 0.6971, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.102294445037842, + "rewards/margins": 5.44495964050293, + "rewards/rejected": -2.342665195465088, + "step": 12086 + }, + { + "epoch": 3.02, + "grad_norm": 3.7742183208465576, + "learning_rate": 3.3833318918951754e-06, + "logits/chosen": -0.5123407244682312, + "logits/rejected": -0.6033502817153931, + "logps/chosen": -53.37652587890625, + "logps/rejected": -102.79554748535156, + "loss": 0.587, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1474170684814453, + "rewards/margins": 6.895528793334961, + "rewards/rejected": -3.7481119632720947, + "step": 12087 + }, + { + "epoch": 3.02, + "grad_norm": 5.725071907043457, + "learning_rate": 3.3825881435175358e-06, + "logits/chosen": -0.5240981578826904, + "logits/rejected": -0.6053560376167297, + "logps/chosen": -60.73186492919922, + "logps/rejected": -101.06401062011719, + "loss": 0.6784, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.935213565826416, + "rewards/margins": 5.957712173461914, + "rewards/rejected": -3.022498369216919, + "step": 12088 + }, + { + "epoch": 3.02, + "grad_norm": 2.9113540649414062, + "learning_rate": 3.3818444351078615e-06, + "logits/chosen": -0.5853829383850098, + "logits/rejected": -0.6719546914100647, + "logps/chosen": -56.392173767089844, + "logps/rejected": -81.41365051269531, + "loss": 0.5956, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0768256187438965, + "rewards/margins": 6.3631696701049805, + "rewards/rejected": -3.286343574523926, + "step": 12089 + }, + { + "epoch": 3.02, + "grad_norm": 8.327205657958984, + "learning_rate": 3.381100766684537e-06, + "logits/chosen": -0.5762715339660645, + "logits/rejected": -0.5987541675567627, + "logps/chosen": -49.567134857177734, + "logps/rejected": -110.20002746582031, + "loss": 0.7198, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1916935443878174, + "rewards/margins": 6.17195987701416, + "rewards/rejected": -2.980266571044922, + "step": 12090 + }, + { + "epoch": 3.02, + "grad_norm": 2.116286516189575, + "learning_rate": 3.3803571382659307e-06, + "logits/chosen": -0.5183246731758118, + "logits/rejected": -0.5966293811798096, + "logps/chosen": -50.03603744506836, + "logps/rejected": -100.6183853149414, + "loss": 0.5217, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1616742610931396, + "rewards/margins": 7.305260181427002, + "rewards/rejected": -4.143585205078125, + "step": 12091 + }, + { + "epoch": 3.02, + "grad_norm": 3.5314011573791504, + "learning_rate": 3.3796135498704235e-06, + "logits/chosen": -0.5250809192657471, + "logits/rejected": -0.5509217381477356, + "logps/chosen": -53.93571853637695, + "logps/rejected": -116.8588638305664, + "loss": 0.5971, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.135924816131592, + "rewards/margins": 6.191354751586914, + "rewards/rejected": -3.0554299354553223, + "step": 12092 + }, + { + "epoch": 3.03, + "grad_norm": 12.805760383605957, + "learning_rate": 3.378870001516389e-06, + "logits/chosen": -0.471981018781662, + "logits/rejected": -0.5424752831459045, + "logps/chosen": -59.41283416748047, + "logps/rejected": -98.47388458251953, + "loss": 0.7816, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8340790271759033, + "rewards/margins": 5.822606563568115, + "rewards/rejected": -2.988527536392212, + "step": 12093 + }, + { + "epoch": 3.03, + "grad_norm": 6.10274600982666, + "learning_rate": 3.378126493222202e-06, + "logits/chosen": -0.5090974569320679, + "logits/rejected": -0.5740013718605042, + "logps/chosen": -60.43791198730469, + "logps/rejected": -99.09576416015625, + "loss": 0.6795, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.207916736602783, + "rewards/margins": 6.46917200088501, + "rewards/rejected": -3.2612550258636475, + "step": 12094 + }, + { + "epoch": 3.03, + "grad_norm": 5.8755364418029785, + "learning_rate": 3.3773830250062335e-06, + "logits/chosen": -0.5176470875740051, + "logits/rejected": -0.6167611479759216, + "logps/chosen": -59.740936279296875, + "logps/rejected": -98.38931274414062, + "loss": 0.6734, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0774083137512207, + "rewards/margins": 5.994588375091553, + "rewards/rejected": -2.9171793460845947, + "step": 12095 + }, + { + "epoch": 3.03, + "grad_norm": 2.7056074142456055, + "learning_rate": 3.376639596886856e-06, + "logits/chosen": -0.5188334584236145, + "logits/rejected": -0.58050537109375, + "logps/chosen": -53.14665603637695, + "logps/rejected": -109.93646240234375, + "loss": 0.6133, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0780656337738037, + "rewards/margins": 6.8495306968688965, + "rewards/rejected": -3.77146577835083, + "step": 12096 + }, + { + "epoch": 3.03, + "grad_norm": 4.338569641113281, + "learning_rate": 3.3758962088824423e-06, + "logits/chosen": -0.5402213335037231, + "logits/rejected": -0.6407427191734314, + "logps/chosen": -57.550785064697266, + "logps/rejected": -103.47899627685547, + "loss": 0.6443, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.83886456489563, + "rewards/margins": 7.376035690307617, + "rewards/rejected": -4.537171363830566, + "step": 12097 + }, + { + "epoch": 3.03, + "grad_norm": 8.634783744812012, + "learning_rate": 3.3751528610113594e-06, + "logits/chosen": -0.5731143951416016, + "logits/rejected": -0.6665156483650208, + "logps/chosen": -56.16865539550781, + "logps/rejected": -123.60476684570312, + "loss": 0.6867, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1578359603881836, + "rewards/margins": 8.003440856933594, + "rewards/rejected": -4.845605850219727, + "step": 12098 + }, + { + "epoch": 3.03, + "grad_norm": 3.6717562675476074, + "learning_rate": 3.3744095532919774e-06, + "logits/chosen": -0.5089654326438904, + "logits/rejected": -0.569251298904419, + "logps/chosen": -70.20355987548828, + "logps/rejected": -110.00314331054688, + "loss": 0.6534, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8822813034057617, + "rewards/margins": 6.8125200271606445, + "rewards/rejected": -3.930239200592041, + "step": 12099 + }, + { + "epoch": 3.03, + "grad_norm": 5.17352819442749, + "learning_rate": 3.373666285742665e-06, + "logits/chosen": -0.5683600902557373, + "logits/rejected": -0.6591309905052185, + "logps/chosen": -58.7728157043457, + "logps/rejected": -102.77156829833984, + "loss": 0.6481, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.854919672012329, + "rewards/margins": 6.069600582122803, + "rewards/rejected": -3.2146811485290527, + "step": 12100 + }, + { + "epoch": 3.03, + "grad_norm": 5.792688846588135, + "learning_rate": 3.372923058381789e-06, + "logits/chosen": -0.5740984678268433, + "logits/rejected": -0.6236461997032166, + "logps/chosen": -40.934478759765625, + "logps/rejected": -95.21898651123047, + "loss": 0.6045, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.085489511489868, + "rewards/margins": 6.205366611480713, + "rewards/rejected": -3.119877338409424, + "step": 12101 + }, + { + "epoch": 3.03, + "grad_norm": 6.395425319671631, + "learning_rate": 3.3721798712277142e-06, + "logits/chosen": -0.47010713815689087, + "logits/rejected": -0.5342737436294556, + "logps/chosen": -61.84353256225586, + "logps/rejected": -120.28189086914062, + "loss": 0.6061, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.040797472000122, + "rewards/margins": 7.4574737548828125, + "rewards/rejected": -4.416676044464111, + "step": 12102 + }, + { + "epoch": 3.03, + "grad_norm": 2.6086843013763428, + "learning_rate": 3.371436724298805e-06, + "logits/chosen": -0.6167047619819641, + "logits/rejected": -0.6910576820373535, + "logps/chosen": -51.31241226196289, + "logps/rejected": -106.48033142089844, + "loss": 0.5779, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.051882028579712, + "rewards/margins": 7.323184013366699, + "rewards/rejected": -4.271301746368408, + "step": 12103 + }, + { + "epoch": 3.03, + "grad_norm": 6.092864513397217, + "learning_rate": 3.3706936176134274e-06, + "logits/chosen": -0.5564197897911072, + "logits/rejected": -0.6192424297332764, + "logps/chosen": -64.668212890625, + "logps/rejected": -131.34158325195312, + "loss": 0.6509, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9452226161956787, + "rewards/margins": 8.45284366607666, + "rewards/rejected": -5.5076212882995605, + "step": 12104 + }, + { + "epoch": 3.03, + "grad_norm": 5.974093914031982, + "learning_rate": 3.3699505511899457e-06, + "logits/chosen": -0.5414336919784546, + "logits/rejected": -0.6212124228477478, + "logps/chosen": -57.67848205566406, + "logps/rejected": -92.61732482910156, + "loss": 0.6661, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.174525022506714, + "rewards/margins": 5.639344692230225, + "rewards/rejected": -2.4648194313049316, + "step": 12105 + }, + { + "epoch": 3.03, + "grad_norm": 5.133411884307861, + "learning_rate": 3.369207525046717e-06, + "logits/chosen": -0.5033877491950989, + "logits/rejected": -0.5874547958374023, + "logps/chosen": -59.076297760009766, + "logps/rejected": -104.36735534667969, + "loss": 0.6345, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8941292762756348, + "rewards/margins": 6.044028282165527, + "rewards/rejected": -3.1498990058898926, + "step": 12106 + }, + { + "epoch": 3.03, + "grad_norm": 2.96144962310791, + "learning_rate": 3.3684645392021053e-06, + "logits/chosen": -0.6182308793067932, + "logits/rejected": -0.6946427226066589, + "logps/chosen": -54.004825592041016, + "logps/rejected": -91.16436767578125, + "loss": 0.5873, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1519925594329834, + "rewards/margins": 5.894498825073242, + "rewards/rejected": -2.7425060272216797, + "step": 12107 + }, + { + "epoch": 3.03, + "grad_norm": 10.885573387145996, + "learning_rate": 3.367721593674472e-06, + "logits/chosen": -0.5011781454086304, + "logits/rejected": -0.5677292943000793, + "logps/chosen": -58.097564697265625, + "logps/rejected": -112.51069641113281, + "loss": 0.6687, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0699126720428467, + "rewards/margins": 6.96779727935791, + "rewards/rejected": -3.8978848457336426, + "step": 12108 + }, + { + "epoch": 3.03, + "grad_norm": 14.694127082824707, + "learning_rate": 3.3669786884821725e-06, + "logits/chosen": -0.569509744644165, + "logits/rejected": -0.6267513632774353, + "logps/chosen": -56.46696472167969, + "logps/rejected": -102.42025756835938, + "loss": 0.6673, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.791801929473877, + "rewards/margins": 6.276710033416748, + "rewards/rejected": -3.484907627105713, + "step": 12109 + }, + { + "epoch": 3.03, + "grad_norm": 4.314905643463135, + "learning_rate": 3.3662358236435664e-06, + "logits/chosen": -0.5504547357559204, + "logits/rejected": -0.6311745047569275, + "logps/chosen": -49.61244201660156, + "logps/rejected": -100.40798950195312, + "loss": 0.6363, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.895634651184082, + "rewards/margins": 6.112800121307373, + "rewards/rejected": -3.217165231704712, + "step": 12110 + }, + { + "epoch": 3.03, + "grad_norm": 4.819877624511719, + "learning_rate": 3.365492999177012e-06, + "logits/chosen": -0.45252546668052673, + "logits/rejected": -0.5215032696723938, + "logps/chosen": -60.62955856323242, + "logps/rejected": -109.73497009277344, + "loss": 0.6225, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.741086483001709, + "rewards/margins": 7.052707672119141, + "rewards/rejected": -4.311621189117432, + "step": 12111 + }, + { + "epoch": 3.03, + "grad_norm": 4.938635349273682, + "learning_rate": 3.364750215100864e-06, + "logits/chosen": -0.5732167363166809, + "logits/rejected": -0.637485682964325, + "logps/chosen": -55.80912780761719, + "logps/rejected": -94.07818603515625, + "loss": 0.6583, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0686705112457275, + "rewards/margins": 6.772897243499756, + "rewards/rejected": -3.7042267322540283, + "step": 12112 + }, + { + "epoch": 3.03, + "grad_norm": 17.293718338012695, + "learning_rate": 3.3640074714334776e-06, + "logits/chosen": -0.4911031424999237, + "logits/rejected": -0.5663245916366577, + "logps/chosen": -50.69533920288086, + "logps/rejected": -94.9885025024414, + "loss": 0.6722, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.139268159866333, + "rewards/margins": 7.089404106140137, + "rewards/rejected": -3.9501357078552246, + "step": 12113 + }, + { + "epoch": 3.03, + "grad_norm": 10.009536743164062, + "learning_rate": 3.3632647681932062e-06, + "logits/chosen": -0.5726684927940369, + "logits/rejected": -0.6525750160217285, + "logps/chosen": -63.16876983642578, + "logps/rejected": -102.59228515625, + "loss": 0.7125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8402462005615234, + "rewards/margins": 6.488884449005127, + "rewards/rejected": -3.6486384868621826, + "step": 12114 + }, + { + "epoch": 3.03, + "grad_norm": 4.787627696990967, + "learning_rate": 3.3625221053984037e-06, + "logits/chosen": -0.5653872489929199, + "logits/rejected": -0.6497114896774292, + "logps/chosen": -47.568214416503906, + "logps/rejected": -97.69339752197266, + "loss": 0.5425, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.965991735458374, + "rewards/margins": 6.577524185180664, + "rewards/rejected": -3.61153244972229, + "step": 12115 + }, + { + "epoch": 3.03, + "grad_norm": 18.675413131713867, + "learning_rate": 3.3617794830674212e-06, + "logits/chosen": -0.5514369010925293, + "logits/rejected": -0.6287176609039307, + "logps/chosen": -57.057315826416016, + "logps/rejected": -127.2193603515625, + "loss": 0.6511, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.992466926574707, + "rewards/margins": 8.051654815673828, + "rewards/rejected": -5.059187412261963, + "step": 12116 + }, + { + "epoch": 3.03, + "grad_norm": 6.985101699829102, + "learning_rate": 3.3610369012186107e-06, + "logits/chosen": -0.5236435532569885, + "logits/rejected": -0.6023514866828918, + "logps/chosen": -58.31917190551758, + "logps/rejected": -93.35809326171875, + "loss": 0.7987, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1897144317626953, + "rewards/margins": 5.932821273803711, + "rewards/rejected": -2.7431068420410156, + "step": 12117 + }, + { + "epoch": 3.03, + "grad_norm": 3.2713675498962402, + "learning_rate": 3.36029435987032e-06, + "logits/chosen": -0.5449440479278564, + "logits/rejected": -0.6308650970458984, + "logps/chosen": -48.46856689453125, + "logps/rejected": -99.8487548828125, + "loss": 0.5761, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8083198070526123, + "rewards/margins": 6.581757068634033, + "rewards/rejected": -3.773437261581421, + "step": 12118 + }, + { + "epoch": 3.03, + "grad_norm": 3.928029775619507, + "learning_rate": 3.3595518590409027e-06, + "logits/chosen": -0.5031675100326538, + "logits/rejected": -0.6004478335380554, + "logps/chosen": -48.88718795776367, + "logps/rejected": -98.61526489257812, + "loss": 0.5293, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.865370273590088, + "rewards/margins": 6.646625518798828, + "rewards/rejected": -3.781254768371582, + "step": 12119 + }, + { + "epoch": 3.03, + "grad_norm": 3.857548475265503, + "learning_rate": 3.358809398748701e-06, + "logits/chosen": -0.5124679207801819, + "logits/rejected": -0.6228163242340088, + "logps/chosen": -61.666412353515625, + "logps/rejected": -118.47441101074219, + "loss": 0.6017, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.027482748031616, + "rewards/margins": 7.706910610198975, + "rewards/rejected": -4.679427623748779, + "step": 12120 + }, + { + "epoch": 3.03, + "grad_norm": 13.010872840881348, + "learning_rate": 3.358066979012066e-06, + "logits/chosen": -0.510926365852356, + "logits/rejected": -0.5749029517173767, + "logps/chosen": -50.18479919433594, + "logps/rejected": -110.49562072753906, + "loss": 0.668, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.754173755645752, + "rewards/margins": 7.588430404663086, + "rewards/rejected": -4.834256649017334, + "step": 12121 + }, + { + "epoch": 3.03, + "grad_norm": 5.368624687194824, + "learning_rate": 3.3573245998493432e-06, + "logits/chosen": -0.5450701713562012, + "logits/rejected": -0.6268423795700073, + "logps/chosen": -68.38638305664062, + "logps/rejected": -105.87041473388672, + "loss": 0.725, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.294882297515869, + "rewards/margins": 7.309100151062012, + "rewards/rejected": -4.014218330383301, + "step": 12122 + }, + { + "epoch": 3.03, + "grad_norm": 3.6590054035186768, + "learning_rate": 3.3565822612788747e-06, + "logits/chosen": -0.5634868144989014, + "logits/rejected": -0.6152134537696838, + "logps/chosen": -50.34568405151367, + "logps/rejected": -96.5019302368164, + "loss": 0.652, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.019524574279785, + "rewards/margins": 6.201896667480469, + "rewards/rejected": -3.1823718547821045, + "step": 12123 + }, + { + "epoch": 3.03, + "grad_norm": 7.442630290985107, + "learning_rate": 3.3558399633190074e-06, + "logits/chosen": -0.5354127883911133, + "logits/rejected": -0.5965207815170288, + "logps/chosen": -58.849884033203125, + "logps/rejected": -116.24595642089844, + "loss": 0.6259, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.453958511352539, + "rewards/margins": 7.613979339599609, + "rewards/rejected": -4.16002082824707, + "step": 12124 + }, + { + "epoch": 3.03, + "grad_norm": 4.811436653137207, + "learning_rate": 3.3550977059880833e-06, + "logits/chosen": -0.5568121671676636, + "logits/rejected": -0.6779686808586121, + "logps/chosen": -51.68849182128906, + "logps/rejected": -95.13704681396484, + "loss": 0.6123, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1180267333984375, + "rewards/margins": 7.355600357055664, + "rewards/rejected": -4.237573623657227, + "step": 12125 + }, + { + "epoch": 3.03, + "grad_norm": 4.316246509552002, + "learning_rate": 3.354355489304446e-06, + "logits/chosen": -0.44218772649765015, + "logits/rejected": -0.5548105835914612, + "logps/chosen": -58.13420104980469, + "logps/rejected": -96.89046478271484, + "loss": 0.6606, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6944355964660645, + "rewards/margins": 5.854288578033447, + "rewards/rejected": -3.1598525047302246, + "step": 12126 + }, + { + "epoch": 3.03, + "grad_norm": 3.763967275619507, + "learning_rate": 3.3536133132864334e-06, + "logits/chosen": -0.5901528596878052, + "logits/rejected": -0.6300152540206909, + "logps/chosen": -51.792850494384766, + "logps/rejected": -109.99842071533203, + "loss": 0.5923, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.139538526535034, + "rewards/margins": 7.104619979858398, + "rewards/rejected": -3.965080976486206, + "step": 12127 + }, + { + "epoch": 3.03, + "grad_norm": 2.7157297134399414, + "learning_rate": 3.352871177952387e-06, + "logits/chosen": -0.48213866353034973, + "logits/rejected": -0.5364350080490112, + "logps/chosen": -59.61421585083008, + "logps/rejected": -98.48335266113281, + "loss": 0.6086, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.906348943710327, + "rewards/margins": 6.492896556854248, + "rewards/rejected": -3.586548328399658, + "step": 12128 + }, + { + "epoch": 3.03, + "grad_norm": 5.769072532653809, + "learning_rate": 3.3521290833206454e-06, + "logits/chosen": -0.39421403408050537, + "logits/rejected": -0.5192606449127197, + "logps/chosen": -67.46126556396484, + "logps/rejected": -123.55634307861328, + "loss": 0.6238, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8836703300476074, + "rewards/margins": 6.300189018249512, + "rewards/rejected": -3.4165186882019043, + "step": 12129 + }, + { + "epoch": 3.03, + "grad_norm": 4.426991939544678, + "learning_rate": 3.351387029409549e-06, + "logits/chosen": -0.46454647183418274, + "logits/rejected": -0.6219415664672852, + "logps/chosen": -67.38069152832031, + "logps/rejected": -91.64777374267578, + "loss": 0.6403, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0450918674468994, + "rewards/margins": 7.341931343078613, + "rewards/rejected": -4.296839237213135, + "step": 12130 + }, + { + "epoch": 3.03, + "grad_norm": 6.1195244789123535, + "learning_rate": 3.3506450162374314e-06, + "logits/chosen": -0.48994696140289307, + "logits/rejected": -0.5469436645507812, + "logps/chosen": -53.973876953125, + "logps/rejected": -108.66455078125, + "loss": 0.6331, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3020529747009277, + "rewards/margins": 6.71689510345459, + "rewards/rejected": -3.414842128753662, + "step": 12131 + }, + { + "epoch": 3.03, + "grad_norm": 5.73441743850708, + "learning_rate": 3.3499030438226283e-06, + "logits/chosen": -0.4825236201286316, + "logits/rejected": -0.549219012260437, + "logps/chosen": -50.728912353515625, + "logps/rejected": -101.80455780029297, + "loss": 0.5828, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1032490730285645, + "rewards/margins": 7.006073474884033, + "rewards/rejected": -3.9028244018554688, + "step": 12132 + }, + { + "epoch": 3.04, + "grad_norm": 9.76795482635498, + "learning_rate": 3.3491611121834792e-06, + "logits/chosen": -0.4108901023864746, + "logits/rejected": -0.4973854124546051, + "logps/chosen": -56.683319091796875, + "logps/rejected": -102.73057556152344, + "loss": 0.6152, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.18426513671875, + "rewards/margins": 6.489084243774414, + "rewards/rejected": -3.3048195838928223, + "step": 12133 + }, + { + "epoch": 3.04, + "grad_norm": 6.4442458152771, + "learning_rate": 3.348419221338312e-06, + "logits/chosen": -0.4474945664405823, + "logits/rejected": -0.540823757648468, + "logps/chosen": -53.9295539855957, + "logps/rejected": -109.83995819091797, + "loss": 0.5943, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9712395668029785, + "rewards/margins": 7.085350513458252, + "rewards/rejected": -4.114109992980957, + "step": 12134 + }, + { + "epoch": 3.04, + "grad_norm": 3.4209046363830566, + "learning_rate": 3.3476773713054633e-06, + "logits/chosen": -0.5174974799156189, + "logits/rejected": -0.6117967367172241, + "logps/chosen": -52.169158935546875, + "logps/rejected": -78.2563705444336, + "loss": 0.613, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.216303825378418, + "rewards/margins": 5.697375774383545, + "rewards/rejected": -2.481071710586548, + "step": 12135 + }, + { + "epoch": 3.04, + "grad_norm": 4.138882160186768, + "learning_rate": 3.346935562103265e-06, + "logits/chosen": -0.47403958439826965, + "logits/rejected": -0.5912640690803528, + "logps/chosen": -57.59270095825195, + "logps/rejected": -101.93164825439453, + "loss": 0.6412, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.919050693511963, + "rewards/margins": 7.446037292480469, + "rewards/rejected": -4.526987075805664, + "step": 12136 + }, + { + "epoch": 3.04, + "grad_norm": 4.277516841888428, + "learning_rate": 3.3461937937500478e-06, + "logits/chosen": -0.4813353419303894, + "logits/rejected": -0.5918524861335754, + "logps/chosen": -53.264747619628906, + "logps/rejected": -110.9355697631836, + "loss": 0.6009, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.774003267288208, + "rewards/margins": 7.771007537841797, + "rewards/rejected": -4.997004508972168, + "step": 12137 + }, + { + "epoch": 3.04, + "grad_norm": 6.595694541931152, + "learning_rate": 3.34545206626414e-06, + "logits/chosen": -0.49404236674308777, + "logits/rejected": -0.5540359020233154, + "logps/chosen": -55.66254425048828, + "logps/rejected": -95.56367492675781, + "loss": 0.707, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1843652725219727, + "rewards/margins": 6.110029220581055, + "rewards/rejected": -2.925663709640503, + "step": 12138 + }, + { + "epoch": 3.04, + "grad_norm": 8.85598373413086, + "learning_rate": 3.3447103796638714e-06, + "logits/chosen": -0.46943044662475586, + "logits/rejected": -0.6180496215820312, + "logps/chosen": -67.03822326660156, + "logps/rejected": -90.94640350341797, + "loss": 0.7217, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1625118255615234, + "rewards/margins": 6.510275840759277, + "rewards/rejected": -3.347764253616333, + "step": 12139 + }, + { + "epoch": 3.04, + "grad_norm": 4.078817844390869, + "learning_rate": 3.343968733967571e-06, + "logits/chosen": -0.4952372908592224, + "logits/rejected": -0.5581720471382141, + "logps/chosen": -56.799903869628906, + "logps/rejected": -113.21324920654297, + "loss": 0.6496, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0619304180145264, + "rewards/margins": 6.132043361663818, + "rewards/rejected": -3.070113182067871, + "step": 12140 + }, + { + "epoch": 3.04, + "grad_norm": 2.9126932621002197, + "learning_rate": 3.3432271291935636e-06, + "logits/chosen": -0.47860583662986755, + "logits/rejected": -0.5477629899978638, + "logps/chosen": -46.49153137207031, + "logps/rejected": -108.78309631347656, + "loss": 0.5292, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.110785722732544, + "rewards/margins": 7.242802619934082, + "rewards/rejected": -4.132017135620117, + "step": 12141 + }, + { + "epoch": 3.04, + "grad_norm": 5.023107528686523, + "learning_rate": 3.342485565360176e-06, + "logits/chosen": -0.5101014971733093, + "logits/rejected": -0.5974634289741516, + "logps/chosen": -57.24317169189453, + "logps/rejected": -105.90754699707031, + "loss": 0.7597, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.97316837310791, + "rewards/margins": 6.079259872436523, + "rewards/rejected": -3.106091260910034, + "step": 12142 + }, + { + "epoch": 3.04, + "grad_norm": 2.888338804244995, + "learning_rate": 3.3417440424857327e-06, + "logits/chosen": -0.5097828507423401, + "logits/rejected": -0.5415764451026917, + "logps/chosen": -48.32691955566406, + "logps/rejected": -98.2865982055664, + "loss": 0.6006, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3079140186309814, + "rewards/margins": 6.149298667907715, + "rewards/rejected": -2.8413851261138916, + "step": 12143 + }, + { + "epoch": 3.04, + "grad_norm": 2.2112605571746826, + "learning_rate": 3.34100256058856e-06, + "logits/chosen": -0.5765945315361023, + "logits/rejected": -0.6460217833518982, + "logps/chosen": -53.21751022338867, + "logps/rejected": -107.16841888427734, + "loss": 0.5584, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.057166337966919, + "rewards/margins": 6.401027202606201, + "rewards/rejected": -3.343860626220703, + "step": 12144 + }, + { + "epoch": 3.04, + "grad_norm": 1.8826762437820435, + "learning_rate": 3.3402611196869764e-06, + "logits/chosen": -0.5965697765350342, + "logits/rejected": -0.7103761434555054, + "logps/chosen": -60.57741165161133, + "logps/rejected": -91.3197250366211, + "loss": 0.6142, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.69559383392334, + "rewards/margins": 7.12954568862915, + "rewards/rejected": -3.4339518547058105, + "step": 12145 + }, + { + "epoch": 3.04, + "grad_norm": 5.201896667480469, + "learning_rate": 3.3395197197993056e-06, + "logits/chosen": -0.5854196548461914, + "logits/rejected": -0.6891548037528992, + "logps/chosen": -53.263916015625, + "logps/rejected": -82.1490478515625, + "loss": 0.7121, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.962785482406616, + "rewards/margins": 5.403703689575195, + "rewards/rejected": -2.44091796875, + "step": 12146 + }, + { + "epoch": 3.04, + "grad_norm": 4.999111175537109, + "learning_rate": 3.3387783609438717e-06, + "logits/chosen": -0.4742145538330078, + "logits/rejected": -0.5779731869697571, + "logps/chosen": -53.71416473388672, + "logps/rejected": -86.29853820800781, + "loss": 0.6089, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.105456829071045, + "rewards/margins": 5.973223686218262, + "rewards/rejected": -2.8677666187286377, + "step": 12147 + }, + { + "epoch": 3.04, + "grad_norm": 2.1955721378326416, + "learning_rate": 3.3380370431389895e-06, + "logits/chosen": -0.5660423636436462, + "logits/rejected": -0.6180279850959778, + "logps/chosen": -40.963558197021484, + "logps/rejected": -102.26316833496094, + "loss": 0.4687, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.295710325241089, + "rewards/margins": 6.578367233276367, + "rewards/rejected": -3.2826569080352783, + "step": 12148 + }, + { + "epoch": 3.04, + "grad_norm": 17.11324119567871, + "learning_rate": 3.337295766402978e-06, + "logits/chosen": -0.48514360189437866, + "logits/rejected": -0.6110307574272156, + "logps/chosen": -65.5890121459961, + "logps/rejected": -83.17328643798828, + "loss": 0.6676, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8154120445251465, + "rewards/margins": 5.328718662261963, + "rewards/rejected": -2.5133068561553955, + "step": 12149 + }, + { + "epoch": 3.04, + "grad_norm": 6.843489646911621, + "learning_rate": 3.3365545307541587e-06, + "logits/chosen": -0.47145557403564453, + "logits/rejected": -0.5555230379104614, + "logps/chosen": -58.22835922241211, + "logps/rejected": -111.578369140625, + "loss": 0.6433, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.946321725845337, + "rewards/margins": 6.25574254989624, + "rewards/rejected": -3.3094215393066406, + "step": 12150 + }, + { + "epoch": 3.04, + "grad_norm": 2.872957468032837, + "learning_rate": 3.335813336210847e-06, + "logits/chosen": -0.587885856628418, + "logits/rejected": -0.640596866607666, + "logps/chosen": -52.54226303100586, + "logps/rejected": -109.55779266357422, + "loss": 0.6023, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.736692428588867, + "rewards/margins": 6.951128005981445, + "rewards/rejected": -4.214436054229736, + "step": 12151 + }, + { + "epoch": 3.04, + "grad_norm": 6.856655120849609, + "learning_rate": 3.3350721827913568e-06, + "logits/chosen": -0.4696815013885498, + "logits/rejected": -0.6059616804122925, + "logps/chosen": -62.835880279541016, + "logps/rejected": -90.20633697509766, + "loss": 0.8116, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0175557136535645, + "rewards/margins": 6.5962018966674805, + "rewards/rejected": -3.5786452293395996, + "step": 12152 + }, + { + "epoch": 3.04, + "grad_norm": 4.029043674468994, + "learning_rate": 3.334331070514003e-06, + "logits/chosen": -0.5953485369682312, + "logits/rejected": -0.653668999671936, + "logps/chosen": -49.92793273925781, + "logps/rejected": -103.51712036132812, + "loss": 0.6411, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.232388496398926, + "rewards/margins": 6.643247604370117, + "rewards/rejected": -3.4108593463897705, + "step": 12153 + }, + { + "epoch": 3.04, + "grad_norm": 6.449484825134277, + "learning_rate": 3.333589999397101e-06, + "logits/chosen": -0.4141097068786621, + "logits/rejected": -0.5623308420181274, + "logps/chosen": -62.389442443847656, + "logps/rejected": -86.57803344726562, + "loss": 0.6242, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.341097831726074, + "rewards/margins": 6.037977695465088, + "rewards/rejected": -2.6968801021575928, + "step": 12154 + }, + { + "epoch": 3.04, + "grad_norm": 10.795692443847656, + "learning_rate": 3.3328489694589637e-06, + "logits/chosen": -0.5681107044219971, + "logits/rejected": -0.6516460180282593, + "logps/chosen": -55.62356185913086, + "logps/rejected": -94.34382629394531, + "loss": 0.6229, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.077043294906616, + "rewards/margins": 6.302763938903809, + "rewards/rejected": -3.2257208824157715, + "step": 12155 + }, + { + "epoch": 3.04, + "grad_norm": 7.746073246002197, + "learning_rate": 3.3321079807179e-06, + "logits/chosen": -0.5239381790161133, + "logits/rejected": -0.5256528258323669, + "logps/chosen": -41.283424377441406, + "logps/rejected": -93.5250244140625, + "loss": 0.6609, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.14174485206604, + "rewards/margins": 5.175277233123779, + "rewards/rejected": -2.0335330963134766, + "step": 12156 + }, + { + "epoch": 3.04, + "grad_norm": 6.595232009887695, + "learning_rate": 3.331367033192223e-06, + "logits/chosen": -0.5033167600631714, + "logits/rejected": -0.6264044642448425, + "logps/chosen": -60.220252990722656, + "logps/rejected": -121.4421615600586, + "loss": 0.5603, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3046321868896484, + "rewards/margins": 8.328526496887207, + "rewards/rejected": -5.0238938331604, + "step": 12157 + }, + { + "epoch": 3.04, + "grad_norm": 3.637244701385498, + "learning_rate": 3.3306261269002415e-06, + "logits/chosen": -0.5274335741996765, + "logits/rejected": -0.6353766918182373, + "logps/chosen": -59.62126922607422, + "logps/rejected": -86.23944854736328, + "loss": 0.6559, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.05279278755188, + "rewards/margins": 5.001125335693359, + "rewards/rejected": -1.94833242893219, + "step": 12158 + }, + { + "epoch": 3.04, + "grad_norm": 5.581954479217529, + "learning_rate": 3.329885261860264e-06, + "logits/chosen": -0.5516884326934814, + "logits/rejected": -0.626725971698761, + "logps/chosen": -54.59278106689453, + "logps/rejected": -109.4147720336914, + "loss": 0.638, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.21476411819458, + "rewards/margins": 7.878468990325928, + "rewards/rejected": -4.663704872131348, + "step": 12159 + }, + { + "epoch": 3.04, + "grad_norm": 4.687920093536377, + "learning_rate": 3.329144438090597e-06, + "logits/chosen": -0.4824003279209137, + "logits/rejected": -0.6112256646156311, + "logps/chosen": -48.571922302246094, + "logps/rejected": -87.34893798828125, + "loss": 0.5888, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.879502773284912, + "rewards/margins": 7.258302688598633, + "rewards/rejected": -4.378799915313721, + "step": 12160 + }, + { + "epoch": 3.04, + "grad_norm": 3.384674310684204, + "learning_rate": 3.3284036556095478e-06, + "logits/chosen": -0.5320395231246948, + "logits/rejected": -0.5691111087799072, + "logps/chosen": -47.901729583740234, + "logps/rejected": -99.75769805908203, + "loss": 0.5846, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1762189865112305, + "rewards/margins": 6.062147617340088, + "rewards/rejected": -2.8859286308288574, + "step": 12161 + }, + { + "epoch": 3.04, + "grad_norm": 2.004958391189575, + "learning_rate": 3.3276629144354256e-06, + "logits/chosen": -0.5865907073020935, + "logits/rejected": -0.6839085817337036, + "logps/chosen": -49.124053955078125, + "logps/rejected": -92.67372131347656, + "loss": 0.5554, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.278921604156494, + "rewards/margins": 7.807692527770996, + "rewards/rejected": -4.528770923614502, + "step": 12162 + }, + { + "epoch": 3.04, + "grad_norm": 6.919412612915039, + "learning_rate": 3.3269222145865277e-06, + "logits/chosen": -0.5402280688285828, + "logits/rejected": -0.5894833207130432, + "logps/chosen": -53.838470458984375, + "logps/rejected": -100.93533325195312, + "loss": 0.6476, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0522639751434326, + "rewards/margins": 6.28342866897583, + "rewards/rejected": -3.2311646938323975, + "step": 12163 + }, + { + "epoch": 3.04, + "grad_norm": 3.8351080417633057, + "learning_rate": 3.3261815560811627e-06, + "logits/chosen": -0.4657803773880005, + "logits/rejected": -0.5165577530860901, + "logps/chosen": -50.015419006347656, + "logps/rejected": -104.88114929199219, + "loss": 0.6041, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.321197271347046, + "rewards/margins": 6.014196872711182, + "rewards/rejected": -2.692999839782715, + "step": 12164 + }, + { + "epoch": 3.04, + "grad_norm": 7.844130039215088, + "learning_rate": 3.325440938937634e-06, + "logits/chosen": -0.5188769698143005, + "logits/rejected": -0.6291583776473999, + "logps/chosen": -48.54294967651367, + "logps/rejected": -106.94921875, + "loss": 0.5381, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.238272190093994, + "rewards/margins": 8.048637390136719, + "rewards/rejected": -4.810364723205566, + "step": 12165 + }, + { + "epoch": 3.04, + "grad_norm": 13.32586669921875, + "learning_rate": 3.324700363174238e-06, + "logits/chosen": -0.50035560131073, + "logits/rejected": -0.58176589012146, + "logps/chosen": -51.1300048828125, + "logps/rejected": -94.32938385009766, + "loss": 0.6636, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.967130661010742, + "rewards/margins": 6.016171932220459, + "rewards/rejected": -3.049041748046875, + "step": 12166 + }, + { + "epoch": 3.04, + "grad_norm": 2.5470845699310303, + "learning_rate": 3.323959828809279e-06, + "logits/chosen": -0.4927886426448822, + "logits/rejected": -0.5692459344863892, + "logps/chosen": -48.20183181762695, + "logps/rejected": -105.62158203125, + "loss": 0.5487, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1021804809570312, + "rewards/margins": 7.264254570007324, + "rewards/rejected": -4.162073612213135, + "step": 12167 + }, + { + "epoch": 3.04, + "grad_norm": 13.762561798095703, + "learning_rate": 3.323219335861055e-06, + "logits/chosen": -0.5350199937820435, + "logits/rejected": -0.589534342288971, + "logps/chosen": -57.6697883605957, + "logps/rejected": -104.65202331542969, + "loss": 0.7454, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.959012031555176, + "rewards/margins": 6.257076263427734, + "rewards/rejected": -3.2980635166168213, + "step": 12168 + }, + { + "epoch": 3.04, + "grad_norm": 2.1637561321258545, + "learning_rate": 3.3224788843478654e-06, + "logits/chosen": -0.5004278421401978, + "logits/rejected": -0.5920819640159607, + "logps/chosen": -59.08578109741211, + "logps/rejected": -94.80996704101562, + "loss": 0.6009, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0582451820373535, + "rewards/margins": 6.625005722045898, + "rewards/rejected": -3.566760301589966, + "step": 12169 + }, + { + "epoch": 3.04, + "grad_norm": 5.752874374389648, + "learning_rate": 3.3217384742880064e-06, + "logits/chosen": -0.5021259188652039, + "logits/rejected": -0.5875410437583923, + "logps/chosen": -58.76050567626953, + "logps/rejected": -100.35279083251953, + "loss": 0.8089, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9075114727020264, + "rewards/margins": 6.515608787536621, + "rewards/rejected": -3.6080970764160156, + "step": 12170 + }, + { + "epoch": 3.04, + "grad_norm": 5.388704299926758, + "learning_rate": 3.320998105699775e-06, + "logits/chosen": -0.5183001756668091, + "logits/rejected": -0.6357333064079285, + "logps/chosen": -62.19560241699219, + "logps/rejected": -88.50042724609375, + "loss": 0.6955, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.756002187728882, + "rewards/margins": 5.80280876159668, + "rewards/rejected": -3.046806812286377, + "step": 12171 + }, + { + "epoch": 3.04, + "grad_norm": 3.829476833343506, + "learning_rate": 3.320257778601467e-06, + "logits/chosen": -0.5425601005554199, + "logits/rejected": -0.5782667398452759, + "logps/chosen": -49.151954650878906, + "logps/rejected": -105.90255737304688, + "loss": 0.6637, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.588770627975464, + "rewards/margins": 7.482696533203125, + "rewards/rejected": -3.893925666809082, + "step": 12172 + }, + { + "epoch": 3.05, + "grad_norm": 4.668202877044678, + "learning_rate": 3.3195174930113744e-06, + "logits/chosen": -0.567843496799469, + "logits/rejected": -0.6720138788223267, + "logps/chosen": -53.64250564575195, + "logps/rejected": -101.06417846679688, + "loss": 0.6169, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1815543174743652, + "rewards/margins": 7.333817481994629, + "rewards/rejected": -4.152263641357422, + "step": 12173 + }, + { + "epoch": 3.05, + "grad_norm": 12.654999732971191, + "learning_rate": 3.3187772489477925e-06, + "logits/chosen": -0.4664400517940521, + "logits/rejected": -0.6055519580841064, + "logps/chosen": -62.22234344482422, + "logps/rejected": -101.07847595214844, + "loss": 0.6185, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0169405937194824, + "rewards/margins": 6.538824081420898, + "rewards/rejected": -3.521883487701416, + "step": 12174 + }, + { + "epoch": 3.05, + "grad_norm": 4.288302421569824, + "learning_rate": 3.318037046429012e-06, + "logits/chosen": -0.4508039951324463, + "logits/rejected": -0.5492672324180603, + "logps/chosen": -47.751075744628906, + "logps/rejected": -82.34536743164062, + "loss": 0.6063, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.981416702270508, + "rewards/margins": 6.098457336425781, + "rewards/rejected": -3.1170406341552734, + "step": 12175 + }, + { + "epoch": 3.05, + "grad_norm": 3.4295012950897217, + "learning_rate": 3.317296885473328e-06, + "logits/chosen": -0.49218297004699707, + "logits/rejected": -0.5974913835525513, + "logps/chosen": -57.01089096069336, + "logps/rejected": -113.49589538574219, + "loss": 0.5539, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1461267471313477, + "rewards/margins": 7.643843650817871, + "rewards/rejected": -4.497716903686523, + "step": 12176 + }, + { + "epoch": 3.05, + "grad_norm": 5.749207973480225, + "learning_rate": 3.316556766099025e-06, + "logits/chosen": -0.46269938349723816, + "logits/rejected": -0.5430043339729309, + "logps/chosen": -54.2669677734375, + "logps/rejected": -116.29016876220703, + "loss": 0.6299, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0367889404296875, + "rewards/margins": 6.739073753356934, + "rewards/rejected": -3.702284336090088, + "step": 12177 + }, + { + "epoch": 3.05, + "grad_norm": 4.869024753570557, + "learning_rate": 3.315816688324396e-06, + "logits/chosen": -0.49718958139419556, + "logits/rejected": -0.5834827423095703, + "logps/chosen": -70.27535247802734, + "logps/rejected": -91.343505859375, + "loss": 0.6528, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2784616947174072, + "rewards/margins": 6.088531017303467, + "rewards/rejected": -2.8100693225860596, + "step": 12178 + }, + { + "epoch": 3.05, + "grad_norm": 6.525714874267578, + "learning_rate": 3.3150766521677293e-06, + "logits/chosen": -0.49218976497650146, + "logits/rejected": -0.5190322399139404, + "logps/chosen": -59.65769958496094, + "logps/rejected": -123.49335479736328, + "loss": 0.7655, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.057800054550171, + "rewards/margins": 6.738568305969238, + "rewards/rejected": -3.680767774581909, + "step": 12179 + }, + { + "epoch": 3.05, + "grad_norm": 2.865410804748535, + "learning_rate": 3.314336657647308e-06, + "logits/chosen": -0.5480194091796875, + "logits/rejected": -0.6423385143280029, + "logps/chosen": -64.79570770263672, + "logps/rejected": -105.66496276855469, + "loss": 0.7196, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.139928102493286, + "rewards/margins": 7.8758955001831055, + "rewards/rejected": -4.735967636108398, + "step": 12180 + }, + { + "epoch": 3.05, + "grad_norm": 13.903441429138184, + "learning_rate": 3.313596704781422e-06, + "logits/chosen": -0.5182986259460449, + "logits/rejected": -0.5981770157814026, + "logps/chosen": -56.69697189331055, + "logps/rejected": -107.59437561035156, + "loss": 0.6334, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.924304962158203, + "rewards/margins": 6.8742523193359375, + "rewards/rejected": -3.9499475955963135, + "step": 12181 + }, + { + "epoch": 3.05, + "grad_norm": 4.099948406219482, + "learning_rate": 3.312856793588355e-06, + "logits/chosen": -0.5561478137969971, + "logits/rejected": -0.6574708819389343, + "logps/chosen": -62.15350341796875, + "logps/rejected": -90.29566192626953, + "loss": 0.6647, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.264650821685791, + "rewards/margins": 6.374716281890869, + "rewards/rejected": -3.110064744949341, + "step": 12182 + }, + { + "epoch": 3.05, + "grad_norm": 5.193102836608887, + "learning_rate": 3.3121169240863927e-06, + "logits/chosen": -0.513930082321167, + "logits/rejected": -0.5772969126701355, + "logps/chosen": -51.49007034301758, + "logps/rejected": -106.10568237304688, + "loss": 0.6325, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.392983913421631, + "rewards/margins": 8.040435791015625, + "rewards/rejected": -4.647451400756836, + "step": 12183 + }, + { + "epoch": 3.05, + "grad_norm": 4.3744282722473145, + "learning_rate": 3.311377096293815e-06, + "logits/chosen": -0.5923938155174255, + "logits/rejected": -0.6519899964332581, + "logps/chosen": -55.04583740234375, + "logps/rejected": -99.01764678955078, + "loss": 0.6916, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2542598247528076, + "rewards/margins": 5.960999965667725, + "rewards/rejected": -2.706740379333496, + "step": 12184 + }, + { + "epoch": 3.05, + "grad_norm": 3.63269305229187, + "learning_rate": 3.3106373102289057e-06, + "logits/chosen": -0.5483831167221069, + "logits/rejected": -0.5978628396987915, + "logps/chosen": -53.371864318847656, + "logps/rejected": -101.03021240234375, + "loss": 0.5765, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1756577491760254, + "rewards/margins": 5.895968914031982, + "rewards/rejected": -2.7203116416931152, + "step": 12185 + }, + { + "epoch": 3.05, + "grad_norm": 5.090458393096924, + "learning_rate": 3.309897565909945e-06, + "logits/chosen": -0.5423617362976074, + "logits/rejected": -0.5918000340461731, + "logps/chosen": -55.57708740234375, + "logps/rejected": -113.57677459716797, + "loss": 0.6, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3586857318878174, + "rewards/margins": 5.879655838012695, + "rewards/rejected": -2.520969867706299, + "step": 12186 + }, + { + "epoch": 3.05, + "grad_norm": 4.057841777801514, + "learning_rate": 3.309157863355214e-06, + "logits/chosen": -0.5255410671234131, + "logits/rejected": -0.5948923230171204, + "logps/chosen": -49.28429412841797, + "logps/rejected": -122.85498809814453, + "loss": 0.5379, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1172049045562744, + "rewards/margins": 7.903586387634277, + "rewards/rejected": -4.786381244659424, + "step": 12187 + }, + { + "epoch": 3.05, + "grad_norm": 5.638439178466797, + "learning_rate": 3.3084182025829905e-06, + "logits/chosen": -0.5676314830780029, + "logits/rejected": -0.6138602495193481, + "logps/chosen": -47.21044921875, + "logps/rejected": -104.96936798095703, + "loss": 0.5809, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3854801654815674, + "rewards/margins": 7.466928005218506, + "rewards/rejected": -4.081448554992676, + "step": 12188 + }, + { + "epoch": 3.05, + "grad_norm": 4.674056529998779, + "learning_rate": 3.3076785836115498e-06, + "logits/chosen": -0.4804439842700958, + "logits/rejected": -0.5869082808494568, + "logps/chosen": -51.19959259033203, + "logps/rejected": -109.18667602539062, + "loss": 0.5736, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.00594162940979, + "rewards/margins": 6.67832088470459, + "rewards/rejected": -3.6723790168762207, + "step": 12189 + }, + { + "epoch": 3.05, + "grad_norm": 3.6834216117858887, + "learning_rate": 3.3069390064591756e-06, + "logits/chosen": -0.5994631052017212, + "logits/rejected": -0.6760472655296326, + "logps/chosen": -48.34368896484375, + "logps/rejected": -99.44766235351562, + "loss": 0.5961, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9762017726898193, + "rewards/margins": 7.230287551879883, + "rewards/rejected": -4.254085540771484, + "step": 12190 + }, + { + "epoch": 3.05, + "grad_norm": 5.131727695465088, + "learning_rate": 3.306199471144137e-06, + "logits/chosen": -0.5226866006851196, + "logits/rejected": -0.5653302073478699, + "logps/chosen": -57.976409912109375, + "logps/rejected": -101.475830078125, + "loss": 0.6919, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.985074520111084, + "rewards/margins": 6.2941694259643555, + "rewards/rejected": -3.3090944290161133, + "step": 12191 + }, + { + "epoch": 3.05, + "grad_norm": 3.496885061264038, + "learning_rate": 3.305459977684711e-06, + "logits/chosen": -0.5046948790550232, + "logits/rejected": -0.537882924079895, + "logps/chosen": -49.294715881347656, + "logps/rejected": -120.08094787597656, + "loss": 0.6361, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.957242488861084, + "rewards/margins": 6.339972496032715, + "rewards/rejected": -3.382730007171631, + "step": 12192 + }, + { + "epoch": 3.05, + "grad_norm": 6.367014408111572, + "learning_rate": 3.304720526099171e-06, + "logits/chosen": -0.48917025327682495, + "logits/rejected": -0.5960522890090942, + "logps/chosen": -59.617862701416016, + "logps/rejected": -100.27860260009766, + "loss": 0.6237, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4586329460144043, + "rewards/margins": 7.6525092124938965, + "rewards/rejected": -4.193876266479492, + "step": 12193 + }, + { + "epoch": 3.05, + "grad_norm": 2.556063652038574, + "learning_rate": 3.3039811164057916e-06, + "logits/chosen": -0.4719604253768921, + "logits/rejected": -0.5663776397705078, + "logps/chosen": -52.8157844543457, + "logps/rejected": -106.88815307617188, + "loss": 0.6178, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3567259311676025, + "rewards/margins": 7.935318946838379, + "rewards/rejected": -4.5785932540893555, + "step": 12194 + }, + { + "epoch": 3.05, + "grad_norm": 17.5494384765625, + "learning_rate": 3.3032417486228418e-06, + "logits/chosen": -0.5137451887130737, + "logits/rejected": -0.586888313293457, + "logps/chosen": -50.921592712402344, + "logps/rejected": -95.0126953125, + "loss": 0.6392, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.098121166229248, + "rewards/margins": 6.636236667633057, + "rewards/rejected": -3.5381155014038086, + "step": 12195 + }, + { + "epoch": 3.05, + "grad_norm": 6.797121524810791, + "learning_rate": 3.3025024227685933e-06, + "logits/chosen": -0.5315445065498352, + "logits/rejected": -0.5577667355537415, + "logps/chosen": -52.839874267578125, + "logps/rejected": -111.47074127197266, + "loss": 0.6269, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2392213344573975, + "rewards/margins": 6.845972537994385, + "rewards/rejected": -3.6067512035369873, + "step": 12196 + }, + { + "epoch": 3.05, + "grad_norm": 3.8503293991088867, + "learning_rate": 3.301763138861316e-06, + "logits/chosen": -0.4179888367652893, + "logits/rejected": -0.5307433605194092, + "logps/chosen": -70.95085144042969, + "logps/rejected": -111.02902221679688, + "loss": 0.6046, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1254541873931885, + "rewards/margins": 6.474440097808838, + "rewards/rejected": -3.3489856719970703, + "step": 12197 + }, + { + "epoch": 3.05, + "grad_norm": 6.499725341796875, + "learning_rate": 3.301023896919277e-06, + "logits/chosen": -0.4828082323074341, + "logits/rejected": -0.5816824436187744, + "logps/chosen": -51.73429870605469, + "logps/rejected": -83.70537567138672, + "loss": 0.6204, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.388117551803589, + "rewards/margins": 6.118197917938232, + "rewards/rejected": -2.7300808429718018, + "step": 12198 + }, + { + "epoch": 3.05, + "grad_norm": 6.083810329437256, + "learning_rate": 3.300284696960745e-06, + "logits/chosen": -0.5777873992919922, + "logits/rejected": -0.6512901186943054, + "logps/chosen": -42.68273162841797, + "logps/rejected": -103.4438247680664, + "loss": 0.5612, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2035558223724365, + "rewards/margins": 7.795803546905518, + "rewards/rejected": -4.592247486114502, + "step": 12199 + }, + { + "epoch": 3.05, + "grad_norm": 7.647948741912842, + "learning_rate": 3.2995455390039853e-06, + "logits/chosen": -0.4408288598060608, + "logits/rejected": -0.5141385793685913, + "logps/chosen": -59.68682861328125, + "logps/rejected": -119.09895324707031, + "loss": 0.7182, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.804314136505127, + "rewards/margins": 7.146022796630859, + "rewards/rejected": -4.341708660125732, + "step": 12200 + }, + { + "epoch": 3.05, + "grad_norm": 4.693271160125732, + "learning_rate": 3.2988064230672656e-06, + "logits/chosen": -0.49094071984291077, + "logits/rejected": -0.5680478811264038, + "logps/chosen": -53.523189544677734, + "logps/rejected": -108.03935241699219, + "loss": 0.6148, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.97377347946167, + "rewards/margins": 7.049399375915527, + "rewards/rejected": -4.075625896453857, + "step": 12201 + }, + { + "epoch": 3.05, + "grad_norm": 5.610342025756836, + "learning_rate": 3.2980673491688474e-06, + "logits/chosen": -0.5009795427322388, + "logits/rejected": -0.6157398819923401, + "logps/chosen": -51.76805114746094, + "logps/rejected": -96.32434844970703, + "loss": 0.5626, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.000232696533203, + "rewards/margins": 7.528636932373047, + "rewards/rejected": -4.528404235839844, + "step": 12202 + }, + { + "epoch": 3.05, + "grad_norm": 5.439608573913574, + "learning_rate": 3.2973283173269953e-06, + "logits/chosen": -0.4451403021812439, + "logits/rejected": -0.49502235651016235, + "logps/chosen": -83.28005981445312, + "logps/rejected": -107.29830932617188, + "loss": 0.6979, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.896049737930298, + "rewards/margins": 5.7553839683532715, + "rewards/rejected": -2.8593339920043945, + "step": 12203 + }, + { + "epoch": 3.05, + "grad_norm": 3.021777868270874, + "learning_rate": 3.296589327559973e-06, + "logits/chosen": -0.49875420331954956, + "logits/rejected": -0.5526037812232971, + "logps/chosen": -58.825374603271484, + "logps/rejected": -112.58570861816406, + "loss": 0.6284, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9375715255737305, + "rewards/margins": 6.8829426765441895, + "rewards/rejected": -3.945371627807617, + "step": 12204 + }, + { + "epoch": 3.05, + "grad_norm": 3.4836184978485107, + "learning_rate": 3.2958503798860386e-06, + "logits/chosen": -0.5269128084182739, + "logits/rejected": -0.6042038202285767, + "logps/chosen": -59.06529998779297, + "logps/rejected": -95.10919952392578, + "loss": 0.6568, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7619142532348633, + "rewards/margins": 7.169360637664795, + "rewards/rejected": -4.40744686126709, + "step": 12205 + }, + { + "epoch": 3.05, + "grad_norm": 4.435569763183594, + "learning_rate": 3.2951114743234534e-06, + "logits/chosen": -0.5057074427604675, + "logits/rejected": -0.5962899923324585, + "logps/chosen": -76.87150573730469, + "logps/rejected": -103.44541931152344, + "loss": 0.6735, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.54028582572937, + "rewards/margins": 6.166831970214844, + "rewards/rejected": -3.6265459060668945, + "step": 12206 + }, + { + "epoch": 3.05, + "grad_norm": 5.200039863586426, + "learning_rate": 3.294372610890478e-06, + "logits/chosen": -0.557102620601654, + "logits/rejected": -0.6534550786018372, + "logps/chosen": -57.53835678100586, + "logps/rejected": -78.86151885986328, + "loss": 0.697, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2053513526916504, + "rewards/margins": 5.942928791046143, + "rewards/rejected": -2.737577438354492, + "step": 12207 + }, + { + "epoch": 3.05, + "grad_norm": 6.349699974060059, + "learning_rate": 3.2936337896053717e-06, + "logits/chosen": -0.4445435404777527, + "logits/rejected": -0.5054248571395874, + "logps/chosen": -60.73039245605469, + "logps/rejected": -128.73666381835938, + "loss": 0.5873, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3962550163269043, + "rewards/margins": 7.073195934295654, + "rewards/rejected": -3.676940679550171, + "step": 12208 + }, + { + "epoch": 3.05, + "grad_norm": 2.7736523151397705, + "learning_rate": 3.2928950104863855e-06, + "logits/chosen": -0.5268694162368774, + "logits/rejected": -0.6567566990852356, + "logps/chosen": -55.94845199584961, + "logps/rejected": -101.58544158935547, + "loss": 0.6184, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.032996654510498, + "rewards/margins": 7.72085428237915, + "rewards/rejected": -4.687857151031494, + "step": 12209 + }, + { + "epoch": 3.05, + "grad_norm": 3.437718629837036, + "learning_rate": 3.292156273551781e-06, + "logits/chosen": -0.4785996079444885, + "logits/rejected": -0.5442118644714355, + "logps/chosen": -60.93537902832031, + "logps/rejected": -116.21691131591797, + "loss": 0.5872, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9010512828826904, + "rewards/margins": 7.951902389526367, + "rewards/rejected": -5.050851345062256, + "step": 12210 + }, + { + "epoch": 3.05, + "grad_norm": 6.997643947601318, + "learning_rate": 3.291417578819812e-06, + "logits/chosen": -0.6490494012832642, + "logits/rejected": -0.7470941543579102, + "logps/chosen": -71.98458099365234, + "logps/rejected": -100.64341735839844, + "loss": 0.7401, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0458333492279053, + "rewards/margins": 6.885761260986328, + "rewards/rejected": -3.83992862701416, + "step": 12211 + }, + { + "epoch": 3.06, + "grad_norm": 4.2337565422058105, + "learning_rate": 3.2906789263087326e-06, + "logits/chosen": -0.48600995540618896, + "logits/rejected": -0.6419736742973328, + "logps/chosen": -61.91691589355469, + "logps/rejected": -88.87193298339844, + "loss": 0.5943, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0990049839019775, + "rewards/margins": 7.135112762451172, + "rewards/rejected": -4.036107063293457, + "step": 12212 + }, + { + "epoch": 3.06, + "grad_norm": 6.3681769371032715, + "learning_rate": 3.2899403160367936e-06, + "logits/chosen": -0.5433642268180847, + "logits/rejected": -0.5582061409950256, + "logps/chosen": -61.65803909301758, + "logps/rejected": -98.553466796875, + "loss": 0.6973, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.998211622238159, + "rewards/margins": 5.422290325164795, + "rewards/rejected": -2.424079418182373, + "step": 12213 + }, + { + "epoch": 3.06, + "grad_norm": 4.693688869476318, + "learning_rate": 3.2892017480222484e-06, + "logits/chosen": -0.552435576915741, + "logits/rejected": -0.5981262922286987, + "logps/chosen": -47.89714813232422, + "logps/rejected": -117.19720458984375, + "loss": 0.6246, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.129615545272827, + "rewards/margins": 6.754195213317871, + "rewards/rejected": -3.624579668045044, + "step": 12214 + }, + { + "epoch": 3.06, + "grad_norm": 4.905158519744873, + "learning_rate": 3.288463222283349e-06, + "logits/chosen": -0.5805844068527222, + "logits/rejected": -0.6505870819091797, + "logps/chosen": -56.66935348510742, + "logps/rejected": -101.03522491455078, + "loss": 0.6595, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.051929473876953, + "rewards/margins": 6.892934799194336, + "rewards/rejected": -3.841005325317383, + "step": 12215 + }, + { + "epoch": 3.06, + "grad_norm": 5.410035133361816, + "learning_rate": 3.287724738838343e-06, + "logits/chosen": -0.4267989695072174, + "logits/rejected": -0.5384184122085571, + "logps/chosen": -60.23332977294922, + "logps/rejected": -90.78392028808594, + "loss": 0.6711, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.143239736557007, + "rewards/margins": 6.741291522979736, + "rewards/rejected": -3.5980520248413086, + "step": 12216 + }, + { + "epoch": 3.06, + "grad_norm": 7.791202545166016, + "learning_rate": 3.2869862977054795e-06, + "logits/chosen": -0.5344316959381104, + "logits/rejected": -0.6366376876831055, + "logps/chosen": -64.85931396484375, + "logps/rejected": -91.47647094726562, + "loss": 0.6698, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.871344566345215, + "rewards/margins": 6.712606906890869, + "rewards/rejected": -3.841262102127075, + "step": 12217 + }, + { + "epoch": 3.06, + "grad_norm": 3.0722477436065674, + "learning_rate": 3.2862478989030056e-06, + "logits/chosen": -0.5364158749580383, + "logits/rejected": -0.6098147034645081, + "logps/chosen": -42.81060028076172, + "logps/rejected": -91.74707794189453, + "loss": 0.5184, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8047831058502197, + "rewards/margins": 6.072746753692627, + "rewards/rejected": -3.2679636478424072, + "step": 12218 + }, + { + "epoch": 3.06, + "grad_norm": 5.445648670196533, + "learning_rate": 3.2855095424491725e-06, + "logits/chosen": -0.4258464574813843, + "logits/rejected": -0.5451660752296448, + "logps/chosen": -62.94720458984375, + "logps/rejected": -100.75577545166016, + "loss": 0.6982, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0082461833953857, + "rewards/margins": 6.824150085449219, + "rewards/rejected": -3.815904140472412, + "step": 12219 + }, + { + "epoch": 3.06, + "grad_norm": 5.916876792907715, + "learning_rate": 3.28477122836222e-06, + "logits/chosen": -0.5852565765380859, + "logits/rejected": -0.6387326121330261, + "logps/chosen": -48.813140869140625, + "logps/rejected": -97.44554138183594, + "loss": 0.5935, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2672836780548096, + "rewards/margins": 6.568840980529785, + "rewards/rejected": -3.3015568256378174, + "step": 12220 + }, + { + "epoch": 3.06, + "grad_norm": 7.651469707489014, + "learning_rate": 3.2840329566603934e-06, + "logits/chosen": -0.4768792986869812, + "logits/rejected": -0.5113643407821655, + "logps/chosen": -54.616966247558594, + "logps/rejected": -95.22816467285156, + "loss": 0.7752, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.254481554031372, + "rewards/margins": 4.780768394470215, + "rewards/rejected": -1.5262870788574219, + "step": 12221 + }, + { + "epoch": 3.06, + "grad_norm": 6.065969944000244, + "learning_rate": 3.283294727361941e-06, + "logits/chosen": -0.555618166923523, + "logits/rejected": -0.6642802357673645, + "logps/chosen": -62.12692642211914, + "logps/rejected": -110.0695571899414, + "loss": 0.6824, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8095033168792725, + "rewards/margins": 6.813120365142822, + "rewards/rejected": -4.003617286682129, + "step": 12222 + }, + { + "epoch": 3.06, + "grad_norm": 3.657189130783081, + "learning_rate": 3.282556540485099e-06, + "logits/chosen": -0.4870876967906952, + "logits/rejected": -0.5908116698265076, + "logps/chosen": -65.66008758544922, + "logps/rejected": -93.14076232910156, + "loss": 0.6323, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8197410106658936, + "rewards/margins": 6.085109710693359, + "rewards/rejected": -3.265368938446045, + "step": 12223 + }, + { + "epoch": 3.06, + "grad_norm": 5.4586262702941895, + "learning_rate": 3.281818396048112e-06, + "logits/chosen": -0.48465538024902344, + "logits/rejected": -0.5781832337379456, + "logps/chosen": -64.49514770507812, + "logps/rejected": -95.4814682006836, + "loss": 0.6851, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3660147190093994, + "rewards/margins": 6.872983455657959, + "rewards/rejected": -3.5069687366485596, + "step": 12224 + }, + { + "epoch": 3.06, + "grad_norm": 7.069784164428711, + "learning_rate": 3.281080294069221e-06, + "logits/chosen": -0.560512125492096, + "logits/rejected": -0.5806781649589539, + "logps/chosen": -53.88990020751953, + "logps/rejected": -117.31635284423828, + "loss": 0.7127, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1259360313415527, + "rewards/margins": 6.588342189788818, + "rewards/rejected": -3.4624056816101074, + "step": 12225 + }, + { + "epoch": 3.06, + "grad_norm": 4.382907867431641, + "learning_rate": 3.2803422345666645e-06, + "logits/chosen": -0.503480076789856, + "logits/rejected": -0.5328930616378784, + "logps/chosen": -53.32320785522461, + "logps/rejected": -107.08335876464844, + "loss": 0.6725, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3079535961151123, + "rewards/margins": 6.868955135345459, + "rewards/rejected": -3.5610015392303467, + "step": 12226 + }, + { + "epoch": 3.06, + "grad_norm": 23.18495750427246, + "learning_rate": 3.2796042175586795e-06, + "logits/chosen": -0.5672473907470703, + "logits/rejected": -0.6192862391471863, + "logps/chosen": -53.10997772216797, + "logps/rejected": -114.50065612792969, + "loss": 0.7502, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.911545515060425, + "rewards/margins": 6.774683952331543, + "rewards/rejected": -3.863137722015381, + "step": 12227 + }, + { + "epoch": 3.06, + "grad_norm": 6.7212138175964355, + "learning_rate": 3.2788662430635043e-06, + "logits/chosen": -0.45543456077575684, + "logits/rejected": -0.524691641330719, + "logps/chosen": -55.188201904296875, + "logps/rejected": -94.9193115234375, + "loss": 0.6108, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1509289741516113, + "rewards/margins": 6.260133743286133, + "rewards/rejected": -3.1092042922973633, + "step": 12228 + }, + { + "epoch": 3.06, + "grad_norm": 4.232389450073242, + "learning_rate": 3.2781283110993768e-06, + "logits/chosen": -0.4986238479614258, + "logits/rejected": -0.6041396856307983, + "logps/chosen": -50.20515060424805, + "logps/rejected": -92.16644287109375, + "loss": 0.6582, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8886375427246094, + "rewards/margins": 6.505997657775879, + "rewards/rejected": -3.6173596382141113, + "step": 12229 + }, + { + "epoch": 3.06, + "grad_norm": 8.161688804626465, + "learning_rate": 3.2773904216845276e-06, + "logits/chosen": -0.5437089204788208, + "logits/rejected": -0.6118483543395996, + "logps/chosen": -59.113014221191406, + "logps/rejected": -90.83401489257812, + "loss": 0.653, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.855940103530884, + "rewards/margins": 5.412198066711426, + "rewards/rejected": -2.556257724761963, + "step": 12230 + }, + { + "epoch": 3.06, + "grad_norm": 7.081912517547607, + "learning_rate": 3.2766525748371947e-06, + "logits/chosen": -0.5054824352264404, + "logits/rejected": -0.586155116558075, + "logps/chosen": -50.89976501464844, + "logps/rejected": -110.57562255859375, + "loss": 0.6044, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1672682762145996, + "rewards/margins": 7.142902374267578, + "rewards/rejected": -3.9756343364715576, + "step": 12231 + }, + { + "epoch": 3.06, + "grad_norm": 5.217398643493652, + "learning_rate": 3.275914770575608e-06, + "logits/chosen": -0.5509785413742065, + "logits/rejected": -0.6202711462974548, + "logps/chosen": -65.2163314819336, + "logps/rejected": -96.950927734375, + "loss": 0.7739, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3224165439605713, + "rewards/margins": 6.585113525390625, + "rewards/rejected": -3.2626967430114746, + "step": 12232 + }, + { + "epoch": 3.06, + "grad_norm": 7.537123680114746, + "learning_rate": 3.275177008918004e-06, + "logits/chosen": -0.49949848651885986, + "logits/rejected": -0.5658356547355652, + "logps/chosen": -57.766822814941406, + "logps/rejected": -120.96824645996094, + "loss": 0.6973, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8475756645202637, + "rewards/margins": 7.597232341766357, + "rewards/rejected": -4.749655723571777, + "step": 12233 + }, + { + "epoch": 3.06, + "grad_norm": 2.8599348068237305, + "learning_rate": 3.2744392898826092e-06, + "logits/chosen": -0.42550379037857056, + "logits/rejected": -0.538551390171051, + "logps/chosen": -52.806983947753906, + "logps/rejected": -104.2800521850586, + "loss": 0.5492, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4279487133026123, + "rewards/margins": 7.4351959228515625, + "rewards/rejected": -4.007247447967529, + "step": 12234 + }, + { + "epoch": 3.06, + "grad_norm": 3.1501924991607666, + "learning_rate": 3.2737016134876537e-06, + "logits/chosen": -0.4623059630393982, + "logits/rejected": -0.494841605424881, + "logps/chosen": -53.127410888671875, + "logps/rejected": -107.73143005371094, + "loss": 0.5782, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4631733894348145, + "rewards/margins": 7.4926042556762695, + "rewards/rejected": -4.029429912567139, + "step": 12235 + }, + { + "epoch": 3.06, + "grad_norm": 7.954895496368408, + "learning_rate": 3.2729639797513683e-06, + "logits/chosen": -0.43835729360580444, + "logits/rejected": -0.577267587184906, + "logps/chosen": -57.749671936035156, + "logps/rejected": -93.31990051269531, + "loss": 0.5517, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3160901069641113, + "rewards/margins": 7.175899982452393, + "rewards/rejected": -3.8598098754882812, + "step": 12236 + }, + { + "epoch": 3.06, + "grad_norm": 5.073375701904297, + "learning_rate": 3.2722263886919812e-06, + "logits/chosen": -0.5004850625991821, + "logits/rejected": -0.5952270030975342, + "logps/chosen": -54.744720458984375, + "logps/rejected": -110.07119750976562, + "loss": 0.6026, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.789069175720215, + "rewards/margins": 6.940876007080078, + "rewards/rejected": -4.151806831359863, + "step": 12237 + }, + { + "epoch": 3.06, + "grad_norm": 1.7271312475204468, + "learning_rate": 3.2714888403277167e-06, + "logits/chosen": -0.567704975605011, + "logits/rejected": -0.6950971484184265, + "logps/chosen": -49.49198532104492, + "logps/rejected": -104.61127471923828, + "loss": 0.5973, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3069963455200195, + "rewards/margins": 9.27302074432373, + "rewards/rejected": -5.9660234451293945, + "step": 12238 + }, + { + "epoch": 3.06, + "grad_norm": 7.068892478942871, + "learning_rate": 3.2707513346768017e-06, + "logits/chosen": -0.5532500743865967, + "logits/rejected": -0.5948351621627808, + "logps/chosen": -49.664974212646484, + "logps/rejected": -125.07470703125, + "loss": 0.6341, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8717846870422363, + "rewards/margins": 7.737828731536865, + "rewards/rejected": -4.866044044494629, + "step": 12239 + }, + { + "epoch": 3.06, + "grad_norm": 9.118992805480957, + "learning_rate": 3.2700138717574614e-06, + "logits/chosen": -0.5745697617530823, + "logits/rejected": -0.6025352478027344, + "logps/chosen": -49.05361557006836, + "logps/rejected": -96.40701293945312, + "loss": 0.7081, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9096767902374268, + "rewards/margins": 5.664865970611572, + "rewards/rejected": -2.7551894187927246, + "step": 12240 + }, + { + "epoch": 3.06, + "grad_norm": 7.020886421203613, + "learning_rate": 3.269276451587917e-06, + "logits/chosen": -0.48166465759277344, + "logits/rejected": -0.5668900012969971, + "logps/chosen": -56.43048858642578, + "logps/rejected": -103.66746520996094, + "loss": 0.8036, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.680732011795044, + "rewards/margins": 6.632105827331543, + "rewards/rejected": -3.95137357711792, + "step": 12241 + }, + { + "epoch": 3.06, + "grad_norm": 5.620092868804932, + "learning_rate": 3.268539074186393e-06, + "logits/chosen": -0.5877602100372314, + "logits/rejected": -0.6890968084335327, + "logps/chosen": -52.43080520629883, + "logps/rejected": -106.91596221923828, + "loss": 0.7004, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2727200984954834, + "rewards/margins": 7.116498947143555, + "rewards/rejected": -3.8437788486480713, + "step": 12242 + }, + { + "epoch": 3.06, + "grad_norm": 9.998948097229004, + "learning_rate": 3.2678017395711098e-06, + "logits/chosen": -0.454881876707077, + "logits/rejected": -0.5408668518066406, + "logps/chosen": -53.510902404785156, + "logps/rejected": -95.52418518066406, + "loss": 0.724, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.144646167755127, + "rewards/margins": 5.929498672485352, + "rewards/rejected": -2.7848527431488037, + "step": 12243 + }, + { + "epoch": 3.06, + "grad_norm": 4.751147270202637, + "learning_rate": 3.2670644477602886e-06, + "logits/chosen": -0.5611423850059509, + "logits/rejected": -0.5867279171943665, + "logps/chosen": -51.123565673828125, + "logps/rejected": -95.623291015625, + "loss": 0.6606, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0517055988311768, + "rewards/margins": 5.315338611602783, + "rewards/rejected": -2.2636325359344482, + "step": 12244 + }, + { + "epoch": 3.06, + "grad_norm": 4.615649700164795, + "learning_rate": 3.2663271987721478e-06, + "logits/chosen": -0.46871352195739746, + "logits/rejected": -0.5159378051757812, + "logps/chosen": -60.32209777832031, + "logps/rejected": -117.46414184570312, + "loss": 0.657, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.116401195526123, + "rewards/margins": 6.025130748748779, + "rewards/rejected": -2.908729076385498, + "step": 12245 + }, + { + "epoch": 3.06, + "grad_norm": 4.741203784942627, + "learning_rate": 3.2655899926249056e-06, + "logits/chosen": -0.482282429933548, + "logits/rejected": -0.5792694091796875, + "logps/chosen": -54.09208297729492, + "logps/rejected": -98.69900512695312, + "loss": 0.6644, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0363035202026367, + "rewards/margins": 6.175539970397949, + "rewards/rejected": -3.1392364501953125, + "step": 12246 + }, + { + "epoch": 3.06, + "grad_norm": 2.6776201725006104, + "learning_rate": 3.264852829336781e-06, + "logits/chosen": -0.4950105547904968, + "logits/rejected": -0.5994723439216614, + "logps/chosen": -56.25330352783203, + "logps/rejected": -119.02067565917969, + "loss": 0.5441, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.02675199508667, + "rewards/margins": 8.586087226867676, + "rewards/rejected": -5.559335231781006, + "step": 12247 + }, + { + "epoch": 3.06, + "grad_norm": 6.565855026245117, + "learning_rate": 3.2641157089259875e-06, + "logits/chosen": -0.5029414892196655, + "logits/rejected": -0.5656448602676392, + "logps/chosen": -46.818485260009766, + "logps/rejected": -98.53366088867188, + "loss": 0.5878, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9944992065429688, + "rewards/margins": 6.537854194641113, + "rewards/rejected": -3.5433545112609863, + "step": 12248 + }, + { + "epoch": 3.06, + "grad_norm": 7.294553279876709, + "learning_rate": 3.2633786314107395e-06, + "logits/chosen": -0.5170546174049377, + "logits/rejected": -0.6119490265846252, + "logps/chosen": -51.0933837890625, + "logps/rejected": -105.59536743164062, + "loss": 0.5941, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9409704208374023, + "rewards/margins": 6.597658157348633, + "rewards/rejected": -3.6566879749298096, + "step": 12249 + }, + { + "epoch": 3.06, + "grad_norm": 4.839272975921631, + "learning_rate": 3.262641596809254e-06, + "logits/chosen": -0.4121876657009125, + "logits/rejected": -0.551367998123169, + "logps/chosen": -71.59606170654297, + "logps/rejected": -94.34884643554688, + "loss": 0.6092, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.95465087890625, + "rewards/margins": 6.145443916320801, + "rewards/rejected": -3.19079327583313, + "step": 12250 + }, + { + "epoch": 3.06, + "grad_norm": 4.844391345977783, + "learning_rate": 3.2619046051397453e-06, + "logits/chosen": -0.5056610107421875, + "logits/rejected": -0.5486555099487305, + "logps/chosen": -52.108787536621094, + "logps/rejected": -101.68486785888672, + "loss": 0.5861, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.217766284942627, + "rewards/margins": 6.731822967529297, + "rewards/rejected": -3.514056444168091, + "step": 12251 + }, + { + "epoch": 3.07, + "grad_norm": 2.607684373855591, + "learning_rate": 3.2611676564204187e-06, + "logits/chosen": -0.4677971601486206, + "logits/rejected": -0.5472362637519836, + "logps/chosen": -49.42625427246094, + "logps/rejected": -119.0660629272461, + "loss": 0.5521, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9147067070007324, + "rewards/margins": 7.6423420906066895, + "rewards/rejected": -4.727635383605957, + "step": 12252 + }, + { + "epoch": 3.07, + "grad_norm": 5.15146017074585, + "learning_rate": 3.26043075066949e-06, + "logits/chosen": -0.44965115189552307, + "logits/rejected": -0.5938256978988647, + "logps/chosen": -59.82479476928711, + "logps/rejected": -97.32603454589844, + "loss": 0.6337, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.465564489364624, + "rewards/margins": 6.36134672164917, + "rewards/rejected": -3.895782470703125, + "step": 12253 + }, + { + "epoch": 3.07, + "grad_norm": 8.85187816619873, + "learning_rate": 3.259693887905169e-06, + "logits/chosen": -0.4918394088745117, + "logits/rejected": -0.5812007188796997, + "logps/chosen": -65.25066375732422, + "logps/rejected": -114.17382049560547, + "loss": 0.7158, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7996020317077637, + "rewards/margins": 6.985726833343506, + "rewards/rejected": -4.186124801635742, + "step": 12254 + }, + { + "epoch": 3.07, + "grad_norm": 1.847115397453308, + "learning_rate": 3.258957068145662e-06, + "logits/chosen": -0.6405659317970276, + "logits/rejected": -0.7039552927017212, + "logps/chosen": -52.99677276611328, + "logps/rejected": -98.6133041381836, + "loss": 0.5555, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1991426944732666, + "rewards/margins": 7.258700370788574, + "rewards/rejected": -4.059557914733887, + "step": 12255 + }, + { + "epoch": 3.07, + "grad_norm": 5.716558456420898, + "learning_rate": 3.258220291409177e-06, + "logits/chosen": -0.6007509231567383, + "logits/rejected": -0.6584211587905884, + "logps/chosen": -60.27398681640625, + "logps/rejected": -103.06871032714844, + "loss": 0.7055, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.888765335083008, + "rewards/margins": 6.076488971710205, + "rewards/rejected": -3.1877236366271973, + "step": 12256 + }, + { + "epoch": 3.07, + "grad_norm": 3.885826587677002, + "learning_rate": 3.257483557713922e-06, + "logits/chosen": -0.5124852061271667, + "logits/rejected": -0.6063995361328125, + "logps/chosen": -52.56074905395508, + "logps/rejected": -101.47242736816406, + "loss": 0.5696, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.396134614944458, + "rewards/margins": 6.557165145874023, + "rewards/rejected": -3.1610312461853027, + "step": 12257 + }, + { + "epoch": 3.07, + "grad_norm": 5.065489292144775, + "learning_rate": 3.2567468670781023e-06, + "logits/chosen": -0.5122093558311462, + "logits/rejected": -0.6100783348083496, + "logps/chosen": -59.068416595458984, + "logps/rejected": -99.01453399658203, + "loss": 0.6139, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.780895233154297, + "rewards/margins": 6.521657943725586, + "rewards/rejected": -3.740762710571289, + "step": 12258 + }, + { + "epoch": 3.07, + "grad_norm": 4.63286018371582, + "learning_rate": 3.2560102195199213e-06, + "logits/chosen": -0.5236160755157471, + "logits/rejected": -0.5927254557609558, + "logps/chosen": -53.71577835083008, + "logps/rejected": -91.23338317871094, + "loss": 0.6642, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1031415462493896, + "rewards/margins": 5.965058326721191, + "rewards/rejected": -2.8619165420532227, + "step": 12259 + }, + { + "epoch": 3.07, + "grad_norm": 13.051931381225586, + "learning_rate": 3.2552736150575827e-06, + "logits/chosen": -0.5348827838897705, + "logits/rejected": -0.5490366220474243, + "logps/chosen": -61.957496643066406, + "logps/rejected": -127.51860809326172, + "loss": 0.6977, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1246368885040283, + "rewards/margins": 7.013082504272461, + "rewards/rejected": -3.8884458541870117, + "step": 12260 + }, + { + "epoch": 3.07, + "grad_norm": 3.565065860748291, + "learning_rate": 3.2545370537092867e-06, + "logits/chosen": -0.47392338514328003, + "logits/rejected": -0.4944378733634949, + "logps/chosen": -49.677162170410156, + "logps/rejected": -121.57090759277344, + "loss": 0.5867, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1092493534088135, + "rewards/margins": 5.609915256500244, + "rewards/rejected": -2.500666618347168, + "step": 12261 + }, + { + "epoch": 3.07, + "grad_norm": 36.20662307739258, + "learning_rate": 3.2538005354932413e-06, + "logits/chosen": -0.5163140892982483, + "logits/rejected": -0.5594083070755005, + "logps/chosen": -45.328128814697266, + "logps/rejected": -103.85502624511719, + "loss": 0.7788, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.10355281829834, + "rewards/margins": 6.791949272155762, + "rewards/rejected": -3.68839693069458, + "step": 12262 + }, + { + "epoch": 3.07, + "grad_norm": 5.959166049957275, + "learning_rate": 3.253064060427639e-06, + "logits/chosen": -0.6033843755722046, + "logits/rejected": -0.6690850257873535, + "logps/chosen": -59.44708251953125, + "logps/rejected": -101.2444076538086, + "loss": 0.6873, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.091850757598877, + "rewards/margins": 5.961709976196289, + "rewards/rejected": -2.869858503341675, + "step": 12263 + }, + { + "epoch": 3.07, + "grad_norm": 25.52237319946289, + "learning_rate": 3.252327628530681e-06, + "logits/chosen": -0.6012462973594666, + "logits/rejected": -0.6633444428443909, + "logps/chosen": -43.03474807739258, + "logps/rejected": -106.67528533935547, + "loss": 0.6876, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1353201866149902, + "rewards/margins": 7.744187355041504, + "rewards/rejected": -4.608867168426514, + "step": 12264 + }, + { + "epoch": 3.07, + "grad_norm": 6.157412528991699, + "learning_rate": 3.2515912398205695e-06, + "logits/chosen": -0.47727712988853455, + "logits/rejected": -0.5066834688186646, + "logps/chosen": -61.496822357177734, + "logps/rejected": -106.41561889648438, + "loss": 0.734, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.284268379211426, + "rewards/margins": 6.025883674621582, + "rewards/rejected": -2.741614818572998, + "step": 12265 + }, + { + "epoch": 3.07, + "grad_norm": 3.6876354217529297, + "learning_rate": 3.250854894315495e-06, + "logits/chosen": -0.5602087378501892, + "logits/rejected": -0.6692144870758057, + "logps/chosen": -63.0426025390625, + "logps/rejected": -91.42762756347656, + "loss": 0.7083, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.669264554977417, + "rewards/margins": 6.505959510803223, + "rewards/rejected": -3.8366951942443848, + "step": 12266 + }, + { + "epoch": 3.07, + "grad_norm": 3.3579206466674805, + "learning_rate": 3.2501185920336574e-06, + "logits/chosen": -0.5946388840675354, + "logits/rejected": -0.6269234418869019, + "logps/chosen": -53.719825744628906, + "logps/rejected": -112.5296630859375, + "loss": 0.6248, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.932589054107666, + "rewards/margins": 7.39975118637085, + "rewards/rejected": -4.467162132263184, + "step": 12267 + }, + { + "epoch": 3.07, + "grad_norm": 4.153648376464844, + "learning_rate": 3.2493823329932504e-06, + "logits/chosen": -0.5244566798210144, + "logits/rejected": -0.6218781471252441, + "logps/chosen": -69.48188781738281, + "logps/rejected": -100.41511535644531, + "loss": 0.6854, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.87971568107605, + "rewards/margins": 6.6811957359313965, + "rewards/rejected": -3.801480770111084, + "step": 12268 + }, + { + "epoch": 3.07, + "grad_norm": 5.845254898071289, + "learning_rate": 3.2486461172124693e-06, + "logits/chosen": -0.45897823572158813, + "logits/rejected": -0.5433811545372009, + "logps/chosen": -56.16851043701172, + "logps/rejected": -102.15193176269531, + "loss": 0.7513, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0317044258117676, + "rewards/margins": 6.734912872314453, + "rewards/rejected": -3.7032077312469482, + "step": 12269 + }, + { + "epoch": 3.07, + "grad_norm": 18.410110473632812, + "learning_rate": 3.247909944709505e-06, + "logits/chosen": -0.5425946712493896, + "logits/rejected": -0.6234522461891174, + "logps/chosen": -66.52822875976562, + "logps/rejected": -89.83905029296875, + "loss": 0.7088, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3124241828918457, + "rewards/margins": 5.7906012535095215, + "rewards/rejected": -2.478177547454834, + "step": 12270 + }, + { + "epoch": 3.07, + "grad_norm": 4.0675225257873535, + "learning_rate": 3.247173815502549e-06, + "logits/chosen": -0.4867895543575287, + "logits/rejected": -0.5950707793235779, + "logps/chosen": -64.17195129394531, + "logps/rejected": -106.366455078125, + "loss": 0.5939, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.843108654022217, + "rewards/margins": 7.263520240783691, + "rewards/rejected": -4.420412540435791, + "step": 12271 + }, + { + "epoch": 3.07, + "grad_norm": 3.170475959777832, + "learning_rate": 3.2464377296097933e-06, + "logits/chosen": -0.5458304286003113, + "logits/rejected": -0.6278484463691711, + "logps/chosen": -52.79986572265625, + "logps/rejected": -101.76834106445312, + "loss": 0.6083, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.103846549987793, + "rewards/margins": 7.282027721405029, + "rewards/rejected": -4.178181171417236, + "step": 12272 + }, + { + "epoch": 3.07, + "grad_norm": 10.399528503417969, + "learning_rate": 3.2457016870494256e-06, + "logits/chosen": -0.5026660561561584, + "logits/rejected": -0.6293616890907288, + "logps/chosen": -54.07474136352539, + "logps/rejected": -102.67142486572266, + "loss": 0.6239, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8171567916870117, + "rewards/margins": 6.789664268493652, + "rewards/rejected": -3.9725077152252197, + "step": 12273 + }, + { + "epoch": 3.07, + "grad_norm": 5.517347812652588, + "learning_rate": 3.244965687839635e-06, + "logits/chosen": -0.4847111999988556, + "logits/rejected": -0.6012165546417236, + "logps/chosen": -54.65180206298828, + "logps/rejected": -105.4210433959961, + "loss": 0.6522, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9470248222351074, + "rewards/margins": 7.0794243812561035, + "rewards/rejected": -4.132399559020996, + "step": 12274 + }, + { + "epoch": 3.07, + "grad_norm": 4.596286773681641, + "learning_rate": 3.2442297319986092e-06, + "logits/chosen": -0.49043479561805725, + "logits/rejected": -0.5678002238273621, + "logps/chosen": -49.016414642333984, + "logps/rejected": -113.03092193603516, + "loss": 0.5473, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2029287815093994, + "rewards/margins": 8.127801895141602, + "rewards/rejected": -4.924871921539307, + "step": 12275 + }, + { + "epoch": 3.07, + "grad_norm": 4.671440601348877, + "learning_rate": 3.2434938195445355e-06, + "logits/chosen": -0.6246257424354553, + "logits/rejected": -0.7142643332481384, + "logps/chosen": -50.4162712097168, + "logps/rejected": -89.68531036376953, + "loss": 0.6744, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1319937705993652, + "rewards/margins": 6.11922025680542, + "rewards/rejected": -2.9872264862060547, + "step": 12276 + }, + { + "epoch": 3.07, + "grad_norm": 3.0279736518859863, + "learning_rate": 3.2427579504955963e-06, + "logits/chosen": -0.5185391902923584, + "logits/rejected": -0.648553729057312, + "logps/chosen": -59.96815490722656, + "logps/rejected": -90.22655487060547, + "loss": 0.6142, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9088845252990723, + "rewards/margins": 6.34574556350708, + "rewards/rejected": -3.436861038208008, + "step": 12277 + }, + { + "epoch": 3.07, + "grad_norm": 10.192878723144531, + "learning_rate": 3.2420221248699768e-06, + "logits/chosen": -0.4607078731060028, + "logits/rejected": -0.5789585709571838, + "logps/chosen": -54.604339599609375, + "logps/rejected": -86.35981750488281, + "loss": 0.5836, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.951566696166992, + "rewards/margins": 5.807301044464111, + "rewards/rejected": -2.855734348297119, + "step": 12278 + }, + { + "epoch": 3.07, + "grad_norm": 8.770127296447754, + "learning_rate": 3.2412863426858633e-06, + "logits/chosen": -0.527645468711853, + "logits/rejected": -0.6563251614570618, + "logps/chosen": -66.38499450683594, + "logps/rejected": -98.962158203125, + "loss": 0.5689, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7682383060455322, + "rewards/margins": 5.944878578186035, + "rewards/rejected": -3.176640033721924, + "step": 12279 + }, + { + "epoch": 3.07, + "grad_norm": 9.382827758789062, + "learning_rate": 3.2405506039614332e-06, + "logits/chosen": -0.48880788683891296, + "logits/rejected": -0.5974090695381165, + "logps/chosen": -47.43601989746094, + "logps/rejected": -86.21856689453125, + "loss": 0.5667, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.140155792236328, + "rewards/margins": 6.422662258148193, + "rewards/rejected": -3.2825067043304443, + "step": 12280 + }, + { + "epoch": 3.07, + "grad_norm": 3.826176881790161, + "learning_rate": 3.239814908714867e-06, + "logits/chosen": -0.6075987219810486, + "logits/rejected": -0.6783717274665833, + "logps/chosen": -56.53703308105469, + "logps/rejected": -114.45848083496094, + "loss": 0.7037, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.815150260925293, + "rewards/margins": 8.143864631652832, + "rewards/rejected": -5.3287153244018555, + "step": 12281 + }, + { + "epoch": 3.07, + "grad_norm": 2.804943561553955, + "learning_rate": 3.2390792569643485e-06, + "logits/chosen": -0.5441089272499084, + "logits/rejected": -0.6683695316314697, + "logps/chosen": -55.737545013427734, + "logps/rejected": -100.3191146850586, + "loss": 0.5958, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2750329971313477, + "rewards/margins": 7.586552619934082, + "rewards/rejected": -4.311519622802734, + "step": 12282 + }, + { + "epoch": 3.07, + "grad_norm": 7.412570953369141, + "learning_rate": 3.2383436487280557e-06, + "logits/chosen": -0.43904852867126465, + "logits/rejected": -0.5083810687065125, + "logps/chosen": -59.48227310180664, + "logps/rejected": -104.73536682128906, + "loss": 0.6243, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0612950325012207, + "rewards/margins": 6.547845840454102, + "rewards/rejected": -3.4865505695343018, + "step": 12283 + }, + { + "epoch": 3.07, + "grad_norm": 3.6392757892608643, + "learning_rate": 3.2376080840241635e-06, + "logits/chosen": -0.562228798866272, + "logits/rejected": -0.5975016951560974, + "logps/chosen": -42.92582321166992, + "logps/rejected": -110.71992492675781, + "loss": 0.5572, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6279735565185547, + "rewards/margins": 8.242862701416016, + "rewards/rejected": -4.614889144897461, + "step": 12284 + }, + { + "epoch": 3.07, + "grad_norm": 7.996018409729004, + "learning_rate": 3.2368725628708507e-06, + "logits/chosen": -0.5194208025932312, + "logits/rejected": -0.5797157883644104, + "logps/chosen": -89.78804016113281, + "logps/rejected": -99.71467590332031, + "loss": 0.6106, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0627524852752686, + "rewards/margins": 7.2852396965026855, + "rewards/rejected": -4.222487926483154, + "step": 12285 + }, + { + "epoch": 3.07, + "grad_norm": 11.316365242004395, + "learning_rate": 3.236137085286293e-06, + "logits/chosen": -0.511515736579895, + "logits/rejected": -0.6294424533843994, + "logps/chosen": -63.77295684814453, + "logps/rejected": -106.91409301757812, + "loss": 0.8049, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.782611131668091, + "rewards/margins": 5.925151824951172, + "rewards/rejected": -3.1425399780273438, + "step": 12286 + }, + { + "epoch": 3.07, + "grad_norm": 2.712745428085327, + "learning_rate": 3.2354016512886626e-06, + "logits/chosen": -0.47788864374160767, + "logits/rejected": -0.5653828382492065, + "logps/chosen": -56.14884948730469, + "logps/rejected": -80.24920654296875, + "loss": 0.5804, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.276254653930664, + "rewards/margins": 6.163234233856201, + "rewards/rejected": -2.886979579925537, + "step": 12287 + }, + { + "epoch": 3.07, + "grad_norm": 4.974842071533203, + "learning_rate": 3.2346662608961347e-06, + "logits/chosen": -0.4632304608821869, + "logits/rejected": -0.5775704979896545, + "logps/chosen": -64.85014343261719, + "logps/rejected": -92.85381317138672, + "loss": 0.723, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.99271297454834, + "rewards/margins": 4.594269275665283, + "rewards/rejected": -1.6015567779541016, + "step": 12288 + }, + { + "epoch": 3.07, + "grad_norm": 5.094089984893799, + "learning_rate": 3.2339309141268815e-06, + "logits/chosen": -0.540046751499176, + "logits/rejected": -0.5912935733795166, + "logps/chosen": -53.180702209472656, + "logps/rejected": -106.50251007080078, + "loss": 0.6809, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.224808692932129, + "rewards/margins": 6.452200412750244, + "rewards/rejected": -3.227391481399536, + "step": 12289 + }, + { + "epoch": 3.07, + "grad_norm": 4.3975911140441895, + "learning_rate": 3.2331956109990747e-06, + "logits/chosen": -0.5098950862884521, + "logits/rejected": -0.5978832244873047, + "logps/chosen": -66.04241943359375, + "logps/rejected": -98.24332427978516, + "loss": 0.7156, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4587724208831787, + "rewards/margins": 5.911362648010254, + "rewards/rejected": -3.4525907039642334, + "step": 12290 + }, + { + "epoch": 3.07, + "grad_norm": 4.7539777755737305, + "learning_rate": 3.2324603515308824e-06, + "logits/chosen": -0.4342458248138428, + "logits/rejected": -0.49295473098754883, + "logps/chosen": -53.982177734375, + "logps/rejected": -128.68057250976562, + "loss": 0.5673, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4854345321655273, + "rewards/margins": 8.536849975585938, + "rewards/rejected": -5.05141544342041, + "step": 12291 + }, + { + "epoch": 3.08, + "grad_norm": 4.3395209312438965, + "learning_rate": 3.231725135740474e-06, + "logits/chosen": -0.4870626926422119, + "logits/rejected": -0.6027835607528687, + "logps/chosen": -48.537445068359375, + "logps/rejected": -84.42798614501953, + "loss": 0.5884, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1772267818450928, + "rewards/margins": 7.022227764129639, + "rewards/rejected": -3.8450019359588623, + "step": 12292 + }, + { + "epoch": 3.08, + "grad_norm": 2.9347150325775146, + "learning_rate": 3.230989963646019e-06, + "logits/chosen": -0.4587670564651489, + "logits/rejected": -0.5648314356803894, + "logps/chosen": -55.85433578491211, + "logps/rejected": -108.867431640625, + "loss": 0.5613, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0435359477996826, + "rewards/margins": 6.667305946350098, + "rewards/rejected": -3.623769998550415, + "step": 12293 + }, + { + "epoch": 3.08, + "grad_norm": 31.173891067504883, + "learning_rate": 3.230254835265686e-06, + "logits/chosen": -0.5083850622177124, + "logits/rejected": -0.6175334453582764, + "logps/chosen": -61.36046600341797, + "logps/rejected": -85.13522338867188, + "loss": 0.8742, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9032793045043945, + "rewards/margins": 5.206199645996094, + "rewards/rejected": -2.302920341491699, + "step": 12294 + }, + { + "epoch": 3.08, + "grad_norm": 8.255009651184082, + "learning_rate": 3.2295197506176353e-06, + "logits/chosen": -0.5169591903686523, + "logits/rejected": -0.5649654865264893, + "logps/chosen": -48.36726760864258, + "logps/rejected": -114.75373840332031, + "loss": 0.6041, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.121795654296875, + "rewards/margins": 6.784192085266113, + "rewards/rejected": -3.662396192550659, + "step": 12295 + }, + { + "epoch": 3.08, + "grad_norm": 4.183018684387207, + "learning_rate": 3.2287847097200354e-06, + "logits/chosen": -0.4985811710357666, + "logits/rejected": -0.5710545182228088, + "logps/chosen": -56.03649139404297, + "logps/rejected": -102.91673278808594, + "loss": 0.613, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2095108032226562, + "rewards/margins": 7.055379867553711, + "rewards/rejected": -3.8458688259124756, + "step": 12296 + }, + { + "epoch": 3.08, + "grad_norm": 5.510020732879639, + "learning_rate": 3.2280497125910504e-06, + "logits/chosen": -0.5459691286087036, + "logits/rejected": -0.6312212347984314, + "logps/chosen": -54.31426239013672, + "logps/rejected": -108.77823638916016, + "loss": 0.6783, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2581868171691895, + "rewards/margins": 7.960043430328369, + "rewards/rejected": -4.701857089996338, + "step": 12297 + }, + { + "epoch": 3.08, + "grad_norm": 5.3876237869262695, + "learning_rate": 3.227314759248841e-06, + "logits/chosen": -0.49317729473114014, + "logits/rejected": -0.564011812210083, + "logps/chosen": -59.091766357421875, + "logps/rejected": -135.60362243652344, + "loss": 0.6416, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2164652347564697, + "rewards/margins": 9.24937915802002, + "rewards/rejected": -6.0329132080078125, + "step": 12298 + }, + { + "epoch": 3.08, + "grad_norm": 6.2387919425964355, + "learning_rate": 3.2265798497115697e-06, + "logits/chosen": -0.45189204812049866, + "logits/rejected": -0.5361199975013733, + "logps/chosen": -62.76804733276367, + "logps/rejected": -105.00326538085938, + "loss": 0.8306, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.934875011444092, + "rewards/margins": 5.343432426452637, + "rewards/rejected": -2.408557653427124, + "step": 12299 + }, + { + "epoch": 3.08, + "grad_norm": 5.136336803436279, + "learning_rate": 3.225844983997396e-06, + "logits/chosen": -0.5254368782043457, + "logits/rejected": -0.5791503190994263, + "logps/chosen": -71.13892364501953, + "logps/rejected": -100.29834747314453, + "loss": 0.7871, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.837087869644165, + "rewards/margins": 5.9717230796813965, + "rewards/rejected": -3.1346347332000732, + "step": 12300 + }, + { + "epoch": 3.08, + "grad_norm": 14.311466217041016, + "learning_rate": 3.225110162124481e-06, + "logits/chosen": -0.5025720000267029, + "logits/rejected": -0.5615400075912476, + "logps/chosen": -49.18830108642578, + "logps/rejected": -96.83951568603516, + "loss": 0.6186, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9380319118499756, + "rewards/margins": 6.444177150726318, + "rewards/rejected": -3.506146192550659, + "step": 12301 + }, + { + "epoch": 3.08, + "grad_norm": 11.970272064208984, + "learning_rate": 3.224375384110981e-06, + "logits/chosen": -0.5674281716346741, + "logits/rejected": -0.6183990240097046, + "logps/chosen": -42.29916763305664, + "logps/rejected": -101.67181396484375, + "loss": 0.6286, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1526012420654297, + "rewards/margins": 6.1408538818359375, + "rewards/rejected": -2.9882524013519287, + "step": 12302 + }, + { + "epoch": 3.08, + "grad_norm": 6.851158618927002, + "learning_rate": 3.223640649975054e-06, + "logits/chosen": -0.481103777885437, + "logits/rejected": -0.5689141154289246, + "logps/chosen": -64.08415222167969, + "logps/rejected": -103.09440612792969, + "loss": 0.6766, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0715198516845703, + "rewards/margins": 6.284320831298828, + "rewards/rejected": -3.212801456451416, + "step": 12303 + }, + { + "epoch": 3.08, + "grad_norm": 5.964145183563232, + "learning_rate": 3.222905959734856e-06, + "logits/chosen": -0.6048198342323303, + "logits/rejected": -0.6620896458625793, + "logps/chosen": -50.699676513671875, + "logps/rejected": -122.82246398925781, + "loss": 0.5895, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.933211088180542, + "rewards/margins": 7.0602288246154785, + "rewards/rejected": -4.127017974853516, + "step": 12304 + }, + { + "epoch": 3.08, + "grad_norm": 5.642073154449463, + "learning_rate": 3.222171313408542e-06, + "logits/chosen": -0.5702185034751892, + "logits/rejected": -0.6300877332687378, + "logps/chosen": -53.874183654785156, + "logps/rejected": -107.99160766601562, + "loss": 0.5916, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6969552040100098, + "rewards/margins": 6.798526763916016, + "rewards/rejected": -4.101571083068848, + "step": 12305 + }, + { + "epoch": 3.08, + "grad_norm": 3.3269176483154297, + "learning_rate": 3.2214367110142654e-06, + "logits/chosen": -0.46004897356033325, + "logits/rejected": -0.54086834192276, + "logps/chosen": -53.06709671020508, + "logps/rejected": -112.86358642578125, + "loss": 0.5445, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.013991355895996, + "rewards/margins": 7.3948750495910645, + "rewards/rejected": -4.380883693695068, + "step": 12306 + }, + { + "epoch": 3.08, + "grad_norm": 9.99292278289795, + "learning_rate": 3.2207021525701782e-06, + "logits/chosen": -0.5142192244529724, + "logits/rejected": -0.5996112823486328, + "logps/chosen": -63.581199645996094, + "logps/rejected": -114.53773498535156, + "loss": 0.7273, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.954415798187256, + "rewards/margins": 5.846734523773193, + "rewards/rejected": -2.8923189640045166, + "step": 12307 + }, + { + "epoch": 3.08, + "grad_norm": 3.4147934913635254, + "learning_rate": 3.2199676380944368e-06, + "logits/chosen": -0.5737309455871582, + "logits/rejected": -0.6661889553070068, + "logps/chosen": -71.23514556884766, + "logps/rejected": -93.6622543334961, + "loss": 0.6077, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.882719039916992, + "rewards/margins": 6.4789934158325195, + "rewards/rejected": -3.5962741374969482, + "step": 12308 + }, + { + "epoch": 3.08, + "grad_norm": 8.620540618896484, + "learning_rate": 3.219233167605185e-06, + "logits/chosen": -0.5378757119178772, + "logits/rejected": -0.5782551765441895, + "logps/chosen": -56.08674621582031, + "logps/rejected": -95.6041259765625, + "loss": 0.6497, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.187798500061035, + "rewards/margins": 6.4146809577941895, + "rewards/rejected": -3.2268824577331543, + "step": 12309 + }, + { + "epoch": 3.08, + "grad_norm": 3.997546911239624, + "learning_rate": 3.218498741120577e-06, + "logits/chosen": -0.573278546333313, + "logits/rejected": -0.6634801626205444, + "logps/chosen": -56.309837341308594, + "logps/rejected": -106.1787109375, + "loss": 0.6158, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1753954887390137, + "rewards/margins": 6.711189270019531, + "rewards/rejected": -3.5357933044433594, + "step": 12310 + }, + { + "epoch": 3.08, + "grad_norm": 3.4481163024902344, + "learning_rate": 3.2177643586587613e-06, + "logits/chosen": -0.5151534676551819, + "logits/rejected": -0.6184320449829102, + "logps/chosen": -47.73740005493164, + "logps/rejected": -107.96954345703125, + "loss": 0.5461, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9601690769195557, + "rewards/margins": 7.804357051849365, + "rewards/rejected": -4.8441877365112305, + "step": 12311 + }, + { + "epoch": 3.08, + "grad_norm": 7.679973602294922, + "learning_rate": 3.2170300202378823e-06, + "logits/chosen": -0.5379554629325867, + "logits/rejected": -0.6175243854522705, + "logps/chosen": -59.78063201904297, + "logps/rejected": -106.08057403564453, + "loss": 0.6774, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0377464294433594, + "rewards/margins": 6.108396530151367, + "rewards/rejected": -3.070650577545166, + "step": 12312 + }, + { + "epoch": 3.08, + "grad_norm": 5.196496486663818, + "learning_rate": 3.216295725876088e-06, + "logits/chosen": -0.45654645562171936, + "logits/rejected": -0.5702878832817078, + "logps/chosen": -66.22433471679688, + "logps/rejected": -97.78025817871094, + "loss": 0.7457, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.268899917602539, + "rewards/margins": 6.265066146850586, + "rewards/rejected": -2.996166229248047, + "step": 12313 + }, + { + "epoch": 3.08, + "grad_norm": 3.430420160293579, + "learning_rate": 3.2155614755915236e-06, + "logits/chosen": -0.5080515742301941, + "logits/rejected": -0.5848970413208008, + "logps/chosen": -51.36346435546875, + "logps/rejected": -101.87875366210938, + "loss": 0.6311, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0917348861694336, + "rewards/margins": 7.27651309967041, + "rewards/rejected": -4.184778213500977, + "step": 12314 + }, + { + "epoch": 3.08, + "grad_norm": 4.197619915008545, + "learning_rate": 3.214827269402334e-06, + "logits/chosen": -0.5516747236251831, + "logits/rejected": -0.6430248618125916, + "logps/chosen": -52.059696197509766, + "logps/rejected": -99.76545715332031, + "loss": 0.6575, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1456680297851562, + "rewards/margins": 6.676273345947266, + "rewards/rejected": -3.530604362487793, + "step": 12315 + }, + { + "epoch": 3.08, + "grad_norm": 4.267673969268799, + "learning_rate": 3.214093107326661e-06, + "logits/chosen": -0.5960628390312195, + "logits/rejected": -0.6818339824676514, + "logps/chosen": -67.52165985107422, + "logps/rejected": -93.09857177734375, + "loss": 0.6854, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.903494119644165, + "rewards/margins": 6.032200813293457, + "rewards/rejected": -3.12870717048645, + "step": 12316 + }, + { + "epoch": 3.08, + "grad_norm": 5.557828426361084, + "learning_rate": 3.2133589893826465e-06, + "logits/chosen": -0.6599192023277283, + "logits/rejected": -0.7159278392791748, + "logps/chosen": -50.44375228881836, + "logps/rejected": -83.05360412597656, + "loss": 0.7048, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.155261516571045, + "rewards/margins": 6.112675666809082, + "rewards/rejected": -2.957414388656616, + "step": 12317 + }, + { + "epoch": 3.08, + "grad_norm": 2.81650710105896, + "learning_rate": 3.212624915588432e-06, + "logits/chosen": -0.4866192936897278, + "logits/rejected": -0.5787889957427979, + "logps/chosen": -53.66667175292969, + "logps/rejected": -97.67963409423828, + "loss": 0.5734, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.014008045196533, + "rewards/margins": 6.818190574645996, + "rewards/rejected": -3.8041820526123047, + "step": 12318 + }, + { + "epoch": 3.08, + "grad_norm": 9.703475952148438, + "learning_rate": 3.211890885962158e-06, + "logits/chosen": -0.479105681180954, + "logits/rejected": -0.5276846885681152, + "logps/chosen": -52.47210693359375, + "logps/rejected": -113.25981140136719, + "loss": 0.6973, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8018133640289307, + "rewards/margins": 6.9141645431518555, + "rewards/rejected": -4.1123504638671875, + "step": 12319 + }, + { + "epoch": 3.08, + "grad_norm": 4.733597755432129, + "learning_rate": 3.211156900521961e-06, + "logits/chosen": -0.535582423210144, + "logits/rejected": -0.6320415735244751, + "logps/chosen": -60.75917053222656, + "logps/rejected": -89.41284942626953, + "loss": 0.6911, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.89203143119812, + "rewards/margins": 6.43471622467041, + "rewards/rejected": -3.542684555053711, + "step": 12320 + }, + { + "epoch": 3.08, + "grad_norm": 13.97932243347168, + "learning_rate": 3.210422959285978e-06, + "logits/chosen": -0.5922279953956604, + "logits/rejected": -0.6972052454948425, + "logps/chosen": -54.20152282714844, + "logps/rejected": -97.32627868652344, + "loss": 0.6282, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.883864641189575, + "rewards/margins": 6.579644203186035, + "rewards/rejected": -3.695779323577881, + "step": 12321 + }, + { + "epoch": 3.08, + "grad_norm": 3.563368558883667, + "learning_rate": 3.209689062272351e-06, + "logits/chosen": -0.515484631061554, + "logits/rejected": -0.5135138034820557, + "logps/chosen": -56.50464630126953, + "logps/rejected": -116.89761352539062, + "loss": 0.6847, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.975188970565796, + "rewards/margins": 6.288986682891846, + "rewards/rejected": -3.313798427581787, + "step": 12322 + }, + { + "epoch": 3.08, + "grad_norm": 2.8125064373016357, + "learning_rate": 3.208955209499209e-06, + "logits/chosen": -0.5358577966690063, + "logits/rejected": -0.6451600193977356, + "logps/chosen": -45.22258758544922, + "logps/rejected": -82.17910766601562, + "loss": 0.5684, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.434979200363159, + "rewards/margins": 6.660714149475098, + "rewards/rejected": -3.2257351875305176, + "step": 12323 + }, + { + "epoch": 3.08, + "grad_norm": 6.4133124351501465, + "learning_rate": 3.2082214009846887e-06, + "logits/chosen": -0.511381208896637, + "logits/rejected": -0.5390187501907349, + "logps/chosen": -50.24557113647461, + "logps/rejected": -119.32328033447266, + "loss": 0.6158, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.208620548248291, + "rewards/margins": 6.849330425262451, + "rewards/rejected": -3.6407103538513184, + "step": 12324 + }, + { + "epoch": 3.08, + "grad_norm": 6.9943695068359375, + "learning_rate": 3.2074876367469236e-06, + "logits/chosen": -0.5526167154312134, + "logits/rejected": -0.5800994634628296, + "logps/chosen": -60.78462219238281, + "logps/rejected": -113.23674774169922, + "loss": 0.7458, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.163516044616699, + "rewards/margins": 6.469122409820557, + "rewards/rejected": -3.305605888366699, + "step": 12325 + }, + { + "epoch": 3.08, + "grad_norm": 7.525336265563965, + "learning_rate": 3.206753916804047e-06, + "logits/chosen": -0.5640358328819275, + "logits/rejected": -0.6646881103515625, + "logps/chosen": -58.165618896484375, + "logps/rejected": -92.96097564697266, + "loss": 0.6625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9837279319763184, + "rewards/margins": 6.765104293823242, + "rewards/rejected": -3.781376600265503, + "step": 12326 + }, + { + "epoch": 3.08, + "grad_norm": 2.428208589553833, + "learning_rate": 3.206020241174187e-06, + "logits/chosen": -0.524574875831604, + "logits/rejected": -0.6187731027603149, + "logps/chosen": -54.29433059692383, + "logps/rejected": -107.90126037597656, + "loss": 0.5514, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3203394412994385, + "rewards/margins": 7.483025550842285, + "rewards/rejected": -4.162685871124268, + "step": 12327 + }, + { + "epoch": 3.08, + "grad_norm": 5.251601696014404, + "learning_rate": 3.2052866098754755e-06, + "logits/chosen": -0.5205886960029602, + "logits/rejected": -0.6041001081466675, + "logps/chosen": -59.741111755371094, + "logps/rejected": -99.46977996826172, + "loss": 0.6749, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.289769172668457, + "rewards/margins": 6.81357479095459, + "rewards/rejected": -3.52380633354187, + "step": 12328 + }, + { + "epoch": 3.08, + "grad_norm": 3.4763810634613037, + "learning_rate": 3.2045530229260415e-06, + "logits/chosen": -0.5284977555274963, + "logits/rejected": -0.6365596652030945, + "logps/chosen": -59.38606643676758, + "logps/rejected": -94.4229736328125, + "loss": 0.6302, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9043869972229004, + "rewards/margins": 6.314225673675537, + "rewards/rejected": -3.4098381996154785, + "step": 12329 + }, + { + "epoch": 3.08, + "grad_norm": 4.127223968505859, + "learning_rate": 3.2038194803440115e-06, + "logits/chosen": -0.5008046627044678, + "logits/rejected": -0.6118248701095581, + "logps/chosen": -57.943443298339844, + "logps/rejected": -106.90253448486328, + "loss": 0.6044, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7466793060302734, + "rewards/margins": 7.258250713348389, + "rewards/rejected": -4.511570930480957, + "step": 12330 + }, + { + "epoch": 3.08, + "grad_norm": 4.018935680389404, + "learning_rate": 3.2030859821475125e-06, + "logits/chosen": -0.5255454778671265, + "logits/rejected": -0.6277774572372437, + "logps/chosen": -56.635372161865234, + "logps/rejected": -98.74403381347656, + "loss": 0.5648, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.346811294555664, + "rewards/margins": 7.141297340393066, + "rewards/rejected": -3.7944862842559814, + "step": 12331 + }, + { + "epoch": 3.09, + "grad_norm": 11.674626350402832, + "learning_rate": 3.20235252835467e-06, + "logits/chosen": -0.5200648903846741, + "logits/rejected": -0.5986785888671875, + "logps/chosen": -58.54186248779297, + "logps/rejected": -106.77066802978516, + "loss": 0.6812, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.652751922607422, + "rewards/margins": 6.400824546813965, + "rewards/rejected": -3.748073101043701, + "step": 12332 + }, + { + "epoch": 3.09, + "grad_norm": 2.517381191253662, + "learning_rate": 3.2016191189836086e-06, + "logits/chosen": -0.5384623408317566, + "logits/rejected": -0.6357723474502563, + "logps/chosen": -49.75149154663086, + "logps/rejected": -93.5517349243164, + "loss": 0.6157, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9744105339050293, + "rewards/margins": 7.238180160522461, + "rewards/rejected": -4.263769626617432, + "step": 12333 + }, + { + "epoch": 3.09, + "grad_norm": 6.556194305419922, + "learning_rate": 3.200885754052452e-06, + "logits/chosen": -0.48954737186431885, + "logits/rejected": -0.5695505142211914, + "logps/chosen": -54.72003936767578, + "logps/rejected": -84.42940521240234, + "loss": 0.5962, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.182433605194092, + "rewards/margins": 6.579204082489014, + "rewards/rejected": -3.396770715713501, + "step": 12334 + }, + { + "epoch": 3.09, + "grad_norm": 9.536643981933594, + "learning_rate": 3.2001524335793214e-06, + "logits/chosen": -0.543785810470581, + "logits/rejected": -0.6229218244552612, + "logps/chosen": -56.078514099121094, + "logps/rejected": -104.13526916503906, + "loss": 0.69, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2363665103912354, + "rewards/margins": 6.716809272766113, + "rewards/rejected": -3.480443000793457, + "step": 12335 + }, + { + "epoch": 3.09, + "grad_norm": 6.748955726623535, + "learning_rate": 3.1994191575823395e-06, + "logits/chosen": -0.49663740396499634, + "logits/rejected": -0.6223413944244385, + "logps/chosen": -53.911502838134766, + "logps/rejected": -95.62871551513672, + "loss": 0.6575, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.003178596496582, + "rewards/margins": 6.546442031860352, + "rewards/rejected": -3.5432629585266113, + "step": 12336 + }, + { + "epoch": 3.09, + "grad_norm": 4.976647853851318, + "learning_rate": 3.1986859260796253e-06, + "logits/chosen": -0.5819367170333862, + "logits/rejected": -0.6003933548927307, + "logps/chosen": -46.556297302246094, + "logps/rejected": -101.54481506347656, + "loss": 0.5967, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2453768253326416, + "rewards/margins": 5.942291736602783, + "rewards/rejected": -2.6969149112701416, + "step": 12337 + }, + { + "epoch": 3.09, + "grad_norm": 10.749543190002441, + "learning_rate": 3.1979527390892962e-06, + "logits/chosen": -0.5419681668281555, + "logits/rejected": -0.6481311321258545, + "logps/chosen": -49.35605239868164, + "logps/rejected": -119.62382507324219, + "loss": 0.5724, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1715481281280518, + "rewards/margins": 8.498213768005371, + "rewards/rejected": -5.326664924621582, + "step": 12338 + }, + { + "epoch": 3.09, + "grad_norm": 5.043588638305664, + "learning_rate": 3.197219596629473e-06, + "logits/chosen": -0.4960089325904846, + "logits/rejected": -0.580617368221283, + "logps/chosen": -46.871517181396484, + "logps/rejected": -94.50995635986328, + "loss": 0.5574, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.260061025619507, + "rewards/margins": 6.994528770446777, + "rewards/rejected": -3.7344677448272705, + "step": 12339 + }, + { + "epoch": 3.09, + "grad_norm": 6.015498638153076, + "learning_rate": 3.196486498718272e-06, + "logits/chosen": -0.6044219732284546, + "logits/rejected": -0.6635512709617615, + "logps/chosen": -54.10870361328125, + "logps/rejected": -108.23468780517578, + "loss": 0.6119, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.148486852645874, + "rewards/margins": 6.905295372009277, + "rewards/rejected": -3.7568087577819824, + "step": 12340 + }, + { + "epoch": 3.09, + "grad_norm": 8.436165809631348, + "learning_rate": 3.1957534453738058e-06, + "logits/chosen": -0.547580361366272, + "logits/rejected": -0.5545742511749268, + "logps/chosen": -65.3817138671875, + "logps/rejected": -115.79695129394531, + "loss": 0.6511, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1306207180023193, + "rewards/margins": 6.582818984985352, + "rewards/rejected": -3.4521982669830322, + "step": 12341 + }, + { + "epoch": 3.09, + "grad_norm": 3.3052849769592285, + "learning_rate": 3.195020436614192e-06, + "logits/chosen": -0.4781929850578308, + "logits/rejected": -0.569922924041748, + "logps/chosen": -57.32115936279297, + "logps/rejected": -112.95317840576172, + "loss": 0.6353, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0821287631988525, + "rewards/margins": 7.03880500793457, + "rewards/rejected": -3.9566762447357178, + "step": 12342 + }, + { + "epoch": 3.09, + "grad_norm": 4.985683917999268, + "learning_rate": 3.194287472457543e-06, + "logits/chosen": -0.5021814703941345, + "logits/rejected": -0.5846965909004211, + "logps/chosen": -53.13026428222656, + "logps/rejected": -72.60693359375, + "loss": 0.6462, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.993986129760742, + "rewards/margins": 5.433219909667969, + "rewards/rejected": -2.4392337799072266, + "step": 12343 + }, + { + "epoch": 3.09, + "grad_norm": 3.663745164871216, + "learning_rate": 3.1935545529219726e-06, + "logits/chosen": -0.5063824653625488, + "logits/rejected": -0.545232355594635, + "logps/chosen": -48.48008728027344, + "logps/rejected": -116.05440521240234, + "loss": 0.6353, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3583602905273438, + "rewards/margins": 7.026721477508545, + "rewards/rejected": -3.668361186981201, + "step": 12344 + }, + { + "epoch": 3.09, + "grad_norm": 2.902085781097412, + "learning_rate": 3.1928216780255883e-06, + "logits/chosen": -0.549680769443512, + "logits/rejected": -0.6486007571220398, + "logps/chosen": -44.487545013427734, + "logps/rejected": -104.5402603149414, + "loss": 0.5054, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.160174608230591, + "rewards/margins": 6.789461135864258, + "rewards/rejected": -3.629286050796509, + "step": 12345 + }, + { + "epoch": 3.09, + "grad_norm": 4.143796920776367, + "learning_rate": 3.192088847786504e-06, + "logits/chosen": -0.5419842004776001, + "logits/rejected": -0.636496901512146, + "logps/chosen": -55.89905548095703, + "logps/rejected": -102.1387939453125, + "loss": 0.6211, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.796414613723755, + "rewards/margins": 6.603415489196777, + "rewards/rejected": -3.8070011138916016, + "step": 12346 + }, + { + "epoch": 3.09, + "grad_norm": 3.137888193130493, + "learning_rate": 3.191356062222828e-06, + "logits/chosen": -0.5010291337966919, + "logits/rejected": -0.5922438502311707, + "logps/chosen": -61.33693313598633, + "logps/rejected": -102.23146057128906, + "loss": 0.6458, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.20698881149292, + "rewards/margins": 6.940176010131836, + "rewards/rejected": -3.7331879138946533, + "step": 12347 + }, + { + "epoch": 3.09, + "grad_norm": 4.021546840667725, + "learning_rate": 3.1906233213526653e-06, + "logits/chosen": -0.5814962983131409, + "logits/rejected": -0.6771638989448547, + "logps/chosen": -56.62991714477539, + "logps/rejected": -111.88726043701172, + "loss": 0.5904, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.894308567047119, + "rewards/margins": 7.101042747497559, + "rewards/rejected": -4.206733703613281, + "step": 12348 + }, + { + "epoch": 3.09, + "grad_norm": 6.693717956542969, + "learning_rate": 3.1898906251941263e-06, + "logits/chosen": -0.4937950074672699, + "logits/rejected": -0.5826746821403503, + "logps/chosen": -65.25859832763672, + "logps/rejected": -92.04644012451172, + "loss": 0.6812, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7818071842193604, + "rewards/margins": 5.528331279754639, + "rewards/rejected": -2.7465248107910156, + "step": 12349 + }, + { + "epoch": 3.09, + "grad_norm": 5.044101238250732, + "learning_rate": 3.189157973765313e-06, + "logits/chosen": -0.48927414417266846, + "logits/rejected": -0.5864307880401611, + "logps/chosen": -50.8680305480957, + "logps/rejected": -107.3492431640625, + "loss": 0.6204, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8849191665649414, + "rewards/margins": 6.854068756103516, + "rewards/rejected": -3.969149589538574, + "step": 12350 + }, + { + "epoch": 3.09, + "grad_norm": 5.172333240509033, + "learning_rate": 3.188425367084336e-06, + "logits/chosen": -0.5090622901916504, + "logits/rejected": -0.6203142404556274, + "logps/chosen": -53.951236724853516, + "logps/rejected": -100.70696258544922, + "loss": 0.6157, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.890636920928955, + "rewards/margins": 6.4787139892578125, + "rewards/rejected": -3.5880770683288574, + "step": 12351 + }, + { + "epoch": 3.09, + "grad_norm": 4.384872913360596, + "learning_rate": 3.1876928051692923e-06, + "logits/chosen": -0.5212976932525635, + "logits/rejected": -0.580498993396759, + "logps/chosen": -57.76686477661133, + "logps/rejected": -96.13892364501953, + "loss": 0.7271, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1741034984588623, + "rewards/margins": 5.598968982696533, + "rewards/rejected": -2.42486572265625, + "step": 12352 + }, + { + "epoch": 3.09, + "grad_norm": 5.256584167480469, + "learning_rate": 3.1869602880382877e-06, + "logits/chosen": -0.5440899729728699, + "logits/rejected": -0.6499309539794922, + "logps/chosen": -59.290016174316406, + "logps/rejected": -107.44755554199219, + "loss": 0.6941, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1873841285705566, + "rewards/margins": 6.936734199523926, + "rewards/rejected": -3.749349594116211, + "step": 12353 + }, + { + "epoch": 3.09, + "grad_norm": 6.601120948791504, + "learning_rate": 3.186227815709425e-06, + "logits/chosen": -0.4818916320800781, + "logits/rejected": -0.5899330377578735, + "logps/chosen": -54.53880310058594, + "logps/rejected": -108.14957427978516, + "loss": 0.7228, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.045069932937622, + "rewards/margins": 5.860480785369873, + "rewards/rejected": -2.81541109085083, + "step": 12354 + }, + { + "epoch": 3.09, + "grad_norm": 6.163271427154541, + "learning_rate": 3.185495388200799e-06, + "logits/chosen": -0.5403122901916504, + "logits/rejected": -0.5881897211074829, + "logps/chosen": -51.83635711669922, + "logps/rejected": -111.68817901611328, + "loss": 0.6451, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.927834987640381, + "rewards/margins": 6.994967937469482, + "rewards/rejected": -4.067132949829102, + "step": 12355 + }, + { + "epoch": 3.09, + "grad_norm": 2.911566734313965, + "learning_rate": 3.1847630055305128e-06, + "logits/chosen": -0.475212037563324, + "logits/rejected": -0.5962030291557312, + "logps/chosen": -63.032344818115234, + "logps/rejected": -101.22775268554688, + "loss": 0.5851, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2238762378692627, + "rewards/margins": 7.813467979431152, + "rewards/rejected": -4.589591026306152, + "step": 12356 + }, + { + "epoch": 3.09, + "grad_norm": 7.77146053314209, + "learning_rate": 3.184030667716664e-06, + "logits/chosen": -0.4351043999195099, + "logits/rejected": -0.5322498083114624, + "logps/chosen": -60.30982971191406, + "logps/rejected": -118.47311401367188, + "loss": 0.5893, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.80556058883667, + "rewards/margins": 7.055203437805176, + "rewards/rejected": -4.249642372131348, + "step": 12357 + }, + { + "epoch": 3.09, + "grad_norm": 7.419487476348877, + "learning_rate": 3.1832983747773496e-06, + "logits/chosen": -0.5212933421134949, + "logits/rejected": -0.604636013507843, + "logps/chosen": -51.59836959838867, + "logps/rejected": -109.23548126220703, + "loss": 0.59, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.153477191925049, + "rewards/margins": 6.606485843658447, + "rewards/rejected": -3.4530093669891357, + "step": 12358 + }, + { + "epoch": 3.09, + "grad_norm": 3.2988293170928955, + "learning_rate": 3.182566126730664e-06, + "logits/chosen": -0.6458559632301331, + "logits/rejected": -0.6881715655326843, + "logps/chosen": -46.8039436340332, + "logps/rejected": -101.69169616699219, + "loss": 0.5715, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0725255012512207, + "rewards/margins": 6.443905353546143, + "rewards/rejected": -3.371379852294922, + "step": 12359 + }, + { + "epoch": 3.09, + "grad_norm": 6.787739276885986, + "learning_rate": 3.1818339235947015e-06, + "logits/chosen": -0.573623538017273, + "logits/rejected": -0.6649253964424133, + "logps/chosen": -56.305267333984375, + "logps/rejected": -110.55864715576172, + "loss": 0.5401, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9974851608276367, + "rewards/margins": 7.376874923706055, + "rewards/rejected": -4.379389762878418, + "step": 12360 + }, + { + "epoch": 3.09, + "grad_norm": 3.9003942012786865, + "learning_rate": 3.1811017653875585e-06, + "logits/chosen": -0.45625805854797363, + "logits/rejected": -0.6181090474128723, + "logps/chosen": -57.101200103759766, + "logps/rejected": -98.21953582763672, + "loss": 0.6262, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1205556392669678, + "rewards/margins": 7.856017112731934, + "rewards/rejected": -4.735461711883545, + "step": 12361 + }, + { + "epoch": 3.09, + "grad_norm": 6.703307151794434, + "learning_rate": 3.180369652127325e-06, + "logits/chosen": -0.47519221901893616, + "logits/rejected": -0.6017471551895142, + "logps/chosen": -57.102230072021484, + "logps/rejected": -116.22506713867188, + "loss": 0.5403, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.112729072570801, + "rewards/margins": 8.306900024414062, + "rewards/rejected": -5.19417142868042, + "step": 12362 + }, + { + "epoch": 3.09, + "grad_norm": 7.3732194900512695, + "learning_rate": 3.179637583832092e-06, + "logits/chosen": -0.5276866555213928, + "logits/rejected": -0.6015332341194153, + "logps/chosen": -54.080047607421875, + "logps/rejected": -117.90082550048828, + "loss": 0.648, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8468422889709473, + "rewards/margins": 7.544046401977539, + "rewards/rejected": -4.697205066680908, + "step": 12363 + }, + { + "epoch": 3.09, + "grad_norm": 3.8731424808502197, + "learning_rate": 3.17890556051995e-06, + "logits/chosen": -0.44262266159057617, + "logits/rejected": -0.545039176940918, + "logps/chosen": -53.2862663269043, + "logps/rejected": -106.34284973144531, + "loss": 0.5496, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9957199096679688, + "rewards/margins": 7.412843704223633, + "rewards/rejected": -4.417123794555664, + "step": 12364 + }, + { + "epoch": 3.09, + "grad_norm": 3.191267728805542, + "learning_rate": 3.178173582208991e-06, + "logits/chosen": -0.5303533673286438, + "logits/rejected": -0.6244940757751465, + "logps/chosen": -49.67494201660156, + "logps/rejected": -115.85423278808594, + "loss": 0.5444, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.266127586364746, + "rewards/margins": 8.004859924316406, + "rewards/rejected": -4.73873233795166, + "step": 12365 + }, + { + "epoch": 3.09, + "grad_norm": 5.962968826293945, + "learning_rate": 3.177441648917299e-06, + "logits/chosen": -0.5280771851539612, + "logits/rejected": -0.643591046333313, + "logps/chosen": -63.493534088134766, + "logps/rejected": -96.27899932861328, + "loss": 0.6527, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0277669429779053, + "rewards/margins": 5.939786911010742, + "rewards/rejected": -2.912020206451416, + "step": 12366 + }, + { + "epoch": 3.09, + "grad_norm": 4.343913555145264, + "learning_rate": 3.1767097606629615e-06, + "logits/chosen": -0.5504844188690186, + "logits/rejected": -0.6366065740585327, + "logps/chosen": -50.96856689453125, + "logps/rejected": -98.98541259765625, + "loss": 0.578, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.833472728729248, + "rewards/margins": 6.209067344665527, + "rewards/rejected": -3.3755946159362793, + "step": 12367 + }, + { + "epoch": 3.09, + "grad_norm": 4.033862590789795, + "learning_rate": 3.175977917464066e-06, + "logits/chosen": -0.5504695177078247, + "logits/rejected": -0.5961251258850098, + "logps/chosen": -47.841758728027344, + "logps/rejected": -93.9706802368164, + "loss": 0.6064, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2098305225372314, + "rewards/margins": 6.2182393074035645, + "rewards/rejected": -3.008409023284912, + "step": 12368 + }, + { + "epoch": 3.09, + "grad_norm": 5.973080158233643, + "learning_rate": 3.1752461193386974e-06, + "logits/chosen": -0.5308067798614502, + "logits/rejected": -0.6163604855537415, + "logps/chosen": -46.20140838623047, + "logps/rejected": -111.67584228515625, + "loss": 0.598, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.196591377258301, + "rewards/margins": 6.286142826080322, + "rewards/rejected": -3.0895514488220215, + "step": 12369 + }, + { + "epoch": 3.09, + "grad_norm": 4.182408332824707, + "learning_rate": 3.174514366304937e-06, + "logits/chosen": -0.45464539527893066, + "logits/rejected": -0.5719736218452454, + "logps/chosen": -53.62091827392578, + "logps/rejected": -82.32249450683594, + "loss": 0.6367, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1395747661590576, + "rewards/margins": 6.160958290100098, + "rewards/rejected": -3.021383762359619, + "step": 12370 + }, + { + "epoch": 3.09, + "grad_norm": 6.3515625, + "learning_rate": 3.1737826583808684e-06, + "logits/chosen": -0.46422287821769714, + "logits/rejected": -0.5683159828186035, + "logps/chosen": -47.34593963623047, + "logps/rejected": -87.56143951416016, + "loss": 0.6112, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.068798065185547, + "rewards/margins": 5.69142484664917, + "rewards/rejected": -2.6226272583007812, + "step": 12371 + }, + { + "epoch": 3.1, + "grad_norm": 5.709962844848633, + "learning_rate": 3.1730509955845734e-06, + "logits/chosen": -0.5363901853561401, + "logits/rejected": -0.6343846321105957, + "logps/chosen": -47.62862777709961, + "logps/rejected": -96.32572174072266, + "loss": 0.5966, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2424120903015137, + "rewards/margins": 6.8965911865234375, + "rewards/rejected": -3.654179096221924, + "step": 12372 + }, + { + "epoch": 3.1, + "grad_norm": 8.440810203552246, + "learning_rate": 3.172319377934131e-06, + "logits/chosen": -0.577865719795227, + "logits/rejected": -0.6398105621337891, + "logps/chosen": -51.180763244628906, + "logps/rejected": -102.60177612304688, + "loss": 0.6604, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0787923336029053, + "rewards/margins": 6.626886367797852, + "rewards/rejected": -3.548093557357788, + "step": 12373 + }, + { + "epoch": 3.1, + "grad_norm": 7.546627998352051, + "learning_rate": 3.171587805447621e-06, + "logits/chosen": -0.5311260223388672, + "logits/rejected": -0.6485378742218018, + "logps/chosen": -57.34803771972656, + "logps/rejected": -114.0166244506836, + "loss": 0.5995, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.780890941619873, + "rewards/margins": 7.86232852935791, + "rewards/rejected": -5.081438064575195, + "step": 12374 + }, + { + "epoch": 3.1, + "grad_norm": 4.5539631843566895, + "learning_rate": 3.1708562781431206e-06, + "logits/chosen": -0.5442816019058228, + "logits/rejected": -0.6424906849861145, + "logps/chosen": -53.09408187866211, + "logps/rejected": -110.18132019042969, + "loss": 0.73, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8627452850341797, + "rewards/margins": 6.67162561416626, + "rewards/rejected": -3.80888032913208, + "step": 12375 + }, + { + "epoch": 3.1, + "grad_norm": 5.596814155578613, + "learning_rate": 3.1701247960387082e-06, + "logits/chosen": -0.5718833208084106, + "logits/rejected": -0.6178258657455444, + "logps/chosen": -63.20820236206055, + "logps/rejected": -112.79369354248047, + "loss": 0.6702, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.021801471710205, + "rewards/margins": 6.844432830810547, + "rewards/rejected": -3.822631359100342, + "step": 12376 + }, + { + "epoch": 3.1, + "grad_norm": 3.499584674835205, + "learning_rate": 3.1693933591524573e-06, + "logits/chosen": -0.5247929096221924, + "logits/rejected": -0.5946319699287415, + "logps/chosen": -54.06850814819336, + "logps/rejected": -116.95499420166016, + "loss": 0.6188, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.864025831222534, + "rewards/margins": 7.016632080078125, + "rewards/rejected": -4.15260648727417, + "step": 12377 + }, + { + "epoch": 3.1, + "grad_norm": 11.386775970458984, + "learning_rate": 3.1686619675024435e-06, + "logits/chosen": -0.5174939036369324, + "logits/rejected": -0.6077674627304077, + "logps/chosen": -53.83929443359375, + "logps/rejected": -85.64432525634766, + "loss": 0.6753, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.71939754486084, + "rewards/margins": 6.051560401916504, + "rewards/rejected": -3.332162857055664, + "step": 12378 + }, + { + "epoch": 3.1, + "grad_norm": 5.390069961547852, + "learning_rate": 3.1679306211067416e-06, + "logits/chosen": -0.5263607501983643, + "logits/rejected": -0.6386385560035706, + "logps/chosen": -56.80198669433594, + "logps/rejected": -99.92341613769531, + "loss": 0.6272, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.959860324859619, + "rewards/margins": 7.598937034606934, + "rewards/rejected": -4.6390767097473145, + "step": 12379 + }, + { + "epoch": 3.1, + "grad_norm": 3.7899210453033447, + "learning_rate": 3.167199319983422e-06, + "logits/chosen": -0.5276739597320557, + "logits/rejected": -0.6161553263664246, + "logps/chosen": -52.92820739746094, + "logps/rejected": -102.74180603027344, + "loss": 0.6347, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2383439540863037, + "rewards/margins": 6.338196754455566, + "rewards/rejected": -3.0998528003692627, + "step": 12380 + }, + { + "epoch": 3.1, + "grad_norm": 7.3536272048950195, + "learning_rate": 3.1664680641505553e-06, + "logits/chosen": -0.522930383682251, + "logits/rejected": -0.600471019744873, + "logps/chosen": -51.55884552001953, + "logps/rejected": -123.90402221679688, + "loss": 0.6457, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.885319948196411, + "rewards/margins": 8.003426551818848, + "rewards/rejected": -5.118106365203857, + "step": 12381 + }, + { + "epoch": 3.1, + "grad_norm": 7.007680892944336, + "learning_rate": 3.1657368536262138e-06, + "logits/chosen": -0.47688519954681396, + "logits/rejected": -0.5856422185897827, + "logps/chosen": -63.57508850097656, + "logps/rejected": -106.93191528320312, + "loss": 0.7829, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8775107860565186, + "rewards/margins": 4.864410877227783, + "rewards/rejected": -1.9869003295898438, + "step": 12382 + }, + { + "epoch": 3.1, + "grad_norm": 12.366668701171875, + "learning_rate": 3.1650056884284687e-06, + "logits/chosen": -0.49648186564445496, + "logits/rejected": -0.5789873003959656, + "logps/chosen": -60.93205642700195, + "logps/rejected": -105.20354461669922, + "loss": 0.782, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.966460704803467, + "rewards/margins": 5.856509685516357, + "rewards/rejected": -2.890049457550049, + "step": 12383 + }, + { + "epoch": 3.1, + "grad_norm": 2.7502551078796387, + "learning_rate": 3.1642745685753806e-06, + "logits/chosen": -0.5447583198547363, + "logits/rejected": -0.6405621767044067, + "logps/chosen": -51.46445083618164, + "logps/rejected": -106.07618713378906, + "loss": 0.5715, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.172083616256714, + "rewards/margins": 7.425937652587891, + "rewards/rejected": -4.2538533210754395, + "step": 12384 + }, + { + "epoch": 3.1, + "grad_norm": 5.435296058654785, + "learning_rate": 3.1635434940850217e-06, + "logits/chosen": -0.469648540019989, + "logits/rejected": -0.5311518907546997, + "logps/chosen": -56.28089904785156, + "logps/rejected": -101.94444274902344, + "loss": 0.6688, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0176942348480225, + "rewards/margins": 5.95761775970459, + "rewards/rejected": -2.9399240016937256, + "step": 12385 + }, + { + "epoch": 3.1, + "grad_norm": 3.8062405586242676, + "learning_rate": 3.162812464975458e-06, + "logits/chosen": -0.5116932988166809, + "logits/rejected": -0.5671799778938293, + "logps/chosen": -53.28458023071289, + "logps/rejected": -111.15007781982422, + "loss": 0.6349, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.814979314804077, + "rewards/margins": 6.682357311248779, + "rewards/rejected": -3.8673782348632812, + "step": 12386 + }, + { + "epoch": 3.1, + "grad_norm": 12.317935943603516, + "learning_rate": 3.162081481264751e-06, + "logits/chosen": -0.5224651098251343, + "logits/rejected": -0.6276232004165649, + "logps/chosen": -51.44025421142578, + "logps/rejected": -100.49942016601562, + "loss": 0.6351, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.910787343978882, + "rewards/margins": 7.393919467926025, + "rewards/rejected": -4.4831318855285645, + "step": 12387 + }, + { + "epoch": 3.1, + "grad_norm": 4.108512878417969, + "learning_rate": 3.1613505429709656e-06, + "logits/chosen": -0.5849460363388062, + "logits/rejected": -0.6680872440338135, + "logps/chosen": -44.6355094909668, + "logps/rejected": -94.99790954589844, + "loss": 0.5974, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2952141761779785, + "rewards/margins": 7.146631240844727, + "rewards/rejected": -3.8514161109924316, + "step": 12388 + }, + { + "epoch": 3.1, + "grad_norm": 3.4191219806671143, + "learning_rate": 3.160619650112164e-06, + "logits/chosen": -0.4698494076728821, + "logits/rejected": -0.5461417436599731, + "logps/chosen": -55.942832946777344, + "logps/rejected": -98.34140014648438, + "loss": 0.5725, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4063351154327393, + "rewards/margins": 6.002264499664307, + "rewards/rejected": -2.595928430557251, + "step": 12389 + }, + { + "epoch": 3.1, + "grad_norm": 10.929844856262207, + "learning_rate": 3.159888802706408e-06, + "logits/chosen": -0.49761807918548584, + "logits/rejected": -0.5491084456443787, + "logps/chosen": -49.80865478515625, + "logps/rejected": -104.22364807128906, + "loss": 0.6684, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1513147354125977, + "rewards/margins": 5.928969860076904, + "rewards/rejected": -2.7776553630828857, + "step": 12390 + }, + { + "epoch": 3.1, + "grad_norm": 3.470355272293091, + "learning_rate": 3.159158000771756e-06, + "logits/chosen": -0.550309419631958, + "logits/rejected": -0.6539334654808044, + "logps/chosen": -45.27801513671875, + "logps/rejected": -95.18287658691406, + "loss": 0.5356, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2553415298461914, + "rewards/margins": 7.554791450500488, + "rewards/rejected": -4.299449920654297, + "step": 12391 + }, + { + "epoch": 3.1, + "grad_norm": 6.088961601257324, + "learning_rate": 3.158427244326268e-06, + "logits/chosen": -0.5477783679962158, + "logits/rejected": -0.6170124411582947, + "logps/chosen": -50.212059020996094, + "logps/rejected": -105.50233459472656, + "loss": 0.6937, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.53645658493042, + "rewards/margins": 7.4073710441589355, + "rewards/rejected": -3.8709144592285156, + "step": 12392 + }, + { + "epoch": 3.1, + "grad_norm": 12.309365272521973, + "learning_rate": 3.157696533388003e-06, + "logits/chosen": -0.4715227484703064, + "logits/rejected": -0.5206667184829712, + "logps/chosen": -57.53341293334961, + "logps/rejected": -89.08097076416016, + "loss": 0.6292, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9143972396850586, + "rewards/margins": 5.712663650512695, + "rewards/rejected": -2.7982664108276367, + "step": 12393 + }, + { + "epoch": 3.1, + "grad_norm": 14.539429664611816, + "learning_rate": 3.156965867975014e-06, + "logits/chosen": -0.5103981494903564, + "logits/rejected": -0.5910845398902893, + "logps/chosen": -57.998958587646484, + "logps/rejected": -110.68766021728516, + "loss": 0.6866, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.741783380508423, + "rewards/margins": 6.563553333282471, + "rewards/rejected": -3.821770191192627, + "step": 12394 + }, + { + "epoch": 3.1, + "grad_norm": 11.036808967590332, + "learning_rate": 3.15623524810536e-06, + "logits/chosen": -0.5741276741027832, + "logits/rejected": -0.6439353227615356, + "logps/chosen": -46.54167556762695, + "logps/rejected": -89.74431610107422, + "loss": 0.6321, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8464622497558594, + "rewards/margins": 5.307849407196045, + "rewards/rejected": -2.4613869190216064, + "step": 12395 + }, + { + "epoch": 3.1, + "grad_norm": 3.8766133785247803, + "learning_rate": 3.1555046737970923e-06, + "logits/chosen": -0.4950890839099884, + "logits/rejected": -0.5431129336357117, + "logps/chosen": -49.351951599121094, + "logps/rejected": -102.32266235351562, + "loss": 0.6141, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0511982440948486, + "rewards/margins": 6.931925296783447, + "rewards/rejected": -3.8807272911071777, + "step": 12396 + }, + { + "epoch": 3.1, + "grad_norm": 4.124855995178223, + "learning_rate": 3.154774145068269e-06, + "logits/chosen": -0.4642222225666046, + "logits/rejected": -0.5510925054550171, + "logps/chosen": -63.89528274536133, + "logps/rejected": -123.43412017822266, + "loss": 0.6575, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.193974494934082, + "rewards/margins": 7.541123390197754, + "rewards/rejected": -4.3471479415893555, + "step": 12397 + }, + { + "epoch": 3.1, + "grad_norm": 3.8780148029327393, + "learning_rate": 3.1540436619369367e-06, + "logits/chosen": -0.6433127522468567, + "logits/rejected": -0.6812201142311096, + "logps/chosen": -50.620155334472656, + "logps/rejected": -101.81564331054688, + "loss": 0.5812, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2123682498931885, + "rewards/margins": 5.912450313568115, + "rewards/rejected": -2.7000820636749268, + "step": 12398 + }, + { + "epoch": 3.1, + "grad_norm": 4.522823333740234, + "learning_rate": 3.15331322442115e-06, + "logits/chosen": -0.5899792909622192, + "logits/rejected": -0.6591723561286926, + "logps/chosen": -52.96729278564453, + "logps/rejected": -101.72071838378906, + "loss": 0.629, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2326197624206543, + "rewards/margins": 6.154483795166016, + "rewards/rejected": -2.9218637943267822, + "step": 12399 + }, + { + "epoch": 3.1, + "grad_norm": 13.094315528869629, + "learning_rate": 3.1525828325389574e-06, + "logits/chosen": -0.4743708670139313, + "logits/rejected": -0.5591219663619995, + "logps/chosen": -56.17680358886719, + "logps/rejected": -116.32147216796875, + "loss": 0.6412, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.819277286529541, + "rewards/margins": 6.856185436248779, + "rewards/rejected": -4.03690767288208, + "step": 12400 + }, + { + "epoch": 3.1, + "grad_norm": 8.604101181030273, + "learning_rate": 3.1518524863084094e-06, + "logits/chosen": -0.5166822671890259, + "logits/rejected": -0.5486436486244202, + "logps/chosen": -51.33020782470703, + "logps/rejected": -101.15940856933594, + "loss": 0.6456, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0624256134033203, + "rewards/margins": 5.407009601593018, + "rewards/rejected": -2.3445842266082764, + "step": 12401 + }, + { + "epoch": 3.1, + "grad_norm": 4.830276966094971, + "learning_rate": 3.151122185747551e-06, + "logits/chosen": -0.5468831062316895, + "logits/rejected": -0.5977088212966919, + "logps/chosen": -46.734657287597656, + "logps/rejected": -93.42037200927734, + "loss": 0.5703, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3409695625305176, + "rewards/margins": 5.9488301277160645, + "rewards/rejected": -2.6078603267669678, + "step": 12402 + }, + { + "epoch": 3.1, + "grad_norm": 3.790815830230713, + "learning_rate": 3.1503919308744303e-06, + "logits/chosen": -0.5380789041519165, + "logits/rejected": -0.6278725862503052, + "logps/chosen": -52.697723388671875, + "logps/rejected": -96.88246154785156, + "loss": 0.6402, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9953198432922363, + "rewards/margins": 6.167664051055908, + "rewards/rejected": -3.172344207763672, + "step": 12403 + }, + { + "epoch": 3.1, + "grad_norm": 3.83369779586792, + "learning_rate": 3.1496617217070926e-06, + "logits/chosen": -0.6011368632316589, + "logits/rejected": -0.5873909592628479, + "logps/chosen": -41.396942138671875, + "logps/rejected": -117.35994720458984, + "loss": 0.6968, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1959774494171143, + "rewards/margins": 7.142214775085449, + "rewards/rejected": -3.946237564086914, + "step": 12404 + }, + { + "epoch": 3.1, + "grad_norm": 5.63986873626709, + "learning_rate": 3.1489315582635814e-06, + "logits/chosen": -0.5438316464424133, + "logits/rejected": -0.6088975667953491, + "logps/chosen": -40.93077850341797, + "logps/rejected": -82.87017822265625, + "loss": 0.663, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3530845642089844, + "rewards/margins": 6.5472822189331055, + "rewards/rejected": -3.194197177886963, + "step": 12405 + }, + { + "epoch": 3.1, + "grad_norm": 2.906374931335449, + "learning_rate": 3.1482014405619398e-06, + "logits/chosen": -0.46697816252708435, + "logits/rejected": -0.5450018048286438, + "logps/chosen": -46.15947723388672, + "logps/rejected": -100.51116180419922, + "loss": 0.5472, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9291114807128906, + "rewards/margins": 7.600343704223633, + "rewards/rejected": -4.671232223510742, + "step": 12406 + }, + { + "epoch": 3.1, + "grad_norm": 3.9101428985595703, + "learning_rate": 3.1474713686202096e-06, + "logits/chosen": -0.5255811214447021, + "logits/rejected": -0.608338475227356, + "logps/chosen": -60.42217254638672, + "logps/rejected": -85.0744857788086, + "loss": 0.69, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.335233211517334, + "rewards/margins": 5.9676337242126465, + "rewards/rejected": -2.6324002742767334, + "step": 12407 + }, + { + "epoch": 3.1, + "grad_norm": 5.625021934509277, + "learning_rate": 3.1467413424564353e-06, + "logits/chosen": -0.5290718078613281, + "logits/rejected": -0.5997839570045471, + "logps/chosen": -70.79581451416016, + "logps/rejected": -117.6197509765625, + "loss": 0.7054, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3629677295684814, + "rewards/margins": 7.347724437713623, + "rewards/rejected": -3.9847571849823, + "step": 12408 + }, + { + "epoch": 3.1, + "grad_norm": 3.8803062438964844, + "learning_rate": 3.1460113620886517e-06, + "logits/chosen": -0.4601050019264221, + "logits/rejected": -0.5589668154716492, + "logps/chosen": -55.35964584350586, + "logps/rejected": -102.68730163574219, + "loss": 0.586, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4074110984802246, + "rewards/margins": 7.4237775802612305, + "rewards/rejected": -4.016366481781006, + "step": 12409 + }, + { + "epoch": 3.1, + "grad_norm": 7.569692134857178, + "learning_rate": 3.1452814275348986e-06, + "logits/chosen": -0.5235545039176941, + "logits/rejected": -0.5592822432518005, + "logps/chosen": -56.51206588745117, + "logps/rejected": -98.76446533203125, + "loss": 0.661, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.814096212387085, + "rewards/margins": 5.037216663360596, + "rewards/rejected": -2.22312068939209, + "step": 12410 + }, + { + "epoch": 3.1, + "grad_norm": 3.6429178714752197, + "learning_rate": 3.1445515388132165e-06, + "logits/chosen": -0.5201660990715027, + "logits/rejected": -0.6045902967453003, + "logps/chosen": -53.74119567871094, + "logps/rejected": -111.1497802734375, + "loss": 0.5683, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.916982650756836, + "rewards/margins": 6.638926029205322, + "rewards/rejected": -3.7219438552856445, + "step": 12411 + }, + { + "epoch": 3.11, + "grad_norm": 4.550854682922363, + "learning_rate": 3.1438216959416367e-06, + "logits/chosen": -0.5649656653404236, + "logits/rejected": -0.642199695110321, + "logps/chosen": -51.447200775146484, + "logps/rejected": -100.41091918945312, + "loss": 0.6488, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.966282606124878, + "rewards/margins": 6.44916296005249, + "rewards/rejected": -3.4828810691833496, + "step": 12412 + }, + { + "epoch": 3.11, + "grad_norm": 8.151015281677246, + "learning_rate": 3.1430918989381988e-06, + "logits/chosen": -0.5946688652038574, + "logits/rejected": -0.6708040237426758, + "logps/chosen": -55.75446701049805, + "logps/rejected": -93.15176391601562, + "loss": 0.7117, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0947787761688232, + "rewards/margins": 6.9660964012146, + "rewards/rejected": -3.8713178634643555, + "step": 12413 + }, + { + "epoch": 3.11, + "grad_norm": 6.001729488372803, + "learning_rate": 3.1423621478209344e-06, + "logits/chosen": -0.5273883938789368, + "logits/rejected": -0.6368588209152222, + "logps/chosen": -57.74388122558594, + "logps/rejected": -91.39791870117188, + "loss": 0.5865, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.683051824569702, + "rewards/margins": 5.967270851135254, + "rewards/rejected": -3.284219264984131, + "step": 12414 + }, + { + "epoch": 3.11, + "grad_norm": 9.250164985656738, + "learning_rate": 3.141632442607878e-06, + "logits/chosen": -0.5706161856651306, + "logits/rejected": -0.6377269625663757, + "logps/chosen": -58.948177337646484, + "logps/rejected": -105.76113891601562, + "loss": 0.6567, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.111278533935547, + "rewards/margins": 6.587618827819824, + "rewards/rejected": -3.47633957862854, + "step": 12415 + }, + { + "epoch": 3.11, + "grad_norm": 6.685643196105957, + "learning_rate": 3.1409027833170604e-06, + "logits/chosen": -0.4489911198616028, + "logits/rejected": -0.5888998508453369, + "logps/chosen": -80.7357177734375, + "logps/rejected": -102.18791198730469, + "loss": 0.6412, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7735776901245117, + "rewards/margins": 6.8915510177612305, + "rewards/rejected": -4.117973327636719, + "step": 12416 + }, + { + "epoch": 3.11, + "grad_norm": 4.041614055633545, + "learning_rate": 3.1401731699665116e-06, + "logits/chosen": -0.5552805066108704, + "logits/rejected": -0.6544772386550903, + "logps/chosen": -54.550479888916016, + "logps/rejected": -88.7488784790039, + "loss": 0.6674, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.93076491355896, + "rewards/margins": 6.052441596984863, + "rewards/rejected": -3.121676445007324, + "step": 12417 + }, + { + "epoch": 3.11, + "grad_norm": 3.7286667823791504, + "learning_rate": 3.1394436025742637e-06, + "logits/chosen": -0.5250458121299744, + "logits/rejected": -0.5799874067306519, + "logps/chosen": -46.860801696777344, + "logps/rejected": -113.11872863769531, + "loss": 0.5861, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.066922187805176, + "rewards/margins": 8.26753044128418, + "rewards/rejected": -5.20060920715332, + "step": 12418 + }, + { + "epoch": 3.11, + "grad_norm": 4.496034622192383, + "learning_rate": 3.1387140811583416e-06, + "logits/chosen": -0.4528777003288269, + "logits/rejected": -0.5676324367523193, + "logps/chosen": -55.78163146972656, + "logps/rejected": -94.42024230957031, + "loss": 0.604, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9972832202911377, + "rewards/margins": 6.708078861236572, + "rewards/rejected": -3.7107958793640137, + "step": 12419 + }, + { + "epoch": 3.11, + "grad_norm": 3.165531635284424, + "learning_rate": 3.1379846057367745e-06, + "logits/chosen": -0.5378901958465576, + "logits/rejected": -0.6562497615814209, + "logps/chosen": -57.294395446777344, + "logps/rejected": -84.80339813232422, + "loss": 0.6455, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3431248664855957, + "rewards/margins": 7.354541778564453, + "rewards/rejected": -4.011416912078857, + "step": 12420 + }, + { + "epoch": 3.11, + "grad_norm": 3.45847487449646, + "learning_rate": 3.1372551763275888e-06, + "logits/chosen": -0.48767638206481934, + "logits/rejected": -0.5892429947853088, + "logps/chosen": -52.516075134277344, + "logps/rejected": -99.94462585449219, + "loss": 0.5892, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1664416790008545, + "rewards/margins": 7.276003837585449, + "rewards/rejected": -4.109561920166016, + "step": 12421 + }, + { + "epoch": 3.11, + "grad_norm": 9.865621566772461, + "learning_rate": 3.1365257929488102e-06, + "logits/chosen": -0.5114577412605286, + "logits/rejected": -0.56439208984375, + "logps/chosen": -51.62334442138672, + "logps/rejected": -118.97703552246094, + "loss": 0.6136, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.868408441543579, + "rewards/margins": 7.316218852996826, + "rewards/rejected": -4.447810649871826, + "step": 12422 + }, + { + "epoch": 3.11, + "grad_norm": 6.488589286804199, + "learning_rate": 3.1357964556184595e-06, + "logits/chosen": -0.4851066470146179, + "logits/rejected": -0.5983094573020935, + "logps/chosen": -75.20417785644531, + "logps/rejected": -104.08080291748047, + "loss": 0.7537, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9204695224761963, + "rewards/margins": 6.839029312133789, + "rewards/rejected": -3.9185595512390137, + "step": 12423 + }, + { + "epoch": 3.11, + "grad_norm": 4.052945137023926, + "learning_rate": 3.1350671643545605e-06, + "logits/chosen": -0.5406253933906555, + "logits/rejected": -0.5836041569709778, + "logps/chosen": -46.100059509277344, + "logps/rejected": -122.43669128417969, + "loss": 0.6177, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0543417930603027, + "rewards/margins": 8.17934513092041, + "rewards/rejected": -5.125002861022949, + "step": 12424 + }, + { + "epoch": 3.11, + "grad_norm": 5.027695655822754, + "learning_rate": 3.1343379191751366e-06, + "logits/chosen": -0.5465483665466309, + "logits/rejected": -0.5805318355560303, + "logps/chosen": -40.759883880615234, + "logps/rejected": -106.67584228515625, + "loss": 0.5544, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.116831064224243, + "rewards/margins": 6.307253837585449, + "rewards/rejected": -3.190422773361206, + "step": 12425 + }, + { + "epoch": 3.11, + "grad_norm": 4.639980316162109, + "learning_rate": 3.133608720098209e-06, + "logits/chosen": -0.5360618829727173, + "logits/rejected": -0.5908588171005249, + "logps/chosen": -50.32036209106445, + "logps/rejected": -103.5584945678711, + "loss": 0.5832, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9888429641723633, + "rewards/margins": 6.6356520652771, + "rewards/rejected": -3.6468091011047363, + "step": 12426 + }, + { + "epoch": 3.11, + "grad_norm": 1.2953152656555176, + "learning_rate": 3.1328795671417923e-06, + "logits/chosen": -0.5755259990692139, + "logits/rejected": -0.6457093954086304, + "logps/chosen": -48.684932708740234, + "logps/rejected": -117.19525909423828, + "loss": 0.5413, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3565421104431152, + "rewards/margins": 8.607511520385742, + "rewards/rejected": -5.2509684562683105, + "step": 12427 + }, + { + "epoch": 3.11, + "grad_norm": 6.023926258087158, + "learning_rate": 3.1321504603239085e-06, + "logits/chosen": -0.5078887343406677, + "logits/rejected": -0.5995073318481445, + "logps/chosen": -53.71207046508789, + "logps/rejected": -99.11221313476562, + "loss": 0.5914, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9467673301696777, + "rewards/margins": 6.704517841339111, + "rewards/rejected": -3.7577507495880127, + "step": 12428 + }, + { + "epoch": 3.11, + "grad_norm": 8.412854194641113, + "learning_rate": 3.1314213996625746e-06, + "logits/chosen": -0.5001615881919861, + "logits/rejected": -0.5486706495285034, + "logps/chosen": -56.78009796142578, + "logps/rejected": -106.04766082763672, + "loss": 0.6314, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.950075387954712, + "rewards/margins": 5.717367649078369, + "rewards/rejected": -2.7672924995422363, + "step": 12429 + }, + { + "epoch": 3.11, + "grad_norm": 5.038395881652832, + "learning_rate": 3.1306923851758044e-06, + "logits/chosen": -0.5088449120521545, + "logits/rejected": -0.6061402559280396, + "logps/chosen": -64.84513092041016, + "logps/rejected": -117.53958892822266, + "loss": 0.7106, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.822599411010742, + "rewards/margins": 6.900967597961426, + "rewards/rejected": -4.078368186950684, + "step": 12430 + }, + { + "epoch": 3.11, + "grad_norm": 5.089274883270264, + "learning_rate": 3.1299634168816134e-06, + "logits/chosen": -0.5441138744354248, + "logits/rejected": -0.6327136158943176, + "logps/chosen": -53.67414474487305, + "logps/rejected": -101.06040954589844, + "loss": 0.6515, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9514834880828857, + "rewards/margins": 5.671791076660156, + "rewards/rejected": -2.720306634902954, + "step": 12431 + }, + { + "epoch": 3.11, + "grad_norm": 6.476896286010742, + "learning_rate": 3.1292344947980156e-06, + "logits/chosen": -0.5631989240646362, + "logits/rejected": -0.6645048260688782, + "logps/chosen": -60.51444625854492, + "logps/rejected": -101.98149108886719, + "loss": 0.7053, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.894932508468628, + "rewards/margins": 6.151794910430908, + "rewards/rejected": -3.2568626403808594, + "step": 12432 + }, + { + "epoch": 3.11, + "grad_norm": 6.187932014465332, + "learning_rate": 3.1285056189430243e-06, + "logits/chosen": -0.5427389740943909, + "logits/rejected": -0.6053730249404907, + "logps/chosen": -55.2205810546875, + "logps/rejected": -110.82736206054688, + "loss": 0.6477, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.161527156829834, + "rewards/margins": 6.8550262451171875, + "rewards/rejected": -3.6934993267059326, + "step": 12433 + }, + { + "epoch": 3.11, + "grad_norm": 6.952874660491943, + "learning_rate": 3.127776789334649e-06, + "logits/chosen": -0.5943218469619751, + "logits/rejected": -0.650535523891449, + "logps/chosen": -48.562904357910156, + "logps/rejected": -105.9195785522461, + "loss": 0.6007, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9950459003448486, + "rewards/margins": 6.766280174255371, + "rewards/rejected": -3.7712342739105225, + "step": 12434 + }, + { + "epoch": 3.11, + "grad_norm": 4.0952467918396, + "learning_rate": 3.1270480059909005e-06, + "logits/chosen": -0.5209389328956604, + "logits/rejected": -0.6455116271972656, + "logps/chosen": -67.97909545898438, + "logps/rejected": -99.64046478271484, + "loss": 0.6101, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1048905849456787, + "rewards/margins": 7.264862537384033, + "rewards/rejected": -4.159972190856934, + "step": 12435 + }, + { + "epoch": 3.11, + "grad_norm": 3.423283576965332, + "learning_rate": 3.126319268929788e-06, + "logits/chosen": -0.44996178150177, + "logits/rejected": -0.5607470870018005, + "logps/chosen": -61.79644775390625, + "logps/rejected": -92.31007385253906, + "loss": 0.5311, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.926729202270508, + "rewards/margins": 6.340697288513184, + "rewards/rejected": -3.413968086242676, + "step": 12436 + }, + { + "epoch": 3.11, + "grad_norm": 5.197710990905762, + "learning_rate": 3.1255905781693195e-06, + "logits/chosen": -0.5672292113304138, + "logits/rejected": -0.6691938042640686, + "logps/chosen": -48.54457092285156, + "logps/rejected": -99.37217712402344, + "loss": 0.648, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0580906867980957, + "rewards/margins": 6.697756767272949, + "rewards/rejected": -3.639666795730591, + "step": 12437 + }, + { + "epoch": 3.11, + "grad_norm": 3.6392924785614014, + "learning_rate": 3.124861933727501e-06, + "logits/chosen": -0.628756046295166, + "logits/rejected": -0.7391288876533508, + "logps/chosen": -51.463294982910156, + "logps/rejected": -89.83367919921875, + "loss": 0.6096, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0090084075927734, + "rewards/margins": 6.846285343170166, + "rewards/rejected": -3.8372769355773926, + "step": 12438 + }, + { + "epoch": 3.11, + "grad_norm": 1.706902027130127, + "learning_rate": 3.124133335622338e-06, + "logits/chosen": -0.47912901639938354, + "logits/rejected": -0.563173770904541, + "logps/chosen": -55.57172393798828, + "logps/rejected": -115.43331146240234, + "loss": 0.5576, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.092383623123169, + "rewards/margins": 7.600494861602783, + "rewards/rejected": -4.508111476898193, + "step": 12439 + }, + { + "epoch": 3.11, + "grad_norm": 4.201337814331055, + "learning_rate": 3.123404783871838e-06, + "logits/chosen": -0.5325897932052612, + "logits/rejected": -0.601459264755249, + "logps/chosen": -47.961917877197266, + "logps/rejected": -107.7877197265625, + "loss": 0.5483, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.404960870742798, + "rewards/margins": 7.454862117767334, + "rewards/rejected": -4.049901008605957, + "step": 12440 + }, + { + "epoch": 3.11, + "grad_norm": 3.5184993743896484, + "learning_rate": 3.122676278493999e-06, + "logits/chosen": -0.506269097328186, + "logits/rejected": -0.5709449648857117, + "logps/chosen": -41.721702575683594, + "logps/rejected": -100.60071563720703, + "loss": 0.5928, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3532028198242188, + "rewards/margins": 6.892541408538818, + "rewards/rejected": -3.539339065551758, + "step": 12441 + }, + { + "epoch": 3.11, + "grad_norm": 1.789773941040039, + "learning_rate": 3.121947819506827e-06, + "logits/chosen": -0.4979372024536133, + "logits/rejected": -0.5993158221244812, + "logps/chosen": -65.34356689453125, + "logps/rejected": -92.94023895263672, + "loss": 0.6025, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3575356006622314, + "rewards/margins": 7.77596378326416, + "rewards/rejected": -4.41842794418335, + "step": 12442 + }, + { + "epoch": 3.11, + "grad_norm": 6.281520843505859, + "learning_rate": 3.121219406928324e-06, + "logits/chosen": -0.5814773440361023, + "logits/rejected": -0.6520724296569824, + "logps/chosen": -51.517642974853516, + "logps/rejected": -105.0589599609375, + "loss": 0.6112, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2464218139648438, + "rewards/margins": 7.261709213256836, + "rewards/rejected": -4.01528787612915, + "step": 12443 + }, + { + "epoch": 3.11, + "grad_norm": 4.953988075256348, + "learning_rate": 3.1204910407764846e-06, + "logits/chosen": -0.4880650043487549, + "logits/rejected": -0.5324639081954956, + "logps/chosen": -54.208133697509766, + "logps/rejected": -110.96815490722656, + "loss": 0.6398, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0664641857147217, + "rewards/margins": 6.215788841247559, + "rewards/rejected": -3.149324417114258, + "step": 12444 + }, + { + "epoch": 3.11, + "grad_norm": 5.308079242706299, + "learning_rate": 3.1197627210693122e-06, + "logits/chosen": -0.5780495405197144, + "logits/rejected": -0.6392685770988464, + "logps/chosen": -55.68377685546875, + "logps/rejected": -107.78010559082031, + "loss": 0.7248, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9659814834594727, + "rewards/margins": 5.328179836273193, + "rewards/rejected": -2.3621983528137207, + "step": 12445 + }, + { + "epoch": 3.11, + "grad_norm": 8.68088436126709, + "learning_rate": 3.1190344478248024e-06, + "logits/chosen": -0.531487226486206, + "logits/rejected": -0.6667331457138062, + "logps/chosen": -55.962371826171875, + "logps/rejected": -106.0677719116211, + "loss": 0.5772, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1244027614593506, + "rewards/margins": 7.343477249145508, + "rewards/rejected": -4.219074726104736, + "step": 12446 + }, + { + "epoch": 3.11, + "grad_norm": 4.4556145668029785, + "learning_rate": 3.118306221060953e-06, + "logits/chosen": -0.5267118811607361, + "logits/rejected": -0.5283446907997131, + "logps/chosen": -46.3757209777832, + "logps/rejected": -119.2144775390625, + "loss": 0.6444, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.951817750930786, + "rewards/margins": 6.434205532073975, + "rewards/rejected": -3.4823880195617676, + "step": 12447 + }, + { + "epoch": 3.11, + "grad_norm": 3.5536770820617676, + "learning_rate": 3.1175780407957578e-06, + "logits/chosen": -0.5933240652084351, + "logits/rejected": -0.6613659262657166, + "logps/chosen": -58.66950225830078, + "logps/rejected": -106.72303009033203, + "loss": 0.6943, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.223752498626709, + "rewards/margins": 7.565396785736084, + "rewards/rejected": -4.341644287109375, + "step": 12448 + }, + { + "epoch": 3.11, + "grad_norm": 2.8620219230651855, + "learning_rate": 3.1168499070472113e-06, + "logits/chosen": -0.5678843259811401, + "logits/rejected": -0.6614730358123779, + "logps/chosen": -58.20329666137695, + "logps/rejected": -101.54703521728516, + "loss": 0.6436, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.237044095993042, + "rewards/margins": 7.185391426086426, + "rewards/rejected": -3.9483470916748047, + "step": 12449 + }, + { + "epoch": 3.11, + "grad_norm": 15.07600212097168, + "learning_rate": 3.1161218198333065e-06, + "logits/chosen": -0.5459287762641907, + "logits/rejected": -0.622231125831604, + "logps/chosen": -62.722412109375, + "logps/rejected": -100.87643432617188, + "loss": 0.8602, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8218483924865723, + "rewards/margins": 6.138587474822998, + "rewards/rejected": -3.316739320755005, + "step": 12450 + }, + { + "epoch": 3.11, + "grad_norm": 4.352145671844482, + "learning_rate": 3.115393779172037e-06, + "logits/chosen": -0.4696054458618164, + "logits/rejected": -0.5778905153274536, + "logps/chosen": -63.704891204833984, + "logps/rejected": -106.24954223632812, + "loss": 0.6126, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.990473747253418, + "rewards/margins": 6.967784404754639, + "rewards/rejected": -3.9773106575012207, + "step": 12451 + }, + { + "epoch": 3.12, + "grad_norm": 5.744104385375977, + "learning_rate": 3.11466578508139e-06, + "logits/chosen": -0.5135017037391663, + "logits/rejected": -0.5834685564041138, + "logps/chosen": -53.14987564086914, + "logps/rejected": -101.37726593017578, + "loss": 0.5888, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1690707206726074, + "rewards/margins": 7.026778221130371, + "rewards/rejected": -3.857707977294922, + "step": 12452 + }, + { + "epoch": 3.12, + "grad_norm": 2.775027275085449, + "learning_rate": 3.1139378375793556e-06, + "logits/chosen": -0.5221775770187378, + "logits/rejected": -0.5818896293640137, + "logps/chosen": -43.04645538330078, + "logps/rejected": -111.75685119628906, + "loss": 0.5057, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.929471969604492, + "rewards/margins": 7.381417274475098, + "rewards/rejected": -4.4519453048706055, + "step": 12453 + }, + { + "epoch": 3.12, + "grad_norm": 5.119651794433594, + "learning_rate": 3.1132099366839263e-06, + "logits/chosen": -0.5258238911628723, + "logits/rejected": -0.6034775972366333, + "logps/chosen": -56.16896438598633, + "logps/rejected": -87.15699005126953, + "loss": 0.7337, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.00810170173645, + "rewards/margins": 5.7798075675964355, + "rewards/rejected": -2.7717063426971436, + "step": 12454 + }, + { + "epoch": 3.12, + "grad_norm": 5.559659481048584, + "learning_rate": 3.112482082413085e-06, + "logits/chosen": -0.5334573984146118, + "logits/rejected": -0.5808818936347961, + "logps/chosen": -44.56239700317383, + "logps/rejected": -110.6839828491211, + "loss": 0.5604, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.053985834121704, + "rewards/margins": 7.630666255950928, + "rewards/rejected": -4.5766801834106445, + "step": 12455 + }, + { + "epoch": 3.12, + "grad_norm": 3.913424253463745, + "learning_rate": 3.111754274784818e-06, + "logits/chosen": -0.5176109075546265, + "logits/rejected": -0.620478630065918, + "logps/chosen": -58.49256896972656, + "logps/rejected": -96.69210815429688, + "loss": 0.5459, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.108680486679077, + "rewards/margins": 6.928889274597168, + "rewards/rejected": -3.8202085494995117, + "step": 12456 + }, + { + "epoch": 3.12, + "grad_norm": 3.974891185760498, + "learning_rate": 3.111026513817112e-06, + "logits/chosen": -0.45447948575019836, + "logits/rejected": -0.5485615730285645, + "logps/chosen": -51.704505920410156, + "logps/rejected": -93.17386627197266, + "loss": 0.6004, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.445930242538452, + "rewards/margins": 6.573736190795898, + "rewards/rejected": -3.1278061866760254, + "step": 12457 + }, + { + "epoch": 3.12, + "grad_norm": 5.095363140106201, + "learning_rate": 3.110298799527951e-06, + "logits/chosen": -0.5329386591911316, + "logits/rejected": -0.5798783302307129, + "logps/chosen": -55.745628356933594, + "logps/rejected": -102.15156555175781, + "loss": 0.6429, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1142117977142334, + "rewards/margins": 5.768547058105469, + "rewards/rejected": -2.654334545135498, + "step": 12458 + }, + { + "epoch": 3.12, + "grad_norm": 4.320927619934082, + "learning_rate": 3.109571131935316e-06, + "logits/chosen": -0.5159252882003784, + "logits/rejected": -0.5626612305641174, + "logps/chosen": -57.73722839355469, + "logps/rejected": -99.7767333984375, + "loss": 0.6936, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2109451293945312, + "rewards/margins": 6.363798141479492, + "rewards/rejected": -3.15285325050354, + "step": 12459 + }, + { + "epoch": 3.12, + "grad_norm": 25.208099365234375, + "learning_rate": 3.1088435110571884e-06, + "logits/chosen": -0.5447350144386292, + "logits/rejected": -0.5898758769035339, + "logps/chosen": -55.825355529785156, + "logps/rejected": -127.89520263671875, + "loss": 0.755, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8292136192321777, + "rewards/margins": 7.583032131195068, + "rewards/rejected": -4.753818511962891, + "step": 12460 + }, + { + "epoch": 3.12, + "grad_norm": 8.35153579711914, + "learning_rate": 3.108115936911551e-06, + "logits/chosen": -0.5036163926124573, + "logits/rejected": -0.6214858889579773, + "logps/chosen": -57.65668869018555, + "logps/rejected": -95.29476165771484, + "loss": 0.6997, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8024067878723145, + "rewards/margins": 7.0197319984436035, + "rewards/rejected": -4.217325210571289, + "step": 12461 + }, + { + "epoch": 3.12, + "grad_norm": 8.330583572387695, + "learning_rate": 3.10738840951638e-06, + "logits/chosen": -0.5090107321739197, + "logits/rejected": -0.5928307175636292, + "logps/chosen": -51.115638732910156, + "logps/rejected": -116.4179458618164, + "loss": 0.5829, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.322539806365967, + "rewards/margins": 8.453934669494629, + "rewards/rejected": -5.1313958168029785, + "step": 12462 + }, + { + "epoch": 3.12, + "grad_norm": 3.8431310653686523, + "learning_rate": 3.106660928889654e-06, + "logits/chosen": -0.4842704236507416, + "logits/rejected": -0.5231773853302002, + "logps/chosen": -52.96818161010742, + "logps/rejected": -95.48008728027344, + "loss": 0.7055, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9786360263824463, + "rewards/margins": 5.632699966430664, + "rewards/rejected": -2.654064178466797, + "step": 12463 + }, + { + "epoch": 3.12, + "grad_norm": 4.411563396453857, + "learning_rate": 3.10593349504935e-06, + "logits/chosen": -0.5374724268913269, + "logits/rejected": -0.6618478894233704, + "logps/chosen": -74.2310791015625, + "logps/rejected": -86.88497161865234, + "loss": 0.7107, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4354748725891113, + "rewards/margins": 6.208130836486816, + "rewards/rejected": -2.772655725479126, + "step": 12464 + }, + { + "epoch": 3.12, + "grad_norm": 4.360743045806885, + "learning_rate": 3.1052061080134454e-06, + "logits/chosen": -0.5540267825126648, + "logits/rejected": -0.6085919737815857, + "logps/chosen": -51.16537094116211, + "logps/rejected": -90.47407531738281, + "loss": 0.7206, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.21952486038208, + "rewards/margins": 6.135581016540527, + "rewards/rejected": -2.9160566329956055, + "step": 12465 + }, + { + "epoch": 3.12, + "grad_norm": 8.331083297729492, + "learning_rate": 3.1044787677999128e-06, + "logits/chosen": -0.6210494041442871, + "logits/rejected": -0.700260579586029, + "logps/chosen": -43.958282470703125, + "logps/rejected": -106.73263549804688, + "loss": 0.6169, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.840930461883545, + "rewards/margins": 7.141509532928467, + "rewards/rejected": -4.30057954788208, + "step": 12466 + }, + { + "epoch": 3.12, + "grad_norm": 11.806807518005371, + "learning_rate": 3.1037514744267244e-06, + "logits/chosen": -0.5617330074310303, + "logits/rejected": -0.597496747970581, + "logps/chosen": -48.031097412109375, + "logps/rejected": -93.16078186035156, + "loss": 0.8944, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.185669183731079, + "rewards/margins": 5.496023654937744, + "rewards/rejected": -2.310354709625244, + "step": 12467 + }, + { + "epoch": 3.12, + "grad_norm": 5.686554431915283, + "learning_rate": 3.1030242279118573e-06, + "logits/chosen": -0.49466943740844727, + "logits/rejected": -0.5578365325927734, + "logps/chosen": -56.96104049682617, + "logps/rejected": -131.71253967285156, + "loss": 0.6275, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2690045833587646, + "rewards/margins": 6.707104682922363, + "rewards/rejected": -3.4380993843078613, + "step": 12468 + }, + { + "epoch": 3.12, + "grad_norm": 8.202021598815918, + "learning_rate": 3.1022970282732767e-06, + "logits/chosen": -0.5513512492179871, + "logits/rejected": -0.6385639309883118, + "logps/chosen": -56.84222412109375, + "logps/rejected": -95.07897186279297, + "loss": 0.5623, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3238561153411865, + "rewards/margins": 5.657681465148926, + "rewards/rejected": -2.333824872970581, + "step": 12469 + }, + { + "epoch": 3.12, + "grad_norm": 5.719315528869629, + "learning_rate": 3.101569875528955e-06, + "logits/chosen": -0.5488319993019104, + "logits/rejected": -0.5863733291625977, + "logps/chosen": -62.88811111450195, + "logps/rejected": -110.85853576660156, + "loss": 0.6421, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3670172691345215, + "rewards/margins": 6.256131172180176, + "rewards/rejected": -2.8891139030456543, + "step": 12470 + }, + { + "epoch": 3.12, + "grad_norm": 2.649888515472412, + "learning_rate": 3.1008427696968613e-06, + "logits/chosen": -0.5455790162086487, + "logits/rejected": -0.675011932849884, + "logps/chosen": -60.93569564819336, + "logps/rejected": -124.8264389038086, + "loss": 0.6031, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0721678733825684, + "rewards/margins": 7.590353012084961, + "rewards/rejected": -4.518185138702393, + "step": 12471 + }, + { + "epoch": 3.12, + "grad_norm": 2.7746357917785645, + "learning_rate": 3.100115710794963e-06, + "logits/chosen": -0.5555705428123474, + "logits/rejected": -0.6775487661361694, + "logps/chosen": -62.48136520385742, + "logps/rejected": -121.34760284423828, + "loss": 0.6298, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2295405864715576, + "rewards/margins": 8.734087944030762, + "rewards/rejected": -5.504547119140625, + "step": 12472 + }, + { + "epoch": 3.12, + "grad_norm": 9.132857322692871, + "learning_rate": 3.0993886988412263e-06, + "logits/chosen": -0.5206007957458496, + "logits/rejected": -0.6150636076927185, + "logps/chosen": -77.0286865234375, + "logps/rejected": -100.96592712402344, + "loss": 0.818, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.696356773376465, + "rewards/margins": 5.727121829986572, + "rewards/rejected": -3.030764579772949, + "step": 12473 + }, + { + "epoch": 3.12, + "grad_norm": 8.737979888916016, + "learning_rate": 3.098661733853616e-06, + "logits/chosen": -0.5071561336517334, + "logits/rejected": -0.6197410225868225, + "logps/chosen": -63.896575927734375, + "logps/rejected": -106.13180541992188, + "loss": 0.6606, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.846339702606201, + "rewards/margins": 6.692059516906738, + "rewards/rejected": -3.845719814300537, + "step": 12474 + }, + { + "epoch": 3.12, + "grad_norm": 4.514691352844238, + "learning_rate": 3.0979348158500977e-06, + "logits/chosen": -0.4727250337600708, + "logits/rejected": -0.5709185004234314, + "logps/chosen": -48.92988204956055, + "logps/rejected": -104.03924560546875, + "loss": 0.5311, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1330513954162598, + "rewards/margins": 7.553882122039795, + "rewards/rejected": -4.420830726623535, + "step": 12475 + }, + { + "epoch": 3.12, + "grad_norm": 8.150406837463379, + "learning_rate": 3.097207944848633e-06, + "logits/chosen": -0.48658326268196106, + "logits/rejected": -0.5792415738105774, + "logps/chosen": -56.15067672729492, + "logps/rejected": -104.07566833496094, + "loss": 0.7326, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.293315887451172, + "rewards/margins": 7.068431377410889, + "rewards/rejected": -3.775115728378296, + "step": 12476 + }, + { + "epoch": 3.12, + "grad_norm": 6.915759086608887, + "learning_rate": 3.096481120867183e-06, + "logits/chosen": -0.6129170060157776, + "logits/rejected": -0.728721559047699, + "logps/chosen": -60.851829528808594, + "logps/rejected": -100.14665222167969, + "loss": 0.6533, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.805833339691162, + "rewards/margins": 6.594242095947266, + "rewards/rejected": -3.7884092330932617, + "step": 12477 + }, + { + "epoch": 3.12, + "grad_norm": 10.766205787658691, + "learning_rate": 3.09575434392371e-06, + "logits/chosen": -0.5070164799690247, + "logits/rejected": -0.5824808478355408, + "logps/chosen": -60.77500915527344, + "logps/rejected": -116.01404571533203, + "loss": 0.6641, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9282209873199463, + "rewards/margins": 6.716124057769775, + "rewards/rejected": -3.787902593612671, + "step": 12478 + }, + { + "epoch": 3.12, + "grad_norm": 6.0171284675598145, + "learning_rate": 3.095027614036173e-06, + "logits/chosen": -0.5298497080802917, + "logits/rejected": -0.5934005379676819, + "logps/chosen": -51.133811950683594, + "logps/rejected": -122.69823455810547, + "loss": 0.686, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.104647397994995, + "rewards/margins": 6.743800163269043, + "rewards/rejected": -3.639153003692627, + "step": 12479 + }, + { + "epoch": 3.12, + "grad_norm": 3.7356786727905273, + "learning_rate": 3.0943009312225285e-06, + "logits/chosen": -0.5592925548553467, + "logits/rejected": -0.6487593054771423, + "logps/chosen": -48.65437316894531, + "logps/rejected": -95.38748168945312, + "loss": 0.5488, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.048624277114868, + "rewards/margins": 7.012353420257568, + "rewards/rejected": -3.9637293815612793, + "step": 12480 + }, + { + "epoch": 3.12, + "grad_norm": 4.1169328689575195, + "learning_rate": 3.0935742955007353e-06, + "logits/chosen": -0.49045035243034363, + "logits/rejected": -0.5711654424667358, + "logps/chosen": -47.20789337158203, + "logps/rejected": -96.16490173339844, + "loss": 0.6294, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9322657585144043, + "rewards/margins": 6.326589584350586, + "rewards/rejected": -3.3943240642547607, + "step": 12481 + }, + { + "epoch": 3.12, + "grad_norm": 11.617439270019531, + "learning_rate": 3.092847706888749e-06, + "logits/chosen": -0.5648339986801147, + "logits/rejected": -0.6822286248207092, + "logps/chosen": -63.91874694824219, + "logps/rejected": -88.95317840576172, + "loss": 0.6648, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9232234954833984, + "rewards/margins": 6.213754653930664, + "rewards/rejected": -3.290531635284424, + "step": 12482 + }, + { + "epoch": 3.12, + "grad_norm": 2.681941032409668, + "learning_rate": 3.0921211654045256e-06, + "logits/chosen": -0.5962291955947876, + "logits/rejected": -0.6653070449829102, + "logps/chosen": -51.64415740966797, + "logps/rejected": -119.46553039550781, + "loss": 0.5724, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.265385150909424, + "rewards/margins": 8.209364891052246, + "rewards/rejected": -4.943979263305664, + "step": 12483 + }, + { + "epoch": 3.12, + "grad_norm": 9.000896453857422, + "learning_rate": 3.091394671066016e-06, + "logits/chosen": -0.46797245740890503, + "logits/rejected": -0.520702600479126, + "logps/chosen": -48.90516662597656, + "logps/rejected": -100.12738800048828, + "loss": 0.6202, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9315037727355957, + "rewards/margins": 6.162502288818359, + "rewards/rejected": -3.2309982776641846, + "step": 12484 + }, + { + "epoch": 3.12, + "grad_norm": 5.470855236053467, + "learning_rate": 3.0906682238911743e-06, + "logits/chosen": -0.5721564292907715, + "logits/rejected": -0.6964385509490967, + "logps/chosen": -51.601768493652344, + "logps/rejected": -109.6402359008789, + "loss": 0.5643, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.147191047668457, + "rewards/margins": 8.211645126342773, + "rewards/rejected": -5.064453125, + "step": 12485 + }, + { + "epoch": 3.12, + "grad_norm": 6.147942543029785, + "learning_rate": 3.089941823897954e-06, + "logits/chosen": -0.48302143812179565, + "logits/rejected": -0.5972938537597656, + "logps/chosen": -55.64509582519531, + "logps/rejected": -85.35408020019531, + "loss": 0.6758, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.806201219558716, + "rewards/margins": 6.430540561676025, + "rewards/rejected": -3.6243391036987305, + "step": 12486 + }, + { + "epoch": 3.12, + "grad_norm": 3.517134189605713, + "learning_rate": 3.0892154711042997e-06, + "logits/chosen": -0.5114302635192871, + "logits/rejected": -0.6119179129600525, + "logps/chosen": -56.38546371459961, + "logps/rejected": -86.91392517089844, + "loss": 0.538, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.989785671234131, + "rewards/margins": 6.147500514984131, + "rewards/rejected": -3.15771484375, + "step": 12487 + }, + { + "epoch": 3.12, + "grad_norm": 12.904059410095215, + "learning_rate": 3.0884891655281646e-06, + "logits/chosen": -0.519054651260376, + "logits/rejected": -0.5905826091766357, + "logps/chosen": -69.46573638916016, + "logps/rejected": -95.99998474121094, + "loss": 0.7034, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9903318881988525, + "rewards/margins": 5.87106466293335, + "rewards/rejected": -2.8807332515716553, + "step": 12488 + }, + { + "epoch": 3.12, + "grad_norm": 5.108415126800537, + "learning_rate": 3.087762907187496e-06, + "logits/chosen": -0.5884814262390137, + "logits/rejected": -0.6600186824798584, + "logps/chosen": -46.326255798339844, + "logps/rejected": -107.92787170410156, + "loss": 0.5803, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.104985237121582, + "rewards/margins": 7.187605857849121, + "rewards/rejected": -4.082620620727539, + "step": 12489 + }, + { + "epoch": 3.12, + "grad_norm": 6.1724042892456055, + "learning_rate": 3.0870366961002395e-06, + "logits/chosen": -0.5401536822319031, + "logits/rejected": -0.6386457085609436, + "logps/chosen": -60.46061706542969, + "logps/rejected": -104.17375183105469, + "loss": 0.6915, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0622105598449707, + "rewards/margins": 6.922720909118652, + "rewards/rejected": -3.8605103492736816, + "step": 12490 + }, + { + "epoch": 3.12, + "grad_norm": 3.9300482273101807, + "learning_rate": 3.086310532284341e-06, + "logits/chosen": -0.5992490649223328, + "logits/rejected": -0.6524481773376465, + "logps/chosen": -50.91415023803711, + "logps/rejected": -118.80038452148438, + "loss": 0.5661, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8684473037719727, + "rewards/margins": 7.616246700286865, + "rewards/rejected": -4.747799873352051, + "step": 12491 + }, + { + "epoch": 3.13, + "grad_norm": 4.279702663421631, + "learning_rate": 3.0855844157577447e-06, + "logits/chosen": -0.5105717182159424, + "logits/rejected": -0.5338026285171509, + "logps/chosen": -51.04106140136719, + "logps/rejected": -98.51348114013672, + "loss": 0.6894, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2273576259613037, + "rewards/margins": 5.691828727722168, + "rewards/rejected": -2.464470863342285, + "step": 12492 + }, + { + "epoch": 3.13, + "grad_norm": 3.3884575366973877, + "learning_rate": 3.0848583465383947e-06, + "logits/chosen": -0.5610133409500122, + "logits/rejected": -0.6403790712356567, + "logps/chosen": -71.41558837890625, + "logps/rejected": -118.93553924560547, + "loss": 0.6644, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6869821548461914, + "rewards/margins": 8.210676193237305, + "rewards/rejected": -5.523694038391113, + "step": 12493 + }, + { + "epoch": 3.13, + "grad_norm": 5.001433849334717, + "learning_rate": 3.0841323246442306e-06, + "logits/chosen": -0.5119484066963196, + "logits/rejected": -0.6309566497802734, + "logps/chosen": -72.7353286743164, + "logps/rejected": -105.3173828125, + "loss": 0.6648, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.069009780883789, + "rewards/margins": 6.63955020904541, + "rewards/rejected": -3.570540428161621, + "step": 12494 + }, + { + "epoch": 3.13, + "grad_norm": 8.337639808654785, + "learning_rate": 3.0834063500931947e-06, + "logits/chosen": -0.5790514349937439, + "logits/rejected": -0.656936764717102, + "logps/chosen": -51.80207824707031, + "logps/rejected": -97.00967407226562, + "loss": 0.6486, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.921262264251709, + "rewards/margins": 5.883822441101074, + "rewards/rejected": -2.9625606536865234, + "step": 12495 + }, + { + "epoch": 3.13, + "grad_norm": 4.098719120025635, + "learning_rate": 3.082680422903226e-06, + "logits/chosen": -0.5199974775314331, + "logits/rejected": -0.6609283089637756, + "logps/chosen": -52.52656555175781, + "logps/rejected": -99.36280059814453, + "loss": 0.5385, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0889644622802734, + "rewards/margins": 8.321224212646484, + "rewards/rejected": -5.232259750366211, + "step": 12496 + }, + { + "epoch": 3.13, + "grad_norm": 5.0112080574035645, + "learning_rate": 3.081954543092266e-06, + "logits/chosen": -0.6050567030906677, + "logits/rejected": -0.6882545948028564, + "logps/chosen": -53.633018493652344, + "logps/rejected": -104.79432678222656, + "loss": 0.6104, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.938296318054199, + "rewards/margins": 6.807181358337402, + "rewards/rejected": -3.868885040283203, + "step": 12497 + }, + { + "epoch": 3.13, + "grad_norm": 2.1339170932769775, + "learning_rate": 3.081228710678247e-06, + "logits/chosen": -0.45650631189346313, + "logits/rejected": -0.5391224026679993, + "logps/chosen": -49.08744812011719, + "logps/rejected": -106.08052062988281, + "loss": 0.5435, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.293846607208252, + "rewards/margins": 8.381585121154785, + "rewards/rejected": -5.087738990783691, + "step": 12498 + }, + { + "epoch": 3.13, + "grad_norm": 4.571007251739502, + "learning_rate": 3.080502925679107e-06, + "logits/chosen": -0.5294938683509827, + "logits/rejected": -0.5521718263626099, + "logps/chosen": -44.98332977294922, + "logps/rejected": -116.55436706542969, + "loss": 0.5577, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3147780895233154, + "rewards/margins": 7.386147499084473, + "rewards/rejected": -4.07136869430542, + "step": 12499 + }, + { + "epoch": 3.13, + "grad_norm": 12.618725776672363, + "learning_rate": 3.079777188112784e-06, + "logits/chosen": -0.4828117787837982, + "logits/rejected": -0.5762879848480225, + "logps/chosen": -52.425228118896484, + "logps/rejected": -102.59576416015625, + "loss": 0.5983, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.85508394241333, + "rewards/margins": 6.763533592224121, + "rewards/rejected": -3.908449172973633, + "step": 12500 + }, + { + "epoch": 3.13, + "grad_norm": 5.419832706451416, + "learning_rate": 3.0790514979972064e-06, + "logits/chosen": -0.541292130947113, + "logits/rejected": -0.6392719745635986, + "logps/chosen": -53.7879638671875, + "logps/rejected": -81.60604858398438, + "loss": 0.6116, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.953752279281616, + "rewards/margins": 5.963937282562256, + "rewards/rejected": -3.0101847648620605, + "step": 12501 + }, + { + "epoch": 3.13, + "grad_norm": 6.053884983062744, + "learning_rate": 3.0783258553503105e-06, + "logits/chosen": -0.4790835976600647, + "logits/rejected": -0.509355366230011, + "logps/chosen": -50.48324203491211, + "logps/rejected": -95.92967987060547, + "loss": 0.6735, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.034742832183838, + "rewards/margins": 5.658817291259766, + "rewards/rejected": -2.624074935913086, + "step": 12502 + }, + { + "epoch": 3.13, + "grad_norm": 8.851787567138672, + "learning_rate": 3.077600260190027e-06, + "logits/chosen": -0.5786987543106079, + "logits/rejected": -0.6527886390686035, + "logps/chosen": -42.889007568359375, + "logps/rejected": -101.61446380615234, + "loss": 0.5782, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.349491834640503, + "rewards/margins": 6.62804651260376, + "rewards/rejected": -3.278555393218994, + "step": 12503 + }, + { + "epoch": 3.13, + "grad_norm": 6.525018692016602, + "learning_rate": 3.076874712534286e-06, + "logits/chosen": -0.5965057611465454, + "logits/rejected": -0.6091780662536621, + "logps/chosen": -50.03612518310547, + "logps/rejected": -113.86124420166016, + "loss": 0.6434, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.944956064224243, + "rewards/margins": 5.904146671295166, + "rewards/rejected": -2.959190845489502, + "step": 12504 + }, + { + "epoch": 3.13, + "grad_norm": 6.963936805725098, + "learning_rate": 3.076149212401016e-06, + "logits/chosen": -0.521409273147583, + "logits/rejected": -0.5913736820220947, + "logps/chosen": -61.19353103637695, + "logps/rejected": -118.65019226074219, + "loss": 0.5917, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0024659633636475, + "rewards/margins": 6.855731010437012, + "rewards/rejected": -3.8532650470733643, + "step": 12505 + }, + { + "epoch": 3.13, + "grad_norm": 3.507669687271118, + "learning_rate": 3.075423759808145e-06, + "logits/chosen": -0.5090349912643433, + "logits/rejected": -0.5750471353530884, + "logps/chosen": -57.08713912963867, + "logps/rejected": -102.53812408447266, + "loss": 0.6884, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.248875856399536, + "rewards/margins": 6.581302642822266, + "rewards/rejected": -3.3324263095855713, + "step": 12506 + }, + { + "epoch": 3.13, + "grad_norm": 15.761089324951172, + "learning_rate": 3.0746983547736e-06, + "logits/chosen": -0.4380094110965729, + "logits/rejected": -0.5399216413497925, + "logps/chosen": -69.74076843261719, + "logps/rejected": -112.92501068115234, + "loss": 0.6713, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.69808030128479, + "rewards/margins": 6.427778244018555, + "rewards/rejected": -3.7296977043151855, + "step": 12507 + }, + { + "epoch": 3.13, + "grad_norm": 26.76113510131836, + "learning_rate": 3.0739729973153075e-06, + "logits/chosen": -0.48221200704574585, + "logits/rejected": -0.5506302714347839, + "logps/chosen": -62.46455001831055, + "logps/rejected": -114.57849884033203, + "loss": 0.7723, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7017226219177246, + "rewards/margins": 6.641415596008301, + "rewards/rejected": -3.9396934509277344, + "step": 12508 + }, + { + "epoch": 3.13, + "grad_norm": 23.144367218017578, + "learning_rate": 3.07324768745119e-06, + "logits/chosen": -0.5107685327529907, + "logits/rejected": -0.5445586442947388, + "logps/chosen": -52.98572540283203, + "logps/rejected": -108.89608001708984, + "loss": 0.6271, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.155088424682617, + "rewards/margins": 6.597257614135742, + "rewards/rejected": -3.442169427871704, + "step": 12509 + }, + { + "epoch": 3.13, + "grad_norm": 5.182073593139648, + "learning_rate": 3.07252242519917e-06, + "logits/chosen": -0.5107945203781128, + "logits/rejected": -0.5435940623283386, + "logps/chosen": -55.07077407836914, + "logps/rejected": -108.16024780273438, + "loss": 0.7171, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0863871574401855, + "rewards/margins": 6.583444118499756, + "rewards/rejected": -3.497056484222412, + "step": 12510 + }, + { + "epoch": 3.13, + "grad_norm": 8.399582862854004, + "learning_rate": 3.071797210577175e-06, + "logits/chosen": -0.6095777153968811, + "logits/rejected": -0.6429484486579895, + "logps/chosen": -49.59375, + "logps/rejected": -104.3916015625, + "loss": 0.6803, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8236172199249268, + "rewards/margins": 5.585877895355225, + "rewards/rejected": -2.762260675430298, + "step": 12511 + }, + { + "epoch": 3.13, + "grad_norm": 22.266860961914062, + "learning_rate": 3.0710720436031193e-06, + "logits/chosen": -0.5745609998703003, + "logits/rejected": -0.6386392712593079, + "logps/chosen": -51.946678161621094, + "logps/rejected": -107.59960174560547, + "loss": 0.6861, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3207592964172363, + "rewards/margins": 7.153388023376465, + "rewards/rejected": -3.8326287269592285, + "step": 12512 + }, + { + "epoch": 3.13, + "grad_norm": 9.60454273223877, + "learning_rate": 3.0703469242949233e-06, + "logits/chosen": -0.5154886245727539, + "logits/rejected": -0.6130715608596802, + "logps/chosen": -53.37395477294922, + "logps/rejected": -111.87334442138672, + "loss": 0.6626, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0254502296447754, + "rewards/margins": 6.137867450714111, + "rewards/rejected": -3.112417697906494, + "step": 12513 + }, + { + "epoch": 3.13, + "grad_norm": 8.532036781311035, + "learning_rate": 3.0696218526705095e-06, + "logits/chosen": -0.5383042097091675, + "logits/rejected": -0.5870231986045837, + "logps/chosen": -60.44298553466797, + "logps/rejected": -93.6823501586914, + "loss": 0.7413, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7835307121276855, + "rewards/margins": 5.487202167510986, + "rewards/rejected": -2.703671932220459, + "step": 12514 + }, + { + "epoch": 3.13, + "grad_norm": 7.404848575592041, + "learning_rate": 3.068896828747794e-06, + "logits/chosen": -0.5567479133605957, + "logits/rejected": -0.6037644743919373, + "logps/chosen": -55.267093658447266, + "logps/rejected": -103.43218994140625, + "loss": 0.6636, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8823540210723877, + "rewards/margins": 5.951671123504639, + "rewards/rejected": -3.06931734085083, + "step": 12515 + }, + { + "epoch": 3.13, + "grad_norm": 4.723426818847656, + "learning_rate": 3.06817185254469e-06, + "logits/chosen": -0.5637003779411316, + "logits/rejected": -0.6588433384895325, + "logps/chosen": -44.51861572265625, + "logps/rejected": -113.70020294189453, + "loss": 0.637, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0920791625976562, + "rewards/margins": 7.593912124633789, + "rewards/rejected": -4.501832485198975, + "step": 12516 + }, + { + "epoch": 3.13, + "grad_norm": 3.3865914344787598, + "learning_rate": 3.067446924079115e-06, + "logits/chosen": -0.5770605802536011, + "logits/rejected": -0.6473550796508789, + "logps/chosen": -48.035247802734375, + "logps/rejected": -96.95663452148438, + "loss": 0.6214, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.893968105316162, + "rewards/margins": 6.965384006500244, + "rewards/rejected": -4.07141637802124, + "step": 12517 + }, + { + "epoch": 3.13, + "grad_norm": 4.411604404449463, + "learning_rate": 3.0667220433689826e-06, + "logits/chosen": -0.5296364426612854, + "logits/rejected": -0.5999597311019897, + "logps/chosen": -55.08066177368164, + "logps/rejected": -109.71736145019531, + "loss": 0.6121, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.845433235168457, + "rewards/margins": 7.099709987640381, + "rewards/rejected": -4.254277229309082, + "step": 12518 + }, + { + "epoch": 3.13, + "grad_norm": 4.1395487785339355, + "learning_rate": 3.065997210432204e-06, + "logits/chosen": -0.5254059433937073, + "logits/rejected": -0.5961087942123413, + "logps/chosen": -55.87325668334961, + "logps/rejected": -95.40882110595703, + "loss": 0.589, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.870540142059326, + "rewards/margins": 6.647048473358154, + "rewards/rejected": -3.776508331298828, + "step": 12519 + }, + { + "epoch": 3.13, + "grad_norm": 3.433589220046997, + "learning_rate": 3.0652724252866918e-06, + "logits/chosen": -0.51530921459198, + "logits/rejected": -0.6167593598365784, + "logps/chosen": -60.386390686035156, + "logps/rejected": -98.859130859375, + "loss": 0.6016, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.170074224472046, + "rewards/margins": 6.802870750427246, + "rewards/rejected": -3.6327965259552, + "step": 12520 + }, + { + "epoch": 3.13, + "grad_norm": 5.312684059143066, + "learning_rate": 3.064547687950355e-06, + "logits/chosen": -0.5452129244804382, + "logits/rejected": -0.6666827201843262, + "logps/chosen": -47.279624938964844, + "logps/rejected": -98.00437927246094, + "loss": 0.5754, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.185091018676758, + "rewards/margins": 7.285140514373779, + "rewards/rejected": -4.1000494956970215, + "step": 12521 + }, + { + "epoch": 3.13, + "grad_norm": 4.346690654754639, + "learning_rate": 3.063822998441105e-06, + "logits/chosen": -0.5462590456008911, + "logits/rejected": -0.5958238840103149, + "logps/chosen": -58.298221588134766, + "logps/rejected": -92.44380187988281, + "loss": 0.6695, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.167360782623291, + "rewards/margins": 5.170462608337402, + "rewards/rejected": -2.0031020641326904, + "step": 12522 + }, + { + "epoch": 3.13, + "grad_norm": 5.543505668640137, + "learning_rate": 3.063098356776847e-06, + "logits/chosen": -0.5317321419715881, + "logits/rejected": -0.6131817102432251, + "logps/chosen": -51.504817962646484, + "logps/rejected": -97.66201782226562, + "loss": 0.584, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9494340419769287, + "rewards/margins": 5.502216339111328, + "rewards/rejected": -2.5527825355529785, + "step": 12523 + }, + { + "epoch": 3.13, + "grad_norm": 17.016931533813477, + "learning_rate": 3.062373762975489e-06, + "logits/chosen": -0.46755465865135193, + "logits/rejected": -0.5758194923400879, + "logps/chosen": -67.96075439453125, + "logps/rejected": -105.75296020507812, + "loss": 0.8051, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7492337226867676, + "rewards/margins": 5.9108452796936035, + "rewards/rejected": -3.161611557006836, + "step": 12524 + }, + { + "epoch": 3.13, + "grad_norm": 7.609513282775879, + "learning_rate": 3.061649217054937e-06, + "logits/chosen": -0.7166455388069153, + "logits/rejected": -0.7958440780639648, + "logps/chosen": -57.35285568237305, + "logps/rejected": -101.10308837890625, + "loss": 0.6769, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1767690181732178, + "rewards/margins": 7.509067535400391, + "rewards/rejected": -4.332298755645752, + "step": 12525 + }, + { + "epoch": 3.13, + "grad_norm": 4.684732913970947, + "learning_rate": 3.0609247190330938e-06, + "logits/chosen": -0.48817962408065796, + "logits/rejected": -0.5594709515571594, + "logps/chosen": -47.994232177734375, + "logps/rejected": -102.12393951416016, + "loss": 0.5671, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0782158374786377, + "rewards/margins": 6.406023979187012, + "rewards/rejected": -3.327808380126953, + "step": 12526 + }, + { + "epoch": 3.13, + "grad_norm": 4.744450092315674, + "learning_rate": 3.060200268927862e-06, + "logits/chosen": -0.5867112874984741, + "logits/rejected": -0.6558261513710022, + "logps/chosen": -52.055747985839844, + "logps/rejected": -98.18101501464844, + "loss": 0.6308, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0458014011383057, + "rewards/margins": 6.405701160430908, + "rewards/rejected": -3.3598995208740234, + "step": 12527 + }, + { + "epoch": 3.13, + "grad_norm": 6.774609565734863, + "learning_rate": 3.0594758667571462e-06, + "logits/chosen": -0.494018018245697, + "logits/rejected": -0.5703679323196411, + "logps/chosen": -52.87001037597656, + "logps/rejected": -109.36344909667969, + "loss": 0.6527, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.4663338661193848, + "rewards/margins": 7.160375595092773, + "rewards/rejected": -4.694041728973389, + "step": 12528 + }, + { + "epoch": 3.13, + "grad_norm": 4.318065643310547, + "learning_rate": 3.0587515125388478e-06, + "logits/chosen": -0.5464351773262024, + "logits/rejected": -0.6178956627845764, + "logps/chosen": -54.33840560913086, + "logps/rejected": -124.40872192382812, + "loss": 0.5786, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2034764289855957, + "rewards/margins": 8.957740783691406, + "rewards/rejected": -5.754264831542969, + "step": 12529 + }, + { + "epoch": 3.13, + "grad_norm": 4.509596824645996, + "learning_rate": 3.0580272062908605e-06, + "logits/chosen": -0.5175305008888245, + "logits/rejected": -0.5867308974266052, + "logps/chosen": -59.3978157043457, + "logps/rejected": -105.04222106933594, + "loss": 0.6774, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.730705976486206, + "rewards/margins": 5.9241180419921875, + "rewards/rejected": -3.1934118270874023, + "step": 12530 + }, + { + "epoch": 3.13, + "grad_norm": 3.573697090148926, + "learning_rate": 3.0573029480310883e-06, + "logits/chosen": -0.5640528202056885, + "logits/rejected": -0.6554524898529053, + "logps/chosen": -50.8785400390625, + "logps/rejected": -92.05172729492188, + "loss": 0.6009, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.010279655456543, + "rewards/margins": 7.834480285644531, + "rewards/rejected": -4.824200630187988, + "step": 12531 + }, + { + "epoch": 3.14, + "grad_norm": 4.116913795471191, + "learning_rate": 3.0565787377774263e-06, + "logits/chosen": -0.5366670489311218, + "logits/rejected": -0.631623387336731, + "logps/chosen": -65.69817352294922, + "logps/rejected": -101.57782745361328, + "loss": 0.578, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1814746856689453, + "rewards/margins": 6.446264266967773, + "rewards/rejected": -3.2647898197174072, + "step": 12532 + }, + { + "epoch": 3.14, + "grad_norm": 5.499844551086426, + "learning_rate": 3.055854575547772e-06, + "logits/chosen": -0.558809220790863, + "logits/rejected": -0.6353847980499268, + "logps/chosen": -45.528106689453125, + "logps/rejected": -92.19712829589844, + "loss": 0.6359, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1705994606018066, + "rewards/margins": 5.940559387207031, + "rewards/rejected": -2.7699599266052246, + "step": 12533 + }, + { + "epoch": 3.14, + "grad_norm": 4.838677883148193, + "learning_rate": 3.055130461360018e-06, + "logits/chosen": -0.4741152822971344, + "logits/rejected": -0.6050083041191101, + "logps/chosen": -60.68174743652344, + "logps/rejected": -98.89396667480469, + "loss": 0.6756, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5637922286987305, + "rewards/margins": 7.107420921325684, + "rewards/rejected": -3.5436289310455322, + "step": 12534 + }, + { + "epoch": 3.14, + "grad_norm": 8.230780601501465, + "learning_rate": 3.054406395232059e-06, + "logits/chosen": -0.555027961730957, + "logits/rejected": -0.5808705687522888, + "logps/chosen": -56.04902648925781, + "logps/rejected": -87.85552215576172, + "loss": 0.6905, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.952425956726074, + "rewards/margins": 6.003934860229492, + "rewards/rejected": -3.051508903503418, + "step": 12535 + }, + { + "epoch": 3.14, + "grad_norm": 4.604122161865234, + "learning_rate": 3.053682377181788e-06, + "logits/chosen": -0.521970808506012, + "logits/rejected": -0.5618836283683777, + "logps/chosen": -43.543907165527344, + "logps/rejected": -96.56723022460938, + "loss": 0.559, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.486006259918213, + "rewards/margins": 7.076643943786621, + "rewards/rejected": -3.590637683868408, + "step": 12536 + }, + { + "epoch": 3.14, + "grad_norm": 3.5493290424346924, + "learning_rate": 3.0529584072270944e-06, + "logits/chosen": -0.5322261452674866, + "logits/rejected": -0.6134386658668518, + "logps/chosen": -62.20447540283203, + "logps/rejected": -104.5125503540039, + "loss": 0.6881, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.237886667251587, + "rewards/margins": 7.1830291748046875, + "rewards/rejected": -3.9451420307159424, + "step": 12537 + }, + { + "epoch": 3.14, + "grad_norm": 2.7636666297912598, + "learning_rate": 3.052234485385871e-06, + "logits/chosen": -0.5565851330757141, + "logits/rejected": -0.6258332133293152, + "logps/chosen": -43.031646728515625, + "logps/rejected": -102.85438537597656, + "loss": 0.548, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.223219633102417, + "rewards/margins": 7.460978031158447, + "rewards/rejected": -4.237758159637451, + "step": 12538 + }, + { + "epoch": 3.14, + "grad_norm": 4.818996906280518, + "learning_rate": 3.051510611676003e-06, + "logits/chosen": -0.5584502816200256, + "logits/rejected": -0.6585659384727478, + "logps/chosen": -50.61760330200195, + "logps/rejected": -100.85231018066406, + "loss": 0.5691, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1503262519836426, + "rewards/margins": 6.09036922454834, + "rewards/rejected": -2.94004225730896, + "step": 12539 + }, + { + "epoch": 3.14, + "grad_norm": 7.7999348640441895, + "learning_rate": 3.0507867861153835e-06, + "logits/chosen": -0.48351186513900757, + "logits/rejected": -0.5471469163894653, + "logps/chosen": -55.917449951171875, + "logps/rejected": -100.77381134033203, + "loss": 0.6925, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.823122262954712, + "rewards/margins": 5.368441104888916, + "rewards/rejected": -2.5453193187713623, + "step": 12540 + }, + { + "epoch": 3.14, + "grad_norm": 7.630490303039551, + "learning_rate": 3.0500630087218943e-06, + "logits/chosen": -0.5310905575752258, + "logits/rejected": -0.5668216347694397, + "logps/chosen": -51.16462707519531, + "logps/rejected": -108.36884307861328, + "loss": 0.5973, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.351390838623047, + "rewards/margins": 6.655818939208984, + "rewards/rejected": -3.3044283390045166, + "step": 12541 + }, + { + "epoch": 3.14, + "grad_norm": 2.9002134799957275, + "learning_rate": 3.04933927951342e-06, + "logits/chosen": -0.5146265029907227, + "logits/rejected": -0.6079615950584412, + "logps/chosen": -57.15013885498047, + "logps/rejected": -87.27530670166016, + "loss": 0.6067, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3171818256378174, + "rewards/margins": 6.698364734649658, + "rewards/rejected": -3.38118314743042, + "step": 12542 + }, + { + "epoch": 3.14, + "grad_norm": 16.717557907104492, + "learning_rate": 3.048615598507851e-06, + "logits/chosen": -0.45426931977272034, + "logits/rejected": -0.5893963575363159, + "logps/chosen": -63.32595443725586, + "logps/rejected": -93.17798614501953, + "loss": 0.8593, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.154306411743164, + "rewards/margins": 5.614851951599121, + "rewards/rejected": -2.460545778274536, + "step": 12543 + }, + { + "epoch": 3.14, + "grad_norm": 5.88661003112793, + "learning_rate": 3.0478919657230625e-06, + "logits/chosen": -0.5159881114959717, + "logits/rejected": -0.5671306848526001, + "logps/chosen": -50.33488845825195, + "logps/rejected": -116.36139678955078, + "loss": 0.5814, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0265016555786133, + "rewards/margins": 6.004417419433594, + "rewards/rejected": -2.9779162406921387, + "step": 12544 + }, + { + "epoch": 3.14, + "grad_norm": 3.460507869720459, + "learning_rate": 3.0471683811769416e-06, + "logits/chosen": -0.4478394389152527, + "logits/rejected": -0.5484011173248291, + "logps/chosen": -60.544837951660156, + "logps/rejected": -103.4870376586914, + "loss": 0.5754, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.118969678878784, + "rewards/margins": 6.913962364196777, + "rewards/rejected": -3.794992208480835, + "step": 12545 + }, + { + "epoch": 3.14, + "grad_norm": 6.5696306228637695, + "learning_rate": 3.0464448448873663e-06, + "logits/chosen": -0.5781477689743042, + "logits/rejected": -0.6768363118171692, + "logps/chosen": -55.50533676147461, + "logps/rejected": -105.21004486083984, + "loss": 0.706, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8753321170806885, + "rewards/margins": 7.104757308959961, + "rewards/rejected": -4.22942590713501, + "step": 12546 + }, + { + "epoch": 3.14, + "grad_norm": 3.1223692893981934, + "learning_rate": 3.045721356872218e-06, + "logits/chosen": -0.5010459423065186, + "logits/rejected": -0.6305751204490662, + "logps/chosen": -58.49958801269531, + "logps/rejected": -95.45262145996094, + "loss": 0.5771, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.121711254119873, + "rewards/margins": 6.905481815338135, + "rewards/rejected": -3.7837705612182617, + "step": 12547 + }, + { + "epoch": 3.14, + "grad_norm": 6.321862697601318, + "learning_rate": 3.044997917149373e-06, + "logits/chosen": -0.5512211322784424, + "logits/rejected": -0.6084052920341492, + "logps/chosen": -56.280418395996094, + "logps/rejected": -112.53038024902344, + "loss": 0.6223, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.102928400039673, + "rewards/margins": 7.6667561531066895, + "rewards/rejected": -4.5638275146484375, + "step": 12548 + }, + { + "epoch": 3.14, + "grad_norm": 5.400115489959717, + "learning_rate": 3.044274525736709e-06, + "logits/chosen": -0.5079699754714966, + "logits/rejected": -0.573071300983429, + "logps/chosen": -61.51503372192383, + "logps/rejected": -105.00405883789062, + "loss": 0.6487, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0831875801086426, + "rewards/margins": 6.841259956359863, + "rewards/rejected": -3.7580726146698, + "step": 12549 + }, + { + "epoch": 3.14, + "grad_norm": 3.159818649291992, + "learning_rate": 3.0435511826521024e-06, + "logits/chosen": -0.5055572986602783, + "logits/rejected": -0.5836018323898315, + "logps/chosen": -52.83427810668945, + "logps/rejected": -113.66692352294922, + "loss": 0.6296, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9486441612243652, + "rewards/margins": 6.893349647521973, + "rewards/rejected": -3.9447059631347656, + "step": 12550 + }, + { + "epoch": 3.14, + "grad_norm": 6.072895526885986, + "learning_rate": 3.042827887913426e-06, + "logits/chosen": -0.6013303995132446, + "logits/rejected": -0.604643702507019, + "logps/chosen": -54.645790100097656, + "logps/rejected": -141.79254150390625, + "loss": 0.6196, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2404017448425293, + "rewards/margins": 8.57861328125, + "rewards/rejected": -5.338211536407471, + "step": 12551 + }, + { + "epoch": 3.14, + "grad_norm": 4.944685935974121, + "learning_rate": 3.042104641538554e-06, + "logits/chosen": -0.5827472805976868, + "logits/rejected": -0.6409649848937988, + "logps/chosen": -45.45460891723633, + "logps/rejected": -109.31028747558594, + "loss": 0.6494, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2279417514801025, + "rewards/margins": 7.232626914978027, + "rewards/rejected": -4.004684925079346, + "step": 12552 + }, + { + "epoch": 3.14, + "grad_norm": 5.115041732788086, + "learning_rate": 3.041381443545359e-06, + "logits/chosen": -0.4984048008918762, + "logits/rejected": -0.5660055875778198, + "logps/chosen": -57.33386993408203, + "logps/rejected": -104.8265380859375, + "loss": 0.5996, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0281479358673096, + "rewards/margins": 6.9000959396362305, + "rewards/rejected": -3.8719482421875, + "step": 12553 + }, + { + "epoch": 3.14, + "grad_norm": 4.300481796264648, + "learning_rate": 3.0406582939517133e-06, + "logits/chosen": -0.5687352418899536, + "logits/rejected": -0.6384690403938293, + "logps/chosen": -48.571807861328125, + "logps/rejected": -82.5396957397461, + "loss": 0.6736, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.396965503692627, + "rewards/margins": 5.716670989990234, + "rewards/rejected": -2.3197054862976074, + "step": 12554 + }, + { + "epoch": 3.14, + "grad_norm": 8.474042892456055, + "learning_rate": 3.0399351927754837e-06, + "logits/chosen": -0.5167646408081055, + "logits/rejected": -0.570949375629425, + "logps/chosen": -58.75700378417969, + "logps/rejected": -108.51061248779297, + "loss": 0.6723, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0093135833740234, + "rewards/margins": 6.321105003356934, + "rewards/rejected": -3.31179141998291, + "step": 12555 + }, + { + "epoch": 3.14, + "grad_norm": 8.338028907775879, + "learning_rate": 3.039212140034539e-06, + "logits/chosen": -0.4811522662639618, + "logits/rejected": -0.5782417058944702, + "logps/chosen": -63.371620178222656, + "logps/rejected": -97.94010925292969, + "loss": 0.6998, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.943591833114624, + "rewards/margins": 7.032198905944824, + "rewards/rejected": -4.088607311248779, + "step": 12556 + }, + { + "epoch": 3.14, + "grad_norm": 5.184249401092529, + "learning_rate": 3.0384891357467496e-06, + "logits/chosen": -0.4100486934185028, + "logits/rejected": -0.5046261548995972, + "logps/chosen": -58.78815841674805, + "logps/rejected": -105.36993408203125, + "loss": 0.5993, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.039975166320801, + "rewards/margins": 6.713940143585205, + "rewards/rejected": -3.6739649772644043, + "step": 12557 + }, + { + "epoch": 3.14, + "grad_norm": 6.0816874504089355, + "learning_rate": 3.037766179929982e-06, + "logits/chosen": -0.532546877861023, + "logits/rejected": -0.5720863342285156, + "logps/chosen": -45.3163948059082, + "logps/rejected": -108.82962799072266, + "loss": 0.5936, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.149111270904541, + "rewards/margins": 6.478963851928711, + "rewards/rejected": -3.329853057861328, + "step": 12558 + }, + { + "epoch": 3.14, + "grad_norm": 4.15263032913208, + "learning_rate": 3.037043272602096e-06, + "logits/chosen": -0.5225775837898254, + "logits/rejected": -0.6311852335929871, + "logps/chosen": -54.817901611328125, + "logps/rejected": -92.08573150634766, + "loss": 0.7096, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.869077682495117, + "rewards/margins": 6.977174758911133, + "rewards/rejected": -4.108097553253174, + "step": 12559 + }, + { + "epoch": 3.14, + "grad_norm": 4.1819844245910645, + "learning_rate": 3.036320413780959e-06, + "logits/chosen": -0.48175427317619324, + "logits/rejected": -0.4919232726097107, + "logps/chosen": -53.42533874511719, + "logps/rejected": -130.91127014160156, + "loss": 0.5883, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.065115451812744, + "rewards/margins": 7.450967311859131, + "rewards/rejected": -4.38585090637207, + "step": 12560 + }, + { + "epoch": 3.14, + "grad_norm": 8.807496070861816, + "learning_rate": 3.0355976034844348e-06, + "logits/chosen": -0.4772454500198364, + "logits/rejected": -0.5843931436538696, + "logps/chosen": -56.07168197631836, + "logps/rejected": -95.86831665039062, + "loss": 1.0188, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1267659664154053, + "rewards/margins": 6.316457271575928, + "rewards/rejected": -3.1896913051605225, + "step": 12561 + }, + { + "epoch": 3.14, + "grad_norm": 6.986248016357422, + "learning_rate": 3.0348748417303826e-06, + "logits/chosen": -0.5943567156791687, + "logits/rejected": -0.6596929430961609, + "logps/chosen": -57.767574310302734, + "logps/rejected": -89.20934295654297, + "loss": 0.7063, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9135332107543945, + "rewards/margins": 5.4539995193481445, + "rewards/rejected": -2.540465831756592, + "step": 12562 + }, + { + "epoch": 3.14, + "grad_norm": 3.7717440128326416, + "learning_rate": 3.034152128536663e-06, + "logits/chosen": -0.5758864283561707, + "logits/rejected": -0.6272439956665039, + "logps/chosen": -52.75266647338867, + "logps/rejected": -116.43753051757812, + "loss": 0.5871, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.872753143310547, + "rewards/margins": 7.765562534332275, + "rewards/rejected": -4.892808437347412, + "step": 12563 + }, + { + "epoch": 3.14, + "grad_norm": 4.818387985229492, + "learning_rate": 3.0334294639211347e-06, + "logits/chosen": -0.47783738374710083, + "logits/rejected": -0.5201764702796936, + "logps/chosen": -46.317161560058594, + "logps/rejected": -100.89274597167969, + "loss": 0.519, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.159721851348877, + "rewards/margins": 6.525470733642578, + "rewards/rejected": -3.365748882293701, + "step": 12564 + }, + { + "epoch": 3.14, + "grad_norm": 6.243654727935791, + "learning_rate": 3.032706847901658e-06, + "logits/chosen": -0.4478819966316223, + "logits/rejected": -0.5649815201759338, + "logps/chosen": -61.9561653137207, + "logps/rejected": -121.04226684570312, + "loss": 0.6003, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.159980297088623, + "rewards/margins": 8.361406326293945, + "rewards/rejected": -5.201425552368164, + "step": 12565 + }, + { + "epoch": 3.14, + "grad_norm": 4.816740989685059, + "learning_rate": 3.0319842804960865e-06, + "logits/chosen": -0.5183256268501282, + "logits/rejected": -0.5895649790763855, + "logps/chosen": -61.70912170410156, + "logps/rejected": -102.08086395263672, + "loss": 0.6772, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.913616180419922, + "rewards/margins": 7.011354446411133, + "rewards/rejected": -4.097737789154053, + "step": 12566 + }, + { + "epoch": 3.14, + "grad_norm": 7.7584004402160645, + "learning_rate": 3.031261761722276e-06, + "logits/chosen": -0.5551945567131042, + "logits/rejected": -0.5990587472915649, + "logps/chosen": -55.162723541259766, + "logps/rejected": -123.22817993164062, + "loss": 0.6788, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1642584800720215, + "rewards/margins": 8.388721466064453, + "rewards/rejected": -5.224462985992432, + "step": 12567 + }, + { + "epoch": 3.14, + "grad_norm": 5.714491844177246, + "learning_rate": 3.0305392915980823e-06, + "logits/chosen": -0.5565057396888733, + "logits/rejected": -0.622093141078949, + "logps/chosen": -51.978759765625, + "logps/rejected": -113.89616394042969, + "loss": 0.5846, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.18578839302063, + "rewards/margins": 7.121275424957275, + "rewards/rejected": -3.935487747192383, + "step": 12568 + }, + { + "epoch": 3.14, + "grad_norm": 4.017027378082275, + "learning_rate": 3.0298168701413577e-06, + "logits/chosen": -0.5149871110916138, + "logits/rejected": -0.6280702352523804, + "logps/chosen": -56.290706634521484, + "logps/rejected": -82.48648071289062, + "loss": 0.6629, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1871373653411865, + "rewards/margins": 6.049902439117432, + "rewards/rejected": -2.862764835357666, + "step": 12569 + }, + { + "epoch": 3.14, + "grad_norm": 9.755956649780273, + "learning_rate": 3.029094497369952e-06, + "logits/chosen": -0.5270661115646362, + "logits/rejected": -0.5741496682167053, + "logps/chosen": -64.1761703491211, + "logps/rejected": -108.19986724853516, + "loss": 0.7431, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6875128746032715, + "rewards/margins": 5.980566024780273, + "rewards/rejected": -3.293053150177002, + "step": 12570 + }, + { + "epoch": 3.14, + "grad_norm": 6.310330867767334, + "learning_rate": 3.0283721733017185e-06, + "logits/chosen": -0.4729207456111908, + "logits/rejected": -0.5932003855705261, + "logps/chosen": -67.55241394042969, + "logps/rejected": -99.98886108398438, + "loss": 0.652, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8346729278564453, + "rewards/margins": 6.648892879486084, + "rewards/rejected": -3.8142201900482178, + "step": 12571 + }, + { + "epoch": 3.15, + "grad_norm": 5.298356056213379, + "learning_rate": 3.0276498979545075e-06, + "logits/chosen": -0.5418907403945923, + "logits/rejected": -0.6335045099258423, + "logps/chosen": -61.596099853515625, + "logps/rejected": -103.89250946044922, + "loss": 0.6568, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9472920894622803, + "rewards/margins": 6.674029350280762, + "rewards/rejected": -3.7267379760742188, + "step": 12572 + }, + { + "epoch": 3.15, + "grad_norm": 4.78024959564209, + "learning_rate": 3.0269276713461636e-06, + "logits/chosen": -0.5725393295288086, + "logits/rejected": -0.5811257362365723, + "logps/chosen": -54.913719177246094, + "logps/rejected": -109.410888671875, + "loss": 0.6061, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7260632514953613, + "rewards/margins": 6.1704182624816895, + "rewards/rejected": -3.4443557262420654, + "step": 12573 + }, + { + "epoch": 3.15, + "grad_norm": 7.697381973266602, + "learning_rate": 3.026205493494536e-06, + "logits/chosen": -0.5255885720252991, + "logits/rejected": -0.5862517356872559, + "logps/chosen": -51.38689041137695, + "logps/rejected": -120.85621643066406, + "loss": 0.6321, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4037864208221436, + "rewards/margins": 6.834941864013672, + "rewards/rejected": -3.4311554431915283, + "step": 12574 + }, + { + "epoch": 3.15, + "grad_norm": 2.334277391433716, + "learning_rate": 3.0254833644174707e-06, + "logits/chosen": -0.589648425579071, + "logits/rejected": -0.687799870967865, + "logps/chosen": -45.992950439453125, + "logps/rejected": -107.02349090576172, + "loss": 0.5291, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9386954307556152, + "rewards/margins": 7.956087112426758, + "rewards/rejected": -5.017392158508301, + "step": 12575 + }, + { + "epoch": 3.15, + "grad_norm": 7.2762932777404785, + "learning_rate": 3.024761284132811e-06, + "logits/chosen": -0.5112881064414978, + "logits/rejected": -0.5774849057197571, + "logps/chosen": -47.54387283325195, + "logps/rejected": -104.6418228149414, + "loss": 0.6571, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.056245803833008, + "rewards/margins": 5.833444118499756, + "rewards/rejected": -2.777198314666748, + "step": 12576 + }, + { + "epoch": 3.15, + "grad_norm": 3.929517984390259, + "learning_rate": 3.0240392526584007e-06, + "logits/chosen": -0.4919871687889099, + "logits/rejected": -0.5367992520332336, + "logps/chosen": -64.45915222167969, + "logps/rejected": -120.8838119506836, + "loss": 0.6587, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0807833671569824, + "rewards/margins": 7.196081161499023, + "rewards/rejected": -4.115298271179199, + "step": 12577 + }, + { + "epoch": 3.15, + "grad_norm": 3.1543917655944824, + "learning_rate": 3.023317270012082e-06, + "logits/chosen": -0.5774241089820862, + "logits/rejected": -0.683204174041748, + "logps/chosen": -51.602787017822266, + "logps/rejected": -102.94976043701172, + "loss": 0.5712, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.147653579711914, + "rewards/margins": 7.665340423583984, + "rewards/rejected": -4.5176873207092285, + "step": 12578 + }, + { + "epoch": 3.15, + "grad_norm": 7.399299621582031, + "learning_rate": 3.0225953362116978e-06, + "logits/chosen": -0.5408199429512024, + "logits/rejected": -0.6157112717628479, + "logps/chosen": -55.801490783691406, + "logps/rejected": -125.1419906616211, + "loss": 0.6431, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.942662239074707, + "rewards/margins": 7.676278114318848, + "rewards/rejected": -4.733616352081299, + "step": 12579 + }, + { + "epoch": 3.15, + "grad_norm": 4.707407474517822, + "learning_rate": 3.0218734512750847e-06, + "logits/chosen": -0.4993613064289093, + "logits/rejected": -0.5545039772987366, + "logps/chosen": -47.67113494873047, + "logps/rejected": -116.5765380859375, + "loss": 0.5724, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.000548839569092, + "rewards/margins": 6.064035892486572, + "rewards/rejected": -3.0634877681732178, + "step": 12580 + }, + { + "epoch": 3.15, + "grad_norm": 13.054205894470215, + "learning_rate": 3.0211516152200828e-06, + "logits/chosen": -0.5268442034721375, + "logits/rejected": -0.6413037776947021, + "logps/chosen": -53.39986038208008, + "logps/rejected": -100.76888275146484, + "loss": 0.5797, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8227832317352295, + "rewards/margins": 7.163888931274414, + "rewards/rejected": -4.341105937957764, + "step": 12581 + }, + { + "epoch": 3.15, + "grad_norm": 5.659017086029053, + "learning_rate": 3.020429828064527e-06, + "logits/chosen": -0.4743278920650482, + "logits/rejected": -0.5903786420822144, + "logps/chosen": -51.47066116333008, + "logps/rejected": -104.00250244140625, + "loss": 0.6174, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8968000411987305, + "rewards/margins": 7.740062236785889, + "rewards/rejected": -4.843262672424316, + "step": 12582 + }, + { + "epoch": 3.15, + "grad_norm": 3.112687349319458, + "learning_rate": 3.0197080898262604e-06, + "logits/chosen": -0.5320351123809814, + "logits/rejected": -0.6415015459060669, + "logps/chosen": -52.05957794189453, + "logps/rejected": -92.31088256835938, + "loss": 0.5949, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9880855083465576, + "rewards/margins": 7.320099830627441, + "rewards/rejected": -4.332014083862305, + "step": 12583 + }, + { + "epoch": 3.15, + "grad_norm": 9.493206977844238, + "learning_rate": 3.0189864005231106e-06, + "logits/chosen": -0.5308144688606262, + "logits/rejected": -0.5830632448196411, + "logps/chosen": -61.064170837402344, + "logps/rejected": -114.442138671875, + "loss": 0.702, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8705761432647705, + "rewards/margins": 6.382640838623047, + "rewards/rejected": -3.5120651721954346, + "step": 12584 + }, + { + "epoch": 3.15, + "grad_norm": 3.8344507217407227, + "learning_rate": 3.0182647601729133e-06, + "logits/chosen": -0.5360569357872009, + "logits/rejected": -0.6176976561546326, + "logps/chosen": -53.97349166870117, + "logps/rejected": -80.7394790649414, + "loss": 0.6677, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1789751052856445, + "rewards/margins": 5.434189796447754, + "rewards/rejected": -2.2552149295806885, + "step": 12585 + }, + { + "epoch": 3.15, + "grad_norm": 3.364452600479126, + "learning_rate": 3.017543168793504e-06, + "logits/chosen": -0.5238935947418213, + "logits/rejected": -0.602271556854248, + "logps/chosen": -47.41990661621094, + "logps/rejected": -106.29874420166016, + "loss": 0.5468, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.116351842880249, + "rewards/margins": 7.925719261169434, + "rewards/rejected": -4.809366703033447, + "step": 12586 + }, + { + "epoch": 3.15, + "grad_norm": 7.6384663581848145, + "learning_rate": 3.016821626402709e-06, + "logits/chosen": -0.5184518098831177, + "logits/rejected": -0.6368929743766785, + "logps/chosen": -63.32352828979492, + "logps/rejected": -90.67620849609375, + "loss": 0.7237, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.046854257583618, + "rewards/margins": 6.77889347076416, + "rewards/rejected": -3.732038736343384, + "step": 12587 + }, + { + "epoch": 3.15, + "grad_norm": 5.260861873626709, + "learning_rate": 3.016100133018362e-06, + "logits/chosen": -0.494320273399353, + "logits/rejected": -0.5563745498657227, + "logps/chosen": -60.828338623046875, + "logps/rejected": -105.08088684082031, + "loss": 0.7157, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9923017024993896, + "rewards/margins": 6.824665546417236, + "rewards/rejected": -3.832364082336426, + "step": 12588 + }, + { + "epoch": 3.15, + "grad_norm": 6.060003757476807, + "learning_rate": 3.015378688658291e-06, + "logits/chosen": -0.4561389684677124, + "logits/rejected": -0.5610156059265137, + "logps/chosen": -62.69528579711914, + "logps/rejected": -96.97429656982422, + "loss": 0.6743, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.209472417831421, + "rewards/margins": 6.687607288360596, + "rewards/rejected": -3.4781341552734375, + "step": 12589 + }, + { + "epoch": 3.15, + "grad_norm": 3.2274036407470703, + "learning_rate": 3.014657293340324e-06, + "logits/chosen": -0.4937041401863098, + "logits/rejected": -0.563298225402832, + "logps/chosen": -61.06338882446289, + "logps/rejected": -115.26162719726562, + "loss": 0.6142, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3966774940490723, + "rewards/margins": 6.733761787414551, + "rewards/rejected": -3.3370845317840576, + "step": 12590 + }, + { + "epoch": 3.15, + "grad_norm": 6.239700794219971, + "learning_rate": 3.0139359470822864e-06, + "logits/chosen": -0.5744615793228149, + "logits/rejected": -0.6007769107818604, + "logps/chosen": -47.122562408447266, + "logps/rejected": -109.10274505615234, + "loss": 0.6597, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0072922706604004, + "rewards/margins": 5.55223274230957, + "rewards/rejected": -2.544940948486328, + "step": 12591 + }, + { + "epoch": 3.15, + "grad_norm": 5.3865556716918945, + "learning_rate": 3.0132146499020044e-06, + "logits/chosen": -0.5901801586151123, + "logits/rejected": -0.6120009422302246, + "logps/chosen": -48.95035171508789, + "logps/rejected": -119.24592590332031, + "loss": 0.7073, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.152873992919922, + "rewards/margins": 7.625669956207275, + "rewards/rejected": -4.4727959632873535, + "step": 12592 + }, + { + "epoch": 3.15, + "grad_norm": 8.258038520812988, + "learning_rate": 3.0124934018173026e-06, + "logits/chosen": -0.5397096872329712, + "logits/rejected": -0.5983830094337463, + "logps/chosen": -60.73686981201172, + "logps/rejected": -120.15362548828125, + "loss": 0.6122, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.185237407684326, + "rewards/margins": 7.103985786437988, + "rewards/rejected": -3.9187490940093994, + "step": 12593 + }, + { + "epoch": 3.15, + "grad_norm": 4.962287425994873, + "learning_rate": 3.0117722028460007e-06, + "logits/chosen": -0.5252488255500793, + "logits/rejected": -0.6101818680763245, + "logps/chosen": -55.289268493652344, + "logps/rejected": -119.83232116699219, + "loss": 0.5831, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1429970264434814, + "rewards/margins": 7.322726726531982, + "rewards/rejected": -4.179730415344238, + "step": 12594 + }, + { + "epoch": 3.15, + "grad_norm": 6.1509904861450195, + "learning_rate": 3.0110510530059234e-06, + "logits/chosen": -0.48890629410743713, + "logits/rejected": -0.5749303102493286, + "logps/chosen": -66.92938232421875, + "logps/rejected": -109.44378662109375, + "loss": 0.6652, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0037851333618164, + "rewards/margins": 7.273728847503662, + "rewards/rejected": -4.2699432373046875, + "step": 12595 + }, + { + "epoch": 3.15, + "grad_norm": 1.84676992893219, + "learning_rate": 3.01032995231489e-06, + "logits/chosen": -0.519965410232544, + "logits/rejected": -0.6433132886886597, + "logps/chosen": -59.743553161621094, + "logps/rejected": -98.53363037109375, + "loss": 0.5386, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9970080852508545, + "rewards/margins": 6.7777323722839355, + "rewards/rejected": -3.7807247638702393, + "step": 12596 + }, + { + "epoch": 3.15, + "grad_norm": 6.114794731140137, + "learning_rate": 3.009608900790721e-06, + "logits/chosen": -0.5547679662704468, + "logits/rejected": -0.5795496106147766, + "logps/chosen": -60.80918884277344, + "logps/rejected": -114.61451721191406, + "loss": 0.7013, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.864352226257324, + "rewards/margins": 6.256372928619385, + "rewards/rejected": -3.3920207023620605, + "step": 12597 + }, + { + "epoch": 3.15, + "grad_norm": 5.959779262542725, + "learning_rate": 3.0088878984512314e-06, + "logits/chosen": -0.48314157128334045, + "logits/rejected": -0.5350009202957153, + "logps/chosen": -59.90449905395508, + "logps/rejected": -103.3559799194336, + "loss": 0.7846, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.225918769836426, + "rewards/margins": 5.583352565765381, + "rewards/rejected": -2.3574342727661133, + "step": 12598 + }, + { + "epoch": 3.15, + "grad_norm": 6.449711322784424, + "learning_rate": 3.008166945314239e-06, + "logits/chosen": -0.49406641721725464, + "logits/rejected": -0.6314007043838501, + "logps/chosen": -61.65209197998047, + "logps/rejected": -79.74395751953125, + "loss": 0.7183, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.091151714324951, + "rewards/margins": 5.358825206756592, + "rewards/rejected": -2.2676730155944824, + "step": 12599 + }, + { + "epoch": 3.15, + "grad_norm": 4.149231433868408, + "learning_rate": 3.0074460413975636e-06, + "logits/chosen": -0.4746836721897125, + "logits/rejected": -0.5524835586547852, + "logps/chosen": -49.936798095703125, + "logps/rejected": -97.77462005615234, + "loss": 0.5889, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0916647911071777, + "rewards/margins": 6.939848899841309, + "rewards/rejected": -3.848184108734131, + "step": 12600 + }, + { + "epoch": 3.15, + "grad_norm": 5.64790678024292, + "learning_rate": 3.0067251867190133e-06, + "logits/chosen": -0.49930301308631897, + "logits/rejected": -0.592602014541626, + "logps/chosen": -45.22856140136719, + "logps/rejected": -97.9964828491211, + "loss": 0.5538, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.255502700805664, + "rewards/margins": 6.332243919372559, + "rewards/rejected": -3.0767412185668945, + "step": 12601 + }, + { + "epoch": 3.15, + "grad_norm": 5.291317462921143, + "learning_rate": 3.006004381296403e-06, + "logits/chosen": -0.5044732093811035, + "logits/rejected": -0.5906336307525635, + "logps/chosen": -49.16428756713867, + "logps/rejected": -101.01209259033203, + "loss": 0.5825, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1297338008880615, + "rewards/margins": 6.884203910827637, + "rewards/rejected": -3.754470109939575, + "step": 12602 + }, + { + "epoch": 3.15, + "grad_norm": 7.214801788330078, + "learning_rate": 3.005283625147546e-06, + "logits/chosen": -0.5529898405075073, + "logits/rejected": -0.6325510740280151, + "logps/chosen": -55.680320739746094, + "logps/rejected": -100.09192657470703, + "loss": 0.6967, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7104694843292236, + "rewards/margins": 5.879419326782227, + "rewards/rejected": -3.168950080871582, + "step": 12603 + }, + { + "epoch": 3.15, + "grad_norm": 4.334197998046875, + "learning_rate": 3.0045629182902536e-06, + "logits/chosen": -0.5199477076530457, + "logits/rejected": -0.6058332324028015, + "logps/chosen": -45.94233703613281, + "logps/rejected": -89.06114959716797, + "loss": 0.6229, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2168009281158447, + "rewards/margins": 6.40037202835083, + "rewards/rejected": -3.1835708618164062, + "step": 12604 + }, + { + "epoch": 3.15, + "grad_norm": 4.894781112670898, + "learning_rate": 3.0038422607423332e-06, + "logits/chosen": -0.46651339530944824, + "logits/rejected": -0.5545543432235718, + "logps/chosen": -57.150054931640625, + "logps/rejected": -107.48033905029297, + "loss": 0.7056, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9098286628723145, + "rewards/margins": 6.815167427062988, + "rewards/rejected": -3.905339241027832, + "step": 12605 + }, + { + "epoch": 3.15, + "grad_norm": 12.900217056274414, + "learning_rate": 3.0031216525215934e-06, + "logits/chosen": -0.4730609655380249, + "logits/rejected": -0.5762307047843933, + "logps/chosen": -51.55752182006836, + "logps/rejected": -92.56718444824219, + "loss": 0.6501, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0998032093048096, + "rewards/margins": 6.130110740661621, + "rewards/rejected": -3.0303075313568115, + "step": 12606 + }, + { + "epoch": 3.15, + "grad_norm": 6.679142475128174, + "learning_rate": 3.0024010936458426e-06, + "logits/chosen": -0.5070313811302185, + "logits/rejected": -0.5755984783172607, + "logps/chosen": -46.59770202636719, + "logps/rejected": -87.7528076171875, + "loss": 0.7288, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1044557094573975, + "rewards/margins": 5.926522254943848, + "rewards/rejected": -2.822066307067871, + "step": 12607 + }, + { + "epoch": 3.15, + "grad_norm": 6.407470226287842, + "learning_rate": 3.0016805841328845e-06, + "logits/chosen": -0.5266379714012146, + "logits/rejected": -0.5603711605072021, + "logps/chosen": -51.91926193237305, + "logps/rejected": -99.17306518554688, + "loss": 0.642, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.031432628631592, + "rewards/margins": 5.618980407714844, + "rewards/rejected": -2.58754825592041, + "step": 12608 + }, + { + "epoch": 3.15, + "grad_norm": 3.5668232440948486, + "learning_rate": 3.0009601240005247e-06, + "logits/chosen": -0.4521004557609558, + "logits/rejected": -0.5393164753913879, + "logps/chosen": -55.04985046386719, + "logps/rejected": -96.01446533203125, + "loss": 0.5987, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.006493330001831, + "rewards/margins": 6.890120983123779, + "rewards/rejected": -3.8836276531219482, + "step": 12609 + }, + { + "epoch": 3.15, + "grad_norm": 10.805238723754883, + "learning_rate": 3.000239713266566e-06, + "logits/chosen": -0.6057804226875305, + "logits/rejected": -0.6425546407699585, + "logps/chosen": -55.894901275634766, + "logps/rejected": -111.11090087890625, + "loss": 0.7522, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9543559551239014, + "rewards/margins": 6.908387184143066, + "rewards/rejected": -3.954031229019165, + "step": 12610 + }, + { + "epoch": 3.15, + "grad_norm": 2.6558077335357666, + "learning_rate": 2.999519351948813e-06, + "logits/chosen": -0.5183449983596802, + "logits/rejected": -0.5842321515083313, + "logps/chosen": -50.13905334472656, + "logps/rejected": -100.77517700195312, + "loss": 0.5419, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.042916774749756, + "rewards/margins": 6.629148960113525, + "rewards/rejected": -3.5862324237823486, + "step": 12611 + }, + { + "epoch": 3.16, + "grad_norm": 4.507111549377441, + "learning_rate": 2.998799040065063e-06, + "logits/chosen": -0.5745436549186707, + "logits/rejected": -0.6767488121986389, + "logps/chosen": -75.51356506347656, + "logps/rejected": -110.31781768798828, + "loss": 0.6918, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.931997537612915, + "rewards/margins": 6.4342851638793945, + "rewards/rejected": -3.5022878646850586, + "step": 12612 + }, + { + "epoch": 3.16, + "grad_norm": 3.5999741554260254, + "learning_rate": 2.998078777633117e-06, + "logits/chosen": -0.5035092830657959, + "logits/rejected": -0.5842087268829346, + "logps/chosen": -51.18342971801758, + "logps/rejected": -93.38430786132812, + "loss": 0.5841, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.208200693130493, + "rewards/margins": 6.350511074066162, + "rewards/rejected": -3.14231014251709, + "step": 12613 + }, + { + "epoch": 3.16, + "grad_norm": 6.757107257843018, + "learning_rate": 2.9973585646707738e-06, + "logits/chosen": -0.5284115076065063, + "logits/rejected": -0.6487216353416443, + "logps/chosen": -61.387359619140625, + "logps/rejected": -93.32734680175781, + "loss": 0.6484, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.12660551071167, + "rewards/margins": 6.566338539123535, + "rewards/rejected": -3.4397330284118652, + "step": 12614 + }, + { + "epoch": 3.16, + "grad_norm": 2.846353530883789, + "learning_rate": 2.996638401195833e-06, + "logits/chosen": -0.4346965551376343, + "logits/rejected": -0.5757690072059631, + "logps/chosen": -52.85655975341797, + "logps/rejected": -91.41424560546875, + "loss": 0.5928, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1186084747314453, + "rewards/margins": 7.80087423324585, + "rewards/rejected": -4.682265758514404, + "step": 12615 + }, + { + "epoch": 3.16, + "grad_norm": 4.949540615081787, + "learning_rate": 2.9959182872260845e-06, + "logits/chosen": -0.45761895179748535, + "logits/rejected": -0.5641943216323853, + "logps/chosen": -53.39213180541992, + "logps/rejected": -124.1243896484375, + "loss": 0.5719, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4110960960388184, + "rewards/margins": 8.862237930297852, + "rewards/rejected": -5.451141834259033, + "step": 12616 + }, + { + "epoch": 3.16, + "grad_norm": 4.770196437835693, + "learning_rate": 2.9951982227793286e-06, + "logits/chosen": -0.4286099672317505, + "logits/rejected": -0.5235715508460999, + "logps/chosen": -59.18553161621094, + "logps/rejected": -99.13556671142578, + "loss": 0.6724, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.002424716949463, + "rewards/margins": 4.588232517242432, + "rewards/rejected": -1.5858080387115479, + "step": 12617 + }, + { + "epoch": 3.16, + "grad_norm": 2.601752996444702, + "learning_rate": 2.9944782078733584e-06, + "logits/chosen": -0.420896977186203, + "logits/rejected": -0.5182231664657593, + "logps/chosen": -58.2299919128418, + "logps/rejected": -106.54466247558594, + "loss": 0.58, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1333320140838623, + "rewards/margins": 6.941570281982422, + "rewards/rejected": -3.8082380294799805, + "step": 12618 + }, + { + "epoch": 3.16, + "grad_norm": 2.2077367305755615, + "learning_rate": 2.9937582425259625e-06, + "logits/chosen": -0.5176001787185669, + "logits/rejected": -0.6324183344841003, + "logps/chosen": -48.16889190673828, + "logps/rejected": -114.78657531738281, + "loss": 0.5412, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.879979133605957, + "rewards/margins": 8.653546333312988, + "rewards/rejected": -5.773568153381348, + "step": 12619 + }, + { + "epoch": 3.16, + "grad_norm": 28.899328231811523, + "learning_rate": 2.9930383267549345e-06, + "logits/chosen": -0.5167158246040344, + "logits/rejected": -0.5958857536315918, + "logps/chosen": -51.80950927734375, + "logps/rejected": -95.10111999511719, + "loss": 0.6033, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.523831367492676, + "rewards/margins": 6.72675895690918, + "rewards/rejected": -3.202927350997925, + "step": 12620 + }, + { + "epoch": 3.16, + "grad_norm": 2.455235719680786, + "learning_rate": 2.9923184605780654e-06, + "logits/chosen": -0.5308113694190979, + "logits/rejected": -0.6196911334991455, + "logps/chosen": -57.305274963378906, + "logps/rejected": -110.20632934570312, + "loss": 0.6262, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1688480377197266, + "rewards/margins": 6.89919900894165, + "rewards/rejected": -3.730351448059082, + "step": 12621 + }, + { + "epoch": 3.16, + "grad_norm": 3.0154354572296143, + "learning_rate": 2.991598644013143e-06, + "logits/chosen": -0.4956808090209961, + "logits/rejected": -0.5698937177658081, + "logps/chosen": -53.50096130371094, + "logps/rejected": -119.31878662109375, + "loss": 0.5366, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.941605567932129, + "rewards/margins": 7.626618385314941, + "rewards/rejected": -4.6850128173828125, + "step": 12622 + }, + { + "epoch": 3.16, + "grad_norm": 5.676424026489258, + "learning_rate": 2.990878877077953e-06, + "logits/chosen": -0.5748900175094604, + "logits/rejected": -0.643574059009552, + "logps/chosen": -55.40236282348633, + "logps/rejected": -104.23924255371094, + "loss": 0.6067, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.44089412689209, + "rewards/margins": 6.816303253173828, + "rewards/rejected": -3.375408411026001, + "step": 12623 + }, + { + "epoch": 3.16, + "grad_norm": 2.6612277030944824, + "learning_rate": 2.9901591597902834e-06, + "logits/chosen": -0.4299215078353882, + "logits/rejected": -0.5735971331596375, + "logps/chosen": -53.02193069458008, + "logps/rejected": -91.8287124633789, + "loss": 0.5557, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1534533500671387, + "rewards/margins": 7.8244853019714355, + "rewards/rejected": -4.671031951904297, + "step": 12624 + }, + { + "epoch": 3.16, + "grad_norm": 5.006731986999512, + "learning_rate": 2.9894394921679194e-06, + "logits/chosen": -0.5191113948822021, + "logits/rejected": -0.6085078716278076, + "logps/chosen": -46.44567108154297, + "logps/rejected": -94.90756225585938, + "loss": 0.6202, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8161611557006836, + "rewards/margins": 6.932678699493408, + "rewards/rejected": -4.116518020629883, + "step": 12625 + }, + { + "epoch": 3.16, + "grad_norm": 1.9311678409576416, + "learning_rate": 2.988719874228643e-06, + "logits/chosen": -0.49969470500946045, + "logits/rejected": -0.6018741726875305, + "logps/chosen": -53.79996871948242, + "logps/rejected": -100.66130828857422, + "loss": 0.5906, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.287075996398926, + "rewards/margins": 8.267047882080078, + "rewards/rejected": -4.979973316192627, + "step": 12626 + }, + { + "epoch": 3.16, + "grad_norm": 5.330454349517822, + "learning_rate": 2.9880003059902384e-06, + "logits/chosen": -0.5026464462280273, + "logits/rejected": -0.5707126259803772, + "logps/chosen": -52.80833053588867, + "logps/rejected": -104.00284576416016, + "loss": 0.622, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.152873992919922, + "rewards/margins": 7.70638370513916, + "rewards/rejected": -4.553509712219238, + "step": 12627 + }, + { + "epoch": 3.16, + "grad_norm": 6.170112609863281, + "learning_rate": 2.9872807874704845e-06, + "logits/chosen": -0.5183160305023193, + "logits/rejected": -0.5632377862930298, + "logps/chosen": -57.65780258178711, + "logps/rejected": -119.9657974243164, + "loss": 0.6713, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0701448917388916, + "rewards/margins": 6.151247978210449, + "rewards/rejected": -3.0811028480529785, + "step": 12628 + }, + { + "epoch": 3.16, + "grad_norm": 7.6959614753723145, + "learning_rate": 2.986561318687167e-06, + "logits/chosen": -0.5558061003684998, + "logits/rejected": -0.6668286919593811, + "logps/chosen": -55.60978317260742, + "logps/rejected": -101.62991333007812, + "loss": 0.7946, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.312326431274414, + "rewards/margins": 6.727868556976318, + "rewards/rejected": -3.415541410446167, + "step": 12629 + }, + { + "epoch": 3.16, + "grad_norm": 4.261108875274658, + "learning_rate": 2.9858418996580573e-06, + "logits/chosen": -0.5279086828231812, + "logits/rejected": -0.5922385454177856, + "logps/chosen": -48.7641716003418, + "logps/rejected": -130.99642944335938, + "loss": 0.5725, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1738762855529785, + "rewards/margins": 8.325342178344727, + "rewards/rejected": -5.15146541595459, + "step": 12630 + }, + { + "epoch": 3.16, + "grad_norm": 3.0949063301086426, + "learning_rate": 2.9851225304009378e-06, + "logits/chosen": -0.5227802395820618, + "logits/rejected": -0.6137189865112305, + "logps/chosen": -51.33100891113281, + "logps/rejected": -111.10760498046875, + "loss": 0.5257, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.137143135070801, + "rewards/margins": 6.933450222015381, + "rewards/rejected": -3.796306610107422, + "step": 12631 + }, + { + "epoch": 3.16, + "grad_norm": 5.681519985198975, + "learning_rate": 2.984403210933585e-06, + "logits/chosen": -0.5766907930374146, + "logits/rejected": -0.6266763806343079, + "logps/chosen": -49.75129318237305, + "logps/rejected": -119.32994842529297, + "loss": 0.6345, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4437384605407715, + "rewards/margins": 7.041001319885254, + "rewards/rejected": -3.5972628593444824, + "step": 12632 + }, + { + "epoch": 3.16, + "grad_norm": 9.272233009338379, + "learning_rate": 2.983683941273771e-06, + "logits/chosen": -0.5983734726905823, + "logits/rejected": -0.6848708987236023, + "logps/chosen": -51.82810974121094, + "logps/rejected": -86.24527740478516, + "loss": 0.703, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7615396976470947, + "rewards/margins": 4.823892593383789, + "rewards/rejected": -2.0623528957366943, + "step": 12633 + }, + { + "epoch": 3.16, + "grad_norm": 2.604139804840088, + "learning_rate": 2.982964721439272e-06, + "logits/chosen": -0.5513470768928528, + "logits/rejected": -0.6932717561721802, + "logps/chosen": -57.77009582519531, + "logps/rejected": -87.01290130615234, + "loss": 0.5826, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1804306507110596, + "rewards/margins": 7.1272711753845215, + "rewards/rejected": -3.94684100151062, + "step": 12634 + }, + { + "epoch": 3.16, + "grad_norm": 4.189706325531006, + "learning_rate": 2.98224555144786e-06, + "logits/chosen": -0.5057454109191895, + "logits/rejected": -0.581357479095459, + "logps/chosen": -68.75440216064453, + "logps/rejected": -98.24569702148438, + "loss": 0.7166, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.501554489135742, + "rewards/margins": 5.667773723602295, + "rewards/rejected": -2.1662189960479736, + "step": 12635 + }, + { + "epoch": 3.16, + "grad_norm": 4.709486961364746, + "learning_rate": 2.9815264313173075e-06, + "logits/chosen": -0.5104873180389404, + "logits/rejected": -0.6190637946128845, + "logps/chosen": -60.38585662841797, + "logps/rejected": -94.0560302734375, + "loss": 0.6837, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9585742950439453, + "rewards/margins": 6.829350471496582, + "rewards/rejected": -3.8707759380340576, + "step": 12636 + }, + { + "epoch": 3.16, + "grad_norm": 4.44421911239624, + "learning_rate": 2.980807361065383e-06, + "logits/chosen": -0.4968016445636749, + "logits/rejected": -0.6138738989830017, + "logps/chosen": -63.52813720703125, + "logps/rejected": -87.06236267089844, + "loss": 0.6483, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1724014282226562, + "rewards/margins": 5.818088054656982, + "rewards/rejected": -2.645686626434326, + "step": 12637 + }, + { + "epoch": 3.16, + "grad_norm": 2.2530863285064697, + "learning_rate": 2.9800883407098565e-06, + "logits/chosen": -0.5185598134994507, + "logits/rejected": -0.6414888501167297, + "logps/chosen": -54.995479583740234, + "logps/rejected": -90.94010925292969, + "loss": 0.5599, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0567750930786133, + "rewards/margins": 7.263791084289551, + "rewards/rejected": -4.207016468048096, + "step": 12638 + }, + { + "epoch": 3.16, + "grad_norm": 4.110376834869385, + "learning_rate": 2.979369370268495e-06, + "logits/chosen": -0.5561865568161011, + "logits/rejected": -0.5684505105018616, + "logps/chosen": -52.262123107910156, + "logps/rejected": -111.85197448730469, + "loss": 0.6803, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0446810722351074, + "rewards/margins": 6.708107948303223, + "rewards/rejected": -3.6634268760681152, + "step": 12639 + }, + { + "epoch": 3.16, + "grad_norm": 2.4995594024658203, + "learning_rate": 2.9786504497590674e-06, + "logits/chosen": -0.48724862933158875, + "logits/rejected": -0.5794005990028381, + "logps/chosen": -51.809852600097656, + "logps/rejected": -124.07733154296875, + "loss": 0.5554, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0308539867401123, + "rewards/margins": 8.356982231140137, + "rewards/rejected": -5.326128005981445, + "step": 12640 + }, + { + "epoch": 3.16, + "grad_norm": 4.307213306427002, + "learning_rate": 2.977931579199336e-06, + "logits/chosen": -0.49348628520965576, + "logits/rejected": -0.5719662308692932, + "logps/chosen": -50.626319885253906, + "logps/rejected": -118.82302856445312, + "loss": 0.6253, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5241401195526123, + "rewards/margins": 8.056930541992188, + "rewards/rejected": -4.532789707183838, + "step": 12641 + }, + { + "epoch": 3.16, + "grad_norm": 4.226978302001953, + "learning_rate": 2.9772127586070642e-06, + "logits/chosen": -0.5922490954399109, + "logits/rejected": -0.6704416275024414, + "logps/chosen": -46.42408752441406, + "logps/rejected": -94.87105560302734, + "loss": 0.6229, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.119117021560669, + "rewards/margins": 6.919398307800293, + "rewards/rejected": -3.8002817630767822, + "step": 12642 + }, + { + "epoch": 3.16, + "grad_norm": 6.116229057312012, + "learning_rate": 2.9764939880000203e-06, + "logits/chosen": -0.5328854918479919, + "logits/rejected": -0.5768833756446838, + "logps/chosen": -57.71635055541992, + "logps/rejected": -103.31929779052734, + "loss": 0.7617, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.151047468185425, + "rewards/margins": 5.324761390686035, + "rewards/rejected": -2.1737136840820312, + "step": 12643 + }, + { + "epoch": 3.16, + "grad_norm": 5.5818257331848145, + "learning_rate": 2.9757752673959605e-06, + "logits/chosen": -0.5912582278251648, + "logits/rejected": -0.6887488961219788, + "logps/chosen": -51.142391204833984, + "logps/rejected": -103.11528015136719, + "loss": 0.6098, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.708576202392578, + "rewards/margins": 6.987258434295654, + "rewards/rejected": -4.278681755065918, + "step": 12644 + }, + { + "epoch": 3.16, + "grad_norm": 5.46103048324585, + "learning_rate": 2.9750565968126453e-06, + "logits/chosen": -0.5215710401535034, + "logits/rejected": -0.5782884955406189, + "logps/chosen": -51.38574981689453, + "logps/rejected": -103.1534652709961, + "loss": 0.6014, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5050806999206543, + "rewards/margins": 6.413137912750244, + "rewards/rejected": -2.908057689666748, + "step": 12645 + }, + { + "epoch": 3.16, + "grad_norm": 3.496523857116699, + "learning_rate": 2.974337976267837e-06, + "logits/chosen": -0.5313763618469238, + "logits/rejected": -0.5573847889900208, + "logps/chosen": -45.482364654541016, + "logps/rejected": -111.20726013183594, + "loss": 0.6294, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.140625238418579, + "rewards/margins": 7.174164772033691, + "rewards/rejected": -4.033539295196533, + "step": 12646 + }, + { + "epoch": 3.16, + "grad_norm": 4.277713298797607, + "learning_rate": 2.973619405779292e-06, + "logits/chosen": -0.5369908213615417, + "logits/rejected": -0.6578345894813538, + "logps/chosen": -60.92598342895508, + "logps/rejected": -95.33716583251953, + "loss": 0.5895, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0632073879241943, + "rewards/margins": 6.931509017944336, + "rewards/rejected": -3.8683009147644043, + "step": 12647 + }, + { + "epoch": 3.16, + "grad_norm": 15.665181159973145, + "learning_rate": 2.9729008853647668e-06, + "logits/chosen": -0.4758908152580261, + "logits/rejected": -0.5582760572433472, + "logps/chosen": -62.52355194091797, + "logps/rejected": -112.80184936523438, + "loss": 0.5953, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7908260822296143, + "rewards/margins": 6.927541255950928, + "rewards/rejected": -4.136714458465576, + "step": 12648 + }, + { + "epoch": 3.16, + "grad_norm": 6.096712589263916, + "learning_rate": 2.9721824150420166e-06, + "logits/chosen": -0.5780421495437622, + "logits/rejected": -0.6916259527206421, + "logps/chosen": -65.23819732666016, + "logps/rejected": -96.8751220703125, + "loss": 0.6351, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.320258140563965, + "rewards/margins": 7.5276408195495605, + "rewards/rejected": -4.207383155822754, + "step": 12649 + }, + { + "epoch": 3.16, + "grad_norm": 6.543191909790039, + "learning_rate": 2.971463994828797e-06, + "logits/chosen": -0.4951154291629791, + "logits/rejected": -0.5693551301956177, + "logps/chosen": -58.760215759277344, + "logps/rejected": -94.70577239990234, + "loss": 0.6936, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.936089277267456, + "rewards/margins": 6.17518949508667, + "rewards/rejected": -3.239100217819214, + "step": 12650 + }, + { + "epoch": 3.16, + "grad_norm": 4.574378967285156, + "learning_rate": 2.9707456247428577e-06, + "logits/chosen": -0.524754524230957, + "logits/rejected": -0.6405380964279175, + "logps/chosen": -73.59215545654297, + "logps/rejected": -121.8150863647461, + "loss": 0.708, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.990485906600952, + "rewards/margins": 7.730220317840576, + "rewards/rejected": -4.739734172821045, + "step": 12651 + }, + { + "epoch": 3.17, + "grad_norm": 6.835792064666748, + "learning_rate": 2.9700273048019533e-06, + "logits/chosen": -0.47804003953933716, + "logits/rejected": -0.5602908730506897, + "logps/chosen": -64.47542572021484, + "logps/rejected": -111.11183166503906, + "loss": 0.7665, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.146543025970459, + "rewards/margins": 5.968827247619629, + "rewards/rejected": -2.822283983230591, + "step": 12652 + }, + { + "epoch": 3.17, + "grad_norm": 4.167879104614258, + "learning_rate": 2.9693090350238333e-06, + "logits/chosen": -0.5013846755027771, + "logits/rejected": -0.5423811078071594, + "logps/chosen": -55.64948654174805, + "logps/rejected": -103.0887451171875, + "loss": 0.6168, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9803152084350586, + "rewards/margins": 6.700155258178711, + "rewards/rejected": -3.7198400497436523, + "step": 12653 + }, + { + "epoch": 3.17, + "grad_norm": 3.5458719730377197, + "learning_rate": 2.9685908154262482e-06, + "logits/chosen": -0.4316982626914978, + "logits/rejected": -0.559866189956665, + "logps/chosen": -56.90076446533203, + "logps/rejected": -105.74726867675781, + "loss": 0.6737, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3949694633483887, + "rewards/margins": 7.96557092666626, + "rewards/rejected": -4.570601463317871, + "step": 12654 + }, + { + "epoch": 3.17, + "grad_norm": 4.556604862213135, + "learning_rate": 2.9678726460269438e-06, + "logits/chosen": -0.52980637550354, + "logits/rejected": -0.6532623767852783, + "logps/chosen": -57.82380294799805, + "logps/rejected": -106.26596069335938, + "loss": 0.6186, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3153769969940186, + "rewards/margins": 7.08029842376709, + "rewards/rejected": -3.7649216651916504, + "step": 12655 + }, + { + "epoch": 3.17, + "grad_norm": 17.129613876342773, + "learning_rate": 2.9671545268436674e-06, + "logits/chosen": -0.5486330986022949, + "logits/rejected": -0.6087278723716736, + "logps/chosen": -50.994361877441406, + "logps/rejected": -84.30958557128906, + "loss": 0.5896, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.41513991355896, + "rewards/margins": 5.771966457366943, + "rewards/rejected": -2.3568267822265625, + "step": 12656 + }, + { + "epoch": 3.17, + "grad_norm": 8.736377716064453, + "learning_rate": 2.9664364578941658e-06, + "logits/chosen": -0.5669911503791809, + "logits/rejected": -0.6130839586257935, + "logps/chosen": -55.45481872558594, + "logps/rejected": -127.04882049560547, + "loss": 0.695, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.13592529296875, + "rewards/margins": 8.458237648010254, + "rewards/rejected": -5.322311878204346, + "step": 12657 + }, + { + "epoch": 3.17, + "grad_norm": 2.997028350830078, + "learning_rate": 2.965718439196181e-06, + "logits/chosen": -0.5119302868843079, + "logits/rejected": -0.6355559825897217, + "logps/chosen": -58.09105682373047, + "logps/rejected": -82.90144348144531, + "loss": 0.5944, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1668508052825928, + "rewards/margins": 6.6839680671691895, + "rewards/rejected": -3.5171167850494385, + "step": 12658 + }, + { + "epoch": 3.17, + "grad_norm": 23.63650894165039, + "learning_rate": 2.9650004707674574e-06, + "logits/chosen": -0.6089591383934021, + "logits/rejected": -0.6899250745773315, + "logps/chosen": -46.87248229980469, + "logps/rejected": -119.20179748535156, + "loss": 0.6301, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8655624389648438, + "rewards/margins": 7.30734920501709, + "rewards/rejected": -4.441786766052246, + "step": 12659 + }, + { + "epoch": 3.17, + "grad_norm": 6.92261266708374, + "learning_rate": 2.9642825526257367e-06, + "logits/chosen": -0.524182915687561, + "logits/rejected": -0.5827988982200623, + "logps/chosen": -59.614013671875, + "logps/rejected": -102.7854995727539, + "loss": 0.6685, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.83378005027771, + "rewards/margins": 6.893037796020508, + "rewards/rejected": -4.059257507324219, + "step": 12660 + }, + { + "epoch": 3.17, + "grad_norm": 6.767139911651611, + "learning_rate": 2.9635646847887623e-06, + "logits/chosen": -0.5402112007141113, + "logits/rejected": -0.6075243353843689, + "logps/chosen": -56.8043212890625, + "logps/rejected": -104.52760314941406, + "loss": 0.7731, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.202448844909668, + "rewards/margins": 6.563971042633057, + "rewards/rejected": -3.3615221977233887, + "step": 12661 + }, + { + "epoch": 3.17, + "grad_norm": 9.58035659790039, + "learning_rate": 2.962846867274267e-06, + "logits/chosen": -0.4694063067436218, + "logits/rejected": -0.5397243499755859, + "logps/chosen": -65.87736511230469, + "logps/rejected": -100.42770385742188, + "loss": 0.831, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.831170082092285, + "rewards/margins": 5.218342304229736, + "rewards/rejected": -2.387171745300293, + "step": 12662 + }, + { + "epoch": 3.17, + "grad_norm": 5.4137492179870605, + "learning_rate": 2.962129100099993e-06, + "logits/chosen": -0.472789466381073, + "logits/rejected": -0.5127564668655396, + "logps/chosen": -61.071754455566406, + "logps/rejected": -100.22745513916016, + "loss": 0.7184, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0343849658966064, + "rewards/margins": 5.434079170227051, + "rewards/rejected": -2.3996942043304443, + "step": 12663 + }, + { + "epoch": 3.17, + "grad_norm": 2.6998581886291504, + "learning_rate": 2.9614113832836773e-06, + "logits/chosen": -0.5054972171783447, + "logits/rejected": -0.5862084031105042, + "logps/chosen": -54.6541862487793, + "logps/rejected": -108.60157775878906, + "loss": 0.539, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.168497323989868, + "rewards/margins": 7.280341148376465, + "rewards/rejected": -4.111843109130859, + "step": 12664 + }, + { + "epoch": 3.17, + "grad_norm": 5.2420783042907715, + "learning_rate": 2.960693716843055e-06, + "logits/chosen": -0.5194060206413269, + "logits/rejected": -0.6489611864089966, + "logps/chosen": -55.85274124145508, + "logps/rejected": -99.76918029785156, + "loss": 0.5609, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8121955394744873, + "rewards/margins": 7.206945419311523, + "rewards/rejected": -4.394749641418457, + "step": 12665 + }, + { + "epoch": 3.17, + "grad_norm": 7.3094000816345215, + "learning_rate": 2.959976100795859e-06, + "logits/chosen": -0.526902437210083, + "logits/rejected": -0.6375340819358826, + "logps/chosen": -54.55916976928711, + "logps/rejected": -101.2605972290039, + "loss": 0.647, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8291215896606445, + "rewards/margins": 7.237569808959961, + "rewards/rejected": -4.408448219299316, + "step": 12666 + }, + { + "epoch": 3.17, + "grad_norm": 7.013710021972656, + "learning_rate": 2.9592585351598248e-06, + "logits/chosen": -0.5472573041915894, + "logits/rejected": -0.5738552808761597, + "logps/chosen": -52.719085693359375, + "logps/rejected": -99.75863647460938, + "loss": 0.6827, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5071768760681152, + "rewards/margins": 5.9401044845581055, + "rewards/rejected": -2.4329276084899902, + "step": 12667 + }, + { + "epoch": 3.17, + "grad_norm": 10.861274719238281, + "learning_rate": 2.9585410199526825e-06, + "logits/chosen": -0.4884764850139618, + "logits/rejected": -0.578953742980957, + "logps/chosen": -57.72185516357422, + "logps/rejected": -102.84913635253906, + "loss": 0.8093, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.713430643081665, + "rewards/margins": 6.193639755249023, + "rewards/rejected": -3.4802088737487793, + "step": 12668 + }, + { + "epoch": 3.17, + "grad_norm": 8.090185165405273, + "learning_rate": 2.9578235551921624e-06, + "logits/chosen": -0.5608183145523071, + "logits/rejected": -0.6190318465232849, + "logps/chosen": -57.40636444091797, + "logps/rejected": -83.16619110107422, + "loss": 0.8707, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.180302381515503, + "rewards/margins": 5.013369560241699, + "rewards/rejected": -1.833067536354065, + "step": 12669 + }, + { + "epoch": 3.17, + "grad_norm": 8.26970100402832, + "learning_rate": 2.9571061408959943e-06, + "logits/chosen": -0.5795595645904541, + "logits/rejected": -0.6543583273887634, + "logps/chosen": -47.34921646118164, + "logps/rejected": -103.09453582763672, + "loss": 0.6816, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0856125354766846, + "rewards/margins": 6.983062744140625, + "rewards/rejected": -3.8974509239196777, + "step": 12670 + }, + { + "epoch": 3.17, + "grad_norm": 5.894681453704834, + "learning_rate": 2.9563887770819056e-06, + "logits/chosen": -0.4432068467140198, + "logits/rejected": -0.4852571487426758, + "logps/chosen": -56.75392150878906, + "logps/rejected": -114.29324340820312, + "loss": 0.6731, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.158337354660034, + "rewards/margins": 6.061711311340332, + "rewards/rejected": -2.9033737182617188, + "step": 12671 + }, + { + "epoch": 3.17, + "grad_norm": 1.855864405632019, + "learning_rate": 2.9556714637676264e-06, + "logits/chosen": -0.4940272569656372, + "logits/rejected": -0.5697392821311951, + "logps/chosen": -43.82373809814453, + "logps/rejected": -91.64421081542969, + "loss": 0.5272, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.206162452697754, + "rewards/margins": 6.645105838775635, + "rewards/rejected": -3.4389426708221436, + "step": 12672 + }, + { + "epoch": 3.17, + "grad_norm": 9.156449317932129, + "learning_rate": 2.954954200970879e-06, + "logits/chosen": -0.48956233263015747, + "logits/rejected": -0.5927630066871643, + "logps/chosen": -54.96630096435547, + "logps/rejected": -75.38601684570312, + "loss": 0.844, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.092707872390747, + "rewards/margins": 4.996151924133301, + "rewards/rejected": -1.9034439325332642, + "step": 12673 + }, + { + "epoch": 3.17, + "grad_norm": 4.768862247467041, + "learning_rate": 2.954236988709386e-06, + "logits/chosen": -0.6448309421539307, + "logits/rejected": -0.7102920413017273, + "logps/chosen": -50.507930755615234, + "logps/rejected": -130.1931610107422, + "loss": 0.6618, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.184474229812622, + "rewards/margins": 8.464529037475586, + "rewards/rejected": -5.280054092407227, + "step": 12674 + }, + { + "epoch": 3.17, + "grad_norm": 4.369210720062256, + "learning_rate": 2.953519827000877e-06, + "logits/chosen": -0.5796493291854858, + "logits/rejected": -0.5959301590919495, + "logps/chosen": -52.80753707885742, + "logps/rejected": -115.4452896118164, + "loss": 0.6324, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.332385540008545, + "rewards/margins": 6.248071670532227, + "rewards/rejected": -2.91568660736084, + "step": 12675 + }, + { + "epoch": 3.17, + "grad_norm": 4.010849475860596, + "learning_rate": 2.952802715863066e-06, + "logits/chosen": -0.501467227935791, + "logits/rejected": -0.5985539555549622, + "logps/chosen": -52.17485427856445, + "logps/rejected": -106.97168731689453, + "loss": 0.5729, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9549765586853027, + "rewards/margins": 6.929116249084473, + "rewards/rejected": -3.974140167236328, + "step": 12676 + }, + { + "epoch": 3.17, + "grad_norm": 3.0899784564971924, + "learning_rate": 2.952085655313679e-06, + "logits/chosen": -0.5141963958740234, + "logits/rejected": -0.5830714106559753, + "logps/chosen": -49.113914489746094, + "logps/rejected": -134.821533203125, + "loss": 0.4807, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.000027894973755, + "rewards/margins": 8.286917686462402, + "rewards/rejected": -5.286890029907227, + "step": 12677 + }, + { + "epoch": 3.17, + "grad_norm": 4.132833957672119, + "learning_rate": 2.9513686453704337e-06, + "logits/chosen": -0.5588392019271851, + "logits/rejected": -0.6915112733840942, + "logps/chosen": -61.98146057128906, + "logps/rejected": -87.92120361328125, + "loss": 0.6757, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0859920978546143, + "rewards/margins": 6.775843620300293, + "rewards/rejected": -3.689851760864258, + "step": 12678 + }, + { + "epoch": 3.17, + "grad_norm": 4.656584739685059, + "learning_rate": 2.950651686051049e-06, + "logits/chosen": -0.5893104076385498, + "logits/rejected": -0.6658029556274414, + "logps/chosen": -51.261234283447266, + "logps/rejected": -95.49494934082031, + "loss": 0.6049, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1404261589050293, + "rewards/margins": 6.458987712860107, + "rewards/rejected": -3.318561553955078, + "step": 12679 + }, + { + "epoch": 3.17, + "grad_norm": 7.070308685302734, + "learning_rate": 2.9499347773732393e-06, + "logits/chosen": -0.5897578597068787, + "logits/rejected": -0.5968813896179199, + "logps/chosen": -62.186744689941406, + "logps/rejected": -132.75274658203125, + "loss": 0.7484, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9305906295776367, + "rewards/margins": 7.012363910675049, + "rewards/rejected": -4.081772804260254, + "step": 12680 + }, + { + "epoch": 3.17, + "grad_norm": 13.957173347473145, + "learning_rate": 2.9492179193547225e-06, + "logits/chosen": -0.5690934658050537, + "logits/rejected": -0.6125407218933105, + "logps/chosen": -48.377201080322266, + "logps/rejected": -110.96185302734375, + "loss": 0.6729, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.86488676071167, + "rewards/margins": 6.619225025177002, + "rewards/rejected": -3.754338502883911, + "step": 12681 + }, + { + "epoch": 3.17, + "grad_norm": 3.8325111865997314, + "learning_rate": 2.9485011120132133e-06, + "logits/chosen": -0.5771229863166809, + "logits/rejected": -0.6522119641304016, + "logps/chosen": -53.84840774536133, + "logps/rejected": -106.9103775024414, + "loss": 0.5974, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7806334495544434, + "rewards/margins": 7.008902072906494, + "rewards/rejected": -4.228268623352051, + "step": 12682 + }, + { + "epoch": 3.17, + "grad_norm": 2.771454334259033, + "learning_rate": 2.9477843553664227e-06, + "logits/chosen": -0.49287599325180054, + "logits/rejected": -0.5612442493438721, + "logps/chosen": -68.53247833251953, + "logps/rejected": -116.45189666748047, + "loss": 0.6841, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3575525283813477, + "rewards/margins": 6.9868879318237305, + "rewards/rejected": -3.6293349266052246, + "step": 12683 + }, + { + "epoch": 3.17, + "grad_norm": 5.179327011108398, + "learning_rate": 2.9470676494320634e-06, + "logits/chosen": -0.5171204209327698, + "logits/rejected": -0.5637021064758301, + "logps/chosen": -52.66828155517578, + "logps/rejected": -120.56382751464844, + "loss": 0.5997, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2748830318450928, + "rewards/margins": 8.192682266235352, + "rewards/rejected": -4.917799949645996, + "step": 12684 + }, + { + "epoch": 3.17, + "grad_norm": 3.5940499305725098, + "learning_rate": 2.9463509942278444e-06, + "logits/chosen": -0.5489360094070435, + "logits/rejected": -0.6454514265060425, + "logps/chosen": -43.044578552246094, + "logps/rejected": -104.64241027832031, + "loss": 0.5146, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2914960384368896, + "rewards/margins": 7.94505500793457, + "rewards/rejected": -4.65355920791626, + "step": 12685 + }, + { + "epoch": 3.17, + "grad_norm": 8.07119369506836, + "learning_rate": 2.9456343897714813e-06, + "logits/chosen": -0.4544318914413452, + "logits/rejected": -0.5154517889022827, + "logps/chosen": -57.717041015625, + "logps/rejected": -106.44685363769531, + "loss": 0.6874, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.31601619720459, + "rewards/margins": 6.780308246612549, + "rewards/rejected": -3.4642930030822754, + "step": 12686 + }, + { + "epoch": 3.17, + "grad_norm": 4.322849273681641, + "learning_rate": 2.9449178360806742e-06, + "logits/chosen": -0.5312697887420654, + "logits/rejected": -0.5662198662757874, + "logps/chosen": -43.88032531738281, + "logps/rejected": -92.66766357421875, + "loss": 0.597, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7518599033355713, + "rewards/margins": 5.867132186889648, + "rewards/rejected": -3.115272045135498, + "step": 12687 + }, + { + "epoch": 3.17, + "grad_norm": 3.8689072132110596, + "learning_rate": 2.9442013331731325e-06, + "logits/chosen": -0.5295358896255493, + "logits/rejected": -0.601830005645752, + "logps/chosen": -46.74903106689453, + "logps/rejected": -95.24566650390625, + "loss": 0.578, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.280120849609375, + "rewards/margins": 6.863023281097412, + "rewards/rejected": -3.5829029083251953, + "step": 12688 + }, + { + "epoch": 3.17, + "grad_norm": 3.2444703578948975, + "learning_rate": 2.943484881066564e-06, + "logits/chosen": -0.5021909475326538, + "logits/rejected": -0.5678807497024536, + "logps/chosen": -47.1054573059082, + "logps/rejected": -99.83013916015625, + "loss": 0.5232, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1222453117370605, + "rewards/margins": 6.934218406677246, + "rewards/rejected": -3.8119728565216064, + "step": 12689 + }, + { + "epoch": 3.17, + "grad_norm": 6.389464378356934, + "learning_rate": 2.9427684797786716e-06, + "logits/chosen": -0.47512906789779663, + "logits/rejected": -0.5492373108863831, + "logps/chosen": -64.17159271240234, + "logps/rejected": -119.15006256103516, + "loss": 0.6421, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9146175384521484, + "rewards/margins": 6.621243000030518, + "rewards/rejected": -3.706624984741211, + "step": 12690 + }, + { + "epoch": 3.17, + "grad_norm": 4.300820827484131, + "learning_rate": 2.9420521293271576e-06, + "logits/chosen": -0.561854362487793, + "logits/rejected": -0.648792028427124, + "logps/chosen": -54.191558837890625, + "logps/rejected": -112.77264404296875, + "loss": 0.6198, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3637778759002686, + "rewards/margins": 7.031469345092773, + "rewards/rejected": -3.667691707611084, + "step": 12691 + }, + { + "epoch": 3.18, + "grad_norm": 2.7140491008758545, + "learning_rate": 2.9413358297297243e-06, + "logits/chosen": -0.5239225029945374, + "logits/rejected": -0.6161013245582581, + "logps/chosen": -52.0390625, + "logps/rejected": -100.66458892822266, + "loss": 0.5547, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9553749561309814, + "rewards/margins": 7.314043998718262, + "rewards/rejected": -4.358668327331543, + "step": 12692 + }, + { + "epoch": 3.18, + "grad_norm": 5.1395649909973145, + "learning_rate": 2.940619581004074e-06, + "logits/chosen": -0.4145021438598633, + "logits/rejected": -0.4983065128326416, + "logps/chosen": -67.18370819091797, + "logps/rejected": -122.3144302368164, + "loss": 0.7029, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.093090057373047, + "rewards/margins": 7.327773571014404, + "rewards/rejected": -4.234683513641357, + "step": 12693 + }, + { + "epoch": 3.18, + "grad_norm": 8.821443557739258, + "learning_rate": 2.9399033831679036e-06, + "logits/chosen": -0.48684167861938477, + "logits/rejected": -0.5799283385276794, + "logps/chosen": -48.78699493408203, + "logps/rejected": -108.63662719726562, + "loss": 0.6128, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.854989528656006, + "rewards/margins": 7.398612022399902, + "rewards/rejected": -4.543622970581055, + "step": 12694 + }, + { + "epoch": 3.18, + "grad_norm": 9.905071258544922, + "learning_rate": 2.9391872362389108e-06, + "logits/chosen": -0.5604707598686218, + "logits/rejected": -0.6015876531600952, + "logps/chosen": -46.50067901611328, + "logps/rejected": -115.6978988647461, + "loss": 0.6532, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.366419792175293, + "rewards/margins": 7.4792304039001465, + "rewards/rejected": -4.112810134887695, + "step": 12695 + }, + { + "epoch": 3.18, + "grad_norm": 6.849905014038086, + "learning_rate": 2.9384711402347935e-06, + "logits/chosen": -0.6450062394142151, + "logits/rejected": -0.6591362357139587, + "logps/chosen": -47.24089050292969, + "logps/rejected": -94.0720443725586, + "loss": 0.7955, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.048197031021118, + "rewards/margins": 5.293139457702637, + "rewards/rejected": -2.2449419498443604, + "step": 12696 + }, + { + "epoch": 3.18, + "grad_norm": 2.924877166748047, + "learning_rate": 2.9377550951732482e-06, + "logits/chosen": -0.4998478591442108, + "logits/rejected": -0.6389790773391724, + "logps/chosen": -70.76947784423828, + "logps/rejected": -106.58425903320312, + "loss": 0.637, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3044424057006836, + "rewards/margins": 7.847492694854736, + "rewards/rejected": -4.543050289154053, + "step": 12697 + }, + { + "epoch": 3.18, + "grad_norm": 3.165463924407959, + "learning_rate": 2.937039101071967e-06, + "logits/chosen": -0.4907355308532715, + "logits/rejected": -0.5987159013748169, + "logps/chosen": -50.520751953125, + "logps/rejected": -109.90412902832031, + "loss": 0.5736, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0384721755981445, + "rewards/margins": 8.049901962280273, + "rewards/rejected": -5.011429786682129, + "step": 12698 + }, + { + "epoch": 3.18, + "grad_norm": 8.250411033630371, + "learning_rate": 2.936323157948644e-06, + "logits/chosen": -0.47713765501976013, + "logits/rejected": -0.5808066129684448, + "logps/chosen": -59.980655670166016, + "logps/rejected": -96.0503158569336, + "loss": 0.7074, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.015841484069824, + "rewards/margins": 6.6827192306518555, + "rewards/rejected": -3.6668777465820312, + "step": 12699 + }, + { + "epoch": 3.18, + "grad_norm": 3.746349334716797, + "learning_rate": 2.935607265820972e-06, + "logits/chosen": -0.6138343214988708, + "logits/rejected": -0.647834837436676, + "logps/chosen": -45.35203552246094, + "logps/rejected": -109.53046417236328, + "loss": 0.6313, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.116614818572998, + "rewards/margins": 6.801689147949219, + "rewards/rejected": -3.6850745677948, + "step": 12700 + }, + { + "epoch": 3.18, + "grad_norm": 7.950429439544678, + "learning_rate": 2.9348914247066392e-06, + "logits/chosen": -0.5933433771133423, + "logits/rejected": -0.6606943607330322, + "logps/chosen": -46.587677001953125, + "logps/rejected": -105.01370239257812, + "loss": 0.7002, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1969804763793945, + "rewards/margins": 7.499309062957764, + "rewards/rejected": -4.302328586578369, + "step": 12701 + }, + { + "epoch": 3.18, + "grad_norm": 6.017942428588867, + "learning_rate": 2.9341756346233342e-06, + "logits/chosen": -0.5345050692558289, + "logits/rejected": -0.6309272646903992, + "logps/chosen": -55.16740036010742, + "logps/rejected": -94.8133773803711, + "loss": 0.6461, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0261170864105225, + "rewards/margins": 5.746469497680664, + "rewards/rejected": -2.7203524112701416, + "step": 12702 + }, + { + "epoch": 3.18, + "grad_norm": 4.702426433563232, + "learning_rate": 2.9334598955887483e-06, + "logits/chosen": -0.5646396279335022, + "logits/rejected": -0.636443018913269, + "logps/chosen": -51.12967300415039, + "logps/rejected": -104.11958312988281, + "loss": 0.6632, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.922722578048706, + "rewards/margins": 7.475874423980713, + "rewards/rejected": -4.553152561187744, + "step": 12703 + }, + { + "epoch": 3.18, + "grad_norm": 3.7366349697113037, + "learning_rate": 2.932744207620568e-06, + "logits/chosen": -0.5058190226554871, + "logits/rejected": -0.6231017112731934, + "logps/chosen": -60.79826354980469, + "logps/rejected": -104.48993682861328, + "loss": 0.5977, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0567092895507812, + "rewards/margins": 6.979327201843262, + "rewards/rejected": -3.922617197036743, + "step": 12704 + }, + { + "epoch": 3.18, + "grad_norm": 4.804745674133301, + "learning_rate": 2.932028570736474e-06, + "logits/chosen": -0.5505826473236084, + "logits/rejected": -0.6351457238197327, + "logps/chosen": -53.530513763427734, + "logps/rejected": -103.12179565429688, + "loss": 0.6395, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6316280364990234, + "rewards/margins": 7.645711898803711, + "rewards/rejected": -5.0140838623046875, + "step": 12705 + }, + { + "epoch": 3.18, + "grad_norm": 16.743886947631836, + "learning_rate": 2.9313129849541554e-06, + "logits/chosen": -0.5822416543960571, + "logits/rejected": -0.6354463696479797, + "logps/chosen": -44.743221282958984, + "logps/rejected": -114.64665985107422, + "loss": 0.6866, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6905789375305176, + "rewards/margins": 7.186552047729492, + "rewards/rejected": -4.495973110198975, + "step": 12706 + }, + { + "epoch": 3.18, + "grad_norm": 2.510371685028076, + "learning_rate": 2.9305974502912936e-06, + "logits/chosen": -0.5280442237854004, + "logits/rejected": -0.6486019492149353, + "logps/chosen": -55.71412658691406, + "logps/rejected": -106.09030151367188, + "loss": 0.5268, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.030056953430176, + "rewards/margins": 7.345877170562744, + "rewards/rejected": -4.315820217132568, + "step": 12707 + }, + { + "epoch": 3.18, + "grad_norm": 3.0162410736083984, + "learning_rate": 2.9298819667655692e-06, + "logits/chosen": -0.5249924063682556, + "logits/rejected": -0.6206228733062744, + "logps/chosen": -55.141319274902344, + "logps/rejected": -110.81686401367188, + "loss": 0.5867, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.348579168319702, + "rewards/margins": 8.323375701904297, + "rewards/rejected": -4.974797248840332, + "step": 12708 + }, + { + "epoch": 3.18, + "grad_norm": 4.395105361938477, + "learning_rate": 2.929166534394663e-06, + "logits/chosen": -0.5270997285842896, + "logits/rejected": -0.6136946678161621, + "logps/chosen": -56.02338409423828, + "logps/rejected": -103.1219482421875, + "loss": 0.5879, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2471001148223877, + "rewards/margins": 7.095237731933594, + "rewards/rejected": -3.848137855529785, + "step": 12709 + }, + { + "epoch": 3.18, + "grad_norm": 7.9440083503723145, + "learning_rate": 2.9284511531962543e-06, + "logits/chosen": -0.4219864010810852, + "logits/rejected": -0.47597160935401917, + "logps/chosen": -63.66278076171875, + "logps/rejected": -115.28875732421875, + "loss": 0.6384, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.908318519592285, + "rewards/margins": 6.0782647132873535, + "rewards/rejected": -3.1699466705322266, + "step": 12710 + }, + { + "epoch": 3.18, + "grad_norm": 5.834137916564941, + "learning_rate": 2.9277358231880227e-06, + "logits/chosen": -0.5831154584884644, + "logits/rejected": -0.718877911567688, + "logps/chosen": -51.81747817993164, + "logps/rejected": -91.25635528564453, + "loss": 0.5466, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1157052516937256, + "rewards/margins": 7.1131696701049805, + "rewards/rejected": -3.997464418411255, + "step": 12711 + }, + { + "epoch": 3.18, + "grad_norm": 7.223453044891357, + "learning_rate": 2.927020544387642e-06, + "logits/chosen": -0.49425798654556274, + "logits/rejected": -0.6295634508132935, + "logps/chosen": -57.35466766357422, + "logps/rejected": -92.86824798583984, + "loss": 0.5923, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8912410736083984, + "rewards/margins": 7.327699184417725, + "rewards/rejected": -4.436458587646484, + "step": 12712 + }, + { + "epoch": 3.18, + "grad_norm": 2.543199300765991, + "learning_rate": 2.926305316812788e-06, + "logits/chosen": -0.556317150592804, + "logits/rejected": -0.60603928565979, + "logps/chosen": -44.08641815185547, + "logps/rejected": -107.39772033691406, + "loss": 0.5377, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.456170082092285, + "rewards/margins": 7.020287990570068, + "rewards/rejected": -3.564117670059204, + "step": 12713 + }, + { + "epoch": 3.18, + "grad_norm": 8.463340759277344, + "learning_rate": 2.9255901404811365e-06, + "logits/chosen": -0.6397560238838196, + "logits/rejected": -0.683405876159668, + "logps/chosen": -55.95313262939453, + "logps/rejected": -105.96986389160156, + "loss": 0.6743, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0168793201446533, + "rewards/margins": 6.410135746002197, + "rewards/rejected": -3.393256187438965, + "step": 12714 + }, + { + "epoch": 3.18, + "grad_norm": 3.843475341796875, + "learning_rate": 2.9248750154103585e-06, + "logits/chosen": -0.5444208979606628, + "logits/rejected": -0.6498299241065979, + "logps/chosen": -53.90220642089844, + "logps/rejected": -93.30450439453125, + "loss": 0.5959, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2732560634613037, + "rewards/margins": 7.125706672668457, + "rewards/rejected": -3.852450370788574, + "step": 12715 + }, + { + "epoch": 3.18, + "grad_norm": 10.18199634552002, + "learning_rate": 2.9241599416181254e-06, + "logits/chosen": -0.529594898223877, + "logits/rejected": -0.6249674558639526, + "logps/chosen": -56.58876037597656, + "logps/rejected": -114.59056854248047, + "loss": 0.6339, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.858898401260376, + "rewards/margins": 8.029695510864258, + "rewards/rejected": -5.170797348022461, + "step": 12716 + }, + { + "epoch": 3.18, + "grad_norm": 6.853818416595459, + "learning_rate": 2.9234449191221072e-06, + "logits/chosen": -0.50996333360672, + "logits/rejected": -0.6339377164840698, + "logps/chosen": -64.08312225341797, + "logps/rejected": -114.04481506347656, + "loss": 0.6724, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.077868938446045, + "rewards/margins": 7.834146022796631, + "rewards/rejected": -4.756278038024902, + "step": 12717 + }, + { + "epoch": 3.18, + "grad_norm": 2.88547420501709, + "learning_rate": 2.9227299479399773e-06, + "logits/chosen": -0.5978103876113892, + "logits/rejected": -0.6365251541137695, + "logps/chosen": -60.4563102722168, + "logps/rejected": -125.0063247680664, + "loss": 0.5718, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1650445461273193, + "rewards/margins": 7.594791412353516, + "rewards/rejected": -4.429747104644775, + "step": 12718 + }, + { + "epoch": 3.18, + "grad_norm": 3.2247681617736816, + "learning_rate": 2.9220150280893967e-06, + "logits/chosen": -0.5825793147087097, + "logits/rejected": -0.6532332897186279, + "logps/chosen": -49.507537841796875, + "logps/rejected": -106.5456771850586, + "loss": 0.5395, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4355688095092773, + "rewards/margins": 8.680057525634766, + "rewards/rejected": -5.244488716125488, + "step": 12719 + }, + { + "epoch": 3.18, + "grad_norm": 4.298556327819824, + "learning_rate": 2.9213001595880354e-06, + "logits/chosen": -0.5136778354644775, + "logits/rejected": -0.5251832604408264, + "logps/chosen": -55.61764144897461, + "logps/rejected": -129.29248046875, + "loss": 0.642, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.972543954849243, + "rewards/margins": 7.882137298583984, + "rewards/rejected": -4.90959358215332, + "step": 12720 + }, + { + "epoch": 3.18, + "grad_norm": 21.420318603515625, + "learning_rate": 2.9205853424535586e-06, + "logits/chosen": -0.5271298885345459, + "logits/rejected": -0.6320631504058838, + "logps/chosen": -62.4708251953125, + "logps/rejected": -97.6412353515625, + "loss": 0.7501, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.327122449874878, + "rewards/margins": 7.109219551086426, + "rewards/rejected": -3.782097101211548, + "step": 12721 + }, + { + "epoch": 3.18, + "grad_norm": 6.04207181930542, + "learning_rate": 2.9198705767036305e-06, + "logits/chosen": -0.5139697194099426, + "logits/rejected": -0.5944334268569946, + "logps/chosen": -55.68013000488281, + "logps/rejected": -111.40324401855469, + "loss": 0.6563, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.683358907699585, + "rewards/margins": 7.2902913093566895, + "rewards/rejected": -4.606932163238525, + "step": 12722 + }, + { + "epoch": 3.18, + "grad_norm": 1.9658030271530151, + "learning_rate": 2.9191558623559136e-06, + "logits/chosen": -0.5496283769607544, + "logits/rejected": -0.6481655240058899, + "logps/chosen": -45.99169921875, + "logps/rejected": -118.02548217773438, + "loss": 0.5068, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1186225414276123, + "rewards/margins": 8.523449897766113, + "rewards/rejected": -5.404827117919922, + "step": 12723 + }, + { + "epoch": 3.18, + "grad_norm": 9.305609703063965, + "learning_rate": 2.918441199428066e-06, + "logits/chosen": -0.5178579688072205, + "logits/rejected": -0.5954588651657104, + "logps/chosen": -52.40385055541992, + "logps/rejected": -133.3025360107422, + "loss": 0.5846, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0138766765594482, + "rewards/margins": 8.80724048614502, + "rewards/rejected": -5.79336404800415, + "step": 12724 + }, + { + "epoch": 3.18, + "grad_norm": 2.9762351512908936, + "learning_rate": 2.9177265879377527e-06, + "logits/chosen": -0.5522805452346802, + "logits/rejected": -0.6351010203361511, + "logps/chosen": -56.22587966918945, + "logps/rejected": -104.75228881835938, + "loss": 0.6214, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8517556190490723, + "rewards/margins": 7.475763320922852, + "rewards/rejected": -4.6240081787109375, + "step": 12725 + }, + { + "epoch": 3.18, + "grad_norm": 2.5785417556762695, + "learning_rate": 2.9170120279026283e-06, + "logits/chosen": -0.5736223459243774, + "logits/rejected": -0.653536319732666, + "logps/chosen": -67.72813415527344, + "logps/rejected": -88.85543823242188, + "loss": 0.6637, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.245897054672241, + "rewards/margins": 7.285445213317871, + "rewards/rejected": -4.039548397064209, + "step": 12726 + }, + { + "epoch": 3.18, + "grad_norm": 19.150880813598633, + "learning_rate": 2.9162975193403543e-06, + "logits/chosen": -0.5389816761016846, + "logits/rejected": -0.6497393846511841, + "logps/chosen": -65.75769805908203, + "logps/rejected": -109.9979248046875, + "loss": 0.7544, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.190699338912964, + "rewards/margins": 8.70083999633789, + "rewards/rejected": -5.5101399421691895, + "step": 12727 + }, + { + "epoch": 3.18, + "grad_norm": 5.978610515594482, + "learning_rate": 2.915583062268582e-06, + "logits/chosen": -0.5355336666107178, + "logits/rejected": -0.6643953323364258, + "logps/chosen": -57.409423828125, + "logps/rejected": -96.06346130371094, + "loss": 0.6095, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1555936336517334, + "rewards/margins": 8.156499862670898, + "rewards/rejected": -5.000905990600586, + "step": 12728 + }, + { + "epoch": 3.18, + "grad_norm": 4.135972499847412, + "learning_rate": 2.914868656704972e-06, + "logits/chosen": -0.528182327747345, + "logits/rejected": -0.6033653616905212, + "logps/chosen": -59.094207763671875, + "logps/rejected": -106.60758972167969, + "loss": 0.6258, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0401523113250732, + "rewards/margins": 7.02081298828125, + "rewards/rejected": -3.9806604385375977, + "step": 12729 + }, + { + "epoch": 3.18, + "grad_norm": 3.257219076156616, + "learning_rate": 2.914154302667176e-06, + "logits/chosen": -0.5218244194984436, + "logits/rejected": -0.6172947287559509, + "logps/chosen": -52.69352722167969, + "logps/rejected": -108.9009017944336, + "loss": 0.6306, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.846835136413574, + "rewards/margins": 6.968975067138672, + "rewards/rejected": -4.1221394538879395, + "step": 12730 + }, + { + "epoch": 3.18, + "grad_norm": 7.057349681854248, + "learning_rate": 2.913440000172843e-06, + "logits/chosen": -0.5246164798736572, + "logits/rejected": -0.5967555046081543, + "logps/chosen": -55.094112396240234, + "logps/rejected": -118.99818420410156, + "loss": 0.6353, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0065245628356934, + "rewards/margins": 7.376101493835449, + "rewards/rejected": -4.369576930999756, + "step": 12731 + }, + { + "epoch": 3.19, + "grad_norm": 6.436765193939209, + "learning_rate": 2.912725749239629e-06, + "logits/chosen": -0.539851188659668, + "logits/rejected": -0.6152105927467346, + "logps/chosen": -55.328895568847656, + "logps/rejected": -92.0007095336914, + "loss": 0.6564, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.57918643951416, + "rewards/margins": 6.238383769989014, + "rewards/rejected": -2.6591975688934326, + "step": 12732 + }, + { + "epoch": 3.19, + "grad_norm": 3.64540433883667, + "learning_rate": 2.912011549885182e-06, + "logits/chosen": -0.5121257901191711, + "logits/rejected": -0.5681073665618896, + "logps/chosen": -58.92086410522461, + "logps/rejected": -101.87138366699219, + "loss": 0.623, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.329294204711914, + "rewards/margins": 6.174643039703369, + "rewards/rejected": -2.845348358154297, + "step": 12733 + }, + { + "epoch": 3.19, + "grad_norm": 3.3726913928985596, + "learning_rate": 2.911297402127148e-06, + "logits/chosen": -0.46445438265800476, + "logits/rejected": -0.594916582107544, + "logps/chosen": -49.498329162597656, + "logps/rejected": -78.95230102539062, + "loss": 0.6212, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.928241014480591, + "rewards/margins": 6.676708221435547, + "rewards/rejected": -3.748467445373535, + "step": 12734 + }, + { + "epoch": 3.19, + "grad_norm": 15.233142852783203, + "learning_rate": 2.91058330598318e-06, + "logits/chosen": -0.5329982042312622, + "logits/rejected": -0.6273185014724731, + "logps/chosen": -54.68604278564453, + "logps/rejected": -92.25067901611328, + "loss": 0.6206, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5880796909332275, + "rewards/margins": 6.075437545776367, + "rewards/rejected": -3.487358331680298, + "step": 12735 + }, + { + "epoch": 3.19, + "grad_norm": 4.249969482421875, + "learning_rate": 2.90986926147092e-06, + "logits/chosen": -0.4971814453601837, + "logits/rejected": -0.5775576829910278, + "logps/chosen": -42.06869888305664, + "logps/rejected": -84.99504089355469, + "loss": 0.5936, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.024796485900879, + "rewards/margins": 6.795774459838867, + "rewards/rejected": -3.7709779739379883, + "step": 12736 + }, + { + "epoch": 3.19, + "grad_norm": 4.058058738708496, + "learning_rate": 2.909155268608012e-06, + "logits/chosen": -0.5652115345001221, + "logits/rejected": -0.6191693544387817, + "logps/chosen": -53.42350769042969, + "logps/rejected": -97.27811431884766, + "loss": 0.6932, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.225743293762207, + "rewards/margins": 6.382390975952148, + "rewards/rejected": -3.156647205352783, + "step": 12737 + }, + { + "epoch": 3.19, + "grad_norm": 6.692300319671631, + "learning_rate": 2.9084413274121015e-06, + "logits/chosen": -0.5260745286941528, + "logits/rejected": -0.5842748880386353, + "logps/chosen": -52.652496337890625, + "logps/rejected": -107.22166442871094, + "loss": 0.67, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.19174861907959, + "rewards/margins": 6.0012383460998535, + "rewards/rejected": -2.8094897270202637, + "step": 12738 + }, + { + "epoch": 3.19, + "grad_norm": 3.29310941696167, + "learning_rate": 2.9077274379008335e-06, + "logits/chosen": -0.5916712284088135, + "logits/rejected": -0.6622239351272583, + "logps/chosen": -57.14079284667969, + "logps/rejected": -101.1778564453125, + "loss": 0.6868, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2770752906799316, + "rewards/margins": 6.843008995056152, + "rewards/rejected": -3.5659334659576416, + "step": 12739 + }, + { + "epoch": 3.19, + "grad_norm": 8.728690147399902, + "learning_rate": 2.9070136000918426e-06, + "logits/chosen": -0.4421325922012329, + "logits/rejected": -0.5189402103424072, + "logps/chosen": -62.36288833618164, + "logps/rejected": -115.75704193115234, + "loss": 0.6714, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0596539974212646, + "rewards/margins": 7.19002628326416, + "rewards/rejected": -4.130371570587158, + "step": 12740 + }, + { + "epoch": 3.19, + "grad_norm": 7.167690277099609, + "learning_rate": 2.906299814002772e-06, + "logits/chosen": -0.5205664038658142, + "logits/rejected": -0.5612671375274658, + "logps/chosen": -60.09571838378906, + "logps/rejected": -104.75055694580078, + "loss": 0.7127, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.914304256439209, + "rewards/margins": 6.9802398681640625, + "rewards/rejected": -4.065936088562012, + "step": 12741 + }, + { + "epoch": 3.19, + "grad_norm": 12.344894409179688, + "learning_rate": 2.905586079651262e-06, + "logits/chosen": -0.4097740054130554, + "logits/rejected": -0.4818548560142517, + "logps/chosen": -54.63851547241211, + "logps/rejected": -89.60032653808594, + "loss": 0.788, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.679266929626465, + "rewards/margins": 5.27004861831665, + "rewards/rejected": -2.5907812118530273, + "step": 12742 + }, + { + "epoch": 3.19, + "grad_norm": 3.3350918292999268, + "learning_rate": 2.9048723970549475e-06, + "logits/chosen": -0.5129362940788269, + "logits/rejected": -0.5939375162124634, + "logps/chosen": -55.35032272338867, + "logps/rejected": -102.92922973632812, + "loss": 0.574, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.230020523071289, + "rewards/margins": 6.507917404174805, + "rewards/rejected": -3.277897357940674, + "step": 12743 + }, + { + "epoch": 3.19, + "grad_norm": 4.485174179077148, + "learning_rate": 2.9041587662314626e-06, + "logits/chosen": -0.6003689169883728, + "logits/rejected": -0.6503762006759644, + "logps/chosen": -44.39252471923828, + "logps/rejected": -98.08433532714844, + "loss": 0.6084, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.114619255065918, + "rewards/margins": 6.502447605133057, + "rewards/rejected": -3.3878281116485596, + "step": 12744 + }, + { + "epoch": 3.19, + "grad_norm": 5.287407875061035, + "learning_rate": 2.903445187198446e-06, + "logits/chosen": -0.5098035931587219, + "logits/rejected": -0.561934769153595, + "logps/chosen": -53.36437225341797, + "logps/rejected": -96.83972930908203, + "loss": 0.7618, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.048157215118408, + "rewards/margins": 6.0048065185546875, + "rewards/rejected": -2.9566490650177, + "step": 12745 + }, + { + "epoch": 3.19, + "grad_norm": 3.783963441848755, + "learning_rate": 2.9027316599735262e-06, + "logits/chosen": -0.4768572747707367, + "logits/rejected": -0.5556473731994629, + "logps/chosen": -54.867408752441406, + "logps/rejected": -101.72247314453125, + "loss": 0.6742, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9852280616760254, + "rewards/margins": 6.459469795227051, + "rewards/rejected": -3.4742419719696045, + "step": 12746 + }, + { + "epoch": 3.19, + "grad_norm": 4.483717441558838, + "learning_rate": 2.9020181845743402e-06, + "logits/chosen": -0.47906947135925293, + "logits/rejected": -0.5586609840393066, + "logps/chosen": -50.8628044128418, + "logps/rejected": -110.37362670898438, + "loss": 0.5431, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3131489753723145, + "rewards/margins": 7.772026062011719, + "rewards/rejected": -4.458877086639404, + "step": 12747 + }, + { + "epoch": 3.19, + "grad_norm": 5.57786226272583, + "learning_rate": 2.9013047610185164e-06, + "logits/chosen": -0.4475080668926239, + "logits/rejected": -0.5386390686035156, + "logps/chosen": -55.61650848388672, + "logps/rejected": -98.48767852783203, + "loss": 0.616, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0889084339141846, + "rewards/margins": 6.571291923522949, + "rewards/rejected": -3.4823837280273438, + "step": 12748 + }, + { + "epoch": 3.19, + "grad_norm": 4.593743801116943, + "learning_rate": 2.9005913893236824e-06, + "logits/chosen": -0.490189790725708, + "logits/rejected": -0.5646790862083435, + "logps/chosen": -49.13429260253906, + "logps/rejected": -115.03662109375, + "loss": 0.5384, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.420146942138672, + "rewards/margins": 7.683588027954102, + "rewards/rejected": -4.26344108581543, + "step": 12749 + }, + { + "epoch": 3.19, + "grad_norm": 4.022824764251709, + "learning_rate": 2.8998780695074696e-06, + "logits/chosen": -0.4838026762008667, + "logits/rejected": -0.6048682928085327, + "logps/chosen": -48.58967208862305, + "logps/rejected": -101.07725524902344, + "loss": 0.5521, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.444026470184326, + "rewards/margins": 7.39058256149292, + "rewards/rejected": -3.9465558528900146, + "step": 12750 + }, + { + "epoch": 3.19, + "grad_norm": 4.454583168029785, + "learning_rate": 2.899164801587504e-06, + "logits/chosen": -0.4807855784893036, + "logits/rejected": -0.5659250617027283, + "logps/chosen": -50.13274383544922, + "logps/rejected": -106.50442504882812, + "loss": 0.5851, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4950132369995117, + "rewards/margins": 7.324956893920898, + "rewards/rejected": -3.8299429416656494, + "step": 12751 + }, + { + "epoch": 3.19, + "grad_norm": 7.20228385925293, + "learning_rate": 2.898451585581409e-06, + "logits/chosen": -0.555952250957489, + "logits/rejected": -0.6431536674499512, + "logps/chosen": -57.39588165283203, + "logps/rejected": -104.07111358642578, + "loss": 0.7587, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.158492088317871, + "rewards/margins": 7.312206268310547, + "rewards/rejected": -4.153714656829834, + "step": 12752 + }, + { + "epoch": 3.19, + "grad_norm": 3.192884922027588, + "learning_rate": 2.89773842150681e-06, + "logits/chosen": -0.5246450304985046, + "logits/rejected": -0.6388186812400818, + "logps/chosen": -56.81239318847656, + "logps/rejected": -115.71379089355469, + "loss": 0.6481, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.44698166847229, + "rewards/margins": 7.637197494506836, + "rewards/rejected": -4.190216064453125, + "step": 12753 + }, + { + "epoch": 3.19, + "grad_norm": 5.1417975425720215, + "learning_rate": 2.897025309381335e-06, + "logits/chosen": -0.5129307508468628, + "logits/rejected": -0.5579270720481873, + "logps/chosen": -52.17747116088867, + "logps/rejected": -107.91592407226562, + "loss": 0.6327, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1893393993377686, + "rewards/margins": 6.87441349029541, + "rewards/rejected": -3.6850743293762207, + "step": 12754 + }, + { + "epoch": 3.19, + "grad_norm": 4.426154613494873, + "learning_rate": 2.896312249222597e-06, + "logits/chosen": -0.5403262376785278, + "logits/rejected": -0.6236172914505005, + "logps/chosen": -50.662689208984375, + "logps/rejected": -115.11570739746094, + "loss": 0.5796, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9931602478027344, + "rewards/margins": 7.816890716552734, + "rewards/rejected": -4.823730945587158, + "step": 12755 + }, + { + "epoch": 3.19, + "grad_norm": 4.833774089813232, + "learning_rate": 2.8955992410482215e-06, + "logits/chosen": -0.514907956123352, + "logits/rejected": -0.596294105052948, + "logps/chosen": -58.67109298706055, + "logps/rejected": -104.07003784179688, + "loss": 0.6832, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.818331241607666, + "rewards/margins": 6.778541088104248, + "rewards/rejected": -3.960210084915161, + "step": 12756 + }, + { + "epoch": 3.19, + "grad_norm": 4.759308815002441, + "learning_rate": 2.8948862848758306e-06, + "logits/chosen": -0.49790623784065247, + "logits/rejected": -0.597131073474884, + "logps/chosen": -73.19124603271484, + "logps/rejected": -80.09003448486328, + "loss": 0.613, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.001323699951172, + "rewards/margins": 6.600017070770264, + "rewards/rejected": -3.5986931324005127, + "step": 12757 + }, + { + "epoch": 3.19, + "grad_norm": 6.270033836364746, + "learning_rate": 2.8941733807230347e-06, + "logits/chosen": -0.5935246348381042, + "logits/rejected": -0.6639574766159058, + "logps/chosen": -45.31079864501953, + "logps/rejected": -112.3111343383789, + "loss": 0.6266, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.942018747329712, + "rewards/margins": 7.595844268798828, + "rewards/rejected": -4.653825283050537, + "step": 12758 + }, + { + "epoch": 3.19, + "grad_norm": 6.869322299957275, + "learning_rate": 2.8934605286074546e-06, + "logits/chosen": -0.4506496787071228, + "logits/rejected": -0.5607044100761414, + "logps/chosen": -59.48290252685547, + "logps/rejected": -92.38277435302734, + "loss": 0.6413, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2111713886260986, + "rewards/margins": 5.854130268096924, + "rewards/rejected": -2.642958164215088, + "step": 12759 + }, + { + "epoch": 3.19, + "grad_norm": 5.2094526290893555, + "learning_rate": 2.8927477285467074e-06, + "logits/chosen": -0.5056644678115845, + "logits/rejected": -0.5637403726577759, + "logps/chosen": -44.878822326660156, + "logps/rejected": -105.25057220458984, + "loss": 0.5232, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.260326623916626, + "rewards/margins": 6.794097423553467, + "rewards/rejected": -3.5337705612182617, + "step": 12760 + }, + { + "epoch": 3.19, + "grad_norm": 7.559780597686768, + "learning_rate": 2.892034980558406e-06, + "logits/chosen": -0.5449385046958923, + "logits/rejected": -0.6546190977096558, + "logps/chosen": -46.7480583190918, + "logps/rejected": -97.6116943359375, + "loss": 0.5472, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.40667986869812, + "rewards/margins": 7.726236343383789, + "rewards/rejected": -4.31955623626709, + "step": 12761 + }, + { + "epoch": 3.19, + "grad_norm": 4.0362443923950195, + "learning_rate": 2.89132228466016e-06, + "logits/chosen": -0.5368546843528748, + "logits/rejected": -0.635592520236969, + "logps/chosen": -55.59614181518555, + "logps/rejected": -101.83172607421875, + "loss": 0.6582, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1737091541290283, + "rewards/margins": 7.396340370178223, + "rewards/rejected": -4.222630977630615, + "step": 12762 + }, + { + "epoch": 3.19, + "grad_norm": 2.7920665740966797, + "learning_rate": 2.8906096408695836e-06, + "logits/chosen": -0.5381761789321899, + "logits/rejected": -0.6372326612472534, + "logps/chosen": -51.45643997192383, + "logps/rejected": -111.53163146972656, + "loss": 0.6125, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1165857315063477, + "rewards/margins": 8.263254165649414, + "rewards/rejected": -5.146667957305908, + "step": 12763 + }, + { + "epoch": 3.19, + "grad_norm": 2.8246572017669678, + "learning_rate": 2.889897049204289e-06, + "logits/chosen": -0.5144373178482056, + "logits/rejected": -0.588858425617218, + "logps/chosen": -60.369598388671875, + "logps/rejected": -118.28047943115234, + "loss": 0.578, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.58886981010437, + "rewards/margins": 9.162939071655273, + "rewards/rejected": -5.574069976806641, + "step": 12764 + }, + { + "epoch": 3.19, + "grad_norm": 4.672554016113281, + "learning_rate": 2.8891845096818796e-06, + "logits/chosen": -0.5857700109481812, + "logits/rejected": -0.6131042242050171, + "logps/chosen": -49.27956771850586, + "logps/rejected": -87.2077407836914, + "loss": 0.6965, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.069244146347046, + "rewards/margins": 5.349640369415283, + "rewards/rejected": -2.2803966999053955, + "step": 12765 + }, + { + "epoch": 3.19, + "grad_norm": 6.586877346038818, + "learning_rate": 2.888472022319968e-06, + "logits/chosen": -0.5832614302635193, + "logits/rejected": -0.6570135354995728, + "logps/chosen": -52.25151062011719, + "logps/rejected": -81.15829467773438, + "loss": 0.6393, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1063356399536133, + "rewards/margins": 5.351770401000977, + "rewards/rejected": -2.2454347610473633, + "step": 12766 + }, + { + "epoch": 3.19, + "grad_norm": 11.785167694091797, + "learning_rate": 2.8877595871361565e-06, + "logits/chosen": -0.4899219870567322, + "logits/rejected": -0.5120875239372253, + "logps/chosen": -60.051231384277344, + "logps/rejected": -127.05528259277344, + "loss": 0.8738, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4514920711517334, + "rewards/margins": 6.441101551055908, + "rewards/rejected": -3.989609718322754, + "step": 12767 + }, + { + "epoch": 3.19, + "grad_norm": 3.5980310440063477, + "learning_rate": 2.8870472041480548e-06, + "logits/chosen": -0.5651761293411255, + "logits/rejected": -0.6675377488136292, + "logps/chosen": -48.62831115722656, + "logps/rejected": -78.95065307617188, + "loss": 0.6502, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1161105632781982, + "rewards/margins": 6.3719305992126465, + "rewards/rejected": -3.2558202743530273, + "step": 12768 + }, + { + "epoch": 3.19, + "grad_norm": 11.956879615783691, + "learning_rate": 2.886334873373262e-06, + "logits/chosen": -0.5728261470794678, + "logits/rejected": -0.627227246761322, + "logps/chosen": -59.09384536743164, + "logps/rejected": -112.5767822265625, + "loss": 0.7229, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9734201431274414, + "rewards/margins": 6.695820331573486, + "rewards/rejected": -3.7224009037017822, + "step": 12769 + }, + { + "epoch": 3.19, + "grad_norm": 4.851236343383789, + "learning_rate": 2.885622594829384e-06, + "logits/chosen": -0.4699816405773163, + "logits/rejected": -0.5545997023582458, + "logps/chosen": -51.32526397705078, + "logps/rejected": -104.30146789550781, + "loss": 0.4918, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2023324966430664, + "rewards/margins": 6.39578914642334, + "rewards/rejected": -3.1934566497802734, + "step": 12770 + }, + { + "epoch": 3.19, + "grad_norm": 12.50760555267334, + "learning_rate": 2.8849103685340196e-06, + "logits/chosen": -0.5688036680221558, + "logits/rejected": -0.6632042527198792, + "logps/chosen": -55.92021179199219, + "logps/rejected": -94.94837188720703, + "loss": 0.7511, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.138481616973877, + "rewards/margins": 6.3131866455078125, + "rewards/rejected": -3.1747050285339355, + "step": 12771 + }, + { + "epoch": 3.2, + "grad_norm": 5.4626312255859375, + "learning_rate": 2.8841981945047714e-06, + "logits/chosen": -0.4811306595802307, + "logits/rejected": -0.5824047923088074, + "logps/chosen": -62.88877487182617, + "logps/rejected": -92.7547836303711, + "loss": 0.7144, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.707524299621582, + "rewards/margins": 6.319962501525879, + "rewards/rejected": -3.612438678741455, + "step": 12772 + }, + { + "epoch": 3.2, + "grad_norm": 4.501948356628418, + "learning_rate": 2.883486072759236e-06, + "logits/chosen": -0.5330591797828674, + "logits/rejected": -0.5775718092918396, + "logps/chosen": -57.12867736816406, + "logps/rejected": -112.04849243164062, + "loss": 0.632, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1885807514190674, + "rewards/margins": 7.72946834564209, + "rewards/rejected": -4.540886878967285, + "step": 12773 + }, + { + "epoch": 3.2, + "grad_norm": 3.8251709938049316, + "learning_rate": 2.8827740033150094e-06, + "logits/chosen": -0.4852507412433624, + "logits/rejected": -0.5805291533470154, + "logps/chosen": -58.3857536315918, + "logps/rejected": -102.8656234741211, + "loss": 0.5971, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.131870746612549, + "rewards/margins": 6.8674235343933105, + "rewards/rejected": -3.7355527877807617, + "step": 12774 + }, + { + "epoch": 3.2, + "grad_norm": 4.5046892166137695, + "learning_rate": 2.8820619861896908e-06, + "logits/chosen": -0.5450595617294312, + "logits/rejected": -0.6232557892799377, + "logps/chosen": -45.70684051513672, + "logps/rejected": -114.85171508789062, + "loss": 0.5655, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9869561195373535, + "rewards/margins": 6.943949222564697, + "rewards/rejected": -3.9569926261901855, + "step": 12775 + }, + { + "epoch": 3.2, + "grad_norm": 5.473662853240967, + "learning_rate": 2.881350021400873e-06, + "logits/chosen": -0.5497114062309265, + "logits/rejected": -0.63656085729599, + "logps/chosen": -45.315773010253906, + "logps/rejected": -100.52534484863281, + "loss": 0.5343, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1769471168518066, + "rewards/margins": 6.624100685119629, + "rewards/rejected": -3.447153091430664, + "step": 12776 + }, + { + "epoch": 3.2, + "grad_norm": 6.008908748626709, + "learning_rate": 2.8806381089661484e-06, + "logits/chosen": -0.5032902956008911, + "logits/rejected": -0.556613028049469, + "logps/chosen": -43.31730270385742, + "logps/rejected": -111.0712890625, + "loss": 0.4962, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3715980052948, + "rewards/margins": 6.705996513366699, + "rewards/rejected": -3.3343989849090576, + "step": 12777 + }, + { + "epoch": 3.2, + "grad_norm": 20.418703079223633, + "learning_rate": 2.879926248903112e-06, + "logits/chosen": -0.5192515254020691, + "logits/rejected": -0.5650200247764587, + "logps/chosen": -59.68357467651367, + "logps/rejected": -96.10981750488281, + "loss": 0.7302, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8155627250671387, + "rewards/margins": 4.815540313720703, + "rewards/rejected": -1.9999773502349854, + "step": 12778 + }, + { + "epoch": 3.2, + "grad_norm": 5.631032943725586, + "learning_rate": 2.8792144412293533e-06, + "logits/chosen": -0.4966229796409607, + "logits/rejected": -0.5489311814308167, + "logps/chosen": -58.7447395324707, + "logps/rejected": -100.86555480957031, + "loss": 0.6372, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1129307746887207, + "rewards/margins": 6.014204502105713, + "rewards/rejected": -2.9012742042541504, + "step": 12779 + }, + { + "epoch": 3.2, + "grad_norm": 16.374393463134766, + "learning_rate": 2.878502685962459e-06, + "logits/chosen": -0.5274726748466492, + "logits/rejected": -0.5878655910491943, + "logps/chosen": -52.58485794067383, + "logps/rejected": -110.20330047607422, + "loss": 0.6859, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.143840789794922, + "rewards/margins": 6.384005546569824, + "rewards/rejected": -3.2401645183563232, + "step": 12780 + }, + { + "epoch": 3.2, + "grad_norm": 5.936657428741455, + "learning_rate": 2.87779098312002e-06, + "logits/chosen": -0.5184363126754761, + "logits/rejected": -0.6430272459983826, + "logps/chosen": -60.52201843261719, + "logps/rejected": -105.71585083007812, + "loss": 0.692, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8283989429473877, + "rewards/margins": 7.673836708068848, + "rewards/rejected": -4.845437526702881, + "step": 12781 + }, + { + "epoch": 3.2, + "grad_norm": 16.45716667175293, + "learning_rate": 2.8770793327196277e-06, + "logits/chosen": -0.531597375869751, + "logits/rejected": -0.6231098175048828, + "logps/chosen": -50.70912170410156, + "logps/rejected": -95.87577819824219, + "loss": 0.6498, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8959856033325195, + "rewards/margins": 6.17844820022583, + "rewards/rejected": -3.2824625968933105, + "step": 12782 + }, + { + "epoch": 3.2, + "grad_norm": 4.236498832702637, + "learning_rate": 2.876367734778859e-06, + "logits/chosen": -0.4888337254524231, + "logits/rejected": -0.5412362217903137, + "logps/chosen": -47.83448791503906, + "logps/rejected": -105.43794250488281, + "loss": 0.6846, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.064814567565918, + "rewards/margins": 7.272920608520508, + "rewards/rejected": -4.208106517791748, + "step": 12783 + }, + { + "epoch": 3.2, + "grad_norm": 5.175585746765137, + "learning_rate": 2.8756561893153013e-06, + "logits/chosen": -0.47316914796829224, + "logits/rejected": -0.6235297918319702, + "logps/chosen": -65.16289520263672, + "logps/rejected": -86.97438049316406, + "loss": 0.7659, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1428639888763428, + "rewards/margins": 6.415440082550049, + "rewards/rejected": -3.2725765705108643, + "step": 12784 + }, + { + "epoch": 3.2, + "grad_norm": 3.0419602394104004, + "learning_rate": 2.8749446963465416e-06, + "logits/chosen": -0.5476664900779724, + "logits/rejected": -0.6583899855613708, + "logps/chosen": -52.021488189697266, + "logps/rejected": -99.08118438720703, + "loss": 0.6111, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0218894481658936, + "rewards/margins": 6.942469120025635, + "rewards/rejected": -3.920579433441162, + "step": 12785 + }, + { + "epoch": 3.2, + "grad_norm": 4.369161128997803, + "learning_rate": 2.8742332558901588e-06, + "logits/chosen": -0.47985339164733887, + "logits/rejected": -0.5134801864624023, + "logps/chosen": -61.64653015136719, + "logps/rejected": -108.57390594482422, + "loss": 0.6576, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0087475776672363, + "rewards/margins": 5.517360687255859, + "rewards/rejected": -2.508612871170044, + "step": 12786 + }, + { + "epoch": 3.2, + "grad_norm": 6.972026348114014, + "learning_rate": 2.873521867963731e-06, + "logits/chosen": -0.572819709777832, + "logits/rejected": -0.6398413181304932, + "logps/chosen": -59.56079864501953, + "logps/rejected": -128.65208435058594, + "loss": 0.5823, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.952483892440796, + "rewards/margins": 7.586273193359375, + "rewards/rejected": -4.633789539337158, + "step": 12787 + }, + { + "epoch": 3.2, + "grad_norm": 5.972066879272461, + "learning_rate": 2.8728105325848414e-06, + "logits/chosen": -0.5195973515510559, + "logits/rejected": -0.5575318336486816, + "logps/chosen": -54.0124397277832, + "logps/rejected": -101.12602233886719, + "loss": 0.7453, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.161869525909424, + "rewards/margins": 5.679769992828369, + "rewards/rejected": -2.5179007053375244, + "step": 12788 + }, + { + "epoch": 3.2, + "grad_norm": 6.712353229522705, + "learning_rate": 2.8720992497710665e-06, + "logits/chosen": -0.5315959453582764, + "logits/rejected": -0.5483383536338806, + "logps/chosen": -50.61380386352539, + "logps/rejected": -98.78673553466797, + "loss": 0.6476, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1346795558929443, + "rewards/margins": 6.494984149932861, + "rewards/rejected": -3.3603038787841797, + "step": 12789 + }, + { + "epoch": 3.2, + "grad_norm": 5.142971992492676, + "learning_rate": 2.8713880195399805e-06, + "logits/chosen": -0.488469660282135, + "logits/rejected": -0.6041210293769836, + "logps/chosen": -55.45140075683594, + "logps/rejected": -95.46987915039062, + "loss": 0.6178, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.273993730545044, + "rewards/margins": 6.6398234367370605, + "rewards/rejected": -3.3658299446105957, + "step": 12790 + }, + { + "epoch": 3.2, + "grad_norm": 8.413076400756836, + "learning_rate": 2.8706768419091624e-06, + "logits/chosen": -0.49093493819236755, + "logits/rejected": -0.558170735836029, + "logps/chosen": -53.687469482421875, + "logps/rejected": -108.04679870605469, + "loss": 0.6987, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.837946653366089, + "rewards/margins": 6.309515476226807, + "rewards/rejected": -3.471569061279297, + "step": 12791 + }, + { + "epoch": 3.2, + "grad_norm": 12.619237899780273, + "learning_rate": 2.8699657168961827e-06, + "logits/chosen": -0.5878346562385559, + "logits/rejected": -0.6544198393821716, + "logps/chosen": -57.381900787353516, + "logps/rejected": -114.43401336669922, + "loss": 0.6649, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.101306676864624, + "rewards/margins": 7.715332508087158, + "rewards/rejected": -4.614026069641113, + "step": 12792 + }, + { + "epoch": 3.2, + "grad_norm": 10.569718360900879, + "learning_rate": 2.8692546445186175e-06, + "logits/chosen": -0.47940027713775635, + "logits/rejected": -0.5968763828277588, + "logps/chosen": -72.77200317382812, + "logps/rejected": -103.22795104980469, + "loss": 0.7641, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.79793119430542, + "rewards/margins": 6.38084077835083, + "rewards/rejected": -3.582909345626831, + "step": 12793 + }, + { + "epoch": 3.2, + "grad_norm": 2.5388576984405518, + "learning_rate": 2.8685436247940366e-06, + "logits/chosen": -0.6390936970710754, + "logits/rejected": -0.7376677989959717, + "logps/chosen": -46.31806564331055, + "logps/rejected": -100.46261596679688, + "loss": 0.5612, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.302703380584717, + "rewards/margins": 7.654960632324219, + "rewards/rejected": -4.35225772857666, + "step": 12794 + }, + { + "epoch": 3.2, + "grad_norm": 1.7104321718215942, + "learning_rate": 2.867832657740008e-06, + "logits/chosen": -0.44831541180610657, + "logits/rejected": -0.5826071500778198, + "logps/chosen": -56.18203353881836, + "logps/rejected": -102.55523681640625, + "loss": 0.5477, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.207545757293701, + "rewards/margins": 7.373054027557373, + "rewards/rejected": -4.165507793426514, + "step": 12795 + }, + { + "epoch": 3.2, + "grad_norm": 5.824656963348389, + "learning_rate": 2.8671217433741017e-06, + "logits/chosen": -0.4599662721157074, + "logits/rejected": -0.5264579057693481, + "logps/chosen": -72.84150695800781, + "logps/rejected": -84.88067626953125, + "loss": 0.7439, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.963712215423584, + "rewards/margins": 6.181376934051514, + "rewards/rejected": -3.2176644802093506, + "step": 12796 + }, + { + "epoch": 3.2, + "grad_norm": 4.978545188903809, + "learning_rate": 2.86641088171389e-06, + "logits/chosen": -0.44724053144454956, + "logits/rejected": -0.5267248153686523, + "logps/chosen": -64.34246063232422, + "logps/rejected": -108.1495132446289, + "loss": 0.6893, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.790415048599243, + "rewards/margins": 5.68108606338501, + "rewards/rejected": -2.8906707763671875, + "step": 12797 + }, + { + "epoch": 3.2, + "grad_norm": 7.755503177642822, + "learning_rate": 2.8657000727769314e-06, + "logits/chosen": -0.45819219946861267, + "logits/rejected": -0.4874001145362854, + "logps/chosen": -56.17335891723633, + "logps/rejected": -113.67733764648438, + "loss": 0.6837, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.876260995864868, + "rewards/margins": 6.169942378997803, + "rewards/rejected": -3.2936813831329346, + "step": 12798 + }, + { + "epoch": 3.2, + "grad_norm": 7.299689292907715, + "learning_rate": 2.8649893165807934e-06, + "logits/chosen": -0.5828076004981995, + "logits/rejected": -0.6466422080993652, + "logps/chosen": -51.69475173950195, + "logps/rejected": -110.57769775390625, + "loss": 0.7546, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.729184150695801, + "rewards/margins": 6.95489501953125, + "rewards/rejected": -4.225711345672607, + "step": 12799 + }, + { + "epoch": 3.2, + "grad_norm": 3.6912343502044678, + "learning_rate": 2.8642786131430445e-06, + "logits/chosen": -0.48757660388946533, + "logits/rejected": -0.5842204093933105, + "logps/chosen": -51.51689147949219, + "logps/rejected": -105.77908325195312, + "loss": 0.5231, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1323699951171875, + "rewards/margins": 7.6828155517578125, + "rewards/rejected": -4.550445079803467, + "step": 12800 + }, + { + "epoch": 3.2, + "grad_norm": 5.193835735321045, + "learning_rate": 2.863567962481239e-06, + "logits/chosen": -0.5532839894294739, + "logits/rejected": -0.6533591747283936, + "logps/chosen": -48.0068244934082, + "logps/rejected": -95.60283660888672, + "loss": 0.5404, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2174072265625, + "rewards/margins": 6.875886917114258, + "rewards/rejected": -3.6584794521331787, + "step": 12801 + }, + { + "epoch": 3.2, + "grad_norm": 4.04628849029541, + "learning_rate": 2.862857364612941e-06, + "logits/chosen": -0.5940757989883423, + "logits/rejected": -0.7029539346694946, + "logps/chosen": -60.29574203491211, + "logps/rejected": -131.66748046875, + "loss": 0.6049, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8274309635162354, + "rewards/margins": 9.377158164978027, + "rewards/rejected": -6.549727439880371, + "step": 12802 + }, + { + "epoch": 3.2, + "grad_norm": 5.05810546875, + "learning_rate": 2.8621468195557135e-06, + "logits/chosen": -0.6231513023376465, + "logits/rejected": -0.69735187292099, + "logps/chosen": -54.619441986083984, + "logps/rejected": -110.84793853759766, + "loss": 0.5611, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0259299278259277, + "rewards/margins": 8.026420593261719, + "rewards/rejected": -5.000489711761475, + "step": 12803 + }, + { + "epoch": 3.2, + "grad_norm": 3.4359564781188965, + "learning_rate": 2.861436327327112e-06, + "logits/chosen": -0.5165100693702698, + "logits/rejected": -0.6121855974197388, + "logps/chosen": -69.04891967773438, + "logps/rejected": -89.15564727783203, + "loss": 0.6262, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3331899642944336, + "rewards/margins": 6.684457302093506, + "rewards/rejected": -3.3512673377990723, + "step": 12804 + }, + { + "epoch": 3.2, + "grad_norm": 6.365314483642578, + "learning_rate": 2.8607258879446903e-06, + "logits/chosen": -0.4516949951648712, + "logits/rejected": -0.5790418386459351, + "logps/chosen": -63.321353912353516, + "logps/rejected": -123.57669067382812, + "loss": 0.6034, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7674827575683594, + "rewards/margins": 7.043206691741943, + "rewards/rejected": -4.275723457336426, + "step": 12805 + }, + { + "epoch": 3.2, + "grad_norm": 3.354555606842041, + "learning_rate": 2.86001550142601e-06, + "logits/chosen": -0.5551732778549194, + "logits/rejected": -0.5958834290504456, + "logps/chosen": -48.465763092041016, + "logps/rejected": -106.46110534667969, + "loss": 0.6265, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.6807494163513184, + "rewards/margins": 6.776793956756592, + "rewards/rejected": -3.0960447788238525, + "step": 12806 + }, + { + "epoch": 3.2, + "grad_norm": 3.6793980598449707, + "learning_rate": 2.859305167788623e-06, + "logits/chosen": -0.4887217879295349, + "logits/rejected": -0.5771596431732178, + "logps/chosen": -59.70924377441406, + "logps/rejected": -92.66065216064453, + "loss": 0.59, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.209916830062866, + "rewards/margins": 5.528116226196289, + "rewards/rejected": -2.318199634552002, + "step": 12807 + }, + { + "epoch": 3.2, + "grad_norm": 7.578546524047852, + "learning_rate": 2.85859488705008e-06, + "logits/chosen": -0.5213146209716797, + "logits/rejected": -0.6390891671180725, + "logps/chosen": -57.199432373046875, + "logps/rejected": -110.09909057617188, + "loss": 0.6539, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.877171039581299, + "rewards/margins": 7.801125526428223, + "rewards/rejected": -4.923954486846924, + "step": 12808 + }, + { + "epoch": 3.2, + "grad_norm": 5.0766448974609375, + "learning_rate": 2.8578846592279376e-06, + "logits/chosen": -0.5135913491249084, + "logits/rejected": -0.6158487200737, + "logps/chosen": -55.36763000488281, + "logps/rejected": -105.9218978881836, + "loss": 0.6085, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.696648597717285, + "rewards/margins": 6.918750762939453, + "rewards/rejected": -4.222103118896484, + "step": 12809 + }, + { + "epoch": 3.2, + "grad_norm": 5.942112922668457, + "learning_rate": 2.8571744843397412e-06, + "logits/chosen": -0.5315078496932983, + "logits/rejected": -0.6043927073478699, + "logps/chosen": -51.866825103759766, + "logps/rejected": -102.2451400756836, + "loss": 0.617, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2808938026428223, + "rewards/margins": 5.8206257820129395, + "rewards/rejected": -2.5397324562072754, + "step": 12810 + }, + { + "epoch": 3.2, + "grad_norm": 6.800108909606934, + "learning_rate": 2.8564643624030447e-06, + "logits/chosen": -0.45044052600860596, + "logits/rejected": -0.5334774851799011, + "logps/chosen": -50.07893753051758, + "logps/rejected": -86.53357696533203, + "loss": 0.7083, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.920046091079712, + "rewards/margins": 5.484331130981445, + "rewards/rejected": -2.5642852783203125, + "step": 12811 + }, + { + "epoch": 3.21, + "grad_norm": 5.303391456604004, + "learning_rate": 2.855754293435394e-06, + "logits/chosen": -0.5328475832939148, + "logits/rejected": -0.6368958950042725, + "logps/chosen": -55.606605529785156, + "logps/rejected": -112.806640625, + "loss": 0.6466, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0162410736083984, + "rewards/margins": 7.587332725524902, + "rewards/rejected": -4.571091175079346, + "step": 12812 + }, + { + "epoch": 3.21, + "grad_norm": 3.99042010307312, + "learning_rate": 2.855044277454333e-06, + "logits/chosen": -0.5628507137298584, + "logits/rejected": -0.6542159914970398, + "logps/chosen": -48.82316970825195, + "logps/rejected": -103.22702026367188, + "loss": 0.5975, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3951468467712402, + "rewards/margins": 8.168747901916504, + "rewards/rejected": -4.7736005783081055, + "step": 12813 + }, + { + "epoch": 3.21, + "grad_norm": 2.781250238418579, + "learning_rate": 2.8543343144774116e-06, + "logits/chosen": -0.525041937828064, + "logits/rejected": -0.6308748126029968, + "logps/chosen": -52.4100341796875, + "logps/rejected": -101.03643798828125, + "loss": 0.5556, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1567015647888184, + "rewards/margins": 7.405102729797363, + "rewards/rejected": -4.248401641845703, + "step": 12814 + }, + { + "epoch": 3.21, + "grad_norm": 4.580328464508057, + "learning_rate": 2.8536244045221697e-06, + "logits/chosen": -0.5225695371627808, + "logits/rejected": -0.5793054103851318, + "logps/chosen": -61.585533142089844, + "logps/rejected": -123.90057373046875, + "loss": 0.5715, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0064291954040527, + "rewards/margins": 6.422562599182129, + "rewards/rejected": -3.416132926940918, + "step": 12815 + }, + { + "epoch": 3.21, + "grad_norm": 12.676283836364746, + "learning_rate": 2.852914547606153e-06, + "logits/chosen": -0.5563758611679077, + "logits/rejected": -0.6300984025001526, + "logps/chosen": -60.111900329589844, + "logps/rejected": -114.75216674804688, + "loss": 0.7632, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.010833501815796, + "rewards/margins": 7.47938346862793, + "rewards/rejected": -4.468549728393555, + "step": 12816 + }, + { + "epoch": 3.21, + "grad_norm": 10.482542991638184, + "learning_rate": 2.8522047437469005e-06, + "logits/chosen": -0.4665556848049164, + "logits/rejected": -0.5776053071022034, + "logps/chosen": -60.99405288696289, + "logps/rejected": -89.70345306396484, + "loss": 0.6509, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2005531787872314, + "rewards/margins": 6.5935564041137695, + "rewards/rejected": -3.393003225326538, + "step": 12817 + }, + { + "epoch": 3.21, + "grad_norm": 4.133472919464111, + "learning_rate": 2.851494992961955e-06, + "logits/chosen": -0.5727386474609375, + "logits/rejected": -0.6471785306930542, + "logps/chosen": -55.288124084472656, + "logps/rejected": -107.10584259033203, + "loss": 0.6273, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.758045196533203, + "rewards/margins": 7.051028728485107, + "rewards/rejected": -4.292983531951904, + "step": 12818 + }, + { + "epoch": 3.21, + "grad_norm": 6.526284217834473, + "learning_rate": 2.850785295268853e-06, + "logits/chosen": -0.47933122515678406, + "logits/rejected": -0.5648749470710754, + "logps/chosen": -50.99723815917969, + "logps/rejected": -109.77378845214844, + "loss": 0.655, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.01969838142395, + "rewards/margins": 7.081079483032227, + "rewards/rejected": -4.061380863189697, + "step": 12819 + }, + { + "epoch": 3.21, + "grad_norm": 24.662878036499023, + "learning_rate": 2.8500756506851307e-06, + "logits/chosen": -0.589962363243103, + "logits/rejected": -0.6551536917686462, + "logps/chosen": -52.30596923828125, + "logps/rejected": -108.50595092773438, + "loss": 0.6879, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2555315494537354, + "rewards/margins": 7.621460437774658, + "rewards/rejected": -4.365928649902344, + "step": 12820 + }, + { + "epoch": 3.21, + "grad_norm": 4.874855041503906, + "learning_rate": 2.849366059228328e-06, + "logits/chosen": -0.5668162107467651, + "logits/rejected": -0.6853798031806946, + "logps/chosen": -47.19756317138672, + "logps/rejected": -98.87690734863281, + "loss": 0.5505, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.306870937347412, + "rewards/margins": 8.575081825256348, + "rewards/rejected": -5.268211364746094, + "step": 12821 + }, + { + "epoch": 3.21, + "grad_norm": 5.932804107666016, + "learning_rate": 2.8486565209159774e-06, + "logits/chosen": -0.49504444003105164, + "logits/rejected": -0.5274238586425781, + "logps/chosen": -61.00493240356445, + "logps/rejected": -118.47859191894531, + "loss": 0.713, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.829148769378662, + "rewards/margins": 7.096022605895996, + "rewards/rejected": -4.266873836517334, + "step": 12822 + }, + { + "epoch": 3.21, + "grad_norm": 19.431989669799805, + "learning_rate": 2.84794703576561e-06, + "logits/chosen": -0.5761898756027222, + "logits/rejected": -0.6773031949996948, + "logps/chosen": -46.656349182128906, + "logps/rejected": -81.39990234375, + "loss": 0.6205, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.108302593231201, + "rewards/margins": 6.532143592834473, + "rewards/rejected": -3.4238407611846924, + "step": 12823 + }, + { + "epoch": 3.21, + "grad_norm": 11.333024024963379, + "learning_rate": 2.8472376037947606e-06, + "logits/chosen": -0.6371440887451172, + "logits/rejected": -0.6419990062713623, + "logps/chosen": -79.44027709960938, + "logps/rejected": -88.20310974121094, + "loss": 0.721, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.162956476211548, + "rewards/margins": 6.186191558837891, + "rewards/rejected": -3.0232348442077637, + "step": 12824 + }, + { + "epoch": 3.21, + "grad_norm": 4.507138729095459, + "learning_rate": 2.846528225020964e-06, + "logits/chosen": -0.6392707824707031, + "logits/rejected": -0.6741182208061218, + "logps/chosen": -49.54399108886719, + "logps/rejected": -102.73653411865234, + "loss": 0.6292, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3386764526367188, + "rewards/margins": 6.801087856292725, + "rewards/rejected": -3.462411403656006, + "step": 12825 + }, + { + "epoch": 3.21, + "grad_norm": 4.082450866699219, + "learning_rate": 2.8458188994617418e-06, + "logits/chosen": -0.46958184242248535, + "logits/rejected": -0.5895277857780457, + "logps/chosen": -83.27117919921875, + "logps/rejected": -146.04400634765625, + "loss": 0.6111, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.020552635192871, + "rewards/margins": 8.758454322814941, + "rewards/rejected": -5.73790168762207, + "step": 12826 + }, + { + "epoch": 3.21, + "grad_norm": 7.796195983886719, + "learning_rate": 2.8451096271346257e-06, + "logits/chosen": -0.5842872858047485, + "logits/rejected": -0.6326141953468323, + "logps/chosen": -53.780242919921875, + "logps/rejected": -107.55506896972656, + "loss": 0.7461, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7845940589904785, + "rewards/margins": 6.234462261199951, + "rewards/rejected": -3.4498677253723145, + "step": 12827 + }, + { + "epoch": 3.21, + "grad_norm": 2.6695291996002197, + "learning_rate": 2.8444004080571443e-06, + "logits/chosen": -0.6200447082519531, + "logits/rejected": -0.7171008586883545, + "logps/chosen": -43.651947021484375, + "logps/rejected": -93.14967346191406, + "loss": 0.5573, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.292753219604492, + "rewards/margins": 7.464322090148926, + "rewards/rejected": -4.171568870544434, + "step": 12828 + }, + { + "epoch": 3.21, + "grad_norm": 9.814705848693848, + "learning_rate": 2.843691242246822e-06, + "logits/chosen": -0.47967058420181274, + "logits/rejected": -0.6245030760765076, + "logps/chosen": -59.3272590637207, + "logps/rejected": -95.32902526855469, + "loss": 0.6249, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.997523546218872, + "rewards/margins": 7.343008518218994, + "rewards/rejected": -4.345485210418701, + "step": 12829 + }, + { + "epoch": 3.21, + "grad_norm": 7.130573749542236, + "learning_rate": 2.84298212972118e-06, + "logits/chosen": -0.500100314617157, + "logits/rejected": -0.6359906196594238, + "logps/chosen": -52.325050354003906, + "logps/rejected": -102.6550064086914, + "loss": 0.6126, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9127702713012695, + "rewards/margins": 7.5509538650512695, + "rewards/rejected": -4.63818359375, + "step": 12830 + }, + { + "epoch": 3.21, + "grad_norm": 4.561645030975342, + "learning_rate": 2.8422730704977464e-06, + "logits/chosen": -0.4469364881515503, + "logits/rejected": -0.534607470035553, + "logps/chosen": -63.12469482421875, + "logps/rejected": -104.68683624267578, + "loss": 0.6364, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0177690982818604, + "rewards/margins": 7.470692157745361, + "rewards/rejected": -4.45292329788208, + "step": 12831 + }, + { + "epoch": 3.21, + "grad_norm": 3.6425724029541016, + "learning_rate": 2.841564064594041e-06, + "logits/chosen": -0.5332770347595215, + "logits/rejected": -0.6196878552436829, + "logps/chosen": -45.71796798706055, + "logps/rejected": -123.36441802978516, + "loss": 0.5479, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2054100036621094, + "rewards/margins": 8.291828155517578, + "rewards/rejected": -5.086418151855469, + "step": 12832 + }, + { + "epoch": 3.21, + "grad_norm": 12.83217716217041, + "learning_rate": 2.8408551120275807e-06, + "logits/chosen": -0.4751516878604889, + "logits/rejected": -0.5537294745445251, + "logps/chosen": -48.22269821166992, + "logps/rejected": -103.267822265625, + "loss": 0.6353, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9018495082855225, + "rewards/margins": 6.63714599609375, + "rewards/rejected": -3.7352962493896484, + "step": 12833 + }, + { + "epoch": 3.21, + "grad_norm": 5.6032538414001465, + "learning_rate": 2.8401462128158884e-06, + "logits/chosen": -0.39854565262794495, + "logits/rejected": -0.5207605361938477, + "logps/chosen": -64.73270416259766, + "logps/rejected": -119.28097534179688, + "loss": 0.5868, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9834771156311035, + "rewards/margins": 6.743993282318115, + "rewards/rejected": -3.760516405105591, + "step": 12834 + }, + { + "epoch": 3.21, + "grad_norm": 6.613924980163574, + "learning_rate": 2.8394373669764796e-06, + "logits/chosen": -0.5779014825820923, + "logits/rejected": -0.6738929748535156, + "logps/chosen": -63.96935272216797, + "logps/rejected": -87.86080932617188, + "loss": 0.8271, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.884183645248413, + "rewards/margins": 5.819903373718262, + "rewards/rejected": -2.9357199668884277, + "step": 12835 + }, + { + "epoch": 3.21, + "grad_norm": 4.1823835372924805, + "learning_rate": 2.8387285745268733e-06, + "logits/chosen": -0.4440593123435974, + "logits/rejected": -0.5304729342460632, + "logps/chosen": -58.97991180419922, + "logps/rejected": -118.39520263671875, + "loss": 0.5712, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.026313543319702, + "rewards/margins": 6.927712440490723, + "rewards/rejected": -3.9013991355895996, + "step": 12836 + }, + { + "epoch": 3.21, + "grad_norm": 7.576849460601807, + "learning_rate": 2.8380198354845823e-06, + "logits/chosen": -0.5515722632408142, + "logits/rejected": -0.682182252407074, + "logps/chosen": -60.3802490234375, + "logps/rejected": -93.21298217773438, + "loss": 0.7507, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1528048515319824, + "rewards/margins": 5.955194473266602, + "rewards/rejected": -2.8023900985717773, + "step": 12837 + }, + { + "epoch": 3.21, + "grad_norm": 7.798500061035156, + "learning_rate": 2.837311149867119e-06, + "logits/chosen": -0.5143730640411377, + "logits/rejected": -0.5812942385673523, + "logps/chosen": -47.00701141357422, + "logps/rejected": -109.70565032958984, + "loss": 0.6246, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.131143808364868, + "rewards/margins": 6.343572616577148, + "rewards/rejected": -3.2124295234680176, + "step": 12838 + }, + { + "epoch": 3.21, + "grad_norm": 2.6862447261810303, + "learning_rate": 2.836602517692e-06, + "logits/chosen": -0.5031048655509949, + "logits/rejected": -0.6172103881835938, + "logps/chosen": -52.51299285888672, + "logps/rejected": -99.33009338378906, + "loss": 0.6288, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.49025821685791, + "rewards/margins": 7.934695243835449, + "rewards/rejected": -4.444437026977539, + "step": 12839 + }, + { + "epoch": 3.21, + "grad_norm": 4.732133865356445, + "learning_rate": 2.8358939389767337e-06, + "logits/chosen": -0.5401359796524048, + "logits/rejected": -0.6454771757125854, + "logps/chosen": -60.4384765625, + "logps/rejected": -80.72434997558594, + "loss": 0.6346, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.116370916366577, + "rewards/margins": 6.411433219909668, + "rewards/rejected": -3.2950620651245117, + "step": 12840 + }, + { + "epoch": 3.21, + "grad_norm": 9.977131843566895, + "learning_rate": 2.8351854137388273e-06, + "logits/chosen": -0.5261102914810181, + "logits/rejected": -0.5748342275619507, + "logps/chosen": -50.65803909301758, + "logps/rejected": -101.04442596435547, + "loss": 0.5984, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9417710304260254, + "rewards/margins": 6.030802249908447, + "rewards/rejected": -3.089031457901001, + "step": 12841 + }, + { + "epoch": 3.21, + "grad_norm": 4.5361175537109375, + "learning_rate": 2.834476941995793e-06, + "logits/chosen": -0.47618168592453003, + "logits/rejected": -0.5316419005393982, + "logps/chosen": -55.23634719848633, + "logps/rejected": -94.87273406982422, + "loss": 0.6622, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.920837640762329, + "rewards/margins": 6.00967264175415, + "rewards/rejected": -3.088834762573242, + "step": 12842 + }, + { + "epoch": 3.21, + "grad_norm": 5.181975364685059, + "learning_rate": 2.8337685237651403e-06, + "logits/chosen": -0.5806556940078735, + "logits/rejected": -0.6007048487663269, + "logps/chosen": -56.13410186767578, + "logps/rejected": -115.3880844116211, + "loss": 0.6943, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1960039138793945, + "rewards/margins": 6.1980695724487305, + "rewards/rejected": -3.002065420150757, + "step": 12843 + }, + { + "epoch": 3.21, + "grad_norm": 3.477374315261841, + "learning_rate": 2.833060159064367e-06, + "logits/chosen": -0.5651119351387024, + "logits/rejected": -0.6436938047409058, + "logps/chosen": -55.652015686035156, + "logps/rejected": -113.2196044921875, + "loss": 0.589, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9765501022338867, + "rewards/margins": 7.219968795776367, + "rewards/rejected": -4.2434186935424805, + "step": 12844 + }, + { + "epoch": 3.21, + "grad_norm": 2.6137685775756836, + "learning_rate": 2.8323518479109824e-06, + "logits/chosen": -0.4660398066043854, + "logits/rejected": -0.5410377383232117, + "logps/chosen": -52.669803619384766, + "logps/rejected": -94.383544921875, + "loss": 0.5186, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1638572216033936, + "rewards/margins": 6.314403533935547, + "rewards/rejected": -3.1505465507507324, + "step": 12845 + }, + { + "epoch": 3.21, + "grad_norm": 5.3541460037231445, + "learning_rate": 2.8316435903224913e-06, + "logits/chosen": -0.5033531188964844, + "logits/rejected": -0.5748974084854126, + "logps/chosen": -53.45429611206055, + "logps/rejected": -95.72105407714844, + "loss": 0.709, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.301339626312256, + "rewards/margins": 5.4634504318237305, + "rewards/rejected": -2.1621105670928955, + "step": 12846 + }, + { + "epoch": 3.21, + "grad_norm": 4.493139743804932, + "learning_rate": 2.8309353863163936e-06, + "logits/chosen": -0.49652355909347534, + "logits/rejected": -0.6054850816726685, + "logps/chosen": -59.533573150634766, + "logps/rejected": -108.24958801269531, + "loss": 0.7551, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.015894651412964, + "rewards/margins": 6.48820686340332, + "rewards/rejected": -3.472311496734619, + "step": 12847 + }, + { + "epoch": 3.21, + "grad_norm": 6.192129611968994, + "learning_rate": 2.830227235910187e-06, + "logits/chosen": -0.5908347368240356, + "logits/rejected": -0.6457794904708862, + "logps/chosen": -48.663795471191406, + "logps/rejected": -95.62632751464844, + "loss": 0.5722, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5151472091674805, + "rewards/margins": 6.834809303283691, + "rewards/rejected": -3.31966233253479, + "step": 12848 + }, + { + "epoch": 3.21, + "grad_norm": 4.632768630981445, + "learning_rate": 2.8295191391213763e-06, + "logits/chosen": -0.48483315110206604, + "logits/rejected": -0.4817638695240021, + "logps/chosen": -48.55195236206055, + "logps/rejected": -111.18478393554688, + "loss": 0.6768, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.297792434692383, + "rewards/margins": 5.77160120010376, + "rewards/rejected": -2.473809242248535, + "step": 12849 + }, + { + "epoch": 3.21, + "grad_norm": 1.630346417427063, + "learning_rate": 2.8288110959674552e-06, + "logits/chosen": -0.48709142208099365, + "logits/rejected": -0.534487247467041, + "logps/chosen": -44.65927505493164, + "logps/rejected": -106.45982360839844, + "loss": 0.4785, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1149821281433105, + "rewards/margins": 8.245389938354492, + "rewards/rejected": -5.130407333374023, + "step": 12850 + }, + { + "epoch": 3.21, + "grad_norm": 6.3420257568359375, + "learning_rate": 2.8281031064659203e-06, + "logits/chosen": -0.5214785933494568, + "logits/rejected": -0.6222511529922485, + "logps/chosen": -54.91414260864258, + "logps/rejected": -94.75761413574219, + "loss": 0.6758, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.463660717010498, + "rewards/margins": 5.636252403259277, + "rewards/rejected": -2.1725916862487793, + "step": 12851 + }, + { + "epoch": 3.22, + "grad_norm": 4.978094100952148, + "learning_rate": 2.8273951706342696e-06, + "logits/chosen": -0.5504168272018433, + "logits/rejected": -0.5929344296455383, + "logps/chosen": -54.4146614074707, + "logps/rejected": -89.9625015258789, + "loss": 0.6402, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.019756317138672, + "rewards/margins": 5.888289928436279, + "rewards/rejected": -2.8685338497161865, + "step": 12852 + }, + { + "epoch": 3.22, + "grad_norm": 3.919389009475708, + "learning_rate": 2.826687288489992e-06, + "logits/chosen": -0.5430510640144348, + "logits/rejected": -0.6133426427841187, + "logps/chosen": -62.078651428222656, + "logps/rejected": -99.02487182617188, + "loss": 0.6611, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.135039806365967, + "rewards/margins": 7.1361260414123535, + "rewards/rejected": -4.001086235046387, + "step": 12853 + }, + { + "epoch": 3.22, + "grad_norm": 3.4210243225097656, + "learning_rate": 2.8259794600505862e-06, + "logits/chosen": -0.49783432483673096, + "logits/rejected": -0.5703161954879761, + "logps/chosen": -67.22688293457031, + "logps/rejected": -93.92023468017578, + "loss": 0.6773, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0307888984680176, + "rewards/margins": 5.929118633270264, + "rewards/rejected": -2.898329496383667, + "step": 12854 + }, + { + "epoch": 3.22, + "grad_norm": 4.08492374420166, + "learning_rate": 2.8252716853335404e-06, + "logits/chosen": -0.520843505859375, + "logits/rejected": -0.5935829281806946, + "logps/chosen": -50.01115798950195, + "logps/rejected": -97.78530883789062, + "loss": 0.6936, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.333448886871338, + "rewards/margins": 7.201999664306641, + "rewards/rejected": -3.8685500621795654, + "step": 12855 + }, + { + "epoch": 3.22, + "grad_norm": 3.768571376800537, + "learning_rate": 2.8245639643563423e-06, + "logits/chosen": -0.4633947014808655, + "logits/rejected": -0.5405386686325073, + "logps/chosen": -46.655029296875, + "logps/rejected": -104.63047790527344, + "loss": 0.5993, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.092008590698242, + "rewards/margins": 7.1977219581604, + "rewards/rejected": -4.105712890625, + "step": 12856 + }, + { + "epoch": 3.22, + "grad_norm": 9.875382423400879, + "learning_rate": 2.823856297136484e-06, + "logits/chosen": -0.5545986890792847, + "logits/rejected": -0.6031876802444458, + "logps/chosen": -62.724884033203125, + "logps/rejected": -92.23478698730469, + "loss": 1.0202, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1089894771575928, + "rewards/margins": 4.727542877197266, + "rewards/rejected": -1.6185531616210938, + "step": 12857 + }, + { + "epoch": 3.22, + "grad_norm": 5.251866817474365, + "learning_rate": 2.82314868369145e-06, + "logits/chosen": -0.48997753858566284, + "logits/rejected": -0.5816514492034912, + "logps/chosen": -50.81495666503906, + "logps/rejected": -111.59126281738281, + "loss": 0.5886, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2238240242004395, + "rewards/margins": 7.675381660461426, + "rewards/rejected": -4.451557636260986, + "step": 12858 + }, + { + "epoch": 3.22, + "grad_norm": 4.728315353393555, + "learning_rate": 2.8224411240387284e-06, + "logits/chosen": -0.492252379655838, + "logits/rejected": -0.5595332980155945, + "logps/chosen": -62.780723571777344, + "logps/rejected": -90.288818359375, + "loss": 0.6806, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.904662847518921, + "rewards/margins": 5.574041366577148, + "rewards/rejected": -2.6693785190582275, + "step": 12859 + }, + { + "epoch": 3.22, + "grad_norm": 3.4555673599243164, + "learning_rate": 2.821733618195802e-06, + "logits/chosen": -0.47059306502342224, + "logits/rejected": -0.5223467350006104, + "logps/chosen": -54.692161560058594, + "logps/rejected": -116.95046997070312, + "loss": 0.5511, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7628262042999268, + "rewards/margins": 7.228395462036133, + "rewards/rejected": -4.465569019317627, + "step": 12860 + }, + { + "epoch": 3.22, + "grad_norm": 10.449211120605469, + "learning_rate": 2.8210261661801563e-06, + "logits/chosen": -0.46888619661331177, + "logits/rejected": -0.5802870988845825, + "logps/chosen": -52.038970947265625, + "logps/rejected": -95.9931411743164, + "loss": 0.6723, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7795894145965576, + "rewards/margins": 6.087607383728027, + "rewards/rejected": -3.308018207550049, + "step": 12861 + }, + { + "epoch": 3.22, + "grad_norm": 20.012571334838867, + "learning_rate": 2.8203187680092714e-06, + "logits/chosen": -0.5309053063392639, + "logits/rejected": -0.6096562743186951, + "logps/chosen": -58.219356536865234, + "logps/rejected": -101.53959655761719, + "loss": 0.6776, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.974532127380371, + "rewards/margins": 5.794616222381592, + "rewards/rejected": -2.8200840950012207, + "step": 12862 + }, + { + "epoch": 3.22, + "grad_norm": 3.2310597896575928, + "learning_rate": 2.819611423700626e-06, + "logits/chosen": -0.558465838432312, + "logits/rejected": -0.5952874422073364, + "logps/chosen": -60.8724365234375, + "logps/rejected": -101.39665985107422, + "loss": 0.6366, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.244415521621704, + "rewards/margins": 6.025293827056885, + "rewards/rejected": -2.7808780670166016, + "step": 12863 + }, + { + "epoch": 3.22, + "grad_norm": 3.2396724224090576, + "learning_rate": 2.818904133271704e-06, + "logits/chosen": -0.47007063031196594, + "logits/rejected": -0.5905106067657471, + "logps/chosen": -74.49162292480469, + "logps/rejected": -99.10382080078125, + "loss": 0.6141, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.90169095993042, + "rewards/margins": 6.706600666046143, + "rewards/rejected": -3.8049097061157227, + "step": 12864 + }, + { + "epoch": 3.22, + "grad_norm": 4.724110126495361, + "learning_rate": 2.818196896739981e-06, + "logits/chosen": -0.5453615784645081, + "logits/rejected": -0.5952086448669434, + "logps/chosen": -49.6264762878418, + "logps/rejected": -85.86920928955078, + "loss": 0.5643, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2992095947265625, + "rewards/margins": 6.092014312744141, + "rewards/rejected": -2.792804718017578, + "step": 12865 + }, + { + "epoch": 3.22, + "grad_norm": 10.744696617126465, + "learning_rate": 2.8174897141229306e-06, + "logits/chosen": -0.439675897359848, + "logits/rejected": -0.5201088786125183, + "logps/chosen": -62.41807556152344, + "logps/rejected": -102.72242736816406, + "loss": 0.6609, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.220449924468994, + "rewards/margins": 6.022128105163574, + "rewards/rejected": -2.801677942276001, + "step": 12866 + }, + { + "epoch": 3.22, + "grad_norm": 3.5192182064056396, + "learning_rate": 2.816782585438033e-06, + "logits/chosen": -0.46845191717147827, + "logits/rejected": -0.5762009620666504, + "logps/chosen": -62.53364562988281, + "logps/rejected": -103.56668090820312, + "loss": 0.6316, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4682345390319824, + "rewards/margins": 6.32758903503418, + "rewards/rejected": -2.8593547344207764, + "step": 12867 + }, + { + "epoch": 3.22, + "grad_norm": 10.34900188446045, + "learning_rate": 2.8160755107027603e-06, + "logits/chosen": -0.5986267924308777, + "logits/rejected": -0.6643527746200562, + "logps/chosen": -52.3741455078125, + "logps/rejected": -96.46260833740234, + "loss": 0.6693, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2724552154541016, + "rewards/margins": 6.46441650390625, + "rewards/rejected": -3.1919612884521484, + "step": 12868 + }, + { + "epoch": 3.22, + "grad_norm": 6.159423828125, + "learning_rate": 2.8153684899345824e-06, + "logits/chosen": -0.504073977470398, + "logits/rejected": -0.6261250376701355, + "logps/chosen": -54.7886848449707, + "logps/rejected": -90.08985137939453, + "loss": 0.5652, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8523733615875244, + "rewards/margins": 6.897039413452148, + "rewards/rejected": -4.044666290283203, + "step": 12869 + }, + { + "epoch": 3.22, + "grad_norm": 4.655476093292236, + "learning_rate": 2.814661523150973e-06, + "logits/chosen": -0.5534953474998474, + "logits/rejected": -0.6187174320220947, + "logps/chosen": -60.374122619628906, + "logps/rejected": -100.91730499267578, + "loss": 0.6526, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.984919786453247, + "rewards/margins": 5.653317451477051, + "rewards/rejected": -2.6683974266052246, + "step": 12870 + }, + { + "epoch": 3.22, + "grad_norm": 5.0508294105529785, + "learning_rate": 2.8139546103694054e-06, + "logits/chosen": -0.65302973985672, + "logits/rejected": -0.7480113506317139, + "logps/chosen": -56.84381103515625, + "logps/rejected": -111.25180053710938, + "loss": 0.6642, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0588927268981934, + "rewards/margins": 8.262121200561523, + "rewards/rejected": -5.203228950500488, + "step": 12871 + }, + { + "epoch": 3.22, + "grad_norm": 4.324860572814941, + "learning_rate": 2.813247751607341e-06, + "logits/chosen": -0.5269644260406494, + "logits/rejected": -0.5961424708366394, + "logps/chosen": -51.14994430541992, + "logps/rejected": -99.0462646484375, + "loss": 0.6017, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1363062858581543, + "rewards/margins": 6.91015625, + "rewards/rejected": -3.7738499641418457, + "step": 12872 + }, + { + "epoch": 3.22, + "grad_norm": 3.0324814319610596, + "learning_rate": 2.8125409468822503e-06, + "logits/chosen": -0.46664565801620483, + "logits/rejected": -0.5717421770095825, + "logps/chosen": -58.71638870239258, + "logps/rejected": -91.41142272949219, + "loss": 0.575, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.122542142868042, + "rewards/margins": 5.9923319816589355, + "rewards/rejected": -2.869788885116577, + "step": 12873 + }, + { + "epoch": 3.22, + "grad_norm": 7.220216751098633, + "learning_rate": 2.8118341962116014e-06, + "logits/chosen": -0.6126777529716492, + "logits/rejected": -0.7352651953697205, + "logps/chosen": -55.31476974487305, + "logps/rejected": -87.81502532958984, + "loss": 0.7873, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.949568033218384, + "rewards/margins": 5.838076114654541, + "rewards/rejected": -2.888507604598999, + "step": 12874 + }, + { + "epoch": 3.22, + "grad_norm": 3.8002567291259766, + "learning_rate": 2.8111274996128567e-06, + "logits/chosen": -0.5288869142532349, + "logits/rejected": -0.5754378437995911, + "logps/chosen": -46.52156448364258, + "logps/rejected": -135.20431518554688, + "loss": 0.5596, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1309151649475098, + "rewards/margins": 7.901113033294678, + "rewards/rejected": -4.770197868347168, + "step": 12875 + }, + { + "epoch": 3.22, + "grad_norm": 4.236581325531006, + "learning_rate": 2.8104208571034786e-06, + "logits/chosen": -0.587683379650116, + "logits/rejected": -0.6874743103981018, + "logps/chosen": -48.63232421875, + "logps/rejected": -98.79647064208984, + "loss": 0.6198, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4109902381896973, + "rewards/margins": 6.528609275817871, + "rewards/rejected": -3.1176187992095947, + "step": 12876 + }, + { + "epoch": 3.22, + "grad_norm": 3.119537830352783, + "learning_rate": 2.8097142687009317e-06, + "logits/chosen": -0.5520840883255005, + "logits/rejected": -0.6695866584777832, + "logps/chosen": -49.97265625, + "logps/rejected": -94.008056640625, + "loss": 0.5916, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.971816062927246, + "rewards/margins": 7.000000953674316, + "rewards/rejected": -4.02818489074707, + "step": 12877 + }, + { + "epoch": 3.22, + "grad_norm": 15.129586219787598, + "learning_rate": 2.8090077344226724e-06, + "logits/chosen": -0.5265238881111145, + "logits/rejected": -0.5845492482185364, + "logps/chosen": -61.840660095214844, + "logps/rejected": -106.98454284667969, + "loss": 0.6346, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0601444244384766, + "rewards/margins": 6.160390377044678, + "rewards/rejected": -3.100245952606201, + "step": 12878 + }, + { + "epoch": 3.22, + "grad_norm": 2.967650890350342, + "learning_rate": 2.808301254286164e-06, + "logits/chosen": -0.5529966354370117, + "logits/rejected": -0.6132990717887878, + "logps/chosen": -50.51003646850586, + "logps/rejected": -95.93013000488281, + "loss": 0.6548, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2256217002868652, + "rewards/margins": 6.475268840789795, + "rewards/rejected": -3.2496466636657715, + "step": 12879 + }, + { + "epoch": 3.22, + "grad_norm": 5.112815856933594, + "learning_rate": 2.8075948283088637e-06, + "logits/chosen": -0.4820205569267273, + "logits/rejected": -0.5607267618179321, + "logps/chosen": -56.58057403564453, + "logps/rejected": -92.29388427734375, + "loss": 0.662, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7641525268554688, + "rewards/margins": 5.853673458099365, + "rewards/rejected": -3.0895204544067383, + "step": 12880 + }, + { + "epoch": 3.22, + "grad_norm": 5.24506139755249, + "learning_rate": 2.8068884565082255e-06, + "logits/chosen": -0.4149891138076782, + "logits/rejected": -0.4443260133266449, + "logps/chosen": -56.30707550048828, + "logps/rejected": -105.71918487548828, + "loss": 0.7518, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8794686794281006, + "rewards/margins": 5.522161483764648, + "rewards/rejected": -2.6426925659179688, + "step": 12881 + }, + { + "epoch": 3.22, + "grad_norm": 8.786029815673828, + "learning_rate": 2.8061821389017074e-06, + "logits/chosen": -0.5284173488616943, + "logits/rejected": -0.6144473552703857, + "logps/chosen": -54.59169006347656, + "logps/rejected": -113.8677978515625, + "loss": 0.6308, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7880449295043945, + "rewards/margins": 6.773366928100586, + "rewards/rejected": -3.9853219985961914, + "step": 12882 + }, + { + "epoch": 3.22, + "grad_norm": 2.837005376815796, + "learning_rate": 2.8054758755067624e-06, + "logits/chosen": -0.541813850402832, + "logits/rejected": -0.5968464612960815, + "logps/chosen": -53.511024475097656, + "logps/rejected": -100.88041687011719, + "loss": 0.6241, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.06775164604187, + "rewards/margins": 6.326942443847656, + "rewards/rejected": -3.259190559387207, + "step": 12883 + }, + { + "epoch": 3.22, + "grad_norm": 8.7910737991333, + "learning_rate": 2.8047696663408407e-06, + "logits/chosen": -0.5257956981658936, + "logits/rejected": -0.5658844113349915, + "logps/chosen": -50.12727355957031, + "logps/rejected": -100.99710845947266, + "loss": 0.6881, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1986143589019775, + "rewards/margins": 5.9066996574401855, + "rewards/rejected": -2.708085060119629, + "step": 12884 + }, + { + "epoch": 3.22, + "grad_norm": 4.741023063659668, + "learning_rate": 2.804063511421396e-06, + "logits/chosen": -0.5073720216751099, + "logits/rejected": -0.5982058048248291, + "logps/chosen": -53.21632385253906, + "logps/rejected": -108.94700622558594, + "loss": 0.6061, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.069673538208008, + "rewards/margins": 6.36139440536499, + "rewards/rejected": -3.2917206287384033, + "step": 12885 + }, + { + "epoch": 3.22, + "grad_norm": 4.3886003494262695, + "learning_rate": 2.8033574107658813e-06, + "logits/chosen": -0.5749348998069763, + "logits/rejected": -0.6286670565605164, + "logps/chosen": -41.27165222167969, + "logps/rejected": -122.56861877441406, + "loss": 0.5362, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0732474327087402, + "rewards/margins": 7.863224029541016, + "rewards/rejected": -4.789976119995117, + "step": 12886 + }, + { + "epoch": 3.22, + "grad_norm": 5.081981182098389, + "learning_rate": 2.8026513643917375e-06, + "logits/chosen": -0.5619573593139648, + "logits/rejected": -0.6209110021591187, + "logps/chosen": -57.068538665771484, + "logps/rejected": -110.24431610107422, + "loss": 0.6806, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0419161319732666, + "rewards/margins": 5.811463832855225, + "rewards/rejected": -2.7695472240448, + "step": 12887 + }, + { + "epoch": 3.22, + "grad_norm": 3.085632085800171, + "learning_rate": 2.8019453723164147e-06, + "logits/chosen": -0.5429692268371582, + "logits/rejected": -0.6491805911064148, + "logps/chosen": -52.24890899658203, + "logps/rejected": -105.22396850585938, + "loss": 0.5526, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.993478298187256, + "rewards/margins": 6.949492931365967, + "rewards/rejected": -3.956015110015869, + "step": 12888 + }, + { + "epoch": 3.22, + "grad_norm": 4.519928932189941, + "learning_rate": 2.801239434557363e-06, + "logits/chosen": -0.4530520439147949, + "logits/rejected": -0.5824060440063477, + "logps/chosen": -64.3622817993164, + "logps/rejected": -106.85639953613281, + "loss": 0.6323, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4169552326202393, + "rewards/margins": 7.524756908416748, + "rewards/rejected": -5.10780143737793, + "step": 12889 + }, + { + "epoch": 3.22, + "grad_norm": 6.83467960357666, + "learning_rate": 2.800533551132023e-06, + "logits/chosen": -0.520465612411499, + "logits/rejected": -0.5840564370155334, + "logps/chosen": -57.22365188598633, + "logps/rejected": -115.81009674072266, + "loss": 0.6834, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9459011554718018, + "rewards/margins": 6.903388500213623, + "rewards/rejected": -3.957486629486084, + "step": 12890 + }, + { + "epoch": 3.22, + "grad_norm": 5.577213287353516, + "learning_rate": 2.799827722057836e-06, + "logits/chosen": -0.510249674320221, + "logits/rejected": -0.5886551737785339, + "logps/chosen": -58.705039978027344, + "logps/rejected": -96.67868041992188, + "loss": 0.709, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0136196613311768, + "rewards/margins": 6.188326835632324, + "rewards/rejected": -3.17470645904541, + "step": 12891 + }, + { + "epoch": 3.23, + "grad_norm": 2.76941180229187, + "learning_rate": 2.799121947352249e-06, + "logits/chosen": -0.5516196489334106, + "logits/rejected": -0.6448867917060852, + "logps/chosen": -56.827857971191406, + "logps/rejected": -97.35755157470703, + "loss": 0.6235, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.225451707839966, + "rewards/margins": 6.963933944702148, + "rewards/rejected": -3.7384824752807617, + "step": 12892 + }, + { + "epoch": 3.23, + "grad_norm": 4.297443389892578, + "learning_rate": 2.7984162270327e-06, + "logits/chosen": -0.5166265964508057, + "logits/rejected": -0.6203041672706604, + "logps/chosen": -69.43057250976562, + "logps/rejected": -100.2990493774414, + "loss": 0.6002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.666461944580078, + "rewards/margins": 6.687333106994629, + "rewards/rejected": -4.020871162414551, + "step": 12893 + }, + { + "epoch": 3.23, + "grad_norm": 4.688154220581055, + "learning_rate": 2.7977105611166246e-06, + "logits/chosen": -0.5231669545173645, + "logits/rejected": -0.6592455506324768, + "logps/chosen": -62.868408203125, + "logps/rejected": -92.18669891357422, + "loss": 0.5992, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9594576358795166, + "rewards/margins": 7.249508857727051, + "rewards/rejected": -4.290051460266113, + "step": 12894 + }, + { + "epoch": 3.23, + "grad_norm": 8.177627563476562, + "learning_rate": 2.7970049496214657e-06, + "logits/chosen": -0.5373345017433167, + "logits/rejected": -0.6030423641204834, + "logps/chosen": -57.81466293334961, + "logps/rejected": -88.59400939941406, + "loss": 0.6883, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.994792938232422, + "rewards/margins": 5.630627155303955, + "rewards/rejected": -2.635833978652954, + "step": 12895 + }, + { + "epoch": 3.23, + "grad_norm": 27.840431213378906, + "learning_rate": 2.7962993925646586e-06, + "logits/chosen": -0.6612235307693481, + "logits/rejected": -0.7333124279975891, + "logps/chosen": -49.048057556152344, + "logps/rejected": -93.32579803466797, + "loss": 0.7919, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8729045391082764, + "rewards/margins": 6.596270561218262, + "rewards/rejected": -3.723365545272827, + "step": 12896 + }, + { + "epoch": 3.23, + "grad_norm": 4.353634357452393, + "learning_rate": 2.795593889963635e-06, + "logits/chosen": -0.5654581189155579, + "logits/rejected": -0.6313586235046387, + "logps/chosen": -46.76817321777344, + "logps/rejected": -105.2158432006836, + "loss": 0.6772, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2256712913513184, + "rewards/margins": 6.741220474243164, + "rewards/rejected": -3.5155491828918457, + "step": 12897 + }, + { + "epoch": 3.23, + "grad_norm": 4.7143168449401855, + "learning_rate": 2.794888441835833e-06, + "logits/chosen": -0.465739369392395, + "logits/rejected": -0.5510708093643188, + "logps/chosen": -55.054603576660156, + "logps/rejected": -98.53813171386719, + "loss": 0.5854, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7357945442199707, + "rewards/margins": 6.817535400390625, + "rewards/rejected": -4.081740379333496, + "step": 12898 + }, + { + "epoch": 3.23, + "grad_norm": 10.085434913635254, + "learning_rate": 2.794183048198681e-06, + "logits/chosen": -0.4853125512599945, + "logits/rejected": -0.5494725108146667, + "logps/chosen": -52.40774154663086, + "logps/rejected": -114.49480438232422, + "loss": 0.6028, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6750059127807617, + "rewards/margins": 6.824454307556152, + "rewards/rejected": -4.149447917938232, + "step": 12899 + }, + { + "epoch": 3.23, + "grad_norm": 4.634737014770508, + "learning_rate": 2.793477709069614e-06, + "logits/chosen": -0.5360125303268433, + "logits/rejected": -0.5707550644874573, + "logps/chosen": -59.186546325683594, + "logps/rejected": -124.6883773803711, + "loss": 0.6054, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9911558628082275, + "rewards/margins": 8.009946823120117, + "rewards/rejected": -5.0187907218933105, + "step": 12900 + }, + { + "epoch": 3.23, + "grad_norm": 9.47042465209961, + "learning_rate": 2.792772424466059e-06, + "logits/chosen": -0.5994583964347839, + "logits/rejected": -0.6436313390731812, + "logps/chosen": -44.364620208740234, + "logps/rejected": -98.19475555419922, + "loss": 0.5987, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9600279331207275, + "rewards/margins": 5.862112998962402, + "rewards/rejected": -2.902085304260254, + "step": 12901 + }, + { + "epoch": 3.23, + "grad_norm": 3.1679680347442627, + "learning_rate": 2.7920671944054456e-06, + "logits/chosen": -0.4985525608062744, + "logits/rejected": -0.6322858333587646, + "logps/chosen": -59.85273742675781, + "logps/rejected": -87.97506713867188, + "loss": 0.6041, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.118405342102051, + "rewards/margins": 7.190289497375488, + "rewards/rejected": -4.071883678436279, + "step": 12902 + }, + { + "epoch": 3.23, + "grad_norm": 7.659755706787109, + "learning_rate": 2.7913620189051993e-06, + "logits/chosen": -0.519757866859436, + "logits/rejected": -0.530422031879425, + "logps/chosen": -58.88488006591797, + "logps/rejected": -113.36528015136719, + "loss": 0.6763, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.740964651107788, + "rewards/margins": 6.193016052246094, + "rewards/rejected": -3.4520514011383057, + "step": 12903 + }, + { + "epoch": 3.23, + "grad_norm": 18.146026611328125, + "learning_rate": 2.7906568979827476e-06, + "logits/chosen": -0.491252601146698, + "logits/rejected": -0.5374661684036255, + "logps/chosen": -54.052066802978516, + "logps/rejected": -126.46228790283203, + "loss": 0.625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0557172298431396, + "rewards/margins": 7.6585540771484375, + "rewards/rejected": -4.6028361320495605, + "step": 12904 + }, + { + "epoch": 3.23, + "grad_norm": 4.400544166564941, + "learning_rate": 2.7899518316555152e-06, + "logits/chosen": -0.5681909322738647, + "logits/rejected": -0.6695345640182495, + "logps/chosen": -68.4263916015625, + "logps/rejected": -117.39224243164062, + "loss": 0.6887, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8790745735168457, + "rewards/margins": 8.312957763671875, + "rewards/rejected": -5.433883190155029, + "step": 12905 + }, + { + "epoch": 3.23, + "grad_norm": 10.136651992797852, + "learning_rate": 2.7892468199409206e-06, + "logits/chosen": -0.5472496151924133, + "logits/rejected": -0.6182045340538025, + "logps/chosen": -48.14949417114258, + "logps/rejected": -109.1511459350586, + "loss": 0.5748, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.996453046798706, + "rewards/margins": 7.117219924926758, + "rewards/rejected": -4.120767116546631, + "step": 12906 + }, + { + "epoch": 3.23, + "grad_norm": 6.122653961181641, + "learning_rate": 2.788541862856391e-06, + "logits/chosen": -0.4440952241420746, + "logits/rejected": -0.5204282999038696, + "logps/chosen": -62.1380500793457, + "logps/rejected": -115.6929702758789, + "loss": 0.6415, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.840333938598633, + "rewards/margins": 6.6714067459106445, + "rewards/rejected": -3.831073045730591, + "step": 12907 + }, + { + "epoch": 3.23, + "grad_norm": 9.547420501708984, + "learning_rate": 2.7878369604193445e-06, + "logits/chosen": -0.5377832651138306, + "logits/rejected": -0.5705454349517822, + "logps/chosen": -66.79846954345703, + "logps/rejected": -101.656494140625, + "loss": 0.7441, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.213167428970337, + "rewards/margins": 6.003067970275879, + "rewards/rejected": -2.789900302886963, + "step": 12908 + }, + { + "epoch": 3.23, + "grad_norm": 4.247176647186279, + "learning_rate": 2.787132112647197e-06, + "logits/chosen": -0.5879150629043579, + "logits/rejected": -0.6094440817832947, + "logps/chosen": -52.423702239990234, + "logps/rejected": -102.8312759399414, + "loss": 0.6084, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.825362205505371, + "rewards/margins": 5.808908939361572, + "rewards/rejected": -2.983546733856201, + "step": 12909 + }, + { + "epoch": 3.23, + "grad_norm": 21.084768295288086, + "learning_rate": 2.7864273195573716e-06, + "logits/chosen": -0.6214969158172607, + "logits/rejected": -0.7029333114624023, + "logps/chosen": -50.246402740478516, + "logps/rejected": -108.05072784423828, + "loss": 0.6297, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1233415603637695, + "rewards/margins": 7.570407867431641, + "rewards/rejected": -4.447065830230713, + "step": 12910 + }, + { + "epoch": 3.23, + "grad_norm": 3.684081792831421, + "learning_rate": 2.7857225811672813e-06, + "logits/chosen": -0.5167842507362366, + "logits/rejected": -0.5975496768951416, + "logps/chosen": -52.327613830566406, + "logps/rejected": -106.56202697753906, + "loss": 0.6155, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1504859924316406, + "rewards/margins": 6.493053913116455, + "rewards/rejected": -3.3425681591033936, + "step": 12911 + }, + { + "epoch": 3.23, + "grad_norm": 6.788602352142334, + "learning_rate": 2.7850178974943386e-06, + "logits/chosen": -0.4906383752822876, + "logits/rejected": -0.5706164836883545, + "logps/chosen": -56.17188262939453, + "logps/rejected": -87.61140441894531, + "loss": 0.7043, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9368062019348145, + "rewards/margins": 5.425049781799316, + "rewards/rejected": -2.48824405670166, + "step": 12912 + }, + { + "epoch": 3.23, + "grad_norm": 3.722905158996582, + "learning_rate": 2.7843132685559603e-06, + "logits/chosen": -0.4972841441631317, + "logits/rejected": -0.5664486885070801, + "logps/chosen": -53.901641845703125, + "logps/rejected": -107.80267333984375, + "loss": 0.6383, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.847752332687378, + "rewards/margins": 7.115142822265625, + "rewards/rejected": -4.267390727996826, + "step": 12913 + }, + { + "epoch": 3.23, + "grad_norm": 4.112299919128418, + "learning_rate": 2.7836086943695616e-06, + "logits/chosen": -0.5813139081001282, + "logits/rejected": -0.6410138010978699, + "logps/chosen": -51.28111267089844, + "logps/rejected": -94.70991516113281, + "loss": 0.595, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8180556297302246, + "rewards/margins": 6.530676364898682, + "rewards/rejected": -3.7126212120056152, + "step": 12914 + }, + { + "epoch": 3.23, + "grad_norm": 4.142407417297363, + "learning_rate": 2.7829041749525455e-06, + "logits/chosen": -0.4766958951950073, + "logits/rejected": -0.4968581795692444, + "logps/chosen": -48.37586975097656, + "logps/rejected": -125.1935043334961, + "loss": 0.5474, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0544776916503906, + "rewards/margins": 7.819156646728516, + "rewards/rejected": -4.764678955078125, + "step": 12915 + }, + { + "epoch": 3.23, + "grad_norm": 2.9621386528015137, + "learning_rate": 2.782199710322325e-06, + "logits/chosen": -0.42246124148368835, + "logits/rejected": -0.5652254223823547, + "logps/chosen": -67.05531311035156, + "logps/rejected": -108.61959838867188, + "loss": 0.5881, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1008827686309814, + "rewards/margins": 7.716647624969482, + "rewards/rejected": -4.61576509475708, + "step": 12916 + }, + { + "epoch": 3.23, + "grad_norm": 18.544818878173828, + "learning_rate": 2.781495300496312e-06, + "logits/chosen": -0.5195531845092773, + "logits/rejected": -0.6199700236320496, + "logps/chosen": -53.49086380004883, + "logps/rejected": -129.87576293945312, + "loss": 0.6121, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9138095378875732, + "rewards/margins": 8.3837890625, + "rewards/rejected": -5.469979286193848, + "step": 12917 + }, + { + "epoch": 3.23, + "grad_norm": 3.1507580280303955, + "learning_rate": 2.7807909454919087e-06, + "logits/chosen": -0.5439352989196777, + "logits/rejected": -0.6047185063362122, + "logps/chosen": -48.15757751464844, + "logps/rejected": -111.14234924316406, + "loss": 0.5469, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3706717491149902, + "rewards/margins": 7.3944010734558105, + "rewards/rejected": -4.02372932434082, + "step": 12918 + }, + { + "epoch": 3.23, + "grad_norm": 3.528773307800293, + "learning_rate": 2.7800866453265198e-06, + "logits/chosen": -0.5837237238883972, + "logits/rejected": -0.691478431224823, + "logps/chosen": -48.04270935058594, + "logps/rejected": -96.04531860351562, + "loss": 0.546, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4158201217651367, + "rewards/margins": 7.331830978393555, + "rewards/rejected": -3.916011095046997, + "step": 12919 + }, + { + "epoch": 3.23, + "grad_norm": 5.474126815795898, + "learning_rate": 2.7793824000175528e-06, + "logits/chosen": -0.5147576332092285, + "logits/rejected": -0.5943715572357178, + "logps/chosen": -53.311439514160156, + "logps/rejected": -104.7315902709961, + "loss": 0.6769, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.62583327293396, + "rewards/margins": 6.325528144836426, + "rewards/rejected": -3.699695348739624, + "step": 12920 + }, + { + "epoch": 3.23, + "grad_norm": 4.566417694091797, + "learning_rate": 2.7786782095824095e-06, + "logits/chosen": -0.5143465995788574, + "logits/rejected": -0.5448311567306519, + "logps/chosen": -56.25959396362305, + "logps/rejected": -97.491455078125, + "loss": 0.671, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1079611778259277, + "rewards/margins": 5.120047569274902, + "rewards/rejected": -2.0120866298675537, + "step": 12921 + }, + { + "epoch": 3.23, + "grad_norm": 5.492070198059082, + "learning_rate": 2.7779740740384876e-06, + "logits/chosen": -0.5211275815963745, + "logits/rejected": -0.5788917541503906, + "logps/chosen": -61.523983001708984, + "logps/rejected": -110.719970703125, + "loss": 0.6884, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8721468448638916, + "rewards/margins": 6.11522102355957, + "rewards/rejected": -3.243074417114258, + "step": 12922 + }, + { + "epoch": 3.23, + "grad_norm": 7.054550647735596, + "learning_rate": 2.7772699934031915e-06, + "logits/chosen": -0.507106602191925, + "logits/rejected": -0.602906346321106, + "logps/chosen": -64.93077850341797, + "logps/rejected": -92.57123565673828, + "loss": 0.7116, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8742363452911377, + "rewards/margins": 6.480956077575684, + "rewards/rejected": -3.606719732284546, + "step": 12923 + }, + { + "epoch": 3.23, + "grad_norm": 2.911445379257202, + "learning_rate": 2.7765659676939166e-06, + "logits/chosen": -0.5817723870277405, + "logits/rejected": -0.6882463693618774, + "logps/chosen": -52.20280838012695, + "logps/rejected": -92.44817352294922, + "loss": 0.5758, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.333940267562866, + "rewards/margins": 7.072169303894043, + "rewards/rejected": -3.738229751586914, + "step": 12924 + }, + { + "epoch": 3.23, + "grad_norm": 7.1590704917907715, + "learning_rate": 2.7758619969280637e-06, + "logits/chosen": -0.554969310760498, + "logits/rejected": -0.664031982421875, + "logps/chosen": -63.362754821777344, + "logps/rejected": -82.45709991455078, + "loss": 0.7043, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1275248527526855, + "rewards/margins": 5.852245807647705, + "rewards/rejected": -2.7247211933135986, + "step": 12925 + }, + { + "epoch": 3.23, + "grad_norm": 3.191014051437378, + "learning_rate": 2.775158081123026e-06, + "logits/chosen": -0.6009135842323303, + "logits/rejected": -0.6937671899795532, + "logps/chosen": -53.54860305786133, + "logps/rejected": -101.54025268554688, + "loss": 0.6594, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.43237566947937, + "rewards/margins": 7.692145347595215, + "rewards/rejected": -4.259769916534424, + "step": 12926 + }, + { + "epoch": 3.23, + "grad_norm": 2.6069204807281494, + "learning_rate": 2.774454220296198e-06, + "logits/chosen": -0.5084186792373657, + "logits/rejected": -0.6034219861030579, + "logps/chosen": -45.91578674316406, + "logps/rejected": -90.63298034667969, + "loss": 0.5573, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3116536140441895, + "rewards/margins": 7.105741500854492, + "rewards/rejected": -3.7940874099731445, + "step": 12927 + }, + { + "epoch": 3.23, + "grad_norm": 8.015905380249023, + "learning_rate": 2.773750414464972e-06, + "logits/chosen": -0.5625075101852417, + "logits/rejected": -0.6144452095031738, + "logps/chosen": -45.892940521240234, + "logps/rejected": -98.15727233886719, + "loss": 0.6676, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0900728702545166, + "rewards/margins": 6.202476501464844, + "rewards/rejected": -3.112403392791748, + "step": 12928 + }, + { + "epoch": 3.23, + "grad_norm": 4.479780673980713, + "learning_rate": 2.773046663646745e-06, + "logits/chosen": -0.47657835483551025, + "logits/rejected": -0.6163570880889893, + "logps/chosen": -52.99637985229492, + "logps/rejected": -88.64463806152344, + "loss": 0.683, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4232802391052246, + "rewards/margins": 6.414026737213135, + "rewards/rejected": -2.99074649810791, + "step": 12929 + }, + { + "epoch": 3.23, + "grad_norm": 3.073837995529175, + "learning_rate": 2.7723429678589005e-06, + "logits/chosen": -0.5882747769355774, + "logits/rejected": -0.7032812237739563, + "logps/chosen": -59.00261306762695, + "logps/rejected": -98.87970733642578, + "loss": 0.6026, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9186179637908936, + "rewards/margins": 6.879215240478516, + "rewards/rejected": -3.9605977535247803, + "step": 12930 + }, + { + "epoch": 3.23, + "grad_norm": 4.777978897094727, + "learning_rate": 2.77163932711883e-06, + "logits/chosen": -0.5542286038398743, + "logits/rejected": -0.6301924586296082, + "logps/chosen": -49.752288818359375, + "logps/rejected": -106.95774841308594, + "loss": 0.596, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1023080348968506, + "rewards/margins": 7.0065717697143555, + "rewards/rejected": -3.904264450073242, + "step": 12931 + }, + { + "epoch": 3.24, + "grad_norm": 2.7488248348236084, + "learning_rate": 2.7709357414439265e-06, + "logits/chosen": -0.5252954959869385, + "logits/rejected": -0.628885805606842, + "logps/chosen": -55.21514892578125, + "logps/rejected": -118.5686264038086, + "loss": 0.6264, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9037535190582275, + "rewards/margins": 7.51746940612793, + "rewards/rejected": -4.613716125488281, + "step": 12932 + }, + { + "epoch": 3.24, + "grad_norm": 6.12460470199585, + "learning_rate": 2.770232210851568e-06, + "logits/chosen": -0.583023726940155, + "logits/rejected": -0.6623183488845825, + "logps/chosen": -47.09801483154297, + "logps/rejected": -99.87129211425781, + "loss": 0.6088, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.07462739944458, + "rewards/margins": 5.496962547302246, + "rewards/rejected": -2.422335147857666, + "step": 12933 + }, + { + "epoch": 3.24, + "grad_norm": 6.17919397354126, + "learning_rate": 2.769528735359143e-06, + "logits/chosen": -0.5067617297172546, + "logits/rejected": -0.5879145860671997, + "logps/chosen": -55.01833724975586, + "logps/rejected": -97.82209777832031, + "loss": 0.6468, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.820343494415283, + "rewards/margins": 6.046474456787109, + "rewards/rejected": -3.226130962371826, + "step": 12934 + }, + { + "epoch": 3.24, + "grad_norm": 8.231029510498047, + "learning_rate": 2.7688253149840373e-06, + "logits/chosen": -0.4965555667877197, + "logits/rejected": -0.5554108023643494, + "logps/chosen": -50.92127227783203, + "logps/rejected": -94.11961364746094, + "loss": 0.6841, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2380847930908203, + "rewards/margins": 5.991487503051758, + "rewards/rejected": -2.7534027099609375, + "step": 12935 + }, + { + "epoch": 3.24, + "grad_norm": 3.9514386653900146, + "learning_rate": 2.7681219497436307e-06, + "logits/chosen": -0.5468946695327759, + "logits/rejected": -0.6500440239906311, + "logps/chosen": -49.772762298583984, + "logps/rejected": -100.5169677734375, + "loss": 0.657, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.22540283203125, + "rewards/margins": 6.649633407592773, + "rewards/rejected": -3.4242305755615234, + "step": 12936 + }, + { + "epoch": 3.24, + "grad_norm": 5.57651948928833, + "learning_rate": 2.767418639655303e-06, + "logits/chosen": -0.4891357421875, + "logits/rejected": -0.5654054880142212, + "logps/chosen": -57.453773498535156, + "logps/rejected": -111.39463806152344, + "loss": 0.6324, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9478793144226074, + "rewards/margins": 6.579241752624512, + "rewards/rejected": -3.6313624382019043, + "step": 12937 + }, + { + "epoch": 3.24, + "grad_norm": 17.79935073852539, + "learning_rate": 2.766715384736438e-06, + "logits/chosen": -0.5511893033981323, + "logits/rejected": -0.6238216757774353, + "logps/chosen": -56.65662384033203, + "logps/rejected": -78.62539672851562, + "loss": 0.7115, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.002821207046509, + "rewards/margins": 4.719378471374512, + "rewards/rejected": -1.716557502746582, + "step": 12938 + }, + { + "epoch": 3.24, + "grad_norm": 2.1914994716644287, + "learning_rate": 2.766012185004411e-06, + "logits/chosen": -0.5360947251319885, + "logits/rejected": -0.6485645771026611, + "logps/chosen": -50.9267463684082, + "logps/rejected": -103.57957458496094, + "loss": 0.539, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0645017623901367, + "rewards/margins": 7.6524505615234375, + "rewards/rejected": -4.587948799133301, + "step": 12939 + }, + { + "epoch": 3.24, + "grad_norm": 4.5670247077941895, + "learning_rate": 2.7653090404765964e-06, + "logits/chosen": -0.5146797299385071, + "logits/rejected": -0.6141324043273926, + "logps/chosen": -61.81282043457031, + "logps/rejected": -102.34963989257812, + "loss": 0.6694, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5562379360198975, + "rewards/margins": 6.728976249694824, + "rewards/rejected": -4.172739028930664, + "step": 12940 + }, + { + "epoch": 3.24, + "grad_norm": 4.102531433105469, + "learning_rate": 2.764605951170375e-06, + "logits/chosen": -0.5952563285827637, + "logits/rejected": -0.6618534922599792, + "logps/chosen": -49.839786529541016, + "logps/rejected": -101.0351791381836, + "loss": 0.6652, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0934672355651855, + "rewards/margins": 6.5806732177734375, + "rewards/rejected": -3.487206220626831, + "step": 12941 + }, + { + "epoch": 3.24, + "grad_norm": 5.880068302154541, + "learning_rate": 2.7639029171031163e-06, + "logits/chosen": -0.5530288219451904, + "logits/rejected": -0.6396522521972656, + "logps/chosen": -51.72922897338867, + "logps/rejected": -89.32503509521484, + "loss": 0.724, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.044344663619995, + "rewards/margins": 5.613264083862305, + "rewards/rejected": -2.5689189434051514, + "step": 12942 + }, + { + "epoch": 3.24, + "grad_norm": 23.02044677734375, + "learning_rate": 2.763199938292196e-06, + "logits/chosen": -0.45853665471076965, + "logits/rejected": -0.5413631200790405, + "logps/chosen": -57.63433074951172, + "logps/rejected": -108.38897705078125, + "loss": 0.6582, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9013969898223877, + "rewards/margins": 7.023475646972656, + "rewards/rejected": -4.1220784187316895, + "step": 12943 + }, + { + "epoch": 3.24, + "grad_norm": 4.876297950744629, + "learning_rate": 2.7624970147549835e-06, + "logits/chosen": -0.4974439740180969, + "logits/rejected": -0.6184406876564026, + "logps/chosen": -64.03118133544922, + "logps/rejected": -92.15411376953125, + "loss": 0.7267, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9856998920440674, + "rewards/margins": 6.958232402801514, + "rewards/rejected": -3.972532272338867, + "step": 12944 + }, + { + "epoch": 3.24, + "grad_norm": 19.699575424194336, + "learning_rate": 2.761794146508851e-06, + "logits/chosen": -0.5777760148048401, + "logits/rejected": -0.6448155641555786, + "logps/chosen": -63.19086837768555, + "logps/rejected": -101.88929748535156, + "loss": 0.7009, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8322770595550537, + "rewards/margins": 7.1253981590271, + "rewards/rejected": -4.293120861053467, + "step": 12945 + }, + { + "epoch": 3.24, + "grad_norm": 3.313514471054077, + "learning_rate": 2.7610913335711665e-06, + "logits/chosen": -0.5494153499603271, + "logits/rejected": -0.6685943603515625, + "logps/chosen": -55.03294372558594, + "logps/rejected": -96.37873077392578, + "loss": 0.5605, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.95493221282959, + "rewards/margins": 6.584039688110352, + "rewards/rejected": -3.6291069984436035, + "step": 12946 + }, + { + "epoch": 3.24, + "grad_norm": 7.271572113037109, + "learning_rate": 2.760388575959294e-06, + "logits/chosen": -0.5284575819969177, + "logits/rejected": -0.5932375192642212, + "logps/chosen": -50.84782409667969, + "logps/rejected": -104.43572998046875, + "loss": 0.6196, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8517165184020996, + "rewards/margins": 5.931283473968506, + "rewards/rejected": -3.079566478729248, + "step": 12947 + }, + { + "epoch": 3.24, + "grad_norm": 3.1492176055908203, + "learning_rate": 2.759685873690604e-06, + "logits/chosen": -0.5319297909736633, + "logits/rejected": -0.5974998474121094, + "logps/chosen": -47.66339111328125, + "logps/rejected": -127.34622192382812, + "loss": 0.5263, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.350159168243408, + "rewards/margins": 7.3380513191223145, + "rewards/rejected": -3.9878921508789062, + "step": 12948 + }, + { + "epoch": 3.24, + "grad_norm": 3.020526885986328, + "learning_rate": 2.758983226782457e-06, + "logits/chosen": -0.5475746393203735, + "logits/rejected": -0.6002564430236816, + "logps/chosen": -50.69879913330078, + "logps/rejected": -97.2774658203125, + "loss": 0.6777, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1741058826446533, + "rewards/margins": 6.018899917602539, + "rewards/rejected": -2.8447935581207275, + "step": 12949 + }, + { + "epoch": 3.24, + "grad_norm": 3.168300151824951, + "learning_rate": 2.7582806352522194e-06, + "logits/chosen": -0.5996837615966797, + "logits/rejected": -0.6288820505142212, + "logps/chosen": -38.49729537963867, + "logps/rejected": -105.59263610839844, + "loss": 0.5847, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5633883476257324, + "rewards/margins": 6.832914352416992, + "rewards/rejected": -3.269526958465576, + "step": 12950 + }, + { + "epoch": 3.24, + "grad_norm": 3.2215991020202637, + "learning_rate": 2.757578099117253e-06, + "logits/chosen": -0.5757098197937012, + "logits/rejected": -0.5930293202400208, + "logps/chosen": -48.301597595214844, + "logps/rejected": -114.214599609375, + "loss": 0.5505, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4355666637420654, + "rewards/margins": 6.835873603820801, + "rewards/rejected": -3.4003071784973145, + "step": 12951 + }, + { + "epoch": 3.24, + "grad_norm": 6.027993202209473, + "learning_rate": 2.756875618394914e-06, + "logits/chosen": -0.5366486310958862, + "logits/rejected": -0.6726284027099609, + "logps/chosen": -48.90365219116211, + "logps/rejected": -85.48943328857422, + "loss": 0.5576, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0300307273864746, + "rewards/margins": 6.930631160736084, + "rewards/rejected": -3.9006009101867676, + "step": 12952 + }, + { + "epoch": 3.24, + "grad_norm": 3.20766544342041, + "learning_rate": 2.756173193102567e-06, + "logits/chosen": -0.5039635300636292, + "logits/rejected": -0.6453484892845154, + "logps/chosen": -50.8191032409668, + "logps/rejected": -100.2999038696289, + "loss": 0.5227, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2350339889526367, + "rewards/margins": 8.227489471435547, + "rewards/rejected": -4.992455005645752, + "step": 12953 + }, + { + "epoch": 3.24, + "grad_norm": 8.138138771057129, + "learning_rate": 2.7554708232575665e-06, + "logits/chosen": -0.5098355412483215, + "logits/rejected": -0.6459941267967224, + "logps/chosen": -66.81840515136719, + "logps/rejected": -93.88371276855469, + "loss": 0.7126, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.697910785675049, + "rewards/margins": 6.860827445983887, + "rewards/rejected": -4.16291618347168, + "step": 12954 + }, + { + "epoch": 3.24, + "grad_norm": 6.83825159072876, + "learning_rate": 2.754768508877268e-06, + "logits/chosen": -0.5408639311790466, + "logits/rejected": -0.6323146224021912, + "logps/chosen": -56.49395751953125, + "logps/rejected": -100.77571868896484, + "loss": 0.6518, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.179704189300537, + "rewards/margins": 7.488474369049072, + "rewards/rejected": -4.308770179748535, + "step": 12955 + }, + { + "epoch": 3.24, + "grad_norm": 9.644430160522461, + "learning_rate": 2.7540662499790272e-06, + "logits/chosen": -0.5104539394378662, + "logits/rejected": -0.5804078578948975, + "logps/chosen": -48.026180267333984, + "logps/rejected": -105.96591186523438, + "loss": 0.5799, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2801907062530518, + "rewards/margins": 7.693417549133301, + "rewards/rejected": -4.41322660446167, + "step": 12956 + }, + { + "epoch": 3.24, + "grad_norm": 1.768815040588379, + "learning_rate": 2.7533640465802023e-06, + "logits/chosen": -0.5374904870986938, + "logits/rejected": -0.5744385719299316, + "logps/chosen": -47.57437515258789, + "logps/rejected": -127.41844940185547, + "loss": 0.5397, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.146782398223877, + "rewards/margins": 8.677152633666992, + "rewards/rejected": -5.530370235443115, + "step": 12957 + }, + { + "epoch": 3.24, + "grad_norm": 5.4235429763793945, + "learning_rate": 2.752661898698138e-06, + "logits/chosen": -0.507401168346405, + "logits/rejected": -0.6631651520729065, + "logps/chosen": -62.92633056640625, + "logps/rejected": -90.69161224365234, + "loss": 0.5863, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2087011337280273, + "rewards/margins": 8.135539054870605, + "rewards/rejected": -4.926837921142578, + "step": 12958 + }, + { + "epoch": 3.24, + "grad_norm": 8.141624450683594, + "learning_rate": 2.7519598063501877e-06, + "logits/chosen": -0.49205252528190613, + "logits/rejected": -0.5683043599128723, + "logps/chosen": -51.35417938232422, + "logps/rejected": -87.88105773925781, + "loss": 0.6681, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9430487155914307, + "rewards/margins": 6.129963397979736, + "rewards/rejected": -3.1869149208068848, + "step": 12959 + }, + { + "epoch": 3.24, + "grad_norm": 5.6912946701049805, + "learning_rate": 2.7512577695537044e-06, + "logits/chosen": -0.5354468822479248, + "logits/rejected": -0.6040881872177124, + "logps/chosen": -47.43491744995117, + "logps/rejected": -124.60195922851562, + "loss": 0.6257, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.115665912628174, + "rewards/margins": 8.240859985351562, + "rewards/rejected": -5.125195026397705, + "step": 12960 + }, + { + "epoch": 3.24, + "grad_norm": 6.5844340324401855, + "learning_rate": 2.7505557883260333e-06, + "logits/chosen": -0.49575841426849365, + "logits/rejected": -0.578447699546814, + "logps/chosen": -49.21803665161133, + "logps/rejected": -95.17364501953125, + "loss": 0.5492, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9967596530914307, + "rewards/margins": 6.59246826171875, + "rewards/rejected": -3.5957093238830566, + "step": 12961 + }, + { + "epoch": 3.24, + "grad_norm": 9.051369667053223, + "learning_rate": 2.749853862684519e-06, + "logits/chosen": -0.46498584747314453, + "logits/rejected": -0.4790040850639343, + "logps/chosen": -53.598567962646484, + "logps/rejected": -117.6004867553711, + "loss": 0.6598, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3817899227142334, + "rewards/margins": 6.256497383117676, + "rewards/rejected": -2.8747079372406006, + "step": 12962 + }, + { + "epoch": 3.24, + "grad_norm": 3.9985811710357666, + "learning_rate": 2.7491519926465117e-06, + "logits/chosen": -0.6459609270095825, + "logits/rejected": -0.7279112339019775, + "logps/chosen": -60.98017883300781, + "logps/rejected": -116.40919494628906, + "loss": 0.6152, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9538276195526123, + "rewards/margins": 7.7240986824035645, + "rewards/rejected": -4.770270824432373, + "step": 12963 + }, + { + "epoch": 3.24, + "grad_norm": 7.848753929138184, + "learning_rate": 2.7484501782293515e-06, + "logits/chosen": -0.5304014086723328, + "logits/rejected": -0.6006582379341125, + "logps/chosen": -52.492977142333984, + "logps/rejected": -86.58747100830078, + "loss": 0.6151, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.082566022872925, + "rewards/margins": 5.596561431884766, + "rewards/rejected": -2.513995409011841, + "step": 12964 + }, + { + "epoch": 3.24, + "grad_norm": 6.84454870223999, + "learning_rate": 2.7477484194503818e-06, + "logits/chosen": -0.6513369083404541, + "logits/rejected": -0.6991463303565979, + "logps/chosen": -58.98231506347656, + "logps/rejected": -95.00399017333984, + "loss": 0.6933, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.411637306213379, + "rewards/margins": 5.844402313232422, + "rewards/rejected": -2.4327645301818848, + "step": 12965 + }, + { + "epoch": 3.24, + "grad_norm": 18.300357818603516, + "learning_rate": 2.7470467163269456e-06, + "logits/chosen": -0.501422643661499, + "logits/rejected": -0.550547182559967, + "logps/chosen": -49.63715362548828, + "logps/rejected": -111.06067657470703, + "loss": 0.6794, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.043438196182251, + "rewards/margins": 6.402199745178223, + "rewards/rejected": -3.3587608337402344, + "step": 12966 + }, + { + "epoch": 3.24, + "grad_norm": 4.107977867126465, + "learning_rate": 2.746345068876379e-06, + "logits/chosen": -0.5837806463241577, + "logits/rejected": -0.6745708584785461, + "logps/chosen": -50.61354064941406, + "logps/rejected": -94.24634552001953, + "loss": 0.5817, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.843318462371826, + "rewards/margins": 6.386384963989258, + "rewards/rejected": -3.5430662631988525, + "step": 12967 + }, + { + "epoch": 3.24, + "grad_norm": 5.874927997589111, + "learning_rate": 2.745643477116026e-06, + "logits/chosen": -0.5731610059738159, + "logits/rejected": -0.6679630279541016, + "logps/chosen": -49.80739212036133, + "logps/rejected": -102.58633422851562, + "loss": 0.7159, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.646239757537842, + "rewards/margins": 7.445538520812988, + "rewards/rejected": -4.799299240112305, + "step": 12968 + }, + { + "epoch": 3.24, + "grad_norm": 3.661761522293091, + "learning_rate": 2.7449419410632193e-06, + "logits/chosen": -0.5078050494194031, + "logits/rejected": -0.6034776568412781, + "logps/chosen": -46.66041946411133, + "logps/rejected": -101.49616241455078, + "loss": 0.5814, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3627569675445557, + "rewards/margins": 7.761430740356445, + "rewards/rejected": -4.3986735343933105, + "step": 12969 + }, + { + "epoch": 3.24, + "grad_norm": 4.911710739135742, + "learning_rate": 2.744240460735294e-06, + "logits/chosen": -0.49210673570632935, + "logits/rejected": -0.6097802519798279, + "logps/chosen": -53.341209411621094, + "logps/rejected": -95.16343688964844, + "loss": 0.5934, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.837395429611206, + "rewards/margins": 7.120978832244873, + "rewards/rejected": -4.283583164215088, + "step": 12970 + }, + { + "epoch": 3.24, + "grad_norm": 7.399637699127197, + "learning_rate": 2.7435390361495884e-06, + "logits/chosen": -0.5613573789596558, + "logits/rejected": -0.6592884063720703, + "logps/chosen": -48.789772033691406, + "logps/rejected": -99.08868408203125, + "loss": 0.6242, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.444243907928467, + "rewards/margins": 6.45796537399292, + "rewards/rejected": -3.013721466064453, + "step": 12971 + }, + { + "epoch": 3.25, + "grad_norm": 4.495508670806885, + "learning_rate": 2.7428376673234338e-06, + "logits/chosen": -0.5780819654464722, + "logits/rejected": -0.6414623260498047, + "logps/chosen": -48.731109619140625, + "logps/rejected": -107.65637969970703, + "loss": 0.6438, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.972423791885376, + "rewards/margins": 7.07383918762207, + "rewards/rejected": -4.101415634155273, + "step": 12972 + }, + { + "epoch": 3.25, + "grad_norm": 4.0170183181762695, + "learning_rate": 2.7421363542741586e-06, + "logits/chosen": -0.4661976099014282, + "logits/rejected": -0.5126243233680725, + "logps/chosen": -51.75261688232422, + "logps/rejected": -116.7245101928711, + "loss": 0.6368, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3395040035247803, + "rewards/margins": 6.739807605743408, + "rewards/rejected": -3.400303602218628, + "step": 12973 + }, + { + "epoch": 3.25, + "grad_norm": 4.021513938903809, + "learning_rate": 2.7414350970190962e-06, + "logits/chosen": -0.5268569588661194, + "logits/rejected": -0.6144707798957825, + "logps/chosen": -54.731449127197266, + "logps/rejected": -102.6497802734375, + "loss": 0.6084, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.183894634246826, + "rewards/margins": 6.655848979949951, + "rewards/rejected": -3.471954107284546, + "step": 12974 + }, + { + "epoch": 3.25, + "grad_norm": 7.9661784172058105, + "learning_rate": 2.740733895575578e-06, + "logits/chosen": -0.46123459935188293, + "logits/rejected": -0.49914735555648804, + "logps/chosen": -52.9276237487793, + "logps/rejected": -110.56626892089844, + "loss": 0.6647, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0963313579559326, + "rewards/margins": 6.159819602966309, + "rewards/rejected": -3.063488245010376, + "step": 12975 + }, + { + "epoch": 3.25, + "grad_norm": 5.524808883666992, + "learning_rate": 2.740032749960925e-06, + "logits/chosen": -0.6025705933570862, + "logits/rejected": -0.6757076382637024, + "logps/chosen": -48.50147247314453, + "logps/rejected": -111.98878479003906, + "loss": 0.6158, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.087099552154541, + "rewards/margins": 7.007767677307129, + "rewards/rejected": -3.920668840408325, + "step": 12976 + }, + { + "epoch": 3.25, + "grad_norm": 5.96807336807251, + "learning_rate": 2.739331660192467e-06, + "logits/chosen": -0.5280653834342957, + "logits/rejected": -0.6370319724082947, + "logps/chosen": -61.75505828857422, + "logps/rejected": -88.69186401367188, + "loss": 0.699, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.021613836288452, + "rewards/margins": 5.065821647644043, + "rewards/rejected": -2.044208288192749, + "step": 12977 + }, + { + "epoch": 3.25, + "grad_norm": 7.953479766845703, + "learning_rate": 2.7386306262875294e-06, + "logits/chosen": -0.5521031022071838, + "logits/rejected": -0.647527277469635, + "logps/chosen": -53.089561462402344, + "logps/rejected": -91.30239868164062, + "loss": 0.6256, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1914258003234863, + "rewards/margins": 6.687980651855469, + "rewards/rejected": -3.496556282043457, + "step": 12978 + }, + { + "epoch": 3.25, + "grad_norm": 3.6470987796783447, + "learning_rate": 2.7379296482634345e-06, + "logits/chosen": -0.5085468888282776, + "logits/rejected": -0.6050796508789062, + "logps/chosen": -50.633018493652344, + "logps/rejected": -103.96843719482422, + "loss": 0.5565, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0677008628845215, + "rewards/margins": 7.553544521331787, + "rewards/rejected": -4.485844135284424, + "step": 12979 + }, + { + "epoch": 3.25, + "grad_norm": 8.21225357055664, + "learning_rate": 2.737228726137502e-06, + "logits/chosen": -0.5279816389083862, + "logits/rejected": -0.593673586845398, + "logps/chosen": -51.9890251159668, + "logps/rejected": -99.03208923339844, + "loss": 0.7286, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.195361852645874, + "rewards/margins": 6.063685894012451, + "rewards/rejected": -2.868323802947998, + "step": 12980 + }, + { + "epoch": 3.25, + "grad_norm": 3.3495285511016846, + "learning_rate": 2.736527859927057e-06, + "logits/chosen": -0.4882842004299164, + "logits/rejected": -0.6006403565406799, + "logps/chosen": -49.91815185546875, + "logps/rejected": -108.9811019897461, + "loss": 0.532, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1484944820404053, + "rewards/margins": 7.310793876647949, + "rewards/rejected": -4.162299633026123, + "step": 12981 + }, + { + "epoch": 3.25, + "grad_norm": 4.268740177154541, + "learning_rate": 2.7358270496494154e-06, + "logits/chosen": -0.6105114221572876, + "logits/rejected": -0.6951314210891724, + "logps/chosen": -58.678871154785156, + "logps/rejected": -121.86013793945312, + "loss": 0.7535, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3580515384674072, + "rewards/margins": 8.091142654418945, + "rewards/rejected": -4.733090877532959, + "step": 12982 + }, + { + "epoch": 3.25, + "grad_norm": 3.897477626800537, + "learning_rate": 2.735126295321894e-06, + "logits/chosen": -0.5876019597053528, + "logits/rejected": -0.6978954076766968, + "logps/chosen": -51.52911376953125, + "logps/rejected": -92.74479675292969, + "loss": 0.6179, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.866157054901123, + "rewards/margins": 7.712610244750977, + "rewards/rejected": -4.8464531898498535, + "step": 12983 + }, + { + "epoch": 3.25, + "grad_norm": 2.4957969188690186, + "learning_rate": 2.734425596961813e-06, + "logits/chosen": -0.48756080865859985, + "logits/rejected": -0.5568527579307556, + "logps/chosen": -57.654693603515625, + "logps/rejected": -108.56831359863281, + "loss": 0.6127, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5164456367492676, + "rewards/margins": 7.585297584533691, + "rewards/rejected": -4.068851947784424, + "step": 12984 + }, + { + "epoch": 3.25, + "grad_norm": 21.4671630859375, + "learning_rate": 2.733724954586483e-06, + "logits/chosen": -0.5877038240432739, + "logits/rejected": -0.6549872159957886, + "logps/chosen": -60.93358612060547, + "logps/rejected": -109.55632781982422, + "loss": 0.7486, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.215057849884033, + "rewards/margins": 6.532802104949951, + "rewards/rejected": -3.3177435398101807, + "step": 12985 + }, + { + "epoch": 3.25, + "grad_norm": 7.71314001083374, + "learning_rate": 2.7330243682132226e-06, + "logits/chosen": -0.5417567491531372, + "logits/rejected": -0.618264377117157, + "logps/chosen": -44.21259307861328, + "logps/rejected": -98.01202392578125, + "loss": 0.6175, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1501564979553223, + "rewards/margins": 7.69309139251709, + "rewards/rejected": -4.542934894561768, + "step": 12986 + }, + { + "epoch": 3.25, + "grad_norm": 5.685973644256592, + "learning_rate": 2.732323837859341e-06, + "logits/chosen": -0.531510055065155, + "logits/rejected": -0.6008365154266357, + "logps/chosen": -55.13801574707031, + "logps/rejected": -125.11497497558594, + "loss": 0.5837, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3056929111480713, + "rewards/margins": 8.783327102661133, + "rewards/rejected": -5.477633476257324, + "step": 12987 + }, + { + "epoch": 3.25, + "grad_norm": 8.985889434814453, + "learning_rate": 2.7316233635421464e-06, + "logits/chosen": -0.5864697098731995, + "logits/rejected": -0.5991590023040771, + "logps/chosen": -59.31526565551758, + "logps/rejected": -109.83604431152344, + "loss": 0.763, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5359537601470947, + "rewards/margins": 5.941384315490723, + "rewards/rejected": -3.405431032180786, + "step": 12988 + }, + { + "epoch": 3.25, + "grad_norm": 5.4605631828308105, + "learning_rate": 2.7309229452789543e-06, + "logits/chosen": -0.5632073283195496, + "logits/rejected": -0.6523067951202393, + "logps/chosen": -53.58489227294922, + "logps/rejected": -121.36104583740234, + "loss": 0.6591, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9832751750946045, + "rewards/margins": 7.64783239364624, + "rewards/rejected": -4.664556980133057, + "step": 12989 + }, + { + "epoch": 3.25, + "grad_norm": 16.475711822509766, + "learning_rate": 2.7302225830870678e-06, + "logits/chosen": -0.5351711511611938, + "logits/rejected": -0.6281312704086304, + "logps/chosen": -46.21104049682617, + "logps/rejected": -111.56082916259766, + "loss": 0.5405, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.026463031768799, + "rewards/margins": 7.392906665802002, + "rewards/rejected": -4.366443157196045, + "step": 12990 + }, + { + "epoch": 3.25, + "grad_norm": 3.42195463180542, + "learning_rate": 2.7295222769837972e-06, + "logits/chosen": -0.5624656081199646, + "logits/rejected": -0.6287158131599426, + "logps/chosen": -47.103858947753906, + "logps/rejected": -102.0127944946289, + "loss": 0.5609, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.056346893310547, + "rewards/margins": 6.674684524536133, + "rewards/rejected": -3.618338108062744, + "step": 12991 + }, + { + "epoch": 3.25, + "grad_norm": 4.917136192321777, + "learning_rate": 2.7288220269864437e-06, + "logits/chosen": -0.5451433062553406, + "logits/rejected": -0.6045763492584229, + "logps/chosen": -52.723419189453125, + "logps/rejected": -118.8753662109375, + "loss": 0.5877, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1903529167175293, + "rewards/margins": 7.2473273277282715, + "rewards/rejected": -4.056974411010742, + "step": 12992 + }, + { + "epoch": 3.25, + "grad_norm": 8.048502922058105, + "learning_rate": 2.7281218331123162e-06, + "logits/chosen": -0.6039987802505493, + "logits/rejected": -0.6626541018486023, + "logps/chosen": -50.831329345703125, + "logps/rejected": -107.25164794921875, + "loss": 0.6908, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1069841384887695, + "rewards/margins": 6.1506805419921875, + "rewards/rejected": -3.0436959266662598, + "step": 12993 + }, + { + "epoch": 3.25, + "grad_norm": 5.188544273376465, + "learning_rate": 2.7274216953787143e-06, + "logits/chosen": -0.45958811044692993, + "logits/rejected": -0.5308310389518738, + "logps/chosen": -51.81490707397461, + "logps/rejected": -109.88648986816406, + "loss": 0.5709, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.923698902130127, + "rewards/margins": 6.5289764404296875, + "rewards/rejected": -3.6052775382995605, + "step": 12994 + }, + { + "epoch": 3.25, + "grad_norm": 5.494684219360352, + "learning_rate": 2.7267216138029373e-06, + "logits/chosen": -0.5685633420944214, + "logits/rejected": -0.7135196924209595, + "logps/chosen": -53.358680725097656, + "logps/rejected": -94.83502960205078, + "loss": 0.5473, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2260923385620117, + "rewards/margins": 7.6932854652404785, + "rewards/rejected": -4.467193603515625, + "step": 12995 + }, + { + "epoch": 3.25, + "grad_norm": 4.518549919128418, + "learning_rate": 2.72602158840229e-06, + "logits/chosen": -0.5009441375732422, + "logits/rejected": -0.592342734336853, + "logps/chosen": -55.41541290283203, + "logps/rejected": -105.5218505859375, + "loss": 0.6052, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.232330799102783, + "rewards/margins": 8.121421813964844, + "rewards/rejected": -4.889091491699219, + "step": 12996 + }, + { + "epoch": 3.25, + "grad_norm": 4.239490509033203, + "learning_rate": 2.7253216191940673e-06, + "logits/chosen": -0.623267412185669, + "logits/rejected": -0.711534321308136, + "logps/chosen": -56.42649459838867, + "logps/rejected": -93.21223449707031, + "loss": 0.6208, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3108410835266113, + "rewards/margins": 6.690112590789795, + "rewards/rejected": -3.3792717456817627, + "step": 12997 + }, + { + "epoch": 3.25, + "grad_norm": 3.1687254905700684, + "learning_rate": 2.724621706195565e-06, + "logits/chosen": -0.556976318359375, + "logits/rejected": -0.6205905675888062, + "logps/chosen": -49.224029541015625, + "logps/rejected": -109.1878433227539, + "loss": 0.6037, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4386396408081055, + "rewards/margins": 7.896765232086182, + "rewards/rejected": -4.458125591278076, + "step": 12998 + }, + { + "epoch": 3.25, + "grad_norm": 2.704378366470337, + "learning_rate": 2.7239218494240804e-06, + "logits/chosen": -0.46804672479629517, + "logits/rejected": -0.5466988682746887, + "logps/chosen": -59.2630615234375, + "logps/rejected": -99.66666412353516, + "loss": 0.634, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.925365686416626, + "rewards/margins": 6.745385646820068, + "rewards/rejected": -3.8200204372406006, + "step": 12999 + }, + { + "epoch": 3.25, + "grad_norm": 5.80450963973999, + "learning_rate": 2.7232220488969126e-06, + "logits/chosen": -0.5851606726646423, + "logits/rejected": -0.6317853331565857, + "logps/chosen": -52.788543701171875, + "logps/rejected": -106.2013168334961, + "loss": 0.7141, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1456618309020996, + "rewards/margins": 6.937530040740967, + "rewards/rejected": -3.791867733001709, + "step": 13000 + }, + { + "epoch": 3.25, + "grad_norm": 8.889969825744629, + "learning_rate": 2.7225223046313455e-06, + "logits/chosen": -0.4840146005153656, + "logits/rejected": -0.5879529714584351, + "logps/chosen": -59.06781005859375, + "logps/rejected": -101.96060180664062, + "loss": 0.5784, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0982136726379395, + "rewards/margins": 6.906740188598633, + "rewards/rejected": -3.8085269927978516, + "step": 13001 + }, + { + "epoch": 3.25, + "grad_norm": 7.322225570678711, + "learning_rate": 2.7218226166446736e-06, + "logits/chosen": -0.5689533948898315, + "logits/rejected": -0.652058482170105, + "logps/chosen": -60.36130905151367, + "logps/rejected": -107.64111328125, + "loss": 0.6377, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.751316547393799, + "rewards/margins": 6.875704765319824, + "rewards/rejected": -4.124388694763184, + "step": 13002 + }, + { + "epoch": 3.25, + "grad_norm": 5.262044429779053, + "learning_rate": 2.7211229849541932e-06, + "logits/chosen": -0.5563616752624512, + "logits/rejected": -0.6743060946464539, + "logps/chosen": -68.70902252197266, + "logps/rejected": -104.85247802734375, + "loss": 0.6692, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0713870525360107, + "rewards/margins": 7.29920768737793, + "rewards/rejected": -4.22782039642334, + "step": 13003 + }, + { + "epoch": 3.25, + "grad_norm": 5.097148418426514, + "learning_rate": 2.720423409577183e-06, + "logits/chosen": -0.49821242690086365, + "logits/rejected": -0.6007007360458374, + "logps/chosen": -60.51001739501953, + "logps/rejected": -92.75867462158203, + "loss": 0.5841, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.303997755050659, + "rewards/margins": 7.241475582122803, + "rewards/rejected": -3.9374773502349854, + "step": 13004 + }, + { + "epoch": 3.25, + "grad_norm": 3.3741490840911865, + "learning_rate": 2.7197238905309355e-06, + "logits/chosen": -0.5273823738098145, + "logits/rejected": -0.6125786900520325, + "logps/chosen": -56.704490661621094, + "logps/rejected": -97.54209899902344, + "loss": 0.6082, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1180858612060547, + "rewards/margins": 6.098361968994141, + "rewards/rejected": -2.980276346206665, + "step": 13005 + }, + { + "epoch": 3.25, + "grad_norm": 4.694886684417725, + "learning_rate": 2.7190244278327383e-06, + "logits/chosen": -0.5681838989257812, + "logits/rejected": -0.6360692381858826, + "logps/chosen": -47.117835998535156, + "logps/rejected": -107.98861694335938, + "loss": 0.611, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.163466215133667, + "rewards/margins": 7.009856224060059, + "rewards/rejected": -3.8463892936706543, + "step": 13006 + }, + { + "epoch": 3.25, + "grad_norm": 8.848241806030273, + "learning_rate": 2.7183250214998726e-06, + "logits/chosen": -0.48568612337112427, + "logits/rejected": -0.558208703994751, + "logps/chosen": -58.08015823364258, + "logps/rejected": -102.42383575439453, + "loss": 0.6422, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.874391555786133, + "rewards/margins": 6.601446628570557, + "rewards/rejected": -3.7270545959472656, + "step": 13007 + }, + { + "epoch": 3.25, + "grad_norm": 2.30035138130188, + "learning_rate": 2.717625671549621e-06, + "logits/chosen": -0.5225411653518677, + "logits/rejected": -0.6197778582572937, + "logps/chosen": -48.301429748535156, + "logps/rejected": -87.68780517578125, + "loss": 0.5356, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3688013553619385, + "rewards/margins": 6.253891468048096, + "rewards/rejected": -2.88508939743042, + "step": 13008 + }, + { + "epoch": 3.25, + "grad_norm": 7.168314456939697, + "learning_rate": 2.7169263779992694e-06, + "logits/chosen": -0.642045259475708, + "logits/rejected": -0.6979087591171265, + "logps/chosen": -56.28253173828125, + "logps/rejected": -92.32569885253906, + "loss": 0.6709, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.991431713104248, + "rewards/margins": 6.306624889373779, + "rewards/rejected": -3.3151931762695312, + "step": 13009 + }, + { + "epoch": 3.25, + "grad_norm": 5.0322980880737305, + "learning_rate": 2.716227140866092e-06, + "logits/chosen": -0.4610413610935211, + "logits/rejected": -0.5239359140396118, + "logps/chosen": -53.68954086303711, + "logps/rejected": -79.32276916503906, + "loss": 0.63, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.25772762298584, + "rewards/margins": 5.251217365264893, + "rewards/rejected": -1.9934887886047363, + "step": 13010 + }, + { + "epoch": 3.25, + "grad_norm": 3.6591925621032715, + "learning_rate": 2.7155279601673744e-06, + "logits/chosen": -0.6036330461502075, + "logits/rejected": -0.6966162919998169, + "logps/chosen": -60.58693313598633, + "logps/rejected": -109.73112487792969, + "loss": 0.6445, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.165085554122925, + "rewards/margins": 7.765811920166016, + "rewards/rejected": -4.600726127624512, + "step": 13011 + }, + { + "epoch": 3.26, + "grad_norm": 3.440199851989746, + "learning_rate": 2.7148288359203906e-06, + "logits/chosen": -0.5226562023162842, + "logits/rejected": -0.6589627265930176, + "logps/chosen": -63.33562088012695, + "logps/rejected": -96.61114501953125, + "loss": 0.6016, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1033663749694824, + "rewards/margins": 7.393902778625488, + "rewards/rejected": -4.290535926818848, + "step": 13012 + }, + { + "epoch": 3.26, + "grad_norm": 18.80621910095215, + "learning_rate": 2.7141297681424146e-06, + "logits/chosen": -0.5863559246063232, + "logits/rejected": -0.6708342432975769, + "logps/chosen": -48.21356201171875, + "logps/rejected": -101.2681884765625, + "loss": 0.6836, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.951533555984497, + "rewards/margins": 6.796752452850342, + "rewards/rejected": -3.8452184200286865, + "step": 13013 + }, + { + "epoch": 3.26, + "grad_norm": 22.095134735107422, + "learning_rate": 2.7134307568507255e-06, + "logits/chosen": -0.5468371510505676, + "logits/rejected": -0.6425612568855286, + "logps/chosen": -70.87281036376953, + "logps/rejected": -88.48013305664062, + "loss": 0.7233, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.951664686203003, + "rewards/margins": 6.05129337310791, + "rewards/rejected": -3.099628210067749, + "step": 13014 + }, + { + "epoch": 3.26, + "grad_norm": 6.189356803894043, + "learning_rate": 2.7127318020625943e-06, + "logits/chosen": -0.5388752222061157, + "logits/rejected": -0.6171056032180786, + "logps/chosen": -49.79890441894531, + "logps/rejected": -94.6244888305664, + "loss": 0.5806, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9714581966400146, + "rewards/margins": 6.496959209442139, + "rewards/rejected": -3.525501012802124, + "step": 13015 + }, + { + "epoch": 3.26, + "grad_norm": 3.037107467651367, + "learning_rate": 2.7120329037952907e-06, + "logits/chosen": -0.6164252758026123, + "logits/rejected": -0.6894922852516174, + "logps/chosen": -49.27910614013672, + "logps/rejected": -107.5335464477539, + "loss": 0.6302, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9577670097351074, + "rewards/margins": 7.845640182495117, + "rewards/rejected": -4.88787317276001, + "step": 13016 + }, + { + "epoch": 3.26, + "grad_norm": 6.971181869506836, + "learning_rate": 2.7113340620660888e-06, + "logits/chosen": -0.5194565057754517, + "logits/rejected": -0.593842089176178, + "logps/chosen": -50.93629455566406, + "logps/rejected": -88.0691909790039, + "loss": 0.6915, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7840280532836914, + "rewards/margins": 5.278246879577637, + "rewards/rejected": -2.4942188262939453, + "step": 13017 + }, + { + "epoch": 3.26, + "grad_norm": 17.297950744628906, + "learning_rate": 2.7106352768922594e-06, + "logits/chosen": -0.4925123453140259, + "logits/rejected": -0.5482646822929382, + "logps/chosen": -52.543373107910156, + "logps/rejected": -98.91679382324219, + "loss": 0.6583, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.596745252609253, + "rewards/margins": 5.833775520324707, + "rewards/rejected": -3.237030029296875, + "step": 13018 + }, + { + "epoch": 3.26, + "grad_norm": 4.576606273651123, + "learning_rate": 2.709936548291064e-06, + "logits/chosen": -0.5336392521858215, + "logits/rejected": -0.6578033566474915, + "logps/chosen": -54.145469665527344, + "logps/rejected": -106.19176483154297, + "loss": 0.6652, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.356088876724243, + "rewards/margins": 7.706384181976318, + "rewards/rejected": -4.3502960205078125, + "step": 13019 + }, + { + "epoch": 3.26, + "grad_norm": 4.103509902954102, + "learning_rate": 2.709237876279772e-06, + "logits/chosen": -0.5663280487060547, + "logits/rejected": -0.6550443768501282, + "logps/chosen": -47.16156005859375, + "logps/rejected": -95.25848388671875, + "loss": 0.5832, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.164299488067627, + "rewards/margins": 6.928791046142578, + "rewards/rejected": -3.7644920349121094, + "step": 13020 + }, + { + "epoch": 3.26, + "grad_norm": 4.2551655769348145, + "learning_rate": 2.708539260875651e-06, + "logits/chosen": -0.535483717918396, + "logits/rejected": -0.6366410255432129, + "logps/chosen": -51.647499084472656, + "logps/rejected": -90.55406951904297, + "loss": 0.6228, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1484150886535645, + "rewards/margins": 6.4737372398376465, + "rewards/rejected": -3.325322389602661, + "step": 13021 + }, + { + "epoch": 3.26, + "grad_norm": 4.00360631942749, + "learning_rate": 2.707840702095962e-06, + "logits/chosen": -0.5706746578216553, + "logits/rejected": -0.6183506846427917, + "logps/chosen": -55.75908279418945, + "logps/rejected": -113.77568054199219, + "loss": 0.6514, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6071739196777344, + "rewards/margins": 7.2154388427734375, + "rewards/rejected": -3.6082653999328613, + "step": 13022 + }, + { + "epoch": 3.26, + "grad_norm": 3.9641332626342773, + "learning_rate": 2.7071421999579652e-06, + "logits/chosen": -0.48870089650154114, + "logits/rejected": -0.5598412752151489, + "logps/chosen": -56.84954833984375, + "logps/rejected": -93.38480377197266, + "loss": 0.6879, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4113969802856445, + "rewards/margins": 6.350207328796387, + "rewards/rejected": -2.938810110092163, + "step": 13023 + }, + { + "epoch": 3.26, + "grad_norm": 2.970367670059204, + "learning_rate": 2.7064437544789257e-06, + "logits/chosen": -0.5482891201972961, + "logits/rejected": -0.6174359917640686, + "logps/chosen": -46.07319259643555, + "logps/rejected": -99.22802734375, + "loss": 0.5992, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1291255950927734, + "rewards/margins": 6.705721855163574, + "rewards/rejected": -3.576596260070801, + "step": 13024 + }, + { + "epoch": 3.26, + "grad_norm": 10.937111854553223, + "learning_rate": 2.7057453656761003e-06, + "logits/chosen": -0.5931180715560913, + "logits/rejected": -0.652920126914978, + "logps/chosen": -59.51268768310547, + "logps/rejected": -120.8416748046875, + "loss": 0.6698, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.044963836669922, + "rewards/margins": 6.978701591491699, + "rewards/rejected": -3.9337377548217773, + "step": 13025 + }, + { + "epoch": 3.26, + "grad_norm": 15.292919158935547, + "learning_rate": 2.7050470335667457e-06, + "logits/chosen": -0.48834389448165894, + "logits/rejected": -0.5815166234970093, + "logps/chosen": -62.75410079956055, + "logps/rejected": -102.00337219238281, + "loss": 0.7063, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9093849658966064, + "rewards/margins": 6.8318071365356445, + "rewards/rejected": -3.922421932220459, + "step": 13026 + }, + { + "epoch": 3.26, + "grad_norm": 6.884438514709473, + "learning_rate": 2.704348758168121e-06, + "logits/chosen": -0.5465975999832153, + "logits/rejected": -0.6359979510307312, + "logps/chosen": -59.314701080322266, + "logps/rejected": -96.71195983886719, + "loss": 0.6145, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.890082597732544, + "rewards/margins": 6.650817394256592, + "rewards/rejected": -3.7607340812683105, + "step": 13027 + }, + { + "epoch": 3.26, + "grad_norm": 4.560387134552002, + "learning_rate": 2.703650539497481e-06, + "logits/chosen": -0.5356501340866089, + "logits/rejected": -0.6427607536315918, + "logps/chosen": -47.169776916503906, + "logps/rejected": -99.6110610961914, + "loss": 0.5541, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.304394006729126, + "rewards/margins": 7.773097038269043, + "rewards/rejected": -4.468703269958496, + "step": 13028 + }, + { + "epoch": 3.26, + "grad_norm": 2.8670313358306885, + "learning_rate": 2.702952377572076e-06, + "logits/chosen": -0.5675880908966064, + "logits/rejected": -0.652649462223053, + "logps/chosen": -43.205238342285156, + "logps/rejected": -98.71356964111328, + "loss": 0.5222, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3410820960998535, + "rewards/margins": 7.851280689239502, + "rewards/rejected": -4.510199069976807, + "step": 13029 + }, + { + "epoch": 3.26, + "grad_norm": 3.3839800357818604, + "learning_rate": 2.702254272409164e-06, + "logits/chosen": -0.5293047428131104, + "logits/rejected": -0.6206557154655457, + "logps/chosen": -49.837703704833984, + "logps/rejected": -89.08546447753906, + "loss": 0.5671, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2676048278808594, + "rewards/margins": 6.222647666931152, + "rewards/rejected": -2.955043315887451, + "step": 13030 + }, + { + "epoch": 3.26, + "grad_norm": 4.2098212242126465, + "learning_rate": 2.70155622402599e-06, + "logits/chosen": -0.6238192915916443, + "logits/rejected": -0.6781848669052124, + "logps/chosen": -54.19638442993164, + "logps/rejected": -94.59599304199219, + "loss": 0.6393, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.031588077545166, + "rewards/margins": 6.022579193115234, + "rewards/rejected": -2.9909911155700684, + "step": 13031 + }, + { + "epoch": 3.26, + "grad_norm": 5.177859783172607, + "learning_rate": 2.700858232439809e-06, + "logits/chosen": -0.5749145746231079, + "logits/rejected": -0.6147717833518982, + "logps/chosen": -63.76642990112305, + "logps/rejected": -123.27613067626953, + "loss": 0.6439, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2772457599639893, + "rewards/margins": 7.861467361450195, + "rewards/rejected": -4.584221839904785, + "step": 13032 + }, + { + "epoch": 3.26, + "grad_norm": 3.3376107215881348, + "learning_rate": 2.700160297667864e-06, + "logits/chosen": -0.5438542366027832, + "logits/rejected": -0.645219087600708, + "logps/chosen": -48.55711364746094, + "logps/rejected": -77.88934326171875, + "loss": 0.6443, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3490986824035645, + "rewards/margins": 6.502074241638184, + "rewards/rejected": -3.152975559234619, + "step": 13033 + }, + { + "epoch": 3.26, + "grad_norm": 3.6099557876586914, + "learning_rate": 2.699462419727407e-06, + "logits/chosen": -0.4765462577342987, + "logits/rejected": -0.530613899230957, + "logps/chosen": -52.431488037109375, + "logps/rejected": -107.0072021484375, + "loss": 0.6106, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.100722551345825, + "rewards/margins": 6.723753452301025, + "rewards/rejected": -3.6230309009552, + "step": 13034 + }, + { + "epoch": 3.26, + "grad_norm": 1.6379386186599731, + "learning_rate": 2.698764598635678e-06, + "logits/chosen": -0.5577819347381592, + "logits/rejected": -0.6670696139335632, + "logps/chosen": -54.630889892578125, + "logps/rejected": -115.07463073730469, + "loss": 0.5607, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.092064380645752, + "rewards/margins": 8.458610534667969, + "rewards/rejected": -5.366546154022217, + "step": 13035 + }, + { + "epoch": 3.26, + "grad_norm": 3.726977586746216, + "learning_rate": 2.698066834409926e-06, + "logits/chosen": -0.5088813900947571, + "logits/rejected": -0.6096820831298828, + "logps/chosen": -50.22194290161133, + "logps/rejected": -106.96601104736328, + "loss": 0.6112, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.087597608566284, + "rewards/margins": 7.731960296630859, + "rewards/rejected": -4.644362449645996, + "step": 13036 + }, + { + "epoch": 3.26, + "grad_norm": 5.9656596183776855, + "learning_rate": 2.69736912706739e-06, + "logits/chosen": -0.5078179836273193, + "logits/rejected": -0.5758662223815918, + "logps/chosen": -45.15372848510742, + "logps/rejected": -84.20934295654297, + "loss": 0.6781, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1523144245147705, + "rewards/margins": 5.947144508361816, + "rewards/rejected": -2.794830083847046, + "step": 13037 + }, + { + "epoch": 3.26, + "grad_norm": 5.999374866485596, + "learning_rate": 2.696671476625311e-06, + "logits/chosen": -0.5046799182891846, + "logits/rejected": -0.6023882627487183, + "logps/chosen": -57.103389739990234, + "logps/rejected": -94.53404235839844, + "loss": 0.6494, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1182923316955566, + "rewards/margins": 6.844472885131836, + "rewards/rejected": -3.7261810302734375, + "step": 13038 + }, + { + "epoch": 3.26, + "grad_norm": 8.856021881103516, + "learning_rate": 2.6959738831009315e-06, + "logits/chosen": -0.5003103613853455, + "logits/rejected": -0.533234715461731, + "logps/chosen": -49.619075775146484, + "logps/rejected": -104.03012084960938, + "loss": 0.6651, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9410932064056396, + "rewards/margins": 6.151658535003662, + "rewards/rejected": -3.2105655670166016, + "step": 13039 + }, + { + "epoch": 3.26, + "grad_norm": 10.161643981933594, + "learning_rate": 2.6952763465114874e-06, + "logits/chosen": -0.5307125449180603, + "logits/rejected": -0.5778429508209229, + "logps/chosen": -60.40245819091797, + "logps/rejected": -102.98389434814453, + "loss": 0.7978, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7480931282043457, + "rewards/margins": 5.524710178375244, + "rewards/rejected": -2.7766175270080566, + "step": 13040 + }, + { + "epoch": 3.26, + "grad_norm": 11.549026489257812, + "learning_rate": 2.6945788668742153e-06, + "logits/chosen": -0.5109643936157227, + "logits/rejected": -0.6074456572532654, + "logps/chosen": -54.2745361328125, + "logps/rejected": -124.94925689697266, + "loss": 0.6941, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8522284030914307, + "rewards/margins": 7.646975994110107, + "rewards/rejected": -4.794747829437256, + "step": 13041 + }, + { + "epoch": 3.26, + "grad_norm": 4.55172061920166, + "learning_rate": 2.693881444206351e-06, + "logits/chosen": -0.4889775514602661, + "logits/rejected": -0.5817501544952393, + "logps/chosen": -52.09617233276367, + "logps/rejected": -91.86002349853516, + "loss": 0.5326, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2393507957458496, + "rewards/margins": 6.201080322265625, + "rewards/rejected": -2.961729049682617, + "step": 13042 + }, + { + "epoch": 3.26, + "grad_norm": 3.7771854400634766, + "learning_rate": 2.693184078525133e-06, + "logits/chosen": -0.6591408252716064, + "logits/rejected": -0.6910339593887329, + "logps/chosen": -47.5815544128418, + "logps/rejected": -96.83708190917969, + "loss": 0.6373, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.435206174850464, + "rewards/margins": 6.489940166473389, + "rewards/rejected": -3.0547335147857666, + "step": 13043 + }, + { + "epoch": 3.26, + "grad_norm": 5.310979843139648, + "learning_rate": 2.6924867698477858e-06, + "logits/chosen": -0.5407072901725769, + "logits/rejected": -0.6136071681976318, + "logps/chosen": -54.694679260253906, + "logps/rejected": -90.59762573242188, + "loss": 0.5998, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1885244846343994, + "rewards/margins": 6.092126369476318, + "rewards/rejected": -2.90360164642334, + "step": 13044 + }, + { + "epoch": 3.26, + "grad_norm": 5.435397624969482, + "learning_rate": 2.6917895181915456e-06, + "logits/chosen": -0.5902994871139526, + "logits/rejected": -0.6715079545974731, + "logps/chosen": -57.239078521728516, + "logps/rejected": -111.15678405761719, + "loss": 0.7139, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0340259075164795, + "rewards/margins": 7.550815582275391, + "rewards/rejected": -4.516789436340332, + "step": 13045 + }, + { + "epoch": 3.26, + "grad_norm": 16.667095184326172, + "learning_rate": 2.691092323573645e-06, + "logits/chosen": -0.5519607067108154, + "logits/rejected": -0.6120874881744385, + "logps/chosen": -59.66641616821289, + "logps/rejected": -96.87734985351562, + "loss": 0.7805, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7627005577087402, + "rewards/margins": 5.722570419311523, + "rewards/rejected": -2.9598703384399414, + "step": 13046 + }, + { + "epoch": 3.26, + "grad_norm": 6.328153133392334, + "learning_rate": 2.690395186011305e-06, + "logits/chosen": -0.5143536329269409, + "logits/rejected": -0.649799108505249, + "logps/chosen": -54.809974670410156, + "logps/rejected": -90.2037582397461, + "loss": 0.5756, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0724942684173584, + "rewards/margins": 6.703987121582031, + "rewards/rejected": -3.631493091583252, + "step": 13047 + }, + { + "epoch": 3.26, + "grad_norm": 14.400328636169434, + "learning_rate": 2.6896981055217563e-06, + "logits/chosen": -0.6060189604759216, + "logits/rejected": -0.6886157393455505, + "logps/chosen": -66.53341674804688, + "logps/rejected": -99.60713195800781, + "loss": 0.6697, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.098653793334961, + "rewards/margins": 7.052171230316162, + "rewards/rejected": -3.953517436981201, + "step": 13048 + }, + { + "epoch": 3.26, + "grad_norm": 7.6821770668029785, + "learning_rate": 2.689001082122228e-06, + "logits/chosen": -0.5846623182296753, + "logits/rejected": -0.6505581736564636, + "logps/chosen": -57.615482330322266, + "logps/rejected": -100.00991821289062, + "loss": 0.7306, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.871730327606201, + "rewards/margins": 5.319586753845215, + "rewards/rejected": -2.4478557109832764, + "step": 13049 + }, + { + "epoch": 3.26, + "grad_norm": 5.797370433807373, + "learning_rate": 2.6883041158299393e-06, + "logits/chosen": -0.6204689145088196, + "logits/rejected": -0.7231013774871826, + "logps/chosen": -49.812705993652344, + "logps/rejected": -90.21065521240234, + "loss": 0.5826, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.088853120803833, + "rewards/margins": 6.392940998077393, + "rewards/rejected": -3.3040881156921387, + "step": 13050 + }, + { + "epoch": 3.26, + "grad_norm": 4.0560994148254395, + "learning_rate": 2.687607206662114e-06, + "logits/chosen": -0.537501335144043, + "logits/rejected": -0.6721275448799133, + "logps/chosen": -61.24945068359375, + "logps/rejected": -103.91842651367188, + "loss": 0.6704, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.889056921005249, + "rewards/margins": 8.22752857208252, + "rewards/rejected": -5.338472366333008, + "step": 13051 + }, + { + "epoch": 3.27, + "grad_norm": 7.911653518676758, + "learning_rate": 2.6869103546359758e-06, + "logits/chosen": -0.5760203003883362, + "logits/rejected": -0.6540610194206238, + "logps/chosen": -53.150447845458984, + "logps/rejected": -97.50030517578125, + "loss": 0.7319, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1384260654449463, + "rewards/margins": 5.894376754760742, + "rewards/rejected": -2.755950450897217, + "step": 13052 + }, + { + "epoch": 3.27, + "grad_norm": 4.618352890014648, + "learning_rate": 2.6862135597687433e-06, + "logits/chosen": -0.537024736404419, + "logits/rejected": -0.6364649534225464, + "logps/chosen": -54.08552169799805, + "logps/rejected": -99.35176849365234, + "loss": 0.5884, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2243337631225586, + "rewards/margins": 6.953863620758057, + "rewards/rejected": -3.729530096054077, + "step": 13053 + }, + { + "epoch": 3.27, + "grad_norm": 18.505630493164062, + "learning_rate": 2.6855168220776325e-06, + "logits/chosen": -0.6052318811416626, + "logits/rejected": -0.6649953722953796, + "logps/chosen": -51.63613510131836, + "logps/rejected": -107.03860473632812, + "loss": 0.6712, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1018447875976562, + "rewards/margins": 6.059408187866211, + "rewards/rejected": -2.9575634002685547, + "step": 13054 + }, + { + "epoch": 3.27, + "grad_norm": 11.3423490524292, + "learning_rate": 2.6848201415798646e-06, + "logits/chosen": -0.5000072717666626, + "logits/rejected": -0.6180543899536133, + "logps/chosen": -74.84315490722656, + "logps/rejected": -119.9290542602539, + "loss": 0.7669, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.770740270614624, + "rewards/margins": 5.78030252456665, + "rewards/rejected": -3.0095629692077637, + "step": 13055 + }, + { + "epoch": 3.27, + "grad_norm": 3.698212146759033, + "learning_rate": 2.684123518292652e-06, + "logits/chosen": -0.5228292346000671, + "logits/rejected": -0.5668248534202576, + "logps/chosen": -59.37059783935547, + "logps/rejected": -108.85105895996094, + "loss": 0.6176, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8865721225738525, + "rewards/margins": 6.901776313781738, + "rewards/rejected": -4.015203475952148, + "step": 13056 + }, + { + "epoch": 3.27, + "grad_norm": 5.4216084480285645, + "learning_rate": 2.6834269522332123e-06, + "logits/chosen": -0.5275608897209167, + "logits/rejected": -0.5542436242103577, + "logps/chosen": -50.65718078613281, + "logps/rejected": -114.68359375, + "loss": 0.5772, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.117558717727661, + "rewards/margins": 6.244495391845703, + "rewards/rejected": -3.126936435699463, + "step": 13057 + }, + { + "epoch": 3.27, + "grad_norm": 8.043159484863281, + "learning_rate": 2.6827304434187563e-06, + "logits/chosen": -0.5168255567550659, + "logits/rejected": -0.5461878180503845, + "logps/chosen": -53.05106735229492, + "logps/rejected": -134.83526611328125, + "loss": 0.646, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.237791061401367, + "rewards/margins": 6.844378471374512, + "rewards/rejected": -3.6065869331359863, + "step": 13058 + }, + { + "epoch": 3.27, + "grad_norm": 3.954591751098633, + "learning_rate": 2.6820339918664944e-06, + "logits/chosen": -0.5463415384292603, + "logits/rejected": -0.640078067779541, + "logps/chosen": -53.889286041259766, + "logps/rejected": -98.21146392822266, + "loss": 0.6123, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.211744785308838, + "rewards/margins": 7.0999836921691895, + "rewards/rejected": -3.8882386684417725, + "step": 13059 + }, + { + "epoch": 3.27, + "grad_norm": 17.350811004638672, + "learning_rate": 2.6813375975936396e-06, + "logits/chosen": -0.6145941019058228, + "logits/rejected": -0.6497775912284851, + "logps/chosen": -41.848995208740234, + "logps/rejected": -106.32642364501953, + "loss": 0.5887, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.309537410736084, + "rewards/margins": 7.1507768630981445, + "rewards/rejected": -3.8412396907806396, + "step": 13060 + }, + { + "epoch": 3.27, + "grad_norm": 16.053058624267578, + "learning_rate": 2.6806412606173993e-06, + "logits/chosen": -0.5602625012397766, + "logits/rejected": -0.6108008623123169, + "logps/chosen": -55.68861770629883, + "logps/rejected": -115.4637222290039, + "loss": 0.6194, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9087777137756348, + "rewards/margins": 7.8402419090271, + "rewards/rejected": -4.931464195251465, + "step": 13061 + }, + { + "epoch": 3.27, + "grad_norm": 5.686217308044434, + "learning_rate": 2.679944980954978e-06, + "logits/chosen": -0.5758025646209717, + "logits/rejected": -0.6433149576187134, + "logps/chosen": -53.28196716308594, + "logps/rejected": -91.13421630859375, + "loss": 0.6496, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9542675018310547, + "rewards/margins": 6.22972297668457, + "rewards/rejected": -3.2754554748535156, + "step": 13062 + }, + { + "epoch": 3.27, + "grad_norm": 2.2417685985565186, + "learning_rate": 2.679248758623585e-06, + "logits/chosen": -0.551668643951416, + "logits/rejected": -0.5828454494476318, + "logps/chosen": -49.618919372558594, + "logps/rejected": -118.42852783203125, + "loss": 0.4843, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.172339916229248, + "rewards/margins": 7.628308296203613, + "rewards/rejected": -4.455968379974365, + "step": 13063 + }, + { + "epoch": 3.27, + "grad_norm": 7.576875686645508, + "learning_rate": 2.6785525936404246e-06, + "logits/chosen": -0.565258264541626, + "logits/rejected": -0.6242868900299072, + "logps/chosen": -56.311058044433594, + "logps/rejected": -99.37825012207031, + "loss": 0.6751, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7825095653533936, + "rewards/margins": 5.585181713104248, + "rewards/rejected": -2.8026719093322754, + "step": 13064 + }, + { + "epoch": 3.27, + "grad_norm": 9.967081069946289, + "learning_rate": 2.6778564860226998e-06, + "logits/chosen": -0.5458815097808838, + "logits/rejected": -0.6452393531799316, + "logps/chosen": -56.55669021606445, + "logps/rejected": -105.71034240722656, + "loss": 0.6427, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9229202270507812, + "rewards/margins": 6.670651912689209, + "rewards/rejected": -3.7477312088012695, + "step": 13065 + }, + { + "epoch": 3.27, + "grad_norm": 4.9176926612854, + "learning_rate": 2.6771604357876078e-06, + "logits/chosen": -0.521264910697937, + "logits/rejected": -0.5506990551948547, + "logps/chosen": -53.06111145019531, + "logps/rejected": -127.83013916015625, + "loss": 0.6181, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4618234634399414, + "rewards/margins": 7.566802501678467, + "rewards/rejected": -4.104979515075684, + "step": 13066 + }, + { + "epoch": 3.27, + "grad_norm": 8.67296314239502, + "learning_rate": 2.6764644429523552e-06, + "logits/chosen": -0.5368951559066772, + "logits/rejected": -0.606892466545105, + "logps/chosen": -72.19632720947266, + "logps/rejected": -106.36486053466797, + "loss": 0.8636, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8785524368286133, + "rewards/margins": 6.854559898376465, + "rewards/rejected": -3.9760076999664307, + "step": 13067 + }, + { + "epoch": 3.27, + "grad_norm": 10.601259231567383, + "learning_rate": 2.6757685075341374e-06, + "logits/chosen": -0.5618787407875061, + "logits/rejected": -0.6511861085891724, + "logps/chosen": -56.7725830078125, + "logps/rejected": -129.13327026367188, + "loss": 0.585, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9717307090759277, + "rewards/margins": 8.130597114562988, + "rewards/rejected": -5.158866882324219, + "step": 13068 + }, + { + "epoch": 3.27, + "grad_norm": 4.859751224517822, + "learning_rate": 2.6750726295501493e-06, + "logits/chosen": -0.5704373121261597, + "logits/rejected": -0.6724965572357178, + "logps/chosen": -51.42195129394531, + "logps/rejected": -104.76008605957031, + "loss": 0.5788, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.939953327178955, + "rewards/margins": 7.3113250732421875, + "rewards/rejected": -4.371371269226074, + "step": 13069 + }, + { + "epoch": 3.27, + "grad_norm": 22.395591735839844, + "learning_rate": 2.6743768090175927e-06, + "logits/chosen": -0.4979843497276306, + "logits/rejected": -0.5966551303863525, + "logps/chosen": -58.65495681762695, + "logps/rejected": -117.04466247558594, + "loss": 0.7332, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9818477630615234, + "rewards/margins": 5.933539867401123, + "rewards/rejected": -2.9516921043395996, + "step": 13070 + }, + { + "epoch": 3.27, + "grad_norm": 8.369085311889648, + "learning_rate": 2.6736810459536575e-06, + "logits/chosen": -0.49921727180480957, + "logits/rejected": -0.5576946139335632, + "logps/chosen": -59.9381217956543, + "logps/rejected": -121.98267364501953, + "loss": 0.6433, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9116952419281006, + "rewards/margins": 6.767844200134277, + "rewards/rejected": -3.856149196624756, + "step": 13071 + }, + { + "epoch": 3.27, + "grad_norm": 4.598433494567871, + "learning_rate": 2.672985340375537e-06, + "logits/chosen": -0.5970401763916016, + "logits/rejected": -0.6368678212165833, + "logps/chosen": -46.452110290527344, + "logps/rejected": -99.86067962646484, + "loss": 0.6222, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3349404335021973, + "rewards/margins": 6.4338812828063965, + "rewards/rejected": -3.0989415645599365, + "step": 13072 + }, + { + "epoch": 3.27, + "grad_norm": 6.684057235717773, + "learning_rate": 2.672289692300425e-06, + "logits/chosen": -0.5604979991912842, + "logits/rejected": -0.6379446387290955, + "logps/chosen": -47.235206604003906, + "logps/rejected": -96.57235717773438, + "loss": 0.5817, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8371715545654297, + "rewards/margins": 6.4522786140441895, + "rewards/rejected": -3.6151070594787598, + "step": 13073 + }, + { + "epoch": 3.27, + "grad_norm": 3.549981117248535, + "learning_rate": 2.6715941017455093e-06, + "logits/chosen": -0.449079692363739, + "logits/rejected": -0.5910030007362366, + "logps/chosen": -72.09742736816406, + "logps/rejected": -103.2260513305664, + "loss": 0.6219, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1064951419830322, + "rewards/margins": 8.185575485229492, + "rewards/rejected": -5.079080104827881, + "step": 13074 + }, + { + "epoch": 3.27, + "grad_norm": 9.915592193603516, + "learning_rate": 2.6708985687279816e-06, + "logits/chosen": -0.6240312457084656, + "logits/rejected": -0.683541476726532, + "logps/chosen": -58.78014373779297, + "logps/rejected": -119.5705337524414, + "loss": 0.7156, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.835695743560791, + "rewards/margins": 7.09727668762207, + "rewards/rejected": -4.261580467224121, + "step": 13075 + }, + { + "epoch": 3.27, + "grad_norm": 4.172961711883545, + "learning_rate": 2.670203093265026e-06, + "logits/chosen": -0.5293623208999634, + "logits/rejected": -0.6647867560386658, + "logps/chosen": -58.991153717041016, + "logps/rejected": -109.01580810546875, + "loss": 0.6112, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0632071495056152, + "rewards/margins": 7.4018330574035645, + "rewards/rejected": -4.338625431060791, + "step": 13076 + }, + { + "epoch": 3.27, + "grad_norm": 3.3704264163970947, + "learning_rate": 2.6695076753738312e-06, + "logits/chosen": -0.5028331279754639, + "logits/rejected": -0.501631498336792, + "logps/chosen": -41.219757080078125, + "logps/rejected": -100.80244445800781, + "loss": 0.5844, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0853748321533203, + "rewards/margins": 5.975795745849609, + "rewards/rejected": -2.890420913696289, + "step": 13077 + }, + { + "epoch": 3.27, + "grad_norm": 4.560874938964844, + "learning_rate": 2.668812315071581e-06, + "logits/chosen": -0.5500693321228027, + "logits/rejected": -0.6058897376060486, + "logps/chosen": -56.78337860107422, + "logps/rejected": -108.00685119628906, + "loss": 0.6147, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2249019145965576, + "rewards/margins": 6.789370059967041, + "rewards/rejected": -3.5644686222076416, + "step": 13078 + }, + { + "epoch": 3.27, + "grad_norm": 9.339523315429688, + "learning_rate": 2.6681170123754565e-06, + "logits/chosen": -0.5626574754714966, + "logits/rejected": -0.6286130547523499, + "logps/chosen": -59.42335510253906, + "logps/rejected": -120.82886505126953, + "loss": 0.611, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9498841762542725, + "rewards/margins": 7.590428829193115, + "rewards/rejected": -4.640544891357422, + "step": 13079 + }, + { + "epoch": 3.27, + "grad_norm": 6.889956951141357, + "learning_rate": 2.6674217673026427e-06, + "logits/chosen": -0.5589190721511841, + "logits/rejected": -0.6352797150611877, + "logps/chosen": -61.867000579833984, + "logps/rejected": -106.5961685180664, + "loss": 0.7066, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9287517070770264, + "rewards/margins": 7.144343376159668, + "rewards/rejected": -4.2155914306640625, + "step": 13080 + }, + { + "epoch": 3.27, + "grad_norm": 8.467902183532715, + "learning_rate": 2.6667265798703155e-06, + "logits/chosen": -0.6237553358078003, + "logits/rejected": -0.6504402160644531, + "logps/chosen": -63.499725341796875, + "logps/rejected": -108.03871154785156, + "loss": 0.6551, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7886815071105957, + "rewards/margins": 6.3704071044921875, + "rewards/rejected": -3.5817246437072754, + "step": 13081 + }, + { + "epoch": 3.27, + "grad_norm": 7.133803367614746, + "learning_rate": 2.6660314500956587e-06, + "logits/chosen": -0.5407106876373291, + "logits/rejected": -0.6154676079750061, + "logps/chosen": -58.4536247253418, + "logps/rejected": -109.01730346679688, + "loss": 0.6477, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0511586666107178, + "rewards/margins": 6.931722640991211, + "rewards/rejected": -3.8805642127990723, + "step": 13082 + }, + { + "epoch": 3.27, + "grad_norm": 4.094451427459717, + "learning_rate": 2.6653363779958476e-06, + "logits/chosen": -0.5045437216758728, + "logits/rejected": -0.5660532116889954, + "logps/chosen": -59.19902801513672, + "logps/rejected": -122.43498229980469, + "loss": 0.5917, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.737910747528076, + "rewards/margins": 7.304253578186035, + "rewards/rejected": -4.566342830657959, + "step": 13083 + }, + { + "epoch": 3.27, + "grad_norm": 9.589446067810059, + "learning_rate": 2.6646413635880553e-06, + "logits/chosen": -0.5422253608703613, + "logits/rejected": -0.5905141234397888, + "logps/chosen": -54.051300048828125, + "logps/rejected": -111.11283874511719, + "loss": 0.561, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9899065494537354, + "rewards/margins": 6.949344635009766, + "rewards/rejected": -3.9594380855560303, + "step": 13084 + }, + { + "epoch": 3.27, + "grad_norm": 6.314566135406494, + "learning_rate": 2.663946406889461e-06, + "logits/chosen": -0.6044971942901611, + "logits/rejected": -0.6430069208145142, + "logps/chosen": -59.13113021850586, + "logps/rejected": -121.16658020019531, + "loss": 0.6685, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1583759784698486, + "rewards/margins": 7.684000015258789, + "rewards/rejected": -4.5256242752075195, + "step": 13085 + }, + { + "epoch": 3.27, + "grad_norm": 5.381219387054443, + "learning_rate": 2.6632515079172362e-06, + "logits/chosen": -0.5885920524597168, + "logits/rejected": -0.7384048700332642, + "logps/chosen": -56.00823974609375, + "logps/rejected": -96.04164123535156, + "loss": 0.5914, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2901508808135986, + "rewards/margins": 8.520798683166504, + "rewards/rejected": -5.230647087097168, + "step": 13086 + }, + { + "epoch": 3.27, + "grad_norm": 5.0839643478393555, + "learning_rate": 2.6625566666885496e-06, + "logits/chosen": -0.5201681852340698, + "logits/rejected": -0.5926219820976257, + "logps/chosen": -51.99380111694336, + "logps/rejected": -95.92803192138672, + "loss": 0.6624, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0064902305603027, + "rewards/margins": 6.446730613708496, + "rewards/rejected": -3.440239906311035, + "step": 13087 + }, + { + "epoch": 3.27, + "grad_norm": 3.901933431625366, + "learning_rate": 2.6618618832205744e-06, + "logits/chosen": -0.5741803646087646, + "logits/rejected": -0.6182662844657898, + "logps/chosen": -49.35434341430664, + "logps/rejected": -119.88075256347656, + "loss": 0.5949, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.062302350997925, + "rewards/margins": 7.3098225593566895, + "rewards/rejected": -4.247520446777344, + "step": 13088 + }, + { + "epoch": 3.27, + "grad_norm": 3.6645615100860596, + "learning_rate": 2.6611671575304826e-06, + "logits/chosen": -0.5643434524536133, + "logits/rejected": -0.6129657030105591, + "logps/chosen": -55.81697463989258, + "logps/rejected": -111.58211517333984, + "loss": 0.6106, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.007139205932617, + "rewards/margins": 7.48378324508667, + "rewards/rejected": -4.476644039154053, + "step": 13089 + }, + { + "epoch": 3.27, + "grad_norm": 5.063594818115234, + "learning_rate": 2.6604724896354338e-06, + "logits/chosen": -0.42439132928848267, + "logits/rejected": -0.5816828012466431, + "logps/chosen": -66.68708038330078, + "logps/rejected": -95.46968078613281, + "loss": 0.6318, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1366829872131348, + "rewards/margins": 6.685472011566162, + "rewards/rejected": -3.5487890243530273, + "step": 13090 + }, + { + "epoch": 3.27, + "grad_norm": 10.582186698913574, + "learning_rate": 2.659777879552599e-06, + "logits/chosen": -0.5856891870498657, + "logits/rejected": -0.6902364492416382, + "logps/chosen": -54.88764190673828, + "logps/rejected": -102.76609802246094, + "loss": 0.6319, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1856305599212646, + "rewards/margins": 7.154654502868652, + "rewards/rejected": -3.9690232276916504, + "step": 13091 + }, + { + "epoch": 3.28, + "grad_norm": 4.923028469085693, + "learning_rate": 2.6590833272991435e-06, + "logits/chosen": -0.5638689994812012, + "logits/rejected": -0.626686155796051, + "logps/chosen": -51.27643966674805, + "logps/rejected": -98.12090301513672, + "loss": 0.5862, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2736923694610596, + "rewards/margins": 6.637517929077148, + "rewards/rejected": -3.363825559616089, + "step": 13092 + }, + { + "epoch": 3.28, + "grad_norm": 6.568387031555176, + "learning_rate": 2.6583888328922293e-06, + "logits/chosen": -0.5495897531509399, + "logits/rejected": -0.5738093852996826, + "logps/chosen": -49.83721923828125, + "logps/rejected": -88.93451690673828, + "loss": 0.7418, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.983206272125244, + "rewards/margins": 5.095296859741211, + "rewards/rejected": -2.1120903491973877, + "step": 13093 + }, + { + "epoch": 3.28, + "grad_norm": 7.472297668457031, + "learning_rate": 2.6576943963490152e-06, + "logits/chosen": -0.47183531522750854, + "logits/rejected": -0.5709206461906433, + "logps/chosen": -66.33792114257812, + "logps/rejected": -103.21903991699219, + "loss": 0.6416, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.893526554107666, + "rewards/margins": 7.207336902618408, + "rewards/rejected": -4.313810348510742, + "step": 13094 + }, + { + "epoch": 3.28, + "grad_norm": 5.378133296966553, + "learning_rate": 2.657000017686666e-06, + "logits/chosen": -0.6004806160926819, + "logits/rejected": -0.6444725394248962, + "logps/chosen": -52.73905563354492, + "logps/rejected": -98.03713989257812, + "loss": 0.7005, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.061663866043091, + "rewards/margins": 5.688633918762207, + "rewards/rejected": -2.626970052719116, + "step": 13095 + }, + { + "epoch": 3.28, + "grad_norm": 3.8713488578796387, + "learning_rate": 2.656305696922339e-06, + "logits/chosen": -0.4943143129348755, + "logits/rejected": -0.6019298434257507, + "logps/chosen": -55.5654182434082, + "logps/rejected": -98.66929626464844, + "loss": 0.5878, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9056236743927, + "rewards/margins": 6.121799945831299, + "rewards/rejected": -3.2161762714385986, + "step": 13096 + }, + { + "epoch": 3.28, + "grad_norm": 7.389645576477051, + "learning_rate": 2.6556114340731885e-06, + "logits/chosen": -0.5580820441246033, + "logits/rejected": -0.5857337117195129, + "logps/chosen": -49.6036262512207, + "logps/rejected": -111.27572631835938, + "loss": 0.5617, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1373345851898193, + "rewards/margins": 6.575228214263916, + "rewards/rejected": -3.4378933906555176, + "step": 13097 + }, + { + "epoch": 3.28, + "grad_norm": 5.081210613250732, + "learning_rate": 2.6549172291563753e-06, + "logits/chosen": -0.5599743127822876, + "logits/rejected": -0.625625491142273, + "logps/chosen": -57.134273529052734, + "logps/rejected": -93.86575317382812, + "loss": 0.6227, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.107952117919922, + "rewards/margins": 6.171073913574219, + "rewards/rejected": -3.0631213188171387, + "step": 13098 + }, + { + "epoch": 3.28, + "grad_norm": 8.78904914855957, + "learning_rate": 2.6542230821890497e-06, + "logits/chosen": -0.6470574736595154, + "logits/rejected": -0.7149322032928467, + "logps/chosen": -53.0885009765625, + "logps/rejected": -82.66717529296875, + "loss": 0.673, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.295732021331787, + "rewards/margins": 5.982240676879883, + "rewards/rejected": -2.686509132385254, + "step": 13099 + }, + { + "epoch": 3.28, + "grad_norm": 3.1687967777252197, + "learning_rate": 2.653528993188369e-06, + "logits/chosen": -0.5680676698684692, + "logits/rejected": -0.6284087300300598, + "logps/chosen": -51.92506408691406, + "logps/rejected": -99.35062408447266, + "loss": 0.6371, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0213301181793213, + "rewards/margins": 6.574000358581543, + "rewards/rejected": -3.552670478820801, + "step": 13100 + }, + { + "epoch": 3.28, + "grad_norm": 7.2980241775512695, + "learning_rate": 2.652834962171482e-06, + "logits/chosen": -0.5331640243530273, + "logits/rejected": -0.5885346531867981, + "logps/chosen": -49.07514572143555, + "logps/rejected": -119.10983276367188, + "loss": 0.668, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0780580043792725, + "rewards/margins": 6.786142349243164, + "rewards/rejected": -3.7080843448638916, + "step": 13101 + }, + { + "epoch": 3.28, + "grad_norm": 5.320509910583496, + "learning_rate": 2.6521409891555385e-06, + "logits/chosen": -0.490119606256485, + "logits/rejected": -0.5583285093307495, + "logps/chosen": -65.7527084350586, + "logps/rejected": -112.11581420898438, + "loss": 0.7042, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1349058151245117, + "rewards/margins": 6.764068603515625, + "rewards/rejected": -3.6291635036468506, + "step": 13102 + }, + { + "epoch": 3.28, + "grad_norm": 5.888826370239258, + "learning_rate": 2.6514470741576904e-06, + "logits/chosen": -0.5568234920501709, + "logits/rejected": -0.6700741052627563, + "logps/chosen": -51.70832824707031, + "logps/rejected": -93.5186767578125, + "loss": 0.6763, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9901905059814453, + "rewards/margins": 6.552964687347412, + "rewards/rejected": -3.562774181365967, + "step": 13103 + }, + { + "epoch": 3.28, + "grad_norm": 4.80942964553833, + "learning_rate": 2.6507532171950824e-06, + "logits/chosen": -0.4296785593032837, + "logits/rejected": -0.48143982887268066, + "logps/chosen": -50.33956527709961, + "logps/rejected": -105.54985046386719, + "loss": 0.5797, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1816415786743164, + "rewards/margins": 7.065560340881348, + "rewards/rejected": -3.8839190006256104, + "step": 13104 + }, + { + "epoch": 3.28, + "grad_norm": 6.388784885406494, + "learning_rate": 2.65005941828486e-06, + "logits/chosen": -0.5617353320121765, + "logits/rejected": -0.6241453289985657, + "logps/chosen": -43.926231384277344, + "logps/rejected": -128.0572967529297, + "loss": 0.5748, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.023560047149658, + "rewards/margins": 7.835319995880127, + "rewards/rejected": -4.811761379241943, + "step": 13105 + }, + { + "epoch": 3.28, + "grad_norm": 4.184526443481445, + "learning_rate": 2.6493656774441683e-06, + "logits/chosen": -0.5238258838653564, + "logits/rejected": -0.5943769812583923, + "logps/chosen": -53.718231201171875, + "logps/rejected": -90.737060546875, + "loss": 0.6356, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9774272441864014, + "rewards/margins": 6.747020244598389, + "rewards/rejected": -3.7695934772491455, + "step": 13106 + }, + { + "epoch": 3.28, + "grad_norm": 3.0692107677459717, + "learning_rate": 2.6486719946901527e-06, + "logits/chosen": -0.5445756912231445, + "logits/rejected": -0.6315834522247314, + "logps/chosen": -63.3330078125, + "logps/rejected": -90.13275146484375, + "loss": 0.5934, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0435261726379395, + "rewards/margins": 6.601430892944336, + "rewards/rejected": -3.5579049587249756, + "step": 13107 + }, + { + "epoch": 3.28, + "grad_norm": 4.347497940063477, + "learning_rate": 2.647978370039954e-06, + "logits/chosen": -0.5221060514450073, + "logits/rejected": -0.6451378464698792, + "logps/chosen": -56.01395034790039, + "logps/rejected": -88.44207763671875, + "loss": 0.5387, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2846319675445557, + "rewards/margins": 6.2074079513549805, + "rewards/rejected": -2.922776222229004, + "step": 13108 + }, + { + "epoch": 3.28, + "grad_norm": 3.1416125297546387, + "learning_rate": 2.647284803510709e-06, + "logits/chosen": -0.5200308561325073, + "logits/rejected": -0.6130949258804321, + "logps/chosen": -65.79460144042969, + "logps/rejected": -126.06018829345703, + "loss": 0.587, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.120436668395996, + "rewards/margins": 7.4845733642578125, + "rewards/rejected": -4.364137172698975, + "step": 13109 + }, + { + "epoch": 3.28, + "grad_norm": 4.8995866775512695, + "learning_rate": 2.646591295119561e-06, + "logits/chosen": -0.5343159437179565, + "logits/rejected": -0.6132565140724182, + "logps/chosen": -58.839378356933594, + "logps/rejected": -100.30191040039062, + "loss": 0.6892, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1160669326782227, + "rewards/margins": 7.439878463745117, + "rewards/rejected": -4.3238115310668945, + "step": 13110 + }, + { + "epoch": 3.28, + "grad_norm": 2.5814616680145264, + "learning_rate": 2.6458978448836453e-06, + "logits/chosen": -0.5289422869682312, + "logits/rejected": -0.6217617392539978, + "logps/chosen": -61.40500259399414, + "logps/rejected": -112.48876190185547, + "loss": 0.585, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3365960121154785, + "rewards/margins": 8.016541481018066, + "rewards/rejected": -4.67994499206543, + "step": 13111 + }, + { + "epoch": 3.28, + "grad_norm": 9.38254451751709, + "learning_rate": 2.6452044528200958e-06, + "logits/chosen": -0.5871121287345886, + "logits/rejected": -0.6506015062332153, + "logps/chosen": -52.73908996582031, + "logps/rejected": -102.41847229003906, + "loss": 0.6299, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.218400001525879, + "rewards/margins": 7.293876647949219, + "rewards/rejected": -4.075477123260498, + "step": 13112 + }, + { + "epoch": 3.28, + "grad_norm": 5.980746746063232, + "learning_rate": 2.6445111189460517e-06, + "logits/chosen": -0.458789587020874, + "logits/rejected": -0.5878183245658875, + "logps/chosen": -64.2994155883789, + "logps/rejected": -101.47151184082031, + "loss": 0.6356, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0917980670928955, + "rewards/margins": 7.40720272064209, + "rewards/rejected": -4.315404415130615, + "step": 13113 + }, + { + "epoch": 3.28, + "grad_norm": 2.5909409523010254, + "learning_rate": 2.643817843278642e-06, + "logits/chosen": -0.5411593914031982, + "logits/rejected": -0.6075475811958313, + "logps/chosen": -46.9318962097168, + "logps/rejected": -129.39620971679688, + "loss": 0.4771, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.154721975326538, + "rewards/margins": 9.088805198669434, + "rewards/rejected": -5.934083461761475, + "step": 13114 + }, + { + "epoch": 3.28, + "grad_norm": 14.865947723388672, + "learning_rate": 2.643124625834998e-06, + "logits/chosen": -0.5203412771224976, + "logits/rejected": -0.5820322036743164, + "logps/chosen": -55.66697311401367, + "logps/rejected": -109.83746337890625, + "loss": 0.5873, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.455584764480591, + "rewards/margins": 6.879270553588867, + "rewards/rejected": -3.423685073852539, + "step": 13115 + }, + { + "epoch": 3.28, + "grad_norm": 12.473583221435547, + "learning_rate": 2.6424314666322537e-06, + "logits/chosen": -0.6186360716819763, + "logits/rejected": -0.6861934661865234, + "logps/chosen": -54.749168395996094, + "logps/rejected": -113.24639892578125, + "loss": 0.6118, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0724966526031494, + "rewards/margins": 6.587068557739258, + "rewards/rejected": -3.5145721435546875, + "step": 13116 + }, + { + "epoch": 3.28, + "grad_norm": 2.8028759956359863, + "learning_rate": 2.641738365687533e-06, + "logits/chosen": -0.4742651581764221, + "logits/rejected": -0.5440508723258972, + "logps/chosen": -57.715476989746094, + "logps/rejected": -101.21427917480469, + "loss": 0.5895, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.781930685043335, + "rewards/margins": 6.307046890258789, + "rewards/rejected": -3.5251169204711914, + "step": 13117 + }, + { + "epoch": 3.28, + "grad_norm": 12.221397399902344, + "learning_rate": 2.6410453230179677e-06, + "logits/chosen": -0.5679915547370911, + "logits/rejected": -0.659916877746582, + "logps/chosen": -62.79450607299805, + "logps/rejected": -104.42317199707031, + "loss": 0.7118, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.002156972885132, + "rewards/margins": 6.711660385131836, + "rewards/rejected": -3.7095038890838623, + "step": 13118 + }, + { + "epoch": 3.28, + "grad_norm": 4.598519325256348, + "learning_rate": 2.64035233864068e-06, + "logits/chosen": -0.5051611065864563, + "logits/rejected": -0.5744563341140747, + "logps/chosen": -50.345436096191406, + "logps/rejected": -103.96651458740234, + "loss": 0.5777, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1351370811462402, + "rewards/margins": 6.202029705047607, + "rewards/rejected": -3.066892147064209, + "step": 13119 + }, + { + "epoch": 3.28, + "grad_norm": 27.5783748626709, + "learning_rate": 2.639659412572797e-06, + "logits/chosen": -0.53318190574646, + "logits/rejected": -0.5926026105880737, + "logps/chosen": -49.95268249511719, + "logps/rejected": -104.08334350585938, + "loss": 0.6477, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.208312511444092, + "rewards/margins": 7.133710861206055, + "rewards/rejected": -3.925398111343384, + "step": 13120 + }, + { + "epoch": 3.28, + "grad_norm": 3.255110025405884, + "learning_rate": 2.638966544831442e-06, + "logits/chosen": -0.5451125502586365, + "logits/rejected": -0.6445589661598206, + "logps/chosen": -52.605865478515625, + "logps/rejected": -102.62939453125, + "loss": 0.5556, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1036906242370605, + "rewards/margins": 7.039984703063965, + "rewards/rejected": -3.936293601989746, + "step": 13121 + }, + { + "epoch": 3.28, + "grad_norm": 6.808048725128174, + "learning_rate": 2.638273735433731e-06, + "logits/chosen": -0.5806682109832764, + "logits/rejected": -0.5815949440002441, + "logps/chosen": -45.16447067260742, + "logps/rejected": -120.45523071289062, + "loss": 0.6833, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0708365440368652, + "rewards/margins": 6.486429214477539, + "rewards/rejected": -3.4155921936035156, + "step": 13122 + }, + { + "epoch": 3.28, + "grad_norm": 5.883810520172119, + "learning_rate": 2.6375809843967913e-06, + "logits/chosen": -0.5289968252182007, + "logits/rejected": -0.5668152570724487, + "logps/chosen": -56.987579345703125, + "logps/rejected": -116.29034423828125, + "loss": 0.6675, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7870066165924072, + "rewards/margins": 5.890566349029541, + "rewards/rejected": -3.103560447692871, + "step": 13123 + }, + { + "epoch": 3.28, + "grad_norm": 7.771599769592285, + "learning_rate": 2.6368882917377357e-06, + "logits/chosen": -0.5062300562858582, + "logits/rejected": -0.5461716055870056, + "logps/chosen": -57.53478240966797, + "logps/rejected": -99.39543151855469, + "loss": 0.7146, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1925296783447266, + "rewards/margins": 6.36478328704834, + "rewards/rejected": -3.1722536087036133, + "step": 13124 + }, + { + "epoch": 3.28, + "grad_norm": 4.127809524536133, + "learning_rate": 2.6361956574736867e-06, + "logits/chosen": -0.6060518026351929, + "logits/rejected": -0.6970412731170654, + "logps/chosen": -44.09960174560547, + "logps/rejected": -97.98548126220703, + "loss": 0.582, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.271397352218628, + "rewards/margins": 7.3161091804504395, + "rewards/rejected": -4.044712066650391, + "step": 13125 + }, + { + "epoch": 3.28, + "grad_norm": 4.938198089599609, + "learning_rate": 2.6355030816217566e-06, + "logits/chosen": -0.5301270484924316, + "logits/rejected": -0.6312923431396484, + "logps/chosen": -54.88131332397461, + "logps/rejected": -106.11453247070312, + "loss": 0.6166, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8992748260498047, + "rewards/margins": 6.742269039154053, + "rewards/rejected": -3.842994451522827, + "step": 13126 + }, + { + "epoch": 3.28, + "grad_norm": 5.615264415740967, + "learning_rate": 2.6348105641990586e-06, + "logits/chosen": -0.5469006896018982, + "logits/rejected": -0.6218383312225342, + "logps/chosen": -48.277809143066406, + "logps/rejected": -88.699951171875, + "loss": 0.5857, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9264891147613525, + "rewards/margins": 6.1091766357421875, + "rewards/rejected": -3.182687282562256, + "step": 13127 + }, + { + "epoch": 3.28, + "grad_norm": 21.851165771484375, + "learning_rate": 2.6341181052227094e-06, + "logits/chosen": -0.5219423174858093, + "logits/rejected": -0.5798875093460083, + "logps/chosen": -67.16278076171875, + "logps/rejected": -114.31683349609375, + "loss": 0.7335, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.837561845779419, + "rewards/margins": 6.894966125488281, + "rewards/rejected": -4.057404518127441, + "step": 13128 + }, + { + "epoch": 3.28, + "grad_norm": 5.879770278930664, + "learning_rate": 2.6334257047098176e-06, + "logits/chosen": -0.4675179421901703, + "logits/rejected": -0.6315505504608154, + "logps/chosen": -78.0185546875, + "logps/rejected": -93.26713562011719, + "loss": 0.6977, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.170241355895996, + "rewards/margins": 5.873452663421631, + "rewards/rejected": -2.7032110691070557, + "step": 13129 + }, + { + "epoch": 3.28, + "grad_norm": 4.662182807922363, + "learning_rate": 2.6327333626774933e-06, + "logits/chosen": -0.5785359740257263, + "logits/rejected": -0.6253467202186584, + "logps/chosen": -48.66139602661133, + "logps/rejected": -92.30668640136719, + "loss": 0.5829, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.648162364959717, + "rewards/margins": 6.344955921173096, + "rewards/rejected": -3.696794271469116, + "step": 13130 + }, + { + "epoch": 3.28, + "grad_norm": 3.216312885284424, + "learning_rate": 2.632041079142844e-06, + "logits/chosen": -0.5882574319839478, + "logits/rejected": -0.6452365517616272, + "logps/chosen": -49.22125244140625, + "logps/rejected": -96.11175537109375, + "loss": 0.6495, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.817034959793091, + "rewards/margins": 6.550136089324951, + "rewards/rejected": -3.7331008911132812, + "step": 13131 + }, + { + "epoch": 3.29, + "grad_norm": 12.091526985168457, + "learning_rate": 2.631348854122983e-06, + "logits/chosen": -0.5900425910949707, + "logits/rejected": -0.6862724423408508, + "logps/chosen": -60.13926315307617, + "logps/rejected": -105.14937591552734, + "loss": 0.76, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.81363582611084, + "rewards/margins": 5.82778263092041, + "rewards/rejected": -3.0141468048095703, + "step": 13132 + }, + { + "epoch": 3.29, + "grad_norm": 12.41063117980957, + "learning_rate": 2.6306566876350072e-06, + "logits/chosen": -0.5422633290290833, + "logits/rejected": -0.559095561504364, + "logps/chosen": -52.71406173706055, + "logps/rejected": -126.75489807128906, + "loss": 0.6029, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8231163024902344, + "rewards/margins": 7.062236785888672, + "rewards/rejected": -4.2391204833984375, + "step": 13133 + }, + { + "epoch": 3.29, + "grad_norm": 2.321965217590332, + "learning_rate": 2.629964579696025e-06, + "logits/chosen": -0.5388345718383789, + "logits/rejected": -0.681634783744812, + "logps/chosen": -64.45851135253906, + "logps/rejected": -93.56529998779297, + "loss": 0.5658, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.081007719039917, + "rewards/margins": 7.271121978759766, + "rewards/rejected": -4.190114498138428, + "step": 13134 + }, + { + "epoch": 3.29, + "grad_norm": 4.094658374786377, + "learning_rate": 2.6292725303231427e-06, + "logits/chosen": -0.4770854711532593, + "logits/rejected": -0.6114847660064697, + "logps/chosen": -61.5405158996582, + "logps/rejected": -108.81380462646484, + "loss": 0.6368, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.059499740600586, + "rewards/margins": 7.325819969177246, + "rewards/rejected": -4.26632022857666, + "step": 13135 + }, + { + "epoch": 3.29, + "grad_norm": 4.280628681182861, + "learning_rate": 2.6285805395334537e-06, + "logits/chosen": -0.5618000030517578, + "logits/rejected": -0.6294518709182739, + "logps/chosen": -58.25328063964844, + "logps/rejected": -110.29080963134766, + "loss": 0.6753, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.03275203704834, + "rewards/margins": 7.0793304443359375, + "rewards/rejected": -4.046578407287598, + "step": 13136 + }, + { + "epoch": 3.29, + "grad_norm": 4.127385139465332, + "learning_rate": 2.627888607344062e-06, + "logits/chosen": -0.4801863431930542, + "logits/rejected": -0.6094738841056824, + "logps/chosen": -58.819061279296875, + "logps/rejected": -94.64492797851562, + "loss": 0.6363, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8191473484039307, + "rewards/margins": 6.247572422027588, + "rewards/rejected": -3.4284255504608154, + "step": 13137 + }, + { + "epoch": 3.29, + "grad_norm": 2.8667383193969727, + "learning_rate": 2.627196733772067e-06, + "logits/chosen": -0.549692690372467, + "logits/rejected": -0.6709474325180054, + "logps/chosen": -57.56740951538086, + "logps/rejected": -99.22325134277344, + "loss": 0.6405, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0167436599731445, + "rewards/margins": 6.943939208984375, + "rewards/rejected": -3.9271955490112305, + "step": 13138 + }, + { + "epoch": 3.29, + "grad_norm": 3.365417718887329, + "learning_rate": 2.626504918834566e-06, + "logits/chosen": -0.5433074831962585, + "logits/rejected": -0.6505104899406433, + "logps/chosen": -61.8875732421875, + "logps/rejected": -100.79650115966797, + "loss": 0.5981, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9653377532958984, + "rewards/margins": 6.238002300262451, + "rewards/rejected": -3.2726645469665527, + "step": 13139 + }, + { + "epoch": 3.29, + "grad_norm": 6.225637435913086, + "learning_rate": 2.6258131625486504e-06, + "logits/chosen": -0.5473542809486389, + "logits/rejected": -0.6072826385498047, + "logps/chosen": -55.83131790161133, + "logps/rejected": -117.10958862304688, + "loss": 0.7227, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1480681896209717, + "rewards/margins": 8.081075668334961, + "rewards/rejected": -4.93300724029541, + "step": 13140 + }, + { + "epoch": 3.29, + "grad_norm": 3.757329225540161, + "learning_rate": 2.625121464931419e-06, + "logits/chosen": -0.5703373551368713, + "logits/rejected": -0.6288368701934814, + "logps/chosen": -53.490943908691406, + "logps/rejected": -120.66517639160156, + "loss": 0.6859, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0330569744110107, + "rewards/margins": 8.41184139251709, + "rewards/rejected": -5.378784656524658, + "step": 13141 + }, + { + "epoch": 3.29, + "grad_norm": 2.967517852783203, + "learning_rate": 2.62442982599996e-06, + "logits/chosen": -0.5289414525032043, + "logits/rejected": -0.6516821384429932, + "logps/chosen": -59.76757049560547, + "logps/rejected": -90.413330078125, + "loss": 0.5775, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.436889171600342, + "rewards/margins": 7.2712531089782715, + "rewards/rejected": -3.8343639373779297, + "step": 13142 + }, + { + "epoch": 3.29, + "grad_norm": 2.866370916366577, + "learning_rate": 2.623738245771369e-06, + "logits/chosen": -0.5642393827438354, + "logits/rejected": -0.6458732485771179, + "logps/chosen": -58.894981384277344, + "logps/rejected": -106.24991607666016, + "loss": 0.6368, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4885101318359375, + "rewards/margins": 7.410051345825195, + "rewards/rejected": -3.921541690826416, + "step": 13143 + }, + { + "epoch": 3.29, + "grad_norm": 2.735233783721924, + "learning_rate": 2.6230467242627333e-06, + "logits/chosen": -0.5247228145599365, + "logits/rejected": -0.6300089359283447, + "logps/chosen": -47.51136779785156, + "logps/rejected": -93.19210052490234, + "loss": 0.5087, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1658823490142822, + "rewards/margins": 6.9108734130859375, + "rewards/rejected": -3.744990825653076, + "step": 13144 + }, + { + "epoch": 3.29, + "grad_norm": 11.795595169067383, + "learning_rate": 2.6223552614911397e-06, + "logits/chosen": -0.5039234161376953, + "logits/rejected": -0.5926675796508789, + "logps/chosen": -65.27532958984375, + "logps/rejected": -96.73239135742188, + "loss": 0.7733, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0471646785736084, + "rewards/margins": 6.107492923736572, + "rewards/rejected": -3.060328483581543, + "step": 13145 + }, + { + "epoch": 3.29, + "grad_norm": 4.716121673583984, + "learning_rate": 2.6216638574736777e-06, + "logits/chosen": -0.6451045870780945, + "logits/rejected": -0.7084671258926392, + "logps/chosen": -54.60024642944336, + "logps/rejected": -100.34007263183594, + "loss": 0.6816, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7098214626312256, + "rewards/margins": 6.234264373779297, + "rewards/rejected": -3.524442195892334, + "step": 13146 + }, + { + "epoch": 3.29, + "grad_norm": 2.140702486038208, + "learning_rate": 2.620972512227432e-06, + "logits/chosen": -0.5209695100784302, + "logits/rejected": -0.6034259796142578, + "logps/chosen": -44.89997863769531, + "logps/rejected": -91.63663482666016, + "loss": 0.5002, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.527533769607544, + "rewards/margins": 7.068103313446045, + "rewards/rejected": -3.540569543838501, + "step": 13147 + }, + { + "epoch": 3.29, + "grad_norm": 3.6026744842529297, + "learning_rate": 2.620281225769484e-06, + "logits/chosen": -0.5025637149810791, + "logits/rejected": -0.530595064163208, + "logps/chosen": -51.76284408569336, + "logps/rejected": -113.0181884765625, + "loss": 0.6199, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.204742431640625, + "rewards/margins": 6.915863990783691, + "rewards/rejected": -3.7111213207244873, + "step": 13148 + }, + { + "epoch": 3.29, + "grad_norm": 8.358970642089844, + "learning_rate": 2.6195899981169183e-06, + "logits/chosen": -0.6292665004730225, + "logits/rejected": -0.69974684715271, + "logps/chosen": -51.624271392822266, + "logps/rejected": -101.78585052490234, + "loss": 0.6326, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2713992595672607, + "rewards/margins": 6.082244873046875, + "rewards/rejected": -2.8108456134796143, + "step": 13149 + }, + { + "epoch": 3.29, + "grad_norm": 7.096480369567871, + "learning_rate": 2.618898829286819e-06, + "logits/chosen": -0.5544565916061401, + "logits/rejected": -0.6616829633712769, + "logps/chosen": -57.2750358581543, + "logps/rejected": -113.41680908203125, + "loss": 0.6413, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.948369026184082, + "rewards/margins": 7.2729597091674805, + "rewards/rejected": -4.324591159820557, + "step": 13150 + }, + { + "epoch": 3.29, + "grad_norm": 2.317619562149048, + "learning_rate": 2.6182077192962586e-06, + "logits/chosen": -0.5512205362319946, + "logits/rejected": -0.5793302059173584, + "logps/chosen": -45.931671142578125, + "logps/rejected": -98.31686401367188, + "loss": 0.5396, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4687016010284424, + "rewards/margins": 6.382650375366211, + "rewards/rejected": -2.9139490127563477, + "step": 13151 + }, + { + "epoch": 3.29, + "grad_norm": 3.057882308959961, + "learning_rate": 2.617516668162319e-06, + "logits/chosen": -0.5298843383789062, + "logits/rejected": -0.6037636399269104, + "logps/chosen": -56.485877990722656, + "logps/rejected": -99.8808822631836, + "loss": 0.5707, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2496860027313232, + "rewards/margins": 6.3117570877075195, + "rewards/rejected": -3.0620718002319336, + "step": 13152 + }, + { + "epoch": 3.29, + "grad_norm": 15.63710880279541, + "learning_rate": 2.6168256759020784e-06, + "logits/chosen": -0.5598815083503723, + "logits/rejected": -0.6148298382759094, + "logps/chosen": -55.933250427246094, + "logps/rejected": -104.70992279052734, + "loss": 0.7386, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7111425399780273, + "rewards/margins": 5.836345672607422, + "rewards/rejected": -3.1252031326293945, + "step": 13153 + }, + { + "epoch": 3.29, + "grad_norm": 5.617664337158203, + "learning_rate": 2.6161347425326108e-06, + "logits/chosen": -0.48166200518608093, + "logits/rejected": -0.5446581840515137, + "logps/chosen": -57.98543167114258, + "logps/rejected": -99.87743377685547, + "loss": 0.664, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.321645736694336, + "rewards/margins": 6.638502597808838, + "rewards/rejected": -3.316856622695923, + "step": 13154 + }, + { + "epoch": 3.29, + "grad_norm": 14.388915061950684, + "learning_rate": 2.6154438680709863e-06, + "logits/chosen": -0.5069253444671631, + "logits/rejected": -0.6335002183914185, + "logps/chosen": -56.67680358886719, + "logps/rejected": -88.80979919433594, + "loss": 0.6933, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2998368740081787, + "rewards/margins": 7.671571254730225, + "rewards/rejected": -4.371734619140625, + "step": 13155 + }, + { + "epoch": 3.29, + "grad_norm": 4.42441987991333, + "learning_rate": 2.614753052534283e-06, + "logits/chosen": -0.5040961503982544, + "logits/rejected": -0.5734291672706604, + "logps/chosen": -45.672306060791016, + "logps/rejected": -87.23192596435547, + "loss": 0.6683, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9915831089019775, + "rewards/margins": 6.326295852661133, + "rewards/rejected": -3.334712505340576, + "step": 13156 + }, + { + "epoch": 3.29, + "grad_norm": 4.1942667961120605, + "learning_rate": 2.614062295939569e-06, + "logits/chosen": -0.5194365382194519, + "logits/rejected": -0.5527220368385315, + "logps/chosen": -51.66493225097656, + "logps/rejected": -119.62388610839844, + "loss": 0.7003, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1465768814086914, + "rewards/margins": 7.341128349304199, + "rewards/rejected": -4.194551467895508, + "step": 13157 + }, + { + "epoch": 3.29, + "grad_norm": 2.4769558906555176, + "learning_rate": 2.613371598303911e-06, + "logits/chosen": -0.587012529373169, + "logits/rejected": -0.6986173987388611, + "logps/chosen": -51.45128631591797, + "logps/rejected": -93.39863586425781, + "loss": 0.5995, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1113686561584473, + "rewards/margins": 6.963857173919678, + "rewards/rejected": -3.8524880409240723, + "step": 13158 + }, + { + "epoch": 3.29, + "grad_norm": 4.370944976806641, + "learning_rate": 2.612680959644381e-06, + "logits/chosen": -0.5810469388961792, + "logits/rejected": -0.6270045638084412, + "logps/chosen": -53.686553955078125, + "logps/rejected": -130.3653564453125, + "loss": 0.6289, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.955305814743042, + "rewards/margins": 8.198982238769531, + "rewards/rejected": -5.24367618560791, + "step": 13159 + }, + { + "epoch": 3.29, + "grad_norm": 5.998348712921143, + "learning_rate": 2.6119903799780445e-06, + "logits/chosen": -0.5732084512710571, + "logits/rejected": -0.6189056038856506, + "logps/chosen": -73.69961547851562, + "logps/rejected": -125.75294494628906, + "loss": 0.7079, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.26531982421875, + "rewards/margins": 7.500558853149414, + "rewards/rejected": -4.235239028930664, + "step": 13160 + }, + { + "epoch": 3.29, + "grad_norm": 5.608645915985107, + "learning_rate": 2.611299859321963e-06, + "logits/chosen": -0.5461857318878174, + "logits/rejected": -0.5930266380310059, + "logps/chosen": -55.469520568847656, + "logps/rejected": -102.8904800415039, + "loss": 0.7512, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3023881912231445, + "rewards/margins": 6.47452449798584, + "rewards/rejected": -3.1721367835998535, + "step": 13161 + }, + { + "epoch": 3.29, + "grad_norm": 5.480058193206787, + "learning_rate": 2.610609397693203e-06, + "logits/chosen": -0.5501519441604614, + "logits/rejected": -0.6262648701667786, + "logps/chosen": -63.42030334472656, + "logps/rejected": -101.25785827636719, + "loss": 0.7698, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1833243370056152, + "rewards/margins": 6.5294013023376465, + "rewards/rejected": -3.3460769653320312, + "step": 13162 + }, + { + "epoch": 3.29, + "grad_norm": 4.505186080932617, + "learning_rate": 2.6099189951088282e-06, + "logits/chosen": -0.5374597311019897, + "logits/rejected": -0.5932586193084717, + "logps/chosen": -60.36430740356445, + "logps/rejected": -105.73219299316406, + "loss": 0.6889, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2577006816864014, + "rewards/margins": 6.662015438079834, + "rewards/rejected": -3.4043145179748535, + "step": 13163 + }, + { + "epoch": 3.29, + "grad_norm": 6.079548358917236, + "learning_rate": 2.609228651585897e-06, + "logits/chosen": -0.5290039777755737, + "logits/rejected": -0.5745772123336792, + "logps/chosen": -53.993927001953125, + "logps/rejected": -100.69688415527344, + "loss": 0.6952, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.339372396469116, + "rewards/margins": 6.530455112457275, + "rewards/rejected": -3.1910829544067383, + "step": 13164 + }, + { + "epoch": 3.29, + "grad_norm": 4.495344161987305, + "learning_rate": 2.6085383671414677e-06, + "logits/chosen": -0.49092158675193787, + "logits/rejected": -0.5882779359817505, + "logps/chosen": -62.991947174072266, + "logps/rejected": -103.95072174072266, + "loss": 0.6439, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1625630855560303, + "rewards/margins": 6.781703472137451, + "rewards/rejected": -3.619140625, + "step": 13165 + }, + { + "epoch": 3.29, + "grad_norm": 15.604215621948242, + "learning_rate": 2.6078481417926004e-06, + "logits/chosen": -0.5727065801620483, + "logits/rejected": -0.6267280578613281, + "logps/chosen": -46.24283218383789, + "logps/rejected": -100.49827575683594, + "loss": 0.6128, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3288750648498535, + "rewards/margins": 7.6349992752075195, + "rewards/rejected": -4.306124210357666, + "step": 13166 + }, + { + "epoch": 3.29, + "grad_norm": 11.167322158813477, + "learning_rate": 2.60715797555635e-06, + "logits/chosen": -0.625842273235321, + "logits/rejected": -0.6897515654563904, + "logps/chosen": -51.95930099487305, + "logps/rejected": -128.7318572998047, + "loss": 0.5895, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0571999549865723, + "rewards/margins": 8.097724914550781, + "rewards/rejected": -5.040524959564209, + "step": 13167 + }, + { + "epoch": 3.29, + "grad_norm": 5.1081862449646, + "learning_rate": 2.6064678684497693e-06, + "logits/chosen": -0.5561670064926147, + "logits/rejected": -0.6098910570144653, + "logps/chosen": -56.47165298461914, + "logps/rejected": -106.6449966430664, + "loss": 0.6921, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.23087739944458, + "rewards/margins": 6.104902744293213, + "rewards/rejected": -2.874025583267212, + "step": 13168 + }, + { + "epoch": 3.29, + "grad_norm": 2.654404401779175, + "learning_rate": 2.6057778204899147e-06, + "logits/chosen": -0.5154219269752502, + "logits/rejected": -0.6341203451156616, + "logps/chosen": -57.746986389160156, + "logps/rejected": -96.24916076660156, + "loss": 0.5816, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9402623176574707, + "rewards/margins": 7.722023010253906, + "rewards/rejected": -4.7817606925964355, + "step": 13169 + }, + { + "epoch": 3.29, + "grad_norm": 3.0880112648010254, + "learning_rate": 2.6050878316938355e-06, + "logits/chosen": -0.6235575079917908, + "logits/rejected": -0.6831435561180115, + "logps/chosen": -43.716583251953125, + "logps/rejected": -99.36600494384766, + "loss": 0.596, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.968949794769287, + "rewards/margins": 7.117316722869873, + "rewards/rejected": -4.148367404937744, + "step": 13170 + }, + { + "epoch": 3.29, + "grad_norm": 9.739644050598145, + "learning_rate": 2.604397902078585e-06, + "logits/chosen": -0.4989830255508423, + "logits/rejected": -0.6067014932632446, + "logps/chosen": -65.523681640625, + "logps/rejected": -133.07533264160156, + "loss": 0.6738, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.841728925704956, + "rewards/margins": 7.183247089385986, + "rewards/rejected": -4.341518402099609, + "step": 13171 + }, + { + "epoch": 3.3, + "grad_norm": 2.474653959274292, + "learning_rate": 2.6037080316612106e-06, + "logits/chosen": -0.5444862246513367, + "logits/rejected": -0.6324014067649841, + "logps/chosen": -51.55359649658203, + "logps/rejected": -117.56705474853516, + "loss": 0.5402, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5182130336761475, + "rewards/margins": 7.76333475112915, + "rewards/rejected": -4.245121479034424, + "step": 13172 + }, + { + "epoch": 3.3, + "grad_norm": 3.805495023727417, + "learning_rate": 2.603018220458757e-06, + "logits/chosen": -0.48148053884506226, + "logits/rejected": -0.5819025039672852, + "logps/chosen": -58.60246276855469, + "logps/rejected": -94.18119812011719, + "loss": 0.5313, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.984250783920288, + "rewards/margins": 7.045120716094971, + "rewards/rejected": -4.0608696937561035, + "step": 13173 + }, + { + "epoch": 3.3, + "grad_norm": 6.360605239868164, + "learning_rate": 2.6023284684882734e-06, + "logits/chosen": -0.5205888748168945, + "logits/rejected": -0.5574215650558472, + "logps/chosen": -46.1204948425293, + "logps/rejected": -117.1659927368164, + "loss": 0.6535, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0168769359588623, + "rewards/margins": 6.514371871948242, + "rewards/rejected": -3.497494697570801, + "step": 13174 + }, + { + "epoch": 3.3, + "grad_norm": 4.31008768081665, + "learning_rate": 2.601638775766807e-06, + "logits/chosen": -0.5371236801147461, + "logits/rejected": -0.6184532642364502, + "logps/chosen": -47.898765563964844, + "logps/rejected": -99.11283111572266, + "loss": 0.6056, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2738609313964844, + "rewards/margins": 7.406046390533447, + "rewards/rejected": -4.132184982299805, + "step": 13175 + }, + { + "epoch": 3.3, + "grad_norm": 31.94400978088379, + "learning_rate": 2.600949142311394e-06, + "logits/chosen": -0.48829352855682373, + "logits/rejected": -0.6445452570915222, + "logps/chosen": -70.21354675292969, + "logps/rejected": -97.76075744628906, + "loss": 0.699, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.618852376937866, + "rewards/margins": 7.154019355773926, + "rewards/rejected": -4.5351667404174805, + "step": 13176 + }, + { + "epoch": 3.3, + "grad_norm": 5.806901931762695, + "learning_rate": 2.6002595681390796e-06, + "logits/chosen": -0.4767169952392578, + "logits/rejected": -0.5438525676727295, + "logps/chosen": -58.11424255371094, + "logps/rejected": -96.41209411621094, + "loss": 0.6602, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9424967765808105, + "rewards/margins": 6.449100971221924, + "rewards/rejected": -3.5066046714782715, + "step": 13177 + }, + { + "epoch": 3.3, + "grad_norm": 6.811811923980713, + "learning_rate": 2.599570053266908e-06, + "logits/chosen": -0.5970540046691895, + "logits/rejected": -0.6524956822395325, + "logps/chosen": -62.223350524902344, + "logps/rejected": -104.37799072265625, + "loss": 0.7346, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.843496084213257, + "rewards/margins": 6.827814102172852, + "rewards/rejected": -3.9843180179595947, + "step": 13178 + }, + { + "epoch": 3.3, + "grad_norm": 3.581723928451538, + "learning_rate": 2.5988805977119088e-06, + "logits/chosen": -0.5473519563674927, + "logits/rejected": -0.6290481090545654, + "logps/chosen": -45.48759078979492, + "logps/rejected": -86.89813995361328, + "loss": 0.6819, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.404128313064575, + "rewards/margins": 6.37253475189209, + "rewards/rejected": -2.9684064388275146, + "step": 13179 + }, + { + "epoch": 3.3, + "grad_norm": 4.15335750579834, + "learning_rate": 2.598191201491124e-06, + "logits/chosen": -0.5849140882492065, + "logits/rejected": -0.6580874919891357, + "logps/chosen": -49.15104293823242, + "logps/rejected": -87.55304718017578, + "loss": 0.6012, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8516669273376465, + "rewards/margins": 6.609424591064453, + "rewards/rejected": -3.7577579021453857, + "step": 13180 + }, + { + "epoch": 3.3, + "grad_norm": 6.130287170410156, + "learning_rate": 2.5975018646215914e-06, + "logits/chosen": -0.6006811857223511, + "logits/rejected": -0.6664898991584778, + "logps/chosen": -57.65720748901367, + "logps/rejected": -112.00250244140625, + "loss": 0.6625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8338825702667236, + "rewards/margins": 6.518301486968994, + "rewards/rejected": -3.6844191551208496, + "step": 13181 + }, + { + "epoch": 3.3, + "grad_norm": 4.745941638946533, + "learning_rate": 2.596812587120343e-06, + "logits/chosen": -0.5657064318656921, + "logits/rejected": -0.6359073519706726, + "logps/chosen": -50.88300704956055, + "logps/rejected": -97.76206970214844, + "loss": 0.7182, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.238672971725464, + "rewards/margins": 6.219702243804932, + "rewards/rejected": -2.9810287952423096, + "step": 13182 + }, + { + "epoch": 3.3, + "grad_norm": 1.8504828214645386, + "learning_rate": 2.596123369004411e-06, + "logits/chosen": -0.5051039457321167, + "logits/rejected": -0.6024858355522156, + "logps/chosen": -53.69938659667969, + "logps/rejected": -103.86504364013672, + "loss": 0.5165, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2315988540649414, + "rewards/margins": 7.827807426452637, + "rewards/rejected": -4.596209526062012, + "step": 13183 + }, + { + "epoch": 3.3, + "grad_norm": 2.82861065864563, + "learning_rate": 2.595434210290828e-06, + "logits/chosen": -0.5674829483032227, + "logits/rejected": -0.6822165250778198, + "logps/chosen": -56.25460433959961, + "logps/rejected": -95.01007843017578, + "loss": 0.5375, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.109635353088379, + "rewards/margins": 7.3763628005981445, + "rewards/rejected": -4.266727447509766, + "step": 13184 + }, + { + "epoch": 3.3, + "grad_norm": 5.846867561340332, + "learning_rate": 2.594745110996624e-06, + "logits/chosen": -0.5870555639266968, + "logits/rejected": -0.6841014623641968, + "logps/chosen": -45.185516357421875, + "logps/rejected": -83.51177978515625, + "loss": 0.6256, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.063549757003784, + "rewards/margins": 6.334219932556152, + "rewards/rejected": -3.2706704139709473, + "step": 13185 + }, + { + "epoch": 3.3, + "grad_norm": 13.517546653747559, + "learning_rate": 2.594056071138824e-06, + "logits/chosen": -0.5456435084342957, + "logits/rejected": -0.6525200605392456, + "logps/chosen": -64.01998901367188, + "logps/rejected": -101.42109680175781, + "loss": 0.6687, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8666834831237793, + "rewards/margins": 6.198032855987549, + "rewards/rejected": -3.3313498497009277, + "step": 13186 + }, + { + "epoch": 3.3, + "grad_norm": 7.780872344970703, + "learning_rate": 2.59336709073446e-06, + "logits/chosen": -0.5492115020751953, + "logits/rejected": -0.5479667782783508, + "logps/chosen": -49.809173583984375, + "logps/rejected": -115.55223846435547, + "loss": 0.7349, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.259683609008789, + "rewards/margins": 6.138940334320068, + "rewards/rejected": -2.8792567253112793, + "step": 13187 + }, + { + "epoch": 3.3, + "grad_norm": 4.004763126373291, + "learning_rate": 2.5926781698005525e-06, + "logits/chosen": -0.5819096565246582, + "logits/rejected": -0.636103093624115, + "logps/chosen": -53.68798065185547, + "logps/rejected": -100.02049255371094, + "loss": 0.6738, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2168402671813965, + "rewards/margins": 6.882741451263428, + "rewards/rejected": -3.665900945663452, + "step": 13188 + }, + { + "epoch": 3.3, + "grad_norm": 3.0932061672210693, + "learning_rate": 2.59198930835413e-06, + "logits/chosen": -0.5829967260360718, + "logits/rejected": -0.6841480731964111, + "logps/chosen": -47.69186019897461, + "logps/rejected": -111.54595947265625, + "loss": 0.552, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.500271797180176, + "rewards/margins": 9.110424995422363, + "rewards/rejected": -5.6101531982421875, + "step": 13189 + }, + { + "epoch": 3.3, + "grad_norm": 3.568565607070923, + "learning_rate": 2.5913005064122137e-06, + "logits/chosen": -0.5563409924507141, + "logits/rejected": -0.6477216482162476, + "logps/chosen": -59.48455810546875, + "logps/rejected": -111.01822662353516, + "loss": 0.6258, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.22841739654541, + "rewards/margins": 7.827136993408203, + "rewards/rejected": -4.598719596862793, + "step": 13190 + }, + { + "epoch": 3.3, + "grad_norm": 9.284058570861816, + "learning_rate": 2.5906117639918216e-06, + "logits/chosen": -0.5832849740982056, + "logits/rejected": -0.657267689704895, + "logps/chosen": -53.299827575683594, + "logps/rejected": -116.92070007324219, + "loss": 0.6139, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.480475425720215, + "rewards/margins": 6.200104713439941, + "rewards/rejected": -2.719630002975464, + "step": 13191 + }, + { + "epoch": 3.3, + "grad_norm": 3.8017847537994385, + "learning_rate": 2.5899230811099772e-06, + "logits/chosen": -0.5099051594734192, + "logits/rejected": -0.6147638559341431, + "logps/chosen": -48.963539123535156, + "logps/rejected": -99.44963836669922, + "loss": 0.5491, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0538864135742188, + "rewards/margins": 6.986052513122559, + "rewards/rejected": -3.932166337966919, + "step": 13192 + }, + { + "epoch": 3.3, + "grad_norm": 6.522525787353516, + "learning_rate": 2.589234457783697e-06, + "logits/chosen": -0.5195890665054321, + "logits/rejected": -0.5881345272064209, + "logps/chosen": -56.37273025512695, + "logps/rejected": -108.78439331054688, + "loss": 0.576, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.241833448410034, + "rewards/margins": 7.623327732086182, + "rewards/rejected": -4.38149356842041, + "step": 13193 + }, + { + "epoch": 3.3, + "grad_norm": 10.248307228088379, + "learning_rate": 2.588545894029996e-06, + "logits/chosen": -0.645197868347168, + "logits/rejected": -0.7288273572921753, + "logps/chosen": -57.17354202270508, + "logps/rejected": -92.69213104248047, + "loss": 0.7154, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.873823642730713, + "rewards/margins": 6.105173110961914, + "rewards/rejected": -3.231348752975464, + "step": 13194 + }, + { + "epoch": 3.3, + "grad_norm": 3.7320454120635986, + "learning_rate": 2.587857389865891e-06, + "logits/chosen": -0.5761964321136475, + "logits/rejected": -0.6305540204048157, + "logps/chosen": -52.55923080444336, + "logps/rejected": -94.90425109863281, + "loss": 0.6552, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.357910633087158, + "rewards/margins": 5.870495319366455, + "rewards/rejected": -2.5125844478607178, + "step": 13195 + }, + { + "epoch": 3.3, + "grad_norm": 10.076725006103516, + "learning_rate": 2.587168945308397e-06, + "logits/chosen": -0.5252731442451477, + "logits/rejected": -0.6208321452140808, + "logps/chosen": -58.72283935546875, + "logps/rejected": -96.33355712890625, + "loss": 0.633, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9560928344726562, + "rewards/margins": 6.319562911987305, + "rewards/rejected": -3.3634698390960693, + "step": 13196 + }, + { + "epoch": 3.3, + "grad_norm": 6.52748441696167, + "learning_rate": 2.586480560374525e-06, + "logits/chosen": -0.521286129951477, + "logits/rejected": -0.6133179068565369, + "logps/chosen": -59.73708724975586, + "logps/rejected": -93.62862396240234, + "loss": 0.6469, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.854212522506714, + "rewards/margins": 6.886612415313721, + "rewards/rejected": -4.032399654388428, + "step": 13197 + }, + { + "epoch": 3.3, + "grad_norm": 3.01465106010437, + "learning_rate": 2.5857922350812836e-06, + "logits/chosen": -0.5282533168792725, + "logits/rejected": -0.6491883397102356, + "logps/chosen": -55.81144332885742, + "logps/rejected": -99.90731811523438, + "loss": 0.5416, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9823291301727295, + "rewards/margins": 7.383662700653076, + "rewards/rejected": -4.401334762573242, + "step": 13198 + }, + { + "epoch": 3.3, + "grad_norm": 5.010983467102051, + "learning_rate": 2.585103969445686e-06, + "logits/chosen": -0.5913984775543213, + "logits/rejected": -0.6584863066673279, + "logps/chosen": -54.880760192871094, + "logps/rejected": -111.96916961669922, + "loss": 0.6672, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0832691192626953, + "rewards/margins": 7.047808647155762, + "rewards/rejected": -3.964539051055908, + "step": 13199 + }, + { + "epoch": 3.3, + "grad_norm": 3.167806386947632, + "learning_rate": 2.5844157634847374e-06, + "logits/chosen": -0.5967330932617188, + "logits/rejected": -0.6902353763580322, + "logps/chosen": -59.55720901489258, + "logps/rejected": -94.47515869140625, + "loss": 0.6741, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2370264530181885, + "rewards/margins": 6.335605144500732, + "rewards/rejected": -3.098578691482544, + "step": 13200 + }, + { + "epoch": 3.3, + "grad_norm": 3.3458807468414307, + "learning_rate": 2.583727617215443e-06, + "logits/chosen": -0.6296804547309875, + "logits/rejected": -0.7030806541442871, + "logps/chosen": -61.32061004638672, + "logps/rejected": -109.23397827148438, + "loss": 0.6141, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.001605987548828, + "rewards/margins": 7.4108476638793945, + "rewards/rejected": -4.40924072265625, + "step": 13201 + }, + { + "epoch": 3.3, + "grad_norm": 2.9335246086120605, + "learning_rate": 2.583039530654811e-06, + "logits/chosen": -0.5486429929733276, + "logits/rejected": -0.6527818441390991, + "logps/chosen": -48.711360931396484, + "logps/rejected": -110.41499328613281, + "loss": 0.5249, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.111971855163574, + "rewards/margins": 7.580657958984375, + "rewards/rejected": -4.468686103820801, + "step": 13202 + }, + { + "epoch": 3.3, + "grad_norm": 5.810513019561768, + "learning_rate": 2.582351503819843e-06, + "logits/chosen": -0.5512983202934265, + "logits/rejected": -0.6274229288101196, + "logps/chosen": -59.1749382019043, + "logps/rejected": -112.31227111816406, + "loss": 0.6955, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1588268280029297, + "rewards/margins": 6.812469005584717, + "rewards/rejected": -3.653642416000366, + "step": 13203 + }, + { + "epoch": 3.3, + "grad_norm": 4.771407127380371, + "learning_rate": 2.5816635367275387e-06, + "logits/chosen": -0.5983166694641113, + "logits/rejected": -0.6549574732780457, + "logps/chosen": -48.170257568359375, + "logps/rejected": -83.30492401123047, + "loss": 0.6707, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0893170833587646, + "rewards/margins": 5.743797779083252, + "rewards/rejected": -2.654480218887329, + "step": 13204 + }, + { + "epoch": 3.3, + "grad_norm": 4.903730392456055, + "learning_rate": 2.5809756293949032e-06, + "logits/chosen": -0.586763858795166, + "logits/rejected": -0.6475309133529663, + "logps/chosen": -56.81024169921875, + "logps/rejected": -99.47258758544922, + "loss": 0.6596, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0386297702789307, + "rewards/margins": 6.747560501098633, + "rewards/rejected": -3.7089309692382812, + "step": 13205 + }, + { + "epoch": 3.3, + "grad_norm": 5.604221343994141, + "learning_rate": 2.5802877818389312e-06, + "logits/chosen": -0.5683909058570862, + "logits/rejected": -0.6216787695884705, + "logps/chosen": -56.41852569580078, + "logps/rejected": -107.03836059570312, + "loss": 0.6933, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7582309246063232, + "rewards/margins": 6.858164310455322, + "rewards/rejected": -4.099933624267578, + "step": 13206 + }, + { + "epoch": 3.3, + "grad_norm": 6.04518461227417, + "learning_rate": 2.579599994076623e-06, + "logits/chosen": -0.5901177525520325, + "logits/rejected": -0.6428039073944092, + "logps/chosen": -50.786865234375, + "logps/rejected": -84.97724914550781, + "loss": 0.7164, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.291513681411743, + "rewards/margins": 5.464295864105225, + "rewards/rejected": -2.1727824211120605, + "step": 13207 + }, + { + "epoch": 3.3, + "grad_norm": 5.523461818695068, + "learning_rate": 2.5789122661249723e-06, + "logits/chosen": -0.4870814085006714, + "logits/rejected": -0.5634068846702576, + "logps/chosen": -59.082401275634766, + "logps/rejected": -98.30165100097656, + "loss": 0.6673, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.022165298461914, + "rewards/margins": 5.917909622192383, + "rewards/rejected": -2.895744562149048, + "step": 13208 + }, + { + "epoch": 3.3, + "grad_norm": 6.739748477935791, + "learning_rate": 2.5782245980009756e-06, + "logits/chosen": -0.527801513671875, + "logits/rejected": -0.6006129384040833, + "logps/chosen": -46.48145294189453, + "logps/rejected": -87.97219848632812, + "loss": 0.6588, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0823822021484375, + "rewards/margins": 6.040204048156738, + "rewards/rejected": -2.9578213691711426, + "step": 13209 + }, + { + "epoch": 3.3, + "grad_norm": 5.841495990753174, + "learning_rate": 2.577536989721625e-06, + "logits/chosen": -0.5370497703552246, + "logits/rejected": -0.6334753036499023, + "logps/chosen": -58.8972282409668, + "logps/rejected": -100.78276062011719, + "loss": 0.6562, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9955544471740723, + "rewards/margins": 6.600177764892578, + "rewards/rejected": -3.604623317718506, + "step": 13210 + }, + { + "epoch": 3.3, + "grad_norm": 5.414416790008545, + "learning_rate": 2.5768494413039113e-06, + "logits/chosen": -0.5190205574035645, + "logits/rejected": -0.5771193504333496, + "logps/chosen": -63.01832962036133, + "logps/rejected": -95.89139556884766, + "loss": 0.6926, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1047136783599854, + "rewards/margins": 5.946802616119385, + "rewards/rejected": -2.8420889377593994, + "step": 13211 + }, + { + "epoch": 3.31, + "grad_norm": 4.942816734313965, + "learning_rate": 2.5761619527648263e-06, + "logits/chosen": -0.5489755868911743, + "logits/rejected": -0.633519172668457, + "logps/chosen": -52.546409606933594, + "logps/rejected": -101.99114990234375, + "loss": 0.6367, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1140177249908447, + "rewards/margins": 6.274234771728516, + "rewards/rejected": -3.160217046737671, + "step": 13212 + }, + { + "epoch": 3.31, + "grad_norm": 3.7497453689575195, + "learning_rate": 2.5754745241213558e-06, + "logits/chosen": -0.5845963358879089, + "logits/rejected": -0.6306148767471313, + "logps/chosen": -49.90208435058594, + "logps/rejected": -106.37826538085938, + "loss": 0.6199, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.166180372238159, + "rewards/margins": 6.393437385559082, + "rewards/rejected": -3.2272567749023438, + "step": 13213 + }, + { + "epoch": 3.31, + "grad_norm": 4.895814895629883, + "learning_rate": 2.5747871553904902e-06, + "logits/chosen": -0.641314685344696, + "logits/rejected": -0.7278088331222534, + "logps/chosen": -53.636924743652344, + "logps/rejected": -95.76823425292969, + "loss": 0.5753, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5446579456329346, + "rewards/margins": 7.450964450836182, + "rewards/rejected": -3.9063069820404053, + "step": 13214 + }, + { + "epoch": 3.31, + "grad_norm": 2.664130926132202, + "learning_rate": 2.574099846589214e-06, + "logits/chosen": -0.5392393469810486, + "logits/rejected": -0.6790469884872437, + "logps/chosen": -47.28307342529297, + "logps/rejected": -81.93058013916016, + "loss": 0.5186, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.239748239517212, + "rewards/margins": 7.5360612869262695, + "rewards/rejected": -4.296313285827637, + "step": 13215 + }, + { + "epoch": 3.31, + "grad_norm": 5.431685924530029, + "learning_rate": 2.5734125977345093e-06, + "logits/chosen": -0.49374252557754517, + "logits/rejected": -0.5719038844108582, + "logps/chosen": -72.02747344970703, + "logps/rejected": -102.73098754882812, + "loss": 0.6999, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1778781414031982, + "rewards/margins": 5.607394218444824, + "rewards/rejected": -2.4295156002044678, + "step": 13216 + }, + { + "epoch": 3.31, + "grad_norm": 2.8549256324768066, + "learning_rate": 2.5727254088433617e-06, + "logits/chosen": -0.611497163772583, + "logits/rejected": -0.6967614889144897, + "logps/chosen": -45.314735412597656, + "logps/rejected": -100.13406372070312, + "loss": 0.5131, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.075122117996216, + "rewards/margins": 7.339189052581787, + "rewards/rejected": -4.26406717300415, + "step": 13217 + }, + { + "epoch": 3.31, + "grad_norm": 4.049210548400879, + "learning_rate": 2.5720382799327516e-06, + "logits/chosen": -0.5165108442306519, + "logits/rejected": -0.586437463760376, + "logps/chosen": -43.27554702758789, + "logps/rejected": -108.42787170410156, + "loss": 0.5621, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0723977088928223, + "rewards/margins": 7.416883945465088, + "rewards/rejected": -4.344486713409424, + "step": 13218 + }, + { + "epoch": 3.31, + "grad_norm": 4.9240641593933105, + "learning_rate": 2.571351211019655e-06, + "logits/chosen": -0.5728500485420227, + "logits/rejected": -0.6263847947120667, + "logps/chosen": -66.61157989501953, + "logps/rejected": -94.60504150390625, + "loss": 0.6712, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0657033920288086, + "rewards/margins": 5.60509729385376, + "rewards/rejected": -2.539393663406372, + "step": 13219 + }, + { + "epoch": 3.31, + "grad_norm": 12.558643341064453, + "learning_rate": 2.5706642021210543e-06, + "logits/chosen": -0.6544642448425293, + "logits/rejected": -0.7179173231124878, + "logps/chosen": -57.31352233886719, + "logps/rejected": -99.8376693725586, + "loss": 0.8037, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1375951766967773, + "rewards/margins": 5.759606838226318, + "rewards/rejected": -2.622011423110962, + "step": 13220 + }, + { + "epoch": 3.31, + "grad_norm": 2.7278971672058105, + "learning_rate": 2.5699772532539285e-06, + "logits/chosen": -0.5810264348983765, + "logits/rejected": -0.6185635328292847, + "logps/chosen": -42.06609344482422, + "logps/rejected": -107.9482650756836, + "loss": 0.5677, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2052664756774902, + "rewards/margins": 6.646028995513916, + "rewards/rejected": -3.440762519836426, + "step": 13221 + }, + { + "epoch": 3.31, + "grad_norm": 5.322582721710205, + "learning_rate": 2.569290364435246e-06, + "logits/chosen": -0.5872546434402466, + "logits/rejected": -0.692331075668335, + "logps/chosen": -63.91569137573242, + "logps/rejected": -113.59097290039062, + "loss": 0.6843, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0611155033111572, + "rewards/margins": 7.456295967102051, + "rewards/rejected": -4.3951802253723145, + "step": 13222 + }, + { + "epoch": 3.31, + "grad_norm": 4.855027675628662, + "learning_rate": 2.568603535681984e-06, + "logits/chosen": -0.5494632124900818, + "logits/rejected": -0.5454202890396118, + "logps/chosen": -48.41493225097656, + "logps/rejected": -140.9385223388672, + "loss": 0.6183, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.115309715270996, + "rewards/margins": 8.237092018127441, + "rewards/rejected": -5.121783256530762, + "step": 13223 + }, + { + "epoch": 3.31, + "grad_norm": 4.101282119750977, + "learning_rate": 2.567916767011116e-06, + "logits/chosen": -0.49816980957984924, + "logits/rejected": -0.5790491104125977, + "logps/chosen": -60.815391540527344, + "logps/rejected": -97.22339630126953, + "loss": 0.5727, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7986574172973633, + "rewards/margins": 5.749977111816406, + "rewards/rejected": -2.951319694519043, + "step": 13224 + }, + { + "epoch": 3.31, + "grad_norm": 5.571485996246338, + "learning_rate": 2.567230058439612e-06, + "logits/chosen": -0.5586952567100525, + "logits/rejected": -0.6583718657493591, + "logps/chosen": -58.39720153808594, + "logps/rejected": -112.52888488769531, + "loss": 0.6112, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6991686820983887, + "rewards/margins": 7.587826728820801, + "rewards/rejected": -4.88865852355957, + "step": 13225 + }, + { + "epoch": 3.31, + "grad_norm": 4.859243392944336, + "learning_rate": 2.5665434099844388e-06, + "logits/chosen": -0.5997898578643799, + "logits/rejected": -0.6840553879737854, + "logps/chosen": -53.963966369628906, + "logps/rejected": -90.19200134277344, + "loss": 0.6006, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0550971031188965, + "rewards/margins": 6.0433149337768555, + "rewards/rejected": -2.988217830657959, + "step": 13226 + }, + { + "epoch": 3.31, + "grad_norm": 3.3106486797332764, + "learning_rate": 2.565856821662569e-06, + "logits/chosen": -0.5642072558403015, + "logits/rejected": -0.6151036620140076, + "logps/chosen": -47.536376953125, + "logps/rejected": -100.30169677734375, + "loss": 0.5634, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0280675888061523, + "rewards/margins": 6.759559154510498, + "rewards/rejected": -3.7314913272857666, + "step": 13227 + }, + { + "epoch": 3.31, + "grad_norm": 3.01198148727417, + "learning_rate": 2.565170293490965e-06, + "logits/chosen": -0.6223917007446289, + "logits/rejected": -0.6889426708221436, + "logps/chosen": -53.698516845703125, + "logps/rejected": -112.4996109008789, + "loss": 0.6009, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9187207221984863, + "rewards/margins": 7.601301193237305, + "rewards/rejected": -4.682580471038818, + "step": 13228 + }, + { + "epoch": 3.31, + "grad_norm": 12.876273155212402, + "learning_rate": 2.564483825486592e-06, + "logits/chosen": -0.5424021482467651, + "logits/rejected": -0.5856939554214478, + "logps/chosen": -62.629638671875, + "logps/rejected": -121.88812255859375, + "loss": 0.6415, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.261845827102661, + "rewards/margins": 7.809543609619141, + "rewards/rejected": -4.547698020935059, + "step": 13229 + }, + { + "epoch": 3.31, + "grad_norm": 5.054757118225098, + "learning_rate": 2.5637974176664156e-06, + "logits/chosen": -0.5475966334342957, + "logits/rejected": -0.6179689764976501, + "logps/chosen": -54.93083572387695, + "logps/rejected": -103.10667419433594, + "loss": 0.7427, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.098649740219116, + "rewards/margins": 6.293910980224609, + "rewards/rejected": -3.1952614784240723, + "step": 13230 + }, + { + "epoch": 3.31, + "grad_norm": 4.109143257141113, + "learning_rate": 2.563111070047394e-06, + "logits/chosen": -0.5580734610557556, + "logits/rejected": -0.6275447010993958, + "logps/chosen": -56.433799743652344, + "logps/rejected": -98.54650115966797, + "loss": 0.571, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1439220905303955, + "rewards/margins": 6.684863090515137, + "rewards/rejected": -3.5409414768218994, + "step": 13231 + }, + { + "epoch": 3.31, + "grad_norm": 11.3135404586792, + "learning_rate": 2.562424782646492e-06, + "logits/chosen": -0.5813100337982178, + "logits/rejected": -0.649967610836029, + "logps/chosen": -53.93505859375, + "logps/rejected": -97.93522644042969, + "loss": 0.6423, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9560353755950928, + "rewards/margins": 5.554748058319092, + "rewards/rejected": -2.598712921142578, + "step": 13232 + }, + { + "epoch": 3.31, + "grad_norm": 4.891341686248779, + "learning_rate": 2.561738555480667e-06, + "logits/chosen": -0.5754640102386475, + "logits/rejected": -0.6904259920120239, + "logps/chosen": -49.49764633178711, + "logps/rejected": -97.32765197753906, + "loss": 0.5968, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1310315132141113, + "rewards/margins": 7.799337863922119, + "rewards/rejected": -4.668306350708008, + "step": 13233 + }, + { + "epoch": 3.31, + "grad_norm": 4.514632701873779, + "learning_rate": 2.561052388566873e-06, + "logits/chosen": -0.5417050719261169, + "logits/rejected": -0.6290168762207031, + "logps/chosen": -45.70014190673828, + "logps/rejected": -118.65747833251953, + "loss": 0.6152, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2896006107330322, + "rewards/margins": 7.6645708084106445, + "rewards/rejected": -4.374970436096191, + "step": 13234 + }, + { + "epoch": 3.31, + "grad_norm": 12.655162811279297, + "learning_rate": 2.560366281922071e-06, + "logits/chosen": -0.6279770731925964, + "logits/rejected": -0.6940696239471436, + "logps/chosen": -46.05732727050781, + "logps/rejected": -94.24942016601562, + "loss": 0.6374, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.107287883758545, + "rewards/margins": 6.960485458374023, + "rewards/rejected": -3.8531973361968994, + "step": 13235 + }, + { + "epoch": 3.31, + "grad_norm": 3.801276922225952, + "learning_rate": 2.559680235563213e-06, + "logits/chosen": -0.5823901891708374, + "logits/rejected": -0.6164513230323792, + "logps/chosen": -58.01218795776367, + "logps/rejected": -105.27179718017578, + "loss": 0.6139, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.410675048828125, + "rewards/margins": 5.420729637145996, + "rewards/rejected": -2.010054588317871, + "step": 13236 + }, + { + "epoch": 3.31, + "grad_norm": 2.9145729541778564, + "learning_rate": 2.55899424950725e-06, + "logits/chosen": -0.6112513542175293, + "logits/rejected": -0.6732223629951477, + "logps/chosen": -58.640472412109375, + "logps/rejected": -119.24458312988281, + "loss": 0.6192, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0898027420043945, + "rewards/margins": 7.282931804656982, + "rewards/rejected": -4.193129062652588, + "step": 13237 + }, + { + "epoch": 3.31, + "grad_norm": 6.55824613571167, + "learning_rate": 2.5583083237711358e-06, + "logits/chosen": -0.5278446674346924, + "logits/rejected": -0.586288571357727, + "logps/chosen": -48.8756217956543, + "logps/rejected": -117.55308532714844, + "loss": 0.6498, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1479921340942383, + "rewards/margins": 7.342030048370361, + "rewards/rejected": -4.194038391113281, + "step": 13238 + }, + { + "epoch": 3.31, + "grad_norm": 11.5806884765625, + "learning_rate": 2.557622458371821e-06, + "logits/chosen": -0.618107259273529, + "logits/rejected": -0.6680229306221008, + "logps/chosen": -62.40042495727539, + "logps/rejected": -117.65188598632812, + "loss": 0.8058, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9065940380096436, + "rewards/margins": 7.7207183837890625, + "rewards/rejected": -4.814124584197998, + "step": 13239 + }, + { + "epoch": 3.31, + "grad_norm": 14.717272758483887, + "learning_rate": 2.556936653326254e-06, + "logits/chosen": -0.5859148502349854, + "logits/rejected": -0.6524629592895508, + "logps/chosen": -57.92619323730469, + "logps/rejected": -101.49227142333984, + "loss": 0.6811, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.698228359222412, + "rewards/margins": 5.941452980041504, + "rewards/rejected": -3.24322509765625, + "step": 13240 + }, + { + "epoch": 3.31, + "grad_norm": 3.131772756576538, + "learning_rate": 2.5562509086513787e-06, + "logits/chosen": -0.6160291433334351, + "logits/rejected": -0.6552962064743042, + "logps/chosen": -58.73300552368164, + "logps/rejected": -107.45318603515625, + "loss": 0.6622, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2268409729003906, + "rewards/margins": 7.518902778625488, + "rewards/rejected": -4.292061805725098, + "step": 13241 + }, + { + "epoch": 3.31, + "grad_norm": 4.732209205627441, + "learning_rate": 2.5555652243641448e-06, + "logits/chosen": -0.5383984446525574, + "logits/rejected": -0.5895375609397888, + "logps/chosen": -55.31779479980469, + "logps/rejected": -98.60399627685547, + "loss": 0.6683, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.222007989883423, + "rewards/margins": 6.214571952819824, + "rewards/rejected": -2.992563486099243, + "step": 13242 + }, + { + "epoch": 3.31, + "grad_norm": 8.206854820251465, + "learning_rate": 2.554879600481494e-06, + "logits/chosen": -0.5110177397727966, + "logits/rejected": -0.5914828777313232, + "logps/chosen": -47.23573303222656, + "logps/rejected": -87.32540893554688, + "loss": 0.5717, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7420732975006104, + "rewards/margins": 6.192604064941406, + "rewards/rejected": -3.450530529022217, + "step": 13243 + }, + { + "epoch": 3.31, + "grad_norm": 13.72877025604248, + "learning_rate": 2.5541940370203677e-06, + "logits/chosen": -0.5238611698150635, + "logits/rejected": -0.5666810870170593, + "logps/chosen": -63.89777755737305, + "logps/rejected": -124.0137939453125, + "loss": 0.7708, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.880816698074341, + "rewards/margins": 6.479991436004639, + "rewards/rejected": -3.599175214767456, + "step": 13244 + }, + { + "epoch": 3.31, + "grad_norm": 6.808953762054443, + "learning_rate": 2.5535085339977097e-06, + "logits/chosen": -0.46474534273147583, + "logits/rejected": -0.5716520547866821, + "logps/chosen": -68.08695220947266, + "logps/rejected": -110.78121948242188, + "loss": 0.6245, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9112706184387207, + "rewards/margins": 6.082884311676025, + "rewards/rejected": -3.171613931655884, + "step": 13245 + }, + { + "epoch": 3.31, + "grad_norm": 11.161800384521484, + "learning_rate": 2.5528230914304574e-06, + "logits/chosen": -0.5651894211769104, + "logits/rejected": -0.6659299731254578, + "logps/chosen": -53.871551513671875, + "logps/rejected": -95.77152252197266, + "loss": 0.5726, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1388282775878906, + "rewards/margins": 7.826996326446533, + "rewards/rejected": -4.688167572021484, + "step": 13246 + }, + { + "epoch": 3.31, + "grad_norm": 3.7824463844299316, + "learning_rate": 2.5521377093355483e-06, + "logits/chosen": -0.47690877318382263, + "logits/rejected": -0.6064823269844055, + "logps/chosen": -64.55890655517578, + "logps/rejected": -82.96771240234375, + "loss": 0.6061, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.041158437728882, + "rewards/margins": 5.902702331542969, + "rewards/rejected": -2.861544132232666, + "step": 13247 + }, + { + "epoch": 3.31, + "grad_norm": 3.6832685470581055, + "learning_rate": 2.551452387729921e-06, + "logits/chosen": -0.6372114419937134, + "logits/rejected": -0.7197949290275574, + "logps/chosen": -61.92431640625, + "logps/rejected": -94.54855346679688, + "loss": 0.6675, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1802656650543213, + "rewards/margins": 6.848266124725342, + "rewards/rejected": -3.6680006980895996, + "step": 13248 + }, + { + "epoch": 3.31, + "grad_norm": 5.504367351531982, + "learning_rate": 2.550767126630509e-06, + "logits/chosen": -0.6140871644020081, + "logits/rejected": -0.6993659734725952, + "logps/chosen": -60.80690383911133, + "logps/rejected": -106.2950668334961, + "loss": 0.6096, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.991633653640747, + "rewards/margins": 7.298766136169434, + "rewards/rejected": -4.307131767272949, + "step": 13249 + }, + { + "epoch": 3.31, + "grad_norm": 6.123153209686279, + "learning_rate": 2.5500819260542476e-06, + "logits/chosen": -0.49862372875213623, + "logits/rejected": -0.5724501013755798, + "logps/chosen": -65.68687438964844, + "logps/rejected": -104.00999450683594, + "loss": 0.657, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.000626564025879, + "rewards/margins": 6.573361396789551, + "rewards/rejected": -3.5727343559265137, + "step": 13250 + }, + { + "epoch": 3.31, + "grad_norm": 24.084848403930664, + "learning_rate": 2.5493967860180656e-06, + "logits/chosen": -0.5231172442436218, + "logits/rejected": -0.5837339162826538, + "logps/chosen": -56.74694061279297, + "logps/rejected": -111.00859069824219, + "loss": 0.6434, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.258390188217163, + "rewards/margins": 6.839138984680176, + "rewards/rejected": -3.58074951171875, + "step": 13251 + }, + { + "epoch": 3.32, + "grad_norm": 3.615267038345337, + "learning_rate": 2.548711706538898e-06, + "logits/chosen": -0.5500085949897766, + "logits/rejected": -0.6403510570526123, + "logps/chosen": -52.16687774658203, + "logps/rejected": -98.49815368652344, + "loss": 0.6057, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2310678958892822, + "rewards/margins": 6.510461807250977, + "rewards/rejected": -3.279393196105957, + "step": 13252 + }, + { + "epoch": 3.32, + "grad_norm": 3.4966189861297607, + "learning_rate": 2.548026687633671e-06, + "logits/chosen": -0.5972437858581543, + "logits/rejected": -0.6772087812423706, + "logps/chosen": -49.6763801574707, + "logps/rejected": -120.08340454101562, + "loss": 0.6183, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.859952449798584, + "rewards/margins": 7.860798358917236, + "rewards/rejected": -5.000845432281494, + "step": 13253 + }, + { + "epoch": 3.32, + "grad_norm": 4.126640796661377, + "learning_rate": 2.5473417293193113e-06, + "logits/chosen": -0.5142251253128052, + "logits/rejected": -0.5608769655227661, + "logps/chosen": -59.41818618774414, + "logps/rejected": -109.08242797851562, + "loss": 0.6096, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1845335960388184, + "rewards/margins": 6.815575122833252, + "rewards/rejected": -3.6310415267944336, + "step": 13254 + }, + { + "epoch": 3.32, + "grad_norm": 7.932367324829102, + "learning_rate": 2.546656831612748e-06, + "logits/chosen": -0.5105465650558472, + "logits/rejected": -0.5372501015663147, + "logps/chosen": -50.80574035644531, + "logps/rejected": -115.79104614257812, + "loss": 0.5892, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.030351400375366, + "rewards/margins": 6.873591423034668, + "rewards/rejected": -3.8432393074035645, + "step": 13255 + }, + { + "epoch": 3.32, + "grad_norm": 4.493417263031006, + "learning_rate": 2.5459719945309013e-06, + "logits/chosen": -0.5547482371330261, + "logits/rejected": -0.6409795880317688, + "logps/chosen": -53.71002960205078, + "logps/rejected": -96.56903076171875, + "loss": 0.5288, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4286859035491943, + "rewards/margins": 6.92488956451416, + "rewards/rejected": -3.4962031841278076, + "step": 13256 + }, + { + "epoch": 3.32, + "grad_norm": 8.37209415435791, + "learning_rate": 2.5452872180906995e-06, + "logits/chosen": -0.4923183023929596, + "logits/rejected": -0.6116082668304443, + "logps/chosen": -61.47874450683594, + "logps/rejected": -93.56800842285156, + "loss": 0.68, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9443559646606445, + "rewards/margins": 6.439132213592529, + "rewards/rejected": -3.4947757720947266, + "step": 13257 + }, + { + "epoch": 3.32, + "grad_norm": 2.34452748298645, + "learning_rate": 2.5446025023090613e-06, + "logits/chosen": -0.588529109954834, + "logits/rejected": -0.6335719227790833, + "logps/chosen": -51.74113464355469, + "logps/rejected": -129.49911499023438, + "loss": 0.6058, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.131985902786255, + "rewards/margins": 7.846649169921875, + "rewards/rejected": -4.714663505554199, + "step": 13258 + }, + { + "epoch": 3.32, + "grad_norm": 4.840198993682861, + "learning_rate": 2.5439178472029046e-06, + "logits/chosen": -0.5447785258293152, + "logits/rejected": -0.6494585275650024, + "logps/chosen": -55.83692932128906, + "logps/rejected": -84.93482971191406, + "loss": 0.6611, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2949891090393066, + "rewards/margins": 6.193614482879639, + "rewards/rejected": -2.8986260890960693, + "step": 13259 + }, + { + "epoch": 3.32, + "grad_norm": 3.827558755874634, + "learning_rate": 2.543233252789153e-06, + "logits/chosen": -0.5262836217880249, + "logits/rejected": -0.644664466381073, + "logps/chosen": -74.85755920410156, + "logps/rejected": -103.38945770263672, + "loss": 0.6682, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2071030139923096, + "rewards/margins": 7.84474515914917, + "rewards/rejected": -4.637642860412598, + "step": 13260 + }, + { + "epoch": 3.32, + "grad_norm": 4.695594310760498, + "learning_rate": 2.5425487190847203e-06, + "logits/chosen": -0.5612970590591431, + "logits/rejected": -0.6623923778533936, + "logps/chosen": -47.599884033203125, + "logps/rejected": -111.89835357666016, + "loss": 0.5927, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.98557448387146, + "rewards/margins": 7.7576494216918945, + "rewards/rejected": -4.772075176239014, + "step": 13261 + }, + { + "epoch": 3.32, + "grad_norm": 4.1194844245910645, + "learning_rate": 2.5418642461065206e-06, + "logits/chosen": -0.5716922283172607, + "logits/rejected": -0.632540225982666, + "logps/chosen": -51.12884521484375, + "logps/rejected": -82.62802124023438, + "loss": 0.6415, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3453569412231445, + "rewards/margins": 6.118070602416992, + "rewards/rejected": -2.7727136611938477, + "step": 13262 + }, + { + "epoch": 3.32, + "grad_norm": 4.8861470222473145, + "learning_rate": 2.5411798338714698e-06, + "logits/chosen": -0.49265414476394653, + "logits/rejected": -0.5730482935905457, + "logps/chosen": -57.224430084228516, + "logps/rejected": -109.3199691772461, + "loss": 0.6579, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9521055221557617, + "rewards/margins": 6.612898826599121, + "rewards/rejected": -3.6607937812805176, + "step": 13263 + }, + { + "epoch": 3.32, + "grad_norm": 4.434854507446289, + "learning_rate": 2.5404954823964855e-06, + "logits/chosen": -0.5125317573547363, + "logits/rejected": -0.6100355386734009, + "logps/chosen": -55.55894088745117, + "logps/rejected": -106.84884643554688, + "loss": 0.6227, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9883952140808105, + "rewards/margins": 7.049740314483643, + "rewards/rejected": -4.061345100402832, + "step": 13264 + }, + { + "epoch": 3.32, + "grad_norm": 7.316734313964844, + "learning_rate": 2.539811191698469e-06, + "logits/chosen": -0.5854138135910034, + "logits/rejected": -0.6994879841804504, + "logps/chosen": -54.65401077270508, + "logps/rejected": -95.93096923828125, + "loss": 0.6678, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9791412353515625, + "rewards/margins": 7.01318359375, + "rewards/rejected": -4.0340423583984375, + "step": 13265 + }, + { + "epoch": 3.32, + "grad_norm": 9.849335670471191, + "learning_rate": 2.5391269617943346e-06, + "logits/chosen": -0.5804623365402222, + "logits/rejected": -0.6840049624443054, + "logps/chosen": -47.33959197998047, + "logps/rejected": -106.00675201416016, + "loss": 0.5285, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3054819107055664, + "rewards/margins": 7.977738380432129, + "rewards/rejected": -4.672255992889404, + "step": 13266 + }, + { + "epoch": 3.32, + "grad_norm": 5.070235729217529, + "learning_rate": 2.538442792700995e-06, + "logits/chosen": -0.5746033191680908, + "logits/rejected": -0.6123197078704834, + "logps/chosen": -54.920997619628906, + "logps/rejected": -112.8567123413086, + "loss": 0.654, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1482913494110107, + "rewards/margins": 6.559255599975586, + "rewards/rejected": -3.4109644889831543, + "step": 13267 + }, + { + "epoch": 3.32, + "grad_norm": 3.112342596054077, + "learning_rate": 2.5377586844353484e-06, + "logits/chosen": -0.5064669251441956, + "logits/rejected": -0.59122234582901, + "logps/chosen": -63.2984619140625, + "logps/rejected": -149.99636840820312, + "loss": 0.635, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4326107501983643, + "rewards/margins": 8.977424621582031, + "rewards/rejected": -5.5448150634765625, + "step": 13268 + }, + { + "epoch": 3.32, + "grad_norm": 2.8236753940582275, + "learning_rate": 2.5370746370143035e-06, + "logits/chosen": -0.5633037686347961, + "logits/rejected": -0.6207195520401001, + "logps/chosen": -54.367515563964844, + "logps/rejected": -112.55560302734375, + "loss": 0.6322, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4311249256134033, + "rewards/margins": 7.335275173187256, + "rewards/rejected": -3.9041500091552734, + "step": 13269 + }, + { + "epoch": 3.32, + "grad_norm": 1.7043296098709106, + "learning_rate": 2.5363906504547658e-06, + "logits/chosen": -0.5582364201545715, + "logits/rejected": -0.5960630178451538, + "logps/chosen": -55.35315704345703, + "logps/rejected": -118.22488403320312, + "loss": 0.5477, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.254182815551758, + "rewards/margins": 7.4549560546875, + "rewards/rejected": -4.200772762298584, + "step": 13270 + }, + { + "epoch": 3.32, + "grad_norm": 4.7254533767700195, + "learning_rate": 2.5357067247736354e-06, + "logits/chosen": -0.5110735893249512, + "logits/rejected": -0.542538583278656, + "logps/chosen": -52.04265213012695, + "logps/rejected": -126.89218139648438, + "loss": 0.6408, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9014387130737305, + "rewards/margins": 7.856115341186523, + "rewards/rejected": -4.954677104949951, + "step": 13271 + }, + { + "epoch": 3.32, + "grad_norm": 16.91524887084961, + "learning_rate": 2.5350228599878114e-06, + "logits/chosen": -0.4780777096748352, + "logits/rejected": -0.54915452003479, + "logps/chosen": -70.3243637084961, + "logps/rejected": -87.4954833984375, + "loss": 0.85, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.908052921295166, + "rewards/margins": 5.276910781860352, + "rewards/rejected": -2.3688580989837646, + "step": 13272 + }, + { + "epoch": 3.32, + "grad_norm": 18.025569915771484, + "learning_rate": 2.534339056114196e-06, + "logits/chosen": -0.476888507604599, + "logits/rejected": -0.5835819840431213, + "logps/chosen": -56.29491424560547, + "logps/rejected": -113.71713256835938, + "loss": 0.628, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.933114767074585, + "rewards/margins": 8.160633087158203, + "rewards/rejected": -5.227518558502197, + "step": 13273 + }, + { + "epoch": 3.32, + "grad_norm": 3.9375264644622803, + "learning_rate": 2.5336553131696857e-06, + "logits/chosen": -0.6052320599555969, + "logits/rejected": -0.677588701248169, + "logps/chosen": -41.362640380859375, + "logps/rejected": -92.05851745605469, + "loss": 0.5385, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2631802558898926, + "rewards/margins": 6.650216102600098, + "rewards/rejected": -3.387035369873047, + "step": 13274 + }, + { + "epoch": 3.32, + "grad_norm": 4.324343204498291, + "learning_rate": 2.5329716311711733e-06, + "logits/chosen": -0.493838369846344, + "logits/rejected": -0.5603092908859253, + "logps/chosen": -55.525142669677734, + "logps/rejected": -99.04850769042969, + "loss": 0.6656, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.643610954284668, + "rewards/margins": 5.708615303039551, + "rewards/rejected": -3.065004825592041, + "step": 13275 + }, + { + "epoch": 3.32, + "grad_norm": 7.671606063842773, + "learning_rate": 2.5322880101355585e-06, + "logits/chosen": -0.5950279235839844, + "logits/rejected": -0.6351690292358398, + "logps/chosen": -51.93779373168945, + "logps/rejected": -98.27706909179688, + "loss": 0.7663, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.973189115524292, + "rewards/margins": 5.1323652267456055, + "rewards/rejected": -2.1591756343841553, + "step": 13276 + }, + { + "epoch": 3.32, + "grad_norm": 3.5503463745117188, + "learning_rate": 2.5316044500797294e-06, + "logits/chosen": -0.5974826216697693, + "logits/rejected": -0.6897577047348022, + "logps/chosen": -65.33697509765625, + "logps/rejected": -98.89515686035156, + "loss": 0.6629, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.956378698348999, + "rewards/margins": 6.291808128356934, + "rewards/rejected": -3.3354291915893555, + "step": 13277 + }, + { + "epoch": 3.32, + "grad_norm": 3.0637145042419434, + "learning_rate": 2.5309209510205817e-06, + "logits/chosen": -0.5033660531044006, + "logits/rejected": -0.5966643691062927, + "logps/chosen": -59.92609405517578, + "logps/rejected": -115.58938598632812, + "loss": 0.5688, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.489809036254883, + "rewards/margins": 7.5007710456848145, + "rewards/rejected": -4.010962009429932, + "step": 13278 + }, + { + "epoch": 3.32, + "grad_norm": 5.046731472015381, + "learning_rate": 2.5302375129750036e-06, + "logits/chosen": -0.48018091917037964, + "logits/rejected": -0.574344277381897, + "logps/chosen": -55.14247512817383, + "logps/rejected": -115.81924438476562, + "loss": 0.5639, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7677931785583496, + "rewards/margins": 7.575992584228516, + "rewards/rejected": -4.808199882507324, + "step": 13279 + }, + { + "epoch": 3.32, + "grad_norm": 12.196304321289062, + "learning_rate": 2.529554135959882e-06, + "logits/chosen": -0.5097857117652893, + "logits/rejected": -0.5708920955657959, + "logps/chosen": -48.292179107666016, + "logps/rejected": -91.52615356445312, + "loss": 0.7632, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2941904067993164, + "rewards/margins": 5.897115707397461, + "rewards/rejected": -2.6029255390167236, + "step": 13280 + }, + { + "epoch": 3.32, + "grad_norm": 4.566104888916016, + "learning_rate": 2.528870819992105e-06, + "logits/chosen": -0.5778950452804565, + "logits/rejected": -0.62857985496521, + "logps/chosen": -60.3307991027832, + "logps/rejected": -109.70022583007812, + "loss": 0.5745, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.122908353805542, + "rewards/margins": 6.807093143463135, + "rewards/rejected": -3.6841847896575928, + "step": 13281 + }, + { + "epoch": 3.32, + "grad_norm": 7.181350231170654, + "learning_rate": 2.52818756508856e-06, + "logits/chosen": -0.5402584671974182, + "logits/rejected": -0.6260286569595337, + "logps/chosen": -59.87053680419922, + "logps/rejected": -101.744384765625, + "loss": 0.7036, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.086338996887207, + "rewards/margins": 6.069116115570068, + "rewards/rejected": -2.982776641845703, + "step": 13282 + }, + { + "epoch": 3.32, + "grad_norm": 6.0948486328125, + "learning_rate": 2.5275043712661295e-06, + "logits/chosen": -0.5913426876068115, + "logits/rejected": -0.6530981063842773, + "logps/chosen": -53.630615234375, + "logps/rejected": -101.60338592529297, + "loss": 0.7005, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1119842529296875, + "rewards/margins": 6.748504161834717, + "rewards/rejected": -3.6365199089050293, + "step": 13283 + }, + { + "epoch": 3.32, + "grad_norm": 13.465453147888184, + "learning_rate": 2.5268212385416948e-06, + "logits/chosen": -0.5359174609184265, + "logits/rejected": -0.5436099171638489, + "logps/chosen": -61.92539596557617, + "logps/rejected": -134.6581268310547, + "loss": 0.6809, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7365047931671143, + "rewards/margins": 6.454138278961182, + "rewards/rejected": -3.7176337242126465, + "step": 13284 + }, + { + "epoch": 3.32, + "grad_norm": 4.107594966888428, + "learning_rate": 2.5261381669321394e-06, + "logits/chosen": -0.6288844347000122, + "logits/rejected": -0.741826593875885, + "logps/chosen": -50.37495803833008, + "logps/rejected": -104.0280532836914, + "loss": 0.5706, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2631776332855225, + "rewards/margins": 8.239591598510742, + "rewards/rejected": -4.976415157318115, + "step": 13285 + }, + { + "epoch": 3.32, + "grad_norm": 12.552794456481934, + "learning_rate": 2.5254551564543406e-06, + "logits/chosen": -0.4554115831851959, + "logits/rejected": -0.5760443806648254, + "logps/chosen": -59.015037536621094, + "logps/rejected": -117.8580551147461, + "loss": 0.6009, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7619433403015137, + "rewards/margins": 7.965912342071533, + "rewards/rejected": -5.203968524932861, + "step": 13286 + }, + { + "epoch": 3.32, + "grad_norm": 13.287150382995605, + "learning_rate": 2.524772207125176e-06, + "logits/chosen": -0.5737903118133545, + "logits/rejected": -0.6137087941169739, + "logps/chosen": -60.08265686035156, + "logps/rejected": -109.13629150390625, + "loss": 0.8198, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6939048767089844, + "rewards/margins": 6.189094543457031, + "rewards/rejected": -3.495189666748047, + "step": 13287 + }, + { + "epoch": 3.32, + "grad_norm": 5.140265941619873, + "learning_rate": 2.524089318961524e-06, + "logits/chosen": -0.5548002123832703, + "logits/rejected": -0.597019612789154, + "logps/chosen": -58.7268180847168, + "logps/rejected": -100.10317993164062, + "loss": 0.6626, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.234192371368408, + "rewards/margins": 7.075833797454834, + "rewards/rejected": -3.841641902923584, + "step": 13288 + }, + { + "epoch": 3.32, + "grad_norm": 7.0807204246521, + "learning_rate": 2.5234064919802587e-06, + "logits/chosen": -0.5347333550453186, + "logits/rejected": -0.5422986745834351, + "logps/chosen": -60.382347106933594, + "logps/rejected": -127.66522979736328, + "loss": 0.7056, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3605222702026367, + "rewards/margins": 8.217691421508789, + "rewards/rejected": -4.8571696281433105, + "step": 13289 + }, + { + "epoch": 3.32, + "grad_norm": 3.4743874073028564, + "learning_rate": 2.5227237261982513e-06, + "logits/chosen": -0.6296101212501526, + "logits/rejected": -0.6867737174034119, + "logps/chosen": -47.270057678222656, + "logps/rejected": -96.21266174316406, + "loss": 0.6084, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1388847827911377, + "rewards/margins": 5.917238712310791, + "rewards/rejected": -2.7783544063568115, + "step": 13290 + }, + { + "epoch": 3.32, + "grad_norm": 26.142417907714844, + "learning_rate": 2.5220410216323776e-06, + "logits/chosen": -0.6248673796653748, + "logits/rejected": -0.7116648554801941, + "logps/chosen": -49.7269287109375, + "logps/rejected": -111.31709289550781, + "loss": 0.7728, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.672960042953491, + "rewards/margins": 6.990622520446777, + "rewards/rejected": -4.317661762237549, + "step": 13291 + }, + { + "epoch": 3.33, + "grad_norm": 3.7247061729431152, + "learning_rate": 2.5213583782995055e-06, + "logits/chosen": -0.5770410895347595, + "logits/rejected": -0.6580438613891602, + "logps/chosen": -50.64497756958008, + "logps/rejected": -95.2432632446289, + "loss": 0.6325, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1176059246063232, + "rewards/margins": 6.133984565734863, + "rewards/rejected": -3.016378879547119, + "step": 13292 + }, + { + "epoch": 3.33, + "grad_norm": 23.64546775817871, + "learning_rate": 2.520675796216503e-06, + "logits/chosen": -0.573161244392395, + "logits/rejected": -0.6672123670578003, + "logps/chosen": -55.466373443603516, + "logps/rejected": -114.54247283935547, + "loss": 0.6997, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.908715009689331, + "rewards/margins": 7.930999755859375, + "rewards/rejected": -5.022284030914307, + "step": 13293 + }, + { + "epoch": 3.33, + "grad_norm": 27.60814094543457, + "learning_rate": 2.5199932754002386e-06, + "logits/chosen": -0.5846736431121826, + "logits/rejected": -0.6214752793312073, + "logps/chosen": -51.8026123046875, + "logps/rejected": -95.93445587158203, + "loss": 0.9442, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6530513763427734, + "rewards/margins": 4.902162075042725, + "rewards/rejected": -2.2491111755371094, + "step": 13294 + }, + { + "epoch": 3.33, + "grad_norm": 3.253053903579712, + "learning_rate": 2.5193108158675795e-06, + "logits/chosen": -0.6167906522750854, + "logits/rejected": -0.667587399482727, + "logps/chosen": -47.27809524536133, + "logps/rejected": -87.72691345214844, + "loss": 0.5779, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.265390157699585, + "rewards/margins": 5.880712509155273, + "rewards/rejected": -2.6153225898742676, + "step": 13295 + }, + { + "epoch": 3.33, + "grad_norm": 6.714505672454834, + "learning_rate": 2.518628417635389e-06, + "logits/chosen": -0.5614667534828186, + "logits/rejected": -0.6317340731620789, + "logps/chosen": -69.54948425292969, + "logps/rejected": -101.89645385742188, + "loss": 0.8055, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9573848247528076, + "rewards/margins": 5.9924421310424805, + "rewards/rejected": -3.0350568294525146, + "step": 13296 + }, + { + "epoch": 3.33, + "grad_norm": 5.814345359802246, + "learning_rate": 2.517946080720528e-06, + "logits/chosen": -0.46061190962791443, + "logits/rejected": -0.5082509517669678, + "logps/chosen": -50.6003303527832, + "logps/rejected": -125.05196380615234, + "loss": 0.5871, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1459360122680664, + "rewards/margins": 7.368902206420898, + "rewards/rejected": -4.222965717315674, + "step": 13297 + }, + { + "epoch": 3.33, + "grad_norm": 12.408981323242188, + "learning_rate": 2.517263805139861e-06, + "logits/chosen": -0.6090962886810303, + "logits/rejected": -0.6753785610198975, + "logps/chosen": -58.853878021240234, + "logps/rejected": -89.76348876953125, + "loss": 0.846, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.20175838470459, + "rewards/margins": 5.971324920654297, + "rewards/rejected": -2.7695658206939697, + "step": 13298 + }, + { + "epoch": 3.33, + "grad_norm": 21.545745849609375, + "learning_rate": 2.5165815909102463e-06, + "logits/chosen": -0.6289859414100647, + "logits/rejected": -0.6752555966377258, + "logps/chosen": -46.38512420654297, + "logps/rejected": -105.85896301269531, + "loss": 0.6759, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8761727809906006, + "rewards/margins": 6.300927639007568, + "rewards/rejected": -3.424755334854126, + "step": 13299 + }, + { + "epoch": 3.33, + "grad_norm": 3.1061201095581055, + "learning_rate": 2.5158994380485403e-06, + "logits/chosen": -0.5276263356208801, + "logits/rejected": -0.6121674180030823, + "logps/chosen": -53.906455993652344, + "logps/rejected": -98.67617797851562, + "loss": 0.5736, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8838818073272705, + "rewards/margins": 6.274407386779785, + "rewards/rejected": -3.3905248641967773, + "step": 13300 + }, + { + "epoch": 3.33, + "grad_norm": 4.401501178741455, + "learning_rate": 2.5152173465716034e-06, + "logits/chosen": -0.5402190685272217, + "logits/rejected": -0.6295949220657349, + "logps/chosen": -48.34397506713867, + "logps/rejected": -104.6036376953125, + "loss": 0.5845, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1741747856140137, + "rewards/margins": 6.753244400024414, + "rewards/rejected": -3.5790698528289795, + "step": 13301 + }, + { + "epoch": 3.33, + "grad_norm": 17.624887466430664, + "learning_rate": 2.514535316496286e-06, + "logits/chosen": -0.6084578633308411, + "logits/rejected": -0.7137372493743896, + "logps/chosen": -54.94569396972656, + "logps/rejected": -90.02163696289062, + "loss": 0.6365, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.644573211669922, + "rewards/margins": 6.6147236824035645, + "rewards/rejected": -3.9701504707336426, + "step": 13302 + }, + { + "epoch": 3.33, + "grad_norm": 14.447757720947266, + "learning_rate": 2.5138533478394474e-06, + "logits/chosen": -0.6421884298324585, + "logits/rejected": -0.6949765086174011, + "logps/chosen": -48.3326301574707, + "logps/rejected": -114.02616119384766, + "loss": 0.6353, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1612355709075928, + "rewards/margins": 7.685337066650391, + "rewards/rejected": -4.524101734161377, + "step": 13303 + }, + { + "epoch": 3.33, + "grad_norm": 6.877700328826904, + "learning_rate": 2.5131714406179365e-06, + "logits/chosen": -0.5794756412506104, + "logits/rejected": -0.6498242616653442, + "logps/chosen": -45.847835540771484, + "logps/rejected": -107.66895294189453, + "loss": 0.5812, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2853662967681885, + "rewards/margins": 7.100981712341309, + "rewards/rejected": -3.81561541557312, + "step": 13304 + }, + { + "epoch": 3.33, + "grad_norm": 3.526839017868042, + "learning_rate": 2.5124895948486027e-06, + "logits/chosen": -0.5469022393226624, + "logits/rejected": -0.6107656955718994, + "logps/chosen": -61.4183349609375, + "logps/rejected": -114.49391174316406, + "loss": 0.6551, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0993385314941406, + "rewards/margins": 7.021707534790039, + "rewards/rejected": -3.9223690032958984, + "step": 13305 + }, + { + "epoch": 3.33, + "grad_norm": 4.111918926239014, + "learning_rate": 2.5118078105482967e-06, + "logits/chosen": -0.49033743143081665, + "logits/rejected": -0.5724478363990784, + "logps/chosen": -49.324459075927734, + "logps/rejected": -116.44529724121094, + "loss": 0.5701, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1747984886169434, + "rewards/margins": 6.744693756103516, + "rewards/rejected": -3.5698952674865723, + "step": 13306 + }, + { + "epoch": 3.33, + "grad_norm": 6.071070194244385, + "learning_rate": 2.51112608773387e-06, + "logits/chosen": -0.5650939345359802, + "logits/rejected": -0.6767145395278931, + "logps/chosen": -62.59918975830078, + "logps/rejected": -105.17488098144531, + "loss": 0.7367, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.240706443786621, + "rewards/margins": 7.238083839416504, + "rewards/rejected": -3.997377634048462, + "step": 13307 + }, + { + "epoch": 3.33, + "grad_norm": 4.389920711517334, + "learning_rate": 2.510444426422161e-06, + "logits/chosen": -0.4979681372642517, + "logits/rejected": -0.5675209760665894, + "logps/chosen": -49.800148010253906, + "logps/rejected": -84.73959350585938, + "loss": 0.6519, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0603280067443848, + "rewards/margins": 5.468143463134766, + "rewards/rejected": -2.40781569480896, + "step": 13308 + }, + { + "epoch": 3.33, + "grad_norm": 6.084274768829346, + "learning_rate": 2.509762826630019e-06, + "logits/chosen": -0.5013875365257263, + "logits/rejected": -0.5684539079666138, + "logps/chosen": -45.02143096923828, + "logps/rejected": -100.92564392089844, + "loss": 0.5285, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.04652738571167, + "rewards/margins": 5.9561285972595215, + "rewards/rejected": -2.9096012115478516, + "step": 13309 + }, + { + "epoch": 3.33, + "grad_norm": 11.285867691040039, + "learning_rate": 2.509081288374289e-06, + "logits/chosen": -0.5380210876464844, + "logits/rejected": -0.611615002155304, + "logps/chosen": -46.94196701049805, + "logps/rejected": -123.91486358642578, + "loss": 0.6276, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3121495246887207, + "rewards/margins": 8.343613624572754, + "rewards/rejected": -5.031463623046875, + "step": 13310 + }, + { + "epoch": 3.33, + "grad_norm": 5.575448513031006, + "learning_rate": 2.5083998116718067e-06, + "logits/chosen": -0.5477957129478455, + "logits/rejected": -0.6377776265144348, + "logps/chosen": -72.70452117919922, + "logps/rejected": -92.27393341064453, + "loss": 0.7923, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0527238845825195, + "rewards/margins": 5.7316999435424805, + "rewards/rejected": -2.67897629737854, + "step": 13311 + }, + { + "epoch": 3.33, + "grad_norm": 4.427846431732178, + "learning_rate": 2.507718396539415e-06, + "logits/chosen": -0.5362859964370728, + "logits/rejected": -0.5905343294143677, + "logps/chosen": -47.870262145996094, + "logps/rejected": -88.39815521240234, + "loss": 0.6297, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0560507774353027, + "rewards/margins": 5.499950885772705, + "rewards/rejected": -2.4439001083374023, + "step": 13312 + }, + { + "epoch": 3.33, + "grad_norm": 3.0392658710479736, + "learning_rate": 2.507037042993955e-06, + "logits/chosen": -0.5905084609985352, + "logits/rejected": -0.6612817645072937, + "logps/chosen": -47.5138053894043, + "logps/rejected": -117.59000396728516, + "loss": 0.5764, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.052088737487793, + "rewards/margins": 8.252472877502441, + "rewards/rejected": -5.200384140014648, + "step": 13313 + }, + { + "epoch": 3.33, + "grad_norm": 36.802818298339844, + "learning_rate": 2.5063557510522606e-06, + "logits/chosen": -0.48003068566322327, + "logits/rejected": -0.571952223777771, + "logps/chosen": -69.21377563476562, + "logps/rejected": -101.88658142089844, + "loss": 0.7071, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.146286725997925, + "rewards/margins": 5.64068078994751, + "rewards/rejected": -2.494394063949585, + "step": 13314 + }, + { + "epoch": 3.33, + "grad_norm": 3.9603145122528076, + "learning_rate": 2.5056745207311668e-06, + "logits/chosen": -0.5487788319587708, + "logits/rejected": -0.6710256338119507, + "logps/chosen": -58.92863082885742, + "logps/rejected": -85.98757934570312, + "loss": 0.6021, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5733907222747803, + "rewards/margins": 6.758315563201904, + "rewards/rejected": -3.184924364089966, + "step": 13315 + }, + { + "epoch": 3.33, + "grad_norm": 4.301329135894775, + "learning_rate": 2.50499335204751e-06, + "logits/chosen": -0.47263848781585693, + "logits/rejected": -0.5927051901817322, + "logps/chosen": -56.213932037353516, + "logps/rejected": -104.26838684082031, + "loss": 0.5954, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.659956455230713, + "rewards/margins": 6.296690940856934, + "rewards/rejected": -3.6367347240448, + "step": 13316 + }, + { + "epoch": 3.33, + "grad_norm": 3.0092782974243164, + "learning_rate": 2.504312245018121e-06, + "logits/chosen": -0.5361834764480591, + "logits/rejected": -0.5913746356964111, + "logps/chosen": -47.78956604003906, + "logps/rejected": -101.65657806396484, + "loss": 0.5829, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.99218487739563, + "rewards/margins": 6.942682266235352, + "rewards/rejected": -3.9504971504211426, + "step": 13317 + }, + { + "epoch": 3.33, + "grad_norm": 6.552111625671387, + "learning_rate": 2.50363119965983e-06, + "logits/chosen": -0.5881005525588989, + "logits/rejected": -0.6618078351020813, + "logps/chosen": -58.26788330078125, + "logps/rejected": -109.14173889160156, + "loss": 0.7008, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9072957038879395, + "rewards/margins": 6.150224208831787, + "rewards/rejected": -3.2429277896881104, + "step": 13318 + }, + { + "epoch": 3.33, + "grad_norm": 4.6982831954956055, + "learning_rate": 2.502950215989469e-06, + "logits/chosen": -0.5530695915222168, + "logits/rejected": -0.590085506439209, + "logps/chosen": -45.21324157714844, + "logps/rejected": -111.83148956298828, + "loss": 0.5538, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8954720497131348, + "rewards/margins": 6.9842047691345215, + "rewards/rejected": -4.088732719421387, + "step": 13319 + }, + { + "epoch": 3.33, + "grad_norm": 5.523916244506836, + "learning_rate": 2.5022692940238617e-06, + "logits/chosen": -0.5733816027641296, + "logits/rejected": -0.6696634888648987, + "logps/chosen": -45.963340759277344, + "logps/rejected": -106.5312271118164, + "loss": 0.5496, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0268030166625977, + "rewards/margins": 8.073320388793945, + "rewards/rejected": -5.046516418457031, + "step": 13320 + }, + { + "epoch": 3.33, + "grad_norm": 6.695833683013916, + "learning_rate": 2.501588433779839e-06, + "logits/chosen": -0.5078432559967041, + "logits/rejected": -0.5882468223571777, + "logps/chosen": -51.35686111450195, + "logps/rejected": -97.28837585449219, + "loss": 0.6045, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.076458215713501, + "rewards/margins": 7.165011405944824, + "rewards/rejected": -4.088553428649902, + "step": 13321 + }, + { + "epoch": 3.33, + "grad_norm": 16.364744186401367, + "learning_rate": 2.5009076352742234e-06, + "logits/chosen": -0.5793702602386475, + "logits/rejected": -0.7095131278038025, + "logps/chosen": -60.449249267578125, + "logps/rejected": -93.41629791259766, + "loss": 0.7792, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7849345207214355, + "rewards/margins": 7.049630165100098, + "rewards/rejected": -4.2646965980529785, + "step": 13322 + }, + { + "epoch": 3.33, + "grad_norm": 2.8390896320343018, + "learning_rate": 2.5002268985238367e-06, + "logits/chosen": -0.504322350025177, + "logits/rejected": -0.6483404636383057, + "logps/chosen": -63.16279602050781, + "logps/rejected": -90.31800842285156, + "loss": 0.5959, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.158780097961426, + "rewards/margins": 7.27559757232666, + "rewards/rejected": -4.116817474365234, + "step": 13323 + }, + { + "epoch": 3.33, + "grad_norm": 2.374845027923584, + "learning_rate": 2.499546223545504e-06, + "logits/chosen": -0.6026837229728699, + "logits/rejected": -0.6884592771530151, + "logps/chosen": -46.80044174194336, + "logps/rejected": -103.07308959960938, + "loss": 0.5321, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5750832557678223, + "rewards/margins": 7.853344917297363, + "rewards/rejected": -4.278261184692383, + "step": 13324 + }, + { + "epoch": 3.33, + "grad_norm": 10.64120101928711, + "learning_rate": 2.4988656103560417e-06, + "logits/chosen": -0.5362096428871155, + "logits/rejected": -0.624816358089447, + "logps/chosen": -49.41792297363281, + "logps/rejected": -97.2032699584961, + "loss": 0.645, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9604270458221436, + "rewards/margins": 6.2648725509643555, + "rewards/rejected": -3.3044450283050537, + "step": 13325 + }, + { + "epoch": 3.33, + "grad_norm": 6.836430072784424, + "learning_rate": 2.498185058972273e-06, + "logits/chosen": -0.5915277004241943, + "logits/rejected": -0.6743156909942627, + "logps/chosen": -55.49463653564453, + "logps/rejected": -82.41862487792969, + "loss": 0.6634, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8975369930267334, + "rewards/margins": 5.546820163726807, + "rewards/rejected": -2.6492831707000732, + "step": 13326 + }, + { + "epoch": 3.33, + "grad_norm": 4.646721363067627, + "learning_rate": 2.4975045694110094e-06, + "logits/chosen": -0.605734646320343, + "logits/rejected": -0.6696205139160156, + "logps/chosen": -52.93560028076172, + "logps/rejected": -95.00489807128906, + "loss": 0.5865, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2173330783843994, + "rewards/margins": 5.586243152618408, + "rewards/rejected": -2.368910312652588, + "step": 13327 + }, + { + "epoch": 3.33, + "grad_norm": 2.9085676670074463, + "learning_rate": 2.4968241416890725e-06, + "logits/chosen": -0.5353331565856934, + "logits/rejected": -0.652210533618927, + "logps/chosen": -48.7923583984375, + "logps/rejected": -100.02938079833984, + "loss": 0.5811, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2995574474334717, + "rewards/margins": 8.501092910766602, + "rewards/rejected": -5.201537132263184, + "step": 13328 + }, + { + "epoch": 3.33, + "grad_norm": 4.42047119140625, + "learning_rate": 2.496143775823273e-06, + "logits/chosen": -0.5800638198852539, + "logits/rejected": -0.6074509620666504, + "logps/chosen": -47.80628967285156, + "logps/rejected": -92.4139404296875, + "loss": 0.5302, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2912490367889404, + "rewards/margins": 6.20374059677124, + "rewards/rejected": -2.912491798400879, + "step": 13329 + }, + { + "epoch": 3.33, + "grad_norm": 2.8896126747131348, + "learning_rate": 2.495463471830423e-06, + "logits/chosen": -0.5684973001480103, + "logits/rejected": -0.629243016242981, + "logps/chosen": -46.30046081542969, + "logps/rejected": -96.93623352050781, + "loss": 0.5512, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4987845420837402, + "rewards/margins": 6.927408695220947, + "rewards/rejected": -3.428623914718628, + "step": 13330 + }, + { + "epoch": 3.33, + "grad_norm": 4.098040580749512, + "learning_rate": 2.4947832297273356e-06, + "logits/chosen": -0.46256110072135925, + "logits/rejected": -0.5843132138252258, + "logps/chosen": -71.72516632080078, + "logps/rejected": -109.45032501220703, + "loss": 0.6531, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7693657875061035, + "rewards/margins": 6.873266220092773, + "rewards/rejected": -4.103899955749512, + "step": 13331 + }, + { + "epoch": 3.34, + "grad_norm": 8.776285171508789, + "learning_rate": 2.49410304953082e-06, + "logits/chosen": -0.5120492577552795, + "logits/rejected": -0.5643508434295654, + "logps/chosen": -42.53264236450195, + "logps/rejected": -119.43408203125, + "loss": 0.5524, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2579736709594727, + "rewards/margins": 6.1003031730651855, + "rewards/rejected": -2.8423290252685547, + "step": 13332 + }, + { + "epoch": 3.34, + "grad_norm": 7.2739410400390625, + "learning_rate": 2.493422931257681e-06, + "logits/chosen": -0.5090309977531433, + "logits/rejected": -0.5927842855453491, + "logps/chosen": -52.851646423339844, + "logps/rejected": -103.94095611572266, + "loss": 0.5125, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9692752361297607, + "rewards/margins": 7.218451499938965, + "rewards/rejected": -4.249176025390625, + "step": 13333 + }, + { + "epoch": 3.34, + "grad_norm": 7.648014068603516, + "learning_rate": 2.4927428749247298e-06, + "logits/chosen": -0.5344040393829346, + "logits/rejected": -0.6293268203735352, + "logps/chosen": -63.65251541137695, + "logps/rejected": -99.702880859375, + "loss": 0.5891, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2560219764709473, + "rewards/margins": 6.630965232849121, + "rewards/rejected": -3.3749430179595947, + "step": 13334 + }, + { + "epoch": 3.34, + "grad_norm": 5.99117374420166, + "learning_rate": 2.4920628805487684e-06, + "logits/chosen": -0.5873221158981323, + "logits/rejected": -0.7085404992103577, + "logps/chosen": -51.861228942871094, + "logps/rejected": -85.43343353271484, + "loss": 0.5991, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0863521099090576, + "rewards/margins": 6.625487804412842, + "rewards/rejected": -3.539135217666626, + "step": 13335 + }, + { + "epoch": 3.34, + "grad_norm": 3.143979787826538, + "learning_rate": 2.491382948146599e-06, + "logits/chosen": -0.583574116230011, + "logits/rejected": -0.6770687103271484, + "logps/chosen": -47.017860412597656, + "logps/rejected": -124.96612548828125, + "loss": 0.5577, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3389346599578857, + "rewards/margins": 9.076024055480957, + "rewards/rejected": -5.73708963394165, + "step": 13336 + }, + { + "epoch": 3.34, + "grad_norm": 3.939678192138672, + "learning_rate": 2.4907030777350253e-06, + "logits/chosen": -0.588007390499115, + "logits/rejected": -0.6695504784584045, + "logps/chosen": -46.276302337646484, + "logps/rejected": -101.90262603759766, + "loss": 0.6091, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.13082218170166, + "rewards/margins": 7.070643901824951, + "rewards/rejected": -3.939821720123291, + "step": 13337 + }, + { + "epoch": 3.34, + "grad_norm": 6.739354610443115, + "learning_rate": 2.4900232693308497e-06, + "logits/chosen": -0.5495470762252808, + "logits/rejected": -0.6578901410102844, + "logps/chosen": -52.63349533081055, + "logps/rejected": -82.57799530029297, + "loss": 0.6429, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9769155979156494, + "rewards/margins": 6.9934983253479, + "rewards/rejected": -4.016582489013672, + "step": 13338 + }, + { + "epoch": 3.34, + "grad_norm": 3.705476760864258, + "learning_rate": 2.4893435229508688e-06, + "logits/chosen": -0.5431598424911499, + "logits/rejected": -0.5982018709182739, + "logps/chosen": -50.40503692626953, + "logps/rejected": -107.1448745727539, + "loss": 0.5923, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.156313419342041, + "rewards/margins": 7.360642910003662, + "rewards/rejected": -4.204329490661621, + "step": 13339 + }, + { + "epoch": 3.34, + "grad_norm": 6.081522464752197, + "learning_rate": 2.488663838611878e-06, + "logits/chosen": -0.5718021392822266, + "logits/rejected": -0.6024640798568726, + "logps/chosen": -49.04988098144531, + "logps/rejected": -90.38580322265625, + "loss": 0.7418, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7704081535339355, + "rewards/margins": 5.285265922546387, + "rewards/rejected": -2.514857769012451, + "step": 13340 + }, + { + "epoch": 3.34, + "grad_norm": 2.7255890369415283, + "learning_rate": 2.4879842163306766e-06, + "logits/chosen": -0.5258253812789917, + "logits/rejected": -0.590998113155365, + "logps/chosen": -48.75905227661133, + "logps/rejected": -106.06576538085938, + "loss": 0.576, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1134908199310303, + "rewards/margins": 7.582267761230469, + "rewards/rejected": -4.468776702880859, + "step": 13341 + }, + { + "epoch": 3.34, + "grad_norm": 2.1937320232391357, + "learning_rate": 2.487304656124057e-06, + "logits/chosen": -0.5423352718353271, + "logits/rejected": -0.629706859588623, + "logps/chosen": -60.640907287597656, + "logps/rejected": -103.0550308227539, + "loss": 0.6385, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1985726356506348, + "rewards/margins": 6.944906711578369, + "rewards/rejected": -3.7463338375091553, + "step": 13342 + }, + { + "epoch": 3.34, + "grad_norm": 4.71816873550415, + "learning_rate": 2.48662515800881e-06, + "logits/chosen": -0.540556788444519, + "logits/rejected": -0.6370538473129272, + "logps/chosen": -54.35649108886719, + "logps/rejected": -101.96129608154297, + "loss": 0.6173, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1312127113342285, + "rewards/margins": 7.075153350830078, + "rewards/rejected": -3.943939685821533, + "step": 13343 + }, + { + "epoch": 3.34, + "grad_norm": 7.023054122924805, + "learning_rate": 2.485945722001731e-06, + "logits/chosen": -0.5379108190536499, + "logits/rejected": -0.6351189613342285, + "logps/chosen": -53.151512145996094, + "logps/rejected": -120.83350372314453, + "loss": 0.6393, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.625580310821533, + "rewards/margins": 9.030741691589355, + "rewards/rejected": -6.405160903930664, + "step": 13344 + }, + { + "epoch": 3.34, + "grad_norm": 4.343884468078613, + "learning_rate": 2.4852663481196053e-06, + "logits/chosen": -0.5365654230117798, + "logits/rejected": -0.6186287999153137, + "logps/chosen": -60.74763870239258, + "logps/rejected": -129.22560119628906, + "loss": 0.6422, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0214147567749023, + "rewards/margins": 8.048589706420898, + "rewards/rejected": -5.02717399597168, + "step": 13345 + }, + { + "epoch": 3.34, + "grad_norm": 12.524295806884766, + "learning_rate": 2.484587036379225e-06, + "logits/chosen": -0.5814575552940369, + "logits/rejected": -0.6908873319625854, + "logps/chosen": -54.51338195800781, + "logps/rejected": -102.50013732910156, + "loss": 0.6577, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.103438138961792, + "rewards/margins": 6.634922981262207, + "rewards/rejected": -3.531484842300415, + "step": 13346 + }, + { + "epoch": 3.34, + "grad_norm": 8.264545440673828, + "learning_rate": 2.4839077867973745e-06, + "logits/chosen": -0.598196268081665, + "logits/rejected": -0.6569187641143799, + "logps/chosen": -58.69332504272461, + "logps/rejected": -112.07749938964844, + "loss": 0.6843, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.999721050262451, + "rewards/margins": 8.117136001586914, + "rewards/rejected": -5.117415904998779, + "step": 13347 + }, + { + "epoch": 3.34, + "grad_norm": 2.8326804637908936, + "learning_rate": 2.4832285993908365e-06, + "logits/chosen": -0.5658257603645325, + "logits/rejected": -0.6485154628753662, + "logps/chosen": -57.45396041870117, + "logps/rejected": -114.81111145019531, + "loss": 0.6316, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8719406127929688, + "rewards/margins": 7.557695388793945, + "rewards/rejected": -4.685754776000977, + "step": 13348 + }, + { + "epoch": 3.34, + "grad_norm": 7.184292316436768, + "learning_rate": 2.482549474176399e-06, + "logits/chosen": -0.6638838052749634, + "logits/rejected": -0.719017505645752, + "logps/chosen": -49.54317855834961, + "logps/rejected": -108.42302703857422, + "loss": 0.6767, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.032707452774048, + "rewards/margins": 7.490159034729004, + "rewards/rejected": -4.457450866699219, + "step": 13349 + }, + { + "epoch": 3.34, + "grad_norm": 12.74553108215332, + "learning_rate": 2.481870411170842e-06, + "logits/chosen": -0.5050916075706482, + "logits/rejected": -0.6046081781387329, + "logps/chosen": -46.88398742675781, + "logps/rejected": -124.53247833251953, + "loss": 0.6442, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8402132987976074, + "rewards/margins": 8.073881149291992, + "rewards/rejected": -5.233666896820068, + "step": 13350 + }, + { + "epoch": 3.34, + "grad_norm": 6.726109981536865, + "learning_rate": 2.481191410390943e-06, + "logits/chosen": -0.5500391721725464, + "logits/rejected": -0.6338157057762146, + "logps/chosen": -61.51478958129883, + "logps/rejected": -118.50881958007812, + "loss": 0.64, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.199458599090576, + "rewards/margins": 7.604681491851807, + "rewards/rejected": -4.4052228927612305, + "step": 13351 + }, + { + "epoch": 3.34, + "grad_norm": 5.250444412231445, + "learning_rate": 2.4805124718534834e-06, + "logits/chosen": -0.603918194770813, + "logits/rejected": -0.7058758735656738, + "logps/chosen": -58.47236633300781, + "logps/rejected": -80.27037811279297, + "loss": 0.6255, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9846134185791016, + "rewards/margins": 6.301534652709961, + "rewards/rejected": -3.3169212341308594, + "step": 13352 + }, + { + "epoch": 3.34, + "grad_norm": 5.602682590484619, + "learning_rate": 2.4798335955752446e-06, + "logits/chosen": -0.526459813117981, + "logits/rejected": -0.6100140810012817, + "logps/chosen": -54.13252258300781, + "logps/rejected": -99.92859649658203, + "loss": 0.6852, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.313899517059326, + "rewards/margins": 7.002628326416016, + "rewards/rejected": -3.6887290477752686, + "step": 13353 + }, + { + "epoch": 3.34, + "grad_norm": 3.095242500305176, + "learning_rate": 2.4791547815729946e-06, + "logits/chosen": -0.6085025072097778, + "logits/rejected": -0.6525972485542297, + "logps/chosen": -53.83649444580078, + "logps/rejected": -117.31338500976562, + "loss": 0.6308, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.050973415374756, + "rewards/margins": 7.20137882232666, + "rewards/rejected": -4.150405406951904, + "step": 13354 + }, + { + "epoch": 3.34, + "grad_norm": 5.397253036499023, + "learning_rate": 2.478476029863511e-06, + "logits/chosen": -0.5667678117752075, + "logits/rejected": -0.6066583395004272, + "logps/chosen": -48.59161376953125, + "logps/rejected": -95.36592864990234, + "loss": 0.6394, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.037384033203125, + "rewards/margins": 6.834446907043457, + "rewards/rejected": -3.7970633506774902, + "step": 13355 + }, + { + "epoch": 3.34, + "grad_norm": 7.891015529632568, + "learning_rate": 2.4777973404635687e-06, + "logits/chosen": -0.5487455129623413, + "logits/rejected": -0.6115779280662537, + "logps/chosen": -57.05902862548828, + "logps/rejected": -122.81938171386719, + "loss": 0.6375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1547632217407227, + "rewards/margins": 8.30325698852539, + "rewards/rejected": -5.148493766784668, + "step": 13356 + }, + { + "epoch": 3.34, + "grad_norm": 5.41158390045166, + "learning_rate": 2.4771187133899367e-06, + "logits/chosen": -0.5197175145149231, + "logits/rejected": -0.639869749546051, + "logps/chosen": -57.67706298828125, + "logps/rejected": -112.41297912597656, + "loss": 0.6674, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8219826221466064, + "rewards/margins": 7.7951436042785645, + "rewards/rejected": -4.973161697387695, + "step": 13357 + }, + { + "epoch": 3.34, + "grad_norm": 3.3446602821350098, + "learning_rate": 2.4764401486593826e-06, + "logits/chosen": -0.6006321907043457, + "logits/rejected": -0.6955553293228149, + "logps/chosen": -47.78076171875, + "logps/rejected": -89.68081665039062, + "loss": 0.5505, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0457231998443604, + "rewards/margins": 6.963563919067383, + "rewards/rejected": -3.917840003967285, + "step": 13358 + }, + { + "epoch": 3.34, + "grad_norm": 5.8072333335876465, + "learning_rate": 2.4757616462886786e-06, + "logits/chosen": -0.600145697593689, + "logits/rejected": -0.61648029088974, + "logps/chosen": -62.27665710449219, + "logps/rejected": -135.89794921875, + "loss": 0.736, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9884064197540283, + "rewards/margins": 7.372809410095215, + "rewards/rejected": -4.384402275085449, + "step": 13359 + }, + { + "epoch": 3.34, + "grad_norm": 5.887679576873779, + "learning_rate": 2.4750832062945897e-06, + "logits/chosen": -0.5544522404670715, + "logits/rejected": -0.6294118762016296, + "logps/chosen": -54.46506118774414, + "logps/rejected": -122.74942016601562, + "loss": 0.561, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.181821346282959, + "rewards/margins": 8.123003959655762, + "rewards/rejected": -4.941182613372803, + "step": 13360 + }, + { + "epoch": 3.34, + "grad_norm": 7.188495635986328, + "learning_rate": 2.474404828693878e-06, + "logits/chosen": -0.5545921325683594, + "logits/rejected": -0.6344473361968994, + "logps/chosen": -57.3129997253418, + "logps/rejected": -98.9188232421875, + "loss": 0.6865, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1256167888641357, + "rewards/margins": 7.2683305740356445, + "rewards/rejected": -4.142712593078613, + "step": 13361 + }, + { + "epoch": 3.34, + "grad_norm": 3.772132158279419, + "learning_rate": 2.4737265135033113e-06, + "logits/chosen": -0.5761919021606445, + "logits/rejected": -0.6282240748405457, + "logps/chosen": -61.74552917480469, + "logps/rejected": -105.82263946533203, + "loss": 0.5984, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2343714237213135, + "rewards/margins": 6.647972583770752, + "rewards/rejected": -3.413600444793701, + "step": 13362 + }, + { + "epoch": 3.34, + "grad_norm": 5.404396057128906, + "learning_rate": 2.473048260739648e-06, + "logits/chosen": -0.5085282325744629, + "logits/rejected": -0.632117748260498, + "logps/chosen": -76.6417465209961, + "logps/rejected": -89.30357360839844, + "loss": 0.7625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8903648853302, + "rewards/margins": 6.98776912689209, + "rewards/rejected": -4.097404956817627, + "step": 13363 + }, + { + "epoch": 3.34, + "grad_norm": 4.631223201751709, + "learning_rate": 2.4723700704196513e-06, + "logits/chosen": -0.6099326014518738, + "logits/rejected": -0.7049822807312012, + "logps/chosen": -45.398765563964844, + "logps/rejected": -98.35563659667969, + "loss": 0.5752, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1777615547180176, + "rewards/margins": 7.678665637969971, + "rewards/rejected": -4.500903606414795, + "step": 13364 + }, + { + "epoch": 3.34, + "grad_norm": 3.6328940391540527, + "learning_rate": 2.4716919425600787e-06, + "logits/chosen": -0.5590915083885193, + "logits/rejected": -0.6230422854423523, + "logps/chosen": -52.73409652709961, + "logps/rejected": -117.60885620117188, + "loss": 0.6995, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1807379722595215, + "rewards/margins": 7.086490631103516, + "rewards/rejected": -3.9057517051696777, + "step": 13365 + }, + { + "epoch": 3.34, + "grad_norm": 3.119939088821411, + "learning_rate": 2.471013877177686e-06, + "logits/chosen": -0.5698485374450684, + "logits/rejected": -0.6373021602630615, + "logps/chosen": -53.64487838745117, + "logps/rejected": -102.74220275878906, + "loss": 0.6017, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.994419574737549, + "rewards/margins": 6.817928314208984, + "rewards/rejected": -3.8235087394714355, + "step": 13366 + }, + { + "epoch": 3.34, + "grad_norm": 6.072563171386719, + "learning_rate": 2.470335874289232e-06, + "logits/chosen": -0.533454418182373, + "logits/rejected": -0.5945650339126587, + "logps/chosen": -57.154788970947266, + "logps/rejected": -100.30805206298828, + "loss": 0.637, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4796152114868164, + "rewards/margins": 6.824512958526611, + "rewards/rejected": -3.344897747039795, + "step": 13367 + }, + { + "epoch": 3.34, + "grad_norm": 8.110530853271484, + "learning_rate": 2.4696579339114697e-06, + "logits/chosen": -0.520354151725769, + "logits/rejected": -0.6097694635391235, + "logps/chosen": -57.42918014526367, + "logps/rejected": -102.82713317871094, + "loss": 0.5927, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.148599863052368, + "rewards/margins": 7.436552047729492, + "rewards/rejected": -4.287952899932861, + "step": 13368 + }, + { + "epoch": 3.34, + "grad_norm": 4.427454948425293, + "learning_rate": 2.4689800560611486e-06, + "logits/chosen": -0.5925819277763367, + "logits/rejected": -0.6650078892707825, + "logps/chosen": -54.98246765136719, + "logps/rejected": -115.78968811035156, + "loss": 0.599, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.057016372680664, + "rewards/margins": 7.7358832359313965, + "rewards/rejected": -4.678866386413574, + "step": 13369 + }, + { + "epoch": 3.34, + "grad_norm": 2.922896146774292, + "learning_rate": 2.468302240755023e-06, + "logits/chosen": -0.530737042427063, + "logits/rejected": -0.6330636739730835, + "logps/chosen": -54.480674743652344, + "logps/rejected": -116.59147644042969, + "loss": 0.5798, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.863845109939575, + "rewards/margins": 7.705877304077148, + "rewards/rejected": -4.842031955718994, + "step": 13370 + }, + { + "epoch": 3.34, + "grad_norm": 2.024935483932495, + "learning_rate": 2.467624488009843e-06, + "logits/chosen": -0.5162017345428467, + "logits/rejected": -0.5916440486907959, + "logps/chosen": -52.62059783935547, + "logps/rejected": -115.3631591796875, + "loss": 0.5306, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.896289825439453, + "rewards/margins": 8.120097160339355, + "rewards/rejected": -5.223807334899902, + "step": 13371 + }, + { + "epoch": 3.35, + "grad_norm": 3.1773433685302734, + "learning_rate": 2.4669467978423563e-06, + "logits/chosen": -0.6031804084777832, + "logits/rejected": -0.7152817845344543, + "logps/chosen": -50.14015197753906, + "logps/rejected": -95.31224060058594, + "loss": 0.5664, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0010883808135986, + "rewards/margins": 6.718074798583984, + "rewards/rejected": -3.7169861793518066, + "step": 13372 + }, + { + "epoch": 3.35, + "grad_norm": 2.793783664703369, + "learning_rate": 2.466269170269306e-06, + "logits/chosen": -0.5682188868522644, + "logits/rejected": -0.6244062781333923, + "logps/chosen": -52.84979248046875, + "logps/rejected": -116.35040283203125, + "loss": 0.5802, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.30001163482666, + "rewards/margins": 7.262273788452148, + "rewards/rejected": -3.9622621536254883, + "step": 13373 + }, + { + "epoch": 3.35, + "grad_norm": 5.062503814697266, + "learning_rate": 2.4655916053074417e-06, + "logits/chosen": -0.5674703121185303, + "logits/rejected": -0.645807683467865, + "logps/chosen": -56.31924819946289, + "logps/rejected": -89.96410369873047, + "loss": 0.7209, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3709380626678467, + "rewards/margins": 6.818085670471191, + "rewards/rejected": -3.447148084640503, + "step": 13374 + }, + { + "epoch": 3.35, + "grad_norm": 4.366410732269287, + "learning_rate": 2.4649141029735047e-06, + "logits/chosen": -0.572583019733429, + "logits/rejected": -0.6638807058334351, + "logps/chosen": -55.91565704345703, + "logps/rejected": -103.61636352539062, + "loss": 0.6393, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.524181365966797, + "rewards/margins": 7.460179805755615, + "rewards/rejected": -3.9359982013702393, + "step": 13375 + }, + { + "epoch": 3.35, + "grad_norm": 10.01718807220459, + "learning_rate": 2.464236663284234e-06, + "logits/chosen": -0.6293167471885681, + "logits/rejected": -0.7027933597564697, + "logps/chosen": -62.03055191040039, + "logps/rejected": -110.75304412841797, + "loss": 0.7913, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1392297744750977, + "rewards/margins": 6.4993391036987305, + "rewards/rejected": -3.3601090908050537, + "step": 13376 + }, + { + "epoch": 3.35, + "grad_norm": 6.600399971008301, + "learning_rate": 2.463559286256375e-06, + "logits/chosen": -0.4778243899345398, + "logits/rejected": -0.5578335523605347, + "logps/chosen": -76.9164810180664, + "logps/rejected": -111.40534973144531, + "loss": 0.7505, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9394454956054688, + "rewards/margins": 7.684989929199219, + "rewards/rejected": -4.745543956756592, + "step": 13377 + }, + { + "epoch": 3.35, + "grad_norm": 4.888564586639404, + "learning_rate": 2.462881971906664e-06, + "logits/chosen": -0.5510801076889038, + "logits/rejected": -0.6520049571990967, + "logps/chosen": -63.79475784301758, + "logps/rejected": -97.55052947998047, + "loss": 0.604, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.764469623565674, + "rewards/margins": 7.227843761444092, + "rewards/rejected": -4.463374614715576, + "step": 13378 + }, + { + "epoch": 3.35, + "grad_norm": 4.5327982902526855, + "learning_rate": 2.462204720251836e-06, + "logits/chosen": -0.5514459013938904, + "logits/rejected": -0.5750459432601929, + "logps/chosen": -59.31974411010742, + "logps/rejected": -109.9446029663086, + "loss": 0.6457, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.054635524749756, + "rewards/margins": 5.296695232391357, + "rewards/rejected": -2.2420597076416016, + "step": 13379 + }, + { + "epoch": 3.35, + "grad_norm": 13.132307052612305, + "learning_rate": 2.461527531308629e-06, + "logits/chosen": -0.5113388299942017, + "logits/rejected": -0.5782049894332886, + "logps/chosen": -53.16880416870117, + "logps/rejected": -126.79570007324219, + "loss": 0.7637, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1558475494384766, + "rewards/margins": 7.6925506591796875, + "rewards/rejected": -4.536703109741211, + "step": 13380 + }, + { + "epoch": 3.35, + "grad_norm": 3.1666953563690186, + "learning_rate": 2.4608504050937805e-06, + "logits/chosen": -0.5277314782142639, + "logits/rejected": -0.5753712058067322, + "logps/chosen": -50.664146423339844, + "logps/rejected": -124.33564758300781, + "loss": 0.59, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1761481761932373, + "rewards/margins": 7.709808826446533, + "rewards/rejected": -4.533660888671875, + "step": 13381 + }, + { + "epoch": 3.35, + "grad_norm": 1.8024013042449951, + "learning_rate": 2.4601733416240146e-06, + "logits/chosen": -0.5081796050071716, + "logits/rejected": -0.6363707780838013, + "logps/chosen": -54.86504364013672, + "logps/rejected": -114.24247741699219, + "loss": 0.5631, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1864306926727295, + "rewards/margins": 7.854582786560059, + "rewards/rejected": -4.66815185546875, + "step": 13382 + }, + { + "epoch": 3.35, + "grad_norm": 5.007697105407715, + "learning_rate": 2.4594963409160676e-06, + "logits/chosen": -0.5384551882743835, + "logits/rejected": -0.6280018091201782, + "logps/chosen": -60.19898223876953, + "logps/rejected": -91.88297271728516, + "loss": 0.5995, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.232112407684326, + "rewards/margins": 6.114582538604736, + "rewards/rejected": -2.882469654083252, + "step": 13383 + }, + { + "epoch": 3.35, + "grad_norm": 3.916611433029175, + "learning_rate": 2.45881940298667e-06, + "logits/chosen": -0.5006592273712158, + "logits/rejected": -0.6143495440483093, + "logps/chosen": -61.90394592285156, + "logps/rejected": -84.06144714355469, + "loss": 0.6591, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9738385677337646, + "rewards/margins": 5.599006175994873, + "rewards/rejected": -2.6251676082611084, + "step": 13384 + }, + { + "epoch": 3.35, + "grad_norm": 4.100768566131592, + "learning_rate": 2.4581425278525484e-06, + "logits/chosen": -0.6221830248832703, + "logits/rejected": -0.6857784986495972, + "logps/chosen": -52.425880432128906, + "logps/rejected": -101.95155334472656, + "loss": 0.5934, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.524890422821045, + "rewards/margins": 7.01779842376709, + "rewards/rejected": -3.492908000946045, + "step": 13385 + }, + { + "epoch": 3.35, + "grad_norm": 3.270927667617798, + "learning_rate": 2.457465715530426e-06, + "logits/chosen": -0.5911169052124023, + "logits/rejected": -0.6187771558761597, + "logps/chosen": -52.8154296875, + "logps/rejected": -117.81501007080078, + "loss": 0.611, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.52420711517334, + "rewards/margins": 7.595614910125732, + "rewards/rejected": -4.071408271789551, + "step": 13386 + }, + { + "epoch": 3.35, + "grad_norm": 6.688473701477051, + "learning_rate": 2.4567889660370318e-06, + "logits/chosen": -0.574988842010498, + "logits/rejected": -0.6722770929336548, + "logps/chosen": -54.98295211791992, + "logps/rejected": -111.8333969116211, + "loss": 0.6482, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7950146198272705, + "rewards/margins": 7.645538330078125, + "rewards/rejected": -4.850523471832275, + "step": 13387 + }, + { + "epoch": 3.35, + "grad_norm": 12.156564712524414, + "learning_rate": 2.456112279389086e-06, + "logits/chosen": -0.6141826510429382, + "logits/rejected": -0.7033793926239014, + "logps/chosen": -53.453678131103516, + "logps/rejected": -107.25749206542969, + "loss": 0.7151, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.076577663421631, + "rewards/margins": 6.811834812164307, + "rewards/rejected": -3.7352566719055176, + "step": 13388 + }, + { + "epoch": 3.35, + "grad_norm": 16.364002227783203, + "learning_rate": 2.4554356556033133e-06, + "logits/chosen": -0.5220690369606018, + "logits/rejected": -0.611251175403595, + "logps/chosen": -72.26942443847656, + "logps/rejected": -92.99419403076172, + "loss": 0.7232, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.657724142074585, + "rewards/margins": 6.099727153778076, + "rewards/rejected": -3.4420034885406494, + "step": 13389 + }, + { + "epoch": 3.35, + "grad_norm": 4.061253547668457, + "learning_rate": 2.4547590946964324e-06, + "logits/chosen": -0.5738685131072998, + "logits/rejected": -0.6353504657745361, + "logps/chosen": -58.37961959838867, + "logps/rejected": -114.33004760742188, + "loss": 0.5758, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1570215225219727, + "rewards/margins": 8.522659301757812, + "rewards/rejected": -5.36563777923584, + "step": 13390 + }, + { + "epoch": 3.35, + "grad_norm": 8.641063690185547, + "learning_rate": 2.454082596685159e-06, + "logits/chosen": -0.5762666463851929, + "logits/rejected": -0.6257339715957642, + "logps/chosen": -62.95713424682617, + "logps/rejected": -117.86949157714844, + "loss": 0.7507, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.100069999694824, + "rewards/margins": 8.202471733093262, + "rewards/rejected": -5.1024017333984375, + "step": 13391 + }, + { + "epoch": 3.35, + "grad_norm": 11.549725532531738, + "learning_rate": 2.453406161586215e-06, + "logits/chosen": -0.5078790187835693, + "logits/rejected": -0.5697081685066223, + "logps/chosen": -54.47374725341797, + "logps/rejected": -104.17005157470703, + "loss": 0.6422, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.98584246635437, + "rewards/margins": 6.441938877105713, + "rewards/rejected": -3.456096649169922, + "step": 13392 + }, + { + "epoch": 3.35, + "grad_norm": 6.5034966468811035, + "learning_rate": 2.452729789416313e-06, + "logits/chosen": -0.6295667290687561, + "logits/rejected": -0.6571074724197388, + "logps/chosen": -44.865867614746094, + "logps/rejected": -111.51332092285156, + "loss": 0.5799, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3838367462158203, + "rewards/margins": 6.669279098510742, + "rewards/rejected": -3.285442590713501, + "step": 13393 + }, + { + "epoch": 3.35, + "grad_norm": 3.923250436782837, + "learning_rate": 2.452053480192166e-06, + "logits/chosen": -0.5663461089134216, + "logits/rejected": -0.63618403673172, + "logps/chosen": -61.43531799316406, + "logps/rejected": -95.60039520263672, + "loss": 0.6765, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.24552059173584, + "rewards/margins": 6.99176025390625, + "rewards/rejected": -3.746238946914673, + "step": 13394 + }, + { + "epoch": 3.35, + "grad_norm": 5.237586498260498, + "learning_rate": 2.451377233930487e-06, + "logits/chosen": -0.5745037198066711, + "logits/rejected": -0.633187472820282, + "logps/chosen": -55.7285270690918, + "logps/rejected": -123.66124725341797, + "loss": 0.6076, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.206234931945801, + "rewards/margins": 7.106882095336914, + "rewards/rejected": -3.9006471633911133, + "step": 13395 + }, + { + "epoch": 3.35, + "grad_norm": 4.748563766479492, + "learning_rate": 2.4507010506479915e-06, + "logits/chosen": -0.5156964063644409, + "logits/rejected": -0.5843973755836487, + "logps/chosen": -56.86454772949219, + "logps/rejected": -100.98434448242188, + "loss": 0.6506, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1871960163116455, + "rewards/margins": 6.690630912780762, + "rewards/rejected": -3.503434419631958, + "step": 13396 + }, + { + "epoch": 3.35, + "grad_norm": 3.2667832374572754, + "learning_rate": 2.4500249303613805e-06, + "logits/chosen": -0.5378979444503784, + "logits/rejected": -0.6239395141601562, + "logps/chosen": -58.120849609375, + "logps/rejected": -115.87672424316406, + "loss": 0.5914, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1158010959625244, + "rewards/margins": 7.4131245613098145, + "rewards/rejected": -4.297323703765869, + "step": 13397 + }, + { + "epoch": 3.35, + "grad_norm": 4.68417501449585, + "learning_rate": 2.449348873087366e-06, + "logits/chosen": -0.4900245666503906, + "logits/rejected": -0.5619114637374878, + "logps/chosen": -55.28981018066406, + "logps/rejected": -103.79790496826172, + "loss": 0.5648, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0338196754455566, + "rewards/margins": 6.565238952636719, + "rewards/rejected": -3.531419038772583, + "step": 13398 + }, + { + "epoch": 3.35, + "grad_norm": 6.834243297576904, + "learning_rate": 2.4486728788426576e-06, + "logits/chosen": -0.6223080158233643, + "logits/rejected": -0.7105468511581421, + "logps/chosen": -56.219276428222656, + "logps/rejected": -97.18734741210938, + "loss": 0.6914, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.16469669342041, + "rewards/margins": 7.462185859680176, + "rewards/rejected": -4.297489166259766, + "step": 13399 + }, + { + "epoch": 3.35, + "grad_norm": 3.1380395889282227, + "learning_rate": 2.4479969476439523e-06, + "logits/chosen": -0.5937002897262573, + "logits/rejected": -0.6675192713737488, + "logps/chosen": -57.54850387573242, + "logps/rejected": -109.32553100585938, + "loss": 0.6001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3148906230926514, + "rewards/margins": 7.561544418334961, + "rewards/rejected": -4.2466535568237305, + "step": 13400 + }, + { + "epoch": 3.35, + "grad_norm": 5.237860679626465, + "learning_rate": 2.4473210795079574e-06, + "logits/chosen": -0.49077755212783813, + "logits/rejected": -0.5689928531646729, + "logps/chosen": -56.409828186035156, + "logps/rejected": -121.6962890625, + "loss": 0.6939, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.932704210281372, + "rewards/margins": 7.179835319519043, + "rewards/rejected": -4.247131824493408, + "step": 13401 + }, + { + "epoch": 3.35, + "grad_norm": 11.705838203430176, + "learning_rate": 2.4466452744513747e-06, + "logits/chosen": -0.5363695025444031, + "logits/rejected": -0.610927402973175, + "logps/chosen": -64.589111328125, + "logps/rejected": -111.2536392211914, + "loss": 0.7698, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6373136043548584, + "rewards/margins": 5.935262203216553, + "rewards/rejected": -3.2979490756988525, + "step": 13402 + }, + { + "epoch": 3.35, + "grad_norm": 3.8055031299591064, + "learning_rate": 2.4459695324909034e-06, + "logits/chosen": -0.5145458579063416, + "logits/rejected": -0.6428568959236145, + "logps/chosen": -66.453857421875, + "logps/rejected": -99.00086975097656, + "loss": 0.5768, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.237860679626465, + "rewards/margins": 7.035962104797363, + "rewards/rejected": -3.798100709915161, + "step": 13403 + }, + { + "epoch": 3.35, + "grad_norm": 2.907792091369629, + "learning_rate": 2.4452938536432395e-06, + "logits/chosen": -0.5544626712799072, + "logits/rejected": -0.6687281131744385, + "logps/chosen": -52.054107666015625, + "logps/rejected": -117.065185546875, + "loss": 0.5603, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1477842330932617, + "rewards/margins": 9.121591567993164, + "rewards/rejected": -5.973806381225586, + "step": 13404 + }, + { + "epoch": 3.35, + "grad_norm": 2.5145063400268555, + "learning_rate": 2.4446182379250843e-06, + "logits/chosen": -0.5179659724235535, + "logits/rejected": -0.5876924395561218, + "logps/chosen": -58.87885665893555, + "logps/rejected": -92.900146484375, + "loss": 0.6122, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.31736421585083, + "rewards/margins": 6.5479841232299805, + "rewards/rejected": -3.2306201457977295, + "step": 13405 + }, + { + "epoch": 3.35, + "grad_norm": 3.527545928955078, + "learning_rate": 2.44394268535313e-06, + "logits/chosen": -0.5496288537979126, + "logits/rejected": -0.5897338390350342, + "logps/chosen": -48.76397705078125, + "logps/rejected": -110.12003326416016, + "loss": 0.5591, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2554030418395996, + "rewards/margins": 6.676806449890137, + "rewards/rejected": -3.4214038848876953, + "step": 13406 + }, + { + "epoch": 3.35, + "grad_norm": 10.183368682861328, + "learning_rate": 2.443267195944069e-06, + "logits/chosen": -0.6189812421798706, + "logits/rejected": -0.6777582764625549, + "logps/chosen": -53.115577697753906, + "logps/rejected": -117.24142456054688, + "loss": 0.7097, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.978628158569336, + "rewards/margins": 7.665612697601318, + "rewards/rejected": -4.686985015869141, + "step": 13407 + }, + { + "epoch": 3.35, + "grad_norm": 4.088181018829346, + "learning_rate": 2.4425917697145966e-06, + "logits/chosen": -0.6497191190719604, + "logits/rejected": -0.6768982410430908, + "logps/chosen": -52.853084564208984, + "logps/rejected": -128.97471618652344, + "loss": 0.6609, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.945309638977051, + "rewards/margins": 7.088796138763428, + "rewards/rejected": -4.143486976623535, + "step": 13408 + }, + { + "epoch": 3.35, + "grad_norm": 2.5347180366516113, + "learning_rate": 2.4419164066814006e-06, + "logits/chosen": -0.5578633546829224, + "logits/rejected": -0.6861627101898193, + "logps/chosen": -61.91877746582031, + "logps/rejected": -93.93653869628906, + "loss": 0.5697, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9712748527526855, + "rewards/margins": 7.145814895629883, + "rewards/rejected": -4.174539566040039, + "step": 13409 + }, + { + "epoch": 3.35, + "grad_norm": 3.895096778869629, + "learning_rate": 2.4412411068611723e-06, + "logits/chosen": -0.44894713163375854, + "logits/rejected": -0.5388088822364807, + "logps/chosen": -64.68035888671875, + "logps/rejected": -94.35755920410156, + "loss": 0.6284, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.379451274871826, + "rewards/margins": 5.463575839996338, + "rewards/rejected": -2.0841240882873535, + "step": 13410 + }, + { + "epoch": 3.35, + "grad_norm": 4.312075614929199, + "learning_rate": 2.4405658702705983e-06, + "logits/chosen": -0.5216034054756165, + "logits/rejected": -0.6014151573181152, + "logps/chosen": -83.15794372558594, + "logps/rejected": -99.04493713378906, + "loss": 0.6962, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.285362958908081, + "rewards/margins": 5.199228286743164, + "rewards/rejected": -1.9138652086257935, + "step": 13411 + }, + { + "epoch": 3.36, + "grad_norm": 11.529125213623047, + "learning_rate": 2.439890696926362e-06, + "logits/chosen": -0.47716760635375977, + "logits/rejected": -0.5367243885993958, + "logps/chosen": -55.02552032470703, + "logps/rejected": -104.8823013305664, + "loss": 0.633, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2326250076293945, + "rewards/margins": 6.152348518371582, + "rewards/rejected": -2.9197235107421875, + "step": 13412 + }, + { + "epoch": 3.36, + "grad_norm": 4.803969860076904, + "learning_rate": 2.4392155868451494e-06, + "logits/chosen": -0.573783814907074, + "logits/rejected": -0.6761690378189087, + "logps/chosen": -52.02627182006836, + "logps/rejected": -111.98653411865234, + "loss": 0.5831, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3168833255767822, + "rewards/margins": 9.012738227844238, + "rewards/rejected": -5.695854663848877, + "step": 13413 + }, + { + "epoch": 3.36, + "grad_norm": 2.1398937702178955, + "learning_rate": 2.438540540043645e-06, + "logits/chosen": -0.5803415775299072, + "logits/rejected": -0.6228852868080139, + "logps/chosen": -50.22223663330078, + "logps/rejected": -90.82025909423828, + "loss": 0.6173, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4572010040283203, + "rewards/margins": 7.195856094360352, + "rewards/rejected": -3.7386550903320312, + "step": 13414 + }, + { + "epoch": 3.36, + "grad_norm": 4.101904392242432, + "learning_rate": 2.437865556538529e-06, + "logits/chosen": -0.5416598320007324, + "logits/rejected": -0.6437273025512695, + "logps/chosen": -73.32487487792969, + "logps/rejected": -117.78286743164062, + "loss": 0.6447, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.352442502975464, + "rewards/margins": 8.17009449005127, + "rewards/rejected": -4.817652225494385, + "step": 13415 + }, + { + "epoch": 3.36, + "grad_norm": 8.950174331665039, + "learning_rate": 2.437190636346478e-06, + "logits/chosen": -0.5537575483322144, + "logits/rejected": -0.6399754285812378, + "logps/chosen": -60.18560028076172, + "logps/rejected": -106.75012969970703, + "loss": 0.7045, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2710154056549072, + "rewards/margins": 7.427422046661377, + "rewards/rejected": -4.156406402587891, + "step": 13416 + }, + { + "epoch": 3.36, + "grad_norm": 3.480219602584839, + "learning_rate": 2.4365157794841742e-06, + "logits/chosen": -0.5874501466751099, + "logits/rejected": -0.659666895866394, + "logps/chosen": -67.50110626220703, + "logps/rejected": -112.08392333984375, + "loss": 0.7532, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9551751613616943, + "rewards/margins": 7.083867073059082, + "rewards/rejected": -4.128692626953125, + "step": 13417 + }, + { + "epoch": 3.36, + "grad_norm": 4.025023460388184, + "learning_rate": 2.4358409859682924e-06, + "logits/chosen": -0.5420278310775757, + "logits/rejected": -0.6480549573898315, + "logps/chosen": -57.59315490722656, + "logps/rejected": -103.62824249267578, + "loss": 0.5996, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1258034706115723, + "rewards/margins": 7.81201171875, + "rewards/rejected": -4.686207294464111, + "step": 13418 + }, + { + "epoch": 3.36, + "grad_norm": 5.799195289611816, + "learning_rate": 2.435166255815505e-06, + "logits/chosen": -0.5816981792449951, + "logits/rejected": -0.6374225616455078, + "logps/chosen": -56.68595886230469, + "logps/rejected": -115.01311492919922, + "loss": 0.6034, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3592283725738525, + "rewards/margins": 7.356685638427734, + "rewards/rejected": -3.9974565505981445, + "step": 13419 + }, + { + "epoch": 3.36, + "grad_norm": 5.769659996032715, + "learning_rate": 2.43449158904249e-06, + "logits/chosen": -0.5777308344841003, + "logits/rejected": -0.7068573236465454, + "logps/chosen": -56.69120788574219, + "logps/rejected": -99.67610931396484, + "loss": 0.6037, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.201777458190918, + "rewards/margins": 6.756988048553467, + "rewards/rejected": -3.5552101135253906, + "step": 13420 + }, + { + "epoch": 3.36, + "grad_norm": 6.829748153686523, + "learning_rate": 2.433816985665916e-06, + "logits/chosen": -0.5423317551612854, + "logits/rejected": -0.6113121509552002, + "logps/chosen": -61.25827407836914, + "logps/rejected": -113.06828308105469, + "loss": 0.6712, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9954092502593994, + "rewards/margins": 7.113390922546387, + "rewards/rejected": -4.117981910705566, + "step": 13421 + }, + { + "epoch": 3.36, + "grad_norm": 4.329436302185059, + "learning_rate": 2.4331424457024522e-06, + "logits/chosen": -0.5038137435913086, + "logits/rejected": -0.6251533627510071, + "logps/chosen": -57.4387092590332, + "logps/rejected": -92.05742645263672, + "loss": 0.5334, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0722577571868896, + "rewards/margins": 6.883177280426025, + "rewards/rejected": -3.810919761657715, + "step": 13422 + }, + { + "epoch": 3.36, + "grad_norm": 2.1565914154052734, + "learning_rate": 2.4324679691687706e-06, + "logits/chosen": -0.5950242877006531, + "logits/rejected": -0.7006224989891052, + "logps/chosen": -65.2601089477539, + "logps/rejected": -107.135498046875, + "loss": 0.6419, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.993842124938965, + "rewards/margins": 7.408575534820557, + "rewards/rejected": -4.414732933044434, + "step": 13423 + }, + { + "epoch": 3.36, + "grad_norm": 6.489982604980469, + "learning_rate": 2.4317935560815363e-06, + "logits/chosen": -0.5106757283210754, + "logits/rejected": -0.628045916557312, + "logps/chosen": -51.94145965576172, + "logps/rejected": -98.68402862548828, + "loss": 0.6292, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.169227123260498, + "rewards/margins": 7.015773296356201, + "rewards/rejected": -3.8465466499328613, + "step": 13424 + }, + { + "epoch": 3.36, + "grad_norm": 3.0825765132904053, + "learning_rate": 2.4311192064574134e-06, + "logits/chosen": -0.5232099890708923, + "logits/rejected": -0.6084962487220764, + "logps/chosen": -59.30222702026367, + "logps/rejected": -108.4975357055664, + "loss": 0.6033, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.245807409286499, + "rewards/margins": 6.874971866607666, + "rewards/rejected": -3.6291635036468506, + "step": 13425 + }, + { + "epoch": 3.36, + "grad_norm": 8.533527374267578, + "learning_rate": 2.430444920313067e-06, + "logits/chosen": -0.5191249847412109, + "logits/rejected": -0.5988757014274597, + "logps/chosen": -53.36988067626953, + "logps/rejected": -117.72286224365234, + "loss": 0.5436, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1437158584594727, + "rewards/margins": 8.115889549255371, + "rewards/rejected": -4.972173690795898, + "step": 13426 + }, + { + "epoch": 3.36, + "grad_norm": 8.34190845489502, + "learning_rate": 2.4297706976651618e-06, + "logits/chosen": -0.5787855982780457, + "logits/rejected": -0.6057615280151367, + "logps/chosen": -57.39128494262695, + "logps/rejected": -106.78821563720703, + "loss": 0.8113, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.388026237487793, + "rewards/margins": 5.727778911590576, + "rewards/rejected": -2.339752674102783, + "step": 13427 + }, + { + "epoch": 3.36, + "grad_norm": 6.568599700927734, + "learning_rate": 2.4290965385303562e-06, + "logits/chosen": -0.5708662271499634, + "logits/rejected": -0.6218467950820923, + "logps/chosen": -51.134159088134766, + "logps/rejected": -96.24211120605469, + "loss": 0.7029, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3411834239959717, + "rewards/margins": 6.538273811340332, + "rewards/rejected": -3.1970903873443604, + "step": 13428 + }, + { + "epoch": 3.36, + "grad_norm": 11.24953556060791, + "learning_rate": 2.428422442925308e-06, + "logits/chosen": -0.5182493329048157, + "logits/rejected": -0.5876368880271912, + "logps/chosen": -62.453739166259766, + "logps/rejected": -112.5029296875, + "loss": 0.6394, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.715275287628174, + "rewards/margins": 6.628838539123535, + "rewards/rejected": -3.9135637283325195, + "step": 13429 + }, + { + "epoch": 3.36, + "grad_norm": 7.851250171661377, + "learning_rate": 2.4277484108666783e-06, + "logits/chosen": -0.5443986058235168, + "logits/rejected": -0.5820070505142212, + "logps/chosen": -52.42031478881836, + "logps/rejected": -121.62930297851562, + "loss": 0.7024, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3513803482055664, + "rewards/margins": 7.209746837615967, + "rewards/rejected": -3.858366012573242, + "step": 13430 + }, + { + "epoch": 3.36, + "grad_norm": 2.978713274002075, + "learning_rate": 2.4270744423711214e-06, + "logits/chosen": -0.5361406803131104, + "logits/rejected": -0.6489979028701782, + "logps/chosen": -52.3463020324707, + "logps/rejected": -100.34770202636719, + "loss": 0.6027, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2498939037323, + "rewards/margins": 7.070559024810791, + "rewards/rejected": -3.820664882659912, + "step": 13431 + }, + { + "epoch": 3.36, + "grad_norm": 5.719888687133789, + "learning_rate": 2.4264005374552894e-06, + "logits/chosen": -0.516156017780304, + "logits/rejected": -0.5617958903312683, + "logps/chosen": -49.07699203491211, + "logps/rejected": -94.97553253173828, + "loss": 0.6326, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1262667179107666, + "rewards/margins": 5.967455863952637, + "rewards/rejected": -2.841189384460449, + "step": 13432 + }, + { + "epoch": 3.36, + "grad_norm": 6.601708889007568, + "learning_rate": 2.42572669613584e-06, + "logits/chosen": -0.5554707050323486, + "logits/rejected": -0.5989320874214172, + "logps/chosen": -64.05017852783203, + "logps/rejected": -95.79722595214844, + "loss": 0.7228, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1913630962371826, + "rewards/margins": 5.552942276000977, + "rewards/rejected": -2.361578941345215, + "step": 13433 + }, + { + "epoch": 3.36, + "grad_norm": 5.267703533172607, + "learning_rate": 2.4250529184294203e-06, + "logits/chosen": -0.6015467643737793, + "logits/rejected": -0.7272327542304993, + "logps/chosen": -51.9049072265625, + "logps/rejected": -114.43363952636719, + "loss": 0.6018, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0607969760894775, + "rewards/margins": 7.4777936935424805, + "rewards/rejected": -4.416996479034424, + "step": 13434 + }, + { + "epoch": 3.36, + "grad_norm": 15.144423484802246, + "learning_rate": 2.4243792043526833e-06, + "logits/chosen": -0.5888679027557373, + "logits/rejected": -0.6725809574127197, + "logps/chosen": -59.372314453125, + "logps/rejected": -96.25717163085938, + "loss": 0.7933, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1736364364624023, + "rewards/margins": 5.742544174194336, + "rewards/rejected": -2.5689072608947754, + "step": 13435 + }, + { + "epoch": 3.36, + "grad_norm": 6.603369235992432, + "learning_rate": 2.423705553922275e-06, + "logits/chosen": -0.5168647766113281, + "logits/rejected": -0.5911222696304321, + "logps/chosen": -54.9791259765625, + "logps/rejected": -88.075439453125, + "loss": 0.5764, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5344855785369873, + "rewards/margins": 6.231995582580566, + "rewards/rejected": -2.697510242462158, + "step": 13436 + }, + { + "epoch": 3.36, + "grad_norm": 5.5151824951171875, + "learning_rate": 2.423031967154842e-06, + "logits/chosen": -0.5196565985679626, + "logits/rejected": -0.6097320318222046, + "logps/chosen": -56.781795501708984, + "logps/rejected": -106.39258575439453, + "loss": 0.6877, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1701319217681885, + "rewards/margins": 7.372881889343262, + "rewards/rejected": -4.202750205993652, + "step": 13437 + }, + { + "epoch": 3.36, + "grad_norm": 6.9211506843566895, + "learning_rate": 2.42235844406703e-06, + "logits/chosen": -0.5067219734191895, + "logits/rejected": -0.5786961317062378, + "logps/chosen": -57.15580749511719, + "logps/rejected": -95.03904724121094, + "loss": 0.6462, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.116821527481079, + "rewards/margins": 6.201764106750488, + "rewards/rejected": -3.0849430561065674, + "step": 13438 + }, + { + "epoch": 3.36, + "grad_norm": 2.9034457206726074, + "learning_rate": 2.4216849846754857e-06, + "logits/chosen": -0.5200121998786926, + "logits/rejected": -0.632142961025238, + "logps/chosen": -53.56660842895508, + "logps/rejected": -92.98658752441406, + "loss": 0.5747, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0967600345611572, + "rewards/margins": 6.922738552093506, + "rewards/rejected": -3.8259782791137695, + "step": 13439 + }, + { + "epoch": 3.36, + "grad_norm": 5.3585686683654785, + "learning_rate": 2.4210115889968446e-06, + "logits/chosen": -0.6612354516983032, + "logits/rejected": -0.7254615426063538, + "logps/chosen": -53.41231918334961, + "logps/rejected": -95.00079345703125, + "loss": 0.5802, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.176406145095825, + "rewards/margins": 6.0948028564453125, + "rewards/rejected": -2.9183969497680664, + "step": 13440 + }, + { + "epoch": 3.36, + "grad_norm": 11.356188774108887, + "learning_rate": 2.4203382570477495e-06, + "logits/chosen": -0.5682942271232605, + "logits/rejected": -0.6395294666290283, + "logps/chosen": -54.00099182128906, + "logps/rejected": -135.1448516845703, + "loss": 0.6042, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.688512086868286, + "rewards/margins": 8.008071899414062, + "rewards/rejected": -5.3195600509643555, + "step": 13441 + }, + { + "epoch": 3.36, + "grad_norm": 5.490847110748291, + "learning_rate": 2.419664988844845e-06, + "logits/chosen": -0.5774551033973694, + "logits/rejected": -0.6443687677383423, + "logps/chosen": -47.926090240478516, + "logps/rejected": -105.84980010986328, + "loss": 0.6458, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1044373512268066, + "rewards/margins": 6.316153526306152, + "rewards/rejected": -3.2117156982421875, + "step": 13442 + }, + { + "epoch": 3.36, + "grad_norm": 6.212146282196045, + "learning_rate": 2.4189917844047584e-06, + "logits/chosen": -0.577128529548645, + "logits/rejected": -0.6250584721565247, + "logps/chosen": -63.62076950073242, + "logps/rejected": -108.81676483154297, + "loss": 0.6887, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0610597133636475, + "rewards/margins": 5.965963363647461, + "rewards/rejected": -2.9049031734466553, + "step": 13443 + }, + { + "epoch": 3.36, + "grad_norm": 17.228696823120117, + "learning_rate": 2.4183186437441305e-06, + "logits/chosen": -0.49073636531829834, + "logits/rejected": -0.519920289516449, + "logps/chosen": -49.879905700683594, + "logps/rejected": -108.4545669555664, + "loss": 0.6242, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9243533611297607, + "rewards/margins": 6.9472856521606445, + "rewards/rejected": -4.022932052612305, + "step": 13444 + }, + { + "epoch": 3.36, + "grad_norm": 1.4874978065490723, + "learning_rate": 2.417645566879596e-06, + "logits/chosen": -0.5463076829910278, + "logits/rejected": -0.6118228435516357, + "logps/chosen": -42.219573974609375, + "logps/rejected": -126.96125030517578, + "loss": 0.4929, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3612611293792725, + "rewards/margins": 8.392086029052734, + "rewards/rejected": -5.030824661254883, + "step": 13445 + }, + { + "epoch": 3.36, + "grad_norm": 8.322216033935547, + "learning_rate": 2.4169725538277873e-06, + "logits/chosen": -0.4987730085849762, + "logits/rejected": -0.5435953140258789, + "logps/chosen": -54.6138916015625, + "logps/rejected": -106.33396911621094, + "loss": 0.6671, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8516054153442383, + "rewards/margins": 5.926753044128418, + "rewards/rejected": -3.0751471519470215, + "step": 13446 + }, + { + "epoch": 3.36, + "grad_norm": 5.120053291320801, + "learning_rate": 2.4162996046053316e-06, + "logits/chosen": -0.4926508367061615, + "logits/rejected": -0.6028130054473877, + "logps/chosen": -56.58110046386719, + "logps/rejected": -114.65105438232422, + "loss": 0.57, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.258129119873047, + "rewards/margins": 8.302909851074219, + "rewards/rejected": -5.044780731201172, + "step": 13447 + }, + { + "epoch": 3.36, + "grad_norm": 4.739470481872559, + "learning_rate": 2.4156267192288635e-06, + "logits/chosen": -0.5424290895462036, + "logits/rejected": -0.5639081001281738, + "logps/chosen": -51.3541259765625, + "logps/rejected": -95.34797668457031, + "loss": 0.712, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8802566528320312, + "rewards/margins": 5.760423183441162, + "rewards/rejected": -2.880166530609131, + "step": 13448 + }, + { + "epoch": 3.36, + "grad_norm": 3.2804148197174072, + "learning_rate": 2.4149538977150073e-06, + "logits/chosen": -0.6012610197067261, + "logits/rejected": -0.6666956543922424, + "logps/chosen": -51.19544219970703, + "logps/rejected": -104.00148010253906, + "loss": 0.598, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.431061267852783, + "rewards/margins": 6.805004119873047, + "rewards/rejected": -3.3739428520202637, + "step": 13449 + }, + { + "epoch": 3.36, + "grad_norm": 3.9538023471832275, + "learning_rate": 2.414281140080388e-06, + "logits/chosen": -0.579046905040741, + "logits/rejected": -0.6322205066680908, + "logps/chosen": -47.688148498535156, + "logps/rejected": -93.42838287353516, + "loss": 0.5828, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3436648845672607, + "rewards/margins": 6.986578464508057, + "rewards/rejected": -3.642913341522217, + "step": 13450 + }, + { + "epoch": 3.36, + "grad_norm": 3.4855597019195557, + "learning_rate": 2.4136084463416342e-06, + "logits/chosen": -0.5480948090553284, + "logits/rejected": -0.6209450364112854, + "logps/chosen": -45.24673080444336, + "logps/rejected": -122.87786865234375, + "loss": 0.6119, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.463819742202759, + "rewards/margins": 8.20930004119873, + "rewards/rejected": -4.745479583740234, + "step": 13451 + }, + { + "epoch": 3.37, + "grad_norm": 4.6584601402282715, + "learning_rate": 2.412935816515365e-06, + "logits/chosen": -0.5882943272590637, + "logits/rejected": -0.6736303567886353, + "logps/chosen": -64.41767120361328, + "logps/rejected": -109.86113739013672, + "loss": 0.6785, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5973751544952393, + "rewards/margins": 7.930174827575684, + "rewards/rejected": -4.332799911499023, + "step": 13452 + }, + { + "epoch": 3.37, + "grad_norm": 6.110694885253906, + "learning_rate": 2.4122632506182054e-06, + "logits/chosen": -0.5886568427085876, + "logits/rejected": -0.6440590620040894, + "logps/chosen": -47.61070251464844, + "logps/rejected": -100.23869323730469, + "loss": 0.6206, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0382847785949707, + "rewards/margins": 5.682629108428955, + "rewards/rejected": -2.6443448066711426, + "step": 13453 + }, + { + "epoch": 3.37, + "grad_norm": 4.43470573425293, + "learning_rate": 2.4115907486667727e-06, + "logits/chosen": -0.5545409917831421, + "logits/rejected": -0.643600344657898, + "logps/chosen": -45.568504333496094, + "logps/rejected": -96.58236694335938, + "loss": 0.6144, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1697754859924316, + "rewards/margins": 7.323060989379883, + "rewards/rejected": -4.153285503387451, + "step": 13454 + }, + { + "epoch": 3.37, + "grad_norm": 3.8286280632019043, + "learning_rate": 2.4109183106776846e-06, + "logits/chosen": -0.530083417892456, + "logits/rejected": -0.6528289914131165, + "logps/chosen": -54.45954513549805, + "logps/rejected": -98.37545013427734, + "loss": 0.5984, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.023003339767456, + "rewards/margins": 7.667583465576172, + "rewards/rejected": -4.6445794105529785, + "step": 13455 + }, + { + "epoch": 3.37, + "grad_norm": 4.610793590545654, + "learning_rate": 2.4102459366675602e-06, + "logits/chosen": -0.5908483862876892, + "logits/rejected": -0.6867422461509705, + "logps/chosen": -63.283817291259766, + "logps/rejected": -99.08135986328125, + "loss": 0.7272, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.697232484817505, + "rewards/margins": 5.9230451583862305, + "rewards/rejected": -3.2258121967315674, + "step": 13456 + }, + { + "epoch": 3.37, + "grad_norm": 2.7018380165100098, + "learning_rate": 2.4095736266530116e-06, + "logits/chosen": -0.5535826683044434, + "logits/rejected": -0.6355710029602051, + "logps/chosen": -74.53330993652344, + "logps/rejected": -107.17294311523438, + "loss": 0.6048, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.025879383087158, + "rewards/margins": 7.884981155395508, + "rewards/rejected": -4.85910177230835, + "step": 13457 + }, + { + "epoch": 3.37, + "grad_norm": 6.623183727264404, + "learning_rate": 2.4089013806506563e-06, + "logits/chosen": -0.557289183139801, + "logits/rejected": -0.6216702461242676, + "logps/chosen": -55.83420181274414, + "logps/rejected": -84.31048583984375, + "loss": 0.8114, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8373830318450928, + "rewards/margins": 5.464349269866943, + "rewards/rejected": -2.626966953277588, + "step": 13458 + }, + { + "epoch": 3.37, + "grad_norm": 4.237208366394043, + "learning_rate": 2.408229198677101e-06, + "logits/chosen": -0.6403590440750122, + "logits/rejected": -0.7515833377838135, + "logps/chosen": -52.071075439453125, + "logps/rejected": -114.08657836914062, + "loss": 0.6257, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1131765842437744, + "rewards/margins": 8.227081298828125, + "rewards/rejected": -5.113905429840088, + "step": 13459 + }, + { + "epoch": 3.37, + "grad_norm": 3.457388401031494, + "learning_rate": 2.407557080748962e-06, + "logits/chosen": -0.5542185306549072, + "logits/rejected": -0.6605425477027893, + "logps/chosen": -57.62444305419922, + "logps/rejected": -105.15460205078125, + "loss": 0.6783, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8956172466278076, + "rewards/margins": 7.011043071746826, + "rewards/rejected": -4.1154255867004395, + "step": 13460 + }, + { + "epoch": 3.37, + "grad_norm": 6.935020446777344, + "learning_rate": 2.406885026882844e-06, + "logits/chosen": -0.5231775045394897, + "logits/rejected": -0.6688430905342102, + "logps/chosen": -57.58422088623047, + "logps/rejected": -91.18206024169922, + "loss": 0.69, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.056791305541992, + "rewards/margins": 7.215002059936523, + "rewards/rejected": -4.158210277557373, + "step": 13461 + }, + { + "epoch": 3.37, + "grad_norm": 2.752260208129883, + "learning_rate": 2.406213037095354e-06, + "logits/chosen": -0.47759297490119934, + "logits/rejected": -0.5732274651527405, + "logps/chosen": -51.34103775024414, + "logps/rejected": -94.29371643066406, + "loss": 0.5628, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1516947746276855, + "rewards/margins": 7.368678569793701, + "rewards/rejected": -4.216983795166016, + "step": 13462 + }, + { + "epoch": 3.37, + "grad_norm": 3.7958455085754395, + "learning_rate": 2.4055411114031003e-06, + "logits/chosen": -0.4997348189353943, + "logits/rejected": -0.596418559551239, + "logps/chosen": -68.15482330322266, + "logps/rejected": -96.3007583618164, + "loss": 0.6341, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9588963985443115, + "rewards/margins": 6.123352527618408, + "rewards/rejected": -3.1644554138183594, + "step": 13463 + }, + { + "epoch": 3.37, + "grad_norm": 34.703712463378906, + "learning_rate": 2.404869249822685e-06, + "logits/chosen": -0.5414984822273254, + "logits/rejected": -0.605991542339325, + "logps/chosen": -63.029579162597656, + "logps/rejected": -90.34930419921875, + "loss": 0.8323, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.903975009918213, + "rewards/margins": 4.538016319274902, + "rewards/rejected": -1.6340411901474, + "step": 13464 + }, + { + "epoch": 3.37, + "grad_norm": 6.964960098266602, + "learning_rate": 2.4041974523707098e-06, + "logits/chosen": -0.5744565725326538, + "logits/rejected": -0.6598711609840393, + "logps/chosen": -56.5694465637207, + "logps/rejected": -106.98270416259766, + "loss": 0.699, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0167360305786133, + "rewards/margins": 5.258291244506836, + "rewards/rejected": -2.2415552139282227, + "step": 13465 + }, + { + "epoch": 3.37, + "grad_norm": 4.257961273193359, + "learning_rate": 2.403525719063778e-06, + "logits/chosen": -0.48295891284942627, + "logits/rejected": -0.5301406383514404, + "logps/chosen": -39.50775146484375, + "logps/rejected": -113.2514877319336, + "loss": 0.5674, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2354888916015625, + "rewards/margins": 7.587067604064941, + "rewards/rejected": -4.351579666137695, + "step": 13466 + }, + { + "epoch": 3.37, + "grad_norm": 2.0606930255889893, + "learning_rate": 2.4028540499184873e-06, + "logits/chosen": -0.6003563404083252, + "logits/rejected": -0.7046060562133789, + "logps/chosen": -54.14438247680664, + "logps/rejected": -104.70580291748047, + "loss": 0.5584, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.989999294281006, + "rewards/margins": 8.384248733520508, + "rewards/rejected": -5.394250869750977, + "step": 13467 + }, + { + "epoch": 3.37, + "grad_norm": 23.927160263061523, + "learning_rate": 2.4021824449514335e-06, + "logits/chosen": -0.5690562725067139, + "logits/rejected": -0.7006552815437317, + "logps/chosen": -60.503013610839844, + "logps/rejected": -98.06105041503906, + "loss": 0.6977, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.754549026489258, + "rewards/margins": 7.431380748748779, + "rewards/rejected": -4.67683219909668, + "step": 13468 + }, + { + "epoch": 3.37, + "grad_norm": 6.676271915435791, + "learning_rate": 2.4015109041792143e-06, + "logits/chosen": -0.5083488821983337, + "logits/rejected": -0.5644669532775879, + "logps/chosen": -54.50062561035156, + "logps/rejected": -107.98643493652344, + "loss": 0.6076, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9276092052459717, + "rewards/margins": 6.595531463623047, + "rewards/rejected": -3.6679224967956543, + "step": 13469 + }, + { + "epoch": 3.37, + "grad_norm": 3.638753890991211, + "learning_rate": 2.400839427618427e-06, + "logits/chosen": -0.5513557195663452, + "logits/rejected": -0.6206103563308716, + "logps/chosen": -49.65694808959961, + "logps/rejected": -115.98690795898438, + "loss": 0.6444, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.174271583557129, + "rewards/margins": 6.967929840087891, + "rewards/rejected": -3.7936580181121826, + "step": 13470 + }, + { + "epoch": 3.37, + "grad_norm": 3.2930331230163574, + "learning_rate": 2.4001680152856623e-06, + "logits/chosen": -0.6306321620941162, + "logits/rejected": -0.7117425203323364, + "logps/chosen": -53.093170166015625, + "logps/rejected": -106.93379211425781, + "loss": 0.5702, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.851578950881958, + "rewards/margins": 7.192414283752441, + "rewards/rejected": -4.340835094451904, + "step": 13471 + }, + { + "epoch": 3.37, + "grad_norm": 3.390589714050293, + "learning_rate": 2.399496667197509e-06, + "logits/chosen": -0.6090613007545471, + "logits/rejected": -0.679530918598175, + "logps/chosen": -57.06117248535156, + "logps/rejected": -107.29315185546875, + "loss": 0.6347, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2507755756378174, + "rewards/margins": 7.510488510131836, + "rewards/rejected": -4.259712219238281, + "step": 13472 + }, + { + "epoch": 3.37, + "grad_norm": 3.660059690475464, + "learning_rate": 2.398825383370561e-06, + "logits/chosen": -0.5554615259170532, + "logits/rejected": -0.6278589367866516, + "logps/chosen": -51.476348876953125, + "logps/rejected": -103.41561889648438, + "loss": 0.6061, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.076709032058716, + "rewards/margins": 6.889115333557129, + "rewards/rejected": -3.812406063079834, + "step": 13473 + }, + { + "epoch": 3.37, + "grad_norm": 4.229282379150391, + "learning_rate": 2.3981541638214047e-06, + "logits/chosen": -0.5465081334114075, + "logits/rejected": -0.6285936236381531, + "logps/chosen": -56.47669982910156, + "logps/rejected": -100.22451782226562, + "loss": 0.6711, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0598392486572266, + "rewards/margins": 7.518647193908691, + "rewards/rejected": -4.458807945251465, + "step": 13474 + }, + { + "epoch": 3.37, + "grad_norm": 27.7365665435791, + "learning_rate": 2.397483008566624e-06, + "logits/chosen": -0.4464481770992279, + "logits/rejected": -0.5539883375167847, + "logps/chosen": -60.87803268432617, + "logps/rejected": -110.60968780517578, + "loss": 0.7422, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0123918056488037, + "rewards/margins": 6.271017074584961, + "rewards/rejected": -3.2586252689361572, + "step": 13475 + }, + { + "epoch": 3.37, + "grad_norm": 6.4066853523254395, + "learning_rate": 2.3968119176228085e-06, + "logits/chosen": -0.585754930973053, + "logits/rejected": -0.7033271193504333, + "logps/chosen": -64.28544616699219, + "logps/rejected": -106.53326416015625, + "loss": 0.6394, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.444525957107544, + "rewards/margins": 7.6047892570495605, + "rewards/rejected": -4.160262584686279, + "step": 13476 + }, + { + "epoch": 3.37, + "grad_norm": 6.240604400634766, + "learning_rate": 2.3961408910065374e-06, + "logits/chosen": -0.5510888695716858, + "logits/rejected": -0.5794526934623718, + "logps/chosen": -72.23839569091797, + "logps/rejected": -108.75305938720703, + "loss": 0.7312, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.956326723098755, + "rewards/margins": 6.3047661781311035, + "rewards/rejected": -3.3484396934509277, + "step": 13477 + }, + { + "epoch": 3.37, + "grad_norm": 29.56941032409668, + "learning_rate": 2.395469928734397e-06, + "logits/chosen": -0.5291678309440613, + "logits/rejected": -0.6430360674858093, + "logps/chosen": -60.40787887573242, + "logps/rejected": -114.07618713378906, + "loss": 0.6324, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0835771560668945, + "rewards/margins": 6.410641670227051, + "rewards/rejected": -3.3270645141601562, + "step": 13478 + }, + { + "epoch": 3.37, + "grad_norm": 3.4811227321624756, + "learning_rate": 2.394799030822964e-06, + "logits/chosen": -0.5380872488021851, + "logits/rejected": -0.5880535840988159, + "logps/chosen": -47.600486755371094, + "logps/rejected": -103.21148681640625, + "loss": 0.6391, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.093682050704956, + "rewards/margins": 6.46854305267334, + "rewards/rejected": -3.374861240386963, + "step": 13479 + }, + { + "epoch": 3.37, + "grad_norm": 5.427138805389404, + "learning_rate": 2.3941281972888164e-06, + "logits/chosen": -0.5094223022460938, + "logits/rejected": -0.5963312387466431, + "logps/chosen": -62.7276496887207, + "logps/rejected": -99.16056823730469, + "loss": 0.6762, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4328742027282715, + "rewards/margins": 7.966884613037109, + "rewards/rejected": -4.534010410308838, + "step": 13480 + }, + { + "epoch": 3.37, + "grad_norm": 4.7865142822265625, + "learning_rate": 2.393457428148535e-06, + "logits/chosen": -0.5959372520446777, + "logits/rejected": -0.6172366142272949, + "logps/chosen": -46.29832077026367, + "logps/rejected": -111.70889282226562, + "loss": 0.6078, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9227263927459717, + "rewards/margins": 6.698642253875732, + "rewards/rejected": -3.7759158611297607, + "step": 13481 + }, + { + "epoch": 3.37, + "grad_norm": 26.447240829467773, + "learning_rate": 2.392786723418693e-06, + "logits/chosen": -0.5401997566223145, + "logits/rejected": -0.6642560362815857, + "logps/chosen": -54.21192932128906, + "logps/rejected": -101.62786102294922, + "loss": 0.584, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0924832820892334, + "rewards/margins": 8.204071998596191, + "rewards/rejected": -5.111588954925537, + "step": 13482 + }, + { + "epoch": 3.37, + "grad_norm": 3.8468568325042725, + "learning_rate": 2.3921160831158618e-06, + "logits/chosen": -0.575129508972168, + "logits/rejected": -0.6733765006065369, + "logps/chosen": -54.055763244628906, + "logps/rejected": -114.37532043457031, + "loss": 0.5412, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.196575403213501, + "rewards/margins": 8.103570938110352, + "rewards/rejected": -4.90699577331543, + "step": 13483 + }, + { + "epoch": 3.37, + "grad_norm": 5.605751037597656, + "learning_rate": 2.3914455072566168e-06, + "logits/chosen": -0.6321607828140259, + "logits/rejected": -0.6550706624984741, + "logps/chosen": -46.76383972167969, + "logps/rejected": -124.74404907226562, + "loss": 0.6153, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.073549509048462, + "rewards/margins": 6.349536418914795, + "rewards/rejected": -3.275986433029175, + "step": 13484 + }, + { + "epoch": 3.37, + "grad_norm": 5.1134467124938965, + "learning_rate": 2.390774995857531e-06, + "logits/chosen": -0.6189652681350708, + "logits/rejected": -0.6658893823623657, + "logps/chosen": -50.5753173828125, + "logps/rejected": -107.12960815429688, + "loss": 0.612, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9598090648651123, + "rewards/margins": 5.869261264801025, + "rewards/rejected": -2.909452438354492, + "step": 13485 + }, + { + "epoch": 3.37, + "grad_norm": 2.9731760025024414, + "learning_rate": 2.390104548935167e-06, + "logits/chosen": -0.5593886375427246, + "logits/rejected": -0.6501127481460571, + "logps/chosen": -59.27227020263672, + "logps/rejected": -108.57007598876953, + "loss": 0.6341, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0372610092163086, + "rewards/margins": 7.327866554260254, + "rewards/rejected": -4.290605545043945, + "step": 13486 + }, + { + "epoch": 3.37, + "grad_norm": 6.398645401000977, + "learning_rate": 2.3894341665060954e-06, + "logits/chosen": -0.5966979265213013, + "logits/rejected": -0.6720871925354004, + "logps/chosen": -55.70826721191406, + "logps/rejected": -119.0007553100586, + "loss": 0.618, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0759902000427246, + "rewards/margins": 7.533844947814941, + "rewards/rejected": -4.457855224609375, + "step": 13487 + }, + { + "epoch": 3.37, + "grad_norm": 12.735386848449707, + "learning_rate": 2.388763848586884e-06, + "logits/chosen": -0.6237562298774719, + "logits/rejected": -0.6602601408958435, + "logps/chosen": -43.02405548095703, + "logps/rejected": -101.049072265625, + "loss": 0.6534, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.021393299102783, + "rewards/margins": 6.218814849853516, + "rewards/rejected": -3.197421073913574, + "step": 13488 + }, + { + "epoch": 3.37, + "grad_norm": 3.5978128910064697, + "learning_rate": 2.3880935951940957e-06, + "logits/chosen": -0.5363376140594482, + "logits/rejected": -0.5925771594047546, + "logps/chosen": -50.88473892211914, + "logps/rejected": -95.46322631835938, + "loss": 0.6294, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1864235401153564, + "rewards/margins": 6.292903900146484, + "rewards/rejected": -3.106480598449707, + "step": 13489 + }, + { + "epoch": 3.37, + "grad_norm": 6.080671310424805, + "learning_rate": 2.3874234063442907e-06, + "logits/chosen": -0.52340167760849, + "logits/rejected": -0.6039391756057739, + "logps/chosen": -60.26884078979492, + "logps/rejected": -111.23426818847656, + "loss": 0.5566, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9790611267089844, + "rewards/margins": 6.49020528793335, + "rewards/rejected": -3.5111443996429443, + "step": 13490 + }, + { + "epoch": 3.37, + "grad_norm": 3.6101672649383545, + "learning_rate": 2.386753282054034e-06, + "logits/chosen": -0.5346115231513977, + "logits/rejected": -0.6126042008399963, + "logps/chosen": -55.56899642944336, + "logps/rejected": -103.64495086669922, + "loss": 0.6237, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.953275442123413, + "rewards/margins": 7.585573196411133, + "rewards/rejected": -4.632298469543457, + "step": 13491 + }, + { + "epoch": 3.38, + "grad_norm": 3.712778091430664, + "learning_rate": 2.3860832223398832e-06, + "logits/chosen": -0.5760243535041809, + "logits/rejected": -0.6641916632652283, + "logps/chosen": -54.792972564697266, + "logps/rejected": -111.61259460449219, + "loss": 0.5588, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.987260341644287, + "rewards/margins": 7.845853805541992, + "rewards/rejected": -4.858593940734863, + "step": 13492 + }, + { + "epoch": 3.38, + "grad_norm": 4.886354923248291, + "learning_rate": 2.385413227218395e-06, + "logits/chosen": -0.510341465473175, + "logits/rejected": -0.5620360374450684, + "logps/chosen": -55.859336853027344, + "logps/rejected": -111.81266784667969, + "loss": 0.6114, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2558600902557373, + "rewards/margins": 6.533971309661865, + "rewards/rejected": -3.278111457824707, + "step": 13493 + }, + { + "epoch": 3.38, + "grad_norm": 9.404496192932129, + "learning_rate": 2.3847432967061283e-06, + "logits/chosen": -0.519888699054718, + "logits/rejected": -0.6181503534317017, + "logps/chosen": -51.676025390625, + "logps/rejected": -88.01197814941406, + "loss": 0.65, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7075765132904053, + "rewards/margins": 6.309004783630371, + "rewards/rejected": -3.6014273166656494, + "step": 13494 + }, + { + "epoch": 3.38, + "grad_norm": 2.7197399139404297, + "learning_rate": 2.3840734308196345e-06, + "logits/chosen": -0.5176177024841309, + "logits/rejected": -0.6197865605354309, + "logps/chosen": -55.56941604614258, + "logps/rejected": -98.68124389648438, + "loss": 0.5679, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.121689796447754, + "rewards/margins": 6.226923942565918, + "rewards/rejected": -3.105233907699585, + "step": 13495 + }, + { + "epoch": 3.38, + "grad_norm": 2.5391900539398193, + "learning_rate": 2.383403629575471e-06, + "logits/chosen": -0.5317937135696411, + "logits/rejected": -0.6267716884613037, + "logps/chosen": -51.90703582763672, + "logps/rejected": -114.04039001464844, + "loss": 0.5463, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0792927742004395, + "rewards/margins": 7.788127899169922, + "rewards/rejected": -4.708835124969482, + "step": 13496 + }, + { + "epoch": 3.38, + "grad_norm": 10.497612953186035, + "learning_rate": 2.382733892990187e-06, + "logits/chosen": -0.5685389637947083, + "logits/rejected": -0.6121752262115479, + "logps/chosen": -60.4926643371582, + "logps/rejected": -114.98159790039062, + "loss": 0.8137, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.933640718460083, + "rewards/margins": 5.670716285705566, + "rewards/rejected": -2.7370758056640625, + "step": 13497 + }, + { + "epoch": 3.38, + "grad_norm": 8.300799369812012, + "learning_rate": 2.38206422108033e-06, + "logits/chosen": -0.5241983532905579, + "logits/rejected": -0.6088455319404602, + "logps/chosen": -54.37842559814453, + "logps/rejected": -109.6507568359375, + "loss": 0.5515, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0038461685180664, + "rewards/margins": 7.710726261138916, + "rewards/rejected": -4.706879615783691, + "step": 13498 + }, + { + "epoch": 3.38, + "grad_norm": 1.9801603555679321, + "learning_rate": 2.3813946138624524e-06, + "logits/chosen": -0.5716196298599243, + "logits/rejected": -0.6605404615402222, + "logps/chosen": -50.768375396728516, + "logps/rejected": -116.29679870605469, + "loss": 0.4923, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7413382530212402, + "rewards/margins": 8.443273544311523, + "rewards/rejected": -5.701934814453125, + "step": 13499 + }, + { + "epoch": 3.38, + "grad_norm": 5.2070746421813965, + "learning_rate": 2.380725071353098e-06, + "logits/chosen": -0.46188291907310486, + "logits/rejected": -0.541691243648529, + "logps/chosen": -71.2166748046875, + "logps/rejected": -108.34162902832031, + "loss": 0.6468, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6317896842956543, + "rewards/margins": 6.389880180358887, + "rewards/rejected": -3.758090019226074, + "step": 13500 + }, + { + "epoch": 3.38, + "grad_norm": 5.560104846954346, + "learning_rate": 2.380055593568814e-06, + "logits/chosen": -0.5915708541870117, + "logits/rejected": -0.6422325372695923, + "logps/chosen": -51.15210723876953, + "logps/rejected": -104.52690887451172, + "loss": 0.6849, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1325221061706543, + "rewards/margins": 5.456615924835205, + "rewards/rejected": -2.3240935802459717, + "step": 13501 + }, + { + "epoch": 3.38, + "grad_norm": 7.414731025695801, + "learning_rate": 2.3793861805261416e-06, + "logits/chosen": -0.5621194243431091, + "logits/rejected": -0.6342759132385254, + "logps/chosen": -55.64311599731445, + "logps/rejected": -86.7750244140625, + "loss": 0.7749, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0415778160095215, + "rewards/margins": 6.064592361450195, + "rewards/rejected": -3.023014783859253, + "step": 13502 + }, + { + "epoch": 3.38, + "grad_norm": 2.726252794265747, + "learning_rate": 2.378716832241626e-06, + "logits/chosen": -0.5445523262023926, + "logits/rejected": -0.6398161053657532, + "logps/chosen": -60.918663024902344, + "logps/rejected": -112.41634368896484, + "loss": 0.5914, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1779415607452393, + "rewards/margins": 7.6245245933532715, + "rewards/rejected": -4.446582794189453, + "step": 13503 + }, + { + "epoch": 3.38, + "grad_norm": 3.8936562538146973, + "learning_rate": 2.3780475487318056e-06, + "logits/chosen": -0.6004695296287537, + "logits/rejected": -0.6899235844612122, + "logps/chosen": -46.003028869628906, + "logps/rejected": -112.91825866699219, + "loss": 0.5873, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.905560255050659, + "rewards/margins": 8.485343933105469, + "rewards/rejected": -5.579782009124756, + "step": 13504 + }, + { + "epoch": 3.38, + "grad_norm": 9.689458847045898, + "learning_rate": 2.3773783300132174e-06, + "logits/chosen": -0.5966578125953674, + "logits/rejected": -0.6539644002914429, + "logps/chosen": -48.243370056152344, + "logps/rejected": -127.19219207763672, + "loss": 0.5924, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.003028392791748, + "rewards/margins": 7.488738536834717, + "rewards/rejected": -4.4857096672058105, + "step": 13505 + }, + { + "epoch": 3.38, + "grad_norm": 9.484637260437012, + "learning_rate": 2.376709176102402e-06, + "logits/chosen": -0.5448853969573975, + "logits/rejected": -0.6589787006378174, + "logps/chosen": -62.03078079223633, + "logps/rejected": -108.11962127685547, + "loss": 0.6303, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.86515474319458, + "rewards/margins": 7.148425102233887, + "rewards/rejected": -4.283270359039307, + "step": 13506 + }, + { + "epoch": 3.38, + "grad_norm": 3.2885587215423584, + "learning_rate": 2.376040087015893e-06, + "logits/chosen": -0.4727745056152344, + "logits/rejected": -0.6374202370643616, + "logps/chosen": -62.414737701416016, + "logps/rejected": -92.3489761352539, + "loss": 0.5555, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3766722679138184, + "rewards/margins": 7.123363494873047, + "rewards/rejected": -3.7466917037963867, + "step": 13507 + }, + { + "epoch": 3.38, + "grad_norm": 2.0155177116394043, + "learning_rate": 2.375371062770222e-06, + "logits/chosen": -0.5362389087677002, + "logits/rejected": -0.6382536292076111, + "logps/chosen": -44.33376693725586, + "logps/rejected": -95.3154296875, + "loss": 0.5522, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2319295406341553, + "rewards/margins": 7.711622714996338, + "rewards/rejected": -4.479693412780762, + "step": 13508 + }, + { + "epoch": 3.38, + "grad_norm": 4.760770320892334, + "learning_rate": 2.3747021033819263e-06, + "logits/chosen": -0.577691912651062, + "logits/rejected": -0.6515448689460754, + "logps/chosen": -53.8095588684082, + "logps/rejected": -111.47888946533203, + "loss": 0.595, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0580222606658936, + "rewards/margins": 6.897431373596191, + "rewards/rejected": -3.839409351348877, + "step": 13509 + }, + { + "epoch": 3.38, + "grad_norm": 8.401758193969727, + "learning_rate": 2.374033208867534e-06, + "logits/chosen": -0.6702888607978821, + "logits/rejected": -0.7115333080291748, + "logps/chosen": -56.743263244628906, + "logps/rejected": -119.64451599121094, + "loss": 0.6604, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0541927814483643, + "rewards/margins": 6.48356819152832, + "rewards/rejected": -3.429375648498535, + "step": 13510 + }, + { + "epoch": 3.38, + "grad_norm": 6.169252395629883, + "learning_rate": 2.3733643792435724e-06, + "logits/chosen": -0.5900176763534546, + "logits/rejected": -0.6509606838226318, + "logps/chosen": -58.54346466064453, + "logps/rejected": -100.42431640625, + "loss": 0.6216, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.070971727371216, + "rewards/margins": 6.383306980133057, + "rewards/rejected": -3.3123350143432617, + "step": 13511 + }, + { + "epoch": 3.38, + "grad_norm": 3.8652126789093018, + "learning_rate": 2.3726956145265705e-06, + "logits/chosen": -0.5556535124778748, + "logits/rejected": -0.6503993272781372, + "logps/chosen": -58.14733123779297, + "logps/rejected": -104.31383514404297, + "loss": 0.5945, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3625614643096924, + "rewards/margins": 6.996610641479492, + "rewards/rejected": -3.634049415588379, + "step": 13512 + }, + { + "epoch": 3.38, + "grad_norm": 3.854307174682617, + "learning_rate": 2.3720269147330582e-06, + "logits/chosen": -0.5316680669784546, + "logits/rejected": -0.6377032399177551, + "logps/chosen": -52.39422607421875, + "logps/rejected": -90.6466064453125, + "loss": 0.6025, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9396843910217285, + "rewards/margins": 6.838460445404053, + "rewards/rejected": -3.898776054382324, + "step": 13513 + }, + { + "epoch": 3.38, + "grad_norm": 3.727412462234497, + "learning_rate": 2.3713582798795525e-06, + "logits/chosen": -0.6107190251350403, + "logits/rejected": -0.6811566352844238, + "logps/chosen": -46.830135345458984, + "logps/rejected": -94.4453125, + "loss": 0.584, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3568034172058105, + "rewards/margins": 6.2483320236206055, + "rewards/rejected": -2.891528606414795, + "step": 13514 + }, + { + "epoch": 3.38, + "grad_norm": 4.90973424911499, + "learning_rate": 2.3706897099825792e-06, + "logits/chosen": -0.5432240962982178, + "logits/rejected": -0.6126306056976318, + "logps/chosen": -48.12511444091797, + "logps/rejected": -111.92681884765625, + "loss": 0.6556, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.048727512359619, + "rewards/margins": 7.751113414764404, + "rewards/rejected": -4.702384948730469, + "step": 13515 + }, + { + "epoch": 3.38, + "grad_norm": 9.01368522644043, + "learning_rate": 2.3700212050586625e-06, + "logits/chosen": -0.5947198867797852, + "logits/rejected": -0.6235741972923279, + "logps/chosen": -49.63813018798828, + "logps/rejected": -111.37615966796875, + "loss": 0.8276, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9466917514801025, + "rewards/margins": 5.978682994842529, + "rewards/rejected": -3.0319912433624268, + "step": 13516 + }, + { + "epoch": 3.38, + "grad_norm": 5.647365093231201, + "learning_rate": 2.369352765124319e-06, + "logits/chosen": -0.6654998660087585, + "logits/rejected": -0.7014033198356628, + "logps/chosen": -44.08955383300781, + "logps/rejected": -107.79354858398438, + "loss": 0.657, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1531059741973877, + "rewards/margins": 6.972095489501953, + "rewards/rejected": -3.818988800048828, + "step": 13517 + }, + { + "epoch": 3.38, + "grad_norm": 4.059394359588623, + "learning_rate": 2.368684390196064e-06, + "logits/chosen": -0.5589151978492737, + "logits/rejected": -0.6373558044433594, + "logps/chosen": -60.12713623046875, + "logps/rejected": -109.27015686035156, + "loss": 0.56, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1966826915740967, + "rewards/margins": 7.702845096588135, + "rewards/rejected": -4.506162643432617, + "step": 13518 + }, + { + "epoch": 3.38, + "grad_norm": 6.829806804656982, + "learning_rate": 2.3680160802904196e-06, + "logits/chosen": -0.5153660774230957, + "logits/rejected": -0.6007955074310303, + "logps/chosen": -58.29457092285156, + "logps/rejected": -96.89686584472656, + "loss": 0.6492, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0583322048187256, + "rewards/margins": 6.624110698699951, + "rewards/rejected": -3.5657777786254883, + "step": 13519 + }, + { + "epoch": 3.38, + "grad_norm": 9.289639472961426, + "learning_rate": 2.367347835423895e-06, + "logits/chosen": -0.5317520499229431, + "logits/rejected": -0.6029177904129028, + "logps/chosen": -58.82650375366211, + "logps/rejected": -107.87785339355469, + "loss": 0.647, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.081366539001465, + "rewards/margins": 7.163955211639404, + "rewards/rejected": -4.0825886726379395, + "step": 13520 + }, + { + "epoch": 3.38, + "grad_norm": 6.765945911407471, + "learning_rate": 2.366679655613007e-06, + "logits/chosen": -0.5649144649505615, + "logits/rejected": -0.6535550355911255, + "logps/chosen": -58.05372619628906, + "logps/rejected": -107.7603530883789, + "loss": 0.5925, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4872138500213623, + "rewards/margins": 7.040940284729004, + "rewards/rejected": -4.5537261962890625, + "step": 13521 + }, + { + "epoch": 3.38, + "grad_norm": 5.911120414733887, + "learning_rate": 2.366011540874266e-06, + "logits/chosen": -0.5761184692382812, + "logits/rejected": -0.6204466223716736, + "logps/chosen": -46.735172271728516, + "logps/rejected": -88.6869125366211, + "loss": 0.5639, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1920833587646484, + "rewards/margins": 6.015157222747803, + "rewards/rejected": -2.8230741024017334, + "step": 13522 + }, + { + "epoch": 3.38, + "grad_norm": 5.839049816131592, + "learning_rate": 2.3653434912241803e-06, + "logits/chosen": -0.5638183355331421, + "logits/rejected": -0.5857340097427368, + "logps/chosen": -47.10810852050781, + "logps/rejected": -92.2076416015625, + "loss": 0.662, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.950836181640625, + "rewards/margins": 6.410707950592041, + "rewards/rejected": -3.459871530532837, + "step": 13523 + }, + { + "epoch": 3.38, + "grad_norm": 2.4101786613464355, + "learning_rate": 2.3646755066792606e-06, + "logits/chosen": -0.6234042644500732, + "logits/rejected": -0.6647876501083374, + "logps/chosen": -51.492164611816406, + "logps/rejected": -112.19863891601562, + "loss": 0.5893, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.136932134628296, + "rewards/margins": 7.925379753112793, + "rewards/rejected": -4.788447380065918, + "step": 13524 + }, + { + "epoch": 3.38, + "grad_norm": 3.543823719024658, + "learning_rate": 2.3640075872560113e-06, + "logits/chosen": -0.45967626571655273, + "logits/rejected": -0.5489700436592102, + "logps/chosen": -53.9466667175293, + "logps/rejected": -99.4813003540039, + "loss": 0.5282, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1252472400665283, + "rewards/margins": 6.250441551208496, + "rewards/rejected": -3.1251943111419678, + "step": 13525 + }, + { + "epoch": 3.38, + "grad_norm": 3.979058265686035, + "learning_rate": 2.3633397329709374e-06, + "logits/chosen": -0.5653563737869263, + "logits/rejected": -0.6387612223625183, + "logps/chosen": -53.61181640625, + "logps/rejected": -108.71538543701172, + "loss": 0.6458, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.915773391723633, + "rewards/margins": 7.345310688018799, + "rewards/rejected": -4.429536819458008, + "step": 13526 + }, + { + "epoch": 3.38, + "grad_norm": 3.751532554626465, + "learning_rate": 2.3626719438405432e-06, + "logits/chosen": -0.5533167123794556, + "logits/rejected": -0.5807979702949524, + "logps/chosen": -49.325660705566406, + "logps/rejected": -110.50363159179688, + "loss": 0.6428, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.298316240310669, + "rewards/margins": 5.950157642364502, + "rewards/rejected": -2.651841402053833, + "step": 13527 + }, + { + "epoch": 3.38, + "grad_norm": 5.362566947937012, + "learning_rate": 2.3620042198813337e-06, + "logits/chosen": -0.4921433925628662, + "logits/rejected": -0.5770164132118225, + "logps/chosen": -61.22200012207031, + "logps/rejected": -114.74717712402344, + "loss": 0.6159, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9735093116760254, + "rewards/margins": 7.537477970123291, + "rewards/rejected": -4.563968181610107, + "step": 13528 + }, + { + "epoch": 3.38, + "grad_norm": 5.061445236206055, + "learning_rate": 2.3613365611098027e-06, + "logits/chosen": -0.5589858293533325, + "logits/rejected": -0.6487663984298706, + "logps/chosen": -50.278236389160156, + "logps/rejected": -122.40859985351562, + "loss": 0.5845, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.321986436843872, + "rewards/margins": 7.805482864379883, + "rewards/rejected": -4.48349666595459, + "step": 13529 + }, + { + "epoch": 3.38, + "grad_norm": 3.735353946685791, + "learning_rate": 2.3606689675424516e-06, + "logits/chosen": -0.5209662914276123, + "logits/rejected": -0.6024795770645142, + "logps/chosen": -59.21482467651367, + "logps/rejected": -112.58405303955078, + "loss": 0.6381, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7930774688720703, + "rewards/margins": 7.181844711303711, + "rewards/rejected": -4.388767242431641, + "step": 13530 + }, + { + "epoch": 3.38, + "grad_norm": 3.4901790618896484, + "learning_rate": 2.3600014391957807e-06, + "logits/chosen": -0.5183141827583313, + "logits/rejected": -0.5915853977203369, + "logps/chosen": -59.78952407836914, + "logps/rejected": -116.89598846435547, + "loss": 0.5448, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9799718856811523, + "rewards/margins": 7.383079528808594, + "rewards/rejected": -4.4031081199646, + "step": 13531 + }, + { + "epoch": 3.39, + "grad_norm": 20.53614044189453, + "learning_rate": 2.3593339760862792e-06, + "logits/chosen": -0.5841500163078308, + "logits/rejected": -0.6703628301620483, + "logps/chosen": -63.019020080566406, + "logps/rejected": -109.585205078125, + "loss": 0.7367, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9465718269348145, + "rewards/margins": 7.437926292419434, + "rewards/rejected": -4.491354465484619, + "step": 13532 + }, + { + "epoch": 3.39, + "grad_norm": 4.100395679473877, + "learning_rate": 2.3586665782304443e-06, + "logits/chosen": -0.5297095775604248, + "logits/rejected": -0.6055791974067688, + "logps/chosen": -58.16022491455078, + "logps/rejected": -107.46007537841797, + "loss": 0.6391, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.089261531829834, + "rewards/margins": 6.737380504608154, + "rewards/rejected": -3.6481192111968994, + "step": 13533 + }, + { + "epoch": 3.39, + "grad_norm": 16.926990509033203, + "learning_rate": 2.3579992456447693e-06, + "logits/chosen": -0.5407208800315857, + "logits/rejected": -0.5971819758415222, + "logps/chosen": -66.8328857421875, + "logps/rejected": -99.09334564208984, + "loss": 0.7957, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9580013751983643, + "rewards/margins": 5.830446720123291, + "rewards/rejected": -2.8724453449249268, + "step": 13534 + }, + { + "epoch": 3.39, + "grad_norm": 3.0524935722351074, + "learning_rate": 2.3573319783457428e-06, + "logits/chosen": -0.5362399220466614, + "logits/rejected": -0.6292460560798645, + "logps/chosen": -65.30541229248047, + "logps/rejected": -103.48344421386719, + "loss": 0.6785, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9265761375427246, + "rewards/margins": 7.156068325042725, + "rewards/rejected": -4.229492664337158, + "step": 13535 + }, + { + "epoch": 3.39, + "grad_norm": 5.1144890785217285, + "learning_rate": 2.3566647763498517e-06, + "logits/chosen": -0.5620650053024292, + "logits/rejected": -0.6173393726348877, + "logps/chosen": -57.35845947265625, + "logps/rejected": -117.92361450195312, + "loss": 0.6623, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1689422130584717, + "rewards/margins": 7.320620536804199, + "rewards/rejected": -4.151678562164307, + "step": 13536 + }, + { + "epoch": 3.39, + "grad_norm": 2.4677436351776123, + "learning_rate": 2.3559976396735885e-06, + "logits/chosen": -0.5561304092407227, + "logits/rejected": -0.6334195137023926, + "logps/chosen": -49.2426872253418, + "logps/rejected": -110.67977905273438, + "loss": 0.5542, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.994609832763672, + "rewards/margins": 8.204253196716309, + "rewards/rejected": -5.2096428871154785, + "step": 13537 + }, + { + "epoch": 3.39, + "grad_norm": 6.517960548400879, + "learning_rate": 2.355330568333435e-06, + "logits/chosen": -0.514214038848877, + "logits/rejected": -0.5893682241439819, + "logps/chosen": -60.373573303222656, + "logps/rejected": -136.60110473632812, + "loss": 0.7013, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9849283695220947, + "rewards/margins": 8.925033569335938, + "rewards/rejected": -5.940104961395264, + "step": 13538 + }, + { + "epoch": 3.39, + "grad_norm": 7.476912021636963, + "learning_rate": 2.3546635623458747e-06, + "logits/chosen": -0.5179287195205688, + "logits/rejected": -0.5559837222099304, + "logps/chosen": -60.32041549682617, + "logps/rejected": -116.97618103027344, + "loss": 0.6933, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.930661201477051, + "rewards/margins": 6.073521614074707, + "rewards/rejected": -3.14285945892334, + "step": 13539 + }, + { + "epoch": 3.39, + "grad_norm": 5.332734107971191, + "learning_rate": 2.353996621727393e-06, + "logits/chosen": -0.5816897749900818, + "logits/rejected": -0.5922881960868835, + "logps/chosen": -50.128028869628906, + "logps/rejected": -130.76979064941406, + "loss": 0.7174, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.897000789642334, + "rewards/margins": 7.448668003082275, + "rewards/rejected": -4.5516676902771, + "step": 13540 + }, + { + "epoch": 3.39, + "grad_norm": 31.589420318603516, + "learning_rate": 2.353329746494468e-06, + "logits/chosen": -0.6060333847999573, + "logits/rejected": -0.6336379051208496, + "logps/chosen": -51.236976623535156, + "logps/rejected": -107.41634368896484, + "loss": 0.7664, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1468825340270996, + "rewards/margins": 6.774168968200684, + "rewards/rejected": -3.627286672592163, + "step": 13541 + }, + { + "epoch": 3.39, + "grad_norm": 12.774019241333008, + "learning_rate": 2.352662936663581e-06, + "logits/chosen": -0.5536316633224487, + "logits/rejected": -0.6378036737442017, + "logps/chosen": -52.81492233276367, + "logps/rejected": -110.89326477050781, + "loss": 0.599, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0139684677124023, + "rewards/margins": 7.3971052169799805, + "rewards/rejected": -4.38313627243042, + "step": 13542 + }, + { + "epoch": 3.39, + "grad_norm": 6.732717990875244, + "learning_rate": 2.3519961922512095e-06, + "logits/chosen": -0.5423005223274231, + "logits/rejected": -0.5919523239135742, + "logps/chosen": -55.37363815307617, + "logps/rejected": -104.89739990234375, + "loss": 0.5625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0692341327667236, + "rewards/margins": 6.797826290130615, + "rewards/rejected": -3.7285919189453125, + "step": 13543 + }, + { + "epoch": 3.39, + "grad_norm": 6.043520927429199, + "learning_rate": 2.3513295132738268e-06, + "logits/chosen": -0.5352467894554138, + "logits/rejected": -0.6259687542915344, + "logps/chosen": -63.34809112548828, + "logps/rejected": -96.68196105957031, + "loss": 0.704, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.176609516143799, + "rewards/margins": 6.78436279296875, + "rewards/rejected": -3.607753276824951, + "step": 13544 + }, + { + "epoch": 3.39, + "grad_norm": 5.20031213760376, + "learning_rate": 2.3506628997479085e-06, + "logits/chosen": -0.6080095171928406, + "logits/rejected": -0.6796138286590576, + "logps/chosen": -50.70732879638672, + "logps/rejected": -95.70092010498047, + "loss": 0.6378, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.175593137741089, + "rewards/margins": 5.918830394744873, + "rewards/rejected": -2.743237018585205, + "step": 13545 + }, + { + "epoch": 3.39, + "grad_norm": 5.006772994995117, + "learning_rate": 2.34999635168993e-06, + "logits/chosen": -0.5455014109611511, + "logits/rejected": -0.6576012372970581, + "logps/chosen": -45.84790802001953, + "logps/rejected": -97.89762878417969, + "loss": 0.5881, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0791847705841064, + "rewards/margins": 7.539030075073242, + "rewards/rejected": -4.459845066070557, + "step": 13546 + }, + { + "epoch": 3.39, + "grad_norm": 4.592016220092773, + "learning_rate": 2.3493298691163602e-06, + "logits/chosen": -0.5653592944145203, + "logits/rejected": -0.6436352729797363, + "logps/chosen": -54.02898025512695, + "logps/rejected": -100.79313659667969, + "loss": 0.6026, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3511128425598145, + "rewards/margins": 6.65345573425293, + "rewards/rejected": -3.302342414855957, + "step": 13547 + }, + { + "epoch": 3.39, + "grad_norm": 4.743475437164307, + "learning_rate": 2.3486634520436667e-06, + "logits/chosen": -0.5529444217681885, + "logits/rejected": -0.6222758889198303, + "logps/chosen": -52.67491912841797, + "logps/rejected": -103.93263244628906, + "loss": 0.6723, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.843982696533203, + "rewards/margins": 6.947848320007324, + "rewards/rejected": -4.103865623474121, + "step": 13548 + }, + { + "epoch": 3.39, + "grad_norm": 2.304291009902954, + "learning_rate": 2.3479971004883216e-06, + "logits/chosen": -0.5396191477775574, + "logits/rejected": -0.6540101170539856, + "logps/chosen": -49.75728225708008, + "logps/rejected": -103.27349090576172, + "loss": 0.5048, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4249141216278076, + "rewards/margins": 8.110296249389648, + "rewards/rejected": -4.6853814125061035, + "step": 13549 + }, + { + "epoch": 3.39, + "grad_norm": 10.828814506530762, + "learning_rate": 2.3473308144667894e-06, + "logits/chosen": -0.6219824552536011, + "logits/rejected": -0.700803816318512, + "logps/chosen": -48.192771911621094, + "logps/rejected": -111.86495208740234, + "loss": 0.5942, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5699524879455566, + "rewards/margins": 7.563390731811523, + "rewards/rejected": -4.993437767028809, + "step": 13550 + }, + { + "epoch": 3.39, + "grad_norm": 7.131547927856445, + "learning_rate": 2.346664593995532e-06, + "logits/chosen": -0.5115776062011719, + "logits/rejected": -0.6223999857902527, + "logps/chosen": -71.05715942382812, + "logps/rejected": -100.18788146972656, + "loss": 0.6741, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8223628997802734, + "rewards/margins": 6.363745212554932, + "rewards/rejected": -3.541382312774658, + "step": 13551 + }, + { + "epoch": 3.39, + "grad_norm": 44.50541305541992, + "learning_rate": 2.3459984390910162e-06, + "logits/chosen": -0.5192023515701294, + "logits/rejected": -0.6253976821899414, + "logps/chosen": -58.497806549072266, + "logps/rejected": -118.80663299560547, + "loss": 0.5989, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2545461654663086, + "rewards/margins": 7.66276741027832, + "rewards/rejected": -4.40822172164917, + "step": 13552 + }, + { + "epoch": 3.39, + "grad_norm": 14.573963165283203, + "learning_rate": 2.3453323497697024e-06, + "logits/chosen": -0.571643054485321, + "logits/rejected": -0.6954997777938843, + "logps/chosen": -57.45009231567383, + "logps/rejected": -97.07207489013672, + "loss": 0.6637, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0628068447113037, + "rewards/margins": 6.884498596191406, + "rewards/rejected": -3.8216919898986816, + "step": 13553 + }, + { + "epoch": 3.39, + "grad_norm": 8.71664810180664, + "learning_rate": 2.3446663260480473e-06, + "logits/chosen": -0.5576444864273071, + "logits/rejected": -0.6163132786750793, + "logps/chosen": -53.428829193115234, + "logps/rejected": -102.42520904541016, + "loss": 0.6777, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8363540172576904, + "rewards/margins": 6.419568061828613, + "rewards/rejected": -3.583214282989502, + "step": 13554 + }, + { + "epoch": 3.39, + "grad_norm": 3.957429885864258, + "learning_rate": 2.3440003679425123e-06, + "logits/chosen": -0.5612950325012207, + "logits/rejected": -0.610132098197937, + "logps/chosen": -63.37718963623047, + "logps/rejected": -98.40312957763672, + "loss": 0.7017, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.056366443634033, + "rewards/margins": 6.624637603759766, + "rewards/rejected": -3.568270683288574, + "step": 13555 + }, + { + "epoch": 3.39, + "grad_norm": 3.554011344909668, + "learning_rate": 2.343334475469557e-06, + "logits/chosen": -0.5444551706314087, + "logits/rejected": -0.668220043182373, + "logps/chosen": -67.75226593017578, + "logps/rejected": -95.90723419189453, + "loss": 0.663, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8688488006591797, + "rewards/margins": 7.635283470153809, + "rewards/rejected": -4.7664337158203125, + "step": 13556 + }, + { + "epoch": 3.39, + "grad_norm": 5.950063228607178, + "learning_rate": 2.3426686486456286e-06, + "logits/chosen": -0.5201154947280884, + "logits/rejected": -0.5729283690452576, + "logps/chosen": -60.88407897949219, + "logps/rejected": -107.32430267333984, + "loss": 0.6659, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.179468870162964, + "rewards/margins": 6.746493339538574, + "rewards/rejected": -3.5670247077941895, + "step": 13557 + }, + { + "epoch": 3.39, + "grad_norm": 5.302422523498535, + "learning_rate": 2.3420028874871843e-06, + "logits/chosen": -0.5178072452545166, + "logits/rejected": -0.5715122222900391, + "logps/chosen": -59.2740592956543, + "logps/rejected": -106.30738067626953, + "loss": 0.5904, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.206331729888916, + "rewards/margins": 6.530084609985352, + "rewards/rejected": -3.3237526416778564, + "step": 13558 + }, + { + "epoch": 3.39, + "grad_norm": 5.153617858886719, + "learning_rate": 2.3413371920106774e-06, + "logits/chosen": -0.5427746176719666, + "logits/rejected": -0.6419402956962585, + "logps/chosen": -50.035308837890625, + "logps/rejected": -105.23628997802734, + "loss": 0.6195, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.34661865234375, + "rewards/margins": 7.554654121398926, + "rewards/rejected": -4.208034992218018, + "step": 13559 + }, + { + "epoch": 3.39, + "grad_norm": 6.621654987335205, + "learning_rate": 2.340671562232557e-06, + "logits/chosen": -0.5857703685760498, + "logits/rejected": -0.6605026721954346, + "logps/chosen": -50.62417984008789, + "logps/rejected": -98.0860366821289, + "loss": 0.7646, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1153807640075684, + "rewards/margins": 6.891284465789795, + "rewards/rejected": -3.77590274810791, + "step": 13560 + }, + { + "epoch": 3.39, + "grad_norm": 4.467384338378906, + "learning_rate": 2.3400059981692685e-06, + "logits/chosen": -0.5768120288848877, + "logits/rejected": -0.6596810221672058, + "logps/chosen": -55.03124237060547, + "logps/rejected": -78.63288116455078, + "loss": 0.6365, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2357733249664307, + "rewards/margins": 5.102990627288818, + "rewards/rejected": -1.8672171831130981, + "step": 13561 + }, + { + "epoch": 3.39, + "grad_norm": 3.7116873264312744, + "learning_rate": 2.339340499837263e-06, + "logits/chosen": -0.5206360816955566, + "logits/rejected": -0.6095618009567261, + "logps/chosen": -46.2657585144043, + "logps/rejected": -110.2241439819336, + "loss": 0.5385, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2399373054504395, + "rewards/margins": 7.962339401245117, + "rewards/rejected": -4.722402572631836, + "step": 13562 + }, + { + "epoch": 3.39, + "grad_norm": 3.704850435256958, + "learning_rate": 2.3386750672529838e-06, + "logits/chosen": -0.546990156173706, + "logits/rejected": -0.6509377956390381, + "logps/chosen": -66.1238021850586, + "logps/rejected": -123.75228118896484, + "loss": 0.6002, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.567790985107422, + "rewards/margins": 8.612961769104004, + "rewards/rejected": -5.04517126083374, + "step": 13563 + }, + { + "epoch": 3.39, + "grad_norm": 2.836733341217041, + "learning_rate": 2.338009700432873e-06, + "logits/chosen": -0.647650957107544, + "logits/rejected": -0.7262610197067261, + "logps/chosen": -61.28044891357422, + "logps/rejected": -101.01701354980469, + "loss": 0.6697, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0428590774536133, + "rewards/margins": 7.420558452606201, + "rewards/rejected": -4.377699851989746, + "step": 13564 + }, + { + "epoch": 3.39, + "grad_norm": 6.440043926239014, + "learning_rate": 2.3373443993933755e-06, + "logits/chosen": -0.5290855169296265, + "logits/rejected": -0.5751180648803711, + "logps/chosen": -51.895599365234375, + "logps/rejected": -96.25675201416016, + "loss": 0.566, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.29921817779541, + "rewards/margins": 6.0522050857543945, + "rewards/rejected": -2.7529866695404053, + "step": 13565 + }, + { + "epoch": 3.39, + "grad_norm": 7.637966156005859, + "learning_rate": 2.336679164150928e-06, + "logits/chosen": -0.49354639649391174, + "logits/rejected": -0.5926674008369446, + "logps/chosen": -66.63580322265625, + "logps/rejected": -100.13074493408203, + "loss": 0.6891, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.126756429672241, + "rewards/margins": 6.812129020690918, + "rewards/rejected": -3.685372829437256, + "step": 13566 + }, + { + "epoch": 3.39, + "grad_norm": 5.012099742889404, + "learning_rate": 2.3360139947219735e-06, + "logits/chosen": -0.5340071320533752, + "logits/rejected": -0.6412206292152405, + "logps/chosen": -64.40676879882812, + "logps/rejected": -96.48407745361328, + "loss": 0.7853, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0277624130249023, + "rewards/margins": 6.798859119415283, + "rewards/rejected": -3.7710962295532227, + "step": 13567 + }, + { + "epoch": 3.39, + "grad_norm": 4.783342361450195, + "learning_rate": 2.335348891122946e-06, + "logits/chosen": -0.5524789094924927, + "logits/rejected": -0.5777802467346191, + "logps/chosen": -57.43946075439453, + "logps/rejected": -111.84526062011719, + "loss": 0.6455, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.050621509552002, + "rewards/margins": 6.661933422088623, + "rewards/rejected": -3.611311912536621, + "step": 13568 + }, + { + "epoch": 3.39, + "grad_norm": 27.376792907714844, + "learning_rate": 2.3346838533702805e-06, + "logits/chosen": -0.5593191385269165, + "logits/rejected": -0.6613689661026001, + "logps/chosen": -44.91095733642578, + "logps/rejected": -78.13465881347656, + "loss": 0.6669, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.097022533416748, + "rewards/margins": 5.726192951202393, + "rewards/rejected": -2.6291706562042236, + "step": 13569 + }, + { + "epoch": 3.39, + "grad_norm": 3.9394357204437256, + "learning_rate": 2.3340188814804115e-06, + "logits/chosen": -0.5113672614097595, + "logits/rejected": -0.5711531043052673, + "logps/chosen": -51.08308029174805, + "logps/rejected": -112.00259399414062, + "loss": 0.5948, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.143430233001709, + "rewards/margins": 7.448513031005859, + "rewards/rejected": -4.30508279800415, + "step": 13570 + }, + { + "epoch": 3.39, + "grad_norm": 4.030300140380859, + "learning_rate": 2.3333539754697755e-06, + "logits/chosen": -0.5696533918380737, + "logits/rejected": -0.6395472288131714, + "logps/chosen": -51.356956481933594, + "logps/rejected": -105.75125122070312, + "loss": 0.6365, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.239363431930542, + "rewards/margins": 7.366340637207031, + "rewards/rejected": -4.126977920532227, + "step": 13571 + }, + { + "epoch": 3.4, + "grad_norm": 6.876026153564453, + "learning_rate": 2.332689135354795e-06, + "logits/chosen": -0.5930241942405701, + "logits/rejected": -0.6682083010673523, + "logps/chosen": -53.226680755615234, + "logps/rejected": -140.91262817382812, + "loss": 0.6042, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1751675605773926, + "rewards/margins": 9.182265281677246, + "rewards/rejected": -6.007098197937012, + "step": 13572 + }, + { + "epoch": 3.4, + "grad_norm": 2.2386274337768555, + "learning_rate": 2.332024361151904e-06, + "logits/chosen": -0.6148681640625, + "logits/rejected": -0.6690001487731934, + "logps/chosen": -48.54913330078125, + "logps/rejected": -104.18997192382812, + "loss": 0.5506, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.251431703567505, + "rewards/margins": 7.421450138092041, + "rewards/rejected": -4.170018672943115, + "step": 13573 + }, + { + "epoch": 3.4, + "grad_norm": 6.431844234466553, + "learning_rate": 2.3313596528775316e-06, + "logits/chosen": -0.5817834734916687, + "logits/rejected": -0.6342157125473022, + "logps/chosen": -59.22197341918945, + "logps/rejected": -105.88810729980469, + "loss": 0.6549, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.962440013885498, + "rewards/margins": 6.847070693969727, + "rewards/rejected": -3.884629964828491, + "step": 13574 + }, + { + "epoch": 3.4, + "grad_norm": 2.0394647121429443, + "learning_rate": 2.3306950105480975e-06, + "logits/chosen": -0.49007686972618103, + "logits/rejected": -0.6403746604919434, + "logps/chosen": -59.02217102050781, + "logps/rejected": -101.39693450927734, + "loss": 0.5496, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.366525650024414, + "rewards/margins": 8.056849479675293, + "rewards/rejected": -4.690324783325195, + "step": 13575 + }, + { + "epoch": 3.4, + "grad_norm": 4.630951404571533, + "learning_rate": 2.3300304341800286e-06, + "logits/chosen": -0.5407203435897827, + "logits/rejected": -0.6310762166976929, + "logps/chosen": -46.22685241699219, + "logps/rejected": -90.27832794189453, + "loss": 0.5645, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0035789012908936, + "rewards/margins": 6.450245380401611, + "rewards/rejected": -3.446666717529297, + "step": 13576 + }, + { + "epoch": 3.4, + "grad_norm": 10.367432594299316, + "learning_rate": 2.329365923789749e-06, + "logits/chosen": -0.5472652316093445, + "logits/rejected": -0.6177429556846619, + "logps/chosen": -42.93733596801758, + "logps/rejected": -95.7375259399414, + "loss": 0.6049, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.828591823577881, + "rewards/margins": 6.2760796546936035, + "rewards/rejected": -3.4474880695343018, + "step": 13577 + }, + { + "epoch": 3.4, + "grad_norm": 6.209629535675049, + "learning_rate": 2.328701479393679e-06, + "logits/chosen": -0.545974850654602, + "logits/rejected": -0.5618607401847839, + "logps/chosen": -68.08466339111328, + "logps/rejected": -116.74261474609375, + "loss": 0.6622, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1563777923583984, + "rewards/margins": 6.592098236083984, + "rewards/rejected": -3.435720682144165, + "step": 13578 + }, + { + "epoch": 3.4, + "grad_norm": 3.4229660034179688, + "learning_rate": 2.3280371010082342e-06, + "logits/chosen": -0.5442617535591125, + "logits/rejected": -0.6403226852416992, + "logps/chosen": -52.787109375, + "logps/rejected": -119.90312957763672, + "loss": 0.5694, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9617655277252197, + "rewards/margins": 8.178939819335938, + "rewards/rejected": -5.217174530029297, + "step": 13579 + }, + { + "epoch": 3.4, + "grad_norm": 4.243880748748779, + "learning_rate": 2.3273727886498372e-06, + "logits/chosen": -0.5792556405067444, + "logits/rejected": -0.6545939445495605, + "logps/chosen": -52.7292594909668, + "logps/rejected": -116.23262023925781, + "loss": 0.6396, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.091005802154541, + "rewards/margins": 7.736262321472168, + "rewards/rejected": -4.645256042480469, + "step": 13580 + }, + { + "epoch": 3.4, + "grad_norm": 4.810962200164795, + "learning_rate": 2.3267085423349007e-06, + "logits/chosen": -0.4863898456096649, + "logits/rejected": -0.6088568568229675, + "logps/chosen": -61.353267669677734, + "logps/rejected": -110.93284606933594, + "loss": 0.651, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3240625858306885, + "rewards/margins": 6.725170135498047, + "rewards/rejected": -3.4011077880859375, + "step": 13581 + }, + { + "epoch": 3.4, + "grad_norm": 3.748185873031616, + "learning_rate": 2.326044362079838e-06, + "logits/chosen": -0.58058762550354, + "logits/rejected": -0.6357098817825317, + "logps/chosen": -42.483943939208984, + "logps/rejected": -97.86763763427734, + "loss": 0.5457, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9412460327148438, + "rewards/margins": 6.7566237449646, + "rewards/rejected": -3.8153774738311768, + "step": 13582 + }, + { + "epoch": 3.4, + "grad_norm": 5.94028377532959, + "learning_rate": 2.325380247901065e-06, + "logits/chosen": -0.6014601588249207, + "logits/rejected": -0.6812634468078613, + "logps/chosen": -45.23551559448242, + "logps/rejected": -117.91170501708984, + "loss": 0.5586, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3222618103027344, + "rewards/margins": 8.698481559753418, + "rewards/rejected": -5.376219272613525, + "step": 13583 + }, + { + "epoch": 3.4, + "grad_norm": 5.665019989013672, + "learning_rate": 2.3247161998149892e-06, + "logits/chosen": -0.5105723142623901, + "logits/rejected": -0.5805494785308838, + "logps/chosen": -58.0075798034668, + "logps/rejected": -97.2287826538086, + "loss": 0.6759, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.899367094039917, + "rewards/margins": 6.066588878631592, + "rewards/rejected": -3.167221784591675, + "step": 13584 + }, + { + "epoch": 3.4, + "grad_norm": 7.318907737731934, + "learning_rate": 2.324052217838023e-06, + "logits/chosen": -0.5684767961502075, + "logits/rejected": -0.6230123043060303, + "logps/chosen": -42.189353942871094, + "logps/rejected": -90.7048110961914, + "loss": 0.6126, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.066408634185791, + "rewards/margins": 6.390947341918945, + "rewards/rejected": -3.3245391845703125, + "step": 13585 + }, + { + "epoch": 3.4, + "grad_norm": 5.650683879852295, + "learning_rate": 2.3233883019865735e-06, + "logits/chosen": -0.6188392043113708, + "logits/rejected": -0.7141121625900269, + "logps/chosen": -53.9488639831543, + "logps/rejected": -86.39913177490234, + "loss": 0.6228, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1515636444091797, + "rewards/margins": 6.904330253601074, + "rewards/rejected": -3.7527668476104736, + "step": 13586 + }, + { + "epoch": 3.4, + "grad_norm": 2.576979160308838, + "learning_rate": 2.3227244522770437e-06, + "logits/chosen": -0.5972496271133423, + "logits/rejected": -0.6808886528015137, + "logps/chosen": -46.45771408081055, + "logps/rejected": -84.39195251464844, + "loss": 0.5217, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0203676223754883, + "rewards/margins": 7.457711696624756, + "rewards/rejected": -4.437343597412109, + "step": 13587 + }, + { + "epoch": 3.4, + "grad_norm": 5.981515407562256, + "learning_rate": 2.3220606687258422e-06, + "logits/chosen": -0.5445640087127686, + "logits/rejected": -0.6417613625526428, + "logps/chosen": -54.80730056762695, + "logps/rejected": -103.6644287109375, + "loss": 0.5838, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1201627254486084, + "rewards/margins": 8.550708770751953, + "rewards/rejected": -5.430545806884766, + "step": 13588 + }, + { + "epoch": 3.4, + "grad_norm": 11.502445220947266, + "learning_rate": 2.3213969513493685e-06, + "logits/chosen": -0.5878637433052063, + "logits/rejected": -0.6856021881103516, + "logps/chosen": -45.40316390991211, + "logps/rejected": -100.79118347167969, + "loss": 0.6081, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.971870183944702, + "rewards/margins": 7.141455173492432, + "rewards/rejected": -4.169585227966309, + "step": 13589 + }, + { + "epoch": 3.4, + "grad_norm": 4.252269744873047, + "learning_rate": 2.320733300164027e-06, + "logits/chosen": -0.5843930840492249, + "logits/rejected": -0.602156400680542, + "logps/chosen": -46.73773956298828, + "logps/rejected": -117.94647979736328, + "loss": 0.5657, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.326016902923584, + "rewards/margins": 6.255758285522461, + "rewards/rejected": -2.929741144180298, + "step": 13590 + }, + { + "epoch": 3.4, + "grad_norm": 14.854135513305664, + "learning_rate": 2.3200697151862134e-06, + "logits/chosen": -0.5249056816101074, + "logits/rejected": -0.5793952345848083, + "logps/chosen": -57.027523040771484, + "logps/rejected": -108.59512329101562, + "loss": 0.8156, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.919036865234375, + "rewards/margins": 5.688535213470459, + "rewards/rejected": -2.769498586654663, + "step": 13591 + }, + { + "epoch": 3.4, + "grad_norm": 7.480907917022705, + "learning_rate": 2.3194061964323296e-06, + "logits/chosen": -0.5699581503868103, + "logits/rejected": -0.6093390583992004, + "logps/chosen": -50.047725677490234, + "logps/rejected": -109.40847778320312, + "loss": 0.6901, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.743086576461792, + "rewards/margins": 7.034775733947754, + "rewards/rejected": -4.291689395904541, + "step": 13592 + }, + { + "epoch": 3.4, + "grad_norm": 8.959094047546387, + "learning_rate": 2.3187427439187695e-06, + "logits/chosen": -0.5814070105552673, + "logits/rejected": -0.6755331158638, + "logps/chosen": -69.81404113769531, + "logps/rejected": -108.85958099365234, + "loss": 0.7565, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1857502460479736, + "rewards/margins": 6.69409704208374, + "rewards/rejected": -3.5083467960357666, + "step": 13593 + }, + { + "epoch": 3.4, + "grad_norm": 3.402416229248047, + "learning_rate": 2.3180793576619265e-06, + "logits/chosen": -0.5748533010482788, + "logits/rejected": -0.6466227769851685, + "logps/chosen": -47.433834075927734, + "logps/rejected": -111.19361877441406, + "loss": 0.5588, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3754982948303223, + "rewards/margins": 8.761314392089844, + "rewards/rejected": -5.38581657409668, + "step": 13594 + }, + { + "epoch": 3.4, + "grad_norm": 5.513846397399902, + "learning_rate": 2.3174160376781973e-06, + "logits/chosen": -0.6601942777633667, + "logits/rejected": -0.7511190176010132, + "logps/chosen": -53.798240661621094, + "logps/rejected": -100.97419738769531, + "loss": 0.673, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0103981494903564, + "rewards/margins": 7.4294023513793945, + "rewards/rejected": -4.419004440307617, + "step": 13595 + }, + { + "epoch": 3.4, + "grad_norm": 4.909422397613525, + "learning_rate": 2.316752783983971e-06, + "logits/chosen": -0.5970431566238403, + "logits/rejected": -0.6465651392936707, + "logps/chosen": -52.066043853759766, + "logps/rejected": -103.0050277709961, + "loss": 0.6845, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9697232246398926, + "rewards/margins": 5.725008010864258, + "rewards/rejected": -2.755284547805786, + "step": 13596 + }, + { + "epoch": 3.4, + "grad_norm": 2.8624093532562256, + "learning_rate": 2.3160895965956354e-06, + "logits/chosen": -0.5201152563095093, + "logits/rejected": -0.5750454068183899, + "logps/chosen": -45.94487762451172, + "logps/rejected": -130.68370056152344, + "loss": 0.5234, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2078123092651367, + "rewards/margins": 9.322035789489746, + "rewards/rejected": -6.114223480224609, + "step": 13597 + }, + { + "epoch": 3.4, + "grad_norm": 4.252961158752441, + "learning_rate": 2.3154264755295823e-06, + "logits/chosen": -0.5607302784919739, + "logits/rejected": -0.6407090425491333, + "logps/chosen": -58.299407958984375, + "logps/rejected": -100.35952758789062, + "loss": 0.625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0312845706939697, + "rewards/margins": 6.206744194030762, + "rewards/rejected": -3.17546010017395, + "step": 13598 + }, + { + "epoch": 3.4, + "grad_norm": 8.894906044006348, + "learning_rate": 2.3147634208021967e-06, + "logits/chosen": -0.5264461040496826, + "logits/rejected": -0.6030849814414978, + "logps/chosen": -53.5646858215332, + "logps/rejected": -109.62094116210938, + "loss": 0.5875, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.49123477935791, + "rewards/margins": 7.836440086364746, + "rewards/rejected": -4.345206260681152, + "step": 13599 + }, + { + "epoch": 3.4, + "grad_norm": 5.875636100769043, + "learning_rate": 2.3141004324298615e-06, + "logits/chosen": -0.5676823854446411, + "logits/rejected": -0.6245630979537964, + "logps/chosen": -58.102542877197266, + "logps/rejected": -104.99031066894531, + "loss": 0.6245, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9466989040374756, + "rewards/margins": 5.641193866729736, + "rewards/rejected": -2.694495439529419, + "step": 13600 + }, + { + "epoch": 3.4, + "grad_norm": 17.4401912689209, + "learning_rate": 2.3134375104289613e-06, + "logits/chosen": -0.5761101245880127, + "logits/rejected": -0.6477874517440796, + "logps/chosen": -52.019866943359375, + "logps/rejected": -98.13856506347656, + "loss": 0.8161, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3768470287323, + "rewards/margins": 6.1605706214904785, + "rewards/rejected": -2.783723831176758, + "step": 13601 + }, + { + "epoch": 3.4, + "grad_norm": 2.615670919418335, + "learning_rate": 2.3127746548158798e-06, + "logits/chosen": -0.47996729612350464, + "logits/rejected": -0.5755913257598877, + "logps/chosen": -56.615196228027344, + "logps/rejected": -93.25240325927734, + "loss": 0.5662, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2869508266448975, + "rewards/margins": 6.10643196105957, + "rewards/rejected": -2.8194806575775146, + "step": 13602 + }, + { + "epoch": 3.4, + "grad_norm": 4.628337383270264, + "learning_rate": 2.312111865606995e-06, + "logits/chosen": -0.5078741312026978, + "logits/rejected": -0.6342242956161499, + "logps/chosen": -56.39226150512695, + "logps/rejected": -92.32566833496094, + "loss": 0.6631, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1870760917663574, + "rewards/margins": 7.318793296813965, + "rewards/rejected": -4.131716728210449, + "step": 13603 + }, + { + "epoch": 3.4, + "grad_norm": 5.5153913497924805, + "learning_rate": 2.311449142818682e-06, + "logits/chosen": -0.6112977266311646, + "logits/rejected": -0.698600709438324, + "logps/chosen": -59.92147445678711, + "logps/rejected": -92.41240692138672, + "loss": 0.7241, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.239813804626465, + "rewards/margins": 6.529422760009766, + "rewards/rejected": -3.28960919380188, + "step": 13604 + }, + { + "epoch": 3.4, + "grad_norm": 6.106942176818848, + "learning_rate": 2.3107864864673228e-06, + "logits/chosen": -0.5221525430679321, + "logits/rejected": -0.5918135643005371, + "logps/chosen": -49.29632568359375, + "logps/rejected": -108.86631774902344, + "loss": 0.6127, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1788489818573, + "rewards/margins": 7.422591209411621, + "rewards/rejected": -4.2437424659729, + "step": 13605 + }, + { + "epoch": 3.4, + "grad_norm": 2.5488452911376953, + "learning_rate": 2.31012389656929e-06, + "logits/chosen": -0.5434374809265137, + "logits/rejected": -0.6412670016288757, + "logps/chosen": -49.50840759277344, + "logps/rejected": -86.80479431152344, + "loss": 0.5703, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1983511447906494, + "rewards/margins": 6.821836948394775, + "rewards/rejected": -3.623485565185547, + "step": 13606 + }, + { + "epoch": 3.4, + "grad_norm": 5.50641393661499, + "learning_rate": 2.3094613731409552e-06, + "logits/chosen": -0.5300322771072388, + "logits/rejected": -0.6221379637718201, + "logps/chosen": -66.4120864868164, + "logps/rejected": -89.5341796875, + "loss": 0.6857, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0619168281555176, + "rewards/margins": 6.32595157623291, + "rewards/rejected": -3.2640345096588135, + "step": 13607 + }, + { + "epoch": 3.4, + "grad_norm": 3.65820050239563, + "learning_rate": 2.3087989161986933e-06, + "logits/chosen": -0.5660380125045776, + "logits/rejected": -0.6475786566734314, + "logps/chosen": -59.840919494628906, + "logps/rejected": -98.83657836914062, + "loss": 0.6887, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1002087593078613, + "rewards/margins": 7.281157493591309, + "rewards/rejected": -4.1809492111206055, + "step": 13608 + }, + { + "epoch": 3.4, + "grad_norm": 2.7862610816955566, + "learning_rate": 2.3081365257588706e-06, + "logits/chosen": -0.5250759720802307, + "logits/rejected": -0.6487878561019897, + "logps/chosen": -48.95987319946289, + "logps/rejected": -84.1751708984375, + "loss": 0.5548, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.549973726272583, + "rewards/margins": 7.457049369812012, + "rewards/rejected": -3.9070754051208496, + "step": 13609 + }, + { + "epoch": 3.4, + "grad_norm": 7.560617923736572, + "learning_rate": 2.30747420183786e-06, + "logits/chosen": -0.5441374778747559, + "logits/rejected": -0.6172875761985779, + "logps/chosen": -54.03816223144531, + "logps/rejected": -96.31217193603516, + "loss": 0.5713, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.712811231613159, + "rewards/margins": 6.201779365539551, + "rewards/rejected": -3.488967180252075, + "step": 13610 + }, + { + "epoch": 3.4, + "grad_norm": 2.9510066509246826, + "learning_rate": 2.3068119444520253e-06, + "logits/chosen": -0.4720615744590759, + "logits/rejected": -0.5477958917617798, + "logps/chosen": -66.98065948486328, + "logps/rejected": -91.36306762695312, + "loss": 0.6997, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.219053268432617, + "rewards/margins": 6.490818023681641, + "rewards/rejected": -3.2717649936676025, + "step": 13611 + }, + { + "epoch": 3.41, + "grad_norm": 4.014756679534912, + "learning_rate": 2.3061497536177303e-06, + "logits/chosen": -0.5494657754898071, + "logits/rejected": -0.6053547859191895, + "logps/chosen": -57.79649353027344, + "logps/rejected": -94.4249267578125, + "loss": 0.6595, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1739320755004883, + "rewards/margins": 6.588762283325195, + "rewards/rejected": -3.414830207824707, + "step": 13612 + }, + { + "epoch": 3.41, + "grad_norm": 7.612861633300781, + "learning_rate": 2.3054876293513418e-06, + "logits/chosen": -0.5538601279258728, + "logits/rejected": -0.6434810757637024, + "logps/chosen": -50.84516525268555, + "logps/rejected": -79.73499298095703, + "loss": 0.7099, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0560145378112793, + "rewards/margins": 5.614882946014404, + "rewards/rejected": -2.558867931365967, + "step": 13613 + }, + { + "epoch": 3.41, + "grad_norm": 3.667299509048462, + "learning_rate": 2.3048255716692205e-06, + "logits/chosen": -0.5464376211166382, + "logits/rejected": -0.6015732884407043, + "logps/chosen": -51.81629180908203, + "logps/rejected": -109.80712127685547, + "loss": 0.6023, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.251107692718506, + "rewards/margins": 6.955965995788574, + "rewards/rejected": -3.7048583030700684, + "step": 13614 + }, + { + "epoch": 3.41, + "grad_norm": 4.144808769226074, + "learning_rate": 2.304163580587724e-06, + "logits/chosen": -0.6000544428825378, + "logits/rejected": -0.6836846470832825, + "logps/chosen": -52.1140251159668, + "logps/rejected": -82.65681457519531, + "loss": 0.6979, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.182141065597534, + "rewards/margins": 6.102960586547852, + "rewards/rejected": -2.9208197593688965, + "step": 13615 + }, + { + "epoch": 3.41, + "grad_norm": 4.195592880249023, + "learning_rate": 2.303501656123212e-06, + "logits/chosen": -0.6009307503700256, + "logits/rejected": -0.6784917116165161, + "logps/chosen": -47.0780143737793, + "logps/rejected": -90.43250274658203, + "loss": 0.6409, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.181981086730957, + "rewards/margins": 5.9597249031066895, + "rewards/rejected": -2.7777442932128906, + "step": 13616 + }, + { + "epoch": 3.41, + "grad_norm": 5.985405445098877, + "learning_rate": 2.302839798292047e-06, + "logits/chosen": -0.5491100549697876, + "logits/rejected": -0.6381949186325073, + "logps/chosen": -56.88131332397461, + "logps/rejected": -98.76323699951172, + "loss": 0.682, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.094902515411377, + "rewards/margins": 6.67057466506958, + "rewards/rejected": -3.575672149658203, + "step": 13617 + }, + { + "epoch": 3.41, + "grad_norm": 2.5087273120880127, + "learning_rate": 2.302178007110575e-06, + "logits/chosen": -0.5135685205459595, + "logits/rejected": -0.6345958113670349, + "logps/chosen": -56.99262619018555, + "logps/rejected": -96.61713409423828, + "loss": 0.5585, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1944327354431152, + "rewards/margins": 7.512715816497803, + "rewards/rejected": -4.3182830810546875, + "step": 13618 + }, + { + "epoch": 3.41, + "grad_norm": 3.430330753326416, + "learning_rate": 2.301516282595155e-06, + "logits/chosen": -0.6469314098358154, + "logits/rejected": -0.7639280557632446, + "logps/chosen": -55.866485595703125, + "logps/rejected": -112.1705322265625, + "loss": 0.6469, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.081228256225586, + "rewards/margins": 8.776372909545898, + "rewards/rejected": -5.695144176483154, + "step": 13619 + }, + { + "epoch": 3.41, + "grad_norm": 7.831273555755615, + "learning_rate": 2.3008546247621387e-06, + "logits/chosen": -0.5533922910690308, + "logits/rejected": -0.6421890258789062, + "logps/chosen": -47.227821350097656, + "logps/rejected": -95.41544342041016, + "loss": 0.5533, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4716179370880127, + "rewards/margins": 6.59089994430542, + "rewards/rejected": -3.119281530380249, + "step": 13620 + }, + { + "epoch": 3.41, + "grad_norm": 2.777414560317993, + "learning_rate": 2.300193033627876e-06, + "logits/chosen": -0.5664242506027222, + "logits/rejected": -0.6096892952919006, + "logps/chosen": -53.6767463684082, + "logps/rejected": -117.37866973876953, + "loss": 0.5818, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2049543857574463, + "rewards/margins": 6.6953125, + "rewards/rejected": -3.4903578758239746, + "step": 13621 + }, + { + "epoch": 3.41, + "grad_norm": 11.438675880432129, + "learning_rate": 2.2995315092087127e-06, + "logits/chosen": -0.5653964281082153, + "logits/rejected": -0.6439816355705261, + "logps/chosen": -62.00606155395508, + "logps/rejected": -109.3458480834961, + "loss": 0.6462, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8199188709259033, + "rewards/margins": 6.214949607849121, + "rewards/rejected": -3.3950304985046387, + "step": 13622 + }, + { + "epoch": 3.41, + "grad_norm": 5.438332557678223, + "learning_rate": 2.298870051521001e-06, + "logits/chosen": -0.5451772212982178, + "logits/rejected": -0.5974210500717163, + "logps/chosen": -52.79483413696289, + "logps/rejected": -123.9244384765625, + "loss": 0.6404, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7643072605133057, + "rewards/margins": 7.01446533203125, + "rewards/rejected": -4.250158786773682, + "step": 13623 + }, + { + "epoch": 3.41, + "grad_norm": 7.867207050323486, + "learning_rate": 2.2982086605810828e-06, + "logits/chosen": -0.4235524535179138, + "logits/rejected": -0.50584477186203, + "logps/chosen": -61.025306701660156, + "logps/rejected": -97.03776550292969, + "loss": 0.6878, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0973398685455322, + "rewards/margins": 6.554626941680908, + "rewards/rejected": -3.457287073135376, + "step": 13624 + }, + { + "epoch": 3.41, + "grad_norm": 6.503180503845215, + "learning_rate": 2.2975473364053004e-06, + "logits/chosen": -0.561343789100647, + "logits/rejected": -0.6563096642494202, + "logps/chosen": -45.93362045288086, + "logps/rejected": -95.3058853149414, + "loss": 0.5958, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1892237663269043, + "rewards/margins": 6.710384368896484, + "rewards/rejected": -3.52116060256958, + "step": 13625 + }, + { + "epoch": 3.41, + "grad_norm": 11.2473726272583, + "learning_rate": 2.2968860790099996e-06, + "logits/chosen": -0.5130411386489868, + "logits/rejected": -0.5173718333244324, + "logps/chosen": -53.13896179199219, + "logps/rejected": -111.25227355957031, + "loss": 0.6397, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9773354530334473, + "rewards/margins": 6.255219459533691, + "rewards/rejected": -3.277884006500244, + "step": 13626 + }, + { + "epoch": 3.41, + "grad_norm": 2.440840005874634, + "learning_rate": 2.296224888411517e-06, + "logits/chosen": -0.5834088921546936, + "logits/rejected": -0.6555699706077576, + "logps/chosen": -52.23344421386719, + "logps/rejected": -109.67547607421875, + "loss": 0.6463, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3524394035339355, + "rewards/margins": 8.292397499084473, + "rewards/rejected": -4.939957618713379, + "step": 13627 + }, + { + "epoch": 3.41, + "grad_norm": 16.683351516723633, + "learning_rate": 2.2955637646261957e-06, + "logits/chosen": -0.572851300239563, + "logits/rejected": -0.6821990609169006, + "logps/chosen": -55.70768737792969, + "logps/rejected": -105.342041015625, + "loss": 0.6078, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0306222438812256, + "rewards/margins": 8.37255573272705, + "rewards/rejected": -5.341933250427246, + "step": 13628 + }, + { + "epoch": 3.41, + "grad_norm": 3.918811559677124, + "learning_rate": 2.29490270767037e-06, + "logits/chosen": -0.5506103038787842, + "logits/rejected": -0.63051438331604, + "logps/chosen": -55.3262825012207, + "logps/rejected": -126.20166015625, + "loss": 0.5357, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.224170684814453, + "rewards/margins": 8.654311180114746, + "rewards/rejected": -5.430141448974609, + "step": 13629 + }, + { + "epoch": 3.41, + "grad_norm": 4.19811487197876, + "learning_rate": 2.2942417175603737e-06, + "logits/chosen": -0.6470695734024048, + "logits/rejected": -0.6555308699607849, + "logps/chosen": -56.052024841308594, + "logps/rejected": -117.4716567993164, + "loss": 0.6915, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.842984676361084, + "rewards/margins": 6.0948076248168945, + "rewards/rejected": -3.2518227100372314, + "step": 13630 + }, + { + "epoch": 3.41, + "grad_norm": 4.178468704223633, + "learning_rate": 2.293580794312545e-06, + "logits/chosen": -0.5637970566749573, + "logits/rejected": -0.6426479816436768, + "logps/chosen": -47.99110412597656, + "logps/rejected": -104.25050354003906, + "loss": 0.6366, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4420242309570312, + "rewards/margins": 6.459444522857666, + "rewards/rejected": -3.0174202919006348, + "step": 13631 + }, + { + "epoch": 3.41, + "grad_norm": 10.244032859802246, + "learning_rate": 2.292919937943211e-06, + "logits/chosen": -0.5047551393508911, + "logits/rejected": -0.6242486238479614, + "logps/chosen": -63.834693908691406, + "logps/rejected": -91.60262298583984, + "loss": 0.6429, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.5510382652282715, + "rewards/margins": 6.813103675842285, + "rewards/rejected": -3.2620651721954346, + "step": 13632 + }, + { + "epoch": 3.41, + "grad_norm": 4.895031929016113, + "learning_rate": 2.2922591484687066e-06, + "logits/chosen": -0.49814942479133606, + "logits/rejected": -0.5801197290420532, + "logps/chosen": -58.541908264160156, + "logps/rejected": -103.99211120605469, + "loss": 0.6669, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2033193111419678, + "rewards/margins": 7.5122480392456055, + "rewards/rejected": -4.308928489685059, + "step": 13633 + }, + { + "epoch": 3.41, + "grad_norm": 3.7734596729278564, + "learning_rate": 2.291598425905357e-06, + "logits/chosen": -0.5701077580451965, + "logits/rejected": -0.624794602394104, + "logps/chosen": -51.25310134887695, + "logps/rejected": -116.93548583984375, + "loss": 0.6476, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.218515396118164, + "rewards/margins": 7.57373571395874, + "rewards/rejected": -4.355220317840576, + "step": 13634 + }, + { + "epoch": 3.41, + "grad_norm": 12.976518630981445, + "learning_rate": 2.290937770269493e-06, + "logits/chosen": -0.5475057363510132, + "logits/rejected": -0.653091311454773, + "logps/chosen": -60.37094497680664, + "logps/rejected": -122.08065032958984, + "loss": 0.5955, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9494197368621826, + "rewards/margins": 7.290709018707275, + "rewards/rejected": -4.341289043426514, + "step": 13635 + }, + { + "epoch": 3.41, + "grad_norm": 4.034123420715332, + "learning_rate": 2.2902771815774385e-06, + "logits/chosen": -0.6148896217346191, + "logits/rejected": -0.6686050891876221, + "logps/chosen": -46.959495544433594, + "logps/rejected": -108.78990936279297, + "loss": 0.5286, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.952305555343628, + "rewards/margins": 6.498854637145996, + "rewards/rejected": -3.546548843383789, + "step": 13636 + }, + { + "epoch": 3.41, + "grad_norm": 3.278289556503296, + "learning_rate": 2.2896166598455143e-06, + "logits/chosen": -0.49956023693084717, + "logits/rejected": -0.5595558881759644, + "logps/chosen": -56.35091781616211, + "logps/rejected": -120.86817932128906, + "loss": 0.5848, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.88293194770813, + "rewards/margins": 7.080310821533203, + "rewards/rejected": -4.197378635406494, + "step": 13637 + }, + { + "epoch": 3.41, + "grad_norm": 11.204891204833984, + "learning_rate": 2.2889562050900484e-06, + "logits/chosen": -0.568691074848175, + "logits/rejected": -0.6661586761474609, + "logps/chosen": -59.75282287597656, + "logps/rejected": -92.79496002197266, + "loss": 0.7011, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.214632511138916, + "rewards/margins": 6.185892581939697, + "rewards/rejected": -2.9712603092193604, + "step": 13638 + }, + { + "epoch": 3.41, + "grad_norm": 3.73665714263916, + "learning_rate": 2.288295817327357e-06, + "logits/chosen": -0.46783769130706787, + "logits/rejected": -0.5630402565002441, + "logps/chosen": -55.57378387451172, + "logps/rejected": -97.24581909179688, + "loss": 0.5158, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.296755790710449, + "rewards/margins": 6.820860385894775, + "rewards/rejected": -3.524104595184326, + "step": 13639 + }, + { + "epoch": 3.41, + "grad_norm": 4.3366498947143555, + "learning_rate": 2.287635496573759e-06, + "logits/chosen": -0.5152698159217834, + "logits/rejected": -0.6176847815513611, + "logps/chosen": -54.63411331176758, + "logps/rejected": -100.96186065673828, + "loss": 0.6452, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1744794845581055, + "rewards/margins": 6.804736614227295, + "rewards/rejected": -3.6302571296691895, + "step": 13640 + }, + { + "epoch": 3.41, + "grad_norm": 5.527562141418457, + "learning_rate": 2.286975242845575e-06, + "logits/chosen": -0.5190989971160889, + "logits/rejected": -0.5640597343444824, + "logps/chosen": -68.02812194824219, + "logps/rejected": -121.35588073730469, + "loss": 0.6615, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9462532997131348, + "rewards/margins": 5.953329086303711, + "rewards/rejected": -3.0070760250091553, + "step": 13641 + }, + { + "epoch": 3.41, + "grad_norm": 3.468374490737915, + "learning_rate": 2.286315056159118e-06, + "logits/chosen": -0.5313510298728943, + "logits/rejected": -0.5844171643257141, + "logps/chosen": -48.215293884277344, + "logps/rejected": -115.29553985595703, + "loss": 0.5474, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0413379669189453, + "rewards/margins": 7.979851722717285, + "rewards/rejected": -4.93851375579834, + "step": 13642 + }, + { + "epoch": 3.41, + "grad_norm": 13.852287292480469, + "learning_rate": 2.285654936530701e-06, + "logits/chosen": -0.6067829132080078, + "logits/rejected": -0.6397271156311035, + "logps/chosen": -58.976654052734375, + "logps/rejected": -111.54083251953125, + "loss": 0.7069, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.3304383754730225, + "rewards/margins": 6.303159713745117, + "rewards/rejected": -3.9727213382720947, + "step": 13643 + }, + { + "epoch": 3.41, + "grad_norm": 2.84476900100708, + "learning_rate": 2.2849948839766374e-06, + "logits/chosen": -0.54679274559021, + "logits/rejected": -0.6153669953346252, + "logps/chosen": -53.61267852783203, + "logps/rejected": -126.52180480957031, + "loss": 0.5667, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.300203561782837, + "rewards/margins": 8.47025203704834, + "rewards/rejected": -5.170048713684082, + "step": 13644 + }, + { + "epoch": 3.41, + "grad_norm": 8.085000038146973, + "learning_rate": 2.2843348985132415e-06, + "logits/chosen": -0.5737036466598511, + "logits/rejected": -0.6754471659660339, + "logps/chosen": -60.44167709350586, + "logps/rejected": -96.31390380859375, + "loss": 0.6557, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7842118740081787, + "rewards/margins": 5.9512457847595215, + "rewards/rejected": -3.1670336723327637, + "step": 13645 + }, + { + "epoch": 3.41, + "grad_norm": 3.3307039737701416, + "learning_rate": 2.283674980156815e-06, + "logits/chosen": -0.5325659513473511, + "logits/rejected": -0.6222357749938965, + "logps/chosen": -44.07170486450195, + "logps/rejected": -97.41337585449219, + "loss": 0.5345, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.053175210952759, + "rewards/margins": 7.4641032218933105, + "rewards/rejected": -4.410928249359131, + "step": 13646 + }, + { + "epoch": 3.41, + "grad_norm": 2.36279559135437, + "learning_rate": 2.283015128923669e-06, + "logits/chosen": -0.535800039768219, + "logits/rejected": -0.5936145186424255, + "logps/chosen": -43.752464294433594, + "logps/rejected": -106.21855163574219, + "loss": 0.4815, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0046098232269287, + "rewards/margins": 8.157943725585938, + "rewards/rejected": -5.153334617614746, + "step": 13647 + }, + { + "epoch": 3.41, + "grad_norm": 8.085493087768555, + "learning_rate": 2.282355344830111e-06, + "logits/chosen": -0.569240152835846, + "logits/rejected": -0.6066656112670898, + "logps/chosen": -51.521358489990234, + "logps/rejected": -110.96647644042969, + "loss": 0.6358, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.148477792739868, + "rewards/margins": 6.626548767089844, + "rewards/rejected": -3.478071451187134, + "step": 13648 + }, + { + "epoch": 3.41, + "grad_norm": 5.002853870391846, + "learning_rate": 2.2816956278924437e-06, + "logits/chosen": -0.45277172327041626, + "logits/rejected": -0.590848445892334, + "logps/chosen": -63.97643280029297, + "logps/rejected": -95.57598876953125, + "loss": 0.6988, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0737338066101074, + "rewards/margins": 6.0471930503845215, + "rewards/rejected": -2.973459243774414, + "step": 13649 + }, + { + "epoch": 3.41, + "grad_norm": 6.247041702270508, + "learning_rate": 2.2810359781269657e-06, + "logits/chosen": -0.5659117698669434, + "logits/rejected": -0.6612887978553772, + "logps/chosen": -58.376670837402344, + "logps/rejected": -107.60897064208984, + "loss": 0.593, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.35905122756958, + "rewards/margins": 7.285396099090576, + "rewards/rejected": -3.9263453483581543, + "step": 13650 + }, + { + "epoch": 3.41, + "grad_norm": 3.2377307415008545, + "learning_rate": 2.2803763955499834e-06, + "logits/chosen": -0.44305241107940674, + "logits/rejected": -0.5614508986473083, + "logps/chosen": -51.07286071777344, + "logps/rejected": -87.35043334960938, + "loss": 0.5983, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2388529777526855, + "rewards/margins": 6.510110378265381, + "rewards/rejected": -3.2712574005126953, + "step": 13651 + }, + { + "epoch": 3.42, + "grad_norm": 4.371204376220703, + "learning_rate": 2.279716880177791e-06, + "logits/chosen": -0.5519988536834717, + "logits/rejected": -0.6770634651184082, + "logps/chosen": -55.94318389892578, + "logps/rejected": -94.59999084472656, + "loss": 0.5909, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1757898330688477, + "rewards/margins": 7.57302713394165, + "rewards/rejected": -4.3972368240356445, + "step": 13652 + }, + { + "epoch": 3.42, + "grad_norm": 3.6816608905792236, + "learning_rate": 2.27905743202669e-06, + "logits/chosen": -0.6012505292892456, + "logits/rejected": -0.6194519996643066, + "logps/chosen": -56.50180435180664, + "logps/rejected": -120.38671875, + "loss": 0.7005, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1243526935577393, + "rewards/margins": 6.320509433746338, + "rewards/rejected": -3.1961567401885986, + "step": 13653 + }, + { + "epoch": 3.42, + "grad_norm": 2.3513972759246826, + "learning_rate": 2.2783980511129738e-06, + "logits/chosen": -0.5391473174095154, + "logits/rejected": -0.6502649188041687, + "logps/chosen": -47.82666778564453, + "logps/rejected": -99.06028747558594, + "loss": 0.5255, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.784526824951172, + "rewards/margins": 7.242914199829102, + "rewards/rejected": -4.45838737487793, + "step": 13654 + }, + { + "epoch": 3.42, + "grad_norm": 5.781332969665527, + "learning_rate": 2.277738737452935e-06, + "logits/chosen": -0.5754993557929993, + "logits/rejected": -0.6732444763183594, + "logps/chosen": -68.21483612060547, + "logps/rejected": -106.6506576538086, + "loss": 0.6814, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2053115367889404, + "rewards/margins": 6.488218307495117, + "rewards/rejected": -3.2829067707061768, + "step": 13655 + }, + { + "epoch": 3.42, + "grad_norm": 4.845006942749023, + "learning_rate": 2.2770794910628685e-06, + "logits/chosen": -0.4997630715370178, + "logits/rejected": -0.5339618921279907, + "logps/chosen": -42.93341827392578, + "logps/rejected": -99.16072845458984, + "loss": 0.5481, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2074978351593018, + "rewards/margins": 6.139082908630371, + "rewards/rejected": -2.9315850734710693, + "step": 13656 + }, + { + "epoch": 3.42, + "grad_norm": 6.712369441986084, + "learning_rate": 2.2764203119590645e-06, + "logits/chosen": -0.4842127561569214, + "logits/rejected": -0.5125792026519775, + "logps/chosen": -56.69811248779297, + "logps/rejected": -93.58859252929688, + "loss": 0.8942, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8136587142944336, + "rewards/margins": 4.606361389160156, + "rewards/rejected": -1.7927031517028809, + "step": 13657 + }, + { + "epoch": 3.42, + "grad_norm": 9.813654899597168, + "learning_rate": 2.27576120015781e-06, + "logits/chosen": -0.569534182548523, + "logits/rejected": -0.6278281211853027, + "logps/chosen": -51.1697883605957, + "logps/rejected": -104.47144317626953, + "loss": 0.6722, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8564977645874023, + "rewards/margins": 6.9617109298706055, + "rewards/rejected": -4.105213165283203, + "step": 13658 + }, + { + "epoch": 3.42, + "grad_norm": 2.647629737854004, + "learning_rate": 2.275102155675393e-06, + "logits/chosen": -0.5607284307479858, + "logits/rejected": -0.5916997790336609, + "logps/chosen": -53.36589431762695, + "logps/rejected": -116.84032440185547, + "loss": 0.5625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.136270761489868, + "rewards/margins": 6.911657810211182, + "rewards/rejected": -3.7753870487213135, + "step": 13659 + }, + { + "epoch": 3.42, + "grad_norm": 5.305633068084717, + "learning_rate": 2.274443178528105e-06, + "logits/chosen": -0.5610754489898682, + "logits/rejected": -0.6276958584785461, + "logps/chosen": -46.33075714111328, + "logps/rejected": -104.24332427978516, + "loss": 0.6009, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0556585788726807, + "rewards/margins": 6.735557556152344, + "rewards/rejected": -3.679898738861084, + "step": 13660 + }, + { + "epoch": 3.42, + "grad_norm": 3.0693259239196777, + "learning_rate": 2.273784268732221e-06, + "logits/chosen": -0.5605512857437134, + "logits/rejected": -0.6140967011451721, + "logps/chosen": -51.748661041259766, + "logps/rejected": -109.82122039794922, + "loss": 0.5643, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3311891555786133, + "rewards/margins": 6.02724027633667, + "rewards/rejected": -2.6960508823394775, + "step": 13661 + }, + { + "epoch": 3.42, + "grad_norm": 2.800435781478882, + "learning_rate": 2.273125426304027e-06, + "logits/chosen": -0.5329661965370178, + "logits/rejected": -0.5788566470146179, + "logps/chosen": -61.5571174621582, + "logps/rejected": -118.1773681640625, + "loss": 0.6119, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1530601978302, + "rewards/margins": 7.839923858642578, + "rewards/rejected": -4.686863899230957, + "step": 13662 + }, + { + "epoch": 3.42, + "grad_norm": 4.557204723358154, + "learning_rate": 2.272466651259806e-06, + "logits/chosen": -0.45979130268096924, + "logits/rejected": -0.5489752888679504, + "logps/chosen": -64.896240234375, + "logps/rejected": -109.90579986572266, + "loss": 0.6898, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.964277744293213, + "rewards/margins": 6.935370445251465, + "rewards/rejected": -3.971092700958252, + "step": 13663 + }, + { + "epoch": 3.42, + "grad_norm": 5.3916335105896, + "learning_rate": 2.2718079436158354e-06, + "logits/chosen": -0.4728023409843445, + "logits/rejected": -0.5653268694877625, + "logps/chosen": -59.960391998291016, + "logps/rejected": -100.2163314819336, + "loss": 0.6338, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.805396795272827, + "rewards/margins": 6.8656005859375, + "rewards/rejected": -4.06020450592041, + "step": 13664 + }, + { + "epoch": 3.42, + "grad_norm": 15.389629364013672, + "learning_rate": 2.271149303388391e-06, + "logits/chosen": -0.5179668068885803, + "logits/rejected": -0.6419884562492371, + "logps/chosen": -53.40291976928711, + "logps/rejected": -117.88919830322266, + "loss": 0.6635, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.025369644165039, + "rewards/margins": 8.119421005249023, + "rewards/rejected": -5.094051837921143, + "step": 13665 + }, + { + "epoch": 3.42, + "grad_norm": 5.927828788757324, + "learning_rate": 2.2704907305937508e-06, + "logits/chosen": -0.594627857208252, + "logits/rejected": -0.6784518361091614, + "logps/chosen": -54.11177062988281, + "logps/rejected": -92.76824188232422, + "loss": 0.6614, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.239767551422119, + "rewards/margins": 7.222836494445801, + "rewards/rejected": -3.9830691814422607, + "step": 13666 + }, + { + "epoch": 3.42, + "grad_norm": 2.571913242340088, + "learning_rate": 2.269832225248188e-06, + "logits/chosen": -0.5768060684204102, + "logits/rejected": -0.6857783794403076, + "logps/chosen": -53.823795318603516, + "logps/rejected": -95.65567779541016, + "loss": 0.5492, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.293508529663086, + "rewards/margins": 7.405097961425781, + "rewards/rejected": -4.1115899085998535, + "step": 13667 + }, + { + "epoch": 3.42, + "grad_norm": 10.325481414794922, + "learning_rate": 2.2691737873679737e-06, + "logits/chosen": -0.5953554511070251, + "logits/rejected": -0.6144605278968811, + "logps/chosen": -50.523826599121094, + "logps/rejected": -84.2474365234375, + "loss": 0.6531, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0688490867614746, + "rewards/margins": 5.072696685791016, + "rewards/rejected": -2.003847360610962, + "step": 13668 + }, + { + "epoch": 3.42, + "grad_norm": 5.578949451446533, + "learning_rate": 2.2685154169693813e-06, + "logits/chosen": -0.5150470733642578, + "logits/rejected": -0.5985329747200012, + "logps/chosen": -45.34906768798828, + "logps/rejected": -110.2457046508789, + "loss": 0.6094, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.01076078414917, + "rewards/margins": 6.351073741912842, + "rewards/rejected": -3.3403124809265137, + "step": 13669 + }, + { + "epoch": 3.42, + "grad_norm": 30.787527084350586, + "learning_rate": 2.267857114068678e-06, + "logits/chosen": -0.5226746797561646, + "logits/rejected": -0.6176668405532837, + "logps/chosen": -58.57196044921875, + "logps/rejected": -93.8944320678711, + "loss": 0.768, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9311842918395996, + "rewards/margins": 6.128381729125977, + "rewards/rejected": -3.197197198867798, + "step": 13670 + }, + { + "epoch": 3.42, + "grad_norm": 14.627331733703613, + "learning_rate": 2.2671988786821296e-06, + "logits/chosen": -0.4958444833755493, + "logits/rejected": -0.5913020968437195, + "logps/chosen": -62.698707580566406, + "logps/rejected": -92.05443572998047, + "loss": 0.7621, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8550422191619873, + "rewards/margins": 5.912519454956055, + "rewards/rejected": -3.0574774742126465, + "step": 13671 + }, + { + "epoch": 3.42, + "grad_norm": 55.731101989746094, + "learning_rate": 2.2665407108260057e-06, + "logits/chosen": -0.5457739233970642, + "logits/rejected": -0.517322301864624, + "logps/chosen": -45.8894157409668, + "logps/rejected": -129.93275451660156, + "loss": 0.6388, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.829838752746582, + "rewards/margins": 7.840773105621338, + "rewards/rejected": -5.010934829711914, + "step": 13672 + }, + { + "epoch": 3.42, + "grad_norm": 3.2003870010375977, + "learning_rate": 2.2658826105165664e-06, + "logits/chosen": -0.5229209065437317, + "logits/rejected": -0.5923669338226318, + "logps/chosen": -49.845603942871094, + "logps/rejected": -114.75146484375, + "loss": 0.559, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.422089099884033, + "rewards/margins": 7.3969221115112305, + "rewards/rejected": -3.974832534790039, + "step": 13673 + }, + { + "epoch": 3.42, + "grad_norm": 2.7199466228485107, + "learning_rate": 2.2652245777700775e-06, + "logits/chosen": -0.504614531993866, + "logits/rejected": -0.5555856227874756, + "logps/chosen": -58.99119567871094, + "logps/rejected": -111.99823760986328, + "loss": 0.6176, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2849106788635254, + "rewards/margins": 6.614406585693359, + "rewards/rejected": -3.3294966220855713, + "step": 13674 + }, + { + "epoch": 3.42, + "grad_norm": 3.185119867324829, + "learning_rate": 2.2645666126027964e-06, + "logits/chosen": -0.5062380433082581, + "logits/rejected": -0.6145744919776917, + "logps/chosen": -65.13449096679688, + "logps/rejected": -93.14398193359375, + "loss": 0.6683, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2129766941070557, + "rewards/margins": 6.284267902374268, + "rewards/rejected": -3.07129168510437, + "step": 13675 + }, + { + "epoch": 3.42, + "grad_norm": 3.829512596130371, + "learning_rate": 2.263908715030986e-06, + "logits/chosen": -0.5216342210769653, + "logits/rejected": -0.6086568236351013, + "logps/chosen": -55.90801239013672, + "logps/rejected": -98.20555114746094, + "loss": 0.612, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1197731494903564, + "rewards/margins": 7.166697978973389, + "rewards/rejected": -4.046924114227295, + "step": 13676 + }, + { + "epoch": 3.42, + "grad_norm": 3.042733907699585, + "learning_rate": 2.2632508850708996e-06, + "logits/chosen": -0.6002362966537476, + "logits/rejected": -0.6894235014915466, + "logps/chosen": -47.21073532104492, + "logps/rejected": -116.01160430908203, + "loss": 0.5242, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1380116939544678, + "rewards/margins": 8.981164932250977, + "rewards/rejected": -5.84315299987793, + "step": 13677 + }, + { + "epoch": 3.42, + "grad_norm": 16.925003051757812, + "learning_rate": 2.2625931227387965e-06, + "logits/chosen": -0.5221303105354309, + "logits/rejected": -0.5745360851287842, + "logps/chosen": -68.160888671875, + "logps/rejected": -117.0732650756836, + "loss": 0.7413, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6898281574249268, + "rewards/margins": 6.92021369934082, + "rewards/rejected": -4.230384826660156, + "step": 13678 + }, + { + "epoch": 3.42, + "grad_norm": 15.381103515625, + "learning_rate": 2.2619354280509288e-06, + "logits/chosen": -0.5653080940246582, + "logits/rejected": -0.6364169716835022, + "logps/chosen": -48.566734313964844, + "logps/rejected": -90.8729019165039, + "loss": 0.6596, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.961416721343994, + "rewards/margins": 5.547271728515625, + "rewards/rejected": -2.5858545303344727, + "step": 13679 + }, + { + "epoch": 3.42, + "grad_norm": 6.690812110900879, + "learning_rate": 2.2612778010235476e-06, + "logits/chosen": -0.5439838171005249, + "logits/rejected": -0.6100795269012451, + "logps/chosen": -54.08097457885742, + "logps/rejected": -120.842041015625, + "loss": 0.67, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0144381523132324, + "rewards/margins": 8.271475791931152, + "rewards/rejected": -5.257037162780762, + "step": 13680 + }, + { + "epoch": 3.42, + "grad_norm": 6.353133201599121, + "learning_rate": 2.2606202416729063e-06, + "logits/chosen": -0.5798433423042297, + "logits/rejected": -0.6507328152656555, + "logps/chosen": -55.10131072998047, + "logps/rejected": -107.33058166503906, + "loss": 0.6865, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2072577476501465, + "rewards/margins": 7.197412967681885, + "rewards/rejected": -3.9901554584503174, + "step": 13681 + }, + { + "epoch": 3.42, + "grad_norm": 1.9869611263275146, + "learning_rate": 2.2599627500152527e-06, + "logits/chosen": -0.5051064491271973, + "logits/rejected": -0.667799711227417, + "logps/chosen": -68.11003112792969, + "logps/rejected": -89.76771545410156, + "loss": 0.5667, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8369455337524414, + "rewards/margins": 7.702439785003662, + "rewards/rejected": -3.8654942512512207, + "step": 13682 + }, + { + "epoch": 3.42, + "grad_norm": 9.411698341369629, + "learning_rate": 2.2593053260668323e-06, + "logits/chosen": -0.5719618201255798, + "logits/rejected": -0.6654407978057861, + "logps/chosen": -51.9739875793457, + "logps/rejected": -100.75904846191406, + "loss": 0.6167, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.119004011154175, + "rewards/margins": 7.047486305236816, + "rewards/rejected": -3.9284822940826416, + "step": 13683 + }, + { + "epoch": 3.42, + "grad_norm": 3.8063604831695557, + "learning_rate": 2.258647969843894e-06, + "logits/chosen": -0.5451539158821106, + "logits/rejected": -0.6512227654457092, + "logps/chosen": -49.8275032043457, + "logps/rejected": -94.44230651855469, + "loss": 0.5599, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2466301918029785, + "rewards/margins": 6.747618198394775, + "rewards/rejected": -3.500988006591797, + "step": 13684 + }, + { + "epoch": 3.42, + "grad_norm": 5.347836494445801, + "learning_rate": 2.2579906813626807e-06, + "logits/chosen": -0.5433008074760437, + "logits/rejected": -0.6810254454612732, + "logps/chosen": -57.75357437133789, + "logps/rejected": -109.01803588867188, + "loss": 0.6276, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.01094126701355, + "rewards/margins": 8.339559555053711, + "rewards/rejected": -5.32861852645874, + "step": 13685 + }, + { + "epoch": 3.42, + "grad_norm": 2.2576606273651123, + "learning_rate": 2.257333460639432e-06, + "logits/chosen": -0.6053270101547241, + "logits/rejected": -0.7141969203948975, + "logps/chosen": -47.56712341308594, + "logps/rejected": -99.28852844238281, + "loss": 0.5443, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.18346905708313, + "rewards/margins": 8.279328346252441, + "rewards/rejected": -5.095859527587891, + "step": 13686 + }, + { + "epoch": 3.42, + "grad_norm": 27.26010513305664, + "learning_rate": 2.2566763076903908e-06, + "logits/chosen": -0.5429738163948059, + "logits/rejected": -0.6562444567680359, + "logps/chosen": -48.41108703613281, + "logps/rejected": -91.89631652832031, + "loss": 0.5888, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0391998291015625, + "rewards/margins": 7.858555793762207, + "rewards/rejected": -4.8193559646606445, + "step": 13687 + }, + { + "epoch": 3.42, + "grad_norm": 10.053417205810547, + "learning_rate": 2.2560192225317996e-06, + "logits/chosen": -0.5391049385070801, + "logits/rejected": -0.6321283578872681, + "logps/chosen": -64.62151336669922, + "logps/rejected": -93.34567260742188, + "loss": 0.7173, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0999510288238525, + "rewards/margins": 5.501692295074463, + "rewards/rejected": -2.4017415046691895, + "step": 13688 + }, + { + "epoch": 3.42, + "grad_norm": 6.790510177612305, + "learning_rate": 2.2553622051798887e-06, + "logits/chosen": -0.48567572236061096, + "logits/rejected": -0.5573335886001587, + "logps/chosen": -47.630584716796875, + "logps/rejected": -98.6493148803711, + "loss": 0.6275, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0015976428985596, + "rewards/margins": 6.324079513549805, + "rewards/rejected": -3.322481870651245, + "step": 13689 + }, + { + "epoch": 3.42, + "grad_norm": 6.629221439361572, + "learning_rate": 2.2547052556508964e-06, + "logits/chosen": -0.5629168152809143, + "logits/rejected": -0.6530746221542358, + "logps/chosen": -50.4378547668457, + "logps/rejected": -100.32797241210938, + "loss": 0.6448, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6465940475463867, + "rewards/margins": 6.598411560058594, + "rewards/rejected": -3.9518182277679443, + "step": 13690 + }, + { + "epoch": 3.42, + "grad_norm": 19.472217559814453, + "learning_rate": 2.2540483739610592e-06, + "logits/chosen": -0.6109067797660828, + "logits/rejected": -0.6638001799583435, + "logps/chosen": -60.17772674560547, + "logps/rejected": -111.12146759033203, + "loss": 0.8973, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.609811305999756, + "rewards/margins": 6.478774547576904, + "rewards/rejected": -3.868962526321411, + "step": 13691 + }, + { + "epoch": 3.43, + "grad_norm": 3.4921984672546387, + "learning_rate": 2.2533915601266075e-06, + "logits/chosen": -0.5619661808013916, + "logits/rejected": -0.6372824907302856, + "logps/chosen": -51.2809944152832, + "logps/rejected": -104.23665618896484, + "loss": 0.5729, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.169642686843872, + "rewards/margins": 7.423028945922852, + "rewards/rejected": -4.2533860206604, + "step": 13692 + }, + { + "epoch": 3.43, + "grad_norm": 9.439621925354004, + "learning_rate": 2.25273481416377e-06, + "logits/chosen": -0.5103681087493896, + "logits/rejected": -0.6655480861663818, + "logps/chosen": -68.76575469970703, + "logps/rejected": -82.72438049316406, + "loss": 0.6757, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.223806381225586, + "rewards/margins": 6.700403213500977, + "rewards/rejected": -3.4765965938568115, + "step": 13693 + }, + { + "epoch": 3.43, + "grad_norm": 15.431731224060059, + "learning_rate": 2.2520781360887795e-06, + "logits/chosen": -0.5372340679168701, + "logits/rejected": -0.6613640189170837, + "logps/chosen": -58.30500793457031, + "logps/rejected": -105.65348815917969, + "loss": 0.6488, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1445391178131104, + "rewards/margins": 8.549221992492676, + "rewards/rejected": -5.404682636260986, + "step": 13694 + }, + { + "epoch": 3.43, + "grad_norm": 3.264582633972168, + "learning_rate": 2.2514215259178608e-06, + "logits/chosen": -0.5657967329025269, + "logits/rejected": -0.605486273765564, + "logps/chosen": -57.33860778808594, + "logps/rejected": -109.97103118896484, + "loss": 0.584, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.158379554748535, + "rewards/margins": 6.064857482910156, + "rewards/rejected": -2.906477451324463, + "step": 13695 + }, + { + "epoch": 3.43, + "grad_norm": 7.46047830581665, + "learning_rate": 2.2507649836672378e-06, + "logits/chosen": -0.5311543345451355, + "logits/rejected": -0.5966765284538269, + "logps/chosen": -53.68635940551758, + "logps/rejected": -99.40682983398438, + "loss": 0.6714, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.77709698677063, + "rewards/margins": 7.045609951019287, + "rewards/rejected": -4.268513202667236, + "step": 13696 + }, + { + "epoch": 3.43, + "grad_norm": 3.3160147666931152, + "learning_rate": 2.2501085093531383e-06, + "logits/chosen": -0.47156503796577454, + "logits/rejected": -0.5932629704475403, + "logps/chosen": -61.13239669799805, + "logps/rejected": -85.05500793457031, + "loss": 0.6383, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.692128896713257, + "rewards/margins": 7.058688163757324, + "rewards/rejected": -3.3665592670440674, + "step": 13697 + }, + { + "epoch": 3.43, + "grad_norm": 5.621368408203125, + "learning_rate": 2.24945210299178e-06, + "logits/chosen": -0.5461084246635437, + "logits/rejected": -0.6002569198608398, + "logps/chosen": -52.74529266357422, + "logps/rejected": -104.25719451904297, + "loss": 0.6591, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0797595977783203, + "rewards/margins": 6.533037185668945, + "rewards/rejected": -3.453277587890625, + "step": 13698 + }, + { + "epoch": 3.43, + "grad_norm": 3.7808945178985596, + "learning_rate": 2.2487957645993887e-06, + "logits/chosen": -0.5702398419380188, + "logits/rejected": -0.6135697960853577, + "logps/chosen": -52.080604553222656, + "logps/rejected": -124.71261596679688, + "loss": 0.6105, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7635393142700195, + "rewards/margins": 7.56528902053833, + "rewards/rejected": -4.801749229431152, + "step": 13699 + }, + { + "epoch": 3.43, + "grad_norm": 8.2465238571167, + "learning_rate": 2.2481394941921795e-06, + "logits/chosen": -0.6035121083259583, + "logits/rejected": -0.6673994064331055, + "logps/chosen": -59.22595977783203, + "logps/rejected": -107.93453216552734, + "loss": 0.5988, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.901505470275879, + "rewards/margins": 7.393561363220215, + "rewards/rejected": -4.492056369781494, + "step": 13700 + }, + { + "epoch": 3.43, + "grad_norm": 21.39596176147461, + "learning_rate": 2.2474832917863694e-06, + "logits/chosen": -0.5262263417243958, + "logits/rejected": -0.5926996469497681, + "logps/chosen": -61.47871780395508, + "logps/rejected": -115.22154235839844, + "loss": 0.6785, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.640690565109253, + "rewards/margins": 7.917270660400391, + "rewards/rejected": -5.276580810546875, + "step": 13701 + }, + { + "epoch": 3.43, + "grad_norm": 4.261544227600098, + "learning_rate": 2.246827157398174e-06, + "logits/chosen": -0.5558762550354004, + "logits/rejected": -0.6277458071708679, + "logps/chosen": -48.28096389770508, + "logps/rejected": -103.84307098388672, + "loss": 0.6157, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1227426528930664, + "rewards/margins": 7.117035388946533, + "rewards/rejected": -3.994292736053467, + "step": 13702 + }, + { + "epoch": 3.43, + "grad_norm": 5.5786452293396, + "learning_rate": 2.2461710910438124e-06, + "logits/chosen": -0.4980999231338501, + "logits/rejected": -0.5269057750701904, + "logps/chosen": -52.678775787353516, + "logps/rejected": -120.17388153076172, + "loss": 0.5787, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1428754329681396, + "rewards/margins": 7.78242826461792, + "rewards/rejected": -4.639553070068359, + "step": 13703 + }, + { + "epoch": 3.43, + "grad_norm": 5.13864278793335, + "learning_rate": 2.245515092739488e-06, + "logits/chosen": -0.6119755506515503, + "logits/rejected": -0.687730073928833, + "logps/chosen": -54.85722732543945, + "logps/rejected": -90.95423889160156, + "loss": 0.6494, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9731192588806152, + "rewards/margins": 6.6358866691589355, + "rewards/rejected": -3.6627678871154785, + "step": 13704 + }, + { + "epoch": 3.43, + "grad_norm": 3.4700348377227783, + "learning_rate": 2.2448591625014153e-06, + "logits/chosen": -0.46660780906677246, + "logits/rejected": -0.6070163249969482, + "logps/chosen": -70.58686828613281, + "logps/rejected": -100.79524230957031, + "loss": 0.6564, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1010353565216064, + "rewards/margins": 7.656566619873047, + "rewards/rejected": -4.555531978607178, + "step": 13705 + }, + { + "epoch": 3.43, + "grad_norm": 5.362624168395996, + "learning_rate": 2.2442033003458073e-06, + "logits/chosen": -0.5385162234306335, + "logits/rejected": -0.6152961850166321, + "logps/chosen": -54.930171966552734, + "logps/rejected": -105.50946044921875, + "loss": 0.6791, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.379396677017212, + "rewards/margins": 7.010702133178711, + "rewards/rejected": -3.6313059329986572, + "step": 13706 + }, + { + "epoch": 3.43, + "grad_norm": 8.599651336669922, + "learning_rate": 2.2435475062888636e-06, + "logits/chosen": -0.5131211280822754, + "logits/rejected": -0.6075322031974792, + "logps/chosen": -78.28590393066406, + "logps/rejected": -100.32327270507812, + "loss": 0.8417, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8431644439697266, + "rewards/margins": 6.307950019836426, + "rewards/rejected": -3.4647860527038574, + "step": 13707 + }, + { + "epoch": 3.43, + "grad_norm": 5.329469203948975, + "learning_rate": 2.242891780346792e-06, + "logits/chosen": -0.518067479133606, + "logits/rejected": -0.5801207423210144, + "logps/chosen": -49.71037292480469, + "logps/rejected": -106.92424774169922, + "loss": 0.5857, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0515098571777344, + "rewards/margins": 6.389454364776611, + "rewards/rejected": -3.3379440307617188, + "step": 13708 + }, + { + "epoch": 3.43, + "grad_norm": 4.683425426483154, + "learning_rate": 2.2422361225358e-06, + "logits/chosen": -0.49896618723869324, + "logits/rejected": -0.5763211846351624, + "logps/chosen": -56.10883331298828, + "logps/rejected": -107.21534729003906, + "loss": 0.5971, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.754242181777954, + "rewards/margins": 6.659589767456055, + "rewards/rejected": -3.905348300933838, + "step": 13709 + }, + { + "epoch": 3.43, + "grad_norm": 3.153109550476074, + "learning_rate": 2.2415805328720856e-06, + "logits/chosen": -0.49076908826828003, + "logits/rejected": -0.5486230254173279, + "logps/chosen": -58.794921875, + "logps/rejected": -125.1734390258789, + "loss": 0.6827, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.121523857116699, + "rewards/margins": 6.984485626220703, + "rewards/rejected": -3.862962245941162, + "step": 13710 + }, + { + "epoch": 3.43, + "grad_norm": 13.487043380737305, + "learning_rate": 2.240925011371849e-06, + "logits/chosen": -0.5333102941513062, + "logits/rejected": -0.6056526303291321, + "logps/chosen": -63.75980758666992, + "logps/rejected": -104.57644653320312, + "loss": 0.7261, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.072310447692871, + "rewards/margins": 7.371677875518799, + "rewards/rejected": -4.299367427825928, + "step": 13711 + }, + { + "epoch": 3.43, + "grad_norm": 7.214300632476807, + "learning_rate": 2.240269558051292e-06, + "logits/chosen": -0.5118721127510071, + "logits/rejected": -0.5351431369781494, + "logps/chosen": -52.60102081298828, + "logps/rejected": -111.86578369140625, + "loss": 0.6306, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9945120811462402, + "rewards/margins": 6.5103302001953125, + "rewards/rejected": -3.5158183574676514, + "step": 13712 + }, + { + "epoch": 3.43, + "grad_norm": 5.250207424163818, + "learning_rate": 2.239614172926609e-06, + "logits/chosen": -0.5729750990867615, + "logits/rejected": -0.5946004986763, + "logps/chosen": -50.423622131347656, + "logps/rejected": -119.71631622314453, + "loss": 0.6052, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1324639320373535, + "rewards/margins": 7.880311012268066, + "rewards/rejected": -4.747847557067871, + "step": 13713 + }, + { + "epoch": 3.43, + "grad_norm": 4.042075157165527, + "learning_rate": 2.2389588560139946e-06, + "logits/chosen": -0.5446614623069763, + "logits/rejected": -0.6263868808746338, + "logps/chosen": -61.632144927978516, + "logps/rejected": -115.5867691040039, + "loss": 0.6656, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.022228240966797, + "rewards/margins": 7.682167053222656, + "rewards/rejected": -4.659938812255859, + "step": 13714 + }, + { + "epoch": 3.43, + "grad_norm": 3.3149914741516113, + "learning_rate": 2.2383036073296456e-06, + "logits/chosen": -0.46692654490470886, + "logits/rejected": -0.5771650075912476, + "logps/chosen": -59.02843475341797, + "logps/rejected": -107.86027526855469, + "loss": 0.6219, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4150917530059814, + "rewards/margins": 7.987188816070557, + "rewards/rejected": -4.572096824645996, + "step": 13715 + }, + { + "epoch": 3.43, + "grad_norm": 3.064837694168091, + "learning_rate": 2.2376484268897498e-06, + "logits/chosen": -0.4957329034805298, + "logits/rejected": -0.5604526996612549, + "logps/chosen": -53.80487823486328, + "logps/rejected": -120.51914978027344, + "loss": 0.568, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.139456272125244, + "rewards/margins": 7.897629261016846, + "rewards/rejected": -4.758172512054443, + "step": 13716 + }, + { + "epoch": 3.43, + "grad_norm": 4.648463249206543, + "learning_rate": 2.236993314710501e-06, + "logits/chosen": -0.44054195284843445, + "logits/rejected": -0.5595875382423401, + "logps/chosen": -70.05085754394531, + "logps/rejected": -103.27545928955078, + "loss": 0.6359, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.088705062866211, + "rewards/margins": 7.5339202880859375, + "rewards/rejected": -4.445215702056885, + "step": 13717 + }, + { + "epoch": 3.43, + "grad_norm": 3.468148708343506, + "learning_rate": 2.236338270808085e-06, + "logits/chosen": -0.5835459232330322, + "logits/rejected": -0.6414916515350342, + "logps/chosen": -51.61784362792969, + "logps/rejected": -108.62388610839844, + "loss": 0.5637, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0751144886016846, + "rewards/margins": 6.442180633544922, + "rewards/rejected": -3.367065906524658, + "step": 13718 + }, + { + "epoch": 3.43, + "grad_norm": 5.914889335632324, + "learning_rate": 2.2356832951986916e-06, + "logits/chosen": -0.5449802875518799, + "logits/rejected": -0.6395103335380554, + "logps/chosen": -56.85859298706055, + "logps/rejected": -110.60912322998047, + "loss": 0.6599, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.228928327560425, + "rewards/margins": 7.563487529754639, + "rewards/rejected": -4.334559440612793, + "step": 13719 + }, + { + "epoch": 3.43, + "grad_norm": 6.98983097076416, + "learning_rate": 2.235028387898504e-06, + "logits/chosen": -0.6032465696334839, + "logits/rejected": -0.6821388602256775, + "logps/chosen": -57.517425537109375, + "logps/rejected": -107.9067611694336, + "loss": 0.7052, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9227585792541504, + "rewards/margins": 6.132724761962891, + "rewards/rejected": -3.2099666595458984, + "step": 13720 + }, + { + "epoch": 3.43, + "grad_norm": 11.193158149719238, + "learning_rate": 2.2343735489237046e-06, + "logits/chosen": -0.585972249507904, + "logits/rejected": -0.69158536195755, + "logps/chosen": -50.610008239746094, + "logps/rejected": -111.52068328857422, + "loss": 0.6525, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.961611270904541, + "rewards/margins": 7.06492805480957, + "rewards/rejected": -4.1033172607421875, + "step": 13721 + }, + { + "epoch": 3.43, + "grad_norm": 3.894483804702759, + "learning_rate": 2.2337187782904777e-06, + "logits/chosen": -0.5246367454528809, + "logits/rejected": -0.6243383288383484, + "logps/chosen": -68.51441955566406, + "logps/rejected": -125.0447769165039, + "loss": 0.62, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7920262813568115, + "rewards/margins": 8.37274169921875, + "rewards/rejected": -5.580714225769043, + "step": 13722 + }, + { + "epoch": 3.43, + "grad_norm": 7.438314437866211, + "learning_rate": 2.233064076015001e-06, + "logits/chosen": -0.5467946529388428, + "logits/rejected": -0.5732454061508179, + "logps/chosen": -51.33561325073242, + "logps/rejected": -97.04780578613281, + "loss": 0.7514, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8490896224975586, + "rewards/margins": 5.752405166625977, + "rewards/rejected": -2.903315544128418, + "step": 13723 + }, + { + "epoch": 3.43, + "grad_norm": 2.9014060497283936, + "learning_rate": 2.232409442113455e-06, + "logits/chosen": -0.5275940299034119, + "logits/rejected": -0.5940997004508972, + "logps/chosen": -64.55693817138672, + "logps/rejected": -103.6368408203125, + "loss": 0.6636, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.932614326477051, + "rewards/margins": 7.577092170715332, + "rewards/rejected": -4.644477844238281, + "step": 13724 + }, + { + "epoch": 3.43, + "grad_norm": 3.9143402576446533, + "learning_rate": 2.2317548766020157e-06, + "logits/chosen": -0.5544602274894714, + "logits/rejected": -0.6360334157943726, + "logps/chosen": -39.675758361816406, + "logps/rejected": -104.38318634033203, + "loss": 0.5475, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.235257863998413, + "rewards/margins": 6.373415946960449, + "rewards/rejected": -3.138157844543457, + "step": 13725 + }, + { + "epoch": 3.43, + "grad_norm": 5.781414031982422, + "learning_rate": 2.2311003794968563e-06, + "logits/chosen": -0.48974156379699707, + "logits/rejected": -0.5838963985443115, + "logps/chosen": -69.85009002685547, + "logps/rejected": -110.57766723632812, + "loss": 0.6311, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0543107986450195, + "rewards/margins": 6.781016826629639, + "rewards/rejected": -3.726706027984619, + "step": 13726 + }, + { + "epoch": 3.43, + "grad_norm": 12.099154472351074, + "learning_rate": 2.230445950814153e-06, + "logits/chosen": -0.5404470562934875, + "logits/rejected": -0.607301652431488, + "logps/chosen": -52.39303207397461, + "logps/rejected": -87.81204223632812, + "loss": 0.6191, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4342498779296875, + "rewards/margins": 5.75575065612793, + "rewards/rejected": -2.321500539779663, + "step": 13727 + }, + { + "epoch": 3.43, + "grad_norm": 4.093544006347656, + "learning_rate": 2.229791590570076e-06, + "logits/chosen": -0.4830039441585541, + "logits/rejected": -0.5547226667404175, + "logps/chosen": -64.89282989501953, + "logps/rejected": -95.44116973876953, + "loss": 0.6208, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.991516590118408, + "rewards/margins": 6.282418251037598, + "rewards/rejected": -3.2909011840820312, + "step": 13728 + }, + { + "epoch": 3.43, + "grad_norm": 7.85788106918335, + "learning_rate": 2.229137298780794e-06, + "logits/chosen": -0.4938110411167145, + "logits/rejected": -0.5893337726593018, + "logps/chosen": -65.0533676147461, + "logps/rejected": -109.6771240234375, + "loss": 0.6997, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.060807228088379, + "rewards/margins": 6.105939865112305, + "rewards/rejected": -3.0451323986053467, + "step": 13729 + }, + { + "epoch": 3.43, + "grad_norm": 5.4828009605407715, + "learning_rate": 2.228483075462477e-06, + "logits/chosen": -0.5509181022644043, + "logits/rejected": -0.6785322427749634, + "logps/chosen": -53.3558235168457, + "logps/rejected": -89.14363861083984, + "loss": 0.5863, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2781670093536377, + "rewards/margins": 7.216036796569824, + "rewards/rejected": -3.9378700256347656, + "step": 13730 + }, + { + "epoch": 3.43, + "grad_norm": 5.432773590087891, + "learning_rate": 2.227828920631294e-06, + "logits/chosen": -0.5742271542549133, + "logits/rejected": -0.6673697233200073, + "logps/chosen": -45.85218048095703, + "logps/rejected": -96.50070190429688, + "loss": 0.6491, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.123403787612915, + "rewards/margins": 6.890172958374023, + "rewards/rejected": -3.7667694091796875, + "step": 13731 + }, + { + "epoch": 3.44, + "grad_norm": 13.800748825073242, + "learning_rate": 2.2271748343034034e-06, + "logits/chosen": -0.5138818025588989, + "logits/rejected": -0.5776988863945007, + "logps/chosen": -51.86616516113281, + "logps/rejected": -104.12403869628906, + "loss": 0.587, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2406694889068604, + "rewards/margins": 6.9271392822265625, + "rewards/rejected": -3.686469554901123, + "step": 13732 + }, + { + "epoch": 3.44, + "grad_norm": 4.17859411239624, + "learning_rate": 2.226520816494972e-06, + "logits/chosen": -0.5005207061767578, + "logits/rejected": -0.5907238125801086, + "logps/chosen": -62.43025588989258, + "logps/rejected": -92.58000183105469, + "loss": 0.5636, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.904712677001953, + "rewards/margins": 6.109029769897461, + "rewards/rejected": -3.204317092895508, + "step": 13733 + }, + { + "epoch": 3.44, + "grad_norm": 2.976717948913574, + "learning_rate": 2.2258668672221635e-06, + "logits/chosen": -0.5809158086776733, + "logits/rejected": -0.6204611659049988, + "logps/chosen": -46.96348571777344, + "logps/rejected": -109.74176025390625, + "loss": 0.5413, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1907472610473633, + "rewards/margins": 7.119138240814209, + "rewards/rejected": -3.9283909797668457, + "step": 13734 + }, + { + "epoch": 3.44, + "grad_norm": 4.717531681060791, + "learning_rate": 2.225212986501135e-06, + "logits/chosen": -0.6163350343704224, + "logits/rejected": -0.6844965815544128, + "logps/chosen": -55.755279541015625, + "logps/rejected": -116.8073501586914, + "loss": 0.6208, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5281457901000977, + "rewards/margins": 7.879347801208496, + "rewards/rejected": -4.351202011108398, + "step": 13735 + }, + { + "epoch": 3.44, + "grad_norm": 11.56675910949707, + "learning_rate": 2.224559174348043e-06, + "logits/chosen": -0.48131006956100464, + "logits/rejected": -0.5735641121864319, + "logps/chosen": -61.674312591552734, + "logps/rejected": -84.9786376953125, + "loss": 0.7048, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0302977561950684, + "rewards/margins": 5.804537773132324, + "rewards/rejected": -2.7742397785186768, + "step": 13736 + }, + { + "epoch": 3.44, + "grad_norm": 7.334779739379883, + "learning_rate": 2.223905430779048e-06, + "logits/chosen": -0.5315039753913879, + "logits/rejected": -0.6030463576316833, + "logps/chosen": -51.4903678894043, + "logps/rejected": -94.56289672851562, + "loss": 0.6792, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.878265380859375, + "rewards/margins": 5.6120195388793945, + "rewards/rejected": -2.7337546348571777, + "step": 13737 + }, + { + "epoch": 3.44, + "grad_norm": 7.073272705078125, + "learning_rate": 2.223251755810303e-06, + "logits/chosen": -0.5384408235549927, + "logits/rejected": -0.6356550455093384, + "logps/chosen": -53.38179016113281, + "logps/rejected": -86.84366607666016, + "loss": 0.6943, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.852065324783325, + "rewards/margins": 6.599231719970703, + "rewards/rejected": -3.747166395187378, + "step": 13738 + }, + { + "epoch": 3.44, + "grad_norm": 11.44442367553711, + "learning_rate": 2.2225981494579578e-06, + "logits/chosen": -0.5515471696853638, + "logits/rejected": -0.6215105056762695, + "logps/chosen": -47.087921142578125, + "logps/rejected": -119.2763671875, + "loss": 0.5974, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0129611492156982, + "rewards/margins": 8.261200904846191, + "rewards/rejected": -5.2482404708862305, + "step": 13739 + }, + { + "epoch": 3.44, + "grad_norm": 4.511131763458252, + "learning_rate": 2.221944611738169e-06, + "logits/chosen": -0.5389069318771362, + "logits/rejected": -0.6249318718910217, + "logps/chosen": -72.10928344726562, + "logps/rejected": -109.10643005371094, + "loss": 0.6629, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.868342161178589, + "rewards/margins": 7.155814170837402, + "rewards/rejected": -4.287471294403076, + "step": 13740 + }, + { + "epoch": 3.44, + "grad_norm": 12.600722312927246, + "learning_rate": 2.2212911426670817e-06, + "logits/chosen": -0.6031373739242554, + "logits/rejected": -0.6756100058555603, + "logps/chosen": -49.672523498535156, + "logps/rejected": -105.60018157958984, + "loss": 0.6349, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.904975414276123, + "rewards/margins": 6.4430437088012695, + "rewards/rejected": -3.5380678176879883, + "step": 13741 + }, + { + "epoch": 3.44, + "grad_norm": 4.483885765075684, + "learning_rate": 2.220637742260848e-06, + "logits/chosen": -0.5906327366828918, + "logits/rejected": -0.6280332803726196, + "logps/chosen": -51.2089729309082, + "logps/rejected": -123.14735412597656, + "loss": 0.6569, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.876664161682129, + "rewards/margins": 7.310108184814453, + "rewards/rejected": -4.433444023132324, + "step": 13742 + }, + { + "epoch": 3.44, + "grad_norm": 2.7475767135620117, + "learning_rate": 2.219984410535611e-06, + "logits/chosen": -0.5137792229652405, + "logits/rejected": -0.6116992831230164, + "logps/chosen": -50.256874084472656, + "logps/rejected": -99.15542602539062, + "loss": 0.6242, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.236443281173706, + "rewards/margins": 7.780324459075928, + "rewards/rejected": -4.543881416320801, + "step": 13743 + }, + { + "epoch": 3.44, + "grad_norm": 6.064176559448242, + "learning_rate": 2.219331147507515e-06, + "logits/chosen": -0.5731332302093506, + "logits/rejected": -0.628316342830658, + "logps/chosen": -55.17055130004883, + "logps/rejected": -108.44670104980469, + "loss": 0.674, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8359694480895996, + "rewards/margins": 6.234908580780029, + "rewards/rejected": -3.398939609527588, + "step": 13744 + }, + { + "epoch": 3.44, + "grad_norm": 8.449888229370117, + "learning_rate": 2.218677953192706e-06, + "logits/chosen": -0.49417644739151, + "logits/rejected": -0.5862467885017395, + "logps/chosen": -58.78359603881836, + "logps/rejected": -97.77223205566406, + "loss": 0.6803, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.270995855331421, + "rewards/margins": 7.009701728820801, + "rewards/rejected": -3.738706350326538, + "step": 13745 + }, + { + "epoch": 3.44, + "grad_norm": 2.72454571723938, + "learning_rate": 2.2180248276073228e-06, + "logits/chosen": -0.5586065053939819, + "logits/rejected": -0.6476194858551025, + "logps/chosen": -47.649017333984375, + "logps/rejected": -103.28654479980469, + "loss": 0.5729, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.110501289367676, + "rewards/margins": 7.448708534240723, + "rewards/rejected": -4.338207721710205, + "step": 13746 + }, + { + "epoch": 3.44, + "grad_norm": 2.3144478797912598, + "learning_rate": 2.2173717707675034e-06, + "logits/chosen": -0.5804586410522461, + "logits/rejected": -0.6618887782096863, + "logps/chosen": -59.32630920410156, + "logps/rejected": -113.80150604248047, + "loss": 0.5937, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3272030353546143, + "rewards/margins": 7.51816987991333, + "rewards/rejected": -4.190967082977295, + "step": 13747 + }, + { + "epoch": 3.44, + "grad_norm": 6.34685754776001, + "learning_rate": 2.216718782689388e-06, + "logits/chosen": -0.5772901773452759, + "logits/rejected": -0.643625020980835, + "logps/chosen": -51.70840835571289, + "logps/rejected": -117.69265747070312, + "loss": 0.6348, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.227860927581787, + "rewards/margins": 7.2968244552612305, + "rewards/rejected": -4.068964004516602, + "step": 13748 + }, + { + "epoch": 3.44, + "grad_norm": 3.400994062423706, + "learning_rate": 2.216065863389115e-06, + "logits/chosen": -0.5527597665786743, + "logits/rejected": -0.6214682459831238, + "logps/chosen": -47.71153259277344, + "logps/rejected": -112.31268310546875, + "loss": 0.6052, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.066990852355957, + "rewards/margins": 8.114099502563477, + "rewards/rejected": -5.047109127044678, + "step": 13749 + }, + { + "epoch": 3.44, + "grad_norm": 6.212236404418945, + "learning_rate": 2.215413012882812e-06, + "logits/chosen": -0.5965155363082886, + "logits/rejected": -0.68597412109375, + "logps/chosen": -45.716941833496094, + "logps/rejected": -116.21076965332031, + "loss": 0.5695, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3845632076263428, + "rewards/margins": 7.6868486404418945, + "rewards/rejected": -4.302286148071289, + "step": 13750 + }, + { + "epoch": 3.44, + "grad_norm": 4.819909572601318, + "learning_rate": 2.2147602311866147e-06, + "logits/chosen": -0.5545505285263062, + "logits/rejected": -0.6462085247039795, + "logps/chosen": -53.631248474121094, + "logps/rejected": -100.36399841308594, + "loss": 0.5584, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1890549659729004, + "rewards/margins": 6.346040725708008, + "rewards/rejected": -3.1569855213165283, + "step": 13751 + }, + { + "epoch": 3.44, + "grad_norm": 2.731670379638672, + "learning_rate": 2.2141075183166566e-06, + "logits/chosen": -0.4823777973651886, + "logits/rejected": -0.5875443816184998, + "logps/chosen": -63.511146545410156, + "logps/rejected": -92.4154281616211, + "loss": 0.6078, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0794787406921387, + "rewards/margins": 6.9382243156433105, + "rewards/rejected": -3.858745574951172, + "step": 13752 + }, + { + "epoch": 3.44, + "grad_norm": 15.507878303527832, + "learning_rate": 2.2134548742890653e-06, + "logits/chosen": -0.5603448152542114, + "logits/rejected": -0.6518699526786804, + "logps/chosen": -63.40509796142578, + "logps/rejected": -113.55193328857422, + "loss": 0.7051, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.67189884185791, + "rewards/margins": 6.09363317489624, + "rewards/rejected": -3.42173433303833, + "step": 13753 + }, + { + "epoch": 3.44, + "grad_norm": 7.382841110229492, + "learning_rate": 2.212802299119966e-06, + "logits/chosen": -0.5059091448783875, + "logits/rejected": -0.6271958947181702, + "logps/chosen": -60.240013122558594, + "logps/rejected": -88.68556213378906, + "loss": 0.6268, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8137826919555664, + "rewards/margins": 5.771587371826172, + "rewards/rejected": -2.9578051567077637, + "step": 13754 + }, + { + "epoch": 3.44, + "grad_norm": 5.147373199462891, + "learning_rate": 2.212149792825489e-06, + "logits/chosen": -0.5530423521995544, + "logits/rejected": -0.6029990911483765, + "logps/chosen": -62.77189254760742, + "logps/rejected": -119.02122497558594, + "loss": 0.6476, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9703545570373535, + "rewards/margins": 7.180608749389648, + "rewards/rejected": -4.210254669189453, + "step": 13755 + }, + { + "epoch": 3.44, + "grad_norm": 3.1714909076690674, + "learning_rate": 2.2114973554217544e-06, + "logits/chosen": -0.45192694664001465, + "logits/rejected": -0.5773632526397705, + "logps/chosen": -58.6532096862793, + "logps/rejected": -101.45806121826172, + "loss": 0.6, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8023521900177, + "rewards/margins": 7.079712867736816, + "rewards/rejected": -4.277360439300537, + "step": 13756 + }, + { + "epoch": 3.44, + "grad_norm": 5.6624555587768555, + "learning_rate": 2.2108449869248854e-06, + "logits/chosen": -0.555161714553833, + "logits/rejected": -0.5933882594108582, + "logps/chosen": -53.47044372558594, + "logps/rejected": -91.96530151367188, + "loss": 0.6885, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0322818756103516, + "rewards/margins": 5.6293511390686035, + "rewards/rejected": -2.597069025039673, + "step": 13757 + }, + { + "epoch": 3.44, + "grad_norm": 5.995446681976318, + "learning_rate": 2.210192687351005e-06, + "logits/chosen": -0.5224727988243103, + "logits/rejected": -0.6385592222213745, + "logps/chosen": -71.04761505126953, + "logps/rejected": -103.50357818603516, + "loss": 0.7531, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0721845626831055, + "rewards/margins": 6.447394371032715, + "rewards/rejected": -3.3752105236053467, + "step": 13758 + }, + { + "epoch": 3.44, + "grad_norm": 21.165218353271484, + "learning_rate": 2.2095404567162286e-06, + "logits/chosen": -0.4988820552825928, + "logits/rejected": -0.5477780103683472, + "logps/chosen": -60.678794860839844, + "logps/rejected": -100.32371520996094, + "loss": 0.7259, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.996878147125244, + "rewards/margins": 5.386065483093262, + "rewards/rejected": -2.3891868591308594, + "step": 13759 + }, + { + "epoch": 3.44, + "grad_norm": 3.623539447784424, + "learning_rate": 2.208888295036678e-06, + "logits/chosen": -0.6137030124664307, + "logits/rejected": -0.6392016410827637, + "logps/chosen": -48.246917724609375, + "logps/rejected": -109.5885238647461, + "loss": 0.6318, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0151801109313965, + "rewards/margins": 6.971314430236816, + "rewards/rejected": -3.9561338424682617, + "step": 13760 + }, + { + "epoch": 3.44, + "grad_norm": 3.4541091918945312, + "learning_rate": 2.208236202328466e-06, + "logits/chosen": -0.622377872467041, + "logits/rejected": -0.6909345388412476, + "logps/chosen": -47.00607681274414, + "logps/rejected": -86.91337585449219, + "loss": 0.562, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.230503559112549, + "rewards/margins": 6.712410926818848, + "rewards/rejected": -3.481907367706299, + "step": 13761 + }, + { + "epoch": 3.44, + "grad_norm": 5.956570625305176, + "learning_rate": 2.2075841786077053e-06, + "logits/chosen": -0.5466402173042297, + "logits/rejected": -0.6621619462966919, + "logps/chosen": -59.1144905090332, + "logps/rejected": -95.28614044189453, + "loss": 0.7136, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0046470165252686, + "rewards/margins": 6.734095573425293, + "rewards/rejected": -3.7294490337371826, + "step": 13762 + }, + { + "epoch": 3.44, + "grad_norm": 7.679666519165039, + "learning_rate": 2.2069322238905112e-06, + "logits/chosen": -0.5547291040420532, + "logits/rejected": -0.6064826250076294, + "logps/chosen": -65.38443756103516, + "logps/rejected": -120.521484375, + "loss": 0.7154, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.225813865661621, + "rewards/margins": 6.602110862731934, + "rewards/rejected": -3.3762972354888916, + "step": 13763 + }, + { + "epoch": 3.44, + "grad_norm": 4.730984210968018, + "learning_rate": 2.206280338192991e-06, + "logits/chosen": -0.621826708316803, + "logits/rejected": -0.7179828882217407, + "logps/chosen": -56.433082580566406, + "logps/rejected": -103.17774200439453, + "loss": 0.68, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3906917572021484, + "rewards/margins": 6.477611541748047, + "rewards/rejected": -3.0869200229644775, + "step": 13764 + }, + { + "epoch": 3.44, + "grad_norm": 5.327047824859619, + "learning_rate": 2.205628521531257e-06, + "logits/chosen": -0.5636621713638306, + "logits/rejected": -0.6430796980857849, + "logps/chosen": -42.2924690246582, + "logps/rejected": -120.49934387207031, + "loss": 0.511, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.159501791000366, + "rewards/margins": 8.031000137329102, + "rewards/rejected": -4.8714985847473145, + "step": 13765 + }, + { + "epoch": 3.44, + "grad_norm": 1.6371697187423706, + "learning_rate": 2.204976773921413e-06, + "logits/chosen": -0.5466715097427368, + "logits/rejected": -0.6409639716148376, + "logps/chosen": -55.331268310546875, + "logps/rejected": -108.44430541992188, + "loss": 0.6234, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.147960662841797, + "rewards/margins": 7.761907577514648, + "rewards/rejected": -4.61394739151001, + "step": 13766 + }, + { + "epoch": 3.44, + "grad_norm": 10.920534133911133, + "learning_rate": 2.2043250953795668e-06, + "logits/chosen": -0.5913602113723755, + "logits/rejected": -0.650153636932373, + "logps/chosen": -51.464805603027344, + "logps/rejected": -95.94883728027344, + "loss": 0.7181, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.805946111679077, + "rewards/margins": 5.461223125457764, + "rewards/rejected": -2.6552774906158447, + "step": 13767 + }, + { + "epoch": 3.44, + "grad_norm": 10.787997245788574, + "learning_rate": 2.2036734859218217e-06, + "logits/chosen": -0.5619077682495117, + "logits/rejected": -0.6473350524902344, + "logps/chosen": -47.29805374145508, + "logps/rejected": -104.98004150390625, + "loss": 0.622, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.225943088531494, + "rewards/margins": 7.256370544433594, + "rewards/rejected": -4.030426502227783, + "step": 13768 + }, + { + "epoch": 3.44, + "grad_norm": 4.183157444000244, + "learning_rate": 2.2030219455642775e-06, + "logits/chosen": -0.5253725647926331, + "logits/rejected": -0.5786501169204712, + "logps/chosen": -49.422325134277344, + "logps/rejected": -111.78612518310547, + "loss": 0.7051, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1378915309906006, + "rewards/margins": 6.807430744171143, + "rewards/rejected": -3.669539451599121, + "step": 13769 + }, + { + "epoch": 3.44, + "grad_norm": 7.574726581573486, + "learning_rate": 2.2023704743230373e-06, + "logits/chosen": -0.6154543161392212, + "logits/rejected": -0.6803568005561829, + "logps/chosen": -51.56076431274414, + "logps/rejected": -111.38298797607422, + "loss": 0.5796, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.212141513824463, + "rewards/margins": 7.29676628112793, + "rewards/rejected": -4.084624290466309, + "step": 13770 + }, + { + "epoch": 3.44, + "grad_norm": 6.401010513305664, + "learning_rate": 2.201719072214199e-06, + "logits/chosen": -0.4843152165412903, + "logits/rejected": -0.5446925759315491, + "logps/chosen": -53.44716262817383, + "logps/rejected": -105.01626586914062, + "loss": 0.74, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.2080488204956055, + "rewards/margins": 5.817748546600342, + "rewards/rejected": -2.6096999645233154, + "step": 13771 + }, + { + "epoch": 3.45, + "grad_norm": 5.082906246185303, + "learning_rate": 2.2010677392538566e-06, + "logits/chosen": -0.5104431509971619, + "logits/rejected": -0.6329406499862671, + "logps/chosen": -66.60466766357422, + "logps/rejected": -101.64832305908203, + "loss": 0.6874, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.026538372039795, + "rewards/margins": 7.0134735107421875, + "rewards/rejected": -3.9869346618652344, + "step": 13772 + }, + { + "epoch": 3.45, + "grad_norm": 3.201997995376587, + "learning_rate": 2.200416475458107e-06, + "logits/chosen": -0.5549443960189819, + "logits/rejected": -0.6605916023254395, + "logps/chosen": -48.03721618652344, + "logps/rejected": -84.83505249023438, + "loss": 0.6046, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.289818286895752, + "rewards/margins": 6.656305313110352, + "rewards/rejected": -3.3664870262145996, + "step": 13773 + }, + { + "epoch": 3.45, + "grad_norm": 4.817912578582764, + "learning_rate": 2.199765280843048e-06, + "logits/chosen": -0.5107871890068054, + "logits/rejected": -0.5864878296852112, + "logps/chosen": -51.34123611450195, + "logps/rejected": -86.41228485107422, + "loss": 0.6416, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.6538493633270264, + "rewards/margins": 5.8482770919799805, + "rewards/rejected": -2.194427490234375, + "step": 13774 + }, + { + "epoch": 3.45, + "grad_norm": 2.4372477531433105, + "learning_rate": 2.199114155424764e-06, + "logits/chosen": -0.46913161873817444, + "logits/rejected": -0.590688169002533, + "logps/chosen": -60.346221923828125, + "logps/rejected": -105.24861145019531, + "loss": 0.5824, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2504489421844482, + "rewards/margins": 8.183069229125977, + "rewards/rejected": -4.932620525360107, + "step": 13775 + }, + { + "epoch": 3.45, + "grad_norm": 4.448184013366699, + "learning_rate": 2.1984630992193484e-06, + "logits/chosen": -0.49781614542007446, + "logits/rejected": -0.511928141117096, + "logps/chosen": -58.60456848144531, + "logps/rejected": -122.82066345214844, + "loss": 0.5984, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0939509868621826, + "rewards/margins": 6.829245090484619, + "rewards/rejected": -3.735293388366699, + "step": 13776 + }, + { + "epoch": 3.45, + "grad_norm": 2.214175224304199, + "learning_rate": 2.197812112242893e-06, + "logits/chosen": -0.6539721488952637, + "logits/rejected": -0.7365323305130005, + "logps/chosen": -48.89478302001953, + "logps/rejected": -93.26756286621094, + "loss": 0.5838, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6473069190979004, + "rewards/margins": 7.80009651184082, + "rewards/rejected": -4.15278959274292, + "step": 13777 + }, + { + "epoch": 3.45, + "grad_norm": 6.042660713195801, + "learning_rate": 2.197161194511477e-06, + "logits/chosen": -0.5537744164466858, + "logits/rejected": -0.6169579029083252, + "logps/chosen": -54.258426666259766, + "logps/rejected": -115.87355041503906, + "loss": 0.6843, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0362772941589355, + "rewards/margins": 7.103476524353027, + "rewards/rejected": -4.067198753356934, + "step": 13778 + }, + { + "epoch": 3.45, + "grad_norm": 7.702773571014404, + "learning_rate": 2.196510346041189e-06, + "logits/chosen": -0.5326974391937256, + "logits/rejected": -0.6027473211288452, + "logps/chosen": -70.2112045288086, + "logps/rejected": -98.18097686767578, + "loss": 0.6813, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1384217739105225, + "rewards/margins": 5.981205940246582, + "rewards/rejected": -2.8427846431732178, + "step": 13779 + }, + { + "epoch": 3.45, + "grad_norm": 3.6266493797302246, + "learning_rate": 2.1958595668481147e-06, + "logits/chosen": -0.49306100606918335, + "logits/rejected": -0.5958687663078308, + "logps/chosen": -55.86821365356445, + "logps/rejected": -94.36109161376953, + "loss": 0.5246, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0463368892669678, + "rewards/margins": 6.082851409912109, + "rewards/rejected": -3.0365140438079834, + "step": 13780 + }, + { + "epoch": 3.45, + "grad_norm": 6.2204084396362305, + "learning_rate": 2.1952088569483327e-06, + "logits/chosen": -0.5845544338226318, + "logits/rejected": -0.5949245691299438, + "logps/chosen": -53.47307586669922, + "logps/rejected": -117.84513854980469, + "loss": 0.651, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.011399030685425, + "rewards/margins": 6.291427135467529, + "rewards/rejected": -3.280027389526367, + "step": 13781 + }, + { + "epoch": 3.45, + "grad_norm": 2.370685577392578, + "learning_rate": 2.1945582163579216e-06, + "logits/chosen": -0.5511776804924011, + "logits/rejected": -0.630855917930603, + "logps/chosen": -50.36561965942383, + "logps/rejected": -119.8744888305664, + "loss": 0.5555, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9889562129974365, + "rewards/margins": 8.259635925292969, + "rewards/rejected": -5.270679950714111, + "step": 13782 + }, + { + "epoch": 3.45, + "grad_norm": 4.824080944061279, + "learning_rate": 2.1939076450929627e-06, + "logits/chosen": -0.5624409317970276, + "logits/rejected": -0.6740376949310303, + "logps/chosen": -58.615089416503906, + "logps/rejected": -94.11001586914062, + "loss": 0.6942, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.049777030944824, + "rewards/margins": 6.616718292236328, + "rewards/rejected": -3.566941022872925, + "step": 13783 + }, + { + "epoch": 3.45, + "grad_norm": 3.196535110473633, + "learning_rate": 2.193257143169528e-06, + "logits/chosen": -0.5925881862640381, + "logits/rejected": -0.6449043154716492, + "logps/chosen": -82.32752990722656, + "logps/rejected": -117.31201171875, + "loss": 0.6582, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1615848541259766, + "rewards/margins": 7.596640586853027, + "rewards/rejected": -4.435055732727051, + "step": 13784 + }, + { + "epoch": 3.45, + "grad_norm": 23.267488479614258, + "learning_rate": 2.1926067106036973e-06, + "logits/chosen": -0.561109721660614, + "logits/rejected": -0.6544637084007263, + "logps/chosen": -56.14588165283203, + "logps/rejected": -90.89212036132812, + "loss": 0.807, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.737244129180908, + "rewards/margins": 5.197535991668701, + "rewards/rejected": -2.460292100906372, + "step": 13785 + }, + { + "epoch": 3.45, + "grad_norm": 6.464471340179443, + "learning_rate": 2.1919563474115408e-06, + "logits/chosen": -0.4894988238811493, + "logits/rejected": -0.5194635987281799, + "logps/chosen": -63.17588806152344, + "logps/rejected": -112.9109115600586, + "loss": 0.5817, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.255608081817627, + "rewards/margins": 6.536513328552246, + "rewards/rejected": -3.280905246734619, + "step": 13786 + }, + { + "epoch": 3.45, + "grad_norm": 6.610849857330322, + "learning_rate": 2.191306053609128e-06, + "logits/chosen": -0.524158239364624, + "logits/rejected": -0.5704606175422668, + "logps/chosen": -51.57735824584961, + "logps/rejected": -130.3592529296875, + "loss": 0.7139, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1426005363464355, + "rewards/margins": 7.453956604003906, + "rewards/rejected": -4.311356067657471, + "step": 13787 + }, + { + "epoch": 3.45, + "grad_norm": 6.492717742919922, + "learning_rate": 2.190655829212532e-06, + "logits/chosen": -0.5386737585067749, + "logits/rejected": -0.604463517665863, + "logps/chosen": -66.18103790283203, + "logps/rejected": -113.65019226074219, + "loss": 0.7123, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7573230266571045, + "rewards/margins": 6.991948127746582, + "rewards/rejected": -4.23462438583374, + "step": 13788 + }, + { + "epoch": 3.45, + "grad_norm": 5.822388648986816, + "learning_rate": 2.190005674237818e-06, + "logits/chosen": -0.43449097871780396, + "logits/rejected": -0.6089333295822144, + "logps/chosen": -73.56507873535156, + "logps/rejected": -103.70977783203125, + "loss": 0.6926, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8072080612182617, + "rewards/margins": 7.001955032348633, + "rewards/rejected": -4.194746971130371, + "step": 13789 + }, + { + "epoch": 3.45, + "grad_norm": 8.213544845581055, + "learning_rate": 2.189355588701051e-06, + "logits/chosen": -0.5285820960998535, + "logits/rejected": -0.5878096222877502, + "logps/chosen": -65.35234069824219, + "logps/rejected": -111.89368438720703, + "loss": 0.6506, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6968495845794678, + "rewards/margins": 7.1059651374816895, + "rewards/rejected": -4.409115791320801, + "step": 13790 + }, + { + "epoch": 3.45, + "grad_norm": 4.014322280883789, + "learning_rate": 2.188705572618297e-06, + "logits/chosen": -0.5779842734336853, + "logits/rejected": -0.6176190376281738, + "logps/chosen": -62.75105667114258, + "logps/rejected": -120.74946594238281, + "loss": 0.6789, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3231022357940674, + "rewards/margins": 7.151707649230957, + "rewards/rejected": -3.8286049365997314, + "step": 13791 + }, + { + "epoch": 3.45, + "grad_norm": 7.434356689453125, + "learning_rate": 2.1880556260056222e-06, + "logits/chosen": -0.5420682430267334, + "logits/rejected": -0.6252350807189941, + "logps/chosen": -52.52655792236328, + "logps/rejected": -106.35285186767578, + "loss": 0.5554, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1383590698242188, + "rewards/margins": 7.374664306640625, + "rewards/rejected": -4.236304759979248, + "step": 13792 + }, + { + "epoch": 3.45, + "grad_norm": 3.1819300651550293, + "learning_rate": 2.18740574887908e-06, + "logits/chosen": -0.5210620164871216, + "logits/rejected": -0.6360418200492859, + "logps/chosen": -60.44532775878906, + "logps/rejected": -93.20835876464844, + "loss": 0.5743, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0519447326660156, + "rewards/margins": 6.6248016357421875, + "rewards/rejected": -3.57285737991333, + "step": 13793 + }, + { + "epoch": 3.45, + "grad_norm": 5.406570911407471, + "learning_rate": 2.186755941254734e-06, + "logits/chosen": -0.47729718685150146, + "logits/rejected": -0.5729756355285645, + "logps/chosen": -60.66805648803711, + "logps/rejected": -124.18350982666016, + "loss": 0.6158, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0705559253692627, + "rewards/margins": 7.395780563354492, + "rewards/rejected": -4.32522439956665, + "step": 13794 + }, + { + "epoch": 3.45, + "grad_norm": 3.4745423793792725, + "learning_rate": 2.1861062031486423e-06, + "logits/chosen": -0.5041423439979553, + "logits/rejected": -0.5962048768997192, + "logps/chosen": -52.82622528076172, + "logps/rejected": -105.5694580078125, + "loss": 0.6684, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9620351791381836, + "rewards/margins": 6.815699577331543, + "rewards/rejected": -3.8536643981933594, + "step": 13795 + }, + { + "epoch": 3.45, + "grad_norm": 12.002918243408203, + "learning_rate": 2.1854565345768593e-06, + "logits/chosen": -0.524316668510437, + "logits/rejected": -0.6115474700927734, + "logps/chosen": -65.11344909667969, + "logps/rejected": -95.24080657958984, + "loss": 0.732, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8982045650482178, + "rewards/margins": 6.092504978179932, + "rewards/rejected": -3.1943001747131348, + "step": 13796 + }, + { + "epoch": 3.45, + "grad_norm": 3.294572353363037, + "learning_rate": 2.1848069355554373e-06, + "logits/chosen": -0.5617405772209167, + "logits/rejected": -0.6888176798820496, + "logps/chosen": -49.3419189453125, + "logps/rejected": -99.189453125, + "loss": 0.6303, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.299333333969116, + "rewards/margins": 8.380485534667969, + "rewards/rejected": -5.081151962280273, + "step": 13797 + }, + { + "epoch": 3.45, + "grad_norm": 5.016369342803955, + "learning_rate": 2.1841574061004323e-06, + "logits/chosen": -0.45373278856277466, + "logits/rejected": -0.5522301197052002, + "logps/chosen": -70.63337707519531, + "logps/rejected": -114.68494415283203, + "loss": 0.5888, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0236237049102783, + "rewards/margins": 7.184035778045654, + "rewards/rejected": -4.160411357879639, + "step": 13798 + }, + { + "epoch": 3.45, + "grad_norm": 2.6088156700134277, + "learning_rate": 2.1835079462278923e-06, + "logits/chosen": -0.5322256684303284, + "logits/rejected": -0.6597086191177368, + "logps/chosen": -59.493560791015625, + "logps/rejected": -112.16394805908203, + "loss": 0.5792, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7463877201080322, + "rewards/margins": 6.880268096923828, + "rewards/rejected": -4.133880138397217, + "step": 13799 + }, + { + "epoch": 3.45, + "grad_norm": 11.644875526428223, + "learning_rate": 2.182858555953865e-06, + "logits/chosen": -0.5327818393707275, + "logits/rejected": -0.6215722560882568, + "logps/chosen": -56.272377014160156, + "logps/rejected": -109.83245086669922, + "loss": 0.5821, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.070148229598999, + "rewards/margins": 7.293715953826904, + "rewards/rejected": -4.223568439483643, + "step": 13800 + }, + { + "epoch": 3.45, + "grad_norm": 7.778433799743652, + "learning_rate": 2.1822092352944017e-06, + "logits/chosen": -0.5707226991653442, + "logits/rejected": -0.6386655569076538, + "logps/chosen": -46.749855041503906, + "logps/rejected": -94.86288452148438, + "loss": 0.6794, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.177827835083008, + "rewards/margins": 6.888192176818848, + "rewards/rejected": -3.71036434173584, + "step": 13801 + }, + { + "epoch": 3.45, + "grad_norm": 4.952726364135742, + "learning_rate": 2.181559984265545e-06, + "logits/chosen": -0.5401172637939453, + "logits/rejected": -0.6023118495941162, + "logps/chosen": -52.80426025390625, + "logps/rejected": -99.10490417480469, + "loss": 0.6488, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2162926197052, + "rewards/margins": 5.8203277587890625, + "rewards/rejected": -2.604034662246704, + "step": 13802 + }, + { + "epoch": 3.45, + "grad_norm": 8.966432571411133, + "learning_rate": 2.180910802883337e-06, + "logits/chosen": -0.5988810658454895, + "logits/rejected": -0.6549491882324219, + "logps/chosen": -47.039955139160156, + "logps/rejected": -124.17195129394531, + "loss": 0.6423, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9072234630584717, + "rewards/margins": 7.653911590576172, + "rewards/rejected": -4.746688365936279, + "step": 13803 + }, + { + "epoch": 3.45, + "grad_norm": 2.887146234512329, + "learning_rate": 2.180261691163823e-06, + "logits/chosen": -0.5010360479354858, + "logits/rejected": -0.5787631273269653, + "logps/chosen": -49.27587127685547, + "logps/rejected": -94.47749328613281, + "loss": 0.5404, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1261866092681885, + "rewards/margins": 6.499948501586914, + "rewards/rejected": -3.373762369155884, + "step": 13804 + }, + { + "epoch": 3.45, + "grad_norm": 5.591415882110596, + "learning_rate": 2.1796126491230407e-06, + "logits/chosen": -0.5420753955841064, + "logits/rejected": -0.5994939208030701, + "logps/chosen": -42.653533935546875, + "logps/rejected": -115.1454086303711, + "loss": 0.569, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3867671489715576, + "rewards/margins": 7.511441230773926, + "rewards/rejected": -4.124673843383789, + "step": 13805 + }, + { + "epoch": 3.45, + "grad_norm": 6.754519462585449, + "learning_rate": 2.178963676777031e-06, + "logits/chosen": -0.49199336767196655, + "logits/rejected": -0.5950503945350647, + "logps/chosen": -63.6966667175293, + "logps/rejected": -88.89892578125, + "loss": 0.6569, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1860477924346924, + "rewards/margins": 5.129583358764648, + "rewards/rejected": -1.9435354471206665, + "step": 13806 + }, + { + "epoch": 3.45, + "grad_norm": 5.478600025177002, + "learning_rate": 2.178314774141828e-06, + "logits/chosen": -0.49551764130592346, + "logits/rejected": -0.5598633289337158, + "logps/chosen": -55.51900100708008, + "logps/rejected": -123.33057403564453, + "loss": 0.5744, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.339517593383789, + "rewards/margins": 7.880771636962891, + "rewards/rejected": -4.541254043579102, + "step": 13807 + }, + { + "epoch": 3.45, + "grad_norm": 10.557889938354492, + "learning_rate": 2.17766594123347e-06, + "logits/chosen": -0.5700395703315735, + "logits/rejected": -0.6665099859237671, + "logps/chosen": -46.15850067138672, + "logps/rejected": -113.00687408447266, + "loss": 0.5817, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1760571002960205, + "rewards/margins": 7.691272258758545, + "rewards/rejected": -4.5152153968811035, + "step": 13808 + }, + { + "epoch": 3.45, + "grad_norm": 5.1993632316589355, + "learning_rate": 2.1770171780679866e-06, + "logits/chosen": -0.515083372592926, + "logits/rejected": -0.5164508819580078, + "logps/chosen": -48.06363296508789, + "logps/rejected": -98.42491912841797, + "loss": 0.6794, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.494771957397461, + "rewards/margins": 6.238891124725342, + "rewards/rejected": -2.744119644165039, + "step": 13809 + }, + { + "epoch": 3.45, + "grad_norm": 7.079290866851807, + "learning_rate": 2.176368484661413e-06, + "logits/chosen": -0.6046552658081055, + "logits/rejected": -0.6647570133209229, + "logps/chosen": -46.39228057861328, + "logps/rejected": -95.91935729980469, + "loss": 0.5879, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.191434860229492, + "rewards/margins": 6.439198970794678, + "rewards/rejected": -3.2477641105651855, + "step": 13810 + }, + { + "epoch": 3.45, + "grad_norm": 21.275554656982422, + "learning_rate": 2.175719861029778e-06, + "logits/chosen": -0.45914411544799805, + "logits/rejected": -0.552686333656311, + "logps/chosen": -58.568458557128906, + "logps/rejected": -106.55113220214844, + "loss": 0.6627, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1175284385681152, + "rewards/margins": 7.934030532836914, + "rewards/rejected": -4.816501617431641, + "step": 13811 + }, + { + "epoch": 3.46, + "grad_norm": 39.256778717041016, + "learning_rate": 2.1750713071891072e-06, + "logits/chosen": -0.5115102529525757, + "logits/rejected": -0.5856097340583801, + "logps/chosen": -51.937721252441406, + "logps/rejected": -96.29078674316406, + "loss": 0.6957, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.008565902709961, + "rewards/margins": 7.001421928405762, + "rewards/rejected": -3.99285626411438, + "step": 13812 + }, + { + "epoch": 3.46, + "grad_norm": 3.424234390258789, + "learning_rate": 2.174422823155431e-06, + "logits/chosen": -0.5507957935333252, + "logits/rejected": -0.6367619633674622, + "logps/chosen": -53.023170471191406, + "logps/rejected": -121.21022033691406, + "loss": 0.5133, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.887293815612793, + "rewards/margins": 8.087343215942383, + "rewards/rejected": -5.200048923492432, + "step": 13813 + }, + { + "epoch": 3.46, + "grad_norm": 4.891712188720703, + "learning_rate": 2.1737744089447723e-06, + "logits/chosen": -0.5712911486625671, + "logits/rejected": -0.683992862701416, + "logps/chosen": -73.74408721923828, + "logps/rejected": -99.70225524902344, + "loss": 0.6667, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.239776611328125, + "rewards/margins": 7.373653411865234, + "rewards/rejected": -4.133876323699951, + "step": 13814 + }, + { + "epoch": 3.46, + "grad_norm": 3.8450872898101807, + "learning_rate": 2.1731260645731517e-06, + "logits/chosen": -0.5099225640296936, + "logits/rejected": -0.5758646726608276, + "logps/chosen": -49.494911193847656, + "logps/rejected": -154.50389099121094, + "loss": 0.5046, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9685728549957275, + "rewards/margins": 9.751348495483398, + "rewards/rejected": -6.782775402069092, + "step": 13815 + }, + { + "epoch": 3.46, + "grad_norm": 11.400720596313477, + "learning_rate": 2.172477790056595e-06, + "logits/chosen": -0.5204949975013733, + "logits/rejected": -0.6312921047210693, + "logps/chosen": -62.823486328125, + "logps/rejected": -104.310546875, + "loss": 0.6744, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7825849056243896, + "rewards/margins": 6.8478569984436035, + "rewards/rejected": -4.065272331237793, + "step": 13816 + }, + { + "epoch": 3.46, + "grad_norm": 38.47053146362305, + "learning_rate": 2.17182958541112e-06, + "logits/chosen": -0.5484740734100342, + "logits/rejected": -0.623055636882782, + "logps/chosen": -56.27072525024414, + "logps/rejected": -113.65077209472656, + "loss": 0.6548, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0253539085388184, + "rewards/margins": 7.71942663192749, + "rewards/rejected": -4.694072723388672, + "step": 13817 + }, + { + "epoch": 3.46, + "grad_norm": 7.493363857269287, + "learning_rate": 2.1711814506527417e-06, + "logits/chosen": -0.49801626801490784, + "logits/rejected": -0.5243453979492188, + "logps/chosen": -52.695411682128906, + "logps/rejected": -129.91403198242188, + "loss": 0.6044, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0936203002929688, + "rewards/margins": 8.050661087036133, + "rewards/rejected": -4.957040786743164, + "step": 13818 + }, + { + "epoch": 3.46, + "grad_norm": 14.115399360656738, + "learning_rate": 2.170533385797479e-06, + "logits/chosen": -0.49769359827041626, + "logits/rejected": -0.599497377872467, + "logps/chosen": -59.486698150634766, + "logps/rejected": -89.9991226196289, + "loss": 0.6322, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.288440465927124, + "rewards/margins": 6.455957412719727, + "rewards/rejected": -3.1675174236297607, + "step": 13819 + }, + { + "epoch": 3.46, + "grad_norm": 15.156885147094727, + "learning_rate": 2.16988539086135e-06, + "logits/chosen": -0.5998653173446655, + "logits/rejected": -0.7166150808334351, + "logps/chosen": -50.477256774902344, + "logps/rejected": -92.92619323730469, + "loss": 0.6418, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1807284355163574, + "rewards/margins": 6.379236221313477, + "rewards/rejected": -3.198507785797119, + "step": 13820 + }, + { + "epoch": 3.46, + "grad_norm": 5.119009971618652, + "learning_rate": 2.1692374658603587e-06, + "logits/chosen": -0.5780412554740906, + "logits/rejected": -0.6355952620506287, + "logps/chosen": -51.272701263427734, + "logps/rejected": -107.39777374267578, + "loss": 0.5929, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.194431781768799, + "rewards/margins": 6.38406229019165, + "rewards/rejected": -3.1896309852600098, + "step": 13821 + }, + { + "epoch": 3.46, + "grad_norm": 10.430806159973145, + "learning_rate": 2.168589610810521e-06, + "logits/chosen": -0.5603775382041931, + "logits/rejected": -0.6276947259902954, + "logps/chosen": -58.30374526977539, + "logps/rejected": -119.56221771240234, + "loss": 0.7329, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8176686763763428, + "rewards/margins": 6.952047824859619, + "rewards/rejected": -4.134378910064697, + "step": 13822 + }, + { + "epoch": 3.46, + "grad_norm": 10.016351699829102, + "learning_rate": 2.1679418257278474e-06, + "logits/chosen": -0.6328970789909363, + "logits/rejected": -0.7435773611068726, + "logps/chosen": -42.998714447021484, + "logps/rejected": -91.32534790039062, + "loss": 0.6196, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9863812923431396, + "rewards/margins": 7.065568923950195, + "rewards/rejected": -4.079188346862793, + "step": 13823 + }, + { + "epoch": 3.46, + "grad_norm": 10.251803398132324, + "learning_rate": 2.1672941106283434e-06, + "logits/chosen": -0.589137613773346, + "logits/rejected": -0.6268176436424255, + "logps/chosen": -58.8731689453125, + "logps/rejected": -100.98676300048828, + "loss": 0.8194, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.242886781692505, + "rewards/margins": 5.951869964599609, + "rewards/rejected": -2.708983898162842, + "step": 13824 + }, + { + "epoch": 3.46, + "grad_norm": 8.485424995422363, + "learning_rate": 2.1666464655280133e-06, + "logits/chosen": -0.5154464840888977, + "logits/rejected": -0.5657863020896912, + "logps/chosen": -60.231658935546875, + "logps/rejected": -91.0914535522461, + "loss": 0.7091, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.299293279647827, + "rewards/margins": 5.146728515625, + "rewards/rejected": -1.8474347591400146, + "step": 13825 + }, + { + "epoch": 3.46, + "grad_norm": 4.665643215179443, + "learning_rate": 2.165998890442864e-06, + "logits/chosen": -0.5377169847488403, + "logits/rejected": -0.6188048124313354, + "logps/chosen": -47.412227630615234, + "logps/rejected": -113.39051818847656, + "loss": 0.5673, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.140420913696289, + "rewards/margins": 8.05636215209961, + "rewards/rejected": -4.9159417152404785, + "step": 13826 + }, + { + "epoch": 3.46, + "grad_norm": 5.804358005523682, + "learning_rate": 2.1653513853888976e-06, + "logits/chosen": -0.5161415934562683, + "logits/rejected": -0.6307975053787231, + "logps/chosen": -61.43122100830078, + "logps/rejected": -103.16153717041016, + "loss": 0.5427, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9327921867370605, + "rewards/margins": 6.986131191253662, + "rewards/rejected": -4.053338527679443, + "step": 13827 + }, + { + "epoch": 3.46, + "grad_norm": 5.0054450035095215, + "learning_rate": 2.164703950382111e-06, + "logits/chosen": -0.5755043029785156, + "logits/rejected": -0.6178716421127319, + "logps/chosen": -47.71544647216797, + "logps/rejected": -104.23876190185547, + "loss": 0.6045, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8883676528930664, + "rewards/margins": 6.713555812835693, + "rewards/rejected": -3.8251876831054688, + "step": 13828 + }, + { + "epoch": 3.46, + "grad_norm": 3.434049367904663, + "learning_rate": 2.164056585438508e-06, + "logits/chosen": -0.5959125757217407, + "logits/rejected": -0.6227148771286011, + "logps/chosen": -48.456199645996094, + "logps/rejected": -110.46371459960938, + "loss": 0.5469, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.461733818054199, + "rewards/margins": 7.27225399017334, + "rewards/rejected": -3.8105201721191406, + "step": 13829 + }, + { + "epoch": 3.46, + "grad_norm": 11.86431884765625, + "learning_rate": 2.16340929057408e-06, + "logits/chosen": -0.6059848070144653, + "logits/rejected": -0.7007185816764832, + "logps/chosen": -54.532554626464844, + "logps/rejected": -107.06639862060547, + "loss": 0.7908, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.004798173904419, + "rewards/margins": 7.68137788772583, + "rewards/rejected": -4.676579475402832, + "step": 13830 + }, + { + "epoch": 3.46, + "grad_norm": 5.549200057983398, + "learning_rate": 2.162762065804829e-06, + "logits/chosen": -0.5289106369018555, + "logits/rejected": -0.6402968168258667, + "logps/chosen": -57.82567596435547, + "logps/rejected": -94.01168823242188, + "loss": 0.7368, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.165085792541504, + "rewards/margins": 6.0308451652526855, + "rewards/rejected": -2.8657593727111816, + "step": 13831 + }, + { + "epoch": 3.46, + "grad_norm": 20.882984161376953, + "learning_rate": 2.162114911146744e-06, + "logits/chosen": -0.5406985878944397, + "logits/rejected": -0.6392094492912292, + "logps/chosen": -60.86216735839844, + "logps/rejected": -96.25115966796875, + "loss": 0.7894, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4947659969329834, + "rewards/margins": 7.117161273956299, + "rewards/rejected": -4.622395038604736, + "step": 13832 + }, + { + "epoch": 3.46, + "grad_norm": 11.22398567199707, + "learning_rate": 2.1614678266158166e-06, + "logits/chosen": -0.4912077486515045, + "logits/rejected": -0.5868955850601196, + "logps/chosen": -70.713623046875, + "logps/rejected": -110.52166748046875, + "loss": 0.7293, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2153549194335938, + "rewards/margins": 6.329480171203613, + "rewards/rejected": -3.1141247749328613, + "step": 13833 + }, + { + "epoch": 3.46, + "grad_norm": 4.503432273864746, + "learning_rate": 2.1608208122280406e-06, + "logits/chosen": -0.5618051290512085, + "logits/rejected": -0.6105136871337891, + "logps/chosen": -43.91072082519531, + "logps/rejected": -120.08967590332031, + "loss": 0.5426, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2515408992767334, + "rewards/margins": 7.687772750854492, + "rewards/rejected": -4.436232566833496, + "step": 13834 + }, + { + "epoch": 3.46, + "grad_norm": 6.309882164001465, + "learning_rate": 2.1601738679994017e-06, + "logits/chosen": -0.49855607748031616, + "logits/rejected": -0.599904477596283, + "logps/chosen": -61.228904724121094, + "logps/rejected": -93.13561248779297, + "loss": 0.6589, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9547691345214844, + "rewards/margins": 6.355324745178223, + "rewards/rejected": -3.4005556106567383, + "step": 13835 + }, + { + "epoch": 3.46, + "grad_norm": 3.4823453426361084, + "learning_rate": 2.1595269939458853e-06, + "logits/chosen": -0.5712476372718811, + "logits/rejected": -0.6389676332473755, + "logps/chosen": -71.84932708740234, + "logps/rejected": -119.20436096191406, + "loss": 0.62, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2335612773895264, + "rewards/margins": 7.310312747955322, + "rewards/rejected": -4.076751708984375, + "step": 13836 + }, + { + "epoch": 3.46, + "grad_norm": 3.603158950805664, + "learning_rate": 2.158880190083478e-06, + "logits/chosen": -0.5854473114013672, + "logits/rejected": -0.6469981074333191, + "logps/chosen": -56.74375534057617, + "logps/rejected": -120.68730163574219, + "loss": 0.6222, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.225470542907715, + "rewards/margins": 8.33350944519043, + "rewards/rejected": -5.108038902282715, + "step": 13837 + }, + { + "epoch": 3.46, + "grad_norm": 8.093077659606934, + "learning_rate": 2.1582334564281656e-06, + "logits/chosen": -0.5465744733810425, + "logits/rejected": -0.667460024356842, + "logps/chosen": -56.194847106933594, + "logps/rejected": -88.16825103759766, + "loss": 0.6649, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.314934492111206, + "rewards/margins": 6.097829818725586, + "rewards/rejected": -2.782895088195801, + "step": 13838 + }, + { + "epoch": 3.46, + "grad_norm": 4.740846157073975, + "learning_rate": 2.157586792995927e-06, + "logits/chosen": -0.4902516305446625, + "logits/rejected": -0.5854355096817017, + "logps/chosen": -54.155311584472656, + "logps/rejected": -113.86219787597656, + "loss": 0.5854, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7400970458984375, + "rewards/margins": 8.616923332214355, + "rewards/rejected": -5.876826763153076, + "step": 13839 + }, + { + "epoch": 3.46, + "grad_norm": 4.713177680969238, + "learning_rate": 2.15694019980274e-06, + "logits/chosen": -0.4841565787792206, + "logits/rejected": -0.584716796875, + "logps/chosen": -76.515380859375, + "logps/rejected": -118.90299987792969, + "loss": 0.659, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9064419269561768, + "rewards/margins": 7.931167125701904, + "rewards/rejected": -5.02472448348999, + "step": 13840 + }, + { + "epoch": 3.46, + "grad_norm": 2.8933074474334717, + "learning_rate": 2.1562936768645864e-06, + "logits/chosen": -0.5935075283050537, + "logits/rejected": -0.6607054471969604, + "logps/chosen": -44.71604919433594, + "logps/rejected": -106.47763061523438, + "loss": 0.549, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2715630531311035, + "rewards/margins": 7.5974626541137695, + "rewards/rejected": -4.325900077819824, + "step": 13841 + }, + { + "epoch": 3.46, + "grad_norm": 7.456360816955566, + "learning_rate": 2.1556472241974408e-06, + "logits/chosen": -0.520073413848877, + "logits/rejected": -0.5655999779701233, + "logps/chosen": -69.69624328613281, + "logps/rejected": -119.13700866699219, + "loss": 0.7585, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9159207344055176, + "rewards/margins": 6.897755146026611, + "rewards/rejected": -3.981834650039673, + "step": 13842 + }, + { + "epoch": 3.46, + "grad_norm": 5.231240272521973, + "learning_rate": 2.1550008418172764e-06, + "logits/chosen": -0.4891551733016968, + "logits/rejected": -0.6013476848602295, + "logps/chosen": -53.1063117980957, + "logps/rejected": -106.1996078491211, + "loss": 0.5613, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.946000099182129, + "rewards/margins": 6.886433124542236, + "rewards/rejected": -3.9404335021972656, + "step": 13843 + }, + { + "epoch": 3.46, + "grad_norm": 3.2732884883880615, + "learning_rate": 2.1543545297400686e-06, + "logits/chosen": -0.5178526043891907, + "logits/rejected": -0.5624152421951294, + "logps/chosen": -67.36341857910156, + "logps/rejected": -107.96204376220703, + "loss": 0.5808, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1372017860412598, + "rewards/margins": 6.324375629425049, + "rewards/rejected": -3.187173843383789, + "step": 13844 + }, + { + "epoch": 3.46, + "grad_norm": 8.083910942077637, + "learning_rate": 2.1537082879817873e-06, + "logits/chosen": -0.4887733459472656, + "logits/rejected": -0.5690685510635376, + "logps/chosen": -60.00967788696289, + "logps/rejected": -109.82868194580078, + "loss": 0.656, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.101814031600952, + "rewards/margins": 7.669517993927002, + "rewards/rejected": -4.567703723907471, + "step": 13845 + }, + { + "epoch": 3.46, + "grad_norm": 5.077511787414551, + "learning_rate": 2.1530621165583993e-06, + "logits/chosen": -0.5122054815292358, + "logits/rejected": -0.6035251617431641, + "logps/chosen": -57.83909606933594, + "logps/rejected": -113.24893188476562, + "loss": 0.6081, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2077345848083496, + "rewards/margins": 7.217649936676025, + "rewards/rejected": -4.009914875030518, + "step": 13846 + }, + { + "epoch": 3.46, + "grad_norm": 6.8427019119262695, + "learning_rate": 2.152416015485877e-06, + "logits/chosen": -0.5298989415168762, + "logits/rejected": -0.5940744876861572, + "logps/chosen": -63.538475036621094, + "logps/rejected": -82.13397216796875, + "loss": 0.784, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1647465229034424, + "rewards/margins": 4.594359397888184, + "rewards/rejected": -1.4296131134033203, + "step": 13847 + }, + { + "epoch": 3.46, + "grad_norm": 30.576576232910156, + "learning_rate": 2.1517699847801816e-06, + "logits/chosen": -0.5039401650428772, + "logits/rejected": -0.5669661164283752, + "logps/chosen": -54.41413497924805, + "logps/rejected": -106.3003158569336, + "loss": 0.7075, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.911005973815918, + "rewards/margins": 7.0004496574401855, + "rewards/rejected": -4.089443206787109, + "step": 13848 + }, + { + "epoch": 3.46, + "grad_norm": 10.748777389526367, + "learning_rate": 2.1511240244572817e-06, + "logits/chosen": -0.6045843362808228, + "logits/rejected": -0.6737614870071411, + "logps/chosen": -53.687469482421875, + "logps/rejected": -91.17919921875, + "loss": 0.7427, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.035254716873169, + "rewards/margins": 6.930236339569092, + "rewards/rejected": -3.894981622695923, + "step": 13849 + }, + { + "epoch": 3.46, + "grad_norm": 3.0255818367004395, + "learning_rate": 2.150478134533135e-06, + "logits/chosen": -0.5768589973449707, + "logits/rejected": -0.6573980450630188, + "logps/chosen": -50.64105987548828, + "logps/rejected": -123.3741455078125, + "loss": 0.56, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3832314014434814, + "rewards/margins": 8.559379577636719, + "rewards/rejected": -5.176148414611816, + "step": 13850 + }, + { + "epoch": 3.47, + "grad_norm": 5.484847545623779, + "learning_rate": 2.1498323150237065e-06, + "logits/chosen": -0.6019573211669922, + "logits/rejected": -0.6663050055503845, + "logps/chosen": -45.469234466552734, + "logps/rejected": -103.07087707519531, + "loss": 0.531, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1412501335144043, + "rewards/margins": 7.288344383239746, + "rewards/rejected": -4.147094249725342, + "step": 13851 + }, + { + "epoch": 3.47, + "grad_norm": 3.2913641929626465, + "learning_rate": 2.149186565944953e-06, + "logits/chosen": -0.4301152229309082, + "logits/rejected": -0.4877326488494873, + "logps/chosen": -73.59609985351562, + "logps/rejected": -112.50230407714844, + "loss": 0.6131, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.071863889694214, + "rewards/margins": 6.790783882141113, + "rewards/rejected": -3.7189202308654785, + "step": 13852 + }, + { + "epoch": 3.47, + "grad_norm": 8.61363410949707, + "learning_rate": 2.14854088731283e-06, + "logits/chosen": -0.5395696759223938, + "logits/rejected": -0.6794367432594299, + "logps/chosen": -66.17715454101562, + "logps/rejected": -97.40593719482422, + "loss": 0.7031, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9731411933898926, + "rewards/margins": 6.351830005645752, + "rewards/rejected": -3.378688335418701, + "step": 13853 + }, + { + "epoch": 3.47, + "grad_norm": 4.666443347930908, + "learning_rate": 2.1478952791432965e-06, + "logits/chosen": -0.47741279006004333, + "logits/rejected": -0.5275893211364746, + "logps/chosen": -55.69176483154297, + "logps/rejected": -111.68375396728516, + "loss": 0.6844, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.189267158508301, + "rewards/margins": 7.242153644561768, + "rewards/rejected": -4.052886486053467, + "step": 13854 + }, + { + "epoch": 3.47, + "grad_norm": 9.972349166870117, + "learning_rate": 2.1472497414523023e-06, + "logits/chosen": -0.5145199298858643, + "logits/rejected": -0.62706458568573, + "logps/chosen": -60.33910369873047, + "logps/rejected": -112.2579116821289, + "loss": 0.6969, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.868816614151001, + "rewards/margins": 7.422167778015137, + "rewards/rejected": -4.553351402282715, + "step": 13855 + }, + { + "epoch": 3.47, + "grad_norm": 7.914620399475098, + "learning_rate": 2.146604274255803e-06, + "logits/chosen": -0.4266512989997864, + "logits/rejected": -0.510320782661438, + "logps/chosen": -63.37535858154297, + "logps/rejected": -94.57099914550781, + "loss": 0.6265, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7382521629333496, + "rewards/margins": 6.198490142822266, + "rewards/rejected": -3.460237979888916, + "step": 13856 + }, + { + "epoch": 3.47, + "grad_norm": 6.758835792541504, + "learning_rate": 2.1459588775697475e-06, + "logits/chosen": -0.5534224510192871, + "logits/rejected": -0.6619473099708557, + "logps/chosen": -49.455169677734375, + "logps/rejected": -96.62649536132812, + "loss": 0.5673, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0554418563842773, + "rewards/margins": 7.561925888061523, + "rewards/rejected": -4.506484508514404, + "step": 13857 + }, + { + "epoch": 3.47, + "grad_norm": 9.910989761352539, + "learning_rate": 2.145313551410082e-06, + "logits/chosen": -0.49877920746803284, + "logits/rejected": -0.5869693756103516, + "logps/chosen": -57.531158447265625, + "logps/rejected": -98.04618072509766, + "loss": 0.663, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6130270957946777, + "rewards/margins": 5.8758087158203125, + "rewards/rejected": -3.2627811431884766, + "step": 13858 + }, + { + "epoch": 3.47, + "grad_norm": 2.972464084625244, + "learning_rate": 2.144668295792756e-06, + "logits/chosen": -0.46868425607681274, + "logits/rejected": -0.5863779783248901, + "logps/chosen": -56.376708984375, + "logps/rejected": -99.40174102783203, + "loss": 0.5661, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.272320508956909, + "rewards/margins": 7.568955898284912, + "rewards/rejected": -4.296634674072266, + "step": 13859 + }, + { + "epoch": 3.47, + "grad_norm": 3.5510334968566895, + "learning_rate": 2.1440231107337147e-06, + "logits/chosen": -0.4676278233528137, + "logits/rejected": -0.5428469181060791, + "logps/chosen": -56.04936981201172, + "logps/rejected": -136.39010620117188, + "loss": 0.6068, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0259785652160645, + "rewards/margins": 9.189998626708984, + "rewards/rejected": -6.16402006149292, + "step": 13860 + }, + { + "epoch": 3.47, + "grad_norm": 5.242514610290527, + "learning_rate": 2.1433779962488976e-06, + "logits/chosen": -0.6121158003807068, + "logits/rejected": -0.6397266983985901, + "logps/chosen": -56.99812316894531, + "logps/rejected": -113.74559020996094, + "loss": 0.733, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8355538845062256, + "rewards/margins": 5.7538580894470215, + "rewards/rejected": -2.918303966522217, + "step": 13861 + }, + { + "epoch": 3.47, + "grad_norm": 3.4286062717437744, + "learning_rate": 2.1427329523542486e-06, + "logits/chosen": -0.4660431146621704, + "logits/rejected": -0.5261496901512146, + "logps/chosen": -55.58966827392578, + "logps/rejected": -102.89752960205078, + "loss": 0.6316, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.388127326965332, + "rewards/margins": 6.792579650878906, + "rewards/rejected": -3.404451847076416, + "step": 13862 + }, + { + "epoch": 3.47, + "grad_norm": 7.069372653961182, + "learning_rate": 2.142087979065711e-06, + "logits/chosen": -0.5932719707489014, + "logits/rejected": -0.6083747148513794, + "logps/chosen": -44.87824249267578, + "logps/rejected": -126.287109375, + "loss": 0.6002, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.278130054473877, + "rewards/margins": 7.584402084350586, + "rewards/rejected": -4.306273460388184, + "step": 13863 + }, + { + "epoch": 3.47, + "grad_norm": 7.179802894592285, + "learning_rate": 2.1414430763992163e-06, + "logits/chosen": -0.5588563084602356, + "logits/rejected": -0.6790841817855835, + "logps/chosen": -64.4246826171875, + "logps/rejected": -103.08383178710938, + "loss": 0.6897, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0576391220092773, + "rewards/margins": 7.217763900756836, + "rewards/rejected": -4.160124778747559, + "step": 13864 + }, + { + "epoch": 3.47, + "grad_norm": 3.1995956897735596, + "learning_rate": 2.140798244370703e-06, + "logits/chosen": -0.5212416052818298, + "logits/rejected": -0.6279944777488708, + "logps/chosen": -53.227134704589844, + "logps/rejected": -106.81837463378906, + "loss": 0.5451, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3881497383117676, + "rewards/margins": 7.610538005828857, + "rewards/rejected": -4.22238826751709, + "step": 13865 + }, + { + "epoch": 3.47, + "grad_norm": 4.743943214416504, + "learning_rate": 2.1401534829961085e-06, + "logits/chosen": -0.5396734476089478, + "logits/rejected": -0.6480642557144165, + "logps/chosen": -54.731353759765625, + "logps/rejected": -99.05766296386719, + "loss": 0.6525, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.06994891166687, + "rewards/margins": 7.0177483558654785, + "rewards/rejected": -3.947798728942871, + "step": 13866 + }, + { + "epoch": 3.47, + "grad_norm": 9.491032600402832, + "learning_rate": 2.1395087922913634e-06, + "logits/chosen": -0.49715596437454224, + "logits/rejected": -0.5724661350250244, + "logps/chosen": -47.99653625488281, + "logps/rejected": -103.6051025390625, + "loss": 0.6101, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1125240325927734, + "rewards/margins": 5.8334736824035645, + "rewards/rejected": -2.720949172973633, + "step": 13867 + }, + { + "epoch": 3.47, + "grad_norm": 18.618988037109375, + "learning_rate": 2.138864172272396e-06, + "logits/chosen": -0.4545583128929138, + "logits/rejected": -0.5396530628204346, + "logps/chosen": -64.70707702636719, + "logps/rejected": -104.8518295288086, + "loss": 0.7661, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8836591243743896, + "rewards/margins": 5.708036422729492, + "rewards/rejected": -2.8243775367736816, + "step": 13868 + }, + { + "epoch": 3.47, + "grad_norm": 3.6053526401519775, + "learning_rate": 2.1382196229551406e-06, + "logits/chosen": -0.5059139728546143, + "logits/rejected": -0.5580682754516602, + "logps/chosen": -51.08966827392578, + "logps/rejected": -108.36741638183594, + "loss": 0.6126, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3106424808502197, + "rewards/margins": 7.144863128662109, + "rewards/rejected": -3.8342204093933105, + "step": 13869 + }, + { + "epoch": 3.47, + "grad_norm": 5.357516765594482, + "learning_rate": 2.137575144355522e-06, + "logits/chosen": -0.5561277866363525, + "logits/rejected": -0.6361381411552429, + "logps/chosen": -54.06147384643555, + "logps/rejected": -125.932373046875, + "loss": 0.6315, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2706820964813232, + "rewards/margins": 8.174537658691406, + "rewards/rejected": -4.903855800628662, + "step": 13870 + }, + { + "epoch": 3.47, + "grad_norm": 5.333071231842041, + "learning_rate": 2.1369307364894647e-06, + "logits/chosen": -0.5575261116027832, + "logits/rejected": -0.5944821238517761, + "logps/chosen": -54.9799919128418, + "logps/rejected": -97.61653900146484, + "loss": 0.6036, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4991846084594727, + "rewards/margins": 6.024710655212402, + "rewards/rejected": -2.5255260467529297, + "step": 13871 + }, + { + "epoch": 3.47, + "grad_norm": 5.527179718017578, + "learning_rate": 2.1362863993728953e-06, + "logits/chosen": -0.5625640749931335, + "logits/rejected": -0.667086660861969, + "logps/chosen": -55.89160919189453, + "logps/rejected": -104.08665466308594, + "loss": 0.6053, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1589651107788086, + "rewards/margins": 7.726219654083252, + "rewards/rejected": -4.56725549697876, + "step": 13872 + }, + { + "epoch": 3.47, + "grad_norm": 7.721793174743652, + "learning_rate": 2.1356421330217342e-06, + "logits/chosen": -0.4703490436077118, + "logits/rejected": -0.553621232509613, + "logps/chosen": -59.573272705078125, + "logps/rejected": -105.70304107666016, + "loss": 0.6839, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9289326667785645, + "rewards/margins": 6.211551189422607, + "rewards/rejected": -3.282618522644043, + "step": 13873 + }, + { + "epoch": 3.47, + "grad_norm": 4.301298141479492, + "learning_rate": 2.134997937451904e-06, + "logits/chosen": -0.5424519777297974, + "logits/rejected": -0.6110051870346069, + "logps/chosen": -61.96955108642578, + "logps/rejected": -104.77302551269531, + "loss": 0.6633, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2116804122924805, + "rewards/margins": 6.866135597229004, + "rewards/rejected": -3.6544549465179443, + "step": 13874 + }, + { + "epoch": 3.47, + "grad_norm": 4.092987060546875, + "learning_rate": 2.1343538126793217e-06, + "logits/chosen": -0.5447818040847778, + "logits/rejected": -0.6330041289329529, + "logps/chosen": -53.86678695678711, + "logps/rejected": -127.1025619506836, + "loss": 0.5584, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9937398433685303, + "rewards/margins": 7.752750396728516, + "rewards/rejected": -4.759010314941406, + "step": 13875 + }, + { + "epoch": 3.47, + "grad_norm": 2.870535373687744, + "learning_rate": 2.1337097587199035e-06, + "logits/chosen": -0.5747760534286499, + "logits/rejected": -0.6396129727363586, + "logps/chosen": -57.10624313354492, + "logps/rejected": -114.32276916503906, + "loss": 0.5806, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9911067485809326, + "rewards/margins": 7.429586887359619, + "rewards/rejected": -4.438480377197266, + "step": 13876 + }, + { + "epoch": 3.47, + "grad_norm": 3.7185699939727783, + "learning_rate": 2.133065775589567e-06, + "logits/chosen": -0.48936381936073303, + "logits/rejected": -0.5852414965629578, + "logps/chosen": -55.265018463134766, + "logps/rejected": -102.0654296875, + "loss": 0.554, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9382424354553223, + "rewards/margins": 7.107565402984619, + "rewards/rejected": -4.169322967529297, + "step": 13877 + }, + { + "epoch": 3.47, + "grad_norm": 3.4327335357666016, + "learning_rate": 2.1324218633042253e-06, + "logits/chosen": -0.5758526921272278, + "logits/rejected": -0.6525595188140869, + "logps/chosen": -46.984745025634766, + "logps/rejected": -101.51615142822266, + "loss": 0.5549, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0878798961639404, + "rewards/margins": 6.896644115447998, + "rewards/rejected": -3.8087642192840576, + "step": 13878 + }, + { + "epoch": 3.47, + "grad_norm": 4.4353508949279785, + "learning_rate": 2.131778021879787e-06, + "logits/chosen": -0.5654081702232361, + "logits/rejected": -0.6258476972579956, + "logps/chosen": -57.51719665527344, + "logps/rejected": -108.89700317382812, + "loss": 0.6502, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0157885551452637, + "rewards/margins": 5.874627590179443, + "rewards/rejected": -2.8588385581970215, + "step": 13879 + }, + { + "epoch": 3.47, + "grad_norm": 5.319180488586426, + "learning_rate": 2.1311342513321653e-06, + "logits/chosen": -0.5032809972763062, + "logits/rejected": -0.5629940629005432, + "logps/chosen": -71.68211364746094, + "logps/rejected": -113.22884368896484, + "loss": 0.6065, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9426028728485107, + "rewards/margins": 6.731402397155762, + "rewards/rejected": -3.7887990474700928, + "step": 13880 + }, + { + "epoch": 3.47, + "grad_norm": 3.5819692611694336, + "learning_rate": 2.1304905516772713e-06, + "logits/chosen": -0.5439521670341492, + "logits/rejected": -0.6674996614456177, + "logps/chosen": -54.07019805908203, + "logps/rejected": -113.00221252441406, + "loss": 0.6044, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0532243251800537, + "rewards/margins": 8.231424331665039, + "rewards/rejected": -5.178199768066406, + "step": 13881 + }, + { + "epoch": 3.47, + "grad_norm": 6.861718654632568, + "learning_rate": 2.1298469229310036e-06, + "logits/chosen": -0.5646821856498718, + "logits/rejected": -0.6868218183517456, + "logps/chosen": -59.70672607421875, + "logps/rejected": -111.7137680053711, + "loss": 0.6767, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2705798149108887, + "rewards/margins": 8.502880096435547, + "rewards/rejected": -5.232300758361816, + "step": 13882 + }, + { + "epoch": 3.47, + "grad_norm": 3.1306543350219727, + "learning_rate": 2.1292033651092716e-06, + "logits/chosen": -0.5052328705787659, + "logits/rejected": -0.6026380658149719, + "logps/chosen": -49.180259704589844, + "logps/rejected": -114.08301544189453, + "loss": 0.5935, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1203391551971436, + "rewards/margins": 8.227904319763184, + "rewards/rejected": -5.1075639724731445, + "step": 13883 + }, + { + "epoch": 3.47, + "grad_norm": 4.108235836029053, + "learning_rate": 2.12855987822798e-06, + "logits/chosen": -0.5168181657791138, + "logits/rejected": -0.6135086417198181, + "logps/chosen": -53.89894104003906, + "logps/rejected": -96.2939453125, + "loss": 0.5825, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.136686325073242, + "rewards/margins": 7.9117512702941895, + "rewards/rejected": -4.775064468383789, + "step": 13884 + }, + { + "epoch": 3.47, + "grad_norm": 4.186982154846191, + "learning_rate": 2.1279164623030275e-06, + "logits/chosen": -0.5553982257843018, + "logits/rejected": -0.6267092823982239, + "logps/chosen": -43.519649505615234, + "logps/rejected": -100.72390747070312, + "loss": 0.5909, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3252816200256348, + "rewards/margins": 7.184577465057373, + "rewards/rejected": -3.8592963218688965, + "step": 13885 + }, + { + "epoch": 3.47, + "grad_norm": 27.27309799194336, + "learning_rate": 2.1272731173503127e-06, + "logits/chosen": -0.5645941495895386, + "logits/rejected": -0.6332708597183228, + "logps/chosen": -46.30206298828125, + "logps/rejected": -112.65850830078125, + "loss": 0.6379, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.052351474761963, + "rewards/margins": 7.3982462882995605, + "rewards/rejected": -4.345895290374756, + "step": 13886 + }, + { + "epoch": 3.47, + "grad_norm": 2.7679011821746826, + "learning_rate": 2.1266298433857363e-06, + "logits/chosen": -0.49841582775115967, + "logits/rejected": -0.6247459053993225, + "logps/chosen": -56.821678161621094, + "logps/rejected": -111.63843536376953, + "loss": 0.5343, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2788100242614746, + "rewards/margins": 8.202361106872559, + "rewards/rejected": -4.923551559448242, + "step": 13887 + }, + { + "epoch": 3.47, + "grad_norm": 6.22856330871582, + "learning_rate": 2.1259866404251926e-06, + "logits/chosen": -0.5395506620407104, + "logits/rejected": -0.6291320323944092, + "logps/chosen": -51.37287521362305, + "logps/rejected": -96.43091583251953, + "loss": 0.6823, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2010912895202637, + "rewards/margins": 7.243997573852539, + "rewards/rejected": -4.042905807495117, + "step": 13888 + }, + { + "epoch": 3.47, + "grad_norm": 14.970499992370605, + "learning_rate": 2.125343508484575e-06, + "logits/chosen": -0.5358734726905823, + "logits/rejected": -0.6823527812957764, + "logps/chosen": -60.69672393798828, + "logps/rejected": -98.48577880859375, + "loss": 0.7639, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.135087251663208, + "rewards/margins": 6.775819301605225, + "rewards/rejected": -3.6407318115234375, + "step": 13889 + }, + { + "epoch": 3.47, + "grad_norm": 6.221726894378662, + "learning_rate": 2.1247004475797777e-06, + "logits/chosen": -0.5937995910644531, + "logits/rejected": -0.6516434550285339, + "logps/chosen": -55.607810974121094, + "logps/rejected": -100.5169677734375, + "loss": 0.6584, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0826237201690674, + "rewards/margins": 6.211455345153809, + "rewards/rejected": -3.1288321018218994, + "step": 13890 + }, + { + "epoch": 3.48, + "grad_norm": 5.889821529388428, + "learning_rate": 2.1240574577266893e-06, + "logits/chosen": -0.542617678642273, + "logits/rejected": -0.5881778001785278, + "logps/chosen": -57.41658020019531, + "logps/rejected": -116.16777038574219, + "loss": 0.584, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.239199638366699, + "rewards/margins": 6.51750373840332, + "rewards/rejected": -3.278303623199463, + "step": 13891 + }, + { + "epoch": 3.48, + "grad_norm": 3.798905849456787, + "learning_rate": 2.123414538941202e-06, + "logits/chosen": -0.5265157222747803, + "logits/rejected": -0.6187188625335693, + "logps/chosen": -49.236053466796875, + "logps/rejected": -103.78507232666016, + "loss": 0.5844, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5317575931549072, + "rewards/margins": 7.206601142883301, + "rewards/rejected": -4.6748433113098145, + "step": 13892 + }, + { + "epoch": 3.48, + "grad_norm": 3.3008368015289307, + "learning_rate": 2.1227716912391993e-06, + "logits/chosen": -0.5270167589187622, + "logits/rejected": -0.6161328554153442, + "logps/chosen": -51.6126708984375, + "logps/rejected": -96.66481018066406, + "loss": 0.5591, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.320600748062134, + "rewards/margins": 6.294685363769531, + "rewards/rejected": -2.97408390045166, + "step": 13893 + }, + { + "epoch": 3.48, + "grad_norm": 4.828752517700195, + "learning_rate": 2.1221289146365703e-06, + "logits/chosen": -0.5125726461410522, + "logits/rejected": -0.5824549198150635, + "logps/chosen": -63.81742477416992, + "logps/rejected": -103.70621490478516, + "loss": 0.6986, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9080305099487305, + "rewards/margins": 6.94883394241333, + "rewards/rejected": -4.0408034324646, + "step": 13894 + }, + { + "epoch": 3.48, + "grad_norm": 2.2429938316345215, + "learning_rate": 2.1214862091491966e-06, + "logits/chosen": -0.46696919202804565, + "logits/rejected": -0.5968666076660156, + "logps/chosen": -51.99831008911133, + "logps/rejected": -111.58758544921875, + "loss": 0.5057, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0824177265167236, + "rewards/margins": 8.123087882995605, + "rewards/rejected": -5.0406694412231445, + "step": 13895 + }, + { + "epoch": 3.48, + "grad_norm": 4.354242324829102, + "learning_rate": 2.1208435747929583e-06, + "logits/chosen": -0.572684109210968, + "logits/rejected": -0.620890736579895, + "logps/chosen": -52.773712158203125, + "logps/rejected": -116.11322021484375, + "loss": 0.5857, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1997275352478027, + "rewards/margins": 6.801847457885742, + "rewards/rejected": -3.6021196842193604, + "step": 13896 + }, + { + "epoch": 3.48, + "grad_norm": 5.133448600769043, + "learning_rate": 2.12020101158374e-06, + "logits/chosen": -0.5747457146644592, + "logits/rejected": -0.6942895650863647, + "logps/chosen": -52.7934455871582, + "logps/rejected": -106.1234130859375, + "loss": 0.5982, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1305692195892334, + "rewards/margins": 7.762704372406006, + "rewards/rejected": -4.632134914398193, + "step": 13897 + }, + { + "epoch": 3.48, + "grad_norm": 3.950854778289795, + "learning_rate": 2.119558519537416e-06, + "logits/chosen": -0.49704793095588684, + "logits/rejected": -0.60850989818573, + "logps/chosen": -80.03386688232422, + "logps/rejected": -123.43604278564453, + "loss": 0.6565, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.168015241622925, + "rewards/margins": 7.611062049865723, + "rewards/rejected": -4.443046569824219, + "step": 13898 + }, + { + "epoch": 3.48, + "grad_norm": 9.888419151306152, + "learning_rate": 2.1189160986698666e-06, + "logits/chosen": -0.6093774437904358, + "logits/rejected": -0.711554765701294, + "logps/chosen": -48.56943130493164, + "logps/rejected": -117.22189331054688, + "loss": 0.5398, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.973463535308838, + "rewards/margins": 8.496474266052246, + "rewards/rejected": -5.523010730743408, + "step": 13899 + }, + { + "epoch": 3.48, + "grad_norm": 3.615661382675171, + "learning_rate": 2.1182737489969647e-06, + "logits/chosen": -0.560470700263977, + "logits/rejected": -0.6580020785331726, + "logps/chosen": -50.98957824707031, + "logps/rejected": -114.98484802246094, + "loss": 0.5504, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0744996070861816, + "rewards/margins": 8.473691940307617, + "rewards/rejected": -5.399191856384277, + "step": 13900 + }, + { + "epoch": 3.48, + "grad_norm": 4.053528785705566, + "learning_rate": 2.117631470534582e-06, + "logits/chosen": -0.49384576082229614, + "logits/rejected": -0.5851638913154602, + "logps/chosen": -64.95326232910156, + "logps/rejected": -101.10779571533203, + "loss": 0.6104, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1416468620300293, + "rewards/margins": 7.155315399169922, + "rewards/rejected": -4.013668060302734, + "step": 13901 + }, + { + "epoch": 3.48, + "grad_norm": 14.349557876586914, + "learning_rate": 2.1169892632985926e-06, + "logits/chosen": -0.552676260471344, + "logits/rejected": -0.6106319427490234, + "logps/chosen": -67.80457305908203, + "logps/rejected": -116.60169219970703, + "loss": 0.6649, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6199426651000977, + "rewards/margins": 6.38790225982666, + "rewards/rejected": -3.7679591178894043, + "step": 13902 + }, + { + "epoch": 3.48, + "grad_norm": 4.3279523849487305, + "learning_rate": 2.1163471273048657e-06, + "logits/chosen": -0.5259327292442322, + "logits/rejected": -0.5722109079360962, + "logps/chosen": -52.50749206542969, + "logps/rejected": -113.19026947021484, + "loss": 0.6643, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.991426467895508, + "rewards/margins": 6.920319557189941, + "rewards/rejected": -3.9288930892944336, + "step": 13903 + }, + { + "epoch": 3.48, + "grad_norm": 3.316385269165039, + "learning_rate": 2.1157050625692666e-06, + "logits/chosen": -0.5788407325744629, + "logits/rejected": -0.6527299880981445, + "logps/chosen": -50.68111801147461, + "logps/rejected": -113.85324096679688, + "loss": 0.5678, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1854846477508545, + "rewards/margins": 8.22915267944336, + "rewards/rejected": -5.043667793273926, + "step": 13904 + }, + { + "epoch": 3.48, + "grad_norm": 4.75669527053833, + "learning_rate": 2.115063069107663e-06, + "logits/chosen": -0.5444208383560181, + "logits/rejected": -0.5849330425262451, + "logps/chosen": -44.347904205322266, + "logps/rejected": -99.59379577636719, + "loss": 0.6707, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2132678031921387, + "rewards/margins": 6.439053058624268, + "rewards/rejected": -3.225785493850708, + "step": 13905 + }, + { + "epoch": 3.48, + "grad_norm": 4.990482807159424, + "learning_rate": 2.1144211469359234e-06, + "logits/chosen": -0.5007972717285156, + "logits/rejected": -0.6172669529914856, + "logps/chosen": -55.38364028930664, + "logps/rejected": -82.30844116210938, + "loss": 0.6102, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5027222633361816, + "rewards/margins": 6.711821556091309, + "rewards/rejected": -3.209099531173706, + "step": 13906 + }, + { + "epoch": 3.48, + "grad_norm": 9.761452674865723, + "learning_rate": 2.113779296069903e-06, + "logits/chosen": -0.5315518379211426, + "logits/rejected": -0.6270430088043213, + "logps/chosen": -71.6279296875, + "logps/rejected": -106.39958190917969, + "loss": 0.8268, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1787259578704834, + "rewards/margins": 6.846553802490234, + "rewards/rejected": -3.667828321456909, + "step": 13907 + }, + { + "epoch": 3.48, + "grad_norm": 4.292969226837158, + "learning_rate": 2.1131375165254656e-06, + "logits/chosen": -0.5408650040626526, + "logits/rejected": -0.6375011801719666, + "logps/chosen": -49.916908264160156, + "logps/rejected": -104.93563842773438, + "loss": 0.5666, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3797028064727783, + "rewards/margins": 7.5589165687561035, + "rewards/rejected": -4.179214000701904, + "step": 13908 + }, + { + "epoch": 3.48, + "grad_norm": 6.87872314453125, + "learning_rate": 2.112495808318475e-06, + "logits/chosen": -0.584356963634491, + "logits/rejected": -0.6021863222122192, + "logps/chosen": -51.71855163574219, + "logps/rejected": -115.67509460449219, + "loss": 0.6588, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.130758047103882, + "rewards/margins": 6.464883327484131, + "rewards/rejected": -3.33412504196167, + "step": 13909 + }, + { + "epoch": 3.48, + "grad_norm": 12.808956146240234, + "learning_rate": 2.11185417146478e-06, + "logits/chosen": -0.5146588087081909, + "logits/rejected": -0.6126495599746704, + "logps/chosen": -53.71168518066406, + "logps/rejected": -94.16199493408203, + "loss": 0.599, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2870430946350098, + "rewards/margins": 7.426448345184326, + "rewards/rejected": -4.139404773712158, + "step": 13910 + }, + { + "epoch": 3.48, + "grad_norm": 13.798885345458984, + "learning_rate": 2.111212605980241e-06, + "logits/chosen": -0.526607096195221, + "logits/rejected": -0.5866847038269043, + "logps/chosen": -57.13508605957031, + "logps/rejected": -108.07218170166016, + "loss": 0.8011, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0308446884155273, + "rewards/margins": 6.692397117614746, + "rewards/rejected": -3.661552667617798, + "step": 13911 + }, + { + "epoch": 3.48, + "grad_norm": 7.727904319763184, + "learning_rate": 2.1105711118807128e-06, + "logits/chosen": -0.5537669658660889, + "logits/rejected": -0.6429962515830994, + "logps/chosen": -66.12763977050781, + "logps/rejected": -106.21868133544922, + "loss": 0.6903, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8670666217803955, + "rewards/margins": 6.352407455444336, + "rewards/rejected": -3.4853408336639404, + "step": 13912 + }, + { + "epoch": 3.48, + "grad_norm": 5.245250225067139, + "learning_rate": 2.1099296891820455e-06, + "logits/chosen": -0.6065376996994019, + "logits/rejected": -0.6584892868995667, + "logps/chosen": -55.28932571411133, + "logps/rejected": -101.67625427246094, + "loss": 0.6959, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.002533435821533, + "rewards/margins": 6.295022010803223, + "rewards/rejected": -3.2924880981445312, + "step": 13913 + }, + { + "epoch": 3.48, + "grad_norm": 2.801788330078125, + "learning_rate": 2.109288337900088e-06, + "logits/chosen": -0.5262840390205383, + "logits/rejected": -0.6065307855606079, + "logps/chosen": -59.77240753173828, + "logps/rejected": -112.15972900390625, + "loss": 0.5986, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.171243667602539, + "rewards/margins": 7.748283863067627, + "rewards/rejected": -4.577040195465088, + "step": 13914 + }, + { + "epoch": 3.48, + "grad_norm": 8.189481735229492, + "learning_rate": 2.108647058050692e-06, + "logits/chosen": -0.5790849924087524, + "logits/rejected": -0.6649419665336609, + "logps/chosen": -64.05462646484375, + "logps/rejected": -96.36066436767578, + "loss": 0.7094, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.118227005004883, + "rewards/margins": 6.8947248458862305, + "rewards/rejected": -3.7764976024627686, + "step": 13915 + }, + { + "epoch": 3.48, + "grad_norm": 3.8801357746124268, + "learning_rate": 2.108005849649701e-06, + "logits/chosen": -0.5150519609451294, + "logits/rejected": -0.627272367477417, + "logps/chosen": -54.29132843017578, + "logps/rejected": -100.62582397460938, + "loss": 0.6075, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.298704147338867, + "rewards/margins": 7.291049957275391, + "rewards/rejected": -3.9923453330993652, + "step": 13916 + }, + { + "epoch": 3.48, + "grad_norm": 9.291265487670898, + "learning_rate": 2.1073647127129627e-06, + "logits/chosen": -0.6185157895088196, + "logits/rejected": -0.6821139454841614, + "logps/chosen": -54.75519561767578, + "logps/rejected": -100.04585266113281, + "loss": 0.6412, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0151476860046387, + "rewards/margins": 6.501552104949951, + "rewards/rejected": -3.4864041805267334, + "step": 13917 + }, + { + "epoch": 3.48, + "grad_norm": 9.881006240844727, + "learning_rate": 2.10672364725632e-06, + "logits/chosen": -0.5434317588806152, + "logits/rejected": -0.6414031386375427, + "logps/chosen": -65.6390151977539, + "logps/rejected": -99.24417114257812, + "loss": 0.7529, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8931992053985596, + "rewards/margins": 6.435347557067871, + "rewards/rejected": -3.5421485900878906, + "step": 13918 + }, + { + "epoch": 3.48, + "grad_norm": 3.5347044467926025, + "learning_rate": 2.106082653295611e-06, + "logits/chosen": -0.5764725208282471, + "logits/rejected": -0.6445247530937195, + "logps/chosen": -61.997703552246094, + "logps/rejected": -101.1524887084961, + "loss": 0.6472, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0831685066223145, + "rewards/margins": 6.173943519592285, + "rewards/rejected": -3.0907750129699707, + "step": 13919 + }, + { + "epoch": 3.48, + "grad_norm": 7.399306774139404, + "learning_rate": 2.1054417308466795e-06, + "logits/chosen": -0.4891088306903839, + "logits/rejected": -0.5391634106636047, + "logps/chosen": -59.42521667480469, + "logps/rejected": -120.261962890625, + "loss": 0.7304, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.406564712524414, + "rewards/margins": 7.034660816192627, + "rewards/rejected": -3.628096580505371, + "step": 13920 + }, + { + "epoch": 3.48, + "grad_norm": 11.016700744628906, + "learning_rate": 2.1048008799253624e-06, + "logits/chosen": -0.4990614950656891, + "logits/rejected": -0.6274830102920532, + "logps/chosen": -74.52840423583984, + "logps/rejected": -107.49102783203125, + "loss": 0.6785, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.41345477104187, + "rewards/margins": 6.871148586273193, + "rewards/rejected": -4.457694053649902, + "step": 13921 + }, + { + "epoch": 3.48, + "grad_norm": 2.8891663551330566, + "learning_rate": 2.1041601005474922e-06, + "logits/chosen": -0.5096337199211121, + "logits/rejected": -0.5841242671012878, + "logps/chosen": -51.044921875, + "logps/rejected": -110.06092071533203, + "loss": 0.5741, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1580076217651367, + "rewards/margins": 8.006850242614746, + "rewards/rejected": -4.848842620849609, + "step": 13922 + }, + { + "epoch": 3.48, + "grad_norm": 3.808530569076538, + "learning_rate": 2.1035193927289065e-06, + "logits/chosen": -0.6061837673187256, + "logits/rejected": -0.6781587600708008, + "logps/chosen": -44.65718078613281, + "logps/rejected": -102.99159240722656, + "loss": 0.602, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.205191135406494, + "rewards/margins": 7.46656608581543, + "rewards/rejected": -4.261374473571777, + "step": 13923 + }, + { + "epoch": 3.48, + "grad_norm": 6.75936222076416, + "learning_rate": 2.102878756485441e-06, + "logits/chosen": -0.5811034440994263, + "logits/rejected": -0.6472436785697937, + "logps/chosen": -51.22964096069336, + "logps/rejected": -100.1819839477539, + "loss": 0.6541, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.504737615585327, + "rewards/margins": 6.99143123626709, + "rewards/rejected": -3.486694097518921, + "step": 13924 + }, + { + "epoch": 3.48, + "grad_norm": 3.2631900310516357, + "learning_rate": 2.10223819183292e-06, + "logits/chosen": -0.5820163488388062, + "logits/rejected": -0.6923280954360962, + "logps/chosen": -69.43244171142578, + "logps/rejected": -114.04486083984375, + "loss": 0.6615, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.246636390686035, + "rewards/margins": 8.503244400024414, + "rewards/rejected": -5.256608009338379, + "step": 13925 + }, + { + "epoch": 3.48, + "grad_norm": 3.7035791873931885, + "learning_rate": 2.1015976987871747e-06, + "logits/chosen": -0.46941471099853516, + "logits/rejected": -0.5799277424812317, + "logps/chosen": -51.49494171142578, + "logps/rejected": -94.0212631225586, + "loss": 0.5801, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3147337436676025, + "rewards/margins": 6.166587829589844, + "rewards/rejected": -2.8518543243408203, + "step": 13926 + }, + { + "epoch": 3.48, + "grad_norm": 4.853264808654785, + "learning_rate": 2.100957277364035e-06, + "logits/chosen": -0.587458610534668, + "logits/rejected": -0.6605376601219177, + "logps/chosen": -44.785030364990234, + "logps/rejected": -96.99685668945312, + "loss": 0.5402, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1477208137512207, + "rewards/margins": 7.122867107391357, + "rewards/rejected": -3.975146532058716, + "step": 13927 + }, + { + "epoch": 3.48, + "grad_norm": 5.949254512786865, + "learning_rate": 2.1003169275793245e-06, + "logits/chosen": -0.5257045030593872, + "logits/rejected": -0.6505575180053711, + "logps/chosen": -70.26140594482422, + "logps/rejected": -81.78860473632812, + "loss": 0.7663, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1345739364624023, + "rewards/margins": 4.880670547485352, + "rewards/rejected": -1.7460963726043701, + "step": 13928 + }, + { + "epoch": 3.48, + "grad_norm": 7.697727680206299, + "learning_rate": 2.099676649448866e-06, + "logits/chosen": -0.48359501361846924, + "logits/rejected": -0.5503694415092468, + "logps/chosen": -57.15118408203125, + "logps/rejected": -109.68888092041016, + "loss": 0.694, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.06803035736084, + "rewards/margins": 6.629388809204102, + "rewards/rejected": -3.5613579750061035, + "step": 13929 + }, + { + "epoch": 3.48, + "grad_norm": 4.14877986907959, + "learning_rate": 2.0990364429884828e-06, + "logits/chosen": -0.4786103665828705, + "logits/rejected": -0.5536236763000488, + "logps/chosen": -55.16590118408203, + "logps/rejected": -132.9507293701172, + "loss": 0.5974, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2185163497924805, + "rewards/margins": 8.373075485229492, + "rewards/rejected": -5.154559135437012, + "step": 13930 + }, + { + "epoch": 3.49, + "grad_norm": 6.8726806640625, + "learning_rate": 2.098396308213996e-06, + "logits/chosen": -0.5580439567565918, + "logits/rejected": -0.6662006974220276, + "logps/chosen": -59.120750427246094, + "logps/rejected": -111.25403594970703, + "loss": 0.669, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1331191062927246, + "rewards/margins": 7.368264675140381, + "rewards/rejected": -4.235145092010498, + "step": 13931 + }, + { + "epoch": 3.49, + "grad_norm": 2.2250261306762695, + "learning_rate": 2.09775624514122e-06, + "logits/chosen": -0.4875684678554535, + "logits/rejected": -0.5343437790870667, + "logps/chosen": -70.56087493896484, + "logps/rejected": -123.11949157714844, + "loss": 0.5785, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.013183832168579, + "rewards/margins": 8.004535675048828, + "rewards/rejected": -4.991352558135986, + "step": 13932 + }, + { + "epoch": 3.49, + "grad_norm": 5.598970890045166, + "learning_rate": 2.0971162537859765e-06, + "logits/chosen": -0.5597472190856934, + "logits/rejected": -0.6764540076255798, + "logps/chosen": -43.16567611694336, + "logps/rejected": -99.03716278076172, + "loss": 0.5711, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2767772674560547, + "rewards/margins": 7.758643627166748, + "rewards/rejected": -4.481866836547852, + "step": 13933 + }, + { + "epoch": 3.49, + "grad_norm": 3.0908327102661133, + "learning_rate": 2.0964763341640776e-06, + "logits/chosen": -0.5585636496543884, + "logits/rejected": -0.6172106266021729, + "logps/chosen": -43.584651947021484, + "logps/rejected": -112.3558120727539, + "loss": 0.4916, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.894862174987793, + "rewards/margins": 7.219276428222656, + "rewards/rejected": -4.324413776397705, + "step": 13934 + }, + { + "epoch": 3.49, + "grad_norm": 2.420316696166992, + "learning_rate": 2.0958364862913356e-06, + "logits/chosen": -0.5695266723632812, + "logits/rejected": -0.6310204267501831, + "logps/chosen": -58.65656280517578, + "logps/rejected": -113.87044525146484, + "loss": 0.5878, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8588976860046387, + "rewards/margins": 7.803971290588379, + "rewards/rejected": -4.945073127746582, + "step": 13935 + }, + { + "epoch": 3.49, + "grad_norm": 5.178908824920654, + "learning_rate": 2.095196710183564e-06, + "logits/chosen": -0.5189260840415955, + "logits/rejected": -0.5820972323417664, + "logps/chosen": -59.96738052368164, + "logps/rejected": -121.71842193603516, + "loss": 0.6448, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8528966903686523, + "rewards/margins": 7.449326038360596, + "rewards/rejected": -4.596429347991943, + "step": 13936 + }, + { + "epoch": 3.49, + "grad_norm": 4.251283168792725, + "learning_rate": 2.09455700585657e-06, + "logits/chosen": -0.6054858565330505, + "logits/rejected": -0.6588035821914673, + "logps/chosen": -51.46917724609375, + "logps/rejected": -100.96614074707031, + "loss": 0.6367, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3518073558807373, + "rewards/margins": 6.861544609069824, + "rewards/rejected": -3.509737014770508, + "step": 13937 + }, + { + "epoch": 3.49, + "grad_norm": 6.289852142333984, + "learning_rate": 2.0939173733261646e-06, + "logits/chosen": -0.5359914302825928, + "logits/rejected": -0.6187863945960999, + "logps/chosen": -50.15242004394531, + "logps/rejected": -104.56898498535156, + "loss": 0.5331, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.351590394973755, + "rewards/margins": 7.9373674392700195, + "rewards/rejected": -4.585777282714844, + "step": 13938 + }, + { + "epoch": 3.49, + "grad_norm": 15.816264152526855, + "learning_rate": 2.0932778126081505e-06, + "logits/chosen": -0.5566062331199646, + "logits/rejected": -0.6158748269081116, + "logps/chosen": -43.045475006103516, + "logps/rejected": -119.99185943603516, + "loss": 0.6511, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8727352619171143, + "rewards/margins": 6.7478346824646, + "rewards/rejected": -3.875098943710327, + "step": 13939 + }, + { + "epoch": 3.49, + "grad_norm": 8.267370223999023, + "learning_rate": 2.0926383237183346e-06, + "logits/chosen": -0.5663250684738159, + "logits/rejected": -0.5767492651939392, + "logps/chosen": -49.138553619384766, + "logps/rejected": -114.7973861694336, + "loss": 0.6841, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9509902000427246, + "rewards/margins": 7.744815826416016, + "rewards/rejected": -4.793824672698975, + "step": 13940 + }, + { + "epoch": 3.49, + "grad_norm": 7.766568183898926, + "learning_rate": 2.0919989066725184e-06, + "logits/chosen": -0.5526983141899109, + "logits/rejected": -0.5708661079406738, + "logps/chosen": -57.62724685668945, + "logps/rejected": -114.43382263183594, + "loss": 0.7041, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0946898460388184, + "rewards/margins": 6.735818862915039, + "rewards/rejected": -3.641129493713379, + "step": 13941 + }, + { + "epoch": 3.49, + "grad_norm": 2.762530565261841, + "learning_rate": 2.0913595614865007e-06, + "logits/chosen": -0.5725725889205933, + "logits/rejected": -0.654366672039032, + "logps/chosen": -46.372867584228516, + "logps/rejected": -106.44451141357422, + "loss": 0.5267, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3333308696746826, + "rewards/margins": 6.707536697387695, + "rewards/rejected": -3.374206066131592, + "step": 13942 + }, + { + "epoch": 3.49, + "grad_norm": 5.8902506828308105, + "learning_rate": 2.0907202881760834e-06, + "logits/chosen": -0.5203490257263184, + "logits/rejected": -0.6195275783538818, + "logps/chosen": -49.5562858581543, + "logps/rejected": -94.89957427978516, + "loss": 0.5984, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.206364393234253, + "rewards/margins": 6.680912971496582, + "rewards/rejected": -3.47454833984375, + "step": 13943 + }, + { + "epoch": 3.49, + "grad_norm": 3.847444534301758, + "learning_rate": 2.09008108675706e-06, + "logits/chosen": -0.567634105682373, + "logits/rejected": -0.6539463996887207, + "logps/chosen": -53.389434814453125, + "logps/rejected": -101.49314880371094, + "loss": 0.6077, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8715744018554688, + "rewards/margins": 6.940927028656006, + "rewards/rejected": -4.069352149963379, + "step": 13944 + }, + { + "epoch": 3.49, + "grad_norm": 5.738885879516602, + "learning_rate": 2.0894419572452308e-06, + "logits/chosen": -0.49597790837287903, + "logits/rejected": -0.5738160610198975, + "logps/chosen": -52.184303283691406, + "logps/rejected": -89.71591186523438, + "loss": 0.6398, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.126647472381592, + "rewards/margins": 6.178714275360107, + "rewards/rejected": -3.0520663261413574, + "step": 13945 + }, + { + "epoch": 3.49, + "grad_norm": 3.079740285873413, + "learning_rate": 2.0888028996563864e-06, + "logits/chosen": -0.6203400492668152, + "logits/rejected": -0.7065893411636353, + "logps/chosen": -50.55379867553711, + "logps/rejected": -89.52428436279297, + "loss": 0.5217, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0618393421173096, + "rewards/margins": 7.044950008392334, + "rewards/rejected": -3.983111619949341, + "step": 13946 + }, + { + "epoch": 3.49, + "grad_norm": 7.354219913482666, + "learning_rate": 2.088163914006317e-06, + "logits/chosen": -0.5546844005584717, + "logits/rejected": -0.616989016532898, + "logps/chosen": -58.74855041503906, + "logps/rejected": -93.45899200439453, + "loss": 0.7468, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.809699535369873, + "rewards/margins": 5.4085373878479, + "rewards/rejected": -2.5988383293151855, + "step": 13947 + }, + { + "epoch": 3.49, + "grad_norm": 6.042010307312012, + "learning_rate": 2.0875250003108145e-06, + "logits/chosen": -0.49479708075523376, + "logits/rejected": -0.5730584263801575, + "logps/chosen": -60.998409271240234, + "logps/rejected": -103.5145492553711, + "loss": 0.6806, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0336403846740723, + "rewards/margins": 6.660843849182129, + "rewards/rejected": -3.627203941345215, + "step": 13948 + }, + { + "epoch": 3.49, + "grad_norm": 4.365046501159668, + "learning_rate": 2.08688615858567e-06, + "logits/chosen": -0.637671947479248, + "logits/rejected": -0.7109739780426025, + "logps/chosen": -64.59994506835938, + "logps/rejected": -101.50321960449219, + "loss": 0.6935, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1011545658111572, + "rewards/margins": 6.900570392608643, + "rewards/rejected": -3.7994155883789062, + "step": 13949 + }, + { + "epoch": 3.49, + "grad_norm": 4.285833358764648, + "learning_rate": 2.0862473888466645e-06, + "logits/chosen": -0.5657894611358643, + "logits/rejected": -0.6240354180335999, + "logps/chosen": -74.72804260253906, + "logps/rejected": -98.99488830566406, + "loss": 0.7253, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.005323886871338, + "rewards/margins": 6.499635696411133, + "rewards/rejected": -3.494311809539795, + "step": 13950 + }, + { + "epoch": 3.49, + "grad_norm": 4.5446600914001465, + "learning_rate": 2.0856086911095847e-06, + "logits/chosen": -0.4786462187767029, + "logits/rejected": -0.5272368788719177, + "logps/chosen": -53.04647445678711, + "logps/rejected": -108.26567840576172, + "loss": 0.6444, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9104113578796387, + "rewards/margins": 7.124283313751221, + "rewards/rejected": -4.213871955871582, + "step": 13951 + }, + { + "epoch": 3.49, + "grad_norm": 3.030994415283203, + "learning_rate": 2.0849700653902176e-06, + "logits/chosen": -0.5481524467468262, + "logits/rejected": -0.6378974914550781, + "logps/chosen": -60.92502212524414, + "logps/rejected": -129.41574096679688, + "loss": 0.6127, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2752952575683594, + "rewards/margins": 9.072101593017578, + "rewards/rejected": -5.796806335449219, + "step": 13952 + }, + { + "epoch": 3.49, + "grad_norm": 4.009594440460205, + "learning_rate": 2.0843315117043373e-06, + "logits/chosen": -0.5786826014518738, + "logits/rejected": -0.6678312420845032, + "logps/chosen": -53.09883117675781, + "logps/rejected": -120.17124938964844, + "loss": 0.6359, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1345205307006836, + "rewards/margins": 8.422590255737305, + "rewards/rejected": -5.288069725036621, + "step": 13953 + }, + { + "epoch": 3.49, + "grad_norm": 7.442895412445068, + "learning_rate": 2.0836930300677266e-06, + "logits/chosen": -0.6027575135231018, + "logits/rejected": -0.6700005531311035, + "logps/chosen": -63.55620193481445, + "logps/rejected": -95.63827514648438, + "loss": 0.8714, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.919933319091797, + "rewards/margins": 5.914336204528809, + "rewards/rejected": -2.99440336227417, + "step": 13954 + }, + { + "epoch": 3.49, + "grad_norm": 5.817948818206787, + "learning_rate": 2.0830546204961645e-06, + "logits/chosen": -0.5736029148101807, + "logits/rejected": -0.5912287831306458, + "logps/chosen": -42.352848052978516, + "logps/rejected": -101.77044677734375, + "loss": 0.5715, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.224818229675293, + "rewards/margins": 5.919116497039795, + "rewards/rejected": -2.694298505783081, + "step": 13955 + }, + { + "epoch": 3.49, + "grad_norm": 5.448991298675537, + "learning_rate": 2.0824162830054257e-06, + "logits/chosen": -0.6235910654067993, + "logits/rejected": -0.7022764682769775, + "logps/chosen": -53.383087158203125, + "logps/rejected": -105.15289306640625, + "loss": 0.6967, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7780604362487793, + "rewards/margins": 7.62969970703125, + "rewards/rejected": -4.851639747619629, + "step": 13956 + }, + { + "epoch": 3.49, + "grad_norm": 13.72426700592041, + "learning_rate": 2.0817780176112816e-06, + "logits/chosen": -0.4640714228153229, + "logits/rejected": -0.5913983583450317, + "logps/chosen": -64.92863464355469, + "logps/rejected": -88.72239685058594, + "loss": 0.6738, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4084649085998535, + "rewards/margins": 6.927639007568359, + "rewards/rejected": -3.5191736221313477, + "step": 13957 + }, + { + "epoch": 3.49, + "grad_norm": 5.5225725173950195, + "learning_rate": 2.081139824329509e-06, + "logits/chosen": -0.5256285071372986, + "logits/rejected": -0.598305344581604, + "logps/chosen": -54.53052520751953, + "logps/rejected": -128.16192626953125, + "loss": 0.6008, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2419395446777344, + "rewards/margins": 9.066402435302734, + "rewards/rejected": -5.824463844299316, + "step": 13958 + }, + { + "epoch": 3.49, + "grad_norm": 7.0891642570495605, + "learning_rate": 2.0805017031758756e-06, + "logits/chosen": -0.4942333400249481, + "logits/rejected": -0.6153682470321655, + "logps/chosen": -71.53268432617188, + "logps/rejected": -90.90132141113281, + "loss": 0.6921, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7101120948791504, + "rewards/margins": 5.847867488861084, + "rewards/rejected": -3.1377549171447754, + "step": 13959 + }, + { + "epoch": 3.49, + "grad_norm": 5.92012882232666, + "learning_rate": 2.0798636541661488e-06, + "logits/chosen": -0.5387704372406006, + "logits/rejected": -0.6278006434440613, + "logps/chosen": -56.248313903808594, + "logps/rejected": -114.34272766113281, + "loss": 0.5852, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0859107971191406, + "rewards/margins": 7.30360221862793, + "rewards/rejected": -4.217690944671631, + "step": 13960 + }, + { + "epoch": 3.49, + "grad_norm": 2.7114460468292236, + "learning_rate": 2.079225677316099e-06, + "logits/chosen": -0.620122492313385, + "logits/rejected": -0.6864988803863525, + "logps/chosen": -50.503013610839844, + "logps/rejected": -124.85052490234375, + "loss": 0.5883, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.158088207244873, + "rewards/margins": 8.617461204528809, + "rewards/rejected": -5.459372520446777, + "step": 13961 + }, + { + "epoch": 3.49, + "grad_norm": 3.9713034629821777, + "learning_rate": 2.0785877726414877e-06, + "logits/chosen": -0.5655238032341003, + "logits/rejected": -0.6784954071044922, + "logps/chosen": -57.50684356689453, + "logps/rejected": -99.7399673461914, + "loss": 0.6302, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5985898971557617, + "rewards/margins": 7.871621131896973, + "rewards/rejected": -4.273031234741211, + "step": 13962 + }, + { + "epoch": 3.49, + "grad_norm": 48.3369255065918, + "learning_rate": 2.077949940158081e-06, + "logits/chosen": -0.6108817458152771, + "logits/rejected": -0.7191736698150635, + "logps/chosen": -61.27603530883789, + "logps/rejected": -81.6289291381836, + "loss": 0.7813, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.857759475708008, + "rewards/margins": 6.035027980804443, + "rewards/rejected": -3.1772677898406982, + "step": 13963 + }, + { + "epoch": 3.49, + "grad_norm": 4.892452239990234, + "learning_rate": 2.07731217988164e-06, + "logits/chosen": -0.4982255697250366, + "logits/rejected": -0.5814648270606995, + "logps/chosen": -56.215110778808594, + "logps/rejected": -103.09640502929688, + "loss": 0.7002, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.089338541030884, + "rewards/margins": 6.149715423583984, + "rewards/rejected": -3.0603771209716797, + "step": 13964 + }, + { + "epoch": 3.49, + "grad_norm": 6.602901935577393, + "learning_rate": 2.076674491827922e-06, + "logits/chosen": -0.6181696057319641, + "logits/rejected": -0.628061056137085, + "logps/chosen": -46.113624572753906, + "logps/rejected": -98.40172576904297, + "loss": 0.653, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5185577869415283, + "rewards/margins": 6.012089729309082, + "rewards/rejected": -2.4935317039489746, + "step": 13965 + }, + { + "epoch": 3.49, + "grad_norm": 9.391653060913086, + "learning_rate": 2.076036876012688e-06, + "logits/chosen": -0.5037298798561096, + "logits/rejected": -0.5825454592704773, + "logps/chosen": -49.36949920654297, + "logps/rejected": -103.2413558959961, + "loss": 0.6365, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.864457607269287, + "rewards/margins": 7.1469831466674805, + "rewards/rejected": -4.282524585723877, + "step": 13966 + }, + { + "epoch": 3.49, + "grad_norm": 5.69437313079834, + "learning_rate": 2.075399332451693e-06, + "logits/chosen": -0.5891388058662415, + "logits/rejected": -0.6756871342658997, + "logps/chosen": -53.757442474365234, + "logps/rejected": -112.83740234375, + "loss": 0.6072, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7735328674316406, + "rewards/margins": 7.623420715332031, + "rewards/rejected": -4.849886894226074, + "step": 13967 + }, + { + "epoch": 3.49, + "grad_norm": 49.596561431884766, + "learning_rate": 2.0747618611606894e-06, + "logits/chosen": -0.5523674488067627, + "logits/rejected": -0.6447967886924744, + "logps/chosen": -62.693904876708984, + "logps/rejected": -99.96763610839844, + "loss": 0.8695, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.752898693084717, + "rewards/margins": 5.893394470214844, + "rewards/rejected": -3.140495777130127, + "step": 13968 + }, + { + "epoch": 3.49, + "grad_norm": 6.043649673461914, + "learning_rate": 2.074124462155432e-06, + "logits/chosen": -0.5797048807144165, + "logits/rejected": -0.6614863872528076, + "logps/chosen": -59.839725494384766, + "logps/rejected": -104.5802001953125, + "loss": 0.6411, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2126951217651367, + "rewards/margins": 7.584719181060791, + "rewards/rejected": -4.372023582458496, + "step": 13969 + }, + { + "epoch": 3.49, + "grad_norm": 6.188208103179932, + "learning_rate": 2.073487135451673e-06, + "logits/chosen": -0.5770612955093384, + "logits/rejected": -0.6247917413711548, + "logps/chosen": -56.26826858520508, + "logps/rejected": -119.44689178466797, + "loss": 0.6988, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.213198184967041, + "rewards/margins": 6.8433380126953125, + "rewards/rejected": -3.6301398277282715, + "step": 13970 + }, + { + "epoch": 3.5, + "grad_norm": 6.686397075653076, + "learning_rate": 2.072849881065159e-06, + "logits/chosen": -0.6087473630905151, + "logits/rejected": -0.7262066602706909, + "logps/chosen": -58.637237548828125, + "logps/rejected": -83.03520202636719, + "loss": 0.5844, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.174345016479492, + "rewards/margins": 7.0712738037109375, + "rewards/rejected": -3.896928548812866, + "step": 13971 + }, + { + "epoch": 3.5, + "grad_norm": 6.512042999267578, + "learning_rate": 2.0722126990116364e-06, + "logits/chosen": -0.5081581473350525, + "logits/rejected": -0.5940134525299072, + "logps/chosen": -68.43310546875, + "logps/rejected": -125.70672607421875, + "loss": 0.6576, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2496094703674316, + "rewards/margins": 8.633971214294434, + "rewards/rejected": -5.38436222076416, + "step": 13972 + }, + { + "epoch": 3.5, + "grad_norm": 8.038542747497559, + "learning_rate": 2.0715755893068547e-06, + "logits/chosen": -0.5932491421699524, + "logits/rejected": -0.6492445468902588, + "logps/chosen": -53.788936614990234, + "logps/rejected": -107.46487426757812, + "loss": 0.6387, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0994882583618164, + "rewards/margins": 6.806203842163086, + "rewards/rejected": -3.7067155838012695, + "step": 13973 + }, + { + "epoch": 3.5, + "grad_norm": 3.57521653175354, + "learning_rate": 2.0709385519665537e-06, + "logits/chosen": -0.5974677801132202, + "logits/rejected": -0.6755505800247192, + "logps/chosen": -43.80217742919922, + "logps/rejected": -112.38529968261719, + "loss": 0.5402, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2626330852508545, + "rewards/margins": 8.531180381774902, + "rewards/rejected": -5.268547058105469, + "step": 13974 + }, + { + "epoch": 3.5, + "grad_norm": 4.195201873779297, + "learning_rate": 2.070301587006476e-06, + "logits/chosen": -0.5255947709083557, + "logits/rejected": -0.6570063829421997, + "logps/chosen": -58.48908615112305, + "logps/rejected": -110.87789154052734, + "loss": 0.5506, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9677491188049316, + "rewards/margins": 7.930948734283447, + "rewards/rejected": -4.963200092315674, + "step": 13975 + }, + { + "epoch": 3.5, + "grad_norm": 2.194873809814453, + "learning_rate": 2.0696646944423633e-06, + "logits/chosen": -0.5135886073112488, + "logits/rejected": -0.6432660818099976, + "logps/chosen": -67.55622863769531, + "logps/rejected": -94.33306121826172, + "loss": 0.5971, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8760528564453125, + "rewards/margins": 8.20200252532959, + "rewards/rejected": -5.325949668884277, + "step": 13976 + }, + { + "epoch": 3.5, + "grad_norm": 4.6097941398620605, + "learning_rate": 2.0690278742899526e-06, + "logits/chosen": -0.6163483262062073, + "logits/rejected": -0.6739877462387085, + "logps/chosen": -45.656307220458984, + "logps/rejected": -96.6647720336914, + "loss": 0.6329, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.053628921508789, + "rewards/margins": 6.734580993652344, + "rewards/rejected": -3.6809520721435547, + "step": 13977 + }, + { + "epoch": 3.5, + "grad_norm": 9.89595890045166, + "learning_rate": 2.0683911265649796e-06, + "logits/chosen": -0.5605137944221497, + "logits/rejected": -0.6368021368980408, + "logps/chosen": -64.58439636230469, + "logps/rejected": -105.44293212890625, + "loss": 0.7837, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9710450172424316, + "rewards/margins": 6.706777095794678, + "rewards/rejected": -3.735731840133667, + "step": 13978 + }, + { + "epoch": 3.5, + "grad_norm": 2.1158783435821533, + "learning_rate": 2.0677544512831817e-06, + "logits/chosen": -0.5713139772415161, + "logits/rejected": -0.6441764831542969, + "logps/chosen": -49.93641662597656, + "logps/rejected": -96.36643981933594, + "loss": 0.519, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1847617626190186, + "rewards/margins": 6.160500526428223, + "rewards/rejected": -2.975738048553467, + "step": 13979 + }, + { + "epoch": 3.5, + "grad_norm": 7.393056869506836, + "learning_rate": 2.0671178484602883e-06, + "logits/chosen": -0.5314658880233765, + "logits/rejected": -0.578075647354126, + "logps/chosen": -47.872352600097656, + "logps/rejected": -123.12368774414062, + "loss": 0.4685, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.295884132385254, + "rewards/margins": 8.206366539001465, + "rewards/rejected": -4.910482406616211, + "step": 13980 + }, + { + "epoch": 3.5, + "grad_norm": 8.409868240356445, + "learning_rate": 2.0664813181120343e-06, + "logits/chosen": -0.5728209614753723, + "logits/rejected": -0.63429194688797, + "logps/chosen": -61.555721282958984, + "logps/rejected": -130.55067443847656, + "loss": 0.6738, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.996066093444824, + "rewards/margins": 7.944809913635254, + "rewards/rejected": -4.94874382019043, + "step": 13981 + }, + { + "epoch": 3.5, + "grad_norm": 5.536940097808838, + "learning_rate": 2.065844860254145e-06, + "logits/chosen": -0.6447328329086304, + "logits/rejected": -0.7460836172103882, + "logps/chosen": -43.83633041381836, + "logps/rejected": -103.89566040039062, + "loss": 0.5016, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9283905029296875, + "rewards/margins": 7.003873825073242, + "rewards/rejected": -4.075483322143555, + "step": 13982 + }, + { + "epoch": 3.5, + "grad_norm": 2.838444471359253, + "learning_rate": 2.065208474902352e-06, + "logits/chosen": -0.5116596817970276, + "logits/rejected": -0.6155194640159607, + "logps/chosen": -60.211727142333984, + "logps/rejected": -99.9427490234375, + "loss": 0.6092, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1974968910217285, + "rewards/margins": 6.79248571395874, + "rewards/rejected": -3.5949888229370117, + "step": 13983 + }, + { + "epoch": 3.5, + "grad_norm": 4.147797584533691, + "learning_rate": 2.064572162072379e-06, + "logits/chosen": -0.5309619903564453, + "logits/rejected": -0.6547262668609619, + "logps/chosen": -49.81623077392578, + "logps/rejected": -113.91657257080078, + "loss": 0.5344, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4158222675323486, + "rewards/margins": 9.693870544433594, + "rewards/rejected": -6.278048038482666, + "step": 13984 + }, + { + "epoch": 3.5, + "grad_norm": 3.8823976516723633, + "learning_rate": 2.063935921779949e-06, + "logits/chosen": -0.5818180441856384, + "logits/rejected": -0.62235426902771, + "logps/chosen": -53.691612243652344, + "logps/rejected": -96.18609619140625, + "loss": 0.6419, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9806926250457764, + "rewards/margins": 7.313397407531738, + "rewards/rejected": -4.332705497741699, + "step": 13985 + }, + { + "epoch": 3.5, + "grad_norm": 12.085996627807617, + "learning_rate": 2.0632997540407867e-06, + "logits/chosen": -0.6019939184188843, + "logits/rejected": -0.6808979511260986, + "logps/chosen": -48.941837310791016, + "logps/rejected": -106.03076934814453, + "loss": 0.6258, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9833836555480957, + "rewards/margins": 6.560389518737793, + "rewards/rejected": -3.5770058631896973, + "step": 13986 + }, + { + "epoch": 3.5, + "grad_norm": 16.57223892211914, + "learning_rate": 2.062663658870609e-06, + "logits/chosen": -0.604674756526947, + "logits/rejected": -0.6670071482658386, + "logps/chosen": -52.34849548339844, + "logps/rejected": -130.96006774902344, + "loss": 0.6155, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.379209518432617, + "rewards/margins": 8.643884658813477, + "rewards/rejected": -5.264674663543701, + "step": 13987 + }, + { + "epoch": 3.5, + "grad_norm": 12.174842834472656, + "learning_rate": 2.0620276362851386e-06, + "logits/chosen": -0.5827159881591797, + "logits/rejected": -0.6162727475166321, + "logps/chosen": -48.59065246582031, + "logps/rejected": -97.43719482421875, + "loss": 0.7252, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2436790466308594, + "rewards/margins": 6.453569412231445, + "rewards/rejected": -3.209890365600586, + "step": 13988 + }, + { + "epoch": 3.5, + "grad_norm": 3.1591804027557373, + "learning_rate": 2.0613916863000906e-06, + "logits/chosen": -0.6483585834503174, + "logits/rejected": -0.7082776427268982, + "logps/chosen": -47.82339096069336, + "logps/rejected": -101.96111297607422, + "loss": 0.57, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5929901599884033, + "rewards/margins": 7.176695823669434, + "rewards/rejected": -3.583705425262451, + "step": 13989 + }, + { + "epoch": 3.5, + "grad_norm": 13.462864875793457, + "learning_rate": 2.0607558089311776e-06, + "logits/chosen": -0.5505068302154541, + "logits/rejected": -0.6309493780136108, + "logps/chosen": -58.22480010986328, + "logps/rejected": -113.4165267944336, + "loss": 0.6357, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.440565347671509, + "rewards/margins": 7.7648468017578125, + "rewards/rejected": -4.324281692504883, + "step": 13990 + }, + { + "epoch": 3.5, + "grad_norm": 2.988585948944092, + "learning_rate": 2.0601200041941168e-06, + "logits/chosen": -0.5419399738311768, + "logits/rejected": -0.6420612931251526, + "logps/chosen": -56.36203384399414, + "logps/rejected": -98.86676788330078, + "loss": 0.6209, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.14518141746521, + "rewards/margins": 7.305734157562256, + "rewards/rejected": -4.160552978515625, + "step": 13991 + }, + { + "epoch": 3.5, + "grad_norm": 4.269409656524658, + "learning_rate": 2.0594842721046175e-06, + "logits/chosen": -0.4907723069190979, + "logits/rejected": -0.5626611709594727, + "logps/chosen": -58.93073654174805, + "logps/rejected": -100.51528930664062, + "loss": 0.645, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.403460741043091, + "rewards/margins": 6.975841522216797, + "rewards/rejected": -3.5723812580108643, + "step": 13992 + }, + { + "epoch": 3.5, + "grad_norm": 3.5168354511260986, + "learning_rate": 2.058848612678388e-06, + "logits/chosen": -0.5733715891838074, + "logits/rejected": -0.6662912368774414, + "logps/chosen": -44.61022186279297, + "logps/rejected": -85.98994445800781, + "loss": 0.5529, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3604812622070312, + "rewards/margins": 6.734297275543213, + "rewards/rejected": -3.3738162517547607, + "step": 13993 + }, + { + "epoch": 3.5, + "grad_norm": 12.05023193359375, + "learning_rate": 2.058213025931137e-06, + "logits/chosen": -0.5144742131233215, + "logits/rejected": -0.5663151144981384, + "logps/chosen": -68.31968688964844, + "logps/rejected": -109.31401824951172, + "loss": 0.7271, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0955331325531006, + "rewards/margins": 6.8506059646606445, + "rewards/rejected": -3.755072593688965, + "step": 13994 + }, + { + "epoch": 3.5, + "grad_norm": 8.29123592376709, + "learning_rate": 2.057577511878576e-06, + "logits/chosen": -0.617138683795929, + "logits/rejected": -0.6676012277603149, + "logps/chosen": -50.589839935302734, + "logps/rejected": -78.83330535888672, + "loss": 0.6578, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2940797805786133, + "rewards/margins": 5.302597999572754, + "rewards/rejected": -2.0085179805755615, + "step": 13995 + }, + { + "epoch": 3.5, + "grad_norm": 4.662010192871094, + "learning_rate": 2.0569420705364e-06, + "logits/chosen": -0.4834834039211273, + "logits/rejected": -0.5936951041221619, + "logps/chosen": -53.026058197021484, + "logps/rejected": -123.33161926269531, + "loss": 0.6104, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1507139205932617, + "rewards/margins": 8.297637939453125, + "rewards/rejected": -5.146923065185547, + "step": 13996 + }, + { + "epoch": 3.5, + "grad_norm": 6.24727725982666, + "learning_rate": 2.0563067019203155e-06, + "logits/chosen": -0.5652474761009216, + "logits/rejected": -0.6965495347976685, + "logps/chosen": -51.88113021850586, + "logps/rejected": -80.74417114257812, + "loss": 0.6563, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.104466438293457, + "rewards/margins": 6.2517499923706055, + "rewards/rejected": -3.1472837924957275, + "step": 13997 + }, + { + "epoch": 3.5, + "grad_norm": 4.1449174880981445, + "learning_rate": 2.055671406046025e-06, + "logits/chosen": -0.5467185378074646, + "logits/rejected": -0.6670193672180176, + "logps/chosen": -61.153228759765625, + "logps/rejected": -95.72286987304688, + "loss": 0.5923, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4411933422088623, + "rewards/margins": 7.565067768096924, + "rewards/rejected": -4.123874664306641, + "step": 13998 + }, + { + "epoch": 3.5, + "grad_norm": 1.8779125213623047, + "learning_rate": 2.0550361829292265e-06, + "logits/chosen": -0.5413002371788025, + "logits/rejected": -0.6650876998901367, + "logps/chosen": -45.68696594238281, + "logps/rejected": -107.38264465332031, + "loss": 0.5094, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2743546962738037, + "rewards/margins": 7.908840179443359, + "rewards/rejected": -4.634484767913818, + "step": 13999 + }, + { + "epoch": 3.5, + "grad_norm": 7.797316551208496, + "learning_rate": 2.0544010325856146e-06, + "logits/chosen": -0.5499165058135986, + "logits/rejected": -0.5779799222946167, + "logps/chosen": -57.81241989135742, + "logps/rejected": -109.67816162109375, + "loss": 0.7133, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0688796043395996, + "rewards/margins": 5.5066938400268555, + "rewards/rejected": -2.437814235687256, + "step": 14000 + }, + { + "epoch": 3.5, + "grad_norm": 3.5720343589782715, + "learning_rate": 2.053765955030888e-06, + "logits/chosen": -0.5705288648605347, + "logits/rejected": -0.6099535822868347, + "logps/chosen": -40.24630355834961, + "logps/rejected": -109.46465301513672, + "loss": 0.5854, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2530508041381836, + "rewards/margins": 6.0617828369140625, + "rewards/rejected": -2.8087315559387207, + "step": 14001 + }, + { + "epoch": 3.5, + "grad_norm": 4.568950176239014, + "learning_rate": 2.0531309502807378e-06, + "logits/chosen": -0.4722024202346802, + "logits/rejected": -0.5745512247085571, + "logps/chosen": -58.72495651245117, + "logps/rejected": -131.11550903320312, + "loss": 0.6288, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8353466987609863, + "rewards/margins": 8.18686294555664, + "rewards/rejected": -5.35151481628418, + "step": 14002 + }, + { + "epoch": 3.5, + "grad_norm": 10.267629623413086, + "learning_rate": 2.052496018350855e-06, + "logits/chosen": -0.6256712079048157, + "logits/rejected": -0.6999974250793457, + "logps/chosen": -54.59095764160156, + "logps/rejected": -93.79400634765625, + "loss": 0.6683, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.992246627807617, + "rewards/margins": 6.39172887802124, + "rewards/rejected": -3.3994827270507812, + "step": 14003 + }, + { + "epoch": 3.5, + "grad_norm": 10.685800552368164, + "learning_rate": 2.0518611592569326e-06, + "logits/chosen": -0.5425812602043152, + "logits/rejected": -0.5978420376777649, + "logps/chosen": -48.74365997314453, + "logps/rejected": -105.22386169433594, + "loss": 0.6561, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.003837823867798, + "rewards/margins": 6.650484561920166, + "rewards/rejected": -3.64664626121521, + "step": 14004 + }, + { + "epoch": 3.5, + "grad_norm": 5.530854225158691, + "learning_rate": 2.0512263730146547e-06, + "logits/chosen": -0.5178535580635071, + "logits/rejected": -0.5716143846511841, + "logps/chosen": -59.886573791503906, + "logps/rejected": -108.82052612304688, + "loss": 0.6858, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9549920558929443, + "rewards/margins": 6.449239730834961, + "rewards/rejected": -3.4942474365234375, + "step": 14005 + }, + { + "epoch": 3.5, + "grad_norm": 21.382259368896484, + "learning_rate": 2.0505916596397113e-06, + "logits/chosen": -0.4884546399116516, + "logits/rejected": -0.5256190896034241, + "logps/chosen": -62.131370544433594, + "logps/rejected": -98.19932556152344, + "loss": 0.9497, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.730623722076416, + "rewards/margins": 5.436678409576416, + "rewards/rejected": -2.706054925918579, + "step": 14006 + }, + { + "epoch": 3.5, + "grad_norm": 9.337278366088867, + "learning_rate": 2.049957019147785e-06, + "logits/chosen": -0.5535391569137573, + "logits/rejected": -0.7060115337371826, + "logps/chosen": -55.87709045410156, + "logps/rejected": -100.36932373046875, + "loss": 0.6036, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.069716215133667, + "rewards/margins": 7.709290504455566, + "rewards/rejected": -4.6395745277404785, + "step": 14007 + }, + { + "epoch": 3.5, + "grad_norm": 3.9747326374053955, + "learning_rate": 2.0493224515545574e-06, + "logits/chosen": -0.5680084228515625, + "logits/rejected": -0.6202614307403564, + "logps/chosen": -54.009376525878906, + "logps/rejected": -90.19439697265625, + "loss": 0.6163, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9405417442321777, + "rewards/margins": 6.209926128387451, + "rewards/rejected": -3.2693843841552734, + "step": 14008 + }, + { + "epoch": 3.5, + "grad_norm": 16.736404418945312, + "learning_rate": 2.0486879568757117e-06, + "logits/chosen": -0.5432124733924866, + "logits/rejected": -0.6996200680732727, + "logps/chosen": -61.73832321166992, + "logps/rejected": -89.19371032714844, + "loss": 0.6656, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8943586349487305, + "rewards/margins": 6.591471195220947, + "rewards/rejected": -3.6971123218536377, + "step": 14009 + }, + { + "epoch": 3.5, + "grad_norm": 4.241127014160156, + "learning_rate": 2.048053535126926e-06, + "logits/chosen": -0.47171691060066223, + "logits/rejected": -0.6773514747619629, + "logps/chosen": -73.20729064941406, + "logps/rejected": -109.59493255615234, + "loss": 0.5961, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3111720085144043, + "rewards/margins": 8.526060104370117, + "rewards/rejected": -5.214887619018555, + "step": 14010 + }, + { + "epoch": 3.51, + "grad_norm": 7.066571235656738, + "learning_rate": 2.0474191863238753e-06, + "logits/chosen": -0.6139850616455078, + "logits/rejected": -0.6669202446937561, + "logps/chosen": -51.50849533081055, + "logps/rejected": -112.54785919189453, + "loss": 0.7058, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1177079677581787, + "rewards/margins": 6.710292816162109, + "rewards/rejected": -3.592585325241089, + "step": 14011 + }, + { + "epoch": 3.51, + "grad_norm": 4.1763081550598145, + "learning_rate": 2.0467849104822368e-06, + "logits/chosen": -0.5079350471496582, + "logits/rejected": -0.5978407859802246, + "logps/chosen": -50.11346435546875, + "logps/rejected": -86.49261474609375, + "loss": 0.6465, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.958996534347534, + "rewards/margins": 6.052122116088867, + "rewards/rejected": -3.093125820159912, + "step": 14012 + }, + { + "epoch": 3.51, + "grad_norm": 16.27556610107422, + "learning_rate": 2.046150707617687e-06, + "logits/chosen": -0.598686158657074, + "logits/rejected": -0.7066898941993713, + "logps/chosen": -66.10533905029297, + "logps/rejected": -105.42291259765625, + "loss": 0.7167, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.145808458328247, + "rewards/margins": 7.571643829345703, + "rewards/rejected": -5.4258341789245605, + "step": 14013 + }, + { + "epoch": 3.51, + "grad_norm": 10.383798599243164, + "learning_rate": 2.045516577745894e-06, + "logits/chosen": -0.5109329223632812, + "logits/rejected": -0.6343653202056885, + "logps/chosen": -71.28443908691406, + "logps/rejected": -90.51958465576172, + "loss": 0.7659, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9097790718078613, + "rewards/margins": 5.855165004730225, + "rewards/rejected": -2.9453859329223633, + "step": 14014 + }, + { + "epoch": 3.51, + "grad_norm": 8.217830657958984, + "learning_rate": 2.044882520882528e-06, + "logits/chosen": -0.5193871855735779, + "logits/rejected": -0.578471302986145, + "logps/chosen": -61.433746337890625, + "logps/rejected": -105.56692504882812, + "loss": 0.7899, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8116471767425537, + "rewards/margins": 6.506763458251953, + "rewards/rejected": -3.695115804672241, + "step": 14015 + }, + { + "epoch": 3.51, + "grad_norm": 3.5457539558410645, + "learning_rate": 2.0442485370432593e-06, + "logits/chosen": -0.590775191783905, + "logits/rejected": -0.6680428385734558, + "logps/chosen": -52.69710159301758, + "logps/rejected": -82.92365264892578, + "loss": 0.7854, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5899226665496826, + "rewards/margins": 6.1872782707214355, + "rewards/rejected": -3.597355604171753, + "step": 14016 + }, + { + "epoch": 3.51, + "grad_norm": 4.217341423034668, + "learning_rate": 2.0436146262437533e-06, + "logits/chosen": -0.522180438041687, + "logits/rejected": -0.6152845025062561, + "logps/chosen": -62.137542724609375, + "logps/rejected": -119.46685028076172, + "loss": 0.5733, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0839593410491943, + "rewards/margins": 8.126008033752441, + "rewards/rejected": -5.042049884796143, + "step": 14017 + }, + { + "epoch": 3.51, + "grad_norm": 4.541460037231445, + "learning_rate": 2.0429807884996727e-06, + "logits/chosen": -0.48809367418289185, + "logits/rejected": -0.5381878614425659, + "logps/chosen": -58.540611267089844, + "logps/rejected": -99.96614074707031, + "loss": 0.6884, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.983628988265991, + "rewards/margins": 5.488424777984619, + "rewards/rejected": -2.504795789718628, + "step": 14018 + }, + { + "epoch": 3.51, + "grad_norm": 10.841604232788086, + "learning_rate": 2.0423470238266833e-06, + "logits/chosen": -0.5021987557411194, + "logits/rejected": -0.5894396901130676, + "logps/chosen": -74.30392456054688, + "logps/rejected": -119.58747863769531, + "loss": 0.7217, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7741219997406006, + "rewards/margins": 7.573240756988525, + "rewards/rejected": -4.799118995666504, + "step": 14019 + }, + { + "epoch": 3.51, + "grad_norm": 3.796724557876587, + "learning_rate": 2.041713332240446e-06, + "logits/chosen": -0.5170702338218689, + "logits/rejected": -0.6502444744110107, + "logps/chosen": -52.44617462158203, + "logps/rejected": -79.75035858154297, + "loss": 0.5362, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.166443347930908, + "rewards/margins": 6.649489879608154, + "rewards/rejected": -3.483046531677246, + "step": 14020 + }, + { + "epoch": 3.51, + "grad_norm": 6.086580753326416, + "learning_rate": 2.0410797137566162e-06, + "logits/chosen": -0.5251854658126831, + "logits/rejected": -0.5400820374488831, + "logps/chosen": -46.05712127685547, + "logps/rejected": -120.27037048339844, + "loss": 0.6164, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1708712577819824, + "rewards/margins": 6.903239727020264, + "rewards/rejected": -3.7323687076568604, + "step": 14021 + }, + { + "epoch": 3.51, + "grad_norm": 6.649606704711914, + "learning_rate": 2.040446168390856e-06, + "logits/chosen": -0.4913008213043213, + "logits/rejected": -0.5150989294052124, + "logps/chosen": -55.527408599853516, + "logps/rejected": -132.9143829345703, + "loss": 0.6171, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.971724271774292, + "rewards/margins": 7.1938371658325195, + "rewards/rejected": -4.222112655639648, + "step": 14022 + }, + { + "epoch": 3.51, + "grad_norm": 4.2919793128967285, + "learning_rate": 2.0398126961588172e-06, + "logits/chosen": -0.5405352115631104, + "logits/rejected": -0.6345509886741638, + "logps/chosen": -52.37356948852539, + "logps/rejected": -114.46382904052734, + "loss": 0.6247, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0716633796691895, + "rewards/margins": 8.162774085998535, + "rewards/rejected": -5.0911102294921875, + "step": 14023 + }, + { + "epoch": 3.51, + "grad_norm": 4.257460117340088, + "learning_rate": 2.0391792970761564e-06, + "logits/chosen": -0.5299888849258423, + "logits/rejected": -0.6194700598716736, + "logps/chosen": -52.145179748535156, + "logps/rejected": -85.66677856445312, + "loss": 0.6176, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.956827163696289, + "rewards/margins": 5.7900872230529785, + "rewards/rejected": -2.8332600593566895, + "step": 14024 + }, + { + "epoch": 3.51, + "grad_norm": 9.914484977722168, + "learning_rate": 2.038545971158523e-06, + "logits/chosen": -0.553242027759552, + "logits/rejected": -0.6515801548957825, + "logps/chosen": -49.427589416503906, + "logps/rejected": -89.48129272460938, + "loss": 0.7282, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.288987159729004, + "rewards/margins": 7.5682783126831055, + "rewards/rejected": -4.27929162979126, + "step": 14025 + }, + { + "epoch": 3.51, + "grad_norm": 23.872859954833984, + "learning_rate": 2.0379127184215706e-06, + "logits/chosen": -0.5452751517295837, + "logits/rejected": -0.6365318298339844, + "logps/chosen": -56.16336441040039, + "logps/rejected": -95.40806579589844, + "loss": 0.6869, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1874892711639404, + "rewards/margins": 5.933675289154053, + "rewards/rejected": -2.746185779571533, + "step": 14026 + }, + { + "epoch": 3.51, + "grad_norm": 8.669049263000488, + "learning_rate": 2.0372795388809457e-06, + "logits/chosen": -0.5344294905662537, + "logits/rejected": -0.6552004218101501, + "logps/chosen": -46.97744369506836, + "logps/rejected": -90.7254409790039, + "loss": 0.5839, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.915958881378174, + "rewards/margins": 7.028416633605957, + "rewards/rejected": -4.112457752227783, + "step": 14027 + }, + { + "epoch": 3.51, + "grad_norm": 3.4611825942993164, + "learning_rate": 2.0366464325522926e-06, + "logits/chosen": -0.5563005805015564, + "logits/rejected": -0.6207911968231201, + "logps/chosen": -64.59766387939453, + "logps/rejected": -117.486328125, + "loss": 0.6459, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0587992668151855, + "rewards/margins": 6.955709457397461, + "rewards/rejected": -3.8969099521636963, + "step": 14028 + }, + { + "epoch": 3.51, + "grad_norm": 4.650076866149902, + "learning_rate": 2.03601339945126e-06, + "logits/chosen": -0.5352379679679871, + "logits/rejected": -0.6011978387832642, + "logps/chosen": -55.779502868652344, + "logps/rejected": -96.85731506347656, + "loss": 0.6118, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3751206398010254, + "rewards/margins": 6.802095890045166, + "rewards/rejected": -3.426975965499878, + "step": 14029 + }, + { + "epoch": 3.51, + "grad_norm": 3.29436993598938, + "learning_rate": 2.035380439593487e-06, + "logits/chosen": -0.5015496611595154, + "logits/rejected": -0.5862900614738464, + "logps/chosen": -49.815673828125, + "logps/rejected": -114.62120819091797, + "loss": 0.6236, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.363628625869751, + "rewards/margins": 7.455355644226074, + "rewards/rejected": -4.091726779937744, + "step": 14030 + }, + { + "epoch": 3.51, + "grad_norm": 7.404783248901367, + "learning_rate": 2.0347475529946186e-06, + "logits/chosen": -0.5354757308959961, + "logits/rejected": -0.5842068791389465, + "logps/chosen": -59.40271759033203, + "logps/rejected": -114.38092803955078, + "loss": 0.6347, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2670836448669434, + "rewards/margins": 6.446923732757568, + "rewards/rejected": -3.179839849472046, + "step": 14031 + }, + { + "epoch": 3.51, + "grad_norm": 4.154367923736572, + "learning_rate": 2.0341147396702922e-06, + "logits/chosen": -0.5323320627212524, + "logits/rejected": -0.6042686700820923, + "logps/chosen": -63.082374572753906, + "logps/rejected": -105.56682586669922, + "loss": 0.5829, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.005244255065918, + "rewards/margins": 7.137079238891602, + "rewards/rejected": -4.131834506988525, + "step": 14032 + }, + { + "epoch": 3.51, + "grad_norm": 3.870887279510498, + "learning_rate": 2.033481999636143e-06, + "logits/chosen": -0.5290161371231079, + "logits/rejected": -0.5618388652801514, + "logps/chosen": -60.45429611206055, + "logps/rejected": -128.39044189453125, + "loss": 0.6194, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.239187240600586, + "rewards/margins": 8.487508773803711, + "rewards/rejected": -5.248320579528809, + "step": 14033 + }, + { + "epoch": 3.51, + "grad_norm": 4.724803924560547, + "learning_rate": 2.0328493329078114e-06, + "logits/chosen": -0.5133908987045288, + "logits/rejected": -0.5933071374893188, + "logps/chosen": -56.60990524291992, + "logps/rejected": -103.80557250976562, + "loss": 0.6264, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0123400688171387, + "rewards/margins": 6.916386127471924, + "rewards/rejected": -3.904045581817627, + "step": 14034 + }, + { + "epoch": 3.51, + "grad_norm": 3.8998332023620605, + "learning_rate": 2.0322167395009286e-06, + "logits/chosen": -0.5430036783218384, + "logits/rejected": -0.6522711515426636, + "logps/chosen": -57.817848205566406, + "logps/rejected": -95.61019897460938, + "loss": 0.6037, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1256422996520996, + "rewards/margins": 7.272934913635254, + "rewards/rejected": -4.147292137145996, + "step": 14035 + }, + { + "epoch": 3.51, + "grad_norm": 4.737053871154785, + "learning_rate": 2.0315842194311247e-06, + "logits/chosen": -0.5617335438728333, + "logits/rejected": -0.6792757511138916, + "logps/chosen": -63.7706413269043, + "logps/rejected": -98.15154266357422, + "loss": 0.5956, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.926028251647949, + "rewards/margins": 6.90370512008667, + "rewards/rejected": -3.9776768684387207, + "step": 14036 + }, + { + "epoch": 3.51, + "grad_norm": 9.319992065429688, + "learning_rate": 2.030951772714032e-06, + "logits/chosen": -0.574306845664978, + "logits/rejected": -0.6496308445930481, + "logps/chosen": -53.90462112426758, + "logps/rejected": -91.07015991210938, + "loss": 0.7615, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.929076910018921, + "rewards/margins": 5.903865337371826, + "rewards/rejected": -2.9747889041900635, + "step": 14037 + }, + { + "epoch": 3.51, + "grad_norm": 6.4095611572265625, + "learning_rate": 2.030319399365283e-06, + "logits/chosen": -0.5780623555183411, + "logits/rejected": -0.6387174725532532, + "logps/chosen": -50.225685119628906, + "logps/rejected": -115.96941375732422, + "loss": 0.5921, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.966818332672119, + "rewards/margins": 5.5824689865112305, + "rewards/rejected": -2.6156506538391113, + "step": 14038 + }, + { + "epoch": 3.51, + "grad_norm": 5.0524678230285645, + "learning_rate": 2.029687099400497e-06, + "logits/chosen": -0.5446112155914307, + "logits/rejected": -0.6658383011817932, + "logps/chosen": -52.557777404785156, + "logps/rejected": -88.6576156616211, + "loss": 0.6218, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3358726501464844, + "rewards/margins": 6.595296382904053, + "rewards/rejected": -3.2594237327575684, + "step": 14039 + }, + { + "epoch": 3.51, + "grad_norm": 3.13492751121521, + "learning_rate": 2.0290548728353015e-06, + "logits/chosen": -0.4500560462474823, + "logits/rejected": -0.519328773021698, + "logps/chosen": -57.747100830078125, + "logps/rejected": -103.85189056396484, + "loss": 0.6139, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.007051467895508, + "rewards/margins": 6.6784772872924805, + "rewards/rejected": -3.6714253425598145, + "step": 14040 + }, + { + "epoch": 3.51, + "grad_norm": 3.721855640411377, + "learning_rate": 2.028422719685324e-06, + "logits/chosen": -0.5191907286643982, + "logits/rejected": -0.6014495491981506, + "logps/chosen": -58.81528091430664, + "logps/rejected": -100.89907836914062, + "loss": 0.5488, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.989335298538208, + "rewards/margins": 6.943611145019531, + "rewards/rejected": -3.954275131225586, + "step": 14041 + }, + { + "epoch": 3.51, + "grad_norm": 4.0230326652526855, + "learning_rate": 2.0277906399661778e-06, + "logits/chosen": -0.580142080783844, + "logits/rejected": -0.694271981716156, + "logps/chosen": -54.21613311767578, + "logps/rejected": -98.24414825439453, + "loss": 0.6443, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6625990867614746, + "rewards/margins": 7.363692283630371, + "rewards/rejected": -4.7010931968688965, + "step": 14042 + }, + { + "epoch": 3.51, + "grad_norm": 6.141880035400391, + "learning_rate": 2.0271586336934866e-06, + "logits/chosen": -0.5471465587615967, + "logits/rejected": -0.5566697120666504, + "logps/chosen": -59.87607955932617, + "logps/rejected": -97.28072357177734, + "loss": 0.7821, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.019655466079712, + "rewards/margins": 5.306095123291016, + "rewards/rejected": -2.286440134048462, + "step": 14043 + }, + { + "epoch": 3.51, + "grad_norm": 7.066099643707275, + "learning_rate": 2.02652670088287e-06, + "logits/chosen": -0.5679196715354919, + "logits/rejected": -0.6390944123268127, + "logps/chosen": -48.46440124511719, + "logps/rejected": -103.26268005371094, + "loss": 0.6662, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.179657220840454, + "rewards/margins": 7.3675079345703125, + "rewards/rejected": -4.1878509521484375, + "step": 14044 + }, + { + "epoch": 3.51, + "grad_norm": 2.7462449073791504, + "learning_rate": 2.0258948415499406e-06, + "logits/chosen": -0.49965226650238037, + "logits/rejected": -0.6011403799057007, + "logps/chosen": -46.756221771240234, + "logps/rejected": -96.71231079101562, + "loss": 0.4603, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.109063148498535, + "rewards/margins": 6.973520278930664, + "rewards/rejected": -3.864457130432129, + "step": 14045 + }, + { + "epoch": 3.51, + "grad_norm": 2.153777837753296, + "learning_rate": 2.0252630557103115e-06, + "logits/chosen": -0.521487832069397, + "logits/rejected": -0.6199805736541748, + "logps/chosen": -55.7159423828125, + "logps/rejected": -112.19583892822266, + "loss": 0.5132, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0751802921295166, + "rewards/margins": 8.116514205932617, + "rewards/rejected": -5.04133415222168, + "step": 14046 + }, + { + "epoch": 3.51, + "grad_norm": 4.41098690032959, + "learning_rate": 2.024631343379598e-06, + "logits/chosen": -0.48812946677207947, + "logits/rejected": -0.58720463514328, + "logps/chosen": -51.43701171875, + "logps/rejected": -102.19111633300781, + "loss": 0.5524, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.109297037124634, + "rewards/margins": 6.867416858673096, + "rewards/rejected": -3.758119821548462, + "step": 14047 + }, + { + "epoch": 3.51, + "grad_norm": 18.76034164428711, + "learning_rate": 2.023999704573409e-06, + "logits/chosen": -0.5794418454170227, + "logits/rejected": -0.6357661485671997, + "logps/chosen": -48.186546325683594, + "logps/rejected": -114.56654357910156, + "loss": 0.6313, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.760024309158325, + "rewards/margins": 7.55131721496582, + "rewards/rejected": -4.791293144226074, + "step": 14048 + }, + { + "epoch": 3.51, + "grad_norm": 7.8041911125183105, + "learning_rate": 2.023368139307351e-06, + "logits/chosen": -0.6227617263793945, + "logits/rejected": -0.6461832523345947, + "logps/chosen": -44.87247085571289, + "logps/rejected": -97.49131774902344, + "loss": 0.6387, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9699957370758057, + "rewards/margins": 6.360788345336914, + "rewards/rejected": -3.3907928466796875, + "step": 14049 + }, + { + "epoch": 3.51, + "grad_norm": 5.293407917022705, + "learning_rate": 2.022736647597034e-06, + "logits/chosen": -0.5032806396484375, + "logits/rejected": -0.6041597723960876, + "logps/chosen": -59.76734924316406, + "logps/rejected": -112.88404846191406, + "loss": 0.6623, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.22599196434021, + "rewards/margins": 6.961573600769043, + "rewards/rejected": -3.735581159591675, + "step": 14050 + }, + { + "epoch": 3.52, + "grad_norm": 3.667727470397949, + "learning_rate": 2.0221052294580596e-06, + "logits/chosen": -0.5114057064056396, + "logits/rejected": -0.6442893147468567, + "logps/chosen": -50.109806060791016, + "logps/rejected": -99.81095123291016, + "loss": 0.5643, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9042248725891113, + "rewards/margins": 6.882630348205566, + "rewards/rejected": -3.978405714035034, + "step": 14051 + }, + { + "epoch": 3.52, + "grad_norm": 3.774437427520752, + "learning_rate": 2.021473884906034e-06, + "logits/chosen": -0.5190573930740356, + "logits/rejected": -0.6000759601593018, + "logps/chosen": -60.93305587768555, + "logps/rejected": -112.91546630859375, + "loss": 0.6136, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.129427909851074, + "rewards/margins": 7.29579496383667, + "rewards/rejected": -4.1663665771484375, + "step": 14052 + }, + { + "epoch": 3.52, + "grad_norm": 2.5507912635803223, + "learning_rate": 2.020842613956557e-06, + "logits/chosen": -0.615892231464386, + "logits/rejected": -0.702666699886322, + "logps/chosen": -60.559444427490234, + "logps/rejected": -100.52586364746094, + "loss": 0.6525, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4089314937591553, + "rewards/margins": 6.771689414978027, + "rewards/rejected": -3.3627572059631348, + "step": 14053 + }, + { + "epoch": 3.52, + "grad_norm": 3.325979709625244, + "learning_rate": 2.020211416625225e-06, + "logits/chosen": -0.5782272815704346, + "logits/rejected": -0.6074776649475098, + "logps/chosen": -47.38540267944336, + "logps/rejected": -101.96293640136719, + "loss": 0.5668, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1223371028900146, + "rewards/margins": 6.433713912963867, + "rewards/rejected": -3.3113765716552734, + "step": 14054 + }, + { + "epoch": 3.52, + "grad_norm": 8.331663131713867, + "learning_rate": 2.0195802929276393e-06, + "logits/chosen": -0.6320971250534058, + "logits/rejected": -0.6916940212249756, + "logps/chosen": -53.30762481689453, + "logps/rejected": -94.20044708251953, + "loss": 0.6078, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.837696075439453, + "rewards/margins": 7.031439304351807, + "rewards/rejected": -4.1937432289123535, + "step": 14055 + }, + { + "epoch": 3.52, + "grad_norm": 4.50329065322876, + "learning_rate": 2.0189492428793968e-06, + "logits/chosen": -0.60578852891922, + "logits/rejected": -0.6842194199562073, + "logps/chosen": -53.259849548339844, + "logps/rejected": -92.51126098632812, + "loss": 0.6075, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0759189128875732, + "rewards/margins": 6.582807540893555, + "rewards/rejected": -3.5068888664245605, + "step": 14056 + }, + { + "epoch": 3.52, + "grad_norm": 2.877492904663086, + "learning_rate": 2.0183182664960883e-06, + "logits/chosen": -0.47692060470581055, + "logits/rejected": -0.5775815844535828, + "logps/chosen": -65.37908172607422, + "logps/rejected": -121.25521850585938, + "loss": 0.5622, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9988434314727783, + "rewards/margins": 7.876657485961914, + "rewards/rejected": -4.877814292907715, + "step": 14057 + }, + { + "epoch": 3.52, + "grad_norm": 15.414074897766113, + "learning_rate": 2.017687363793306e-06, + "logits/chosen": -0.5350930690765381, + "logits/rejected": -0.6429702043533325, + "logps/chosen": -58.7731819152832, + "logps/rejected": -114.18373107910156, + "loss": 0.6653, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.850177526473999, + "rewards/margins": 7.172197341918945, + "rewards/rejected": -4.322020530700684, + "step": 14058 + }, + { + "epoch": 3.52, + "grad_norm": 4.836703300476074, + "learning_rate": 2.0170565347866423e-06, + "logits/chosen": -0.5704557299613953, + "logits/rejected": -0.6190160512924194, + "logps/chosen": -54.296722412109375, + "logps/rejected": -111.77938842773438, + "loss": 0.6369, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.697780132293701, + "rewards/margins": 6.774481296539307, + "rewards/rejected": -4.0767011642456055, + "step": 14059 + }, + { + "epoch": 3.52, + "grad_norm": 4.290907382965088, + "learning_rate": 2.0164257794916843e-06, + "logits/chosen": -0.6369096636772156, + "logits/rejected": -0.6831375956535339, + "logps/chosen": -50.6381721496582, + "logps/rejected": -116.77177429199219, + "loss": 0.5904, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9051527976989746, + "rewards/margins": 7.603520393371582, + "rewards/rejected": -4.698367118835449, + "step": 14060 + }, + { + "epoch": 3.52, + "grad_norm": 11.528999328613281, + "learning_rate": 2.015795097924018e-06, + "logits/chosen": -0.5237805843353271, + "logits/rejected": -0.630651593208313, + "logps/chosen": -56.35810852050781, + "logps/rejected": -87.23760986328125, + "loss": 0.7503, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.839348077774048, + "rewards/margins": 5.517098426818848, + "rewards/rejected": -2.6777503490448, + "step": 14061 + }, + { + "epoch": 3.52, + "grad_norm": 4.840480327606201, + "learning_rate": 2.0151644900992294e-06, + "logits/chosen": -0.520187258720398, + "logits/rejected": -0.5533263683319092, + "logps/chosen": -59.13562774658203, + "logps/rejected": -123.8389663696289, + "loss": 0.6611, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.100403070449829, + "rewards/margins": 7.135806083679199, + "rewards/rejected": -4.035403251647949, + "step": 14062 + }, + { + "epoch": 3.52, + "grad_norm": 13.744119644165039, + "learning_rate": 2.014533956032902e-06, + "logits/chosen": -0.6062185764312744, + "logits/rejected": -0.6812040209770203, + "logps/chosen": -64.88131713867188, + "logps/rejected": -92.3299560546875, + "loss": 0.7951, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6571617126464844, + "rewards/margins": 5.424188137054443, + "rewards/rejected": -2.767026662826538, + "step": 14063 + }, + { + "epoch": 3.52, + "grad_norm": 5.454533100128174, + "learning_rate": 2.0139034957406134e-06, + "logits/chosen": -0.5178385376930237, + "logits/rejected": -0.6026559472084045, + "logps/chosen": -59.21635437011719, + "logps/rejected": -105.95116424560547, + "loss": 0.6843, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3237662315368652, + "rewards/margins": 6.388384819030762, + "rewards/rejected": -3.0646185874938965, + "step": 14064 + }, + { + "epoch": 3.52, + "grad_norm": 4.569106101989746, + "learning_rate": 2.013273109237948e-06, + "logits/chosen": -0.5144737958908081, + "logits/rejected": -0.5724475979804993, + "logps/chosen": -63.63508605957031, + "logps/rejected": -95.9583740234375, + "loss": 0.6039, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3745925426483154, + "rewards/margins": 6.631734371185303, + "rewards/rejected": -3.257141590118408, + "step": 14065 + }, + { + "epoch": 3.52, + "grad_norm": 3.9395816326141357, + "learning_rate": 2.012642796540481e-06, + "logits/chosen": -0.5131217837333679, + "logits/rejected": -0.541262686252594, + "logps/chosen": -59.446556091308594, + "logps/rejected": -112.08769989013672, + "loss": 0.6751, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.930816411972046, + "rewards/margins": 7.3742146492004395, + "rewards/rejected": -4.443398475646973, + "step": 14066 + }, + { + "epoch": 3.52, + "grad_norm": 9.697980880737305, + "learning_rate": 2.0120125576637855e-06, + "logits/chosen": -0.49562495946884155, + "logits/rejected": -0.5738312602043152, + "logps/chosen": -61.09624099731445, + "logps/rejected": -97.58687591552734, + "loss": 0.8024, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6993653774261475, + "rewards/margins": 5.654382705688477, + "rewards/rejected": -2.9550178050994873, + "step": 14067 + }, + { + "epoch": 3.52, + "grad_norm": 4.480906009674072, + "learning_rate": 2.0113823926234387e-06, + "logits/chosen": -0.4750146269798279, + "logits/rejected": -0.5578627586364746, + "logps/chosen": -67.11672973632812, + "logps/rejected": -101.60658264160156, + "loss": 0.6888, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.180485725402832, + "rewards/margins": 6.795709609985352, + "rewards/rejected": -3.6152241230010986, + "step": 14068 + }, + { + "epoch": 3.52, + "grad_norm": 8.11498737335205, + "learning_rate": 2.010752301435013e-06, + "logits/chosen": -0.5793914198875427, + "logits/rejected": -0.6475555896759033, + "logps/chosen": -54.95516586303711, + "logps/rejected": -115.07257080078125, + "loss": 0.6931, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.214348793029785, + "rewards/margins": 6.788051128387451, + "rewards/rejected": -3.573702573776245, + "step": 14069 + }, + { + "epoch": 3.52, + "grad_norm": 9.074152946472168, + "learning_rate": 2.0101222841140775e-06, + "logits/chosen": -0.49063238501548767, + "logits/rejected": -0.5426963567733765, + "logps/chosen": -57.78929901123047, + "logps/rejected": -98.69781494140625, + "loss": 0.7849, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.121908664703369, + "rewards/margins": 5.207549095153809, + "rewards/rejected": -2.0856404304504395, + "step": 14070 + }, + { + "epoch": 3.52, + "grad_norm": 6.2209858894348145, + "learning_rate": 2.009492340676199e-06, + "logits/chosen": -0.5003479719161987, + "logits/rejected": -0.5810303688049316, + "logps/chosen": -61.62520217895508, + "logps/rejected": -96.6261978149414, + "loss": 0.6579, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.864208459854126, + "rewards/margins": 6.121955871582031, + "rewards/rejected": -3.257747173309326, + "step": 14071 + }, + { + "epoch": 3.52, + "grad_norm": 3.7115542888641357, + "learning_rate": 2.0088624711369477e-06, + "logits/chosen": -0.5481317639350891, + "logits/rejected": -0.6183823347091675, + "logps/chosen": -55.766056060791016, + "logps/rejected": -107.91419982910156, + "loss": 0.6515, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2784440517425537, + "rewards/margins": 8.008048057556152, + "rewards/rejected": -4.729604244232178, + "step": 14072 + }, + { + "epoch": 3.52, + "grad_norm": 5.557125091552734, + "learning_rate": 2.008232675511886e-06, + "logits/chosen": -0.4694106876850128, + "logits/rejected": -0.5823725461959839, + "logps/chosen": -60.62464904785156, + "logps/rejected": -110.38395690917969, + "loss": 0.6198, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8842055797576904, + "rewards/margins": 7.220115661621094, + "rewards/rejected": -4.335909843444824, + "step": 14073 + }, + { + "epoch": 3.52, + "grad_norm": 6.787519454956055, + "learning_rate": 2.0076029538165746e-06, + "logits/chosen": -0.4695189893245697, + "logits/rejected": -0.5526396036148071, + "logps/chosen": -59.40994644165039, + "logps/rejected": -89.2262954711914, + "loss": 0.7448, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9393727779388428, + "rewards/margins": 5.315498352050781, + "rewards/rejected": -2.3761258125305176, + "step": 14074 + }, + { + "epoch": 3.52, + "grad_norm": 4.531993865966797, + "learning_rate": 2.0069733060665797e-06, + "logits/chosen": -0.5802748799324036, + "logits/rejected": -0.6865420937538147, + "logps/chosen": -54.670387268066406, + "logps/rejected": -115.69499206542969, + "loss": 0.5754, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.867593288421631, + "rewards/margins": 7.837983131408691, + "rewards/rejected": -4.970390319824219, + "step": 14075 + }, + { + "epoch": 3.52, + "grad_norm": 15.581103324890137, + "learning_rate": 2.006343732277456e-06, + "logits/chosen": -0.5152435898780823, + "logits/rejected": -0.6345580816268921, + "logps/chosen": -50.25982666015625, + "logps/rejected": -87.8292007446289, + "loss": 0.6275, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0935263633728027, + "rewards/margins": 6.130133628845215, + "rewards/rejected": -3.036607027053833, + "step": 14076 + }, + { + "epoch": 3.52, + "grad_norm": 2.7726762294769287, + "learning_rate": 2.0057142324647645e-06, + "logits/chosen": -0.5562617182731628, + "logits/rejected": -0.6192402243614197, + "logps/chosen": -51.36995315551758, + "logps/rejected": -91.82349395751953, + "loss": 0.6207, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.219604015350342, + "rewards/margins": 6.006760597229004, + "rewards/rejected": -2.787156343460083, + "step": 14077 + }, + { + "epoch": 3.52, + "grad_norm": 4.624269008636475, + "learning_rate": 2.005084806644059e-06, + "logits/chosen": -0.4665610194206238, + "logits/rejected": -0.5550520420074463, + "logps/chosen": -50.63941955566406, + "logps/rejected": -106.08486938476562, + "loss": 0.6176, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0148091316223145, + "rewards/margins": 7.452244758605957, + "rewards/rejected": -4.437435626983643, + "step": 14078 + }, + { + "epoch": 3.52, + "grad_norm": 2.783997058868408, + "learning_rate": 2.004455454830892e-06, + "logits/chosen": -0.49736177921295166, + "logits/rejected": -0.5720152854919434, + "logps/chosen": -67.81565856933594, + "logps/rejected": -100.556884765625, + "loss": 0.5879, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0679144859313965, + "rewards/margins": 6.417497158050537, + "rewards/rejected": -3.3495824337005615, + "step": 14079 + }, + { + "epoch": 3.52, + "grad_norm": 35.82688903808594, + "learning_rate": 2.003826177040817e-06, + "logits/chosen": -0.5058901309967041, + "logits/rejected": -0.6364439725875854, + "logps/chosen": -70.72077941894531, + "logps/rejected": -103.54483032226562, + "loss": 0.7362, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7561137676239014, + "rewards/margins": 6.386742115020752, + "rewards/rejected": -3.6306278705596924, + "step": 14080 + }, + { + "epoch": 3.52, + "grad_norm": 5.325560092926025, + "learning_rate": 2.0031969732893868e-06, + "logits/chosen": -0.5135393738746643, + "logits/rejected": -0.5645714998245239, + "logps/chosen": -64.0313949584961, + "logps/rejected": -114.23747253417969, + "loss": 0.6522, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.169583559036255, + "rewards/margins": 7.000176906585693, + "rewards/rejected": -3.8305935859680176, + "step": 14081 + }, + { + "epoch": 3.52, + "grad_norm": 4.500114440917969, + "learning_rate": 2.0025678435921442e-06, + "logits/chosen": -0.6044235825538635, + "logits/rejected": -0.6740564703941345, + "logps/chosen": -58.6712760925293, + "logps/rejected": -115.73241424560547, + "loss": 0.5849, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1339094638824463, + "rewards/margins": 6.598475933074951, + "rewards/rejected": -3.464566469192505, + "step": 14082 + }, + { + "epoch": 3.52, + "grad_norm": 5.583367347717285, + "learning_rate": 2.0019387879646384e-06, + "logits/chosen": -0.4286937415599823, + "logits/rejected": -0.4783787131309509, + "logps/chosen": -60.545719146728516, + "logps/rejected": -107.56822967529297, + "loss": 0.6356, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0431771278381348, + "rewards/margins": 6.520850658416748, + "rewards/rejected": -3.4776737689971924, + "step": 14083 + }, + { + "epoch": 3.52, + "grad_norm": 8.908045768737793, + "learning_rate": 2.0013098064224175e-06, + "logits/chosen": -0.5919345617294312, + "logits/rejected": -0.7229292392730713, + "logps/chosen": -63.6727294921875, + "logps/rejected": -119.85870361328125, + "loss": 0.6935, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9825899600982666, + "rewards/margins": 8.4270601272583, + "rewards/rejected": -5.4444708824157715, + "step": 14084 + }, + { + "epoch": 3.52, + "grad_norm": 8.765416145324707, + "learning_rate": 2.0006808989810167e-06, + "logits/chosen": -0.4721298813819885, + "logits/rejected": -0.5809796452522278, + "logps/chosen": -49.30582809448242, + "logps/rejected": -91.65325927734375, + "loss": 0.6217, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2274134159088135, + "rewards/margins": 6.50484037399292, + "rewards/rejected": -3.2774267196655273, + "step": 14085 + }, + { + "epoch": 3.52, + "grad_norm": 11.65039348602295, + "learning_rate": 2.000052065655982e-06, + "logits/chosen": -0.5125036239624023, + "logits/rejected": -0.5941678285598755, + "logps/chosen": -55.66228485107422, + "logps/rejected": -84.28060913085938, + "loss": 0.648, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2069921493530273, + "rewards/margins": 5.536377429962158, + "rewards/rejected": -2.32938551902771, + "step": 14086 + }, + { + "epoch": 3.52, + "grad_norm": 7.29371452331543, + "learning_rate": 1.9994233064628526e-06, + "logits/chosen": -0.5527611374855042, + "logits/rejected": -0.6359171867370605, + "logps/chosen": -58.28340148925781, + "logps/rejected": -107.65670776367188, + "loss": 0.6403, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.066068172454834, + "rewards/margins": 6.728785514831543, + "rewards/rejected": -3.6627180576324463, + "step": 14087 + }, + { + "epoch": 3.52, + "grad_norm": 3.8280715942382812, + "learning_rate": 1.998794621417165e-06, + "logits/chosen": -0.45219695568084717, + "logits/rejected": -0.557060956954956, + "logps/chosen": -61.94209289550781, + "logps/rejected": -127.27365112304688, + "loss": 0.6081, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2886016368865967, + "rewards/margins": 8.33172607421875, + "rewards/rejected": -5.043124198913574, + "step": 14088 + }, + { + "epoch": 3.52, + "grad_norm": 2.9524426460266113, + "learning_rate": 1.998166010534453e-06, + "logits/chosen": -0.5957744717597961, + "logits/rejected": -0.6529712677001953, + "logps/chosen": -51.41355514526367, + "logps/rejected": -81.58218383789062, + "loss": 0.613, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.913792133331299, + "rewards/margins": 6.147890090942383, + "rewards/rejected": -3.234097719192505, + "step": 14089 + }, + { + "epoch": 3.52, + "grad_norm": 5.819942951202393, + "learning_rate": 1.997537473830254e-06, + "logits/chosen": -0.4666723608970642, + "logits/rejected": -0.5760133862495422, + "logps/chosen": -74.19055938720703, + "logps/rejected": -104.71131134033203, + "loss": 0.6372, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7719569206237793, + "rewards/margins": 6.360951900482178, + "rewards/rejected": -3.5889952182769775, + "step": 14090 + }, + { + "epoch": 3.53, + "grad_norm": 5.896609783172607, + "learning_rate": 1.9969090113200968e-06, + "logits/chosen": -0.49657461047172546, + "logits/rejected": -0.6013683676719666, + "logps/chosen": -70.38203430175781, + "logps/rejected": -111.94015502929688, + "loss": 0.7163, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3262317180633545, + "rewards/margins": 7.466946601867676, + "rewards/rejected": -4.1407151222229, + "step": 14091 + }, + { + "epoch": 3.53, + "grad_norm": 17.500516891479492, + "learning_rate": 1.9962806230195105e-06, + "logits/chosen": -0.5341695547103882, + "logits/rejected": -0.5779527425765991, + "logps/chosen": -52.66738510131836, + "logps/rejected": -92.19975280761719, + "loss": 0.6896, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9108176231384277, + "rewards/margins": 5.952357292175293, + "rewards/rejected": -3.0415406227111816, + "step": 14092 + }, + { + "epoch": 3.53, + "grad_norm": 14.136523246765137, + "learning_rate": 1.9956523089440267e-06, + "logits/chosen": -0.5022003650665283, + "logits/rejected": -0.5888813138008118, + "logps/chosen": -57.33737564086914, + "logps/rejected": -98.72286224365234, + "loss": 0.7146, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8975138664245605, + "rewards/margins": 6.084461688995361, + "rewards/rejected": -3.18694806098938, + "step": 14093 + }, + { + "epoch": 3.53, + "grad_norm": 10.062877655029297, + "learning_rate": 1.995024069109168e-06, + "logits/chosen": -0.5766152143478394, + "logits/rejected": -0.6587721705436707, + "logps/chosen": -57.48255157470703, + "logps/rejected": -94.42030334472656, + "loss": 0.69, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1029984951019287, + "rewards/margins": 6.0335774421691895, + "rewards/rejected": -2.9305784702301025, + "step": 14094 + }, + { + "epoch": 3.53, + "grad_norm": 2.75722336769104, + "learning_rate": 1.9943959035304633e-06, + "logits/chosen": -0.5035682320594788, + "logits/rejected": -0.5313451886177063, + "logps/chosen": -47.286685943603516, + "logps/rejected": -136.97323608398438, + "loss": 0.6123, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4756436347961426, + "rewards/margins": 8.455665588378906, + "rewards/rejected": -4.980021953582764, + "step": 14095 + }, + { + "epoch": 3.53, + "grad_norm": 4.611985206604004, + "learning_rate": 1.993767812223432e-06, + "logits/chosen": -0.5450168251991272, + "logits/rejected": -0.627545177936554, + "logps/chosen": -48.878211975097656, + "logps/rejected": -108.15973663330078, + "loss": 0.5317, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.037888526916504, + "rewards/margins": 6.995711326599121, + "rewards/rejected": -3.95782208442688, + "step": 14096 + }, + { + "epoch": 3.53, + "grad_norm": 17.835472106933594, + "learning_rate": 1.9931397952035946e-06, + "logits/chosen": -0.6200515031814575, + "logits/rejected": -0.7168188691139221, + "logps/chosen": -53.143524169921875, + "logps/rejected": -90.71115112304688, + "loss": 0.7574, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.844959020614624, + "rewards/margins": 6.230969429016113, + "rewards/rejected": -3.3860106468200684, + "step": 14097 + }, + { + "epoch": 3.53, + "grad_norm": 4.4582085609436035, + "learning_rate": 1.9925118524864724e-06, + "logits/chosen": -0.5115382075309753, + "logits/rejected": -0.5857193470001221, + "logps/chosen": -52.31473159790039, + "logps/rejected": -95.65843200683594, + "loss": 0.6351, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.066901922225952, + "rewards/margins": 5.8973493576049805, + "rewards/rejected": -2.8304474353790283, + "step": 14098 + }, + { + "epoch": 3.53, + "grad_norm": 3.062641143798828, + "learning_rate": 1.991883984087581e-06, + "logits/chosen": -0.5593018531799316, + "logits/rejected": -0.6822501420974731, + "logps/chosen": -54.95368576049805, + "logps/rejected": -92.22550201416016, + "loss": 0.5931, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5402719974517822, + "rewards/margins": 7.379772186279297, + "rewards/rejected": -3.8394999504089355, + "step": 14099 + }, + { + "epoch": 3.53, + "grad_norm": 12.463722229003906, + "learning_rate": 1.991256190022434e-06, + "logits/chosen": -0.6130150556564331, + "logits/rejected": -0.6559211611747742, + "logps/chosen": -52.65067672729492, + "logps/rejected": -142.59860229492188, + "loss": 0.6759, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7270569801330566, + "rewards/margins": 9.339646339416504, + "rewards/rejected": -6.612590312957764, + "step": 14100 + }, + { + "epoch": 3.53, + "grad_norm": 7.411975860595703, + "learning_rate": 1.9906284703065478e-06, + "logits/chosen": -0.5456459522247314, + "logits/rejected": -0.616435170173645, + "logps/chosen": -51.552337646484375, + "logps/rejected": -107.33213806152344, + "loss": 0.6563, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9793860912323, + "rewards/margins": 7.25701904296875, + "rewards/rejected": -4.277634143829346, + "step": 14101 + }, + { + "epoch": 3.53, + "grad_norm": 2.759054660797119, + "learning_rate": 1.990000824955434e-06, + "logits/chosen": -0.5349608063697815, + "logits/rejected": -0.6331326961517334, + "logps/chosen": -51.382179260253906, + "logps/rejected": -109.85138702392578, + "loss": 0.5675, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0844056606292725, + "rewards/margins": 8.020645141601562, + "rewards/rejected": -4.936239719390869, + "step": 14102 + }, + { + "epoch": 3.53, + "grad_norm": 3.8379616737365723, + "learning_rate": 1.9893732539846015e-06, + "logits/chosen": -0.5329611897468567, + "logits/rejected": -0.6473973393440247, + "logps/chosen": -67.23503875732422, + "logps/rejected": -108.96696472167969, + "loss": 0.7006, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0067811012268066, + "rewards/margins": 7.122638702392578, + "rewards/rejected": -4.115857124328613, + "step": 14103 + }, + { + "epoch": 3.53, + "grad_norm": 5.289368152618408, + "learning_rate": 1.9887457574095565e-06, + "logits/chosen": -0.5503864288330078, + "logits/rejected": -0.4992467164993286, + "logps/chosen": -50.80425262451172, + "logps/rejected": -130.66812133789062, + "loss": 0.6862, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1568167209625244, + "rewards/margins": 6.791286468505859, + "rewards/rejected": -3.634469509124756, + "step": 14104 + }, + { + "epoch": 3.53, + "grad_norm": 8.770766258239746, + "learning_rate": 1.9881183352458083e-06, + "logits/chosen": -0.5145940184593201, + "logits/rejected": -0.5656120777130127, + "logps/chosen": -56.5480842590332, + "logps/rejected": -122.33647155761719, + "loss": 0.6525, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.292635440826416, + "rewards/margins": 8.545310020446777, + "rewards/rejected": -5.252674579620361, + "step": 14105 + }, + { + "epoch": 3.53, + "grad_norm": 3.613503932952881, + "learning_rate": 1.9874909875088598e-06, + "logits/chosen": -0.5839108228683472, + "logits/rejected": -0.7130053043365479, + "logps/chosen": -47.70480728149414, + "logps/rejected": -101.80906677246094, + "loss": 0.5156, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.934136152267456, + "rewards/margins": 6.887474060058594, + "rewards/rejected": -3.9533376693725586, + "step": 14106 + }, + { + "epoch": 3.53, + "grad_norm": 3.0409719944000244, + "learning_rate": 1.9868637142142115e-06, + "logits/chosen": -0.5407167673110962, + "logits/rejected": -0.5912327766418457, + "logps/chosen": -51.868804931640625, + "logps/rejected": -103.62643432617188, + "loss": 0.5757, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.51358699798584, + "rewards/margins": 7.355926513671875, + "rewards/rejected": -3.842339277267456, + "step": 14107 + }, + { + "epoch": 3.53, + "grad_norm": 4.038425445556641, + "learning_rate": 1.9862365153773673e-06, + "logits/chosen": -0.592501163482666, + "logits/rejected": -0.6492749452590942, + "logps/chosen": -51.67665100097656, + "logps/rejected": -91.93392181396484, + "loss": 0.5914, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.094238758087158, + "rewards/margins": 5.874980449676514, + "rewards/rejected": -2.7807419300079346, + "step": 14108 + }, + { + "epoch": 3.53, + "grad_norm": 13.230778694152832, + "learning_rate": 1.985609391013825e-06, + "logits/chosen": -0.5328106880187988, + "logits/rejected": -0.5666138529777527, + "logps/chosen": -53.112796783447266, + "logps/rejected": -100.201171875, + "loss": 0.8088, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0489518642425537, + "rewards/margins": 6.043820381164551, + "rewards/rejected": -2.994868516921997, + "step": 14109 + }, + { + "epoch": 3.53, + "grad_norm": 2.8033971786499023, + "learning_rate": 1.9849823411390784e-06, + "logits/chosen": -0.555064857006073, + "logits/rejected": -0.6203384399414062, + "logps/chosen": -44.66812515258789, + "logps/rejected": -126.09420013427734, + "loss": 0.5663, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.099404811859131, + "rewards/margins": 9.216339111328125, + "rewards/rejected": -6.116934299468994, + "step": 14110 + }, + { + "epoch": 3.53, + "grad_norm": 8.673951148986816, + "learning_rate": 1.9843553657686254e-06, + "logits/chosen": -0.5451047420501709, + "logits/rejected": -0.6038631200790405, + "logps/chosen": -59.91679382324219, + "logps/rejected": -88.84960174560547, + "loss": 0.6242, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8514280319213867, + "rewards/margins": 5.068445205688477, + "rewards/rejected": -2.2170169353485107, + "step": 14111 + }, + { + "epoch": 3.53, + "grad_norm": 5.101386547088623, + "learning_rate": 1.9837284649179606e-06, + "logits/chosen": -0.5543399453163147, + "logits/rejected": -0.6319804787635803, + "logps/chosen": -66.14554595947266, + "logps/rejected": -109.40469360351562, + "loss": 0.6419, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.114365339279175, + "rewards/margins": 7.699855327606201, + "rewards/rejected": -4.585489749908447, + "step": 14112 + }, + { + "epoch": 3.53, + "grad_norm": 3.0166003704071045, + "learning_rate": 1.9831016386025737e-06, + "logits/chosen": -0.5287617444992065, + "logits/rejected": -0.6349201202392578, + "logps/chosen": -55.47057342529297, + "logps/rejected": -91.71391296386719, + "loss": 0.586, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.471458911895752, + "rewards/margins": 7.786795616149902, + "rewards/rejected": -4.31533670425415, + "step": 14113 + }, + { + "epoch": 3.53, + "grad_norm": 4.2740559577941895, + "learning_rate": 1.982474886837952e-06, + "logits/chosen": -0.4935593903064728, + "logits/rejected": -0.6136995553970337, + "logps/chosen": -65.18952941894531, + "logps/rejected": -103.09048461914062, + "loss": 0.6802, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0488440990448, + "rewards/margins": 7.183928489685059, + "rewards/rejected": -4.13508415222168, + "step": 14114 + }, + { + "epoch": 3.53, + "grad_norm": 6.973249912261963, + "learning_rate": 1.9818482096395875e-06, + "logits/chosen": -0.5671453475952148, + "logits/rejected": -0.6622366905212402, + "logps/chosen": -60.763126373291016, + "logps/rejected": -131.56878662109375, + "loss": 0.6199, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2067465782165527, + "rewards/margins": 9.328036308288574, + "rewards/rejected": -6.121290683746338, + "step": 14115 + }, + { + "epoch": 3.53, + "grad_norm": 5.925343990325928, + "learning_rate": 1.9812216070229647e-06, + "logits/chosen": -0.5186696648597717, + "logits/rejected": -0.5882259607315063, + "logps/chosen": -56.153465270996094, + "logps/rejected": -93.71243286132812, + "loss": 0.7297, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.1618103981018066, + "rewards/margins": 6.250946998596191, + "rewards/rejected": -3.0891366004943848, + "step": 14116 + }, + { + "epoch": 3.53, + "grad_norm": 4.357944011688232, + "learning_rate": 1.9805950790035645e-06, + "logits/chosen": -0.580150842666626, + "logits/rejected": -0.6010538935661316, + "logps/chosen": -48.94432067871094, + "logps/rejected": -98.30841064453125, + "loss": 0.6457, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.065566062927246, + "rewards/margins": 5.758229732513428, + "rewards/rejected": -2.6926639080047607, + "step": 14117 + }, + { + "epoch": 3.53, + "grad_norm": 5.592204570770264, + "learning_rate": 1.979968625596873e-06, + "logits/chosen": -0.43890732526779175, + "logits/rejected": -0.5263875126838684, + "logps/chosen": -50.91825866699219, + "logps/rejected": -110.61378479003906, + "loss": 0.5832, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3714537620544434, + "rewards/margins": 6.753608703613281, + "rewards/rejected": -3.382154941558838, + "step": 14118 + }, + { + "epoch": 3.53, + "grad_norm": 2.5565080642700195, + "learning_rate": 1.9793422468183676e-06, + "logits/chosen": -0.5759782195091248, + "logits/rejected": -0.6766196489334106, + "logps/chosen": -43.159793853759766, + "logps/rejected": -102.05233764648438, + "loss": 0.4928, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.962083578109741, + "rewards/margins": 8.444662094116211, + "rewards/rejected": -5.482579231262207, + "step": 14119 + }, + { + "epoch": 3.53, + "grad_norm": 7.180619716644287, + "learning_rate": 1.9787159426835296e-06, + "logits/chosen": -0.5960008502006531, + "logits/rejected": -0.6793123483657837, + "logps/chosen": -56.168758392333984, + "logps/rejected": -109.00825500488281, + "loss": 0.5946, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.925795078277588, + "rewards/margins": 8.430931091308594, + "rewards/rejected": -5.5051350593566895, + "step": 14120 + }, + { + "epoch": 3.53, + "grad_norm": 7.710564613342285, + "learning_rate": 1.978089713207835e-06, + "logits/chosen": -0.4799540638923645, + "logits/rejected": -0.6024667620658875, + "logps/chosen": -67.01238250732422, + "logps/rejected": -112.25377655029297, + "loss": 0.7226, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9655065536499023, + "rewards/margins": 7.18467378616333, + "rewards/rejected": -4.219167709350586, + "step": 14121 + }, + { + "epoch": 3.53, + "grad_norm": 11.821512222290039, + "learning_rate": 1.9774635584067552e-06, + "logits/chosen": -0.5807598233222961, + "logits/rejected": -0.6086796522140503, + "logps/chosen": -54.015296936035156, + "logps/rejected": -108.28630828857422, + "loss": 0.6321, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0848915576934814, + "rewards/margins": 6.017200469970703, + "rewards/rejected": -2.932309150695801, + "step": 14122 + }, + { + "epoch": 3.53, + "grad_norm": 4.45820426940918, + "learning_rate": 1.9768374782957683e-06, + "logits/chosen": -0.5412012338638306, + "logits/rejected": -0.6323949694633484, + "logps/chosen": -60.323001861572266, + "logps/rejected": -99.54022216796875, + "loss": 0.6952, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.206800937652588, + "rewards/margins": 6.745997428894043, + "rewards/rejected": -3.539196729660034, + "step": 14123 + }, + { + "epoch": 3.53, + "grad_norm": 11.094827651977539, + "learning_rate": 1.9762114728903425e-06, + "logits/chosen": -0.5301856994628906, + "logits/rejected": -0.638789713382721, + "logps/chosen": -64.47291564941406, + "logps/rejected": -110.05970764160156, + "loss": 0.7083, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.680830717086792, + "rewards/margins": 7.527955532073975, + "rewards/rejected": -4.847125053405762, + "step": 14124 + }, + { + "epoch": 3.53, + "grad_norm": 11.728638648986816, + "learning_rate": 1.9755855422059455e-06, + "logits/chosen": -0.5652114748954773, + "logits/rejected": -0.6069810390472412, + "logps/chosen": -54.816078186035156, + "logps/rejected": -127.33314514160156, + "loss": 0.6173, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1493096351623535, + "rewards/margins": 7.264071464538574, + "rewards/rejected": -4.114762306213379, + "step": 14125 + }, + { + "epoch": 3.53, + "grad_norm": 4.605276584625244, + "learning_rate": 1.9749596862580467e-06, + "logits/chosen": -0.5706579685211182, + "logits/rejected": -0.6830670833587646, + "logps/chosen": -58.535335540771484, + "logps/rejected": -102.22418212890625, + "loss": 0.6047, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3675649166107178, + "rewards/margins": 7.283666610717773, + "rewards/rejected": -3.9161016941070557, + "step": 14126 + }, + { + "epoch": 3.53, + "grad_norm": 5.295734882354736, + "learning_rate": 1.974333905062116e-06, + "logits/chosen": -0.5136902928352356, + "logits/rejected": -0.6083186864852905, + "logps/chosen": -51.84400939941406, + "logps/rejected": -81.13772583007812, + "loss": 0.6733, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.867847442626953, + "rewards/margins": 5.3989434242248535, + "rewards/rejected": -2.5310959815979004, + "step": 14127 + }, + { + "epoch": 3.53, + "grad_norm": 3.8661530017852783, + "learning_rate": 1.9737081986336083e-06, + "logits/chosen": -0.5635583400726318, + "logits/rejected": -0.6696808934211731, + "logps/chosen": -58.92337417602539, + "logps/rejected": -111.95501708984375, + "loss": 0.5649, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.313746452331543, + "rewards/margins": 7.124598503112793, + "rewards/rejected": -3.810851573944092, + "step": 14128 + }, + { + "epoch": 3.53, + "grad_norm": 4.507379055023193, + "learning_rate": 1.97308256698799e-06, + "logits/chosen": -0.5076379179954529, + "logits/rejected": -0.5782501697540283, + "logps/chosen": -56.46562957763672, + "logps/rejected": -99.06336975097656, + "loss": 0.6781, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8873913288116455, + "rewards/margins": 6.309758186340332, + "rewards/rejected": -3.4223668575286865, + "step": 14129 + }, + { + "epoch": 3.53, + "grad_norm": 2.5385982990264893, + "learning_rate": 1.972457010140723e-06, + "logits/chosen": -0.5290494561195374, + "logits/rejected": -0.6548416614532471, + "logps/chosen": -59.79573059082031, + "logps/rejected": -121.27604675292969, + "loss": 0.5668, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.24177885055542, + "rewards/margins": 8.633048057556152, + "rewards/rejected": -5.391269683837891, + "step": 14130 + }, + { + "epoch": 3.54, + "grad_norm": 6.30690860748291, + "learning_rate": 1.9718315281072635e-06, + "logits/chosen": -0.5463611483573914, + "logits/rejected": -0.6553124785423279, + "logps/chosen": -70.44198608398438, + "logps/rejected": -96.63006591796875, + "loss": 0.666, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8502678871154785, + "rewards/margins": 6.83465576171875, + "rewards/rejected": -3.9843878746032715, + "step": 14131 + }, + { + "epoch": 3.54, + "grad_norm": 7.6142730712890625, + "learning_rate": 1.971206120903066e-06, + "logits/chosen": -0.594918966293335, + "logits/rejected": -0.7185413837432861, + "logps/chosen": -54.17349624633789, + "logps/rejected": -103.46588897705078, + "loss": 0.65, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.760070562362671, + "rewards/margins": 8.104649543762207, + "rewards/rejected": -5.344578742980957, + "step": 14132 + }, + { + "epoch": 3.54, + "grad_norm": 5.736159324645996, + "learning_rate": 1.970580788543589e-06, + "logits/chosen": -0.6098849773406982, + "logits/rejected": -0.6679666638374329, + "logps/chosen": -45.3889274597168, + "logps/rejected": -108.36565399169922, + "loss": 0.6045, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8748745918273926, + "rewards/margins": 7.857193946838379, + "rewards/rejected": -4.982318878173828, + "step": 14133 + }, + { + "epoch": 3.54, + "grad_norm": 4.009562015533447, + "learning_rate": 1.9699555310442824e-06, + "logits/chosen": -0.5750917196273804, + "logits/rejected": -0.6433696150779724, + "logps/chosen": -56.53798294067383, + "logps/rejected": -108.44429779052734, + "loss": 0.6405, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.458922863006592, + "rewards/margins": 7.2581024169921875, + "rewards/rejected": -3.7991795539855957, + "step": 14134 + }, + { + "epoch": 3.54, + "grad_norm": 2.97493052482605, + "learning_rate": 1.9693303484205962e-06, + "logits/chosen": -0.5762133002281189, + "logits/rejected": -0.6342122554779053, + "logps/chosen": -50.176414489746094, + "logps/rejected": -113.87451171875, + "loss": 0.5625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.726107597351074, + "rewards/margins": 8.020066261291504, + "rewards/rejected": -4.293959140777588, + "step": 14135 + }, + { + "epoch": 3.54, + "grad_norm": 4.811944961547852, + "learning_rate": 1.968705240687982e-06, + "logits/chosen": -0.5949774384498596, + "logits/rejected": -0.7033955454826355, + "logps/chosen": -53.48101806640625, + "logps/rejected": -92.10063171386719, + "loss": 0.6609, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.966251850128174, + "rewards/margins": 6.793661117553711, + "rewards/rejected": -3.827409029006958, + "step": 14136 + }, + { + "epoch": 3.54, + "grad_norm": 4.34576416015625, + "learning_rate": 1.9680802078618837e-06, + "logits/chosen": -0.5674086809158325, + "logits/rejected": -0.6573327779769897, + "logps/chosen": -53.77323913574219, + "logps/rejected": -116.9041519165039, + "loss": 0.5942, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0063414573669434, + "rewards/margins": 8.347734451293945, + "rewards/rejected": -5.341392993927002, + "step": 14137 + }, + { + "epoch": 3.54, + "grad_norm": 4.429544925689697, + "learning_rate": 1.9674552499577503e-06, + "logits/chosen": -0.43501901626586914, + "logits/rejected": -0.5802597999572754, + "logps/chosen": -69.05674743652344, + "logps/rejected": -94.31558227539062, + "loss": 0.5911, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0976572036743164, + "rewards/margins": 6.948330879211426, + "rewards/rejected": -3.8506736755371094, + "step": 14138 + }, + { + "epoch": 3.54, + "grad_norm": 3.0240161418914795, + "learning_rate": 1.9668303669910237e-06, + "logits/chosen": -0.5269685983657837, + "logits/rejected": -0.5981283187866211, + "logps/chosen": -43.40263366699219, + "logps/rejected": -117.30485534667969, + "loss": 0.518, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.477094888687134, + "rewards/margins": 8.750788688659668, + "rewards/rejected": -5.273693084716797, + "step": 14139 + }, + { + "epoch": 3.54, + "grad_norm": 5.447689056396484, + "learning_rate": 1.9662055589771427e-06, + "logits/chosen": -0.5358908176422119, + "logits/rejected": -0.5970973968505859, + "logps/chosen": -58.718727111816406, + "logps/rejected": -112.79299926757812, + "loss": 0.7218, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0833334922790527, + "rewards/margins": 7.154117107391357, + "rewards/rejected": -4.070783615112305, + "step": 14140 + }, + { + "epoch": 3.54, + "grad_norm": 3.9572062492370605, + "learning_rate": 1.9655808259315512e-06, + "logits/chosen": -0.5671521425247192, + "logits/rejected": -0.657751202583313, + "logps/chosen": -49.01309585571289, + "logps/rejected": -103.09693908691406, + "loss": 0.5919, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1527373790740967, + "rewards/margins": 6.112152099609375, + "rewards/rejected": -2.95941424369812, + "step": 14141 + }, + { + "epoch": 3.54, + "grad_norm": 20.631418228149414, + "learning_rate": 1.964956167869685e-06, + "logits/chosen": -0.6166542172431946, + "logits/rejected": -0.7116648554801941, + "logps/chosen": -59.8859748840332, + "logps/rejected": -106.24878692626953, + "loss": 0.7472, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2808678150177, + "rewards/margins": 7.238934516906738, + "rewards/rejected": -3.958066940307617, + "step": 14142 + }, + { + "epoch": 3.54, + "grad_norm": 3.9723596572875977, + "learning_rate": 1.9643315848069782e-06, + "logits/chosen": -0.5386680960655212, + "logits/rejected": -0.6186094880104065, + "logps/chosen": -49.674278259277344, + "logps/rejected": -95.63279724121094, + "loss": 0.6217, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.055347204208374, + "rewards/margins": 6.258476734161377, + "rewards/rejected": -3.203129291534424, + "step": 14143 + }, + { + "epoch": 3.54, + "grad_norm": 4.505982398986816, + "learning_rate": 1.9637070767588668e-06, + "logits/chosen": -0.5572446584701538, + "logits/rejected": -0.632505476474762, + "logps/chosen": -53.67905044555664, + "logps/rejected": -107.0853271484375, + "loss": 0.6281, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.089035749435425, + "rewards/margins": 7.426970958709717, + "rewards/rejected": -4.337935447692871, + "step": 14144 + }, + { + "epoch": 3.54, + "grad_norm": 2.994624137878418, + "learning_rate": 1.963082643740785e-06, + "logits/chosen": -0.5818803906440735, + "logits/rejected": -0.6331554055213928, + "logps/chosen": -53.72972869873047, + "logps/rejected": -108.67312622070312, + "loss": 0.5698, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3945748805999756, + "rewards/margins": 6.796696662902832, + "rewards/rejected": -3.4021217823028564, + "step": 14145 + }, + { + "epoch": 3.54, + "grad_norm": 3.8788790702819824, + "learning_rate": 1.962458285768161e-06, + "logits/chosen": -0.47663724422454834, + "logits/rejected": -0.5707883238792419, + "logps/chosen": -50.510650634765625, + "logps/rejected": -102.86149597167969, + "loss": 0.6073, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1753437519073486, + "rewards/margins": 6.197168827056885, + "rewards/rejected": -3.021825075149536, + "step": 14146 + }, + { + "epoch": 3.54, + "grad_norm": 9.713208198547363, + "learning_rate": 1.9618340028564226e-06, + "logits/chosen": -0.5796027779579163, + "logits/rejected": -0.7138238549232483, + "logps/chosen": -63.600162506103516, + "logps/rejected": -105.96234130859375, + "loss": 0.6705, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.872068405151367, + "rewards/margins": 7.113134384155273, + "rewards/rejected": -4.2410664558410645, + "step": 14147 + }, + { + "epoch": 3.54, + "grad_norm": 4.61641788482666, + "learning_rate": 1.9612097950209984e-06, + "logits/chosen": -0.5054841637611389, + "logits/rejected": -0.577398955821991, + "logps/chosen": -50.13329315185547, + "logps/rejected": -99.9028091430664, + "loss": 0.6118, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.039567470550537, + "rewards/margins": 5.871222496032715, + "rewards/rejected": -2.8316543102264404, + "step": 14148 + }, + { + "epoch": 3.54, + "grad_norm": 5.5873894691467285, + "learning_rate": 1.960585662277313e-06, + "logits/chosen": -0.39918169379234314, + "logits/rejected": -0.4542112648487091, + "logps/chosen": -66.04508209228516, + "logps/rejected": -111.43896484375, + "loss": 0.652, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.178506374359131, + "rewards/margins": 5.6944475173950195, + "rewards/rejected": -2.5159413814544678, + "step": 14149 + }, + { + "epoch": 3.54, + "grad_norm": 5.771751880645752, + "learning_rate": 1.9599616046407867e-06, + "logits/chosen": -0.523292064666748, + "logits/rejected": -0.5984949469566345, + "logps/chosen": -71.59521484375, + "logps/rejected": -107.85264587402344, + "loss": 0.6843, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.989409923553467, + "rewards/margins": 6.844542503356934, + "rewards/rejected": -3.855133056640625, + "step": 14150 + }, + { + "epoch": 3.54, + "grad_norm": 4.9145426750183105, + "learning_rate": 1.9593376221268444e-06, + "logits/chosen": -0.53888338804245, + "logits/rejected": -0.6505885720252991, + "logps/chosen": -47.45542907714844, + "logps/rejected": -96.8354721069336, + "loss": 0.5584, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4873604774475098, + "rewards/margins": 6.939249515533447, + "rewards/rejected": -3.4518895149230957, + "step": 14151 + }, + { + "epoch": 3.54, + "grad_norm": 8.067184448242188, + "learning_rate": 1.958713714750904e-06, + "logits/chosen": -0.5461869835853577, + "logits/rejected": -0.6426438093185425, + "logps/chosen": -52.6641960144043, + "logps/rejected": -93.3148193359375, + "loss": 0.6429, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2329673767089844, + "rewards/margins": 7.165156364440918, + "rewards/rejected": -3.932189464569092, + "step": 14152 + }, + { + "epoch": 3.54, + "grad_norm": 7.829306602478027, + "learning_rate": 1.9580898825283807e-06, + "logits/chosen": -0.6401547789573669, + "logits/rejected": -0.7155852317810059, + "logps/chosen": -72.69271087646484, + "logps/rejected": -99.02108764648438, + "loss": 0.797, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0119709968566895, + "rewards/margins": 6.616823673248291, + "rewards/rejected": -3.6048524379730225, + "step": 14153 + }, + { + "epoch": 3.54, + "grad_norm": 5.729038238525391, + "learning_rate": 1.9574661254746935e-06, + "logits/chosen": -0.5098713636398315, + "logits/rejected": -0.5982568264007568, + "logps/chosen": -51.050838470458984, + "logps/rejected": -100.51240539550781, + "loss": 0.6111, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9399495124816895, + "rewards/margins": 7.091783046722412, + "rewards/rejected": -4.151833534240723, + "step": 14154 + }, + { + "epoch": 3.54, + "grad_norm": 17.255165100097656, + "learning_rate": 1.9568424436052524e-06, + "logits/chosen": -0.5164510607719421, + "logits/rejected": -0.6526072025299072, + "logps/chosen": -53.637577056884766, + "logps/rejected": -82.28860473632812, + "loss": 0.6562, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1488351821899414, + "rewards/margins": 6.047802925109863, + "rewards/rejected": -2.8989675045013428, + "step": 14155 + }, + { + "epoch": 3.54, + "grad_norm": 5.083330154418945, + "learning_rate": 1.9562188369354735e-06, + "logits/chosen": -0.5846456289291382, + "logits/rejected": -0.6993774771690369, + "logps/chosen": -51.4036979675293, + "logps/rejected": -107.05070495605469, + "loss": 0.6323, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0666301250457764, + "rewards/margins": 7.904425144195557, + "rewards/rejected": -4.837795257568359, + "step": 14156 + }, + { + "epoch": 3.54, + "grad_norm": 2.124572992324829, + "learning_rate": 1.955595305480763e-06, + "logits/chosen": -0.519977331161499, + "logits/rejected": -0.6135685443878174, + "logps/chosen": -58.35758590698242, + "logps/rejected": -102.69232940673828, + "loss": 0.6013, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6167891025543213, + "rewards/margins": 7.549654006958008, + "rewards/rejected": -4.932865142822266, + "step": 14157 + }, + { + "epoch": 3.54, + "grad_norm": 3.559307813644409, + "learning_rate": 1.954971849256532e-06, + "logits/chosen": -0.5815342664718628, + "logits/rejected": -0.6514473557472229, + "logps/chosen": -52.79946517944336, + "logps/rejected": -109.11514282226562, + "loss": 0.6, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.269132137298584, + "rewards/margins": 8.07402229309082, + "rewards/rejected": -4.8048906326293945, + "step": 14158 + }, + { + "epoch": 3.54, + "grad_norm": 1.4088153839111328, + "learning_rate": 1.954348468278186e-06, + "logits/chosen": -0.5821576714515686, + "logits/rejected": -0.6617949604988098, + "logps/chosen": -44.97248077392578, + "logps/rejected": -125.51301574707031, + "loss": 0.5238, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0642573833465576, + "rewards/margins": 9.557258605957031, + "rewards/rejected": -6.4930009841918945, + "step": 14159 + }, + { + "epoch": 3.54, + "grad_norm": 4.777597427368164, + "learning_rate": 1.953725162561127e-06, + "logits/chosen": -0.5498818755149841, + "logits/rejected": -0.5709096193313599, + "logps/chosen": -49.68456268310547, + "logps/rejected": -106.32522583007812, + "loss": 0.6105, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1168079376220703, + "rewards/margins": 6.571106433868408, + "rewards/rejected": -3.454298496246338, + "step": 14160 + }, + { + "epoch": 3.54, + "grad_norm": 3.739060163497925, + "learning_rate": 1.9531019321207613e-06, + "logits/chosen": -0.5411531925201416, + "logits/rejected": -0.5703704953193665, + "logps/chosen": -51.13875961303711, + "logps/rejected": -103.42190551757812, + "loss": 0.5782, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0865492820739746, + "rewards/margins": 5.98525333404541, + "rewards/rejected": -2.8987038135528564, + "step": 14161 + }, + { + "epoch": 3.54, + "grad_norm": 6.338798522949219, + "learning_rate": 1.9524787769724858e-06, + "logits/chosen": -0.5548215508460999, + "logits/rejected": -0.5943624973297119, + "logps/chosen": -56.74761199951172, + "logps/rejected": -97.99281311035156, + "loss": 0.7506, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2554450035095215, + "rewards/margins": 6.609426498413086, + "rewards/rejected": -3.3539819717407227, + "step": 14162 + }, + { + "epoch": 3.54, + "grad_norm": 2.340759038925171, + "learning_rate": 1.9518556971317026e-06, + "logits/chosen": -0.45064887404441833, + "logits/rejected": -0.5948400497436523, + "logps/chosen": -64.18045806884766, + "logps/rejected": -101.1853256225586, + "loss": 0.5409, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3399345874786377, + "rewards/margins": 8.83997917175293, + "rewards/rejected": -5.500043869018555, + "step": 14163 + }, + { + "epoch": 3.54, + "grad_norm": 5.616547107696533, + "learning_rate": 1.9512326926138076e-06, + "logits/chosen": -0.6072402000427246, + "logits/rejected": -0.6623989343643188, + "logps/chosen": -46.703636169433594, + "logps/rejected": -124.3351821899414, + "loss": 0.6661, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.002530097961426, + "rewards/margins": 8.08099365234375, + "rewards/rejected": -5.078463554382324, + "step": 14164 + }, + { + "epoch": 3.54, + "grad_norm": 12.483797073364258, + "learning_rate": 1.9506097634341937e-06, + "logits/chosen": -0.549086332321167, + "logits/rejected": -0.618743360042572, + "logps/chosen": -50.768821716308594, + "logps/rejected": -100.34306335449219, + "loss": 0.695, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9582266807556152, + "rewards/margins": 6.737580299377441, + "rewards/rejected": -3.7793545722961426, + "step": 14165 + }, + { + "epoch": 3.54, + "grad_norm": 8.99569320678711, + "learning_rate": 1.9499869096082576e-06, + "logits/chosen": -0.5571945309638977, + "logits/rejected": -0.6365317702293396, + "logps/chosen": -52.13119888305664, + "logps/rejected": -92.31534576416016, + "loss": 0.6361, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9471497535705566, + "rewards/margins": 5.418384075164795, + "rewards/rejected": -2.4712345600128174, + "step": 14166 + }, + { + "epoch": 3.54, + "grad_norm": 4.967752456665039, + "learning_rate": 1.9493641311513893e-06, + "logits/chosen": -0.544381856918335, + "logits/rejected": -0.6169911623001099, + "logps/chosen": -49.80522155761719, + "logps/rejected": -99.37420654296875, + "loss": 0.612, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0431554317474365, + "rewards/margins": 6.87196159362793, + "rewards/rejected": -3.828806161880493, + "step": 14167 + }, + { + "epoch": 3.54, + "grad_norm": 7.206462383270264, + "learning_rate": 1.948741428078976e-06, + "logits/chosen": -0.5375076532363892, + "logits/rejected": -0.5985347032546997, + "logps/chosen": -49.67626190185547, + "logps/rejected": -94.45794677734375, + "loss": 0.6679, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3888449668884277, + "rewards/margins": 6.931107521057129, + "rewards/rejected": -3.5422630310058594, + "step": 14168 + }, + { + "epoch": 3.54, + "grad_norm": 5.89794397354126, + "learning_rate": 1.948118800406408e-06, + "logits/chosen": -0.6013942956924438, + "logits/rejected": -0.6735362410545349, + "logps/chosen": -43.34030532836914, + "logps/rejected": -86.4600601196289, + "loss": 0.6546, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0941379070281982, + "rewards/margins": 5.402283191680908, + "rewards/rejected": -2.308145046234131, + "step": 14169 + }, + { + "epoch": 3.54, + "grad_norm": 4.4484100341796875, + "learning_rate": 1.947496248149074e-06, + "logits/chosen": -0.5125653147697449, + "logits/rejected": -0.6128159761428833, + "logps/chosen": -51.62071990966797, + "logps/rejected": -112.54937744140625, + "loss": 0.5354, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9239978790283203, + "rewards/margins": 7.124914169311523, + "rewards/rejected": -4.200916290283203, + "step": 14170 + }, + { + "epoch": 3.55, + "grad_norm": 13.130782127380371, + "learning_rate": 1.946873771322351e-06, + "logits/chosen": -0.5461299419403076, + "logits/rejected": -0.5915796756744385, + "logps/chosen": -48.710540771484375, + "logps/rejected": -87.71259307861328, + "loss": 0.721, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9253363609313965, + "rewards/margins": 5.036989212036133, + "rewards/rejected": -2.1116533279418945, + "step": 14171 + }, + { + "epoch": 3.55, + "grad_norm": 5.877390384674072, + "learning_rate": 1.946251369941624e-06, + "logits/chosen": -0.49496588110923767, + "logits/rejected": -0.5697298049926758, + "logps/chosen": -65.55370330810547, + "logps/rejected": -112.49927520751953, + "loss": 0.7023, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.953589916229248, + "rewards/margins": 6.400542259216309, + "rewards/rejected": -3.4469523429870605, + "step": 14172 + }, + { + "epoch": 3.55, + "grad_norm": 1.9817830324172974, + "learning_rate": 1.945629044022278e-06, + "logits/chosen": -0.5306938886642456, + "logits/rejected": -0.6264612674713135, + "logps/chosen": -43.90690612792969, + "logps/rejected": -100.07017517089844, + "loss": 0.4974, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8788790702819824, + "rewards/margins": 7.4648919105529785, + "rewards/rejected": -4.586012840270996, + "step": 14173 + }, + { + "epoch": 3.55, + "grad_norm": 9.509267807006836, + "learning_rate": 1.9450067935796835e-06, + "logits/chosen": -0.6749724745750427, + "logits/rejected": -0.7293357253074646, + "logps/chosen": -42.801910400390625, + "logps/rejected": -91.74329376220703, + "loss": 0.6751, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.261613607406616, + "rewards/margins": 6.115054607391357, + "rewards/rejected": -2.853440523147583, + "step": 14174 + }, + { + "epoch": 3.55, + "grad_norm": 4.0957350730896, + "learning_rate": 1.9443846186292204e-06, + "logits/chosen": -0.5976331233978271, + "logits/rejected": -0.728198230266571, + "logps/chosen": -69.62705993652344, + "logps/rejected": -99.37777709960938, + "loss": 0.8069, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0229270458221436, + "rewards/margins": 6.858760356903076, + "rewards/rejected": -3.8358335494995117, + "step": 14175 + }, + { + "epoch": 3.55, + "grad_norm": 5.679643154144287, + "learning_rate": 1.943762519186265e-06, + "logits/chosen": -0.6064250469207764, + "logits/rejected": -0.6741601824760437, + "logps/chosen": -45.186546325683594, + "logps/rejected": -116.03534698486328, + "loss": 0.5971, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.383392810821533, + "rewards/margins": 7.187019348144531, + "rewards/rejected": -3.8036258220672607, + "step": 14176 + }, + { + "epoch": 3.55, + "grad_norm": 4.678361415863037, + "learning_rate": 1.9431404952661887e-06, + "logits/chosen": -0.5804009437561035, + "logits/rejected": -0.6770262718200684, + "logps/chosen": -58.61211395263672, + "logps/rejected": -94.15392303466797, + "loss": 0.5908, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.060257911682129, + "rewards/margins": 6.819377422332764, + "rewards/rejected": -3.7591185569763184, + "step": 14177 + }, + { + "epoch": 3.55, + "grad_norm": 5.77205228805542, + "learning_rate": 1.9425185468843606e-06, + "logits/chosen": -0.5935299396514893, + "logits/rejected": -0.6734070181846619, + "logps/chosen": -58.959449768066406, + "logps/rejected": -122.76414489746094, + "loss": 0.6488, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.778681755065918, + "rewards/margins": 8.096073150634766, + "rewards/rejected": -5.317391395568848, + "step": 14178 + }, + { + "epoch": 3.55, + "grad_norm": 4.070079326629639, + "learning_rate": 1.9418966740561528e-06, + "logits/chosen": -0.545638918876648, + "logits/rejected": -0.635137140750885, + "logps/chosen": -54.59415817260742, + "logps/rejected": -105.16448974609375, + "loss": 0.5935, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.431305408477783, + "rewards/margins": 7.078534126281738, + "rewards/rejected": -3.6472277641296387, + "step": 14179 + }, + { + "epoch": 3.55, + "grad_norm": 2.255805015563965, + "learning_rate": 1.941274876796931e-06, + "logits/chosen": -0.4730015695095062, + "logits/rejected": -0.5526901483535767, + "logps/chosen": -57.804386138916016, + "logps/rejected": -118.74937438964844, + "loss": 0.6173, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1646811962127686, + "rewards/margins": 8.56539249420166, + "rewards/rejected": -5.4007110595703125, + "step": 14180 + }, + { + "epoch": 3.55, + "grad_norm": 13.049181938171387, + "learning_rate": 1.9406531551220586e-06, + "logits/chosen": -0.5653614401817322, + "logits/rejected": -0.666111946105957, + "logps/chosen": -53.17645263671875, + "logps/rejected": -107.49884796142578, + "loss": 0.5741, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0513577461242676, + "rewards/margins": 6.550515174865723, + "rewards/rejected": -3.4991581439971924, + "step": 14181 + }, + { + "epoch": 3.55, + "grad_norm": 8.897378921508789, + "learning_rate": 1.940031509046903e-06, + "logits/chosen": -0.5907209515571594, + "logits/rejected": -0.6759529113769531, + "logps/chosen": -56.860565185546875, + "logps/rejected": -122.91004180908203, + "loss": 0.6617, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.065209150314331, + "rewards/margins": 8.324165344238281, + "rewards/rejected": -5.258955955505371, + "step": 14182 + }, + { + "epoch": 3.55, + "grad_norm": 12.711126327514648, + "learning_rate": 1.939409938586821e-06, + "logits/chosen": -0.5323895215988159, + "logits/rejected": -0.5750930905342102, + "logps/chosen": -62.38642883300781, + "logps/rejected": -107.18229675292969, + "loss": 0.6592, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3451170921325684, + "rewards/margins": 6.76818323135376, + "rewards/rejected": -3.4230659008026123, + "step": 14183 + }, + { + "epoch": 3.55, + "grad_norm": 4.868663311004639, + "learning_rate": 1.938788443757177e-06, + "logits/chosen": -0.5786759853363037, + "logits/rejected": -0.6327056884765625, + "logps/chosen": -58.824649810791016, + "logps/rejected": -106.77884674072266, + "loss": 0.5834, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.030850648880005, + "rewards/margins": 7.166122913360596, + "rewards/rejected": -4.13527250289917, + "step": 14184 + }, + { + "epoch": 3.55, + "grad_norm": 4.583050727844238, + "learning_rate": 1.9381670245733263e-06, + "logits/chosen": -0.6427015662193298, + "logits/rejected": -0.6976009011268616, + "logps/chosen": -54.46056365966797, + "logps/rejected": -115.56790161132812, + "loss": 0.6447, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7619266510009766, + "rewards/margins": 6.857461452484131, + "rewards/rejected": -4.095534801483154, + "step": 14185 + }, + { + "epoch": 3.55, + "grad_norm": 4.729682445526123, + "learning_rate": 1.937545681050624e-06, + "logits/chosen": -0.5133886337280273, + "logits/rejected": -0.5719896554946899, + "logps/chosen": -56.4028205871582, + "logps/rejected": -115.05279541015625, + "loss": 0.6475, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8095624446868896, + "rewards/margins": 7.093654155731201, + "rewards/rejected": -4.284091472625732, + "step": 14186 + }, + { + "epoch": 3.55, + "grad_norm": 8.02489185333252, + "learning_rate": 1.936924413204425e-06, + "logits/chosen": -0.5610314607620239, + "logits/rejected": -0.6108133792877197, + "logps/chosen": -53.17344665527344, + "logps/rejected": -104.30913543701172, + "loss": 0.6233, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4215941429138184, + "rewards/margins": 6.986700057983398, + "rewards/rejected": -3.565105438232422, + "step": 14187 + }, + { + "epoch": 3.55, + "grad_norm": 3.7501513957977295, + "learning_rate": 1.936303221050083e-06, + "logits/chosen": -0.47717738151550293, + "logits/rejected": -0.530411958694458, + "logps/chosen": -56.19710922241211, + "logps/rejected": -104.56074523925781, + "loss": 0.5759, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8427438735961914, + "rewards/margins": 7.040398597717285, + "rewards/rejected": -4.1976542472839355, + "step": 14188 + }, + { + "epoch": 3.55, + "grad_norm": 5.101579666137695, + "learning_rate": 1.935682104602948e-06, + "logits/chosen": -0.5268585681915283, + "logits/rejected": -0.5814771056175232, + "logps/chosen": -63.49990463256836, + "logps/rejected": -109.98295593261719, + "loss": 0.6049, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8556556701660156, + "rewards/margins": 6.236551284790039, + "rewards/rejected": -3.3808956146240234, + "step": 14189 + }, + { + "epoch": 3.55, + "grad_norm": 18.157331466674805, + "learning_rate": 1.935061063878365e-06, + "logits/chosen": -0.5377611517906189, + "logits/rejected": -0.5877802968025208, + "logps/chosen": -58.05836486816406, + "logps/rejected": -123.15379333496094, + "loss": 0.6148, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6463370323181152, + "rewards/margins": 7.107797145843506, + "rewards/rejected": -4.461459159851074, + "step": 14190 + }, + { + "epoch": 3.55, + "grad_norm": 3.1905884742736816, + "learning_rate": 1.9344400988916856e-06, + "logits/chosen": -0.5602047443389893, + "logits/rejected": -0.6241228580474854, + "logps/chosen": -59.483341217041016, + "logps/rejected": -106.76778411865234, + "loss": 0.6481, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9184224605560303, + "rewards/margins": 6.304215431213379, + "rewards/rejected": -3.3857929706573486, + "step": 14191 + }, + { + "epoch": 3.55, + "grad_norm": 4.271719932556152, + "learning_rate": 1.9338192096582515e-06, + "logits/chosen": -0.6117721199989319, + "logits/rejected": -0.6924466490745544, + "logps/chosen": -47.022315979003906, + "logps/rejected": -85.45278930664062, + "loss": 0.6331, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.343146324157715, + "rewards/margins": 6.291183948516846, + "rewards/rejected": -2.948037624359131, + "step": 14192 + }, + { + "epoch": 3.55, + "grad_norm": 5.507415771484375, + "learning_rate": 1.933198396193405e-06, + "logits/chosen": -0.5901399850845337, + "logits/rejected": -0.6385886669158936, + "logps/chosen": -68.9524917602539, + "logps/rejected": -97.15177154541016, + "loss": 0.7095, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.825521230697632, + "rewards/margins": 5.5831804275512695, + "rewards/rejected": -2.7576589584350586, + "step": 14193 + }, + { + "epoch": 3.55, + "grad_norm": 2.3901727199554443, + "learning_rate": 1.932577658512489e-06, + "logits/chosen": -0.493562251329422, + "logits/rejected": -0.5727755427360535, + "logps/chosen": -62.06059646606445, + "logps/rejected": -113.16023254394531, + "loss": 0.6141, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.342475652694702, + "rewards/margins": 7.9828948974609375, + "rewards/rejected": -4.6404194831848145, + "step": 14194 + }, + { + "epoch": 3.55, + "grad_norm": 12.233207702636719, + "learning_rate": 1.931956996630842e-06, + "logits/chosen": -0.5544285178184509, + "logits/rejected": -0.6008150577545166, + "logps/chosen": -60.5187873840332, + "logps/rejected": -105.20159149169922, + "loss": 0.7019, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6517837047576904, + "rewards/margins": 6.427379131317139, + "rewards/rejected": -3.7755954265594482, + "step": 14195 + }, + { + "epoch": 3.55, + "grad_norm": 5.1918864250183105, + "learning_rate": 1.9313364105637993e-06, + "logits/chosen": -0.527207612991333, + "logits/rejected": -0.641248345375061, + "logps/chosen": -52.70426559448242, + "logps/rejected": -98.4979476928711, + "loss": 0.5846, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.96675443649292, + "rewards/margins": 6.405208587646484, + "rewards/rejected": -3.4384543895721436, + "step": 14196 + }, + { + "epoch": 3.55, + "grad_norm": 3.8601417541503906, + "learning_rate": 1.9307159003266994e-06, + "logits/chosen": -0.6127343773841858, + "logits/rejected": -0.7005806565284729, + "logps/chosen": -44.68010711669922, + "logps/rejected": -99.12081909179688, + "loss": 0.6312, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2318167686462402, + "rewards/margins": 6.9802398681640625, + "rewards/rejected": -3.748422145843506, + "step": 14197 + }, + { + "epoch": 3.55, + "grad_norm": 4.323996067047119, + "learning_rate": 1.930095465934875e-06, + "logits/chosen": -0.4847654104232788, + "logits/rejected": -0.5367827415466309, + "logps/chosen": -49.828468322753906, + "logps/rejected": -127.92022705078125, + "loss": 0.5528, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.357485294342041, + "rewards/margins": 8.119182586669922, + "rewards/rejected": -4.761698246002197, + "step": 14198 + }, + { + "epoch": 3.55, + "grad_norm": 3.1269073486328125, + "learning_rate": 1.9294751074036543e-06, + "logits/chosen": -0.5793765783309937, + "logits/rejected": -0.6493953466415405, + "logps/chosen": -48.87266540527344, + "logps/rejected": -94.55486297607422, + "loss": 0.5746, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.121993064880371, + "rewards/margins": 7.2030029296875, + "rewards/rejected": -4.081010818481445, + "step": 14199 + }, + { + "epoch": 3.55, + "grad_norm": 8.024194717407227, + "learning_rate": 1.92885482474837e-06, + "logits/chosen": -0.4898131489753723, + "logits/rejected": -0.5736480355262756, + "logps/chosen": -65.50108337402344, + "logps/rejected": -110.75150299072266, + "loss": 0.6607, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9402987957000732, + "rewards/margins": 7.388128757476807, + "rewards/rejected": -4.447829723358154, + "step": 14200 + }, + { + "epoch": 3.55, + "grad_norm": 22.9786434173584, + "learning_rate": 1.9282346179843515e-06, + "logits/chosen": -0.5452892184257507, + "logits/rejected": -0.6298854351043701, + "logps/chosen": -61.67747116088867, + "logps/rejected": -103.25952911376953, + "loss": 0.7483, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.048046827316284, + "rewards/margins": 6.518221378326416, + "rewards/rejected": -3.470174789428711, + "step": 14201 + }, + { + "epoch": 3.55, + "grad_norm": 3.74802303314209, + "learning_rate": 1.9276144871269225e-06, + "logits/chosen": -0.5660095810890198, + "logits/rejected": -0.6270430684089661, + "logps/chosen": -45.109039306640625, + "logps/rejected": -95.54896545410156, + "loss": 0.5462, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9463744163513184, + "rewards/margins": 6.618796348571777, + "rewards/rejected": -3.672421932220459, + "step": 14202 + }, + { + "epoch": 3.55, + "grad_norm": 1.5485502481460571, + "learning_rate": 1.926994432191406e-06, + "logits/chosen": -0.5888625383377075, + "logits/rejected": -0.6775541305541992, + "logps/chosen": -45.84417724609375, + "logps/rejected": -117.09794616699219, + "loss": 0.5096, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.316481113433838, + "rewards/margins": 9.18522834777832, + "rewards/rejected": -5.868747234344482, + "step": 14203 + }, + { + "epoch": 3.55, + "grad_norm": 7.489602565765381, + "learning_rate": 1.926374453193127e-06, + "logits/chosen": -0.5008345246315002, + "logits/rejected": -0.6067168712615967, + "logps/chosen": -59.84184265136719, + "logps/rejected": -85.95957946777344, + "loss": 0.5824, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0584707260131836, + "rewards/margins": 6.184406280517578, + "rewards/rejected": -3.1259355545043945, + "step": 14204 + }, + { + "epoch": 3.55, + "grad_norm": 4.844703674316406, + "learning_rate": 1.9257545501474053e-06, + "logits/chosen": -0.6165680289268494, + "logits/rejected": -0.680290937423706, + "logps/chosen": -58.38505935668945, + "logps/rejected": -129.1202392578125, + "loss": 0.6295, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.930619955062866, + "rewards/margins": 8.447816848754883, + "rewards/rejected": -5.517197608947754, + "step": 14205 + }, + { + "epoch": 3.55, + "grad_norm": 10.11551284790039, + "learning_rate": 1.925134723069556e-06, + "logits/chosen": -0.49291396141052246, + "logits/rejected": -0.5925648212432861, + "logps/chosen": -54.24184036254883, + "logps/rejected": -104.35447692871094, + "loss": 0.6422, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.078969955444336, + "rewards/margins": 7.191025257110596, + "rewards/rejected": -4.112055778503418, + "step": 14206 + }, + { + "epoch": 3.55, + "grad_norm": 3.2958147525787354, + "learning_rate": 1.9245149719749e-06, + "logits/chosen": -0.6237578988075256, + "logits/rejected": -0.6966714262962341, + "logps/chosen": -44.70158386230469, + "logps/rejected": -103.97100067138672, + "loss": 0.5612, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.997065782546997, + "rewards/margins": 7.414631366729736, + "rewards/rejected": -4.41756534576416, + "step": 14207 + }, + { + "epoch": 3.55, + "grad_norm": 14.42208480834961, + "learning_rate": 1.9238952968787495e-06, + "logits/chosen": -0.5227717161178589, + "logits/rejected": -0.5953749418258667, + "logps/chosen": -58.960628509521484, + "logps/rejected": -107.78279876708984, + "loss": 0.6989, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0486836433410645, + "rewards/margins": 6.426124572753906, + "rewards/rejected": -3.3774404525756836, + "step": 14208 + }, + { + "epoch": 3.55, + "grad_norm": 9.7208833694458, + "learning_rate": 1.923275697796419e-06, + "logits/chosen": -0.5405974984169006, + "logits/rejected": -0.6400027275085449, + "logps/chosen": -53.32914733886719, + "logps/rejected": -100.21295928955078, + "loss": 0.6973, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2649567127227783, + "rewards/margins": 6.375217437744141, + "rewards/rejected": -3.110260486602783, + "step": 14209 + }, + { + "epoch": 3.55, + "grad_norm": 3.4360122680664062, + "learning_rate": 1.9226561747432188e-06, + "logits/chosen": -0.5186237096786499, + "logits/rejected": -0.6479395031929016, + "logps/chosen": -63.934173583984375, + "logps/rejected": -91.10747528076172, + "loss": 0.6223, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.245283603668213, + "rewards/margins": 6.792962074279785, + "rewards/rejected": -3.5476787090301514, + "step": 14210 + }, + { + "epoch": 3.56, + "grad_norm": 6.5576171875, + "learning_rate": 1.922036727734456e-06, + "logits/chosen": -0.5590426325798035, + "logits/rejected": -0.6260635852813721, + "logps/chosen": -46.68903732299805, + "logps/rejected": -102.35704803466797, + "loss": 0.5591, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0501272678375244, + "rewards/margins": 7.059471130371094, + "rewards/rejected": -4.00934362411499, + "step": 14211 + }, + { + "epoch": 3.56, + "grad_norm": 2.9711389541625977, + "learning_rate": 1.9214173567854395e-06, + "logits/chosen": -0.5525553226470947, + "logits/rejected": -0.6306157112121582, + "logps/chosen": -49.12907409667969, + "logps/rejected": -92.12716674804688, + "loss": 0.598, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.279841423034668, + "rewards/margins": 6.970525741577148, + "rewards/rejected": -3.6906843185424805, + "step": 14212 + }, + { + "epoch": 3.56, + "grad_norm": 5.3033294677734375, + "learning_rate": 1.920798061911478e-06, + "logits/chosen": -0.5703876614570618, + "logits/rejected": -0.5934087038040161, + "logps/chosen": -48.56586456298828, + "logps/rejected": -117.11186981201172, + "loss": 0.6453, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3587145805358887, + "rewards/margins": 7.703794479370117, + "rewards/rejected": -4.345080375671387, + "step": 14213 + }, + { + "epoch": 3.56, + "grad_norm": 4.8385009765625, + "learning_rate": 1.9201788431278684e-06, + "logits/chosen": -0.4746505320072174, + "logits/rejected": -0.5355567336082458, + "logps/chosen": -61.3514518737793, + "logps/rejected": -120.7571792602539, + "loss": 0.6737, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.262104034423828, + "rewards/margins": 6.4068827629089355, + "rewards/rejected": -3.144778251647949, + "step": 14214 + }, + { + "epoch": 3.56, + "grad_norm": 3.3426856994628906, + "learning_rate": 1.919559700449915e-06, + "logits/chosen": -0.5657681226730347, + "logits/rejected": -0.6349983215332031, + "logps/chosen": -48.00426483154297, + "logps/rejected": -116.94955444335938, + "loss": 0.5482, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.288883686065674, + "rewards/margins": 7.556726455688477, + "rewards/rejected": -4.267842769622803, + "step": 14215 + }, + { + "epoch": 3.56, + "grad_norm": 5.538871765136719, + "learning_rate": 1.9189406338929217e-06, + "logits/chosen": -0.4733717441558838, + "logits/rejected": -0.5613968372344971, + "logps/chosen": -64.24127197265625, + "logps/rejected": -101.9573745727539, + "loss": 0.7008, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.960937023162842, + "rewards/margins": 7.0637712478637695, + "rewards/rejected": -4.1028337478637695, + "step": 14216 + }, + { + "epoch": 3.56, + "grad_norm": 3.278123140335083, + "learning_rate": 1.918321643472179e-06, + "logits/chosen": -0.4443714916706085, + "logits/rejected": -0.5419138669967651, + "logps/chosen": -55.97332000732422, + "logps/rejected": -86.35707092285156, + "loss": 0.6195, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.684138774871826, + "rewards/margins": 6.303444862365723, + "rewards/rejected": -3.6193060874938965, + "step": 14217 + }, + { + "epoch": 3.56, + "grad_norm": 9.32483196258545, + "learning_rate": 1.9177027292029867e-06, + "logits/chosen": -0.6137815117835999, + "logits/rejected": -0.7159085869789124, + "logps/chosen": -59.896244049072266, + "logps/rejected": -92.65084075927734, + "loss": 0.6648, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.016439199447632, + "rewards/margins": 6.7803635597229, + "rewards/rejected": -3.7639243602752686, + "step": 14218 + }, + { + "epoch": 3.56, + "grad_norm": 4.185838222503662, + "learning_rate": 1.9170838911006397e-06, + "logits/chosen": -0.5385235548019409, + "logits/rejected": -0.5856478214263916, + "logps/chosen": -46.833168029785156, + "logps/rejected": -118.62126922607422, + "loss": 0.5818, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1329455375671387, + "rewards/margins": 6.777419090270996, + "rewards/rejected": -3.644473075866699, + "step": 14219 + }, + { + "epoch": 3.56, + "grad_norm": 3.9587697982788086, + "learning_rate": 1.916465129180429e-06, + "logits/chosen": -0.6261030435562134, + "logits/rejected": -0.682660698890686, + "logps/chosen": -49.15278244018555, + "logps/rejected": -111.68475341796875, + "loss": 0.6306, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.09101939201355, + "rewards/margins": 7.1936516761779785, + "rewards/rejected": -4.102632522583008, + "step": 14220 + }, + { + "epoch": 3.56, + "grad_norm": 5.403188705444336, + "learning_rate": 1.9158464434576425e-06, + "logits/chosen": -0.6029098629951477, + "logits/rejected": -0.6614605188369751, + "logps/chosen": -43.73286437988281, + "logps/rejected": -81.75523376464844, + "loss": 0.5772, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2344274520874023, + "rewards/margins": 5.577670574188232, + "rewards/rejected": -2.343243360519409, + "step": 14221 + }, + { + "epoch": 3.56, + "grad_norm": 6.199565887451172, + "learning_rate": 1.915227833947573e-06, + "logits/chosen": -0.5663995742797852, + "logits/rejected": -0.6563234329223633, + "logps/chosen": -56.50908279418945, + "logps/rejected": -93.05516052246094, + "loss": 0.6086, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.112394332885742, + "rewards/margins": 7.2794575691223145, + "rewards/rejected": -4.167063236236572, + "step": 14222 + }, + { + "epoch": 3.56, + "grad_norm": 2.1867928504943848, + "learning_rate": 1.9146093006655046e-06, + "logits/chosen": -0.5140905976295471, + "logits/rejected": -0.6230670809745789, + "logps/chosen": -53.48931121826172, + "logps/rejected": -84.80567169189453, + "loss": 0.5725, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1856703758239746, + "rewards/margins": 6.926888942718506, + "rewards/rejected": -3.7412190437316895, + "step": 14223 + }, + { + "epoch": 3.56, + "grad_norm": 5.144310474395752, + "learning_rate": 1.9139908436267207e-06, + "logits/chosen": -0.5666303038597107, + "logits/rejected": -0.6294741630554199, + "logps/chosen": -52.56513977050781, + "logps/rejected": -113.47514343261719, + "loss": 0.7047, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.973426103591919, + "rewards/margins": 8.233675956726074, + "rewards/rejected": -5.260250091552734, + "step": 14224 + }, + { + "epoch": 3.56, + "grad_norm": 7.999405860900879, + "learning_rate": 1.9133724628465074e-06, + "logits/chosen": -0.6448130011558533, + "logits/rejected": -0.7699081897735596, + "logps/chosen": -59.64836883544922, + "logps/rejected": -87.76689147949219, + "loss": 0.6926, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8674044609069824, + "rewards/margins": 6.29322624206543, + "rewards/rejected": -3.4258222579956055, + "step": 14225 + }, + { + "epoch": 3.56, + "grad_norm": 6.220132350921631, + "learning_rate": 1.9127541583401414e-06, + "logits/chosen": -0.563296377658844, + "logits/rejected": -0.6028072834014893, + "logps/chosen": -48.630496978759766, + "logps/rejected": -114.0355224609375, + "loss": 0.5868, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4393150806427, + "rewards/margins": 7.684960842132568, + "rewards/rejected": -4.245645999908447, + "step": 14226 + }, + { + "epoch": 3.56, + "grad_norm": 7.985446929931641, + "learning_rate": 1.9121359301229066e-06, + "logits/chosen": -0.48746514320373535, + "logits/rejected": -0.5719959735870361, + "logps/chosen": -63.927894592285156, + "logps/rejected": -123.57601928710938, + "loss": 0.5924, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.916604995727539, + "rewards/margins": 7.903920650482178, + "rewards/rejected": -4.987315654754639, + "step": 14227 + }, + { + "epoch": 3.56, + "grad_norm": 4.59541130065918, + "learning_rate": 1.911517778210076e-06, + "logits/chosen": -0.43892788887023926, + "logits/rejected": -0.5017656087875366, + "logps/chosen": -66.09418487548828, + "logps/rejected": -128.36367797851562, + "loss": 0.6788, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3343453407287598, + "rewards/margins": 7.242712020874023, + "rewards/rejected": -3.9083669185638428, + "step": 14228 + }, + { + "epoch": 3.56, + "grad_norm": 6.169580459594727, + "learning_rate": 1.910899702616925e-06, + "logits/chosen": -0.5285073518753052, + "logits/rejected": -0.6057248115539551, + "logps/chosen": -56.33454513549805, + "logps/rejected": -108.66683197021484, + "loss": 0.7213, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.191218376159668, + "rewards/margins": 6.600329399108887, + "rewards/rejected": -3.409111261367798, + "step": 14229 + }, + { + "epoch": 3.56, + "grad_norm": 17.301395416259766, + "learning_rate": 1.9102817033587303e-06, + "logits/chosen": -0.6027767658233643, + "logits/rejected": -0.6837749481201172, + "logps/chosen": -58.967063903808594, + "logps/rejected": -93.02381134033203, + "loss": 0.6785, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9399948120117188, + "rewards/margins": 6.601864337921143, + "rewards/rejected": -3.661869525909424, + "step": 14230 + }, + { + "epoch": 3.56, + "grad_norm": 3.4970216751098633, + "learning_rate": 1.9096637804507588e-06, + "logits/chosen": -0.4939873516559601, + "logits/rejected": -0.5612644553184509, + "logps/chosen": -56.355777740478516, + "logps/rejected": -125.86771392822266, + "loss": 0.58, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1241369247436523, + "rewards/margins": 6.950056552886963, + "rewards/rejected": -3.8259196281433105, + "step": 14231 + }, + { + "epoch": 3.56, + "grad_norm": 5.16087532043457, + "learning_rate": 1.909045933908284e-06, + "logits/chosen": -0.5812615156173706, + "logits/rejected": -0.6315475702285767, + "logps/chosen": -51.48903274536133, + "logps/rejected": -98.60610961914062, + "loss": 0.6356, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.007007598876953, + "rewards/margins": 6.406391620635986, + "rewards/rejected": -3.399384021759033, + "step": 14232 + }, + { + "epoch": 3.56, + "grad_norm": 6.496367454528809, + "learning_rate": 1.9084281637465706e-06, + "logits/chosen": -0.5070309042930603, + "logits/rejected": -0.6161326766014099, + "logps/chosen": -51.85206985473633, + "logps/rejected": -133.27679443359375, + "loss": 0.5684, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.768101453781128, + "rewards/margins": 9.222350120544434, + "rewards/rejected": -6.454248428344727, + "step": 14233 + }, + { + "epoch": 3.56, + "grad_norm": 4.043724060058594, + "learning_rate": 1.907810469980887e-06, + "logits/chosen": -0.6060877442359924, + "logits/rejected": -0.621654212474823, + "logps/chosen": -44.49676513671875, + "logps/rejected": -116.85986328125, + "loss": 0.6676, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0851409435272217, + "rewards/margins": 7.922766208648682, + "rewards/rejected": -4.837625503540039, + "step": 14234 + }, + { + "epoch": 3.56, + "grad_norm": 5.265302658081055, + "learning_rate": 1.9071928526264955e-06, + "logits/chosen": -0.5881551504135132, + "logits/rejected": -0.6929998993873596, + "logps/chosen": -62.873870849609375, + "logps/rejected": -100.64158630371094, + "loss": 0.6944, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2005810737609863, + "rewards/margins": 7.098724842071533, + "rewards/rejected": -3.898144245147705, + "step": 14235 + }, + { + "epoch": 3.56, + "grad_norm": 3.04750657081604, + "learning_rate": 1.906575311698657e-06, + "logits/chosen": -0.5910651087760925, + "logits/rejected": -0.6871154308319092, + "logps/chosen": -50.37529373168945, + "logps/rejected": -110.1152114868164, + "loss": 0.5555, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.622014045715332, + "rewards/margins": 8.292243957519531, + "rewards/rejected": -4.670229434967041, + "step": 14236 + }, + { + "epoch": 3.56, + "grad_norm": 3.6135385036468506, + "learning_rate": 1.9059578472126344e-06, + "logits/chosen": -0.5156278610229492, + "logits/rejected": -0.6252016425132751, + "logps/chosen": -56.50766372680664, + "logps/rejected": -99.84552001953125, + "loss": 0.624, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.056462526321411, + "rewards/margins": 7.598476409912109, + "rewards/rejected": -4.542014122009277, + "step": 14237 + }, + { + "epoch": 3.56, + "grad_norm": 2.628037929534912, + "learning_rate": 1.9053404591836843e-06, + "logits/chosen": -0.5448476672172546, + "logits/rejected": -0.6663514375686646, + "logps/chosen": -53.519813537597656, + "logps/rejected": -102.13289642333984, + "loss": 0.5514, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2270963191986084, + "rewards/margins": 8.213546752929688, + "rewards/rejected": -4.986449718475342, + "step": 14238 + }, + { + "epoch": 3.56, + "grad_norm": 1.635361671447754, + "learning_rate": 1.9047231476270612e-06, + "logits/chosen": -0.5542236566543579, + "logits/rejected": -0.6623886227607727, + "logps/chosen": -51.978233337402344, + "logps/rejected": -123.3810806274414, + "loss": 0.5464, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.344123363494873, + "rewards/margins": 9.485711097717285, + "rewards/rejected": -6.141587257385254, + "step": 14239 + }, + { + "epoch": 3.56, + "grad_norm": 2.8952481746673584, + "learning_rate": 1.9041059125580236e-06, + "logits/chosen": -0.5049691796302795, + "logits/rejected": -0.6459865570068359, + "logps/chosen": -55.74577331542969, + "logps/rejected": -110.07131958007812, + "loss": 0.5787, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2756357192993164, + "rewards/margins": 7.831088066101074, + "rewards/rejected": -4.555452346801758, + "step": 14240 + }, + { + "epoch": 3.56, + "grad_norm": 4.190924167633057, + "learning_rate": 1.9034887539918212e-06, + "logits/chosen": -0.5592618584632874, + "logits/rejected": -0.6268796920776367, + "logps/chosen": -55.05133056640625, + "logps/rejected": -112.05414581298828, + "loss": 0.6618, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0151584148406982, + "rewards/margins": 7.460813522338867, + "rewards/rejected": -4.445655345916748, + "step": 14241 + }, + { + "epoch": 3.56, + "grad_norm": 3.4606242179870605, + "learning_rate": 1.902871671943704e-06, + "logits/chosen": -0.5153095126152039, + "logits/rejected": -0.5905144214630127, + "logps/chosen": -51.90196228027344, + "logps/rejected": -99.31791687011719, + "loss": 0.6313, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9420604705810547, + "rewards/margins": 6.61745548248291, + "rewards/rejected": -3.6753954887390137, + "step": 14242 + }, + { + "epoch": 3.56, + "grad_norm": 3.1067955493927, + "learning_rate": 1.902254666428922e-06, + "logits/chosen": -0.56571364402771, + "logits/rejected": -0.6205645203590393, + "logps/chosen": -55.59516906738281, + "logps/rejected": -114.63790893554688, + "loss": 0.6303, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3825337886810303, + "rewards/margins": 6.388644695281982, + "rewards/rejected": -3.006110906600952, + "step": 14243 + }, + { + "epoch": 3.56, + "grad_norm": 4.91870641708374, + "learning_rate": 1.9016377374627236e-06, + "logits/chosen": -0.5172789096832275, + "logits/rejected": -0.6003315448760986, + "logps/chosen": -54.53153991699219, + "logps/rejected": -102.07659912109375, + "loss": 0.5237, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4567646980285645, + "rewards/margins": 6.269153594970703, + "rewards/rejected": -2.8123886585235596, + "step": 14244 + }, + { + "epoch": 3.56, + "grad_norm": 5.729637145996094, + "learning_rate": 1.901020885060353e-06, + "logits/chosen": -0.6138793230056763, + "logits/rejected": -0.6981269121170044, + "logps/chosen": -68.46456909179688, + "logps/rejected": -100.41891479492188, + "loss": 0.7243, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7813546657562256, + "rewards/margins": 6.169228553771973, + "rewards/rejected": -3.387873411178589, + "step": 14245 + }, + { + "epoch": 3.56, + "grad_norm": 5.93673038482666, + "learning_rate": 1.90040410923705e-06, + "logits/chosen": -0.5281095504760742, + "logits/rejected": -0.6621373891830444, + "logps/chosen": -55.01565933227539, + "logps/rejected": -122.95386505126953, + "loss": 0.6561, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8615875244140625, + "rewards/margins": 8.774574279785156, + "rewards/rejected": -5.9129862785339355, + "step": 14246 + }, + { + "epoch": 3.56, + "grad_norm": 6.178975582122803, + "learning_rate": 1.8997874100080604e-06, + "logits/chosen": -0.6403456926345825, + "logits/rejected": -0.7212154865264893, + "logps/chosen": -48.50990676879883, + "logps/rejected": -107.97622680664062, + "loss": 0.6668, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.84525203704834, + "rewards/margins": 7.576624393463135, + "rewards/rejected": -4.731373310089111, + "step": 14247 + }, + { + "epoch": 3.56, + "grad_norm": 5.301607608795166, + "learning_rate": 1.8991707873886212e-06, + "logits/chosen": -0.5457733869552612, + "logits/rejected": -0.6276770830154419, + "logps/chosen": -54.40819549560547, + "logps/rejected": -100.96073150634766, + "loss": 0.7469, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0470669269561768, + "rewards/margins": 6.77232551574707, + "rewards/rejected": -3.7252585887908936, + "step": 14248 + }, + { + "epoch": 3.56, + "grad_norm": 3.278367519378662, + "learning_rate": 1.8985542413939684e-06, + "logits/chosen": -0.5376092195510864, + "logits/rejected": -0.6351818442344666, + "logps/chosen": -67.23133087158203, + "logps/rejected": -122.23541259765625, + "loss": 0.6446, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7869768142700195, + "rewards/margins": 8.610302925109863, + "rewards/rejected": -5.823326110839844, + "step": 14249 + }, + { + "epoch": 3.56, + "grad_norm": 2.8490824699401855, + "learning_rate": 1.897937772039341e-06, + "logits/chosen": -0.4586411118507385, + "logits/rejected": -0.5326910614967346, + "logps/chosen": -48.76445388793945, + "logps/rejected": -120.99562072753906, + "loss": 0.5646, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4485859870910645, + "rewards/margins": 8.764060020446777, + "rewards/rejected": -5.315473556518555, + "step": 14250 + }, + { + "epoch": 3.57, + "grad_norm": 4.832265377044678, + "learning_rate": 1.8973213793399691e-06, + "logits/chosen": -0.5952274799346924, + "logits/rejected": -0.6691017150878906, + "logps/chosen": -44.71271514892578, + "logps/rejected": -111.54155731201172, + "loss": 0.6311, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.763572931289673, + "rewards/margins": 8.059289932250977, + "rewards/rejected": -5.295717239379883, + "step": 14251 + }, + { + "epoch": 3.57, + "grad_norm": 3.1107959747314453, + "learning_rate": 1.8967050633110873e-06, + "logits/chosen": -0.5471060276031494, + "logits/rejected": -0.6147607564926147, + "logps/chosen": -57.21961975097656, + "logps/rejected": -94.61236572265625, + "loss": 0.6128, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.60677170753479, + "rewards/margins": 6.367950439453125, + "rewards/rejected": -2.761178970336914, + "step": 14252 + }, + { + "epoch": 3.57, + "grad_norm": 7.0366926193237305, + "learning_rate": 1.8960888239679248e-06, + "logits/chosen": -0.47231194376945496, + "logits/rejected": -0.5215808153152466, + "logps/chosen": -62.077144622802734, + "logps/rejected": -131.02626037597656, + "loss": 0.6931, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.3288238048553467, + "rewards/margins": 7.627017974853516, + "rewards/rejected": -4.29819393157959, + "step": 14253 + }, + { + "epoch": 3.57, + "grad_norm": 5.161002159118652, + "learning_rate": 1.8954726613257068e-06, + "logits/chosen": -0.4595019817352295, + "logits/rejected": -0.522806704044342, + "logps/chosen": -47.80940246582031, + "logps/rejected": -114.34944915771484, + "loss": 0.5419, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3196918964385986, + "rewards/margins": 6.614414215087891, + "rewards/rejected": -3.294722557067871, + "step": 14254 + }, + { + "epoch": 3.57, + "grad_norm": 9.995667457580566, + "learning_rate": 1.8948565753996634e-06, + "logits/chosen": -0.5468235611915588, + "logits/rejected": -0.6058948636054993, + "logps/chosen": -55.04528045654297, + "logps/rejected": -122.90226745605469, + "loss": 0.5746, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2980153560638428, + "rewards/margins": 7.695845603942871, + "rewards/rejected": -4.397830009460449, + "step": 14255 + }, + { + "epoch": 3.57, + "grad_norm": 10.128742218017578, + "learning_rate": 1.8942405662050167e-06, + "logits/chosen": -0.4849000871181488, + "logits/rejected": -0.578218936920166, + "logps/chosen": -60.99406051635742, + "logps/rejected": -121.08496856689453, + "loss": 0.7823, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.225830316543579, + "rewards/margins": 7.803550720214844, + "rewards/rejected": -4.577720642089844, + "step": 14256 + }, + { + "epoch": 3.57, + "grad_norm": 8.274402618408203, + "learning_rate": 1.8936246337569875e-06, + "logits/chosen": -0.48812365531921387, + "logits/rejected": -0.5421386957168579, + "logps/chosen": -54.629154205322266, + "logps/rejected": -122.09130096435547, + "loss": 0.6289, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8195910453796387, + "rewards/margins": 8.26764965057373, + "rewards/rejected": -5.448058605194092, + "step": 14257 + }, + { + "epoch": 3.57, + "grad_norm": 8.062857627868652, + "learning_rate": 1.8930087780707973e-06, + "logits/chosen": -0.5375503301620483, + "logits/rejected": -0.5867518186569214, + "logps/chosen": -62.4661750793457, + "logps/rejected": -104.25664520263672, + "loss": 0.7816, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.94675874710083, + "rewards/margins": 6.390252113342285, + "rewards/rejected": -3.4434938430786133, + "step": 14258 + }, + { + "epoch": 3.57, + "grad_norm": 12.89730453491211, + "learning_rate": 1.8923929991616691e-06, + "logits/chosen": -0.5710408687591553, + "logits/rejected": -0.6541379690170288, + "logps/chosen": -51.86171340942383, + "logps/rejected": -90.76234436035156, + "loss": 0.6529, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4732894897460938, + "rewards/margins": 6.578065872192383, + "rewards/rejected": -3.104776382446289, + "step": 14259 + }, + { + "epoch": 3.57, + "grad_norm": 7.40620756149292, + "learning_rate": 1.8917772970448112e-06, + "logits/chosen": -0.5097737312316895, + "logits/rejected": -0.6002709269523621, + "logps/chosen": -60.827823638916016, + "logps/rejected": -107.39978790283203, + "loss": 0.6772, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1585819721221924, + "rewards/margins": 7.288569450378418, + "rewards/rejected": -4.129988193511963, + "step": 14260 + }, + { + "epoch": 3.57, + "grad_norm": 4.312283039093018, + "learning_rate": 1.8911616717354425e-06, + "logits/chosen": -0.5498430728912354, + "logits/rejected": -0.6376453042030334, + "logps/chosen": -62.1990852355957, + "logps/rejected": -110.55484008789062, + "loss": 0.6267, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.119575262069702, + "rewards/margins": 7.328338623046875, + "rewards/rejected": -4.208763599395752, + "step": 14261 + }, + { + "epoch": 3.57, + "grad_norm": 6.588212966918945, + "learning_rate": 1.890546123248777e-06, + "logits/chosen": -0.4966025650501251, + "logits/rejected": -0.5527105927467346, + "logps/chosen": -67.45172119140625, + "logps/rejected": -113.67904663085938, + "loss": 0.6356, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5293800830841064, + "rewards/margins": 7.475892066955566, + "rewards/rejected": -3.946512460708618, + "step": 14262 + }, + { + "epoch": 3.57, + "grad_norm": 6.165699481964111, + "learning_rate": 1.8899306516000248e-06, + "logits/chosen": -0.5638112425804138, + "logits/rejected": -0.6539286971092224, + "logps/chosen": -70.38572692871094, + "logps/rejected": -106.94039154052734, + "loss": 0.6711, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0996735095977783, + "rewards/margins": 6.263590335845947, + "rewards/rejected": -3.163917303085327, + "step": 14263 + }, + { + "epoch": 3.57, + "grad_norm": 3.5589845180511475, + "learning_rate": 1.8893152568043922e-06, + "logits/chosen": -0.5300949811935425, + "logits/rejected": -0.552580714225769, + "logps/chosen": -50.48104476928711, + "logps/rejected": -118.33940124511719, + "loss": 0.5995, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.154775619506836, + "rewards/margins": 6.958974838256836, + "rewards/rejected": -3.8041985034942627, + "step": 14264 + }, + { + "epoch": 3.57, + "grad_norm": 4.162415027618408, + "learning_rate": 1.88869993887709e-06, + "logits/chosen": -0.5095456838607788, + "logits/rejected": -0.5903123617172241, + "logps/chosen": -56.998504638671875, + "logps/rejected": -115.34054565429688, + "loss": 0.6039, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.159785032272339, + "rewards/margins": 6.880270481109619, + "rewards/rejected": -3.7204856872558594, + "step": 14265 + }, + { + "epoch": 3.57, + "grad_norm": 4.855930328369141, + "learning_rate": 1.8880846978333216e-06, + "logits/chosen": -0.5460894107818604, + "logits/rejected": -0.6748837232589722, + "logps/chosen": -53.115638732910156, + "logps/rejected": -88.76498413085938, + "loss": 0.6101, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3101930618286133, + "rewards/margins": 6.982303142547607, + "rewards/rejected": -3.672110080718994, + "step": 14266 + }, + { + "epoch": 3.57, + "grad_norm": 8.127386093139648, + "learning_rate": 1.8874695336882886e-06, + "logits/chosen": -0.5221402645111084, + "logits/rejected": -0.6324359774589539, + "logps/chosen": -75.6514892578125, + "logps/rejected": -121.485107421875, + "loss": 0.7521, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8983922004699707, + "rewards/margins": 7.339552402496338, + "rewards/rejected": -4.441160202026367, + "step": 14267 + }, + { + "epoch": 3.57, + "grad_norm": 7.307713031768799, + "learning_rate": 1.8868544464571958e-06, + "logits/chosen": -0.5310551524162292, + "logits/rejected": -0.6441106200218201, + "logps/chosen": -57.129600524902344, + "logps/rejected": -101.18696594238281, + "loss": 0.6207, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.065829038619995, + "rewards/margins": 6.817532062530518, + "rewards/rejected": -3.751702308654785, + "step": 14268 + }, + { + "epoch": 3.57, + "grad_norm": 3.2526471614837646, + "learning_rate": 1.8862394361552395e-06, + "logits/chosen": -0.5777257680892944, + "logits/rejected": -0.6844477653503418, + "logps/chosen": -72.0969009399414, + "logps/rejected": -99.59220886230469, + "loss": 0.703, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2663192749023438, + "rewards/margins": 7.926912307739258, + "rewards/rejected": -4.660593032836914, + "step": 14269 + }, + { + "epoch": 3.57, + "grad_norm": 9.339390754699707, + "learning_rate": 1.88562450279762e-06, + "logits/chosen": -0.5041351914405823, + "logits/rejected": -0.6381623148918152, + "logps/chosen": -66.90687561035156, + "logps/rejected": -104.3309555053711, + "loss": 0.8062, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.645125150680542, + "rewards/margins": 6.369142532348633, + "rewards/rejected": -3.724018096923828, + "step": 14270 + }, + { + "epoch": 3.57, + "grad_norm": 5.057342529296875, + "learning_rate": 1.8850096463995315e-06, + "logits/chosen": -0.5835529565811157, + "logits/rejected": -0.6365930438041687, + "logps/chosen": -49.661678314208984, + "logps/rejected": -124.72792053222656, + "loss": 0.6035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8723626136779785, + "rewards/margins": 7.218483924865723, + "rewards/rejected": -4.346120834350586, + "step": 14271 + }, + { + "epoch": 3.57, + "grad_norm": 5.306821346282959, + "learning_rate": 1.8843948669761664e-06, + "logits/chosen": -0.4962935745716095, + "logits/rejected": -0.5520606637001038, + "logps/chosen": -56.24011993408203, + "logps/rejected": -98.5927505493164, + "loss": 0.5581, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0226712226867676, + "rewards/margins": 6.442539215087891, + "rewards/rejected": -3.419867515563965, + "step": 14272 + }, + { + "epoch": 3.57, + "grad_norm": 4.01471471786499, + "learning_rate": 1.8837801645427195e-06, + "logits/chosen": -0.5035409927368164, + "logits/rejected": -0.6102556586265564, + "logps/chosen": -53.84023666381836, + "logps/rejected": -110.49565887451172, + "loss": 0.6047, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2581887245178223, + "rewards/margins": 7.462123870849609, + "rewards/rejected": -4.203935623168945, + "step": 14273 + }, + { + "epoch": 3.57, + "grad_norm": 5.12561559677124, + "learning_rate": 1.88316553911438e-06, + "logits/chosen": -0.47627973556518555, + "logits/rejected": -0.567464292049408, + "logps/chosen": -52.407936096191406, + "logps/rejected": -102.08674621582031, + "loss": 0.6184, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.999530792236328, + "rewards/margins": 6.657602310180664, + "rewards/rejected": -3.658071756362915, + "step": 14274 + }, + { + "epoch": 3.57, + "grad_norm": 5.324523448944092, + "learning_rate": 1.8825509907063328e-06, + "logits/chosen": -0.5571669936180115, + "logits/rejected": -0.6043353080749512, + "logps/chosen": -54.99885559082031, + "logps/rejected": -124.551025390625, + "loss": 0.6508, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7067906856536865, + "rewards/margins": 7.588029861450195, + "rewards/rejected": -4.88123893737793, + "step": 14275 + }, + { + "epoch": 3.57, + "grad_norm": 4.258323669433594, + "learning_rate": 1.881936519333767e-06, + "logits/chosen": -0.5935741662979126, + "logits/rejected": -0.6623646020889282, + "logps/chosen": -48.64973449707031, + "logps/rejected": -94.06614685058594, + "loss": 0.633, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0322036743164062, + "rewards/margins": 5.773264408111572, + "rewards/rejected": -2.741060733795166, + "step": 14276 + }, + { + "epoch": 3.57, + "grad_norm": 5.992528915405273, + "learning_rate": 1.8813221250118673e-06, + "logits/chosen": -0.5463312864303589, + "logits/rejected": -0.6093201637268066, + "logps/chosen": -47.736385345458984, + "logps/rejected": -105.13517761230469, + "loss": 0.5859, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.892758369445801, + "rewards/margins": 6.869051456451416, + "rewards/rejected": -3.976292848587036, + "step": 14277 + }, + { + "epoch": 3.57, + "grad_norm": 8.563210487365723, + "learning_rate": 1.880707807755816e-06, + "logits/chosen": -0.5987083911895752, + "logits/rejected": -0.6747986674308777, + "logps/chosen": -66.46110534667969, + "logps/rejected": -123.71896362304688, + "loss": 0.6633, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.609283208847046, + "rewards/margins": 6.190345764160156, + "rewards/rejected": -3.581061840057373, + "step": 14278 + }, + { + "epoch": 3.57, + "grad_norm": 4.975219249725342, + "learning_rate": 1.8800935675807902e-06, + "logits/chosen": -0.5775662064552307, + "logits/rejected": -0.626958966255188, + "logps/chosen": -57.85680389404297, + "logps/rejected": -112.59562683105469, + "loss": 0.6776, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.338602066040039, + "rewards/margins": 7.350945472717285, + "rewards/rejected": -4.012343406677246, + "step": 14279 + }, + { + "epoch": 3.57, + "grad_norm": 5.118434906005859, + "learning_rate": 1.8794794045019727e-06, + "logits/chosen": -0.48465991020202637, + "logits/rejected": -0.5837985277175903, + "logps/chosen": -55.836612701416016, + "logps/rejected": -113.49365234375, + "loss": 0.6216, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3341054916381836, + "rewards/margins": 7.693564414978027, + "rewards/rejected": -4.35945987701416, + "step": 14280 + }, + { + "epoch": 3.57, + "grad_norm": 5.657993316650391, + "learning_rate": 1.8788653185345385e-06, + "logits/chosen": -0.5607246160507202, + "logits/rejected": -0.6510041356086731, + "logps/chosen": -59.4537353515625, + "logps/rejected": -113.12329864501953, + "loss": 0.7268, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7030410766601562, + "rewards/margins": 7.727536201477051, + "rewards/rejected": -5.024494171142578, + "step": 14281 + }, + { + "epoch": 3.57, + "grad_norm": 5.657342910766602, + "learning_rate": 1.87825130969366e-06, + "logits/chosen": -0.5606514811515808, + "logits/rejected": -0.6658697128295898, + "logps/chosen": -55.73991012573242, + "logps/rejected": -97.8021011352539, + "loss": 0.6356, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.937452793121338, + "rewards/margins": 7.419192790985107, + "rewards/rejected": -4.4817399978637695, + "step": 14282 + }, + { + "epoch": 3.57, + "grad_norm": 3.15073823928833, + "learning_rate": 1.8776373779945139e-06, + "logits/chosen": -0.5601103901863098, + "logits/rejected": -0.6215217113494873, + "logps/chosen": -62.193939208984375, + "logps/rejected": -114.44578552246094, + "loss": 0.5856, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7477431297302246, + "rewards/margins": 6.934742450714111, + "rewards/rejected": -4.1869988441467285, + "step": 14283 + }, + { + "epoch": 3.57, + "grad_norm": 2.972168207168579, + "learning_rate": 1.8770235234522693e-06, + "logits/chosen": -0.5881266593933105, + "logits/rejected": -0.638939380645752, + "logps/chosen": -53.137229919433594, + "logps/rejected": -99.94469451904297, + "loss": 0.6445, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3948044776916504, + "rewards/margins": 7.113210678100586, + "rewards/rejected": -3.7184062004089355, + "step": 14284 + }, + { + "epoch": 3.57, + "grad_norm": 16.37274932861328, + "learning_rate": 1.876409746082094e-06, + "logits/chosen": -0.5558160543441772, + "logits/rejected": -0.6273247003555298, + "logps/chosen": -53.67017364501953, + "logps/rejected": -101.37278747558594, + "loss": 0.6417, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7392683029174805, + "rewards/margins": 6.310962677001953, + "rewards/rejected": -3.5716946125030518, + "step": 14285 + }, + { + "epoch": 3.57, + "grad_norm": 4.366453170776367, + "learning_rate": 1.8757960458991554e-06, + "logits/chosen": -0.6076524257659912, + "logits/rejected": -0.636928141117096, + "logps/chosen": -46.153560638427734, + "logps/rejected": -96.70500183105469, + "loss": 0.5819, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9390406608581543, + "rewards/margins": 5.742361545562744, + "rewards/rejected": -2.80332088470459, + "step": 14286 + }, + { + "epoch": 3.57, + "grad_norm": 3.5482096672058105, + "learning_rate": 1.8751824229186234e-06, + "logits/chosen": -0.4913520812988281, + "logits/rejected": -0.6046845316886902, + "logps/chosen": -64.17056274414062, + "logps/rejected": -87.8391342163086, + "loss": 0.6081, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2340822219848633, + "rewards/margins": 6.528931140899658, + "rewards/rejected": -3.294849157333374, + "step": 14287 + }, + { + "epoch": 3.57, + "grad_norm": 2.865790843963623, + "learning_rate": 1.8745688771556543e-06, + "logits/chosen": -0.5020354986190796, + "logits/rejected": -0.5932093262672424, + "logps/chosen": -57.60939407348633, + "logps/rejected": -104.63838958740234, + "loss": 0.6103, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3125264644622803, + "rewards/margins": 7.647448539733887, + "rewards/rejected": -4.334922790527344, + "step": 14288 + }, + { + "epoch": 3.57, + "grad_norm": 13.101397514343262, + "learning_rate": 1.8739554086254119e-06, + "logits/chosen": -0.6005210876464844, + "logits/rejected": -0.650915801525116, + "logps/chosen": -54.6217041015625, + "logps/rejected": -105.23051452636719, + "loss": 0.8129, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.834761142730713, + "rewards/margins": 6.316649436950684, + "rewards/rejected": -3.4818882942199707, + "step": 14289 + }, + { + "epoch": 3.57, + "grad_norm": 5.028226852416992, + "learning_rate": 1.873342017343059e-06, + "logits/chosen": -0.47768276929855347, + "logits/rejected": -0.5300943851470947, + "logps/chosen": -49.87975311279297, + "logps/rejected": -101.22544860839844, + "loss": 0.6443, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.279068946838379, + "rewards/margins": 6.3615241050720215, + "rewards/rejected": -3.0824551582336426, + "step": 14290 + }, + { + "epoch": 3.58, + "grad_norm": 8.403144836425781, + "learning_rate": 1.8727287033237495e-06, + "logits/chosen": -0.5172473192214966, + "logits/rejected": -0.5805346369743347, + "logps/chosen": -51.950355529785156, + "logps/rejected": -121.76884460449219, + "loss": 0.5548, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.091888189315796, + "rewards/margins": 7.845152378082275, + "rewards/rejected": -4.7532639503479, + "step": 14291 + }, + { + "epoch": 3.58, + "grad_norm": 7.001908302307129, + "learning_rate": 1.872115466582639e-06, + "logits/chosen": -0.5527341365814209, + "logits/rejected": -0.6164615154266357, + "logps/chosen": -51.620487213134766, + "logps/rejected": -111.98389434814453, + "loss": 0.6254, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.116933584213257, + "rewards/margins": 7.114555835723877, + "rewards/rejected": -3.997622013092041, + "step": 14292 + }, + { + "epoch": 3.58, + "grad_norm": 6.257872104644775, + "learning_rate": 1.8715023071348836e-06, + "logits/chosen": -0.5157870054244995, + "logits/rejected": -0.6144692301750183, + "logps/chosen": -53.31536865234375, + "logps/rejected": -86.15703582763672, + "loss": 0.6447, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2666540145874023, + "rewards/margins": 6.749004364013672, + "rewards/rejected": -3.4823508262634277, + "step": 14293 + }, + { + "epoch": 3.58, + "grad_norm": 3.9776320457458496, + "learning_rate": 1.8708892249956322e-06, + "logits/chosen": -0.4542674124240875, + "logits/rejected": -0.5686997175216675, + "logps/chosen": -64.50060272216797, + "logps/rejected": -104.09542846679688, + "loss": 0.5882, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.746786117553711, + "rewards/margins": 7.090200424194336, + "rewards/rejected": -4.343414306640625, + "step": 14294 + }, + { + "epoch": 3.58, + "grad_norm": 2.7429423332214355, + "learning_rate": 1.8702762201800384e-06, + "logits/chosen": -0.46534398198127747, + "logits/rejected": -0.5646623969078064, + "logps/chosen": -46.325653076171875, + "logps/rejected": -91.42080688476562, + "loss": 0.5654, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.296144723892212, + "rewards/margins": 6.896960258483887, + "rewards/rejected": -3.600815534591675, + "step": 14295 + }, + { + "epoch": 3.58, + "grad_norm": 8.18249225616455, + "learning_rate": 1.8696632927032477e-06, + "logits/chosen": -0.5689986944198608, + "logits/rejected": -0.6245432496070862, + "logps/chosen": -49.127986907958984, + "logps/rejected": -107.74424743652344, + "loss": 0.5769, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.948648452758789, + "rewards/margins": 7.1061015129089355, + "rewards/rejected": -4.157452583312988, + "step": 14296 + }, + { + "epoch": 3.58, + "grad_norm": 5.798272132873535, + "learning_rate": 1.8690504425804052e-06, + "logits/chosen": -0.5366656184196472, + "logits/rejected": -0.639854907989502, + "logps/chosen": -66.88511657714844, + "logps/rejected": -86.8987808227539, + "loss": 0.7143, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7205543518066406, + "rewards/margins": 6.166467666625977, + "rewards/rejected": -3.445913791656494, + "step": 14297 + }, + { + "epoch": 3.58, + "grad_norm": 3.2964367866516113, + "learning_rate": 1.8684376698266581e-06, + "logits/chosen": -0.6214410662651062, + "logits/rejected": -0.7556654810905457, + "logps/chosen": -52.43792724609375, + "logps/rejected": -97.22608947753906, + "loss": 0.604, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.15421199798584, + "rewards/margins": 7.905555725097656, + "rewards/rejected": -4.751343727111816, + "step": 14298 + }, + { + "epoch": 3.58, + "grad_norm": 4.3522047996521, + "learning_rate": 1.867824974457147e-06, + "logits/chosen": -0.5186755061149597, + "logits/rejected": -0.6134552955627441, + "logps/chosen": -58.912574768066406, + "logps/rejected": -99.3661117553711, + "loss": 0.628, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1644678115844727, + "rewards/margins": 7.491217136383057, + "rewards/rejected": -4.326749801635742, + "step": 14299 + }, + { + "epoch": 3.58, + "grad_norm": 6.873406410217285, + "learning_rate": 1.867212356487011e-06, + "logits/chosen": -0.5668154954910278, + "logits/rejected": -0.6374629735946655, + "logps/chosen": -48.26270294189453, + "logps/rejected": -113.42889404296875, + "loss": 0.6924, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.179873466491699, + "rewards/margins": 7.235471248626709, + "rewards/rejected": -4.05559778213501, + "step": 14300 + }, + { + "epoch": 3.58, + "grad_norm": 2.292264461517334, + "learning_rate": 1.866599815931389e-06, + "logits/chosen": -0.4913885295391083, + "logits/rejected": -0.5705562829971313, + "logps/chosen": -72.82514953613281, + "logps/rejected": -98.84207153320312, + "loss": 0.597, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.205754041671753, + "rewards/margins": 7.602565765380859, + "rewards/rejected": -4.396811485290527, + "step": 14301 + }, + { + "epoch": 3.58, + "grad_norm": 7.956015110015869, + "learning_rate": 1.8659873528054223e-06, + "logits/chosen": -0.5851541757583618, + "logits/rejected": -0.6501208543777466, + "logps/chosen": -72.75984191894531, + "logps/rejected": -122.23973846435547, + "loss": 0.7209, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9606785774230957, + "rewards/margins": 6.817205429077148, + "rewards/rejected": -3.8565263748168945, + "step": 14302 + }, + { + "epoch": 3.58, + "grad_norm": 3.804433584213257, + "learning_rate": 1.8653749671242382e-06, + "logits/chosen": -0.5401594638824463, + "logits/rejected": -0.6039941310882568, + "logps/chosen": -58.3620491027832, + "logps/rejected": -117.3234634399414, + "loss": 0.6043, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3904690742492676, + "rewards/margins": 7.444984436035156, + "rewards/rejected": -4.054515361785889, + "step": 14303 + }, + { + "epoch": 3.58, + "grad_norm": 5.099139213562012, + "learning_rate": 1.8647626589029727e-06, + "logits/chosen": -0.5865957736968994, + "logits/rejected": -0.6789852976799011, + "logps/chosen": -50.94660949707031, + "logps/rejected": -101.27509307861328, + "loss": 0.6296, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.106121778488159, + "rewards/margins": 7.514564514160156, + "rewards/rejected": -4.408442497253418, + "step": 14304 + }, + { + "epoch": 3.58, + "grad_norm": 2.5302791595458984, + "learning_rate": 1.8641504281567602e-06, + "logits/chosen": -0.5223707556724548, + "logits/rejected": -0.59700608253479, + "logps/chosen": -60.98680114746094, + "logps/rejected": -108.7135238647461, + "loss": 0.6206, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.667975425720215, + "rewards/margins": 7.15371036529541, + "rewards/rejected": -4.485734462738037, + "step": 14305 + }, + { + "epoch": 3.58, + "grad_norm": 4.778509140014648, + "learning_rate": 1.8635382749007225e-06, + "logits/chosen": -0.418043851852417, + "logits/rejected": -0.5269472002983093, + "logps/chosen": -59.726173400878906, + "logps/rejected": -113.92207336425781, + "loss": 0.5546, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9565091133117676, + "rewards/margins": 7.420562744140625, + "rewards/rejected": -4.464053630828857, + "step": 14306 + }, + { + "epoch": 3.58, + "grad_norm": 1.9348498582839966, + "learning_rate": 1.8629261991499903e-06, + "logits/chosen": -0.5225995779037476, + "logits/rejected": -0.5715861916542053, + "logps/chosen": -51.912872314453125, + "logps/rejected": -131.89178466796875, + "loss": 0.6027, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.205559730529785, + "rewards/margins": 8.786438941955566, + "rewards/rejected": -5.5808796882629395, + "step": 14307 + }, + { + "epoch": 3.58, + "grad_norm": 9.809449195861816, + "learning_rate": 1.8623142009196904e-06, + "logits/chosen": -0.55384761095047, + "logits/rejected": -0.6215656995773315, + "logps/chosen": -51.95240020751953, + "logps/rejected": -105.59734344482422, + "loss": 0.6457, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.114858865737915, + "rewards/margins": 6.674964904785156, + "rewards/rejected": -3.560105562210083, + "step": 14308 + }, + { + "epoch": 3.58, + "grad_norm": 5.205246448516846, + "learning_rate": 1.8617022802249435e-06, + "logits/chosen": -0.5152179002761841, + "logits/rejected": -0.5976734161376953, + "logps/chosen": -53.67399597167969, + "logps/rejected": -95.41456604003906, + "loss": 0.5645, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.959683656692505, + "rewards/margins": 6.656283855438232, + "rewards/rejected": -3.6966004371643066, + "step": 14309 + }, + { + "epoch": 3.58, + "grad_norm": 4.148284912109375, + "learning_rate": 1.8610904370808696e-06, + "logits/chosen": -0.4909856617450714, + "logits/rejected": -0.6188804507255554, + "logps/chosen": -62.41750717163086, + "logps/rejected": -107.5652847290039, + "loss": 0.6642, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0073206424713135, + "rewards/margins": 8.455214500427246, + "rewards/rejected": -5.447894096374512, + "step": 14310 + }, + { + "epoch": 3.58, + "grad_norm": 5.03286600112915, + "learning_rate": 1.860478671502592e-06, + "logits/chosen": -0.5967453122138977, + "logits/rejected": -0.63401198387146, + "logps/chosen": -49.989952087402344, + "logps/rejected": -113.52792358398438, + "loss": 0.689, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.023951292037964, + "rewards/margins": 7.255739212036133, + "rewards/rejected": -4.231788158416748, + "step": 14311 + }, + { + "epoch": 3.58, + "grad_norm": 14.740716934204102, + "learning_rate": 1.8598669835052252e-06, + "logits/chosen": -0.5497134923934937, + "logits/rejected": -0.6236213445663452, + "logps/chosen": -59.49494552612305, + "logps/rejected": -93.89221954345703, + "loss": 0.5807, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8415732383728027, + "rewards/margins": 7.095177173614502, + "rewards/rejected": -4.253603935241699, + "step": 14312 + }, + { + "epoch": 3.58, + "grad_norm": 4.1105804443359375, + "learning_rate": 1.8592553731038837e-06, + "logits/chosen": -0.5552107095718384, + "logits/rejected": -0.6179999709129333, + "logps/chosen": -50.63572692871094, + "logps/rejected": -104.88511657714844, + "loss": 0.5874, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.06638240814209, + "rewards/margins": 6.967308044433594, + "rewards/rejected": -3.900926113128662, + "step": 14313 + }, + { + "epoch": 3.58, + "grad_norm": 7.352555274963379, + "learning_rate": 1.8586438403136841e-06, + "logits/chosen": -0.5026500225067139, + "logits/rejected": -0.6403685808181763, + "logps/chosen": -64.51481628417969, + "logps/rejected": -91.24626159667969, + "loss": 0.654, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7144742012023926, + "rewards/margins": 6.078578948974609, + "rewards/rejected": -3.3641042709350586, + "step": 14314 + }, + { + "epoch": 3.58, + "grad_norm": 3.7905056476593018, + "learning_rate": 1.858032385149735e-06, + "logits/chosen": -0.5200539231300354, + "logits/rejected": -0.5948528051376343, + "logps/chosen": -71.04658508300781, + "logps/rejected": -118.06268310546875, + "loss": 0.7216, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.768296003341675, + "rewards/margins": 7.132499694824219, + "rewards/rejected": -4.364203453063965, + "step": 14315 + }, + { + "epoch": 3.58, + "grad_norm": 4.057908058166504, + "learning_rate": 1.8574210076271492e-06, + "logits/chosen": -0.5112386345863342, + "logits/rejected": -0.6121960878372192, + "logps/chosen": -48.876548767089844, + "logps/rejected": -111.25025939941406, + "loss": 0.6083, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.955016851425171, + "rewards/margins": 8.163156509399414, + "rewards/rejected": -5.208139419555664, + "step": 14316 + }, + { + "epoch": 3.58, + "grad_norm": 5.368089199066162, + "learning_rate": 1.8568097077610332e-06, + "logits/chosen": -0.45960474014282227, + "logits/rejected": -0.5633403658866882, + "logps/chosen": -53.3780517578125, + "logps/rejected": -99.77315521240234, + "loss": 0.5937, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.106977939605713, + "rewards/margins": 7.7606611251831055, + "rewards/rejected": -4.653683662414551, + "step": 14317 + }, + { + "epoch": 3.58, + "grad_norm": 2.5556395053863525, + "learning_rate": 1.8561984855664906e-06, + "logits/chosen": -0.5498723983764648, + "logits/rejected": -0.6368411779403687, + "logps/chosen": -59.089115142822266, + "logps/rejected": -99.46401977539062, + "loss": 0.562, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1032886505126953, + "rewards/margins": 7.477177143096924, + "rewards/rejected": -4.373888969421387, + "step": 14318 + }, + { + "epoch": 3.58, + "grad_norm": 4.929560661315918, + "learning_rate": 1.8555873410586272e-06, + "logits/chosen": -0.49017345905303955, + "logits/rejected": -0.5611249804496765, + "logps/chosen": -55.59807205200195, + "logps/rejected": -97.46295928955078, + "loss": 0.6817, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.139028549194336, + "rewards/margins": 6.362844467163086, + "rewards/rejected": -3.22381591796875, + "step": 14319 + }, + { + "epoch": 3.58, + "grad_norm": 20.738372802734375, + "learning_rate": 1.8549762742525468e-06, + "logits/chosen": -0.5773279070854187, + "logits/rejected": -0.6475653648376465, + "logps/chosen": -46.00544738769531, + "logps/rejected": -100.37429809570312, + "loss": 0.793, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9834370613098145, + "rewards/margins": 7.090540885925293, + "rewards/rejected": -4.107104301452637, + "step": 14320 + }, + { + "epoch": 3.58, + "grad_norm": 5.113037109375, + "learning_rate": 1.8543652851633481e-06, + "logits/chosen": -0.5343102216720581, + "logits/rejected": -0.6089144945144653, + "logps/chosen": -67.96910858154297, + "logps/rejected": -125.63078308105469, + "loss": 0.6269, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4242489337921143, + "rewards/margins": 7.230955123901367, + "rewards/rejected": -3.8067071437835693, + "step": 14321 + }, + { + "epoch": 3.58, + "grad_norm": 24.741823196411133, + "learning_rate": 1.853754373806127e-06, + "logits/chosen": -0.4873523414134979, + "logits/rejected": -0.5963775515556335, + "logps/chosen": -57.8634033203125, + "logps/rejected": -123.67210388183594, + "loss": 0.672, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0556800365448, + "rewards/margins": 8.219292640686035, + "rewards/rejected": -5.163611888885498, + "step": 14322 + }, + { + "epoch": 3.58, + "grad_norm": 4.012408256530762, + "learning_rate": 1.853143540195984e-06, + "logits/chosen": -0.5294570326805115, + "logits/rejected": -0.5958282351493835, + "logps/chosen": -49.37425994873047, + "logps/rejected": -121.36670684814453, + "loss": 0.6121, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1267459392547607, + "rewards/margins": 7.7638630867004395, + "rewards/rejected": -4.6371169090271, + "step": 14323 + }, + { + "epoch": 3.58, + "grad_norm": 3.227206230163574, + "learning_rate": 1.8525327843480113e-06, + "logits/chosen": -0.43766602873802185, + "logits/rejected": -0.5934464931488037, + "logps/chosen": -70.04442596435547, + "logps/rejected": -85.9222412109375, + "loss": 0.617, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9014036655426025, + "rewards/margins": 7.343425750732422, + "rewards/rejected": -4.44202184677124, + "step": 14324 + }, + { + "epoch": 3.58, + "grad_norm": 3.689405918121338, + "learning_rate": 1.851922106277299e-06, + "logits/chosen": -0.5984214544296265, + "logits/rejected": -0.7070468664169312, + "logps/chosen": -52.711639404296875, + "logps/rejected": -96.85322570800781, + "loss": 0.6375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0282206535339355, + "rewards/margins": 6.904670238494873, + "rewards/rejected": -3.8764491081237793, + "step": 14325 + }, + { + "epoch": 3.58, + "grad_norm": 5.449813365936279, + "learning_rate": 1.8513115059989423e-06, + "logits/chosen": -0.610611081123352, + "logits/rejected": -0.6752774715423584, + "logps/chosen": -70.95508575439453, + "logps/rejected": -108.21941375732422, + "loss": 0.6736, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.715456008911133, + "rewards/margins": 6.127686023712158, + "rewards/rejected": -3.4122297763824463, + "step": 14326 + }, + { + "epoch": 3.58, + "grad_norm": 3.706042766571045, + "learning_rate": 1.850700983528027e-06, + "logits/chosen": -0.45393064618110657, + "logits/rejected": -0.5108029246330261, + "logps/chosen": -62.492584228515625, + "logps/rejected": -102.93791198730469, + "loss": 0.65, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.764503002166748, + "rewards/margins": 5.654384136199951, + "rewards/rejected": -2.8898813724517822, + "step": 14327 + }, + { + "epoch": 3.58, + "grad_norm": 12.53406047821045, + "learning_rate": 1.8500905388796387e-06, + "logits/chosen": -0.5745388865470886, + "logits/rejected": -0.5779219269752502, + "logps/chosen": -61.24274444580078, + "logps/rejected": -118.5848388671875, + "loss": 0.8965, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1045475006103516, + "rewards/margins": 5.913254737854004, + "rewards/rejected": -2.808706760406494, + "step": 14328 + }, + { + "epoch": 3.58, + "grad_norm": 5.518847942352295, + "learning_rate": 1.8494801720688633e-06, + "logits/chosen": -0.6153616905212402, + "logits/rejected": -0.694487452507019, + "logps/chosen": -49.24570083618164, + "logps/rejected": -92.38230895996094, + "loss": 0.6541, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2488956451416016, + "rewards/margins": 6.121257305145264, + "rewards/rejected": -2.872361898422241, + "step": 14329 + }, + { + "epoch": 3.58, + "grad_norm": 5.1636881828308105, + "learning_rate": 1.8488698831107872e-06, + "logits/chosen": -0.5521885752677917, + "logits/rejected": -0.5906989574432373, + "logps/chosen": -71.86067199707031, + "logps/rejected": -119.03816986083984, + "loss": 0.7913, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.101764678955078, + "rewards/margins": 6.776313781738281, + "rewards/rejected": -3.674548864364624, + "step": 14330 + }, + { + "epoch": 3.59, + "grad_norm": 7.913344383239746, + "learning_rate": 1.8482596720204848e-06, + "logits/chosen": -0.5925049781799316, + "logits/rejected": -0.7151681780815125, + "logps/chosen": -55.309295654296875, + "logps/rejected": -107.30436706542969, + "loss": 0.6339, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.160172939300537, + "rewards/margins": 7.176476955413818, + "rewards/rejected": -4.016304016113281, + "step": 14331 + }, + { + "epoch": 3.59, + "grad_norm": 6.336400985717773, + "learning_rate": 1.8476495388130377e-06, + "logits/chosen": -0.43772152066230774, + "logits/rejected": -0.5222580432891846, + "logps/chosen": -59.8897705078125, + "logps/rejected": -105.26945495605469, + "loss": 0.6659, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9178528785705566, + "rewards/margins": 5.713819980621338, + "rewards/rejected": -2.795966863632202, + "step": 14332 + }, + { + "epoch": 3.59, + "grad_norm": 23.347614288330078, + "learning_rate": 1.847039483503526e-06, + "logits/chosen": -0.5326455235481262, + "logits/rejected": -0.6005310416221619, + "logps/chosen": -54.27326583862305, + "logps/rejected": -98.47084045410156, + "loss": 0.7307, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1197662353515625, + "rewards/margins": 6.025153160095215, + "rewards/rejected": -2.905386447906494, + "step": 14333 + }, + { + "epoch": 3.59, + "grad_norm": 3.8652164936065674, + "learning_rate": 1.846429506107022e-06, + "logits/chosen": -0.4993053078651428, + "logits/rejected": -0.576998233795166, + "logps/chosen": -71.62468719482422, + "logps/rejected": -95.27979278564453, + "loss": 0.6275, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1624915599823, + "rewards/margins": 6.432463645935059, + "rewards/rejected": -3.2699718475341797, + "step": 14334 + }, + { + "epoch": 3.59, + "grad_norm": 6.021697998046875, + "learning_rate": 1.8458196066385976e-06, + "logits/chosen": -0.4784989356994629, + "logits/rejected": -0.5662010312080383, + "logps/chosen": -64.37960815429688, + "logps/rejected": -113.33834838867188, + "loss": 0.6986, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1672816276550293, + "rewards/margins": 6.869497299194336, + "rewards/rejected": -3.7022156715393066, + "step": 14335 + }, + { + "epoch": 3.59, + "grad_norm": 4.786848545074463, + "learning_rate": 1.8452097851133272e-06, + "logits/chosen": -0.527452290058136, + "logits/rejected": -0.6463825106620789, + "logps/chosen": -51.08596420288086, + "logps/rejected": -107.22014617919922, + "loss": 0.5391, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2390880584716797, + "rewards/margins": 8.053890228271484, + "rewards/rejected": -4.814801216125488, + "step": 14336 + }, + { + "epoch": 3.59, + "grad_norm": 4.678408145904541, + "learning_rate": 1.8446000415462784e-06, + "logits/chosen": -0.6325597763061523, + "logits/rejected": -0.7246724367141724, + "logps/chosen": -46.190731048583984, + "logps/rejected": -93.84953308105469, + "loss": 0.5442, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2593019008636475, + "rewards/margins": 7.3269124031066895, + "rewards/rejected": -4.067611217498779, + "step": 14337 + }, + { + "epoch": 3.59, + "grad_norm": 7.844038009643555, + "learning_rate": 1.8439903759525173e-06, + "logits/chosen": -0.5350847244262695, + "logits/rejected": -0.619421124458313, + "logps/chosen": -66.18986511230469, + "logps/rejected": -104.048095703125, + "loss": 0.7573, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.129855155944824, + "rewards/margins": 6.245671272277832, + "rewards/rejected": -3.1158154010772705, + "step": 14338 + }, + { + "epoch": 3.59, + "grad_norm": 9.754046440124512, + "learning_rate": 1.8433807883471128e-06, + "logits/chosen": -0.6229175925254822, + "logits/rejected": -0.7403779625892639, + "logps/chosen": -50.43756866455078, + "logps/rejected": -89.71923065185547, + "loss": 0.6014, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.212602376937866, + "rewards/margins": 8.018545150756836, + "rewards/rejected": -4.805943489074707, + "step": 14339 + }, + { + "epoch": 3.59, + "grad_norm": 3.1729841232299805, + "learning_rate": 1.8427712787451247e-06, + "logits/chosen": -0.5160424709320068, + "logits/rejected": -0.6306890845298767, + "logps/chosen": -60.12039566040039, + "logps/rejected": -118.42241668701172, + "loss": 0.5594, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.023272752761841, + "rewards/margins": 8.79168701171875, + "rewards/rejected": -5.76841402053833, + "step": 14340 + }, + { + "epoch": 3.59, + "grad_norm": 4.560169696807861, + "learning_rate": 1.8421618471616182e-06, + "logits/chosen": -0.5799959301948547, + "logits/rejected": -0.6982330083847046, + "logps/chosen": -53.652259826660156, + "logps/rejected": -82.47380828857422, + "loss": 0.6773, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.201528549194336, + "rewards/margins": 6.53851318359375, + "rewards/rejected": -3.336984634399414, + "step": 14341 + }, + { + "epoch": 3.59, + "grad_norm": 13.794451713562012, + "learning_rate": 1.8415524936116508e-06, + "logits/chosen": -0.4960338771343231, + "logits/rejected": -0.5936028361320496, + "logps/chosen": -62.99278259277344, + "logps/rejected": -96.84939575195312, + "loss": 0.6603, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.103471279144287, + "rewards/margins": 6.284692287445068, + "rewards/rejected": -3.1812210083007812, + "step": 14342 + }, + { + "epoch": 3.59, + "grad_norm": 2.485825300216675, + "learning_rate": 1.84094321811028e-06, + "logits/chosen": -0.5674237012863159, + "logits/rejected": -0.6881129145622253, + "logps/chosen": -57.75431823730469, + "logps/rejected": -102.51095581054688, + "loss": 0.6069, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1595401763916016, + "rewards/margins": 8.454093933105469, + "rewards/rejected": -5.294553756713867, + "step": 14343 + }, + { + "epoch": 3.59, + "grad_norm": 4.4716315269470215, + "learning_rate": 1.8403340206725617e-06, + "logits/chosen": -0.5440660715103149, + "logits/rejected": -0.622812032699585, + "logps/chosen": -50.06338882446289, + "logps/rejected": -113.7718734741211, + "loss": 0.6312, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3736796379089355, + "rewards/margins": 6.996234893798828, + "rewards/rejected": -3.6225547790527344, + "step": 14344 + }, + { + "epoch": 3.59, + "grad_norm": 10.938799858093262, + "learning_rate": 1.8397249013135538e-06, + "logits/chosen": -0.555648148059845, + "logits/rejected": -0.6171126961708069, + "logps/chosen": -52.41973114013672, + "logps/rejected": -100.18914031982422, + "loss": 0.58, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9902114868164062, + "rewards/margins": 6.016096115112305, + "rewards/rejected": -3.0258846282958984, + "step": 14345 + }, + { + "epoch": 3.59, + "grad_norm": 2.1671979427337646, + "learning_rate": 1.8391158600483023e-06, + "logits/chosen": -0.527923583984375, + "logits/rejected": -0.6395626664161682, + "logps/chosen": -55.21821975708008, + "logps/rejected": -98.05915832519531, + "loss": 0.5683, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2712574005126953, + "rewards/margins": 7.0336785316467285, + "rewards/rejected": -3.7624213695526123, + "step": 14346 + }, + { + "epoch": 3.59, + "grad_norm": 4.191512584686279, + "learning_rate": 1.8385068968918595e-06, + "logits/chosen": -0.5126698613166809, + "logits/rejected": -0.6248579621315002, + "logps/chosen": -51.41474151611328, + "logps/rejected": -94.57943725585938, + "loss": 0.6139, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2459774017333984, + "rewards/margins": 6.778088092803955, + "rewards/rejected": -3.5321099758148193, + "step": 14347 + }, + { + "epoch": 3.59, + "grad_norm": 4.812045097351074, + "learning_rate": 1.8378980118592771e-06, + "logits/chosen": -0.4611192047595978, + "logits/rejected": -0.5110453367233276, + "logps/chosen": -56.290428161621094, + "logps/rejected": -108.41969299316406, + "loss": 0.7289, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.898167133331299, + "rewards/margins": 6.4410786628723145, + "rewards/rejected": -3.5429115295410156, + "step": 14348 + }, + { + "epoch": 3.59, + "grad_norm": 4.263222694396973, + "learning_rate": 1.8372892049655955e-06, + "logits/chosen": -0.5693787932395935, + "logits/rejected": -0.673658549785614, + "logps/chosen": -55.16282272338867, + "logps/rejected": -88.20587921142578, + "loss": 0.6097, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3593618869781494, + "rewards/margins": 6.83472204208374, + "rewards/rejected": -3.475360155105591, + "step": 14349 + }, + { + "epoch": 3.59, + "grad_norm": 8.604907035827637, + "learning_rate": 1.8366804762258612e-06, + "logits/chosen": -0.6305484771728516, + "logits/rejected": -0.719171941280365, + "logps/chosen": -55.95865249633789, + "logps/rejected": -96.04175567626953, + "loss": 0.6653, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8544273376464844, + "rewards/margins": 5.789556980133057, + "rewards/rejected": -2.935129404067993, + "step": 14350 + }, + { + "epoch": 3.59, + "grad_norm": 4.721921920776367, + "learning_rate": 1.8360718256551186e-06, + "logits/chosen": -0.455447793006897, + "logits/rejected": -0.5290425419807434, + "logps/chosen": -58.91941452026367, + "logps/rejected": -114.12705993652344, + "loss": 0.7508, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.196842670440674, + "rewards/margins": 7.41154146194458, + "rewards/rejected": -4.214698791503906, + "step": 14351 + }, + { + "epoch": 3.59, + "grad_norm": 5.679161071777344, + "learning_rate": 1.835463253268407e-06, + "logits/chosen": -0.533340573310852, + "logits/rejected": -0.5857448577880859, + "logps/chosen": -50.61925506591797, + "logps/rejected": -106.19062042236328, + "loss": 0.6058, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.205594539642334, + "rewards/margins": 6.8120951652526855, + "rewards/rejected": -3.6065008640289307, + "step": 14352 + }, + { + "epoch": 3.59, + "grad_norm": 7.975546836853027, + "learning_rate": 1.834854759080762e-06, + "logits/chosen": -0.5585368275642395, + "logits/rejected": -0.6445029377937317, + "logps/chosen": -53.55080795288086, + "logps/rejected": -102.91960144042969, + "loss": 0.7145, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.032597064971924, + "rewards/margins": 6.715445518493652, + "rewards/rejected": -3.6828479766845703, + "step": 14353 + }, + { + "epoch": 3.59, + "grad_norm": 4.464412212371826, + "learning_rate": 1.8342463431072249e-06, + "logits/chosen": -0.4691787362098694, + "logits/rejected": -0.588550865650177, + "logps/chosen": -61.32426452636719, + "logps/rejected": -98.34747314453125, + "loss": 0.6344, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.928414821624756, + "rewards/margins": 6.479555606842041, + "rewards/rejected": -3.551140785217285, + "step": 14354 + }, + { + "epoch": 3.59, + "grad_norm": 4.883896827697754, + "learning_rate": 1.8336380053628272e-06, + "logits/chosen": -0.5139630436897278, + "logits/rejected": -0.6026365160942078, + "logps/chosen": -45.520606994628906, + "logps/rejected": -90.57894897460938, + "loss": 0.6168, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0678770542144775, + "rewards/margins": 7.247043132781982, + "rewards/rejected": -4.179166316986084, + "step": 14355 + }, + { + "epoch": 3.59, + "grad_norm": 7.0485453605651855, + "learning_rate": 1.8330297458626013e-06, + "logits/chosen": -0.508027195930481, + "logits/rejected": -0.5366805791854858, + "logps/chosen": -43.376522064208984, + "logps/rejected": -121.32054138183594, + "loss": 0.6582, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9644269943237305, + "rewards/margins": 7.093408584594727, + "rewards/rejected": -4.128981590270996, + "step": 14356 + }, + { + "epoch": 3.59, + "grad_norm": 5.630993843078613, + "learning_rate": 1.8324215646215798e-06, + "logits/chosen": -0.5186638236045837, + "logits/rejected": -0.5473374724388123, + "logps/chosen": -50.738441467285156, + "logps/rejected": -119.58988952636719, + "loss": 0.51, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.061397075653076, + "rewards/margins": 7.039222717285156, + "rewards/rejected": -3.97782564163208, + "step": 14357 + }, + { + "epoch": 3.59, + "grad_norm": 5.453470230102539, + "learning_rate": 1.8318134616547895e-06, + "logits/chosen": -0.4512288570404053, + "logits/rejected": -0.5349629521369934, + "logps/chosen": -54.76470184326172, + "logps/rejected": -97.38423156738281, + "loss": 0.6847, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1772162914276123, + "rewards/margins": 5.783406734466553, + "rewards/rejected": -2.6061904430389404, + "step": 14358 + }, + { + "epoch": 3.59, + "grad_norm": 5.395249843597412, + "learning_rate": 1.8312054369772602e-06, + "logits/chosen": -0.5825523138046265, + "logits/rejected": -0.605613112449646, + "logps/chosen": -49.48947525024414, + "logps/rejected": -106.0382080078125, + "loss": 0.6275, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1968343257904053, + "rewards/margins": 7.148663520812988, + "rewards/rejected": -3.951828718185425, + "step": 14359 + }, + { + "epoch": 3.59, + "grad_norm": 3.2693698406219482, + "learning_rate": 1.8305974906040146e-06, + "logits/chosen": -0.45569032430648804, + "logits/rejected": -0.5370912551879883, + "logps/chosen": -50.328819274902344, + "logps/rejected": -109.60319519042969, + "loss": 0.5329, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.000253677368164, + "rewards/margins": 8.134660720825195, + "rewards/rejected": -5.134406566619873, + "step": 14360 + }, + { + "epoch": 3.59, + "grad_norm": 8.335968971252441, + "learning_rate": 1.8299896225500745e-06, + "logits/chosen": -0.505439281463623, + "logits/rejected": -0.5921339988708496, + "logps/chosen": -57.68225860595703, + "logps/rejected": -107.68377685546875, + "loss": 0.7478, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0490124225616455, + "rewards/margins": 6.446336269378662, + "rewards/rejected": -3.3973236083984375, + "step": 14361 + }, + { + "epoch": 3.59, + "grad_norm": 13.047250747680664, + "learning_rate": 1.8293818328304647e-06, + "logits/chosen": -0.6270092725753784, + "logits/rejected": -0.6296112537384033, + "logps/chosen": -41.698551177978516, + "logps/rejected": -106.64662170410156, + "loss": 0.5966, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.884023904800415, + "rewards/margins": 5.811207294464111, + "rewards/rejected": -2.9271836280822754, + "step": 14362 + }, + { + "epoch": 3.59, + "grad_norm": 2.2146410942077637, + "learning_rate": 1.8287741214602e-06, + "logits/chosen": -0.5051358938217163, + "logits/rejected": -0.603188157081604, + "logps/chosen": -52.01569366455078, + "logps/rejected": -100.20408630371094, + "loss": 0.528, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3917808532714844, + "rewards/margins": 7.353909492492676, + "rewards/rejected": -3.9621284008026123, + "step": 14363 + }, + { + "epoch": 3.59, + "grad_norm": 4.473075866699219, + "learning_rate": 1.8281664884543022e-06, + "logits/chosen": -0.5774526000022888, + "logits/rejected": -0.6526826024055481, + "logps/chosen": -58.652652740478516, + "logps/rejected": -123.48558807373047, + "loss": 0.626, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.615309715270996, + "rewards/margins": 6.765388488769531, + "rewards/rejected": -4.150079250335693, + "step": 14364 + }, + { + "epoch": 3.59, + "grad_norm": 4.621365547180176, + "learning_rate": 1.827558933827782e-06, + "logits/chosen": -0.5665570497512817, + "logits/rejected": -0.6583359241485596, + "logps/chosen": -51.183555603027344, + "logps/rejected": -105.49977111816406, + "loss": 0.5615, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2991209030151367, + "rewards/margins": 7.662840843200684, + "rewards/rejected": -4.363719940185547, + "step": 14365 + }, + { + "epoch": 3.59, + "grad_norm": 3.778153657913208, + "learning_rate": 1.8269514575956565e-06, + "logits/chosen": -0.6820197701454163, + "logits/rejected": -0.736023485660553, + "logps/chosen": -44.61125946044922, + "logps/rejected": -106.58280944824219, + "loss": 0.6294, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.125027656555176, + "rewards/margins": 7.712242603302002, + "rewards/rejected": -4.587214469909668, + "step": 14366 + }, + { + "epoch": 3.59, + "grad_norm": 3.6095051765441895, + "learning_rate": 1.826344059772936e-06, + "logits/chosen": -0.4709528386592865, + "logits/rejected": -0.5521642565727234, + "logps/chosen": -50.95838165283203, + "logps/rejected": -104.92680358886719, + "loss": 0.5612, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1925442218780518, + "rewards/margins": 6.554910182952881, + "rewards/rejected": -3.362365961074829, + "step": 14367 + }, + { + "epoch": 3.59, + "grad_norm": 15.406064987182617, + "learning_rate": 1.8257367403746272e-06, + "logits/chosen": -0.5696250200271606, + "logits/rejected": -0.6461530327796936, + "logps/chosen": -54.38474655151367, + "logps/rejected": -102.73866271972656, + "loss": 0.6259, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.078639030456543, + "rewards/margins": 6.990915298461914, + "rewards/rejected": -3.912275791168213, + "step": 14368 + }, + { + "epoch": 3.59, + "grad_norm": 5.898833751678467, + "learning_rate": 1.825129499415742e-06, + "logits/chosen": -0.4846043288707733, + "logits/rejected": -0.5562496781349182, + "logps/chosen": -45.58209228515625, + "logps/rejected": -99.288330078125, + "loss": 0.5601, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.617253303527832, + "rewards/margins": 6.039047718048096, + "rewards/rejected": -2.4217946529388428, + "step": 14369 + }, + { + "epoch": 3.59, + "grad_norm": 4.273975849151611, + "learning_rate": 1.8245223369112836e-06, + "logits/chosen": -0.5353368520736694, + "logits/rejected": -0.6121875643730164, + "logps/chosen": -60.71678924560547, + "logps/rejected": -106.71629333496094, + "loss": 0.6679, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.144540786743164, + "rewards/margins": 6.6619415283203125, + "rewards/rejected": -3.5174007415771484, + "step": 14370 + }, + { + "epoch": 3.6, + "grad_norm": 3.5629360675811768, + "learning_rate": 1.8239152528762538e-06, + "logits/chosen": -0.5538337826728821, + "logits/rejected": -0.6096091270446777, + "logps/chosen": -47.665794372558594, + "logps/rejected": -111.3790512084961, + "loss": 0.5481, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.325242042541504, + "rewards/margins": 6.378584861755371, + "rewards/rejected": -3.0533432960510254, + "step": 14371 + }, + { + "epoch": 3.6, + "grad_norm": 3.5320727825164795, + "learning_rate": 1.8233082473256587e-06, + "logits/chosen": -0.5631122589111328, + "logits/rejected": -0.6276931166648865, + "logps/chosen": -44.5906867980957, + "logps/rejected": -121.27864837646484, + "loss": 0.5299, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.981248617172241, + "rewards/margins": 7.85149621963501, + "rewards/rejected": -4.8702473640441895, + "step": 14372 + }, + { + "epoch": 3.6, + "grad_norm": 7.508416652679443, + "learning_rate": 1.8227013202744953e-06, + "logits/chosen": -0.5012580156326294, + "logits/rejected": -0.5200271010398865, + "logps/chosen": -54.1812744140625, + "logps/rejected": -102.61864471435547, + "loss": 0.739, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2450366020202637, + "rewards/margins": 5.549045085906982, + "rewards/rejected": -2.3040080070495605, + "step": 14373 + }, + { + "epoch": 3.6, + "grad_norm": 4.466617107391357, + "learning_rate": 1.8220944717377598e-06, + "logits/chosen": -0.5486959218978882, + "logits/rejected": -0.62760329246521, + "logps/chosen": -46.8180046081543, + "logps/rejected": -112.0643539428711, + "loss": 0.6597, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.012660026550293, + "rewards/margins": 7.4263763427734375, + "rewards/rejected": -4.4137163162231445, + "step": 14374 + }, + { + "epoch": 3.6, + "grad_norm": 2.1306753158569336, + "learning_rate": 1.8214877017304505e-06, + "logits/chosen": -0.5204359889030457, + "logits/rejected": -0.6544538140296936, + "logps/chosen": -50.629920959472656, + "logps/rejected": -100.99961853027344, + "loss": 0.561, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.125309944152832, + "rewards/margins": 7.9694695472717285, + "rewards/rejected": -4.844159126281738, + "step": 14375 + }, + { + "epoch": 3.6, + "grad_norm": 8.22610092163086, + "learning_rate": 1.8208810102675628e-06, + "logits/chosen": -0.5664985179901123, + "logits/rejected": -0.5916122198104858, + "logps/chosen": -51.81157302856445, + "logps/rejected": -125.02749633789062, + "loss": 0.6129, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1314218044281006, + "rewards/margins": 7.520824432373047, + "rewards/rejected": -4.389402389526367, + "step": 14376 + }, + { + "epoch": 3.6, + "grad_norm": 4.356550693511963, + "learning_rate": 1.8202743973640863e-06, + "logits/chosen": -0.4890877902507782, + "logits/rejected": -0.5918766856193542, + "logps/chosen": -45.545833587646484, + "logps/rejected": -108.4854736328125, + "loss": 0.5275, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.175900459289551, + "rewards/margins": 7.136631965637207, + "rewards/rejected": -3.9607317447662354, + "step": 14377 + }, + { + "epoch": 3.6, + "grad_norm": 4.707378387451172, + "learning_rate": 1.8196678630350101e-06, + "logits/chosen": -0.5424510836601257, + "logits/rejected": -0.6403225064277649, + "logps/chosen": -66.48597717285156, + "logps/rejected": -115.03180694580078, + "loss": 0.6261, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2326507568359375, + "rewards/margins": 8.037348747253418, + "rewards/rejected": -4.804697036743164, + "step": 14378 + }, + { + "epoch": 3.6, + "grad_norm": 6.446697235107422, + "learning_rate": 1.8190614072953256e-06, + "logits/chosen": -0.47984182834625244, + "logits/rejected": -0.5665419697761536, + "logps/chosen": -55.56858825683594, + "logps/rejected": -102.17649841308594, + "loss": 0.666, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.126631259918213, + "rewards/margins": 7.079418182373047, + "rewards/rejected": -3.952786922454834, + "step": 14379 + }, + { + "epoch": 3.6, + "grad_norm": 4.60897159576416, + "learning_rate": 1.8184550301600168e-06, + "logits/chosen": -0.4891843795776367, + "logits/rejected": -0.5948777794837952, + "logps/chosen": -56.06355285644531, + "logps/rejected": -93.55479431152344, + "loss": 0.6153, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0639169216156006, + "rewards/margins": 6.709643363952637, + "rewards/rejected": -3.6457266807556152, + "step": 14380 + }, + { + "epoch": 3.6, + "grad_norm": 2.3250110149383545, + "learning_rate": 1.8178487316440667e-06, + "logits/chosen": -0.5614007115364075, + "logits/rejected": -0.6272193789482117, + "logps/chosen": -46.23845672607422, + "logps/rejected": -103.80061340332031, + "loss": 0.5307, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.125643253326416, + "rewards/margins": 8.13720989227295, + "rewards/rejected": -5.011567115783691, + "step": 14381 + }, + { + "epoch": 3.6, + "grad_norm": 6.023690700531006, + "learning_rate": 1.8172425117624608e-06, + "logits/chosen": -0.4732125401496887, + "logits/rejected": -0.6320185661315918, + "logps/chosen": -62.2708740234375, + "logps/rejected": -98.61256408691406, + "loss": 0.6774, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8009486198425293, + "rewards/margins": 6.154675006866455, + "rewards/rejected": -3.353726387023926, + "step": 14382 + }, + { + "epoch": 3.6, + "grad_norm": 8.227771759033203, + "learning_rate": 1.816636370530176e-06, + "logits/chosen": -0.5744934678077698, + "logits/rejected": -0.6733671426773071, + "logps/chosen": -57.86429977416992, + "logps/rejected": -84.91767883300781, + "loss": 0.8598, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.571326971054077, + "rewards/margins": 5.7200927734375, + "rewards/rejected": -2.148766279220581, + "step": 14383 + }, + { + "epoch": 3.6, + "grad_norm": 7.519890308380127, + "learning_rate": 1.8160303079621943e-06, + "logits/chosen": -0.5354063510894775, + "logits/rejected": -0.6093999147415161, + "logps/chosen": -55.657318115234375, + "logps/rejected": -102.6849136352539, + "loss": 0.6795, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.121915340423584, + "rewards/margins": 7.107660293579102, + "rewards/rejected": -3.9857449531555176, + "step": 14384 + }, + { + "epoch": 3.6, + "grad_norm": 9.631619453430176, + "learning_rate": 1.8154243240734904e-06, + "logits/chosen": -0.5775611996650696, + "logits/rejected": -0.6606085300445557, + "logps/chosen": -56.0896110534668, + "logps/rejected": -97.93360137939453, + "loss": 0.7786, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0204756259918213, + "rewards/margins": 6.788351058959961, + "rewards/rejected": -3.7678756713867188, + "step": 14385 + }, + { + "epoch": 3.6, + "grad_norm": 4.364293098449707, + "learning_rate": 1.814818418879037e-06, + "logits/chosen": -0.5013024210929871, + "logits/rejected": -0.5878870487213135, + "logps/chosen": -46.83440399169922, + "logps/rejected": -91.6748046875, + "loss": 0.6228, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0889103412628174, + "rewards/margins": 6.685400009155273, + "rewards/rejected": -3.5964887142181396, + "step": 14386 + }, + { + "epoch": 3.6, + "grad_norm": 2.9122564792633057, + "learning_rate": 1.8142125923938098e-06, + "logits/chosen": -0.5482358932495117, + "logits/rejected": -0.5905217528343201, + "logps/chosen": -60.25130844116211, + "logps/rejected": -108.49556732177734, + "loss": 0.6188, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.315549373626709, + "rewards/margins": 6.673913478851318, + "rewards/rejected": -3.358363628387451, + "step": 14387 + }, + { + "epoch": 3.6, + "grad_norm": 2.9291083812713623, + "learning_rate": 1.8136068446327782e-06, + "logits/chosen": -0.5063151121139526, + "logits/rejected": -0.5982629656791687, + "logps/chosen": -55.58000564575195, + "logps/rejected": -100.62305450439453, + "loss": 0.5638, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3998770713806152, + "rewards/margins": 7.443328857421875, + "rewards/rejected": -4.043450832366943, + "step": 14388 + }, + { + "epoch": 3.6, + "grad_norm": 8.46301555633545, + "learning_rate": 1.813001175610909e-06, + "logits/chosen": -0.5195987820625305, + "logits/rejected": -0.5787711143493652, + "logps/chosen": -56.10976791381836, + "logps/rejected": -123.2780532836914, + "loss": 0.7575, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.824718952178955, + "rewards/margins": 6.647982120513916, + "rewards/rejected": -3.82326340675354, + "step": 14389 + }, + { + "epoch": 3.6, + "grad_norm": 3.2134227752685547, + "learning_rate": 1.81239558534317e-06, + "logits/chosen": -0.5543453097343445, + "logits/rejected": -0.6045752763748169, + "logps/chosen": -48.75970458984375, + "logps/rejected": -96.43482208251953, + "loss": 0.558, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.014310121536255, + "rewards/margins": 6.441895484924316, + "rewards/rejected": -3.4275853633880615, + "step": 14390 + }, + { + "epoch": 3.6, + "grad_norm": 3.184396505355835, + "learning_rate": 1.8117900738445304e-06, + "logits/chosen": -0.5263031721115112, + "logits/rejected": -0.6020699739456177, + "logps/chosen": -57.283451080322266, + "logps/rejected": -119.89276885986328, + "loss": 0.5759, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2492940425872803, + "rewards/margins": 8.030573844909668, + "rewards/rejected": -4.781280517578125, + "step": 14391 + }, + { + "epoch": 3.6, + "grad_norm": 4.123501777648926, + "learning_rate": 1.811184641129946e-06, + "logits/chosen": -0.6265547275543213, + "logits/rejected": -0.7162302732467651, + "logps/chosen": -53.96925735473633, + "logps/rejected": -85.81474304199219, + "loss": 0.6584, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.242543935775757, + "rewards/margins": 5.645805358886719, + "rewards/rejected": -2.4032609462738037, + "step": 14392 + }, + { + "epoch": 3.6, + "grad_norm": 22.85393524169922, + "learning_rate": 1.8105792872143807e-06, + "logits/chosen": -0.5329185724258423, + "logits/rejected": -0.6119691729545593, + "logps/chosen": -51.99179458618164, + "logps/rejected": -96.85704040527344, + "loss": 0.6411, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0432333946228027, + "rewards/margins": 6.30502986907959, + "rewards/rejected": -3.26179575920105, + "step": 14393 + }, + { + "epoch": 3.6, + "grad_norm": 6.494018077850342, + "learning_rate": 1.8099740121127956e-06, + "logits/chosen": -0.49927374720573425, + "logits/rejected": -0.5703794956207275, + "logps/chosen": -48.36229705810547, + "logps/rejected": -114.14618682861328, + "loss": 0.5785, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0967769622802734, + "rewards/margins": 7.41458797454834, + "rewards/rejected": -4.317811012268066, + "step": 14394 + }, + { + "epoch": 3.6, + "grad_norm": 2.268084764480591, + "learning_rate": 1.8093688158401458e-06, + "logits/chosen": -0.5437665581703186, + "logits/rejected": -0.6036247611045837, + "logps/chosen": -58.75812911987305, + "logps/rejected": -107.26066589355469, + "loss": 0.5798, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1322507858276367, + "rewards/margins": 7.203968524932861, + "rewards/rejected": -4.071717739105225, + "step": 14395 + }, + { + "epoch": 3.6, + "grad_norm": 4.843968868255615, + "learning_rate": 1.8087636984113844e-06, + "logits/chosen": -0.4931052327156067, + "logits/rejected": -0.6269274950027466, + "logps/chosen": -54.70503234863281, + "logps/rejected": -101.5098876953125, + "loss": 0.6083, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9787824153900146, + "rewards/margins": 7.837339401245117, + "rewards/rejected": -4.858556270599365, + "step": 14396 + }, + { + "epoch": 3.6, + "grad_norm": 6.749236106872559, + "learning_rate": 1.8081586598414675e-06, + "logits/chosen": -0.5323904156684875, + "logits/rejected": -0.6016063690185547, + "logps/chosen": -64.71903228759766, + "logps/rejected": -101.03837585449219, + "loss": 0.6792, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.65000057220459, + "rewards/margins": 6.79397439956665, + "rewards/rejected": -4.1439738273620605, + "step": 14397 + }, + { + "epoch": 3.6, + "grad_norm": 4.9260783195495605, + "learning_rate": 1.8075537001453458e-06, + "logits/chosen": -0.644580066204071, + "logits/rejected": -0.682363748550415, + "logps/chosen": -54.787723541259766, + "logps/rejected": -104.93057250976562, + "loss": 0.6071, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9771859645843506, + "rewards/margins": 6.81525993347168, + "rewards/rejected": -3.8380744457244873, + "step": 14398 + }, + { + "epoch": 3.6, + "grad_norm": 2.2299416065216064, + "learning_rate": 1.806948819337966e-06, + "logits/chosen": -0.5650753378868103, + "logits/rejected": -0.64894700050354, + "logps/chosen": -56.42238235473633, + "logps/rejected": -110.62710571289062, + "loss": 0.5466, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.446593761444092, + "rewards/margins": 8.067137718200684, + "rewards/rejected": -4.620543956756592, + "step": 14399 + }, + { + "epoch": 3.6, + "grad_norm": 2.562117099761963, + "learning_rate": 1.8063440174342783e-06, + "logits/chosen": -0.5220481157302856, + "logits/rejected": -0.6311346888542175, + "logps/chosen": -58.43077087402344, + "logps/rejected": -101.33187103271484, + "loss": 0.6007, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0348854064941406, + "rewards/margins": 7.6862359046936035, + "rewards/rejected": -4.651350975036621, + "step": 14400 + }, + { + "epoch": 3.6, + "grad_norm": 2.3827621936798096, + "learning_rate": 1.8057392944492258e-06, + "logits/chosen": -0.5682502388954163, + "logits/rejected": -0.6678467988967896, + "logps/chosen": -52.79313278198242, + "logps/rejected": -119.57500457763672, + "loss": 0.5605, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.542278289794922, + "rewards/margins": 9.261927604675293, + "rewards/rejected": -5.719648838043213, + "step": 14401 + }, + { + "epoch": 3.6, + "grad_norm": 3.2655467987060547, + "learning_rate": 1.8051346503977546e-06, + "logits/chosen": -0.4769151210784912, + "logits/rejected": -0.5195863246917725, + "logps/chosen": -46.74185562133789, + "logps/rejected": -118.85698699951172, + "loss": 0.5152, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.227635383605957, + "rewards/margins": 7.515507698059082, + "rewards/rejected": -4.287871837615967, + "step": 14402 + }, + { + "epoch": 3.6, + "grad_norm": 7.981184959411621, + "learning_rate": 1.804530085294804e-06, + "logits/chosen": -0.5249739289283752, + "logits/rejected": -0.5624637603759766, + "logps/chosen": -46.05298614501953, + "logps/rejected": -92.7031478881836, + "loss": 0.6598, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0188186168670654, + "rewards/margins": 6.1017680168151855, + "rewards/rejected": -3.082949161529541, + "step": 14403 + }, + { + "epoch": 3.6, + "grad_norm": 4.7726545333862305, + "learning_rate": 1.8039255991553124e-06, + "logits/chosen": -0.6057336330413818, + "logits/rejected": -0.6631828546524048, + "logps/chosen": -48.72095489501953, + "logps/rejected": -97.15496826171875, + "loss": 0.6915, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8029069900512695, + "rewards/margins": 7.1956963539123535, + "rewards/rejected": -4.392789840698242, + "step": 14404 + }, + { + "epoch": 3.6, + "grad_norm": 3.1511600017547607, + "learning_rate": 1.80332119199422e-06, + "logits/chosen": -0.5027223825454712, + "logits/rejected": -0.5641469955444336, + "logps/chosen": -49.99445724487305, + "logps/rejected": -113.6243896484375, + "loss": 0.5322, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.566844940185547, + "rewards/margins": 7.930412292480469, + "rewards/rejected": -4.363567352294922, + "step": 14405 + }, + { + "epoch": 3.6, + "grad_norm": 3.6417043209075928, + "learning_rate": 1.8027168638264602e-06, + "logits/chosen": -0.5295619964599609, + "logits/rejected": -0.5903644561767578, + "logps/chosen": -51.206199645996094, + "logps/rejected": -119.94256591796875, + "loss": 0.591, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.972223997116089, + "rewards/margins": 8.041189193725586, + "rewards/rejected": -5.068965435028076, + "step": 14406 + }, + { + "epoch": 3.6, + "grad_norm": 4.870993614196777, + "learning_rate": 1.8021126146669687e-06, + "logits/chosen": -0.5202964544296265, + "logits/rejected": -0.6098498702049255, + "logps/chosen": -53.49098587036133, + "logps/rejected": -103.7176742553711, + "loss": 0.604, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0861611366271973, + "rewards/margins": 7.648228168487549, + "rewards/rejected": -4.562066555023193, + "step": 14407 + }, + { + "epoch": 3.6, + "grad_norm": 7.398679256439209, + "learning_rate": 1.8015084445306741e-06, + "logits/chosen": -0.48240432143211365, + "logits/rejected": -0.5704946517944336, + "logps/chosen": -65.70217895507812, + "logps/rejected": -103.01888275146484, + "loss": 0.668, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.561870574951172, + "rewards/margins": 6.718600273132324, + "rewards/rejected": -4.156729221343994, + "step": 14408 + }, + { + "epoch": 3.6, + "grad_norm": 3.608430862426758, + "learning_rate": 1.8009043534325098e-06, + "logits/chosen": -0.5597891807556152, + "logits/rejected": -0.6781889200210571, + "logps/chosen": -53.35337448120117, + "logps/rejected": -83.68035125732422, + "loss": 0.6047, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.925917387008667, + "rewards/margins": 6.2227325439453125, + "rewards/rejected": -3.2968153953552246, + "step": 14409 + }, + { + "epoch": 3.6, + "grad_norm": 7.6760382652282715, + "learning_rate": 1.8003003413874015e-06, + "logits/chosen": -0.5983326435089111, + "logits/rejected": -0.6819326877593994, + "logps/chosen": -53.98504638671875, + "logps/rejected": -112.03934478759766, + "loss": 0.5761, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9274394512176514, + "rewards/margins": 7.322774887084961, + "rewards/rejected": -4.395336151123047, + "step": 14410 + }, + { + "epoch": 3.61, + "grad_norm": 4.164315700531006, + "learning_rate": 1.7996964084102735e-06, + "logits/chosen": -0.49468687176704407, + "logits/rejected": -0.5952963829040527, + "logps/chosen": -52.69969177246094, + "logps/rejected": -109.5366439819336, + "loss": 0.5512, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9736859798431396, + "rewards/margins": 7.660438537597656, + "rewards/rejected": -4.686752796173096, + "step": 14411 + }, + { + "epoch": 3.61, + "grad_norm": 3.9776015281677246, + "learning_rate": 1.799092554516053e-06, + "logits/chosen": -0.5116336941719055, + "logits/rejected": -0.603090763092041, + "logps/chosen": -59.38974380493164, + "logps/rejected": -107.96255493164062, + "loss": 0.5623, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9542691707611084, + "rewards/margins": 7.407824993133545, + "rewards/rejected": -4.453555583953857, + "step": 14412 + }, + { + "epoch": 3.61, + "grad_norm": 5.015223026275635, + "learning_rate": 1.7984887797196604e-06, + "logits/chosen": -0.5682430267333984, + "logits/rejected": -0.649607241153717, + "logps/chosen": -62.08868408203125, + "logps/rejected": -119.11969757080078, + "loss": 0.6501, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.439359426498413, + "rewards/margins": 8.106073379516602, + "rewards/rejected": -4.666714668273926, + "step": 14413 + }, + { + "epoch": 3.61, + "grad_norm": 6.474395751953125, + "learning_rate": 1.797885084036013e-06, + "logits/chosen": -0.5457911491394043, + "logits/rejected": -0.6567634344100952, + "logps/chosen": -56.3342170715332, + "logps/rejected": -103.39863586425781, + "loss": 0.6931, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.934849500656128, + "rewards/margins": 6.539585113525391, + "rewards/rejected": -3.6047353744506836, + "step": 14414 + }, + { + "epoch": 3.61, + "grad_norm": 3.4136030673980713, + "learning_rate": 1.7972814674800332e-06, + "logits/chosen": -0.5965519547462463, + "logits/rejected": -0.6232486367225647, + "logps/chosen": -66.23821258544922, + "logps/rejected": -96.36122131347656, + "loss": 0.6122, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1525678634643555, + "rewards/margins": 6.051654815673828, + "rewards/rejected": -2.8990867137908936, + "step": 14415 + }, + { + "epoch": 3.61, + "grad_norm": 2.8086695671081543, + "learning_rate": 1.7966779300666349e-06, + "logits/chosen": -0.49038881063461304, + "logits/rejected": -0.6249359250068665, + "logps/chosen": -65.95586395263672, + "logps/rejected": -90.46257019042969, + "loss": 0.5787, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.786632537841797, + "rewards/margins": 6.9205780029296875, + "rewards/rejected": -4.133945465087891, + "step": 14416 + }, + { + "epoch": 3.61, + "grad_norm": 7.45376443862915, + "learning_rate": 1.7960744718107303e-06, + "logits/chosen": -0.5888634324073792, + "logits/rejected": -0.6497496962547302, + "logps/chosen": -55.79703140258789, + "logps/rejected": -128.29507446289062, + "loss": 0.6092, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.159029006958008, + "rewards/margins": 8.15988540649414, + "rewards/rejected": -5.000857353210449, + "step": 14417 + }, + { + "epoch": 3.61, + "grad_norm": 5.850372791290283, + "learning_rate": 1.7954710927272334e-06, + "logits/chosen": -0.5000572204589844, + "logits/rejected": -0.5944085121154785, + "logps/chosen": -62.310333251953125, + "logps/rejected": -101.87541198730469, + "loss": 0.6021, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0357182025909424, + "rewards/margins": 6.956825256347656, + "rewards/rejected": -3.9211068153381348, + "step": 14418 + }, + { + "epoch": 3.61, + "grad_norm": 3.9505186080932617, + "learning_rate": 1.7948677928310577e-06, + "logits/chosen": -0.48644450306892395, + "logits/rejected": -0.5220639109611511, + "logps/chosen": -55.81315612792969, + "logps/rejected": -131.07748413085938, + "loss": 0.5881, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.16586971282959, + "rewards/margins": 7.72775411605835, + "rewards/rejected": -4.56188440322876, + "step": 14419 + }, + { + "epoch": 3.61, + "grad_norm": 6.840900421142578, + "learning_rate": 1.7942645721371043e-06, + "logits/chosen": -0.5712074041366577, + "logits/rejected": -0.6254704594612122, + "logps/chosen": -51.003204345703125, + "logps/rejected": -131.90597534179688, + "loss": 0.602, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.973457098007202, + "rewards/margins": 8.20765495300293, + "rewards/rejected": -5.234197616577148, + "step": 14420 + }, + { + "epoch": 3.61, + "grad_norm": 5.8263750076293945, + "learning_rate": 1.7936614306602835e-06, + "logits/chosen": -0.527726411819458, + "logits/rejected": -0.6007695198059082, + "logps/chosen": -61.87865447998047, + "logps/rejected": -100.70752716064453, + "loss": 0.664, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1787710189819336, + "rewards/margins": 6.969286918640137, + "rewards/rejected": -3.790515899658203, + "step": 14421 + }, + { + "epoch": 3.61, + "grad_norm": 3.554596185684204, + "learning_rate": 1.7930583684155006e-06, + "logits/chosen": -0.6098237037658691, + "logits/rejected": -0.6714248657226562, + "logps/chosen": -41.31641387939453, + "logps/rejected": -119.98213958740234, + "loss": 0.5077, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2775375843048096, + "rewards/margins": 8.442222595214844, + "rewards/rejected": -5.164684772491455, + "step": 14422 + }, + { + "epoch": 3.61, + "grad_norm": 5.246095180511475, + "learning_rate": 1.792455385417657e-06, + "logits/chosen": -0.5700652003288269, + "logits/rejected": -0.6578570008277893, + "logps/chosen": -52.532554626464844, + "logps/rejected": -119.64949035644531, + "loss": 0.6012, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0966668128967285, + "rewards/margins": 7.268744468688965, + "rewards/rejected": -4.172077655792236, + "step": 14423 + }, + { + "epoch": 3.61, + "grad_norm": 4.590559959411621, + "learning_rate": 1.7918524816816502e-06, + "logits/chosen": -0.4620111584663391, + "logits/rejected": -0.5344585180282593, + "logps/chosen": -63.16530227661133, + "logps/rejected": -119.67749786376953, + "loss": 0.6874, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.869339942932129, + "rewards/margins": 6.5087785720825195, + "rewards/rejected": -3.6394381523132324, + "step": 14424 + }, + { + "epoch": 3.61, + "grad_norm": 13.737796783447266, + "learning_rate": 1.7912496572223825e-06, + "logits/chosen": -0.5378090739250183, + "logits/rejected": -0.630825400352478, + "logps/chosen": -59.66781234741211, + "logps/rejected": -97.86949157714844, + "loss": 0.7744, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.1970300674438477, + "rewards/margins": 6.099280834197998, + "rewards/rejected": -3.9022510051727295, + "step": 14425 + }, + { + "epoch": 3.61, + "grad_norm": 7.5578999519348145, + "learning_rate": 1.7906469120547475e-06, + "logits/chosen": -0.5216964483261108, + "logits/rejected": -0.5656471848487854, + "logps/chosen": -62.019134521484375, + "logps/rejected": -116.09691619873047, + "loss": 0.8619, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.025904655456543, + "rewards/margins": 5.849499702453613, + "rewards/rejected": -2.823594570159912, + "step": 14426 + }, + { + "epoch": 3.61, + "grad_norm": 4.506531238555908, + "learning_rate": 1.7900442461936423e-06, + "logits/chosen": -0.5926746726036072, + "logits/rejected": -0.6754809617996216, + "logps/chosen": -53.024452209472656, + "logps/rejected": -100.81028747558594, + "loss": 0.6479, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.697709560394287, + "rewards/margins": 7.3161797523498535, + "rewards/rejected": -4.618470191955566, + "step": 14427 + }, + { + "epoch": 3.61, + "grad_norm": 4.7186126708984375, + "learning_rate": 1.7894416596539588e-06, + "logits/chosen": -0.5206235647201538, + "logits/rejected": -0.5712606906890869, + "logps/chosen": -58.88069152832031, + "logps/rejected": -99.70047760009766, + "loss": 0.6261, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3215322494506836, + "rewards/margins": 6.3146867752075195, + "rewards/rejected": -2.9931538105010986, + "step": 14428 + }, + { + "epoch": 3.61, + "grad_norm": 6.230856895446777, + "learning_rate": 1.7888391524505843e-06, + "logits/chosen": -0.5672551393508911, + "logits/rejected": -0.6156600713729858, + "logps/chosen": -50.81942367553711, + "logps/rejected": -109.98076629638672, + "loss": 0.6271, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8128623962402344, + "rewards/margins": 6.678071022033691, + "rewards/rejected": -3.8652091026306152, + "step": 14429 + }, + { + "epoch": 3.61, + "grad_norm": 5.03587007522583, + "learning_rate": 1.7882367245984117e-06, + "logits/chosen": -0.6238164901733398, + "logits/rejected": -0.6917935013771057, + "logps/chosen": -59.229469299316406, + "logps/rejected": -110.63502502441406, + "loss": 0.6568, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6624250411987305, + "rewards/margins": 7.517697334289551, + "rewards/rejected": -4.85527229309082, + "step": 14430 + }, + { + "epoch": 3.61, + "grad_norm": 9.670761108398438, + "learning_rate": 1.7876343761123261e-06, + "logits/chosen": -0.5246176719665527, + "logits/rejected": -0.5206102132797241, + "logps/chosen": -68.43179321289062, + "logps/rejected": -120.30448150634766, + "loss": 0.6966, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.862006425857544, + "rewards/margins": 6.238974571228027, + "rewards/rejected": -3.376967668533325, + "step": 14431 + }, + { + "epoch": 3.61, + "grad_norm": 5.548070907592773, + "learning_rate": 1.78703210700721e-06, + "logits/chosen": -0.5488578677177429, + "logits/rejected": -0.655062735080719, + "logps/chosen": -55.31495666503906, + "logps/rejected": -103.66938781738281, + "loss": 0.7127, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.530698776245117, + "rewards/margins": 7.0626630783081055, + "rewards/rejected": -4.531963348388672, + "step": 14432 + }, + { + "epoch": 3.61, + "grad_norm": 4.6284332275390625, + "learning_rate": 1.7864299172979482e-06, + "logits/chosen": -0.5933564305305481, + "logits/rejected": -0.639350950717926, + "logps/chosen": -40.7188606262207, + "logps/rejected": -115.6199951171875, + "loss": 0.5913, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.097937822341919, + "rewards/margins": 7.092185974121094, + "rewards/rejected": -3.994248390197754, + "step": 14433 + }, + { + "epoch": 3.61, + "grad_norm": 2.473081588745117, + "learning_rate": 1.7858278069994245e-06, + "logits/chosen": -0.5549232959747314, + "logits/rejected": -0.5934607982635498, + "logps/chosen": -57.41253662109375, + "logps/rejected": -104.10276794433594, + "loss": 0.6206, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.892378568649292, + "rewards/margins": 7.028659820556641, + "rewards/rejected": -4.1362810134887695, + "step": 14434 + }, + { + "epoch": 3.61, + "grad_norm": 7.434097766876221, + "learning_rate": 1.7852257761265107e-06, + "logits/chosen": -0.5760087370872498, + "logits/rejected": -0.6772931814193726, + "logps/chosen": -48.19744110107422, + "logps/rejected": -95.37594604492188, + "loss": 0.6445, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0137746334075928, + "rewards/margins": 7.112746238708496, + "rewards/rejected": -4.098971366882324, + "step": 14435 + }, + { + "epoch": 3.61, + "grad_norm": 8.514410972595215, + "learning_rate": 1.7846238246940879e-06, + "logits/chosen": -0.5732512474060059, + "logits/rejected": -0.6295449733734131, + "logps/chosen": -41.90556716918945, + "logps/rejected": -107.49209594726562, + "loss": 0.5608, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2581675052642822, + "rewards/margins": 6.79136848449707, + "rewards/rejected": -3.53320050239563, + "step": 14436 + }, + { + "epoch": 3.61, + "grad_norm": 30.870708465576172, + "learning_rate": 1.7840219527170333e-06, + "logits/chosen": -0.4506548047065735, + "logits/rejected": -0.5647217035293579, + "logps/chosen": -57.322845458984375, + "logps/rejected": -99.31233978271484, + "loss": 0.8727, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.747405529022217, + "rewards/margins": 7.044216156005859, + "rewards/rejected": -4.296810626983643, + "step": 14437 + }, + { + "epoch": 3.61, + "grad_norm": 3.723728656768799, + "learning_rate": 1.7834201602102141e-06, + "logits/chosen": -0.5499546527862549, + "logits/rejected": -0.6607075929641724, + "logps/chosen": -54.641944885253906, + "logps/rejected": -104.94425201416016, + "loss": 0.6332, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9819447994232178, + "rewards/margins": 7.806342124938965, + "rewards/rejected": -4.824397087097168, + "step": 14438 + }, + { + "epoch": 3.61, + "grad_norm": 3.020374298095703, + "learning_rate": 1.7828184471885036e-06, + "logits/chosen": -0.5144925713539124, + "logits/rejected": -0.5771279335021973, + "logps/chosen": -64.88613891601562, + "logps/rejected": -122.2200927734375, + "loss": 0.6268, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8872299194335938, + "rewards/margins": 8.242307662963867, + "rewards/rejected": -5.355076789855957, + "step": 14439 + }, + { + "epoch": 3.61, + "grad_norm": 3.931859016418457, + "learning_rate": 1.7822168136667729e-06, + "logits/chosen": -0.5926181077957153, + "logits/rejected": -0.6638373136520386, + "logps/chosen": -55.071449279785156, + "logps/rejected": -133.1020965576172, + "loss": 0.5536, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2256674766540527, + "rewards/margins": 9.052403450012207, + "rewards/rejected": -5.826735973358154, + "step": 14440 + }, + { + "epoch": 3.61, + "grad_norm": 5.1592512130737305, + "learning_rate": 1.7816152596598874e-06, + "logits/chosen": -0.5821590423583984, + "logits/rejected": -0.6462838649749756, + "logps/chosen": -40.093868255615234, + "logps/rejected": -104.08267211914062, + "loss": 0.6109, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1717441082000732, + "rewards/margins": 7.438518524169922, + "rewards/rejected": -4.2667741775512695, + "step": 14441 + }, + { + "epoch": 3.61, + "grad_norm": 6.366888523101807, + "learning_rate": 1.7810137851827108e-06, + "logits/chosen": -0.5372997522354126, + "logits/rejected": -0.6382337212562561, + "logps/chosen": -55.3634033203125, + "logps/rejected": -94.15752410888672, + "loss": 0.7398, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9974544048309326, + "rewards/margins": 5.918768882751465, + "rewards/rejected": -2.9213144779205322, + "step": 14442 + }, + { + "epoch": 3.61, + "grad_norm": 5.883144378662109, + "learning_rate": 1.7804123902501087e-06, + "logits/chosen": -0.5797707438468933, + "logits/rejected": -0.6325175166130066, + "logps/chosen": -54.79911422729492, + "logps/rejected": -129.05120849609375, + "loss": 0.6162, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2005715370178223, + "rewards/margins": 7.229671478271484, + "rewards/rejected": -4.029099941253662, + "step": 14443 + }, + { + "epoch": 3.61, + "grad_norm": 4.8207597732543945, + "learning_rate": 1.7798110748769409e-06, + "logits/chosen": -0.5318186283111572, + "logits/rejected": -0.603302538394928, + "logps/chosen": -54.261810302734375, + "logps/rejected": -97.30022430419922, + "loss": 0.6169, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.023312568664551, + "rewards/margins": 5.394758224487305, + "rewards/rejected": -2.371446371078491, + "step": 14444 + }, + { + "epoch": 3.61, + "grad_norm": 7.439254283905029, + "learning_rate": 1.779209839078065e-06, + "logits/chosen": -0.46604859828948975, + "logits/rejected": -0.5823330283164978, + "logps/chosen": -56.344085693359375, + "logps/rejected": -82.64677429199219, + "loss": 0.6262, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0686075687408447, + "rewards/margins": 5.909481048583984, + "rewards/rejected": -2.840873956680298, + "step": 14445 + }, + { + "epoch": 3.61, + "grad_norm": 2.9990973472595215, + "learning_rate": 1.7786086828683413e-06, + "logits/chosen": -0.5683242082595825, + "logits/rejected": -0.6641912460327148, + "logps/chosen": -58.880760192871094, + "logps/rejected": -118.95606994628906, + "loss": 0.5966, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.092043876647949, + "rewards/margins": 8.966301918029785, + "rewards/rejected": -5.874258041381836, + "step": 14446 + }, + { + "epoch": 3.61, + "grad_norm": 3.7496039867401123, + "learning_rate": 1.7780076062626222e-06, + "logits/chosen": -0.5394101142883301, + "logits/rejected": -0.6268722414970398, + "logps/chosen": -56.91703414916992, + "logps/rejected": -91.79302215576172, + "loss": 0.5996, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.111091136932373, + "rewards/margins": 6.451320171356201, + "rewards/rejected": -3.340229034423828, + "step": 14447 + }, + { + "epoch": 3.61, + "grad_norm": 2.6630585193634033, + "learning_rate": 1.7774066092757635e-06, + "logits/chosen": -0.5246230363845825, + "logits/rejected": -0.6361615657806396, + "logps/chosen": -55.875587463378906, + "logps/rejected": -113.39913177490234, + "loss": 0.597, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8150458335876465, + "rewards/margins": 7.234100818634033, + "rewards/rejected": -4.419054985046387, + "step": 14448 + }, + { + "epoch": 3.61, + "grad_norm": 6.218389987945557, + "learning_rate": 1.776805691922614e-06, + "logits/chosen": -0.5623085498809814, + "logits/rejected": -0.6125925779342651, + "logps/chosen": -49.291259765625, + "logps/rejected": -119.54751586914062, + "loss": 0.622, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5677216053009033, + "rewards/margins": 7.793027400970459, + "rewards/rejected": -4.225305557250977, + "step": 14449 + }, + { + "epoch": 3.61, + "grad_norm": 6.985533237457275, + "learning_rate": 1.7762048542180255e-06, + "logits/chosen": -0.507002592086792, + "logits/rejected": -0.592402458190918, + "logps/chosen": -60.2409553527832, + "logps/rejected": -103.2083511352539, + "loss": 0.6494, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7872846126556396, + "rewards/margins": 7.611806869506836, + "rewards/rejected": -4.824522018432617, + "step": 14450 + }, + { + "epoch": 3.62, + "grad_norm": 3.6169614791870117, + "learning_rate": 1.7756040961768423e-06, + "logits/chosen": -0.5180556774139404, + "logits/rejected": -0.5854378342628479, + "logps/chosen": -73.9112548828125, + "logps/rejected": -117.44334411621094, + "loss": 0.7177, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2222976684570312, + "rewards/margins": 7.844925403594971, + "rewards/rejected": -4.622628211975098, + "step": 14451 + }, + { + "epoch": 3.62, + "grad_norm": 5.782762050628662, + "learning_rate": 1.775003417813913e-06, + "logits/chosen": -0.5454113483428955, + "logits/rejected": -0.5936290621757507, + "logps/chosen": -44.0698356628418, + "logps/rejected": -96.92851257324219, + "loss": 0.6143, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.177621841430664, + "rewards/margins": 6.395537376403809, + "rewards/rejected": -3.2179150581359863, + "step": 14452 + }, + { + "epoch": 3.62, + "grad_norm": 12.849944114685059, + "learning_rate": 1.77440281914408e-06, + "logits/chosen": -0.5218960046768188, + "logits/rejected": -0.5392087697982788, + "logps/chosen": -54.034114837646484, + "logps/rejected": -105.1024398803711, + "loss": 0.7454, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0759899616241455, + "rewards/margins": 6.568798065185547, + "rewards/rejected": -3.4928085803985596, + "step": 14453 + }, + { + "epoch": 3.62, + "grad_norm": 6.102696418762207, + "learning_rate": 1.7738023001821825e-06, + "logits/chosen": -0.5003507137298584, + "logits/rejected": -0.557002067565918, + "logps/chosen": -46.268455505371094, + "logps/rejected": -100.99274444580078, + "loss": 0.6939, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.776045322418213, + "rewards/margins": 6.168849468231201, + "rewards/rejected": -3.3928041458129883, + "step": 14454 + }, + { + "epoch": 3.62, + "grad_norm": 3.2865700721740723, + "learning_rate": 1.773201860943063e-06, + "logits/chosen": -0.5595337152481079, + "logits/rejected": -0.6362652778625488, + "logps/chosen": -58.0976448059082, + "logps/rejected": -93.12471008300781, + "loss": 0.6532, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1305155754089355, + "rewards/margins": 7.126147270202637, + "rewards/rejected": -3.9956319332122803, + "step": 14455 + }, + { + "epoch": 3.62, + "grad_norm": 5.240610599517822, + "learning_rate": 1.7726015014415577e-06, + "logits/chosen": -0.5493411421775818, + "logits/rejected": -0.619312047958374, + "logps/chosen": -50.199310302734375, + "logps/rejected": -98.37970733642578, + "loss": 0.6096, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1743335723876953, + "rewards/margins": 6.374423027038574, + "rewards/rejected": -3.200089454650879, + "step": 14456 + }, + { + "epoch": 3.62, + "grad_norm": 7.328519344329834, + "learning_rate": 1.772001221692501e-06, + "logits/chosen": -0.575792133808136, + "logits/rejected": -0.6646786332130432, + "logps/chosen": -71.40228271484375, + "logps/rejected": -97.39556121826172, + "loss": 0.769, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8023226261138916, + "rewards/margins": 6.613429546356201, + "rewards/rejected": -3.8111069202423096, + "step": 14457 + }, + { + "epoch": 3.62, + "grad_norm": 4.0204644203186035, + "learning_rate": 1.7714010217107285e-06, + "logits/chosen": -0.5173242092132568, + "logits/rejected": -0.603053867816925, + "logps/chosen": -61.64866638183594, + "logps/rejected": -113.67903137207031, + "loss": 0.6351, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1306684017181396, + "rewards/margins": 6.78952693939209, + "rewards/rejected": -3.658858299255371, + "step": 14458 + }, + { + "epoch": 3.62, + "grad_norm": 2.2642526626586914, + "learning_rate": 1.7708009015110716e-06, + "logits/chosen": -0.5490232706069946, + "logits/rejected": -0.6222044229507446, + "logps/chosen": -49.431087493896484, + "logps/rejected": -88.84580993652344, + "loss": 0.5709, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.193054676055908, + "rewards/margins": 6.681821346282959, + "rewards/rejected": -3.4887678623199463, + "step": 14459 + }, + { + "epoch": 3.62, + "grad_norm": 5.7016801834106445, + "learning_rate": 1.7702008611083572e-06, + "logits/chosen": -0.6127351522445679, + "logits/rejected": -0.6554286479949951, + "logps/chosen": -43.80553436279297, + "logps/rejected": -118.21395111083984, + "loss": 0.5977, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1546895503997803, + "rewards/margins": 8.220071792602539, + "rewards/rejected": -5.065382957458496, + "step": 14460 + }, + { + "epoch": 3.62, + "grad_norm": 3.927832841873169, + "learning_rate": 1.7696009005174142e-06, + "logits/chosen": -0.5093645453453064, + "logits/rejected": -0.5726558566093445, + "logps/chosen": -55.828521728515625, + "logps/rejected": -103.95781707763672, + "loss": 0.661, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0058772563934326, + "rewards/margins": 6.4884467124938965, + "rewards/rejected": -3.4825687408447266, + "step": 14461 + }, + { + "epoch": 3.62, + "grad_norm": 3.871825933456421, + "learning_rate": 1.7690010197530732e-06, + "logits/chosen": -0.5733828544616699, + "logits/rejected": -0.6466529965400696, + "logps/chosen": -50.89512252807617, + "logps/rejected": -94.82373046875, + "loss": 0.642, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.7108120918273926, + "rewards/margins": 8.16744327545166, + "rewards/rejected": -4.456631660461426, + "step": 14462 + }, + { + "epoch": 3.62, + "grad_norm": 13.23215103149414, + "learning_rate": 1.7684012188301497e-06, + "logits/chosen": -0.5878308415412903, + "logits/rejected": -0.6589040756225586, + "logps/chosen": -56.66820526123047, + "logps/rejected": -105.1965103149414, + "loss": 0.7311, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.728860855102539, + "rewards/margins": 6.705095291137695, + "rewards/rejected": -2.976234197616577, + "step": 14463 + }, + { + "epoch": 3.62, + "grad_norm": 3.312527894973755, + "learning_rate": 1.76780149776347e-06, + "logits/chosen": -0.5416643619537354, + "logits/rejected": -0.6676859855651855, + "logps/chosen": -56.03234100341797, + "logps/rejected": -96.56713104248047, + "loss": 0.6383, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1621012687683105, + "rewards/margins": 7.494056701660156, + "rewards/rejected": -4.331955432891846, + "step": 14464 + }, + { + "epoch": 3.62, + "grad_norm": 2.957674026489258, + "learning_rate": 1.7672018565678544e-06, + "logits/chosen": -0.6309413313865662, + "logits/rejected": -0.7575029134750366, + "logps/chosen": -56.21243667602539, + "logps/rejected": -116.31678771972656, + "loss": 0.59, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0585241317749023, + "rewards/margins": 8.979045867919922, + "rewards/rejected": -5.920522212982178, + "step": 14465 + }, + { + "epoch": 3.62, + "grad_norm": 6.964205265045166, + "learning_rate": 1.7666022952581196e-06, + "logits/chosen": -0.5852102041244507, + "logits/rejected": -0.622795820236206, + "logps/chosen": -62.38214874267578, + "logps/rejected": -108.82814025878906, + "loss": 0.89, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.7950944900512695, + "rewards/margins": 5.7586541175842285, + "rewards/rejected": -2.96355938911438, + "step": 14466 + }, + { + "epoch": 3.62, + "grad_norm": 3.020102024078369, + "learning_rate": 1.7660028138490799e-06, + "logits/chosen": -0.46692925691604614, + "logits/rejected": -0.562231719493866, + "logps/chosen": -55.971744537353516, + "logps/rejected": -107.10320281982422, + "loss": 0.5122, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.583604097366333, + "rewards/margins": 7.3163161277771, + "rewards/rejected": -3.7327122688293457, + "step": 14467 + }, + { + "epoch": 3.62, + "grad_norm": 8.078364372253418, + "learning_rate": 1.765403412355552e-06, + "logits/chosen": -0.5614405274391174, + "logits/rejected": -0.6062039136886597, + "logps/chosen": -53.3150520324707, + "logps/rejected": -97.54839324951172, + "loss": 0.6132, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.915191650390625, + "rewards/margins": 5.456650257110596, + "rewards/rejected": -2.5414581298828125, + "step": 14468 + }, + { + "epoch": 3.62, + "grad_norm": 6.072378158569336, + "learning_rate": 1.7648040907923458e-06, + "logits/chosen": -0.5295482277870178, + "logits/rejected": -0.5916191339492798, + "logps/chosen": -52.864036560058594, + "logps/rejected": -111.841064453125, + "loss": 0.5536, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.93593430519104, + "rewards/margins": 7.067103385925293, + "rewards/rejected": -4.131168842315674, + "step": 14469 + }, + { + "epoch": 3.62, + "grad_norm": 4.10831880569458, + "learning_rate": 1.7642048491742704e-06, + "logits/chosen": -0.5303055047988892, + "logits/rejected": -0.5924326181411743, + "logps/chosen": -57.02811813354492, + "logps/rejected": -106.53970336914062, + "loss": 0.6462, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.950610637664795, + "rewards/margins": 6.643472671508789, + "rewards/rejected": -3.6928627490997314, + "step": 14470 + }, + { + "epoch": 3.62, + "grad_norm": 2.58874249458313, + "learning_rate": 1.763605687516136e-06, + "logits/chosen": -0.572113573551178, + "logits/rejected": -0.5846690535545349, + "logps/chosen": -36.6309928894043, + "logps/rejected": -120.68124389648438, + "loss": 0.5123, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.430842399597168, + "rewards/margins": 8.051992416381836, + "rewards/rejected": -4.621150016784668, + "step": 14471 + }, + { + "epoch": 3.62, + "grad_norm": 9.16450309753418, + "learning_rate": 1.7630066058327467e-06, + "logits/chosen": -0.5160893797874451, + "logits/rejected": -0.5838022232055664, + "logps/chosen": -52.37138366699219, + "logps/rejected": -101.54734802246094, + "loss": 0.5837, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1729416847229004, + "rewards/margins": 5.899472713470459, + "rewards/rejected": -2.7265312671661377, + "step": 14472 + }, + { + "epoch": 3.62, + "grad_norm": 6.647649765014648, + "learning_rate": 1.7624076041389082e-06, + "logits/chosen": -0.5647386908531189, + "logits/rejected": -0.6098400354385376, + "logps/chosen": -54.73590850830078, + "logps/rejected": -118.38496398925781, + "loss": 0.6342, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0303008556365967, + "rewards/margins": 6.779335975646973, + "rewards/rejected": -3.7490346431732178, + "step": 14473 + }, + { + "epoch": 3.62, + "grad_norm": 3.344785690307617, + "learning_rate": 1.7618086824494219e-06, + "logits/chosen": -0.6201344728469849, + "logits/rejected": -0.6601672172546387, + "logps/chosen": -51.426063537597656, + "logps/rejected": -96.47441864013672, + "loss": 0.5613, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1964497566223145, + "rewards/margins": 6.29691219329834, + "rewards/rejected": -3.100461721420288, + "step": 14474 + }, + { + "epoch": 3.62, + "grad_norm": 4.59169864654541, + "learning_rate": 1.7612098407790855e-06, + "logits/chosen": -0.5644539594650269, + "logits/rejected": -0.6425723433494568, + "logps/chosen": -56.32801818847656, + "logps/rejected": -119.38636779785156, + "loss": 0.6408, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1085729598999023, + "rewards/margins": 8.49708080291748, + "rewards/rejected": -5.388506889343262, + "step": 14475 + }, + { + "epoch": 3.62, + "grad_norm": 5.630398273468018, + "learning_rate": 1.760611079142699e-06, + "logits/chosen": -0.5070344805717468, + "logits/rejected": -0.5696164965629578, + "logps/chosen": -49.42681121826172, + "logps/rejected": -91.02418518066406, + "loss": 0.5898, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.095552921295166, + "rewards/margins": 5.760544776916504, + "rewards/rejected": -2.664991855621338, + "step": 14476 + }, + { + "epoch": 3.62, + "grad_norm": 9.578028678894043, + "learning_rate": 1.7600123975550614e-06, + "logits/chosen": -0.48752543330192566, + "logits/rejected": -0.6457281708717346, + "logps/chosen": -57.71883773803711, + "logps/rejected": -90.18583679199219, + "loss": 0.5982, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1169145107269287, + "rewards/margins": 6.140002250671387, + "rewards/rejected": -3.023087501525879, + "step": 14477 + }, + { + "epoch": 3.62, + "grad_norm": 5.355844497680664, + "learning_rate": 1.7594137960309604e-06, + "logits/chosen": -0.5781610012054443, + "logits/rejected": -0.6279684901237488, + "logps/chosen": -50.51935577392578, + "logps/rejected": -115.74569702148438, + "loss": 0.6451, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3307273387908936, + "rewards/margins": 8.444395065307617, + "rewards/rejected": -5.113667011260986, + "step": 14478 + }, + { + "epoch": 3.62, + "grad_norm": 3.5285747051239014, + "learning_rate": 1.758815274585191e-06, + "logits/chosen": -0.626965343952179, + "logits/rejected": -0.6711735129356384, + "logps/chosen": -51.09737014770508, + "logps/rejected": -115.60720825195312, + "loss": 0.6227, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0396201610565186, + "rewards/margins": 7.175983428955078, + "rewards/rejected": -4.1363630294799805, + "step": 14479 + }, + { + "epoch": 3.62, + "grad_norm": 3.048889636993408, + "learning_rate": 1.7582168332325477e-06, + "logits/chosen": -0.509926438331604, + "logits/rejected": -0.5155373811721802, + "logps/chosen": -47.345924377441406, + "logps/rejected": -147.56988525390625, + "loss": 0.6379, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0227210521698, + "rewards/margins": 7.5638322830200195, + "rewards/rejected": -4.541110515594482, + "step": 14480 + }, + { + "epoch": 3.62, + "grad_norm": 5.029788017272949, + "learning_rate": 1.757618471987811e-06, + "logits/chosen": -0.639761745929718, + "logits/rejected": -0.7309479117393494, + "logps/chosen": -47.15791320800781, + "logps/rejected": -113.84342956542969, + "loss": 0.5932, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.23535418510437, + "rewards/margins": 7.817269325256348, + "rewards/rejected": -4.58191442489624, + "step": 14481 + }, + { + "epoch": 3.62, + "grad_norm": 5.942605972290039, + "learning_rate": 1.7570201908657702e-06, + "logits/chosen": -0.5463789701461792, + "logits/rejected": -0.6412984132766724, + "logps/chosen": -62.85674285888672, + "logps/rejected": -107.40194702148438, + "loss": 0.7523, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6585917472839355, + "rewards/margins": 6.611605644226074, + "rewards/rejected": -3.9530138969421387, + "step": 14482 + }, + { + "epoch": 3.62, + "grad_norm": 6.066387176513672, + "learning_rate": 1.7564219898812119e-06, + "logits/chosen": -0.4891132116317749, + "logits/rejected": -0.5083645582199097, + "logps/chosen": -51.92107009887695, + "logps/rejected": -120.98731994628906, + "loss": 0.6693, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0853495597839355, + "rewards/margins": 7.504533767700195, + "rewards/rejected": -4.41918420791626, + "step": 14483 + }, + { + "epoch": 3.62, + "grad_norm": 35.52958297729492, + "learning_rate": 1.755823869048916e-06, + "logits/chosen": -0.600552499294281, + "logits/rejected": -0.6497674584388733, + "logps/chosen": -47.42158126831055, + "logps/rejected": -104.45388793945312, + "loss": 0.6467, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.395580768585205, + "rewards/margins": 6.792863845825195, + "rewards/rejected": -3.397282600402832, + "step": 14484 + }, + { + "epoch": 3.62, + "grad_norm": 5.81619930267334, + "learning_rate": 1.7552258283836615e-06, + "logits/chosen": -0.6167963743209839, + "logits/rejected": -0.6632490754127502, + "logps/chosen": -48.77834701538086, + "logps/rejected": -105.58476257324219, + "loss": 0.5812, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9186806678771973, + "rewards/margins": 7.096622943878174, + "rewards/rejected": -4.177942752838135, + "step": 14485 + }, + { + "epoch": 3.62, + "grad_norm": 3.5240590572357178, + "learning_rate": 1.7546278679002292e-06, + "logits/chosen": -0.48263978958129883, + "logits/rejected": -0.6089221239089966, + "logps/chosen": -60.91471862792969, + "logps/rejected": -77.2326431274414, + "loss": 0.6201, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2552621364593506, + "rewards/margins": 6.970297336578369, + "rewards/rejected": -3.7150347232818604, + "step": 14486 + }, + { + "epoch": 3.62, + "grad_norm": 3.1054599285125732, + "learning_rate": 1.7540299876133949e-06, + "logits/chosen": -0.5348564386367798, + "logits/rejected": -0.6000261306762695, + "logps/chosen": -49.46672821044922, + "logps/rejected": -94.20338439941406, + "loss": 0.5688, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3230390548706055, + "rewards/margins": 7.401954650878906, + "rewards/rejected": -4.078915596008301, + "step": 14487 + }, + { + "epoch": 3.62, + "grad_norm": 3.9118337631225586, + "learning_rate": 1.7534321875379307e-06, + "logits/chosen": -0.5352977514266968, + "logits/rejected": -0.5980353355407715, + "logps/chosen": -60.54051208496094, + "logps/rejected": -120.15396881103516, + "loss": 0.627, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2174856662750244, + "rewards/margins": 7.922682762145996, + "rewards/rejected": -4.705195903778076, + "step": 14488 + }, + { + "epoch": 3.62, + "grad_norm": 2.9042513370513916, + "learning_rate": 1.752834467688611e-06, + "logits/chosen": -0.5438128709793091, + "logits/rejected": -0.6323248744010925, + "logps/chosen": -53.36896896362305, + "logps/rejected": -104.70893096923828, + "loss": 0.5644, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.41713809967041, + "rewards/margins": 7.799220085144043, + "rewards/rejected": -4.382081985473633, + "step": 14489 + }, + { + "epoch": 3.62, + "grad_norm": 2.8010683059692383, + "learning_rate": 1.7522368280802048e-06, + "logits/chosen": -0.5731292366981506, + "logits/rejected": -0.6665441393852234, + "logps/chosen": -55.99369430541992, + "logps/rejected": -109.22522735595703, + "loss": 0.6087, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.150001287460327, + "rewards/margins": 7.626171588897705, + "rewards/rejected": -4.476170539855957, + "step": 14490 + }, + { + "epoch": 3.63, + "grad_norm": 21.015941619873047, + "learning_rate": 1.7516392687274825e-06, + "logits/chosen": -0.5031691193580627, + "logits/rejected": -0.6242036819458008, + "logps/chosen": -63.37644958496094, + "logps/rejected": -119.03799438476562, + "loss": 0.547, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3049230575561523, + "rewards/margins": 8.666156768798828, + "rewards/rejected": -5.361234664916992, + "step": 14491 + }, + { + "epoch": 3.63, + "grad_norm": 1.5534260272979736, + "learning_rate": 1.7510417896452086e-06, + "logits/chosen": -0.5387879014015198, + "logits/rejected": -0.6672609448432922, + "logps/chosen": -51.20848846435547, + "logps/rejected": -87.38063049316406, + "loss": 0.5488, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2998197078704834, + "rewards/margins": 7.81056547164917, + "rewards/rejected": -4.510745525360107, + "step": 14492 + }, + { + "epoch": 3.63, + "grad_norm": 6.24782657623291, + "learning_rate": 1.750444390848146e-06, + "logits/chosen": -0.5235630869865417, + "logits/rejected": -0.5732899308204651, + "logps/chosen": -54.306095123291016, + "logps/rejected": -113.26667022705078, + "loss": 0.6203, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.338141918182373, + "rewards/margins": 6.6092143058776855, + "rewards/rejected": -3.2710721492767334, + "step": 14493 + }, + { + "epoch": 3.63, + "grad_norm": 8.838311195373535, + "learning_rate": 1.7498470723510613e-06, + "logits/chosen": -0.5183629393577576, + "logits/rejected": -0.6179943680763245, + "logps/chosen": -57.932090759277344, + "logps/rejected": -95.68939971923828, + "loss": 0.6492, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8924646377563477, + "rewards/margins": 6.880531311035156, + "rewards/rejected": -3.9880669116973877, + "step": 14494 + }, + { + "epoch": 3.63, + "grad_norm": 6.822883605957031, + "learning_rate": 1.7492498341687104e-06, + "logits/chosen": -0.4928186535835266, + "logits/rejected": -0.562642514705658, + "logps/chosen": -65.39141845703125, + "logps/rejected": -98.54117584228516, + "loss": 0.6707, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.164271831512451, + "rewards/margins": 5.237768650054932, + "rewards/rejected": -2.073496103286743, + "step": 14495 + }, + { + "epoch": 3.63, + "grad_norm": 8.118932723999023, + "learning_rate": 1.7486526763158551e-06, + "logits/chosen": -0.5124391913414001, + "logits/rejected": -0.6170677542686462, + "logps/chosen": -56.58294677734375, + "logps/rejected": -121.54251098632812, + "loss": 0.6303, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.063430070877075, + "rewards/margins": 6.5680999755859375, + "rewards/rejected": -3.5046701431274414, + "step": 14496 + }, + { + "epoch": 3.63, + "grad_norm": 4.196569919586182, + "learning_rate": 1.7480555988072494e-06, + "logits/chosen": -0.5132020711898804, + "logits/rejected": -0.5787987112998962, + "logps/chosen": -54.41899108886719, + "logps/rejected": -108.21682739257812, + "loss": 0.5798, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0757906436920166, + "rewards/margins": 6.90203857421875, + "rewards/rejected": -3.8262481689453125, + "step": 14497 + }, + { + "epoch": 3.63, + "grad_norm": 9.930472373962402, + "learning_rate": 1.74745860165765e-06, + "logits/chosen": -0.4654255509376526, + "logits/rejected": -0.5330650806427002, + "logps/chosen": -55.780029296875, + "logps/rejected": -102.76695251464844, + "loss": 0.6442, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0251264572143555, + "rewards/margins": 6.950502872467041, + "rewards/rejected": -3.9253764152526855, + "step": 14498 + }, + { + "epoch": 3.63, + "grad_norm": 3.5830047130584717, + "learning_rate": 1.7468616848818082e-06, + "logits/chosen": -0.5248876214027405, + "logits/rejected": -0.628844141960144, + "logps/chosen": -59.92314910888672, + "logps/rejected": -111.52203369140625, + "loss": 0.6116, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.125344753265381, + "rewards/margins": 7.672317981719971, + "rewards/rejected": -4.54697322845459, + "step": 14499 + }, + { + "epoch": 3.63, + "grad_norm": 8.79246711730957, + "learning_rate": 1.7462648484944727e-06, + "logits/chosen": -0.5398188829421997, + "logits/rejected": -0.611294150352478, + "logps/chosen": -53.13870620727539, + "logps/rejected": -106.08653259277344, + "loss": 0.6941, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.170285701751709, + "rewards/margins": 6.458142280578613, + "rewards/rejected": -3.2878570556640625, + "step": 14500 + }, + { + "epoch": 3.63, + "grad_norm": 3.332381010055542, + "learning_rate": 1.7456680925103953e-06, + "logits/chosen": -0.5292312502861023, + "logits/rejected": -0.6199800372123718, + "logps/chosen": -53.404876708984375, + "logps/rejected": -96.47402954101562, + "loss": 0.5995, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2295262813568115, + "rewards/margins": 6.8448805809021, + "rewards/rejected": -3.615353584289551, + "step": 14501 + }, + { + "epoch": 3.63, + "grad_norm": 4.293196678161621, + "learning_rate": 1.7450714169443211e-06, + "logits/chosen": -0.5189225077629089, + "logits/rejected": -0.6338732242584229, + "logps/chosen": -63.332000732421875, + "logps/rejected": -96.99055480957031, + "loss": 0.7163, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.003267288208008, + "rewards/margins": 6.725675582885742, + "rewards/rejected": -3.7224082946777344, + "step": 14502 + }, + { + "epoch": 3.63, + "grad_norm": 5.859836578369141, + "learning_rate": 1.744474821810992e-06, + "logits/chosen": -0.598068356513977, + "logits/rejected": -0.7072202563285828, + "logps/chosen": -57.6063346862793, + "logps/rejected": -87.92118835449219, + "loss": 0.7254, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.300316572189331, + "rewards/margins": 7.049888610839844, + "rewards/rejected": -3.74957275390625, + "step": 14503 + }, + { + "epoch": 3.63, + "grad_norm": 4.988438129425049, + "learning_rate": 1.743878307125153e-06, + "logits/chosen": -0.589932918548584, + "logits/rejected": -0.6703939437866211, + "logps/chosen": -68.78374481201172, + "logps/rejected": -96.8822250366211, + "loss": 0.7579, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0783801078796387, + "rewards/margins": 7.560669422149658, + "rewards/rejected": -4.4822893142700195, + "step": 14504 + }, + { + "epoch": 3.63, + "grad_norm": 8.409066200256348, + "learning_rate": 1.7432818729015482e-06, + "logits/chosen": -0.5520033836364746, + "logits/rejected": -0.656835675239563, + "logps/chosen": -56.8839111328125, + "logps/rejected": -104.10260009765625, + "loss": 0.6354, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9799532890319824, + "rewards/margins": 7.5603790283203125, + "rewards/rejected": -4.58042573928833, + "step": 14505 + }, + { + "epoch": 3.63, + "grad_norm": 6.953971862792969, + "learning_rate": 1.7426855191549085e-06, + "logits/chosen": -0.5173730254173279, + "logits/rejected": -0.6061099171638489, + "logps/chosen": -57.56951904296875, + "logps/rejected": -104.4142837524414, + "loss": 0.6715, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.017465114593506, + "rewards/margins": 6.782952785491943, + "rewards/rejected": -3.7654876708984375, + "step": 14506 + }, + { + "epoch": 3.63, + "grad_norm": 4.752630233764648, + "learning_rate": 1.7420892458999745e-06, + "logits/chosen": -0.6484760642051697, + "logits/rejected": -0.7676876783370972, + "logps/chosen": -58.37196731567383, + "logps/rejected": -91.83975982666016, + "loss": 0.6939, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7096991539001465, + "rewards/margins": 6.468594074249268, + "rewards/rejected": -3.758894920349121, + "step": 14507 + }, + { + "epoch": 3.63, + "grad_norm": 4.362245082855225, + "learning_rate": 1.7414930531514818e-06, + "logits/chosen": -0.6275450587272644, + "logits/rejected": -0.6475032567977905, + "logps/chosen": -48.99638366699219, + "logps/rejected": -106.9742431640625, + "loss": 0.6227, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2633938789367676, + "rewards/margins": 7.195814609527588, + "rewards/rejected": -3.9324212074279785, + "step": 14508 + }, + { + "epoch": 3.63, + "grad_norm": 12.893998146057129, + "learning_rate": 1.7408969409241621e-06, + "logits/chosen": -0.5728450417518616, + "logits/rejected": -0.6665832996368408, + "logps/chosen": -48.248321533203125, + "logps/rejected": -104.6500015258789, + "loss": 0.7198, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.051372528076172, + "rewards/margins": 7.52090311050415, + "rewards/rejected": -4.4695305824279785, + "step": 14509 + }, + { + "epoch": 3.63, + "grad_norm": 3.1448287963867188, + "learning_rate": 1.7403009092327438e-06, + "logits/chosen": -0.5713073015213013, + "logits/rejected": -0.6371811032295227, + "logps/chosen": -49.387203216552734, + "logps/rejected": -93.15130615234375, + "loss": 0.6247, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1311092376708984, + "rewards/margins": 7.08356237411499, + "rewards/rejected": -3.9524528980255127, + "step": 14510 + }, + { + "epoch": 3.63, + "grad_norm": 4.643585205078125, + "learning_rate": 1.739704958091959e-06, + "logits/chosen": -0.52495938539505, + "logits/rejected": -0.5775099992752075, + "logps/chosen": -55.73028564453125, + "logps/rejected": -112.01260375976562, + "loss": 0.6335, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2784295082092285, + "rewards/margins": 6.934876441955566, + "rewards/rejected": -3.656446933746338, + "step": 14511 + }, + { + "epoch": 3.63, + "grad_norm": 4.5237345695495605, + "learning_rate": 1.7391090875165317e-06, + "logits/chosen": -0.506793737411499, + "logits/rejected": -0.6067326068878174, + "logps/chosen": -60.65040588378906, + "logps/rejected": -106.55941009521484, + "loss": 0.6422, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.668544292449951, + "rewards/margins": 6.29410982131958, + "rewards/rejected": -3.625565528869629, + "step": 14512 + }, + { + "epoch": 3.63, + "grad_norm": 4.810216426849365, + "learning_rate": 1.7385132975211866e-06, + "logits/chosen": -0.6026073694229126, + "logits/rejected": -0.6221110820770264, + "logps/chosen": -36.91049575805664, + "logps/rejected": -114.63400268554688, + "loss": 0.5195, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.122039556503296, + "rewards/margins": 6.997280120849609, + "rewards/rejected": -3.875239849090576, + "step": 14513 + }, + { + "epoch": 3.63, + "grad_norm": 6.603190898895264, + "learning_rate": 1.7379175881206477e-06, + "logits/chosen": -0.5273079872131348, + "logits/rejected": -0.6346617937088013, + "logps/chosen": -54.779014587402344, + "logps/rejected": -89.9056625366211, + "loss": 0.6798, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2170190811157227, + "rewards/margins": 7.741357803344727, + "rewards/rejected": -4.5243377685546875, + "step": 14514 + }, + { + "epoch": 3.63, + "grad_norm": 7.206642150878906, + "learning_rate": 1.7373219593296343e-06, + "logits/chosen": -0.465368390083313, + "logits/rejected": -0.5927669405937195, + "logps/chosen": -59.572967529296875, + "logps/rejected": -104.75021362304688, + "loss": 0.6205, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.017178535461426, + "rewards/margins": 7.273935317993164, + "rewards/rejected": -4.2567572593688965, + "step": 14515 + }, + { + "epoch": 3.63, + "grad_norm": 2.427931547164917, + "learning_rate": 1.7367264111628669e-06, + "logits/chosen": -0.5511513352394104, + "logits/rejected": -0.6799573302268982, + "logps/chosen": -55.1888427734375, + "logps/rejected": -117.67625427246094, + "loss": 0.5777, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3908233642578125, + "rewards/margins": 8.312376022338867, + "rewards/rejected": -4.921553134918213, + "step": 14516 + }, + { + "epoch": 3.63, + "grad_norm": 6.759720802307129, + "learning_rate": 1.7361309436350603e-06, + "logits/chosen": -0.5169981122016907, + "logits/rejected": -0.592695951461792, + "logps/chosen": -57.645111083984375, + "logps/rejected": -98.66896057128906, + "loss": 0.677, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9041495323181152, + "rewards/margins": 5.822962284088135, + "rewards/rejected": -2.9188127517700195, + "step": 14517 + }, + { + "epoch": 3.63, + "grad_norm": 3.9264421463012695, + "learning_rate": 1.7355355567609284e-06, + "logits/chosen": -0.5983381271362305, + "logits/rejected": -0.6382333040237427, + "logps/chosen": -53.41276931762695, + "logps/rejected": -122.75391387939453, + "loss": 0.6568, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.929440975189209, + "rewards/margins": 7.360696792602539, + "rewards/rejected": -4.431256294250488, + "step": 14518 + }, + { + "epoch": 3.63, + "grad_norm": 6.524077892303467, + "learning_rate": 1.7349402505551871e-06, + "logits/chosen": -0.6026633381843567, + "logits/rejected": -0.6662247776985168, + "logps/chosen": -45.31177520751953, + "logps/rejected": -85.80121612548828, + "loss": 0.7627, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0351197719573975, + "rewards/margins": 6.833750247955322, + "rewards/rejected": -3.798630714416504, + "step": 14519 + }, + { + "epoch": 3.63, + "grad_norm": 2.7661423683166504, + "learning_rate": 1.7343450250325445e-06, + "logits/chosen": -0.5569135546684265, + "logits/rejected": -0.6228554248809814, + "logps/chosen": -54.710472106933594, + "logps/rejected": -99.23294067382812, + "loss": 0.6126, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.091400146484375, + "rewards/margins": 7.403325080871582, + "rewards/rejected": -4.311924457550049, + "step": 14520 + }, + { + "epoch": 3.63, + "grad_norm": 5.188694953918457, + "learning_rate": 1.7337498802077085e-06, + "logits/chosen": -0.586055338382721, + "logits/rejected": -0.6463169455528259, + "logps/chosen": -55.30925750732422, + "logps/rejected": -132.55322265625, + "loss": 0.5796, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.044499397277832, + "rewards/margins": 8.527824401855469, + "rewards/rejected": -5.4833245277404785, + "step": 14521 + }, + { + "epoch": 3.63, + "grad_norm": 17.98410987854004, + "learning_rate": 1.7331548160953871e-06, + "logits/chosen": -0.5466663241386414, + "logits/rejected": -0.6277562975883484, + "logps/chosen": -55.07954406738281, + "logps/rejected": -101.7531509399414, + "loss": 0.6992, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0888524055480957, + "rewards/margins": 6.938676834106445, + "rewards/rejected": -3.8498239517211914, + "step": 14522 + }, + { + "epoch": 3.63, + "grad_norm": 5.324833393096924, + "learning_rate": 1.7325598327102883e-06, + "logits/chosen": -0.5753290057182312, + "logits/rejected": -0.6696574687957764, + "logps/chosen": -49.88629913330078, + "logps/rejected": -113.7000732421875, + "loss": 0.6193, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.207695722579956, + "rewards/margins": 8.04684066772461, + "rewards/rejected": -4.839144706726074, + "step": 14523 + }, + { + "epoch": 3.63, + "grad_norm": 4.222472190856934, + "learning_rate": 1.7319649300671082e-06, + "logits/chosen": -0.5777792930603027, + "logits/rejected": -0.6547292470932007, + "logps/chosen": -56.43855667114258, + "logps/rejected": -113.33826446533203, + "loss": 0.5631, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2681527137756348, + "rewards/margins": 7.485284805297852, + "rewards/rejected": -4.217132568359375, + "step": 14524 + }, + { + "epoch": 3.63, + "grad_norm": 4.93524169921875, + "learning_rate": 1.7313701081805506e-06, + "logits/chosen": -0.6113834381103516, + "logits/rejected": -0.7110660672187805, + "logps/chosen": -51.295448303222656, + "logps/rejected": -110.8164291381836, + "loss": 0.5996, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.018526077270508, + "rewards/margins": 6.973284721374512, + "rewards/rejected": -3.954758644104004, + "step": 14525 + }, + { + "epoch": 3.63, + "grad_norm": 3.6410329341888428, + "learning_rate": 1.7307753670653161e-06, + "logits/chosen": -0.5589864253997803, + "logits/rejected": -0.6092108488082886, + "logps/chosen": -55.248321533203125, + "logps/rejected": -109.72554016113281, + "loss": 0.6476, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.080014228820801, + "rewards/margins": 7.314960956573486, + "rewards/rejected": -4.234947204589844, + "step": 14526 + }, + { + "epoch": 3.63, + "grad_norm": 8.330578804016113, + "learning_rate": 1.7301807067360992e-06, + "logits/chosen": -0.5610004663467407, + "logits/rejected": -0.5878310203552246, + "logps/chosen": -52.1384162902832, + "logps/rejected": -98.15621948242188, + "loss": 0.6814, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2510876655578613, + "rewards/margins": 6.279767036437988, + "rewards/rejected": -3.028679132461548, + "step": 14527 + }, + { + "epoch": 3.63, + "grad_norm": 18.237567901611328, + "learning_rate": 1.7295861272075937e-06, + "logits/chosen": -0.5325778126716614, + "logits/rejected": -0.6266250610351562, + "logps/chosen": -64.7795181274414, + "logps/rejected": -107.27217864990234, + "loss": 0.7824, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.15112566947937, + "rewards/margins": 8.055644035339355, + "rewards/rejected": -4.904519081115723, + "step": 14528 + }, + { + "epoch": 3.63, + "grad_norm": 7.362378120422363, + "learning_rate": 1.7289916284944947e-06, + "logits/chosen": -0.5565255284309387, + "logits/rejected": -0.6746036410331726, + "logps/chosen": -64.2363052368164, + "logps/rejected": -82.57235717773438, + "loss": 0.7067, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0019099712371826, + "rewards/margins": 5.999198913574219, + "rewards/rejected": -2.9972896575927734, + "step": 14529 + }, + { + "epoch": 3.63, + "grad_norm": 2.2608935832977295, + "learning_rate": 1.7283972106114922e-06, + "logits/chosen": -0.533220112323761, + "logits/rejected": -0.6360360383987427, + "logps/chosen": -60.232322692871094, + "logps/rejected": -99.60075378417969, + "loss": 0.5851, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.302323818206787, + "rewards/margins": 7.840574741363525, + "rewards/rejected": -4.538249492645264, + "step": 14530 + }, + { + "epoch": 3.64, + "grad_norm": 4.781877040863037, + "learning_rate": 1.7278028735732727e-06, + "logits/chosen": -0.6084010601043701, + "logits/rejected": -0.6220203638076782, + "logps/chosen": -45.251407623291016, + "logps/rejected": -103.96051025390625, + "loss": 0.5582, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.968545436859131, + "rewards/margins": 6.81727933883667, + "rewards/rejected": -3.848733901977539, + "step": 14531 + }, + { + "epoch": 3.64, + "grad_norm": 4.715765953063965, + "learning_rate": 1.7272086173945263e-06, + "logits/chosen": -0.5809628963470459, + "logits/rejected": -0.6317875981330872, + "logps/chosen": -55.03807830810547, + "logps/rejected": -108.1782455444336, + "loss": 0.6495, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1314857006073, + "rewards/margins": 7.08780574798584, + "rewards/rejected": -3.95632004737854, + "step": 14532 + }, + { + "epoch": 3.64, + "grad_norm": 7.133229732513428, + "learning_rate": 1.7266144420899344e-06, + "logits/chosen": -0.603073239326477, + "logits/rejected": -0.7079575061798096, + "logps/chosen": -50.877967834472656, + "logps/rejected": -117.71033477783203, + "loss": 0.6296, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9358134269714355, + "rewards/margins": 7.7653608322143555, + "rewards/rejected": -4.829546928405762, + "step": 14533 + }, + { + "epoch": 3.64, + "grad_norm": 4.075445175170898, + "learning_rate": 1.7260203476741833e-06, + "logits/chosen": -0.5866512060165405, + "logits/rejected": -0.6719851493835449, + "logps/chosen": -55.45888900756836, + "logps/rejected": -94.12344360351562, + "loss": 0.5791, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.188220739364624, + "rewards/margins": 7.104711055755615, + "rewards/rejected": -3.916491985321045, + "step": 14534 + }, + { + "epoch": 3.64, + "grad_norm": 10.969317436218262, + "learning_rate": 1.7254263341619515e-06, + "logits/chosen": -0.5808464288711548, + "logits/rejected": -0.6222525238990784, + "logps/chosen": -43.877655029296875, + "logps/rejected": -113.29723358154297, + "loss": 0.777, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.80551815032959, + "rewards/margins": 7.208950042724609, + "rewards/rejected": -4.4034318923950195, + "step": 14535 + }, + { + "epoch": 3.64, + "grad_norm": 8.107131004333496, + "learning_rate": 1.7248324015679162e-06, + "logits/chosen": -0.5644294023513794, + "logits/rejected": -0.6536555290222168, + "logps/chosen": -76.57212829589844, + "logps/rejected": -100.89582061767578, + "loss": 0.6679, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8488214015960693, + "rewards/margins": 6.583892822265625, + "rewards/rejected": -3.735071897506714, + "step": 14536 + }, + { + "epoch": 3.64, + "grad_norm": 3.7692654132843018, + "learning_rate": 1.7242385499067576e-06, + "logits/chosen": -0.5492969751358032, + "logits/rejected": -0.6139305830001831, + "logps/chosen": -52.53443145751953, + "logps/rejected": -113.61785888671875, + "loss": 0.5683, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4020681381225586, + "rewards/margins": 7.258636474609375, + "rewards/rejected": -3.8565688133239746, + "step": 14537 + }, + { + "epoch": 3.64, + "grad_norm": 9.541450500488281, + "learning_rate": 1.723644779193147e-06, + "logits/chosen": -0.4969223737716675, + "logits/rejected": -0.562981903553009, + "logps/chosen": -58.1767692565918, + "logps/rejected": -111.78592681884766, + "loss": 0.7164, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2301738262176514, + "rewards/margins": 6.663006782531738, + "rewards/rejected": -3.432833194732666, + "step": 14538 + }, + { + "epoch": 3.64, + "grad_norm": 3.922879457473755, + "learning_rate": 1.7230510894417607e-06, + "logits/chosen": -0.5488786697387695, + "logits/rejected": -0.6269937753677368, + "logps/chosen": -49.68745803833008, + "logps/rejected": -98.6568832397461, + "loss": 0.6431, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.839918613433838, + "rewards/margins": 6.462674140930176, + "rewards/rejected": -3.622755289077759, + "step": 14539 + }, + { + "epoch": 3.64, + "grad_norm": 9.135459899902344, + "learning_rate": 1.7224574806672662e-06, + "logits/chosen": -0.4657900333404541, + "logits/rejected": -0.5869550108909607, + "logps/chosen": -50.79830551147461, + "logps/rejected": -107.67637634277344, + "loss": 0.5581, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.237863063812256, + "rewards/margins": 7.820959091186523, + "rewards/rejected": -4.583096027374268, + "step": 14540 + }, + { + "epoch": 3.64, + "grad_norm": 2.191143035888672, + "learning_rate": 1.7218639528843345e-06, + "logits/chosen": -0.4886685907840729, + "logits/rejected": -0.619247317314148, + "logps/chosen": -56.99956512451172, + "logps/rejected": -112.43994903564453, + "loss": 0.5284, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0395100116729736, + "rewards/margins": 8.218156814575195, + "rewards/rejected": -5.178647041320801, + "step": 14541 + }, + { + "epoch": 3.64, + "grad_norm": 20.23396110534668, + "learning_rate": 1.7212705061076313e-06, + "logits/chosen": -0.6374439597129822, + "logits/rejected": -0.6687684059143066, + "logps/chosen": -49.670772552490234, + "logps/rejected": -110.36357116699219, + "loss": 0.7085, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8589279651641846, + "rewards/margins": 6.701611518859863, + "rewards/rejected": -3.842684268951416, + "step": 14542 + }, + { + "epoch": 3.64, + "grad_norm": 3.543332815170288, + "learning_rate": 1.7206771403518196e-06, + "logits/chosen": -0.5241754651069641, + "logits/rejected": -0.5932795405387878, + "logps/chosen": -58.91001892089844, + "logps/rejected": -102.26504516601562, + "loss": 0.6012, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9771621227264404, + "rewards/margins": 6.62441873550415, + "rewards/rejected": -3.647256851196289, + "step": 14543 + }, + { + "epoch": 3.64, + "grad_norm": 6.375813961029053, + "learning_rate": 1.7200838556315658e-06, + "logits/chosen": -0.5150030851364136, + "logits/rejected": -0.5651712417602539, + "logps/chosen": -54.41211700439453, + "logps/rejected": -117.56448364257812, + "loss": 0.6314, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0433106422424316, + "rewards/margins": 6.846783638000488, + "rewards/rejected": -3.8034722805023193, + "step": 14544 + }, + { + "epoch": 3.64, + "grad_norm": 5.245820999145508, + "learning_rate": 1.7194906519615284e-06, + "logits/chosen": -0.598378598690033, + "logits/rejected": -0.6498234868049622, + "logps/chosen": -45.531734466552734, + "logps/rejected": -100.19226837158203, + "loss": 0.6621, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3359262943267822, + "rewards/margins": 7.498207092285156, + "rewards/rejected": -4.162280559539795, + "step": 14545 + }, + { + "epoch": 3.64, + "grad_norm": 5.228696346282959, + "learning_rate": 1.7188975293563648e-06, + "logits/chosen": -0.49875950813293457, + "logits/rejected": -0.5888513326644897, + "logps/chosen": -56.81620788574219, + "logps/rejected": -103.76274108886719, + "loss": 0.6341, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.196645736694336, + "rewards/margins": 7.454448699951172, + "rewards/rejected": -4.257802963256836, + "step": 14546 + }, + { + "epoch": 3.64, + "grad_norm": 3.7222495079040527, + "learning_rate": 1.7183044878307348e-06, + "logits/chosen": -0.5021829605102539, + "logits/rejected": -0.596477746963501, + "logps/chosen": -62.36626434326172, + "logps/rejected": -101.86451721191406, + "loss": 0.6384, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.12400484085083, + "rewards/margins": 6.362979412078857, + "rewards/rejected": -3.2389745712280273, + "step": 14547 + }, + { + "epoch": 3.64, + "grad_norm": 3.4612317085266113, + "learning_rate": 1.7177115273992916e-06, + "logits/chosen": -0.5631623864173889, + "logits/rejected": -0.6446565389633179, + "logps/chosen": -50.51389694213867, + "logps/rejected": -108.88174438476562, + "loss": 0.5042, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.014578342437744, + "rewards/margins": 7.410346984863281, + "rewards/rejected": -4.395769119262695, + "step": 14548 + }, + { + "epoch": 3.64, + "grad_norm": 2.2335081100463867, + "learning_rate": 1.717118648076686e-06, + "logits/chosen": -0.5856335759162903, + "logits/rejected": -0.7117547988891602, + "logps/chosen": -48.54386520385742, + "logps/rejected": -85.8133544921875, + "loss": 0.5922, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1240463256835938, + "rewards/margins": 6.569949150085449, + "rewards/rejected": -3.4459028244018555, + "step": 14549 + }, + { + "epoch": 3.64, + "grad_norm": 2.426252603530884, + "learning_rate": 1.7165258498775705e-06, + "logits/chosen": -0.46870434284210205, + "logits/rejected": -0.5738627314567566, + "logps/chosen": -49.06718444824219, + "logps/rejected": -101.50755310058594, + "loss": 0.5432, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.282050609588623, + "rewards/margins": 7.901110649108887, + "rewards/rejected": -4.619060039520264, + "step": 14550 + }, + { + "epoch": 3.64, + "grad_norm": 4.273910999298096, + "learning_rate": 1.7159331328165968e-06, + "logits/chosen": -0.527553141117096, + "logits/rejected": -0.6294399499893188, + "logps/chosen": -48.593544006347656, + "logps/rejected": -95.49744415283203, + "loss": 0.5955, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3466219902038574, + "rewards/margins": 6.8836259841918945, + "rewards/rejected": -3.537003755569458, + "step": 14551 + }, + { + "epoch": 3.64, + "grad_norm": 5.2450642585754395, + "learning_rate": 1.7153404969084054e-06, + "logits/chosen": -0.48457837104797363, + "logits/rejected": -0.5655626058578491, + "logps/chosen": -61.71273422241211, + "logps/rejected": -103.24203491210938, + "loss": 0.7105, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4196267127990723, + "rewards/margins": 7.339735507965088, + "rewards/rejected": -3.9201085567474365, + "step": 14552 + }, + { + "epoch": 3.64, + "grad_norm": 3.3034956455230713, + "learning_rate": 1.7147479421676438e-06, + "logits/chosen": -0.4959977865219116, + "logits/rejected": -0.6047300100326538, + "logps/chosen": -56.328468322753906, + "logps/rejected": -106.41000366210938, + "loss": 0.5872, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1805577278137207, + "rewards/margins": 8.288554191589355, + "rewards/rejected": -5.107996463775635, + "step": 14553 + }, + { + "epoch": 3.64, + "grad_norm": 5.443948745727539, + "learning_rate": 1.7141554686089568e-06, + "logits/chosen": -0.5723726749420166, + "logits/rejected": -0.6558108329772949, + "logps/chosen": -42.095340728759766, + "logps/rejected": -125.67142486572266, + "loss": 0.5166, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.072732448577881, + "rewards/margins": 7.738414764404297, + "rewards/rejected": -4.665682792663574, + "step": 14554 + }, + { + "epoch": 3.64, + "grad_norm": 3.12618350982666, + "learning_rate": 1.7135630762469828e-06, + "logits/chosen": -0.5478724241256714, + "logits/rejected": -0.6743592619895935, + "logps/chosen": -56.3287239074707, + "logps/rejected": -91.0962142944336, + "loss": 0.5728, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.100512742996216, + "rewards/margins": 7.592155456542969, + "rewards/rejected": -4.491641998291016, + "step": 14555 + }, + { + "epoch": 3.64, + "grad_norm": 10.967479705810547, + "learning_rate": 1.7129707650963594e-06, + "logits/chosen": -0.5381742119789124, + "logits/rejected": -0.624048113822937, + "logps/chosen": -59.294464111328125, + "logps/rejected": -121.90454864501953, + "loss": 0.6916, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9547362327575684, + "rewards/margins": 7.285171031951904, + "rewards/rejected": -4.3304338455200195, + "step": 14556 + }, + { + "epoch": 3.64, + "grad_norm": 4.139358997344971, + "learning_rate": 1.7123785351717258e-06, + "logits/chosen": -0.5296906232833862, + "logits/rejected": -0.597143292427063, + "logps/chosen": -72.35369873046875, + "logps/rejected": -115.4830093383789, + "loss": 0.6704, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.008334159851074, + "rewards/margins": 7.261970520019531, + "rewards/rejected": -4.253636837005615, + "step": 14557 + }, + { + "epoch": 3.64, + "grad_norm": 4.201426029205322, + "learning_rate": 1.7117863864877144e-06, + "logits/chosen": -0.5741134285926819, + "logits/rejected": -0.7173745036125183, + "logps/chosen": -58.90974807739258, + "logps/rejected": -99.17782592773438, + "loss": 0.5072, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.14020037651062, + "rewards/margins": 7.8721537590026855, + "rewards/rejected": -4.731953144073486, + "step": 14558 + }, + { + "epoch": 3.64, + "grad_norm": 3.662271022796631, + "learning_rate": 1.7111943190589603e-06, + "logits/chosen": -0.5343963503837585, + "logits/rejected": -0.6184598207473755, + "logps/chosen": -47.735374450683594, + "logps/rejected": -97.29290008544922, + "loss": 0.5857, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8458030223846436, + "rewards/margins": 6.505805969238281, + "rewards/rejected": -3.6600027084350586, + "step": 14559 + }, + { + "epoch": 3.64, + "grad_norm": 9.81342887878418, + "learning_rate": 1.7106023329000932e-06, + "logits/chosen": -0.4807151257991791, + "logits/rejected": -0.603055477142334, + "logps/chosen": -57.05714416503906, + "logps/rejected": -101.7081069946289, + "loss": 0.6668, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5987613201141357, + "rewards/margins": 6.3224639892578125, + "rewards/rejected": -3.7237026691436768, + "step": 14560 + }, + { + "epoch": 3.64, + "grad_norm": 4.608489990234375, + "learning_rate": 1.710010428025739e-06, + "logits/chosen": -0.48892366886138916, + "logits/rejected": -0.5635954737663269, + "logps/chosen": -55.03668975830078, + "logps/rejected": -103.52058410644531, + "loss": 0.5996, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.97672700881958, + "rewards/margins": 6.941919803619385, + "rewards/rejected": -3.9651925563812256, + "step": 14561 + }, + { + "epoch": 3.64, + "grad_norm": 5.137314319610596, + "learning_rate": 1.7094186044505284e-06, + "logits/chosen": -0.5176690816879272, + "logits/rejected": -0.6349849104881287, + "logps/chosen": -63.90526580810547, + "logps/rejected": -89.59313201904297, + "loss": 0.6837, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8516836166381836, + "rewards/margins": 6.388575553894043, + "rewards/rejected": -3.536891460418701, + "step": 14562 + }, + { + "epoch": 3.64, + "grad_norm": 2.8737051486968994, + "learning_rate": 1.708826862189084e-06, + "logits/chosen": -0.4635896682739258, + "logits/rejected": -0.5454862713813782, + "logps/chosen": -58.66189956665039, + "logps/rejected": -119.92716979980469, + "loss": 0.6529, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.205592155456543, + "rewards/margins": 8.37551498413086, + "rewards/rejected": -5.169922351837158, + "step": 14563 + }, + { + "epoch": 3.64, + "grad_norm": 7.755318641662598, + "learning_rate": 1.7082352012560266e-06, + "logits/chosen": -0.5719797611236572, + "logits/rejected": -0.642691969871521, + "logps/chosen": -54.023372650146484, + "logps/rejected": -101.75633239746094, + "loss": 0.7245, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.946927785873413, + "rewards/margins": 7.221917152404785, + "rewards/rejected": -4.274989604949951, + "step": 14564 + }, + { + "epoch": 3.64, + "grad_norm": 4.206839561462402, + "learning_rate": 1.7076436216659791e-06, + "logits/chosen": -0.5708082914352417, + "logits/rejected": -0.6378999948501587, + "logps/chosen": -53.753929138183594, + "logps/rejected": -98.55389404296875, + "loss": 0.6242, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2709431648254395, + "rewards/margins": 6.828122138977051, + "rewards/rejected": -3.5571787357330322, + "step": 14565 + }, + { + "epoch": 3.64, + "grad_norm": 5.135108470916748, + "learning_rate": 1.707052123433563e-06, + "logits/chosen": -0.5684842467308044, + "logits/rejected": -0.6258566379547119, + "logps/chosen": -48.957008361816406, + "logps/rejected": -103.95308685302734, + "loss": 0.591, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0945982933044434, + "rewards/margins": 6.395281791687012, + "rewards/rejected": -3.3006837368011475, + "step": 14566 + }, + { + "epoch": 3.64, + "grad_norm": 55.583702087402344, + "learning_rate": 1.7064607065733874e-06, + "logits/chosen": -0.47720959782600403, + "logits/rejected": -0.5540234446525574, + "logps/chosen": -60.806697845458984, + "logps/rejected": -105.34390258789062, + "loss": 0.7712, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0035769939422607, + "rewards/margins": 7.181196212768555, + "rewards/rejected": -4.177619934082031, + "step": 14567 + }, + { + "epoch": 3.64, + "grad_norm": 3.165403366088867, + "learning_rate": 1.7058693711000719e-06, + "logits/chosen": -0.5473551750183105, + "logits/rejected": -0.6587211489677429, + "logps/chosen": -50.385658264160156, + "logps/rejected": -94.09089660644531, + "loss": 0.5265, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.525662660598755, + "rewards/margins": 8.074793815612793, + "rewards/rejected": -4.549130916595459, + "step": 14568 + }, + { + "epoch": 3.64, + "grad_norm": 5.618889331817627, + "learning_rate": 1.705278117028229e-06, + "logits/chosen": -0.5185074806213379, + "logits/rejected": -0.6036736965179443, + "logps/chosen": -63.10784149169922, + "logps/rejected": -93.52212524414062, + "loss": 0.7274, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.299729824066162, + "rewards/margins": 6.161399841308594, + "rewards/rejected": -2.8616697788238525, + "step": 14569 + }, + { + "epoch": 3.64, + "grad_norm": 10.28276538848877, + "learning_rate": 1.7046869443724684e-06, + "logits/chosen": -0.5501586198806763, + "logits/rejected": -0.6885306239128113, + "logps/chosen": -56.00205612182617, + "logps/rejected": -96.28992462158203, + "loss": 0.672, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.062528371810913, + "rewards/margins": 6.8758978843688965, + "rewards/rejected": -3.8133697509765625, + "step": 14570 + }, + { + "epoch": 3.65, + "grad_norm": 3.6621382236480713, + "learning_rate": 1.7040958531473967e-06, + "logits/chosen": -0.5665649175643921, + "logits/rejected": -0.6647571921348572, + "logps/chosen": -55.49998474121094, + "logps/rejected": -110.08528137207031, + "loss": 0.5997, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.093658447265625, + "rewards/margins": 7.9351983070373535, + "rewards/rejected": -4.8415398597717285, + "step": 14571 + }, + { + "epoch": 3.65, + "grad_norm": 5.1113600730896, + "learning_rate": 1.7035048433676237e-06, + "logits/chosen": -0.5737701654434204, + "logits/rejected": -0.6252530813217163, + "logps/chosen": -46.222267150878906, + "logps/rejected": -109.82252502441406, + "loss": 0.5805, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1260695457458496, + "rewards/margins": 7.051576614379883, + "rewards/rejected": -3.925507068634033, + "step": 14572 + }, + { + "epoch": 3.65, + "grad_norm": 1.8041677474975586, + "learning_rate": 1.702913915047753e-06, + "logits/chosen": -0.5580658912658691, + "logits/rejected": -0.6342304944992065, + "logps/chosen": -52.26880645751953, + "logps/rejected": -95.0896224975586, + "loss": 0.5849, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3214147090911865, + "rewards/margins": 7.8728790283203125, + "rewards/rejected": -4.551465034484863, + "step": 14573 + }, + { + "epoch": 3.65, + "grad_norm": 12.95720100402832, + "learning_rate": 1.702323068202384e-06, + "logits/chosen": -0.5203437805175781, + "logits/rejected": -0.6271957159042358, + "logps/chosen": -65.33174896240234, + "logps/rejected": -104.1913833618164, + "loss": 0.7127, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.457465648651123, + "rewards/margins": 6.311903476715088, + "rewards/rejected": -3.8544375896453857, + "step": 14574 + }, + { + "epoch": 3.65, + "grad_norm": 2.930100202560425, + "learning_rate": 1.701732302846122e-06, + "logits/chosen": -0.5581378936767578, + "logits/rejected": -0.6190901398658752, + "logps/chosen": -46.08082580566406, + "logps/rejected": -107.62086486816406, + "loss": 0.5426, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.189340829849243, + "rewards/margins": 7.845023155212402, + "rewards/rejected": -4.655681610107422, + "step": 14575 + }, + { + "epoch": 3.65, + "grad_norm": 7.069615840911865, + "learning_rate": 1.701141618993562e-06, + "logits/chosen": -0.5864158272743225, + "logits/rejected": -0.6912553906440735, + "logps/chosen": -47.96709442138672, + "logps/rejected": -100.30821990966797, + "loss": 0.7162, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1245014667510986, + "rewards/margins": 6.902544975280762, + "rewards/rejected": -3.778043031692505, + "step": 14576 + }, + { + "epoch": 3.65, + "grad_norm": 2.9756999015808105, + "learning_rate": 1.7005510166593004e-06, + "logits/chosen": -0.48711133003234863, + "logits/rejected": -0.5794219970703125, + "logps/chosen": -54.99932861328125, + "logps/rejected": -95.80891418457031, + "loss": 0.5764, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.766490936279297, + "rewards/margins": 6.875594139099121, + "rewards/rejected": -4.109103202819824, + "step": 14577 + }, + { + "epoch": 3.65, + "grad_norm": 3.4644827842712402, + "learning_rate": 1.699960495857934e-06, + "logits/chosen": -0.5275003910064697, + "logits/rejected": -0.6192837953567505, + "logps/chosen": -59.09675216674805, + "logps/rejected": -104.78106689453125, + "loss": 0.6087, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0963659286499023, + "rewards/margins": 6.858219146728516, + "rewards/rejected": -3.7618532180786133, + "step": 14578 + }, + { + "epoch": 3.65, + "grad_norm": 5.1871843338012695, + "learning_rate": 1.6993700566040526e-06, + "logits/chosen": -0.5429385304450989, + "logits/rejected": -0.600376546382904, + "logps/chosen": -39.977718353271484, + "logps/rejected": -93.98043060302734, + "loss": 0.5683, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.347301959991455, + "rewards/margins": 6.703249931335449, + "rewards/rejected": -3.3559484481811523, + "step": 14579 + }, + { + "epoch": 3.65, + "grad_norm": 3.4528825283050537, + "learning_rate": 1.698779698912249e-06, + "logits/chosen": -0.5073615908622742, + "logits/rejected": -0.6160340309143066, + "logps/chosen": -54.976715087890625, + "logps/rejected": -105.62992858886719, + "loss": 0.6525, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0749619007110596, + "rewards/margins": 7.857461929321289, + "rewards/rejected": -4.782500267028809, + "step": 14580 + }, + { + "epoch": 3.65, + "grad_norm": 5.981405258178711, + "learning_rate": 1.698189422797109e-06, + "logits/chosen": -0.5599027872085571, + "logits/rejected": -0.6206958293914795, + "logps/chosen": -60.73346710205078, + "logps/rejected": -125.37818908691406, + "loss": 0.6599, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8661675453186035, + "rewards/margins": 7.081661224365234, + "rewards/rejected": -4.215493202209473, + "step": 14581 + }, + { + "epoch": 3.65, + "grad_norm": 4.038088321685791, + "learning_rate": 1.6975992282732217e-06, + "logits/chosen": -0.5791417360305786, + "logits/rejected": -0.6281813383102417, + "logps/chosen": -51.91291427612305, + "logps/rejected": -109.9609603881836, + "loss": 0.5949, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.015812873840332, + "rewards/margins": 7.137728691101074, + "rewards/rejected": -4.121915817260742, + "step": 14582 + }, + { + "epoch": 3.65, + "grad_norm": 10.118132591247559, + "learning_rate": 1.6970091153551687e-06, + "logits/chosen": -0.4961099326610565, + "logits/rejected": -0.5449742674827576, + "logps/chosen": -55.09784698486328, + "logps/rejected": -110.98986053466797, + "loss": 0.6985, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1561617851257324, + "rewards/margins": 6.11677885055542, + "rewards/rejected": -2.9606170654296875, + "step": 14583 + }, + { + "epoch": 3.65, + "grad_norm": 11.667683601379395, + "learning_rate": 1.696419084057535e-06, + "logits/chosen": -0.5644619464874268, + "logits/rejected": -0.6257779598236084, + "logps/chosen": -50.803993225097656, + "logps/rejected": -112.44659423828125, + "loss": 0.6687, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.104494333267212, + "rewards/margins": 6.994510173797607, + "rewards/rejected": -3.8900153636932373, + "step": 14584 + }, + { + "epoch": 3.65, + "grad_norm": 2.965970754623413, + "learning_rate": 1.6958291343949003e-06, + "logits/chosen": -0.5825519561767578, + "logits/rejected": -0.6287018060684204, + "logps/chosen": -45.39003372192383, + "logps/rejected": -111.04460906982422, + "loss": 0.5419, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.999318838119507, + "rewards/margins": 6.287436485290527, + "rewards/rejected": -3.288118362426758, + "step": 14585 + }, + { + "epoch": 3.65, + "grad_norm": 9.38299560546875, + "learning_rate": 1.6952392663818395e-06, + "logits/chosen": -0.5379029512405396, + "logits/rejected": -0.5928758978843689, + "logps/chosen": -51.7257080078125, + "logps/rejected": -106.1895523071289, + "loss": 0.6382, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0061044692993164, + "rewards/margins": 7.171137809753418, + "rewards/rejected": -4.165033340454102, + "step": 14586 + }, + { + "epoch": 3.65, + "grad_norm": 5.899197578430176, + "learning_rate": 1.6946494800329339e-06, + "logits/chosen": -0.5566355586051941, + "logits/rejected": -0.5846058130264282, + "logps/chosen": -49.628257751464844, + "logps/rejected": -125.06099700927734, + "loss": 0.6321, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0150845050811768, + "rewards/margins": 8.33463191986084, + "rewards/rejected": -5.319548606872559, + "step": 14587 + }, + { + "epoch": 3.65, + "grad_norm": 10.44588851928711, + "learning_rate": 1.6940597753627553e-06, + "logits/chosen": -0.48975279927253723, + "logits/rejected": -0.561664342880249, + "logps/chosen": -57.04227828979492, + "logps/rejected": -114.01792907714844, + "loss": 0.6128, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1214475631713867, + "rewards/margins": 7.594517707824707, + "rewards/rejected": -4.473069667816162, + "step": 14588 + }, + { + "epoch": 3.65, + "grad_norm": 4.040463924407959, + "learning_rate": 1.6934701523858742e-06, + "logits/chosen": -0.5637401938438416, + "logits/rejected": -0.6746125817298889, + "logps/chosen": -57.073360443115234, + "logps/rejected": -116.87466430664062, + "loss": 0.6417, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2275092601776123, + "rewards/margins": 8.026304244995117, + "rewards/rejected": -4.798795700073242, + "step": 14589 + }, + { + "epoch": 3.65, + "grad_norm": 2.2701315879821777, + "learning_rate": 1.6928806111168645e-06, + "logits/chosen": -0.4953192472457886, + "logits/rejected": -0.6157213449478149, + "logps/chosen": -63.3277702331543, + "logps/rejected": -104.36929321289062, + "loss": 0.6693, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.290839910507202, + "rewards/margins": 7.375560760498047, + "rewards/rejected": -4.084721565246582, + "step": 14590 + }, + { + "epoch": 3.65, + "grad_norm": 12.67628288269043, + "learning_rate": 1.6922911515702923e-06, + "logits/chosen": -0.5772404074668884, + "logits/rejected": -0.6517650485038757, + "logps/chosen": -59.257598876953125, + "logps/rejected": -98.29200744628906, + "loss": 0.6454, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8450746536254883, + "rewards/margins": 6.417534828186035, + "rewards/rejected": -3.5724599361419678, + "step": 14591 + }, + { + "epoch": 3.65, + "grad_norm": 4.261870861053467, + "learning_rate": 1.6917017737607222e-06, + "logits/chosen": -0.5266528129577637, + "logits/rejected": -0.683436930179596, + "logps/chosen": -55.05017852783203, + "logps/rejected": -118.37301635742188, + "loss": 0.5913, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.965425968170166, + "rewards/margins": 7.824673652648926, + "rewards/rejected": -4.85924768447876, + "step": 14592 + }, + { + "epoch": 3.65, + "grad_norm": 4.606897830963135, + "learning_rate": 1.69111247770272e-06, + "logits/chosen": -0.5230209827423096, + "logits/rejected": -0.5881466865539551, + "logps/chosen": -49.28848648071289, + "logps/rejected": -113.28263854980469, + "loss": 0.6037, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.188861608505249, + "rewards/margins": 7.583411693572998, + "rewards/rejected": -4.394550323486328, + "step": 14593 + }, + { + "epoch": 3.65, + "grad_norm": 6.8829121589660645, + "learning_rate": 1.6905232634108516e-06, + "logits/chosen": -0.4879654347896576, + "logits/rejected": -0.5077023506164551, + "logps/chosen": -60.263973236083984, + "logps/rejected": -107.49144744873047, + "loss": 0.7569, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.030900001525879, + "rewards/margins": 6.047130107879639, + "rewards/rejected": -3.0162298679351807, + "step": 14594 + }, + { + "epoch": 3.65, + "grad_norm": 39.872314453125, + "learning_rate": 1.6899341308996704e-06, + "logits/chosen": -0.5683161616325378, + "logits/rejected": -0.6793217658996582, + "logps/chosen": -63.48173522949219, + "logps/rejected": -110.56795501708984, + "loss": 0.6698, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1445624828338623, + "rewards/margins": 7.307591915130615, + "rewards/rejected": -4.16303014755249, + "step": 14595 + }, + { + "epoch": 3.65, + "grad_norm": 12.481633186340332, + "learning_rate": 1.6893450801837368e-06, + "logits/chosen": -0.5474884510040283, + "logits/rejected": -0.6529380083084106, + "logps/chosen": -50.80363082885742, + "logps/rejected": -91.56970977783203, + "loss": 0.5958, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8751964569091797, + "rewards/margins": 7.099955081939697, + "rewards/rejected": -4.224759101867676, + "step": 14596 + }, + { + "epoch": 3.65, + "grad_norm": 4.795498847961426, + "learning_rate": 1.6887561112776101e-06, + "logits/chosen": -0.5042952299118042, + "logits/rejected": -0.578767716884613, + "logps/chosen": -63.104835510253906, + "logps/rejected": -115.17672729492188, + "loss": 0.6371, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.053281545639038, + "rewards/margins": 6.8430681228637695, + "rewards/rejected": -3.7897863388061523, + "step": 14597 + }, + { + "epoch": 3.65, + "grad_norm": 8.976327896118164, + "learning_rate": 1.6881672241958413e-06, + "logits/chosen": -0.5335821509361267, + "logits/rejected": -0.5784913897514343, + "logps/chosen": -46.76040267944336, + "logps/rejected": -113.44010925292969, + "loss": 0.5579, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.377509593963623, + "rewards/margins": 7.2388482093811035, + "rewards/rejected": -3.861339569091797, + "step": 14598 + }, + { + "epoch": 3.65, + "grad_norm": 18.54522132873535, + "learning_rate": 1.6875784189529815e-06, + "logits/chosen": -0.5685233473777771, + "logits/rejected": -0.6266756653785706, + "logps/chosen": -55.507293701171875, + "logps/rejected": -112.51632690429688, + "loss": 0.6778, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.458702325820923, + "rewards/margins": 7.3042449951171875, + "rewards/rejected": -4.845542907714844, + "step": 14599 + }, + { + "epoch": 3.65, + "grad_norm": 4.088597297668457, + "learning_rate": 1.6869896955635839e-06, + "logits/chosen": -0.5789162516593933, + "logits/rejected": -0.6193632483482361, + "logps/chosen": -52.79347610473633, + "logps/rejected": -107.11187744140625, + "loss": 0.6627, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2930874824523926, + "rewards/margins": 7.103892803192139, + "rewards/rejected": -3.8108062744140625, + "step": 14600 + }, + { + "epoch": 3.65, + "grad_norm": 9.905545234680176, + "learning_rate": 1.6864010540421948e-06, + "logits/chosen": -0.5793756246566772, + "logits/rejected": -0.6914938688278198, + "logps/chosen": -51.49104309082031, + "logps/rejected": -107.4070816040039, + "loss": 0.6155, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4444830417633057, + "rewards/margins": 8.551258087158203, + "rewards/rejected": -5.106775283813477, + "step": 14601 + }, + { + "epoch": 3.65, + "grad_norm": 4.588893890380859, + "learning_rate": 1.6858124944033588e-06, + "logits/chosen": -0.561808168888092, + "logits/rejected": -0.6191374063491821, + "logps/chosen": -49.09635543823242, + "logps/rejected": -112.58258819580078, + "loss": 0.6402, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4716217517852783, + "rewards/margins": 8.194994926452637, + "rewards/rejected": -4.723372936248779, + "step": 14602 + }, + { + "epoch": 3.65, + "grad_norm": 2.542633056640625, + "learning_rate": 1.685224016661623e-06, + "logits/chosen": -0.4867583215236664, + "logits/rejected": -0.5330570936203003, + "logps/chosen": -65.77056884765625, + "logps/rejected": -123.51586151123047, + "loss": 0.5826, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.109609603881836, + "rewards/margins": 7.623035430908203, + "rewards/rejected": -4.513425827026367, + "step": 14603 + }, + { + "epoch": 3.65, + "grad_norm": 4.958741188049316, + "learning_rate": 1.684635620831525e-06, + "logits/chosen": -0.5428584218025208, + "logits/rejected": -0.6515654921531677, + "logps/chosen": -57.22882080078125, + "logps/rejected": -109.41954040527344, + "loss": 0.587, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9022960662841797, + "rewards/margins": 7.929361820220947, + "rewards/rejected": -5.027065277099609, + "step": 14604 + }, + { + "epoch": 3.65, + "grad_norm": 3.9084789752960205, + "learning_rate": 1.6840473069276097e-06, + "logits/chosen": -0.5248104333877563, + "logits/rejected": -0.6136635541915894, + "logps/chosen": -62.71615219116211, + "logps/rejected": -115.74303436279297, + "loss": 0.6442, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1567604541778564, + "rewards/margins": 7.149382591247559, + "rewards/rejected": -3.992621660232544, + "step": 14605 + }, + { + "epoch": 3.65, + "grad_norm": 3.7706005573272705, + "learning_rate": 1.683459074964412e-06, + "logits/chosen": -0.5353467464447021, + "logits/rejected": -0.6271405220031738, + "logps/chosen": -42.872554779052734, + "logps/rejected": -97.71330261230469, + "loss": 0.5387, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5979576110839844, + "rewards/margins": 7.200611591339111, + "rewards/rejected": -3.602654218673706, + "step": 14606 + }, + { + "epoch": 3.65, + "grad_norm": 3.275146961212158, + "learning_rate": 1.6828709249564662e-06, + "logits/chosen": -0.5187106132507324, + "logits/rejected": -0.62013840675354, + "logps/chosen": -62.497554779052734, + "logps/rejected": -100.96087646484375, + "loss": 0.632, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.043189525604248, + "rewards/margins": 6.288244247436523, + "rewards/rejected": -3.245054006576538, + "step": 14607 + }, + { + "epoch": 3.65, + "grad_norm": 9.59989070892334, + "learning_rate": 1.6822828569183104e-06, + "logits/chosen": -0.5968593955039978, + "logits/rejected": -0.6983373761177063, + "logps/chosen": -46.55186462402344, + "logps/rejected": -100.19697570800781, + "loss": 0.5954, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0532538890838623, + "rewards/margins": 6.799591541290283, + "rewards/rejected": -3.746337413787842, + "step": 14608 + }, + { + "epoch": 3.65, + "grad_norm": 3.857694625854492, + "learning_rate": 1.6816948708644737e-06, + "logits/chosen": -0.4957561492919922, + "logits/rejected": -0.5612592697143555, + "logps/chosen": -52.48326110839844, + "logps/rejected": -117.1672592163086, + "loss": 0.5746, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3352537155151367, + "rewards/margins": 7.961577415466309, + "rewards/rejected": -4.626324653625488, + "step": 14609 + }, + { + "epoch": 3.65, + "grad_norm": 3.224762201309204, + "learning_rate": 1.6811069668094848e-06, + "logits/chosen": -0.5115371346473694, + "logits/rejected": -0.5857070088386536, + "logps/chosen": -54.42853546142578, + "logps/rejected": -108.33590698242188, + "loss": 0.5603, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0894973278045654, + "rewards/margins": 6.541338920593262, + "rewards/rejected": -3.4518418312072754, + "step": 14610 + }, + { + "epoch": 3.66, + "grad_norm": 6.394631862640381, + "learning_rate": 1.6805191447678727e-06, + "logits/chosen": -0.5200129747390747, + "logits/rejected": -0.587462842464447, + "logps/chosen": -51.86346435546875, + "logps/rejected": -106.51416015625, + "loss": 0.5296, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0742955207824707, + "rewards/margins": 6.980111122131348, + "rewards/rejected": -3.905815601348877, + "step": 14611 + }, + { + "epoch": 3.66, + "grad_norm": 3.2647950649261475, + "learning_rate": 1.679931404754167e-06, + "logits/chosen": -0.5913512706756592, + "logits/rejected": -0.6859427690505981, + "logps/chosen": -49.53752136230469, + "logps/rejected": -105.8997573852539, + "loss": 0.5756, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3701584339141846, + "rewards/margins": 7.48915433883667, + "rewards/rejected": -4.118995666503906, + "step": 14612 + }, + { + "epoch": 3.66, + "grad_norm": 5.269651412963867, + "learning_rate": 1.6793437467828838e-06, + "logits/chosen": -0.6447797417640686, + "logits/rejected": -0.7191396355628967, + "logps/chosen": -51.69721221923828, + "logps/rejected": -96.89971160888672, + "loss": 0.6259, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9814720153808594, + "rewards/margins": 6.740918159484863, + "rewards/rejected": -3.7594456672668457, + "step": 14613 + }, + { + "epoch": 3.66, + "grad_norm": 6.765111446380615, + "learning_rate": 1.678756170868549e-06, + "logits/chosen": -0.5438246726989746, + "logits/rejected": -0.6503927111625671, + "logps/chosen": -49.4326286315918, + "logps/rejected": -100.35839080810547, + "loss": 0.5798, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.81487774848938, + "rewards/margins": 8.056018829345703, + "rewards/rejected": -5.241140842437744, + "step": 14614 + }, + { + "epoch": 3.66, + "grad_norm": 25.974483489990234, + "learning_rate": 1.6781686770256834e-06, + "logits/chosen": -0.5154126286506653, + "logits/rejected": -0.5868362188339233, + "logps/chosen": -67.74058532714844, + "logps/rejected": -104.06954956054688, + "loss": 0.8009, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.634476900100708, + "rewards/margins": 6.917541027069092, + "rewards/rejected": -4.283063888549805, + "step": 14615 + }, + { + "epoch": 3.66, + "grad_norm": 4.452613353729248, + "learning_rate": 1.6775812652688029e-06, + "logits/chosen": -0.5431579947471619, + "logits/rejected": -0.6632145643234253, + "logps/chosen": -68.03958129882812, + "logps/rejected": -107.18438720703125, + "loss": 0.6844, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.963684558868408, + "rewards/margins": 7.4741530418396, + "rewards/rejected": -4.510468006134033, + "step": 14616 + }, + { + "epoch": 3.66, + "grad_norm": 3.5071682929992676, + "learning_rate": 1.676993935612422e-06, + "logits/chosen": -0.5210548043251038, + "logits/rejected": -0.6189947128295898, + "logps/chosen": -56.737510681152344, + "logps/rejected": -113.91789245605469, + "loss": 0.5671, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6990110874176025, + "rewards/margins": 6.847659587860107, + "rewards/rejected": -4.148648738861084, + "step": 14617 + }, + { + "epoch": 3.66, + "grad_norm": 6.1442461013793945, + "learning_rate": 1.6764066880710566e-06, + "logits/chosen": -0.5388494729995728, + "logits/rejected": -0.6072616577148438, + "logps/chosen": -53.5907096862793, + "logps/rejected": -107.5762710571289, + "loss": 0.5897, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9922008514404297, + "rewards/margins": 6.516224384307861, + "rewards/rejected": -3.5240230560302734, + "step": 14618 + }, + { + "epoch": 3.66, + "grad_norm": 5.610436916351318, + "learning_rate": 1.6758195226592171e-06, + "logits/chosen": -0.496279239654541, + "logits/rejected": -0.5379278659820557, + "logps/chosen": -59.384193420410156, + "logps/rejected": -121.26819610595703, + "loss": 0.6614, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.185884952545166, + "rewards/margins": 6.813957691192627, + "rewards/rejected": -3.62807297706604, + "step": 14619 + }, + { + "epoch": 3.66, + "grad_norm": 3.130455255508423, + "learning_rate": 1.6752324393914115e-06, + "logits/chosen": -0.6183933615684509, + "logits/rejected": -0.6508686542510986, + "logps/chosen": -47.050453186035156, + "logps/rejected": -97.59591674804688, + "loss": 0.6425, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2107951641082764, + "rewards/margins": 6.993660926818848, + "rewards/rejected": -3.7828660011291504, + "step": 14620 + }, + { + "epoch": 3.66, + "grad_norm": 5.969150543212891, + "learning_rate": 1.6746454382821504e-06, + "logits/chosen": -0.5199869275093079, + "logits/rejected": -0.5949956774711609, + "logps/chosen": -64.22077178955078, + "logps/rejected": -106.14517211914062, + "loss": 0.6853, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.654775857925415, + "rewards/margins": 6.019835472106934, + "rewards/rejected": -3.3650598526000977, + "step": 14621 + }, + { + "epoch": 3.66, + "grad_norm": 3.2669661045074463, + "learning_rate": 1.6740585193459359e-06, + "logits/chosen": -0.6060516834259033, + "logits/rejected": -0.6830990314483643, + "logps/chosen": -54.84529113769531, + "logps/rejected": -108.09796142578125, + "loss": 0.5766, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.984625816345215, + "rewards/margins": 7.126857757568359, + "rewards/rejected": -4.142232418060303, + "step": 14622 + }, + { + "epoch": 3.66, + "grad_norm": 6.992725849151611, + "learning_rate": 1.6734716825972746e-06, + "logits/chosen": -0.5363376140594482, + "logits/rejected": -0.584441065788269, + "logps/chosen": -53.94230651855469, + "logps/rejected": -99.87833404541016, + "loss": 0.5985, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.160188913345337, + "rewards/margins": 6.232328414916992, + "rewards/rejected": -3.0721397399902344, + "step": 14623 + }, + { + "epoch": 3.66, + "grad_norm": 4.814975261688232, + "learning_rate": 1.672884928050665e-06, + "logits/chosen": -0.5284961462020874, + "logits/rejected": -0.6040176749229431, + "logps/chosen": -59.737037658691406, + "logps/rejected": -97.72393798828125, + "loss": 0.6791, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.932783365249634, + "rewards/margins": 6.068852424621582, + "rewards/rejected": -3.1360692977905273, + "step": 14624 + }, + { + "epoch": 3.66, + "grad_norm": 11.46906566619873, + "learning_rate": 1.6722982557206086e-06, + "logits/chosen": -0.5154405832290649, + "logits/rejected": -0.5922122597694397, + "logps/chosen": -55.77577590942383, + "logps/rejected": -85.21710205078125, + "loss": 0.7696, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9252254962921143, + "rewards/margins": 4.974937438964844, + "rewards/rejected": -2.0497119426727295, + "step": 14625 + }, + { + "epoch": 3.66, + "grad_norm": 8.455175399780273, + "learning_rate": 1.671711665621602e-06, + "logits/chosen": -0.49589332938194275, + "logits/rejected": -0.5992299318313599, + "logps/chosen": -52.51019287109375, + "logps/rejected": -104.51217651367188, + "loss": 0.592, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0134661197662354, + "rewards/margins": 6.868133068084717, + "rewards/rejected": -3.8546676635742188, + "step": 14626 + }, + { + "epoch": 3.66, + "grad_norm": 10.280303001403809, + "learning_rate": 1.6711251577681393e-06, + "logits/chosen": -0.5819617509841919, + "logits/rejected": -0.6905573010444641, + "logps/chosen": -45.412322998046875, + "logps/rejected": -107.85716247558594, + "loss": 0.5574, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.901360273361206, + "rewards/margins": 7.813164710998535, + "rewards/rejected": -4.911805152893066, + "step": 14627 + }, + { + "epoch": 3.66, + "grad_norm": 7.664609432220459, + "learning_rate": 1.6705387321747158e-06, + "logits/chosen": -0.3876483142375946, + "logits/rejected": -0.5061795115470886, + "logps/chosen": -65.18250274658203, + "logps/rejected": -92.0663070678711, + "loss": 0.762, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.237617254257202, + "rewards/margins": 6.601922988891602, + "rewards/rejected": -3.3643054962158203, + "step": 14628 + }, + { + "epoch": 3.66, + "grad_norm": 7.376842498779297, + "learning_rate": 1.6699523888558194e-06, + "logits/chosen": -0.5272364020347595, + "logits/rejected": -0.5830717086791992, + "logps/chosen": -52.89131164550781, + "logps/rejected": -117.66130065917969, + "loss": 0.6215, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.283823013305664, + "rewards/margins": 7.244257926940918, + "rewards/rejected": -3.960435390472412, + "step": 14629 + }, + { + "epoch": 3.66, + "grad_norm": 2.533630847930908, + "learning_rate": 1.6693661278259438e-06, + "logits/chosen": -0.5473088622093201, + "logits/rejected": -0.6358057260513306, + "logps/chosen": -67.06410217285156, + "logps/rejected": -117.3373794555664, + "loss": 0.6734, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.840711832046509, + "rewards/margins": 7.186118125915527, + "rewards/rejected": -4.345406532287598, + "step": 14630 + }, + { + "epoch": 3.66, + "grad_norm": 4.846479892730713, + "learning_rate": 1.6687799490995738e-06, + "logits/chosen": -0.5293735265731812, + "logits/rejected": -0.5692833065986633, + "logps/chosen": -63.20100021362305, + "logps/rejected": -103.43122863769531, + "loss": 0.6683, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.701866865158081, + "rewards/margins": 5.403445720672607, + "rewards/rejected": -2.7015790939331055, + "step": 14631 + }, + { + "epoch": 3.66, + "grad_norm": 3.671539068222046, + "learning_rate": 1.6681938526911928e-06, + "logits/chosen": -0.5150855779647827, + "logits/rejected": -0.6121720671653748, + "logps/chosen": -56.965789794921875, + "logps/rejected": -115.29680633544922, + "loss": 0.612, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.669701099395752, + "rewards/margins": 8.844015121459961, + "rewards/rejected": -5.174314498901367, + "step": 14632 + }, + { + "epoch": 3.66, + "grad_norm": 3.007425546646118, + "learning_rate": 1.6676078386152873e-06, + "logits/chosen": -0.6222371459007263, + "logits/rejected": -0.6966225504875183, + "logps/chosen": -66.32266235351562, + "logps/rejected": -100.45013427734375, + "loss": 0.7048, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.230334997177124, + "rewards/margins": 7.633267402648926, + "rewards/rejected": -4.402933120727539, + "step": 14633 + }, + { + "epoch": 3.66, + "grad_norm": 3.9716835021972656, + "learning_rate": 1.6670219068863363e-06, + "logits/chosen": -0.5888254046440125, + "logits/rejected": -0.6958956718444824, + "logps/chosen": -48.634979248046875, + "logps/rejected": -97.8571548461914, + "loss": 0.5597, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.088850736618042, + "rewards/margins": 6.839080810546875, + "rewards/rejected": -3.750229835510254, + "step": 14634 + }, + { + "epoch": 3.66, + "grad_norm": 4.119563102722168, + "learning_rate": 1.6664360575188177e-06, + "logits/chosen": -0.4981083273887634, + "logits/rejected": -0.6620756387710571, + "logps/chosen": -65.23017883300781, + "logps/rejected": -90.27472686767578, + "loss": 0.565, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8781261444091797, + "rewards/margins": 6.500392913818359, + "rewards/rejected": -3.622267007827759, + "step": 14635 + }, + { + "epoch": 3.66, + "grad_norm": 6.938145637512207, + "learning_rate": 1.6658502905272101e-06, + "logits/chosen": -0.5559625625610352, + "logits/rejected": -0.6597455739974976, + "logps/chosen": -51.47911071777344, + "logps/rejected": -114.20091247558594, + "loss": 0.5871, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4030027389526367, + "rewards/margins": 8.550922393798828, + "rewards/rejected": -5.14792013168335, + "step": 14636 + }, + { + "epoch": 3.66, + "grad_norm": 3.1086432933807373, + "learning_rate": 1.6652646059259908e-06, + "logits/chosen": -0.493508905172348, + "logits/rejected": -0.581670880317688, + "logps/chosen": -51.72182083129883, + "logps/rejected": -121.60913848876953, + "loss": 0.4615, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8636395931243896, + "rewards/margins": 8.203497886657715, + "rewards/rejected": -5.339858531951904, + "step": 14637 + }, + { + "epoch": 3.66, + "grad_norm": 11.616802215576172, + "learning_rate": 1.6646790037296273e-06, + "logits/chosen": -0.5609528422355652, + "logits/rejected": -0.657845139503479, + "logps/chosen": -66.42212677001953, + "logps/rejected": -118.15840148925781, + "loss": 0.6306, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.908273696899414, + "rewards/margins": 7.391758441925049, + "rewards/rejected": -4.483484745025635, + "step": 14638 + }, + { + "epoch": 3.66, + "grad_norm": 5.464474201202393, + "learning_rate": 1.664093483952593e-06, + "logits/chosen": -0.5446847677230835, + "logits/rejected": -0.6159787774085999, + "logps/chosen": -50.48432922363281, + "logps/rejected": -112.64720153808594, + "loss": 0.5967, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1810359954833984, + "rewards/margins": 7.090280532836914, + "rewards/rejected": -3.9092447757720947, + "step": 14639 + }, + { + "epoch": 3.66, + "grad_norm": 4.680637359619141, + "learning_rate": 1.6635080466093585e-06, + "logits/chosen": -0.442371129989624, + "logits/rejected": -0.5365885496139526, + "logps/chosen": -66.81334686279297, + "logps/rejected": -108.68632507324219, + "loss": 0.6766, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.242386817932129, + "rewards/margins": 7.153105735778809, + "rewards/rejected": -3.910719394683838, + "step": 14640 + }, + { + "epoch": 3.66, + "grad_norm": 2.5316267013549805, + "learning_rate": 1.6629226917143893e-06, + "logits/chosen": -0.6277652382850647, + "logits/rejected": -0.7128281593322754, + "logps/chosen": -40.1362419128418, + "logps/rejected": -96.41936492919922, + "loss": 0.5452, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2068698406219482, + "rewards/margins": 7.663348197937012, + "rewards/rejected": -4.456478595733643, + "step": 14641 + }, + { + "epoch": 3.66, + "grad_norm": 5.595107078552246, + "learning_rate": 1.6623374192821478e-06, + "logits/chosen": -0.4973454475402832, + "logits/rejected": -0.5445618033409119, + "logps/chosen": -49.14220428466797, + "logps/rejected": -102.26692962646484, + "loss": 0.6822, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.190758228302002, + "rewards/margins": 5.7917022705078125, + "rewards/rejected": -2.6009442806243896, + "step": 14642 + }, + { + "epoch": 3.66, + "grad_norm": 6.17515230178833, + "learning_rate": 1.661752229327101e-06, + "logits/chosen": -0.5286584496498108, + "logits/rejected": -0.625534176826477, + "logps/chosen": -63.303443908691406, + "logps/rejected": -105.06305694580078, + "loss": 0.7445, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.826852560043335, + "rewards/margins": 5.941288471221924, + "rewards/rejected": -3.1144351959228516, + "step": 14643 + }, + { + "epoch": 3.66, + "grad_norm": 4.439075946807861, + "learning_rate": 1.6611671218637071e-06, + "logits/chosen": -0.5274539589881897, + "logits/rejected": -0.5909821391105652, + "logps/chosen": -53.81064987182617, + "logps/rejected": -101.11563873291016, + "loss": 0.5983, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.018139362335205, + "rewards/margins": 6.368866920471191, + "rewards/rejected": -3.3507275581359863, + "step": 14644 + }, + { + "epoch": 3.66, + "grad_norm": 3.4634602069854736, + "learning_rate": 1.660582096906423e-06, + "logits/chosen": -0.5367758274078369, + "logits/rejected": -0.5884590148925781, + "logps/chosen": -51.92798614501953, + "logps/rejected": -111.5192642211914, + "loss": 0.6099, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.80704665184021, + "rewards/margins": 6.737288475036621, + "rewards/rejected": -3.930241823196411, + "step": 14645 + }, + { + "epoch": 3.66, + "grad_norm": 22.561433792114258, + "learning_rate": 1.659997154469709e-06, + "logits/chosen": -0.5331483483314514, + "logits/rejected": -0.6355910897254944, + "logps/chosen": -61.27359390258789, + "logps/rejected": -105.70553588867188, + "loss": 0.6636, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.859968662261963, + "rewards/margins": 6.6884846687316895, + "rewards/rejected": -3.8285155296325684, + "step": 14646 + }, + { + "epoch": 3.66, + "grad_norm": 4.727550029754639, + "learning_rate": 1.6594122945680163e-06, + "logits/chosen": -0.5556212663650513, + "logits/rejected": -0.608122706413269, + "logps/chosen": -48.61046600341797, + "logps/rejected": -103.07990264892578, + "loss": 0.6765, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.217973232269287, + "rewards/margins": 6.778508186340332, + "rewards/rejected": -3.5605344772338867, + "step": 14647 + }, + { + "epoch": 3.66, + "grad_norm": 6.83364200592041, + "learning_rate": 1.6588275172158008e-06, + "logits/chosen": -0.5084853768348694, + "logits/rejected": -0.5601602792739868, + "logps/chosen": -69.28656005859375, + "logps/rejected": -103.8746566772461, + "loss": 0.706, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0748825073242188, + "rewards/margins": 4.9820051193237305, + "rewards/rejected": -1.9071228504180908, + "step": 14648 + }, + { + "epoch": 3.66, + "grad_norm": 10.37471866607666, + "learning_rate": 1.658242822427511e-06, + "logits/chosen": -0.5569713711738586, + "logits/rejected": -0.6340183615684509, + "logps/chosen": -54.833946228027344, + "logps/rejected": -109.96788024902344, + "loss": 0.6414, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8133652210235596, + "rewards/margins": 7.849676132202148, + "rewards/rejected": -5.03631067276001, + "step": 14649 + }, + { + "epoch": 3.66, + "grad_norm": 6.630372524261475, + "learning_rate": 1.6576582102175941e-06, + "logits/chosen": -0.5893718004226685, + "logits/rejected": -0.6662390232086182, + "logps/chosen": -51.4162712097168, + "logps/rejected": -110.19029235839844, + "loss": 0.6483, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9507875442504883, + "rewards/margins": 7.696399688720703, + "rewards/rejected": -4.745612144470215, + "step": 14650 + }, + { + "epoch": 3.67, + "grad_norm": 7.922523021697998, + "learning_rate": 1.6570736806004994e-06, + "logits/chosen": -0.5793865323066711, + "logits/rejected": -0.6579099893569946, + "logps/chosen": -46.563331604003906, + "logps/rejected": -95.65229797363281, + "loss": 0.6085, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.113027811050415, + "rewards/margins": 6.643069267272949, + "rewards/rejected": -3.530041456222534, + "step": 14651 + }, + { + "epoch": 3.67, + "grad_norm": 6.977196216583252, + "learning_rate": 1.6564892335906697e-06, + "logits/chosen": -0.5039538145065308, + "logits/rejected": -0.5751463174819946, + "logps/chosen": -64.44044494628906, + "logps/rejected": -89.17032623291016, + "loss": 0.6648, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.923753023147583, + "rewards/margins": 5.637585639953613, + "rewards/rejected": -2.7138326168060303, + "step": 14652 + }, + { + "epoch": 3.67, + "grad_norm": 7.657301902770996, + "learning_rate": 1.655904869202546e-06, + "logits/chosen": -0.5091559886932373, + "logits/rejected": -0.5730332136154175, + "logps/chosen": -57.703548431396484, + "logps/rejected": -124.0010757446289, + "loss": 0.6429, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3767151832580566, + "rewards/margins": 6.666589736938477, + "rewards/rejected": -3.28987455368042, + "step": 14653 + }, + { + "epoch": 3.67, + "grad_norm": 5.713255405426025, + "learning_rate": 1.6553205874505695e-06, + "logits/chosen": -0.5297085046768188, + "logits/rejected": -0.6173797249794006, + "logps/chosen": -54.25374984741211, + "logps/rejected": -109.95023345947266, + "loss": 0.6711, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8852264881134033, + "rewards/margins": 7.349615097045898, + "rewards/rejected": -4.464388847351074, + "step": 14654 + }, + { + "epoch": 3.67, + "grad_norm": 2.778395652770996, + "learning_rate": 1.6547363883491824e-06, + "logits/chosen": -0.573646605014801, + "logits/rejected": -0.6550920009613037, + "logps/chosen": -50.510345458984375, + "logps/rejected": -108.84501647949219, + "loss": 0.5856, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9996206760406494, + "rewards/margins": 7.414093017578125, + "rewards/rejected": -4.414471626281738, + "step": 14655 + }, + { + "epoch": 3.67, + "grad_norm": 10.664298057556152, + "learning_rate": 1.654152271912814e-06, + "logits/chosen": -0.4291151165962219, + "logits/rejected": -0.5115663409233093, + "logps/chosen": -69.13137817382812, + "logps/rejected": -103.86087036132812, + "loss": 0.6074, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.019209861755371, + "rewards/margins": 5.730615615844727, + "rewards/rejected": -2.7114059925079346, + "step": 14656 + }, + { + "epoch": 3.67, + "grad_norm": 5.988706588745117, + "learning_rate": 1.6535682381559014e-06, + "logits/chosen": -0.4968867897987366, + "logits/rejected": -0.6093421578407288, + "logps/chosen": -56.01498794555664, + "logps/rejected": -105.61759948730469, + "loss": 0.6511, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1851091384887695, + "rewards/margins": 7.899957180023193, + "rewards/rejected": -4.714848518371582, + "step": 14657 + }, + { + "epoch": 3.67, + "grad_norm": 5.112185955047607, + "learning_rate": 1.6529842870928792e-06, + "logits/chosen": -0.5965557098388672, + "logits/rejected": -0.6691009998321533, + "logps/chosen": -61.905731201171875, + "logps/rejected": -91.01587677001953, + "loss": 0.7135, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1514205932617188, + "rewards/margins": 6.150546073913574, + "rewards/rejected": -2.9991254806518555, + "step": 14658 + }, + { + "epoch": 3.67, + "grad_norm": 7.361146926879883, + "learning_rate": 1.6524004187381753e-06, + "logits/chosen": -0.5103570818901062, + "logits/rejected": -0.5403675436973572, + "logps/chosen": -56.47026443481445, + "logps/rejected": -100.00756072998047, + "loss": 0.8282, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.961608409881592, + "rewards/margins": 6.0825605392456055, + "rewards/rejected": -3.120952606201172, + "step": 14659 + }, + { + "epoch": 3.67, + "grad_norm": 9.42473316192627, + "learning_rate": 1.6518166331062157e-06, + "logits/chosen": -0.4983711540699005, + "logits/rejected": -0.5833978056907654, + "logps/chosen": -49.69500732421875, + "logps/rejected": -87.52269744873047, + "loss": 0.5762, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.131045341491699, + "rewards/margins": 6.530716896057129, + "rewards/rejected": -3.399671792984009, + "step": 14660 + }, + { + "epoch": 3.67, + "grad_norm": 3.0613579750061035, + "learning_rate": 1.6512329302114304e-06, + "logits/chosen": -0.5137389898300171, + "logits/rejected": -0.5902794599533081, + "logps/chosen": -57.704811096191406, + "logps/rejected": -111.03282165527344, + "loss": 0.6608, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.850036859512329, + "rewards/margins": 7.129361152648926, + "rewards/rejected": -4.279324531555176, + "step": 14661 + }, + { + "epoch": 3.67, + "grad_norm": 3.6195545196533203, + "learning_rate": 1.6506493100682413e-06, + "logits/chosen": -0.4156562089920044, + "logits/rejected": -0.5392562747001648, + "logps/chosen": -62.347286224365234, + "logps/rejected": -98.8672103881836, + "loss": 0.5891, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.156301259994507, + "rewards/margins": 6.846118450164795, + "rewards/rejected": -3.6898176670074463, + "step": 14662 + }, + { + "epoch": 3.67, + "grad_norm": 6.616659641265869, + "learning_rate": 1.6500657726910686e-06, + "logits/chosen": -0.5252713561058044, + "logits/rejected": -0.5962350368499756, + "logps/chosen": -57.399139404296875, + "logps/rejected": -90.614013671875, + "loss": 0.7606, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.876889228820801, + "rewards/margins": 5.597586154937744, + "rewards/rejected": -2.7206966876983643, + "step": 14663 + }, + { + "epoch": 3.67, + "grad_norm": 4.7839274406433105, + "learning_rate": 1.6494823180943348e-06, + "logits/chosen": -0.48373594880104065, + "logits/rejected": -0.5730632543563843, + "logps/chosen": -60.00484848022461, + "logps/rejected": -97.07646179199219, + "loss": 0.7844, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.998086452484131, + "rewards/margins": 5.75353479385376, + "rewards/rejected": -2.7554478645324707, + "step": 14664 + }, + { + "epoch": 3.67, + "grad_norm": 3.434396743774414, + "learning_rate": 1.648898946292456e-06, + "logits/chosen": -0.5126314163208008, + "logits/rejected": -0.5930415987968445, + "logps/chosen": -49.688682556152344, + "logps/rejected": -100.89591979980469, + "loss": 0.5302, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0025174617767334, + "rewards/margins": 6.721699237823486, + "rewards/rejected": -3.719182014465332, + "step": 14665 + }, + { + "epoch": 3.67, + "grad_norm": 7.85653018951416, + "learning_rate": 1.64831565729985e-06, + "logits/chosen": -0.5318722128868103, + "logits/rejected": -0.6067368984222412, + "logps/chosen": -55.16865539550781, + "logps/rejected": -103.93345642089844, + "loss": 0.6756, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8089983463287354, + "rewards/margins": 6.976340293884277, + "rewards/rejected": -4.167342185974121, + "step": 14666 + }, + { + "epoch": 3.67, + "grad_norm": 14.240225791931152, + "learning_rate": 1.6477324511309267e-06, + "logits/chosen": -0.5470321178436279, + "logits/rejected": -0.6312774419784546, + "logps/chosen": -55.981407165527344, + "logps/rejected": -122.457763671875, + "loss": 0.5275, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5857927799224854, + "rewards/margins": 7.839901924133301, + "rewards/rejected": -4.254108905792236, + "step": 14667 + }, + { + "epoch": 3.67, + "grad_norm": 11.173371315002441, + "learning_rate": 1.6471493278001026e-06, + "logits/chosen": -0.5326387882232666, + "logits/rejected": -0.5773189663887024, + "logps/chosen": -61.009605407714844, + "logps/rejected": -116.20193481445312, + "loss": 0.8161, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9805383682250977, + "rewards/margins": 6.088067054748535, + "rewards/rejected": -3.1075291633605957, + "step": 14668 + }, + { + "epoch": 3.67, + "grad_norm": 13.422891616821289, + "learning_rate": 1.6465662873217847e-06, + "logits/chosen": -0.5254059433937073, + "logits/rejected": -0.5772976875305176, + "logps/chosen": -55.01102828979492, + "logps/rejected": -107.8310317993164, + "loss": 0.7522, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7796883583068848, + "rewards/margins": 6.227450370788574, + "rewards/rejected": -3.4477620124816895, + "step": 14669 + }, + { + "epoch": 3.67, + "grad_norm": 4.031449317932129, + "learning_rate": 1.645983329710379e-06, + "logits/chosen": -0.45203208923339844, + "logits/rejected": -0.53534334897995, + "logps/chosen": -67.01988983154297, + "logps/rejected": -119.49687957763672, + "loss": 0.5963, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.094647169113159, + "rewards/margins": 8.71044921875, + "rewards/rejected": -5.61580228805542, + "step": 14670 + }, + { + "epoch": 3.67, + "grad_norm": 8.850422859191895, + "learning_rate": 1.6454004549802944e-06, + "logits/chosen": -0.5624944567680359, + "logits/rejected": -0.6079267263412476, + "logps/chosen": -51.5004997253418, + "logps/rejected": -95.30677795410156, + "loss": 0.6567, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0629489421844482, + "rewards/margins": 5.541973114013672, + "rewards/rejected": -2.4790234565734863, + "step": 14671 + }, + { + "epoch": 3.67, + "grad_norm": 4.636754512786865, + "learning_rate": 1.6448176631459312e-06, + "logits/chosen": -0.5658569931983948, + "logits/rejected": -0.616014301776886, + "logps/chosen": -56.51202392578125, + "logps/rejected": -116.38069152832031, + "loss": 0.687, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.700927495956421, + "rewards/margins": 6.23920202255249, + "rewards/rejected": -3.5382750034332275, + "step": 14672 + }, + { + "epoch": 3.67, + "grad_norm": 22.535375595092773, + "learning_rate": 1.6442349542216935e-06, + "logits/chosen": -0.5660922527313232, + "logits/rejected": -0.6619177460670471, + "logps/chosen": -57.07958221435547, + "logps/rejected": -97.82368469238281, + "loss": 0.867, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8912997245788574, + "rewards/margins": 7.165462493896484, + "rewards/rejected": -4.274163246154785, + "step": 14673 + }, + { + "epoch": 3.67, + "grad_norm": 19.19959259033203, + "learning_rate": 1.6436523282219797e-06, + "logits/chosen": -0.5543922781944275, + "logits/rejected": -0.6472987532615662, + "logps/chosen": -50.581634521484375, + "logps/rejected": -121.87799072265625, + "loss": 0.6494, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.988784074783325, + "rewards/margins": 7.061368942260742, + "rewards/rejected": -4.072585105895996, + "step": 14674 + }, + { + "epoch": 3.67, + "grad_norm": 6.573609828948975, + "learning_rate": 1.6430697851611855e-06, + "logits/chosen": -0.5694667100906372, + "logits/rejected": -0.6530417203903198, + "logps/chosen": -69.01178741455078, + "logps/rejected": -108.43844604492188, + "loss": 0.8317, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.178025007247925, + "rewards/margins": 5.288380146026611, + "rewards/rejected": -2.1103549003601074, + "step": 14675 + }, + { + "epoch": 3.67, + "grad_norm": 15.730504989624023, + "learning_rate": 1.6424873250537087e-06, + "logits/chosen": -0.5648927688598633, + "logits/rejected": -0.6182808876037598, + "logps/chosen": -66.38362121582031, + "logps/rejected": -123.15958404541016, + "loss": 0.7569, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8437390327453613, + "rewards/margins": 6.614864826202393, + "rewards/rejected": -3.7711265087127686, + "step": 14676 + }, + { + "epoch": 3.67, + "grad_norm": 5.652750015258789, + "learning_rate": 1.6419049479139415e-06, + "logits/chosen": -0.6152245998382568, + "logits/rejected": -0.6442650556564331, + "logps/chosen": -64.8902587890625, + "logps/rejected": -83.86968231201172, + "loss": 0.5244, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2769248485565186, + "rewards/margins": 7.020471096038818, + "rewards/rejected": -3.7435462474823, + "step": 14677 + }, + { + "epoch": 3.67, + "grad_norm": 5.173122406005859, + "learning_rate": 1.6413226537562731e-06, + "logits/chosen": -0.5797263383865356, + "logits/rejected": -0.6281759142875671, + "logps/chosen": -55.34727478027344, + "logps/rejected": -122.69072723388672, + "loss": 0.6576, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.370676040649414, + "rewards/margins": 7.359164237976074, + "rewards/rejected": -3.988487720489502, + "step": 14678 + }, + { + "epoch": 3.67, + "grad_norm": 6.031841278076172, + "learning_rate": 1.6407404425950946e-06, + "logits/chosen": -0.5335761904716492, + "logits/rejected": -0.610309362411499, + "logps/chosen": -46.60282897949219, + "logps/rejected": -100.18993377685547, + "loss": 0.6669, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.733199119567871, + "rewards/margins": 6.616375923156738, + "rewards/rejected": -3.883176803588867, + "step": 14679 + }, + { + "epoch": 3.67, + "grad_norm": 6.9373650550842285, + "learning_rate": 1.6401583144447958e-06, + "logits/chosen": -0.44300395250320435, + "logits/rejected": -0.5555435419082642, + "logps/chosen": -77.9969711303711, + "logps/rejected": -95.60966491699219, + "loss": 0.7286, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.993669033050537, + "rewards/margins": 6.538815021514893, + "rewards/rejected": -3.5451459884643555, + "step": 14680 + }, + { + "epoch": 3.67, + "grad_norm": 9.418883323669434, + "learning_rate": 1.6395762693197553e-06, + "logits/chosen": -0.5862581133842468, + "logits/rejected": -0.6778509616851807, + "logps/chosen": -53.546180725097656, + "logps/rejected": -90.25492858886719, + "loss": 0.6228, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8746397495269775, + "rewards/margins": 6.33876895904541, + "rewards/rejected": -3.4641284942626953, + "step": 14681 + }, + { + "epoch": 3.67, + "grad_norm": 8.545401573181152, + "learning_rate": 1.6389943072343601e-06, + "logits/chosen": -0.5214934349060059, + "logits/rejected": -0.556927502155304, + "logps/chosen": -52.717018127441406, + "logps/rejected": -92.07229614257812, + "loss": 0.6926, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.020646810531616, + "rewards/margins": 5.555753231048584, + "rewards/rejected": -2.5351061820983887, + "step": 14682 + }, + { + "epoch": 3.67, + "grad_norm": 14.129786491394043, + "learning_rate": 1.6384124282029934e-06, + "logits/chosen": -0.489779531955719, + "logits/rejected": -0.5297377109527588, + "logps/chosen": -65.91751861572266, + "logps/rejected": -106.3494873046875, + "loss": 0.876, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7002696990966797, + "rewards/margins": 4.786616325378418, + "rewards/rejected": -2.086346387863159, + "step": 14683 + }, + { + "epoch": 3.67, + "grad_norm": 10.151185035705566, + "learning_rate": 1.6378306322400277e-06, + "logits/chosen": -0.5673093795776367, + "logits/rejected": -0.6402155160903931, + "logps/chosen": -53.09313201904297, + "logps/rejected": -111.41632080078125, + "loss": 0.6734, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8668975830078125, + "rewards/margins": 7.341106414794922, + "rewards/rejected": -4.474208354949951, + "step": 14684 + }, + { + "epoch": 3.67, + "grad_norm": 6.849747657775879, + "learning_rate": 1.6372489193598434e-06, + "logits/chosen": -0.5551655888557434, + "logits/rejected": -0.662735104560852, + "logps/chosen": -58.03687286376953, + "logps/rejected": -103.50041961669922, + "loss": 0.6603, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7381691932678223, + "rewards/margins": 7.0719451904296875, + "rewards/rejected": -4.333775520324707, + "step": 14685 + }, + { + "epoch": 3.67, + "grad_norm": 5.139050483703613, + "learning_rate": 1.6366672895768171e-06, + "logits/chosen": -0.5975856781005859, + "logits/rejected": -0.6607849597930908, + "logps/chosen": -55.09703063964844, + "logps/rejected": -101.34183502197266, + "loss": 0.6426, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.147768497467041, + "rewards/margins": 6.617008209228516, + "rewards/rejected": -3.4692397117614746, + "step": 14686 + }, + { + "epoch": 3.67, + "grad_norm": 37.10268020629883, + "learning_rate": 1.636085742905319e-06, + "logits/chosen": -0.59283447265625, + "logits/rejected": -0.662711501121521, + "logps/chosen": -54.52796936035156, + "logps/rejected": -117.89813232421875, + "loss": 0.8273, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1460371017456055, + "rewards/margins": 6.45981502532959, + "rewards/rejected": -3.3137779235839844, + "step": 14687 + }, + { + "epoch": 3.67, + "grad_norm": 5.945957183837891, + "learning_rate": 1.6355042793597186e-06, + "logits/chosen": -0.5058908462524414, + "logits/rejected": -0.5446207523345947, + "logps/chosen": -69.25582122802734, + "logps/rejected": -111.70477294921875, + "loss": 0.764, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.038175106048584, + "rewards/margins": 5.8511552810668945, + "rewards/rejected": -2.8129806518554688, + "step": 14688 + }, + { + "epoch": 3.67, + "grad_norm": 4.987782001495361, + "learning_rate": 1.634922898954388e-06, + "logits/chosen": -0.6191046237945557, + "logits/rejected": -0.6561082601547241, + "logps/chosen": -46.68365478515625, + "logps/rejected": -114.65031433105469, + "loss": 0.582, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0460801124572754, + "rewards/margins": 7.157409191131592, + "rewards/rejected": -4.111329078674316, + "step": 14689 + }, + { + "epoch": 3.67, + "grad_norm": 5.0679168701171875, + "learning_rate": 1.63434160170369e-06, + "logits/chosen": -0.5802900791168213, + "logits/rejected": -0.6508651971817017, + "logps/chosen": -50.246742248535156, + "logps/rejected": -88.61563110351562, + "loss": 0.5923, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4096672534942627, + "rewards/margins": 6.095737457275391, + "rewards/rejected": -2.686069965362549, + "step": 14690 + }, + { + "epoch": 3.68, + "grad_norm": 3.9989914894104004, + "learning_rate": 1.6337603876219927e-06, + "logits/chosen": -0.5372986793518066, + "logits/rejected": -0.6542127132415771, + "logps/chosen": -53.94623947143555, + "logps/rejected": -110.75431060791016, + "loss": 0.5676, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1855320930480957, + "rewards/margins": 8.377364158630371, + "rewards/rejected": -5.191832065582275, + "step": 14691 + }, + { + "epoch": 3.68, + "grad_norm": 2.1776123046875, + "learning_rate": 1.6331792567236565e-06, + "logits/chosen": -0.5372234582901001, + "logits/rejected": -0.6374523639678955, + "logps/chosen": -48.3773078918457, + "logps/rejected": -100.66651153564453, + "loss": 0.5572, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.164479970932007, + "rewards/margins": 8.678537368774414, + "rewards/rejected": -5.514057636260986, + "step": 14692 + }, + { + "epoch": 3.68, + "grad_norm": 5.593409538269043, + "learning_rate": 1.6325982090230402e-06, + "logits/chosen": -0.44703447818756104, + "logits/rejected": -0.5632851123809814, + "logps/chosen": -56.728477478027344, + "logps/rejected": -106.64179992675781, + "loss": 0.5491, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1629228591918945, + "rewards/margins": 6.859364032745361, + "rewards/rejected": -3.6964409351348877, + "step": 14693 + }, + { + "epoch": 3.68, + "grad_norm": 13.67137622833252, + "learning_rate": 1.6320172445345063e-06, + "logits/chosen": -0.5935707092285156, + "logits/rejected": -0.6465845108032227, + "logps/chosen": -65.43147277832031, + "logps/rejected": -94.88259887695312, + "loss": 0.7379, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.849956750869751, + "rewards/margins": 5.963068962097168, + "rewards/rejected": -3.113111734390259, + "step": 14694 + }, + { + "epoch": 3.68, + "grad_norm": 8.250128746032715, + "learning_rate": 1.6314363632724077e-06, + "logits/chosen": -0.6240032315254211, + "logits/rejected": -0.6993094086647034, + "logps/chosen": -44.46213912963867, + "logps/rejected": -111.28729248046875, + "loss": 0.5751, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0290145874023438, + "rewards/margins": 7.670537948608398, + "rewards/rejected": -4.6415228843688965, + "step": 14695 + }, + { + "epoch": 3.68, + "grad_norm": 7.697347640991211, + "learning_rate": 1.6308555652510982e-06, + "logits/chosen": -0.5603277683258057, + "logits/rejected": -0.5935834050178528, + "logps/chosen": -48.654640197753906, + "logps/rejected": -105.12378692626953, + "loss": 0.5753, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1053528785705566, + "rewards/margins": 6.238264083862305, + "rewards/rejected": -3.132911443710327, + "step": 14696 + }, + { + "epoch": 3.68, + "grad_norm": 5.322539806365967, + "learning_rate": 1.6302748504849314e-06, + "logits/chosen": -0.5395358800888062, + "logits/rejected": -0.6303557753562927, + "logps/chosen": -53.6744499206543, + "logps/rejected": -103.55804443359375, + "loss": 0.597, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.932009220123291, + "rewards/margins": 7.2733941078186035, + "rewards/rejected": -4.3413848876953125, + "step": 14697 + }, + { + "epoch": 3.68, + "grad_norm": 41.422760009765625, + "learning_rate": 1.629694218988261e-06, + "logits/chosen": -0.5581374764442444, + "logits/rejected": -0.6518546342849731, + "logps/chosen": -56.467247009277344, + "logps/rejected": -105.6570816040039, + "loss": 0.7556, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9842216968536377, + "rewards/margins": 7.188420295715332, + "rewards/rejected": -4.204198360443115, + "step": 14698 + }, + { + "epoch": 3.68, + "grad_norm": 3.937044143676758, + "learning_rate": 1.6291136707754267e-06, + "logits/chosen": -0.5998356342315674, + "logits/rejected": -0.6274964809417725, + "logps/chosen": -48.0782356262207, + "logps/rejected": -125.62139892578125, + "loss": 0.5811, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4615588188171387, + "rewards/margins": 8.241079330444336, + "rewards/rejected": -4.779520034790039, + "step": 14699 + }, + { + "epoch": 3.68, + "grad_norm": 7.42918586730957, + "learning_rate": 1.62853320586078e-06, + "logits/chosen": -0.5756239891052246, + "logits/rejected": -0.6330012083053589, + "logps/chosen": -47.13349151611328, + "logps/rejected": -110.45844268798828, + "loss": 0.5335, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1251072883605957, + "rewards/margins": 6.475279331207275, + "rewards/rejected": -3.350172519683838, + "step": 14700 + }, + { + "epoch": 3.68, + "grad_norm": 4.333806991577148, + "learning_rate": 1.6279528242586655e-06, + "logits/chosen": -0.5227643251419067, + "logits/rejected": -0.5817952156066895, + "logps/chosen": -39.805328369140625, + "logps/rejected": -101.13645935058594, + "loss": 0.6246, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0771145820617676, + "rewards/margins": 6.6337103843688965, + "rewards/rejected": -3.556595802307129, + "step": 14701 + }, + { + "epoch": 3.68, + "grad_norm": 4.54723596572876, + "learning_rate": 1.6273725259834227e-06, + "logits/chosen": -0.6369195580482483, + "logits/rejected": -0.5904511213302612, + "logps/chosen": -81.89128112792969, + "logps/rejected": -115.71955871582031, + "loss": 0.7257, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.228492259979248, + "rewards/margins": 7.155270099639893, + "rewards/rejected": -3.9267783164978027, + "step": 14702 + }, + { + "epoch": 3.68, + "grad_norm": 14.143898963928223, + "learning_rate": 1.6267923110493911e-06, + "logits/chosen": -0.5211274027824402, + "logits/rejected": -0.603201150894165, + "logps/chosen": -49.808345794677734, + "logps/rejected": -115.47444152832031, + "loss": 0.6349, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.337279796600342, + "rewards/margins": 7.240868091583252, + "rewards/rejected": -3.90358829498291, + "step": 14703 + }, + { + "epoch": 3.68, + "grad_norm": 6.041640281677246, + "learning_rate": 1.6262121794709107e-06, + "logits/chosen": -0.6286563873291016, + "logits/rejected": -0.7103962302207947, + "logps/chosen": -51.15838623046875, + "logps/rejected": -103.95179748535156, + "loss": 0.6914, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1082043647766113, + "rewards/margins": 6.578665733337402, + "rewards/rejected": -3.470461845397949, + "step": 14704 + }, + { + "epoch": 3.68, + "grad_norm": 3.673199415206909, + "learning_rate": 1.6256321312623163e-06, + "logits/chosen": -0.5315273404121399, + "logits/rejected": -0.5979611277580261, + "logps/chosen": -60.81753921508789, + "logps/rejected": -100.53671264648438, + "loss": 0.6052, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0773866176605225, + "rewards/margins": 5.586996555328369, + "rewards/rejected": -2.5096094608306885, + "step": 14705 + }, + { + "epoch": 3.68, + "grad_norm": 4.222343921661377, + "learning_rate": 1.6250521664379387e-06, + "logits/chosen": -0.5825725197792053, + "logits/rejected": -0.6916824579238892, + "logps/chosen": -49.1046257019043, + "logps/rejected": -115.82968139648438, + "loss": 0.5228, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.290801763534546, + "rewards/margins": 8.684333801269531, + "rewards/rejected": -5.393531799316406, + "step": 14706 + }, + { + "epoch": 3.68, + "grad_norm": 3.2776174545288086, + "learning_rate": 1.6244722850121143e-06, + "logits/chosen": -0.5990104675292969, + "logits/rejected": -0.6487597227096558, + "logps/chosen": -53.68709945678711, + "logps/rejected": -118.53971862792969, + "loss": 0.5883, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.88631010055542, + "rewards/margins": 7.770453929901123, + "rewards/rejected": -4.884143829345703, + "step": 14707 + }, + { + "epoch": 3.68, + "grad_norm": 3.658179759979248, + "learning_rate": 1.6238924869991695e-06, + "logits/chosen": -0.5444297790527344, + "logits/rejected": -0.6289485096931458, + "logps/chosen": -57.518287658691406, + "logps/rejected": -101.08539581298828, + "loss": 0.659, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3718698024749756, + "rewards/margins": 7.208355903625488, + "rewards/rejected": -3.8364858627319336, + "step": 14708 + }, + { + "epoch": 3.68, + "grad_norm": 7.200192928314209, + "learning_rate": 1.6233127724134306e-06, + "logits/chosen": -0.48752671480178833, + "logits/rejected": -0.5617837309837341, + "logps/chosen": -61.698890686035156, + "logps/rejected": -90.77449035644531, + "loss": 0.6577, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0057075023651123, + "rewards/margins": 5.5349297523498535, + "rewards/rejected": -2.5292224884033203, + "step": 14709 + }, + { + "epoch": 3.68, + "grad_norm": 3.6506991386413574, + "learning_rate": 1.622733141269227e-06, + "logits/chosen": -0.5024988651275635, + "logits/rejected": -0.5939018130302429, + "logps/chosen": -78.45008850097656, + "logps/rejected": -103.89423370361328, + "loss": 0.6867, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8776512145996094, + "rewards/margins": 5.482760429382324, + "rewards/rejected": -2.605109214782715, + "step": 14710 + }, + { + "epoch": 3.68, + "grad_norm": 6.408716678619385, + "learning_rate": 1.6221535935808774e-06, + "logits/chosen": -0.5714104771614075, + "logits/rejected": -0.6426911354064941, + "logps/chosen": -47.7834358215332, + "logps/rejected": -107.26341247558594, + "loss": 0.5543, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1523871421813965, + "rewards/margins": 8.115803718566895, + "rewards/rejected": -4.96341609954834, + "step": 14711 + }, + { + "epoch": 3.68, + "grad_norm": 2.2363390922546387, + "learning_rate": 1.6215741293627075e-06, + "logits/chosen": -0.5176240801811218, + "logits/rejected": -0.5835426449775696, + "logps/chosen": -52.430057525634766, + "logps/rejected": -104.51349639892578, + "loss": 0.5858, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3876545429229736, + "rewards/margins": 6.661074161529541, + "rewards/rejected": -3.2734193801879883, + "step": 14712 + }, + { + "epoch": 3.68, + "grad_norm": 4.586342811584473, + "learning_rate": 1.6209947486290323e-06, + "logits/chosen": -0.5392429828643799, + "logits/rejected": -0.5776026248931885, + "logps/chosen": -42.21311569213867, + "logps/rejected": -117.86034393310547, + "loss": 0.602, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2801384925842285, + "rewards/margins": 7.7861433029174805, + "rewards/rejected": -4.50600528717041, + "step": 14713 + }, + { + "epoch": 3.68, + "grad_norm": 3.5833630561828613, + "learning_rate": 1.6204154513941727e-06, + "logits/chosen": -0.49884507060050964, + "logits/rejected": -0.6056802272796631, + "logps/chosen": -61.643959045410156, + "logps/rejected": -105.74383544921875, + "loss": 0.5544, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8653242588043213, + "rewards/margins": 7.417905807495117, + "rewards/rejected": -4.552581787109375, + "step": 14714 + }, + { + "epoch": 3.68, + "grad_norm": 3.6389060020446777, + "learning_rate": 1.6198362376724419e-06, + "logits/chosen": -0.6054969429969788, + "logits/rejected": -0.6815000772476196, + "logps/chosen": -56.29658126831055, + "logps/rejected": -134.9075469970703, + "loss": 0.6644, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.383408546447754, + "rewards/margins": 9.10543441772461, + "rewards/rejected": -5.722026824951172, + "step": 14715 + }, + { + "epoch": 3.68, + "grad_norm": 7.260465621948242, + "learning_rate": 1.6192571074781516e-06, + "logits/chosen": -0.5376390814781189, + "logits/rejected": -0.6429783701896667, + "logps/chosen": -60.7193717956543, + "logps/rejected": -114.61640167236328, + "loss": 0.61, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8273985385894775, + "rewards/margins": 7.914093971252441, + "rewards/rejected": -5.086695194244385, + "step": 14716 + }, + { + "epoch": 3.68, + "grad_norm": 6.149951457977295, + "learning_rate": 1.6186780608256154e-06, + "logits/chosen": -0.5639499425888062, + "logits/rejected": -0.6612356901168823, + "logps/chosen": -53.73174285888672, + "logps/rejected": -93.65829467773438, + "loss": 0.6484, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.228476047515869, + "rewards/margins": 6.608800888061523, + "rewards/rejected": -3.380324602127075, + "step": 14717 + }, + { + "epoch": 3.68, + "grad_norm": 54.38700485229492, + "learning_rate": 1.6180990977291393e-06, + "logits/chosen": -0.5229128003120422, + "logits/rejected": -0.6108993291854858, + "logps/chosen": -63.487667083740234, + "logps/rejected": -91.11123657226562, + "loss": 1.0074, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9557759761810303, + "rewards/margins": 6.075138092041016, + "rewards/rejected": -3.1193621158599854, + "step": 14718 + }, + { + "epoch": 3.68, + "grad_norm": 15.55461597442627, + "learning_rate": 1.6175202182030336e-06, + "logits/chosen": -0.5456337928771973, + "logits/rejected": -0.6080765128135681, + "logps/chosen": -54.82798385620117, + "logps/rejected": -103.28756713867188, + "loss": 0.6662, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0096006393432617, + "rewards/margins": 6.01323127746582, + "rewards/rejected": -3.0036306381225586, + "step": 14719 + }, + { + "epoch": 3.68, + "grad_norm": 24.823654174804688, + "learning_rate": 1.6169414222616009e-06, + "logits/chosen": -0.5231068134307861, + "logits/rejected": -0.597258448600769, + "logps/chosen": -63.67070388793945, + "logps/rejected": -109.30873107910156, + "loss": 0.6677, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.270106077194214, + "rewards/margins": 6.777488708496094, + "rewards/rejected": -3.507382392883301, + "step": 14720 + }, + { + "epoch": 3.68, + "grad_norm": 3.160154104232788, + "learning_rate": 1.616362709919142e-06, + "logits/chosen": -0.5197652578353882, + "logits/rejected": -0.6469293832778931, + "logps/chosen": -52.357666015625, + "logps/rejected": -107.50635528564453, + "loss": 0.5773, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4715638160705566, + "rewards/margins": 8.295690536499023, + "rewards/rejected": -4.824126243591309, + "step": 14721 + }, + { + "epoch": 3.68, + "grad_norm": 5.157304286956787, + "learning_rate": 1.61578408118996e-06, + "logits/chosen": -0.49480384588241577, + "logits/rejected": -0.5808756947517395, + "logps/chosen": -57.17893981933594, + "logps/rejected": -96.01142883300781, + "loss": 0.6349, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9190711975097656, + "rewards/margins": 5.856093406677246, + "rewards/rejected": -2.9370222091674805, + "step": 14722 + }, + { + "epoch": 3.68, + "grad_norm": 8.77702522277832, + "learning_rate": 1.615205536088356e-06, + "logits/chosen": -0.5564814805984497, + "logits/rejected": -0.6035267114639282, + "logps/chosen": -54.66204071044922, + "logps/rejected": -92.63347625732422, + "loss": 0.8419, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.753170967102051, + "rewards/margins": 4.860091209411621, + "rewards/rejected": -2.1069202423095703, + "step": 14723 + }, + { + "epoch": 3.68, + "grad_norm": 4.29155969619751, + "learning_rate": 1.6146270746286201e-06, + "logits/chosen": -0.5356745719909668, + "logits/rejected": -0.6057666540145874, + "logps/chosen": -66.90291595458984, + "logps/rejected": -109.84083557128906, + "loss": 0.605, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.249807357788086, + "rewards/margins": 7.14375638961792, + "rewards/rejected": -3.893949270248413, + "step": 14724 + }, + { + "epoch": 3.68, + "grad_norm": 3.813608169555664, + "learning_rate": 1.61404869682505e-06, + "logits/chosen": -0.5801215171813965, + "logits/rejected": -0.6517429947853088, + "logps/chosen": -51.068092346191406, + "logps/rejected": -97.33863830566406, + "loss": 0.6537, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.227674961090088, + "rewards/margins": 6.556258201599121, + "rewards/rejected": -3.328582763671875, + "step": 14725 + }, + { + "epoch": 3.68, + "grad_norm": 6.525885105133057, + "learning_rate": 1.6134704026919417e-06, + "logits/chosen": -0.5828865766525269, + "logits/rejected": -0.6677366495132446, + "logps/chosen": -59.50208282470703, + "logps/rejected": -100.84383392333984, + "loss": 0.6618, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.78874135017395, + "rewards/margins": 6.806737899780273, + "rewards/rejected": -4.017996311187744, + "step": 14726 + }, + { + "epoch": 3.68, + "grad_norm": 3.080378532409668, + "learning_rate": 1.6128921922435786e-06, + "logits/chosen": -0.5601067543029785, + "logits/rejected": -0.6168158650398254, + "logps/chosen": -48.626731872558594, + "logps/rejected": -121.86939239501953, + "loss": 0.5396, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5016844272613525, + "rewards/margins": 8.551292419433594, + "rewards/rejected": -5.04960823059082, + "step": 14727 + }, + { + "epoch": 3.68, + "grad_norm": 3.942638397216797, + "learning_rate": 1.6123140654942514e-06, + "logits/chosen": -0.5098389387130737, + "logits/rejected": -0.6028927564620972, + "logps/chosen": -54.523353576660156, + "logps/rejected": -99.38729095458984, + "loss": 0.6955, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.977125406265259, + "rewards/margins": 7.171133995056152, + "rewards/rejected": -4.194008827209473, + "step": 14728 + }, + { + "epoch": 3.68, + "grad_norm": 9.66840934753418, + "learning_rate": 1.6117360224582495e-06, + "logits/chosen": -0.6484197378158569, + "logits/rejected": -0.7285736203193665, + "logps/chosen": -53.95109939575195, + "logps/rejected": -125.5915756225586, + "loss": 0.5844, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2422585487365723, + "rewards/margins": 8.163186073303223, + "rewards/rejected": -4.92092752456665, + "step": 14729 + }, + { + "epoch": 3.68, + "grad_norm": 4.193113803863525, + "learning_rate": 1.611158063149854e-06, + "logits/chosen": -0.5265790224075317, + "logits/rejected": -0.6274702548980713, + "logps/chosen": -57.658267974853516, + "logps/rejected": -116.876220703125, + "loss": 0.5655, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0702621936798096, + "rewards/margins": 8.25149154663086, + "rewards/rejected": -5.181229114532471, + "step": 14730 + }, + { + "epoch": 3.69, + "grad_norm": 3.2284514904022217, + "learning_rate": 1.6105801875833453e-06, + "logits/chosen": -0.5199403166770935, + "logits/rejected": -0.5914626121520996, + "logps/chosen": -55.19075012207031, + "logps/rejected": -108.28705596923828, + "loss": 0.584, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2910094261169434, + "rewards/margins": 6.9751434326171875, + "rewards/rejected": -3.684133768081665, + "step": 14731 + }, + { + "epoch": 3.69, + "grad_norm": 3.282792568206787, + "learning_rate": 1.6100023957730077e-06, + "logits/chosen": -0.4877947270870209, + "logits/rejected": -0.5986363887786865, + "logps/chosen": -63.389984130859375, + "logps/rejected": -99.01084899902344, + "loss": 0.6327, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.451050281524658, + "rewards/margins": 6.884299278259277, + "rewards/rejected": -3.433248519897461, + "step": 14732 + }, + { + "epoch": 3.69, + "grad_norm": 7.582329750061035, + "learning_rate": 1.609424687733116e-06, + "logits/chosen": -0.6536191701889038, + "logits/rejected": -0.634365439414978, + "logps/chosen": -69.66250610351562, + "logps/rejected": -105.70878601074219, + "loss": 0.6574, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.122541904449463, + "rewards/margins": 5.915921211242676, + "rewards/rejected": -2.7933785915374756, + "step": 14733 + }, + { + "epoch": 3.69, + "grad_norm": 4.016461372375488, + "learning_rate": 1.6088470634779452e-06, + "logits/chosen": -0.5229964256286621, + "logits/rejected": -0.6137431263923645, + "logps/chosen": -54.16036605834961, + "logps/rejected": -122.50346374511719, + "loss": 0.5351, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0938920974731445, + "rewards/margins": 7.40791130065918, + "rewards/rejected": -4.314018726348877, + "step": 14734 + }, + { + "epoch": 3.69, + "grad_norm": 3.871969699859619, + "learning_rate": 1.6082695230217721e-06, + "logits/chosen": -0.5883080363273621, + "logits/rejected": -0.7076659798622131, + "logps/chosen": -50.93073272705078, + "logps/rejected": -108.9646224975586, + "loss": 0.5911, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2070348262786865, + "rewards/margins": 7.970970630645752, + "rewards/rejected": -4.76393461227417, + "step": 14735 + }, + { + "epoch": 3.69, + "grad_norm": 4.703510761260986, + "learning_rate": 1.6076920663788652e-06, + "logits/chosen": -0.46411556005477905, + "logits/rejected": -0.5560564398765564, + "logps/chosen": -62.24547576904297, + "logps/rejected": -108.21354675292969, + "loss": 0.5821, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.073619842529297, + "rewards/margins": 6.266540050506592, + "rewards/rejected": -3.1929192543029785, + "step": 14736 + }, + { + "epoch": 3.69, + "grad_norm": 9.533978462219238, + "learning_rate": 1.6071146935634968e-06, + "logits/chosen": -0.6154098510742188, + "logits/rejected": -0.6340059041976929, + "logps/chosen": -47.36043930053711, + "logps/rejected": -106.7688980102539, + "loss": 0.7428, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7945749759674072, + "rewards/margins": 6.774630069732666, + "rewards/rejected": -3.9800546169281006, + "step": 14737 + }, + { + "epoch": 3.69, + "grad_norm": 3.099881410598755, + "learning_rate": 1.6065374045899334e-06, + "logits/chosen": -0.4866745173931122, + "logits/rejected": -0.6304542422294617, + "logps/chosen": -64.41691589355469, + "logps/rejected": -92.80443572998047, + "loss": 0.556, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.037076234817505, + "rewards/margins": 7.62979793548584, + "rewards/rejected": -4.592720985412598, + "step": 14738 + }, + { + "epoch": 3.69, + "grad_norm": 3.6401619911193848, + "learning_rate": 1.6059601994724389e-06, + "logits/chosen": -0.5626040101051331, + "logits/rejected": -0.6154852509498596, + "logps/chosen": -56.62971496582031, + "logps/rejected": -107.86246490478516, + "loss": 0.6034, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2021982669830322, + "rewards/margins": 7.016294479370117, + "rewards/rejected": -3.814096450805664, + "step": 14739 + }, + { + "epoch": 3.69, + "grad_norm": 3.8603386878967285, + "learning_rate": 1.6053830782252794e-06, + "logits/chosen": -0.5671178102493286, + "logits/rejected": -0.609668493270874, + "logps/chosen": -57.356441497802734, + "logps/rejected": -111.90534210205078, + "loss": 0.7136, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2750940322875977, + "rewards/margins": 5.9962568283081055, + "rewards/rejected": -2.7211625576019287, + "step": 14740 + }, + { + "epoch": 3.69, + "grad_norm": 5.3464250564575195, + "learning_rate": 1.6048060408627142e-06, + "logits/chosen": -0.5480939149856567, + "logits/rejected": -0.6375125050544739, + "logps/chosen": -56.06231689453125, + "logps/rejected": -100.79219818115234, + "loss": 0.7282, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8534743785858154, + "rewards/margins": 6.505014896392822, + "rewards/rejected": -3.6515402793884277, + "step": 14741 + }, + { + "epoch": 3.69, + "grad_norm": 2.3175511360168457, + "learning_rate": 1.6042290873990018e-06, + "logits/chosen": -0.5231702327728271, + "logits/rejected": -0.6018162369728088, + "logps/chosen": -52.87916946411133, + "logps/rejected": -104.54322814941406, + "loss": 0.5476, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.182854175567627, + "rewards/margins": 7.430143356323242, + "rewards/rejected": -4.247289180755615, + "step": 14742 + }, + { + "epoch": 3.69, + "grad_norm": 3.9437923431396484, + "learning_rate": 1.6036522178484005e-06, + "logits/chosen": -0.5020750761032104, + "logits/rejected": -0.6072996854782104, + "logps/chosen": -52.433448791503906, + "logps/rejected": -107.95564270019531, + "loss": 0.589, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.842515230178833, + "rewards/margins": 7.581465721130371, + "rewards/rejected": -4.738950252532959, + "step": 14743 + }, + { + "epoch": 3.69, + "grad_norm": 1.957534909248352, + "learning_rate": 1.6030754322251667e-06, + "logits/chosen": -0.5837791562080383, + "logits/rejected": -0.6770638823509216, + "logps/chosen": -53.956478118896484, + "logps/rejected": -103.3290786743164, + "loss": 0.5939, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2340760231018066, + "rewards/margins": 7.775304794311523, + "rewards/rejected": -4.541228771209717, + "step": 14744 + }, + { + "epoch": 3.69, + "grad_norm": 4.621178150177002, + "learning_rate": 1.602498730543553e-06, + "logits/chosen": -0.5212327837944031, + "logits/rejected": -0.5985292792320251, + "logps/chosen": -62.99530029296875, + "logps/rejected": -87.91456604003906, + "loss": 0.7118, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.524595260620117, + "rewards/margins": 6.624805450439453, + "rewards/rejected": -3.100210189819336, + "step": 14745 + }, + { + "epoch": 3.69, + "grad_norm": 4.330843448638916, + "learning_rate": 1.601922112817807e-06, + "logits/chosen": -0.5332399010658264, + "logits/rejected": -0.6149857044219971, + "logps/chosen": -49.501373291015625, + "logps/rejected": -103.95568084716797, + "loss": 0.662, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.411464214324951, + "rewards/margins": 6.877801895141602, + "rewards/rejected": -3.4663376808166504, + "step": 14746 + }, + { + "epoch": 3.69, + "grad_norm": 9.258234024047852, + "learning_rate": 1.6013455790621818e-06, + "logits/chosen": -0.5633183717727661, + "logits/rejected": -0.5694791078567505, + "logps/chosen": -46.124202728271484, + "logps/rejected": -113.30332946777344, + "loss": 0.7449, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2489166259765625, + "rewards/margins": 6.919609546661377, + "rewards/rejected": -3.6706933975219727, + "step": 14747 + }, + { + "epoch": 3.69, + "grad_norm": 5.163473129272461, + "learning_rate": 1.6007691292909223e-06, + "logits/chosen": -0.6348300576210022, + "logits/rejected": -0.7216312289237976, + "logps/chosen": -64.3751449584961, + "logps/rejected": -95.0001449584961, + "loss": 0.7521, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.516697883605957, + "rewards/margins": 6.530055046081543, + "rewards/rejected": -3.013357162475586, + "step": 14748 + }, + { + "epoch": 3.69, + "grad_norm": 5.810948848724365, + "learning_rate": 1.6001927635182718e-06, + "logits/chosen": -0.5601155161857605, + "logits/rejected": -0.6340451240539551, + "logps/chosen": -52.69596862792969, + "logps/rejected": -85.89765930175781, + "loss": 0.7451, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.094963312149048, + "rewards/margins": 5.06782341003418, + "rewards/rejected": -1.972860336303711, + "step": 14749 + }, + { + "epoch": 3.69, + "grad_norm": 1.931860089302063, + "learning_rate": 1.5996164817584753e-06, + "logits/chosen": -0.5660635232925415, + "logits/rejected": -0.6742553114891052, + "logps/chosen": -42.656654357910156, + "logps/rejected": -92.63400268554688, + "loss": 0.4756, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1555044651031494, + "rewards/margins": 7.3871026039123535, + "rewards/rejected": -4.231596946716309, + "step": 14750 + }, + { + "epoch": 3.69, + "grad_norm": 4.000225067138672, + "learning_rate": 1.599040284025773e-06, + "logits/chosen": -0.5287156105041504, + "logits/rejected": -0.6042317152023315, + "logps/chosen": -58.35696029663086, + "logps/rejected": -93.67535400390625, + "loss": 0.6629, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2760372161865234, + "rewards/margins": 6.664821624755859, + "rewards/rejected": -3.388784170150757, + "step": 14751 + }, + { + "epoch": 3.69, + "grad_norm": 3.165817975997925, + "learning_rate": 1.5984641703344006e-06, + "logits/chosen": -0.5312100648880005, + "logits/rejected": -0.595138669013977, + "logps/chosen": -45.908512115478516, + "logps/rejected": -98.54303741455078, + "loss": 0.6279, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4923040866851807, + "rewards/margins": 7.096342086791992, + "rewards/rejected": -3.6040377616882324, + "step": 14752 + }, + { + "epoch": 3.69, + "grad_norm": 7.5741448402404785, + "learning_rate": 1.5978881406985986e-06, + "logits/chosen": -0.550434947013855, + "logits/rejected": -0.6347739100456238, + "logps/chosen": -54.88317108154297, + "logps/rejected": -101.32102966308594, + "loss": 0.6812, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.175597667694092, + "rewards/margins": 6.682639122009277, + "rewards/rejected": -3.5070412158966064, + "step": 14753 + }, + { + "epoch": 3.69, + "grad_norm": 4.731673240661621, + "learning_rate": 1.5973121951325975e-06, + "logits/chosen": -0.5989682674407959, + "logits/rejected": -0.6819876432418823, + "logps/chosen": -48.43456268310547, + "logps/rejected": -117.28366088867188, + "loss": 0.6114, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1031785011291504, + "rewards/margins": 7.688971042633057, + "rewards/rejected": -4.585792541503906, + "step": 14754 + }, + { + "epoch": 3.69, + "grad_norm": 4.2838335037231445, + "learning_rate": 1.596736333650633e-06, + "logits/chosen": -0.5760364532470703, + "logits/rejected": -0.6851099133491516, + "logps/chosen": -45.937522888183594, + "logps/rejected": -88.86253356933594, + "loss": 0.5614, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1704795360565186, + "rewards/margins": 7.368960857391357, + "rewards/rejected": -4.19848108291626, + "step": 14755 + }, + { + "epoch": 3.69, + "grad_norm": 6.120757102966309, + "learning_rate": 1.596160556266932e-06, + "logits/chosen": -0.4869474172592163, + "logits/rejected": -0.5971430540084839, + "logps/chosen": -64.62859344482422, + "logps/rejected": -98.90985107421875, + "loss": 0.6191, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.415925979614258, + "rewards/margins": 8.0804443359375, + "rewards/rejected": -4.664517879486084, + "step": 14756 + }, + { + "epoch": 3.69, + "grad_norm": 11.270403861999512, + "learning_rate": 1.5955848629957255e-06, + "logits/chosen": -0.475612610578537, + "logits/rejected": -0.5766485929489136, + "logps/chosen": -63.063873291015625, + "logps/rejected": -97.12174987792969, + "loss": 0.7669, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.890353202819824, + "rewards/margins": 6.1133832931518555, + "rewards/rejected": -3.223029613494873, + "step": 14757 + }, + { + "epoch": 3.69, + "grad_norm": 5.529366493225098, + "learning_rate": 1.5950092538512384e-06, + "logits/chosen": -0.5660954117774963, + "logits/rejected": -0.6346839070320129, + "logps/chosen": -48.9393196105957, + "logps/rejected": -99.46199798583984, + "loss": 0.6265, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.127133846282959, + "rewards/margins": 6.764458179473877, + "rewards/rejected": -3.637324571609497, + "step": 14758 + }, + { + "epoch": 3.69, + "grad_norm": 2.9138097763061523, + "learning_rate": 1.5944337288476925e-06, + "logits/chosen": -0.5463320016860962, + "logits/rejected": -0.6070358157157898, + "logps/chosen": -54.79509735107422, + "logps/rejected": -113.66521453857422, + "loss": 0.562, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9474120140075684, + "rewards/margins": 7.092545509338379, + "rewards/rejected": -4.145133018493652, + "step": 14759 + }, + { + "epoch": 3.69, + "grad_norm": 5.243396282196045, + "learning_rate": 1.5938582879993137e-06, + "logits/chosen": -0.6018918752670288, + "logits/rejected": -0.7409428954124451, + "logps/chosen": -54.331565856933594, + "logps/rejected": -94.5779800415039, + "loss": 0.6964, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.312221050262451, + "rewards/margins": 7.164864540100098, + "rewards/rejected": -3.8526434898376465, + "step": 14760 + }, + { + "epoch": 3.69, + "grad_norm": 7.22503137588501, + "learning_rate": 1.5932829313203174e-06, + "logits/chosen": -0.5476466417312622, + "logits/rejected": -0.6333239078521729, + "logps/chosen": -48.992095947265625, + "logps/rejected": -102.34677124023438, + "loss": 0.6455, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6748266220092773, + "rewards/margins": 6.542866230010986, + "rewards/rejected": -3.868039846420288, + "step": 14761 + }, + { + "epoch": 3.69, + "grad_norm": 14.436149597167969, + "learning_rate": 1.5927076588249246e-06, + "logits/chosen": -0.486293226480484, + "logits/rejected": -0.5966624021530151, + "logps/chosen": -53.639984130859375, + "logps/rejected": -101.6790542602539, + "loss": 0.5897, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3483972549438477, + "rewards/margins": 7.240760326385498, + "rewards/rejected": -3.8923635482788086, + "step": 14762 + }, + { + "epoch": 3.69, + "grad_norm": 3.9105241298675537, + "learning_rate": 1.5921324705273504e-06, + "logits/chosen": -0.4826365113258362, + "logits/rejected": -0.5753215551376343, + "logps/chosen": -51.82234573364258, + "logps/rejected": -103.18388366699219, + "loss": 0.6151, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.209057331085205, + "rewards/margins": 6.915195465087891, + "rewards/rejected": -3.7061376571655273, + "step": 14763 + }, + { + "epoch": 3.69, + "grad_norm": 6.715968608856201, + "learning_rate": 1.591557366441806e-06, + "logits/chosen": -0.552760899066925, + "logits/rejected": -0.6328697800636292, + "logps/chosen": -50.28246307373047, + "logps/rejected": -98.76339721679688, + "loss": 0.6279, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0915427207946777, + "rewards/margins": 6.928297519683838, + "rewards/rejected": -3.83675479888916, + "step": 14764 + }, + { + "epoch": 3.69, + "grad_norm": 14.8798828125, + "learning_rate": 1.5909823465825058e-06, + "logits/chosen": -0.5300455093383789, + "logits/rejected": -0.6315553784370422, + "logps/chosen": -70.98597717285156, + "logps/rejected": -83.16963958740234, + "loss": 0.6819, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7646703720092773, + "rewards/margins": 6.005202293395996, + "rewards/rejected": -3.2405319213867188, + "step": 14765 + }, + { + "epoch": 3.69, + "grad_norm": 12.748954772949219, + "learning_rate": 1.5904074109636576e-06, + "logits/chosen": -0.5011477470397949, + "logits/rejected": -0.5765472650527954, + "logps/chosen": -69.11731719970703, + "logps/rejected": -89.25232696533203, + "loss": 0.8079, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.837423801422119, + "rewards/margins": 5.058754920959473, + "rewards/rejected": -2.2213313579559326, + "step": 14766 + }, + { + "epoch": 3.69, + "grad_norm": 5.137425899505615, + "learning_rate": 1.589832559599468e-06, + "logits/chosen": -0.6194152235984802, + "logits/rejected": -0.7084923982620239, + "logps/chosen": -44.109130859375, + "logps/rejected": -94.8001937866211, + "loss": 0.5682, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.982024669647217, + "rewards/margins": 6.872281551361084, + "rewards/rejected": -3.890256643295288, + "step": 14767 + }, + { + "epoch": 3.69, + "grad_norm": 3.258147716522217, + "learning_rate": 1.5892577925041425e-06, + "logits/chosen": -0.510476291179657, + "logits/rejected": -0.5782557129859924, + "logps/chosen": -47.8749885559082, + "logps/rejected": -105.49285125732422, + "loss": 0.6346, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0845184326171875, + "rewards/margins": 7.544250965118408, + "rewards/rejected": -4.459732532501221, + "step": 14768 + }, + { + "epoch": 3.69, + "grad_norm": 43.98193359375, + "learning_rate": 1.588683109691888e-06, + "logits/chosen": -0.5120857954025269, + "logits/rejected": -0.5358554124832153, + "logps/chosen": -58.11761474609375, + "logps/rejected": -102.95672607421875, + "loss": 0.7033, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8483195304870605, + "rewards/margins": 4.64352560043335, + "rewards/rejected": -1.7952064275741577, + "step": 14769 + }, + { + "epoch": 3.69, + "grad_norm": 6.047701835632324, + "learning_rate": 1.588108511176899e-06, + "logits/chosen": -0.5308446288108826, + "logits/rejected": -0.6197803020477295, + "logps/chosen": -58.371307373046875, + "logps/rejected": -98.99972534179688, + "loss": 0.6401, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8684842586517334, + "rewards/margins": 6.757171630859375, + "rewards/rejected": -3.8886876106262207, + "step": 14770 + }, + { + "epoch": 3.7, + "grad_norm": 4.169511795043945, + "learning_rate": 1.5875339969733778e-06, + "logits/chosen": -0.5908375978469849, + "logits/rejected": -0.670813798904419, + "logps/chosen": -61.76189422607422, + "logps/rejected": -102.23854064941406, + "loss": 0.7648, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1718642711639404, + "rewards/margins": 7.396962642669678, + "rewards/rejected": -4.22509765625, + "step": 14771 + }, + { + "epoch": 3.7, + "grad_norm": 3.6039047241210938, + "learning_rate": 1.5869595670955229e-06, + "logits/chosen": -0.5519473552703857, + "logits/rejected": -0.6343571543693542, + "logps/chosen": -46.01262283325195, + "logps/rejected": -88.34309387207031, + "loss": 0.5898, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2184791564941406, + "rewards/margins": 6.624497890472412, + "rewards/rejected": -3.4060187339782715, + "step": 14772 + }, + { + "epoch": 3.7, + "grad_norm": 6.468968391418457, + "learning_rate": 1.5863852215575277e-06, + "logits/chosen": -0.6212695240974426, + "logits/rejected": -0.6990517377853394, + "logps/chosen": -51.6357421875, + "logps/rejected": -111.739501953125, + "loss": 0.6337, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.173874616622925, + "rewards/margins": 7.3970136642456055, + "rewards/rejected": -4.223138809204102, + "step": 14773 + }, + { + "epoch": 3.7, + "grad_norm": 22.046430587768555, + "learning_rate": 1.5858109603735827e-06, + "logits/chosen": -0.563239336013794, + "logits/rejected": -0.6721755266189575, + "logps/chosen": -61.513954162597656, + "logps/rejected": -105.81450653076172, + "loss": 0.6667, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.201683521270752, + "rewards/margins": 7.129850387573242, + "rewards/rejected": -3.9281671047210693, + "step": 14774 + }, + { + "epoch": 3.7, + "grad_norm": 5.557712554931641, + "learning_rate": 1.585236783557882e-06, + "logits/chosen": -0.5990287661552429, + "logits/rejected": -0.6730753183364868, + "logps/chosen": -54.64406204223633, + "logps/rejected": -101.52661895751953, + "loss": 0.6375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9840807914733887, + "rewards/margins": 7.131403923034668, + "rewards/rejected": -4.147323131561279, + "step": 14775 + }, + { + "epoch": 3.7, + "grad_norm": 3.905163526535034, + "learning_rate": 1.5846626911246122e-06, + "logits/chosen": -0.6221945881843567, + "logits/rejected": -0.7216143608093262, + "logps/chosen": -58.74552917480469, + "logps/rejected": -100.16802215576172, + "loss": 0.6544, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3487789630889893, + "rewards/margins": 6.913699626922607, + "rewards/rejected": -3.5649213790893555, + "step": 14776 + }, + { + "epoch": 3.7, + "grad_norm": 3.74259877204895, + "learning_rate": 1.5840886830879588e-06, + "logits/chosen": -0.6012066006660461, + "logits/rejected": -0.6726435422897339, + "logps/chosen": -52.6131591796875, + "logps/rejected": -84.62616729736328, + "loss": 0.6454, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.150930881500244, + "rewards/margins": 6.317984580993652, + "rewards/rejected": -3.1670539379119873, + "step": 14777 + }, + { + "epoch": 3.7, + "grad_norm": 6.287954330444336, + "learning_rate": 1.5835147594621086e-06, + "logits/chosen": -0.5728088617324829, + "logits/rejected": -0.6502231359481812, + "logps/chosen": -50.5313835144043, + "logps/rejected": -97.9287338256836, + "loss": 0.6436, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1578335762023926, + "rewards/margins": 6.190159320831299, + "rewards/rejected": -3.0323262214660645, + "step": 14778 + }, + { + "epoch": 3.7, + "grad_norm": 9.818999290466309, + "learning_rate": 1.5829409202612412e-06, + "logits/chosen": -0.5776873230934143, + "logits/rejected": -0.6496791839599609, + "logps/chosen": -48.80420684814453, + "logps/rejected": -113.09085083007812, + "loss": 0.6923, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.145373821258545, + "rewards/margins": 7.320523738861084, + "rewards/rejected": -4.175150394439697, + "step": 14779 + }, + { + "epoch": 3.7, + "grad_norm": 34.171932220458984, + "learning_rate": 1.5823671654995399e-06, + "logits/chosen": -0.5136831998825073, + "logits/rejected": -0.6478144526481628, + "logps/chosen": -63.592350006103516, + "logps/rejected": -110.74503326416016, + "loss": 0.698, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2541098594665527, + "rewards/margins": 8.638067245483398, + "rewards/rejected": -5.3839569091796875, + "step": 14780 + }, + { + "epoch": 3.7, + "grad_norm": 4.259859085083008, + "learning_rate": 1.5817934951911807e-06, + "logits/chosen": -0.5246514081954956, + "logits/rejected": -0.6644817590713501, + "logps/chosen": -49.375343322753906, + "logps/rejected": -91.13765716552734, + "loss": 0.5285, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.98258638381958, + "rewards/margins": 7.039935111999512, + "rewards/rejected": -4.057348728179932, + "step": 14781 + }, + { + "epoch": 3.7, + "grad_norm": 2.7847750186920166, + "learning_rate": 1.5812199093503389e-06, + "logits/chosen": -0.5617526769638062, + "logits/rejected": -0.683523416519165, + "logps/chosen": -61.231903076171875, + "logps/rejected": -110.35576629638672, + "loss": 0.5656, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1929116249084473, + "rewards/margins": 7.481842517852783, + "rewards/rejected": -4.288930892944336, + "step": 14782 + }, + { + "epoch": 3.7, + "grad_norm": 4.8684306144714355, + "learning_rate": 1.580646407991191e-06, + "logits/chosen": -0.5599814653396606, + "logits/rejected": -0.6629824638366699, + "logps/chosen": -51.24315643310547, + "logps/rejected": -94.93992614746094, + "loss": 0.5606, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.302283525466919, + "rewards/margins": 7.758471488952637, + "rewards/rejected": -4.456188201904297, + "step": 14783 + }, + { + "epoch": 3.7, + "grad_norm": 7.1993207931518555, + "learning_rate": 1.5800729911279071e-06, + "logits/chosen": -0.510719895362854, + "logits/rejected": -0.5443398952484131, + "logps/chosen": -53.655879974365234, + "logps/rejected": -97.00650024414062, + "loss": 0.7084, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9969937801361084, + "rewards/margins": 5.4240193367004395, + "rewards/rejected": -2.42702579498291, + "step": 14784 + }, + { + "epoch": 3.7, + "grad_norm": 4.242602825164795, + "learning_rate": 1.5794996587746558e-06, + "logits/chosen": -0.5703784823417664, + "logits/rejected": -0.6720230579376221, + "logps/chosen": -52.45891571044922, + "logps/rejected": -91.21868896484375, + "loss": 0.5769, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0407278537750244, + "rewards/margins": 7.045746326446533, + "rewards/rejected": -4.0050177574157715, + "step": 14785 + }, + { + "epoch": 3.7, + "grad_norm": 4.9313249588012695, + "learning_rate": 1.578926410945606e-06, + "logits/chosen": -0.5317044854164124, + "logits/rejected": -0.6256383657455444, + "logps/chosen": -56.92821502685547, + "logps/rejected": -102.590087890625, + "loss": 0.6383, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0578486919403076, + "rewards/margins": 7.463731288909912, + "rewards/rejected": -4.405882835388184, + "step": 14786 + }, + { + "epoch": 3.7, + "grad_norm": 3.3812496662139893, + "learning_rate": 1.5783532476549252e-06, + "logits/chosen": -0.557200014591217, + "logits/rejected": -0.6457298994064331, + "logps/chosen": -53.67705535888672, + "logps/rejected": -98.30333709716797, + "loss": 0.5942, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.284166097640991, + "rewards/margins": 8.03921127319336, + "rewards/rejected": -4.755044937133789, + "step": 14787 + }, + { + "epoch": 3.7, + "grad_norm": 4.192432403564453, + "learning_rate": 1.577780168916775e-06, + "logits/chosen": -0.5048784613609314, + "logits/rejected": -0.5466311573982239, + "logps/chosen": -57.37033462524414, + "logps/rejected": -98.58560180664062, + "loss": 0.67, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.077038049697876, + "rewards/margins": 6.082745552062988, + "rewards/rejected": -3.005706787109375, + "step": 14788 + }, + { + "epoch": 3.7, + "grad_norm": 7.430106163024902, + "learning_rate": 1.577207174745315e-06, + "logits/chosen": -0.6032698154449463, + "logits/rejected": -0.6414476633071899, + "logps/chosen": -67.11659240722656, + "logps/rejected": -117.06080627441406, + "loss": 0.6252, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1577811241149902, + "rewards/margins": 7.008617877960205, + "rewards/rejected": -3.8508362770080566, + "step": 14789 + }, + { + "epoch": 3.7, + "grad_norm": 7.051849842071533, + "learning_rate": 1.5766342651547084e-06, + "logits/chosen": -0.5592968463897705, + "logits/rejected": -0.6062777638435364, + "logps/chosen": -50.98204040527344, + "logps/rejected": -116.29580688476562, + "loss": 0.6107, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1255710124969482, + "rewards/margins": 7.084659576416016, + "rewards/rejected": -3.9590885639190674, + "step": 14790 + }, + { + "epoch": 3.7, + "grad_norm": 4.868363380432129, + "learning_rate": 1.5760614401591102e-06, + "logits/chosen": -0.5259878039360046, + "logits/rejected": -0.6369582414627075, + "logps/chosen": -67.04104614257812, + "logps/rejected": -98.75331115722656, + "loss": 0.6887, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0101165771484375, + "rewards/margins": 6.889335632324219, + "rewards/rejected": -3.8792190551757812, + "step": 14791 + }, + { + "epoch": 3.7, + "grad_norm": 6.916166305541992, + "learning_rate": 1.5754886997726738e-06, + "logits/chosen": -0.5146335363388062, + "logits/rejected": -0.5802306532859802, + "logps/chosen": -52.96544647216797, + "logps/rejected": -100.54390716552734, + "loss": 0.5721, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.995466470718384, + "rewards/margins": 6.670326232910156, + "rewards/rejected": -3.6748597621917725, + "step": 14792 + }, + { + "epoch": 3.7, + "grad_norm": 4.353733539581299, + "learning_rate": 1.5749160440095561e-06, + "logits/chosen": -0.5781861543655396, + "logits/rejected": -0.6829687356948853, + "logps/chosen": -57.82859420776367, + "logps/rejected": -101.77311706542969, + "loss": 0.5567, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.144207715988159, + "rewards/margins": 8.028779983520508, + "rewards/rejected": -4.884572505950928, + "step": 14793 + }, + { + "epoch": 3.7, + "grad_norm": 6.391414165496826, + "learning_rate": 1.5743434728839058e-06, + "logits/chosen": -0.6107597947120667, + "logits/rejected": -0.6470253467559814, + "logps/chosen": -58.24432373046875, + "logps/rejected": -93.76463317871094, + "loss": 0.7101, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.821889877319336, + "rewards/margins": 6.594403266906738, + "rewards/rejected": -3.772512912750244, + "step": 14794 + }, + { + "epoch": 3.7, + "grad_norm": 6.146510124206543, + "learning_rate": 1.5737709864098704e-06, + "logits/chosen": -0.5046248435974121, + "logits/rejected": -0.5769387483596802, + "logps/chosen": -65.4675521850586, + "logps/rejected": -105.47117614746094, + "loss": 0.6363, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9887495040893555, + "rewards/margins": 6.38339900970459, + "rewards/rejected": -3.3946495056152344, + "step": 14795 + }, + { + "epoch": 3.7, + "grad_norm": 3.6368672847747803, + "learning_rate": 1.5731985846016e-06, + "logits/chosen": -0.5324850082397461, + "logits/rejected": -0.5933123826980591, + "logps/chosen": -47.78046417236328, + "logps/rejected": -108.002685546875, + "loss": 0.5702, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5664212703704834, + "rewards/margins": 8.362089157104492, + "rewards/rejected": -4.795668601989746, + "step": 14796 + }, + { + "epoch": 3.7, + "grad_norm": 4.355311870574951, + "learning_rate": 1.5726262674732357e-06, + "logits/chosen": -0.5524818897247314, + "logits/rejected": -0.6417281031608582, + "logps/chosen": -56.143402099609375, + "logps/rejected": -91.66932678222656, + "loss": 0.6782, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9553871154785156, + "rewards/margins": 5.993107795715332, + "rewards/rejected": -3.037720203399658, + "step": 14797 + }, + { + "epoch": 3.7, + "grad_norm": 4.701535224914551, + "learning_rate": 1.5720540350389235e-06, + "logits/chosen": -0.541276752948761, + "logits/rejected": -0.6486514806747437, + "logps/chosen": -56.09357452392578, + "logps/rejected": -111.28300476074219, + "loss": 0.6279, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9843955039978027, + "rewards/margins": 8.523701667785645, + "rewards/rejected": -5.539305686950684, + "step": 14798 + }, + { + "epoch": 3.7, + "grad_norm": 13.470259666442871, + "learning_rate": 1.5714818873128007e-06, + "logits/chosen": -0.5466156005859375, + "logits/rejected": -0.6067167520523071, + "logps/chosen": -56.97606658935547, + "logps/rejected": -110.2060775756836, + "loss": 0.6584, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1389143466949463, + "rewards/margins": 7.084356307983398, + "rewards/rejected": -3.9454424381256104, + "step": 14799 + }, + { + "epoch": 3.7, + "grad_norm": 4.011157035827637, + "learning_rate": 1.5709098243090088e-06, + "logits/chosen": -0.5395784974098206, + "logits/rejected": -0.6160440444946289, + "logps/chosen": -53.677486419677734, + "logps/rejected": -128.44747924804688, + "loss": 0.599, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.191187620162964, + "rewards/margins": 8.280755996704102, + "rewards/rejected": -5.089567184448242, + "step": 14800 + }, + { + "epoch": 3.7, + "grad_norm": 6.058903694152832, + "learning_rate": 1.5703378460416824e-06, + "logits/chosen": -0.5000147819519043, + "logits/rejected": -0.5793550610542297, + "logps/chosen": -51.112056732177734, + "logps/rejected": -106.81729888916016, + "loss": 0.6259, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0337181091308594, + "rewards/margins": 7.713634490966797, + "rewards/rejected": -4.6799163818359375, + "step": 14801 + }, + { + "epoch": 3.7, + "grad_norm": 6.083541393280029, + "learning_rate": 1.5697659525249543e-06, + "logits/chosen": -0.4442858397960663, + "logits/rejected": -0.47664567828178406, + "logps/chosen": -52.652076721191406, + "logps/rejected": -94.5239486694336, + "loss": 0.6952, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.4410316944122314, + "rewards/margins": 5.204253196716309, + "rewards/rejected": -1.7632218599319458, + "step": 14802 + }, + { + "epoch": 3.7, + "grad_norm": 4.410938739776611, + "learning_rate": 1.569194143772959e-06, + "logits/chosen": -0.5138870477676392, + "logits/rejected": -0.6203961968421936, + "logps/chosen": -51.25831985473633, + "logps/rejected": -107.65898132324219, + "loss": 0.6103, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3493432998657227, + "rewards/margins": 7.378046989440918, + "rewards/rejected": -4.028703212738037, + "step": 14803 + }, + { + "epoch": 3.7, + "grad_norm": 6.105734825134277, + "learning_rate": 1.5686224197998246e-06, + "logits/chosen": -0.6072590947151184, + "logits/rejected": -0.6525514125823975, + "logps/chosen": -45.572364807128906, + "logps/rejected": -108.02944946289062, + "loss": 0.7208, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9348134994506836, + "rewards/margins": 5.955684661865234, + "rewards/rejected": -3.0208709239959717, + "step": 14804 + }, + { + "epoch": 3.7, + "grad_norm": 19.200929641723633, + "learning_rate": 1.5680507806196815e-06, + "logits/chosen": -0.5763987302780151, + "logits/rejected": -0.6047794222831726, + "logps/chosen": -51.9230842590332, + "logps/rejected": -105.53765869140625, + "loss": 0.7898, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0775065422058105, + "rewards/margins": 6.556777000427246, + "rewards/rejected": -3.4792704582214355, + "step": 14805 + }, + { + "epoch": 3.7, + "grad_norm": 2.5229899883270264, + "learning_rate": 1.5674792262466544e-06, + "logits/chosen": -0.5658590793609619, + "logits/rejected": -0.6062238812446594, + "logps/chosen": -44.44764709472656, + "logps/rejected": -111.62382507324219, + "loss": 0.5776, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.51383638381958, + "rewards/margins": 7.373019218444824, + "rewards/rejected": -3.859182357788086, + "step": 14806 + }, + { + "epoch": 3.7, + "grad_norm": 4.2319536209106445, + "learning_rate": 1.5669077566948643e-06, + "logits/chosen": -0.543056845664978, + "logits/rejected": -0.6724944114685059, + "logps/chosen": -52.14251708984375, + "logps/rejected": -95.8442153930664, + "loss": 0.6058, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.347240924835205, + "rewards/margins": 7.125432968139648, + "rewards/rejected": -3.7781925201416016, + "step": 14807 + }, + { + "epoch": 3.7, + "grad_norm": 8.647088050842285, + "learning_rate": 1.5663363719784368e-06, + "logits/chosen": -0.5488592982292175, + "logits/rejected": -0.624414324760437, + "logps/chosen": -53.763675689697266, + "logps/rejected": -103.37419891357422, + "loss": 0.6152, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8791894912719727, + "rewards/margins": 7.045839786529541, + "rewards/rejected": -4.16664981842041, + "step": 14808 + }, + { + "epoch": 3.7, + "grad_norm": 5.844645023345947, + "learning_rate": 1.5657650721114903e-06, + "logits/chosen": -0.570745050907135, + "logits/rejected": -0.6830437779426575, + "logps/chosen": -55.66267395019531, + "logps/rejected": -87.33159637451172, + "loss": 0.6468, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8616089820861816, + "rewards/margins": 6.625617980957031, + "rewards/rejected": -3.7640089988708496, + "step": 14809 + }, + { + "epoch": 3.7, + "grad_norm": 6.295675754547119, + "learning_rate": 1.56519385710814e-06, + "logits/chosen": -0.5125772953033447, + "logits/rejected": -0.5860207676887512, + "logps/chosen": -55.606651306152344, + "logps/rejected": -96.770263671875, + "loss": 0.7347, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0349671840667725, + "rewards/margins": 5.551084518432617, + "rewards/rejected": -2.5161170959472656, + "step": 14810 + }, + { + "epoch": 3.71, + "grad_norm": 6.854610919952393, + "learning_rate": 1.5646227269825026e-06, + "logits/chosen": -0.5907130241394043, + "logits/rejected": -0.68048495054245, + "logps/chosen": -56.78129196166992, + "logps/rejected": -84.1793441772461, + "loss": 0.6844, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.098914623260498, + "rewards/margins": 5.993185043334961, + "rewards/rejected": -2.894270896911621, + "step": 14811 + }, + { + "epoch": 3.71, + "grad_norm": 4.9546799659729, + "learning_rate": 1.5640516817486951e-06, + "logits/chosen": -0.5187255144119263, + "logits/rejected": -0.5950483679771423, + "logps/chosen": -53.63973617553711, + "logps/rejected": -104.402587890625, + "loss": 0.6148, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.387328624725342, + "rewards/margins": 7.346441745758057, + "rewards/rejected": -3.959113597869873, + "step": 14812 + }, + { + "epoch": 3.71, + "grad_norm": 16.2301082611084, + "learning_rate": 1.5634807214208214e-06, + "logits/chosen": -0.47217950224876404, + "logits/rejected": -0.5894778370857239, + "logps/chosen": -63.30610656738281, + "logps/rejected": -112.95391845703125, + "loss": 0.6855, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.908576011657715, + "rewards/margins": 7.269786357879639, + "rewards/rejected": -4.361210346221924, + "step": 14813 + }, + { + "epoch": 3.71, + "grad_norm": 3.764956474304199, + "learning_rate": 1.5629098460129948e-06, + "logits/chosen": -0.495012104511261, + "logits/rejected": -0.6195509433746338, + "logps/chosen": -52.71794128417969, + "logps/rejected": -106.791748046875, + "loss": 0.5518, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8287875652313232, + "rewards/margins": 7.837989807128906, + "rewards/rejected": -5.00920295715332, + "step": 14814 + }, + { + "epoch": 3.71, + "grad_norm": 7.992599010467529, + "learning_rate": 1.5623390555393247e-06, + "logits/chosen": -0.6412792801856995, + "logits/rejected": -0.6877564787864685, + "logps/chosen": -46.245025634765625, + "logps/rejected": -102.78028869628906, + "loss": 0.5535, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0448765754699707, + "rewards/margins": 6.015894889831543, + "rewards/rejected": -2.971017599105835, + "step": 14815 + }, + { + "epoch": 3.71, + "grad_norm": 5.687021255493164, + "learning_rate": 1.5617683500139102e-06, + "logits/chosen": -0.656722903251648, + "logits/rejected": -0.7349952459335327, + "logps/chosen": -59.90829849243164, + "logps/rejected": -95.17375183105469, + "loss": 0.6172, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.995684862136841, + "rewards/margins": 6.651124954223633, + "rewards/rejected": -3.655440092086792, + "step": 14816 + }, + { + "epoch": 3.71, + "grad_norm": 8.377350807189941, + "learning_rate": 1.561197729450857e-06, + "logits/chosen": -0.580179750919342, + "logits/rejected": -0.6124727129936218, + "logps/chosen": -58.24427795410156, + "logps/rejected": -119.82820129394531, + "loss": 0.6436, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.134871006011963, + "rewards/margins": 7.305150985717773, + "rewards/rejected": -4.170279502868652, + "step": 14817 + }, + { + "epoch": 3.71, + "grad_norm": 3.9703409671783447, + "learning_rate": 1.5606271938642665e-06, + "logits/chosen": -0.6028825640678406, + "logits/rejected": -0.6804267764091492, + "logps/chosen": -50.619850158691406, + "logps/rejected": -118.90911865234375, + "loss": 0.5802, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1455955505371094, + "rewards/margins": 8.137770652770996, + "rewards/rejected": -4.9921746253967285, + "step": 14818 + }, + { + "epoch": 3.71, + "grad_norm": 6.480317115783691, + "learning_rate": 1.5600567432682367e-06, + "logits/chosen": -0.6212008595466614, + "logits/rejected": -0.697965681552887, + "logps/chosen": -69.92395782470703, + "logps/rejected": -96.67770385742188, + "loss": 0.834, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8260555267333984, + "rewards/margins": 6.127040386199951, + "rewards/rejected": -3.3009843826293945, + "step": 14819 + }, + { + "epoch": 3.71, + "grad_norm": 8.183895111083984, + "learning_rate": 1.559486377676862e-06, + "logits/chosen": -0.5546441078186035, + "logits/rejected": -0.6130632162094116, + "logps/chosen": -58.260986328125, + "logps/rejected": -103.52802276611328, + "loss": 0.8181, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.683433771133423, + "rewards/margins": 5.558849811553955, + "rewards/rejected": -2.8754162788391113, + "step": 14820 + }, + { + "epoch": 3.71, + "grad_norm": 16.85284423828125, + "learning_rate": 1.55891609710424e-06, + "logits/chosen": -0.46372050046920776, + "logits/rejected": -0.5598949193954468, + "logps/chosen": -59.867332458496094, + "logps/rejected": -122.83516693115234, + "loss": 0.6368, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.045062780380249, + "rewards/margins": 6.3782477378845215, + "rewards/rejected": -3.3331851959228516, + "step": 14821 + }, + { + "epoch": 3.71, + "grad_norm": 3.434645414352417, + "learning_rate": 1.5583459015644597e-06, + "logits/chosen": -0.5609714388847351, + "logits/rejected": -0.6161047220230103, + "logps/chosen": -45.73652648925781, + "logps/rejected": -110.72261810302734, + "loss": 0.552, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1617953777313232, + "rewards/margins": 7.5934529304504395, + "rewards/rejected": -4.431657314300537, + "step": 14822 + }, + { + "epoch": 3.71, + "grad_norm": 19.026920318603516, + "learning_rate": 1.5577757910716146e-06, + "logits/chosen": -0.5614712834358215, + "logits/rejected": -0.6644942164421082, + "logps/chosen": -59.879127502441406, + "logps/rejected": -93.4337387084961, + "loss": 0.7229, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1064364910125732, + "rewards/margins": 4.976377964019775, + "rewards/rejected": -1.8699415922164917, + "step": 14823 + }, + { + "epoch": 3.71, + "grad_norm": 4.174508571624756, + "learning_rate": 1.557205765639791e-06, + "logits/chosen": -0.504474401473999, + "logits/rejected": -0.5914244055747986, + "logps/chosen": -46.70307540893555, + "logps/rejected": -86.6236572265625, + "loss": 0.6107, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9145278930664062, + "rewards/margins": 6.457028388977051, + "rewards/rejected": -3.5425000190734863, + "step": 14824 + }, + { + "epoch": 3.71, + "grad_norm": 10.814973831176758, + "learning_rate": 1.5566358252830732e-06, + "logits/chosen": -0.5890741944313049, + "logits/rejected": -0.6517155170440674, + "logps/chosen": -58.77435302734375, + "logps/rejected": -88.0037841796875, + "loss": 0.6758, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4420394897460938, + "rewards/margins": 6.119058609008789, + "rewards/rejected": -2.677018880844116, + "step": 14825 + }, + { + "epoch": 3.71, + "grad_norm": 3.4402096271514893, + "learning_rate": 1.5560659700155477e-06, + "logits/chosen": -0.5148283839225769, + "logits/rejected": -0.6250708103179932, + "logps/chosen": -66.70687866210938, + "logps/rejected": -88.69263458251953, + "loss": 0.5947, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.130439281463623, + "rewards/margins": 6.626102447509766, + "rewards/rejected": -3.4956629276275635, + "step": 14826 + }, + { + "epoch": 3.71, + "grad_norm": 6.066176891326904, + "learning_rate": 1.5554961998512963e-06, + "logits/chosen": -0.5417444109916687, + "logits/rejected": -0.6542983651161194, + "logps/chosen": -51.32268142700195, + "logps/rejected": -120.14950561523438, + "loss": 0.5936, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.172210693359375, + "rewards/margins": 8.038530349731445, + "rewards/rejected": -4.866320610046387, + "step": 14827 + }, + { + "epoch": 3.71, + "grad_norm": 4.742952346801758, + "learning_rate": 1.5549265148043952e-06, + "logits/chosen": -0.642415463924408, + "logits/rejected": -0.6982260942459106, + "logps/chosen": -57.376834869384766, + "logps/rejected": -94.68997192382812, + "loss": 0.6352, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8392224311828613, + "rewards/margins": 6.848731994628906, + "rewards/rejected": -4.009509563446045, + "step": 14828 + }, + { + "epoch": 3.71, + "grad_norm": 3.5502119064331055, + "learning_rate": 1.5543569148889243e-06, + "logits/chosen": -0.5329889059066772, + "logits/rejected": -0.6379481554031372, + "logps/chosen": -58.38794708251953, + "logps/rejected": -130.28213500976562, + "loss": 0.5436, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8942227363586426, + "rewards/margins": 7.919347286224365, + "rewards/rejected": -5.025125026702881, + "step": 14829 + }, + { + "epoch": 3.71, + "grad_norm": 7.187287330627441, + "learning_rate": 1.5537874001189623e-06, + "logits/chosen": -0.5683690905570984, + "logits/rejected": -0.6081506013870239, + "logps/chosen": -58.606536865234375, + "logps/rejected": -97.55194091796875, + "loss": 0.7547, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.996953010559082, + "rewards/margins": 5.592494964599609, + "rewards/rejected": -2.5955421924591064, + "step": 14830 + }, + { + "epoch": 3.71, + "grad_norm": 6.265554904937744, + "learning_rate": 1.5532179705085766e-06, + "logits/chosen": -0.5884183645248413, + "logits/rejected": -0.6502779126167297, + "logps/chosen": -61.160377502441406, + "logps/rejected": -122.25934600830078, + "loss": 0.622, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.03499436378479, + "rewards/margins": 6.607755184173584, + "rewards/rejected": -3.572761058807373, + "step": 14831 + }, + { + "epoch": 3.71, + "grad_norm": 7.767406463623047, + "learning_rate": 1.5526486260718404e-06, + "logits/chosen": -0.5406376719474792, + "logits/rejected": -0.6230897307395935, + "logps/chosen": -55.32868194580078, + "logps/rejected": -107.11869812011719, + "loss": 0.673, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.594573497772217, + "rewards/margins": 7.271940231323242, + "rewards/rejected": -4.677367210388184, + "step": 14832 + }, + { + "epoch": 3.71, + "grad_norm": 6.717025279998779, + "learning_rate": 1.5520793668228246e-06, + "logits/chosen": -0.47231459617614746, + "logits/rejected": -0.6234424114227295, + "logps/chosen": -65.35967254638672, + "logps/rejected": -99.06707763671875, + "loss": 0.6734, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.000685453414917, + "rewards/margins": 7.895974159240723, + "rewards/rejected": -4.895289421081543, + "step": 14833 + }, + { + "epoch": 3.71, + "grad_norm": 6.182069778442383, + "learning_rate": 1.551510192775596e-06, + "logits/chosen": -0.5279110670089722, + "logits/rejected": -0.5918390154838562, + "logps/chosen": -57.32390594482422, + "logps/rejected": -103.72268676757812, + "loss": 0.6996, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9119455814361572, + "rewards/margins": 6.356064319610596, + "rewards/rejected": -3.4441189765930176, + "step": 14834 + }, + { + "epoch": 3.71, + "grad_norm": 3.9632720947265625, + "learning_rate": 1.5509411039442162e-06, + "logits/chosen": -0.5721961259841919, + "logits/rejected": -0.655229926109314, + "logps/chosen": -55.511783599853516, + "logps/rejected": -115.83476257324219, + "loss": 0.6715, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1033213138580322, + "rewards/margins": 7.761320114135742, + "rewards/rejected": -4.657999515533447, + "step": 14835 + }, + { + "epoch": 3.71, + "grad_norm": 5.758712291717529, + "learning_rate": 1.550372100342752e-06, + "logits/chosen": -0.6149933338165283, + "logits/rejected": -0.6775972247123718, + "logps/chosen": -47.43956756591797, + "logps/rejected": -113.5612564086914, + "loss": 0.6336, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0220143795013428, + "rewards/margins": 8.055032730102539, + "rewards/rejected": -5.033019065856934, + "step": 14836 + }, + { + "epoch": 3.71, + "grad_norm": 12.13146686553955, + "learning_rate": 1.5498031819852628e-06, + "logits/chosen": -0.5552855134010315, + "logits/rejected": -0.6676959991455078, + "logps/chosen": -54.05394744873047, + "logps/rejected": -106.64332580566406, + "loss": 0.5527, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.07472825050354, + "rewards/margins": 7.756155014038086, + "rewards/rejected": -4.681427001953125, + "step": 14837 + }, + { + "epoch": 3.71, + "grad_norm": 9.830792427062988, + "learning_rate": 1.5492343488858046e-06, + "logits/chosen": -0.5113801956176758, + "logits/rejected": -0.5764809846878052, + "logps/chosen": -55.29142379760742, + "logps/rejected": -99.6203384399414, + "loss": 0.6509, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0310568809509277, + "rewards/margins": 6.611338138580322, + "rewards/rejected": -3.580280303955078, + "step": 14838 + }, + { + "epoch": 3.71, + "grad_norm": 4.999561309814453, + "learning_rate": 1.5486656010584378e-06, + "logits/chosen": -0.5964540243148804, + "logits/rejected": -0.6649376153945923, + "logps/chosen": -53.6627197265625, + "logps/rejected": -104.2658462524414, + "loss": 0.6994, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.080470085144043, + "rewards/margins": 7.115215301513672, + "rewards/rejected": -4.034745216369629, + "step": 14839 + }, + { + "epoch": 3.71, + "grad_norm": 6.249963760375977, + "learning_rate": 1.548096938517215e-06, + "logits/chosen": -0.5635818243026733, + "logits/rejected": -0.6253809928894043, + "logps/chosen": -54.466094970703125, + "logps/rejected": -131.0990447998047, + "loss": 0.5437, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.052471160888672, + "rewards/margins": 8.38172435760498, + "rewards/rejected": -5.329254627227783, + "step": 14840 + }, + { + "epoch": 3.71, + "grad_norm": 3.0857629776000977, + "learning_rate": 1.547528361276187e-06, + "logits/chosen": -0.5222254395484924, + "logits/rejected": -0.6027209758758545, + "logps/chosen": -63.8815803527832, + "logps/rejected": -106.81578826904297, + "loss": 0.6162, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6975889205932617, + "rewards/margins": 6.930096626281738, + "rewards/rejected": -4.232508182525635, + "step": 14841 + }, + { + "epoch": 3.71, + "grad_norm": 2.709252119064331, + "learning_rate": 1.5469598693494053e-06, + "logits/chosen": -0.4849337935447693, + "logits/rejected": -0.5992213487625122, + "logps/chosen": -53.0491828918457, + "logps/rejected": -113.42367553710938, + "loss": 0.51, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.197356700897217, + "rewards/margins": 8.069925308227539, + "rewards/rejected": -4.8725690841674805, + "step": 14842 + }, + { + "epoch": 3.71, + "grad_norm": 18.456878662109375, + "learning_rate": 1.5463914627509202e-06, + "logits/chosen": -0.43772420287132263, + "logits/rejected": -0.5399131178855896, + "logps/chosen": -71.80057525634766, + "logps/rejected": -102.89800262451172, + "loss": 0.6705, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8340859413146973, + "rewards/margins": 6.300496578216553, + "rewards/rejected": -3.4664103984832764, + "step": 14843 + }, + { + "epoch": 3.71, + "grad_norm": 5.06511926651001, + "learning_rate": 1.5458231414947749e-06, + "logits/chosen": -0.5974932909011841, + "logits/rejected": -0.7195325493812561, + "logps/chosen": -50.25809860229492, + "logps/rejected": -93.4782485961914, + "loss": 0.5124, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.452423572540283, + "rewards/margins": 8.68797779083252, + "rewards/rejected": -5.235554218292236, + "step": 14844 + }, + { + "epoch": 3.71, + "grad_norm": 4.428607940673828, + "learning_rate": 1.5452549055950123e-06, + "logits/chosen": -0.5652535557746887, + "logits/rejected": -0.6252995729446411, + "logps/chosen": -47.91053009033203, + "logps/rejected": -113.89250946044922, + "loss": 0.5461, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.369093894958496, + "rewards/margins": 7.419589996337891, + "rewards/rejected": -4.050496578216553, + "step": 14845 + }, + { + "epoch": 3.71, + "grad_norm": 6.4702677726745605, + "learning_rate": 1.544686755065677e-06, + "logits/chosen": -0.5521432161331177, + "logits/rejected": -0.6684989333152771, + "logps/chosen": -55.516727447509766, + "logps/rejected": -104.06339263916016, + "loss": 0.603, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9355201721191406, + "rewards/margins": 7.456479072570801, + "rewards/rejected": -4.520959377288818, + "step": 14846 + }, + { + "epoch": 3.71, + "grad_norm": 7.108248710632324, + "learning_rate": 1.5441186899208077e-06, + "logits/chosen": -0.5380483269691467, + "logits/rejected": -0.6680058240890503, + "logps/chosen": -61.60533142089844, + "logps/rejected": -101.66536712646484, + "loss": 0.5759, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.704314947128296, + "rewards/margins": 8.07277774810791, + "rewards/rejected": -5.36846399307251, + "step": 14847 + }, + { + "epoch": 3.71, + "grad_norm": 3.270364999771118, + "learning_rate": 1.5435507101744395e-06, + "logits/chosen": -0.513106107711792, + "logits/rejected": -0.6065167784690857, + "logps/chosen": -73.11498260498047, + "logps/rejected": -99.33428192138672, + "loss": 0.6769, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3393914699554443, + "rewards/margins": 7.65527868270874, + "rewards/rejected": -4.3158860206604, + "step": 14848 + }, + { + "epoch": 3.71, + "grad_norm": 13.19908332824707, + "learning_rate": 1.5429828158406113e-06, + "logits/chosen": -0.49410590529441833, + "logits/rejected": -0.6389719247817993, + "logps/chosen": -58.146183013916016, + "logps/rejected": -76.63381958007812, + "loss": 0.619, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.119563341140747, + "rewards/margins": 6.987131595611572, + "rewards/rejected": -3.867568254470825, + "step": 14849 + }, + { + "epoch": 3.71, + "grad_norm": 4.500457763671875, + "learning_rate": 1.5424150069333533e-06, + "logits/chosen": -0.4938003420829773, + "logits/rejected": -0.5595812201499939, + "logps/chosen": -62.13093948364258, + "logps/rejected": -129.21115112304688, + "loss": 0.7128, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.982461452484131, + "rewards/margins": 8.638856887817383, + "rewards/rejected": -5.65639591217041, + "step": 14850 + }, + { + "epoch": 3.72, + "grad_norm": 3.6083550453186035, + "learning_rate": 1.5418472834666992e-06, + "logits/chosen": -0.48648306727409363, + "logits/rejected": -0.5392102599143982, + "logps/chosen": -57.03378677368164, + "logps/rejected": -116.03104400634766, + "loss": 0.6502, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8581721782684326, + "rewards/margins": 6.682466506958008, + "rewards/rejected": -3.824294090270996, + "step": 14851 + }, + { + "epoch": 3.72, + "grad_norm": 60.839385986328125, + "learning_rate": 1.5412796454546775e-06, + "logits/chosen": -0.5992151498794556, + "logits/rejected": -0.5931780338287354, + "logps/chosen": -76.70719146728516, + "logps/rejected": -111.9447021484375, + "loss": 0.9012, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.9401588439941406, + "rewards/margins": 6.570763111114502, + "rewards/rejected": -3.6306042671203613, + "step": 14852 + }, + { + "epoch": 3.72, + "grad_norm": 7.461197376251221, + "learning_rate": 1.540712092911313e-06, + "logits/chosen": -0.5646698474884033, + "logits/rejected": -0.6122042536735535, + "logps/chosen": -68.03443908691406, + "logps/rejected": -119.05530548095703, + "loss": 0.6252, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8206844329833984, + "rewards/margins": 6.972607612609863, + "rewards/rejected": -4.151923656463623, + "step": 14853 + }, + { + "epoch": 3.72, + "grad_norm": 4.702103614807129, + "learning_rate": 1.5401446258506319e-06, + "logits/chosen": -0.5397397875785828, + "logits/rejected": -0.6089906692504883, + "logps/chosen": -55.72399139404297, + "logps/rejected": -113.43927001953125, + "loss": 0.6072, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3448233604431152, + "rewards/margins": 6.995515823364258, + "rewards/rejected": -3.650692939758301, + "step": 14854 + }, + { + "epoch": 3.72, + "grad_norm": 3.7446811199188232, + "learning_rate": 1.5395772442866608e-06, + "logits/chosen": -0.5130068063735962, + "logits/rejected": -0.6025207042694092, + "logps/chosen": -52.698699951171875, + "logps/rejected": -109.89556884765625, + "loss": 0.5809, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0227999687194824, + "rewards/margins": 7.683748722076416, + "rewards/rejected": -4.66094970703125, + "step": 14855 + }, + { + "epoch": 3.72, + "grad_norm": 6.491155624389648, + "learning_rate": 1.5390099482334131e-06, + "logits/chosen": -0.5281741619110107, + "logits/rejected": -0.5821999311447144, + "logps/chosen": -52.941612243652344, + "logps/rejected": -97.02616119384766, + "loss": 0.6003, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3512954711914062, + "rewards/margins": 7.037796974182129, + "rewards/rejected": -3.6865015029907227, + "step": 14856 + }, + { + "epoch": 3.72, + "grad_norm": 4.707222938537598, + "learning_rate": 1.538442737704911e-06, + "logits/chosen": -0.5619943141937256, + "logits/rejected": -0.6315175294876099, + "logps/chosen": -55.718379974365234, + "logps/rejected": -112.54534912109375, + "loss": 0.66, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.703768491744995, + "rewards/margins": 7.3103766441345215, + "rewards/rejected": -4.6066083908081055, + "step": 14857 + }, + { + "epoch": 3.72, + "grad_norm": 16.63031578063965, + "learning_rate": 1.537875612715174e-06, + "logits/chosen": -0.5101616978645325, + "logits/rejected": -0.5841970443725586, + "logps/chosen": -60.190826416015625, + "logps/rejected": -85.55010986328125, + "loss": 0.7955, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.115750789642334, + "rewards/margins": 6.176483154296875, + "rewards/rejected": -3.060732841491699, + "step": 14858 + }, + { + "epoch": 3.72, + "grad_norm": 2.7889926433563232, + "learning_rate": 1.5373085732782094e-06, + "logits/chosen": -0.5487130284309387, + "logits/rejected": -0.6126213073730469, + "logps/chosen": -52.13503646850586, + "logps/rejected": -110.19212341308594, + "loss": 0.554, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0006179809570312, + "rewards/margins": 7.873785495758057, + "rewards/rejected": -4.873167514801025, + "step": 14859 + }, + { + "epoch": 3.72, + "grad_norm": 4.658679485321045, + "learning_rate": 1.5367416194080337e-06, + "logits/chosen": -0.5211902856826782, + "logits/rejected": -0.5968981981277466, + "logps/chosen": -47.53755187988281, + "logps/rejected": -110.41230773925781, + "loss": 0.5737, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3165934085845947, + "rewards/margins": 7.145221710205078, + "rewards/rejected": -3.8286283016204834, + "step": 14860 + }, + { + "epoch": 3.72, + "grad_norm": 3.721717596054077, + "learning_rate": 1.5361747511186575e-06, + "logits/chosen": -0.5988860726356506, + "logits/rejected": -0.7118768692016602, + "logps/chosen": -47.49269104003906, + "logps/rejected": -114.11526489257812, + "loss": 0.5524, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.878530740737915, + "rewards/margins": 8.429549217224121, + "rewards/rejected": -5.551018714904785, + "step": 14861 + }, + { + "epoch": 3.72, + "grad_norm": 3.9430434703826904, + "learning_rate": 1.5356079684240877e-06, + "logits/chosen": -0.5022251605987549, + "logits/rejected": -0.602597177028656, + "logps/chosen": -56.43233871459961, + "logps/rejected": -87.16908264160156, + "loss": 0.6682, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0272927284240723, + "rewards/margins": 6.426012992858887, + "rewards/rejected": -3.3987202644348145, + "step": 14862 + }, + { + "epoch": 3.72, + "grad_norm": 4.762149333953857, + "learning_rate": 1.5350412713383278e-06, + "logits/chosen": -0.4926432967185974, + "logits/rejected": -0.5961437225341797, + "logps/chosen": -59.470481872558594, + "logps/rejected": -92.63146209716797, + "loss": 0.6115, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.305445671081543, + "rewards/margins": 7.12222146987915, + "rewards/rejected": -3.8167762756347656, + "step": 14863 + }, + { + "epoch": 3.72, + "grad_norm": 5.120061874389648, + "learning_rate": 1.5344746598753856e-06, + "logits/chosen": -0.6010634303092957, + "logits/rejected": -0.6988916397094727, + "logps/chosen": -56.229339599609375, + "logps/rejected": -93.87210845947266, + "loss": 0.6231, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.194298267364502, + "rewards/margins": 6.17683744430542, + "rewards/rejected": -2.982539415359497, + "step": 14864 + }, + { + "epoch": 3.72, + "grad_norm": 7.798384189605713, + "learning_rate": 1.5339081340492606e-06, + "logits/chosen": -0.49020373821258545, + "logits/rejected": -0.5485714673995972, + "logps/chosen": -41.1529426574707, + "logps/rejected": -96.28880310058594, + "loss": 0.5044, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0618906021118164, + "rewards/margins": 7.066623687744141, + "rewards/rejected": -4.004733085632324, + "step": 14865 + }, + { + "epoch": 3.72, + "grad_norm": 7.508886337280273, + "learning_rate": 1.533341693873951e-06, + "logits/chosen": -0.5416135191917419, + "logits/rejected": -0.6278351545333862, + "logps/chosen": -45.42815017700195, + "logps/rejected": -103.96566772460938, + "loss": 0.5635, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.192497491836548, + "rewards/margins": 6.907383918762207, + "rewards/rejected": -3.7148869037628174, + "step": 14866 + }, + { + "epoch": 3.72, + "grad_norm": 6.4405999183654785, + "learning_rate": 1.5327753393634565e-06, + "logits/chosen": -0.5740712285041809, + "logits/rejected": -0.6376951932907104, + "logps/chosen": -41.79689025878906, + "logps/rejected": -98.7001724243164, + "loss": 0.5893, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3757071495056152, + "rewards/margins": 7.652690887451172, + "rewards/rejected": -4.276984214782715, + "step": 14867 + }, + { + "epoch": 3.72, + "grad_norm": 4.7952775955200195, + "learning_rate": 1.53220907053177e-06, + "logits/chosen": -0.5332903265953064, + "logits/rejected": -0.6049689054489136, + "logps/chosen": -60.94788360595703, + "logps/rejected": -95.20152282714844, + "loss": 0.6511, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9876017570495605, + "rewards/margins": 6.716238021850586, + "rewards/rejected": -3.7286365032196045, + "step": 14868 + }, + { + "epoch": 3.72, + "grad_norm": 7.80812931060791, + "learning_rate": 1.5316428873928875e-06, + "logits/chosen": -0.6030913591384888, + "logits/rejected": -0.7297778129577637, + "logps/chosen": -62.883155822753906, + "logps/rejected": -102.07041931152344, + "loss": 0.7325, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.323061943054199, + "rewards/margins": 7.653815269470215, + "rewards/rejected": -4.330752849578857, + "step": 14869 + }, + { + "epoch": 3.72, + "grad_norm": 3.1594974994659424, + "learning_rate": 1.5310767899607982e-06, + "logits/chosen": -0.5583179593086243, + "logits/rejected": -0.5606040358543396, + "logps/chosen": -49.012855529785156, + "logps/rejected": -127.12380981445312, + "loss": 0.5809, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4105958938598633, + "rewards/margins": 6.991980075836182, + "rewards/rejected": -3.58138370513916, + "step": 14870 + }, + { + "epoch": 3.72, + "grad_norm": 4.2618021965026855, + "learning_rate": 1.5305107782494895e-06, + "logits/chosen": -0.5565447807312012, + "logits/rejected": -0.63591468334198, + "logps/chosen": -54.23035430908203, + "logps/rejected": -110.71138000488281, + "loss": 0.6916, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9883198738098145, + "rewards/margins": 6.891724586486816, + "rewards/rejected": -3.9034039974212646, + "step": 14871 + }, + { + "epoch": 3.72, + "grad_norm": 3.521707057952881, + "learning_rate": 1.5299448522729515e-06, + "logits/chosen": -0.4559432864189148, + "logits/rejected": -0.5911007523536682, + "logps/chosen": -53.8326416015625, + "logps/rejected": -97.37415313720703, + "loss": 0.6071, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0142292976379395, + "rewards/margins": 6.228657245635986, + "rewards/rejected": -3.214428424835205, + "step": 14872 + }, + { + "epoch": 3.72, + "grad_norm": 2.8505918979644775, + "learning_rate": 1.5293790120451668e-06, + "logits/chosen": -0.5038348436355591, + "logits/rejected": -0.6079160571098328, + "logps/chosen": -61.36353302001953, + "logps/rejected": -99.95501708984375, + "loss": 0.5869, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.284275531768799, + "rewards/margins": 6.629334926605225, + "rewards/rejected": -3.3450589179992676, + "step": 14873 + }, + { + "epoch": 3.72, + "grad_norm": 4.844226837158203, + "learning_rate": 1.5288132575801174e-06, + "logits/chosen": -0.508855938911438, + "logits/rejected": -0.6584490537643433, + "logps/chosen": -64.49879455566406, + "logps/rejected": -84.74722290039062, + "loss": 0.6717, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9269278049468994, + "rewards/margins": 6.560091972351074, + "rewards/rejected": -3.633164167404175, + "step": 14874 + }, + { + "epoch": 3.72, + "grad_norm": 13.144594192504883, + "learning_rate": 1.5282475888917837e-06, + "logits/chosen": -0.5058066844940186, + "logits/rejected": -0.6054356098175049, + "logps/chosen": -53.79782485961914, + "logps/rejected": -84.08220672607422, + "loss": 0.7606, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1372692584991455, + "rewards/margins": 5.572879791259766, + "rewards/rejected": -2.435610294342041, + "step": 14875 + }, + { + "epoch": 3.72, + "grad_norm": 6.382131576538086, + "learning_rate": 1.5276820059941465e-06, + "logits/chosen": -0.5022503733634949, + "logits/rejected": -0.5961589813232422, + "logps/chosen": -62.42979431152344, + "logps/rejected": -87.63470458984375, + "loss": 0.7327, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2957606315612793, + "rewards/margins": 6.534936904907227, + "rewards/rejected": -3.2391765117645264, + "step": 14876 + }, + { + "epoch": 3.72, + "grad_norm": 5.559033393859863, + "learning_rate": 1.5271165089011807e-06, + "logits/chosen": -0.5830603837966919, + "logits/rejected": -0.6080846190452576, + "logps/chosen": -47.582801818847656, + "logps/rejected": -110.02313232421875, + "loss": 0.7074, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1571338176727295, + "rewards/margins": 6.59823751449585, + "rewards/rejected": -3.44110369682312, + "step": 14877 + }, + { + "epoch": 3.72, + "grad_norm": 3.5376925468444824, + "learning_rate": 1.5265510976268583e-06, + "logits/chosen": -0.5513089299201965, + "logits/rejected": -0.6195774078369141, + "logps/chosen": -47.685176849365234, + "logps/rejected": -116.31672668457031, + "loss": 0.5558, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5316200256347656, + "rewards/margins": 8.224686622619629, + "rewards/rejected": -4.6930670738220215, + "step": 14878 + }, + { + "epoch": 3.72, + "grad_norm": 12.425676345825195, + "learning_rate": 1.525985772185154e-06, + "logits/chosen": -0.5009557008743286, + "logits/rejected": -0.5791137218475342, + "logps/chosen": -57.86372756958008, + "logps/rejected": -92.90367889404297, + "loss": 0.7751, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.305373191833496, + "rewards/margins": 5.407418251037598, + "rewards/rejected": -3.1020445823669434, + "step": 14879 + }, + { + "epoch": 3.72, + "grad_norm": 8.616647720336914, + "learning_rate": 1.5254205325900368e-06, + "logits/chosen": -0.6235940456390381, + "logits/rejected": -0.6649787425994873, + "logps/chosen": -48.560752868652344, + "logps/rejected": -109.81045532226562, + "loss": 0.6318, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.798111915588379, + "rewards/margins": 7.449160575866699, + "rewards/rejected": -4.6510491371154785, + "step": 14880 + }, + { + "epoch": 3.72, + "grad_norm": 4.7157206535339355, + "learning_rate": 1.524855378855472e-06, + "logits/chosen": -0.5281175374984741, + "logits/rejected": -0.5983734130859375, + "logps/chosen": -74.35820770263672, + "logps/rejected": -104.36457061767578, + "loss": 0.689, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1636877059936523, + "rewards/margins": 6.501443386077881, + "rewards/rejected": -3.3377556800842285, + "step": 14881 + }, + { + "epoch": 3.72, + "grad_norm": 3.1589515209198, + "learning_rate": 1.5242903109954288e-06, + "logits/chosen": -0.5002744197845459, + "logits/rejected": -0.5654721260070801, + "logps/chosen": -52.92706298828125, + "logps/rejected": -110.38400268554688, + "loss": 0.5649, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.108717918395996, + "rewards/margins": 7.710525035858154, + "rewards/rejected": -4.601806640625, + "step": 14882 + }, + { + "epoch": 3.72, + "grad_norm": 3.990385055541992, + "learning_rate": 1.523725329023869e-06, + "logits/chosen": -0.5986754298210144, + "logits/rejected": -0.6895780563354492, + "logps/chosen": -51.7295036315918, + "logps/rejected": -96.97257995605469, + "loss": 0.6142, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.886889934539795, + "rewards/margins": 6.517732620239258, + "rewards/rejected": -3.6308422088623047, + "step": 14883 + }, + { + "epoch": 3.72, + "grad_norm": 4.539226055145264, + "learning_rate": 1.5231604329547522e-06, + "logits/chosen": -0.5863417983055115, + "logits/rejected": -0.6581050753593445, + "logps/chosen": -51.551307678222656, + "logps/rejected": -99.77864074707031, + "loss": 0.607, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.300325870513916, + "rewards/margins": 6.859166145324707, + "rewards/rejected": -3.558840274810791, + "step": 14884 + }, + { + "epoch": 3.72, + "grad_norm": 3.056443214416504, + "learning_rate": 1.5225956228020411e-06, + "logits/chosen": -0.516925573348999, + "logits/rejected": -0.6094127297401428, + "logps/chosen": -57.178611755371094, + "logps/rejected": -100.08503723144531, + "loss": 0.6131, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0572988986968994, + "rewards/margins": 6.582075595855713, + "rewards/rejected": -3.5247771739959717, + "step": 14885 + }, + { + "epoch": 3.72, + "grad_norm": 4.26314640045166, + "learning_rate": 1.5220308985796889e-06, + "logits/chosen": -0.5292211174964905, + "logits/rejected": -0.6046568751335144, + "logps/chosen": -46.01298141479492, + "logps/rejected": -112.2153091430664, + "loss": 0.5717, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1066651344299316, + "rewards/margins": 7.886712074279785, + "rewards/rejected": -4.780047416687012, + "step": 14886 + }, + { + "epoch": 3.72, + "grad_norm": 14.873382568359375, + "learning_rate": 1.5214662603016544e-06, + "logits/chosen": -0.5010287761688232, + "logits/rejected": -0.5754465460777283, + "logps/chosen": -57.43023681640625, + "logps/rejected": -104.31768035888672, + "loss": 0.5802, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9581446647644043, + "rewards/margins": 7.040188789367676, + "rewards/rejected": -4.08204460144043, + "step": 14887 + }, + { + "epoch": 3.72, + "grad_norm": 10.70747184753418, + "learning_rate": 1.5209017079818866e-06, + "logits/chosen": -0.5113565921783447, + "logits/rejected": -0.5956531763076782, + "logps/chosen": -57.99323272705078, + "logps/rejected": -107.06708526611328, + "loss": 0.7059, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9219987392425537, + "rewards/margins": 6.000115394592285, + "rewards/rejected": -3.0781164169311523, + "step": 14888 + }, + { + "epoch": 3.72, + "grad_norm": 3.891721248626709, + "learning_rate": 1.5203372416343404e-06, + "logits/chosen": -0.5472009181976318, + "logits/rejected": -0.6053221225738525, + "logps/chosen": -49.91275405883789, + "logps/rejected": -97.08940887451172, + "loss": 0.629, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1983609199523926, + "rewards/margins": 5.86544942855835, + "rewards/rejected": -2.667088270187378, + "step": 14889 + }, + { + "epoch": 3.72, + "grad_norm": 4.9446916580200195, + "learning_rate": 1.5197728612729612e-06, + "logits/chosen": -0.5516837239265442, + "logits/rejected": -0.6381500363349915, + "logps/chosen": -54.752159118652344, + "logps/rejected": -92.18265533447266, + "loss": 0.7058, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.218064069747925, + "rewards/margins": 6.871268272399902, + "rewards/rejected": -3.6532037258148193, + "step": 14890 + }, + { + "epoch": 3.73, + "grad_norm": 2.728759527206421, + "learning_rate": 1.5192085669116952e-06, + "logits/chosen": -0.5085155367851257, + "logits/rejected": -0.6092315912246704, + "logps/chosen": -52.594268798828125, + "logps/rejected": -107.4794692993164, + "loss": 0.5468, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.242298126220703, + "rewards/margins": 7.604838848114014, + "rewards/rejected": -4.362541675567627, + "step": 14891 + }, + { + "epoch": 3.73, + "grad_norm": 3.2083404064178467, + "learning_rate": 1.5186443585644894e-06, + "logits/chosen": -0.5691598653793335, + "logits/rejected": -0.6328787803649902, + "logps/chosen": -57.310943603515625, + "logps/rejected": -102.57674407958984, + "loss": 0.6747, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2914488315582275, + "rewards/margins": 6.872888565063477, + "rewards/rejected": -3.581439733505249, + "step": 14892 + }, + { + "epoch": 3.73, + "grad_norm": 4.950045585632324, + "learning_rate": 1.518080236245283e-06, + "logits/chosen": -0.5422307848930359, + "logits/rejected": -0.632695734500885, + "logps/chosen": -56.353233337402344, + "logps/rejected": -98.47891235351562, + "loss": 0.5747, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3370704650878906, + "rewards/margins": 6.686304092407227, + "rewards/rejected": -3.3492343425750732, + "step": 14893 + }, + { + "epoch": 3.73, + "grad_norm": 2.777672529220581, + "learning_rate": 1.5175161999680194e-06, + "logits/chosen": -0.546420156955719, + "logits/rejected": -0.6304338574409485, + "logps/chosen": -54.19810485839844, + "logps/rejected": -107.89222717285156, + "loss": 0.6613, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7757742404937744, + "rewards/margins": 7.114614009857178, + "rewards/rejected": -4.338840007781982, + "step": 14894 + }, + { + "epoch": 3.73, + "grad_norm": 4.393343448638916, + "learning_rate": 1.5169522497466339e-06, + "logits/chosen": -0.503174901008606, + "logits/rejected": -0.5422797203063965, + "logps/chosen": -50.24665069580078, + "logps/rejected": -113.61335754394531, + "loss": 0.6218, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.046067953109741, + "rewards/margins": 5.838497638702393, + "rewards/rejected": -2.7924299240112305, + "step": 14895 + }, + { + "epoch": 3.73, + "grad_norm": 6.0814385414123535, + "learning_rate": 1.5163883855950617e-06, + "logits/chosen": -0.5897229313850403, + "logits/rejected": -0.6740767955780029, + "logps/chosen": -46.017208099365234, + "logps/rejected": -104.73204803466797, + "loss": 0.6428, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.002552032470703, + "rewards/margins": 7.714303016662598, + "rewards/rejected": -4.7117509841918945, + "step": 14896 + }, + { + "epoch": 3.73, + "grad_norm": 5.240316867828369, + "learning_rate": 1.5158246075272399e-06, + "logits/chosen": -0.6196340322494507, + "logits/rejected": -0.7039666175842285, + "logps/chosen": -50.63621520996094, + "logps/rejected": -102.70731353759766, + "loss": 0.6581, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.435798406600952, + "rewards/margins": 6.716962814331055, + "rewards/rejected": -3.2811644077301025, + "step": 14897 + }, + { + "epoch": 3.73, + "grad_norm": 6.042705059051514, + "learning_rate": 1.5152609155570975e-06, + "logits/chosen": -0.5430623292922974, + "logits/rejected": -0.6121068000793457, + "logps/chosen": -48.92369079589844, + "logps/rejected": -120.18963623046875, + "loss": 0.5723, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1187641620635986, + "rewards/margins": 8.172175407409668, + "rewards/rejected": -5.05341100692749, + "step": 14898 + }, + { + "epoch": 3.73, + "grad_norm": 5.905939102172852, + "learning_rate": 1.5146973096985624e-06, + "logits/chosen": -0.6077953577041626, + "logits/rejected": -0.702201247215271, + "logps/chosen": -67.67434692382812, + "logps/rejected": -82.87867736816406, + "loss": 0.7063, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.106736898422241, + "rewards/margins": 6.536094665527344, + "rewards/rejected": -3.4293580055236816, + "step": 14899 + }, + { + "epoch": 3.73, + "grad_norm": 5.182638168334961, + "learning_rate": 1.5141337899655645e-06, + "logits/chosen": -0.587526798248291, + "logits/rejected": -0.6653380393981934, + "logps/chosen": -53.974891662597656, + "logps/rejected": -86.42420196533203, + "loss": 0.6095, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.217442750930786, + "rewards/margins": 5.590549945831299, + "rewards/rejected": -2.3731069564819336, + "step": 14900 + }, + { + "epoch": 3.73, + "grad_norm": 3.584446430206299, + "learning_rate": 1.5135703563720317e-06, + "logits/chosen": -0.5554285049438477, + "logits/rejected": -0.6413863301277161, + "logps/chosen": -52.65911865234375, + "logps/rejected": -108.97746276855469, + "loss": 0.6466, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.145670175552368, + "rewards/margins": 7.587181568145752, + "rewards/rejected": -4.441512107849121, + "step": 14901 + }, + { + "epoch": 3.73, + "grad_norm": 3.3510255813598633, + "learning_rate": 1.51300700893188e-06, + "logits/chosen": -0.5600181818008423, + "logits/rejected": -0.5788872241973877, + "logps/chosen": -50.840293884277344, + "logps/rejected": -126.84181213378906, + "loss": 0.6243, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0416367053985596, + "rewards/margins": 7.3156609535217285, + "rewards/rejected": -4.27402400970459, + "step": 14902 + }, + { + "epoch": 3.73, + "grad_norm": 3.4531304836273193, + "learning_rate": 1.5124437476590343e-06, + "logits/chosen": -0.5803957581520081, + "logits/rejected": -0.6654490828514099, + "logps/chosen": -54.57538604736328, + "logps/rejected": -95.22802734375, + "loss": 0.6046, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.646289348602295, + "rewards/margins": 7.721210956573486, + "rewards/rejected": -4.074921131134033, + "step": 14903 + }, + { + "epoch": 3.73, + "grad_norm": 6.215298652648926, + "learning_rate": 1.5118805725674152e-06, + "logits/chosen": -0.5467225909233093, + "logits/rejected": -0.6575325727462769, + "logps/chosen": -53.31536102294922, + "logps/rejected": -105.94210052490234, + "loss": 0.6841, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.971432685852051, + "rewards/margins": 7.528862476348877, + "rewards/rejected": -4.557429313659668, + "step": 14904 + }, + { + "epoch": 3.73, + "grad_norm": 2.440605401992798, + "learning_rate": 1.5113174836709365e-06, + "logits/chosen": -0.5551749467849731, + "logits/rejected": -0.6232611536979675, + "logps/chosen": -57.763404846191406, + "logps/rejected": -100.86058044433594, + "loss": 0.6016, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.158674478530884, + "rewards/margins": 6.964574813842773, + "rewards/rejected": -3.8058993816375732, + "step": 14905 + }, + { + "epoch": 3.73, + "grad_norm": 15.227049827575684, + "learning_rate": 1.5107544809835122e-06, + "logits/chosen": -0.5651941895484924, + "logits/rejected": -0.6468244791030884, + "logps/chosen": -57.2715950012207, + "logps/rejected": -87.51203918457031, + "loss": 0.6439, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.901218891143799, + "rewards/margins": 6.192543029785156, + "rewards/rejected": -3.2913239002227783, + "step": 14906 + }, + { + "epoch": 3.73, + "grad_norm": 4.727510929107666, + "learning_rate": 1.5101915645190578e-06, + "logits/chosen": -0.557981550693512, + "logits/rejected": -0.5967692732810974, + "logps/chosen": -72.16552734375, + "logps/rejected": -89.49942016601562, + "loss": 0.7457, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.118380546569824, + "rewards/margins": 5.339688777923584, + "rewards/rejected": -2.2213082313537598, + "step": 14907 + }, + { + "epoch": 3.73, + "grad_norm": 8.719230651855469, + "learning_rate": 1.5096287342914818e-06, + "logits/chosen": -0.5592004060745239, + "logits/rejected": -0.6277507543563843, + "logps/chosen": -52.223548889160156, + "logps/rejected": -104.86568450927734, + "loss": 0.5903, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.701213836669922, + "rewards/margins": 6.745896339416504, + "rewards/rejected": -4.044682502746582, + "step": 14908 + }, + { + "epoch": 3.73, + "grad_norm": 6.252103328704834, + "learning_rate": 1.5090659903146903e-06, + "logits/chosen": -0.5213305950164795, + "logits/rejected": -0.6405565142631531, + "logps/chosen": -54.4520263671875, + "logps/rejected": -94.69842529296875, + "loss": 0.6168, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1376798152923584, + "rewards/margins": 7.731233596801758, + "rewards/rejected": -4.59355354309082, + "step": 14909 + }, + { + "epoch": 3.73, + "grad_norm": 7.962435722351074, + "learning_rate": 1.5085033326025933e-06, + "logits/chosen": -0.4597846269607544, + "logits/rejected": -0.4895471930503845, + "logps/chosen": -61.5090217590332, + "logps/rejected": -129.68545532226562, + "loss": 0.5944, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2622036933898926, + "rewards/margins": 7.652905464172363, + "rewards/rejected": -4.390702247619629, + "step": 14910 + }, + { + "epoch": 3.73, + "grad_norm": 4.830517768859863, + "learning_rate": 1.5079407611690906e-06, + "logits/chosen": -0.48226475715637207, + "logits/rejected": -0.57587730884552, + "logps/chosen": -68.02365112304688, + "logps/rejected": -117.69550323486328, + "loss": 0.566, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.423004627227783, + "rewards/margins": 8.194781303405762, + "rewards/rejected": -4.771775722503662, + "step": 14911 + }, + { + "epoch": 3.73, + "grad_norm": 5.413400650024414, + "learning_rate": 1.507378276028088e-06, + "logits/chosen": -0.6596342921257019, + "logits/rejected": -0.7450475096702576, + "logps/chosen": -54.64731216430664, + "logps/rejected": -123.46549224853516, + "loss": 0.5611, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.709773540496826, + "rewards/margins": 7.938824653625488, + "rewards/rejected": -5.229051113128662, + "step": 14912 + }, + { + "epoch": 3.73, + "grad_norm": 28.08377456665039, + "learning_rate": 1.5068158771934833e-06, + "logits/chosen": -0.5478111505508423, + "logits/rejected": -0.6533792018890381, + "logps/chosen": -57.21195983886719, + "logps/rejected": -116.43211364746094, + "loss": 0.7268, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.83152174949646, + "rewards/margins": 8.97027587890625, + "rewards/rejected": -6.138753890991211, + "step": 14913 + }, + { + "epoch": 3.73, + "grad_norm": 6.388268947601318, + "learning_rate": 1.5062535646791715e-06, + "logits/chosen": -0.5878509879112244, + "logits/rejected": -0.6819091439247131, + "logps/chosen": -51.40095901489258, + "logps/rejected": -103.75287628173828, + "loss": 0.5681, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2369236946105957, + "rewards/margins": 8.154130935668945, + "rewards/rejected": -4.917207717895508, + "step": 14914 + }, + { + "epoch": 3.73, + "grad_norm": 7.05708646774292, + "learning_rate": 1.5056913384990523e-06, + "logits/chosen": -0.5705189108848572, + "logits/rejected": -0.628940999507904, + "logps/chosen": -71.06429290771484, + "logps/rejected": -105.38705444335938, + "loss": 0.6708, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.147291898727417, + "rewards/margins": 5.950124740600586, + "rewards/rejected": -2.802832841873169, + "step": 14915 + }, + { + "epoch": 3.73, + "grad_norm": 4.726726055145264, + "learning_rate": 1.5051291986670169e-06, + "logits/chosen": -0.5162939429283142, + "logits/rejected": -0.5630883574485779, + "logps/chosen": -73.03292083740234, + "logps/rejected": -97.65978240966797, + "loss": 0.7024, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0289790630340576, + "rewards/margins": 5.777188301086426, + "rewards/rejected": -2.7482094764709473, + "step": 14916 + }, + { + "epoch": 3.73, + "grad_norm": 3.515075922012329, + "learning_rate": 1.5045671451969545e-06, + "logits/chosen": -0.5532714128494263, + "logits/rejected": -0.6401084065437317, + "logps/chosen": -47.126773834228516, + "logps/rejected": -104.5688705444336, + "loss": 0.5918, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4507482051849365, + "rewards/margins": 8.96300983428955, + "rewards/rejected": -5.512261390686035, + "step": 14917 + }, + { + "epoch": 3.73, + "grad_norm": 6.611538410186768, + "learning_rate": 1.504005178102756e-06, + "logits/chosen": -0.48841309547424316, + "logits/rejected": -0.5787437558174133, + "logps/chosen": -53.00910186767578, + "logps/rejected": -120.59346771240234, + "loss": 0.5089, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.875430107116699, + "rewards/margins": 8.868685722351074, + "rewards/rejected": -5.993255138397217, + "step": 14918 + }, + { + "epoch": 3.73, + "grad_norm": 5.472172260284424, + "learning_rate": 1.5034432973983093e-06, + "logits/chosen": -0.6027467846870422, + "logits/rejected": -0.6897290945053101, + "logps/chosen": -53.908267974853516, + "logps/rejected": -103.42243194580078, + "loss": 0.6446, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.134012460708618, + "rewards/margins": 7.390921592712402, + "rewards/rejected": -4.256908893585205, + "step": 14919 + }, + { + "epoch": 3.73, + "grad_norm": 4.806021213531494, + "learning_rate": 1.5028815030974985e-06, + "logits/chosen": -0.6081199645996094, + "logits/rejected": -0.6627405881881714, + "logps/chosen": -51.34568786621094, + "logps/rejected": -103.17755889892578, + "loss": 0.5912, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.21744966506958, + "rewards/margins": 6.560688018798828, + "rewards/rejected": -3.3432388305664062, + "step": 14920 + }, + { + "epoch": 3.73, + "grad_norm": 23.894737243652344, + "learning_rate": 1.5023197952142033e-06, + "logits/chosen": -0.4974784851074219, + "logits/rejected": -0.5723440647125244, + "logps/chosen": -62.32789993286133, + "logps/rejected": -93.0267333984375, + "loss": 0.9807, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.5919456481933594, + "rewards/margins": 5.339090347290039, + "rewards/rejected": -2.7471446990966797, + "step": 14921 + }, + { + "epoch": 3.73, + "grad_norm": 8.763969421386719, + "learning_rate": 1.5017581737623082e-06, + "logits/chosen": -0.48474156856536865, + "logits/rejected": -0.5258724689483643, + "logps/chosen": -50.71521759033203, + "logps/rejected": -104.97943878173828, + "loss": 0.6647, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.989206314086914, + "rewards/margins": 6.096436977386475, + "rewards/rejected": -3.1072306632995605, + "step": 14922 + }, + { + "epoch": 3.73, + "grad_norm": 7.9978108406066895, + "learning_rate": 1.5011966387556891e-06, + "logits/chosen": -0.5180882215499878, + "logits/rejected": -0.534214198589325, + "logps/chosen": -50.46855926513672, + "logps/rejected": -111.63478088378906, + "loss": 0.6701, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.046081066131592, + "rewards/margins": 5.7133989334106445, + "rewards/rejected": -2.667318344116211, + "step": 14923 + }, + { + "epoch": 3.73, + "grad_norm": 5.388645648956299, + "learning_rate": 1.500635190208221e-06, + "logits/chosen": -0.6228121519088745, + "logits/rejected": -0.6800707578659058, + "logps/chosen": -56.53351593017578, + "logps/rejected": -111.90107727050781, + "loss": 0.6305, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3177366256713867, + "rewards/margins": 6.966129302978516, + "rewards/rejected": -3.648392915725708, + "step": 14924 + }, + { + "epoch": 3.73, + "grad_norm": 4.11916971206665, + "learning_rate": 1.5000738281337818e-06, + "logits/chosen": -0.5048702359199524, + "logits/rejected": -0.5650748014450073, + "logps/chosen": -50.17369842529297, + "logps/rejected": -110.95211791992188, + "loss": 0.5895, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1621108055114746, + "rewards/margins": 8.141311645507812, + "rewards/rejected": -4.979201316833496, + "step": 14925 + }, + { + "epoch": 3.73, + "grad_norm": 3.261110782623291, + "learning_rate": 1.4995125525462401e-06, + "logits/chosen": -0.5999388694763184, + "logits/rejected": -0.6792833209037781, + "logps/chosen": -51.76378631591797, + "logps/rejected": -106.77507019042969, + "loss": 0.681, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.922518253326416, + "rewards/margins": 7.183008193969727, + "rewards/rejected": -4.260490417480469, + "step": 14926 + }, + { + "epoch": 3.73, + "grad_norm": 3.4066085815429688, + "learning_rate": 1.4989513634594654e-06, + "logits/chosen": -0.6032856702804565, + "logits/rejected": -0.6699373126029968, + "logps/chosen": -55.67107391357422, + "logps/rejected": -108.78599548339844, + "loss": 0.6057, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.495743751525879, + "rewards/margins": 7.704355239868164, + "rewards/rejected": -4.208611488342285, + "step": 14927 + }, + { + "epoch": 3.73, + "grad_norm": 4.3170084953308105, + "learning_rate": 1.498390260887328e-06, + "logits/chosen": -0.530972421169281, + "logits/rejected": -0.6110406517982483, + "logps/chosen": -53.045467376708984, + "logps/rejected": -83.99481964111328, + "loss": 0.5598, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.193690538406372, + "rewards/margins": 6.439939498901367, + "rewards/rejected": -3.246248483657837, + "step": 14928 + }, + { + "epoch": 3.73, + "grad_norm": 5.217682838439941, + "learning_rate": 1.4978292448436898e-06, + "logits/chosen": -0.6121114492416382, + "logits/rejected": -0.7110398411750793, + "logps/chosen": -47.832923889160156, + "logps/rejected": -96.46772766113281, + "loss": 0.5751, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8224802017211914, + "rewards/margins": 7.051827907562256, + "rewards/rejected": -4.2293477058410645, + "step": 14929 + }, + { + "epoch": 3.73, + "grad_norm": 3.4241418838500977, + "learning_rate": 1.4972683153424177e-06, + "logits/chosen": -0.6074535250663757, + "logits/rejected": -0.6551260948181152, + "logps/chosen": -59.259395599365234, + "logps/rejected": -126.81465148925781, + "loss": 0.6264, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2370738983154297, + "rewards/margins": 7.657646179199219, + "rewards/rejected": -4.420572280883789, + "step": 14930 + }, + { + "epoch": 3.74, + "grad_norm": 7.428300857543945, + "learning_rate": 1.49670747239737e-06, + "logits/chosen": -0.4672544300556183, + "logits/rejected": -0.5629361867904663, + "logps/chosen": -51.60567092895508, + "logps/rejected": -112.30973815917969, + "loss": 0.6261, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1015853881835938, + "rewards/margins": 6.625588893890381, + "rewards/rejected": -3.5240039825439453, + "step": 14931 + }, + { + "epoch": 3.74, + "grad_norm": 5.51727819442749, + "learning_rate": 1.496146716022408e-06, + "logits/chosen": -0.5149720311164856, + "logits/rejected": -0.572740912437439, + "logps/chosen": -56.1336784362793, + "logps/rejected": -98.06088256835938, + "loss": 0.5899, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.425586223602295, + "rewards/margins": 7.098650932312012, + "rewards/rejected": -3.673064947128296, + "step": 14932 + }, + { + "epoch": 3.74, + "grad_norm": 4.475484848022461, + "learning_rate": 1.4955860462313875e-06, + "logits/chosen": -0.6574012041091919, + "logits/rejected": -0.6523136496543884, + "logps/chosen": -70.63330841064453, + "logps/rejected": -103.49301147460938, + "loss": 0.559, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.06072998046875, + "rewards/margins": 7.295846462249756, + "rewards/rejected": -4.235116004943848, + "step": 14933 + }, + { + "epoch": 3.74, + "grad_norm": 3.477243661880493, + "learning_rate": 1.4950254630381622e-06, + "logits/chosen": -0.5835510492324829, + "logits/rejected": -0.6867557764053345, + "logps/chosen": -52.7073860168457, + "logps/rejected": -94.64663696289062, + "loss": 0.5991, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.169914484024048, + "rewards/margins": 6.901348114013672, + "rewards/rejected": -3.7314343452453613, + "step": 14934 + }, + { + "epoch": 3.74, + "grad_norm": 15.394369125366211, + "learning_rate": 1.494464966456587e-06, + "logits/chosen": -0.5790831446647644, + "logits/rejected": -0.6147710084915161, + "logps/chosen": -50.825679779052734, + "logps/rejected": -117.35275268554688, + "loss": 0.5858, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1637330055236816, + "rewards/margins": 6.4318413734436035, + "rewards/rejected": -3.268108606338501, + "step": 14935 + }, + { + "epoch": 3.74, + "grad_norm": 5.014277458190918, + "learning_rate": 1.4939045565005095e-06, + "logits/chosen": -0.559260368347168, + "logits/rejected": -0.6331153512001038, + "logps/chosen": -49.909423828125, + "logps/rejected": -97.05227661132812, + "loss": 0.6208, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.72104811668396, + "rewards/margins": 6.360200881958008, + "rewards/rejected": -3.639153003692627, + "step": 14936 + }, + { + "epoch": 3.74, + "grad_norm": 5.513842582702637, + "learning_rate": 1.4933442331837816e-06, + "logits/chosen": -0.5692552328109741, + "logits/rejected": -0.6312117576599121, + "logps/chosen": -59.300559997558594, + "logps/rejected": -89.27002716064453, + "loss": 0.7376, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2144083976745605, + "rewards/margins": 6.264353275299072, + "rewards/rejected": -3.0499446392059326, + "step": 14937 + }, + { + "epoch": 3.74, + "grad_norm": 2.847370147705078, + "learning_rate": 1.4927839965202473e-06, + "logits/chosen": -0.5219122767448425, + "logits/rejected": -0.6239473819732666, + "logps/chosen": -54.063594818115234, + "logps/rejected": -109.42362213134766, + "loss": 0.5985, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.100234031677246, + "rewards/margins": 8.081727027893066, + "rewards/rejected": -4.981492519378662, + "step": 14938 + }, + { + "epoch": 3.74, + "grad_norm": 6.16312837600708, + "learning_rate": 1.4922238465237492e-06, + "logits/chosen": -0.48933646082878113, + "logits/rejected": -0.595850944519043, + "logps/chosen": -51.13113021850586, + "logps/rejected": -101.69129943847656, + "loss": 0.6148, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.097343683242798, + "rewards/margins": 7.305112838745117, + "rewards/rejected": -4.20776891708374, + "step": 14939 + }, + { + "epoch": 3.74, + "grad_norm": 5.785913944244385, + "learning_rate": 1.4916637832081326e-06, + "logits/chosen": -0.5730093717575073, + "logits/rejected": -0.6497591733932495, + "logps/chosen": -48.779632568359375, + "logps/rejected": -102.06393432617188, + "loss": 0.6384, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.154433488845825, + "rewards/margins": 5.937758445739746, + "rewards/rejected": -2.783324956893921, + "step": 14940 + }, + { + "epoch": 3.74, + "grad_norm": 16.789714813232422, + "learning_rate": 1.4911038065872357e-06, + "logits/chosen": -0.6122770309448242, + "logits/rejected": -0.7039979100227356, + "logps/chosen": -46.55507278442383, + "logps/rejected": -100.02124786376953, + "loss": 0.5074, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.302677631378174, + "rewards/margins": 7.209059238433838, + "rewards/rejected": -3.9063820838928223, + "step": 14941 + }, + { + "epoch": 3.74, + "grad_norm": 5.664409160614014, + "learning_rate": 1.4905439166748943e-06, + "logits/chosen": -0.5849928855895996, + "logits/rejected": -0.6588030457496643, + "logps/chosen": -49.83770751953125, + "logps/rejected": -92.96536254882812, + "loss": 0.6667, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.188403367996216, + "rewards/margins": 6.055881500244141, + "rewards/rejected": -2.8674778938293457, + "step": 14942 + }, + { + "epoch": 3.74, + "grad_norm": 3.6919236183166504, + "learning_rate": 1.4899841134849452e-06, + "logits/chosen": -0.6103734374046326, + "logits/rejected": -0.6530734300613403, + "logps/chosen": -45.82162094116211, + "logps/rejected": -107.87287139892578, + "loss": 0.5995, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.233639717102051, + "rewards/margins": 7.068511009216309, + "rewards/rejected": -3.834871530532837, + "step": 14943 + }, + { + "epoch": 3.74, + "grad_norm": 4.036376476287842, + "learning_rate": 1.4894243970312256e-06, + "logits/chosen": -0.554905891418457, + "logits/rejected": -0.651167631149292, + "logps/chosen": -52.60433578491211, + "logps/rejected": -108.33577728271484, + "loss": 0.6101, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.376760721206665, + "rewards/margins": 6.304001331329346, + "rewards/rejected": -2.9272406101226807, + "step": 14944 + }, + { + "epoch": 3.74, + "grad_norm": 2.7013561725616455, + "learning_rate": 1.4888647673275598e-06, + "logits/chosen": -0.4860488772392273, + "logits/rejected": -0.5847737193107605, + "logps/chosen": -56.254783630371094, + "logps/rejected": -110.11316680908203, + "loss": 0.6033, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0049352645874023, + "rewards/margins": 6.889913082122803, + "rewards/rejected": -3.8849782943725586, + "step": 14945 + }, + { + "epoch": 3.74, + "grad_norm": 5.254696846008301, + "learning_rate": 1.4883052243877805e-06, + "logits/chosen": -0.4953998923301697, + "logits/rejected": -0.5921688079833984, + "logps/chosen": -64.90103912353516, + "logps/rejected": -121.40514373779297, + "loss": 0.6228, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8893661499023438, + "rewards/margins": 6.712536811828613, + "rewards/rejected": -3.8231704235076904, + "step": 14946 + }, + { + "epoch": 3.74, + "grad_norm": 5.032862663269043, + "learning_rate": 1.4877457682257173e-06, + "logits/chosen": -0.547366738319397, + "logits/rejected": -0.6492509245872498, + "logps/chosen": -61.35554504394531, + "logps/rejected": -109.18049621582031, + "loss": 0.6271, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.255768299102783, + "rewards/margins": 7.649895191192627, + "rewards/rejected": -4.394126892089844, + "step": 14947 + }, + { + "epoch": 3.74, + "grad_norm": 3.9143452644348145, + "learning_rate": 1.4871863988551887e-06, + "logits/chosen": -0.4984336793422699, + "logits/rejected": -0.5874440670013428, + "logps/chosen": -56.82046127319336, + "logps/rejected": -106.36370849609375, + "loss": 0.579, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.152865171432495, + "rewards/margins": 7.515652656555176, + "rewards/rejected": -4.362788200378418, + "step": 14948 + }, + { + "epoch": 3.74, + "grad_norm": 5.321296215057373, + "learning_rate": 1.4866271162900203e-06, + "logits/chosen": -0.5430489778518677, + "logits/rejected": -0.6336554288864136, + "logps/chosen": -66.114501953125, + "logps/rejected": -111.51333618164062, + "loss": 0.7801, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.003343105316162, + "rewards/margins": 5.975336074829102, + "rewards/rejected": -2.9719934463500977, + "step": 14949 + }, + { + "epoch": 3.74, + "grad_norm": 7.623297214508057, + "learning_rate": 1.4860679205440348e-06, + "logits/chosen": -0.5854379534721375, + "logits/rejected": -0.6661218404769897, + "logps/chosen": -48.05620574951172, + "logps/rejected": -102.1572036743164, + "loss": 0.6449, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.796525478363037, + "rewards/margins": 6.127806186676025, + "rewards/rejected": -3.331280469894409, + "step": 14950 + }, + { + "epoch": 3.74, + "grad_norm": 7.951473236083984, + "learning_rate": 1.485508811631049e-06, + "logits/chosen": -0.5561730861663818, + "logits/rejected": -0.6233685612678528, + "logps/chosen": -64.37860107421875, + "logps/rejected": -108.95431518554688, + "loss": 0.6849, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.64847469329834, + "rewards/margins": 7.0757269859313965, + "rewards/rejected": -4.427251815795898, + "step": 14951 + }, + { + "epoch": 3.74, + "grad_norm": 5.537724018096924, + "learning_rate": 1.4849497895648762e-06, + "logits/chosen": -0.5877687931060791, + "logits/rejected": -0.7235403656959534, + "logps/chosen": -57.55643844604492, + "logps/rejected": -90.04940795898438, + "loss": 0.5373, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7034592628479004, + "rewards/margins": 7.207374572753906, + "rewards/rejected": -4.503914833068848, + "step": 14952 + }, + { + "epoch": 3.74, + "grad_norm": 8.201251029968262, + "learning_rate": 1.4843908543593349e-06, + "logits/chosen": -0.5108377933502197, + "logits/rejected": -0.5861557722091675, + "logps/chosen": -49.8405876159668, + "logps/rejected": -94.29527282714844, + "loss": 0.7022, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.630465269088745, + "rewards/margins": 5.499031066894531, + "rewards/rejected": -2.868565559387207, + "step": 14953 + }, + { + "epoch": 3.74, + "grad_norm": 3.051478624343872, + "learning_rate": 1.4838320060282351e-06, + "logits/chosen": -0.4871543049812317, + "logits/rejected": -0.5391737818717957, + "logps/chosen": -42.34721374511719, + "logps/rejected": -91.41718292236328, + "loss": 0.5151, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8726730346679688, + "rewards/margins": 5.668995380401611, + "rewards/rejected": -2.7963221073150635, + "step": 14954 + }, + { + "epoch": 3.74, + "grad_norm": 3.2031383514404297, + "learning_rate": 1.4832732445853849e-06, + "logits/chosen": -0.5572394132614136, + "logits/rejected": -0.6488156914710999, + "logps/chosen": -48.45545196533203, + "logps/rejected": -109.1610336303711, + "loss": 0.6484, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7905561923980713, + "rewards/margins": 8.27092456817627, + "rewards/rejected": -5.480368137359619, + "step": 14955 + }, + { + "epoch": 3.74, + "grad_norm": 5.176820755004883, + "learning_rate": 1.4827145700445943e-06, + "logits/chosen": -0.6438810229301453, + "logits/rejected": -0.7045488953590393, + "logps/chosen": -52.4310302734375, + "logps/rejected": -109.28073120117188, + "loss": 0.6346, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.897979736328125, + "rewards/margins": 7.860584735870361, + "rewards/rejected": -4.962604522705078, + "step": 14956 + }, + { + "epoch": 3.74, + "grad_norm": 4.43428897857666, + "learning_rate": 1.4821559824196669e-06, + "logits/chosen": -0.5735572576522827, + "logits/rejected": -0.6394321918487549, + "logps/chosen": -60.054683685302734, + "logps/rejected": -114.75804138183594, + "loss": 0.6442, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.254254102706909, + "rewards/margins": 7.15944242477417, + "rewards/rejected": -3.9051880836486816, + "step": 14957 + }, + { + "epoch": 3.74, + "grad_norm": 5.959815979003906, + "learning_rate": 1.4815974817244084e-06, + "logits/chosen": -0.5236875414848328, + "logits/rejected": -0.5952541828155518, + "logps/chosen": -54.16202926635742, + "logps/rejected": -109.32015991210938, + "loss": 0.6019, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1203718185424805, + "rewards/margins": 6.567294120788574, + "rewards/rejected": -3.4469218254089355, + "step": 14958 + }, + { + "epoch": 3.74, + "grad_norm": 2.8418147563934326, + "learning_rate": 1.4810390679726183e-06, + "logits/chosen": -0.569271445274353, + "logits/rejected": -0.6427462100982666, + "logps/chosen": -54.21907043457031, + "logps/rejected": -107.58838653564453, + "loss": 0.633, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1453027725219727, + "rewards/margins": 7.478170871734619, + "rewards/rejected": -4.332867622375488, + "step": 14959 + }, + { + "epoch": 3.74, + "grad_norm": 7.855808258056641, + "learning_rate": 1.4804807411780942e-06, + "logits/chosen": -0.5381054282188416, + "logits/rejected": -0.5474463701248169, + "logps/chosen": -52.65610885620117, + "logps/rejected": -134.87364196777344, + "loss": 0.6318, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.003211736679077, + "rewards/margins": 7.9758405685424805, + "rewards/rejected": -4.972629547119141, + "step": 14960 + }, + { + "epoch": 3.74, + "grad_norm": 4.398216247558594, + "learning_rate": 1.4799225013546343e-06, + "logits/chosen": -0.523213803768158, + "logits/rejected": -0.6184496283531189, + "logps/chosen": -61.36178970336914, + "logps/rejected": -104.4761962890625, + "loss": 0.6962, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0025877952575684, + "rewards/margins": 5.874756336212158, + "rewards/rejected": -2.8721680641174316, + "step": 14961 + }, + { + "epoch": 3.74, + "grad_norm": 2.8813648223876953, + "learning_rate": 1.4793643485160354e-06, + "logits/chosen": -0.54184889793396, + "logits/rejected": -0.5832460522651672, + "logps/chosen": -48.05709457397461, + "logps/rejected": -107.33683776855469, + "loss": 0.554, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0506844520568848, + "rewards/margins": 6.619329452514648, + "rewards/rejected": -3.5686442852020264, + "step": 14962 + }, + { + "epoch": 3.74, + "grad_norm": 5.60479211807251, + "learning_rate": 1.478806282676088e-06, + "logits/chosen": -0.6141259670257568, + "logits/rejected": -0.5969761610031128, + "logps/chosen": -47.15816116333008, + "logps/rejected": -126.27033233642578, + "loss": 0.7226, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.166757583618164, + "rewards/margins": 6.642696857452393, + "rewards/rejected": -3.4759392738342285, + "step": 14963 + }, + { + "epoch": 3.74, + "grad_norm": 3.9716596603393555, + "learning_rate": 1.4782483038485812e-06, + "logits/chosen": -0.5562431216239929, + "logits/rejected": -0.6963980793952942, + "logps/chosen": -59.37967300415039, + "logps/rejected": -95.23587799072266, + "loss": 0.6015, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.221663475036621, + "rewards/margins": 6.830206871032715, + "rewards/rejected": -3.608543634414673, + "step": 14964 + }, + { + "epoch": 3.74, + "grad_norm": 25.15091323852539, + "learning_rate": 1.4776904120473062e-06, + "logits/chosen": -0.567221462726593, + "logits/rejected": -0.6454218626022339, + "logps/chosen": -54.155879974365234, + "logps/rejected": -107.05699157714844, + "loss": 0.7215, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.163167715072632, + "rewards/margins": 7.689253330230713, + "rewards/rejected": -4.526084899902344, + "step": 14965 + }, + { + "epoch": 3.74, + "grad_norm": 4.695411205291748, + "learning_rate": 1.4771326072860475e-06, + "logits/chosen": -0.5083043575286865, + "logits/rejected": -0.5616824626922607, + "logps/chosen": -50.828582763671875, + "logps/rejected": -99.28759765625, + "loss": 0.5875, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2461998462677, + "rewards/margins": 6.3462066650390625, + "rewards/rejected": -3.100006341934204, + "step": 14966 + }, + { + "epoch": 3.74, + "grad_norm": 8.701476097106934, + "learning_rate": 1.4765748895785881e-06, + "logits/chosen": -0.5420219302177429, + "logits/rejected": -0.5847535133361816, + "logps/chosen": -51.805519104003906, + "logps/rejected": -108.47268676757812, + "loss": 0.6402, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4886653423309326, + "rewards/margins": 7.226004600524902, + "rewards/rejected": -3.737339735031128, + "step": 14967 + }, + { + "epoch": 3.74, + "grad_norm": 7.566315174102783, + "learning_rate": 1.4760172589387118e-06, + "logits/chosen": -0.5686476826667786, + "logits/rejected": -0.6547384858131409, + "logps/chosen": -57.4656867980957, + "logps/rejected": -87.48771667480469, + "loss": 0.6744, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2228972911834717, + "rewards/margins": 6.428369045257568, + "rewards/rejected": -3.2054710388183594, + "step": 14968 + }, + { + "epoch": 3.74, + "grad_norm": 7.135870456695557, + "learning_rate": 1.4754597153801975e-06, + "logits/chosen": -0.604245662689209, + "logits/rejected": -0.6549227237701416, + "logps/chosen": -49.78214645385742, + "logps/rejected": -109.3228530883789, + "loss": 0.7624, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.034464120864868, + "rewards/margins": 7.262754917144775, + "rewards/rejected": -4.228291034698486, + "step": 14969 + }, + { + "epoch": 3.74, + "grad_norm": 4.619007587432861, + "learning_rate": 1.474902258916821e-06, + "logits/chosen": -0.5080751776695251, + "logits/rejected": -0.6120469570159912, + "logps/chosen": -53.941829681396484, + "logps/rejected": -87.5457992553711, + "loss": 0.6247, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.233896017074585, + "rewards/margins": 7.039274215698242, + "rewards/rejected": -3.805377960205078, + "step": 14970 + }, + { + "epoch": 3.75, + "grad_norm": 5.374202728271484, + "learning_rate": 1.4743448895623607e-06, + "logits/chosen": -0.5508348345756531, + "logits/rejected": -0.6491920948028564, + "logps/chosen": -54.20128631591797, + "logps/rejected": -93.94761657714844, + "loss": 0.5935, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.391066789627075, + "rewards/margins": 6.953578948974609, + "rewards/rejected": -3.562511920928955, + "step": 14971 + }, + { + "epoch": 3.75, + "grad_norm": 11.917860984802246, + "learning_rate": 1.4737876073305885e-06, + "logits/chosen": -0.5204799175262451, + "logits/rejected": -0.6091579794883728, + "logps/chosen": -55.58570861816406, + "logps/rejected": -125.14900970458984, + "loss": 0.6657, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1063928604125977, + "rewards/margins": 7.004933834075928, + "rewards/rejected": -3.8985414505004883, + "step": 14972 + }, + { + "epoch": 3.75, + "grad_norm": 4.478848934173584, + "learning_rate": 1.4732304122352737e-06, + "logits/chosen": -0.5168567299842834, + "logits/rejected": -0.5362247824668884, + "logps/chosen": -59.17572021484375, + "logps/rejected": -126.20500946044922, + "loss": 0.6407, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.428715229034424, + "rewards/margins": 7.869050025939941, + "rewards/rejected": -4.440335750579834, + "step": 14973 + }, + { + "epoch": 3.75, + "grad_norm": 5.913604736328125, + "learning_rate": 1.472673304290186e-06, + "logits/chosen": -0.5015941262245178, + "logits/rejected": -0.5853347182273865, + "logps/chosen": -59.417747497558594, + "logps/rejected": -110.70626068115234, + "loss": 0.6284, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3823959827423096, + "rewards/margins": 7.658122539520264, + "rewards/rejected": -4.275725841522217, + "step": 14974 + }, + { + "epoch": 3.75, + "grad_norm": 1.4276005029678345, + "learning_rate": 1.4721162835090947e-06, + "logits/chosen": -0.5314916968345642, + "logits/rejected": -0.6465733647346497, + "logps/chosen": -43.10495376586914, + "logps/rejected": -99.12210845947266, + "loss": 0.433, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3463802337646484, + "rewards/margins": 8.326530456542969, + "rewards/rejected": -4.98015022277832, + "step": 14975 + }, + { + "epoch": 3.75, + "grad_norm": 3.286083459854126, + "learning_rate": 1.4715593499057622e-06, + "logits/chosen": -0.6094383001327515, + "logits/rejected": -0.7200862169265747, + "logps/chosen": -47.11331558227539, + "logps/rejected": -105.17001342773438, + "loss": 0.5767, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0771167278289795, + "rewards/margins": 8.482138633728027, + "rewards/rejected": -5.405022144317627, + "step": 14976 + }, + { + "epoch": 3.75, + "grad_norm": 4.015704154968262, + "learning_rate": 1.47100250349395e-06, + "logits/chosen": -0.6255552172660828, + "logits/rejected": -0.7094061374664307, + "logps/chosen": -48.27553939819336, + "logps/rejected": -97.79154205322266, + "loss": 0.6401, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.384445905685425, + "rewards/margins": 7.899796962738037, + "rewards/rejected": -4.515350818634033, + "step": 14977 + }, + { + "epoch": 3.75, + "grad_norm": 4.41621732711792, + "learning_rate": 1.470445744287421e-06, + "logits/chosen": -0.612761378288269, + "logits/rejected": -0.6247924566268921, + "logps/chosen": -55.33081817626953, + "logps/rejected": -113.92753601074219, + "loss": 0.6333, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.097693920135498, + "rewards/margins": 6.339436054229736, + "rewards/rejected": -3.2417421340942383, + "step": 14978 + }, + { + "epoch": 3.75, + "grad_norm": 4.363362789154053, + "learning_rate": 1.469889072299932e-06, + "logits/chosen": -0.4879775941371918, + "logits/rejected": -0.5542770624160767, + "logps/chosen": -50.6778450012207, + "logps/rejected": -106.17440795898438, + "loss": 0.6005, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3784971237182617, + "rewards/margins": 7.019140720367432, + "rewards/rejected": -3.64064359664917, + "step": 14979 + }, + { + "epoch": 3.75, + "grad_norm": 7.081836700439453, + "learning_rate": 1.4693324875452369e-06, + "logits/chosen": -0.6059759259223938, + "logits/rejected": -0.7170306444168091, + "logps/chosen": -54.49662780761719, + "logps/rejected": -100.23817443847656, + "loss": 0.7393, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.572474241256714, + "rewards/margins": 7.430648326873779, + "rewards/rejected": -4.8581743240356445, + "step": 14980 + }, + { + "epoch": 3.75, + "grad_norm": 6.335628509521484, + "learning_rate": 1.468775990037093e-06, + "logits/chosen": -0.5054564476013184, + "logits/rejected": -0.5839765071868896, + "logps/chosen": -49.07297897338867, + "logps/rejected": -109.8636245727539, + "loss": 0.5769, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1148428916931152, + "rewards/margins": 7.9120402336120605, + "rewards/rejected": -4.7971978187561035, + "step": 14981 + }, + { + "epoch": 3.75, + "grad_norm": 14.562941551208496, + "learning_rate": 1.468219579789249e-06, + "logits/chosen": -0.5090106129646301, + "logits/rejected": -0.5954310894012451, + "logps/chosen": -57.40636444091797, + "logps/rejected": -111.17337036132812, + "loss": 0.662, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0149292945861816, + "rewards/margins": 7.035715103149414, + "rewards/rejected": -4.020785808563232, + "step": 14982 + }, + { + "epoch": 3.75, + "grad_norm": 11.563579559326172, + "learning_rate": 1.4676632568154575e-06, + "logits/chosen": -0.52065110206604, + "logits/rejected": -0.5795845985412598, + "logps/chosen": -57.362586975097656, + "logps/rejected": -104.90573120117188, + "loss": 0.6413, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.089470148086548, + "rewards/margins": 6.471487045288086, + "rewards/rejected": -3.38201642036438, + "step": 14983 + }, + { + "epoch": 3.75, + "grad_norm": 5.323122024536133, + "learning_rate": 1.4671070211294635e-06, + "logits/chosen": -0.5656673908233643, + "logits/rejected": -0.6975946426391602, + "logps/chosen": -63.969364166259766, + "logps/rejected": -94.78660583496094, + "loss": 0.6046, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.199251651763916, + "rewards/margins": 7.909397125244141, + "rewards/rejected": -4.710145473480225, + "step": 14984 + }, + { + "epoch": 3.75, + "grad_norm": 3.9856717586517334, + "learning_rate": 1.4665508727450118e-06, + "logits/chosen": -0.5637986660003662, + "logits/rejected": -0.6888167858123779, + "logps/chosen": -57.44670867919922, + "logps/rejected": -120.83473205566406, + "loss": 0.6428, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.840923547744751, + "rewards/margins": 8.339054107666016, + "rewards/rejected": -5.4981303215026855, + "step": 14985 + }, + { + "epoch": 3.75, + "grad_norm": 12.377382278442383, + "learning_rate": 1.465994811675846e-06, + "logits/chosen": -0.540023148059845, + "logits/rejected": -0.6369971632957458, + "logps/chosen": -51.768375396728516, + "logps/rejected": -90.49341583251953, + "loss": 0.7142, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2723746299743652, + "rewards/margins": 5.988415718078613, + "rewards/rejected": -2.71604061126709, + "step": 14986 + }, + { + "epoch": 3.75, + "grad_norm": 14.218358993530273, + "learning_rate": 1.4654388379357109e-06, + "logits/chosen": -0.5674153566360474, + "logits/rejected": -0.5940207839012146, + "logps/chosen": -57.00061798095703, + "logps/rejected": -132.64556884765625, + "loss": 0.7548, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.885343551635742, + "rewards/margins": 6.20107364654541, + "rewards/rejected": -3.315730094909668, + "step": 14987 + }, + { + "epoch": 3.75, + "grad_norm": 3.925649642944336, + "learning_rate": 1.4648829515383378e-06, + "logits/chosen": -0.5051242709159851, + "logits/rejected": -0.5960453748703003, + "logps/chosen": -62.47693634033203, + "logps/rejected": -104.31085205078125, + "loss": 0.5728, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1333541870117188, + "rewards/margins": 7.0834808349609375, + "rewards/rejected": -3.9501266479492188, + "step": 14988 + }, + { + "epoch": 3.75, + "grad_norm": 5.5356903076171875, + "learning_rate": 1.4643271524974679e-06, + "logits/chosen": -0.5698499083518982, + "logits/rejected": -0.6053622961044312, + "logps/chosen": -60.05015182495117, + "logps/rejected": -104.49686431884766, + "loss": 0.6414, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8828742504119873, + "rewards/margins": 6.4178466796875, + "rewards/rejected": -3.534972667694092, + "step": 14989 + }, + { + "epoch": 3.75, + "grad_norm": 5.641829013824463, + "learning_rate": 1.4637714408268372e-06, + "logits/chosen": -0.5016846656799316, + "logits/rejected": -0.5945158004760742, + "logps/chosen": -59.7968635559082, + "logps/rejected": -104.24620819091797, + "loss": 0.6819, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.858461856842041, + "rewards/margins": 6.695042610168457, + "rewards/rejected": -3.836580514907837, + "step": 14990 + }, + { + "epoch": 3.75, + "grad_norm": 5.943961143493652, + "learning_rate": 1.4632158165401728e-06, + "logits/chosen": -0.5034812688827515, + "logits/rejected": -0.5995931029319763, + "logps/chosen": -62.48930358886719, + "logps/rejected": -99.34223937988281, + "loss": 0.589, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2262818813323975, + "rewards/margins": 7.888592720031738, + "rewards/rejected": -4.66231107711792, + "step": 14991 + }, + { + "epoch": 3.75, + "grad_norm": 4.009613990783691, + "learning_rate": 1.4626602796512074e-06, + "logits/chosen": -0.5874525904655457, + "logits/rejected": -0.6898766756057739, + "logps/chosen": -54.39181137084961, + "logps/rejected": -107.9604263305664, + "loss": 0.5762, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7147819995880127, + "rewards/margins": 7.849159240722656, + "rewards/rejected": -4.134377479553223, + "step": 14992 + }, + { + "epoch": 3.75, + "grad_norm": 3.095583200454712, + "learning_rate": 1.46210483017367e-06, + "logits/chosen": -0.5696820616722107, + "logits/rejected": -0.6589334011077881, + "logps/chosen": -58.378841400146484, + "logps/rejected": -109.21173095703125, + "loss": 0.6246, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9353578090667725, + "rewards/margins": 7.359620571136475, + "rewards/rejected": -4.424263000488281, + "step": 14993 + }, + { + "epoch": 3.75, + "grad_norm": 3.8511409759521484, + "learning_rate": 1.461549468121286e-06, + "logits/chosen": -0.49465543031692505, + "logits/rejected": -0.6596501469612122, + "logps/chosen": -76.36188507080078, + "logps/rejected": -107.91641998291016, + "loss": 0.6811, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0994839668273926, + "rewards/margins": 7.819954872131348, + "rewards/rejected": -4.720471382141113, + "step": 14994 + }, + { + "epoch": 3.75, + "grad_norm": 3.7594528198242188, + "learning_rate": 1.4609941935077764e-06, + "logits/chosen": -0.5650083422660828, + "logits/rejected": -0.6669410467147827, + "logps/chosen": -67.84363555908203, + "logps/rejected": -95.1487045288086, + "loss": 0.606, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.363459587097168, + "rewards/margins": 7.686318397521973, + "rewards/rejected": -4.3228583335876465, + "step": 14995 + }, + { + "epoch": 3.75, + "grad_norm": 3.9831643104553223, + "learning_rate": 1.4604390063468666e-06, + "logits/chosen": -0.5856679677963257, + "logits/rejected": -0.5979648232460022, + "logps/chosen": -48.579185485839844, + "logps/rejected": -109.55133819580078, + "loss": 0.6156, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.36759614944458, + "rewards/margins": 6.867093086242676, + "rewards/rejected": -3.499497413635254, + "step": 14996 + }, + { + "epoch": 3.75, + "grad_norm": 4.608013153076172, + "learning_rate": 1.4598839066522734e-06, + "logits/chosen": -0.5876575112342834, + "logits/rejected": -0.679935872554779, + "logps/chosen": -52.14281463623047, + "logps/rejected": -91.75948333740234, + "loss": 0.6192, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0255398750305176, + "rewards/margins": 6.594688415527344, + "rewards/rejected": -3.5691487789154053, + "step": 14997 + }, + { + "epoch": 3.75, + "grad_norm": 3.757481575012207, + "learning_rate": 1.4593288944377127e-06, + "logits/chosen": -0.5132073163986206, + "logits/rejected": -0.598767101764679, + "logps/chosen": -50.373477935791016, + "logps/rejected": -111.02210998535156, + "loss": 0.5153, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0664288997650146, + "rewards/margins": 7.932889938354492, + "rewards/rejected": -4.866461277008057, + "step": 14998 + }, + { + "epoch": 3.75, + "grad_norm": 19.038330078125, + "learning_rate": 1.4587739697169029e-06, + "logits/chosen": -0.569521963596344, + "logits/rejected": -0.6557264924049377, + "logps/chosen": -60.250648498535156, + "logps/rejected": -114.82453918457031, + "loss": 0.8939, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9852447509765625, + "rewards/margins": 8.576658248901367, + "rewards/rejected": -5.591413497924805, + "step": 14999 + }, + { + "epoch": 3.75, + "grad_norm": 3.0559659004211426, + "learning_rate": 1.4582191325035526e-06, + "logits/chosen": -0.528924822807312, + "logits/rejected": -0.6442509889602661, + "logps/chosen": -57.01188278198242, + "logps/rejected": -100.32112121582031, + "loss": 0.5859, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8771040439605713, + "rewards/margins": 7.21908712387085, + "rewards/rejected": -4.341982364654541, + "step": 15000 + }, + { + "epoch": 3.75, + "grad_norm": 6.805330753326416, + "learning_rate": 1.4576643828113773e-06, + "logits/chosen": -0.5295836925506592, + "logits/rejected": -0.6383122801780701, + "logps/chosen": -69.82268524169922, + "logps/rejected": -98.47283935546875, + "loss": 0.8063, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.964322566986084, + "rewards/margins": 6.4675397872924805, + "rewards/rejected": -3.5032174587249756, + "step": 15001 + }, + { + "epoch": 3.75, + "grad_norm": 14.813911437988281, + "learning_rate": 1.4571097206540823e-06, + "logits/chosen": -0.511557400226593, + "logits/rejected": -0.5464489459991455, + "logps/chosen": -52.70163345336914, + "logps/rejected": -120.63875579833984, + "loss": 0.6349, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.100025177001953, + "rewards/margins": 7.407907485961914, + "rewards/rejected": -4.307882308959961, + "step": 15002 + }, + { + "epoch": 3.75, + "grad_norm": 6.187306880950928, + "learning_rate": 1.456555146045373e-06, + "logits/chosen": -0.6095811724662781, + "logits/rejected": -0.7023717761039734, + "logps/chosen": -48.44563293457031, + "logps/rejected": -108.24557495117188, + "loss": 0.6092, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.950059652328491, + "rewards/margins": 7.572746276855469, + "rewards/rejected": -4.622686862945557, + "step": 15003 + }, + { + "epoch": 3.75, + "grad_norm": 2.8395657539367676, + "learning_rate": 1.4560006589989567e-06, + "logits/chosen": -0.5335336923599243, + "logits/rejected": -0.6625495553016663, + "logps/chosen": -54.81047058105469, + "logps/rejected": -97.02420806884766, + "loss": 0.5704, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1910789012908936, + "rewards/margins": 8.494003295898438, + "rewards/rejected": -5.302924633026123, + "step": 15004 + }, + { + "epoch": 3.75, + "grad_norm": 7.806646347045898, + "learning_rate": 1.4554462595285319e-06, + "logits/chosen": -0.5295200943946838, + "logits/rejected": -0.6157440543174744, + "logps/chosen": -49.86547088623047, + "logps/rejected": -90.22492980957031, + "loss": 0.6783, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.004838466644287, + "rewards/margins": 5.600803375244141, + "rewards/rejected": -2.5959651470184326, + "step": 15005 + }, + { + "epoch": 3.75, + "grad_norm": 5.268581867218018, + "learning_rate": 1.4548919476478018e-06, + "logits/chosen": -0.5383225679397583, + "logits/rejected": -0.6319065690040588, + "logps/chosen": -65.82472229003906, + "logps/rejected": -109.70500183105469, + "loss": 0.713, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.958463191986084, + "rewards/margins": 7.522033214569092, + "rewards/rejected": -4.563570022583008, + "step": 15006 + }, + { + "epoch": 3.75, + "grad_norm": 7.354860782623291, + "learning_rate": 1.4543377233704608e-06, + "logits/chosen": -0.4694070518016815, + "logits/rejected": -0.557640790939331, + "logps/chosen": -53.4304084777832, + "logps/rejected": -125.57759094238281, + "loss": 0.5806, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.987734079360962, + "rewards/margins": 7.899318218231201, + "rewards/rejected": -4.911584377288818, + "step": 15007 + }, + { + "epoch": 3.75, + "grad_norm": 4.125625133514404, + "learning_rate": 1.4537835867102078e-06, + "logits/chosen": -0.6460726857185364, + "logits/rejected": -0.6907038688659668, + "logps/chosen": -53.32407760620117, + "logps/rejected": -119.93388366699219, + "loss": 0.6576, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4187116622924805, + "rewards/margins": 6.7590203285217285, + "rewards/rejected": -3.340308666229248, + "step": 15008 + }, + { + "epoch": 3.75, + "grad_norm": 6.247084140777588, + "learning_rate": 1.4532295376807343e-06, + "logits/chosen": -0.5428916811943054, + "logits/rejected": -0.6156345009803772, + "logps/chosen": -69.16362762451172, + "logps/rejected": -107.23294067382812, + "loss": 0.7144, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.963197946548462, + "rewards/margins": 6.662839889526367, + "rewards/rejected": -3.6996424198150635, + "step": 15009 + }, + { + "epoch": 3.75, + "grad_norm": 6.57875919342041, + "learning_rate": 1.4526755762957301e-06, + "logits/chosen": -0.5202625393867493, + "logits/rejected": -0.6003093719482422, + "logps/chosen": -54.210723876953125, + "logps/rejected": -97.20679473876953, + "loss": 0.7266, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2382888793945312, + "rewards/margins": 6.362312316894531, + "rewards/rejected": -3.124023914337158, + "step": 15010 + }, + { + "epoch": 3.76, + "grad_norm": 3.8724513053894043, + "learning_rate": 1.4521217025688866e-06, + "logits/chosen": -0.551770806312561, + "logits/rejected": -0.6342086791992188, + "logps/chosen": -61.720157623291016, + "logps/rejected": -83.8510513305664, + "loss": 0.6961, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1823196411132812, + "rewards/margins": 6.859917640686035, + "rewards/rejected": -3.6775975227355957, + "step": 15011 + }, + { + "epoch": 3.76, + "grad_norm": 5.536527633666992, + "learning_rate": 1.4515679165138897e-06, + "logits/chosen": -0.5019874572753906, + "logits/rejected": -0.561836838722229, + "logps/chosen": -52.71733093261719, + "logps/rejected": -107.7756118774414, + "loss": 0.6848, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3751955032348633, + "rewards/margins": 5.838622570037842, + "rewards/rejected": -2.4634268283843994, + "step": 15012 + }, + { + "epoch": 3.76, + "grad_norm": 7.153169631958008, + "learning_rate": 1.451014218144422e-06, + "logits/chosen": -0.48380792140960693, + "logits/rejected": -0.5510025024414062, + "logps/chosen": -61.433807373046875, + "logps/rejected": -106.76890563964844, + "loss": 0.7093, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.596222400665283, + "rewards/margins": 5.933249473571777, + "rewards/rejected": -3.337026834487915, + "step": 15013 + }, + { + "epoch": 3.76, + "grad_norm": 4.440451622009277, + "learning_rate": 1.45046060747417e-06, + "logits/chosen": -0.5527622699737549, + "logits/rejected": -0.6260027289390564, + "logps/chosen": -58.86479949951172, + "logps/rejected": -105.23439025878906, + "loss": 0.6158, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.607949733734131, + "rewards/margins": 7.029945373535156, + "rewards/rejected": -3.4219956398010254, + "step": 15014 + }, + { + "epoch": 3.76, + "grad_norm": 11.2825927734375, + "learning_rate": 1.4499070845168112e-06, + "logits/chosen": -0.5867702960968018, + "logits/rejected": -0.6558127999305725, + "logps/chosen": -48.97090148925781, + "logps/rejected": -105.04292297363281, + "loss": 0.7527, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.169593095779419, + "rewards/margins": 5.704081058502197, + "rewards/rejected": -2.5344879627227783, + "step": 15015 + }, + { + "epoch": 3.76, + "grad_norm": 5.956045627593994, + "learning_rate": 1.4493536492860233e-06, + "logits/chosen": -0.5773393511772156, + "logits/rejected": -0.6562577486038208, + "logps/chosen": -44.892215728759766, + "logps/rejected": -84.44164276123047, + "loss": 0.5906, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.076235294342041, + "rewards/margins": 6.920079231262207, + "rewards/rejected": -3.843843460083008, + "step": 15016 + }, + { + "epoch": 3.76, + "grad_norm": 4.645730972290039, + "learning_rate": 1.4488003017954833e-06, + "logits/chosen": -0.5497803092002869, + "logits/rejected": -0.6284738183021545, + "logps/chosen": -72.26219940185547, + "logps/rejected": -111.79850006103516, + "loss": 0.6486, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9571733474731445, + "rewards/margins": 7.003330707550049, + "rewards/rejected": -4.046157360076904, + "step": 15017 + }, + { + "epoch": 3.76, + "grad_norm": 16.073579788208008, + "learning_rate": 1.4482470420588662e-06, + "logits/chosen": -0.556990921497345, + "logits/rejected": -0.5705944299697876, + "logps/chosen": -56.04893493652344, + "logps/rejected": -107.56433868408203, + "loss": 0.744, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4401519298553467, + "rewards/margins": 6.649397850036621, + "rewards/rejected": -3.2092463970184326, + "step": 15018 + }, + { + "epoch": 3.76, + "grad_norm": 2.5010900497436523, + "learning_rate": 1.447693870089843e-06, + "logits/chosen": -0.5532252192497253, + "logits/rejected": -0.6601671576499939, + "logps/chosen": -57.8465461730957, + "logps/rejected": -92.3497314453125, + "loss": 0.5547, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2631425857543945, + "rewards/margins": 6.662735462188721, + "rewards/rejected": -3.3995935916900635, + "step": 15019 + }, + { + "epoch": 3.76, + "grad_norm": 5.770397186279297, + "learning_rate": 1.4471407859020809e-06, + "logits/chosen": -0.5665420889854431, + "logits/rejected": -0.6624480485916138, + "logps/chosen": -63.7403678894043, + "logps/rejected": -124.77607727050781, + "loss": 0.7102, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1392617225646973, + "rewards/margins": 8.118546485900879, + "rewards/rejected": -4.979284286499023, + "step": 15020 + }, + { + "epoch": 3.76, + "grad_norm": 26.838560104370117, + "learning_rate": 1.4465877895092505e-06, + "logits/chosen": -0.5468793511390686, + "logits/rejected": -0.6435860991477966, + "logps/chosen": -62.488365173339844, + "logps/rejected": -100.33114624023438, + "loss": 0.6876, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7231175899505615, + "rewards/margins": 6.157609462738037, + "rewards/rejected": -3.4344916343688965, + "step": 15021 + }, + { + "epoch": 3.76, + "grad_norm": 7.235170841217041, + "learning_rate": 1.4460348809250158e-06, + "logits/chosen": -0.5354970693588257, + "logits/rejected": -0.5983157753944397, + "logps/chosen": -51.123046875, + "logps/rejected": -132.47314453125, + "loss": 0.6903, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.422636032104492, + "rewards/margins": 7.665704250335693, + "rewards/rejected": -4.243067741394043, + "step": 15022 + }, + { + "epoch": 3.76, + "grad_norm": 10.6709566116333, + "learning_rate": 1.4454820601630376e-06, + "logits/chosen": -0.4986647963523865, + "logits/rejected": -0.5867615342140198, + "logps/chosen": -49.46134948730469, + "logps/rejected": -96.1756591796875, + "loss": 0.6183, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.979710578918457, + "rewards/margins": 6.332553863525391, + "rewards/rejected": -3.3528432846069336, + "step": 15023 + }, + { + "epoch": 3.76, + "grad_norm": 6.507706165313721, + "learning_rate": 1.4449293272369796e-06, + "logits/chosen": -0.5730622410774231, + "logits/rejected": -0.6420652270317078, + "logps/chosen": -50.46002197265625, + "logps/rejected": -107.07569885253906, + "loss": 0.6457, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.53743577003479, + "rewards/margins": 8.099418640136719, + "rewards/rejected": -4.561982154846191, + "step": 15024 + }, + { + "epoch": 3.76, + "grad_norm": 6.466487407684326, + "learning_rate": 1.4443766821604981e-06, + "logits/chosen": -0.6226333975791931, + "logits/rejected": -0.62455815076828, + "logps/chosen": -45.652225494384766, + "logps/rejected": -102.28248596191406, + "loss": 0.694, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0030713081359863, + "rewards/margins": 5.444331169128418, + "rewards/rejected": -2.44126033782959, + "step": 15025 + }, + { + "epoch": 3.76, + "grad_norm": 10.158407211303711, + "learning_rate": 1.4438241249472523e-06, + "logits/chosen": -0.5152555108070374, + "logits/rejected": -0.6352598667144775, + "logps/chosen": -63.83554458618164, + "logps/rejected": -114.32489013671875, + "loss": 0.7022, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.612135171890259, + "rewards/margins": 7.106267929077148, + "rewards/rejected": -4.494132995605469, + "step": 15026 + }, + { + "epoch": 3.76, + "grad_norm": 12.4237699508667, + "learning_rate": 1.4432716556108944e-06, + "logits/chosen": -0.5272073149681091, + "logits/rejected": -0.6182827353477478, + "logps/chosen": -56.1066780090332, + "logps/rejected": -102.97163391113281, + "loss": 0.6499, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.079878091812134, + "rewards/margins": 6.520081996917725, + "rewards/rejected": -3.440204620361328, + "step": 15027 + }, + { + "epoch": 3.76, + "grad_norm": 8.092039108276367, + "learning_rate": 1.442719274165076e-06, + "logits/chosen": -0.5617896318435669, + "logits/rejected": -0.6859012246131897, + "logps/chosen": -58.31028747558594, + "logps/rejected": -87.21316528320312, + "loss": 0.7775, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9463346004486084, + "rewards/margins": 6.66981315612793, + "rewards/rejected": -3.7234785556793213, + "step": 15028 + }, + { + "epoch": 3.76, + "grad_norm": 6.305783748626709, + "learning_rate": 1.4421669806234495e-06, + "logits/chosen": -0.6197332739830017, + "logits/rejected": -0.6842668652534485, + "logps/chosen": -51.383888244628906, + "logps/rejected": -100.50493621826172, + "loss": 0.7006, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.084214448928833, + "rewards/margins": 6.4147138595581055, + "rewards/rejected": -3.3304998874664307, + "step": 15029 + }, + { + "epoch": 3.76, + "grad_norm": 6.282209873199463, + "learning_rate": 1.4416147749996612e-06, + "logits/chosen": -0.5416102409362793, + "logits/rejected": -0.5950912833213806, + "logps/chosen": -53.53805160522461, + "logps/rejected": -98.88272094726562, + "loss": 0.7059, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.3999099731445312, + "rewards/margins": 6.843362331390381, + "rewards/rejected": -3.4434523582458496, + "step": 15030 + }, + { + "epoch": 3.76, + "grad_norm": 2.7100908756256104, + "learning_rate": 1.441062657307355e-06, + "logits/chosen": -0.5183848738670349, + "logits/rejected": -0.5623782873153687, + "logps/chosen": -59.88412094116211, + "logps/rejected": -120.8238525390625, + "loss": 0.6121, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8353195190429688, + "rewards/margins": 7.986663341522217, + "rewards/rejected": -5.151344299316406, + "step": 15031 + }, + { + "epoch": 3.76, + "grad_norm": 2.5929863452911377, + "learning_rate": 1.440510627560176e-06, + "logits/chosen": -0.5408458113670349, + "logits/rejected": -0.5788573622703552, + "logps/chosen": -42.33903884887695, + "logps/rejected": -116.97988891601562, + "loss": 0.5794, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0891425609588623, + "rewards/margins": 7.935436248779297, + "rewards/rejected": -4.846293926239014, + "step": 15032 + }, + { + "epoch": 3.76, + "grad_norm": 17.842056274414062, + "learning_rate": 1.4399586857717685e-06, + "logits/chosen": -0.5215082764625549, + "logits/rejected": -0.6171002984046936, + "logps/chosen": -53.24341583251953, + "logps/rejected": -99.96844482421875, + "loss": 0.6714, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3590850830078125, + "rewards/margins": 6.737938404083252, + "rewards/rejected": -3.3788533210754395, + "step": 15033 + }, + { + "epoch": 3.76, + "grad_norm": 2.1211278438568115, + "learning_rate": 1.4394068319557653e-06, + "logits/chosen": -0.5482949614524841, + "logits/rejected": -0.6380086541175842, + "logps/chosen": -53.73940658569336, + "logps/rejected": -103.58773040771484, + "loss": 0.5686, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4616973400115967, + "rewards/margins": 7.691895008087158, + "rewards/rejected": -4.230197906494141, + "step": 15034 + }, + { + "epoch": 3.76, + "grad_norm": 5.111935615539551, + "learning_rate": 1.4388550661258066e-06, + "logits/chosen": -0.5228458642959595, + "logits/rejected": -0.666199266910553, + "logps/chosen": -56.164634704589844, + "logps/rejected": -107.23431396484375, + "loss": 0.6324, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.175832748413086, + "rewards/margins": 8.737055778503418, + "rewards/rejected": -5.56122350692749, + "step": 15035 + }, + { + "epoch": 3.76, + "grad_norm": 3.022311210632324, + "learning_rate": 1.4383033882955284e-06, + "logits/chosen": -0.5691591501235962, + "logits/rejected": -0.6881076097488403, + "logps/chosen": -58.25074005126953, + "logps/rejected": -102.28455352783203, + "loss": 0.6116, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5001754760742188, + "rewards/margins": 8.581839561462402, + "rewards/rejected": -5.081664085388184, + "step": 15036 + }, + { + "epoch": 3.76, + "grad_norm": 7.911903381347656, + "learning_rate": 1.4377517984785623e-06, + "logits/chosen": -0.5639093518257141, + "logits/rejected": -0.6552099585533142, + "logps/chosen": -64.95881652832031, + "logps/rejected": -117.91744995117188, + "loss": 0.6688, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8954601287841797, + "rewards/margins": 7.690230369567871, + "rewards/rejected": -4.79477071762085, + "step": 15037 + }, + { + "epoch": 3.76, + "grad_norm": 6.695455074310303, + "learning_rate": 1.4372002966885361e-06, + "logits/chosen": -0.5097569227218628, + "logits/rejected": -0.5513051152229309, + "logps/chosen": -51.17076873779297, + "logps/rejected": -98.39595794677734, + "loss": 0.5572, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2484283447265625, + "rewards/margins": 5.524198532104492, + "rewards/rejected": -2.2757694721221924, + "step": 15038 + }, + { + "epoch": 3.76, + "grad_norm": 10.813687324523926, + "learning_rate": 1.4366488829390824e-06, + "logits/chosen": -0.5620959401130676, + "logits/rejected": -0.6614376306533813, + "logps/chosen": -51.016719818115234, + "logps/rejected": -101.1630630493164, + "loss": 0.6053, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2784676551818848, + "rewards/margins": 7.008655548095703, + "rewards/rejected": -3.7301881313323975, + "step": 15039 + }, + { + "epoch": 3.76, + "grad_norm": 2.1949222087860107, + "learning_rate": 1.4360975572438246e-06, + "logits/chosen": -0.5609275102615356, + "logits/rejected": -0.7087169289588928, + "logps/chosen": -58.836551666259766, + "logps/rejected": -89.3867416381836, + "loss": 0.5574, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3647642135620117, + "rewards/margins": 7.860915184020996, + "rewards/rejected": -4.496150970458984, + "step": 15040 + }, + { + "epoch": 3.76, + "grad_norm": 4.362638473510742, + "learning_rate": 1.4355463196163849e-06, + "logits/chosen": -0.5757616758346558, + "logits/rejected": -0.5982206463813782, + "logps/chosen": -49.80408477783203, + "logps/rejected": -116.04100799560547, + "loss": 0.6729, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.941659450531006, + "rewards/margins": 6.335290908813477, + "rewards/rejected": -3.393631935119629, + "step": 15041 + }, + { + "epoch": 3.76, + "grad_norm": 4.185311317443848, + "learning_rate": 1.434995170070389e-06, + "logits/chosen": -0.47997939586639404, + "logits/rejected": -0.552730143070221, + "logps/chosen": -47.322105407714844, + "logps/rejected": -101.68877410888672, + "loss": 0.5838, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1249842643737793, + "rewards/margins": 6.179405212402344, + "rewards/rejected": -3.0544211864471436, + "step": 15042 + }, + { + "epoch": 3.76, + "grad_norm": 3.8079075813293457, + "learning_rate": 1.4344441086194527e-06, + "logits/chosen": -0.5166686177253723, + "logits/rejected": -0.5791885852813721, + "logps/chosen": -60.44523620605469, + "logps/rejected": -101.50347900390625, + "loss": 0.7115, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2762508392333984, + "rewards/margins": 7.776830196380615, + "rewards/rejected": -4.500578880310059, + "step": 15043 + }, + { + "epoch": 3.76, + "grad_norm": 25.203880310058594, + "learning_rate": 1.4338931352771967e-06, + "logits/chosen": -0.4961562752723694, + "logits/rejected": -0.5672788619995117, + "logps/chosen": -53.15212631225586, + "logps/rejected": -106.63717651367188, + "loss": 0.6777, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.925727605819702, + "rewards/margins": 6.575295925140381, + "rewards/rejected": -3.649568796157837, + "step": 15044 + }, + { + "epoch": 3.76, + "grad_norm": 6.148006439208984, + "learning_rate": 1.433342250057234e-06, + "logits/chosen": -0.531938910484314, + "logits/rejected": -0.5903440713882446, + "logps/chosen": -42.06468963623047, + "logps/rejected": -112.34706115722656, + "loss": 0.5886, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.049516439437866, + "rewards/margins": 8.226275444030762, + "rewards/rejected": -5.176759243011475, + "step": 15045 + }, + { + "epoch": 3.76, + "grad_norm": 3.978175401687622, + "learning_rate": 1.4327914529731769e-06, + "logits/chosen": -0.5955528020858765, + "logits/rejected": -0.7026747465133667, + "logps/chosen": -49.785301208496094, + "logps/rejected": -107.50917053222656, + "loss": 0.5864, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2797558307647705, + "rewards/margins": 7.419611930847168, + "rewards/rejected": -4.13985538482666, + "step": 15046 + }, + { + "epoch": 3.76, + "grad_norm": 5.584909915924072, + "learning_rate": 1.4322407440386387e-06, + "logits/chosen": -0.5478719472885132, + "logits/rejected": -0.5956246256828308, + "logps/chosen": -56.29338455200195, + "logps/rejected": -117.10611724853516, + "loss": 0.6823, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3367347717285156, + "rewards/margins": 7.527888298034668, + "rewards/rejected": -4.191153526306152, + "step": 15047 + }, + { + "epoch": 3.76, + "grad_norm": 5.623090744018555, + "learning_rate": 1.4316901232672265e-06, + "logits/chosen": -0.6286904811859131, + "logits/rejected": -0.6918175220489502, + "logps/chosen": -46.7371940612793, + "logps/rejected": -104.86372375488281, + "loss": 0.6155, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0795249938964844, + "rewards/margins": 6.909695625305176, + "rewards/rejected": -3.830170154571533, + "step": 15048 + }, + { + "epoch": 3.76, + "grad_norm": 3.253657102584839, + "learning_rate": 1.431139590672545e-06, + "logits/chosen": -0.4914431869983673, + "logits/rejected": -0.6041486263275146, + "logps/chosen": -49.20985412597656, + "logps/rejected": -107.45475769042969, + "loss": 0.5746, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7675511837005615, + "rewards/margins": 7.762074947357178, + "rewards/rejected": -4.994524002075195, + "step": 15049 + }, + { + "epoch": 3.76, + "grad_norm": 7.281345844268799, + "learning_rate": 1.4305891462682004e-06, + "logits/chosen": -0.5582857728004456, + "logits/rejected": -0.6424723267555237, + "logps/chosen": -62.23389434814453, + "logps/rejected": -101.9007568359375, + "loss": 0.6704, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9637367725372314, + "rewards/margins": 6.166878700256348, + "rewards/rejected": -3.2031421661376953, + "step": 15050 + }, + { + "epoch": 3.77, + "grad_norm": 3.0544917583465576, + "learning_rate": 1.4300387900677965e-06, + "logits/chosen": -0.51976478099823, + "logits/rejected": -0.564063310623169, + "logps/chosen": -55.911354064941406, + "logps/rejected": -106.04066467285156, + "loss": 0.5657, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.054047107696533, + "rewards/margins": 7.196807861328125, + "rewards/rejected": -4.142760753631592, + "step": 15051 + }, + { + "epoch": 3.77, + "grad_norm": 8.779973983764648, + "learning_rate": 1.4294885220849308e-06, + "logits/chosen": -0.5766915678977966, + "logits/rejected": -0.6451552510261536, + "logps/chosen": -54.92511749267578, + "logps/rejected": -116.18079376220703, + "loss": 0.6085, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.235994577407837, + "rewards/margins": 6.429283142089844, + "rewards/rejected": -3.193288803100586, + "step": 15052 + }, + { + "epoch": 3.77, + "grad_norm": 4.08827018737793, + "learning_rate": 1.4289383423331998e-06, + "logits/chosen": -0.5124040246009827, + "logits/rejected": -0.5828156471252441, + "logps/chosen": -59.53482437133789, + "logps/rejected": -99.41290283203125, + "loss": 0.5988, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9334964752197266, + "rewards/margins": 7.076180458068848, + "rewards/rejected": -4.142683506011963, + "step": 15053 + }, + { + "epoch": 3.77, + "grad_norm": 4.455854892730713, + "learning_rate": 1.4283882508262026e-06, + "logits/chosen": -0.48408275842666626, + "logits/rejected": -0.5546956062316895, + "logps/chosen": -48.134498596191406, + "logps/rejected": -98.9097671508789, + "loss": 0.5735, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1516647338867188, + "rewards/margins": 6.846574783325195, + "rewards/rejected": -3.6949098110198975, + "step": 15054 + }, + { + "epoch": 3.77, + "grad_norm": 7.7245192527771, + "learning_rate": 1.4278382475775304e-06, + "logits/chosen": -0.4312698245048523, + "logits/rejected": -0.5371413230895996, + "logps/chosen": -50.63561248779297, + "logps/rejected": -109.03611755371094, + "loss": 0.5861, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.31634521484375, + "rewards/margins": 7.542426109313965, + "rewards/rejected": -4.226080894470215, + "step": 15055 + }, + { + "epoch": 3.77, + "grad_norm": 7.420539379119873, + "learning_rate": 1.4272883326007737e-06, + "logits/chosen": -0.6302963495254517, + "logits/rejected": -0.7272793054580688, + "logps/chosen": -43.67243957519531, + "logps/rejected": -93.39353942871094, + "loss": 0.5739, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1798315048217773, + "rewards/margins": 7.349460124969482, + "rewards/rejected": -4.169628143310547, + "step": 15056 + }, + { + "epoch": 3.77, + "grad_norm": 5.393477439880371, + "learning_rate": 1.4267385059095235e-06, + "logits/chosen": -0.5479338765144348, + "logits/rejected": -0.6744225025177002, + "logps/chosen": -65.7586669921875, + "logps/rejected": -126.83963775634766, + "loss": 0.6329, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.886814594268799, + "rewards/margins": 8.549822807312012, + "rewards/rejected": -5.663007736206055, + "step": 15057 + }, + { + "epoch": 3.77, + "grad_norm": 8.509916305541992, + "learning_rate": 1.4261887675173652e-06, + "logits/chosen": -0.5841224789619446, + "logits/rejected": -0.6222055554389954, + "logps/chosen": -64.51470947265625, + "logps/rejected": -119.35089874267578, + "loss": 0.6365, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.286283016204834, + "rewards/margins": 8.169441223144531, + "rewards/rejected": -4.883158206939697, + "step": 15058 + }, + { + "epoch": 3.77, + "grad_norm": 8.47026252746582, + "learning_rate": 1.4256391174378824e-06, + "logits/chosen": -0.5831145644187927, + "logits/rejected": -0.5939762592315674, + "logps/chosen": -48.5284423828125, + "logps/rejected": -104.8000259399414, + "loss": 0.7107, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9351460933685303, + "rewards/margins": 5.937928676605225, + "rewards/rejected": -3.0027823448181152, + "step": 15059 + }, + { + "epoch": 3.77, + "grad_norm": 2.441661834716797, + "learning_rate": 1.4250895556846589e-06, + "logits/chosen": -0.5732786059379578, + "logits/rejected": -0.6460046768188477, + "logps/chosen": -52.15963363647461, + "logps/rejected": -106.34241485595703, + "loss": 0.5382, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0694210529327393, + "rewards/margins": 7.922488212585449, + "rewards/rejected": -4.853067874908447, + "step": 15060 + }, + { + "epoch": 3.77, + "grad_norm": 10.941551208496094, + "learning_rate": 1.424540082271278e-06, + "logits/chosen": -0.5529136657714844, + "logits/rejected": -0.615691065788269, + "logps/chosen": -53.2169189453125, + "logps/rejected": -97.48860168457031, + "loss": 0.7665, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2822141647338867, + "rewards/margins": 5.440320014953613, + "rewards/rejected": -2.1581056118011475, + "step": 15061 + }, + { + "epoch": 3.77, + "grad_norm": 36.65499496459961, + "learning_rate": 1.4239906972113121e-06, + "logits/chosen": -0.5406613945960999, + "logits/rejected": -0.5481762886047363, + "logps/chosen": -52.14426040649414, + "logps/rejected": -108.68891143798828, + "loss": 0.684, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1608049869537354, + "rewards/margins": 6.662989616394043, + "rewards/rejected": -3.5021848678588867, + "step": 15062 + }, + { + "epoch": 3.77, + "grad_norm": 7.928838729858398, + "learning_rate": 1.4234414005183389e-06, + "logits/chosen": -0.5721696615219116, + "logits/rejected": -0.6391116380691528, + "logps/chosen": -55.57749938964844, + "logps/rejected": -102.38359069824219, + "loss": 0.6903, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2910783290863037, + "rewards/margins": 7.630921363830566, + "rewards/rejected": -4.339842796325684, + "step": 15063 + }, + { + "epoch": 3.77, + "grad_norm": 3.9405975341796875, + "learning_rate": 1.4228921922059347e-06, + "logits/chosen": -0.601546049118042, + "logits/rejected": -0.6691221594810486, + "logps/chosen": -53.914642333984375, + "logps/rejected": -105.27497100830078, + "loss": 0.5931, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1415555477142334, + "rewards/margins": 6.981955528259277, + "rewards/rejected": -3.8403995037078857, + "step": 15064 + }, + { + "epoch": 3.77, + "grad_norm": 32.33220672607422, + "learning_rate": 1.4223430722876696e-06, + "logits/chosen": -0.5239682197570801, + "logits/rejected": -0.642387866973877, + "logps/chosen": -56.85668182373047, + "logps/rejected": -112.6548080444336, + "loss": 0.5943, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.248047351837158, + "rewards/margins": 7.879462718963623, + "rewards/rejected": -4.631415367126465, + "step": 15065 + }, + { + "epoch": 3.77, + "grad_norm": 3.2463698387145996, + "learning_rate": 1.4217940407771109e-06, + "logits/chosen": -0.43994128704071045, + "logits/rejected": -0.45914819836616516, + "logps/chosen": -54.83743667602539, + "logps/rejected": -111.15167236328125, + "loss": 0.5788, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4020578861236572, + "rewards/margins": 6.231543064117432, + "rewards/rejected": -2.8294849395751953, + "step": 15066 + }, + { + "epoch": 3.77, + "grad_norm": 6.935669422149658, + "learning_rate": 1.4212450976878283e-06, + "logits/chosen": -0.4935552775859833, + "logits/rejected": -0.5444846749305725, + "logps/chosen": -49.30745315551758, + "logps/rejected": -120.88887023925781, + "loss": 0.5313, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1111505031585693, + "rewards/margins": 7.288295745849609, + "rewards/rejected": -4.177145957946777, + "step": 15067 + }, + { + "epoch": 3.77, + "grad_norm": 3.714020013809204, + "learning_rate": 1.4206962430333848e-06, + "logits/chosen": -0.5637853145599365, + "logits/rejected": -0.624747097492218, + "logps/chosen": -47.65609359741211, + "logps/rejected": -118.11669921875, + "loss": 0.6016, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.215850353240967, + "rewards/margins": 8.091926574707031, + "rewards/rejected": -4.876076698303223, + "step": 15068 + }, + { + "epoch": 3.77, + "grad_norm": 3.3045156002044678, + "learning_rate": 1.420147476827346e-06, + "logits/chosen": -0.5323827862739563, + "logits/rejected": -0.5929701924324036, + "logps/chosen": -44.38450622558594, + "logps/rejected": -114.17719268798828, + "loss": 0.573, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.235595226287842, + "rewards/margins": 7.4598236083984375, + "rewards/rejected": -4.224228382110596, + "step": 15069 + }, + { + "epoch": 3.77, + "grad_norm": 6.490671634674072, + "learning_rate": 1.4195987990832704e-06, + "logits/chosen": -0.563400387763977, + "logits/rejected": -0.6416334509849548, + "logps/chosen": -46.81078338623047, + "logps/rejected": -109.62342834472656, + "loss": 0.5347, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.093747615814209, + "rewards/margins": 8.01755428314209, + "rewards/rejected": -4.923806190490723, + "step": 15070 + }, + { + "epoch": 3.77, + "grad_norm": 8.53499984741211, + "learning_rate": 1.4190502098147153e-06, + "logits/chosen": -0.5084589123725891, + "logits/rejected": -0.5834489464759827, + "logps/chosen": -52.71952819824219, + "logps/rejected": -116.76542663574219, + "loss": 0.6364, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3348562717437744, + "rewards/margins": 8.083949089050293, + "rewards/rejected": -4.749093055725098, + "step": 15071 + }, + { + "epoch": 3.77, + "grad_norm": 10.395636558532715, + "learning_rate": 1.41850170903524e-06, + "logits/chosen": -0.5181611776351929, + "logits/rejected": -0.5959015488624573, + "logps/chosen": -53.17435073852539, + "logps/rejected": -100.51065826416016, + "loss": 0.6777, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2135379314422607, + "rewards/margins": 6.654282569885254, + "rewards/rejected": -3.440744161605835, + "step": 15072 + }, + { + "epoch": 3.77, + "grad_norm": 6.218686103820801, + "learning_rate": 1.417953296758397e-06, + "logits/chosen": -0.5242671966552734, + "logits/rejected": -0.5605955123901367, + "logps/chosen": -54.84613800048828, + "logps/rejected": -98.37032318115234, + "loss": 0.7613, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.995424747467041, + "rewards/margins": 5.400434494018555, + "rewards/rejected": -2.405010223388672, + "step": 15073 + }, + { + "epoch": 3.77, + "grad_norm": 13.568607330322266, + "learning_rate": 1.4174049729977362e-06, + "logits/chosen": -0.5301713347434998, + "logits/rejected": -0.5945622324943542, + "logps/chosen": -56.80210876464844, + "logps/rejected": -98.53643035888672, + "loss": 0.815, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8109278678894043, + "rewards/margins": 6.03859806060791, + "rewards/rejected": -3.227670192718506, + "step": 15074 + }, + { + "epoch": 3.77, + "grad_norm": 4.708736896514893, + "learning_rate": 1.4168567377668096e-06, + "logits/chosen": -0.5442760586738586, + "logits/rejected": -0.6040091514587402, + "logps/chosen": -52.85948181152344, + "logps/rejected": -104.16294860839844, + "loss": 0.6376, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1012086868286133, + "rewards/margins": 7.195833206176758, + "rewards/rejected": -4.0946245193481445, + "step": 15075 + }, + { + "epoch": 3.77, + "grad_norm": 6.456573486328125, + "learning_rate": 1.416308591079167e-06, + "logits/chosen": -0.5142064690589905, + "logits/rejected": -0.61982661485672, + "logps/chosen": -53.414039611816406, + "logps/rejected": -102.27461242675781, + "loss": 0.6712, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.874033212661743, + "rewards/margins": 6.400289535522461, + "rewards/rejected": -3.526256561279297, + "step": 15076 + }, + { + "epoch": 3.77, + "grad_norm": 3.849026679992676, + "learning_rate": 1.4157605329483477e-06, + "logits/chosen": -0.4797735810279846, + "logits/rejected": -0.5389273166656494, + "logps/chosen": -60.467262268066406, + "logps/rejected": -100.63584899902344, + "loss": 0.5805, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.509296417236328, + "rewards/margins": 6.5345683097839355, + "rewards/rejected": -3.0252718925476074, + "step": 15077 + }, + { + "epoch": 3.77, + "grad_norm": 6.855994701385498, + "learning_rate": 1.4152125633878983e-06, + "logits/chosen": -0.5729895830154419, + "logits/rejected": -0.6717925071716309, + "logps/chosen": -59.08348083496094, + "logps/rejected": -92.75581359863281, + "loss": 0.6925, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.976632595062256, + "rewards/margins": 8.007086753845215, + "rewards/rejected": -5.030453681945801, + "step": 15078 + }, + { + "epoch": 3.77, + "grad_norm": 2.654785394668579, + "learning_rate": 1.4146646824113624e-06, + "logits/chosen": -0.5482586026191711, + "logits/rejected": -0.6492753624916077, + "logps/chosen": -65.5279541015625, + "logps/rejected": -122.10470581054688, + "loss": 0.594, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.081717014312744, + "rewards/margins": 7.743691444396973, + "rewards/rejected": -4.6619744300842285, + "step": 15079 + }, + { + "epoch": 3.77, + "grad_norm": 10.4839506149292, + "learning_rate": 1.4141168900322728e-06, + "logits/chosen": -0.47820067405700684, + "logits/rejected": -0.5418924689292908, + "logps/chosen": -65.04075622558594, + "logps/rejected": -118.82083892822266, + "loss": 0.6071, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9320502281188965, + "rewards/margins": 6.517971038818359, + "rewards/rejected": -3.585920810699463, + "step": 15080 + }, + { + "epoch": 3.77, + "grad_norm": 2.764753580093384, + "learning_rate": 1.4135691862641681e-06, + "logits/chosen": -0.4894994795322418, + "logits/rejected": -0.6258274912834167, + "logps/chosen": -53.135826110839844, + "logps/rejected": -86.09251403808594, + "loss": 0.5674, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0792226791381836, + "rewards/margins": 7.507447242736816, + "rewards/rejected": -4.428224086761475, + "step": 15081 + }, + { + "epoch": 3.77, + "grad_norm": 3.293853998184204, + "learning_rate": 1.4130215711205858e-06, + "logits/chosen": -0.4951094388961792, + "logits/rejected": -0.6098549365997314, + "logps/chosen": -47.00468444824219, + "logps/rejected": -106.05734252929688, + "loss": 0.5193, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3954148292541504, + "rewards/margins": 7.531041622161865, + "rewards/rejected": -4.135626316070557, + "step": 15082 + }, + { + "epoch": 3.77, + "grad_norm": 4.478565216064453, + "learning_rate": 1.412474044615056e-06, + "logits/chosen": -0.47006916999816895, + "logits/rejected": -0.6022230386734009, + "logps/chosen": -59.67580795288086, + "logps/rejected": -89.74909973144531, + "loss": 0.6089, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.069342613220215, + "rewards/margins": 6.646620273590088, + "rewards/rejected": -3.577277660369873, + "step": 15083 + }, + { + "epoch": 3.77, + "grad_norm": 5.1030802726745605, + "learning_rate": 1.4119266067611065e-06, + "logits/chosen": -0.5489584803581238, + "logits/rejected": -0.6585860252380371, + "logps/chosen": -61.42422103881836, + "logps/rejected": -105.45024108886719, + "loss": 0.7349, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.06253719329834, + "rewards/margins": 7.532473564147949, + "rewards/rejected": -4.469936370849609, + "step": 15084 + }, + { + "epoch": 3.77, + "grad_norm": 9.921797752380371, + "learning_rate": 1.4113792575722684e-06, + "logits/chosen": -0.6048332452774048, + "logits/rejected": -0.6548938751220703, + "logps/chosen": -58.4302978515625, + "logps/rejected": -90.58890533447266, + "loss": 0.8481, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.868433713912964, + "rewards/margins": 5.712753772735596, + "rewards/rejected": -2.84432053565979, + "step": 15085 + }, + { + "epoch": 3.77, + "grad_norm": 3.7653417587280273, + "learning_rate": 1.410831997062066e-06, + "logits/chosen": -0.5681780576705933, + "logits/rejected": -0.6318565607070923, + "logps/chosen": -42.809959411621094, + "logps/rejected": -101.27201843261719, + "loss": 0.6168, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.194514513015747, + "rewards/margins": 6.639938831329346, + "rewards/rejected": -3.4454238414764404, + "step": 15086 + }, + { + "epoch": 3.77, + "grad_norm": 4.7422566413879395, + "learning_rate": 1.410284825244021e-06, + "logits/chosen": -0.4936094880104065, + "logits/rejected": -0.5925659537315369, + "logps/chosen": -74.62164306640625, + "logps/rejected": -100.01746368408203, + "loss": 0.7211, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.463803291320801, + "rewards/margins": 5.550727844238281, + "rewards/rejected": -3.0869245529174805, + "step": 15087 + }, + { + "epoch": 3.77, + "grad_norm": 7.30779504776001, + "learning_rate": 1.4097377421316583e-06, + "logits/chosen": -0.544933021068573, + "logits/rejected": -0.6623547077178955, + "logps/chosen": -63.02891159057617, + "logps/rejected": -84.50241088867188, + "loss": 0.615, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9706060886383057, + "rewards/margins": 5.636716842651367, + "rewards/rejected": -2.6661112308502197, + "step": 15088 + }, + { + "epoch": 3.77, + "grad_norm": 3.900233268737793, + "learning_rate": 1.4091907477384924e-06, + "logits/chosen": -0.5516045093536377, + "logits/rejected": -0.6214510798454285, + "logps/chosen": -63.68735122680664, + "logps/rejected": -107.60870361328125, + "loss": 0.668, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1701908111572266, + "rewards/margins": 6.566823959350586, + "rewards/rejected": -3.3966331481933594, + "step": 15089 + }, + { + "epoch": 3.77, + "grad_norm": 4.314528465270996, + "learning_rate": 1.408643842078044e-06, + "logits/chosen": -0.5949097275733948, + "logits/rejected": -0.6544274687767029, + "logps/chosen": -55.74913024902344, + "logps/rejected": -120.17060852050781, + "loss": 0.6713, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.27298641204834, + "rewards/margins": 6.928882598876953, + "rewards/rejected": -3.655895471572876, + "step": 15090 + }, + { + "epoch": 3.78, + "grad_norm": 5.38701868057251, + "learning_rate": 1.408097025163827e-06, + "logits/chosen": -0.6092571020126343, + "logits/rejected": -0.6426807641983032, + "logps/chosen": -43.88075637817383, + "logps/rejected": -111.76313781738281, + "loss": 0.5809, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0626420974731445, + "rewards/margins": 6.882089614868164, + "rewards/rejected": -3.819448232650757, + "step": 15091 + }, + { + "epoch": 3.78, + "grad_norm": 4.292644023895264, + "learning_rate": 1.4075502970093507e-06, + "logits/chosen": -0.5236921310424805, + "logits/rejected": -0.6429418325424194, + "logps/chosen": -56.46868896484375, + "logps/rejected": -98.13450622558594, + "loss": 0.6122, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.205228328704834, + "rewards/margins": 6.9482245445251465, + "rewards/rejected": -3.7429957389831543, + "step": 15092 + }, + { + "epoch": 3.78, + "grad_norm": 8.496261596679688, + "learning_rate": 1.4070036576281282e-06, + "logits/chosen": -0.521528959274292, + "logits/rejected": -0.6237936019897461, + "logps/chosen": -63.146366119384766, + "logps/rejected": -98.69509887695312, + "loss": 0.7306, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.033252239227295, + "rewards/margins": 6.939037322998047, + "rewards/rejected": -3.905784845352173, + "step": 15093 + }, + { + "epoch": 3.78, + "grad_norm": 4.6480889320373535, + "learning_rate": 1.406457107033668e-06, + "logits/chosen": -0.6031442284584045, + "logits/rejected": -0.6653808951377869, + "logps/chosen": -57.65779495239258, + "logps/rejected": -107.43291473388672, + "loss": 0.6129, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.335233211517334, + "rewards/margins": 6.765443801879883, + "rewards/rejected": -3.430210590362549, + "step": 15094 + }, + { + "epoch": 3.78, + "grad_norm": 3.8613779544830322, + "learning_rate": 1.4059106452394754e-06, + "logits/chosen": -0.5414335131645203, + "logits/rejected": -0.5833653807640076, + "logps/chosen": -60.17159652709961, + "logps/rejected": -117.61000061035156, + "loss": 0.6959, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2192628383636475, + "rewards/margins": 6.774578094482422, + "rewards/rejected": -3.5553154945373535, + "step": 15095 + }, + { + "epoch": 3.78, + "grad_norm": 4.179892063140869, + "learning_rate": 1.4053642722590522e-06, + "logits/chosen": -0.5255837440490723, + "logits/rejected": -0.5684680938720703, + "logps/chosen": -51.14647674560547, + "logps/rejected": -121.67578125, + "loss": 0.6466, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.032538414001465, + "rewards/margins": 7.3209381103515625, + "rewards/rejected": -4.288399696350098, + "step": 15096 + }, + { + "epoch": 3.78, + "grad_norm": 5.824451923370361, + "learning_rate": 1.404817988105902e-06, + "logits/chosen": -0.5263586640357971, + "logits/rejected": -0.5863698124885559, + "logps/chosen": -61.905006408691406, + "logps/rejected": -94.35794067382812, + "loss": 0.7233, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1456263065338135, + "rewards/margins": 5.9271626472473145, + "rewards/rejected": -2.781536340713501, + "step": 15097 + }, + { + "epoch": 3.78, + "grad_norm": 5.231668472290039, + "learning_rate": 1.404271792793524e-06, + "logits/chosen": -0.5617282390594482, + "logits/rejected": -0.6463843584060669, + "logps/chosen": -58.25122833251953, + "logps/rejected": -94.39710998535156, + "loss": 0.6518, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.072174072265625, + "rewards/margins": 6.317594051361084, + "rewards/rejected": -3.245419979095459, + "step": 15098 + }, + { + "epoch": 3.78, + "grad_norm": 3.482160806655884, + "learning_rate": 1.4037256863354132e-06, + "logits/chosen": -0.5539998412132263, + "logits/rejected": -0.6015562415122986, + "logps/chosen": -47.48849105834961, + "logps/rejected": -111.84971618652344, + "loss": 0.5833, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1300854682922363, + "rewards/margins": 7.094172477722168, + "rewards/rejected": -3.9640870094299316, + "step": 15099 + }, + { + "epoch": 3.78, + "grad_norm": 6.020658493041992, + "learning_rate": 1.4031796687450666e-06, + "logits/chosen": -0.5261191725730896, + "logits/rejected": -0.5724714994430542, + "logps/chosen": -53.789306640625, + "logps/rejected": -109.70101928710938, + "loss": 0.6919, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.978780746459961, + "rewards/margins": 6.295184135437012, + "rewards/rejected": -3.3164031505584717, + "step": 15100 + }, + { + "epoch": 3.78, + "grad_norm": 3.0536913871765137, + "learning_rate": 1.4026337400359768e-06, + "logits/chosen": -0.5615453124046326, + "logits/rejected": -0.6647632122039795, + "logps/chosen": -52.216064453125, + "logps/rejected": -93.1903076171875, + "loss": 0.53, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.111955404281616, + "rewards/margins": 6.847540378570557, + "rewards/rejected": -3.7355852127075195, + "step": 15101 + }, + { + "epoch": 3.78, + "grad_norm": 13.484600067138672, + "learning_rate": 1.402087900221632e-06, + "logits/chosen": -0.5582633018493652, + "logits/rejected": -0.6433109641075134, + "logps/chosen": -48.17652893066406, + "logps/rejected": -103.25858306884766, + "loss": 0.6487, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.208205461502075, + "rewards/margins": 6.321559906005859, + "rewards/rejected": -3.113354206085205, + "step": 15102 + }, + { + "epoch": 3.78, + "grad_norm": 7.61531925201416, + "learning_rate": 1.4015421493155235e-06, + "logits/chosen": -0.5641838908195496, + "logits/rejected": -0.6497765779495239, + "logps/chosen": -44.03864288330078, + "logps/rejected": -100.51435852050781, + "loss": 0.622, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.957887887954712, + "rewards/margins": 5.8800859451293945, + "rewards/rejected": -2.9221975803375244, + "step": 15103 + }, + { + "epoch": 3.78, + "grad_norm": 3.408796548843384, + "learning_rate": 1.400996487331136e-06, + "logits/chosen": -0.513592004776001, + "logits/rejected": -0.6289703249931335, + "logps/chosen": -57.162818908691406, + "logps/rejected": -98.72122192382812, + "loss": 0.5608, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.099400520324707, + "rewards/margins": 6.839625835418701, + "rewards/rejected": -3.7402257919311523, + "step": 15104 + }, + { + "epoch": 3.78, + "grad_norm": 7.043081760406494, + "learning_rate": 1.4004509142819516e-06, + "logits/chosen": -0.5026519894599915, + "logits/rejected": -0.6151420474052429, + "logps/chosen": -55.600250244140625, + "logps/rejected": -104.20057678222656, + "loss": 0.6365, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7452192306518555, + "rewards/margins": 7.45433235168457, + "rewards/rejected": -4.709114074707031, + "step": 15105 + }, + { + "epoch": 3.78, + "grad_norm": 4.374678611755371, + "learning_rate": 1.399905430181454e-06, + "logits/chosen": -0.5434810519218445, + "logits/rejected": -0.6152925491333008, + "logps/chosen": -44.621124267578125, + "logps/rejected": -96.50409698486328, + "loss": 0.576, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0123252868652344, + "rewards/margins": 6.962514877319336, + "rewards/rejected": -3.9501898288726807, + "step": 15106 + }, + { + "epoch": 3.78, + "grad_norm": 2.440660238265991, + "learning_rate": 1.3993600350431242e-06, + "logits/chosen": -0.5602717399597168, + "logits/rejected": -0.6312954425811768, + "logps/chosen": -55.557090759277344, + "logps/rejected": -119.58184814453125, + "loss": 0.6042, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.912834405899048, + "rewards/margins": 8.253531455993652, + "rewards/rejected": -5.340696334838867, + "step": 15107 + }, + { + "epoch": 3.78, + "grad_norm": 16.881467819213867, + "learning_rate": 1.398814728880437e-06, + "logits/chosen": -0.47343388199806213, + "logits/rejected": -0.5094261169433594, + "logps/chosen": -46.801151275634766, + "logps/rejected": -90.61776733398438, + "loss": 0.7262, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.149034023284912, + "rewards/margins": 4.3947224617004395, + "rewards/rejected": -1.2456883192062378, + "step": 15108 + }, + { + "epoch": 3.78, + "grad_norm": 7.872156143188477, + "learning_rate": 1.398269511706868e-06, + "logits/chosen": -0.5051140785217285, + "logits/rejected": -0.5746971964836121, + "logps/chosen": -61.47258377075195, + "logps/rejected": -130.91607666015625, + "loss": 0.6309, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0660078525543213, + "rewards/margins": 8.256294250488281, + "rewards/rejected": -5.190286159515381, + "step": 15109 + }, + { + "epoch": 3.78, + "grad_norm": 3.4052035808563232, + "learning_rate": 1.3977243835358907e-06, + "logits/chosen": -0.5607945919036865, + "logits/rejected": -0.6226504445075989, + "logps/chosen": -49.337284088134766, + "logps/rejected": -126.82109069824219, + "loss": 0.5349, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0943071842193604, + "rewards/margins": 8.22401237487793, + "rewards/rejected": -5.129705429077148, + "step": 15110 + }, + { + "epoch": 3.78, + "grad_norm": 6.742591857910156, + "learning_rate": 1.3971793443809755e-06, + "logits/chosen": -0.5239551067352295, + "logits/rejected": -0.6240580677986145, + "logps/chosen": -59.85761260986328, + "logps/rejected": -93.90322875976562, + "loss": 0.5782, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.382766008377075, + "rewards/margins": 6.401576042175293, + "rewards/rejected": -3.018810272216797, + "step": 15111 + }, + { + "epoch": 3.78, + "grad_norm": 2.192888021469116, + "learning_rate": 1.3966343942555898e-06, + "logits/chosen": -0.5724319219589233, + "logits/rejected": -0.6764945387840271, + "logps/chosen": -52.054569244384766, + "logps/rejected": -112.42587280273438, + "loss": 0.5233, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0673115253448486, + "rewards/margins": 8.263459205627441, + "rewards/rejected": -5.196147441864014, + "step": 15112 + }, + { + "epoch": 3.78, + "grad_norm": 3.7037250995635986, + "learning_rate": 1.3960895331732023e-06, + "logits/chosen": -0.5022022128105164, + "logits/rejected": -0.5444040298461914, + "logps/chosen": -54.326637268066406, + "logps/rejected": -101.68756866455078, + "loss": 0.5432, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.929886817932129, + "rewards/margins": 5.8423614501953125, + "rewards/rejected": -2.9124748706817627, + "step": 15113 + }, + { + "epoch": 3.78, + "grad_norm": 5.3680620193481445, + "learning_rate": 1.3955447611472745e-06, + "logits/chosen": -0.5716601610183716, + "logits/rejected": -0.6372310519218445, + "logps/chosen": -49.770381927490234, + "logps/rejected": -95.28841400146484, + "loss": 0.6817, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0045461654663086, + "rewards/margins": 6.087018013000488, + "rewards/rejected": -3.0824716091156006, + "step": 15114 + }, + { + "epoch": 3.78, + "grad_norm": 2.8905797004699707, + "learning_rate": 1.3950000781912705e-06, + "logits/chosen": -0.5610783696174622, + "logits/rejected": -0.6522146463394165, + "logps/chosen": -53.98625183105469, + "logps/rejected": -107.02587890625, + "loss": 0.5689, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.319887638092041, + "rewards/margins": 7.827603816986084, + "rewards/rejected": -4.507716655731201, + "step": 15115 + }, + { + "epoch": 3.78, + "grad_norm": 8.325519561767578, + "learning_rate": 1.3944554843186492e-06, + "logits/chosen": -0.5547399520874023, + "logits/rejected": -0.6728066205978394, + "logps/chosen": -69.59329223632812, + "logps/rejected": -85.89083862304688, + "loss": 0.6087, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.622962236404419, + "rewards/margins": 6.82017707824707, + "rewards/rejected": -4.197215557098389, + "step": 15116 + }, + { + "epoch": 3.78, + "grad_norm": 3.01342511177063, + "learning_rate": 1.3939109795428662e-06, + "logits/chosen": -0.5379013419151306, + "logits/rejected": -0.6062573194503784, + "logps/chosen": -54.072425842285156, + "logps/rejected": -107.3609619140625, + "loss": 0.5925, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1830618381500244, + "rewards/margins": 7.212409973144531, + "rewards/rejected": -4.029348373413086, + "step": 15117 + }, + { + "epoch": 3.78, + "grad_norm": 16.15793800354004, + "learning_rate": 1.3933665638773792e-06, + "logits/chosen": -0.5203092694282532, + "logits/rejected": -0.6088458299636841, + "logps/chosen": -64.43632507324219, + "logps/rejected": -122.18266296386719, + "loss": 0.6411, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.416135311126709, + "rewards/margins": 8.459760665893555, + "rewards/rejected": -5.043625354766846, + "step": 15118 + }, + { + "epoch": 3.78, + "grad_norm": 5.995395660400391, + "learning_rate": 1.392822237335643e-06, + "logits/chosen": -0.6038469672203064, + "logits/rejected": -0.722455620765686, + "logps/chosen": -47.69280242919922, + "logps/rejected": -102.70635986328125, + "loss": 0.5895, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9807755947113037, + "rewards/margins": 8.254476547241211, + "rewards/rejected": -5.273700714111328, + "step": 15119 + }, + { + "epoch": 3.78, + "grad_norm": 43.85347366333008, + "learning_rate": 1.3922779999311032e-06, + "logits/chosen": -0.5220780968666077, + "logits/rejected": -0.637718915939331, + "logps/chosen": -59.12385559082031, + "logps/rejected": -102.42738342285156, + "loss": 0.7145, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1680915355682373, + "rewards/margins": 8.107243537902832, + "rewards/rejected": -4.939151763916016, + "step": 15120 + }, + { + "epoch": 3.78, + "grad_norm": 7.763908386230469, + "learning_rate": 1.3917338516772116e-06, + "logits/chosen": -0.5469175577163696, + "logits/rejected": -0.6366288065910339, + "logps/chosen": -56.096675872802734, + "logps/rejected": -98.37911224365234, + "loss": 0.6653, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.942986488342285, + "rewards/margins": 6.388893127441406, + "rewards/rejected": -3.445906400680542, + "step": 15121 + }, + { + "epoch": 3.78, + "grad_norm": 3.4941091537475586, + "learning_rate": 1.3911897925874173e-06, + "logits/chosen": -0.5217641592025757, + "logits/rejected": -0.622031569480896, + "logps/chosen": -51.795501708984375, + "logps/rejected": -86.32108306884766, + "loss": 0.6217, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4900240898132324, + "rewards/margins": 6.335350036621094, + "rewards/rejected": -2.8453261852264404, + "step": 15122 + }, + { + "epoch": 3.78, + "grad_norm": 1.6472831964492798, + "learning_rate": 1.3906458226751584e-06, + "logits/chosen": -0.5563416481018066, + "logits/rejected": -0.6304526925086975, + "logps/chosen": -44.51438522338867, + "logps/rejected": -104.39456176757812, + "loss": 0.5602, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.064774513244629, + "rewards/margins": 8.652681350708008, + "rewards/rejected": -5.587907314300537, + "step": 15123 + }, + { + "epoch": 3.78, + "grad_norm": 4.179614067077637, + "learning_rate": 1.3901019419538808e-06, + "logits/chosen": -0.5734589099884033, + "logits/rejected": -0.6242983937263489, + "logps/chosen": -48.963035583496094, + "logps/rejected": -99.68267822265625, + "loss": 0.6143, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.127607822418213, + "rewards/margins": 5.337164878845215, + "rewards/rejected": -2.2095565795898438, + "step": 15124 + }, + { + "epoch": 3.78, + "grad_norm": 4.637085437774658, + "learning_rate": 1.3895581504370248e-06, + "logits/chosen": -0.5565955638885498, + "logits/rejected": -0.6031005382537842, + "logps/chosen": -42.69514083862305, + "logps/rejected": -124.47953796386719, + "loss": 0.563, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5633702278137207, + "rewards/margins": 7.61069917678833, + "rewards/rejected": -4.047328948974609, + "step": 15125 + }, + { + "epoch": 3.78, + "grad_norm": 4.7758893966674805, + "learning_rate": 1.3890144481380275e-06, + "logits/chosen": -0.5833406448364258, + "logits/rejected": -0.6726670265197754, + "logps/chosen": -53.626983642578125, + "logps/rejected": -114.88821411132812, + "loss": 0.6933, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7740187644958496, + "rewards/margins": 7.904663562774658, + "rewards/rejected": -5.130644798278809, + "step": 15126 + }, + { + "epoch": 3.78, + "grad_norm": 5.7287797927856445, + "learning_rate": 1.3884708350703225e-06, + "logits/chosen": -0.5432319641113281, + "logits/rejected": -0.6143217086791992, + "logps/chosen": -50.446189880371094, + "logps/rejected": -104.0045394897461, + "loss": 0.606, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2225918769836426, + "rewards/margins": 7.341231346130371, + "rewards/rejected": -4.118639945983887, + "step": 15127 + }, + { + "epoch": 3.78, + "grad_norm": 15.254652976989746, + "learning_rate": 1.3879273112473463e-06, + "logits/chosen": -0.6718630194664001, + "logits/rejected": -0.7863926887512207, + "logps/chosen": -55.2494010925293, + "logps/rejected": -90.73250579833984, + "loss": 0.6216, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5115866661071777, + "rewards/margins": 7.248844623565674, + "rewards/rejected": -3.737257719039917, + "step": 15128 + }, + { + "epoch": 3.78, + "grad_norm": 2.4863452911376953, + "learning_rate": 1.3873838766825275e-06, + "logits/chosen": -0.46121400594711304, + "logits/rejected": -0.6003918051719666, + "logps/chosen": -57.59475326538086, + "logps/rejected": -112.73865509033203, + "loss": 0.5432, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2031302452087402, + "rewards/margins": 8.393601417541504, + "rewards/rejected": -5.1904706954956055, + "step": 15129 + }, + { + "epoch": 3.78, + "grad_norm": 5.918398857116699, + "learning_rate": 1.3868405313892946e-06, + "logits/chosen": -0.6363363862037659, + "logits/rejected": -0.693943977355957, + "logps/chosen": -46.21769332885742, + "logps/rejected": -98.49518585205078, + "loss": 0.5685, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3230395317077637, + "rewards/margins": 7.293128967285156, + "rewards/rejected": -3.9700896739959717, + "step": 15130 + }, + { + "epoch": 3.79, + "grad_norm": 3.5968055725097656, + "learning_rate": 1.386297275381077e-06, + "logits/chosen": -0.5320709943771362, + "logits/rejected": -0.625219464302063, + "logps/chosen": -47.037750244140625, + "logps/rejected": -91.07427978515625, + "loss": 0.5073, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.999964952468872, + "rewards/margins": 6.653066635131836, + "rewards/rejected": -3.6531007289886475, + "step": 15131 + }, + { + "epoch": 3.79, + "grad_norm": 2.818392753601074, + "learning_rate": 1.3857541086712962e-06, + "logits/chosen": -0.5622443556785583, + "logits/rejected": -0.672193169593811, + "logps/chosen": -57.515541076660156, + "logps/rejected": -116.14353942871094, + "loss": 0.5849, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1698226928710938, + "rewards/margins": 8.226028442382812, + "rewards/rejected": -5.056204795837402, + "step": 15132 + }, + { + "epoch": 3.79, + "grad_norm": 4.545724868774414, + "learning_rate": 1.3852110312733763e-06, + "logits/chosen": -0.6858876347541809, + "logits/rejected": -0.7841426134109497, + "logps/chosen": -54.709716796875, + "logps/rejected": -91.30590057373047, + "loss": 0.5932, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7995848655700684, + "rewards/margins": 7.798629283905029, + "rewards/rejected": -4.999044418334961, + "step": 15133 + }, + { + "epoch": 3.79, + "grad_norm": 4.06602144241333, + "learning_rate": 1.384668043200737e-06, + "logits/chosen": -0.574672520160675, + "logits/rejected": -0.6724839210510254, + "logps/chosen": -81.56465911865234, + "logps/rejected": -111.20767974853516, + "loss": 0.6912, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8046700954437256, + "rewards/margins": 8.294382095336914, + "rewards/rejected": -5.489711761474609, + "step": 15134 + }, + { + "epoch": 3.79, + "grad_norm": 7.034040927886963, + "learning_rate": 1.3841251444667947e-06, + "logits/chosen": -0.5854013562202454, + "logits/rejected": -0.6022154092788696, + "logps/chosen": -46.20461654663086, + "logps/rejected": -127.32737731933594, + "loss": 0.6025, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.090407609939575, + "rewards/margins": 7.949937343597412, + "rewards/rejected": -4.859530448913574, + "step": 15135 + }, + { + "epoch": 3.79, + "grad_norm": 9.668323516845703, + "learning_rate": 1.3835823350849669e-06, + "logits/chosen": -0.5132550597190857, + "logits/rejected": -0.5994914770126343, + "logps/chosen": -57.919891357421875, + "logps/rejected": -109.84855651855469, + "loss": 0.6105, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2172183990478516, + "rewards/margins": 7.078977584838867, + "rewards/rejected": -3.8617591857910156, + "step": 15136 + }, + { + "epoch": 3.79, + "grad_norm": 3.621215581893921, + "learning_rate": 1.3830396150686653e-06, + "logits/chosen": -0.5884141325950623, + "logits/rejected": -0.6310599446296692, + "logps/chosen": -45.66057205200195, + "logps/rejected": -100.33988952636719, + "loss": 0.6829, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9428465366363525, + "rewards/margins": 6.2935943603515625, + "rewards/rejected": -3.350747585296631, + "step": 15137 + }, + { + "epoch": 3.79, + "grad_norm": 5.619396686553955, + "learning_rate": 1.382496984431303e-06, + "logits/chosen": -0.617693305015564, + "logits/rejected": -0.712187647819519, + "logps/chosen": -45.28852844238281, + "logps/rejected": -106.21561431884766, + "loss": 0.5818, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.969119071960449, + "rewards/margins": 7.475784778594971, + "rewards/rejected": -4.5066657066345215, + "step": 15138 + }, + { + "epoch": 3.79, + "grad_norm": 10.365673065185547, + "learning_rate": 1.381954443186287e-06, + "logits/chosen": -0.5050144195556641, + "logits/rejected": -0.6274099946022034, + "logps/chosen": -49.70722198486328, + "logps/rejected": -99.969970703125, + "loss": 0.6491, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0820746421813965, + "rewards/margins": 7.64207649230957, + "rewards/rejected": -4.560002326965332, + "step": 15139 + }, + { + "epoch": 3.79, + "grad_norm": 4.186159610748291, + "learning_rate": 1.3814119913470258e-06, + "logits/chosen": -0.6078035235404968, + "logits/rejected": -0.6686939001083374, + "logps/chosen": -52.50202941894531, + "logps/rejected": -112.2181396484375, + "loss": 0.6346, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0057480335235596, + "rewards/margins": 7.8213958740234375, + "rewards/rejected": -4.815648555755615, + "step": 15140 + }, + { + "epoch": 3.79, + "grad_norm": 7.014262676239014, + "learning_rate": 1.3808696289269242e-06, + "logits/chosen": -0.5947525501251221, + "logits/rejected": -0.6719000935554504, + "logps/chosen": -56.16374969482422, + "logps/rejected": -103.30821228027344, + "loss": 0.6291, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0417425632476807, + "rewards/margins": 6.153164386749268, + "rewards/rejected": -3.111422061920166, + "step": 15141 + }, + { + "epoch": 3.79, + "grad_norm": 13.278023719787598, + "learning_rate": 1.3803273559393809e-06, + "logits/chosen": -0.544543445110321, + "logits/rejected": -0.6394327282905579, + "logps/chosen": -53.2929573059082, + "logps/rejected": -92.1950912475586, + "loss": 0.6882, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9378392696380615, + "rewards/margins": 5.866549491882324, + "rewards/rejected": -2.9287097454071045, + "step": 15142 + }, + { + "epoch": 3.79, + "grad_norm": 2.746142625808716, + "learning_rate": 1.379785172397801e-06, + "logits/chosen": -0.5856285691261292, + "logits/rejected": -0.6386822462081909, + "logps/chosen": -51.30223083496094, + "logps/rejected": -111.31961822509766, + "loss": 0.5717, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.040858268737793, + "rewards/margins": 6.797050952911377, + "rewards/rejected": -3.756192922592163, + "step": 15143 + }, + { + "epoch": 3.79, + "grad_norm": 6.403435707092285, + "learning_rate": 1.3792430783155798e-06, + "logits/chosen": -0.5432857275009155, + "logits/rejected": -0.5867701768875122, + "logps/chosen": -57.19841766357422, + "logps/rejected": -111.10830688476562, + "loss": 0.7058, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1564559936523438, + "rewards/margins": 5.863864421844482, + "rewards/rejected": -2.7074084281921387, + "step": 15144 + }, + { + "epoch": 3.79, + "grad_norm": 9.565526962280273, + "learning_rate": 1.3787010737061119e-06, + "logits/chosen": -0.48039764165878296, + "logits/rejected": -0.5418071150779724, + "logps/chosen": -48.41039276123047, + "logps/rejected": -95.574951171875, + "loss": 0.6282, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3141849040985107, + "rewards/margins": 6.011354446411133, + "rewards/rejected": -2.6971688270568848, + "step": 15145 + }, + { + "epoch": 3.79, + "grad_norm": 3.7458436489105225, + "learning_rate": 1.378159158582794e-06, + "logits/chosen": -0.5133423805236816, + "logits/rejected": -0.5811251401901245, + "logps/chosen": -48.22065734863281, + "logps/rejected": -118.28746795654297, + "loss": 0.5753, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3399741649627686, + "rewards/margins": 7.6815385818481445, + "rewards/rejected": -4.341563701629639, + "step": 15146 + }, + { + "epoch": 3.79, + "grad_norm": 6.49794864654541, + "learning_rate": 1.3776173329590153e-06, + "logits/chosen": -0.5003851056098938, + "logits/rejected": -0.6091727614402771, + "logps/chosen": -48.6446647644043, + "logps/rejected": -107.98697662353516, + "loss": 0.6051, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.177588939666748, + "rewards/margins": 7.7786102294921875, + "rewards/rejected": -4.6010212898254395, + "step": 15147 + }, + { + "epoch": 3.79, + "grad_norm": 3.317704439163208, + "learning_rate": 1.377075596848164e-06, + "logits/chosen": -0.46731042861938477, + "logits/rejected": -0.59104323387146, + "logps/chosen": -58.26138687133789, + "logps/rejected": -103.31201934814453, + "loss": 0.558, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.449033260345459, + "rewards/margins": 7.947912216186523, + "rewards/rejected": -4.4988789558410645, + "step": 15148 + }, + { + "epoch": 3.79, + "grad_norm": 2.532820701599121, + "learning_rate": 1.3765339502636282e-06, + "logits/chosen": -0.5693594813346863, + "logits/rejected": -0.6305547952651978, + "logps/chosen": -47.45738983154297, + "logps/rejected": -93.95992279052734, + "loss": 0.56, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.350109815597534, + "rewards/margins": 6.883144855499268, + "rewards/rejected": -3.5330357551574707, + "step": 15149 + }, + { + "epoch": 3.79, + "grad_norm": 4.02461576461792, + "learning_rate": 1.375992393218794e-06, + "logits/chosen": -0.5495809316635132, + "logits/rejected": -0.6652133464813232, + "logps/chosen": -60.56381607055664, + "logps/rejected": -110.25733947753906, + "loss": 0.6034, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9121670722961426, + "rewards/margins": 7.811717987060547, + "rewards/rejected": -4.899550914764404, + "step": 15150 + }, + { + "epoch": 3.79, + "grad_norm": 5.906637668609619, + "learning_rate": 1.375450925727042e-06, + "logits/chosen": -0.5995245575904846, + "logits/rejected": -0.6946903467178345, + "logps/chosen": -52.87052536010742, + "logps/rejected": -88.07188415527344, + "loss": 0.6238, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2521867752075195, + "rewards/margins": 6.682196617126465, + "rewards/rejected": -3.4300103187561035, + "step": 15151 + }, + { + "epoch": 3.79, + "grad_norm": 5.36332368850708, + "learning_rate": 1.3749095478017522e-06, + "logits/chosen": -0.480756551027298, + "logits/rejected": -0.5806570053100586, + "logps/chosen": -55.86442184448242, + "logps/rejected": -99.870361328125, + "loss": 0.6251, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9556801319122314, + "rewards/margins": 7.7248640060424805, + "rewards/rejected": -4.76918363571167, + "step": 15152 + }, + { + "epoch": 3.79, + "grad_norm": 4.360050201416016, + "learning_rate": 1.374368259456304e-06, + "logits/chosen": -0.6345539093017578, + "logits/rejected": -0.6851049661636353, + "logps/chosen": -49.740379333496094, + "logps/rejected": -110.10595703125, + "loss": 0.6966, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1735689640045166, + "rewards/margins": 7.999256134033203, + "rewards/rejected": -4.825687408447266, + "step": 15153 + }, + { + "epoch": 3.79, + "grad_norm": 5.734450340270996, + "learning_rate": 1.3738270607040733e-06, + "logits/chosen": -0.47359293699264526, + "logits/rejected": -0.5301523804664612, + "logps/chosen": -64.8536376953125, + "logps/rejected": -123.54842376708984, + "loss": 0.7034, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1689279079437256, + "rewards/margins": 6.326389312744141, + "rewards/rejected": -3.157461404800415, + "step": 15154 + }, + { + "epoch": 3.79, + "grad_norm": 8.348965644836426, + "learning_rate": 1.3732859515584306e-06, + "logits/chosen": -0.5360738635063171, + "logits/rejected": -0.5401247143745422, + "logps/chosen": -62.50506591796875, + "logps/rejected": -98.47683715820312, + "loss": 0.7204, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7365591526031494, + "rewards/margins": 5.377660751342773, + "rewards/rejected": -2.641101360321045, + "step": 15155 + }, + { + "epoch": 3.79, + "grad_norm": 41.59769058227539, + "learning_rate": 1.3727449320327513e-06, + "logits/chosen": -0.5395956039428711, + "logits/rejected": -0.5756807923316956, + "logps/chosen": -58.496761322021484, + "logps/rejected": -99.85697937011719, + "loss": 0.8961, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.859785556793213, + "rewards/margins": 6.387648582458496, + "rewards/rejected": -3.5278635025024414, + "step": 15156 + }, + { + "epoch": 3.79, + "grad_norm": 13.524027824401855, + "learning_rate": 1.3722040021404015e-06, + "logits/chosen": -0.5330359935760498, + "logits/rejected": -0.6238766312599182, + "logps/chosen": -60.53886413574219, + "logps/rejected": -99.74396514892578, + "loss": 0.7577, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9294090270996094, + "rewards/margins": 6.561802387237549, + "rewards/rejected": -3.6323931217193604, + "step": 15157 + }, + { + "epoch": 3.79, + "grad_norm": 11.435319900512695, + "learning_rate": 1.3716631618947512e-06, + "logits/chosen": -0.5375893712043762, + "logits/rejected": -0.5943801403045654, + "logps/chosen": -56.018577575683594, + "logps/rejected": -110.73802185058594, + "loss": 0.7001, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9294841289520264, + "rewards/margins": 6.514434337615967, + "rewards/rejected": -3.584949493408203, + "step": 15158 + }, + { + "epoch": 3.79, + "grad_norm": 5.724212646484375, + "learning_rate": 1.3711224113091632e-06, + "logits/chosen": -0.4775957465171814, + "logits/rejected": -0.5887660980224609, + "logps/chosen": -53.090206146240234, + "logps/rejected": -96.81592559814453, + "loss": 0.6036, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0993995666503906, + "rewards/margins": 7.290702819824219, + "rewards/rejected": -4.191303253173828, + "step": 15159 + }, + { + "epoch": 3.79, + "grad_norm": 11.423931121826172, + "learning_rate": 1.3705817503969988e-06, + "logits/chosen": -0.504362940788269, + "logits/rejected": -0.5724166035652161, + "logps/chosen": -70.15785217285156, + "logps/rejected": -109.38346862792969, + "loss": 0.7041, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.040613889694214, + "rewards/margins": 6.469280242919922, + "rewards/rejected": -3.428666591644287, + "step": 15160 + }, + { + "epoch": 3.79, + "grad_norm": 5.2318267822265625, + "learning_rate": 1.3700411791716212e-06, + "logits/chosen": -0.6196860074996948, + "logits/rejected": -0.697992205619812, + "logps/chosen": -63.62510681152344, + "logps/rejected": -114.9692153930664, + "loss": 0.723, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.011434555053711, + "rewards/margins": 7.641343116760254, + "rewards/rejected": -4.629907608032227, + "step": 15161 + }, + { + "epoch": 3.79, + "grad_norm": 3.4469659328460693, + "learning_rate": 1.369500697646387e-06, + "logits/chosen": -0.5161928534507751, + "logits/rejected": -0.6639254093170166, + "logps/chosen": -55.69948196411133, + "logps/rejected": -101.43238830566406, + "loss": 0.5299, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8827552795410156, + "rewards/margins": 7.333028793334961, + "rewards/rejected": -4.450274467468262, + "step": 15162 + }, + { + "epoch": 3.79, + "grad_norm": 7.556210517883301, + "learning_rate": 1.3689603058346513e-06, + "logits/chosen": -0.6455255746841431, + "logits/rejected": -0.6795400977134705, + "logps/chosen": -49.765228271484375, + "logps/rejected": -115.344970703125, + "loss": 0.6384, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.227633237838745, + "rewards/margins": 6.05653190612793, + "rewards/rejected": -2.8288989067077637, + "step": 15163 + }, + { + "epoch": 3.79, + "grad_norm": 7.511141300201416, + "learning_rate": 1.368420003749768e-06, + "logits/chosen": -0.579738438129425, + "logits/rejected": -0.6705929636955261, + "logps/chosen": -43.39340591430664, + "logps/rejected": -99.59561920166016, + "loss": 0.547, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1548655033111572, + "rewards/margins": 6.823232173919678, + "rewards/rejected": -3.6683661937713623, + "step": 15164 + }, + { + "epoch": 3.79, + "grad_norm": 3.9909403324127197, + "learning_rate": 1.3678797914050917e-06, + "logits/chosen": -0.47692275047302246, + "logits/rejected": -0.5661486387252808, + "logps/chosen": -47.535316467285156, + "logps/rejected": -94.88516235351562, + "loss": 0.5547, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3886702060699463, + "rewards/margins": 7.450557708740234, + "rewards/rejected": -4.061887741088867, + "step": 15165 + }, + { + "epoch": 3.79, + "grad_norm": 5.102200508117676, + "learning_rate": 1.3673396688139668e-06, + "logits/chosen": -0.6055245995521545, + "logits/rejected": -0.683261513710022, + "logps/chosen": -52.59027862548828, + "logps/rejected": -122.05304718017578, + "loss": 0.6203, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2605819702148438, + "rewards/margins": 8.163232803344727, + "rewards/rejected": -4.902650833129883, + "step": 15166 + }, + { + "epoch": 3.79, + "grad_norm": 3.843029022216797, + "learning_rate": 1.3667996359897424e-06, + "logits/chosen": -0.5275452733039856, + "logits/rejected": -0.6922376155853271, + "logps/chosen": -54.27036666870117, + "logps/rejected": -104.92974853515625, + "loss": 0.5585, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1216347217559814, + "rewards/margins": 8.61138916015625, + "rewards/rejected": -5.489755153656006, + "step": 15167 + }, + { + "epoch": 3.79, + "grad_norm": 26.012271881103516, + "learning_rate": 1.3662596929457661e-06, + "logits/chosen": -0.5288590788841248, + "logits/rejected": -0.6303231120109558, + "logps/chosen": -63.190338134765625, + "logps/rejected": -97.89530944824219, + "loss": 0.8389, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0194642543792725, + "rewards/margins": 6.174267768859863, + "rewards/rejected": -3.15480375289917, + "step": 15168 + }, + { + "epoch": 3.79, + "grad_norm": 4.33379602432251, + "learning_rate": 1.3657198396953753e-06, + "logits/chosen": -0.5044151544570923, + "logits/rejected": -0.6295465230941772, + "logps/chosen": -58.522544860839844, + "logps/rejected": -93.16084289550781, + "loss": 0.6071, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.076981782913208, + "rewards/margins": 6.105876445770264, + "rewards/rejected": -3.0288944244384766, + "step": 15169 + }, + { + "epoch": 3.79, + "grad_norm": 10.973873138427734, + "learning_rate": 1.3651800762519125e-06, + "logits/chosen": -0.5327907800674438, + "logits/rejected": -0.5781450867652893, + "logps/chosen": -53.274078369140625, + "logps/rejected": -98.60374450683594, + "loss": 0.6913, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0957250595092773, + "rewards/margins": 6.015883445739746, + "rewards/rejected": -2.9201583862304688, + "step": 15170 + }, + { + "epoch": 3.8, + "grad_norm": 4.827425003051758, + "learning_rate": 1.3646404026287175e-06, + "logits/chosen": -0.4920746684074402, + "logits/rejected": -0.5754854679107666, + "logps/chosen": -54.82783508300781, + "logps/rejected": -116.49456787109375, + "loss": 0.587, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2216479778289795, + "rewards/margins": 7.291008949279785, + "rewards/rejected": -4.069361209869385, + "step": 15171 + }, + { + "epoch": 3.8, + "grad_norm": 8.40369987487793, + "learning_rate": 1.3641008188391253e-06, + "logits/chosen": -0.5196276903152466, + "logits/rejected": -0.5296176075935364, + "logps/chosen": -53.778076171875, + "logps/rejected": -123.63529968261719, + "loss": 0.6869, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.394028425216675, + "rewards/margins": 6.19157600402832, + "rewards/rejected": -2.7975478172302246, + "step": 15172 + }, + { + "epoch": 3.8, + "grad_norm": 5.329948425292969, + "learning_rate": 1.3635613248964673e-06, + "logits/chosen": -0.5167809128761292, + "logits/rejected": -0.5933666229248047, + "logps/chosen": -49.47721481323242, + "logps/rejected": -102.29412078857422, + "loss": 0.621, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.982609272003174, + "rewards/margins": 6.801025390625, + "rewards/rejected": -3.8184163570404053, + "step": 15173 + }, + { + "epoch": 3.8, + "grad_norm": 3.7964046001434326, + "learning_rate": 1.3630219208140787e-06, + "logits/chosen": -0.5711274147033691, + "logits/rejected": -0.6443759799003601, + "logps/chosen": -53.888267517089844, + "logps/rejected": -114.1625747680664, + "loss": 0.5732, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.023529052734375, + "rewards/margins": 7.477108001708984, + "rewards/rejected": -4.453578948974609, + "step": 15174 + }, + { + "epoch": 3.8, + "grad_norm": 5.124274253845215, + "learning_rate": 1.3624826066052854e-06, + "logits/chosen": -0.46478891372680664, + "logits/rejected": -0.4964717626571655, + "logps/chosen": -57.98883056640625, + "logps/rejected": -95.57560729980469, + "loss": 0.6502, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.665719985961914, + "rewards/margins": 5.5738348960876465, + "rewards/rejected": -2.908114433288574, + "step": 15175 + }, + { + "epoch": 3.8, + "grad_norm": 5.829147815704346, + "learning_rate": 1.361943382283417e-06, + "logits/chosen": -0.5998188853263855, + "logits/rejected": -0.6096277236938477, + "logps/chosen": -45.64590072631836, + "logps/rejected": -97.90489959716797, + "loss": 0.719, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9244093894958496, + "rewards/margins": 6.014043807983398, + "rewards/rejected": -3.0896341800689697, + "step": 15176 + }, + { + "epoch": 3.8, + "grad_norm": 4.297771453857422, + "learning_rate": 1.3614042478617978e-06, + "logits/chosen": -0.4935203492641449, + "logits/rejected": -0.5893900394439697, + "logps/chosen": -54.51128005981445, + "logps/rejected": -122.0466537475586, + "loss": 0.6607, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1942319869995117, + "rewards/margins": 7.466701030731201, + "rewards/rejected": -4.2724690437316895, + "step": 15177 + }, + { + "epoch": 3.8, + "grad_norm": 2.789083957672119, + "learning_rate": 1.3608652033537483e-06, + "logits/chosen": -0.6464267373085022, + "logits/rejected": -0.7000936269760132, + "logps/chosen": -47.076568603515625, + "logps/rejected": -123.87102508544922, + "loss": 0.6113, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2079741954803467, + "rewards/margins": 9.15329647064209, + "rewards/rejected": -5.945322036743164, + "step": 15178 + }, + { + "epoch": 3.8, + "grad_norm": 3.4199793338775635, + "learning_rate": 1.3603262487725915e-06, + "logits/chosen": -0.5492579340934753, + "logits/rejected": -0.631983757019043, + "logps/chosen": -50.84767532348633, + "logps/rejected": -114.97338104248047, + "loss": 0.557, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4026689529418945, + "rewards/margins": 7.946116924285889, + "rewards/rejected": -4.543447494506836, + "step": 15179 + }, + { + "epoch": 3.8, + "grad_norm": 3.4961252212524414, + "learning_rate": 1.3597873841316434e-06, + "logits/chosen": -0.5544816255569458, + "logits/rejected": -0.6449823975563049, + "logps/chosen": -51.266780853271484, + "logps/rejected": -87.3166732788086, + "loss": 0.6033, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2489161491394043, + "rewards/margins": 6.564572334289551, + "rewards/rejected": -3.3156566619873047, + "step": 15180 + }, + { + "epoch": 3.8, + "grad_norm": 3.793405771255493, + "learning_rate": 1.359248609444222e-06, + "logits/chosen": -0.506931483745575, + "logits/rejected": -0.5570655465126038, + "logps/chosen": -53.74066925048828, + "logps/rejected": -114.86897277832031, + "loss": 0.5884, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1135499477386475, + "rewards/margins": 6.3477020263671875, + "rewards/rejected": -3.2341525554656982, + "step": 15181 + }, + { + "epoch": 3.8, + "grad_norm": 9.620355606079102, + "learning_rate": 1.3587099247236385e-06, + "logits/chosen": -0.5520286560058594, + "logits/rejected": -0.614605724811554, + "logps/chosen": -48.36949920654297, + "logps/rejected": -105.70758056640625, + "loss": 0.5713, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1174633502960205, + "rewards/margins": 7.078147888183594, + "rewards/rejected": -3.9606852531433105, + "step": 15182 + }, + { + "epoch": 3.8, + "grad_norm": 10.961969375610352, + "learning_rate": 1.3581713299832073e-06, + "logits/chosen": -0.4344986379146576, + "logits/rejected": -0.5519118309020996, + "logps/chosen": -59.90926742553711, + "logps/rejected": -93.59235382080078, + "loss": 0.6729, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.27040433883667, + "rewards/margins": 6.916952610015869, + "rewards/rejected": -3.646547794342041, + "step": 15183 + }, + { + "epoch": 3.8, + "grad_norm": 11.924161911010742, + "learning_rate": 1.3576328252362363e-06, + "logits/chosen": -0.5006787180900574, + "logits/rejected": -0.5861918926239014, + "logps/chosen": -46.563499450683594, + "logps/rejected": -104.56605529785156, + "loss": 0.5848, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.134617567062378, + "rewards/margins": 7.690939426422119, + "rewards/rejected": -4.5563225746154785, + "step": 15184 + }, + { + "epoch": 3.8, + "grad_norm": 7.244015216827393, + "learning_rate": 1.3570944104960305e-06, + "logits/chosen": -0.4984177350997925, + "logits/rejected": -0.6174628138542175, + "logps/chosen": -51.81272888183594, + "logps/rejected": -96.81494903564453, + "loss": 0.7319, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9701666831970215, + "rewards/margins": 6.498211860656738, + "rewards/rejected": -3.528045177459717, + "step": 15185 + }, + { + "epoch": 3.8, + "grad_norm": 7.366407871246338, + "learning_rate": 1.3565560857758986e-06, + "logits/chosen": -0.6061892509460449, + "logits/rejected": -0.6580096483230591, + "logps/chosen": -54.689449310302734, + "logps/rejected": -91.32938385009766, + "loss": 0.6096, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.703406572341919, + "rewards/margins": 5.115493297576904, + "rewards/rejected": -2.412086248397827, + "step": 15186 + }, + { + "epoch": 3.8, + "grad_norm": 3.3067331314086914, + "learning_rate": 1.3560178510891408e-06, + "logits/chosen": -0.5559606552124023, + "logits/rejected": -0.6657006740570068, + "logps/chosen": -45.28318786621094, + "logps/rejected": -114.79507446289062, + "loss": 0.5636, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.226490020751953, + "rewards/margins": 8.442602157592773, + "rewards/rejected": -5.216111183166504, + "step": 15187 + }, + { + "epoch": 3.8, + "grad_norm": 3.0375256538391113, + "learning_rate": 1.3554797064490565e-06, + "logits/chosen": -0.6075537204742432, + "logits/rejected": -0.6737046837806702, + "logps/chosen": -43.59221649169922, + "logps/rejected": -101.19528198242188, + "loss": 0.5508, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.117595911026001, + "rewards/margins": 6.521768093109131, + "rewards/rejected": -3.404172420501709, + "step": 15188 + }, + { + "epoch": 3.8, + "grad_norm": 4.216041564941406, + "learning_rate": 1.3549416518689467e-06, + "logits/chosen": -0.5992881655693054, + "logits/rejected": -0.6035791635513306, + "logps/chosen": -61.81801223754883, + "logps/rejected": -109.9968032836914, + "loss": 0.6745, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1196744441986084, + "rewards/margins": 6.210672855377197, + "rewards/rejected": -3.090998888015747, + "step": 15189 + }, + { + "epoch": 3.8, + "grad_norm": 5.413936138153076, + "learning_rate": 1.3544036873621054e-06, + "logits/chosen": -0.650417685508728, + "logits/rejected": -0.7003068327903748, + "logps/chosen": -40.787010192871094, + "logps/rejected": -96.5953369140625, + "loss": 0.5612, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8565874099731445, + "rewards/margins": 7.206378936767578, + "rewards/rejected": -4.349791526794434, + "step": 15190 + }, + { + "epoch": 3.8, + "grad_norm": 26.650808334350586, + "learning_rate": 1.3538658129418252e-06, + "logits/chosen": -0.5746616125106812, + "logits/rejected": -0.6472070217132568, + "logps/chosen": -53.891571044921875, + "logps/rejected": -97.01216888427734, + "loss": 0.6067, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.997239589691162, + "rewards/margins": 6.957918167114258, + "rewards/rejected": -3.960679054260254, + "step": 15191 + }, + { + "epoch": 3.8, + "grad_norm": 3.7662394046783447, + "learning_rate": 1.3533280286213995e-06, + "logits/chosen": -0.5609021782875061, + "logits/rejected": -0.6410033702850342, + "logps/chosen": -56.077735900878906, + "logps/rejected": -121.11000061035156, + "loss": 0.6934, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1240224838256836, + "rewards/margins": 8.528548240661621, + "rewards/rejected": -5.4045257568359375, + "step": 15192 + }, + { + "epoch": 3.8, + "grad_norm": 4.107939720153809, + "learning_rate": 1.3527903344141196e-06, + "logits/chosen": -0.5021617412567139, + "logits/rejected": -0.5871019959449768, + "logps/chosen": -73.4620361328125, + "logps/rejected": -113.34272766113281, + "loss": 0.6272, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7839176654815674, + "rewards/margins": 7.136514186859131, + "rewards/rejected": -4.352596282958984, + "step": 15193 + }, + { + "epoch": 3.8, + "grad_norm": 7.857887268066406, + "learning_rate": 1.3522527303332666e-06, + "logits/chosen": -0.4789127707481384, + "logits/rejected": -0.5540444254875183, + "logps/chosen": -54.31366729736328, + "logps/rejected": -111.63984680175781, + "loss": 0.6267, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.965346574783325, + "rewards/margins": 7.062492370605469, + "rewards/rejected": -4.097146034240723, + "step": 15194 + }, + { + "epoch": 3.8, + "grad_norm": 11.29170036315918, + "learning_rate": 1.3517152163921289e-06, + "logits/chosen": -0.5444739460945129, + "logits/rejected": -0.5832840204238892, + "logps/chosen": -52.81651306152344, + "logps/rejected": -113.56167602539062, + "loss": 0.6508, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.239741086959839, + "rewards/margins": 6.406723499298096, + "rewards/rejected": -3.166982650756836, + "step": 15195 + }, + { + "epoch": 3.8, + "grad_norm": 4.573662281036377, + "learning_rate": 1.3511777926039904e-06, + "logits/chosen": -0.5086797475814819, + "logits/rejected": -0.6128127574920654, + "logps/chosen": -62.085365295410156, + "logps/rejected": -119.28341674804688, + "loss": 0.6682, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.272279977798462, + "rewards/margins": 8.207462310791016, + "rewards/rejected": -4.935182571411133, + "step": 15196 + }, + { + "epoch": 3.8, + "grad_norm": 5.851758003234863, + "learning_rate": 1.3506404589821292e-06, + "logits/chosen": -0.5543192028999329, + "logits/rejected": -0.5989528298377991, + "logps/chosen": -43.88134765625, + "logps/rejected": -85.3560562133789, + "loss": 0.6783, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9457192420959473, + "rewards/margins": 6.097931385040283, + "rewards/rejected": -3.1522116661071777, + "step": 15197 + }, + { + "epoch": 3.8, + "grad_norm": 3.2954440116882324, + "learning_rate": 1.3501032155398225e-06, + "logits/chosen": -0.49078667163848877, + "logits/rejected": -0.5945695638656616, + "logps/chosen": -60.77924346923828, + "logps/rejected": -109.36058807373047, + "loss": 0.5661, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.032364845275879, + "rewards/margins": 7.484801292419434, + "rewards/rejected": -4.452436447143555, + "step": 15198 + }, + { + "epoch": 3.8, + "grad_norm": 36.69361114501953, + "learning_rate": 1.3495660622903494e-06, + "logits/chosen": -0.5444400310516357, + "logits/rejected": -0.6255476474761963, + "logps/chosen": -56.284996032714844, + "logps/rejected": -118.9204330444336, + "loss": 0.6962, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6079354286193848, + "rewards/margins": 7.091044902801514, + "rewards/rejected": -4.483109474182129, + "step": 15199 + }, + { + "epoch": 3.8, + "grad_norm": 7.187231540679932, + "learning_rate": 1.3490289992469795e-06, + "logits/chosen": -0.5773399472236633, + "logits/rejected": -0.6543294787406921, + "logps/chosen": -56.345947265625, + "logps/rejected": -101.75611114501953, + "loss": 0.6575, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.695284843444824, + "rewards/margins": 6.83022928237915, + "rewards/rejected": -4.134944915771484, + "step": 15200 + }, + { + "epoch": 3.8, + "grad_norm": 6.92532205581665, + "learning_rate": 1.3484920264229883e-06, + "logits/chosen": -0.5627191066741943, + "logits/rejected": -0.6866726279258728, + "logps/chosen": -50.787837982177734, + "logps/rejected": -111.38678741455078, + "loss": 0.5537, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0729918479919434, + "rewards/margins": 8.365984916687012, + "rewards/rejected": -5.29299259185791, + "step": 15201 + }, + { + "epoch": 3.8, + "grad_norm": 10.865266799926758, + "learning_rate": 1.3479551438316435e-06, + "logits/chosen": -0.5936819911003113, + "logits/rejected": -0.6740586161613464, + "logps/chosen": -61.8615837097168, + "logps/rejected": -107.22380065917969, + "loss": 0.6592, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.015165090560913, + "rewards/margins": 7.230264663696289, + "rewards/rejected": -4.215099811553955, + "step": 15202 + }, + { + "epoch": 3.8, + "grad_norm": 6.033239364624023, + "learning_rate": 1.3474183514862094e-06, + "logits/chosen": -0.5354593396186829, + "logits/rejected": -0.6084132194519043, + "logps/chosen": -50.04594802856445, + "logps/rejected": -133.04649353027344, + "loss": 0.5412, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1956799030303955, + "rewards/margins": 8.54468059539795, + "rewards/rejected": -5.349001407623291, + "step": 15203 + }, + { + "epoch": 3.8, + "grad_norm": 6.963613986968994, + "learning_rate": 1.3468816493999548e-06, + "logits/chosen": -0.5213252902030945, + "logits/rejected": -0.6492686867713928, + "logps/chosen": -64.95848846435547, + "logps/rejected": -88.4260482788086, + "loss": 0.6258, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.918111801147461, + "rewards/margins": 6.590029716491699, + "rewards/rejected": -3.671917200088501, + "step": 15204 + }, + { + "epoch": 3.8, + "grad_norm": 3.928044080734253, + "learning_rate": 1.3463450375861404e-06, + "logits/chosen": -0.5287437438964844, + "logits/rejected": -0.5883564949035645, + "logps/chosen": -55.37083435058594, + "logps/rejected": -119.31414794921875, + "loss": 0.6125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.918757677078247, + "rewards/margins": 7.375582218170166, + "rewards/rejected": -4.45682430267334, + "step": 15205 + }, + { + "epoch": 3.8, + "grad_norm": 4.874700546264648, + "learning_rate": 1.3458085160580242e-06, + "logits/chosen": -0.5531091690063477, + "logits/rejected": -0.6588980555534363, + "logps/chosen": -51.34572982788086, + "logps/rejected": -84.96940612792969, + "loss": 0.5831, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8111987113952637, + "rewards/margins": 6.317412376403809, + "rewards/rejected": -3.506213903427124, + "step": 15206 + }, + { + "epoch": 3.8, + "grad_norm": 3.1654043197631836, + "learning_rate": 1.3452720848288664e-06, + "logits/chosen": -0.49943697452545166, + "logits/rejected": -0.5870893597602844, + "logps/chosen": -52.72479248046875, + "logps/rejected": -103.2990951538086, + "loss": 0.572, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1327078342437744, + "rewards/margins": 8.169958114624023, + "rewards/rejected": -5.037250518798828, + "step": 15207 + }, + { + "epoch": 3.8, + "grad_norm": 3.469632863998413, + "learning_rate": 1.3447357439119262e-06, + "logits/chosen": -0.5442068576812744, + "logits/rejected": -0.6067293882369995, + "logps/chosen": -51.816280364990234, + "logps/rejected": -94.28899383544922, + "loss": 0.5684, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.900118112564087, + "rewards/margins": 6.017280101776123, + "rewards/rejected": -3.1171624660491943, + "step": 15208 + }, + { + "epoch": 3.8, + "grad_norm": 4.635465621948242, + "learning_rate": 1.3441994933204505e-06, + "logits/chosen": -0.4016891419887543, + "logits/rejected": -0.5033913254737854, + "logps/chosen": -57.10575866699219, + "logps/rejected": -104.65216064453125, + "loss": 0.605, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9366390705108643, + "rewards/margins": 7.251762390136719, + "rewards/rejected": -4.315123081207275, + "step": 15209 + }, + { + "epoch": 3.8, + "grad_norm": 3.3381266593933105, + "learning_rate": 1.343663333067694e-06, + "logits/chosen": -0.5995329022407532, + "logits/rejected": -0.6717008948326111, + "logps/chosen": -47.99583435058594, + "logps/rejected": -101.13740539550781, + "loss": 0.6154, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.298283338546753, + "rewards/margins": 6.387988090515137, + "rewards/rejected": -3.0897042751312256, + "step": 15210 + }, + { + "epoch": 3.81, + "grad_norm": 10.652876853942871, + "learning_rate": 1.3431272631669079e-06, + "logits/chosen": -0.47339290380477905, + "logits/rejected": -0.5385584831237793, + "logps/chosen": -62.77641296386719, + "logps/rejected": -118.13406372070312, + "loss": 0.7198, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.033407211303711, + "rewards/margins": 7.683314323425293, + "rewards/rejected": -4.649906158447266, + "step": 15211 + }, + { + "epoch": 3.81, + "grad_norm": 3.864872694015503, + "learning_rate": 1.3425912836313342e-06, + "logits/chosen": -0.6172090768814087, + "logits/rejected": -0.7303091287612915, + "logps/chosen": -53.009925842285156, + "logps/rejected": -85.13722229003906, + "loss": 0.6203, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3047900199890137, + "rewards/margins": 6.918820858001709, + "rewards/rejected": -3.6140310764312744, + "step": 15212 + }, + { + "epoch": 3.81, + "grad_norm": 5.544244766235352, + "learning_rate": 1.3420553944742203e-06, + "logits/chosen": -0.5452966094017029, + "logits/rejected": -0.6383675336837769, + "logps/chosen": -67.8967514038086, + "logps/rejected": -104.64236450195312, + "loss": 0.7178, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.604802131652832, + "rewards/margins": 5.515983581542969, + "rewards/rejected": -2.9111814498901367, + "step": 15213 + }, + { + "epoch": 3.81, + "grad_norm": 7.60048246383667, + "learning_rate": 1.3415195957088095e-06, + "logits/chosen": -0.5465061664581299, + "logits/rejected": -0.634739100933075, + "logps/chosen": -53.03071975708008, + "logps/rejected": -96.72811889648438, + "loss": 0.5594, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.230468273162842, + "rewards/margins": 7.351563453674316, + "rewards/rejected": -4.121095180511475, + "step": 15214 + }, + { + "epoch": 3.81, + "grad_norm": 7.195150375366211, + "learning_rate": 1.340983887348341e-06, + "logits/chosen": -0.5447648167610168, + "logits/rejected": -0.6796281337738037, + "logps/chosen": -61.61129379272461, + "logps/rejected": -97.62721252441406, + "loss": 0.5596, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.778102397918701, + "rewards/margins": 6.992597579956055, + "rewards/rejected": -4.214495658874512, + "step": 15215 + }, + { + "epoch": 3.81, + "grad_norm": 2.8869969844818115, + "learning_rate": 1.340448269406051e-06, + "logits/chosen": -0.5412939190864563, + "logits/rejected": -0.6479816436767578, + "logps/chosen": -58.04095458984375, + "logps/rejected": -110.1440200805664, + "loss": 0.5553, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.281782865524292, + "rewards/margins": 7.070265769958496, + "rewards/rejected": -3.788483142852783, + "step": 15216 + }, + { + "epoch": 3.81, + "grad_norm": 7.1689348220825195, + "learning_rate": 1.3399127418951785e-06, + "logits/chosen": -0.5278966426849365, + "logits/rejected": -0.5691267251968384, + "logps/chosen": -47.0461311340332, + "logps/rejected": -110.98880767822266, + "loss": 0.7179, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2986183166503906, + "rewards/margins": 6.138605117797852, + "rewards/rejected": -2.8399863243103027, + "step": 15217 + }, + { + "epoch": 3.81, + "grad_norm": 32.5384407043457, + "learning_rate": 1.3393773048289548e-06, + "logits/chosen": -0.5240072011947632, + "logits/rejected": -0.6058452725410461, + "logps/chosen": -59.21279525756836, + "logps/rejected": -102.01668548583984, + "loss": 0.7786, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8735086917877197, + "rewards/margins": 6.257326126098633, + "rewards/rejected": -3.383817672729492, + "step": 15218 + }, + { + "epoch": 3.81, + "grad_norm": 5.821505069732666, + "learning_rate": 1.3388419582206103e-06, + "logits/chosen": -0.5018871426582336, + "logits/rejected": -0.5919877290725708, + "logps/chosen": -61.98862838745117, + "logps/rejected": -91.69230651855469, + "loss": 0.6569, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4290175437927246, + "rewards/margins": 6.10084342956543, + "rewards/rejected": -2.671826124191284, + "step": 15219 + }, + { + "epoch": 3.81, + "grad_norm": 4.758641242980957, + "learning_rate": 1.3383067020833768e-06, + "logits/chosen": -0.5953234434127808, + "logits/rejected": -0.6428889632225037, + "logps/chosen": -51.35900115966797, + "logps/rejected": -111.54985809326172, + "loss": 0.6082, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.052454948425293, + "rewards/margins": 7.283425807952881, + "rewards/rejected": -4.23097038269043, + "step": 15220 + }, + { + "epoch": 3.81, + "grad_norm": 21.58845329284668, + "learning_rate": 1.3377715364304772e-06, + "logits/chosen": -0.629865288734436, + "logits/rejected": -0.6720535159111023, + "logps/chosen": -42.24679946899414, + "logps/rejected": -104.0383529663086, + "loss": 0.6244, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.130017042160034, + "rewards/margins": 6.561160087585449, + "rewards/rejected": -3.431143283843994, + "step": 15221 + }, + { + "epoch": 3.81, + "grad_norm": 10.249157905578613, + "learning_rate": 1.3372364612751403e-06, + "logits/chosen": -0.5776411294937134, + "logits/rejected": -0.618993878364563, + "logps/chosen": -48.633514404296875, + "logps/rejected": -129.2180938720703, + "loss": 0.6163, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1296868324279785, + "rewards/margins": 7.521824836730957, + "rewards/rejected": -4.3921380043029785, + "step": 15222 + }, + { + "epoch": 3.81, + "grad_norm": 9.939061164855957, + "learning_rate": 1.3367014766305863e-06, + "logits/chosen": -0.5527116656303406, + "logits/rejected": -0.5760309100151062, + "logps/chosen": -50.16178894042969, + "logps/rejected": -106.85079956054688, + "loss": 0.7259, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.932076930999756, + "rewards/margins": 5.638704776763916, + "rewards/rejected": -2.70662784576416, + "step": 15223 + }, + { + "epoch": 3.81, + "grad_norm": 12.709665298461914, + "learning_rate": 1.336166582510034e-06, + "logits/chosen": -0.52155601978302, + "logits/rejected": -0.6035987138748169, + "logps/chosen": -58.67756652832031, + "logps/rejected": -92.81648254394531, + "loss": 0.7213, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8165643215179443, + "rewards/margins": 6.88893985748291, + "rewards/rejected": -4.072375297546387, + "step": 15224 + }, + { + "epoch": 3.81, + "grad_norm": 26.96057891845703, + "learning_rate": 1.335631778926702e-06, + "logits/chosen": -0.5626636147499084, + "logits/rejected": -0.658751904964447, + "logps/chosen": -52.36920166015625, + "logps/rejected": -109.23768615722656, + "loss": 0.6728, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.196993827819824, + "rewards/margins": 6.447784900665283, + "rewards/rejected": -3.25079083442688, + "step": 15225 + }, + { + "epoch": 3.81, + "grad_norm": 2.953341245651245, + "learning_rate": 1.3350970658938084e-06, + "logits/chosen": -0.5091729164123535, + "logits/rejected": -0.6304160356521606, + "logps/chosen": -64.20388793945312, + "logps/rejected": -116.3126220703125, + "loss": 0.605, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.592226266860962, + "rewards/margins": 9.076772689819336, + "rewards/rejected": -5.484546184539795, + "step": 15226 + }, + { + "epoch": 3.81, + "grad_norm": 10.712336540222168, + "learning_rate": 1.3345624434245642e-06, + "logits/chosen": -0.48478519916534424, + "logits/rejected": -0.5441557168960571, + "logps/chosen": -58.798255920410156, + "logps/rejected": -100.2898178100586, + "loss": 0.6957, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.906937599182129, + "rewards/margins": 5.921194553375244, + "rewards/rejected": -3.0142571926116943, + "step": 15227 + }, + { + "epoch": 3.81, + "grad_norm": 4.635056018829346, + "learning_rate": 1.3340279115321791e-06, + "logits/chosen": -0.5521658658981323, + "logits/rejected": -0.6451157331466675, + "logps/chosen": -56.78020477294922, + "logps/rejected": -118.44856262207031, + "loss": 0.5836, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.690049648284912, + "rewards/margins": 8.54849624633789, + "rewards/rejected": -4.858447074890137, + "step": 15228 + }, + { + "epoch": 3.81, + "grad_norm": 3.9024946689605713, + "learning_rate": 1.333493470229865e-06, + "logits/chosen": -0.5163614749908447, + "logits/rejected": -0.6188132762908936, + "logps/chosen": -49.71031951904297, + "logps/rejected": -104.45575714111328, + "loss": 0.5521, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.986100912094116, + "rewards/margins": 7.728700637817383, + "rewards/rejected": -4.742599010467529, + "step": 15229 + }, + { + "epoch": 3.81, + "grad_norm": 8.259503364562988, + "learning_rate": 1.3329591195308273e-06, + "logits/chosen": -0.5224124193191528, + "logits/rejected": -0.6195527911186218, + "logps/chosen": -52.96025466918945, + "logps/rejected": -116.20915985107422, + "loss": 0.5445, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0824623107910156, + "rewards/margins": 7.953775405883789, + "rewards/rejected": -4.871313095092773, + "step": 15230 + }, + { + "epoch": 3.81, + "grad_norm": 4.98124361038208, + "learning_rate": 1.3324248594482687e-06, + "logits/chosen": -0.538364827632904, + "logits/rejected": -0.5887118577957153, + "logps/chosen": -47.29779815673828, + "logps/rejected": -101.89610290527344, + "loss": 0.6277, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2687480449676514, + "rewards/margins": 6.127750396728516, + "rewards/rejected": -2.8590025901794434, + "step": 15231 + }, + { + "epoch": 3.81, + "grad_norm": 5.980525970458984, + "learning_rate": 1.3318906899953943e-06, + "logits/chosen": -0.47393858432769775, + "logits/rejected": -0.6028973460197449, + "logps/chosen": -60.12767028808594, + "logps/rejected": -93.8344955444336, + "loss": 0.74, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1399240493774414, + "rewards/margins": 6.118773460388184, + "rewards/rejected": -2.978849172592163, + "step": 15232 + }, + { + "epoch": 3.81, + "grad_norm": 12.909699440002441, + "learning_rate": 1.3313566111854025e-06, + "logits/chosen": -0.42078107595443726, + "logits/rejected": -0.5503353476524353, + "logps/chosen": -69.73333740234375, + "logps/rejected": -102.82359313964844, + "loss": 0.7012, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6799252033233643, + "rewards/margins": 6.265817165374756, + "rewards/rejected": -3.5858922004699707, + "step": 15233 + }, + { + "epoch": 3.81, + "grad_norm": 4.175999164581299, + "learning_rate": 1.3308226230314898e-06, + "logits/chosen": -0.5632528066635132, + "logits/rejected": -0.5820204019546509, + "logps/chosen": -53.51021957397461, + "logps/rejected": -127.32827758789062, + "loss": 0.6798, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2145650386810303, + "rewards/margins": 7.770544528961182, + "rewards/rejected": -4.555979251861572, + "step": 15234 + }, + { + "epoch": 3.81, + "grad_norm": 22.575849533081055, + "learning_rate": 1.3302887255468528e-06, + "logits/chosen": -0.4908730089664459, + "logits/rejected": -0.5990385413169861, + "logps/chosen": -59.38938903808594, + "logps/rejected": -94.09913635253906, + "loss": 0.9192, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7421422004699707, + "rewards/margins": 6.259171962738037, + "rewards/rejected": -3.5170300006866455, + "step": 15235 + }, + { + "epoch": 3.81, + "grad_norm": 7.175635814666748, + "learning_rate": 1.3297549187446873e-06, + "logits/chosen": -0.5662280321121216, + "logits/rejected": -0.651360809803009, + "logps/chosen": -50.21196365356445, + "logps/rejected": -81.45170593261719, + "loss": 0.6678, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.894834280014038, + "rewards/margins": 5.471868515014648, + "rewards/rejected": -2.5770342350006104, + "step": 15236 + }, + { + "epoch": 3.81, + "grad_norm": 9.492425918579102, + "learning_rate": 1.3292212026381785e-06, + "logits/chosen": -0.47043168544769287, + "logits/rejected": -0.5671465396881104, + "logps/chosen": -50.68289566040039, + "logps/rejected": -116.77788543701172, + "loss": 0.5831, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8738086223602295, + "rewards/margins": 7.783349514007568, + "rewards/rejected": -4.90954065322876, + "step": 15237 + }, + { + "epoch": 3.81, + "grad_norm": 5.0514044761657715, + "learning_rate": 1.3286875772405184e-06, + "logits/chosen": -0.48427119851112366, + "logits/rejected": -0.5462310314178467, + "logps/chosen": -71.742431640625, + "logps/rejected": -134.46493530273438, + "loss": 0.6735, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.847733497619629, + "rewards/margins": 7.567325592041016, + "rewards/rejected": -4.719593048095703, + "step": 15238 + }, + { + "epoch": 3.81, + "grad_norm": 3.494778871536255, + "learning_rate": 1.3281540425648947e-06, + "logits/chosen": -0.6085498332977295, + "logits/rejected": -0.7143185138702393, + "logps/chosen": -58.449737548828125, + "logps/rejected": -99.92054748535156, + "loss": 0.5901, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.171921491622925, + "rewards/margins": 9.002184867858887, + "rewards/rejected": -5.830263614654541, + "step": 15239 + }, + { + "epoch": 3.81, + "grad_norm": 3.2302026748657227, + "learning_rate": 1.3276205986244905e-06, + "logits/chosen": -0.5313644409179688, + "logits/rejected": -0.632677435874939, + "logps/chosen": -51.938865661621094, + "logps/rejected": -86.17865753173828, + "loss": 0.5561, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4971401691436768, + "rewards/margins": 7.151597023010254, + "rewards/rejected": -3.6544559001922607, + "step": 15240 + }, + { + "epoch": 3.81, + "grad_norm": 4.933383464813232, + "learning_rate": 1.3270872454324857e-06, + "logits/chosen": -0.5835672616958618, + "logits/rejected": -0.6831799745559692, + "logps/chosen": -56.2212028503418, + "logps/rejected": -100.07308197021484, + "loss": 0.7015, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.851398468017578, + "rewards/margins": 7.581360340118408, + "rewards/rejected": -4.729962348937988, + "step": 15241 + }, + { + "epoch": 3.81, + "grad_norm": 10.014156341552734, + "learning_rate": 1.326553983002063e-06, + "logits/chosen": -0.5129991769790649, + "logits/rejected": -0.5419797897338867, + "logps/chosen": -67.81876373291016, + "logps/rejected": -123.83366394042969, + "loss": 0.6933, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.095921754837036, + "rewards/margins": 6.921328067779541, + "rewards/rejected": -3.825406074523926, + "step": 15242 + }, + { + "epoch": 3.81, + "grad_norm": 4.451178073883057, + "learning_rate": 1.326020811346399e-06, + "logits/chosen": -0.6165907979011536, + "logits/rejected": -0.6126081347465515, + "logps/chosen": -75.00860595703125, + "logps/rejected": -96.1915283203125, + "loss": 0.6126, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3537447452545166, + "rewards/margins": 6.196410179138184, + "rewards/rejected": -2.842665672302246, + "step": 15243 + }, + { + "epoch": 3.81, + "grad_norm": 3.7681291103363037, + "learning_rate": 1.3254877304786668e-06, + "logits/chosen": -0.4821700155735016, + "logits/rejected": -0.5727959871292114, + "logps/chosen": -47.101375579833984, + "logps/rejected": -89.4736099243164, + "loss": 0.6109, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.939021348953247, + "rewards/margins": 5.860098838806152, + "rewards/rejected": -2.9210779666900635, + "step": 15244 + }, + { + "epoch": 3.81, + "grad_norm": 5.4551239013671875, + "learning_rate": 1.3249547404120427e-06, + "logits/chosen": -0.5490761995315552, + "logits/rejected": -0.640078067779541, + "logps/chosen": -53.417381286621094, + "logps/rejected": -90.1046142578125, + "loss": 0.6611, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0569567680358887, + "rewards/margins": 6.061209678649902, + "rewards/rejected": -3.0042526721954346, + "step": 15245 + }, + { + "epoch": 3.81, + "grad_norm": 11.682846069335938, + "learning_rate": 1.3244218411596942e-06, + "logits/chosen": -0.5020656585693359, + "logits/rejected": -0.5900872349739075, + "logps/chosen": -59.13087463378906, + "logps/rejected": -101.90660095214844, + "loss": 0.6982, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.837006092071533, + "rewards/margins": 7.182132720947266, + "rewards/rejected": -4.345126152038574, + "step": 15246 + }, + { + "epoch": 3.81, + "grad_norm": 4.924350261688232, + "learning_rate": 1.3238890327347936e-06, + "logits/chosen": -0.6878175735473633, + "logits/rejected": -0.7312665581703186, + "logps/chosen": -46.93686294555664, + "logps/rejected": -103.5470199584961, + "loss": 0.586, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2995307445526123, + "rewards/margins": 6.1862335205078125, + "rewards/rejected": -2.8867027759552, + "step": 15247 + }, + { + "epoch": 3.81, + "grad_norm": 3.267561674118042, + "learning_rate": 1.3233563151505048e-06, + "logits/chosen": -0.5708017945289612, + "logits/rejected": -0.691034197807312, + "logps/chosen": -59.43147277832031, + "logps/rejected": -100.0457763671875, + "loss": 0.5281, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3108344078063965, + "rewards/margins": 8.111209869384766, + "rewards/rejected": -4.800374984741211, + "step": 15248 + }, + { + "epoch": 3.81, + "grad_norm": 5.7607855796813965, + "learning_rate": 1.3228236884199907e-06, + "logits/chosen": -0.5268096327781677, + "logits/rejected": -0.60202556848526, + "logps/chosen": -56.370601654052734, + "logps/rejected": -111.33534240722656, + "loss": 0.6418, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.887521266937256, + "rewards/margins": 6.294228553771973, + "rewards/rejected": -3.406708002090454, + "step": 15249 + }, + { + "epoch": 3.81, + "grad_norm": 5.893881320953369, + "learning_rate": 1.3222911525564142e-06, + "logits/chosen": -0.5373473763465881, + "logits/rejected": -0.6265779733657837, + "logps/chosen": -50.23326110839844, + "logps/rejected": -88.76726531982422, + "loss": 0.6134, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2031326293945312, + "rewards/margins": 6.407269477844238, + "rewards/rejected": -3.204136848449707, + "step": 15250 + }, + { + "epoch": 3.82, + "grad_norm": 4.59520959854126, + "learning_rate": 1.3217587075729382e-06, + "logits/chosen": -0.4866722822189331, + "logits/rejected": -0.5937144756317139, + "logps/chosen": -72.49217224121094, + "logps/rejected": -108.11833190917969, + "loss": 0.6645, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.409406900405884, + "rewards/margins": 6.873697280883789, + "rewards/rejected": -3.464289903640747, + "step": 15251 + }, + { + "epoch": 3.82, + "grad_norm": 8.632303237915039, + "learning_rate": 1.321226353482714e-06, + "logits/chosen": -0.44481968879699707, + "logits/rejected": -0.5692132711410522, + "logps/chosen": -66.2152099609375, + "logps/rejected": -96.68302917480469, + "loss": 0.631, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0419983863830566, + "rewards/margins": 6.974940299987793, + "rewards/rejected": -3.9329416751861572, + "step": 15252 + }, + { + "epoch": 3.82, + "grad_norm": 8.579107284545898, + "learning_rate": 1.3206940902988996e-06, + "logits/chosen": -0.5421248078346252, + "logits/rejected": -0.5298763513565063, + "logps/chosen": -55.86268997192383, + "logps/rejected": -108.97012329101562, + "loss": 0.7132, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2006702423095703, + "rewards/margins": 5.507707118988037, + "rewards/rejected": -2.307037115097046, + "step": 15253 + }, + { + "epoch": 3.82, + "grad_norm": 2.845238447189331, + "learning_rate": 1.3201619180346504e-06, + "logits/chosen": -0.5036381483078003, + "logits/rejected": -0.6006078720092773, + "logps/chosen": -51.3768310546875, + "logps/rejected": -90.51348876953125, + "loss": 0.5131, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8163537979125977, + "rewards/margins": 6.63338565826416, + "rewards/rejected": -3.8170318603515625, + "step": 15254 + }, + { + "epoch": 3.82, + "grad_norm": 12.426277160644531, + "learning_rate": 1.319629836703112e-06, + "logits/chosen": -0.5738291144371033, + "logits/rejected": -0.6328569054603577, + "logps/chosen": -50.8869514465332, + "logps/rejected": -97.17841339111328, + "loss": 0.661, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6896936893463135, + "rewards/margins": 5.803279399871826, + "rewards/rejected": -3.1135854721069336, + "step": 15255 + }, + { + "epoch": 3.82, + "grad_norm": 12.112409591674805, + "learning_rate": 1.3190978463174343e-06, + "logits/chosen": -0.487737238407135, + "logits/rejected": -0.5612087845802307, + "logps/chosen": -59.37024688720703, + "logps/rejected": -101.52408599853516, + "loss": 0.6294, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.971320867538452, + "rewards/margins": 5.659940719604492, + "rewards/rejected": -2.688619613647461, + "step": 15256 + }, + { + "epoch": 3.82, + "grad_norm": 10.04532527923584, + "learning_rate": 1.3185659468907651e-06, + "logits/chosen": -0.5782337188720703, + "logits/rejected": -0.652387261390686, + "logps/chosen": -51.049503326416016, + "logps/rejected": -97.21477508544922, + "loss": 0.6337, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.00753116607666, + "rewards/margins": 5.797028541564941, + "rewards/rejected": -2.789497137069702, + "step": 15257 + }, + { + "epoch": 3.82, + "grad_norm": 3.0415735244750977, + "learning_rate": 1.3180341384362477e-06, + "logits/chosen": -0.5209420919418335, + "logits/rejected": -0.5738242864608765, + "logps/chosen": -66.2415771484375, + "logps/rejected": -127.0583724975586, + "loss": 0.6204, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3402161598205566, + "rewards/margins": 8.50059700012207, + "rewards/rejected": -5.160380840301514, + "step": 15258 + }, + { + "epoch": 3.82, + "grad_norm": 3.7902026176452637, + "learning_rate": 1.3175024209670217e-06, + "logits/chosen": -0.5404678583145142, + "logits/rejected": -0.5995960235595703, + "logps/chosen": -46.8655891418457, + "logps/rejected": -107.3494873046875, + "loss": 0.5711, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2208802700042725, + "rewards/margins": 7.13831090927124, + "rewards/rejected": -3.9174306392669678, + "step": 15259 + }, + { + "epoch": 3.82, + "grad_norm": 12.52382755279541, + "learning_rate": 1.316970794496229e-06, + "logits/chosen": -0.47592198848724365, + "logits/rejected": -0.5934545397758484, + "logps/chosen": -62.6988639831543, + "logps/rejected": -106.7398452758789, + "loss": 0.6726, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9058821201324463, + "rewards/margins": 7.324281692504883, + "rewards/rejected": -4.418399333953857, + "step": 15260 + }, + { + "epoch": 3.82, + "grad_norm": 3.938709020614624, + "learning_rate": 1.3164392590370056e-06, + "logits/chosen": -0.5610083937644958, + "logits/rejected": -0.615157425403595, + "logps/chosen": -53.28803253173828, + "logps/rejected": -117.89413452148438, + "loss": 0.6078, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.134554147720337, + "rewards/margins": 6.791835308074951, + "rewards/rejected": -3.6572811603546143, + "step": 15261 + }, + { + "epoch": 3.82, + "grad_norm": 2.6383793354034424, + "learning_rate": 1.3159078146024845e-06, + "logits/chosen": -0.565919041633606, + "logits/rejected": -0.6636573076248169, + "logps/chosen": -45.40831756591797, + "logps/rejected": -104.42794799804688, + "loss": 0.539, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.225011110305786, + "rewards/margins": 7.481112003326416, + "rewards/rejected": -4.256101131439209, + "step": 15262 + }, + { + "epoch": 3.82, + "grad_norm": 8.943077087402344, + "learning_rate": 1.3153764612058013e-06, + "logits/chosen": -0.6301013231277466, + "logits/rejected": -0.6927697062492371, + "logps/chosen": -47.78738021850586, + "logps/rejected": -105.63206481933594, + "loss": 0.7138, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.316000461578369, + "rewards/margins": 6.772246360778809, + "rewards/rejected": -3.4562461376190186, + "step": 15263 + }, + { + "epoch": 3.82, + "grad_norm": 5.801017761230469, + "learning_rate": 1.314845198860083e-06, + "logits/chosen": -0.5191072225570679, + "logits/rejected": -0.6316066384315491, + "logps/chosen": -59.243675231933594, + "logps/rejected": -93.51964569091797, + "loss": 0.5848, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.210367441177368, + "rewards/margins": 6.841262340545654, + "rewards/rejected": -3.630894660949707, + "step": 15264 + }, + { + "epoch": 3.82, + "grad_norm": 5.434272766113281, + "learning_rate": 1.3143140275784617e-06, + "logits/chosen": -0.5444139838218689, + "logits/rejected": -0.60041743516922, + "logps/chosen": -67.74921417236328, + "logps/rejected": -121.06095886230469, + "loss": 0.6193, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0616559982299805, + "rewards/margins": 7.134608745574951, + "rewards/rejected": -4.072952747344971, + "step": 15265 + }, + { + "epoch": 3.82, + "grad_norm": 12.768488883972168, + "learning_rate": 1.313782947374061e-06, + "logits/chosen": -0.547230064868927, + "logits/rejected": -0.6439337730407715, + "logps/chosen": -60.39720916748047, + "logps/rejected": -120.32715606689453, + "loss": 0.6069, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0602517127990723, + "rewards/margins": 8.430198669433594, + "rewards/rejected": -5.36994743347168, + "step": 15266 + }, + { + "epoch": 3.82, + "grad_norm": 7.034701824188232, + "learning_rate": 1.3132519582600023e-06, + "logits/chosen": -0.5983322858810425, + "logits/rejected": -0.6663727760314941, + "logps/chosen": -56.44209289550781, + "logps/rejected": -105.95529174804688, + "loss": 0.6881, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.990661382675171, + "rewards/margins": 7.892269611358643, + "rewards/rejected": -4.901607513427734, + "step": 15267 + }, + { + "epoch": 3.82, + "grad_norm": 8.67383098602295, + "learning_rate": 1.312721060249411e-06, + "logits/chosen": -0.5306326150894165, + "logits/rejected": -0.6256317496299744, + "logps/chosen": -57.50189208984375, + "logps/rejected": -102.16558837890625, + "loss": 0.6975, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.225560188293457, + "rewards/margins": 6.4834303855896, + "rewards/rejected": -3.2578704357147217, + "step": 15268 + }, + { + "epoch": 3.82, + "grad_norm": 7.10961389541626, + "learning_rate": 1.3121902533554032e-06, + "logits/chosen": -0.6012232303619385, + "logits/rejected": -0.6853941679000854, + "logps/chosen": -41.226566314697266, + "logps/rejected": -79.87309265136719, + "loss": 0.6437, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.144333839416504, + "rewards/margins": 5.928534507751465, + "rewards/rejected": -2.78420090675354, + "step": 15269 + }, + { + "epoch": 3.82, + "grad_norm": 6.619191646575928, + "learning_rate": 1.3116595375910979e-06, + "logits/chosen": -0.49747413396835327, + "logits/rejected": -0.5851157903671265, + "logps/chosen": -53.65904998779297, + "logps/rejected": -98.5863265991211, + "loss": 0.6619, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3162829875946045, + "rewards/margins": 6.681821823120117, + "rewards/rejected": -3.3655385971069336, + "step": 15270 + }, + { + "epoch": 3.82, + "grad_norm": 4.745371341705322, + "learning_rate": 1.311128912969607e-06, + "logits/chosen": -0.4789124131202698, + "logits/rejected": -0.5480976700782776, + "logps/chosen": -60.64333724975586, + "logps/rejected": -110.48197937011719, + "loss": 0.619, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0787312984466553, + "rewards/margins": 7.478148460388184, + "rewards/rejected": -4.399416923522949, + "step": 15271 + }, + { + "epoch": 3.82, + "grad_norm": 7.1835246086120605, + "learning_rate": 1.3105983795040462e-06, + "logits/chosen": -0.5325660109519958, + "logits/rejected": -0.604013204574585, + "logps/chosen": -54.419921875, + "logps/rejected": -106.86036682128906, + "loss": 0.7125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2538318634033203, + "rewards/margins": 6.878353595733643, + "rewards/rejected": -3.624521493911743, + "step": 15272 + }, + { + "epoch": 3.82, + "grad_norm": 7.73950719833374, + "learning_rate": 1.310067937207523e-06, + "logits/chosen": -0.5167248249053955, + "logits/rejected": -0.6099919676780701, + "logps/chosen": -63.143638610839844, + "logps/rejected": -106.9753189086914, + "loss": 0.8043, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.001077175140381, + "rewards/margins": 6.362910270690918, + "rewards/rejected": -3.361832618713379, + "step": 15273 + }, + { + "epoch": 3.82, + "grad_norm": 8.531073570251465, + "learning_rate": 1.3095375860931447e-06, + "logits/chosen": -0.5596972107887268, + "logits/rejected": -0.6254416704177856, + "logps/chosen": -53.39437484741211, + "logps/rejected": -106.8205795288086, + "loss": 0.6313, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.933063268661499, + "rewards/margins": 7.715676307678223, + "rewards/rejected": -4.782613277435303, + "step": 15274 + }, + { + "epoch": 3.82, + "grad_norm": 23.45377540588379, + "learning_rate": 1.30900732617402e-06, + "logits/chosen": -0.5472530126571655, + "logits/rejected": -0.6739307045936584, + "logps/chosen": -59.14986038208008, + "logps/rejected": -91.2462158203125, + "loss": 0.5746, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.331618547439575, + "rewards/margins": 7.650456428527832, + "rewards/rejected": -4.318837642669678, + "step": 15275 + }, + { + "epoch": 3.82, + "grad_norm": 17.748268127441406, + "learning_rate": 1.3084771574632494e-06, + "logits/chosen": -0.4286729693412781, + "logits/rejected": -0.540541410446167, + "logps/chosen": -60.578956604003906, + "logps/rejected": -88.68508911132812, + "loss": 0.7262, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.783803701400757, + "rewards/margins": 5.980918884277344, + "rewards/rejected": -3.197115898132324, + "step": 15276 + }, + { + "epoch": 3.82, + "grad_norm": 5.519032955169678, + "learning_rate": 1.3079470799739335e-06, + "logits/chosen": -0.47991618514060974, + "logits/rejected": -0.5465399622917175, + "logps/chosen": -52.481300354003906, + "logps/rejected": -116.12010192871094, + "loss": 0.5713, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0046732425689697, + "rewards/margins": 7.313345909118652, + "rewards/rejected": -4.308672904968262, + "step": 15277 + }, + { + "epoch": 3.82, + "grad_norm": 5.461208343505859, + "learning_rate": 1.3074170937191738e-06, + "logits/chosen": -0.5160707831382751, + "logits/rejected": -0.5745243430137634, + "logps/chosen": -52.96357345581055, + "logps/rejected": -105.07933044433594, + "loss": 0.603, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.478969097137451, + "rewards/margins": 6.807770252227783, + "rewards/rejected": -3.3288018703460693, + "step": 15278 + }, + { + "epoch": 3.82, + "grad_norm": 8.673130989074707, + "learning_rate": 1.3068871987120657e-06, + "logits/chosen": -0.5665668249130249, + "logits/rejected": -0.6628611087799072, + "logps/chosen": -66.55294036865234, + "logps/rejected": -120.93260192871094, + "loss": 0.6445, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.638486385345459, + "rewards/margins": 6.979298114776611, + "rewards/rejected": -4.340811729431152, + "step": 15279 + }, + { + "epoch": 3.82, + "grad_norm": 6.415421962738037, + "learning_rate": 1.306357394965701e-06, + "logits/chosen": -0.6341455578804016, + "logits/rejected": -0.699142575263977, + "logps/chosen": -52.020790100097656, + "logps/rejected": -117.82989501953125, + "loss": 0.6892, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7982723712921143, + "rewards/margins": 7.044755458831787, + "rewards/rejected": -4.24648380279541, + "step": 15280 + }, + { + "epoch": 3.82, + "grad_norm": 3.870976686477661, + "learning_rate": 1.3058276824931742e-06, + "logits/chosen": -0.6100443005561829, + "logits/rejected": -0.6501867175102234, + "logps/chosen": -43.34259796142578, + "logps/rejected": -108.8544921875, + "loss": 0.5716, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9940338134765625, + "rewards/margins": 7.1922736167907715, + "rewards/rejected": -4.198239803314209, + "step": 15281 + }, + { + "epoch": 3.82, + "grad_norm": 6.889926910400391, + "learning_rate": 1.3052980613075755e-06, + "logits/chosen": -0.5678993463516235, + "logits/rejected": -0.6421598196029663, + "logps/chosen": -55.7459602355957, + "logps/rejected": -110.03984069824219, + "loss": 0.6655, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.929807662963867, + "rewards/margins": 7.272583484649658, + "rewards/rejected": -4.342775344848633, + "step": 15282 + }, + { + "epoch": 3.82, + "grad_norm": 11.629180908203125, + "learning_rate": 1.304768531421992e-06, + "logits/chosen": -0.5423141717910767, + "logits/rejected": -0.6487146615982056, + "logps/chosen": -58.324317932128906, + "logps/rejected": -109.91409301757812, + "loss": 0.5799, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.645923137664795, + "rewards/margins": 7.3327813148498535, + "rewards/rejected": -4.6868577003479, + "step": 15283 + }, + { + "epoch": 3.82, + "grad_norm": 4.483933448791504, + "learning_rate": 1.3042390928495074e-06, + "logits/chosen": -0.5255556702613831, + "logits/rejected": -0.5842452645301819, + "logps/chosen": -49.4003791809082, + "logps/rejected": -108.38899993896484, + "loss": 0.5646, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2570950984954834, + "rewards/margins": 6.573105812072754, + "rewards/rejected": -3.3160109519958496, + "step": 15284 + }, + { + "epoch": 3.82, + "grad_norm": 3.2706050872802734, + "learning_rate": 1.303709745603207e-06, + "logits/chosen": -0.5222912430763245, + "logits/rejected": -0.5969972014427185, + "logps/chosen": -52.69573211669922, + "logps/rejected": -109.65762329101562, + "loss": 0.617, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0493857860565186, + "rewards/margins": 7.8687615394592285, + "rewards/rejected": -4.819375038146973, + "step": 15285 + }, + { + "epoch": 3.82, + "grad_norm": 5.660987377166748, + "learning_rate": 1.3031804896961704e-06, + "logits/chosen": -0.5638816356658936, + "logits/rejected": -0.6195845603942871, + "logps/chosen": -53.628143310546875, + "logps/rejected": -104.64937591552734, + "loss": 0.6287, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0894126892089844, + "rewards/margins": 7.099999904632568, + "rewards/rejected": -4.010587692260742, + "step": 15286 + }, + { + "epoch": 3.82, + "grad_norm": 4.894214630126953, + "learning_rate": 1.3026513251414742e-06, + "logits/chosen": -0.5876737833023071, + "logits/rejected": -0.6510158777236938, + "logps/chosen": -56.81427764892578, + "logps/rejected": -110.62625122070312, + "loss": 0.6327, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9882516860961914, + "rewards/margins": 7.507023334503174, + "rewards/rejected": -4.518771648406982, + "step": 15287 + }, + { + "epoch": 3.82, + "grad_norm": 5.579558849334717, + "learning_rate": 1.3021222519521982e-06, + "logits/chosen": -0.6099421381950378, + "logits/rejected": -0.7455763220787048, + "logps/chosen": -59.41273880004883, + "logps/rejected": -115.88434600830078, + "loss": 0.6433, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4659266471862793, + "rewards/margins": 7.92435359954834, + "rewards/rejected": -5.458427429199219, + "step": 15288 + }, + { + "epoch": 3.82, + "grad_norm": 8.797216415405273, + "learning_rate": 1.301593270141413e-06, + "logits/chosen": -0.5462564826011658, + "logits/rejected": -0.595701277256012, + "logps/chosen": -57.76030731201172, + "logps/rejected": -121.90618133544922, + "loss": 0.6982, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1895525455474854, + "rewards/margins": 7.603978157043457, + "rewards/rejected": -4.414425373077393, + "step": 15289 + }, + { + "epoch": 3.82, + "grad_norm": 5.1626434326171875, + "learning_rate": 1.3010643797221934e-06, + "logits/chosen": -0.6055282950401306, + "logits/rejected": -0.6933878660202026, + "logps/chosen": -57.524356842041016, + "logps/rejected": -98.69329833984375, + "loss": 0.6736, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.011317253112793, + "rewards/margins": 7.1853437423706055, + "rewards/rejected": -4.174027442932129, + "step": 15290 + }, + { + "epoch": 3.83, + "grad_norm": 6.69431209564209, + "learning_rate": 1.3005355807076075e-06, + "logits/chosen": -0.5921542048454285, + "logits/rejected": -0.647644579410553, + "logps/chosen": -56.81431579589844, + "logps/rejected": -112.88233947753906, + "loss": 0.6382, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9210305213928223, + "rewards/margins": 7.470729351043701, + "rewards/rejected": -4.549698829650879, + "step": 15291 + }, + { + "epoch": 3.83, + "grad_norm": 5.669000625610352, + "learning_rate": 1.3000068731107206e-06, + "logits/chosen": -0.5341753363609314, + "logits/rejected": -0.6003252863883972, + "logps/chosen": -55.412376403808594, + "logps/rejected": -142.5448760986328, + "loss": 0.7178, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.09594464302063, + "rewards/margins": 8.469191551208496, + "rewards/rejected": -5.373247146606445, + "step": 15292 + }, + { + "epoch": 3.83, + "grad_norm": 66.4809799194336, + "learning_rate": 1.2994782569446013e-06, + "logits/chosen": -0.5385251045227051, + "logits/rejected": -0.607469916343689, + "logps/chosen": -63.586708068847656, + "logps/rejected": -107.06275939941406, + "loss": 0.6869, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.273712635040283, + "rewards/margins": 7.01906681060791, + "rewards/rejected": -3.745353937149048, + "step": 15293 + }, + { + "epoch": 3.83, + "grad_norm": 4.684711456298828, + "learning_rate": 1.2989497322223099e-06, + "logits/chosen": -0.5352143049240112, + "logits/rejected": -0.6252046227455139, + "logps/chosen": -44.35364532470703, + "logps/rejected": -98.3606948852539, + "loss": 0.5525, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1629889011383057, + "rewards/margins": 8.291491508483887, + "rewards/rejected": -5.12850284576416, + "step": 15294 + }, + { + "epoch": 3.83, + "grad_norm": 4.329860687255859, + "learning_rate": 1.2984212989569055e-06, + "logits/chosen": -0.5377220511436462, + "logits/rejected": -0.6512466073036194, + "logps/chosen": -47.998817443847656, + "logps/rejected": -95.33879089355469, + "loss": 0.5489, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.234280824661255, + "rewards/margins": 6.789680004119873, + "rewards/rejected": -3.555398941040039, + "step": 15295 + }, + { + "epoch": 3.83, + "grad_norm": 8.853446960449219, + "learning_rate": 1.2978929571614486e-06, + "logits/chosen": -0.5083904266357422, + "logits/rejected": -0.5995687246322632, + "logps/chosen": -48.11528015136719, + "logps/rejected": -115.20866394042969, + "loss": 0.6018, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0152931213378906, + "rewards/margins": 8.27534294128418, + "rewards/rejected": -5.260049343109131, + "step": 15296 + }, + { + "epoch": 3.83, + "grad_norm": 10.225807189941406, + "learning_rate": 1.297364706848997e-06, + "logits/chosen": -0.4855571389198303, + "logits/rejected": -0.591054379940033, + "logps/chosen": -52.773048400878906, + "logps/rejected": -100.26020812988281, + "loss": 0.7288, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0983810424804688, + "rewards/margins": 6.391343116760254, + "rewards/rejected": -3.292962074279785, + "step": 15297 + }, + { + "epoch": 3.83, + "grad_norm": 3.754265546798706, + "learning_rate": 1.296836548032599e-06, + "logits/chosen": -0.5564494729042053, + "logits/rejected": -0.5667033195495605, + "logps/chosen": -55.113128662109375, + "logps/rejected": -132.18287658691406, + "loss": 0.5836, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.212136745452881, + "rewards/margins": 7.551273345947266, + "rewards/rejected": -4.339137077331543, + "step": 15298 + }, + { + "epoch": 3.83, + "grad_norm": 9.497579574584961, + "learning_rate": 1.2963084807253084e-06, + "logits/chosen": -0.5371180176734924, + "logits/rejected": -0.6205056309700012, + "logps/chosen": -63.80781555175781, + "logps/rejected": -102.34442138671875, + "loss": 0.7248, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0668299198150635, + "rewards/margins": 6.529598712921143, + "rewards/rejected": -3.462769031524658, + "step": 15299 + }, + { + "epoch": 3.83, + "grad_norm": 9.050894737243652, + "learning_rate": 1.2957805049401767e-06, + "logits/chosen": -0.4825143814086914, + "logits/rejected": -0.5661588907241821, + "logps/chosen": -54.73188018798828, + "logps/rejected": -101.70002746582031, + "loss": 0.5889, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.023695468902588, + "rewards/margins": 7.066384792327881, + "rewards/rejected": -4.042689323425293, + "step": 15300 + }, + { + "epoch": 3.83, + "grad_norm": 4.147991180419922, + "learning_rate": 1.2952526206902482e-06, + "logits/chosen": -0.4568023085594177, + "logits/rejected": -0.6108808517456055, + "logps/chosen": -67.14959716796875, + "logps/rejected": -106.85427856445312, + "loss": 0.5687, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0012011528015137, + "rewards/margins": 7.527026176452637, + "rewards/rejected": -4.525825023651123, + "step": 15301 + }, + { + "epoch": 3.83, + "grad_norm": 3.899315595626831, + "learning_rate": 1.294724827988567e-06, + "logits/chosen": -0.5953276753425598, + "logits/rejected": -0.6508612036705017, + "logps/chosen": -50.664527893066406, + "logps/rejected": -112.1072769165039, + "loss": 0.5558, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0244035720825195, + "rewards/margins": 6.988684177398682, + "rewards/rejected": -3.964280605316162, + "step": 15302 + }, + { + "epoch": 3.83, + "grad_norm": 3.774531364440918, + "learning_rate": 1.294197126848178e-06, + "logits/chosen": -0.45421114563941956, + "logits/rejected": -0.5258229374885559, + "logps/chosen": -56.78250503540039, + "logps/rejected": -98.00983428955078, + "loss": 0.575, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.049801826477051, + "rewards/margins": 6.535000801086426, + "rewards/rejected": -3.4851999282836914, + "step": 15303 + }, + { + "epoch": 3.83, + "grad_norm": 11.186327934265137, + "learning_rate": 1.293669517282119e-06, + "logits/chosen": -0.4332754611968994, + "logits/rejected": -0.5657703876495361, + "logps/chosen": -65.67581176757812, + "logps/rejected": -109.25394439697266, + "loss": 0.6156, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0808820724487305, + "rewards/margins": 7.02764368057251, + "rewards/rejected": -3.9467613697052, + "step": 15304 + }, + { + "epoch": 3.83, + "grad_norm": 38.60897445678711, + "learning_rate": 1.2931419993034273e-06, + "logits/chosen": -0.41887256503105164, + "logits/rejected": -0.5607143640518188, + "logps/chosen": -63.835636138916016, + "logps/rejected": -100.95587158203125, + "loss": 0.8201, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6562063694000244, + "rewards/margins": 5.596949577331543, + "rewards/rejected": -2.9407427310943604, + "step": 15305 + }, + { + "epoch": 3.83, + "grad_norm": 3.3218400478363037, + "learning_rate": 1.292614572925141e-06, + "logits/chosen": -0.5677940845489502, + "logits/rejected": -0.6705095767974854, + "logps/chosen": -51.75421905517578, + "logps/rejected": -91.475341796875, + "loss": 0.6126, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.32902193069458, + "rewards/margins": 8.010306358337402, + "rewards/rejected": -4.681283950805664, + "step": 15306 + }, + { + "epoch": 3.83, + "grad_norm": 5.162595272064209, + "learning_rate": 1.2920872381602906e-06, + "logits/chosen": -0.5641871094703674, + "logits/rejected": -0.6232825517654419, + "logps/chosen": -47.31541442871094, + "logps/rejected": -109.12872314453125, + "loss": 0.6029, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.730105400085449, + "rewards/margins": 8.13186264038086, + "rewards/rejected": -4.40175724029541, + "step": 15307 + }, + { + "epoch": 3.83, + "grad_norm": 5.1046342849731445, + "learning_rate": 1.2915599950219099e-06, + "logits/chosen": -0.47199636697769165, + "logits/rejected": -0.5509587526321411, + "logps/chosen": -53.19021987915039, + "logps/rejected": -99.60887145996094, + "loss": 0.6057, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2870306968688965, + "rewards/margins": 6.1243896484375, + "rewards/rejected": -2.8373587131500244, + "step": 15308 + }, + { + "epoch": 3.83, + "grad_norm": 4.108495712280273, + "learning_rate": 1.2910328435230264e-06, + "logits/chosen": -0.5264294147491455, + "logits/rejected": -0.5768917798995972, + "logps/chosen": -56.919132232666016, + "logps/rejected": -100.40534210205078, + "loss": 0.6423, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1553940773010254, + "rewards/margins": 6.374664783477783, + "rewards/rejected": -3.219270944595337, + "step": 15309 + }, + { + "epoch": 3.83, + "grad_norm": 9.566067695617676, + "learning_rate": 1.2905057836766644e-06, + "logits/chosen": -0.5725395679473877, + "logits/rejected": -0.6629980802536011, + "logps/chosen": -55.14263153076172, + "logps/rejected": -116.19829559326172, + "loss": 0.6518, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8117117881774902, + "rewards/margins": 8.118145942687988, + "rewards/rejected": -5.306432723999023, + "step": 15310 + }, + { + "epoch": 3.83, + "grad_norm": 8.397760391235352, + "learning_rate": 1.289978815495852e-06, + "logits/chosen": -0.5775856375694275, + "logits/rejected": -0.6348602175712585, + "logps/chosen": -60.54491424560547, + "logps/rejected": -144.17259216308594, + "loss": 0.6104, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.803633213043213, + "rewards/margins": 7.15859842300415, + "rewards/rejected": -4.354964733123779, + "step": 15311 + }, + { + "epoch": 3.83, + "grad_norm": 2.8912880420684814, + "learning_rate": 1.2894519389936072e-06, + "logits/chosen": -0.4904220700263977, + "logits/rejected": -0.603744626045227, + "logps/chosen": -68.75331115722656, + "logps/rejected": -95.67028045654297, + "loss": 0.6491, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2885327339172363, + "rewards/margins": 5.903912544250488, + "rewards/rejected": -2.615379810333252, + "step": 15312 + }, + { + "epoch": 3.83, + "grad_norm": 4.419078826904297, + "learning_rate": 1.2889251541829539e-06, + "logits/chosen": -0.46441900730133057, + "logits/rejected": -0.5437350273132324, + "logps/chosen": -59.93902587890625, + "logps/rejected": -112.16411590576172, + "loss": 0.6549, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9804041385650635, + "rewards/margins": 6.93480110168457, + "rewards/rejected": -3.954397201538086, + "step": 15313 + }, + { + "epoch": 3.83, + "grad_norm": 5.192051887512207, + "learning_rate": 1.2883984610769051e-06, + "logits/chosen": -0.5938451290130615, + "logits/rejected": -0.6840291619300842, + "logps/chosen": -50.13672637939453, + "logps/rejected": -88.24668884277344, + "loss": 0.6139, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0962376594543457, + "rewards/margins": 5.7494964599609375, + "rewards/rejected": -2.6532585620880127, + "step": 15314 + }, + { + "epoch": 3.83, + "grad_norm": 7.299132823944092, + "learning_rate": 1.2878718596884803e-06, + "logits/chosen": -0.5479354858398438, + "logits/rejected": -0.6300867795944214, + "logps/chosen": -56.584571838378906, + "logps/rejected": -92.49862670898438, + "loss": 0.5793, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1523163318634033, + "rewards/margins": 5.8018798828125, + "rewards/rejected": -2.6495633125305176, + "step": 15315 + }, + { + "epoch": 3.83, + "grad_norm": 22.015581130981445, + "learning_rate": 1.2873453500306898e-06, + "logits/chosen": -0.4414803087711334, + "logits/rejected": -0.5059575438499451, + "logps/chosen": -52.21583557128906, + "logps/rejected": -111.69965362548828, + "loss": 0.5721, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2741618156433105, + "rewards/margins": 6.80242395401001, + "rewards/rejected": -3.528261423110962, + "step": 15316 + }, + { + "epoch": 3.83, + "grad_norm": 3.7287473678588867, + "learning_rate": 1.2868189321165437e-06, + "logits/chosen": -0.5825484395027161, + "logits/rejected": -0.6431208848953247, + "logps/chosen": -47.00047302246094, + "logps/rejected": -90.2059555053711, + "loss": 0.616, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1336588859558105, + "rewards/margins": 6.577653408050537, + "rewards/rejected": -3.4439949989318848, + "step": 15317 + }, + { + "epoch": 3.83, + "grad_norm": 9.75757884979248, + "learning_rate": 1.2862926059590525e-06, + "logits/chosen": -0.5259705185890198, + "logits/rejected": -0.5993515253067017, + "logps/chosen": -46.60776138305664, + "logps/rejected": -108.23762512207031, + "loss": 0.5942, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.617616653442383, + "rewards/margins": 7.2460246086120605, + "rewards/rejected": -4.628408432006836, + "step": 15318 + }, + { + "epoch": 3.83, + "grad_norm": 7.120155334472656, + "learning_rate": 1.285766371571221e-06, + "logits/chosen": -0.5778118968009949, + "logits/rejected": -0.6359869837760925, + "logps/chosen": -52.24433135986328, + "logps/rejected": -100.64422607421875, + "loss": 0.6778, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.492709159851074, + "rewards/margins": 6.554384231567383, + "rewards/rejected": -4.06167459487915, + "step": 15319 + }, + { + "epoch": 3.83, + "grad_norm": 5.289046287536621, + "learning_rate": 1.2852402289660515e-06, + "logits/chosen": -0.45680075883865356, + "logits/rejected": -0.555037260055542, + "logps/chosen": -50.37588882446289, + "logps/rejected": -98.90165710449219, + "loss": 0.5353, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.022006034851074, + "rewards/margins": 6.609267234802246, + "rewards/rejected": -3.587261199951172, + "step": 15320 + }, + { + "epoch": 3.83, + "grad_norm": 5.337905406951904, + "learning_rate": 1.2847141781565486e-06, + "logits/chosen": -0.47133055329322815, + "logits/rejected": -0.5531033277511597, + "logps/chosen": -51.41596984863281, + "logps/rejected": -111.20729064941406, + "loss": 0.5855, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.445617198944092, + "rewards/margins": 7.39572811126709, + "rewards/rejected": -3.950110673904419, + "step": 15321 + }, + { + "epoch": 3.83, + "grad_norm": 6.990301609039307, + "learning_rate": 1.2841882191557103e-06, + "logits/chosen": -0.5125386118888855, + "logits/rejected": -0.5796034336090088, + "logps/chosen": -47.008731842041016, + "logps/rejected": -105.60936737060547, + "loss": 0.6164, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.852017402648926, + "rewards/margins": 6.13826847076416, + "rewards/rejected": -3.2862515449523926, + "step": 15322 + }, + { + "epoch": 3.83, + "grad_norm": 3.9172472953796387, + "learning_rate": 1.2836623519765318e-06, + "logits/chosen": -0.4869564175605774, + "logits/rejected": -0.6052988767623901, + "logps/chosen": -60.2408561706543, + "logps/rejected": -84.26883697509766, + "loss": 0.6381, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3485121726989746, + "rewards/margins": 7.066542148590088, + "rewards/rejected": -3.718029499053955, + "step": 15323 + }, + { + "epoch": 3.83, + "grad_norm": 3.435966968536377, + "learning_rate": 1.2831365766320098e-06, + "logits/chosen": -0.5142614841461182, + "logits/rejected": -0.6223615407943726, + "logps/chosen": -60.68108367919922, + "logps/rejected": -88.45906829833984, + "loss": 0.5983, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3864216804504395, + "rewards/margins": 5.842924118041992, + "rewards/rejected": -2.4565024375915527, + "step": 15324 + }, + { + "epoch": 3.83, + "grad_norm": 4.497045516967773, + "learning_rate": 1.2826108931351388e-06, + "logits/chosen": -0.5904550552368164, + "logits/rejected": -0.6812691688537598, + "logps/chosen": -59.15045928955078, + "logps/rejected": -106.50401306152344, + "loss": 0.6262, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4077308177948, + "rewards/margins": 8.271232604980469, + "rewards/rejected": -4.863502025604248, + "step": 15325 + }, + { + "epoch": 3.83, + "grad_norm": 3.812457323074341, + "learning_rate": 1.2820853014989042e-06, + "logits/chosen": -0.5270910859107971, + "logits/rejected": -0.6040399074554443, + "logps/chosen": -55.7752685546875, + "logps/rejected": -106.52325439453125, + "loss": 0.6059, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.310535430908203, + "rewards/margins": 7.977820873260498, + "rewards/rejected": -4.667285919189453, + "step": 15326 + }, + { + "epoch": 3.83, + "grad_norm": 11.047454833984375, + "learning_rate": 1.2815598017362962e-06, + "logits/chosen": -0.436151385307312, + "logits/rejected": -0.508994460105896, + "logps/chosen": -57.49959945678711, + "logps/rejected": -110.15150451660156, + "loss": 0.6542, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.263737201690674, + "rewards/margins": 6.480767250061035, + "rewards/rejected": -3.2170307636260986, + "step": 15327 + }, + { + "epoch": 3.83, + "grad_norm": 8.804916381835938, + "learning_rate": 1.281034393860302e-06, + "logits/chosen": -0.5885038375854492, + "logits/rejected": -0.6691600680351257, + "logps/chosen": -59.783084869384766, + "logps/rejected": -97.16231536865234, + "loss": 0.6675, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9502124786376953, + "rewards/margins": 5.706872463226318, + "rewards/rejected": -2.756659507751465, + "step": 15328 + }, + { + "epoch": 3.83, + "grad_norm": 3.7244608402252197, + "learning_rate": 1.2805090778839035e-06, + "logits/chosen": -0.5171515345573425, + "logits/rejected": -0.618242084980011, + "logps/chosen": -64.02082824707031, + "logps/rejected": -89.6214599609375, + "loss": 0.6497, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8543789386749268, + "rewards/margins": 6.159703731536865, + "rewards/rejected": -3.3053245544433594, + "step": 15329 + }, + { + "epoch": 3.83, + "grad_norm": 9.19516372680664, + "learning_rate": 1.2799838538200804e-06, + "logits/chosen": -0.5316317677497864, + "logits/rejected": -0.5729560852050781, + "logps/chosen": -54.124080657958984, + "logps/rejected": -107.6119155883789, + "loss": 0.6314, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.15106201171875, + "rewards/margins": 6.218700885772705, + "rewards/rejected": -3.067638874053955, + "step": 15330 + }, + { + "epoch": 3.84, + "grad_norm": 10.911480903625488, + "learning_rate": 1.279458721681815e-06, + "logits/chosen": -0.4817654490470886, + "logits/rejected": -0.552864670753479, + "logps/chosen": -64.94161224365234, + "logps/rejected": -109.83734130859375, + "loss": 0.6791, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9256954193115234, + "rewards/margins": 6.085543632507324, + "rewards/rejected": -3.15984845161438, + "step": 15331 + }, + { + "epoch": 3.84, + "grad_norm": 3.9506819248199463, + "learning_rate": 1.2789336814820802e-06, + "logits/chosen": -0.5250479578971863, + "logits/rejected": -0.6405333876609802, + "logps/chosen": -63.343345642089844, + "logps/rejected": -103.10011291503906, + "loss": 0.5755, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0059139728546143, + "rewards/margins": 7.939122200012207, + "rewards/rejected": -4.933209419250488, + "step": 15332 + }, + { + "epoch": 3.84, + "grad_norm": 4.581100940704346, + "learning_rate": 1.278408733233853e-06, + "logits/chosen": -0.5225813388824463, + "logits/rejected": -0.5855141878128052, + "logps/chosen": -54.80223083496094, + "logps/rejected": -104.93524169921875, + "loss": 0.661, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3491950035095215, + "rewards/margins": 5.540226459503174, + "rewards/rejected": -2.1910321712493896, + "step": 15333 + }, + { + "epoch": 3.84, + "grad_norm": 6.482041358947754, + "learning_rate": 1.277883876950105e-06, + "logits/chosen": -0.5608681440353394, + "logits/rejected": -0.6272419691085815, + "logps/chosen": -52.9516716003418, + "logps/rejected": -115.23062133789062, + "loss": 0.5782, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.44271183013916, + "rewards/margins": 7.283914566040039, + "rewards/rejected": -3.841202735900879, + "step": 15334 + }, + { + "epoch": 3.84, + "grad_norm": 3.0605340003967285, + "learning_rate": 1.2773591126438039e-06, + "logits/chosen": -0.5534078478813171, + "logits/rejected": -0.618241548538208, + "logps/chosen": -55.004066467285156, + "logps/rejected": -105.78317260742188, + "loss": 0.5768, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3658242225646973, + "rewards/margins": 6.456750869750977, + "rewards/rejected": -3.0909266471862793, + "step": 15335 + }, + { + "epoch": 3.84, + "grad_norm": 4.730307102203369, + "learning_rate": 1.2768344403279203e-06, + "logits/chosen": -0.5310854911804199, + "logits/rejected": -0.6110712289810181, + "logps/chosen": -63.31920623779297, + "logps/rejected": -95.65882110595703, + "loss": 0.7084, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.96844482421875, + "rewards/margins": 6.016318321228027, + "rewards/rejected": -3.047873020172119, + "step": 15336 + }, + { + "epoch": 3.84, + "grad_norm": 32.55007553100586, + "learning_rate": 1.2763098600154178e-06, + "logits/chosen": -0.4957699179649353, + "logits/rejected": -0.5532866716384888, + "logps/chosen": -40.86935043334961, + "logps/rejected": -103.14979553222656, + "loss": 0.5925, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.313828945159912, + "rewards/margins": 7.307833194732666, + "rewards/rejected": -3.994004487991333, + "step": 15337 + }, + { + "epoch": 3.84, + "grad_norm": 3.5303001403808594, + "learning_rate": 1.2757853717192575e-06, + "logits/chosen": -0.5357197523117065, + "logits/rejected": -0.6073230504989624, + "logps/chosen": -51.514076232910156, + "logps/rejected": -106.72512817382812, + "loss": 0.5906, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.904275894165039, + "rewards/margins": 7.785835266113281, + "rewards/rejected": -4.8815598487854, + "step": 15338 + }, + { + "epoch": 3.84, + "grad_norm": 6.552249908447266, + "learning_rate": 1.275260975452402e-06, + "logits/chosen": -0.5302374362945557, + "logits/rejected": -0.5522090196609497, + "logps/chosen": -45.666236877441406, + "logps/rejected": -95.37522888183594, + "loss": 0.678, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9553725719451904, + "rewards/margins": 5.6670403480529785, + "rewards/rejected": -2.711667776107788, + "step": 15339 + }, + { + "epoch": 3.84, + "grad_norm": 6.580324649810791, + "learning_rate": 1.274736671227813e-06, + "logits/chosen": -0.5455232858657837, + "logits/rejected": -0.6656046509742737, + "logps/chosen": -63.21636199951172, + "logps/rejected": -100.85987854003906, + "loss": 0.6452, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3892147541046143, + "rewards/margins": 7.1598405838012695, + "rewards/rejected": -3.7706260681152344, + "step": 15340 + }, + { + "epoch": 3.84, + "grad_norm": 4.059333801269531, + "learning_rate": 1.2742124590584398e-06, + "logits/chosen": -0.5385391116142273, + "logits/rejected": -0.6158698797225952, + "logps/chosen": -58.883094787597656, + "logps/rejected": -109.25440979003906, + "loss": 0.6536, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.976212978363037, + "rewards/margins": 7.823639869689941, + "rewards/rejected": -4.847426891326904, + "step": 15341 + }, + { + "epoch": 3.84, + "grad_norm": 15.350400924682617, + "learning_rate": 1.2736883389572396e-06, + "logits/chosen": -0.5264447927474976, + "logits/rejected": -0.6480724811553955, + "logps/chosen": -58.93098068237305, + "logps/rejected": -89.3269271850586, + "loss": 0.635, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9889674186706543, + "rewards/margins": 5.945878982543945, + "rewards/rejected": -2.956911325454712, + "step": 15342 + }, + { + "epoch": 3.84, + "grad_norm": 3.0475969314575195, + "learning_rate": 1.273164310937166e-06, + "logits/chosen": -0.6429861783981323, + "logits/rejected": -0.7413108348846436, + "logps/chosen": -58.380470275878906, + "logps/rejected": -87.29234313964844, + "loss": 0.6119, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9085726737976074, + "rewards/margins": 6.13979959487915, + "rewards/rejected": -3.231226921081543, + "step": 15343 + }, + { + "epoch": 3.84, + "grad_norm": 9.03341293334961, + "learning_rate": 1.272640375011166e-06, + "logits/chosen": -0.5993781089782715, + "logits/rejected": -0.6346929669380188, + "logps/chosen": -61.141353607177734, + "logps/rejected": -91.48247528076172, + "loss": 0.7408, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2373762130737305, + "rewards/margins": 5.466911315917969, + "rewards/rejected": -2.229534864425659, + "step": 15344 + }, + { + "epoch": 3.84, + "grad_norm": 4.382144451141357, + "learning_rate": 1.2721165311921856e-06, + "logits/chosen": -0.5531294941902161, + "logits/rejected": -0.6395457983016968, + "logps/chosen": -49.36178207397461, + "logps/rejected": -95.1445083618164, + "loss": 0.6268, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1985397338867188, + "rewards/margins": 6.836732387542725, + "rewards/rejected": -3.6381919384002686, + "step": 15345 + }, + { + "epoch": 3.84, + "grad_norm": 7.9152421951293945, + "learning_rate": 1.271592779493172e-06, + "logits/chosen": -0.4495338201522827, + "logits/rejected": -0.5587366819381714, + "logps/chosen": -58.74677276611328, + "logps/rejected": -89.0728759765625, + "loss": 0.7318, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1260757446289062, + "rewards/margins": 6.98060417175293, + "rewards/rejected": -3.8545281887054443, + "step": 15346 + }, + { + "epoch": 3.84, + "grad_norm": 2.888674259185791, + "learning_rate": 1.2710691199270669e-06, + "logits/chosen": -0.5153917074203491, + "logits/rejected": -0.6119872331619263, + "logps/chosen": -57.374305725097656, + "logps/rejected": -106.9251708984375, + "loss": 0.5978, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.274156332015991, + "rewards/margins": 8.20457935333252, + "rewards/rejected": -4.930423736572266, + "step": 15347 + }, + { + "epoch": 3.84, + "grad_norm": 4.000186920166016, + "learning_rate": 1.2705455525068088e-06, + "logits/chosen": -0.5221942663192749, + "logits/rejected": -0.6573303937911987, + "logps/chosen": -67.88591003417969, + "logps/rejected": -116.37661743164062, + "loss": 0.6459, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.119385242462158, + "rewards/margins": 8.389293670654297, + "rewards/rejected": -5.2699079513549805, + "step": 15348 + }, + { + "epoch": 3.84, + "grad_norm": 2.48061466217041, + "learning_rate": 1.270022077245338e-06, + "logits/chosen": -0.44777530431747437, + "logits/rejected": -0.5582968592643738, + "logps/chosen": -57.22097396850586, + "logps/rejected": -111.91806030273438, + "loss": 0.5356, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.094526529312134, + "rewards/margins": 8.728504180908203, + "rewards/rejected": -5.633977890014648, + "step": 15349 + }, + { + "epoch": 3.84, + "grad_norm": 5.1354193687438965, + "learning_rate": 1.2694986941555892e-06, + "logits/chosen": -0.4734317660331726, + "logits/rejected": -0.5338342785835266, + "logps/chosen": -67.60047912597656, + "logps/rejected": -117.76847839355469, + "loss": 0.6434, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2257869243621826, + "rewards/margins": 6.656333923339844, + "rewards/rejected": -3.4305477142333984, + "step": 15350 + }, + { + "epoch": 3.84, + "grad_norm": 3.831177234649658, + "learning_rate": 1.2689754032504942e-06, + "logits/chosen": -0.5488926768302917, + "logits/rejected": -0.6336804032325745, + "logps/chosen": -51.01185607910156, + "logps/rejected": -123.51881408691406, + "loss": 0.6345, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8933725357055664, + "rewards/margins": 8.257832527160645, + "rewards/rejected": -5.36445951461792, + "step": 15351 + }, + { + "epoch": 3.84, + "grad_norm": 9.228204727172852, + "learning_rate": 1.2684522045429865e-06, + "logits/chosen": -0.5374518632888794, + "logits/rejected": -0.6214215755462646, + "logps/chosen": -50.31498718261719, + "logps/rejected": -81.4979019165039, + "loss": 0.6674, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3288254737854004, + "rewards/margins": 6.036204814910889, + "rewards/rejected": -2.7073793411254883, + "step": 15352 + }, + { + "epoch": 3.84, + "grad_norm": 2.929786443710327, + "learning_rate": 1.2679290980459929e-06, + "logits/chosen": -0.5246102213859558, + "logits/rejected": -0.6137470006942749, + "logps/chosen": -51.15037536621094, + "logps/rejected": -98.9527816772461, + "loss": 0.6022, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3344924449920654, + "rewards/margins": 7.314019203186035, + "rewards/rejected": -3.979526996612549, + "step": 15353 + }, + { + "epoch": 3.84, + "grad_norm": 5.727557182312012, + "learning_rate": 1.2674060837724422e-06, + "logits/chosen": -0.5743114352226257, + "logits/rejected": -0.5982660055160522, + "logps/chosen": -49.060428619384766, + "logps/rejected": -112.4011001586914, + "loss": 0.6687, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.447349786758423, + "rewards/margins": 7.792544841766357, + "rewards/rejected": -4.3451948165893555, + "step": 15354 + }, + { + "epoch": 3.84, + "grad_norm": 7.707801818847656, + "learning_rate": 1.2668831617352563e-06, + "logits/chosen": -0.46835654973983765, + "logits/rejected": -0.5272361636161804, + "logps/chosen": -53.6698112487793, + "logps/rejected": -116.49887084960938, + "loss": 0.6245, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0374441146850586, + "rewards/margins": 7.429169654846191, + "rewards/rejected": -4.391725540161133, + "step": 15355 + }, + { + "epoch": 3.84, + "grad_norm": 7.00925874710083, + "learning_rate": 1.2663603319473588e-06, + "logits/chosen": -0.6159914135932922, + "logits/rejected": -0.6837949752807617, + "logps/chosen": -51.021156311035156, + "logps/rejected": -113.15406799316406, + "loss": 0.6858, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.268338918685913, + "rewards/margins": 7.711733341217041, + "rewards/rejected": -4.443394660949707, + "step": 15356 + }, + { + "epoch": 3.84, + "grad_norm": 5.737011909484863, + "learning_rate": 1.2658375944216678e-06, + "logits/chosen": -0.5533299446105957, + "logits/rejected": -0.6356295347213745, + "logps/chosen": -50.092376708984375, + "logps/rejected": -109.49516296386719, + "loss": 0.5629, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0422933101654053, + "rewards/margins": 7.1729416847229, + "rewards/rejected": -4.130648612976074, + "step": 15357 + }, + { + "epoch": 3.84, + "grad_norm": 3.777209997177124, + "learning_rate": 1.265314949171103e-06, + "logits/chosen": -0.593491792678833, + "logits/rejected": -0.7095903754234314, + "logps/chosen": -42.960304260253906, + "logps/rejected": -94.06819152832031, + "loss": 0.523, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1160967350006104, + "rewards/margins": 8.281314849853516, + "rewards/rejected": -5.165217399597168, + "step": 15358 + }, + { + "epoch": 3.84, + "grad_norm": 12.857373237609863, + "learning_rate": 1.2647923962085784e-06, + "logits/chosen": -0.5706686973571777, + "logits/rejected": -0.6381518244743347, + "logps/chosen": -58.40632629394531, + "logps/rejected": -109.57962036132812, + "loss": 0.6499, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0939908027648926, + "rewards/margins": 5.885154724121094, + "rewards/rejected": -2.791163682937622, + "step": 15359 + }, + { + "epoch": 3.84, + "grad_norm": 10.248010635375977, + "learning_rate": 1.264269935547005e-06, + "logits/chosen": -0.6079685688018799, + "logits/rejected": -0.6932958364486694, + "logps/chosen": -57.499114990234375, + "logps/rejected": -97.33167266845703, + "loss": 0.7005, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1415040493011475, + "rewards/margins": 5.882101058959961, + "rewards/rejected": -2.7405970096588135, + "step": 15360 + }, + { + "epoch": 3.84, + "grad_norm": 11.091596603393555, + "learning_rate": 1.2637475671992966e-06, + "logits/chosen": -0.4764443039894104, + "logits/rejected": -0.5460494756698608, + "logps/chosen": -68.377197265625, + "logps/rejected": -123.76380920410156, + "loss": 0.9315, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8708419799804688, + "rewards/margins": 6.397751331329346, + "rewards/rejected": -3.526909828186035, + "step": 15361 + }, + { + "epoch": 3.84, + "grad_norm": 6.854429244995117, + "learning_rate": 1.26322529117836e-06, + "logits/chosen": -0.5936626195907593, + "logits/rejected": -0.6850564479827881, + "logps/chosen": -50.045082092285156, + "logps/rejected": -120.09791564941406, + "loss": 0.6181, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.941984176635742, + "rewards/margins": 8.314411163330078, + "rewards/rejected": -5.372425556182861, + "step": 15362 + }, + { + "epoch": 3.84, + "grad_norm": 4.2820024490356445, + "learning_rate": 1.2627031074970992e-06, + "logits/chosen": -0.514836311340332, + "logits/rejected": -0.6043718457221985, + "logps/chosen": -48.43584442138672, + "logps/rejected": -94.10188293457031, + "loss": 0.5758, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1138432025909424, + "rewards/margins": 6.278486728668213, + "rewards/rejected": -3.1646437644958496, + "step": 15363 + }, + { + "epoch": 3.84, + "grad_norm": 14.148866653442383, + "learning_rate": 1.2621810161684216e-06, + "logits/chosen": -0.5598663091659546, + "logits/rejected": -0.6123315691947937, + "logps/chosen": -47.87062454223633, + "logps/rejected": -107.56352233886719, + "loss": 0.7181, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7866153717041016, + "rewards/margins": 7.304845333099365, + "rewards/rejected": -4.518229961395264, + "step": 15364 + }, + { + "epoch": 3.84, + "grad_norm": 3.590627670288086, + "learning_rate": 1.2616590172052268e-06, + "logits/chosen": -0.5836331844329834, + "logits/rejected": -0.6553637981414795, + "logps/chosen": -52.82569122314453, + "logps/rejected": -105.32167053222656, + "loss": 0.5925, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.278313636779785, + "rewards/margins": 7.74446964263916, + "rewards/rejected": -4.466155529022217, + "step": 15365 + }, + { + "epoch": 3.84, + "grad_norm": 4.915276527404785, + "learning_rate": 1.2611371106204123e-06, + "logits/chosen": -0.5923687219619751, + "logits/rejected": -0.6515847444534302, + "logps/chosen": -52.2066764831543, + "logps/rejected": -118.48297119140625, + "loss": 0.6072, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0182857513427734, + "rewards/margins": 7.508068084716797, + "rewards/rejected": -4.489782810211182, + "step": 15366 + }, + { + "epoch": 3.84, + "grad_norm": 2.4484217166900635, + "learning_rate": 1.2606152964268765e-06, + "logits/chosen": -0.5690621733665466, + "logits/rejected": -0.6005016565322876, + "logps/chosen": -46.100746154785156, + "logps/rejected": -114.39802551269531, + "loss": 0.5909, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.015434503555298, + "rewards/margins": 7.868907928466797, + "rewards/rejected": -4.853473663330078, + "step": 15367 + }, + { + "epoch": 3.84, + "grad_norm": 5.854111671447754, + "learning_rate": 1.260093574637517e-06, + "logits/chosen": -0.6025949716567993, + "logits/rejected": -0.6909381151199341, + "logps/chosen": -50.10826110839844, + "logps/rejected": -104.58982849121094, + "loss": 0.6181, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.383671760559082, + "rewards/margins": 7.845009803771973, + "rewards/rejected": -4.461338043212891, + "step": 15368 + }, + { + "epoch": 3.84, + "grad_norm": 5.936516761779785, + "learning_rate": 1.2595719452652206e-06, + "logits/chosen": -0.5840110778808594, + "logits/rejected": -0.622723400592804, + "logps/chosen": -50.552982330322266, + "logps/rejected": -111.53132629394531, + "loss": 0.6777, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2390966415405273, + "rewards/margins": 7.145943641662598, + "rewards/rejected": -3.9068474769592285, + "step": 15369 + }, + { + "epoch": 3.84, + "grad_norm": 10.97758960723877, + "learning_rate": 1.2590504083228799e-06, + "logits/chosen": -0.5494632124900818, + "logits/rejected": -0.6764479279518127, + "logps/chosen": -58.759544372558594, + "logps/rejected": -104.58113098144531, + "loss": 0.8086, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.751089096069336, + "rewards/margins": 8.70694351196289, + "rewards/rejected": -5.955854892730713, + "step": 15370 + }, + { + "epoch": 3.85, + "grad_norm": 8.476178169250488, + "learning_rate": 1.2585289638233838e-06, + "logits/chosen": -0.5331739187240601, + "logits/rejected": -0.5979259610176086, + "logps/chosen": -60.89446258544922, + "logps/rejected": -112.07072448730469, + "loss": 0.7655, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6761085987091064, + "rewards/margins": 7.164632797241211, + "rewards/rejected": -4.488523483276367, + "step": 15371 + }, + { + "epoch": 3.85, + "grad_norm": 20.649248123168945, + "learning_rate": 1.258007611779617e-06, + "logits/chosen": -0.5542245507240295, + "logits/rejected": -0.5816170573234558, + "logps/chosen": -66.45043182373047, + "logps/rejected": -107.96405792236328, + "loss": 0.8479, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6675262451171875, + "rewards/margins": 5.464949131011963, + "rewards/rejected": -2.7974228858947754, + "step": 15372 + }, + { + "epoch": 3.85, + "grad_norm": 4.388514041900635, + "learning_rate": 1.2574863522044605e-06, + "logits/chosen": -0.5810485482215881, + "logits/rejected": -0.6607100367546082, + "logps/chosen": -48.224422454833984, + "logps/rejected": -86.72806549072266, + "loss": 0.6981, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.754293203353882, + "rewards/margins": 6.63619327545166, + "rewards/rejected": -3.8819007873535156, + "step": 15373 + }, + { + "epoch": 3.85, + "grad_norm": 4.23783540725708, + "learning_rate": 1.256965185110799e-06, + "logits/chosen": -0.4563569724559784, + "logits/rejected": -0.4882272779941559, + "logps/chosen": -43.548622131347656, + "logps/rejected": -104.48175048828125, + "loss": 0.5295, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3474056720733643, + "rewards/margins": 6.18134069442749, + "rewards/rejected": -2.8339345455169678, + "step": 15374 + }, + { + "epoch": 3.85, + "grad_norm": 22.826309204101562, + "learning_rate": 1.2564441105115082e-06, + "logits/chosen": -0.5349283218383789, + "logits/rejected": -0.5853568911552429, + "logps/chosen": -49.005123138427734, + "logps/rejected": -131.0336456298828, + "loss": 0.5964, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5354695320129395, + "rewards/margins": 8.382219314575195, + "rewards/rejected": -4.8467488288879395, + "step": 15375 + }, + { + "epoch": 3.85, + "grad_norm": 24.083274841308594, + "learning_rate": 1.2559231284194644e-06, + "logits/chosen": -0.5415597558021545, + "logits/rejected": -0.5845218896865845, + "logps/chosen": -49.861976623535156, + "logps/rejected": -102.84713745117188, + "loss": 0.634, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0066845417022705, + "rewards/margins": 6.689891338348389, + "rewards/rejected": -3.683206558227539, + "step": 15376 + }, + { + "epoch": 3.85, + "grad_norm": 4.846799850463867, + "learning_rate": 1.255402238847544e-06, + "logits/chosen": -0.5471879243850708, + "logits/rejected": -0.6233639717102051, + "logps/chosen": -51.622474670410156, + "logps/rejected": -104.0778579711914, + "loss": 0.6065, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9622642993927, + "rewards/margins": 7.294594764709473, + "rewards/rejected": -4.332330226898193, + "step": 15377 + }, + { + "epoch": 3.85, + "grad_norm": 3.2194459438323975, + "learning_rate": 1.2548814418086153e-06, + "logits/chosen": -0.5933236479759216, + "logits/rejected": -0.688989520072937, + "logps/chosen": -52.88825225830078, + "logps/rejected": -106.23235321044922, + "loss": 0.6018, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.958542585372925, + "rewards/margins": 8.186964988708496, + "rewards/rejected": -5.228422164916992, + "step": 15378 + }, + { + "epoch": 3.85, + "grad_norm": 23.571361541748047, + "learning_rate": 1.2543607373155513e-06, + "logits/chosen": -0.5791126489639282, + "logits/rejected": -0.6412575244903564, + "logps/chosen": -60.95296096801758, + "logps/rejected": -110.16627502441406, + "loss": 0.7231, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.826582431793213, + "rewards/margins": 6.596651554107666, + "rewards/rejected": -3.7700693607330322, + "step": 15379 + }, + { + "epoch": 3.85, + "grad_norm": 8.529967308044434, + "learning_rate": 1.2538401253812177e-06, + "logits/chosen": -0.5631868243217468, + "logits/rejected": -0.632685661315918, + "logps/chosen": -62.713226318359375, + "logps/rejected": -105.29780578613281, + "loss": 0.6874, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.096999168395996, + "rewards/margins": 6.1923136711120605, + "rewards/rejected": -3.0953145027160645, + "step": 15380 + }, + { + "epoch": 3.85, + "grad_norm": 6.854603290557861, + "learning_rate": 1.2533196060184777e-06, + "logits/chosen": -0.5126223564147949, + "logits/rejected": -0.5921849608421326, + "logps/chosen": -56.38768005371094, + "logps/rejected": -106.74136352539062, + "loss": 0.5626, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.16241455078125, + "rewards/margins": 6.9514479637146, + "rewards/rejected": -3.7890334129333496, + "step": 15381 + }, + { + "epoch": 3.85, + "grad_norm": 3.6033501625061035, + "learning_rate": 1.2527991792401967e-06, + "logits/chosen": -0.5337838530540466, + "logits/rejected": -0.651052713394165, + "logps/chosen": -48.54119110107422, + "logps/rejected": -124.84098052978516, + "loss": 0.5226, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0741958618164062, + "rewards/margins": 7.715608596801758, + "rewards/rejected": -4.641412258148193, + "step": 15382 + }, + { + "epoch": 3.85, + "grad_norm": 3.7398743629455566, + "learning_rate": 1.2522788450592337e-06, + "logits/chosen": -0.6044726371765137, + "logits/rejected": -0.6783799529075623, + "logps/chosen": -48.603111267089844, + "logps/rejected": -121.87139129638672, + "loss": 0.629, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1772255897521973, + "rewards/margins": 7.477739334106445, + "rewards/rejected": -4.300513744354248, + "step": 15383 + }, + { + "epoch": 3.85, + "grad_norm": 6.238080978393555, + "learning_rate": 1.2517586034884448e-06, + "logits/chosen": -0.5921907424926758, + "logits/rejected": -0.6339483261108398, + "logps/chosen": -53.97245407104492, + "logps/rejected": -113.3493423461914, + "loss": 0.6277, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3397533893585205, + "rewards/margins": 7.666845321655273, + "rewards/rejected": -4.327091693878174, + "step": 15384 + }, + { + "epoch": 3.85, + "grad_norm": 2.8163230419158936, + "learning_rate": 1.2512384545406886e-06, + "logits/chosen": -0.499118447303772, + "logits/rejected": -0.5937132239341736, + "logps/chosen": -57.50707244873047, + "logps/rejected": -112.11434173583984, + "loss": 0.5519, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3663313388824463, + "rewards/margins": 7.797406196594238, + "rewards/rejected": -4.431074619293213, + "step": 15385 + }, + { + "epoch": 3.85, + "grad_norm": 10.146076202392578, + "learning_rate": 1.2507183982288196e-06, + "logits/chosen": -0.618462085723877, + "logits/rejected": -0.6506462693214417, + "logps/chosen": -46.512638092041016, + "logps/rejected": -124.71670532226562, + "loss": 0.5965, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.034487247467041, + "rewards/margins": 7.543497085571289, + "rewards/rejected": -4.50900936126709, + "step": 15386 + }, + { + "epoch": 3.85, + "grad_norm": 5.732306957244873, + "learning_rate": 1.2501984345656843e-06, + "logits/chosen": -0.579119086265564, + "logits/rejected": -0.6590455770492554, + "logps/chosen": -59.093711853027344, + "logps/rejected": -99.81436920166016, + "loss": 0.6948, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5027849674224854, + "rewards/margins": 6.858616828918457, + "rewards/rejected": -4.355832099914551, + "step": 15387 + }, + { + "epoch": 3.85, + "grad_norm": 39.60313034057617, + "learning_rate": 1.249678563564134e-06, + "logits/chosen": -0.5904597043991089, + "logits/rejected": -0.7066208720207214, + "logps/chosen": -52.52351379394531, + "logps/rejected": -99.85538482666016, + "loss": 0.5712, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.237035036087036, + "rewards/margins": 7.735912322998047, + "rewards/rejected": -4.498877048492432, + "step": 15388 + }, + { + "epoch": 3.85, + "grad_norm": 8.886972427368164, + "learning_rate": 1.2491587852370173e-06, + "logits/chosen": -0.5774505138397217, + "logits/rejected": -0.6606305837631226, + "logps/chosen": -53.61330795288086, + "logps/rejected": -100.34136962890625, + "loss": 0.7304, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8420934677124023, + "rewards/margins": 7.084356307983398, + "rewards/rejected": -4.242263317108154, + "step": 15389 + }, + { + "epoch": 3.85, + "grad_norm": 4.648041725158691, + "learning_rate": 1.2486390995971765e-06, + "logits/chosen": -0.5694359540939331, + "logits/rejected": -0.6496418118476868, + "logps/chosen": -60.691307067871094, + "logps/rejected": -128.94088745117188, + "loss": 0.6575, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.071873664855957, + "rewards/margins": 7.855299949645996, + "rewards/rejected": -4.783426284790039, + "step": 15390 + }, + { + "epoch": 3.85, + "grad_norm": 2.9660234451293945, + "learning_rate": 1.248119506657452e-06, + "logits/chosen": -0.5305605530738831, + "logits/rejected": -0.6160199642181396, + "logps/chosen": -49.703514099121094, + "logps/rejected": -121.78469848632812, + "loss": 0.5739, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.238036632537842, + "rewards/margins": 8.420198440551758, + "rewards/rejected": -5.182161808013916, + "step": 15391 + }, + { + "epoch": 3.85, + "grad_norm": 3.182213306427002, + "learning_rate": 1.2476000064306866e-06, + "logits/chosen": -0.5186336636543274, + "logits/rejected": -0.6331117153167725, + "logps/chosen": -56.229652404785156, + "logps/rejected": -99.20427703857422, + "loss": 0.5546, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.268411636352539, + "rewards/margins": 7.070847511291504, + "rewards/rejected": -3.8024356365203857, + "step": 15392 + }, + { + "epoch": 3.85, + "grad_norm": 3.9588329792022705, + "learning_rate": 1.2470805989297163e-06, + "logits/chosen": -0.5807482600212097, + "logits/rejected": -0.6876676678657532, + "logps/chosen": -59.88739013671875, + "logps/rejected": -96.63761901855469, + "loss": 0.6075, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1733665466308594, + "rewards/margins": 6.833819389343262, + "rewards/rejected": -3.6604528427124023, + "step": 15393 + }, + { + "epoch": 3.85, + "grad_norm": 5.685559272766113, + "learning_rate": 1.2465612841673746e-06, + "logits/chosen": -0.5693131685256958, + "logits/rejected": -0.6445867419242859, + "logps/chosen": -50.51369094848633, + "logps/rejected": -80.10431671142578, + "loss": 0.6916, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9776408672332764, + "rewards/margins": 5.981752395629883, + "rewards/rejected": -3.0041115283966064, + "step": 15394 + }, + { + "epoch": 3.85, + "grad_norm": 7.598000526428223, + "learning_rate": 1.2460420621564973e-06, + "logits/chosen": -0.6386099457740784, + "logits/rejected": -0.7384626865386963, + "logps/chosen": -42.40130615234375, + "logps/rejected": -86.97295379638672, + "loss": 0.6486, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.826085329055786, + "rewards/margins": 6.689161777496338, + "rewards/rejected": -3.8630762100219727, + "step": 15395 + }, + { + "epoch": 3.85, + "grad_norm": 10.367751121520996, + "learning_rate": 1.2455229329099122e-06, + "logits/chosen": -0.566436767578125, + "logits/rejected": -0.6706386208534241, + "logps/chosen": -60.24925231933594, + "logps/rejected": -107.12081909179688, + "loss": 0.6571, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.01107120513916, + "rewards/margins": 7.719693660736084, + "rewards/rejected": -4.708622455596924, + "step": 15396 + }, + { + "epoch": 3.85, + "grad_norm": 4.902719497680664, + "learning_rate": 1.2450038964404509e-06, + "logits/chosen": -0.5083470940589905, + "logits/rejected": -0.590111255645752, + "logps/chosen": -63.37413024902344, + "logps/rejected": -119.79615783691406, + "loss": 0.6408, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.068905830383301, + "rewards/margins": 7.504589080810547, + "rewards/rejected": -4.435682773590088, + "step": 15397 + }, + { + "epoch": 3.85, + "grad_norm": 4.98137092590332, + "learning_rate": 1.2444849527609354e-06, + "logits/chosen": -0.5450636148452759, + "logits/rejected": -0.6668686866760254, + "logps/chosen": -53.0250244140625, + "logps/rejected": -90.9169921875, + "loss": 0.6048, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.653693675994873, + "rewards/margins": 6.907679080963135, + "rewards/rejected": -4.2539849281311035, + "step": 15398 + }, + { + "epoch": 3.85, + "grad_norm": 11.507723808288574, + "learning_rate": 1.2439661018841925e-06, + "logits/chosen": -0.5386336445808411, + "logits/rejected": -0.6035891771316528, + "logps/chosen": -61.12297058105469, + "logps/rejected": -101.45294952392578, + "loss": 0.7074, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.053969383239746, + "rewards/margins": 7.01178503036499, + "rewards/rejected": -3.957815170288086, + "step": 15399 + }, + { + "epoch": 3.85, + "grad_norm": 13.272363662719727, + "learning_rate": 1.2434473438230426e-06, + "logits/chosen": -0.54729163646698, + "logits/rejected": -0.615333080291748, + "logps/chosen": -62.058685302734375, + "logps/rejected": -113.9827651977539, + "loss": 0.617, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.301162004470825, + "rewards/margins": 7.2345476150512695, + "rewards/rejected": -3.9333858489990234, + "step": 15400 + }, + { + "epoch": 3.85, + "grad_norm": 4.389343738555908, + "learning_rate": 1.2429286785903028e-06, + "logits/chosen": -0.5087318420410156, + "logits/rejected": -0.5929067134857178, + "logps/chosen": -52.82685089111328, + "logps/rejected": -116.46519470214844, + "loss": 0.6047, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7826874256134033, + "rewards/margins": 7.091517925262451, + "rewards/rejected": -4.308831214904785, + "step": 15401 + }, + { + "epoch": 3.85, + "grad_norm": 4.211142063140869, + "learning_rate": 1.2424101061987931e-06, + "logits/chosen": -0.5929412841796875, + "logits/rejected": -0.6676333546638489, + "logps/chosen": -58.9156379699707, + "logps/rejected": -103.7159423828125, + "loss": 0.7032, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.028308868408203, + "rewards/margins": 7.6334638595581055, + "rewards/rejected": -4.605154514312744, + "step": 15402 + }, + { + "epoch": 3.85, + "grad_norm": 9.595830917358398, + "learning_rate": 1.241891626661325e-06, + "logits/chosen": -0.5943984389305115, + "logits/rejected": -0.7063480615615845, + "logps/chosen": -57.09463119506836, + "logps/rejected": -103.17828369140625, + "loss": 0.6567, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9913954734802246, + "rewards/margins": 8.043425559997559, + "rewards/rejected": -5.052030086517334, + "step": 15403 + }, + { + "epoch": 3.85, + "grad_norm": 4.942209720611572, + "learning_rate": 1.2413732399907136e-06, + "logits/chosen": -0.5124858021736145, + "logits/rejected": -0.6658863425254822, + "logps/chosen": -55.5748176574707, + "logps/rejected": -93.0535888671875, + "loss": 0.6231, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8956825733184814, + "rewards/margins": 7.7311110496521, + "rewards/rejected": -4.835428237915039, + "step": 15404 + }, + { + "epoch": 3.85, + "grad_norm": 2.216261386871338, + "learning_rate": 1.240854946199767e-06, + "logits/chosen": -0.5268491506576538, + "logits/rejected": -0.6541441082954407, + "logps/chosen": -60.246089935302734, + "logps/rejected": -105.4310531616211, + "loss": 0.5735, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7790517807006836, + "rewards/margins": 8.48528003692627, + "rewards/rejected": -5.706228256225586, + "step": 15405 + }, + { + "epoch": 3.85, + "grad_norm": 5.440539836883545, + "learning_rate": 1.2403367453012922e-06, + "logits/chosen": -0.5560334920883179, + "logits/rejected": -0.6034751534461975, + "logps/chosen": -57.230430603027344, + "logps/rejected": -116.54338836669922, + "loss": 0.6076, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2939677238464355, + "rewards/margins": 7.538960933685303, + "rewards/rejected": -4.244992733001709, + "step": 15406 + }, + { + "epoch": 3.85, + "grad_norm": 5.430430889129639, + "learning_rate": 1.239818637308096e-06, + "logits/chosen": -0.5062726140022278, + "logits/rejected": -0.5874418020248413, + "logps/chosen": -52.611305236816406, + "logps/rejected": -122.00306701660156, + "loss": 0.5897, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.20328426361084, + "rewards/margins": 8.472508430480957, + "rewards/rejected": -5.269225120544434, + "step": 15407 + }, + { + "epoch": 3.85, + "grad_norm": 13.051349639892578, + "learning_rate": 1.2393006222329818e-06, + "logits/chosen": -0.5330384373664856, + "logits/rejected": -0.5978983640670776, + "logps/chosen": -40.56370162963867, + "logps/rejected": -92.65856170654297, + "loss": 0.6244, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3514623641967773, + "rewards/margins": 5.850884437561035, + "rewards/rejected": -2.499422073364258, + "step": 15408 + }, + { + "epoch": 3.85, + "grad_norm": 9.058454513549805, + "learning_rate": 1.238782700088747e-06, + "logits/chosen": -0.5378681421279907, + "logits/rejected": -0.6767506003379822, + "logps/chosen": -60.763832092285156, + "logps/rejected": -106.71807861328125, + "loss": 0.6631, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8281681537628174, + "rewards/margins": 7.620951175689697, + "rewards/rejected": -4.792782783508301, + "step": 15409 + }, + { + "epoch": 3.85, + "grad_norm": 3.4678852558135986, + "learning_rate": 1.2382648708881927e-06, + "logits/chosen": -0.5686919689178467, + "logits/rejected": -0.6742028594017029, + "logps/chosen": -64.81230163574219, + "logps/rejected": -92.50531005859375, + "loss": 0.6492, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7008142471313477, + "rewards/margins": 7.097033977508545, + "rewards/rejected": -4.396219730377197, + "step": 15410 + }, + { + "epoch": 3.86, + "grad_norm": 4.546957969665527, + "learning_rate": 1.2377471346441178e-06, + "logits/chosen": -0.5519420504570007, + "logits/rejected": -0.5990104079246521, + "logps/chosen": -48.57646560668945, + "logps/rejected": -101.75397491455078, + "loss": 0.6146, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.189314842224121, + "rewards/margins": 7.640353202819824, + "rewards/rejected": -4.451037883758545, + "step": 15411 + }, + { + "epoch": 3.86, + "grad_norm": 3.5193300247192383, + "learning_rate": 1.2372294913693095e-06, + "logits/chosen": -0.5123465061187744, + "logits/rejected": -0.5495913028717041, + "logps/chosen": -55.368228912353516, + "logps/rejected": -120.60047149658203, + "loss": 0.6408, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.083122730255127, + "rewards/margins": 7.588409900665283, + "rewards/rejected": -4.505286693572998, + "step": 15412 + }, + { + "epoch": 3.86, + "grad_norm": 7.146882057189941, + "learning_rate": 1.2367119410765626e-06, + "logits/chosen": -0.555705726146698, + "logits/rejected": -0.6287707090377808, + "logps/chosen": -62.204795837402344, + "logps/rejected": -104.45919036865234, + "loss": 0.5849, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.150658130645752, + "rewards/margins": 6.675697326660156, + "rewards/rejected": -3.5250391960144043, + "step": 15413 + }, + { + "epoch": 3.86, + "grad_norm": 9.246442794799805, + "learning_rate": 1.2361944837786682e-06, + "logits/chosen": -0.4996306300163269, + "logits/rejected": -0.5698271989822388, + "logps/chosen": -51.905128479003906, + "logps/rejected": -107.94914245605469, + "loss": 0.6519, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7482376098632812, + "rewards/margins": 6.907498836517334, + "rewards/rejected": -4.159261226654053, + "step": 15414 + }, + { + "epoch": 3.86, + "grad_norm": 2.714317798614502, + "learning_rate": 1.2356771194884115e-06, + "logits/chosen": -0.5792420506477356, + "logits/rejected": -0.6267159581184387, + "logps/chosen": -58.475616455078125, + "logps/rejected": -107.97410583496094, + "loss": 0.6317, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.869724750518799, + "rewards/margins": 7.351899147033691, + "rewards/rejected": -4.482173919677734, + "step": 15415 + }, + { + "epoch": 3.86, + "grad_norm": 11.486517906188965, + "learning_rate": 1.2351598482185757e-06, + "logits/chosen": -0.6053480505943298, + "logits/rejected": -0.6374159455299377, + "logps/chosen": -53.86206817626953, + "logps/rejected": -120.83668518066406, + "loss": 0.6186, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.13213849067688, + "rewards/margins": 7.475847244262695, + "rewards/rejected": -4.3437089920043945, + "step": 15416 + }, + { + "epoch": 3.86, + "grad_norm": 8.879212379455566, + "learning_rate": 1.234642669981946e-06, + "logits/chosen": -0.5741504430770874, + "logits/rejected": -0.6533915996551514, + "logps/chosen": -55.483299255371094, + "logps/rejected": -104.45826721191406, + "loss": 0.6242, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9703381061553955, + "rewards/margins": 6.886678695678711, + "rewards/rejected": -3.9163408279418945, + "step": 15417 + }, + { + "epoch": 3.86, + "grad_norm": 5.298353672027588, + "learning_rate": 1.2341255847913003e-06, + "logits/chosen": -0.5204423069953918, + "logits/rejected": -0.5997652411460876, + "logps/chosen": -61.19972610473633, + "logps/rejected": -100.04592895507812, + "loss": 0.6521, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.143092632293701, + "rewards/margins": 6.316814422607422, + "rewards/rejected": -3.1737213134765625, + "step": 15418 + }, + { + "epoch": 3.86, + "grad_norm": 6.574195384979248, + "learning_rate": 1.233608592659416e-06, + "logits/chosen": -0.5674755573272705, + "logits/rejected": -0.665821373462677, + "logps/chosen": -49.88048553466797, + "logps/rejected": -97.19397735595703, + "loss": 0.6102, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.172924041748047, + "rewards/margins": 7.111295700073242, + "rewards/rejected": -3.9383721351623535, + "step": 15419 + }, + { + "epoch": 3.86, + "grad_norm": 3.06028413772583, + "learning_rate": 1.2330916935990712e-06, + "logits/chosen": -0.538953959941864, + "logits/rejected": -0.606887936592102, + "logps/chosen": -71.72183227539062, + "logps/rejected": -131.16453552246094, + "loss": 0.6732, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.861086845397949, + "rewards/margins": 8.721750259399414, + "rewards/rejected": -5.860663414001465, + "step": 15420 + }, + { + "epoch": 3.86, + "grad_norm": 2.8062949180603027, + "learning_rate": 1.2325748876230354e-06, + "logits/chosen": -0.6098531484603882, + "logits/rejected": -0.661164402961731, + "logps/chosen": -58.77880096435547, + "logps/rejected": -113.42622375488281, + "loss": 0.614, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5713117122650146, + "rewards/margins": 7.022347450256348, + "rewards/rejected": -3.451035499572754, + "step": 15421 + }, + { + "epoch": 3.86, + "grad_norm": 3.9806158542633057, + "learning_rate": 1.232058174744083e-06, + "logits/chosen": -0.39466434717178345, + "logits/rejected": -0.5314502120018005, + "logps/chosen": -59.410072326660156, + "logps/rejected": -112.70736694335938, + "loss": 0.5558, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2153408527374268, + "rewards/margins": 8.492926597595215, + "rewards/rejected": -5.277585983276367, + "step": 15422 + }, + { + "epoch": 3.86, + "grad_norm": 3.2854435443878174, + "learning_rate": 1.231541554974981e-06, + "logits/chosen": -0.5650140047073364, + "logits/rejected": -0.6198909282684326, + "logps/chosen": -48.021339416503906, + "logps/rejected": -94.96086120605469, + "loss": 0.6465, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0147767066955566, + "rewards/margins": 6.992371082305908, + "rewards/rejected": -3.9775943756103516, + "step": 15423 + }, + { + "epoch": 3.86, + "grad_norm": 3.978163480758667, + "learning_rate": 1.2310250283284937e-06, + "logits/chosen": -0.5330546498298645, + "logits/rejected": -0.6174018979072571, + "logps/chosen": -52.55150604248047, + "logps/rejected": -109.73358154296875, + "loss": 0.6068, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2252230644226074, + "rewards/margins": 6.785933017730713, + "rewards/rejected": -3.5607097148895264, + "step": 15424 + }, + { + "epoch": 3.86, + "grad_norm": 2.94179630279541, + "learning_rate": 1.2305085948173884e-06, + "logits/chosen": -0.48624947667121887, + "logits/rejected": -0.5672778487205505, + "logps/chosen": -58.37909698486328, + "logps/rejected": -110.06906127929688, + "loss": 0.5795, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2110848426818848, + "rewards/margins": 7.723171234130859, + "rewards/rejected": -4.512085914611816, + "step": 15425 + }, + { + "epoch": 3.86, + "grad_norm": 4.855257034301758, + "learning_rate": 1.229992254454425e-06, + "logits/chosen": -0.5621252655982971, + "logits/rejected": -0.6603153944015503, + "logps/chosen": -42.75172424316406, + "logps/rejected": -104.10823822021484, + "loss": 0.5106, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1015233993530273, + "rewards/margins": 7.796989917755127, + "rewards/rejected": -4.6954665184021, + "step": 15426 + }, + { + "epoch": 3.86, + "grad_norm": 2.8817927837371826, + "learning_rate": 1.229476007252362e-06, + "logits/chosen": -0.48709574341773987, + "logits/rejected": -0.5800842046737671, + "logps/chosen": -55.11502456665039, + "logps/rejected": -98.14827728271484, + "loss": 0.5479, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1477818489074707, + "rewards/margins": 7.533021450042725, + "rewards/rejected": -4.385239601135254, + "step": 15427 + }, + { + "epoch": 3.86, + "grad_norm": 2.8665647506713867, + "learning_rate": 1.2289598532239572e-06, + "logits/chosen": -0.5257003903388977, + "logits/rejected": -0.6081191301345825, + "logps/chosen": -47.83442306518555, + "logps/rejected": -103.61042785644531, + "loss": 0.5613, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1100316047668457, + "rewards/margins": 8.015167236328125, + "rewards/rejected": -4.9051361083984375, + "step": 15428 + }, + { + "epoch": 3.86, + "grad_norm": 6.29478645324707, + "learning_rate": 1.228443792381968e-06, + "logits/chosen": -0.5259057283401489, + "logits/rejected": -0.597095251083374, + "logps/chosen": -56.997257232666016, + "logps/rejected": -98.68098449707031, + "loss": 0.6132, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.680480480194092, + "rewards/margins": 6.581828594207764, + "rewards/rejected": -3.901348114013672, + "step": 15429 + }, + { + "epoch": 3.86, + "grad_norm": 3.9157843589782715, + "learning_rate": 1.2279278247391418e-06, + "logits/chosen": -0.522820770740509, + "logits/rejected": -0.6178030967712402, + "logps/chosen": -54.9223747253418, + "logps/rejected": -83.68333435058594, + "loss": 0.6085, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2251644134521484, + "rewards/margins": 5.9855146408081055, + "rewards/rejected": -2.760349988937378, + "step": 15430 + }, + { + "epoch": 3.86, + "grad_norm": 4.424674034118652, + "learning_rate": 1.227411950308231e-06, + "logits/chosen": -0.5750990509986877, + "logits/rejected": -0.6422760486602783, + "logps/chosen": -46.56785202026367, + "logps/rejected": -107.36399841308594, + "loss": 0.6322, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9999148845672607, + "rewards/margins": 6.5410895347595215, + "rewards/rejected": -3.541175127029419, + "step": 15431 + }, + { + "epoch": 3.86, + "grad_norm": 3.006314992904663, + "learning_rate": 1.226896169101985e-06, + "logits/chosen": -0.4640404284000397, + "logits/rejected": -0.5956306457519531, + "logps/chosen": -60.975990295410156, + "logps/rejected": -100.37838745117188, + "loss": 0.5865, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0571813583374023, + "rewards/margins": 7.59239387512207, + "rewards/rejected": -4.535212516784668, + "step": 15432 + }, + { + "epoch": 3.86, + "grad_norm": 4.569037914276123, + "learning_rate": 1.2263804811331481e-06, + "logits/chosen": -0.5281183123588562, + "logits/rejected": -0.5788076519966125, + "logps/chosen": -58.13566589355469, + "logps/rejected": -109.31688690185547, + "loss": 0.6375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1639485359191895, + "rewards/margins": 6.918033599853516, + "rewards/rejected": -3.7540857791900635, + "step": 15433 + }, + { + "epoch": 3.86, + "grad_norm": 6.6004204750061035, + "learning_rate": 1.2258648864144618e-06, + "logits/chosen": -0.6023056507110596, + "logits/rejected": -0.675752580165863, + "logps/chosen": -54.89093780517578, + "logps/rejected": -114.25140380859375, + "loss": 0.6542, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0582938194274902, + "rewards/margins": 7.503094673156738, + "rewards/rejected": -4.444801330566406, + "step": 15434 + }, + { + "epoch": 3.86, + "grad_norm": 3.7173123359680176, + "learning_rate": 1.2253493849586695e-06, + "logits/chosen": -0.573004961013794, + "logits/rejected": -0.6539390683174133, + "logps/chosen": -49.12950134277344, + "logps/rejected": -100.70057678222656, + "loss": 0.6009, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3489866256713867, + "rewards/margins": 7.119654655456543, + "rewards/rejected": -3.770667314529419, + "step": 15435 + }, + { + "epoch": 3.86, + "grad_norm": 5.173252105712891, + "learning_rate": 1.2248339767785094e-06, + "logits/chosen": -0.5292232632637024, + "logits/rejected": -0.6472339630126953, + "logps/chosen": -50.74226760864258, + "logps/rejected": -92.19816589355469, + "loss": 0.6076, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7122347354888916, + "rewards/margins": 7.723948001861572, + "rewards/rejected": -5.011713027954102, + "step": 15436 + }, + { + "epoch": 3.86, + "grad_norm": 6.259222507476807, + "learning_rate": 1.2243186618867153e-06, + "logits/chosen": -0.49729102849960327, + "logits/rejected": -0.5828105211257935, + "logps/chosen": -69.19284057617188, + "logps/rejected": -122.55995178222656, + "loss": 0.6521, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7425456047058105, + "rewards/margins": 8.053305625915527, + "rewards/rejected": -5.310760021209717, + "step": 15437 + }, + { + "epoch": 3.86, + "grad_norm": 4.35972785949707, + "learning_rate": 1.2238034402960247e-06, + "logits/chosen": -0.4852116107940674, + "logits/rejected": -0.5602669715881348, + "logps/chosen": -70.58052825927734, + "logps/rejected": -117.31495666503906, + "loss": 0.7452, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.74666166305542, + "rewards/margins": 7.0250773429870605, + "rewards/rejected": -4.278415679931641, + "step": 15438 + }, + { + "epoch": 3.86, + "grad_norm": 3.9923365116119385, + "learning_rate": 1.2232883120191664e-06, + "logits/chosen": -0.48051413893699646, + "logits/rejected": -0.5937938690185547, + "logps/chosen": -62.489295959472656, + "logps/rejected": -100.59803009033203, + "loss": 0.5916, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2708370685577393, + "rewards/margins": 7.349860191345215, + "rewards/rejected": -4.079023361206055, + "step": 15439 + }, + { + "epoch": 3.86, + "grad_norm": 7.738951206207275, + "learning_rate": 1.2227732770688722e-06, + "logits/chosen": -0.5478664636611938, + "logits/rejected": -0.6408591270446777, + "logps/chosen": -61.70745849609375, + "logps/rejected": -99.3369369506836, + "loss": 0.6199, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.922165870666504, + "rewards/margins": 7.50297212600708, + "rewards/rejected": -4.580806255340576, + "step": 15440 + }, + { + "epoch": 3.86, + "grad_norm": 6.424508571624756, + "learning_rate": 1.2222583354578682e-06, + "logits/chosen": -0.5187667608261108, + "logits/rejected": -0.583577036857605, + "logps/chosen": -63.229637145996094, + "logps/rejected": -102.90342712402344, + "loss": 0.6363, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.277348756790161, + "rewards/margins": 7.16484260559082, + "rewards/rejected": -3.8874940872192383, + "step": 15441 + }, + { + "epoch": 3.86, + "grad_norm": 5.71323299407959, + "learning_rate": 1.2217434871988776e-06, + "logits/chosen": -0.5964150428771973, + "logits/rejected": -0.6320855021476746, + "logps/chosen": -57.994102478027344, + "logps/rejected": -115.25740051269531, + "loss": 0.6984, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.316132068634033, + "rewards/margins": 6.395030975341797, + "rewards/rejected": -3.078899383544922, + "step": 15442 + }, + { + "epoch": 3.86, + "grad_norm": 2.9748263359069824, + "learning_rate": 1.2212287323046251e-06, + "logits/chosen": -0.5533554553985596, + "logits/rejected": -0.6428471803665161, + "logps/chosen": -41.71601486206055, + "logps/rejected": -129.95431518554688, + "loss": 0.4917, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.209319829940796, + "rewards/margins": 8.687980651855469, + "rewards/rejected": -5.478660583496094, + "step": 15443 + }, + { + "epoch": 3.86, + "grad_norm": 5.601294040679932, + "learning_rate": 1.2207140707878285e-06, + "logits/chosen": -0.5732467174530029, + "logits/rejected": -0.6131618618965149, + "logps/chosen": -54.95967483520508, + "logps/rejected": -123.94815826416016, + "loss": 0.6884, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.155970573425293, + "rewards/margins": 6.926169395446777, + "rewards/rejected": -3.770198345184326, + "step": 15444 + }, + { + "epoch": 3.86, + "grad_norm": 3.78318190574646, + "learning_rate": 1.220199502661209e-06, + "logits/chosen": -0.5126709938049316, + "logits/rejected": -0.6142065525054932, + "logps/chosen": -66.35220336914062, + "logps/rejected": -81.98636627197266, + "loss": 0.6603, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.680236339569092, + "rewards/margins": 5.083475589752197, + "rewards/rejected": -2.403240203857422, + "step": 15445 + }, + { + "epoch": 3.86, + "grad_norm": 11.18641471862793, + "learning_rate": 1.2196850279374784e-06, + "logits/chosen": -0.4832006096839905, + "logits/rejected": -0.569761335849762, + "logps/chosen": -54.08556365966797, + "logps/rejected": -104.41545867919922, + "loss": 0.5535, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3157777786254883, + "rewards/margins": 6.748598575592041, + "rewards/rejected": -3.4328207969665527, + "step": 15446 + }, + { + "epoch": 3.86, + "grad_norm": 7.101632118225098, + "learning_rate": 1.2191706466293528e-06, + "logits/chosen": -0.5719822645187378, + "logits/rejected": -0.6942264437675476, + "logps/chosen": -60.12741470336914, + "logps/rejected": -85.19740295410156, + "loss": 0.5878, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.068328619003296, + "rewards/margins": 7.321460723876953, + "rewards/rejected": -4.2531328201293945, + "step": 15447 + }, + { + "epoch": 3.86, + "grad_norm": 5.779207229614258, + "learning_rate": 1.2186563587495426e-06, + "logits/chosen": -0.6020088791847229, + "logits/rejected": -0.6463311314582825, + "logps/chosen": -52.35222625732422, + "logps/rejected": -119.72545623779297, + "loss": 0.5317, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9969043731689453, + "rewards/margins": 7.976823329925537, + "rewards/rejected": -4.979918479919434, + "step": 15448 + }, + { + "epoch": 3.86, + "grad_norm": 33.44136428833008, + "learning_rate": 1.2181421643107538e-06, + "logits/chosen": -0.4559203088283539, + "logits/rejected": -0.5145747661590576, + "logps/chosen": -61.784645080566406, + "logps/rejected": -103.33641815185547, + "loss": 0.7045, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.129288673400879, + "rewards/margins": 5.936765193939209, + "rewards/rejected": -2.807476043701172, + "step": 15449 + }, + { + "epoch": 3.86, + "grad_norm": 4.043837547302246, + "learning_rate": 1.2176280633256965e-06, + "logits/chosen": -0.5968406796455383, + "logits/rejected": -0.6665615439414978, + "logps/chosen": -58.822998046875, + "logps/rejected": -97.38134002685547, + "loss": 0.6074, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2831859588623047, + "rewards/margins": 7.345506191253662, + "rewards/rejected": -4.062319755554199, + "step": 15450 + }, + { + "epoch": 3.87, + "grad_norm": 3.5750365257263184, + "learning_rate": 1.2171140558070727e-06, + "logits/chosen": -0.6141926050186157, + "logits/rejected": -0.6512405276298523, + "logps/chosen": -50.89470672607422, + "logps/rejected": -101.59810638427734, + "loss": 0.6173, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0763680934906006, + "rewards/margins": 6.7768073081970215, + "rewards/rejected": -3.700439453125, + "step": 15451 + }, + { + "epoch": 3.87, + "grad_norm": 4.666755676269531, + "learning_rate": 1.2166001417675826e-06, + "logits/chosen": -0.47859370708465576, + "logits/rejected": -0.5448219776153564, + "logps/chosen": -59.32149124145508, + "logps/rejected": -111.22845458984375, + "loss": 0.6441, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8062803745269775, + "rewards/margins": 6.423867225646973, + "rewards/rejected": -3.6175873279571533, + "step": 15452 + }, + { + "epoch": 3.87, + "grad_norm": 3.312018871307373, + "learning_rate": 1.2160863212199276e-06, + "logits/chosen": -0.5189066529273987, + "logits/rejected": -0.6217597723007202, + "logps/chosen": -55.70698547363281, + "logps/rejected": -101.1551742553711, + "loss": 0.6006, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.167635679244995, + "rewards/margins": 7.610019683837891, + "rewards/rejected": -4.442383766174316, + "step": 15453 + }, + { + "epoch": 3.87, + "grad_norm": 4.911255836486816, + "learning_rate": 1.215572594176807e-06, + "logits/chosen": -0.5143938064575195, + "logits/rejected": -0.6278514862060547, + "logps/chosen": -50.924503326416016, + "logps/rejected": -110.64825439453125, + "loss": 0.5626, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.316922187805176, + "rewards/margins": 7.685629844665527, + "rewards/rejected": -4.36870813369751, + "step": 15454 + }, + { + "epoch": 3.87, + "grad_norm": 4.864859104156494, + "learning_rate": 1.2150589606509101e-06, + "logits/chosen": -0.5322072505950928, + "logits/rejected": -0.5970773100852966, + "logps/chosen": -57.95988464355469, + "logps/rejected": -91.38922119140625, + "loss": 0.6703, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3623719215393066, + "rewards/margins": 6.255171775817871, + "rewards/rejected": -2.8928000926971436, + "step": 15455 + }, + { + "epoch": 3.87, + "grad_norm": 4.0405497550964355, + "learning_rate": 1.2145454206549322e-06, + "logits/chosen": -0.6646655797958374, + "logits/rejected": -0.7296157479286194, + "logps/chosen": -53.0057373046875, + "logps/rejected": -103.11029052734375, + "loss": 0.6668, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9944493770599365, + "rewards/margins": 6.532592296600342, + "rewards/rejected": -3.5381436347961426, + "step": 15456 + }, + { + "epoch": 3.87, + "grad_norm": 7.346266269683838, + "learning_rate": 1.214031974201566e-06, + "logits/chosen": -0.4957734942436218, + "logits/rejected": -0.6093454360961914, + "logps/chosen": -58.095489501953125, + "logps/rejected": -116.39945983886719, + "loss": 0.596, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1453919410705566, + "rewards/margins": 7.330817222595215, + "rewards/rejected": -4.1854248046875, + "step": 15457 + }, + { + "epoch": 3.87, + "grad_norm": 4.733504295349121, + "learning_rate": 1.2135186213034938e-06, + "logits/chosen": -0.5082066059112549, + "logits/rejected": -0.5755116939544678, + "logps/chosen": -85.08873748779297, + "logps/rejected": -111.61627197265625, + "loss": 0.6896, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3093502521514893, + "rewards/margins": 7.120368480682373, + "rewards/rejected": -3.8110175132751465, + "step": 15458 + }, + { + "epoch": 3.87, + "grad_norm": 4.408658027648926, + "learning_rate": 1.2130053619734045e-06, + "logits/chosen": -0.48070788383483887, + "logits/rejected": -0.5479565858840942, + "logps/chosen": -56.253150939941406, + "logps/rejected": -116.7020263671875, + "loss": 0.5802, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.016026735305786, + "rewards/margins": 7.434656143188477, + "rewards/rejected": -4.418629169464111, + "step": 15459 + }, + { + "epoch": 3.87, + "grad_norm": 3.2602994441986084, + "learning_rate": 1.2124921962239817e-06, + "logits/chosen": -0.6543916463851929, + "logits/rejected": -0.741502583026886, + "logps/chosen": -57.91218948364258, + "logps/rejected": -100.75143432617188, + "loss": 0.6311, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0069193840026855, + "rewards/margins": 7.006558895111084, + "rewards/rejected": -3.9996399879455566, + "step": 15460 + }, + { + "epoch": 3.87, + "grad_norm": 4.684680938720703, + "learning_rate": 1.2119791240679052e-06, + "logits/chosen": -0.5547289252281189, + "logits/rejected": -0.6436570882797241, + "logps/chosen": -53.06072998046875, + "logps/rejected": -115.1455307006836, + "loss": 0.6105, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.225191354751587, + "rewards/margins": 7.825146675109863, + "rewards/rejected": -4.599954605102539, + "step": 15461 + }, + { + "epoch": 3.87, + "grad_norm": 2.5195250511169434, + "learning_rate": 1.2114661455178523e-06, + "logits/chosen": -0.5735313892364502, + "logits/rejected": -0.6669318675994873, + "logps/chosen": -51.862064361572266, + "logps/rejected": -86.22378540039062, + "loss": 0.5399, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1686434745788574, + "rewards/margins": 6.531891345977783, + "rewards/rejected": -3.3632471561431885, + "step": 15462 + }, + { + "epoch": 3.87, + "grad_norm": 19.412796020507812, + "learning_rate": 1.2109532605865021e-06, + "logits/chosen": -0.5036898255348206, + "logits/rejected": -0.5747915506362915, + "logps/chosen": -77.885986328125, + "logps/rejected": -99.0876693725586, + "loss": 0.8184, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.875541925430298, + "rewards/margins": 6.123344421386719, + "rewards/rejected": -3.247803211212158, + "step": 15463 + }, + { + "epoch": 3.87, + "grad_norm": 8.553492546081543, + "learning_rate": 1.2104404692865257e-06, + "logits/chosen": -0.44720202684402466, + "logits/rejected": -0.4788482189178467, + "logps/chosen": -53.776031494140625, + "logps/rejected": -105.1502914428711, + "loss": 0.6822, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0826187133789062, + "rewards/margins": 5.680396556854248, + "rewards/rejected": -2.597777843475342, + "step": 15464 + }, + { + "epoch": 3.87, + "grad_norm": 6.065098762512207, + "learning_rate": 1.2099277716305973e-06, + "logits/chosen": -0.5592299699783325, + "logits/rejected": -0.6446666717529297, + "logps/chosen": -66.31123352050781, + "logps/rejected": -103.70252990722656, + "loss": 0.692, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0850589275360107, + "rewards/margins": 6.984133720397949, + "rewards/rejected": -3.8990747928619385, + "step": 15465 + }, + { + "epoch": 3.87, + "grad_norm": 18.956239700317383, + "learning_rate": 1.2094151676313848e-06, + "logits/chosen": -0.513411283493042, + "logits/rejected": -0.6044921875, + "logps/chosen": -56.904296875, + "logps/rejected": -96.41688537597656, + "loss": 0.6781, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1989657878875732, + "rewards/margins": 7.101878643035889, + "rewards/rejected": -3.9029133319854736, + "step": 15466 + }, + { + "epoch": 3.87, + "grad_norm": 5.589298725128174, + "learning_rate": 1.2089026573015545e-06, + "logits/chosen": -0.6139202117919922, + "logits/rejected": -0.6770660877227783, + "logps/chosen": -47.15411376953125, + "logps/rejected": -125.53622436523438, + "loss": 0.5394, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9512760639190674, + "rewards/margins": 9.092562675476074, + "rewards/rejected": -6.141287326812744, + "step": 15467 + }, + { + "epoch": 3.87, + "grad_norm": 5.909462928771973, + "learning_rate": 1.208390240653773e-06, + "logits/chosen": -0.5653641819953918, + "logits/rejected": -0.6446759104728699, + "logps/chosen": -45.01156997680664, + "logps/rejected": -81.7794418334961, + "loss": 0.6955, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3963119983673096, + "rewards/margins": 6.09589958190918, + "rewards/rejected": -2.699587345123291, + "step": 15468 + }, + { + "epoch": 3.87, + "grad_norm": 5.7993316650390625, + "learning_rate": 1.2078779177007016e-06, + "logits/chosen": -0.6221427321434021, + "logits/rejected": -0.6811193823814392, + "logps/chosen": -60.58799743652344, + "logps/rejected": -97.10613250732422, + "loss": 0.8291, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1043121814727783, + "rewards/margins": 5.984259128570557, + "rewards/rejected": -2.879946708679199, + "step": 15469 + }, + { + "epoch": 3.87, + "grad_norm": 15.47253704071045, + "learning_rate": 1.207365688454999e-06, + "logits/chosen": -0.5467201471328735, + "logits/rejected": -0.636522650718689, + "logps/chosen": -55.81161117553711, + "logps/rejected": -106.05530548095703, + "loss": 0.6815, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.117683172225952, + "rewards/margins": 7.7447357177734375, + "rewards/rejected": -4.627053260803223, + "step": 15470 + }, + { + "epoch": 3.87, + "grad_norm": 5.653203964233398, + "learning_rate": 1.206853552929324e-06, + "logits/chosen": -0.5460619330406189, + "logits/rejected": -0.6349713802337646, + "logps/chosen": -45.961158752441406, + "logps/rejected": -103.51920318603516, + "loss": 0.6139, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.345609664916992, + "rewards/margins": 7.483458995819092, + "rewards/rejected": -4.137848854064941, + "step": 15471 + }, + { + "epoch": 3.87, + "grad_norm": 8.637187957763672, + "learning_rate": 1.2063415111363352e-06, + "logits/chosen": -0.515347957611084, + "logits/rejected": -0.6287721395492554, + "logps/chosen": -64.01811218261719, + "logps/rejected": -99.28286743164062, + "loss": 0.7076, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9967000484466553, + "rewards/margins": 5.823183059692383, + "rewards/rejected": -2.8264827728271484, + "step": 15472 + }, + { + "epoch": 3.87, + "grad_norm": 6.609061241149902, + "learning_rate": 1.20582956308868e-06, + "logits/chosen": -0.5918762683868408, + "logits/rejected": -0.6773879528045654, + "logps/chosen": -59.50681686401367, + "logps/rejected": -99.73947143554688, + "loss": 0.7182, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2110233306884766, + "rewards/margins": 6.312588214874268, + "rewards/rejected": -3.101565361022949, + "step": 15473 + }, + { + "epoch": 3.87, + "grad_norm": 2.856846332550049, + "learning_rate": 1.2053177087990125e-06, + "logits/chosen": -0.4075099229812622, + "logits/rejected": -0.5129702091217041, + "logps/chosen": -57.620018005371094, + "logps/rejected": -139.82290649414062, + "loss": 0.5163, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.367812395095825, + "rewards/margins": 8.624103546142578, + "rewards/rejected": -5.256290435791016, + "step": 15474 + }, + { + "epoch": 3.87, + "grad_norm": 3.9013872146606445, + "learning_rate": 1.2048059482799813e-06, + "logits/chosen": -0.5354269742965698, + "logits/rejected": -0.6068872213363647, + "logps/chosen": -49.80840301513672, + "logps/rejected": -115.20925903320312, + "loss": 0.5826, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1218161582946777, + "rewards/margins": 8.157817840576172, + "rewards/rejected": -5.036002159118652, + "step": 15475 + }, + { + "epoch": 3.87, + "grad_norm": 17.197124481201172, + "learning_rate": 1.2042942815442322e-06, + "logits/chosen": -0.45770782232284546, + "logits/rejected": -0.5855571627616882, + "logps/chosen": -64.76763153076172, + "logps/rejected": -90.13641357421875, + "loss": 0.8527, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7794220447540283, + "rewards/margins": 5.766396522521973, + "rewards/rejected": -2.9869742393493652, + "step": 15476 + }, + { + "epoch": 3.87, + "grad_norm": 6.7407145500183105, + "learning_rate": 1.2037827086044073e-06, + "logits/chosen": -0.5751825571060181, + "logits/rejected": -0.6275165677070618, + "logps/chosen": -55.64269256591797, + "logps/rejected": -113.76233673095703, + "loss": 0.695, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.133023977279663, + "rewards/margins": 8.038200378417969, + "rewards/rejected": -4.905175685882568, + "step": 15477 + }, + { + "epoch": 3.87, + "grad_norm": 34.614627838134766, + "learning_rate": 1.2032712294731508e-06, + "logits/chosen": -0.5738189220428467, + "logits/rejected": -0.6373755931854248, + "logps/chosen": -69.10615539550781, + "logps/rejected": -94.19071197509766, + "loss": 0.7987, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0127131938934326, + "rewards/margins": 5.946612358093262, + "rewards/rejected": -2.9338996410369873, + "step": 15478 + }, + { + "epoch": 3.87, + "grad_norm": 4.519959449768066, + "learning_rate": 1.2027598441631006e-06, + "logits/chosen": -0.5768100619316101, + "logits/rejected": -0.6937830448150635, + "logps/chosen": -63.20181655883789, + "logps/rejected": -114.88429260253906, + "loss": 0.5733, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8043835163116455, + "rewards/margins": 8.006146430969238, + "rewards/rejected": -5.201763153076172, + "step": 15479 + }, + { + "epoch": 3.87, + "grad_norm": 6.3848772048950195, + "learning_rate": 1.202248552686892e-06, + "logits/chosen": -0.5527521371841431, + "logits/rejected": -0.5816659927368164, + "logps/chosen": -55.321537017822266, + "logps/rejected": -104.9526596069336, + "loss": 0.6773, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9047610759735107, + "rewards/margins": 5.354857921600342, + "rewards/rejected": -2.450096845626831, + "step": 15480 + }, + { + "epoch": 3.87, + "grad_norm": 1.7874724864959717, + "learning_rate": 1.2017373550571626e-06, + "logits/chosen": -0.5986299514770508, + "logits/rejected": -0.6552228331565857, + "logps/chosen": -43.029022216796875, + "logps/rejected": -110.4759521484375, + "loss": 0.5211, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.081988573074341, + "rewards/margins": 8.415098190307617, + "rewards/rejected": -5.333109378814697, + "step": 15481 + }, + { + "epoch": 3.87, + "grad_norm": 7.351468086242676, + "learning_rate": 1.2012262512865435e-06, + "logits/chosen": -0.5982190370559692, + "logits/rejected": -0.6443260312080383, + "logps/chosen": -55.44385528564453, + "logps/rejected": -106.7584457397461, + "loss": 0.6311, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0324530601501465, + "rewards/margins": 6.037453651428223, + "rewards/rejected": -3.0050010681152344, + "step": 15482 + }, + { + "epoch": 3.87, + "grad_norm": 3.7988240718841553, + "learning_rate": 1.2007152413876627e-06, + "logits/chosen": -0.5525242686271667, + "logits/rejected": -0.6435020565986633, + "logps/chosen": -48.57542037963867, + "logps/rejected": -124.8749008178711, + "loss": 0.5849, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1629204750061035, + "rewards/margins": 8.381246566772461, + "rewards/rejected": -5.218326568603516, + "step": 15483 + }, + { + "epoch": 3.87, + "grad_norm": 3.896878719329834, + "learning_rate": 1.200204325373151e-06, + "logits/chosen": -0.4979753792285919, + "logits/rejected": -0.6061709523200989, + "logps/chosen": -49.918338775634766, + "logps/rejected": -97.34542846679688, + "loss": 0.605, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0868868827819824, + "rewards/margins": 6.846510887145996, + "rewards/rejected": -3.7596237659454346, + "step": 15484 + }, + { + "epoch": 3.87, + "grad_norm": 5.304416179656982, + "learning_rate": 1.19969350325563e-06, + "logits/chosen": -0.5886765718460083, + "logits/rejected": -0.6398263573646545, + "logps/chosen": -41.99349594116211, + "logps/rejected": -123.9572525024414, + "loss": 0.5642, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.119960308074951, + "rewards/margins": 8.5136137008667, + "rewards/rejected": -5.3936543464660645, + "step": 15485 + }, + { + "epoch": 3.87, + "grad_norm": 28.538715362548828, + "learning_rate": 1.199182775047727e-06, + "logits/chosen": -0.4841367304325104, + "logits/rejected": -0.5607098340988159, + "logps/chosen": -60.92277526855469, + "logps/rejected": -100.51014709472656, + "loss": 0.6828, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.746912956237793, + "rewards/margins": 6.903356075286865, + "rewards/rejected": -4.156442642211914, + "step": 15486 + }, + { + "epoch": 3.87, + "grad_norm": 7.132336616516113, + "learning_rate": 1.1986721407620582e-06, + "logits/chosen": -0.5068855881690979, + "logits/rejected": -0.6268173456192017, + "logps/chosen": -56.822052001953125, + "logps/rejected": -107.77084350585938, + "loss": 0.5266, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9669618606567383, + "rewards/margins": 7.698714256286621, + "rewards/rejected": -4.731752395629883, + "step": 15487 + }, + { + "epoch": 3.87, + "grad_norm": 5.6048502922058105, + "learning_rate": 1.1981616004112456e-06, + "logits/chosen": -0.5854383111000061, + "logits/rejected": -0.6153322458267212, + "logps/chosen": -54.75462341308594, + "logps/rejected": -125.75707244873047, + "loss": 0.579, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0374860763549805, + "rewards/margins": 8.519781112670898, + "rewards/rejected": -5.482295513153076, + "step": 15488 + }, + { + "epoch": 3.87, + "grad_norm": 5.196719169616699, + "learning_rate": 1.197651154007904e-06, + "logits/chosen": -0.48955947160720825, + "logits/rejected": -0.5812127590179443, + "logps/chosen": -62.128849029541016, + "logps/rejected": -108.21113586425781, + "loss": 0.5825, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1350369453430176, + "rewards/margins": 7.827403545379639, + "rewards/rejected": -4.692366600036621, + "step": 15489 + }, + { + "epoch": 3.88, + "grad_norm": 8.759605407714844, + "learning_rate": 1.1971408015646451e-06, + "logits/chosen": -0.5342729687690735, + "logits/rejected": -0.6592133641242981, + "logps/chosen": -60.52075958251953, + "logps/rejected": -105.69577026367188, + "loss": 0.58, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.875443458557129, + "rewards/margins": 7.616447448730469, + "rewards/rejected": -4.74100399017334, + "step": 15490 + }, + { + "epoch": 3.88, + "grad_norm": 3.025350570678711, + "learning_rate": 1.1966305430940834e-06, + "logits/chosen": -0.577305793762207, + "logits/rejected": -0.6529635190963745, + "logps/chosen": -42.6414909362793, + "logps/rejected": -116.24603271484375, + "loss": 0.5291, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.494898796081543, + "rewards/margins": 8.296236991882324, + "rewards/rejected": -4.801338195800781, + "step": 15491 + }, + { + "epoch": 3.88, + "grad_norm": 5.857380390167236, + "learning_rate": 1.196120378608825e-06, + "logits/chosen": -0.5777034759521484, + "logits/rejected": -0.6770694255828857, + "logps/chosen": -44.243167877197266, + "logps/rejected": -98.20628356933594, + "loss": 0.5672, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.278200626373291, + "rewards/margins": 7.012661457061768, + "rewards/rejected": -3.7344601154327393, + "step": 15492 + }, + { + "epoch": 3.88, + "grad_norm": 15.93410587310791, + "learning_rate": 1.1956103081214797e-06, + "logits/chosen": -0.49507004022598267, + "logits/rejected": -0.5986016988754272, + "logps/chosen": -53.008644104003906, + "logps/rejected": -103.6434555053711, + "loss": 0.7032, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.176551103591919, + "rewards/margins": 7.215043544769287, + "rewards/rejected": -4.0384931564331055, + "step": 15493 + }, + { + "epoch": 3.88, + "grad_norm": 3.6285789012908936, + "learning_rate": 1.1951003316446497e-06, + "logits/chosen": -0.4872879683971405, + "logits/rejected": -0.5183694362640381, + "logps/chosen": -53.03996276855469, + "logps/rejected": -114.25859069824219, + "loss": 0.6031, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8224384784698486, + "rewards/margins": 7.063007354736328, + "rewards/rejected": -4.240569114685059, + "step": 15494 + }, + { + "epoch": 3.88, + "grad_norm": 11.534250259399414, + "learning_rate": 1.1945904491909366e-06, + "logits/chosen": -0.5586291551589966, + "logits/rejected": -0.6581593751907349, + "logps/chosen": -53.47526550292969, + "logps/rejected": -123.47518157958984, + "loss": 0.7138, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0281331539154053, + "rewards/margins": 8.649775505065918, + "rewards/rejected": -5.621642589569092, + "step": 15495 + }, + { + "epoch": 3.88, + "grad_norm": 3.7828049659729004, + "learning_rate": 1.1940806607729422e-06, + "logits/chosen": -0.46570679545402527, + "logits/rejected": -0.5717236995697021, + "logps/chosen": -52.85555648803711, + "logps/rejected": -118.49893188476562, + "loss": 0.5962, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.270627975463867, + "rewards/margins": 8.090608596801758, + "rewards/rejected": -4.819980621337891, + "step": 15496 + }, + { + "epoch": 3.88, + "grad_norm": 19.771957397460938, + "learning_rate": 1.1935709664032629e-06, + "logits/chosen": -0.5599496960639954, + "logits/rejected": -0.6111518740653992, + "logps/chosen": -42.939544677734375, + "logps/rejected": -101.13391876220703, + "loss": 0.6087, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.104835271835327, + "rewards/margins": 6.76817512512207, + "rewards/rejected": -3.6633405685424805, + "step": 15497 + }, + { + "epoch": 3.88, + "grad_norm": 3.6503050327301025, + "learning_rate": 1.1930613660944928e-06, + "logits/chosen": -0.5784379839897156, + "logits/rejected": -0.6320173144340515, + "logps/chosen": -47.78965759277344, + "logps/rejected": -120.66612243652344, + "loss": 0.6282, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.209704637527466, + "rewards/margins": 7.617776393890381, + "rewards/rejected": -4.408071994781494, + "step": 15498 + }, + { + "epoch": 3.88, + "grad_norm": 5.098639965057373, + "learning_rate": 1.1925518598592251e-06, + "logits/chosen": -0.511232316493988, + "logits/rejected": -0.600299596786499, + "logps/chosen": -58.23720932006836, + "logps/rejected": -114.86735534667969, + "loss": 0.6431, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.997645378112793, + "rewards/margins": 6.554162502288818, + "rewards/rejected": -3.5565173625946045, + "step": 15499 + }, + { + "epoch": 3.88, + "grad_norm": 8.815903663635254, + "learning_rate": 1.1920424477100534e-06, + "logits/chosen": -0.551744282245636, + "logits/rejected": -0.6732239127159119, + "logps/chosen": -58.566802978515625, + "logps/rejected": -101.91918182373047, + "loss": 0.6065, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0430033206939697, + "rewards/margins": 7.211082458496094, + "rewards/rejected": -4.168079376220703, + "step": 15500 + }, + { + "epoch": 3.88, + "grad_norm": 4.128093242645264, + "learning_rate": 1.1915331296595606e-06, + "logits/chosen": -0.5203929543495178, + "logits/rejected": -0.6053788065910339, + "logps/chosen": -54.21609878540039, + "logps/rejected": -123.04259490966797, + "loss": 0.6559, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0625338554382324, + "rewards/margins": 7.569573402404785, + "rewards/rejected": -4.5070390701293945, + "step": 15501 + }, + { + "epoch": 3.88, + "grad_norm": 5.719676494598389, + "learning_rate": 1.191023905720335e-06, + "logits/chosen": -0.5377699136734009, + "logits/rejected": -0.6025722026824951, + "logps/chosen": -52.05109405517578, + "logps/rejected": -109.50543212890625, + "loss": 0.6798, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8189613819122314, + "rewards/margins": 6.3364338874816895, + "rewards/rejected": -3.517472267150879, + "step": 15502 + }, + { + "epoch": 3.88, + "grad_norm": 2.359844923019409, + "learning_rate": 1.1905147759049613e-06, + "logits/chosen": -0.5553095936775208, + "logits/rejected": -0.6572771072387695, + "logps/chosen": -44.966773986816406, + "logps/rejected": -94.09745788574219, + "loss": 0.535, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4447436332702637, + "rewards/margins": 7.606648921966553, + "rewards/rejected": -4.161904811859131, + "step": 15503 + }, + { + "epoch": 3.88, + "grad_norm": 4.367923259735107, + "learning_rate": 1.1900057402260196e-06, + "logits/chosen": -0.5199505686759949, + "logits/rejected": -0.6153028011322021, + "logps/chosen": -56.416603088378906, + "logps/rejected": -105.62966918945312, + "loss": 0.6745, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.164395570755005, + "rewards/margins": 6.775182723999023, + "rewards/rejected": -3.6107873916625977, + "step": 15504 + }, + { + "epoch": 3.88, + "grad_norm": 7.662238121032715, + "learning_rate": 1.1894967986960877e-06, + "logits/chosen": -0.45772314071655273, + "logits/rejected": -0.5431278347969055, + "logps/chosen": -64.985107421875, + "logps/rejected": -113.83531188964844, + "loss": 0.7378, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.83853816986084, + "rewards/margins": 6.891755104064941, + "rewards/rejected": -4.05321741104126, + "step": 15505 + }, + { + "epoch": 3.88, + "grad_norm": 7.435196399688721, + "learning_rate": 1.1889879513277435e-06, + "logits/chosen": -0.47929275035858154, + "logits/rejected": -0.5650480389595032, + "logps/chosen": -57.39674377441406, + "logps/rejected": -103.31520080566406, + "loss": 0.6523, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.7461342811584473, + "rewards/margins": 6.454503536224365, + "rewards/rejected": -2.708369731903076, + "step": 15506 + }, + { + "epoch": 3.88, + "grad_norm": 4.122817039489746, + "learning_rate": 1.1884791981335609e-06, + "logits/chosen": -0.5276426076889038, + "logits/rejected": -0.6005302667617798, + "logps/chosen": -57.05078125, + "logps/rejected": -117.02230834960938, + "loss": 0.6296, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2527167797088623, + "rewards/margins": 7.564146518707275, + "rewards/rejected": -4.311429023742676, + "step": 15507 + }, + { + "epoch": 3.88, + "grad_norm": 6.387255668640137, + "learning_rate": 1.1879705391261097e-06, + "logits/chosen": -0.48843279480934143, + "logits/rejected": -0.5839068293571472, + "logps/chosen": -63.01073455810547, + "logps/rejected": -99.8736343383789, + "loss": 0.8515, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8997409343719482, + "rewards/margins": 6.225711822509766, + "rewards/rejected": -3.3259716033935547, + "step": 15508 + }, + { + "epoch": 3.88, + "grad_norm": 5.2781853675842285, + "learning_rate": 1.1874619743179632e-06, + "logits/chosen": -0.48887401819229126, + "logits/rejected": -0.5830244421958923, + "logps/chosen": -55.34789276123047, + "logps/rejected": -118.68370056152344, + "loss": 0.625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2197651863098145, + "rewards/margins": 7.8226776123046875, + "rewards/rejected": -4.602912902832031, + "step": 15509 + }, + { + "epoch": 3.88, + "grad_norm": 7.32895565032959, + "learning_rate": 1.1869535037216846e-06, + "logits/chosen": -0.5384665131568909, + "logits/rejected": -0.6665134429931641, + "logps/chosen": -65.14520263671875, + "logps/rejected": -109.83648681640625, + "loss": 0.6525, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.893167018890381, + "rewards/margins": 7.053640365600586, + "rewards/rejected": -4.160473346710205, + "step": 15510 + }, + { + "epoch": 3.88, + "grad_norm": 4.880385398864746, + "learning_rate": 1.1864451273498423e-06, + "logits/chosen": -0.559620201587677, + "logits/rejected": -0.688122034072876, + "logps/chosen": -52.51599884033203, + "logps/rejected": -89.38705444335938, + "loss": 0.6009, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.186652660369873, + "rewards/margins": 7.18829345703125, + "rewards/rejected": -4.001640796661377, + "step": 15511 + }, + { + "epoch": 3.88, + "grad_norm": 3.1601438522338867, + "learning_rate": 1.1859368452149972e-06, + "logits/chosen": -0.46906694769859314, + "logits/rejected": -0.4878045916557312, + "logps/chosen": -70.97126007080078, + "logps/rejected": -135.23841857910156, + "loss": 0.6127, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0967495441436768, + "rewards/margins": 7.498579025268555, + "rewards/rejected": -4.401829242706299, + "step": 15512 + }, + { + "epoch": 3.88, + "grad_norm": 4.911108016967773, + "learning_rate": 1.185428657329708e-06, + "logits/chosen": -0.5102104544639587, + "logits/rejected": -0.5794544816017151, + "logps/chosen": -48.952388763427734, + "logps/rejected": -111.34455871582031, + "loss": 0.5429, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.973416805267334, + "rewards/margins": 7.376152992248535, + "rewards/rejected": -4.402736186981201, + "step": 15513 + }, + { + "epoch": 3.88, + "grad_norm": 5.167623519897461, + "learning_rate": 1.1849205637065352e-06, + "logits/chosen": -0.5727243423461914, + "logits/rejected": -0.649549126625061, + "logps/chosen": -54.9078483581543, + "logps/rejected": -88.72190856933594, + "loss": 0.6379, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9735639095306396, + "rewards/margins": 6.5236287117004395, + "rewards/rejected": -3.5500648021698, + "step": 15514 + }, + { + "epoch": 3.88, + "grad_norm": 4.266493797302246, + "learning_rate": 1.1844125643580335e-06, + "logits/chosen": -0.5280985832214355, + "logits/rejected": -0.6081660389900208, + "logps/chosen": -54.473323822021484, + "logps/rejected": -102.32257080078125, + "loss": 0.6562, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.356961727142334, + "rewards/margins": 7.56602668762207, + "rewards/rejected": -4.2090654373168945, + "step": 15515 + }, + { + "epoch": 3.88, + "grad_norm": 17.35466194152832, + "learning_rate": 1.183904659296754e-06, + "logits/chosen": -0.5972073078155518, + "logits/rejected": -0.6888059973716736, + "logps/chosen": -60.876529693603516, + "logps/rejected": -91.73236083984375, + "loss": 0.7794, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7029147148132324, + "rewards/margins": 5.436697006225586, + "rewards/rejected": -2.7337820529937744, + "step": 15516 + }, + { + "epoch": 3.88, + "grad_norm": 5.9875688552856445, + "learning_rate": 1.1833968485352498e-06, + "logits/chosen": -0.6057272553443909, + "logits/rejected": -0.6750332117080688, + "logps/chosen": -51.171539306640625, + "logps/rejected": -93.56497192382812, + "loss": 0.6653, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0416877269744873, + "rewards/margins": 6.083175182342529, + "rewards/rejected": -3.041487216949463, + "step": 15517 + }, + { + "epoch": 3.88, + "grad_norm": 7.544384956359863, + "learning_rate": 1.1828891320860703e-06, + "logits/chosen": -0.5055170655250549, + "logits/rejected": -0.5877372026443481, + "logps/chosen": -51.116607666015625, + "logps/rejected": -94.67157745361328, + "loss": 0.6313, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.341634511947632, + "rewards/margins": 6.1247124671936035, + "rewards/rejected": -2.7830772399902344, + "step": 15518 + }, + { + "epoch": 3.88, + "grad_norm": 17.166671752929688, + "learning_rate": 1.1823815099617602e-06, + "logits/chosen": -0.5036455392837524, + "logits/rejected": -0.5705075860023499, + "logps/chosen": -55.47649383544922, + "logps/rejected": -98.41426849365234, + "loss": 0.7307, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8856027126312256, + "rewards/margins": 5.62380838394165, + "rewards/rejected": -2.738205671310425, + "step": 15519 + }, + { + "epoch": 3.88, + "grad_norm": 3.3668324947357178, + "learning_rate": 1.1818739821748627e-06, + "logits/chosen": -0.5808084011077881, + "logits/rejected": -0.6921137571334839, + "logps/chosen": -59.59596252441406, + "logps/rejected": -110.98065948486328, + "loss": 0.5992, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3482470512390137, + "rewards/margins": 7.447089195251465, + "rewards/rejected": -4.098841667175293, + "step": 15520 + }, + { + "epoch": 3.88, + "grad_norm": 9.552129745483398, + "learning_rate": 1.1813665487379211e-06, + "logits/chosen": -0.5339801907539368, + "logits/rejected": -0.5866391658782959, + "logps/chosen": -49.030311584472656, + "logps/rejected": -119.70271301269531, + "loss": 0.5702, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9534661769866943, + "rewards/margins": 7.516341686248779, + "rewards/rejected": -4.562875747680664, + "step": 15521 + }, + { + "epoch": 3.88, + "grad_norm": 15.553550720214844, + "learning_rate": 1.1808592096634736e-06, + "logits/chosen": -0.6361446380615234, + "logits/rejected": -0.6655739545822144, + "logps/chosen": -38.57068634033203, + "logps/rejected": -97.9493637084961, + "loss": 0.5769, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1789023876190186, + "rewards/margins": 6.592645168304443, + "rewards/rejected": -3.413743257522583, + "step": 15522 + }, + { + "epoch": 3.88, + "grad_norm": 4.024151802062988, + "learning_rate": 1.180351964964056e-06, + "logits/chosen": -0.5026460886001587, + "logits/rejected": -0.5908030867576599, + "logps/chosen": -44.45473861694336, + "logps/rejected": -90.13165283203125, + "loss": 0.5221, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3455727100372314, + "rewards/margins": 7.409278869628906, + "rewards/rejected": -4.063705921173096, + "step": 15523 + }, + { + "epoch": 3.88, + "grad_norm": 12.272401809692383, + "learning_rate": 1.1798448146522057e-06, + "logits/chosen": -0.5224735736846924, + "logits/rejected": -0.6134597063064575, + "logps/chosen": -52.70143127441406, + "logps/rejected": -105.61993408203125, + "loss": 0.5514, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.860414981842041, + "rewards/margins": 6.985454082489014, + "rewards/rejected": -4.125039100646973, + "step": 15524 + }, + { + "epoch": 3.88, + "grad_norm": 4.195279598236084, + "learning_rate": 1.1793377587404536e-06, + "logits/chosen": -0.500969409942627, + "logits/rejected": -0.5669930577278137, + "logps/chosen": -56.68476104736328, + "logps/rejected": -112.15399932861328, + "loss": 0.6275, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.070323944091797, + "rewards/margins": 6.280558109283447, + "rewards/rejected": -3.2102339267730713, + "step": 15525 + }, + { + "epoch": 3.88, + "grad_norm": 23.26026153564453, + "learning_rate": 1.1788307972413276e-06, + "logits/chosen": -0.47501999139785767, + "logits/rejected": -0.6068766713142395, + "logps/chosen": -56.20423889160156, + "logps/rejected": -93.36778259277344, + "loss": 0.6447, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6231160163879395, + "rewards/margins": 6.798966407775879, + "rewards/rejected": -4.175849914550781, + "step": 15526 + }, + { + "epoch": 3.88, + "grad_norm": 4.9188103675842285, + "learning_rate": 1.1783239301673582e-06, + "logits/chosen": -0.6047185659408569, + "logits/rejected": -0.6588767766952515, + "logps/chosen": -47.39539337158203, + "logps/rejected": -116.93899536132812, + "loss": 0.5715, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1778879165649414, + "rewards/margins": 7.666374683380127, + "rewards/rejected": -4.488487243652344, + "step": 15527 + }, + { + "epoch": 3.88, + "grad_norm": 6.746121406555176, + "learning_rate": 1.1778171575310681e-06, + "logits/chosen": -0.4494244456291199, + "logits/rejected": -0.6109058856964111, + "logps/chosen": -56.762760162353516, + "logps/rejected": -103.03311920166016, + "loss": 0.6288, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.938927173614502, + "rewards/margins": 7.213609218597412, + "rewards/rejected": -4.27468204498291, + "step": 15528 + }, + { + "epoch": 3.88, + "grad_norm": 6.73244047164917, + "learning_rate": 1.177310479344983e-06, + "logits/chosen": -0.5078492760658264, + "logits/rejected": -0.6179767847061157, + "logps/chosen": -49.249656677246094, + "logps/rejected": -100.33258056640625, + "loss": 0.5802, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2000484466552734, + "rewards/margins": 7.397767543792725, + "rewards/rejected": -4.197718620300293, + "step": 15529 + }, + { + "epoch": 3.89, + "grad_norm": 6.289591312408447, + "learning_rate": 1.1768038956216205e-06, + "logits/chosen": -0.535190224647522, + "logits/rejected": -0.6173940896987915, + "logps/chosen": -65.71691131591797, + "logps/rejected": -116.29667663574219, + "loss": 0.6776, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.870055675506592, + "rewards/margins": 7.446514129638672, + "rewards/rejected": -4.576458930969238, + "step": 15530 + }, + { + "epoch": 3.89, + "grad_norm": 2.0592424869537354, + "learning_rate": 1.1762974063735017e-06, + "logits/chosen": -0.5787147283554077, + "logits/rejected": -0.5972224473953247, + "logps/chosen": -57.09184646606445, + "logps/rejected": -135.14454650878906, + "loss": 0.5539, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5854384899139404, + "rewards/margins": 8.581840515136719, + "rewards/rejected": -4.996401786804199, + "step": 15531 + }, + { + "epoch": 3.89, + "grad_norm": 6.8823089599609375, + "learning_rate": 1.175791011613141e-06, + "logits/chosen": -0.5322669148445129, + "logits/rejected": -0.6021853685379028, + "logps/chosen": -49.18212890625, + "logps/rejected": -109.84799194335938, + "loss": 0.6106, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8373608589172363, + "rewards/margins": 6.976309776306152, + "rewards/rejected": -4.138948917388916, + "step": 15532 + }, + { + "epoch": 3.89, + "grad_norm": 15.949956893920898, + "learning_rate": 1.1752847113530509e-06, + "logits/chosen": -0.5450908541679382, + "logits/rejected": -0.6504409909248352, + "logps/chosen": -55.255916595458984, + "logps/rejected": -84.85350036621094, + "loss": 0.821, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.793245792388916, + "rewards/margins": 5.479240894317627, + "rewards/rejected": -2.6859946250915527, + "step": 15533 + }, + { + "epoch": 3.89, + "grad_norm": 2.6456005573272705, + "learning_rate": 1.1747785056057448e-06, + "logits/chosen": -0.5064582824707031, + "logits/rejected": -0.6238626837730408, + "logps/chosen": -52.347328186035156, + "logps/rejected": -82.13926696777344, + "loss": 0.5818, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.066617965698242, + "rewards/margins": 7.0677995681762695, + "rewards/rejected": -4.001181125640869, + "step": 15534 + }, + { + "epoch": 3.89, + "grad_norm": 4.672940254211426, + "learning_rate": 1.1742723943837292e-06, + "logits/chosen": -0.5596243739128113, + "logits/rejected": -0.6275997757911682, + "logps/chosen": -55.55071258544922, + "logps/rejected": -104.91800689697266, + "loss": 0.5903, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1900012493133545, + "rewards/margins": 6.635427474975586, + "rewards/rejected": -3.445425271987915, + "step": 15535 + }, + { + "epoch": 3.89, + "grad_norm": 6.369540214538574, + "learning_rate": 1.1737663776995135e-06, + "logits/chosen": -0.5197015404701233, + "logits/rejected": -0.6047170758247375, + "logps/chosen": -56.02325439453125, + "logps/rejected": -116.14590454101562, + "loss": 0.5914, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1388816833496094, + "rewards/margins": 8.063773155212402, + "rewards/rejected": -4.924891471862793, + "step": 15536 + }, + { + "epoch": 3.89, + "grad_norm": 2.392627716064453, + "learning_rate": 1.1732604555656002e-06, + "logits/chosen": -0.4976511001586914, + "logits/rejected": -0.6405588984489441, + "logps/chosen": -62.788543701171875, + "logps/rejected": -91.85083770751953, + "loss": 0.6227, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.167201280593872, + "rewards/margins": 6.739485263824463, + "rewards/rejected": -3.5722837448120117, + "step": 15537 + }, + { + "epoch": 3.89, + "grad_norm": 4.180401802062988, + "learning_rate": 1.1727546279944902e-06, + "logits/chosen": -0.5882756114006042, + "logits/rejected": -0.7102140188217163, + "logps/chosen": -47.23119354248047, + "logps/rejected": -85.10015106201172, + "loss": 0.6524, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2568247318267822, + "rewards/margins": 7.31690788269043, + "rewards/rejected": -4.060083389282227, + "step": 15538 + }, + { + "epoch": 3.89, + "grad_norm": 20.198226928710938, + "learning_rate": 1.172248894998685e-06, + "logits/chosen": -0.5236457586288452, + "logits/rejected": -0.602651834487915, + "logps/chosen": -55.37549591064453, + "logps/rejected": -101.01380920410156, + "loss": 0.6925, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9942688941955566, + "rewards/margins": 7.064345359802246, + "rewards/rejected": -4.0700764656066895, + "step": 15539 + }, + { + "epoch": 3.89, + "grad_norm": 36.22179412841797, + "learning_rate": 1.1717432565906817e-06, + "logits/chosen": -0.5511751770973206, + "logits/rejected": -0.6723352074623108, + "logps/chosen": -53.9811897277832, + "logps/rejected": -88.52789306640625, + "loss": 0.6855, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.54252552986145, + "rewards/margins": 6.810391426086426, + "rewards/rejected": -4.267866134643555, + "step": 15540 + }, + { + "epoch": 3.89, + "grad_norm": 4.047689437866211, + "learning_rate": 1.1712377127829722e-06, + "logits/chosen": -0.4981409013271332, + "logits/rejected": -0.6134226322174072, + "logps/chosen": -63.89628982543945, + "logps/rejected": -95.78919982910156, + "loss": 0.6389, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7924270629882812, + "rewards/margins": 5.951142311096191, + "rewards/rejected": -3.1587157249450684, + "step": 15541 + }, + { + "epoch": 3.89, + "grad_norm": 3.3907859325408936, + "learning_rate": 1.1707322635880519e-06, + "logits/chosen": -0.5218929052352905, + "logits/rejected": -0.5524013042449951, + "logps/chosen": -48.87081527709961, + "logps/rejected": -113.40093994140625, + "loss": 0.5704, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3098063468933105, + "rewards/margins": 7.320094585418701, + "rewards/rejected": -4.010288238525391, + "step": 15542 + }, + { + "epoch": 3.89, + "grad_norm": 2.6179089546203613, + "learning_rate": 1.1702269090184131e-06, + "logits/chosen": -0.5368845462799072, + "logits/rejected": -0.6462107300758362, + "logps/chosen": -45.10703659057617, + "logps/rejected": -119.11175537109375, + "loss": 0.4991, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.177476644515991, + "rewards/margins": 8.864418029785156, + "rewards/rejected": -5.686941146850586, + "step": 15543 + }, + { + "epoch": 3.89, + "grad_norm": 3.637228012084961, + "learning_rate": 1.1697216490865383e-06, + "logits/chosen": -0.5541399717330933, + "logits/rejected": -0.6497166156768799, + "logps/chosen": -51.83760452270508, + "logps/rejected": -111.69113159179688, + "loss": 0.5783, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0243072509765625, + "rewards/margins": 7.702914714813232, + "rewards/rejected": -4.678606986999512, + "step": 15544 + }, + { + "epoch": 3.89, + "grad_norm": 16.08521842956543, + "learning_rate": 1.1692164838049152e-06, + "logits/chosen": -0.5147128105163574, + "logits/rejected": -0.5467898845672607, + "logps/chosen": -45.583824157714844, + "logps/rejected": -120.94144439697266, + "loss": 0.574, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1168723106384277, + "rewards/margins": 6.678488254547119, + "rewards/rejected": -3.5616157054901123, + "step": 15545 + }, + { + "epoch": 3.89, + "grad_norm": 4.346341609954834, + "learning_rate": 1.168711413186029e-06, + "logits/chosen": -0.5135836601257324, + "logits/rejected": -0.6246542930603027, + "logps/chosen": -50.41728973388672, + "logps/rejected": -108.315185546875, + "loss": 0.5074, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0870680809020996, + "rewards/margins": 7.325829029083252, + "rewards/rejected": -4.238760948181152, + "step": 15546 + }, + { + "epoch": 3.89, + "grad_norm": 4.25486946105957, + "learning_rate": 1.1682064372423584e-06, + "logits/chosen": -0.5189937949180603, + "logits/rejected": -0.5962228178977966, + "logps/chosen": -49.568115234375, + "logps/rejected": -101.71969604492188, + "loss": 0.5519, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.277348518371582, + "rewards/margins": 6.859777450561523, + "rewards/rejected": -3.5824294090270996, + "step": 15547 + }, + { + "epoch": 3.89, + "grad_norm": 5.80999755859375, + "learning_rate": 1.1677015559863818e-06, + "logits/chosen": -0.6063492894172668, + "logits/rejected": -0.6849933862686157, + "logps/chosen": -51.17173385620117, + "logps/rejected": -106.28598022460938, + "loss": 0.6569, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.062466621398926, + "rewards/margins": 6.638694763183594, + "rewards/rejected": -3.576228141784668, + "step": 15548 + }, + { + "epoch": 3.89, + "grad_norm": 3.061896324157715, + "learning_rate": 1.167196769430577e-06, + "logits/chosen": -0.5083532333374023, + "logits/rejected": -0.5877768993377686, + "logps/chosen": -49.39814376831055, + "logps/rejected": -102.64928436279297, + "loss": 0.5284, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.338763475418091, + "rewards/margins": 8.090332984924316, + "rewards/rejected": -4.751570701599121, + "step": 15549 + }, + { + "epoch": 3.89, + "grad_norm": 3.4822349548339844, + "learning_rate": 1.1666920775874169e-06, + "logits/chosen": -0.48400670289993286, + "logits/rejected": -0.6128773093223572, + "logps/chosen": -57.04547119140625, + "logps/rejected": -105.22378540039062, + "loss": 0.5783, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.961012363433838, + "rewards/margins": 6.784646511077881, + "rewards/rejected": -3.823634386062622, + "step": 15550 + }, + { + "epoch": 3.89, + "grad_norm": 3.6339643001556396, + "learning_rate": 1.1661874804693713e-06, + "logits/chosen": -0.53676837682724, + "logits/rejected": -0.5717411041259766, + "logps/chosen": -47.59598159790039, + "logps/rejected": -126.5588150024414, + "loss": 0.5775, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.01871395111084, + "rewards/margins": 8.167131423950195, + "rewards/rejected": -5.148417949676514, + "step": 15551 + }, + { + "epoch": 3.89, + "grad_norm": 18.001930236816406, + "learning_rate": 1.1656829780889122e-06, + "logits/chosen": -0.5024031400680542, + "logits/rejected": -0.5621562004089355, + "logps/chosen": -57.604881286621094, + "logps/rejected": -118.71124267578125, + "loss": 0.7665, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6709723472595215, + "rewards/margins": 6.801360130310059, + "rewards/rejected": -4.130387306213379, + "step": 15552 + }, + { + "epoch": 3.89, + "grad_norm": 4.955105781555176, + "learning_rate": 1.1651785704585039e-06, + "logits/chosen": -0.5483638644218445, + "logits/rejected": -0.5731750726699829, + "logps/chosen": -40.88115692138672, + "logps/rejected": -111.27774047851562, + "loss": 0.5488, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0931880474090576, + "rewards/margins": 6.605559349060059, + "rewards/rejected": -3.5123722553253174, + "step": 15553 + }, + { + "epoch": 3.89, + "grad_norm": 15.613824844360352, + "learning_rate": 1.1646742575906134e-06, + "logits/chosen": -0.6459758281707764, + "logits/rejected": -0.6521073579788208, + "logps/chosen": -41.565528869628906, + "logps/rejected": -118.8159408569336, + "loss": 0.6168, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.13775372505188, + "rewards/margins": 6.356461048126221, + "rewards/rejected": -3.218707323074341, + "step": 15554 + }, + { + "epoch": 3.89, + "grad_norm": 6.363983631134033, + "learning_rate": 1.1641700394977012e-06, + "logits/chosen": -0.4928838908672333, + "logits/rejected": -0.5739992260932922, + "logps/chosen": -50.009342193603516, + "logps/rejected": -107.2794418334961, + "loss": 0.6072, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.828514337539673, + "rewards/margins": 7.336513996124268, + "rewards/rejected": -4.507999897003174, + "step": 15555 + }, + { + "epoch": 3.89, + "grad_norm": 7.079568862915039, + "learning_rate": 1.1636659161922259e-06, + "logits/chosen": -0.5643417835235596, + "logits/rejected": -0.6481623649597168, + "logps/chosen": -52.31118392944336, + "logps/rejected": -94.45960998535156, + "loss": 0.6579, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.685807943344116, + "rewards/margins": 6.378241539001465, + "rewards/rejected": -3.6924335956573486, + "step": 15556 + }, + { + "epoch": 3.89, + "grad_norm": 3.6767468452453613, + "learning_rate": 1.163161887686648e-06, + "logits/chosen": -0.5476071834564209, + "logits/rejected": -0.6158214211463928, + "logps/chosen": -45.402000427246094, + "logps/rejected": -107.21604919433594, + "loss": 0.5794, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.253037452697754, + "rewards/margins": 7.366533279418945, + "rewards/rejected": -4.113496780395508, + "step": 15557 + }, + { + "epoch": 3.89, + "grad_norm": 7.661406993865967, + "learning_rate": 1.1626579539934207e-06, + "logits/chosen": -0.5993818640708923, + "logits/rejected": -0.6747516989707947, + "logps/chosen": -51.31154251098633, + "logps/rejected": -134.0108184814453, + "loss": 0.6023, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2237582206726074, + "rewards/margins": 8.691459655761719, + "rewards/rejected": -5.467700958251953, + "step": 15558 + }, + { + "epoch": 3.89, + "grad_norm": 82.70487976074219, + "learning_rate": 1.162154115124996e-06, + "logits/chosen": -0.5842651724815369, + "logits/rejected": -0.6540124416351318, + "logps/chosen": -56.590187072753906, + "logps/rejected": -88.05548858642578, + "loss": 0.7247, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8812601566314697, + "rewards/margins": 6.560420036315918, + "rewards/rejected": -3.679159641265869, + "step": 15559 + }, + { + "epoch": 3.89, + "grad_norm": 2.6581168174743652, + "learning_rate": 1.1616503710938249e-06, + "logits/chosen": -0.5235787034034729, + "logits/rejected": -0.6120628118515015, + "logps/chosen": -55.803855895996094, + "logps/rejected": -142.2129364013672, + "loss": 0.5297, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.179772138595581, + "rewards/margins": 9.411417007446289, + "rewards/rejected": -6.231644153594971, + "step": 15560 + }, + { + "epoch": 3.89, + "grad_norm": 13.88192081451416, + "learning_rate": 1.1611467219123584e-06, + "logits/chosen": -0.6134637594223022, + "logits/rejected": -0.6869062185287476, + "logps/chosen": -59.08790588378906, + "logps/rejected": -95.32565307617188, + "loss": 0.6874, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9166884422302246, + "rewards/margins": 7.024853706359863, + "rewards/rejected": -4.108165740966797, + "step": 15561 + }, + { + "epoch": 3.89, + "grad_norm": 5.355652809143066, + "learning_rate": 1.160643167593037e-06, + "logits/chosen": -0.5401201248168945, + "logits/rejected": -0.6202885508537292, + "logps/chosen": -50.49747848510742, + "logps/rejected": -95.0787582397461, + "loss": 0.6506, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0412001609802246, + "rewards/margins": 6.685131549835205, + "rewards/rejected": -3.6439309120178223, + "step": 15562 + }, + { + "epoch": 3.89, + "grad_norm": 3.2922723293304443, + "learning_rate": 1.1601397081483063e-06, + "logits/chosen": -0.6452580094337463, + "logits/rejected": -0.7152068614959717, + "logps/chosen": -41.848876953125, + "logps/rejected": -126.00553894042969, + "loss": 0.5218, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.39432430267334, + "rewards/margins": 8.21716022491455, + "rewards/rejected": -4.822835445404053, + "step": 15563 + }, + { + "epoch": 3.89, + "grad_norm": 11.14496898651123, + "learning_rate": 1.1596363435906095e-06, + "logits/chosen": -0.5287057161331177, + "logits/rejected": -0.6072377562522888, + "logps/chosen": -62.527496337890625, + "logps/rejected": -129.06646728515625, + "loss": 0.6509, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7381155490875244, + "rewards/margins": 7.957136154174805, + "rewards/rejected": -5.219020366668701, + "step": 15564 + }, + { + "epoch": 3.89, + "grad_norm": 13.319318771362305, + "learning_rate": 1.159133073932383e-06, + "logits/chosen": -0.5697898268699646, + "logits/rejected": -0.6578400135040283, + "logps/chosen": -63.469417572021484, + "logps/rejected": -107.01100158691406, + "loss": 0.6521, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.209373712539673, + "rewards/margins": 7.506733417510986, + "rewards/rejected": -4.297359943389893, + "step": 15565 + }, + { + "epoch": 3.89, + "grad_norm": 27.123470306396484, + "learning_rate": 1.1586298991860624e-06, + "logits/chosen": -0.46535101532936096, + "logits/rejected": -0.6075785160064697, + "logps/chosen": -71.32278442382812, + "logps/rejected": -84.95264434814453, + "loss": 0.8086, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.217815399169922, + "rewards/margins": 6.2235822677612305, + "rewards/rejected": -3.0057668685913086, + "step": 15566 + }, + { + "epoch": 3.89, + "grad_norm": 6.42883825302124, + "learning_rate": 1.1581268193640844e-06, + "logits/chosen": -0.5103433132171631, + "logits/rejected": -0.5614721179008484, + "logps/chosen": -53.09193420410156, + "logps/rejected": -117.28795623779297, + "loss": 0.6297, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4665353298187256, + "rewards/margins": 8.138023376464844, + "rewards/rejected": -4.671487808227539, + "step": 15567 + }, + { + "epoch": 3.89, + "grad_norm": 26.95769691467285, + "learning_rate": 1.1576238344788792e-06, + "logits/chosen": -0.5419622659683228, + "logits/rejected": -0.6483632922172546, + "logps/chosen": -48.9243049621582, + "logps/rejected": -98.93895721435547, + "loss": 0.6124, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9686877727508545, + "rewards/margins": 6.457984924316406, + "rewards/rejected": -3.4892969131469727, + "step": 15568 + }, + { + "epoch": 3.89, + "grad_norm": 8.0504732131958, + "learning_rate": 1.1571209445428743e-06, + "logits/chosen": -0.4845488965511322, + "logits/rejected": -0.5993548631668091, + "logps/chosen": -51.626800537109375, + "logps/rejected": -97.03680419921875, + "loss": 0.6074, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.098741054534912, + "rewards/margins": 6.151890754699707, + "rewards/rejected": -3.053149700164795, + "step": 15569 + }, + { + "epoch": 3.9, + "grad_norm": 6.018797874450684, + "learning_rate": 1.1566181495684997e-06, + "logits/chosen": -0.6214633584022522, + "logits/rejected": -0.6838581562042236, + "logps/chosen": -48.45224380493164, + "logps/rejected": -105.47967529296875, + "loss": 0.6048, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2424533367156982, + "rewards/margins": 6.582001686096191, + "rewards/rejected": -3.339548349380493, + "step": 15570 + }, + { + "epoch": 3.9, + "grad_norm": 3.0993659496307373, + "learning_rate": 1.1561154495681775e-06, + "logits/chosen": -0.5419099926948547, + "logits/rejected": -0.6240099668502808, + "logps/chosen": -44.22771072387695, + "logps/rejected": -105.94465637207031, + "loss": 0.5776, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.154951810836792, + "rewards/margins": 8.05420970916748, + "rewards/rejected": -4.899258136749268, + "step": 15571 + }, + { + "epoch": 3.9, + "grad_norm": 17.353029251098633, + "learning_rate": 1.1556128445543325e-06, + "logits/chosen": -0.5482654571533203, + "logits/rejected": -0.622089684009552, + "logps/chosen": -63.317840576171875, + "logps/rejected": -86.96119689941406, + "loss": 0.6975, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1335997581481934, + "rewards/margins": 5.732192516326904, + "rewards/rejected": -2.598592519760132, + "step": 15572 + }, + { + "epoch": 3.9, + "grad_norm": 9.725232124328613, + "learning_rate": 1.155110334539381e-06, + "logits/chosen": -0.49788469076156616, + "logits/rejected": -0.6126594543457031, + "logps/chosen": -52.21385192871094, + "logps/rejected": -89.03704833984375, + "loss": 0.7226, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8650364875793457, + "rewards/margins": 5.944263458251953, + "rewards/rejected": -3.0792272090911865, + "step": 15573 + }, + { + "epoch": 3.9, + "grad_norm": 8.681488037109375, + "learning_rate": 1.1546079195357451e-06, + "logits/chosen": -0.5693301558494568, + "logits/rejected": -0.660672664642334, + "logps/chosen": -59.421875, + "logps/rejected": -108.71085357666016, + "loss": 0.6549, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.377337694168091, + "rewards/margins": 7.195492267608643, + "rewards/rejected": -3.81815505027771, + "step": 15574 + }, + { + "epoch": 3.9, + "grad_norm": 18.981781005859375, + "learning_rate": 1.154105599555837e-06, + "logits/chosen": -0.49496254324913025, + "logits/rejected": -0.5851672291755676, + "logps/chosen": -48.34974670410156, + "logps/rejected": -84.05960083007812, + "loss": 0.5771, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.197162628173828, + "rewards/margins": 6.1233229637146, + "rewards/rejected": -2.9261603355407715, + "step": 15575 + }, + { + "epoch": 3.9, + "grad_norm": 5.547186374664307, + "learning_rate": 1.1536033746120694e-06, + "logits/chosen": -0.5634512901306152, + "logits/rejected": -0.62135249376297, + "logps/chosen": -53.53382110595703, + "logps/rejected": -109.8149185180664, + "loss": 0.6976, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.115553140640259, + "rewards/margins": 6.190138816833496, + "rewards/rejected": -3.0745859146118164, + "step": 15576 + }, + { + "epoch": 3.9, + "grad_norm": 3.6238443851470947, + "learning_rate": 1.1531012447168544e-06, + "logits/chosen": -0.5394724607467651, + "logits/rejected": -0.5912126898765564, + "logps/chosen": -57.326839447021484, + "logps/rejected": -115.13699340820312, + "loss": 0.6474, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.011894464492798, + "rewards/margins": 7.680677890777588, + "rewards/rejected": -4.668783664703369, + "step": 15577 + }, + { + "epoch": 3.9, + "grad_norm": 3.4012346267700195, + "learning_rate": 1.1525992098825984e-06, + "logits/chosen": -0.5990791320800781, + "logits/rejected": -0.6697184443473816, + "logps/chosen": -46.85139465332031, + "logps/rejected": -101.92919921875, + "loss": 0.5106, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1445095539093018, + "rewards/margins": 7.099430561065674, + "rewards/rejected": -3.954921007156372, + "step": 15578 + }, + { + "epoch": 3.9, + "grad_norm": 3.1810624599456787, + "learning_rate": 1.1520972701217097e-06, + "logits/chosen": -0.575777530670166, + "logits/rejected": -0.660057544708252, + "logps/chosen": -58.698577880859375, + "logps/rejected": -106.36990356445312, + "loss": 0.6144, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.168212890625, + "rewards/margins": 8.89107894897461, + "rewards/rejected": -5.722867012023926, + "step": 15579 + }, + { + "epoch": 3.9, + "grad_norm": 8.572978019714355, + "learning_rate": 1.15159542544659e-06, + "logits/chosen": -0.5431286096572876, + "logits/rejected": -0.5669670104980469, + "logps/chosen": -76.77057647705078, + "logps/rejected": -91.10316467285156, + "loss": 0.7564, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.476377010345459, + "rewards/margins": 5.474173069000244, + "rewards/rejected": -1.9977960586547852, + "step": 15580 + }, + { + "epoch": 3.9, + "grad_norm": 4.001669406890869, + "learning_rate": 1.1510936758696395e-06, + "logits/chosen": -0.5723608732223511, + "logits/rejected": -0.6480216383934021, + "logps/chosen": -47.002681732177734, + "logps/rejected": -98.61732482910156, + "loss": 0.5437, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1627750396728516, + "rewards/margins": 7.44951868057251, + "rewards/rejected": -4.2867431640625, + "step": 15581 + }, + { + "epoch": 3.9, + "grad_norm": 6.5149030685424805, + "learning_rate": 1.150592021403259e-06, + "logits/chosen": -0.5862069129943848, + "logits/rejected": -0.6193971633911133, + "logps/chosen": -67.99048614501953, + "logps/rejected": -97.97038269042969, + "loss": 0.7921, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.165454387664795, + "rewards/margins": 5.358992099761963, + "rewards/rejected": -2.193537712097168, + "step": 15582 + }, + { + "epoch": 3.9, + "grad_norm": 5.575909614562988, + "learning_rate": 1.1500904620598447e-06, + "logits/chosen": -0.5766005516052246, + "logits/rejected": -0.6416569948196411, + "logps/chosen": -56.886253356933594, + "logps/rejected": -103.17888641357422, + "loss": 0.6839, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.100391387939453, + "rewards/margins": 6.4219160079956055, + "rewards/rejected": -3.3215250968933105, + "step": 15583 + }, + { + "epoch": 3.9, + "grad_norm": 3.2276391983032227, + "learning_rate": 1.1495889978517882e-06, + "logits/chosen": -0.6217853426933289, + "logits/rejected": -0.6603758931159973, + "logps/chosen": -51.474239349365234, + "logps/rejected": -115.28482055664062, + "loss": 0.5972, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.329586982727051, + "rewards/margins": 8.096931457519531, + "rewards/rejected": -4.767344951629639, + "step": 15584 + }, + { + "epoch": 3.9, + "grad_norm": 18.055240631103516, + "learning_rate": 1.1490876287914832e-06, + "logits/chosen": -0.5383865237236023, + "logits/rejected": -0.6279692053794861, + "logps/chosen": -71.2032699584961, + "logps/rejected": -98.76632690429688, + "loss": 0.715, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2268474102020264, + "rewards/margins": 6.844852924346924, + "rewards/rejected": -3.6180057525634766, + "step": 15585 + }, + { + "epoch": 3.9, + "grad_norm": 23.153610229492188, + "learning_rate": 1.1485863548913217e-06, + "logits/chosen": -0.5508226752281189, + "logits/rejected": -0.6211785078048706, + "logps/chosen": -60.39126968383789, + "logps/rejected": -125.00074768066406, + "loss": 0.659, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8604576587677, + "rewards/margins": 7.967591285705566, + "rewards/rejected": -5.107132911682129, + "step": 15586 + }, + { + "epoch": 3.9, + "grad_norm": 3.949342966079712, + "learning_rate": 1.1480851761636852e-06, + "logits/chosen": -0.5920230746269226, + "logits/rejected": -0.67609041929245, + "logps/chosen": -57.34752655029297, + "logps/rejected": -101.9202651977539, + "loss": 0.6506, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9425413608551025, + "rewards/margins": 8.038161277770996, + "rewards/rejected": -5.0956196784973145, + "step": 15587 + }, + { + "epoch": 3.9, + "grad_norm": 8.550926208496094, + "learning_rate": 1.1475840926209615e-06, + "logits/chosen": -0.5179994106292725, + "logits/rejected": -0.6658743023872375, + "logps/chosen": -59.14449691772461, + "logps/rejected": -111.78425598144531, + "loss": 0.6671, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.100484609603882, + "rewards/margins": 7.948737621307373, + "rewards/rejected": -4.848252296447754, + "step": 15588 + }, + { + "epoch": 3.9, + "grad_norm": 6.753267288208008, + "learning_rate": 1.1470831042755349e-06, + "logits/chosen": -0.5277702808380127, + "logits/rejected": -0.5628176331520081, + "logps/chosen": -49.43666076660156, + "logps/rejected": -125.15744018554688, + "loss": 0.6725, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0171279907226562, + "rewards/margins": 7.290775299072266, + "rewards/rejected": -4.273646831512451, + "step": 15589 + }, + { + "epoch": 3.9, + "grad_norm": 3.09279727935791, + "learning_rate": 1.1465822111397796e-06, + "logits/chosen": -0.5661168098449707, + "logits/rejected": -0.6767191886901855, + "logps/chosen": -54.56620788574219, + "logps/rejected": -99.34221649169922, + "loss": 0.6197, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.947049140930176, + "rewards/margins": 7.049415111541748, + "rewards/rejected": -4.102365493774414, + "step": 15590 + }, + { + "epoch": 3.9, + "grad_norm": 12.196946144104004, + "learning_rate": 1.1460814132260773e-06, + "logits/chosen": -0.5163177251815796, + "logits/rejected": -0.6104640960693359, + "logps/chosen": -48.71454620361328, + "logps/rejected": -88.89354705810547, + "loss": 0.5836, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2083218097686768, + "rewards/margins": 7.3125386238098145, + "rewards/rejected": -4.104216575622559, + "step": 15591 + }, + { + "epoch": 3.9, + "grad_norm": 4.867567539215088, + "learning_rate": 1.1455807105468036e-06, + "logits/chosen": -0.5676413178443909, + "logits/rejected": -0.6485792398452759, + "logps/chosen": -59.37907409667969, + "logps/rejected": -104.35182189941406, + "loss": 0.6321, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.203916549682617, + "rewards/margins": 6.607034683227539, + "rewards/rejected": -3.4031178951263428, + "step": 15592 + }, + { + "epoch": 3.9, + "grad_norm": 8.979421615600586, + "learning_rate": 1.14508010311433e-06, + "logits/chosen": -0.5553568601608276, + "logits/rejected": -0.6760657429695129, + "logps/chosen": -58.91596221923828, + "logps/rejected": -101.21414947509766, + "loss": 0.641, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.12477445602417, + "rewards/margins": 6.477848529815674, + "rewards/rejected": -3.353074312210083, + "step": 15593 + }, + { + "epoch": 3.9, + "grad_norm": 7.008403301239014, + "learning_rate": 1.1445795909410262e-06, + "logits/chosen": -0.5470731854438782, + "logits/rejected": -0.6097579002380371, + "logps/chosen": -50.26067352294922, + "logps/rejected": -104.6849594116211, + "loss": 0.6524, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7984113693237305, + "rewards/margins": 6.4006547927856445, + "rewards/rejected": -3.602243661880493, + "step": 15594 + }, + { + "epoch": 3.9, + "grad_norm": 5.314952850341797, + "learning_rate": 1.1440791740392632e-06, + "logits/chosen": -0.4834716320037842, + "logits/rejected": -0.5488497614860535, + "logps/chosen": -51.2359619140625, + "logps/rejected": -110.66089630126953, + "loss": 0.6167, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5393643379211426, + "rewards/margins": 7.67014217376709, + "rewards/rejected": -4.130777359008789, + "step": 15595 + }, + { + "epoch": 3.9, + "grad_norm": 3.626652479171753, + "learning_rate": 1.143578852421403e-06, + "logits/chosen": -0.5471685528755188, + "logits/rejected": -0.6581575274467468, + "logps/chosen": -60.746543884277344, + "logps/rejected": -108.18497467041016, + "loss": 0.6461, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0547823905944824, + "rewards/margins": 8.497726440429688, + "rewards/rejected": -5.442943572998047, + "step": 15596 + }, + { + "epoch": 3.9, + "grad_norm": 2.4578397274017334, + "learning_rate": 1.1430786260998134e-06, + "logits/chosen": -0.5982514023780823, + "logits/rejected": -0.6515111327171326, + "logps/chosen": -48.690704345703125, + "logps/rejected": -112.2824935913086, + "loss": 0.598, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2481439113616943, + "rewards/margins": 8.028164863586426, + "rewards/rejected": -4.7800211906433105, + "step": 15597 + }, + { + "epoch": 3.9, + "grad_norm": 4.952280521392822, + "learning_rate": 1.142578495086853e-06, + "logits/chosen": -0.5403362512588501, + "logits/rejected": -0.6341696381568909, + "logps/chosen": -55.04320526123047, + "logps/rejected": -113.25676727294922, + "loss": 0.5932, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0784173011779785, + "rewards/margins": 7.670303821563721, + "rewards/rejected": -4.591886520385742, + "step": 15598 + }, + { + "epoch": 3.9, + "grad_norm": 4.859085559844971, + "learning_rate": 1.1420784593948797e-06, + "logits/chosen": -0.55517578125, + "logits/rejected": -0.6339337825775146, + "logps/chosen": -46.96889114379883, + "logps/rejected": -108.71913146972656, + "loss": 0.6114, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0465593338012695, + "rewards/margins": 7.106685161590576, + "rewards/rejected": -4.060125827789307, + "step": 15599 + }, + { + "epoch": 3.9, + "grad_norm": 6.2123236656188965, + "learning_rate": 1.141578519036252e-06, + "logits/chosen": -0.474858820438385, + "logits/rejected": -0.6120859980583191, + "logps/chosen": -54.38566970825195, + "logps/rejected": -86.08153533935547, + "loss": 0.6241, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0228195190429688, + "rewards/margins": 6.904097557067871, + "rewards/rejected": -3.881277561187744, + "step": 15600 + }, + { + "epoch": 3.9, + "grad_norm": 8.957279205322266, + "learning_rate": 1.1410786740233238e-06, + "logits/chosen": -0.49333155155181885, + "logits/rejected": -0.6141530275344849, + "logps/chosen": -55.050106048583984, + "logps/rejected": -116.89676666259766, + "loss": 0.6875, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1313605308532715, + "rewards/margins": 8.230266571044922, + "rewards/rejected": -5.098906517028809, + "step": 15601 + }, + { + "epoch": 3.9, + "grad_norm": 4.603004455566406, + "learning_rate": 1.1405789243684444e-06, + "logits/chosen": -0.5400844812393188, + "logits/rejected": -0.6454175710678101, + "logps/chosen": -53.51823043823242, + "logps/rejected": -99.53929138183594, + "loss": 0.5976, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.182009220123291, + "rewards/margins": 6.596757411956787, + "rewards/rejected": -3.414748191833496, + "step": 15602 + }, + { + "epoch": 3.9, + "grad_norm": 4.1927289962768555, + "learning_rate": 1.1400792700839648e-06, + "logits/chosen": -0.49049514532089233, + "logits/rejected": -0.5769036412239075, + "logps/chosen": -59.83835220336914, + "logps/rejected": -106.49097442626953, + "loss": 0.6198, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1776552200317383, + "rewards/margins": 6.53879451751709, + "rewards/rejected": -3.3611395359039307, + "step": 15603 + }, + { + "epoch": 3.9, + "grad_norm": 7.784324645996094, + "learning_rate": 1.139579711182235e-06, + "logits/chosen": -0.5497883558273315, + "logits/rejected": -0.5957454442977905, + "logps/chosen": -55.28862762451172, + "logps/rejected": -101.15096282958984, + "loss": 0.6034, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3615915775299072, + "rewards/margins": 7.050607681274414, + "rewards/rejected": -3.689016342163086, + "step": 15604 + }, + { + "epoch": 3.9, + "grad_norm": 4.750233173370361, + "learning_rate": 1.1390802476755935e-06, + "logits/chosen": -0.5442081689834595, + "logits/rejected": -0.5618561506271362, + "logps/chosen": -49.70332336425781, + "logps/rejected": -96.93296813964844, + "loss": 0.6612, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1946663856506348, + "rewards/margins": 5.846536159515381, + "rewards/rejected": -2.651869773864746, + "step": 15605 + }, + { + "epoch": 3.9, + "grad_norm": 4.135789394378662, + "learning_rate": 1.1385808795763864e-06, + "logits/chosen": -0.6276130080223083, + "logits/rejected": -0.6994050741195679, + "logps/chosen": -58.84962463378906, + "logps/rejected": -108.36470794677734, + "loss": 0.6166, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.84782075881958, + "rewards/margins": 6.794124603271484, + "rewards/rejected": -3.946303606033325, + "step": 15606 + }, + { + "epoch": 3.9, + "grad_norm": 7.489171028137207, + "learning_rate": 1.1380816068969536e-06, + "logits/chosen": -0.514671266078949, + "logits/rejected": -0.6053704023361206, + "logps/chosen": -72.2815170288086, + "logps/rejected": -107.46562957763672, + "loss": 0.7581, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6558854579925537, + "rewards/margins": 6.673707962036133, + "rewards/rejected": -4.017822742462158, + "step": 15607 + }, + { + "epoch": 3.9, + "grad_norm": 4.773118495941162, + "learning_rate": 1.1375824296496324e-06, + "logits/chosen": -0.5064436793327332, + "logits/rejected": -0.5849684476852417, + "logps/chosen": -55.228858947753906, + "logps/rejected": -91.54110717773438, + "loss": 0.6799, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.75411319732666, + "rewards/margins": 5.526664733886719, + "rewards/rejected": -2.7725515365600586, + "step": 15608 + }, + { + "epoch": 3.9, + "grad_norm": 5.645214080810547, + "learning_rate": 1.1370833478467563e-06, + "logits/chosen": -0.46798259019851685, + "logits/rejected": -0.5881251692771912, + "logps/chosen": -55.929443359375, + "logps/rejected": -85.59498596191406, + "loss": 0.6254, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0021731853485107, + "rewards/margins": 5.335504055023193, + "rewards/rejected": -2.3333308696746826, + "step": 15609 + }, + { + "epoch": 3.91, + "grad_norm": 3.94663667678833, + "learning_rate": 1.1365843615006606e-06, + "logits/chosen": -0.4351659119129181, + "logits/rejected": -0.5518988370895386, + "logps/chosen": -72.18180847167969, + "logps/rejected": -111.8086166381836, + "loss": 0.7058, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4749698638916016, + "rewards/margins": 6.9347243309021, + "rewards/rejected": -4.459754467010498, + "step": 15610 + }, + { + "epoch": 3.91, + "grad_norm": 4.680552959442139, + "learning_rate": 1.1360854706236746e-06, + "logits/chosen": -0.5692921876907349, + "logits/rejected": -0.6489626169204712, + "logps/chosen": -46.2914924621582, + "logps/rejected": -125.96505737304688, + "loss": 0.58, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2159481048583984, + "rewards/margins": 8.057518005371094, + "rewards/rejected": -4.841569423675537, + "step": 15611 + }, + { + "epoch": 3.91, + "grad_norm": 4.202523231506348, + "learning_rate": 1.1355866752281253e-06, + "logits/chosen": -0.6048690676689148, + "logits/rejected": -0.6920182108879089, + "logps/chosen": -53.04174041748047, + "logps/rejected": -100.93912506103516, + "loss": 0.6173, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.123041868209839, + "rewards/margins": 6.45186710357666, + "rewards/rejected": -3.328824758529663, + "step": 15612 + }, + { + "epoch": 3.91, + "grad_norm": 4.638640403747559, + "learning_rate": 1.135087975326341e-06, + "logits/chosen": -0.44201335310935974, + "logits/rejected": -0.5988703370094299, + "logps/chosen": -63.969932556152344, + "logps/rejected": -100.38679504394531, + "loss": 0.5524, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8429043292999268, + "rewards/margins": 7.714603424072266, + "rewards/rejected": -4.87169885635376, + "step": 15613 + }, + { + "epoch": 3.91, + "grad_norm": 4.417984485626221, + "learning_rate": 1.1345893709306439e-06, + "logits/chosen": -0.5829088687896729, + "logits/rejected": -0.675707995891571, + "logps/chosen": -56.79339599609375, + "logps/rejected": -87.42180633544922, + "loss": 0.6301, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9448490142822266, + "rewards/margins": 7.146951675415039, + "rewards/rejected": -4.202103614807129, + "step": 15614 + }, + { + "epoch": 3.91, + "grad_norm": 6.630209922790527, + "learning_rate": 1.134090862053353e-06, + "logits/chosen": -0.523104190826416, + "logits/rejected": -0.642363429069519, + "logps/chosen": -56.13124465942383, + "logps/rejected": -108.45862579345703, + "loss": 0.6643, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1027204990386963, + "rewards/margins": 7.9810309410095215, + "rewards/rejected": -4.878310203552246, + "step": 15615 + }, + { + "epoch": 3.91, + "grad_norm": 9.433476448059082, + "learning_rate": 1.1335924487067907e-06, + "logits/chosen": -0.5912938714027405, + "logits/rejected": -0.6745890974998474, + "logps/chosen": -56.00690841674805, + "logps/rejected": -105.29447937011719, + "loss": 0.6777, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4765918254852295, + "rewards/margins": 7.8914055824279785, + "rewards/rejected": -4.41481351852417, + "step": 15616 + }, + { + "epoch": 3.91, + "grad_norm": 15.321671485900879, + "learning_rate": 1.1330941309032695e-06, + "logits/chosen": -0.4644877314567566, + "logits/rejected": -0.5730750560760498, + "logps/chosen": -69.36300659179688, + "logps/rejected": -108.38198852539062, + "loss": 0.7983, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.468074321746826, + "rewards/margins": 5.666032314300537, + "rewards/rejected": -3.1979575157165527, + "step": 15617 + }, + { + "epoch": 3.91, + "grad_norm": 3.5782532691955566, + "learning_rate": 1.1325959086551075e-06, + "logits/chosen": -0.6147139072418213, + "logits/rejected": -0.6842038631439209, + "logps/chosen": -44.91039276123047, + "logps/rejected": -105.77279663085938, + "loss": 0.6016, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.031491994857788, + "rewards/margins": 6.734045028686523, + "rewards/rejected": -3.7025527954101562, + "step": 15618 + }, + { + "epoch": 3.91, + "grad_norm": 3.188662052154541, + "learning_rate": 1.132097781974612e-06, + "logits/chosen": -0.5129895806312561, + "logits/rejected": -0.5854179263114929, + "logps/chosen": -51.727264404296875, + "logps/rejected": -97.00619506835938, + "loss": 0.5808, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3978071212768555, + "rewards/margins": 7.033964157104492, + "rewards/rejected": -3.6361563205718994, + "step": 15619 + }, + { + "epoch": 3.91, + "grad_norm": 5.318897247314453, + "learning_rate": 1.1315997508740967e-06, + "logits/chosen": -0.5744887590408325, + "logits/rejected": -0.6381235718727112, + "logps/chosen": -55.57837677001953, + "logps/rejected": -107.01467895507812, + "loss": 0.6365, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8783512115478516, + "rewards/margins": 7.063615798950195, + "rewards/rejected": -4.1852641105651855, + "step": 15620 + }, + { + "epoch": 3.91, + "grad_norm": 6.980652809143066, + "learning_rate": 1.1311018153658655e-06, + "logits/chosen": -0.5036913752555847, + "logits/rejected": -0.5663737058639526, + "logps/chosen": -50.8762092590332, + "logps/rejected": -106.9841079711914, + "loss": 0.6748, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0707826614379883, + "rewards/margins": 5.9431867599487305, + "rewards/rejected": -2.8724045753479004, + "step": 15621 + }, + { + "epoch": 3.91, + "grad_norm": 3.9506571292877197, + "learning_rate": 1.1306039754622222e-06, + "logits/chosen": -0.619574785232544, + "logits/rejected": -0.6779746413230896, + "logps/chosen": -55.61235046386719, + "logps/rejected": -103.27587890625, + "loss": 0.6762, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5361952781677246, + "rewards/margins": 6.547641277313232, + "rewards/rejected": -3.011445999145508, + "step": 15622 + }, + { + "epoch": 3.91, + "grad_norm": 11.726524353027344, + "learning_rate": 1.1301062311754718e-06, + "logits/chosen": -0.4814811646938324, + "logits/rejected": -0.5778025388717651, + "logps/chosen": -58.793365478515625, + "logps/rejected": -101.1611328125, + "loss": 0.6014, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1040940284729004, + "rewards/margins": 7.559267997741699, + "rewards/rejected": -4.455173015594482, + "step": 15623 + }, + { + "epoch": 3.91, + "grad_norm": 6.193434715270996, + "learning_rate": 1.1296085825179115e-06, + "logits/chosen": -0.5634512901306152, + "logits/rejected": -0.6396241188049316, + "logps/chosen": -59.94975662231445, + "logps/rejected": -85.3386001586914, + "loss": 0.6831, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0209951400756836, + "rewards/margins": 5.334966659545898, + "rewards/rejected": -2.313971519470215, + "step": 15624 + }, + { + "epoch": 3.91, + "grad_norm": 7.256833076477051, + "learning_rate": 1.129111029501841e-06, + "logits/chosen": -0.6323755383491516, + "logits/rejected": -0.6718823909759521, + "logps/chosen": -54.978416442871094, + "logps/rejected": -112.20682525634766, + "loss": 0.6419, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.098050355911255, + "rewards/margins": 6.602428436279297, + "rewards/rejected": -3.5043787956237793, + "step": 15625 + }, + { + "epoch": 3.91, + "grad_norm": 11.060283660888672, + "learning_rate": 1.1286135721395542e-06, + "logits/chosen": -0.607559859752655, + "logits/rejected": -0.7043685913085938, + "logps/chosen": -52.12691879272461, + "logps/rejected": -93.38605499267578, + "loss": 0.7555, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8141133785247803, + "rewards/margins": 6.911391258239746, + "rewards/rejected": -4.097277641296387, + "step": 15626 + }, + { + "epoch": 3.91, + "grad_norm": 7.004688739776611, + "learning_rate": 1.128116210443343e-06, + "logits/chosen": -0.6116878390312195, + "logits/rejected": -0.7234773635864258, + "logps/chosen": -57.1087532043457, + "logps/rejected": -105.13971710205078, + "loss": 0.7024, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.254690170288086, + "rewards/margins": 6.934813976287842, + "rewards/rejected": -3.680124521255493, + "step": 15627 + }, + { + "epoch": 3.91, + "grad_norm": 7.026979923248291, + "learning_rate": 1.1276189444254976e-06, + "logits/chosen": -0.5525497198104858, + "logits/rejected": -0.614378035068512, + "logps/chosen": -50.88397216796875, + "logps/rejected": -112.99929809570312, + "loss": 0.6851, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0052599906921387, + "rewards/margins": 7.254369258880615, + "rewards/rejected": -4.249108791351318, + "step": 15628 + }, + { + "epoch": 3.91, + "grad_norm": 2.120969533920288, + "learning_rate": 1.1271217740983104e-06, + "logits/chosen": -0.5395007133483887, + "logits/rejected": -0.6411176919937134, + "logps/chosen": -45.21717071533203, + "logps/rejected": -93.27522277832031, + "loss": 0.4766, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3922622203826904, + "rewards/margins": 7.752429008483887, + "rewards/rejected": -4.360167026519775, + "step": 15629 + }, + { + "epoch": 3.91, + "grad_norm": 2.5339927673339844, + "learning_rate": 1.1266246994740605e-06, + "logits/chosen": -0.5335727334022522, + "logits/rejected": -0.6466984152793884, + "logps/chosen": -50.21932601928711, + "logps/rejected": -95.69173431396484, + "loss": 0.5604, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0806655883789062, + "rewards/margins": 7.897815704345703, + "rewards/rejected": -4.817150115966797, + "step": 15630 + }, + { + "epoch": 3.91, + "grad_norm": 4.778560638427734, + "learning_rate": 1.1261277205650346e-06, + "logits/chosen": -0.6183115839958191, + "logits/rejected": -0.6645324230194092, + "logps/chosen": -41.18354034423828, + "logps/rejected": -123.9358139038086, + "loss": 0.5441, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9873204231262207, + "rewards/margins": 7.575989246368408, + "rewards/rejected": -4.588669300079346, + "step": 15631 + }, + { + "epoch": 3.91, + "grad_norm": 13.008277893066406, + "learning_rate": 1.1256308373835152e-06, + "logits/chosen": -0.572689414024353, + "logits/rejected": -0.6629942059516907, + "logps/chosen": -55.35770034790039, + "logps/rejected": -88.97283172607422, + "loss": 0.6285, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8970978260040283, + "rewards/margins": 5.798543453216553, + "rewards/rejected": -2.901445150375366, + "step": 15632 + }, + { + "epoch": 3.91, + "grad_norm": 4.966614246368408, + "learning_rate": 1.1251340499417763e-06, + "logits/chosen": -0.6050898432731628, + "logits/rejected": -0.6477962732315063, + "logps/chosen": -52.29680633544922, + "logps/rejected": -102.47843170166016, + "loss": 0.657, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0879483222961426, + "rewards/margins": 6.4474639892578125, + "rewards/rejected": -3.359516143798828, + "step": 15633 + }, + { + "epoch": 3.91, + "grad_norm": 2.8592801094055176, + "learning_rate": 1.124637358252097e-06, + "logits/chosen": -0.518843948841095, + "logits/rejected": -0.5721865892410278, + "logps/chosen": -52.98987579345703, + "logps/rejected": -120.46472930908203, + "loss": 0.6631, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0492336750030518, + "rewards/margins": 8.152839660644531, + "rewards/rejected": -5.103605270385742, + "step": 15634 + }, + { + "epoch": 3.91, + "grad_norm": 3.488827705383301, + "learning_rate": 1.1241407623267514e-06, + "logits/chosen": -0.552523672580719, + "logits/rejected": -0.637545108795166, + "logps/chosen": -54.760833740234375, + "logps/rejected": -94.0706787109375, + "loss": 0.631, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0012407302856445, + "rewards/margins": 6.258574962615967, + "rewards/rejected": -3.2573344707489014, + "step": 15635 + }, + { + "epoch": 3.91, + "grad_norm": 6.653567314147949, + "learning_rate": 1.1236442621780103e-06, + "logits/chosen": -0.604125440120697, + "logits/rejected": -0.684502124786377, + "logps/chosen": -49.44701385498047, + "logps/rejected": -116.08965301513672, + "loss": 0.5542, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0338046550750732, + "rewards/margins": 8.6530122756958, + "rewards/rejected": -5.619207859039307, + "step": 15636 + }, + { + "epoch": 3.91, + "grad_norm": 13.00210189819336, + "learning_rate": 1.123147857818141e-06, + "logits/chosen": -0.5666916966438293, + "logits/rejected": -0.6546685695648193, + "logps/chosen": -58.837974548339844, + "logps/rejected": -108.13653564453125, + "loss": 0.7047, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.883822441101074, + "rewards/margins": 7.968484878540039, + "rewards/rejected": -5.084661960601807, + "step": 15637 + }, + { + "epoch": 3.91, + "grad_norm": 5.125468730926514, + "learning_rate": 1.1226515492594131e-06, + "logits/chosen": -0.568902313709259, + "logits/rejected": -0.6445081830024719, + "logps/chosen": -49.97926330566406, + "logps/rejected": -99.16181182861328, + "loss": 0.7288, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.283079147338867, + "rewards/margins": 7.161259651184082, + "rewards/rejected": -3.878180742263794, + "step": 15638 + }, + { + "epoch": 3.91, + "grad_norm": 7.9422760009765625, + "learning_rate": 1.1221553365140903e-06, + "logits/chosen": -0.6035908460617065, + "logits/rejected": -0.6624190807342529, + "logps/chosen": -48.7507438659668, + "logps/rejected": -92.09446716308594, + "loss": 0.6559, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2089860439300537, + "rewards/margins": 6.834962844848633, + "rewards/rejected": -3.625976800918579, + "step": 15639 + }, + { + "epoch": 3.91, + "grad_norm": 4.922652244567871, + "learning_rate": 1.1216592195944314e-06, + "logits/chosen": -0.5580089092254639, + "logits/rejected": -0.6199755668640137, + "logps/chosen": -55.683284759521484, + "logps/rejected": -100.37500762939453, + "loss": 0.6822, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1565470695495605, + "rewards/margins": 6.528447151184082, + "rewards/rejected": -3.3719005584716797, + "step": 15640 + }, + { + "epoch": 3.91, + "grad_norm": 3.771939277648926, + "learning_rate": 1.1211631985127002e-06, + "logits/chosen": -0.47282490134239197, + "logits/rejected": -0.5605161786079407, + "logps/chosen": -65.64156341552734, + "logps/rejected": -98.36786651611328, + "loss": 0.5935, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0483431816101074, + "rewards/margins": 6.349806308746338, + "rewards/rejected": -3.3014628887176514, + "step": 15641 + }, + { + "epoch": 3.91, + "grad_norm": 3.9095113277435303, + "learning_rate": 1.1206672732811507e-06, + "logits/chosen": -0.5675566792488098, + "logits/rejected": -0.6060342192649841, + "logps/chosen": -61.36619186401367, + "logps/rejected": -131.28451538085938, + "loss": 0.6698, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9448909759521484, + "rewards/margins": 7.532199859619141, + "rewards/rejected": -4.58730936050415, + "step": 15642 + }, + { + "epoch": 3.91, + "grad_norm": 2.5427639484405518, + "learning_rate": 1.1201714439120403e-06, + "logits/chosen": -0.5556314587593079, + "logits/rejected": -0.6679282188415527, + "logps/chosen": -56.359554290771484, + "logps/rejected": -107.02967834472656, + "loss": 0.5717, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.295742988586426, + "rewards/margins": 8.870577812194824, + "rewards/rejected": -5.574834823608398, + "step": 15643 + }, + { + "epoch": 3.91, + "grad_norm": 10.112308502197266, + "learning_rate": 1.119675710417621e-06, + "logits/chosen": -0.48370641469955444, + "logits/rejected": -0.5985444784164429, + "logps/chosen": -66.15665435791016, + "logps/rejected": -108.29147338867188, + "loss": 0.6773, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6828269958496094, + "rewards/margins": 6.805207252502441, + "rewards/rejected": -4.122380256652832, + "step": 15644 + }, + { + "epoch": 3.91, + "grad_norm": 4.067427635192871, + "learning_rate": 1.11918007281014e-06, + "logits/chosen": -0.548068642616272, + "logits/rejected": -0.6082930564880371, + "logps/chosen": -50.42494201660156, + "logps/rejected": -84.45323944091797, + "loss": 0.7014, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.120870351791382, + "rewards/margins": 5.726354598999023, + "rewards/rejected": -2.6054844856262207, + "step": 15645 + }, + { + "epoch": 3.91, + "grad_norm": 6.179793834686279, + "learning_rate": 1.118684531101849e-06, + "logits/chosen": -0.5574464797973633, + "logits/rejected": -0.6561655402183533, + "logps/chosen": -67.67015075683594, + "logps/rejected": -118.11096954345703, + "loss": 0.6046, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1002347469329834, + "rewards/margins": 8.170550346374512, + "rewards/rejected": -5.070315361022949, + "step": 15646 + }, + { + "epoch": 3.91, + "grad_norm": 11.901664733886719, + "learning_rate": 1.1181890853049914e-06, + "logits/chosen": -0.5705363750457764, + "logits/rejected": -0.6392691731452942, + "logps/chosen": -56.580528259277344, + "logps/rejected": -105.53855895996094, + "loss": 0.6374, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8371548652648926, + "rewards/margins": 6.786747455596924, + "rewards/rejected": -3.9495928287506104, + "step": 15647 + }, + { + "epoch": 3.91, + "grad_norm": 4.997758388519287, + "learning_rate": 1.117693735431809e-06, + "logits/chosen": -0.6295406818389893, + "logits/rejected": -0.7457143068313599, + "logps/chosen": -53.01558303833008, + "logps/rejected": -97.94999694824219, + "loss": 0.6274, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.930563449859619, + "rewards/margins": 8.3079833984375, + "rewards/rejected": -5.377419471740723, + "step": 15648 + }, + { + "epoch": 3.91, + "grad_norm": 3.2130722999572754, + "learning_rate": 1.1171984814945442e-06, + "logits/chosen": -0.5945023894309998, + "logits/rejected": -0.6042089462280273, + "logps/chosen": -59.202484130859375, + "logps/rejected": -123.56598663330078, + "loss": 0.7248, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1695291996002197, + "rewards/margins": 6.302183151245117, + "rewards/rejected": -3.1326537132263184, + "step": 15649 + }, + { + "epoch": 3.92, + "grad_norm": 5.436195373535156, + "learning_rate": 1.116703323505436e-06, + "logits/chosen": -0.5604979395866394, + "logits/rejected": -0.6590248346328735, + "logps/chosen": -62.8398323059082, + "logps/rejected": -82.68949890136719, + "loss": 0.6258, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.210622787475586, + "rewards/margins": 6.28742790222168, + "rewards/rejected": -3.076805591583252, + "step": 15650 + }, + { + "epoch": 3.92, + "grad_norm": 4.290350914001465, + "learning_rate": 1.1162082614767188e-06, + "logits/chosen": -0.4960954189300537, + "logits/rejected": -0.5787907838821411, + "logps/chosen": -54.03233337402344, + "logps/rejected": -90.07255554199219, + "loss": 0.5654, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3099899291992188, + "rewards/margins": 7.015599250793457, + "rewards/rejected": -3.705609083175659, + "step": 15651 + }, + { + "epoch": 3.92, + "grad_norm": 4.4170331954956055, + "learning_rate": 1.1157132954206251e-06, + "logits/chosen": -0.4388732612133026, + "logits/rejected": -0.4810020327568054, + "logps/chosen": -62.17645263671875, + "logps/rejected": -117.3590316772461, + "loss": 0.6332, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0626649856567383, + "rewards/margins": 7.273885250091553, + "rewards/rejected": -4.211219787597656, + "step": 15652 + }, + { + "epoch": 3.92, + "grad_norm": 4.820291996002197, + "learning_rate": 1.1152184253493892e-06, + "logits/chosen": -0.5188563466072083, + "logits/rejected": -0.5933566093444824, + "logps/chosen": -47.05412292480469, + "logps/rejected": -88.84944915771484, + "loss": 0.615, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.961200475692749, + "rewards/margins": 6.063365459442139, + "rewards/rejected": -3.1021647453308105, + "step": 15653 + }, + { + "epoch": 3.92, + "grad_norm": 3.665433168411255, + "learning_rate": 1.1147236512752379e-06, + "logits/chosen": -0.5945799946784973, + "logits/rejected": -0.6545380353927612, + "logps/chosen": -44.019439697265625, + "logps/rejected": -104.43743896484375, + "loss": 0.5272, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1924290657043457, + "rewards/margins": 6.904932022094727, + "rewards/rejected": -3.7125024795532227, + "step": 15654 + }, + { + "epoch": 3.92, + "grad_norm": 12.072595596313477, + "learning_rate": 1.1142289732103962e-06, + "logits/chosen": -0.6193811893463135, + "logits/rejected": -0.7416988611221313, + "logps/chosen": -55.25968933105469, + "logps/rejected": -93.68803405761719, + "loss": 0.7638, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9078593254089355, + "rewards/margins": 6.774535655975342, + "rewards/rejected": -3.8666763305664062, + "step": 15655 + }, + { + "epoch": 3.92, + "grad_norm": 5.902554035186768, + "learning_rate": 1.1137343911670912e-06, + "logits/chosen": -0.6376179456710815, + "logits/rejected": -0.7078495025634766, + "logps/chosen": -45.04684829711914, + "logps/rejected": -95.9728012084961, + "loss": 0.5828, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1257588863372803, + "rewards/margins": 7.076162815093994, + "rewards/rejected": -3.950404167175293, + "step": 15656 + }, + { + "epoch": 3.92, + "grad_norm": 4.1351189613342285, + "learning_rate": 1.1132399051575426e-06, + "logits/chosen": -0.6103230714797974, + "logits/rejected": -0.694420576095581, + "logps/chosen": -52.94511032104492, + "logps/rejected": -99.75218200683594, + "loss": 0.6541, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9674625396728516, + "rewards/margins": 7.180880546569824, + "rewards/rejected": -4.213418006896973, + "step": 15657 + }, + { + "epoch": 3.92, + "grad_norm": 7.359975814819336, + "learning_rate": 1.1127455151939693e-06, + "logits/chosen": -0.6040334701538086, + "logits/rejected": -0.6274278163909912, + "logps/chosen": -48.97288513183594, + "logps/rejected": -121.46941375732422, + "loss": 0.6863, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7092206478118896, + "rewards/margins": 6.379851818084717, + "rewards/rejected": -3.6706314086914062, + "step": 15658 + }, + { + "epoch": 3.92, + "grad_norm": 3.0934903621673584, + "learning_rate": 1.11225122128859e-06, + "logits/chosen": -0.5332581996917725, + "logits/rejected": -0.6203646063804626, + "logps/chosen": -63.935813903808594, + "logps/rejected": -93.4530029296875, + "loss": 0.6686, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2897145748138428, + "rewards/margins": 7.196959495544434, + "rewards/rejected": -3.9072446823120117, + "step": 15659 + }, + { + "epoch": 3.92, + "grad_norm": 5.465213298797607, + "learning_rate": 1.1117570234536174e-06, + "logits/chosen": -0.5236586332321167, + "logits/rejected": -0.6312339305877686, + "logps/chosen": -54.095184326171875, + "logps/rejected": -120.02513122558594, + "loss": 0.6713, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.82627010345459, + "rewards/margins": 7.506598949432373, + "rewards/rejected": -4.680328369140625, + "step": 15660 + }, + { + "epoch": 3.92, + "grad_norm": 4.3639092445373535, + "learning_rate": 1.1112629217012648e-06, + "logits/chosen": -0.4799503684043884, + "logits/rejected": -0.5848926305770874, + "logps/chosen": -45.37217330932617, + "logps/rejected": -104.37491607666016, + "loss": 0.5464, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.261837959289551, + "rewards/margins": 8.506860733032227, + "rewards/rejected": -5.245022296905518, + "step": 15661 + }, + { + "epoch": 3.92, + "grad_norm": 3.2834062576293945, + "learning_rate": 1.1107689160437413e-06, + "logits/chosen": -0.49143901467323303, + "logits/rejected": -0.570234477519989, + "logps/chosen": -50.178871154785156, + "logps/rejected": -119.75767517089844, + "loss": 0.5593, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.12811541557312, + "rewards/margins": 8.242244720458984, + "rewards/rejected": -5.114129543304443, + "step": 15662 + }, + { + "epoch": 3.92, + "grad_norm": 6.020195960998535, + "learning_rate": 1.1102750064932554e-06, + "logits/chosen": -0.5684998631477356, + "logits/rejected": -0.6498810052871704, + "logps/chosen": -58.56651306152344, + "logps/rejected": -99.91604614257812, + "loss": 0.6303, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1981940269470215, + "rewards/margins": 6.790800094604492, + "rewards/rejected": -3.5926053524017334, + "step": 15663 + }, + { + "epoch": 3.92, + "grad_norm": 5.348343849182129, + "learning_rate": 1.1097811930620117e-06, + "logits/chosen": -0.5653226375579834, + "logits/rejected": -0.5892617702484131, + "logps/chosen": -48.794410705566406, + "logps/rejected": -117.80779266357422, + "loss": 0.5934, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.377380847930908, + "rewards/margins": 6.580029487609863, + "rewards/rejected": -3.202648162841797, + "step": 15664 + }, + { + "epoch": 3.92, + "grad_norm": 3.0237340927124023, + "learning_rate": 1.1092874757622114e-06, + "logits/chosen": -0.49082911014556885, + "logits/rejected": -0.5836154222488403, + "logps/chosen": -44.02432632446289, + "logps/rejected": -97.3012924194336, + "loss": 0.5642, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3345234394073486, + "rewards/margins": 6.692220211029053, + "rewards/rejected": -3.357697010040283, + "step": 15665 + }, + { + "epoch": 3.92, + "grad_norm": 23.014822006225586, + "learning_rate": 1.1087938546060568e-06, + "logits/chosen": -0.5301991701126099, + "logits/rejected": -0.5752062201499939, + "logps/chosen": -64.19721984863281, + "logps/rejected": -127.14643859863281, + "loss": 0.7023, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.087336301803589, + "rewards/margins": 8.097896575927734, + "rewards/rejected": -5.010559558868408, + "step": 15666 + }, + { + "epoch": 3.92, + "grad_norm": 3.963486671447754, + "learning_rate": 1.1083003296057438e-06, + "logits/chosen": -0.502364456653595, + "logits/rejected": -0.5162118673324585, + "logps/chosen": -57.846343994140625, + "logps/rejected": -116.1263198852539, + "loss": 0.6311, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.297384262084961, + "rewards/margins": 6.763792514801025, + "rewards/rejected": -3.4664082527160645, + "step": 15667 + }, + { + "epoch": 3.92, + "grad_norm": 7.994905471801758, + "learning_rate": 1.1078069007734699e-06, + "logits/chosen": -0.5305598974227905, + "logits/rejected": -0.6012060642242432, + "logps/chosen": -46.63458251953125, + "logps/rejected": -110.4513168334961, + "loss": 0.5573, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.488279342651367, + "rewards/margins": 7.223033905029297, + "rewards/rejected": -3.734755277633667, + "step": 15668 + }, + { + "epoch": 3.92, + "grad_norm": 3.4240915775299072, + "learning_rate": 1.1073135681214282e-06, + "logits/chosen": -0.557510256767273, + "logits/rejected": -0.6234726905822754, + "logps/chosen": -56.95839309692383, + "logps/rejected": -114.848388671875, + "loss": 0.573, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.23913836479187, + "rewards/margins": 7.30361270904541, + "rewards/rejected": -4.064474105834961, + "step": 15669 + }, + { + "epoch": 3.92, + "grad_norm": 8.462224960327148, + "learning_rate": 1.1068203316618064e-06, + "logits/chosen": -0.5317196846008301, + "logits/rejected": -0.6202431321144104, + "logps/chosen": -70.101318359375, + "logps/rejected": -93.11740112304688, + "loss": 0.7172, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.946418046951294, + "rewards/margins": 6.082509994506836, + "rewards/rejected": -3.136091947555542, + "step": 15670 + }, + { + "epoch": 3.92, + "grad_norm": 4.804278373718262, + "learning_rate": 1.1063271914067975e-06, + "logits/chosen": -0.5310430526733398, + "logits/rejected": -0.560707688331604, + "logps/chosen": -48.44167709350586, + "logps/rejected": -85.01299285888672, + "loss": 0.6374, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.005592107772827, + "rewards/margins": 5.783184051513672, + "rewards/rejected": -2.7775917053222656, + "step": 15671 + }, + { + "epoch": 3.92, + "grad_norm": 5.792605876922607, + "learning_rate": 1.1058341473685846e-06, + "logits/chosen": -0.45485520362854004, + "logits/rejected": -0.5221613645553589, + "logps/chosen": -56.36271667480469, + "logps/rejected": -104.7085952758789, + "loss": 0.6734, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9830217361450195, + "rewards/margins": 6.807154178619385, + "rewards/rejected": -3.824132204055786, + "step": 15672 + }, + { + "epoch": 3.92, + "grad_norm": 2.872716188430786, + "learning_rate": 1.105341199559351e-06, + "logits/chosen": -0.5853802561759949, + "logits/rejected": -0.68391352891922, + "logps/chosen": -66.30878448486328, + "logps/rejected": -116.41480255126953, + "loss": 0.6017, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8726625442504883, + "rewards/margins": 8.113877296447754, + "rewards/rejected": -5.241214752197266, + "step": 15673 + }, + { + "epoch": 3.92, + "grad_norm": 47.84946823120117, + "learning_rate": 1.1048483479912786e-06, + "logits/chosen": -0.5199374556541443, + "logits/rejected": -0.5471999645233154, + "logps/chosen": -54.03368377685547, + "logps/rejected": -106.77011108398438, + "loss": 0.7616, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.012147903442383, + "rewards/margins": 6.695998191833496, + "rewards/rejected": -3.683850049972534, + "step": 15674 + }, + { + "epoch": 3.92, + "grad_norm": 2.431171178817749, + "learning_rate": 1.1043555926765492e-06, + "logits/chosen": -0.5704438090324402, + "logits/rejected": -0.6365381479263306, + "logps/chosen": -50.72785568237305, + "logps/rejected": -128.01068115234375, + "loss": 0.5899, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2573156356811523, + "rewards/margins": 8.59081745147705, + "rewards/rejected": -5.333501815795898, + "step": 15675 + }, + { + "epoch": 3.92, + "grad_norm": 12.967793464660645, + "learning_rate": 1.1038629336273342e-06, + "logits/chosen": -0.5167375802993774, + "logits/rejected": -0.5776508450508118, + "logps/chosen": -46.19413375854492, + "logps/rejected": -113.33838653564453, + "loss": 0.6597, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9929401874542236, + "rewards/margins": 7.795649528503418, + "rewards/rejected": -4.802708625793457, + "step": 15676 + }, + { + "epoch": 3.92, + "grad_norm": 6.984264850616455, + "learning_rate": 1.1033703708558103e-06, + "logits/chosen": -0.5103588104248047, + "logits/rejected": -0.6014965772628784, + "logps/chosen": -74.56171417236328, + "logps/rejected": -110.44538879394531, + "loss": 0.5935, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1682777404785156, + "rewards/margins": 7.071480751037598, + "rewards/rejected": -3.903203248977661, + "step": 15677 + }, + { + "epoch": 3.92, + "grad_norm": 3.028130054473877, + "learning_rate": 1.10287790437415e-06, + "logits/chosen": -0.571195125579834, + "logits/rejected": -0.6087424159049988, + "logps/chosen": -56.50265884399414, + "logps/rejected": -114.11543273925781, + "loss": 0.6232, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6089909076690674, + "rewards/margins": 7.015931606292725, + "rewards/rejected": -4.406940460205078, + "step": 15678 + }, + { + "epoch": 3.92, + "grad_norm": 3.398653745651245, + "learning_rate": 1.1023855341945227e-06, + "logits/chosen": -0.510915219783783, + "logits/rejected": -0.6486937999725342, + "logps/chosen": -59.67581558227539, + "logps/rejected": -112.22802734375, + "loss": 0.6035, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4733707904815674, + "rewards/margins": 8.688278198242188, + "rewards/rejected": -5.214907169342041, + "step": 15679 + }, + { + "epoch": 3.92, + "grad_norm": 4.89467716217041, + "learning_rate": 1.1018932603290927e-06, + "logits/chosen": -0.6035071611404419, + "logits/rejected": -0.6858944892883301, + "logps/chosen": -44.46822738647461, + "logps/rejected": -107.04438781738281, + "loss": 0.5291, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0764594078063965, + "rewards/margins": 7.211778163909912, + "rewards/rejected": -4.135318279266357, + "step": 15680 + }, + { + "epoch": 3.92, + "grad_norm": 22.60584259033203, + "learning_rate": 1.101401082790028e-06, + "logits/chosen": -0.5290811657905579, + "logits/rejected": -0.5525768995285034, + "logps/chosen": -65.32086181640625, + "logps/rejected": -103.97027587890625, + "loss": 0.757, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.944955825805664, + "rewards/margins": 5.784939765930176, + "rewards/rejected": -2.8399832248687744, + "step": 15681 + }, + { + "epoch": 3.92, + "grad_norm": 7.608678340911865, + "learning_rate": 1.1009090015894886e-06, + "logits/chosen": -0.6007500886917114, + "logits/rejected": -0.7271316051483154, + "logps/chosen": -60.465782165527344, + "logps/rejected": -94.25900268554688, + "loss": 0.6411, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2118875980377197, + "rewards/margins": 7.13283634185791, + "rewards/rejected": -3.9209487438201904, + "step": 15682 + }, + { + "epoch": 3.92, + "grad_norm": 6.23366117477417, + "learning_rate": 1.1004170167396344e-06, + "logits/chosen": -0.5308226943016052, + "logits/rejected": -0.6605833768844604, + "logps/chosen": -70.59280395507812, + "logps/rejected": -97.79389953613281, + "loss": 0.6043, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.68550443649292, + "rewards/margins": 7.013710975646973, + "rewards/rejected": -4.328206539154053, + "step": 15683 + }, + { + "epoch": 3.92, + "grad_norm": 8.942232131958008, + "learning_rate": 1.0999251282526248e-06, + "logits/chosen": -0.5324493646621704, + "logits/rejected": -0.6133196353912354, + "logps/chosen": -65.70068359375, + "logps/rejected": -92.91525268554688, + "loss": 0.7331, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.033402442932129, + "rewards/margins": 6.711559295654297, + "rewards/rejected": -3.6781561374664307, + "step": 15684 + }, + { + "epoch": 3.92, + "grad_norm": 9.0839204788208, + "learning_rate": 1.0994333361406117e-06, + "logits/chosen": -0.5981014370918274, + "logits/rejected": -0.6619121432304382, + "logps/chosen": -55.989620208740234, + "logps/rejected": -106.27275848388672, + "loss": 0.715, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7049121856689453, + "rewards/margins": 6.522686958312988, + "rewards/rejected": -3.817775011062622, + "step": 15685 + }, + { + "epoch": 3.92, + "grad_norm": 6.70070743560791, + "learning_rate": 1.0989416404157515e-06, + "logits/chosen": -0.5271145701408386, + "logits/rejected": -0.6024487614631653, + "logps/chosen": -66.39664459228516, + "logps/rejected": -122.00875854492188, + "loss": 0.5961, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.235219955444336, + "rewards/margins": 6.975905418395996, + "rewards/rejected": -3.740685224533081, + "step": 15686 + }, + { + "epoch": 3.92, + "grad_norm": 5.587795257568359, + "learning_rate": 1.0984500410901928e-06, + "logits/chosen": -0.6000420451164246, + "logits/rejected": -0.6900162696838379, + "logps/chosen": -49.973751068115234, + "logps/rejected": -108.87896728515625, + "loss": 0.5516, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.902596950531006, + "rewards/margins": 7.7611470222473145, + "rewards/rejected": -4.858551025390625, + "step": 15687 + }, + { + "epoch": 3.92, + "grad_norm": 4.821514129638672, + "learning_rate": 1.0979585381760821e-06, + "logits/chosen": -0.5466112494468689, + "logits/rejected": -0.6342377066612244, + "logps/chosen": -53.29393768310547, + "logps/rejected": -103.24972534179688, + "loss": 0.6328, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.290356397628784, + "rewards/margins": 7.949615478515625, + "rewards/rejected": -4.65925931930542, + "step": 15688 + }, + { + "epoch": 3.92, + "grad_norm": 6.274806499481201, + "learning_rate": 1.0974671316855673e-06, + "logits/chosen": -0.6110928654670715, + "logits/rejected": -0.687647819519043, + "logps/chosen": -51.274375915527344, + "logps/rejected": -88.4769058227539, + "loss": 0.6327, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.232194662094116, + "rewards/margins": 6.736761093139648, + "rewards/rejected": -3.504565954208374, + "step": 15689 + }, + { + "epoch": 3.93, + "grad_norm": 5.532407283782959, + "learning_rate": 1.0969758216307908e-06, + "logits/chosen": -0.5215083956718445, + "logits/rejected": -0.6136659383773804, + "logps/chosen": -50.35038757324219, + "logps/rejected": -119.00502014160156, + "loss": 0.6147, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.703429698944092, + "rewards/margins": 8.444026947021484, + "rewards/rejected": -5.740597724914551, + "step": 15690 + }, + { + "epoch": 3.93, + "grad_norm": 7.380120277404785, + "learning_rate": 1.0964846080238917e-06, + "logits/chosen": -0.5408686399459839, + "logits/rejected": -0.5625344514846802, + "logps/chosen": -50.61741256713867, + "logps/rejected": -113.59707641601562, + "loss": 0.6377, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9134020805358887, + "rewards/margins": 7.351241588592529, + "rewards/rejected": -4.437839031219482, + "step": 15691 + }, + { + "epoch": 3.93, + "grad_norm": 4.341063022613525, + "learning_rate": 1.0959934908770104e-06, + "logits/chosen": -0.5169442892074585, + "logits/rejected": -0.6069350838661194, + "logps/chosen": -54.8259162902832, + "logps/rejected": -115.84857177734375, + "loss": 0.5452, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.164747714996338, + "rewards/margins": 7.6268463134765625, + "rewards/rejected": -4.462098121643066, + "step": 15692 + }, + { + "epoch": 3.93, + "grad_norm": 4.602824687957764, + "learning_rate": 1.0955024702022831e-06, + "logits/chosen": -0.5357547402381897, + "logits/rejected": -0.6180423498153687, + "logps/chosen": -65.30992126464844, + "logps/rejected": -130.3902130126953, + "loss": 0.5744, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7337615489959717, + "rewards/margins": 7.927133083343506, + "rewards/rejected": -5.193371772766113, + "step": 15693 + }, + { + "epoch": 3.93, + "grad_norm": 17.14271354675293, + "learning_rate": 1.0950115460118433e-06, + "logits/chosen": -0.5494863390922546, + "logits/rejected": -0.6217170357704163, + "logps/chosen": -52.536598205566406, + "logps/rejected": -88.15856170654297, + "loss": 0.6913, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.833460807800293, + "rewards/margins": 6.755680084228516, + "rewards/rejected": -3.9222192764282227, + "step": 15694 + }, + { + "epoch": 3.93, + "grad_norm": 2.6504874229431152, + "learning_rate": 1.0945207183178201e-06, + "logits/chosen": -0.5455238819122314, + "logits/rejected": -0.5889199376106262, + "logps/chosen": -54.795326232910156, + "logps/rejected": -104.85765838623047, + "loss": 0.6513, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1109957695007324, + "rewards/margins": 6.5540337562561035, + "rewards/rejected": -3.44303822517395, + "step": 15695 + }, + { + "epoch": 3.93, + "grad_norm": 6.974733352661133, + "learning_rate": 1.0940299871323452e-06, + "logits/chosen": -0.5231245756149292, + "logits/rejected": -0.5624802708625793, + "logps/chosen": -55.1107063293457, + "logps/rejected": -105.57711791992188, + "loss": 0.694, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1243510246276855, + "rewards/margins": 6.868638515472412, + "rewards/rejected": -3.7442874908447266, + "step": 15696 + }, + { + "epoch": 3.93, + "grad_norm": 4.339298725128174, + "learning_rate": 1.0935393524675442e-06, + "logits/chosen": -0.48506394028663635, + "logits/rejected": -0.6033264398574829, + "logps/chosen": -48.3109016418457, + "logps/rejected": -99.62185668945312, + "loss": 0.582, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.377915620803833, + "rewards/margins": 8.316601753234863, + "rewards/rejected": -4.938685894012451, + "step": 15697 + }, + { + "epoch": 3.93, + "grad_norm": 2.8319032192230225, + "learning_rate": 1.0930488143355394e-06, + "logits/chosen": -0.5571364164352417, + "logits/rejected": -0.6429664492607117, + "logps/chosen": -57.79608154296875, + "logps/rejected": -102.37763977050781, + "loss": 0.5709, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.337574005126953, + "rewards/margins": 7.821599960327148, + "rewards/rejected": -4.484025955200195, + "step": 15698 + }, + { + "epoch": 3.93, + "grad_norm": 11.328118324279785, + "learning_rate": 1.0925583727484556e-06, + "logits/chosen": -0.5350642204284668, + "logits/rejected": -0.6410431861877441, + "logps/chosen": -52.19284439086914, + "logps/rejected": -96.15960693359375, + "loss": 0.5676, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.415647506713867, + "rewards/margins": 7.108375549316406, + "rewards/rejected": -3.69272780418396, + "step": 15699 + }, + { + "epoch": 3.93, + "grad_norm": 28.969058990478516, + "learning_rate": 1.0920680277184104e-06, + "logits/chosen": -0.5087650418281555, + "logits/rejected": -0.5698766708374023, + "logps/chosen": -58.57011032104492, + "logps/rejected": -105.86700439453125, + "loss": 0.7293, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.968595027923584, + "rewards/margins": 6.337927341461182, + "rewards/rejected": -3.3693325519561768, + "step": 15700 + }, + { + "epoch": 3.93, + "grad_norm": 1.827587366104126, + "learning_rate": 1.0915777792575194e-06, + "logits/chosen": -0.5098561644554138, + "logits/rejected": -0.6049185395240784, + "logps/chosen": -62.41549301147461, + "logps/rejected": -104.97312927246094, + "loss": 0.5523, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1484532356262207, + "rewards/margins": 8.262763023376465, + "rewards/rejected": -5.114309787750244, + "step": 15701 + }, + { + "epoch": 3.93, + "grad_norm": 5.506968021392822, + "learning_rate": 1.0910876273779003e-06, + "logits/chosen": -0.4420337677001953, + "logits/rejected": -0.5633158087730408, + "logps/chosen": -60.173858642578125, + "logps/rejected": -112.90787506103516, + "loss": 0.6244, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.189758777618408, + "rewards/margins": 9.078619956970215, + "rewards/rejected": -5.888861656188965, + "step": 15702 + }, + { + "epoch": 3.93, + "grad_norm": 5.344860076904297, + "learning_rate": 1.0905975720916617e-06, + "logits/chosen": -0.5508242249488831, + "logits/rejected": -0.6630011796951294, + "logps/chosen": -66.19063568115234, + "logps/rejected": -95.69908905029297, + "loss": 0.6976, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.100405216217041, + "rewards/margins": 6.982611179351807, + "rewards/rejected": -3.8822054862976074, + "step": 15703 + }, + { + "epoch": 3.93, + "grad_norm": 8.31047534942627, + "learning_rate": 1.0901076134109172e-06, + "logits/chosen": -0.5275818109512329, + "logits/rejected": -0.6422910690307617, + "logps/chosen": -72.56805419921875, + "logps/rejected": -128.43380737304688, + "loss": 0.7428, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1908929347991943, + "rewards/margins": 7.69256067276001, + "rewards/rejected": -4.501667499542236, + "step": 15704 + }, + { + "epoch": 3.93, + "grad_norm": 20.87468719482422, + "learning_rate": 1.0896177513477712e-06, + "logits/chosen": -0.5828156471252441, + "logits/rejected": -0.6358094215393066, + "logps/chosen": -49.53145217895508, + "logps/rejected": -116.3838119506836, + "loss": 0.6077, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.241401195526123, + "rewards/margins": 7.898194789886475, + "rewards/rejected": -4.656793594360352, + "step": 15705 + }, + { + "epoch": 3.93, + "grad_norm": 2.7327489852905273, + "learning_rate": 1.0891279859143306e-06, + "logits/chosen": -0.524293839931488, + "logits/rejected": -0.5391069054603577, + "logps/chosen": -42.00389099121094, + "logps/rejected": -113.79611206054688, + "loss": 0.5636, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.538534641265869, + "rewards/margins": 7.975433349609375, + "rewards/rejected": -4.436898708343506, + "step": 15706 + }, + { + "epoch": 3.93, + "grad_norm": 5.914847373962402, + "learning_rate": 1.0886383171226978e-06, + "logits/chosen": -0.5456748604774475, + "logits/rejected": -0.6341215968132019, + "logps/chosen": -52.48381042480469, + "logps/rejected": -98.28485107421875, + "loss": 0.6386, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.362689256668091, + "rewards/margins": 7.12519645690918, + "rewards/rejected": -3.7625069618225098, + "step": 15707 + }, + { + "epoch": 3.93, + "grad_norm": 3.347522258758545, + "learning_rate": 1.0881487449849704e-06, + "logits/chosen": -0.5811198353767395, + "logits/rejected": -0.6840068697929382, + "logps/chosen": -53.7943000793457, + "logps/rejected": -136.45310974121094, + "loss": 0.5471, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2886765003204346, + "rewards/margins": 9.729804039001465, + "rewards/rejected": -6.441127777099609, + "step": 15708 + }, + { + "epoch": 3.93, + "grad_norm": 3.812772750854492, + "learning_rate": 1.0876592695132504e-06, + "logits/chosen": -0.5719367861747742, + "logits/rejected": -0.6225650310516357, + "logps/chosen": -46.59751892089844, + "logps/rejected": -105.41905975341797, + "loss": 0.6168, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.358351230621338, + "rewards/margins": 7.048704624176025, + "rewards/rejected": -3.6903531551361084, + "step": 15709 + }, + { + "epoch": 3.93, + "grad_norm": 5.703127861022949, + "learning_rate": 1.0871698907196292e-06, + "logits/chosen": -0.5732746720314026, + "logits/rejected": -0.713062584400177, + "logps/chosen": -53.83871841430664, + "logps/rejected": -122.46017456054688, + "loss": 0.574, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2322371006011963, + "rewards/margins": 8.800596237182617, + "rewards/rejected": -5.568359851837158, + "step": 15710 + }, + { + "epoch": 3.93, + "grad_norm": 6.3383026123046875, + "learning_rate": 1.086680608616204e-06, + "logits/chosen": -0.5226543545722961, + "logits/rejected": -0.6106317043304443, + "logps/chosen": -53.32221984863281, + "logps/rejected": -100.48554229736328, + "loss": 0.5814, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.040156364440918, + "rewards/margins": 6.644510269165039, + "rewards/rejected": -3.604353904724121, + "step": 15711 + }, + { + "epoch": 3.93, + "grad_norm": 29.277019500732422, + "learning_rate": 1.0861914232150627e-06, + "logits/chosen": -0.49828648567199707, + "logits/rejected": -0.5916684865951538, + "logps/chosen": -49.85160827636719, + "logps/rejected": -109.25702667236328, + "loss": 0.6822, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8825483322143555, + "rewards/margins": 6.734755039215088, + "rewards/rejected": -3.8522064685821533, + "step": 15712 + }, + { + "epoch": 3.93, + "grad_norm": 2.7570602893829346, + "learning_rate": 1.085702334528293e-06, + "logits/chosen": -0.5329886078834534, + "logits/rejected": -0.6038183569908142, + "logps/chosen": -60.193267822265625, + "logps/rejected": -95.48037719726562, + "loss": 0.6508, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.952075719833374, + "rewards/margins": 6.468561172485352, + "rewards/rejected": -3.5164856910705566, + "step": 15713 + }, + { + "epoch": 3.93, + "grad_norm": 8.744433403015137, + "learning_rate": 1.085213342567984e-06, + "logits/chosen": -0.5960705280303955, + "logits/rejected": -0.6457113027572632, + "logps/chosen": -51.295265197753906, + "logps/rejected": -89.76034545898438, + "loss": 0.6842, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0883209705352783, + "rewards/margins": 5.042981147766113, + "rewards/rejected": -1.9546610116958618, + "step": 15714 + }, + { + "epoch": 3.93, + "grad_norm": 4.451537609100342, + "learning_rate": 1.0847244473462165e-06, + "logits/chosen": -0.5066279172897339, + "logits/rejected": -0.5649404525756836, + "logps/chosen": -55.50472640991211, + "logps/rejected": -105.35750579833984, + "loss": 0.6436, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2184646129608154, + "rewards/margins": 6.269271373748779, + "rewards/rejected": -3.050806999206543, + "step": 15715 + }, + { + "epoch": 3.93, + "grad_norm": 2.8710598945617676, + "learning_rate": 1.0842356488750715e-06, + "logits/chosen": -0.5518075823783875, + "logits/rejected": -0.634277880191803, + "logps/chosen": -59.37147903442383, + "logps/rejected": -113.79844665527344, + "loss": 0.5549, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.411116600036621, + "rewards/margins": 7.533888816833496, + "rewards/rejected": -4.122772693634033, + "step": 15716 + }, + { + "epoch": 3.93, + "grad_norm": 7.858954429626465, + "learning_rate": 1.0837469471666284e-06, + "logits/chosen": -0.4983295798301697, + "logits/rejected": -0.5937310457229614, + "logps/chosen": -60.78013610839844, + "logps/rejected": -104.82691955566406, + "loss": 0.6784, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2775285243988037, + "rewards/margins": 6.710997104644775, + "rewards/rejected": -3.433468818664551, + "step": 15717 + }, + { + "epoch": 3.93, + "grad_norm": 9.93868350982666, + "learning_rate": 1.0832583422329662e-06, + "logits/chosen": -0.6444429159164429, + "logits/rejected": -0.6467357873916626, + "logps/chosen": -72.83869171142578, + "logps/rejected": -104.15377807617188, + "loss": 0.5999, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2589845657348633, + "rewards/margins": 6.739272117614746, + "rewards/rejected": -3.4802873134613037, + "step": 15718 + }, + { + "epoch": 3.93, + "grad_norm": 5.143263339996338, + "learning_rate": 1.0827698340861541e-06, + "logits/chosen": -0.5365851521492004, + "logits/rejected": -0.5956104397773743, + "logps/chosen": -51.02907180786133, + "logps/rejected": -100.8100357055664, + "loss": 0.6145, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.939906358718872, + "rewards/margins": 6.261679172515869, + "rewards/rejected": -3.321772575378418, + "step": 15719 + }, + { + "epoch": 3.93, + "grad_norm": 3.7181427478790283, + "learning_rate": 1.082281422738266e-06, + "logits/chosen": -0.5397674441337585, + "logits/rejected": -0.5882764458656311, + "logps/chosen": -55.86998748779297, + "logps/rejected": -116.53932189941406, + "loss": 0.6299, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.315563201904297, + "rewards/margins": 6.922971725463867, + "rewards/rejected": -3.6074090003967285, + "step": 15720 + }, + { + "epoch": 3.93, + "grad_norm": 5.857784748077393, + "learning_rate": 1.0817931082013733e-06, + "logits/chosen": -0.6308721899986267, + "logits/rejected": -0.6722901463508606, + "logps/chosen": -45.03557205200195, + "logps/rejected": -101.59860229492188, + "loss": 0.6637, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1916096210479736, + "rewards/margins": 7.2401957511901855, + "rewards/rejected": -4.048585891723633, + "step": 15721 + }, + { + "epoch": 3.93, + "grad_norm": 9.272231101989746, + "learning_rate": 1.0813048904875385e-06, + "logits/chosen": -0.5515414476394653, + "logits/rejected": -0.6191151142120361, + "logps/chosen": -55.5604133605957, + "logps/rejected": -88.62809753417969, + "loss": 0.715, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8742096424102783, + "rewards/margins": 6.086717128753662, + "rewards/rejected": -3.212507486343384, + "step": 15722 + }, + { + "epoch": 3.93, + "grad_norm": 4.898706912994385, + "learning_rate": 1.0808167696088272e-06, + "logits/chosen": -0.5754886269569397, + "logits/rejected": -0.6783941984176636, + "logps/chosen": -57.11037063598633, + "logps/rejected": -114.7645034790039, + "loss": 0.6526, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8168721199035645, + "rewards/margins": 7.340327262878418, + "rewards/rejected": -4.5234551429748535, + "step": 15723 + }, + { + "epoch": 3.93, + "grad_norm": 3.4065968990325928, + "learning_rate": 1.0803287455773038e-06, + "logits/chosen": -0.5256890058517456, + "logits/rejected": -0.569094181060791, + "logps/chosen": -50.45088195800781, + "logps/rejected": -115.85911560058594, + "loss": 0.5997, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.515354633331299, + "rewards/margins": 7.869694232940674, + "rewards/rejected": -4.354340553283691, + "step": 15724 + }, + { + "epoch": 3.93, + "grad_norm": 5.2495832443237305, + "learning_rate": 1.0798408184050262e-06, + "logits/chosen": -0.5691392421722412, + "logits/rejected": -0.6673600673675537, + "logps/chosen": -61.58905792236328, + "logps/rejected": -114.06055450439453, + "loss": 0.6227, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.904208183288574, + "rewards/margins": 7.936275959014893, + "rewards/rejected": -5.032067775726318, + "step": 15725 + }, + { + "epoch": 3.93, + "grad_norm": 4.425454139709473, + "learning_rate": 1.0793529881040505e-06, + "logits/chosen": -0.5610882043838501, + "logits/rejected": -0.598918616771698, + "logps/chosen": -39.41720199584961, + "logps/rejected": -108.48977661132812, + "loss": 0.5739, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.130636215209961, + "rewards/margins": 7.215780258178711, + "rewards/rejected": -4.08514404296875, + "step": 15726 + }, + { + "epoch": 3.93, + "grad_norm": 5.968224048614502, + "learning_rate": 1.0788652546864337e-06, + "logits/chosen": -0.5166226029396057, + "logits/rejected": -0.6317105889320374, + "logps/chosen": -55.98164749145508, + "logps/rejected": -86.40750122070312, + "loss": 0.5733, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.256687641143799, + "rewards/margins": 7.486694812774658, + "rewards/rejected": -4.230007648468018, + "step": 15727 + }, + { + "epoch": 3.93, + "grad_norm": 8.238524436950684, + "learning_rate": 1.0783776181642274e-06, + "logits/chosen": -0.4946695566177368, + "logits/rejected": -0.6074110269546509, + "logps/chosen": -57.43626403808594, + "logps/rejected": -113.64576721191406, + "loss": 0.6523, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.126450300216675, + "rewards/margins": 7.491477966308594, + "rewards/rejected": -4.365027904510498, + "step": 15728 + }, + { + "epoch": 3.93, + "grad_norm": 7.351884841918945, + "learning_rate": 1.0778900785494801e-06, + "logits/chosen": -0.5650158524513245, + "logits/rejected": -0.6350207328796387, + "logps/chosen": -52.079444885253906, + "logps/rejected": -98.77344512939453, + "loss": 0.6767, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2953202724456787, + "rewards/margins": 7.1700592041015625, + "rewards/rejected": -3.8747384548187256, + "step": 15729 + }, + { + "epoch": 3.94, + "grad_norm": 3.7110095024108887, + "learning_rate": 1.0774026358542416e-06, + "logits/chosen": -0.533290684223175, + "logits/rejected": -0.6391655206680298, + "logps/chosen": -50.256412506103516, + "logps/rejected": -95.43826293945312, + "loss": 0.6368, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0632362365722656, + "rewards/margins": 6.791945457458496, + "rewards/rejected": -3.7287094593048096, + "step": 15730 + }, + { + "epoch": 3.94, + "grad_norm": 6.93951940536499, + "learning_rate": 1.0769152900905554e-06, + "logits/chosen": -0.6707174777984619, + "logits/rejected": -0.7286849021911621, + "logps/chosen": -64.37997436523438, + "logps/rejected": -105.29721069335938, + "loss": 0.7035, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.460240602493286, + "rewards/margins": 6.800261974334717, + "rewards/rejected": -3.3400213718414307, + "step": 15731 + }, + { + "epoch": 3.94, + "grad_norm": 4.35860013961792, + "learning_rate": 1.0764280412704665e-06, + "logits/chosen": -0.5857553482055664, + "logits/rejected": -0.6660876274108887, + "logps/chosen": -60.860870361328125, + "logps/rejected": -94.35797882080078, + "loss": 0.6588, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.441929578781128, + "rewards/margins": 7.663420677185059, + "rewards/rejected": -4.22149133682251, + "step": 15732 + }, + { + "epoch": 3.94, + "grad_norm": 3.1325623989105225, + "learning_rate": 1.0759408894060135e-06, + "logits/chosen": -0.5401443839073181, + "logits/rejected": -0.6763478517532349, + "logps/chosen": -58.26567840576172, + "logps/rejected": -98.04071044921875, + "loss": 0.5943, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.148902654647827, + "rewards/margins": 7.969508171081543, + "rewards/rejected": -4.820605278015137, + "step": 15733 + }, + { + "epoch": 3.94, + "grad_norm": 7.979530334472656, + "learning_rate": 1.0754538345092336e-06, + "logits/chosen": -0.5296075344085693, + "logits/rejected": -0.6298742890357971, + "logps/chosen": -64.58612060546875, + "logps/rejected": -113.33609008789062, + "loss": 0.643, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0383453369140625, + "rewards/margins": 7.666152477264404, + "rewards/rejected": -4.6278076171875, + "step": 15734 + }, + { + "epoch": 3.94, + "grad_norm": 7.693889617919922, + "learning_rate": 1.0749668765921645e-06, + "logits/chosen": -0.4904845952987671, + "logits/rejected": -0.6250056028366089, + "logps/chosen": -69.41912078857422, + "logps/rejected": -107.82084655761719, + "loss": 0.6076, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7102317810058594, + "rewards/margins": 7.7506256103515625, + "rewards/rejected": -5.040394306182861, + "step": 15735 + }, + { + "epoch": 3.94, + "grad_norm": 6.876379489898682, + "learning_rate": 1.0744800156668395e-06, + "logits/chosen": -0.4744710922241211, + "logits/rejected": -0.5560205578804016, + "logps/chosen": -68.25132751464844, + "logps/rejected": -123.71224975585938, + "loss": 0.6381, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7560465335845947, + "rewards/margins": 8.096966743469238, + "rewards/rejected": -5.3409199714660645, + "step": 15736 + }, + { + "epoch": 3.94, + "grad_norm": 3.3714523315429688, + "learning_rate": 1.0739932517452888e-06, + "logits/chosen": -0.6207103133201599, + "logits/rejected": -0.6291120648384094, + "logps/chosen": -41.71319580078125, + "logps/rejected": -114.54336547851562, + "loss": 0.5315, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3665060997009277, + "rewards/margins": 7.313959121704102, + "rewards/rejected": -3.947453498840332, + "step": 15737 + }, + { + "epoch": 3.94, + "grad_norm": 8.70803451538086, + "learning_rate": 1.0735065848395394e-06, + "logits/chosen": -0.5683379173278809, + "logits/rejected": -0.6471959352493286, + "logps/chosen": -52.8353157043457, + "logps/rejected": -100.58224487304688, + "loss": 0.6582, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8709702491760254, + "rewards/margins": 6.98486852645874, + "rewards/rejected": -4.113898277282715, + "step": 15738 + }, + { + "epoch": 3.94, + "grad_norm": 13.496835708618164, + "learning_rate": 1.0730200149616199e-06, + "logits/chosen": -0.6102061867713928, + "logits/rejected": -0.6948701739311218, + "logps/chosen": -59.62664031982422, + "logps/rejected": -102.39478302001953, + "loss": 0.8519, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.067336320877075, + "rewards/margins": 6.588709831237793, + "rewards/rejected": -3.521373748779297, + "step": 15739 + }, + { + "epoch": 3.94, + "grad_norm": 7.3155975341796875, + "learning_rate": 1.0725335421235527e-06, + "logits/chosen": -0.594306230545044, + "logits/rejected": -0.6580476760864258, + "logps/chosen": -45.81317901611328, + "logps/rejected": -98.42216491699219, + "loss": 0.5723, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.234435796737671, + "rewards/margins": 7.810646057128906, + "rewards/rejected": -4.576210021972656, + "step": 15740 + }, + { + "epoch": 3.94, + "grad_norm": 4.52398157119751, + "learning_rate": 1.0720471663373583e-06, + "logits/chosen": -0.6232349276542664, + "logits/rejected": -0.6007919907569885, + "logps/chosen": -56.15210723876953, + "logps/rejected": -130.93556213378906, + "loss": 0.5864, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1384713649749756, + "rewards/margins": 7.144090175628662, + "rewards/rejected": -4.005618572235107, + "step": 15741 + }, + { + "epoch": 3.94, + "grad_norm": 4.107247829437256, + "learning_rate": 1.0715608876150574e-06, + "logits/chosen": -0.47560685873031616, + "logits/rejected": -0.559506356716156, + "logps/chosen": -59.126068115234375, + "logps/rejected": -115.27452087402344, + "loss": 0.6197, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.174283504486084, + "rewards/margins": 6.713819980621338, + "rewards/rejected": -3.539536476135254, + "step": 15742 + }, + { + "epoch": 3.94, + "grad_norm": 4.0923943519592285, + "learning_rate": 1.0710747059686654e-06, + "logits/chosen": -0.5704017877578735, + "logits/rejected": -0.6505269408226013, + "logps/chosen": -51.79984664916992, + "logps/rejected": -119.03797912597656, + "loss": 0.6173, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.080737352371216, + "rewards/margins": 8.655214309692383, + "rewards/rejected": -5.574476718902588, + "step": 15743 + }, + { + "epoch": 3.94, + "grad_norm": 3.61114501953125, + "learning_rate": 1.070588621410195e-06, + "logits/chosen": -0.5540509223937988, + "logits/rejected": -0.6101527810096741, + "logps/chosen": -70.30012512207031, + "logps/rejected": -112.5701675415039, + "loss": 0.7952, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.130121946334839, + "rewards/margins": 6.89879035949707, + "rewards/rejected": -3.7686686515808105, + "step": 15744 + }, + { + "epoch": 3.94, + "grad_norm": 16.624614715576172, + "learning_rate": 1.0701026339516606e-06, + "logits/chosen": -0.5512492656707764, + "logits/rejected": -0.6295515298843384, + "logps/chosen": -61.02085876464844, + "logps/rejected": -111.72998809814453, + "loss": 0.6397, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8776021003723145, + "rewards/margins": 6.929443359375, + "rewards/rejected": -4.051840782165527, + "step": 15745 + }, + { + "epoch": 3.94, + "grad_norm": 15.619778633117676, + "learning_rate": 1.0696167436050703e-06, + "logits/chosen": -0.4693199694156647, + "logits/rejected": -0.5723813772201538, + "logps/chosen": -56.41827392578125, + "logps/rejected": -117.33123779296875, + "loss": 0.6644, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.722529888153076, + "rewards/margins": 7.6246867179870605, + "rewards/rejected": -4.902156829833984, + "step": 15746 + }, + { + "epoch": 3.94, + "grad_norm": 1.9223670959472656, + "learning_rate": 1.0691309503824293e-06, + "logits/chosen": -0.5077678561210632, + "logits/rejected": -0.6831191778182983, + "logps/chosen": -61.219661712646484, + "logps/rejected": -96.32622528076172, + "loss": 0.5586, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0311713218688965, + "rewards/margins": 8.72534465789795, + "rewards/rejected": -5.694173812866211, + "step": 15747 + }, + { + "epoch": 3.94, + "grad_norm": 25.054100036621094, + "learning_rate": 1.0686452542957437e-06, + "logits/chosen": -0.581354558467865, + "logits/rejected": -0.6716203689575195, + "logps/chosen": -52.63715362548828, + "logps/rejected": -101.79100036621094, + "loss": 0.7749, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.882061004638672, + "rewards/margins": 6.19531774520874, + "rewards/rejected": -3.3132574558258057, + "step": 15748 + }, + { + "epoch": 3.94, + "grad_norm": 31.301589965820312, + "learning_rate": 1.0681596553570166e-06, + "logits/chosen": -0.4911213517189026, + "logits/rejected": -0.5446022748947144, + "logps/chosen": -57.91118621826172, + "logps/rejected": -116.63719177246094, + "loss": 0.7545, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.019897222518921, + "rewards/margins": 6.390920639038086, + "rewards/rejected": -3.371023416519165, + "step": 15749 + }, + { + "epoch": 3.94, + "grad_norm": 4.484364986419678, + "learning_rate": 1.067674153578247e-06, + "logits/chosen": -0.559657096862793, + "logits/rejected": -0.6106321811676025, + "logps/chosen": -50.721351623535156, + "logps/rejected": -117.06607818603516, + "loss": 0.5192, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3921828269958496, + "rewards/margins": 7.4845428466796875, + "rewards/rejected": -4.09235954284668, + "step": 15750 + }, + { + "epoch": 3.94, + "grad_norm": 5.916884899139404, + "learning_rate": 1.06718874897143e-06, + "logits/chosen": -0.4845803678035736, + "logits/rejected": -0.5966799259185791, + "logps/chosen": -56.33718490600586, + "logps/rejected": -94.58403015136719, + "loss": 0.5841, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0711841583251953, + "rewards/margins": 6.967181205749512, + "rewards/rejected": -3.895996570587158, + "step": 15751 + }, + { + "epoch": 3.94, + "grad_norm": 4.473735809326172, + "learning_rate": 1.0667034415485633e-06, + "logits/chosen": -0.5532783269882202, + "logits/rejected": -0.662903368473053, + "logps/chosen": -68.16496276855469, + "logps/rejected": -90.81716918945312, + "loss": 0.7003, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2106282711029053, + "rewards/margins": 6.731362342834473, + "rewards/rejected": -3.5207343101501465, + "step": 15752 + }, + { + "epoch": 3.94, + "grad_norm": 5.125369548797607, + "learning_rate": 1.0662182313216385e-06, + "logits/chosen": -0.5292547941207886, + "logits/rejected": -0.5994212627410889, + "logps/chosen": -55.10560607910156, + "logps/rejected": -98.8309326171875, + "loss": 0.7098, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.444643497467041, + "rewards/margins": 6.443586349487305, + "rewards/rejected": -2.9989426136016846, + "step": 15753 + }, + { + "epoch": 3.94, + "grad_norm": 4.285665035247803, + "learning_rate": 1.0657331183026437e-06, + "logits/chosen": -0.5653182864189148, + "logits/rejected": -0.6592807173728943, + "logps/chosen": -54.89419937133789, + "logps/rejected": -88.01702117919922, + "loss": 0.637, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.994025945663452, + "rewards/margins": 6.899748802185059, + "rewards/rejected": -3.9057223796844482, + "step": 15754 + }, + { + "epoch": 3.94, + "grad_norm": 7.906140327453613, + "learning_rate": 1.06524810250357e-06, + "logits/chosen": -0.5214784741401672, + "logits/rejected": -0.5269242525100708, + "logps/chosen": -67.74090576171875, + "logps/rejected": -111.80671691894531, + "loss": 0.7526, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.229020833969116, + "rewards/margins": 5.204259872436523, + "rewards/rejected": -1.9752389192581177, + "step": 15755 + }, + { + "epoch": 3.94, + "grad_norm": 4.3129496574401855, + "learning_rate": 1.0647631839363993e-06, + "logits/chosen": -0.512015163898468, + "logits/rejected": -0.567499577999115, + "logps/chosen": -50.27107620239258, + "logps/rejected": -95.9355239868164, + "loss": 0.5454, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.198923110961914, + "rewards/margins": 6.553553581237793, + "rewards/rejected": -3.3546299934387207, + "step": 15756 + }, + { + "epoch": 3.94, + "grad_norm": 3.526320219039917, + "learning_rate": 1.064278362613117e-06, + "logits/chosen": -0.4945279359817505, + "logits/rejected": -0.5392565727233887, + "logps/chosen": -55.35807800292969, + "logps/rejected": -107.79423522949219, + "loss": 0.6237, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.402036428451538, + "rewards/margins": 6.4305806159973145, + "rewards/rejected": -3.0285439491271973, + "step": 15757 + }, + { + "epoch": 3.94, + "grad_norm": 7.549359321594238, + "learning_rate": 1.0637936385457032e-06, + "logits/chosen": -0.5197939872741699, + "logits/rejected": -0.5984359979629517, + "logps/chosen": -68.18132019042969, + "logps/rejected": -110.91267395019531, + "loss": 0.7413, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6226820945739746, + "rewards/margins": 7.494660377502441, + "rewards/rejected": -4.871978282928467, + "step": 15758 + }, + { + "epoch": 3.94, + "grad_norm": 6.939568996429443, + "learning_rate": 1.0633090117461336e-06, + "logits/chosen": -0.603766918182373, + "logits/rejected": -0.7125741243362427, + "logps/chosen": -47.238765716552734, + "logps/rejected": -102.98084259033203, + "loss": 0.5934, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.139784812927246, + "rewards/margins": 8.010421752929688, + "rewards/rejected": -4.870636463165283, + "step": 15759 + }, + { + "epoch": 3.94, + "grad_norm": 4.313156604766846, + "learning_rate": 1.0628244822263855e-06, + "logits/chosen": -0.5501194000244141, + "logits/rejected": -0.6415835618972778, + "logps/chosen": -49.62776565551758, + "logps/rejected": -100.75042724609375, + "loss": 0.5717, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0354204177856445, + "rewards/margins": 7.094909191131592, + "rewards/rejected": -4.0594892501831055, + "step": 15760 + }, + { + "epoch": 3.94, + "grad_norm": 6.066031455993652, + "learning_rate": 1.0623400499984355e-06, + "logits/chosen": -0.4495176076889038, + "logits/rejected": -0.5014142394065857, + "logps/chosen": -67.27142333984375, + "logps/rejected": -91.71841430664062, + "loss": 0.7769, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.091017484664917, + "rewards/margins": 4.733272552490234, + "rewards/rejected": -1.6422553062438965, + "step": 15761 + }, + { + "epoch": 3.94, + "grad_norm": 7.533059120178223, + "learning_rate": 1.061855715074248e-06, + "logits/chosen": -0.5454537868499756, + "logits/rejected": -0.5392829179763794, + "logps/chosen": -42.304141998291016, + "logps/rejected": -106.5970458984375, + "loss": 0.6408, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.149606227874756, + "rewards/margins": 6.301661968231201, + "rewards/rejected": -3.1520559787750244, + "step": 15762 + }, + { + "epoch": 3.94, + "grad_norm": 11.655600547790527, + "learning_rate": 1.0613714774657945e-06, + "logits/chosen": -0.5598145723342896, + "logits/rejected": -0.6180679798126221, + "logps/chosen": -57.26802444458008, + "logps/rejected": -114.71080017089844, + "loss": 0.7784, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0022215843200684, + "rewards/margins": 6.718190670013428, + "rewards/rejected": -3.7159695625305176, + "step": 15763 + }, + { + "epoch": 3.94, + "grad_norm": 5.766234397888184, + "learning_rate": 1.0608873371850443e-06, + "logits/chosen": -0.688418447971344, + "logits/rejected": -0.7394333481788635, + "logps/chosen": -52.51150894165039, + "logps/rejected": -105.35552215576172, + "loss": 0.742, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.108450412750244, + "rewards/margins": 6.616174221038818, + "rewards/rejected": -3.507723569869995, + "step": 15764 + }, + { + "epoch": 3.94, + "grad_norm": 3.5726373195648193, + "learning_rate": 1.0604032942439545e-06, + "logits/chosen": -0.5565279126167297, + "logits/rejected": -0.6558615565299988, + "logps/chosen": -49.346134185791016, + "logps/rejected": -108.63427734375, + "loss": 0.6137, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.72279953956604, + "rewards/margins": 6.572173118591309, + "rewards/rejected": -3.8493738174438477, + "step": 15765 + }, + { + "epoch": 3.94, + "grad_norm": 5.873712062835693, + "learning_rate": 1.0599193486544895e-06, + "logits/chosen": -0.6062952280044556, + "logits/rejected": -0.674451470375061, + "logps/chosen": -54.01654815673828, + "logps/rejected": -90.94890594482422, + "loss": 0.6969, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.813445568084717, + "rewards/margins": 6.030758857727051, + "rewards/rejected": -3.217312812805176, + "step": 15766 + }, + { + "epoch": 3.94, + "grad_norm": 3.7424702644348145, + "learning_rate": 1.0594355004286105e-06, + "logits/chosen": -0.5107647180557251, + "logits/rejected": -0.6294386386871338, + "logps/chosen": -54.76313018798828, + "logps/rejected": -92.0899887084961, + "loss": 0.5495, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.957000732421875, + "rewards/margins": 6.724287986755371, + "rewards/rejected": -3.7672882080078125, + "step": 15767 + }, + { + "epoch": 3.94, + "grad_norm": 11.807816505432129, + "learning_rate": 1.0589517495782708e-06, + "logits/chosen": -0.5665644407272339, + "logits/rejected": -0.6219701170921326, + "logps/chosen": -50.455474853515625, + "logps/rejected": -101.15159606933594, + "loss": 0.6559, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.32016658782959, + "rewards/margins": 6.4603118896484375, + "rewards/rejected": -3.1401455402374268, + "step": 15768 + }, + { + "epoch": 3.94, + "grad_norm": 4.143202781677246, + "learning_rate": 1.0584680961154241e-06, + "logits/chosen": -0.5434863567352295, + "logits/rejected": -0.6241956949234009, + "logps/chosen": -51.37758255004883, + "logps/rejected": -110.83878326416016, + "loss": 0.5567, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.068936824798584, + "rewards/margins": 7.995181560516357, + "rewards/rejected": -4.926244258880615, + "step": 15769 + }, + { + "epoch": 3.95, + "grad_norm": 11.758469581604004, + "learning_rate": 1.057984540052025e-06, + "logits/chosen": -0.5294625163078308, + "logits/rejected": -0.5784881114959717, + "logps/chosen": -55.3610725402832, + "logps/rejected": -116.7111587524414, + "loss": 0.6434, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.715012550354004, + "rewards/margins": 7.252460956573486, + "rewards/rejected": -4.537448406219482, + "step": 15770 + }, + { + "epoch": 3.95, + "grad_norm": 5.051446437835693, + "learning_rate": 1.0575010814000202e-06, + "logits/chosen": -0.5229164361953735, + "logits/rejected": -0.6438249349594116, + "logps/chosen": -54.455780029296875, + "logps/rejected": -101.11412048339844, + "loss": 0.6018, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1324641704559326, + "rewards/margins": 7.685939788818359, + "rewards/rejected": -4.553475856781006, + "step": 15771 + }, + { + "epoch": 3.95, + "grad_norm": 10.899613380432129, + "learning_rate": 1.0570177201713562e-06, + "logits/chosen": -0.5078353881835938, + "logits/rejected": -0.6053369045257568, + "logps/chosen": -57.45184326171875, + "logps/rejected": -88.3773193359375, + "loss": 0.56, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3961801528930664, + "rewards/margins": 7.051440238952637, + "rewards/rejected": -3.6552605628967285, + "step": 15772 + }, + { + "epoch": 3.95, + "grad_norm": 6.626653671264648, + "learning_rate": 1.0565344563779794e-06, + "logits/chosen": -0.5542147159576416, + "logits/rejected": -0.6405966877937317, + "logps/chosen": -56.42252731323242, + "logps/rejected": -119.03158569335938, + "loss": 0.6556, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0549731254577637, + "rewards/margins": 8.524286270141602, + "rewards/rejected": -5.46931266784668, + "step": 15773 + }, + { + "epoch": 3.95, + "grad_norm": 7.86307430267334, + "learning_rate": 1.0560512900318292e-06, + "logits/chosen": -0.5029356479644775, + "logits/rejected": -0.5722943544387817, + "logps/chosen": -56.646728515625, + "logps/rejected": -96.37754821777344, + "loss": 0.6798, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0095486640930176, + "rewards/margins": 6.336795330047607, + "rewards/rejected": -3.3272464275360107, + "step": 15774 + }, + { + "epoch": 3.95, + "grad_norm": 3.4912145137786865, + "learning_rate": 1.0555682211448487e-06, + "logits/chosen": -0.5338091850280762, + "logits/rejected": -0.6073143482208252, + "logps/chosen": -54.50402069091797, + "logps/rejected": -115.86209106445312, + "loss": 0.564, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.089026927947998, + "rewards/margins": 7.2953925132751465, + "rewards/rejected": -4.206366539001465, + "step": 15775 + }, + { + "epoch": 3.95, + "grad_norm": 9.901884078979492, + "learning_rate": 1.055085249728972e-06, + "logits/chosen": -0.48851075768470764, + "logits/rejected": -0.5466588735580444, + "logps/chosen": -58.35315704345703, + "logps/rejected": -124.55380249023438, + "loss": 0.6654, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.991091728210449, + "rewards/margins": 6.995429039001465, + "rewards/rejected": -4.004337310791016, + "step": 15776 + }, + { + "epoch": 3.95, + "grad_norm": 2.958559989929199, + "learning_rate": 1.0546023757961338e-06, + "logits/chosen": -0.5811298489570618, + "logits/rejected": -0.6508176922798157, + "logps/chosen": -49.3770637512207, + "logps/rejected": -107.67720031738281, + "loss": 0.656, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3196816444396973, + "rewards/margins": 7.240835666656494, + "rewards/rejected": -3.921154022216797, + "step": 15777 + }, + { + "epoch": 3.95, + "grad_norm": 8.571986198425293, + "learning_rate": 1.0541195993582687e-06, + "logits/chosen": -0.5125650763511658, + "logits/rejected": -0.6276605725288391, + "logps/chosen": -50.126102447509766, + "logps/rejected": -107.18345642089844, + "loss": 0.6736, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.999788761138916, + "rewards/margins": 6.727385997772217, + "rewards/rejected": -3.7275967597961426, + "step": 15778 + }, + { + "epoch": 3.95, + "grad_norm": 8.998568534851074, + "learning_rate": 1.0536369204273055e-06, + "logits/chosen": -0.5647301077842712, + "logits/rejected": -0.6284791231155396, + "logps/chosen": -50.81303787231445, + "logps/rejected": -110.95927429199219, + "loss": 0.6413, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.154111862182617, + "rewards/margins": 7.690066337585449, + "rewards/rejected": -4.535955429077148, + "step": 15779 + }, + { + "epoch": 3.95, + "grad_norm": 7.326425552368164, + "learning_rate": 1.05315433901517e-06, + "logits/chosen": -0.5615047216415405, + "logits/rejected": -0.6842275261878967, + "logps/chosen": -51.53924560546875, + "logps/rejected": -108.08988189697266, + "loss": 0.6964, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.776954412460327, + "rewards/margins": 7.563748359680176, + "rewards/rejected": -4.786794185638428, + "step": 15780 + }, + { + "epoch": 3.95, + "grad_norm": 9.22508430480957, + "learning_rate": 1.0526718551337888e-06, + "logits/chosen": -0.5324583649635315, + "logits/rejected": -0.6333106756210327, + "logps/chosen": -50.69762420654297, + "logps/rejected": -105.57926177978516, + "loss": 0.6188, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0392136573791504, + "rewards/margins": 7.177100658416748, + "rewards/rejected": -4.137887477874756, + "step": 15781 + }, + { + "epoch": 3.95, + "grad_norm": 4.333479404449463, + "learning_rate": 1.0521894687950863e-06, + "logits/chosen": -0.5942942500114441, + "logits/rejected": -0.6464765071868896, + "logps/chosen": -56.2210807800293, + "logps/rejected": -101.98725891113281, + "loss": 0.6525, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8966586589813232, + "rewards/margins": 6.2604169845581055, + "rewards/rejected": -3.3637585639953613, + "step": 15782 + }, + { + "epoch": 3.95, + "grad_norm": 3.5971107482910156, + "learning_rate": 1.0517071800109807e-06, + "logits/chosen": -0.5009409189224243, + "logits/rejected": -0.5643332600593567, + "logps/chosen": -51.3223876953125, + "logps/rejected": -88.6108627319336, + "loss": 0.5901, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.507427215576172, + "rewards/margins": 6.134282112121582, + "rewards/rejected": -2.626854658126831, + "step": 15783 + }, + { + "epoch": 3.95, + "grad_norm": 6.694357872009277, + "learning_rate": 1.0512249887933889e-06, + "logits/chosen": -0.5180844068527222, + "logits/rejected": -0.6009474396705627, + "logps/chosen": -61.94017791748047, + "logps/rejected": -112.84001159667969, + "loss": 0.6817, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1924357414245605, + "rewards/margins": 7.0771379470825195, + "rewards/rejected": -3.8847012519836426, + "step": 15784 + }, + { + "epoch": 3.95, + "grad_norm": 12.196556091308594, + "learning_rate": 1.0507428951542293e-06, + "logits/chosen": -0.528933584690094, + "logits/rejected": -0.5778871774673462, + "logps/chosen": -54.18229675292969, + "logps/rejected": -104.0068359375, + "loss": 0.7858, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9865660667419434, + "rewards/margins": 6.138298034667969, + "rewards/rejected": -3.151731014251709, + "step": 15785 + }, + { + "epoch": 3.95, + "grad_norm": 15.962535858154297, + "learning_rate": 1.0502608991054135e-06, + "logits/chosen": -0.5489292144775391, + "logits/rejected": -0.590722382068634, + "logps/chosen": -45.4996337890625, + "logps/rejected": -123.6116714477539, + "loss": 0.6064, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2007856369018555, + "rewards/margins": 7.662180423736572, + "rewards/rejected": -4.461394309997559, + "step": 15786 + }, + { + "epoch": 3.95, + "grad_norm": 8.669533729553223, + "learning_rate": 1.0497790006588503e-06, + "logits/chosen": -0.4869570732116699, + "logits/rejected": -0.5807307958602905, + "logps/chosen": -65.41133117675781, + "logps/rejected": -107.1008529663086, + "loss": 0.7543, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0283262729644775, + "rewards/margins": 4.897575855255127, + "rewards/rejected": -1.869249939918518, + "step": 15787 + }, + { + "epoch": 3.95, + "grad_norm": 2.836651086807251, + "learning_rate": 1.0492971998264513e-06, + "logits/chosen": -0.5501865148544312, + "logits/rejected": -0.6402818560600281, + "logps/chosen": -80.33372497558594, + "logps/rejected": -110.95132446289062, + "loss": 0.6944, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.055744171142578, + "rewards/margins": 7.664485931396484, + "rewards/rejected": -4.608741283416748, + "step": 15788 + }, + { + "epoch": 3.95, + "grad_norm": 4.338041305541992, + "learning_rate": 1.0488154966201203e-06, + "logits/chosen": -0.555436372756958, + "logits/rejected": -0.5635589361190796, + "logps/chosen": -54.078548431396484, + "logps/rejected": -125.23223876953125, + "loss": 0.6499, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0016472339630127, + "rewards/margins": 6.7040605545043945, + "rewards/rejected": -3.7024128437042236, + "step": 15789 + }, + { + "epoch": 3.95, + "grad_norm": 15.716733932495117, + "learning_rate": 1.0483338910517599e-06, + "logits/chosen": -0.5844231247901917, + "logits/rejected": -0.6042315363883972, + "logps/chosen": -50.52710723876953, + "logps/rejected": -94.4573974609375, + "loss": 0.6771, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2175159454345703, + "rewards/margins": 6.449146747589111, + "rewards/rejected": -3.231630563735962, + "step": 15790 + }, + { + "epoch": 3.95, + "grad_norm": 3.9682154655456543, + "learning_rate": 1.0478523831332727e-06, + "logits/chosen": -0.5642368197441101, + "logits/rejected": -0.658355712890625, + "logps/chosen": -55.53342056274414, + "logps/rejected": -120.84048461914062, + "loss": 0.6233, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.6207451820373535, + "rewards/margins": 7.863444805145264, + "rewards/rejected": -4.242699146270752, + "step": 15791 + }, + { + "epoch": 3.95, + "grad_norm": 4.744462490081787, + "learning_rate": 1.0473709728765574e-06, + "logits/chosen": -0.4859294295310974, + "logits/rejected": -0.5827960968017578, + "logps/chosen": -57.51997756958008, + "logps/rejected": -108.75830078125, + "loss": 0.6674, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.055128812789917, + "rewards/margins": 6.472954750061035, + "rewards/rejected": -3.417825698852539, + "step": 15792 + }, + { + "epoch": 3.95, + "grad_norm": 29.50508689880371, + "learning_rate": 1.0468896602935103e-06, + "logits/chosen": -0.5261818766593933, + "logits/rejected": -0.5584041476249695, + "logps/chosen": -57.74045944213867, + "logps/rejected": -111.13582611083984, + "loss": 0.8023, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6514229774475098, + "rewards/margins": 5.496521949768066, + "rewards/rejected": -2.845099449157715, + "step": 15793 + }, + { + "epoch": 3.95, + "grad_norm": 7.746756076812744, + "learning_rate": 1.046408445396022e-06, + "logits/chosen": -0.5764592885971069, + "logits/rejected": -0.6981140375137329, + "logps/chosen": -51.221649169921875, + "logps/rejected": -99.61385345458984, + "loss": 0.595, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2880959510803223, + "rewards/margins": 7.479284763336182, + "rewards/rejected": -4.191188812255859, + "step": 15794 + }, + { + "epoch": 3.95, + "grad_norm": 2.6133322715759277, + "learning_rate": 1.0459273281959886e-06, + "logits/chosen": -0.49964043498039246, + "logits/rejected": -0.6006665229797363, + "logps/chosen": -49.799190521240234, + "logps/rejected": -99.3541488647461, + "loss": 0.5496, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3999788761138916, + "rewards/margins": 7.215758323669434, + "rewards/rejected": -3.815778970718384, + "step": 15795 + }, + { + "epoch": 3.95, + "grad_norm": 7.414892196655273, + "learning_rate": 1.0454463087052962e-06, + "logits/chosen": -0.4642105996608734, + "logits/rejected": -0.5318195819854736, + "logps/chosen": -56.79568862915039, + "logps/rejected": -112.71575927734375, + "loss": 0.652, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.191981792449951, + "rewards/margins": 6.892298698425293, + "rewards/rejected": -3.7003173828125, + "step": 15796 + }, + { + "epoch": 3.95, + "grad_norm": 5.580471038818359, + "learning_rate": 1.0449653869358305e-06, + "logits/chosen": -0.5827786326408386, + "logits/rejected": -0.639025092124939, + "logps/chosen": -56.505470275878906, + "logps/rejected": -98.83641052246094, + "loss": 0.587, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.146705389022827, + "rewards/margins": 6.0761590003967285, + "rewards/rejected": -2.929453134536743, + "step": 15797 + }, + { + "epoch": 3.95, + "grad_norm": 44.469139099121094, + "learning_rate": 1.0444845628994783e-06, + "logits/chosen": -0.5666184425354004, + "logits/rejected": -0.6049593687057495, + "logps/chosen": -49.753665924072266, + "logps/rejected": -86.70446014404297, + "loss": 0.7915, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.2059121131896973, + "rewards/margins": 4.63511848449707, + "rewards/rejected": -1.4292066097259521, + "step": 15798 + }, + { + "epoch": 3.95, + "grad_norm": 3.74063777923584, + "learning_rate": 1.0440038366081184e-06, + "logits/chosen": -0.5200014710426331, + "logits/rejected": -0.6186884045600891, + "logps/chosen": -55.993263244628906, + "logps/rejected": -97.44783020019531, + "loss": 0.5756, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.206655502319336, + "rewards/margins": 6.927711009979248, + "rewards/rejected": -3.721055507659912, + "step": 15799 + }, + { + "epoch": 3.95, + "grad_norm": 10.412017822265625, + "learning_rate": 1.043523208073633e-06, + "logits/chosen": -0.5517427921295166, + "logits/rejected": -0.5965030789375305, + "logps/chosen": -58.64138412475586, + "logps/rejected": -104.78262329101562, + "loss": 0.7523, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8068063259124756, + "rewards/margins": 7.162962913513184, + "rewards/rejected": -4.356155872344971, + "step": 15800 + }, + { + "epoch": 3.95, + "grad_norm": 6.805183410644531, + "learning_rate": 1.0430426773078972e-06, + "logits/chosen": -0.5574963092803955, + "logits/rejected": -0.6362842917442322, + "logps/chosen": -64.69357299804688, + "logps/rejected": -111.61865997314453, + "loss": 0.7724, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.955350637435913, + "rewards/margins": 6.991146564483643, + "rewards/rejected": -4.03579568862915, + "step": 15801 + }, + { + "epoch": 3.95, + "grad_norm": 6.224837779998779, + "learning_rate": 1.042562244322784e-06, + "logits/chosen": -0.4913312792778015, + "logits/rejected": -0.5715187788009644, + "logps/chosen": -59.9531135559082, + "logps/rejected": -102.4240493774414, + "loss": 0.7899, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.706763505935669, + "rewards/margins": 6.354017734527588, + "rewards/rejected": -3.64725399017334, + "step": 15802 + }, + { + "epoch": 3.95, + "grad_norm": 4.522034645080566, + "learning_rate": 1.0420819091301687e-06, + "logits/chosen": -0.5647860765457153, + "logits/rejected": -0.6485665440559387, + "logps/chosen": -48.99815368652344, + "logps/rejected": -104.64154052734375, + "loss": 0.5575, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.01609206199646, + "rewards/margins": 7.228909969329834, + "rewards/rejected": -4.212817668914795, + "step": 15803 + }, + { + "epoch": 3.95, + "grad_norm": 3.7083311080932617, + "learning_rate": 1.0416016717419193e-06, + "logits/chosen": -0.4352496266365051, + "logits/rejected": -0.5469983220100403, + "logps/chosen": -68.65423583984375, + "logps/rejected": -98.87796020507812, + "loss": 0.6293, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1213889122009277, + "rewards/margins": 6.826522350311279, + "rewards/rejected": -3.7051331996917725, + "step": 15804 + }, + { + "epoch": 3.95, + "grad_norm": 5.795853137969971, + "learning_rate": 1.041121532169902e-06, + "logits/chosen": -0.575741171836853, + "logits/rejected": -0.679212212562561, + "logps/chosen": -54.68981170654297, + "logps/rejected": -91.87690734863281, + "loss": 0.6081, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4618306159973145, + "rewards/margins": 6.8300700187683105, + "rewards/rejected": -3.3682398796081543, + "step": 15805 + }, + { + "epoch": 3.95, + "grad_norm": 3.300511598587036, + "learning_rate": 1.0406414904259815e-06, + "logits/chosen": -0.5485318899154663, + "logits/rejected": -0.6095213890075684, + "logps/chosen": -47.75971221923828, + "logps/rejected": -91.77045440673828, + "loss": 0.5957, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3991944789886475, + "rewards/margins": 6.577003479003906, + "rewards/rejected": -3.177809715270996, + "step": 15806 + }, + { + "epoch": 3.95, + "grad_norm": 4.374466419219971, + "learning_rate": 1.040161546522025e-06, + "logits/chosen": -0.5798559784889221, + "logits/rejected": -0.6617623567581177, + "logps/chosen": -46.75347900390625, + "logps/rejected": -93.75362396240234, + "loss": 0.5633, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.178844928741455, + "rewards/margins": 6.834819793701172, + "rewards/rejected": -3.655975341796875, + "step": 15807 + }, + { + "epoch": 3.95, + "grad_norm": 3.8625316619873047, + "learning_rate": 1.039681700469885e-06, + "logits/chosen": -0.5409208536148071, + "logits/rejected": -0.6521961688995361, + "logps/chosen": -53.42782211303711, + "logps/rejected": -90.70127868652344, + "loss": 0.586, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1502270698547363, + "rewards/margins": 7.4390153884887695, + "rewards/rejected": -4.288787841796875, + "step": 15808 + }, + { + "epoch": 3.95, + "grad_norm": 6.312325477600098, + "learning_rate": 1.039201952281423e-06, + "logits/chosen": -0.4771152138710022, + "logits/rejected": -0.5410608053207397, + "logps/chosen": -54.01327896118164, + "logps/rejected": -97.25004577636719, + "loss": 0.712, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0837442874908447, + "rewards/margins": 6.001600742340088, + "rewards/rejected": -2.9178566932678223, + "step": 15809 + }, + { + "epoch": 3.96, + "grad_norm": 12.85963249206543, + "learning_rate": 1.0387223019684945e-06, + "logits/chosen": -0.5213507413864136, + "logits/rejected": -0.6433597803115845, + "logps/chosen": -62.565673828125, + "logps/rejected": -113.57490539550781, + "loss": 0.6245, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8822436332702637, + "rewards/margins": 7.381346702575684, + "rewards/rejected": -4.499102592468262, + "step": 15810 + }, + { + "epoch": 3.96, + "grad_norm": 6.3067827224731445, + "learning_rate": 1.038242749542952e-06, + "logits/chosen": -0.5008684396743774, + "logits/rejected": -0.595296323299408, + "logps/chosen": -63.599056243896484, + "logps/rejected": -98.90575408935547, + "loss": 0.701, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.946362018585205, + "rewards/margins": 6.446759223937988, + "rewards/rejected": -3.500396728515625, + "step": 15811 + }, + { + "epoch": 3.96, + "grad_norm": 2.6871337890625, + "learning_rate": 1.0377632950166433e-06, + "logits/chosen": -0.493831992149353, + "logits/rejected": -0.6318249106407166, + "logps/chosen": -65.58589935302734, + "logps/rejected": -93.34529876708984, + "loss": 0.64, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1507673263549805, + "rewards/margins": 7.415622711181641, + "rewards/rejected": -4.264855861663818, + "step": 15812 + }, + { + "epoch": 3.96, + "grad_norm": 4.470536708831787, + "learning_rate": 1.0372839384014195e-06, + "logits/chosen": -0.6204541325569153, + "logits/rejected": -0.7229176163673401, + "logps/chosen": -52.660438537597656, + "logps/rejected": -95.80674743652344, + "loss": 0.6542, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.197737693786621, + "rewards/margins": 7.018469333648682, + "rewards/rejected": -3.8207316398620605, + "step": 15813 + }, + { + "epoch": 3.96, + "grad_norm": 6.891371726989746, + "learning_rate": 1.0368046797091242e-06, + "logits/chosen": -0.6215533018112183, + "logits/rejected": -0.6647549867630005, + "logps/chosen": -56.246978759765625, + "logps/rejected": -129.22158813476562, + "loss": 0.6826, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.239557981491089, + "rewards/margins": 7.898010730743408, + "rewards/rejected": -4.658452987670898, + "step": 15814 + }, + { + "epoch": 3.96, + "grad_norm": 11.740632057189941, + "learning_rate": 1.0363255189515998e-06, + "logits/chosen": -0.579367995262146, + "logits/rejected": -0.62601637840271, + "logps/chosen": -52.904869079589844, + "logps/rejected": -107.79058837890625, + "loss": 0.7435, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8169307708740234, + "rewards/margins": 7.422058582305908, + "rewards/rejected": -4.605128288269043, + "step": 15815 + }, + { + "epoch": 3.96, + "grad_norm": 5.72139835357666, + "learning_rate": 1.0358464561406888e-06, + "logits/chosen": -0.5752589106559753, + "logits/rejected": -0.6417348384857178, + "logps/chosen": -55.31181716918945, + "logps/rejected": -102.24090576171875, + "loss": 0.5842, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0807809829711914, + "rewards/margins": 6.665948867797852, + "rewards/rejected": -3.585167646408081, + "step": 15816 + }, + { + "epoch": 3.96, + "grad_norm": 6.431675434112549, + "learning_rate": 1.0353674912882273e-06, + "logits/chosen": -0.5301733613014221, + "logits/rejected": -0.6373148560523987, + "logps/chosen": -52.8170166015625, + "logps/rejected": -117.50464630126953, + "loss": 0.6704, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.09832763671875, + "rewards/margins": 7.939734935760498, + "rewards/rejected": -4.84140682220459, + "step": 15817 + }, + { + "epoch": 3.96, + "grad_norm": 4.96852445602417, + "learning_rate": 1.034888624406053e-06, + "logits/chosen": -0.5043162107467651, + "logits/rejected": -0.5906746983528137, + "logps/chosen": -62.99432373046875, + "logps/rejected": -104.99150848388672, + "loss": 0.6791, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.904256582260132, + "rewards/margins": 6.640629768371582, + "rewards/rejected": -3.7363739013671875, + "step": 15818 + }, + { + "epoch": 3.96, + "grad_norm": 6.634697914123535, + "learning_rate": 1.0344098555059983e-06, + "logits/chosen": -0.5343427658081055, + "logits/rejected": -0.6095681190490723, + "logps/chosen": -56.80874252319336, + "logps/rejected": -97.25724792480469, + "loss": 0.5929, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.084629535675049, + "rewards/margins": 6.759558200836182, + "rewards/rejected": -3.6749279499053955, + "step": 15819 + }, + { + "epoch": 3.96, + "grad_norm": 12.246448516845703, + "learning_rate": 1.0339311845998929e-06, + "logits/chosen": -0.5146878957748413, + "logits/rejected": -0.6032769680023193, + "logps/chosen": -54.08890914916992, + "logps/rejected": -95.510498046875, + "loss": 0.6651, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3937149047851562, + "rewards/margins": 7.395735740661621, + "rewards/rejected": -4.002021789550781, + "step": 15820 + }, + { + "epoch": 3.96, + "grad_norm": 7.135803699493408, + "learning_rate": 1.033452611699568e-06, + "logits/chosen": -0.5423951745033264, + "logits/rejected": -0.6720800399780273, + "logps/chosen": -67.14176940917969, + "logps/rejected": -93.9846420288086, + "loss": 0.7504, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2374701499938965, + "rewards/margins": 6.431241035461426, + "rewards/rejected": -3.19377064704895, + "step": 15821 + }, + { + "epoch": 3.96, + "grad_norm": 2.535676956176758, + "learning_rate": 1.0329741368168477e-06, + "logits/chosen": -0.5582048296928406, + "logits/rejected": -0.6824743747711182, + "logps/chosen": -79.57801055908203, + "logps/rejected": -102.75800323486328, + "loss": 0.6317, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0280935764312744, + "rewards/margins": 7.6743927001953125, + "rewards/rejected": -4.646299362182617, + "step": 15822 + }, + { + "epoch": 3.96, + "grad_norm": 6.6645307540893555, + "learning_rate": 1.0324957599635549e-06, + "logits/chosen": -0.5980321764945984, + "logits/rejected": -0.6781670451164246, + "logps/chosen": -64.86183166503906, + "logps/rejected": -102.02397918701172, + "loss": 0.6308, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2050557136535645, + "rewards/margins": 7.064150810241699, + "rewards/rejected": -3.859095573425293, + "step": 15823 + }, + { + "epoch": 3.96, + "grad_norm": 3.268073320388794, + "learning_rate": 1.0320174811515116e-06, + "logits/chosen": -0.5459356307983398, + "logits/rejected": -0.6028170585632324, + "logps/chosen": -51.32283401489258, + "logps/rejected": -104.93952941894531, + "loss": 0.6522, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3516016006469727, + "rewards/margins": 7.259410381317139, + "rewards/rejected": -3.907809019088745, + "step": 15824 + }, + { + "epoch": 3.96, + "grad_norm": 3.447824716567993, + "learning_rate": 1.0315393003925384e-06, + "logits/chosen": -0.544328510761261, + "logits/rejected": -0.6556157469749451, + "logps/chosen": -59.62586212158203, + "logps/rejected": -95.818603515625, + "loss": 0.5807, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9437718391418457, + "rewards/margins": 7.894279479980469, + "rewards/rejected": -4.950507164001465, + "step": 15825 + }, + { + "epoch": 3.96, + "grad_norm": 16.37224769592285, + "learning_rate": 1.0310612176984502e-06, + "logits/chosen": -0.5538157820701599, + "logits/rejected": -0.6335631012916565, + "logps/chosen": -55.89235305786133, + "logps/rejected": -92.98370361328125, + "loss": 0.6239, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.541217565536499, + "rewards/margins": 7.115151405334473, + "rewards/rejected": -3.5739338397979736, + "step": 15826 + }, + { + "epoch": 3.96, + "grad_norm": 3.561288833618164, + "learning_rate": 1.0305832330810596e-06, + "logits/chosen": -0.5011498928070068, + "logits/rejected": -0.5492637753486633, + "logps/chosen": -46.415916442871094, + "logps/rejected": -115.76275634765625, + "loss": 0.565, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.130117177963257, + "rewards/margins": 6.329283714294434, + "rewards/rejected": -3.1991662979125977, + "step": 15827 + }, + { + "epoch": 3.96, + "grad_norm": 4.124868392944336, + "learning_rate": 1.0301053465521804e-06, + "logits/chosen": -0.5769177675247192, + "logits/rejected": -0.6342903971672058, + "logps/chosen": -53.840824127197266, + "logps/rejected": -122.67974090576172, + "loss": 0.5637, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.194693088531494, + "rewards/margins": 7.322598457336426, + "rewards/rejected": -4.12790584564209, + "step": 15828 + }, + { + "epoch": 3.96, + "grad_norm": 1.5092082023620605, + "learning_rate": 1.029627558123621e-06, + "logits/chosen": -0.5466164946556091, + "logits/rejected": -0.7106555700302124, + "logps/chosen": -47.031681060791016, + "logps/rejected": -117.57516479492188, + "loss": 0.5169, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.074535846710205, + "rewards/margins": 9.9271821975708, + "rewards/rejected": -6.852646350860596, + "step": 15829 + }, + { + "epoch": 3.96, + "grad_norm": 6.4172043800354, + "learning_rate": 1.029149867807186e-06, + "logits/chosen": -0.4555444121360779, + "logits/rejected": -0.5605713725090027, + "logps/chosen": -55.413246154785156, + "logps/rejected": -96.14008331298828, + "loss": 0.6626, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1701772212982178, + "rewards/margins": 7.008078575134277, + "rewards/rejected": -3.8379013538360596, + "step": 15830 + }, + { + "epoch": 3.96, + "grad_norm": 5.538599967956543, + "learning_rate": 1.0286722756146832e-06, + "logits/chosen": -0.5641053915023804, + "logits/rejected": -0.6555773615837097, + "logps/chosen": -58.17951965332031, + "logps/rejected": -92.27986145019531, + "loss": 0.6846, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.152233123779297, + "rewards/margins": 6.467243194580078, + "rewards/rejected": -3.315009832382202, + "step": 15831 + }, + { + "epoch": 3.96, + "grad_norm": 7.408750057220459, + "learning_rate": 1.0281947815579118e-06, + "logits/chosen": -0.5455283522605896, + "logits/rejected": -0.611122727394104, + "logps/chosen": -47.69820022583008, + "logps/rejected": -114.88943481445312, + "loss": 0.5945, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.020059585571289, + "rewards/margins": 7.750560760498047, + "rewards/rejected": -4.730501174926758, + "step": 15832 + }, + { + "epoch": 3.96, + "grad_norm": 4.9662251472473145, + "learning_rate": 1.0277173856486716e-06, + "logits/chosen": -0.5029043555259705, + "logits/rejected": -0.567348837852478, + "logps/chosen": -64.6122817993164, + "logps/rejected": -113.16168975830078, + "loss": 0.6462, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.253833770751953, + "rewards/margins": 7.8216376304626465, + "rewards/rejected": -4.567804336547852, + "step": 15833 + }, + { + "epoch": 3.96, + "grad_norm": 6.376359462738037, + "learning_rate": 1.0272400878987603e-06, + "logits/chosen": -0.52032470703125, + "logits/rejected": -0.5700759887695312, + "logps/chosen": -56.00926208496094, + "logps/rejected": -95.36113739013672, + "loss": 0.7746, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.4271440505981445, + "rewards/margins": 6.756882190704346, + "rewards/rejected": -3.329738140106201, + "step": 15834 + }, + { + "epoch": 3.96, + "grad_norm": 6.828248023986816, + "learning_rate": 1.0267628883199725e-06, + "logits/chosen": -0.4512990117073059, + "logits/rejected": -0.5680122375488281, + "logps/chosen": -62.5712776184082, + "logps/rejected": -107.45661926269531, + "loss": 0.655, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2087562084198, + "rewards/margins": 6.707642555236816, + "rewards/rejected": -3.4988863468170166, + "step": 15835 + }, + { + "epoch": 3.96, + "grad_norm": 12.839516639709473, + "learning_rate": 1.0262857869240989e-06, + "logits/chosen": -0.5423719882965088, + "logits/rejected": -0.6067199110984802, + "logps/chosen": -50.543357849121094, + "logps/rejected": -134.6546173095703, + "loss": 0.6287, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2919869422912598, + "rewards/margins": 7.9103193283081055, + "rewards/rejected": -4.618332386016846, + "step": 15836 + }, + { + "epoch": 3.96, + "grad_norm": 4.2214884757995605, + "learning_rate": 1.025808783722929e-06, + "logits/chosen": -0.5658776760101318, + "logits/rejected": -0.6299989223480225, + "logps/chosen": -61.621559143066406, + "logps/rejected": -98.65242004394531, + "loss": 0.6765, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9567110538482666, + "rewards/margins": 7.00283670425415, + "rewards/rejected": -4.046125411987305, + "step": 15837 + }, + { + "epoch": 3.96, + "grad_norm": 1.5140447616577148, + "learning_rate": 1.0253318787282535e-06, + "logits/chosen": -0.5056234002113342, + "logits/rejected": -0.6487623453140259, + "logps/chosen": -49.18275451660156, + "logps/rejected": -102.44853973388672, + "loss": 0.4976, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3864619731903076, + "rewards/margins": 8.526729583740234, + "rewards/rejected": -5.1402668952941895, + "step": 15838 + }, + { + "epoch": 3.96, + "grad_norm": 3.50590443611145, + "learning_rate": 1.0248550719518546e-06, + "logits/chosen": -0.5683198571205139, + "logits/rejected": -0.6600571870803833, + "logps/chosen": -50.66472625732422, + "logps/rejected": -90.81721496582031, + "loss": 0.5865, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.167858362197876, + "rewards/margins": 7.104966163635254, + "rewards/rejected": -3.937107563018799, + "step": 15839 + }, + { + "epoch": 3.96, + "grad_norm": 35.83720779418945, + "learning_rate": 1.0243783634055137e-06, + "logits/chosen": -0.4999579191207886, + "logits/rejected": -0.6353597044944763, + "logps/chosen": -48.2908935546875, + "logps/rejected": -98.72224426269531, + "loss": 0.692, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8749828338623047, + "rewards/margins": 8.373025894165039, + "rewards/rejected": -5.498043537139893, + "step": 15840 + }, + { + "epoch": 3.96, + "grad_norm": 6.668633460998535, + "learning_rate": 1.0239017531010137e-06, + "logits/chosen": -0.5645685195922852, + "logits/rejected": -0.6174156665802002, + "logps/chosen": -52.10023880004883, + "logps/rejected": -108.38856506347656, + "loss": 0.6198, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.976954460144043, + "rewards/margins": 7.192447662353516, + "rewards/rejected": -4.215493202209473, + "step": 15841 + }, + { + "epoch": 3.96, + "grad_norm": 7.9565558433532715, + "learning_rate": 1.0234252410501289e-06, + "logits/chosen": -0.6056941151618958, + "logits/rejected": -0.7038479447364807, + "logps/chosen": -61.7077751159668, + "logps/rejected": -122.64383697509766, + "loss": 0.6022, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.150057554244995, + "rewards/margins": 9.104312896728516, + "rewards/rejected": -5.9542555809021, + "step": 15842 + }, + { + "epoch": 3.96, + "grad_norm": 4.161569595336914, + "learning_rate": 1.0229488272646376e-06, + "logits/chosen": -0.5275620222091675, + "logits/rejected": -0.6596190929412842, + "logps/chosen": -49.780357360839844, + "logps/rejected": -88.72607421875, + "loss": 0.5662, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1086812019348145, + "rewards/margins": 7.7045087814331055, + "rewards/rejected": -4.595827102661133, + "step": 15843 + }, + { + "epoch": 3.96, + "grad_norm": 6.285635471343994, + "learning_rate": 1.0224725117563111e-06, + "logits/chosen": -0.48410382866859436, + "logits/rejected": -0.6237761974334717, + "logps/chosen": -65.2081527709961, + "logps/rejected": -118.6351547241211, + "loss": 0.5503, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2984540462493896, + "rewards/margins": 8.830036163330078, + "rewards/rejected": -5.531581401824951, + "step": 15844 + }, + { + "epoch": 3.96, + "grad_norm": 8.395102500915527, + "learning_rate": 1.0219962945369177e-06, + "logits/chosen": -0.63420170545578, + "logits/rejected": -0.6972503066062927, + "logps/chosen": -46.88590621948242, + "logps/rejected": -98.8941421508789, + "loss": 0.6508, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.921072244644165, + "rewards/margins": 7.261255264282227, + "rewards/rejected": -4.340183258056641, + "step": 15845 + }, + { + "epoch": 3.96, + "grad_norm": 2.140328884124756, + "learning_rate": 1.0215201756182285e-06, + "logits/chosen": -0.5216050148010254, + "logits/rejected": -0.6396527290344238, + "logps/chosen": -44.52849578857422, + "logps/rejected": -106.80118560791016, + "loss": 0.5397, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0199947357177734, + "rewards/margins": 8.709287643432617, + "rewards/rejected": -5.689291954040527, + "step": 15846 + }, + { + "epoch": 3.96, + "grad_norm": 2.660370349884033, + "learning_rate": 1.0210441550120076e-06, + "logits/chosen": -0.4899864196777344, + "logits/rejected": -0.5544878840446472, + "logps/chosen": -59.64716339111328, + "logps/rejected": -104.58658599853516, + "loss": 0.611, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.986829996109009, + "rewards/margins": 7.252252101898193, + "rewards/rejected": -4.2654218673706055, + "step": 15847 + }, + { + "epoch": 3.96, + "grad_norm": 4.0250563621521, + "learning_rate": 1.0205682327300164e-06, + "logits/chosen": -0.60223388671875, + "logits/rejected": -0.7415831089019775, + "logps/chosen": -62.489131927490234, + "logps/rejected": -83.9222183227539, + "loss": 0.5952, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8192646503448486, + "rewards/margins": 7.130776882171631, + "rewards/rejected": -4.311511993408203, + "step": 15848 + }, + { + "epoch": 3.96, + "grad_norm": 3.6986641883850098, + "learning_rate": 1.0200924087840168e-06, + "logits/chosen": -0.4924702048301697, + "logits/rejected": -0.5936084985733032, + "logps/chosen": -56.38622283935547, + "logps/rejected": -91.54024505615234, + "loss": 0.6354, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.135279417037964, + "rewards/margins": 6.555671691894531, + "rewards/rejected": -3.4203922748565674, + "step": 15849 + }, + { + "epoch": 3.97, + "grad_norm": 13.525047302246094, + "learning_rate": 1.0196166831857702e-06, + "logits/chosen": -0.5283080339431763, + "logits/rejected": -0.6577238440513611, + "logps/chosen": -62.97844314575195, + "logps/rejected": -111.0865249633789, + "loss": 0.6908, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7715470790863037, + "rewards/margins": 6.767230033874512, + "rewards/rejected": -3.995682954788208, + "step": 15850 + }, + { + "epoch": 3.97, + "grad_norm": 13.083980560302734, + "learning_rate": 1.0191410559470261e-06, + "logits/chosen": -0.4839540719985962, + "logits/rejected": -0.6471704244613647, + "logps/chosen": -60.329750061035156, + "logps/rejected": -102.38267517089844, + "loss": 0.6103, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8265440464019775, + "rewards/margins": 8.128654479980469, + "rewards/rejected": -5.3021111488342285, + "step": 15851 + }, + { + "epoch": 3.97, + "grad_norm": 11.450940132141113, + "learning_rate": 1.018665527079541e-06, + "logits/chosen": -0.5279092192649841, + "logits/rejected": -0.6086139678955078, + "logps/chosen": -69.32963562011719, + "logps/rejected": -113.14472961425781, + "loss": 0.7406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8478140830993652, + "rewards/margins": 7.002737998962402, + "rewards/rejected": -4.154923915863037, + "step": 15852 + }, + { + "epoch": 3.97, + "grad_norm": 7.051183223724365, + "learning_rate": 1.0181900965950675e-06, + "logits/chosen": -0.5507675409317017, + "logits/rejected": -0.6436240077018738, + "logps/chosen": -62.22882843017578, + "logps/rejected": -110.09906005859375, + "loss": 0.6291, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3665544986724854, + "rewards/margins": 8.083340644836426, + "rewards/rejected": -4.7167863845825195, + "step": 15853 + }, + { + "epoch": 3.97, + "grad_norm": 3.4877545833587646, + "learning_rate": 1.0177147645053498e-06, + "logits/chosen": -0.38556745648384094, + "logits/rejected": -0.45412054657936096, + "logps/chosen": -58.815155029296875, + "logps/rejected": -116.18850708007812, + "loss": 0.6553, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.272346019744873, + "rewards/margins": 7.431801795959473, + "rewards/rejected": -4.1594557762146, + "step": 15854 + }, + { + "epoch": 3.97, + "grad_norm": 36.27128219604492, + "learning_rate": 1.0172395308221355e-06, + "logits/chosen": -0.5071171522140503, + "logits/rejected": -0.6324268579483032, + "logps/chosen": -62.68436813354492, + "logps/rejected": -100.28726196289062, + "loss": 0.7028, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8113303184509277, + "rewards/margins": 7.262197017669678, + "rewards/rejected": -4.45086669921875, + "step": 15855 + }, + { + "epoch": 3.97, + "grad_norm": 5.090072154998779, + "learning_rate": 1.0167643955571705e-06, + "logits/chosen": -0.5268494486808777, + "logits/rejected": -0.6126167178153992, + "logps/chosen": -49.411094665527344, + "logps/rejected": -88.26414489746094, + "loss": 0.7897, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.112778425216675, + "rewards/margins": 5.68831205368042, + "rewards/rejected": -2.5755341053009033, + "step": 15856 + }, + { + "epoch": 3.97, + "grad_norm": 5.588434219360352, + "learning_rate": 1.016289358722194e-06, + "logits/chosen": -0.5506182909011841, + "logits/rejected": -0.6149861216545105, + "logps/chosen": -51.485633850097656, + "logps/rejected": -107.17189025878906, + "loss": 0.5624, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.465834856033325, + "rewards/margins": 7.722612380981445, + "rewards/rejected": -4.256777763366699, + "step": 15857 + }, + { + "epoch": 3.97, + "grad_norm": 4.361303806304932, + "learning_rate": 1.0158144203289432e-06, + "logits/chosen": -0.4779401421546936, + "logits/rejected": -0.5820754766464233, + "logps/chosen": -51.29929733276367, + "logps/rejected": -108.03184509277344, + "loss": 0.5783, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.089966297149658, + "rewards/margins": 8.21970272064209, + "rewards/rejected": -5.129736423492432, + "step": 15858 + }, + { + "epoch": 3.97, + "grad_norm": 5.263432502746582, + "learning_rate": 1.0153395803891568e-06, + "logits/chosen": -0.438772976398468, + "logits/rejected": -0.5399462580680847, + "logps/chosen": -52.839176177978516, + "logps/rejected": -99.68609619140625, + "loss": 0.5872, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2975616455078125, + "rewards/margins": 7.284113883972168, + "rewards/rejected": -3.9865522384643555, + "step": 15859 + }, + { + "epoch": 3.97, + "grad_norm": 7.434573650360107, + "learning_rate": 1.0148648389145683e-06, + "logits/chosen": -0.4745332896709442, + "logits/rejected": -0.5662965774536133, + "logps/chosen": -60.64318084716797, + "logps/rejected": -120.46990966796875, + "loss": 0.7052, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.5400495529174805, + "rewards/margins": 7.1722412109375, + "rewards/rejected": -3.6321914196014404, + "step": 15860 + }, + { + "epoch": 3.97, + "grad_norm": 10.555343627929688, + "learning_rate": 1.0143901959169067e-06, + "logits/chosen": -0.46578502655029297, + "logits/rejected": -0.5509974360466003, + "logps/chosen": -56.34931945800781, + "logps/rejected": -92.47489929199219, + "loss": 0.6751, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.987682819366455, + "rewards/margins": 6.312381744384766, + "rewards/rejected": -3.3246984481811523, + "step": 15861 + }, + { + "epoch": 3.97, + "grad_norm": 7.720592498779297, + "learning_rate": 1.013915651407904e-06, + "logits/chosen": -0.4608628749847412, + "logits/rejected": -0.5492449402809143, + "logps/chosen": -54.239166259765625, + "logps/rejected": -87.24333190917969, + "loss": 0.7815, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.804194927215576, + "rewards/margins": 5.080550670623779, + "rewards/rejected": -2.2763559818267822, + "step": 15862 + }, + { + "epoch": 3.97, + "grad_norm": 5.062201023101807, + "learning_rate": 1.0134412053992843e-06, + "logits/chosen": -0.4753952622413635, + "logits/rejected": -0.5509018898010254, + "logps/chosen": -54.48197937011719, + "logps/rejected": -98.56443786621094, + "loss": 0.6052, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5330538749694824, + "rewards/margins": 7.164061069488525, + "rewards/rejected": -3.6310079097747803, + "step": 15863 + }, + { + "epoch": 3.97, + "grad_norm": 15.696032524108887, + "learning_rate": 1.012966857902774e-06, + "logits/chosen": -0.5062075853347778, + "logits/rejected": -0.5890055894851685, + "logps/chosen": -68.8550796508789, + "logps/rejected": -122.0784912109375, + "loss": 0.6962, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9575693607330322, + "rewards/margins": 7.402658939361572, + "rewards/rejected": -4.445089817047119, + "step": 15864 + }, + { + "epoch": 3.97, + "grad_norm": 4.6373066902160645, + "learning_rate": 1.012492608930093e-06, + "logits/chosen": -0.49284833669662476, + "logits/rejected": -0.5586069822311401, + "logps/chosen": -48.063018798828125, + "logps/rejected": -89.35398864746094, + "loss": 0.5922, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2768473625183105, + "rewards/margins": 6.32346248626709, + "rewards/rejected": -3.0466153621673584, + "step": 15865 + }, + { + "epoch": 3.97, + "grad_norm": 6.782588481903076, + "learning_rate": 1.0120184584929599e-06, + "logits/chosen": -0.501198410987854, + "logits/rejected": -0.594482958316803, + "logps/chosen": -48.3378791809082, + "logps/rejected": -87.45199584960938, + "loss": 0.6093, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0586376190185547, + "rewards/margins": 5.817924976348877, + "rewards/rejected": -2.759287118911743, + "step": 15866 + }, + { + "epoch": 3.97, + "grad_norm": 4.857716083526611, + "learning_rate": 1.0115444066030927e-06, + "logits/chosen": -0.4854223430156708, + "logits/rejected": -0.5329554677009583, + "logps/chosen": -62.74842834472656, + "logps/rejected": -100.0632095336914, + "loss": 0.6266, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9888622760772705, + "rewards/margins": 5.303325176239014, + "rewards/rejected": -2.314462661743164, + "step": 15867 + }, + { + "epoch": 3.97, + "grad_norm": 2.7533395290374756, + "learning_rate": 1.0110704532722065e-06, + "logits/chosen": -0.5166285037994385, + "logits/rejected": -0.652339518070221, + "logps/chosen": -53.419334411621094, + "logps/rejected": -107.28254699707031, + "loss": 0.5534, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0988211631774902, + "rewards/margins": 7.578288555145264, + "rewards/rejected": -4.479467391967773, + "step": 15868 + }, + { + "epoch": 3.97, + "grad_norm": 2.310309648513794, + "learning_rate": 1.0105965985120126e-06, + "logits/chosen": -0.4651511311531067, + "logits/rejected": -0.6064264178276062, + "logps/chosen": -64.23355102539062, + "logps/rejected": -91.41061401367188, + "loss": 0.5798, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2289233207702637, + "rewards/margins": 7.200674533843994, + "rewards/rejected": -3.9717509746551514, + "step": 15869 + }, + { + "epoch": 3.97, + "grad_norm": 3.6846840381622314, + "learning_rate": 1.0101228423342186e-06, + "logits/chosen": -0.4973592758178711, + "logits/rejected": -0.5952625870704651, + "logps/chosen": -55.79414367675781, + "logps/rejected": -116.33157348632812, + "loss": 0.5479, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.319678783416748, + "rewards/margins": 7.656030654907227, + "rewards/rejected": -4.336352348327637, + "step": 15870 + }, + { + "epoch": 3.97, + "grad_norm": 8.531811714172363, + "learning_rate": 1.009649184750534e-06, + "logits/chosen": -0.4978795647621155, + "logits/rejected": -0.6059041619300842, + "logps/chosen": -64.18805694580078, + "logps/rejected": -100.6734848022461, + "loss": 0.7091, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.692457437515259, + "rewards/margins": 6.787087440490723, + "rewards/rejected": -4.094629764556885, + "step": 15871 + }, + { + "epoch": 3.97, + "grad_norm": 2.7000443935394287, + "learning_rate": 1.0091756257726626e-06, + "logits/chosen": -0.5299719572067261, + "logits/rejected": -0.5639681220054626, + "logps/chosen": -53.0703239440918, + "logps/rejected": -99.65120697021484, + "loss": 0.6106, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9603521823883057, + "rewards/margins": 7.009707927703857, + "rewards/rejected": -4.049355983734131, + "step": 15872 + }, + { + "epoch": 3.97, + "grad_norm": 11.326057434082031, + "learning_rate": 1.0087021654123046e-06, + "logits/chosen": -0.5647041201591492, + "logits/rejected": -0.6297385692596436, + "logps/chosen": -53.61738586425781, + "logps/rejected": -100.24610900878906, + "loss": 0.6319, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.041926145553589, + "rewards/margins": 6.949738502502441, + "rewards/rejected": -3.907811403274536, + "step": 15873 + }, + { + "epoch": 3.97, + "grad_norm": 28.417713165283203, + "learning_rate": 1.0082288036811633e-06, + "logits/chosen": -0.5928273797035217, + "logits/rejected": -0.6316056251525879, + "logps/chosen": -44.77581024169922, + "logps/rejected": -87.51596069335938, + "loss": 0.5976, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.253335475921631, + "rewards/margins": 6.046425819396973, + "rewards/rejected": -2.793090343475342, + "step": 15874 + }, + { + "epoch": 3.97, + "grad_norm": 8.058076858520508, + "learning_rate": 1.007755540590934e-06, + "logits/chosen": -0.5144490003585815, + "logits/rejected": -0.6124849319458008, + "logps/chosen": -58.41890335083008, + "logps/rejected": -96.75727844238281, + "loss": 0.5262, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5056023597717285, + "rewards/margins": 6.921571731567383, + "rewards/rejected": -3.415968894958496, + "step": 15875 + }, + { + "epoch": 3.97, + "grad_norm": 15.69489574432373, + "learning_rate": 1.00728237615331e-06, + "logits/chosen": -0.5061013102531433, + "logits/rejected": -0.5909904837608337, + "logps/chosen": -50.17948913574219, + "logps/rejected": -93.62116241455078, + "loss": 0.8434, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.309143543243408, + "rewards/margins": 5.817759037017822, + "rewards/rejected": -2.508615255355835, + "step": 15876 + }, + { + "epoch": 3.97, + "grad_norm": 17.924903869628906, + "learning_rate": 1.006809310379987e-06, + "logits/chosen": -0.5999844074249268, + "logits/rejected": -0.6303830146789551, + "logps/chosen": -58.63158416748047, + "logps/rejected": -96.7883071899414, + "loss": 0.6678, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0605249404907227, + "rewards/margins": 5.676230430603027, + "rewards/rejected": -2.6157054901123047, + "step": 15877 + }, + { + "epoch": 3.97, + "grad_norm": 4.743592739105225, + "learning_rate": 1.0063363432826534e-06, + "logits/chosen": -0.5342612266540527, + "logits/rejected": -0.6363494992256165, + "logps/chosen": -54.96519088745117, + "logps/rejected": -85.37506866455078, + "loss": 0.6147, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1492388248443604, + "rewards/margins": 6.749817371368408, + "rewards/rejected": -3.6005778312683105, + "step": 15878 + }, + { + "epoch": 3.97, + "grad_norm": 5.421112537384033, + "learning_rate": 1.005863474872995e-06, + "logits/chosen": -0.4911980628967285, + "logits/rejected": -0.6054913401603699, + "logps/chosen": -60.11432647705078, + "logps/rejected": -105.81562805175781, + "loss": 0.6456, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4566092491149902, + "rewards/margins": 7.82287073135376, + "rewards/rejected": -4.366260528564453, + "step": 15879 + }, + { + "epoch": 3.97, + "grad_norm": 2.184932231903076, + "learning_rate": 1.0053907051626987e-06, + "logits/chosen": -0.5996456742286682, + "logits/rejected": -0.675227165222168, + "logps/chosen": -49.39992904663086, + "logps/rejected": -96.502197265625, + "loss": 0.5373, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.243361711502075, + "rewards/margins": 7.4573655128479, + "rewards/rejected": -4.214003562927246, + "step": 15880 + }, + { + "epoch": 3.97, + "grad_norm": 4.756528854370117, + "learning_rate": 1.0049180341634478e-06, + "logits/chosen": -0.5720556378364563, + "logits/rejected": -0.64607834815979, + "logps/chosen": -53.08285140991211, + "logps/rejected": -109.5391845703125, + "loss": 0.6925, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.334493398666382, + "rewards/margins": 7.334765911102295, + "rewards/rejected": -4.000273704528809, + "step": 15881 + }, + { + "epoch": 3.97, + "grad_norm": 5.178255558013916, + "learning_rate": 1.004445461886922e-06, + "logits/chosen": -0.5746526718139648, + "logits/rejected": -0.6296901702880859, + "logps/chosen": -49.07421112060547, + "logps/rejected": -99.19636535644531, + "loss": 0.64, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9698238372802734, + "rewards/margins": 6.812943935394287, + "rewards/rejected": -3.8431200981140137, + "step": 15882 + }, + { + "epoch": 3.97, + "grad_norm": 3.803107976913452, + "learning_rate": 1.0039729883447974e-06, + "logits/chosen": -0.45060059428215027, + "logits/rejected": -0.5558164715766907, + "logps/chosen": -51.18006134033203, + "logps/rejected": -106.89863586425781, + "loss": 0.5637, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1688201427459717, + "rewards/margins": 7.9472503662109375, + "rewards/rejected": -4.77842903137207, + "step": 15883 + }, + { + "epoch": 3.97, + "grad_norm": 4.563066005706787, + "learning_rate": 1.0035006135487518e-06, + "logits/chosen": -0.5873944759368896, + "logits/rejected": -0.5952439904212952, + "logps/chosen": -53.23210144042969, + "logps/rejected": -111.855224609375, + "loss": 0.6782, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1892144680023193, + "rewards/margins": 6.505896091461182, + "rewards/rejected": -3.3166821002960205, + "step": 15884 + }, + { + "epoch": 3.97, + "grad_norm": 3.4438774585723877, + "learning_rate": 1.0030283375104572e-06, + "logits/chosen": -0.4686424136161804, + "logits/rejected": -0.5419472455978394, + "logps/chosen": -58.56377029418945, + "logps/rejected": -130.610107421875, + "loss": 0.5725, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1226420402526855, + "rewards/margins": 8.274139404296875, + "rewards/rejected": -5.151496410369873, + "step": 15885 + }, + { + "epoch": 3.97, + "grad_norm": 5.062403202056885, + "learning_rate": 1.0025561602415818e-06, + "logits/chosen": -0.46394333243370056, + "logits/rejected": -0.5626701712608337, + "logps/chosen": -51.41999435424805, + "logps/rejected": -117.83427429199219, + "loss": 0.541, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.281771659851074, + "rewards/margins": 8.533462524414062, + "rewards/rejected": -5.251691818237305, + "step": 15886 + }, + { + "epoch": 3.97, + "grad_norm": 4.912059783935547, + "learning_rate": 1.0020840817537976e-06, + "logits/chosen": -0.5779840350151062, + "logits/rejected": -0.6582103967666626, + "logps/chosen": -56.56626892089844, + "logps/rejected": -97.34211730957031, + "loss": 0.7178, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8915514945983887, + "rewards/margins": 6.696299076080322, + "rewards/rejected": -3.804748058319092, + "step": 15887 + }, + { + "epoch": 3.97, + "grad_norm": 2.7046525478363037, + "learning_rate": 1.001612102058766e-06, + "logits/chosen": -0.609257698059082, + "logits/rejected": -0.7120453119277954, + "logps/chosen": -56.6434326171875, + "logps/rejected": -86.19808959960938, + "loss": 0.6142, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3015177249908447, + "rewards/margins": 7.5463104248046875, + "rewards/rejected": -4.244792938232422, + "step": 15888 + }, + { + "epoch": 3.97, + "grad_norm": 34.557987213134766, + "learning_rate": 1.0011402211681542e-06, + "logits/chosen": -0.5369532704353333, + "logits/rejected": -0.6242051720619202, + "logps/chosen": -59.12187576293945, + "logps/rejected": -107.12882995605469, + "loss": 0.7365, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.990424156188965, + "rewards/margins": 7.91005802154541, + "rewards/rejected": -4.9196343421936035, + "step": 15889 + }, + { + "epoch": 3.98, + "grad_norm": 4.195663928985596, + "learning_rate": 1.0006684390936206e-06, + "logits/chosen": -0.6059494614601135, + "logits/rejected": -0.6795742511749268, + "logps/chosen": -50.542144775390625, + "logps/rejected": -105.05237579345703, + "loss": 0.6531, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.434774398803711, + "rewards/margins": 7.020631313323975, + "rewards/rejected": -3.585857391357422, + "step": 15890 + }, + { + "epoch": 3.98, + "grad_norm": 6.558688640594482, + "learning_rate": 1.0001967558468228e-06, + "logits/chosen": -0.5414804220199585, + "logits/rejected": -0.5997389554977417, + "logps/chosen": -52.94026184082031, + "logps/rejected": -96.23196411132812, + "loss": 0.6962, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2568907737731934, + "rewards/margins": 5.020143985748291, + "rewards/rejected": -1.7632527351379395, + "step": 15891 + }, + { + "epoch": 3.98, + "grad_norm": 2.040205240249634, + "learning_rate": 9.997251714394175e-07, + "logits/chosen": -0.5400782227516174, + "logits/rejected": -0.574927806854248, + "logps/chosen": -50.35839080810547, + "logps/rejected": -123.37950134277344, + "loss": 0.5435, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.304856777191162, + "rewards/margins": 7.674137115478516, + "rewards/rejected": -4.369280815124512, + "step": 15892 + }, + { + "epoch": 3.98, + "grad_norm": 8.359186172485352, + "learning_rate": 9.992536858830604e-07, + "logits/chosen": -0.5281678438186646, + "logits/rejected": -0.6372804641723633, + "logps/chosen": -51.7638053894043, + "logps/rejected": -97.34595489501953, + "loss": 0.5606, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0283074378967285, + "rewards/margins": 6.933564186096191, + "rewards/rejected": -3.905256509780884, + "step": 15893 + }, + { + "epoch": 3.98, + "grad_norm": 3.89272403717041, + "learning_rate": 9.987822991893975e-07, + "logits/chosen": -0.538567304611206, + "logits/rejected": -0.6192500591278076, + "logps/chosen": -51.821800231933594, + "logps/rejected": -111.38945007324219, + "loss": 0.5573, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.950174331665039, + "rewards/margins": 6.803202152252197, + "rewards/rejected": -3.8530282974243164, + "step": 15894 + }, + { + "epoch": 3.98, + "grad_norm": 5.683559417724609, + "learning_rate": 9.9831101137008e-07, + "logits/chosen": -0.5214958786964417, + "logits/rejected": -0.6001754403114319, + "logps/chosen": -46.37394714355469, + "logps/rejected": -116.13287353515625, + "loss": 0.5491, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9825682640075684, + "rewards/margins": 7.5738115310668945, + "rewards/rejected": -4.591243267059326, + "step": 15895 + }, + { + "epoch": 3.98, + "grad_norm": 11.051496505737305, + "learning_rate": 9.978398224367563e-07, + "logits/chosen": -0.485586017370224, + "logits/rejected": -0.5525038838386536, + "logps/chosen": -45.89112854003906, + "logps/rejected": -116.4612045288086, + "loss": 0.5794, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0445353984832764, + "rewards/margins": 7.848052501678467, + "rewards/rejected": -4.803517818450928, + "step": 15896 + }, + { + "epoch": 3.98, + "grad_norm": 12.355741500854492, + "learning_rate": 9.97368732401065e-07, + "logits/chosen": -0.5779978632926941, + "logits/rejected": -0.6926109790802002, + "logps/chosen": -52.02428436279297, + "logps/rejected": -91.49653625488281, + "loss": 0.6393, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9346067905426025, + "rewards/margins": 6.776278495788574, + "rewards/rejected": -3.8416714668273926, + "step": 15897 + }, + { + "epoch": 3.98, + "grad_norm": 22.745540618896484, + "learning_rate": 9.968977412746495e-07, + "logits/chosen": -0.5539799928665161, + "logits/rejected": -0.6599383354187012, + "logps/chosen": -69.30449676513672, + "logps/rejected": -109.92022705078125, + "loss": 0.6818, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.628288745880127, + "rewards/margins": 6.857353210449219, + "rewards/rejected": -4.229064464569092, + "step": 15898 + }, + { + "epoch": 3.98, + "grad_norm": 3.777555227279663, + "learning_rate": 9.964268490691503e-07, + "logits/chosen": -0.554900586605072, + "logits/rejected": -0.6069920659065247, + "logps/chosen": -46.053955078125, + "logps/rejected": -117.29683685302734, + "loss": 0.516, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.25068998336792, + "rewards/margins": 6.986602306365967, + "rewards/rejected": -3.7359120845794678, + "step": 15899 + }, + { + "epoch": 3.98, + "grad_norm": 4.523924827575684, + "learning_rate": 9.959560557962017e-07, + "logits/chosen": -0.44574156403541565, + "logits/rejected": -0.5090569257736206, + "logps/chosen": -60.29289245605469, + "logps/rejected": -111.5156478881836, + "loss": 0.6311, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0315749645233154, + "rewards/margins": 7.339954853057861, + "rewards/rejected": -4.308379650115967, + "step": 15900 + }, + { + "epoch": 3.98, + "grad_norm": 5.087145805358887, + "learning_rate": 9.95485361467437e-07, + "logits/chosen": -0.4875062108039856, + "logits/rejected": -0.5837529897689819, + "logps/chosen": -46.02202606201172, + "logps/rejected": -97.42860412597656, + "loss": 0.5822, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2815451622009277, + "rewards/margins": 7.446538925170898, + "rewards/rejected": -4.1649932861328125, + "step": 15901 + }, + { + "epoch": 3.98, + "grad_norm": 6.640341281890869, + "learning_rate": 9.950147660944893e-07, + "logits/chosen": -0.596975564956665, + "logits/rejected": -0.6373804211616516, + "logps/chosen": -45.37691116333008, + "logps/rejected": -112.98619079589844, + "loss": 0.6087, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.258802890777588, + "rewards/margins": 6.925806045532227, + "rewards/rejected": -3.6670031547546387, + "step": 15902 + }, + { + "epoch": 3.98, + "grad_norm": 3.7217490673065186, + "learning_rate": 9.945442696889867e-07, + "logits/chosen": -0.4364742636680603, + "logits/rejected": -0.5173034071922302, + "logps/chosen": -75.08001708984375, + "logps/rejected": -116.70071411132812, + "loss": 0.5955, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8461742401123047, + "rewards/margins": 6.7146430015563965, + "rewards/rejected": -3.86846923828125, + "step": 15903 + }, + { + "epoch": 3.98, + "grad_norm": 2.4466185569763184, + "learning_rate": 9.940738722625542e-07, + "logits/chosen": -0.47332802414894104, + "logits/rejected": -0.5984935164451599, + "logps/chosen": -55.59824752807617, + "logps/rejected": -94.7660903930664, + "loss": 0.5714, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9126038551330566, + "rewards/margins": 7.558644771575928, + "rewards/rejected": -4.646040916442871, + "step": 15904 + }, + { + "epoch": 3.98, + "grad_norm": 3.4221038818359375, + "learning_rate": 9.93603573826819e-07, + "logits/chosen": -0.5848737359046936, + "logits/rejected": -0.6288363933563232, + "logps/chosen": -59.94463348388672, + "logps/rejected": -110.67626953125, + "loss": 0.6454, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0437235832214355, + "rewards/margins": 7.1497416496276855, + "rewards/rejected": -4.10601806640625, + "step": 15905 + }, + { + "epoch": 3.98, + "grad_norm": 21.76181983947754, + "learning_rate": 9.931333743933992e-07, + "logits/chosen": -0.5726902484893799, + "logits/rejected": -0.6382827758789062, + "logps/chosen": -58.56243133544922, + "logps/rejected": -107.43016052246094, + "loss": 0.7852, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.073004722595215, + "rewards/margins": 7.275214195251465, + "rewards/rejected": -4.202209949493408, + "step": 15906 + }, + { + "epoch": 3.98, + "grad_norm": 7.594696521759033, + "learning_rate": 9.926632739739173e-07, + "logits/chosen": -0.4797085225582123, + "logits/rejected": -0.588648796081543, + "logps/chosen": -69.27129364013672, + "logps/rejected": -94.54805755615234, + "loss": 0.7719, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.117889642715454, + "rewards/margins": 6.747834205627441, + "rewards/rejected": -3.6299448013305664, + "step": 15907 + }, + { + "epoch": 3.98, + "grad_norm": 3.1099696159362793, + "learning_rate": 9.921932725799875e-07, + "logits/chosen": -0.5647615194320679, + "logits/rejected": -0.6522936820983887, + "logps/chosen": -50.243682861328125, + "logps/rejected": -106.55191802978516, + "loss": 0.4953, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4247021675109863, + "rewards/margins": 7.508039474487305, + "rewards/rejected": -4.083337306976318, + "step": 15908 + }, + { + "epoch": 3.98, + "grad_norm": 3.924529552459717, + "learning_rate": 9.917233702232243e-07, + "logits/chosen": -0.4965735971927643, + "logits/rejected": -0.5750545263290405, + "logps/chosen": -56.40449523925781, + "logps/rejected": -115.36180114746094, + "loss": 0.6094, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.861544370651245, + "rewards/margins": 7.714493751525879, + "rewards/rejected": -4.852949142456055, + "step": 15909 + }, + { + "epoch": 3.98, + "grad_norm": 2.7406959533691406, + "learning_rate": 9.912535669152412e-07, + "logits/chosen": -0.543473482131958, + "logits/rejected": -0.6127285957336426, + "logps/chosen": -60.263763427734375, + "logps/rejected": -115.6544418334961, + "loss": 0.6306, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.135176658630371, + "rewards/margins": 7.530295372009277, + "rewards/rejected": -4.395118713378906, + "step": 15910 + }, + { + "epoch": 3.98, + "grad_norm": 5.045440196990967, + "learning_rate": 9.907838626676447e-07, + "logits/chosen": -0.4917275607585907, + "logits/rejected": -0.6122184991836548, + "logps/chosen": -53.92197036743164, + "logps/rejected": -91.06561279296875, + "loss": 0.7184, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.930739402770996, + "rewards/margins": 6.72291374206543, + "rewards/rejected": -3.7921745777130127, + "step": 15911 + }, + { + "epoch": 3.98, + "grad_norm": 3.526271104812622, + "learning_rate": 9.903142574920448e-07, + "logits/chosen": -0.536189079284668, + "logits/rejected": -0.6418616771697998, + "logps/chosen": -51.99944305419922, + "logps/rejected": -90.89136505126953, + "loss": 0.5927, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0408225059509277, + "rewards/margins": 7.324493885040283, + "rewards/rejected": -4.283670902252197, + "step": 15912 + }, + { + "epoch": 3.98, + "grad_norm": 4.681820869445801, + "learning_rate": 9.898447514000431e-07, + "logits/chosen": -0.5174251794815063, + "logits/rejected": -0.6045981645584106, + "logps/chosen": -56.05769729614258, + "logps/rejected": -112.22724914550781, + "loss": 0.6004, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0081472396850586, + "rewards/margins": 7.045641899108887, + "rewards/rejected": -4.037494659423828, + "step": 15913 + }, + { + "epoch": 3.98, + "grad_norm": 5.0909857749938965, + "learning_rate": 9.893753444032444e-07, + "logits/chosen": -0.51731276512146, + "logits/rejected": -0.5923595428466797, + "logps/chosen": -62.18699645996094, + "logps/rejected": -107.36689758300781, + "loss": 0.6904, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.093419075012207, + "rewards/margins": 7.297938346862793, + "rewards/rejected": -4.204519748687744, + "step": 15914 + }, + { + "epoch": 3.98, + "grad_norm": 4.788808822631836, + "learning_rate": 9.88906036513247e-07, + "logits/chosen": -0.6167402267456055, + "logits/rejected": -0.6945956945419312, + "logps/chosen": -55.67164993286133, + "logps/rejected": -106.60655975341797, + "loss": 0.6062, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0693960189819336, + "rewards/margins": 7.343421459197998, + "rewards/rejected": -4.2740254402160645, + "step": 15915 + }, + { + "epoch": 3.98, + "grad_norm": 4.892789840698242, + "learning_rate": 9.884368277416462e-07, + "logits/chosen": -0.5315028429031372, + "logits/rejected": -0.5765882134437561, + "logps/chosen": -42.46611785888672, + "logps/rejected": -108.63957214355469, + "loss": 0.5383, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.17185378074646, + "rewards/margins": 6.680246353149414, + "rewards/rejected": -3.508391857147217, + "step": 15916 + }, + { + "epoch": 3.98, + "grad_norm": 5.1715264320373535, + "learning_rate": 9.879677181000396e-07, + "logits/chosen": -0.6259675025939941, + "logits/rejected": -0.6588805913925171, + "logps/chosen": -36.251930236816406, + "logps/rejected": -122.02362060546875, + "loss": 0.5844, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4337880611419678, + "rewards/margins": 8.328117370605469, + "rewards/rejected": -4.89432954788208, + "step": 15917 + }, + { + "epoch": 3.98, + "grad_norm": 4.572150707244873, + "learning_rate": 9.874987076000182e-07, + "logits/chosen": -0.4637783467769623, + "logits/rejected": -0.5625306367874146, + "logps/chosen": -56.212806701660156, + "logps/rejected": -104.9349365234375, + "loss": 0.6034, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4517555236816406, + "rewards/margins": 8.00355339050293, + "rewards/rejected": -4.5517988204956055, + "step": 15918 + }, + { + "epoch": 3.98, + "grad_norm": 33.95309066772461, + "learning_rate": 9.870297962531704e-07, + "logits/chosen": -0.5117828845977783, + "logits/rejected": -0.6154526472091675, + "logps/chosen": -69.50606536865234, + "logps/rejected": -95.94366455078125, + "loss": 0.6432, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.911834478378296, + "rewards/margins": 6.70883846282959, + "rewards/rejected": -3.7970032691955566, + "step": 15919 + }, + { + "epoch": 3.98, + "grad_norm": 6.611413955688477, + "learning_rate": 9.865609840710861e-07, + "logits/chosen": -0.5151118040084839, + "logits/rejected": -0.5749644637107849, + "logps/chosen": -53.386375427246094, + "logps/rejected": -105.37613677978516, + "loss": 0.5922, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.765876293182373, + "rewards/margins": 6.77255916595459, + "rewards/rejected": -4.006682395935059, + "step": 15920 + }, + { + "epoch": 3.98, + "grad_norm": 28.190208435058594, + "learning_rate": 9.860922710653486e-07, + "logits/chosen": -0.5381545424461365, + "logits/rejected": -0.6174795031547546, + "logps/chosen": -51.8513069152832, + "logps/rejected": -116.47693634033203, + "loss": 0.6501, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9031004905700684, + "rewards/margins": 7.422074794769287, + "rewards/rejected": -4.518974304199219, + "step": 15921 + }, + { + "epoch": 3.98, + "grad_norm": 5.373639106750488, + "learning_rate": 9.856236572475391e-07, + "logits/chosen": -0.5147950053215027, + "logits/rejected": -0.5561617612838745, + "logps/chosen": -54.72077178955078, + "logps/rejected": -117.49575805664062, + "loss": 0.6406, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2023072242736816, + "rewards/margins": 7.647890567779541, + "rewards/rejected": -4.445582866668701, + "step": 15922 + }, + { + "epoch": 3.98, + "grad_norm": 5.1963276863098145, + "learning_rate": 9.851551426292388e-07, + "logits/chosen": -0.5892699360847473, + "logits/rejected": -0.6772583723068237, + "logps/chosen": -53.872310638427734, + "logps/rejected": -114.93717193603516, + "loss": 0.6769, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.896873712539673, + "rewards/margins": 7.441441535949707, + "rewards/rejected": -4.544568061828613, + "step": 15923 + }, + { + "epoch": 3.98, + "grad_norm": 5.40798282623291, + "learning_rate": 9.846867272220267e-07, + "logits/chosen": -0.5843144059181213, + "logits/rejected": -0.6502653360366821, + "logps/chosen": -48.61378860473633, + "logps/rejected": -111.09490966796875, + "loss": 0.5812, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0176281929016113, + "rewards/margins": 7.52095890045166, + "rewards/rejected": -4.503330230712891, + "step": 15924 + }, + { + "epoch": 3.98, + "grad_norm": 8.030747413635254, + "learning_rate": 9.84218411037477e-07, + "logits/chosen": -0.5233370065689087, + "logits/rejected": -0.5134108662605286, + "logps/chosen": -64.79570007324219, + "logps/rejected": -131.5640411376953, + "loss": 0.7738, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.970639228820801, + "rewards/margins": 7.0941619873046875, + "rewards/rejected": -4.123522758483887, + "step": 15925 + }, + { + "epoch": 3.98, + "grad_norm": 2.8202273845672607, + "learning_rate": 9.837501940871597e-07, + "logits/chosen": -0.5641257166862488, + "logits/rejected": -0.5975860953330994, + "logps/chosen": -39.587318420410156, + "logps/rejected": -111.01586151123047, + "loss": 0.552, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2919487953186035, + "rewards/margins": 7.939933776855469, + "rewards/rejected": -4.647984504699707, + "step": 15926 + }, + { + "epoch": 3.98, + "grad_norm": 11.064735412597656, + "learning_rate": 9.832820763826484e-07, + "logits/chosen": -0.5946427583694458, + "logits/rejected": -0.6999710202217102, + "logps/chosen": -59.33475112915039, + "logps/rejected": -108.45208740234375, + "loss": 0.5775, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.965226173400879, + "rewards/margins": 7.976653099060059, + "rewards/rejected": -5.01142692565918, + "step": 15927 + }, + { + "epoch": 3.98, + "grad_norm": 6.201157569885254, + "learning_rate": 9.8281405793551e-07, + "logits/chosen": -0.5580025911331177, + "logits/rejected": -0.6211829781532288, + "logps/chosen": -45.01433181762695, + "logps/rejected": -102.618408203125, + "loss": 0.5831, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.524660110473633, + "rewards/margins": 6.730459213256836, + "rewards/rejected": -3.205798864364624, + "step": 15928 + }, + { + "epoch": 3.98, + "grad_norm": 10.287454605102539, + "learning_rate": 9.82346138757307e-07, + "logits/chosen": -0.5632014870643616, + "logits/rejected": -0.6581488847732544, + "logps/chosen": -55.692230224609375, + "logps/rejected": -101.45860290527344, + "loss": 0.6775, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.85296630859375, + "rewards/margins": 6.263921737670898, + "rewards/rejected": -3.4109556674957275, + "step": 15929 + }, + { + "epoch": 3.99, + "grad_norm": 4.865208148956299, + "learning_rate": 9.818783188596059e-07, + "logits/chosen": -0.5597338676452637, + "logits/rejected": -0.5941867828369141, + "logps/chosen": -49.299468994140625, + "logps/rejected": -117.21966552734375, + "loss": 0.6236, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4253547191619873, + "rewards/margins": 6.8049211502075195, + "rewards/rejected": -3.379565715789795, + "step": 15930 + }, + { + "epoch": 3.99, + "grad_norm": 4.961761474609375, + "learning_rate": 9.81410598253964e-07, + "logits/chosen": -0.5357860922813416, + "logits/rejected": -0.6309840679168701, + "logps/chosen": -52.57822036743164, + "logps/rejected": -91.35221862792969, + "loss": 0.6192, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.275010108947754, + "rewards/margins": 6.894208908081055, + "rewards/rejected": -3.6191985607147217, + "step": 15931 + }, + { + "epoch": 3.99, + "grad_norm": 7.615108489990234, + "learning_rate": 9.809429769519418e-07, + "logits/chosen": -0.6270083785057068, + "logits/rejected": -0.6453813314437866, + "logps/chosen": -59.14261245727539, + "logps/rejected": -114.4835433959961, + "loss": 0.7373, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2105984687805176, + "rewards/margins": 5.9415998458862305, + "rewards/rejected": -2.731001853942871, + "step": 15932 + }, + { + "epoch": 3.99, + "grad_norm": 21.246997833251953, + "learning_rate": 9.804754549650942e-07, + "logits/chosen": -0.568178653717041, + "logits/rejected": -0.6144625544548035, + "logps/chosen": -41.3649787902832, + "logps/rejected": -108.31401062011719, + "loss": 0.5556, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3372199535369873, + "rewards/margins": 7.626008033752441, + "rewards/rejected": -4.288788318634033, + "step": 15933 + }, + { + "epoch": 3.99, + "grad_norm": 5.59820556640625, + "learning_rate": 9.800080323049715e-07, + "logits/chosen": -0.575133204460144, + "logits/rejected": -0.6541658043861389, + "logps/chosen": -54.01933288574219, + "logps/rejected": -110.05663299560547, + "loss": 0.683, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0467543601989746, + "rewards/margins": 7.611402988433838, + "rewards/rejected": -4.564648628234863, + "step": 15934 + }, + { + "epoch": 3.99, + "grad_norm": 10.201444625854492, + "learning_rate": 9.795407089831276e-07, + "logits/chosen": -0.5373557806015015, + "logits/rejected": -0.6110684275627136, + "logps/chosen": -50.33247375488281, + "logps/rejected": -101.594482421875, + "loss": 0.626, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.473544120788574, + "rewards/margins": 5.708568572998047, + "rewards/rejected": -2.2350244522094727, + "step": 15935 + }, + { + "epoch": 3.99, + "grad_norm": 6.241282939910889, + "learning_rate": 9.79073485011109e-07, + "logits/chosen": -0.5786851048469543, + "logits/rejected": -0.645309329032898, + "logps/chosen": -58.59307098388672, + "logps/rejected": -81.65186309814453, + "loss": 0.6955, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.418738842010498, + "rewards/margins": 5.685741424560547, + "rewards/rejected": -2.267003059387207, + "step": 15936 + }, + { + "epoch": 3.99, + "grad_norm": 5.129996299743652, + "learning_rate": 9.786063604004603e-07, + "logits/chosen": -0.47395092248916626, + "logits/rejected": -0.5408076047897339, + "logps/chosen": -54.959712982177734, + "logps/rejected": -92.29386138916016, + "loss": 0.6245, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2798783779144287, + "rewards/margins": 5.554927825927734, + "rewards/rejected": -2.2750494480133057, + "step": 15937 + }, + { + "epoch": 3.99, + "grad_norm": 5.927367687225342, + "learning_rate": 9.781393351627261e-07, + "logits/chosen": -0.48437708616256714, + "logits/rejected": -0.5572546720504761, + "logps/chosen": -43.3455810546875, + "logps/rejected": -117.26306915283203, + "loss": 0.6233, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.190194845199585, + "rewards/margins": 8.57189655303955, + "rewards/rejected": -5.381702423095703, + "step": 15938 + }, + { + "epoch": 3.99, + "grad_norm": 4.7356367111206055, + "learning_rate": 9.776724093094486e-07, + "logits/chosen": -0.5457161664962769, + "logits/rejected": -0.6137121319770813, + "logps/chosen": -51.52400207519531, + "logps/rejected": -98.29536437988281, + "loss": 0.6286, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.367713451385498, + "rewards/margins": 6.422565460205078, + "rewards/rejected": -3.0548527240753174, + "step": 15939 + }, + { + "epoch": 3.99, + "grad_norm": 7.290774822235107, + "learning_rate": 9.77205582852162e-07, + "logits/chosen": -0.5869145393371582, + "logits/rejected": -0.679489254951477, + "logps/chosen": -60.3681755065918, + "logps/rejected": -89.99079895019531, + "loss": 0.6513, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0869548320770264, + "rewards/margins": 6.709768295288086, + "rewards/rejected": -3.6228132247924805, + "step": 15940 + }, + { + "epoch": 3.99, + "grad_norm": 5.177866458892822, + "learning_rate": 9.767388558024044e-07, + "logits/chosen": -0.5703171491622925, + "logits/rejected": -0.6023591160774231, + "logps/chosen": -47.86458206176758, + "logps/rejected": -97.140380859375, + "loss": 0.617, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.132707118988037, + "rewards/margins": 5.814753532409668, + "rewards/rejected": -2.682046413421631, + "step": 15941 + }, + { + "epoch": 3.99, + "grad_norm": 4.272948265075684, + "learning_rate": 9.76272228171712e-07, + "logits/chosen": -0.5250553488731384, + "logits/rejected": -0.5986616611480713, + "logps/chosen": -48.29877471923828, + "logps/rejected": -104.11527252197266, + "loss": 0.5616, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8638017177581787, + "rewards/margins": 6.362853527069092, + "rewards/rejected": -3.499051809310913, + "step": 15942 + }, + { + "epoch": 3.99, + "grad_norm": 3.395585536956787, + "learning_rate": 9.758056999716093e-07, + "logits/chosen": -0.5498902201652527, + "logits/rejected": -0.6369020342826843, + "logps/chosen": -58.01276397705078, + "logps/rejected": -108.31490325927734, + "loss": 0.6326, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.793452262878418, + "rewards/margins": 7.142553806304932, + "rewards/rejected": -4.349101543426514, + "step": 15943 + }, + { + "epoch": 3.99, + "grad_norm": 3.516317129135132, + "learning_rate": 9.753392712136283e-07, + "logits/chosen": -0.48717617988586426, + "logits/rejected": -0.5320053696632385, + "logps/chosen": -54.66604232788086, + "logps/rejected": -132.66537475585938, + "loss": 0.5372, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2963976860046387, + "rewards/margins": 8.48343563079834, + "rewards/rejected": -5.187038421630859, + "step": 15944 + }, + { + "epoch": 3.99, + "grad_norm": 5.060702323913574, + "learning_rate": 9.748729419092962e-07, + "logits/chosen": -0.486422598361969, + "logits/rejected": -0.5598998069763184, + "logps/chosen": -50.76204299926758, + "logps/rejected": -88.5255355834961, + "loss": 0.6195, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.155531167984009, + "rewards/margins": 6.00274658203125, + "rewards/rejected": -2.847214460372925, + "step": 15945 + }, + { + "epoch": 3.99, + "grad_norm": 3.0617730617523193, + "learning_rate": 9.744067120701351e-07, + "logits/chosen": -0.5588197112083435, + "logits/rejected": -0.6341046690940857, + "logps/chosen": -55.90064239501953, + "logps/rejected": -87.09991455078125, + "loss": 0.6413, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2363672256469727, + "rewards/margins": 6.201167583465576, + "rewards/rejected": -2.9648001194000244, + "step": 15946 + }, + { + "epoch": 3.99, + "grad_norm": 49.69501876831055, + "learning_rate": 9.73940581707664e-07, + "logits/chosen": -0.5175119638442993, + "logits/rejected": -0.6363651156425476, + "logps/chosen": -59.56685256958008, + "logps/rejected": -105.55880737304688, + "loss": 0.6622, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.016251802444458, + "rewards/margins": 6.88543701171875, + "rewards/rejected": -3.869185447692871, + "step": 15947 + }, + { + "epoch": 3.99, + "grad_norm": 5.037932395935059, + "learning_rate": 9.734745508334053e-07, + "logits/chosen": -0.5097192525863647, + "logits/rejected": -0.5666214227676392, + "logps/chosen": -57.13829803466797, + "logps/rejected": -111.20548248291016, + "loss": 0.6492, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.352360248565674, + "rewards/margins": 7.482213020324707, + "rewards/rejected": -4.129853248596191, + "step": 15948 + }, + { + "epoch": 3.99, + "grad_norm": 6.147233009338379, + "learning_rate": 9.730086194588723e-07, + "logits/chosen": -0.572625994682312, + "logits/rejected": -0.6528182029724121, + "logps/chosen": -45.95945739746094, + "logps/rejected": -116.1903076171875, + "loss": 0.5889, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.170314073562622, + "rewards/margins": 7.621992111206055, + "rewards/rejected": -4.451677322387695, + "step": 15949 + }, + { + "epoch": 3.99, + "grad_norm": 15.217695236206055, + "learning_rate": 9.7254278759558e-07, + "logits/chosen": -0.5890927314758301, + "logits/rejected": -0.6581465601921082, + "logps/chosen": -59.36018371582031, + "logps/rejected": -98.38758087158203, + "loss": 0.6186, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.029971122741699, + "rewards/margins": 6.608436107635498, + "rewards/rejected": -3.578465223312378, + "step": 15950 + }, + { + "epoch": 3.99, + "grad_norm": 3.9437315464019775, + "learning_rate": 9.720770552550402e-07, + "logits/chosen": -0.5289259552955627, + "logits/rejected": -0.6591362953186035, + "logps/chosen": -60.033531188964844, + "logps/rejected": -99.83067321777344, + "loss": 0.6551, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9089837074279785, + "rewards/margins": 7.929743766784668, + "rewards/rejected": -5.0207600593566895, + "step": 15951 + }, + { + "epoch": 3.99, + "grad_norm": 6.566840648651123, + "learning_rate": 9.716114224487583e-07, + "logits/chosen": -0.46023258566856384, + "logits/rejected": -0.5357791185379028, + "logps/chosen": -62.6124267578125, + "logps/rejected": -101.41876220703125, + "loss": 0.6405, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1696605682373047, + "rewards/margins": 5.686507225036621, + "rewards/rejected": -2.5168466567993164, + "step": 15952 + }, + { + "epoch": 3.99, + "grad_norm": 3.704559087753296, + "learning_rate": 9.711458891882453e-07, + "logits/chosen": -0.49531862139701843, + "logits/rejected": -0.6501961946487427, + "logps/chosen": -62.8976936340332, + "logps/rejected": -112.50041198730469, + "loss": 0.544, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7913005352020264, + "rewards/margins": 7.72125244140625, + "rewards/rejected": -4.9299516677856445, + "step": 15953 + }, + { + "epoch": 3.99, + "grad_norm": 4.594649314880371, + "learning_rate": 9.70680455485002e-07, + "logits/chosen": -0.5313626527786255, + "logits/rejected": -0.6556247472763062, + "logps/chosen": -51.90534210205078, + "logps/rejected": -107.00518035888672, + "loss": 0.6153, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1747548580169678, + "rewards/margins": 7.353468418121338, + "rewards/rejected": -4.178712844848633, + "step": 15954 + }, + { + "epoch": 3.99, + "grad_norm": 4.65157413482666, + "learning_rate": 9.70215121350529e-07, + "logits/chosen": -0.5555679798126221, + "logits/rejected": -0.6534659266471863, + "logps/chosen": -54.384605407714844, + "logps/rejected": -87.41473388671875, + "loss": 0.6243, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9545295238494873, + "rewards/margins": 6.404112815856934, + "rewards/rejected": -3.4495837688446045, + "step": 15955 + }, + { + "epoch": 3.99, + "grad_norm": 4.330230236053467, + "learning_rate": 9.697498867963267e-07, + "logits/chosen": -0.5613777041435242, + "logits/rejected": -0.6529709100723267, + "logps/chosen": -47.63642883300781, + "logps/rejected": -99.95919036865234, + "loss": 0.5378, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1969082355499268, + "rewards/margins": 7.077265739440918, + "rewards/rejected": -3.8803577423095703, + "step": 15956 + }, + { + "epoch": 3.99, + "grad_norm": 10.057411193847656, + "learning_rate": 9.692847518338932e-07, + "logits/chosen": -0.5162825584411621, + "logits/rejected": -0.5609570145606995, + "logps/chosen": -55.849700927734375, + "logps/rejected": -93.19125366210938, + "loss": 0.6463, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1171553134918213, + "rewards/margins": 5.837244510650635, + "rewards/rejected": -2.7200894355773926, + "step": 15957 + }, + { + "epoch": 3.99, + "grad_norm": 6.969101428985596, + "learning_rate": 9.688197164747203e-07, + "logits/chosen": -0.49242064356803894, + "logits/rejected": -0.5780749917030334, + "logps/chosen": -54.63463592529297, + "logps/rejected": -97.3761978149414, + "loss": 0.657, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9172911643981934, + "rewards/margins": 7.1925129890441895, + "rewards/rejected": -4.275222301483154, + "step": 15958 + }, + { + "epoch": 3.99, + "grad_norm": 15.060888290405273, + "learning_rate": 9.683547807302989e-07, + "logits/chosen": -0.45067107677459717, + "logits/rejected": -0.5173429846763611, + "logps/chosen": -62.163848876953125, + "logps/rejected": -85.05329895019531, + "loss": 0.6958, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.774991512298584, + "rewards/margins": 5.555738925933838, + "rewards/rejected": -2.7807469367980957, + "step": 15959 + }, + { + "epoch": 3.99, + "grad_norm": 14.170783996582031, + "learning_rate": 9.678899446121205e-07, + "logits/chosen": -0.5477445125579834, + "logits/rejected": -0.5934286713600159, + "logps/chosen": -45.350284576416016, + "logps/rejected": -105.7005844116211, + "loss": 0.644, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.101541042327881, + "rewards/margins": 6.231424331665039, + "rewards/rejected": -3.129883289337158, + "step": 15960 + }, + { + "epoch": 3.99, + "grad_norm": 5.8245954513549805, + "learning_rate": 9.674252081316704e-07, + "logits/chosen": -0.5503235459327698, + "logits/rejected": -0.6257526874542236, + "logps/chosen": -54.636329650878906, + "logps/rejected": -101.2955551147461, + "loss": 0.5881, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.000335454940796, + "rewards/margins": 6.2618255615234375, + "rewards/rejected": -3.2614901065826416, + "step": 15961 + }, + { + "epoch": 3.99, + "grad_norm": 10.835545539855957, + "learning_rate": 9.66960571300431e-07, + "logits/chosen": -0.5196540951728821, + "logits/rejected": -0.5664422512054443, + "logps/chosen": -54.929691314697266, + "logps/rejected": -124.14201354980469, + "loss": 0.6183, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9528520107269287, + "rewards/margins": 7.9921369552612305, + "rewards/rejected": -5.039285182952881, + "step": 15962 + }, + { + "epoch": 3.99, + "grad_norm": 4.149049758911133, + "learning_rate": 9.664960341298868e-07, + "logits/chosen": -0.43608129024505615, + "logits/rejected": -0.551180899143219, + "logps/chosen": -72.25170135498047, + "logps/rejected": -117.6248550415039, + "loss": 0.5895, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1609716415405273, + "rewards/margins": 6.886235237121582, + "rewards/rejected": -3.7252633571624756, + "step": 15963 + }, + { + "epoch": 3.99, + "grad_norm": 2.0838687419891357, + "learning_rate": 9.66031596631516e-07, + "logits/chosen": -0.45805373787879944, + "logits/rejected": -0.5868173837661743, + "logps/chosen": -65.73265075683594, + "logps/rejected": -103.21147155761719, + "loss": 0.5521, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.988055467605591, + "rewards/margins": 7.770455360412598, + "rewards/rejected": -4.782399654388428, + "step": 15964 + }, + { + "epoch": 3.99, + "grad_norm": 3.536665201187134, + "learning_rate": 9.655672588167942e-07, + "logits/chosen": -0.49495503306388855, + "logits/rejected": -0.5951298475265503, + "logps/chosen": -54.233543395996094, + "logps/rejected": -91.2990951538086, + "loss": 0.549, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.969942569732666, + "rewards/margins": 7.255275726318359, + "rewards/rejected": -4.285333156585693, + "step": 15965 + }, + { + "epoch": 3.99, + "grad_norm": 2.973135232925415, + "learning_rate": 9.651030206971962e-07, + "logits/chosen": -0.553580641746521, + "logits/rejected": -0.6566305756568909, + "logps/chosen": -60.36431121826172, + "logps/rejected": -100.96846771240234, + "loss": 0.5866, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.388643503189087, + "rewards/margins": 7.556102275848389, + "rewards/rejected": -4.1674580574035645, + "step": 15966 + }, + { + "epoch": 3.99, + "grad_norm": 13.483588218688965, + "learning_rate": 9.646388822841974e-07, + "logits/chosen": -0.5501141548156738, + "logits/rejected": -0.6223451495170593, + "logps/chosen": -51.21400833129883, + "logps/rejected": -97.9139404296875, + "loss": 0.7525, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.101280450820923, + "rewards/margins": 7.04284143447876, + "rewards/rejected": -3.941560983657837, + "step": 15967 + }, + { + "epoch": 3.99, + "grad_norm": 5.4697442054748535, + "learning_rate": 9.641748435892606e-07, + "logits/chosen": -0.4617340564727783, + "logits/rejected": -0.5771583914756775, + "logps/chosen": -48.67039108276367, + "logps/rejected": -108.5562973022461, + "loss": 0.6241, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2214484214782715, + "rewards/margins": 7.985832691192627, + "rewards/rejected": -4.764384746551514, + "step": 15968 + }, + { + "epoch": 3.99, + "grad_norm": 5.96333122253418, + "learning_rate": 9.637109046238569e-07, + "logits/chosen": -0.47992396354675293, + "logits/rejected": -0.5205281972885132, + "logps/chosen": -58.248985290527344, + "logps/rejected": -122.33743286132812, + "loss": 0.5888, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8741447925567627, + "rewards/margins": 7.446459770202637, + "rewards/rejected": -4.572315216064453, + "step": 15969 + }, + { + "epoch": 4.0, + "grad_norm": 8.203930854797363, + "learning_rate": 9.632470653994509e-07, + "logits/chosen": -0.5389465689659119, + "logits/rejected": -0.6215804815292358, + "logps/chosen": -63.204673767089844, + "logps/rejected": -104.39647674560547, + "loss": 0.7074, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.037950277328491, + "rewards/margins": 6.583009719848633, + "rewards/rejected": -3.5450594425201416, + "step": 15970 + }, + { + "epoch": 4.0, + "grad_norm": 3.793613910675049, + "learning_rate": 9.627833259275033e-07, + "logits/chosen": -0.5115904211997986, + "logits/rejected": -0.5831723809242249, + "logps/chosen": -57.19972229003906, + "logps/rejected": -117.57720947265625, + "loss": 0.5614, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3353042602539062, + "rewards/margins": 7.314188480377197, + "rewards/rejected": -3.97888445854187, + "step": 15971 + }, + { + "epoch": 4.0, + "grad_norm": 5.791696548461914, + "learning_rate": 9.623196862194727e-07, + "logits/chosen": -0.5389478206634521, + "logits/rejected": -0.6766849756240845, + "logps/chosen": -59.66300964355469, + "logps/rejected": -102.43814086914062, + "loss": 0.5882, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.720705509185791, + "rewards/margins": 7.361323833465576, + "rewards/rejected": -4.640618324279785, + "step": 15972 + }, + { + "epoch": 4.0, + "grad_norm": 5.130208492279053, + "learning_rate": 9.618561462868191e-07, + "logits/chosen": -0.4611901640892029, + "logits/rejected": -0.5401140451431274, + "logps/chosen": -58.572959899902344, + "logps/rejected": -100.77796936035156, + "loss": 0.6297, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1140735149383545, + "rewards/margins": 6.589937210083008, + "rewards/rejected": -3.4758641719818115, + "step": 15973 + }, + { + "epoch": 4.0, + "grad_norm": 2.709829568862915, + "learning_rate": 9.61392706140994e-07, + "logits/chosen": -0.5164250135421753, + "logits/rejected": -0.5797299146652222, + "logps/chosen": -61.6558837890625, + "logps/rejected": -135.31283569335938, + "loss": 0.6402, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.527822494506836, + "rewards/margins": 8.890049934387207, + "rewards/rejected": -5.362227439880371, + "step": 15974 + }, + { + "epoch": 4.0, + "grad_norm": 5.002143859863281, + "learning_rate": 9.609293657934514e-07, + "logits/chosen": -0.6086726784706116, + "logits/rejected": -0.6852232217788696, + "logps/chosen": -50.177486419677734, + "logps/rejected": -82.19317626953125, + "loss": 0.6315, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.207838535308838, + "rewards/margins": 6.348689556121826, + "rewards/rejected": -3.1408510208129883, + "step": 15975 + }, + { + "epoch": 4.0, + "grad_norm": 4.358623027801514, + "learning_rate": 9.604661252556407e-07, + "logits/chosen": -0.49355465173721313, + "logits/rejected": -0.6010679006576538, + "logps/chosen": -56.134849548339844, + "logps/rejected": -91.83577728271484, + "loss": 0.5577, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.25545597076416, + "rewards/margins": 6.365983009338379, + "rewards/rejected": -3.110527753829956, + "step": 15976 + }, + { + "epoch": 4.0, + "grad_norm": 8.502941131591797, + "learning_rate": 9.600029845390074e-07, + "logits/chosen": -0.5386401414871216, + "logits/rejected": -0.6482468843460083, + "logps/chosen": -57.188446044921875, + "logps/rejected": -93.97224426269531, + "loss": 0.6406, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0010194778442383, + "rewards/margins": 7.398708820343018, + "rewards/rejected": -4.397689342498779, + "step": 15977 + }, + { + "epoch": 4.0, + "grad_norm": 4.3280253410339355, + "learning_rate": 9.59539943654999e-07, + "logits/chosen": -0.538479208946228, + "logits/rejected": -0.6184719800949097, + "logps/chosen": -47.40326690673828, + "logps/rejected": -102.87211608886719, + "loss": 0.577, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.081807851791382, + "rewards/margins": 6.558639049530029, + "rewards/rejected": -3.4768316745758057, + "step": 15978 + }, + { + "epoch": 4.0, + "grad_norm": 4.729049205780029, + "learning_rate": 9.590770026150559e-07, + "logits/chosen": -0.520749568939209, + "logits/rejected": -0.5384451746940613, + "logps/chosen": -56.56440353393555, + "logps/rejected": -124.69660949707031, + "loss": 0.6823, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2028088569641113, + "rewards/margins": 6.2557268142700195, + "rewards/rejected": -3.052917718887329, + "step": 15979 + }, + { + "epoch": 4.0, + "grad_norm": 3.0621137619018555, + "learning_rate": 9.586141614306172e-07, + "logits/chosen": -0.5400403141975403, + "logits/rejected": -0.6490234732627869, + "logps/chosen": -49.743614196777344, + "logps/rejected": -98.72980499267578, + "loss": 0.6155, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.347398281097412, + "rewards/margins": 8.428909301757812, + "rewards/rejected": -5.081510543823242, + "step": 15980 + }, + { + "epoch": 4.0, + "grad_norm": 8.912595748901367, + "learning_rate": 9.58151420113121e-07, + "logits/chosen": -0.5517511963844299, + "logits/rejected": -0.6221619248390198, + "logps/chosen": -50.54415512084961, + "logps/rejected": -98.556640625, + "loss": 0.6006, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1186881065368652, + "rewards/margins": 6.9093708992004395, + "rewards/rejected": -3.790682554244995, + "step": 15981 + }, + { + "epoch": 4.0, + "grad_norm": 6.881345748901367, + "learning_rate": 9.576887786740048e-07, + "logits/chosen": -0.5053680539131165, + "logits/rejected": -0.5826867818832397, + "logps/chosen": -45.37133026123047, + "logps/rejected": -85.6967544555664, + "loss": 0.5847, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7875113487243652, + "rewards/margins": 5.980173587799072, + "rewards/rejected": -3.192662000656128, + "step": 15982 + }, + { + "epoch": 4.0, + "grad_norm": 3.465912342071533, + "learning_rate": 9.57226237124696e-07, + "logits/chosen": -0.5624377131462097, + "logits/rejected": -0.5928454399108887, + "logps/chosen": -46.798152923583984, + "logps/rejected": -116.44611358642578, + "loss": 0.5571, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1268513202667236, + "rewards/margins": 6.974684238433838, + "rewards/rejected": -3.847832679748535, + "step": 15983 + }, + { + "epoch": 4.0, + "grad_norm": 11.577455520629883, + "learning_rate": 9.567637954766269e-07, + "logits/chosen": -0.5184138417243958, + "logits/rejected": -0.6523247957229614, + "logps/chosen": -55.55091857910156, + "logps/rejected": -102.84195709228516, + "loss": 0.6747, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8198015689849854, + "rewards/margins": 6.549385070800781, + "rewards/rejected": -3.729583263397217, + "step": 15984 + }, + { + "epoch": 4.0, + "grad_norm": 5.470223903656006, + "learning_rate": 9.563014537412275e-07, + "logits/chosen": -0.6298361420631409, + "logits/rejected": -0.6364310383796692, + "logps/chosen": -44.538570404052734, + "logps/rejected": -142.73577880859375, + "loss": 0.6115, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.23325252532959, + "rewards/margins": 7.227320194244385, + "rewards/rejected": -3.994067430496216, + "step": 15985 + }, + { + "epoch": 4.0, + "grad_norm": 3.9433884620666504, + "learning_rate": 9.558392119299175e-07, + "logits/chosen": -0.4869903326034546, + "logits/rejected": -0.5726510286331177, + "logps/chosen": -55.97480773925781, + "logps/rejected": -105.05349731445312, + "loss": 0.5961, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0423645973205566, + "rewards/margins": 6.902920722961426, + "rewards/rejected": -3.860556125640869, + "step": 15986 + }, + { + "epoch": 4.0, + "grad_norm": 4.091956615447998, + "learning_rate": 9.553770700541227e-07, + "logits/chosen": -0.5396695733070374, + "logits/rejected": -0.5333274006843567, + "logps/chosen": -49.28852844238281, + "logps/rejected": -128.50698852539062, + "loss": 0.599, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2696805000305176, + "rewards/margins": 8.073712348937988, + "rewards/rejected": -4.804032325744629, + "step": 15987 + }, + { + "epoch": 4.0, + "grad_norm": 3.718966007232666, + "learning_rate": 9.549150281252633e-07, + "logits/chosen": -0.4787873327732086, + "logits/rejected": -0.5920910835266113, + "logps/chosen": -55.85419464111328, + "logps/rejected": -127.57035827636719, + "loss": 0.5943, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.421038866043091, + "rewards/margins": 9.49055004119873, + "rewards/rejected": -6.069511890411377, + "step": 15988 + }, + { + "epoch": 4.0, + "grad_norm": 5.154825210571289, + "learning_rate": 9.544530861547563e-07, + "logits/chosen": -0.5331178903579712, + "logits/rejected": -0.6286640167236328, + "logps/chosen": -57.295562744140625, + "logps/rejected": -97.71961212158203, + "loss": 0.6506, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.204599380493164, + "rewards/margins": 6.541574954986572, + "rewards/rejected": -3.3369765281677246, + "step": 15989 + }, + { + "epoch": 4.0, + "grad_norm": 2.4448726177215576, + "learning_rate": 9.53991244154015e-07, + "logits/chosen": -0.5192006826400757, + "logits/rejected": -0.5512743592262268, + "logps/chosen": -46.075218200683594, + "logps/rejected": -129.1924591064453, + "loss": 0.5755, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.18172287940979, + "rewards/margins": 9.00664234161377, + "rewards/rejected": -5.824919700622559, + "step": 15990 + }, + { + "epoch": 4.0, + "grad_norm": 4.480286598205566, + "learning_rate": 9.535295021344554e-07, + "logits/chosen": -0.6188132166862488, + "logits/rejected": -0.705296516418457, + "logps/chosen": -48.785911560058594, + "logps/rejected": -111.58903503417969, + "loss": 0.5898, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2427992820739746, + "rewards/margins": 7.267548561096191, + "rewards/rejected": -4.024749755859375, + "step": 15991 + }, + { + "epoch": 4.0, + "grad_norm": 51.158329010009766, + "learning_rate": 9.530678601074855e-07, + "logits/chosen": -0.5305361747741699, + "logits/rejected": -0.6035678386688232, + "logps/chosen": -71.46212768554688, + "logps/rejected": -102.44901275634766, + "loss": 0.7575, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2465336322784424, + "rewards/margins": 6.970245361328125, + "rewards/rejected": -3.7237119674682617, + "step": 15992 + }, + { + "epoch": 4.0, + "grad_norm": 4.445451259613037, + "learning_rate": 9.526063180845124e-07, + "logits/chosen": -0.5231773853302002, + "logits/rejected": -0.5758567452430725, + "logps/chosen": -50.212303161621094, + "logps/rejected": -112.26130676269531, + "loss": 0.6438, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0390307903289795, + "rewards/margins": 6.214761734008789, + "rewards/rejected": -3.1757309436798096, + "step": 15993 + }, + { + "epoch": 4.0, + "grad_norm": 5.3681254386901855, + "learning_rate": 9.521448760769431e-07, + "logits/chosen": -0.5301424264907837, + "logits/rejected": -0.5905410647392273, + "logps/chosen": -60.13134002685547, + "logps/rejected": -100.83551025390625, + "loss": 0.6843, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0744152069091797, + "rewards/margins": 6.847209930419922, + "rewards/rejected": -3.7727949619293213, + "step": 15994 + }, + { + "epoch": 4.0, + "grad_norm": 4.9153666496276855, + "learning_rate": 9.516835340961783e-07, + "logits/chosen": -0.5617011785507202, + "logits/rejected": -0.593360185623169, + "logps/chosen": -52.610015869140625, + "logps/rejected": -110.48081970214844, + "loss": 0.5942, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.075253486633301, + "rewards/margins": 7.0845794677734375, + "rewards/rejected": -4.0093255043029785, + "step": 15995 + }, + { + "epoch": 4.0, + "grad_norm": 5.340830326080322, + "learning_rate": 9.512222921536202e-07, + "logits/chosen": -0.6143156290054321, + "logits/rejected": -0.665575385093689, + "logps/chosen": -47.90668487548828, + "logps/rejected": -112.81619262695312, + "loss": 0.5772, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.182924509048462, + "rewards/margins": 6.934488296508789, + "rewards/rejected": -3.751563549041748, + "step": 15996 + }, + { + "epoch": 4.0, + "grad_norm": 4.393012523651123, + "learning_rate": 9.507611502606662e-07, + "logits/chosen": -0.4196164309978485, + "logits/rejected": -0.6016994118690491, + "logps/chosen": -64.73673248291016, + "logps/rejected": -96.40880584716797, + "loss": 0.5812, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1519927978515625, + "rewards/margins": 7.331627368927002, + "rewards/rejected": -4.179635047912598, + "step": 15997 + }, + { + "epoch": 4.0, + "grad_norm": 4.2291460037231445, + "learning_rate": 9.503001084287094e-07, + "logits/chosen": -0.5621203780174255, + "logits/rejected": -0.6482265591621399, + "logps/chosen": -54.95001220703125, + "logps/rejected": -117.194091796875, + "loss": 0.5607, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.273242950439453, + "rewards/margins": 8.736244201660156, + "rewards/rejected": -5.463000774383545, + "step": 15998 + }, + { + "epoch": 4.0, + "grad_norm": 2.5051991939544678, + "learning_rate": 9.498391666691448e-07, + "logits/chosen": -0.49620285630226135, + "logits/rejected": -0.5638942718505859, + "logps/chosen": -49.763450622558594, + "logps/rejected": -106.79954528808594, + "loss": 0.5458, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4152767658233643, + "rewards/margins": 7.696681022644043, + "rewards/rejected": -4.2814040184021, + "step": 15999 + }, + { + "epoch": 4.0, + "grad_norm": 18.592378616333008, + "learning_rate": 9.493783249933636e-07, + "logits/chosen": -0.4633573293685913, + "logits/rejected": -0.5204333662986755, + "logps/chosen": -60.07577896118164, + "logps/rejected": -103.8283462524414, + "loss": 0.7184, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.012009382247925, + "rewards/margins": 6.247693061828613, + "rewards/rejected": -3.2356841564178467, + "step": 16000 + }, + { + "epoch": 4.0, + "grad_norm": 2.0411205291748047, + "learning_rate": 9.489175834127523e-07, + "logits/chosen": -0.6354334354400635, + "logits/rejected": -0.7419214248657227, + "logps/chosen": -48.23735046386719, + "logps/rejected": -91.79615783691406, + "loss": 0.5093, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2947700023651123, + "rewards/margins": 7.932159900665283, + "rewards/rejected": -4.637390613555908, + "step": 16001 + }, + { + "epoch": 4.0, + "grad_norm": 5.865440368652344, + "learning_rate": 9.484569419386952e-07, + "logits/chosen": -0.5730680227279663, + "logits/rejected": -0.6286019682884216, + "logps/chosen": -46.02599334716797, + "logps/rejected": -99.58686065673828, + "loss": 0.6437, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.100543737411499, + "rewards/margins": 6.799977779388428, + "rewards/rejected": -3.699434518814087, + "step": 16002 + }, + { + "epoch": 4.0, + "grad_norm": 6.648226737976074, + "learning_rate": 9.479964005825775e-07, + "logits/chosen": -0.5793702602386475, + "logits/rejected": -0.640451192855835, + "logps/chosen": -53.00160217285156, + "logps/rejected": -95.86588287353516, + "loss": 0.7328, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7605106830596924, + "rewards/margins": 5.26610803604126, + "rewards/rejected": -2.505596876144409, + "step": 16003 + }, + { + "epoch": 4.0, + "grad_norm": 6.690134048461914, + "learning_rate": 9.475359593557793e-07, + "logits/chosen": -0.5312028527259827, + "logits/rejected": -0.5834405422210693, + "logps/chosen": -47.93700408935547, + "logps/rejected": -121.58876037597656, + "loss": 0.6207, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.813955068588257, + "rewards/margins": 6.714592933654785, + "rewards/rejected": -3.90063738822937, + "step": 16004 + }, + { + "epoch": 4.0, + "grad_norm": 6.465432167053223, + "learning_rate": 9.470756182696761e-07, + "logits/chosen": -0.5039388537406921, + "logits/rejected": -0.6017638444900513, + "logps/chosen": -49.88866424560547, + "logps/rejected": -97.86285400390625, + "loss": 0.567, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.726590156555176, + "rewards/margins": 7.6463942527771, + "rewards/rejected": -4.919804573059082, + "step": 16005 + }, + { + "epoch": 4.0, + "grad_norm": 4.806429862976074, + "learning_rate": 9.466153773356468e-07, + "logits/chosen": -0.5853152871131897, + "logits/rejected": -0.6906729340553284, + "logps/chosen": -53.954986572265625, + "logps/rejected": -97.07152557373047, + "loss": 0.5885, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9504952430725098, + "rewards/margins": 6.727204322814941, + "rewards/rejected": -3.7767088413238525, + "step": 16006 + }, + { + "epoch": 4.0, + "grad_norm": 2.9665446281433105, + "learning_rate": 9.461552365650629e-07, + "logits/chosen": -0.5707131028175354, + "logits/rejected": -0.6412004828453064, + "logps/chosen": -63.51042175292969, + "logps/rejected": -99.10490417480469, + "loss": 0.5904, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.561521053314209, + "rewards/margins": 7.372297763824463, + "rewards/rejected": -3.8107759952545166, + "step": 16007 + }, + { + "epoch": 4.0, + "grad_norm": 5.051449775695801, + "learning_rate": 9.456951959692934e-07, + "logits/chosen": -0.5627345442771912, + "logits/rejected": -0.6610655784606934, + "logps/chosen": -57.38495635986328, + "logps/rejected": -117.93265533447266, + "loss": 0.6011, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.180420160293579, + "rewards/margins": 7.928252696990967, + "rewards/rejected": -4.747832298278809, + "step": 16008 + }, + { + "epoch": 4.0, + "grad_norm": 5.253895282745361, + "learning_rate": 9.452352555597083e-07, + "logits/chosen": -0.4652077555656433, + "logits/rejected": -0.5729431509971619, + "logps/chosen": -61.32691192626953, + "logps/rejected": -107.73200225830078, + "loss": 0.6458, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9417757987976074, + "rewards/margins": 7.779002666473389, + "rewards/rejected": -4.837226867675781, + "step": 16009 + }, + { + "epoch": 4.01, + "grad_norm": 4.428560256958008, + "learning_rate": 9.447754153476752e-07, + "logits/chosen": -0.5102366805076599, + "logits/rejected": -0.5789403915405273, + "logps/chosen": -48.09388732910156, + "logps/rejected": -115.0573501586914, + "loss": 0.6024, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.095600128173828, + "rewards/margins": 7.595884323120117, + "rewards/rejected": -4.500284671783447, + "step": 16010 + }, + { + "epoch": 4.01, + "grad_norm": 6.064742088317871, + "learning_rate": 9.44315675344552e-07, + "logits/chosen": -0.48643386363983154, + "logits/rejected": -0.54527747631073, + "logps/chosen": -55.77573013305664, + "logps/rejected": -127.181396484375, + "loss": 0.6308, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.397947311401367, + "rewards/margins": 7.951726913452148, + "rewards/rejected": -4.553779602050781, + "step": 16011 + }, + { + "epoch": 4.01, + "grad_norm": 4.674803733825684, + "learning_rate": 9.438560355617026e-07, + "logits/chosen": -0.5117509961128235, + "logits/rejected": -0.5662009716033936, + "logps/chosen": -50.370365142822266, + "logps/rejected": -106.18476104736328, + "loss": 0.6446, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9337522983551025, + "rewards/margins": 6.3920698165893555, + "rewards/rejected": -3.4583170413970947, + "step": 16012 + }, + { + "epoch": 4.01, + "grad_norm": 9.179862022399902, + "learning_rate": 9.433964960104863e-07, + "logits/chosen": -0.5638477206230164, + "logits/rejected": -0.6497130990028381, + "logps/chosen": -52.89295196533203, + "logps/rejected": -109.28949737548828, + "loss": 0.6813, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.144862174987793, + "rewards/margins": 8.135835647583008, + "rewards/rejected": -4.990972518920898, + "step": 16013 + }, + { + "epoch": 4.01, + "grad_norm": 6.151362895965576, + "learning_rate": 9.429370567022577e-07, + "logits/chosen": -0.568230152130127, + "logits/rejected": -0.6666780710220337, + "logps/chosen": -71.1186752319336, + "logps/rejected": -118.81714630126953, + "loss": 0.6923, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.668114185333252, + "rewards/margins": 7.771244525909424, + "rewards/rejected": -4.103131294250488, + "step": 16014 + }, + { + "epoch": 4.01, + "grad_norm": 7.319626331329346, + "learning_rate": 9.42477717648368e-07, + "logits/chosen": -0.5177427530288696, + "logits/rejected": -0.567681610584259, + "logps/chosen": -48.6861686706543, + "logps/rejected": -119.43114471435547, + "loss": 0.6693, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2744672298431396, + "rewards/margins": 7.694656848907471, + "rewards/rejected": -4.42018985748291, + "step": 16015 + }, + { + "epoch": 4.01, + "grad_norm": 5.3575215339660645, + "learning_rate": 9.42018478860171e-07, + "logits/chosen": -0.5293852090835571, + "logits/rejected": -0.5890715718269348, + "logps/chosen": -56.15939712524414, + "logps/rejected": -111.6170654296875, + "loss": 0.6496, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1629137992858887, + "rewards/margins": 7.044780731201172, + "rewards/rejected": -3.881866455078125, + "step": 16016 + }, + { + "epoch": 4.01, + "grad_norm": 3.225689172744751, + "learning_rate": 9.415593403490137e-07, + "logits/chosen": -0.4629589915275574, + "logits/rejected": -0.5543746948242188, + "logps/chosen": -58.3690185546875, + "logps/rejected": -112.27671813964844, + "loss": 0.6193, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0343618392944336, + "rewards/margins": 7.430149555206299, + "rewards/rejected": -4.395787715911865, + "step": 16017 + }, + { + "epoch": 4.01, + "grad_norm": 17.361202239990234, + "learning_rate": 9.411003021262405e-07, + "logits/chosen": -0.5561131238937378, + "logits/rejected": -0.6416156888008118, + "logps/chosen": -49.2299690246582, + "logps/rejected": -84.09876251220703, + "loss": 0.6567, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.191201686859131, + "rewards/margins": 6.289095878601074, + "rewards/rejected": -3.097893238067627, + "step": 16018 + }, + { + "epoch": 4.01, + "grad_norm": 2.4155185222625732, + "learning_rate": 9.406413642031975e-07, + "logits/chosen": -0.5278717875480652, + "logits/rejected": -0.5663768649101257, + "logps/chosen": -49.579132080078125, + "logps/rejected": -118.25174713134766, + "loss": 0.5346, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4947562217712402, + "rewards/margins": 7.597543716430664, + "rewards/rejected": -4.102787494659424, + "step": 16019 + }, + { + "epoch": 4.01, + "grad_norm": 4.710920333862305, + "learning_rate": 9.401825265912224e-07, + "logits/chosen": -0.45084869861602783, + "logits/rejected": -0.572227418422699, + "logps/chosen": -71.85971069335938, + "logps/rejected": -107.0212631225586, + "loss": 0.5667, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.132732391357422, + "rewards/margins": 7.538784027099609, + "rewards/rejected": -4.406051158905029, + "step": 16020 + }, + { + "epoch": 4.01, + "grad_norm": 12.611298561096191, + "learning_rate": 9.39723789301657e-07, + "logits/chosen": -0.5677422881126404, + "logits/rejected": -0.6691691875457764, + "logps/chosen": -48.93145751953125, + "logps/rejected": -104.74380493164062, + "loss": 0.6735, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6979565620422363, + "rewards/margins": 7.459335803985596, + "rewards/rejected": -4.761379718780518, + "step": 16021 + }, + { + "epoch": 4.01, + "grad_norm": 4.737045764923096, + "learning_rate": 9.392651523458351e-07, + "logits/chosen": -0.5062556862831116, + "logits/rejected": -0.6450985074043274, + "logps/chosen": -68.51644897460938, + "logps/rejected": -94.82386779785156, + "loss": 0.66, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1163082122802734, + "rewards/margins": 6.183250427246094, + "rewards/rejected": -3.066941976547241, + "step": 16022 + }, + { + "epoch": 4.01, + "grad_norm": 5.543783187866211, + "learning_rate": 9.388066157350895e-07, + "logits/chosen": -0.5602452158927917, + "logits/rejected": -0.5742182731628418, + "logps/chosen": -54.37940979003906, + "logps/rejected": -101.60272216796875, + "loss": 0.6955, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.191479444503784, + "rewards/margins": 5.708052158355713, + "rewards/rejected": -2.5165722370147705, + "step": 16023 + }, + { + "epoch": 4.01, + "grad_norm": 3.2932684421539307, + "learning_rate": 9.383481794807519e-07, + "logits/chosen": -0.5794776082038879, + "logits/rejected": -0.7181019186973572, + "logps/chosen": -62.03213882446289, + "logps/rejected": -113.10489654541016, + "loss": 0.6318, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8998191356658936, + "rewards/margins": 6.904013633728027, + "rewards/rejected": -4.004195690155029, + "step": 16024 + }, + { + "epoch": 4.01, + "grad_norm": 4.681517124176025, + "learning_rate": 9.378898435941542e-07, + "logits/chosen": -0.5207849144935608, + "logits/rejected": -0.6415461301803589, + "logps/chosen": -73.27183532714844, + "logps/rejected": -88.52667236328125, + "loss": 0.6532, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.090736150741577, + "rewards/margins": 6.519078254699707, + "rewards/rejected": -3.428342342376709, + "step": 16025 + }, + { + "epoch": 4.01, + "grad_norm": 4.82661771774292, + "learning_rate": 9.374316080866158e-07, + "logits/chosen": -0.6150022745132446, + "logits/rejected": -0.6567002534866333, + "logps/chosen": -67.54816436767578, + "logps/rejected": -112.7315444946289, + "loss": 0.7088, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.871212959289551, + "rewards/margins": 6.037043571472168, + "rewards/rejected": -3.165830612182617, + "step": 16026 + }, + { + "epoch": 4.01, + "grad_norm": 3.9749562740325928, + "learning_rate": 9.369734729694635e-07, + "logits/chosen": -0.5484306812286377, + "logits/rejected": -0.6032525300979614, + "logps/chosen": -48.08106994628906, + "logps/rejected": -87.32829284667969, + "loss": 0.6356, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2903966903686523, + "rewards/margins": 5.528398513793945, + "rewards/rejected": -2.238001823425293, + "step": 16027 + }, + { + "epoch": 4.01, + "grad_norm": 5.094621658325195, + "learning_rate": 9.365154382540215e-07, + "logits/chosen": -0.5134593844413757, + "logits/rejected": -0.5611522197723389, + "logps/chosen": -55.27566146850586, + "logps/rejected": -110.08572387695312, + "loss": 0.6252, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.335753917694092, + "rewards/margins": 7.038740158081055, + "rewards/rejected": -3.702986240386963, + "step": 16028 + }, + { + "epoch": 4.01, + "grad_norm": 3.183689594268799, + "learning_rate": 9.360575039516017e-07, + "logits/chosen": -0.5825459361076355, + "logits/rejected": -0.636425256729126, + "logps/chosen": -52.49955749511719, + "logps/rejected": -126.730224609375, + "loss": 0.6069, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.472419261932373, + "rewards/margins": 7.2924699783325195, + "rewards/rejected": -3.8200507164001465, + "step": 16029 + }, + { + "epoch": 4.01, + "grad_norm": 11.3594331741333, + "learning_rate": 9.355996700735242e-07, + "logits/chosen": -0.4784144163131714, + "logits/rejected": -0.5798272490501404, + "logps/chosen": -58.21543884277344, + "logps/rejected": -94.67596435546875, + "loss": 0.6019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.164907932281494, + "rewards/margins": 6.72156286239624, + "rewards/rejected": -3.556654930114746, + "step": 16030 + }, + { + "epoch": 4.01, + "grad_norm": 7.306909561157227, + "learning_rate": 9.351419366311032e-07, + "logits/chosen": -0.5729271769523621, + "logits/rejected": -0.6264947056770325, + "logps/chosen": -55.72945022583008, + "logps/rejected": -123.47714233398438, + "loss": 0.6205, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8610169887542725, + "rewards/margins": 8.040980339050293, + "rewards/rejected": -5.1799635887146, + "step": 16031 + }, + { + "epoch": 4.01, + "grad_norm": 5.32718563079834, + "learning_rate": 9.346843036356484e-07, + "logits/chosen": -0.5776349306106567, + "logits/rejected": -0.6348620653152466, + "logps/chosen": -61.35847473144531, + "logps/rejected": -123.03400421142578, + "loss": 0.6485, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.023669958114624, + "rewards/margins": 7.45017147064209, + "rewards/rejected": -4.426501274108887, + "step": 16032 + }, + { + "epoch": 4.01, + "grad_norm": 3.5961122512817383, + "learning_rate": 9.342267710984671e-07, + "logits/chosen": -0.6090990900993347, + "logits/rejected": -0.7130720615386963, + "logps/chosen": -51.56941223144531, + "logps/rejected": -102.88169860839844, + "loss": 0.6141, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.393127202987671, + "rewards/margins": 6.496518611907959, + "rewards/rejected": -3.103391170501709, + "step": 16033 + }, + { + "epoch": 4.01, + "grad_norm": 4.661505699157715, + "learning_rate": 9.337693390308683e-07, + "logits/chosen": -0.5420118570327759, + "logits/rejected": -0.6608111262321472, + "logps/chosen": -57.27688217163086, + "logps/rejected": -94.55248260498047, + "loss": 0.6145, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4576175212860107, + "rewards/margins": 7.0115556716918945, + "rewards/rejected": -3.5539374351501465, + "step": 16034 + }, + { + "epoch": 4.01, + "grad_norm": 4.680257797241211, + "learning_rate": 9.333120074441543e-07, + "logits/chosen": -0.5177477598190308, + "logits/rejected": -0.5815707445144653, + "logps/chosen": -51.49886703491211, + "logps/rejected": -86.73124694824219, + "loss": 0.6034, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2430357933044434, + "rewards/margins": 5.788005352020264, + "rewards/rejected": -2.5449695587158203, + "step": 16035 + }, + { + "epoch": 4.01, + "grad_norm": 12.228655815124512, + "learning_rate": 9.328547763496243e-07, + "logits/chosen": -0.48905062675476074, + "logits/rejected": -0.541942834854126, + "logps/chosen": -52.26295471191406, + "logps/rejected": -110.60350036621094, + "loss": 0.9607, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0794527530670166, + "rewards/margins": 7.167695999145508, + "rewards/rejected": -4.088243007659912, + "step": 16036 + }, + { + "epoch": 4.01, + "grad_norm": 5.557107448577881, + "learning_rate": 9.323976457585804e-07, + "logits/chosen": -0.6524524688720703, + "logits/rejected": -0.6957871913909912, + "logps/chosen": -44.36832046508789, + "logps/rejected": -102.08404541015625, + "loss": 0.6587, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.763072967529297, + "rewards/margins": 6.888484954833984, + "rewards/rejected": -4.125411510467529, + "step": 16037 + }, + { + "epoch": 4.01, + "grad_norm": 6.13334321975708, + "learning_rate": 9.319406156823157e-07, + "logits/chosen": -0.559630274772644, + "logits/rejected": -0.6179283261299133, + "logps/chosen": -67.0930404663086, + "logps/rejected": -116.7115249633789, + "loss": 0.8645, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.574268341064453, + "rewards/margins": 6.947775840759277, + "rewards/rejected": -4.373507022857666, + "step": 16038 + }, + { + "epoch": 4.01, + "grad_norm": 2.947547435760498, + "learning_rate": 9.314836861321269e-07, + "logits/chosen": -0.5106292366981506, + "logits/rejected": -0.5922061204910278, + "logps/chosen": -52.29532241821289, + "logps/rejected": -99.035888671875, + "loss": 0.5496, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.479703187942505, + "rewards/margins": 7.3815789222717285, + "rewards/rejected": -3.9018757343292236, + "step": 16039 + }, + { + "epoch": 4.01, + "grad_norm": 9.794868469238281, + "learning_rate": 9.310268571193037e-07, + "logits/chosen": -0.5959951281547546, + "logits/rejected": -0.6532597541809082, + "logps/chosen": -61.480308532714844, + "logps/rejected": -117.57406616210938, + "loss": 0.7011, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0158698558807373, + "rewards/margins": 7.959314346313477, + "rewards/rejected": -4.943444728851318, + "step": 16040 + }, + { + "epoch": 4.01, + "grad_norm": 11.732120513916016, + "learning_rate": 9.305701286551338e-07, + "logits/chosen": -0.5286811590194702, + "logits/rejected": -0.6248655319213867, + "logps/chosen": -50.71883010864258, + "logps/rejected": -99.38438415527344, + "loss": 0.5612, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0514373779296875, + "rewards/margins": 7.121758460998535, + "rewards/rejected": -4.070321559906006, + "step": 16041 + }, + { + "epoch": 4.01, + "grad_norm": 3.252657413482666, + "learning_rate": 9.301135007509055e-07, + "logits/chosen": -0.5742537975311279, + "logits/rejected": -0.648343026638031, + "logps/chosen": -42.678123474121094, + "logps/rejected": -94.38409423828125, + "loss": 0.5955, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1040730476379395, + "rewards/margins": 7.565731048583984, + "rewards/rejected": -4.461658477783203, + "step": 16042 + }, + { + "epoch": 4.01, + "grad_norm": 3.8616018295288086, + "learning_rate": 9.296569734179001e-07, + "logits/chosen": -0.4986813962459564, + "logits/rejected": -0.5559126734733582, + "logps/chosen": -52.19961166381836, + "logps/rejected": -104.87281036376953, + "loss": 0.5513, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.827608585357666, + "rewards/margins": 7.674169063568115, + "rewards/rejected": -4.846559524536133, + "step": 16043 + }, + { + "epoch": 4.01, + "grad_norm": 2.7682266235351562, + "learning_rate": 9.292005466674014e-07, + "logits/chosen": -0.48112988471984863, + "logits/rejected": -0.5810087323188782, + "logps/chosen": -51.80036163330078, + "logps/rejected": -132.04771423339844, + "loss": 0.5726, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2382771968841553, + "rewards/margins": 9.399999618530273, + "rewards/rejected": -6.1617231369018555, + "step": 16044 + }, + { + "epoch": 4.01, + "grad_norm": 4.4153265953063965, + "learning_rate": 9.287442205106861e-07, + "logits/chosen": -0.5442816615104675, + "logits/rejected": -0.5792341232299805, + "logps/chosen": -54.028343200683594, + "logps/rejected": -118.15337371826172, + "loss": 0.6183, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.016201972961426, + "rewards/margins": 7.121460914611816, + "rewards/rejected": -4.105258941650391, + "step": 16045 + }, + { + "epoch": 4.01, + "grad_norm": 4.022507667541504, + "learning_rate": 9.282879949590323e-07, + "logits/chosen": -0.5206066966056824, + "logits/rejected": -0.5727933645248413, + "logps/chosen": -51.55553436279297, + "logps/rejected": -112.2273178100586, + "loss": 0.5922, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2799456119537354, + "rewards/margins": 5.81048059463501, + "rewards/rejected": -2.5305352210998535, + "step": 16046 + }, + { + "epoch": 4.01, + "grad_norm": 6.732400417327881, + "learning_rate": 9.278318700237138e-07, + "logits/chosen": -0.5763610601425171, + "logits/rejected": -0.630687952041626, + "logps/chosen": -68.92339324951172, + "logps/rejected": -99.50025939941406, + "loss": 0.75, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.049741268157959, + "rewards/margins": 5.761677265167236, + "rewards/rejected": -2.7119367122650146, + "step": 16047 + }, + { + "epoch": 4.01, + "grad_norm": 6.718915939331055, + "learning_rate": 9.273758457159992e-07, + "logits/chosen": -0.6030313968658447, + "logits/rejected": -0.6568748950958252, + "logps/chosen": -43.10713577270508, + "logps/rejected": -89.53263854980469, + "loss": 0.5572, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9855058193206787, + "rewards/margins": 6.012601375579834, + "rewards/rejected": -3.0270957946777344, + "step": 16048 + }, + { + "epoch": 4.01, + "grad_norm": 4.4004387855529785, + "learning_rate": 9.269199220471608e-07, + "logits/chosen": -0.5296652317047119, + "logits/rejected": -0.617895245552063, + "logps/chosen": -49.608482360839844, + "logps/rejected": -88.60449981689453, + "loss": 0.6568, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3780174255371094, + "rewards/margins": 6.426767349243164, + "rewards/rejected": -3.0487499237060547, + "step": 16049 + }, + { + "epoch": 4.02, + "grad_norm": 4.095227241516113, + "learning_rate": 9.264640990284634e-07, + "logits/chosen": -0.5069887042045593, + "logits/rejected": -0.5890929698944092, + "logps/chosen": -58.48578643798828, + "logps/rejected": -94.20525360107422, + "loss": 0.6226, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1735572814941406, + "rewards/margins": 6.939779281616211, + "rewards/rejected": -3.7662220001220703, + "step": 16050 + }, + { + "epoch": 4.02, + "grad_norm": 3.5179476737976074, + "learning_rate": 9.260083766711703e-07, + "logits/chosen": -0.48159053921699524, + "logits/rejected": -0.5763659477233887, + "logps/chosen": -54.71744918823242, + "logps/rejected": -117.19776153564453, + "loss": 0.5651, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.373293876647949, + "rewards/margins": 7.367191791534424, + "rewards/rejected": -3.993898868560791, + "step": 16051 + }, + { + "epoch": 4.02, + "grad_norm": 6.079131126403809, + "learning_rate": 9.255527549865439e-07, + "logits/chosen": -0.48198872804641724, + "logits/rejected": -0.5232690572738647, + "logps/chosen": -57.10942840576172, + "logps/rejected": -110.43878173828125, + "loss": 0.6268, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.214759349822998, + "rewards/margins": 6.088813304901123, + "rewards/rejected": -2.874053716659546, + "step": 16052 + }, + { + "epoch": 4.02, + "grad_norm": 5.03210973739624, + "learning_rate": 9.250972339858433e-07, + "logits/chosen": -0.5887182354927063, + "logits/rejected": -0.6816798448562622, + "logps/chosen": -50.066471099853516, + "logps/rejected": -93.07492065429688, + "loss": 0.5909, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1970582008361816, + "rewards/margins": 6.763667106628418, + "rewards/rejected": -3.5666093826293945, + "step": 16053 + }, + { + "epoch": 4.02, + "grad_norm": 5.86172342300415, + "learning_rate": 9.246418136803225e-07, + "logits/chosen": -0.5205019116401672, + "logits/rejected": -0.6294143199920654, + "logps/chosen": -59.646759033203125, + "logps/rejected": -92.04540252685547, + "loss": 0.6126, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1949706077575684, + "rewards/margins": 6.442892074584961, + "rewards/rejected": -3.2479217052459717, + "step": 16054 + }, + { + "epoch": 4.02, + "grad_norm": 3.7464394569396973, + "learning_rate": 9.241864940812372e-07, + "logits/chosen": -0.4916776716709137, + "logits/rejected": -0.6307982802391052, + "logps/chosen": -65.8082275390625, + "logps/rejected": -106.74950408935547, + "loss": 0.5978, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0604095458984375, + "rewards/margins": 7.062899112701416, + "rewards/rejected": -4.00248908996582, + "step": 16055 + }, + { + "epoch": 4.02, + "grad_norm": 3.849763870239258, + "learning_rate": 9.237312751998406e-07, + "logits/chosen": -0.5800446271896362, + "logits/rejected": -0.6332288980484009, + "logps/chosen": -55.7846794128418, + "logps/rejected": -117.09329986572266, + "loss": 0.5796, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.268458366394043, + "rewards/margins": 7.948826789855957, + "rewards/rejected": -4.680367469787598, + "step": 16056 + }, + { + "epoch": 4.02, + "grad_norm": 5.136307239532471, + "learning_rate": 9.232761570473798e-07, + "logits/chosen": -0.5419528484344482, + "logits/rejected": -0.6200251579284668, + "logps/chosen": -49.173744201660156, + "logps/rejected": -125.07159423828125, + "loss": 0.5975, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.977748394012451, + "rewards/margins": 7.919935703277588, + "rewards/rejected": -4.942187786102295, + "step": 16057 + }, + { + "epoch": 4.02, + "grad_norm": 3.6075100898742676, + "learning_rate": 9.228211396351e-07, + "logits/chosen": -0.5159603357315063, + "logits/rejected": -0.6319807171821594, + "logps/chosen": -54.91619873046875, + "logps/rejected": -97.66307830810547, + "loss": 0.5819, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4431681632995605, + "rewards/margins": 7.6580424308776855, + "rewards/rejected": -4.214874267578125, + "step": 16058 + }, + { + "epoch": 4.02, + "grad_norm": 4.072953701019287, + "learning_rate": 9.223662229742475e-07, + "logits/chosen": -0.5652652382850647, + "logits/rejected": -0.643363893032074, + "logps/chosen": -53.95864486694336, + "logps/rejected": -105.9271240234375, + "loss": 0.5946, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.39247727394104, + "rewards/margins": 8.297041893005371, + "rewards/rejected": -4.90456485748291, + "step": 16059 + }, + { + "epoch": 4.02, + "grad_norm": 4.070415496826172, + "learning_rate": 9.219114070760632e-07, + "logits/chosen": -0.5495326519012451, + "logits/rejected": -0.648961067199707, + "logps/chosen": -60.045509338378906, + "logps/rejected": -102.91848754882812, + "loss": 0.6385, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9651222229003906, + "rewards/margins": 7.264542579650879, + "rewards/rejected": -4.299421310424805, + "step": 16060 + }, + { + "epoch": 4.02, + "grad_norm": 3.4875104427337646, + "learning_rate": 9.214566919517842e-07, + "logits/chosen": -0.6707594990730286, + "logits/rejected": -0.6858949065208435, + "logps/chosen": -42.459747314453125, + "logps/rejected": -106.48504638671875, + "loss": 0.5896, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.395519971847534, + "rewards/margins": 8.049593925476074, + "rewards/rejected": -4.6540727615356445, + "step": 16061 + }, + { + "epoch": 4.02, + "grad_norm": 4.815586566925049, + "learning_rate": 9.210020776126499e-07, + "logits/chosen": -0.4982878267765045, + "logits/rejected": -0.6001427173614502, + "logps/chosen": -50.60398483276367, + "logps/rejected": -103.97429656982422, + "loss": 0.6128, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.079535961151123, + "rewards/margins": 6.994632720947266, + "rewards/rejected": -3.91509747505188, + "step": 16062 + }, + { + "epoch": 4.02, + "grad_norm": 5.693872928619385, + "learning_rate": 9.205475640698913e-07, + "logits/chosen": -0.5910825133323669, + "logits/rejected": -0.6488538384437561, + "logps/chosen": -45.16686248779297, + "logps/rejected": -141.40806579589844, + "loss": 0.5846, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1164681911468506, + "rewards/margins": 9.3926362991333, + "rewards/rejected": -6.276167392730713, + "step": 16063 + }, + { + "epoch": 4.02, + "grad_norm": 3.5603079795837402, + "learning_rate": 9.200931513347433e-07, + "logits/chosen": -0.5606745481491089, + "logits/rejected": -0.6378547549247742, + "logps/chosen": -52.604591369628906, + "logps/rejected": -108.0428466796875, + "loss": 0.6358, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0991780757904053, + "rewards/margins": 7.305243968963623, + "rewards/rejected": -4.206065654754639, + "step": 16064 + }, + { + "epoch": 4.02, + "grad_norm": 2.421853542327881, + "learning_rate": 9.19638839418433e-07, + "logits/chosen": -0.6094836592674255, + "logits/rejected": -0.691323459148407, + "logps/chosen": -47.47767639160156, + "logps/rejected": -100.17638397216797, + "loss": 0.6053, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.222341299057007, + "rewards/margins": 7.664233207702637, + "rewards/rejected": -4.441891193389893, + "step": 16065 + }, + { + "epoch": 4.02, + "grad_norm": 2.1556122303009033, + "learning_rate": 9.191846283321853e-07, + "logits/chosen": -0.5425156354904175, + "logits/rejected": -0.6239134073257446, + "logps/chosen": -49.166072845458984, + "logps/rejected": -120.7363052368164, + "loss": 0.5127, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0181796550750732, + "rewards/margins": 7.922945976257324, + "rewards/rejected": -4.90476655960083, + "step": 16066 + }, + { + "epoch": 4.02, + "grad_norm": 4.437564373016357, + "learning_rate": 9.187305180872275e-07, + "logits/chosen": -0.597761869430542, + "logits/rejected": -0.687830924987793, + "logps/chosen": -55.53533172607422, + "logps/rejected": -108.28748321533203, + "loss": 0.6241, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.737323760986328, + "rewards/margins": 7.642905235290527, + "rewards/rejected": -4.905581474304199, + "step": 16067 + }, + { + "epoch": 4.02, + "grad_norm": 3.9869065284729004, + "learning_rate": 9.182765086947798e-07, + "logits/chosen": -0.5662099123001099, + "logits/rejected": -0.6131089329719543, + "logps/chosen": -41.40874099731445, + "logps/rejected": -114.2821273803711, + "loss": 0.515, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2695674896240234, + "rewards/margins": 7.522175312042236, + "rewards/rejected": -4.252607822418213, + "step": 16068 + }, + { + "epoch": 4.02, + "grad_norm": 4.049201488494873, + "learning_rate": 9.178226001660595e-07, + "logits/chosen": -0.5295270681381226, + "logits/rejected": -0.6355270743370056, + "logps/chosen": -49.66670608520508, + "logps/rejected": -83.3191909790039, + "loss": 0.6005, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.948042392730713, + "rewards/margins": 6.916656017303467, + "rewards/rejected": -3.968614101409912, + "step": 16069 + }, + { + "epoch": 4.02, + "grad_norm": 4.4697160720825195, + "learning_rate": 9.173687925122849e-07, + "logits/chosen": -0.5492997169494629, + "logits/rejected": -0.6119661331176758, + "logps/chosen": -57.08739471435547, + "logps/rejected": -131.971435546875, + "loss": 0.6292, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.766693115234375, + "rewards/margins": 8.259332656860352, + "rewards/rejected": -5.492639541625977, + "step": 16070 + }, + { + "epoch": 4.02, + "grad_norm": 10.830642700195312, + "learning_rate": 9.169150857446723e-07, + "logits/chosen": -0.5525614023208618, + "logits/rejected": -0.6200423240661621, + "logps/chosen": -50.88550567626953, + "logps/rejected": -106.0699691772461, + "loss": 0.5758, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3056206703186035, + "rewards/margins": 7.522968292236328, + "rewards/rejected": -4.217347621917725, + "step": 16071 + }, + { + "epoch": 4.02, + "grad_norm": 5.008965969085693, + "learning_rate": 9.164614798744287e-07, + "logits/chosen": -0.500109076499939, + "logits/rejected": -0.5837188363075256, + "logps/chosen": -56.272918701171875, + "logps/rejected": -95.74299621582031, + "loss": 0.5855, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2514028549194336, + "rewards/margins": 7.047235488891602, + "rewards/rejected": -3.795832872390747, + "step": 16072 + }, + { + "epoch": 4.02, + "grad_norm": 4.897258758544922, + "learning_rate": 9.160079749127654e-07, + "logits/chosen": -0.573768138885498, + "logits/rejected": -0.6333452463150024, + "logps/chosen": -51.92787551879883, + "logps/rejected": -111.35386657714844, + "loss": 0.5304, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1913137435913086, + "rewards/margins": 7.012616157531738, + "rewards/rejected": -3.8213019371032715, + "step": 16073 + }, + { + "epoch": 4.02, + "grad_norm": 8.903870582580566, + "learning_rate": 9.155545708708902e-07, + "logits/chosen": -0.5557774305343628, + "logits/rejected": -0.6404892802238464, + "logps/chosen": -49.44353103637695, + "logps/rejected": -106.85932159423828, + "loss": 0.6099, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8736562728881836, + "rewards/margins": 7.0073981285095215, + "rewards/rejected": -4.133741855621338, + "step": 16074 + }, + { + "epoch": 4.02, + "grad_norm": 4.940117359161377, + "learning_rate": 9.151012677600058e-07, + "logits/chosen": -0.5297653675079346, + "logits/rejected": -0.6129410862922668, + "logps/chosen": -53.74739456176758, + "logps/rejected": -107.45233154296875, + "loss": 0.5926, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.042954444885254, + "rewards/margins": 8.032119750976562, + "rewards/rejected": -4.98916482925415, + "step": 16075 + }, + { + "epoch": 4.02, + "grad_norm": 5.762118816375732, + "learning_rate": 9.146480655913126e-07, + "logits/chosen": -0.5257202386856079, + "logits/rejected": -0.6302810311317444, + "logps/chosen": -48.45524597167969, + "logps/rejected": -98.49169921875, + "loss": 0.6285, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1119649410247803, + "rewards/margins": 6.57628870010376, + "rewards/rejected": -3.4643232822418213, + "step": 16076 + }, + { + "epoch": 4.02, + "grad_norm": 5.197466850280762, + "learning_rate": 9.141949643760123e-07, + "logits/chosen": -0.5592637062072754, + "logits/rejected": -0.64476478099823, + "logps/chosen": -53.08136749267578, + "logps/rejected": -86.32707214355469, + "loss": 0.6282, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1307907104492188, + "rewards/margins": 6.153561592102051, + "rewards/rejected": -3.022770643234253, + "step": 16077 + }, + { + "epoch": 4.02, + "grad_norm": 5.02576208114624, + "learning_rate": 9.137419641253003e-07, + "logits/chosen": -0.5459449291229248, + "logits/rejected": -0.5888784527778625, + "logps/chosen": -48.68484878540039, + "logps/rejected": -106.94822692871094, + "loss": 0.5892, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.053061008453369, + "rewards/margins": 6.574233055114746, + "rewards/rejected": -3.521172523498535, + "step": 16078 + }, + { + "epoch": 4.02, + "grad_norm": 7.343229293823242, + "learning_rate": 9.132890648503701e-07, + "logits/chosen": -0.5563380718231201, + "logits/rejected": -0.6490054130554199, + "logps/chosen": -51.112548828125, + "logps/rejected": -94.74097442626953, + "loss": 0.6065, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2078585624694824, + "rewards/margins": 6.919485092163086, + "rewards/rejected": -3.7116267681121826, + "step": 16079 + }, + { + "epoch": 4.02, + "grad_norm": 5.1159210205078125, + "learning_rate": 9.128362665624147e-07, + "logits/chosen": -0.5270001292228699, + "logits/rejected": -0.6052186489105225, + "logps/chosen": -56.69267272949219, + "logps/rejected": -111.6932373046875, + "loss": 0.6874, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1878230571746826, + "rewards/margins": 6.837493896484375, + "rewards/rejected": -3.6496710777282715, + "step": 16080 + }, + { + "epoch": 4.02, + "grad_norm": 3.161837339401245, + "learning_rate": 9.123835692726208e-07, + "logits/chosen": -0.5077365636825562, + "logits/rejected": -0.5273701548576355, + "logps/chosen": -53.662750244140625, + "logps/rejected": -123.79472351074219, + "loss": 0.5868, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.565337657928467, + "rewards/margins": 6.902921676635742, + "rewards/rejected": -3.337583541870117, + "step": 16081 + }, + { + "epoch": 4.02, + "grad_norm": 7.467218399047852, + "learning_rate": 9.119309729921782e-07, + "logits/chosen": -0.5350930094718933, + "logits/rejected": -0.6334271430969238, + "logps/chosen": -55.17374038696289, + "logps/rejected": -105.5828857421875, + "loss": 0.5928, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.271827220916748, + "rewards/margins": 7.267101287841797, + "rewards/rejected": -3.995274305343628, + "step": 16082 + }, + { + "epoch": 4.02, + "grad_norm": 2.9776830673217773, + "learning_rate": 9.1147847773227e-07, + "logits/chosen": -0.5377631783485413, + "logits/rejected": -0.6657333970069885, + "logps/chosen": -86.01801300048828, + "logps/rejected": -113.79653930664062, + "loss": 0.7104, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.172539710998535, + "rewards/margins": 8.1420259475708, + "rewards/rejected": -4.969486236572266, + "step": 16083 + }, + { + "epoch": 4.02, + "grad_norm": 7.973119258880615, + "learning_rate": 9.110260835040758e-07, + "logits/chosen": -0.5483345985412598, + "logits/rejected": -0.6316305994987488, + "logps/chosen": -54.53961181640625, + "logps/rejected": -113.17190551757812, + "loss": 0.5709, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4842169284820557, + "rewards/margins": 7.684859275817871, + "rewards/rejected": -4.2006425857543945, + "step": 16084 + }, + { + "epoch": 4.02, + "grad_norm": 9.775038719177246, + "learning_rate": 9.105737903187773e-07, + "logits/chosen": -0.6182111501693726, + "logits/rejected": -0.7058509588241577, + "logps/chosen": -54.54450988769531, + "logps/rejected": -123.5476303100586, + "loss": 0.7013, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.936560869216919, + "rewards/margins": 6.975398063659668, + "rewards/rejected": -4.03883695602417, + "step": 16085 + }, + { + "epoch": 4.02, + "grad_norm": 2.605464220046997, + "learning_rate": 9.101215981875488e-07, + "logits/chosen": -0.5486941337585449, + "logits/rejected": -0.6399244666099548, + "logps/chosen": -60.59652328491211, + "logps/rejected": -134.39308166503906, + "loss": 0.584, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6703715324401855, + "rewards/margins": 8.499953269958496, + "rewards/rejected": -5.829581260681152, + "step": 16086 + }, + { + "epoch": 4.02, + "grad_norm": 2.93477725982666, + "learning_rate": 9.096695071215677e-07, + "logits/chosen": -0.5535901188850403, + "logits/rejected": -0.6495233178138733, + "logps/chosen": -54.865020751953125, + "logps/rejected": -96.29210662841797, + "loss": 0.6016, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0958809852600098, + "rewards/margins": 6.90030574798584, + "rewards/rejected": -3.804424524307251, + "step": 16087 + }, + { + "epoch": 4.02, + "grad_norm": 5.98118257522583, + "learning_rate": 9.092175171320016e-07, + "logits/chosen": -0.5557277798652649, + "logits/rejected": -0.6051868200302124, + "logps/chosen": -69.62337493896484, + "logps/rejected": -106.2902603149414, + "loss": 0.6279, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2299163341522217, + "rewards/margins": 6.924942493438721, + "rewards/rejected": -3.69502592086792, + "step": 16088 + }, + { + "epoch": 4.02, + "grad_norm": 3.095257520675659, + "learning_rate": 9.087656282300239e-07, + "logits/chosen": -0.5681589841842651, + "logits/rejected": -0.6706146001815796, + "logps/chosen": -55.473876953125, + "logps/rejected": -97.9849624633789, + "loss": 0.5991, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9762518405914307, + "rewards/margins": 6.8538641929626465, + "rewards/rejected": -3.877612829208374, + "step": 16089 + }, + { + "epoch": 4.03, + "grad_norm": 2.6726584434509277, + "learning_rate": 9.083138404267988e-07, + "logits/chosen": -0.5305823683738708, + "logits/rejected": -0.5904474854469299, + "logps/chosen": -47.551795959472656, + "logps/rejected": -109.43794250488281, + "loss": 0.5678, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3528008460998535, + "rewards/margins": 7.434910774230957, + "rewards/rejected": -4.082110404968262, + "step": 16090 + }, + { + "epoch": 4.03, + "grad_norm": 4.537035942077637, + "learning_rate": 9.078621537334897e-07, + "logits/chosen": -0.4904278516769409, + "logits/rejected": -0.5812851190567017, + "logps/chosen": -68.42832946777344, + "logps/rejected": -118.75848388671875, + "loss": 0.5947, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.171546697616577, + "rewards/margins": 8.170454978942871, + "rewards/rejected": -4.998908996582031, + "step": 16091 + }, + { + "epoch": 4.03, + "grad_norm": 3.2531251907348633, + "learning_rate": 9.074105681612605e-07, + "logits/chosen": -0.4607204496860504, + "logits/rejected": -0.576445460319519, + "logps/chosen": -61.455142974853516, + "logps/rejected": -117.09610748291016, + "loss": 0.5431, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0928292274475098, + "rewards/margins": 8.0961332321167, + "rewards/rejected": -5.0033040046691895, + "step": 16092 + }, + { + "epoch": 4.03, + "grad_norm": 3.6494953632354736, + "learning_rate": 9.069590837212694e-07, + "logits/chosen": -0.5495617389678955, + "logits/rejected": -0.6248016357421875, + "logps/chosen": -46.0999641418457, + "logps/rejected": -109.89620971679688, + "loss": 0.5965, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.977142333984375, + "rewards/margins": 7.014575481414795, + "rewards/rejected": -4.037433624267578, + "step": 16093 + }, + { + "epoch": 4.03, + "grad_norm": 4.274814128875732, + "learning_rate": 9.065077004246714e-07, + "logits/chosen": -0.5394387245178223, + "logits/rejected": -0.6576359272003174, + "logps/chosen": -61.68723678588867, + "logps/rejected": -86.91133117675781, + "loss": 0.6392, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9520230293273926, + "rewards/margins": 5.886984825134277, + "rewards/rejected": -2.9349617958068848, + "step": 16094 + }, + { + "epoch": 4.03, + "grad_norm": 4.615832805633545, + "learning_rate": 9.060564182826237e-07, + "logits/chosen": -0.585171103477478, + "logits/rejected": -0.6894663572311401, + "logps/chosen": -48.29594421386719, + "logps/rejected": -103.78932189941406, + "loss": 0.5802, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.113931894302368, + "rewards/margins": 7.228355407714844, + "rewards/rejected": -4.114424228668213, + "step": 16095 + }, + { + "epoch": 4.03, + "grad_norm": 3.5929107666015625, + "learning_rate": 9.056052373062762e-07, + "logits/chosen": -0.5573100447654724, + "logits/rejected": -0.6824440360069275, + "logps/chosen": -52.75536346435547, + "logps/rejected": -113.97189331054688, + "loss": 0.6101, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2298367023468018, + "rewards/margins": 8.559473991394043, + "rewards/rejected": -5.32963752746582, + "step": 16096 + }, + { + "epoch": 4.03, + "grad_norm": 3.8788187503814697, + "learning_rate": 9.051541575067768e-07, + "logits/chosen": -0.5514676570892334, + "logits/rejected": -0.6301663517951965, + "logps/chosen": -58.71826171875, + "logps/rejected": -100.10441589355469, + "loss": 0.644, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.991086483001709, + "rewards/margins": 5.694865703582764, + "rewards/rejected": -2.7037792205810547, + "step": 16097 + }, + { + "epoch": 4.03, + "grad_norm": 5.239315509796143, + "learning_rate": 9.047031788952737e-07, + "logits/chosen": -0.5383352041244507, + "logits/rejected": -0.63603675365448, + "logps/chosen": -47.51641082763672, + "logps/rejected": -96.89779663085938, + "loss": 0.597, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9633655548095703, + "rewards/margins": 6.677286148071289, + "rewards/rejected": -3.7139205932617188, + "step": 16098 + }, + { + "epoch": 4.03, + "grad_norm": 4.5521063804626465, + "learning_rate": 9.042523014829141e-07, + "logits/chosen": -0.4946311116218567, + "logits/rejected": -0.60159832239151, + "logps/chosen": -53.26075744628906, + "logps/rejected": -121.16169738769531, + "loss": 0.5429, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.113913059234619, + "rewards/margins": 9.119709968566895, + "rewards/rejected": -6.005796909332275, + "step": 16099 + }, + { + "epoch": 4.03, + "grad_norm": 2.1949198246002197, + "learning_rate": 9.038015252808335e-07, + "logits/chosen": -0.6066203117370605, + "logits/rejected": -0.7040748000144958, + "logps/chosen": -55.855224609375, + "logps/rejected": -103.38087463378906, + "loss": 0.5676, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.507370948791504, + "rewards/margins": 8.35239315032959, + "rewards/rejected": -4.845022201538086, + "step": 16100 + }, + { + "epoch": 4.03, + "grad_norm": 3.6889679431915283, + "learning_rate": 9.033508503001742e-07, + "logits/chosen": -0.5174744129180908, + "logits/rejected": -0.6133556365966797, + "logps/chosen": -52.95067596435547, + "logps/rejected": -110.52141571044922, + "loss": 0.5772, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.898894786834717, + "rewards/margins": 7.230397701263428, + "rewards/rejected": -4.331502914428711, + "step": 16101 + }, + { + "epoch": 4.03, + "grad_norm": 4.184577465057373, + "learning_rate": 9.029002765520739e-07, + "logits/chosen": -0.43409454822540283, + "logits/rejected": -0.48752450942993164, + "logps/chosen": -43.3449592590332, + "logps/rejected": -106.8291244506836, + "loss": 0.5493, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1664116382598877, + "rewards/margins": 6.39804220199585, + "rewards/rejected": -3.2316298484802246, + "step": 16102 + }, + { + "epoch": 4.03, + "grad_norm": 8.828210830688477, + "learning_rate": 9.024498040476659e-07, + "logits/chosen": -0.4867396354675293, + "logits/rejected": -0.5424368381500244, + "logps/chosen": -66.43450927734375, + "logps/rejected": -108.17406463623047, + "loss": 0.7756, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.75132155418396, + "rewards/margins": 6.266392707824707, + "rewards/rejected": -3.515070676803589, + "step": 16103 + }, + { + "epoch": 4.03, + "grad_norm": 5.661516189575195, + "learning_rate": 9.019994327980808e-07, + "logits/chosen": -0.476439893245697, + "logits/rejected": -0.5622684359550476, + "logps/chosen": -58.31101989746094, + "logps/rejected": -78.80917358398438, + "loss": 0.7076, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9169797897338867, + "rewards/margins": 5.627196311950684, + "rewards/rejected": -2.7102162837982178, + "step": 16104 + }, + { + "epoch": 4.03, + "grad_norm": 4.668848037719727, + "learning_rate": 9.015491628144491e-07, + "logits/chosen": -0.49918827414512634, + "logits/rejected": -0.5922845602035522, + "logps/chosen": -46.698753356933594, + "logps/rejected": -112.22098541259766, + "loss": 0.5618, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.307788372039795, + "rewards/margins": 8.427613258361816, + "rewards/rejected": -5.11982536315918, + "step": 16105 + }, + { + "epoch": 4.03, + "grad_norm": 49.9988899230957, + "learning_rate": 9.010989941078957e-07, + "logits/chosen": -0.5655515193939209, + "logits/rejected": -0.6289884448051453, + "logps/chosen": -55.11784362792969, + "logps/rejected": -98.33303833007812, + "loss": 0.6425, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.176311492919922, + "rewards/margins": 6.375064849853516, + "rewards/rejected": -3.1987533569335938, + "step": 16106 + }, + { + "epoch": 4.03, + "grad_norm": 5.682502269744873, + "learning_rate": 9.006489266895474e-07, + "logits/chosen": -0.5953029990196228, + "logits/rejected": -0.6869786381721497, + "logps/chosen": -51.83153533935547, + "logps/rejected": -106.62354278564453, + "loss": 0.6547, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.248051643371582, + "rewards/margins": 7.994750022888184, + "rewards/rejected": -4.746697902679443, + "step": 16107 + }, + { + "epoch": 4.03, + "grad_norm": 9.223262786865234, + "learning_rate": 9.001989605705241e-07, + "logits/chosen": -0.609281063079834, + "logits/rejected": -0.6822044849395752, + "logps/chosen": -58.51835632324219, + "logps/rejected": -92.78649139404297, + "loss": 0.582, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9985923767089844, + "rewards/margins": 6.646223068237305, + "rewards/rejected": -3.647630453109741, + "step": 16108 + }, + { + "epoch": 4.03, + "grad_norm": 3.0082390308380127, + "learning_rate": 8.997490957619443e-07, + "logits/chosen": -0.506887674331665, + "logits/rejected": -0.5281850099563599, + "logps/chosen": -46.82642364501953, + "logps/rejected": -125.74264526367188, + "loss": 0.5313, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.424571990966797, + "rewards/margins": 8.017229080200195, + "rewards/rejected": -4.59265661239624, + "step": 16109 + }, + { + "epoch": 4.03, + "grad_norm": 8.744152069091797, + "learning_rate": 8.992993322749266e-07, + "logits/chosen": -0.5708339810371399, + "logits/rejected": -0.6222457885742188, + "logps/chosen": -55.266761779785156, + "logps/rejected": -129.60922241210938, + "loss": 0.609, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.921875476837158, + "rewards/margins": 7.41115665435791, + "rewards/rejected": -4.489281177520752, + "step": 16110 + }, + { + "epoch": 4.03, + "grad_norm": 7.255592346191406, + "learning_rate": 8.988496701205845e-07, + "logits/chosen": -0.4927945137023926, + "logits/rejected": -0.6218140721321106, + "logps/chosen": -69.81673431396484, + "logps/rejected": -98.69259643554688, + "loss": 0.6007, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.290194272994995, + "rewards/margins": 7.621809482574463, + "rewards/rejected": -4.331614971160889, + "step": 16111 + }, + { + "epoch": 4.03, + "grad_norm": 2.4871222972869873, + "learning_rate": 8.984001093100275e-07, + "logits/chosen": -0.49611300230026245, + "logits/rejected": -0.6198937892913818, + "logps/chosen": -63.582801818847656, + "logps/rejected": -103.28042602539062, + "loss": 0.6053, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.164533853530884, + "rewards/margins": 7.9846014976501465, + "rewards/rejected": -4.820067882537842, + "step": 16112 + }, + { + "epoch": 4.03, + "grad_norm": 4.053172588348389, + "learning_rate": 8.979506498543666e-07, + "logits/chosen": -0.48011109232902527, + "logits/rejected": -0.5949897170066833, + "logps/chosen": -55.16902160644531, + "logps/rejected": -103.31880187988281, + "loss": 0.5752, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1636321544647217, + "rewards/margins": 7.129136085510254, + "rewards/rejected": -3.9655039310455322, + "step": 16113 + }, + { + "epoch": 4.03, + "grad_norm": 14.021586418151855, + "learning_rate": 8.975012917647108e-07, + "logits/chosen": -0.5575892329216003, + "logits/rejected": -0.6512925624847412, + "logps/chosen": -48.99047088623047, + "logps/rejected": -96.22073364257812, + "loss": 0.6196, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8421578407287598, + "rewards/margins": 6.720048904418945, + "rewards/rejected": -3.877892017364502, + "step": 16114 + }, + { + "epoch": 4.03, + "grad_norm": 6.93196964263916, + "learning_rate": 8.970520350521595e-07, + "logits/chosen": -0.5872301459312439, + "logits/rejected": -0.6448806524276733, + "logps/chosen": -47.37045669555664, + "logps/rejected": -106.80850219726562, + "loss": 0.5729, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3410580158233643, + "rewards/margins": 6.855566501617432, + "rewards/rejected": -3.514508008956909, + "step": 16115 + }, + { + "epoch": 4.03, + "grad_norm": 4.950698375701904, + "learning_rate": 8.966028797278159e-07, + "logits/chosen": -0.5430111289024353, + "logits/rejected": -0.6188004612922668, + "logps/chosen": -60.07009506225586, + "logps/rejected": -132.48419189453125, + "loss": 0.6181, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.171032667160034, + "rewards/margins": 8.444161415100098, + "rewards/rejected": -5.273128986358643, + "step": 16116 + }, + { + "epoch": 4.03, + "grad_norm": 4.813366889953613, + "learning_rate": 8.961538258027825e-07, + "logits/chosen": -0.6009165644645691, + "logits/rejected": -0.6633297801017761, + "logps/chosen": -54.73699951171875, + "logps/rejected": -100.30306243896484, + "loss": 0.7428, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6766436100006104, + "rewards/margins": 7.089155673980713, + "rewards/rejected": -4.412511348724365, + "step": 16117 + }, + { + "epoch": 4.03, + "grad_norm": 7.97105598449707, + "learning_rate": 8.957048732881507e-07, + "logits/chosen": -0.5235711336135864, + "logits/rejected": -0.6181963682174683, + "logps/chosen": -61.72480392456055, + "logps/rejected": -129.977294921875, + "loss": 0.6661, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.052952766418457, + "rewards/margins": 7.855347633361816, + "rewards/rejected": -4.802394390106201, + "step": 16118 + }, + { + "epoch": 4.03, + "grad_norm": 5.041590213775635, + "learning_rate": 8.952560221950168e-07, + "logits/chosen": -0.534906804561615, + "logits/rejected": -0.6060803532600403, + "logps/chosen": -51.586692810058594, + "logps/rejected": -86.76268005371094, + "loss": 0.6729, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9478330612182617, + "rewards/margins": 5.477352142333984, + "rewards/rejected": -2.5295190811157227, + "step": 16119 + }, + { + "epoch": 4.03, + "grad_norm": 3.884589195251465, + "learning_rate": 8.948072725344742e-07, + "logits/chosen": -0.4675446152687073, + "logits/rejected": -0.5990691184997559, + "logps/chosen": -62.68985366821289, + "logps/rejected": -104.8410415649414, + "loss": 0.575, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0307912826538086, + "rewards/margins": 8.585404396057129, + "rewards/rejected": -5.55461311340332, + "step": 16120 + }, + { + "epoch": 4.03, + "grad_norm": 2.545762538909912, + "learning_rate": 8.9435862431761e-07, + "logits/chosen": -0.5789994597434998, + "logits/rejected": -0.6471977233886719, + "logps/chosen": -60.93161392211914, + "logps/rejected": -97.79686737060547, + "loss": 0.6124, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.196319341659546, + "rewards/margins": 7.0832343101501465, + "rewards/rejected": -3.8869149684906006, + "step": 16121 + }, + { + "epoch": 4.03, + "grad_norm": 3.0946788787841797, + "learning_rate": 8.939100775555099e-07, + "logits/chosen": -0.5883664488792419, + "logits/rejected": -0.6838032007217407, + "logps/chosen": -53.53057098388672, + "logps/rejected": -107.23443603515625, + "loss": 0.5871, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1983883380889893, + "rewards/margins": 7.861491680145264, + "rewards/rejected": -4.6631035804748535, + "step": 16122 + }, + { + "epoch": 4.03, + "grad_norm": 5.432537078857422, + "learning_rate": 8.934616322592604e-07, + "logits/chosen": -0.5921307802200317, + "logits/rejected": -0.6271674633026123, + "logps/chosen": -58.94154739379883, + "logps/rejected": -145.4884796142578, + "loss": 0.5897, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9986519813537598, + "rewards/margins": 7.86976432800293, + "rewards/rejected": -4.87111234664917, + "step": 16123 + }, + { + "epoch": 4.03, + "grad_norm": 9.332700729370117, + "learning_rate": 8.930132884399422e-07, + "logits/chosen": -0.5655015707015991, + "logits/rejected": -0.6319512128829956, + "logps/chosen": -62.42828369140625, + "logps/rejected": -110.0165786743164, + "loss": 0.7146, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.119243860244751, + "rewards/margins": 7.341671466827393, + "rewards/rejected": -4.222426891326904, + "step": 16124 + }, + { + "epoch": 4.03, + "grad_norm": 5.444565773010254, + "learning_rate": 8.925650461086327e-07, + "logits/chosen": -0.5440219640731812, + "logits/rejected": -0.6059651374816895, + "logps/chosen": -43.16082000732422, + "logps/rejected": -93.95477294921875, + "loss": 0.6424, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.162567138671875, + "rewards/margins": 6.357792854309082, + "rewards/rejected": -3.195225954055786, + "step": 16125 + }, + { + "epoch": 4.03, + "grad_norm": 24.334877014160156, + "learning_rate": 8.921169052764111e-07, + "logits/chosen": -0.5587279200553894, + "logits/rejected": -0.597858726978302, + "logps/chosen": -54.608001708984375, + "logps/rejected": -102.33904266357422, + "loss": 0.6578, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2145912647247314, + "rewards/margins": 6.3123369216918945, + "rewards/rejected": -3.097745656967163, + "step": 16126 + }, + { + "epoch": 4.03, + "grad_norm": 3.025725841522217, + "learning_rate": 8.916688659543487e-07, + "logits/chosen": -0.5816758871078491, + "logits/rejected": -0.6432474255561829, + "logps/chosen": -47.02170181274414, + "logps/rejected": -117.22698974609375, + "loss": 0.5658, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.87307071685791, + "rewards/margins": 7.356306552886963, + "rewards/rejected": -4.483235836029053, + "step": 16127 + }, + { + "epoch": 4.03, + "grad_norm": 2.6711323261260986, + "learning_rate": 8.912209281535195e-07, + "logits/chosen": -0.5936528444290161, + "logits/rejected": -0.7082991600036621, + "logps/chosen": -65.87260437011719, + "logps/rejected": -106.77783203125, + "loss": 0.6633, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5791120529174805, + "rewards/margins": 7.86949348449707, + "rewards/rejected": -4.29038143157959, + "step": 16128 + }, + { + "epoch": 4.03, + "grad_norm": 6.354142189025879, + "learning_rate": 8.907730918849905e-07, + "logits/chosen": -0.5808244943618774, + "logits/rejected": -0.6740878820419312, + "logps/chosen": -55.56468200683594, + "logps/rejected": -88.35681915283203, + "loss": 0.5847, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1939783096313477, + "rewards/margins": 6.668676376342773, + "rewards/rejected": -3.474698066711426, + "step": 16129 + }, + { + "epoch": 4.04, + "grad_norm": 9.030562400817871, + "learning_rate": 8.903253571598303e-07, + "logits/chosen": -0.6016002893447876, + "logits/rejected": -0.6740589737892151, + "logps/chosen": -58.94194793701172, + "logps/rejected": -112.5032958984375, + "loss": 0.717, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1210663318634033, + "rewards/margins": 7.373577117919922, + "rewards/rejected": -4.252511024475098, + "step": 16130 + }, + { + "epoch": 4.04, + "grad_norm": 2.4614293575286865, + "learning_rate": 8.89877723989101e-07, + "logits/chosen": -0.5306933522224426, + "logits/rejected": -0.5897753834724426, + "logps/chosen": -56.67183303833008, + "logps/rejected": -116.82049560546875, + "loss": 0.5945, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1173079013824463, + "rewards/margins": 7.504306793212891, + "rewards/rejected": -4.386998653411865, + "step": 16131 + }, + { + "epoch": 4.04, + "grad_norm": 2.820638656616211, + "learning_rate": 8.894301923838655e-07, + "logits/chosen": -0.5400965213775635, + "logits/rejected": -0.6253865361213684, + "logps/chosen": -47.306156158447266, + "logps/rejected": -106.77032470703125, + "loss": 0.5168, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3272314071655273, + "rewards/margins": 7.435909271240234, + "rewards/rejected": -4.108677387237549, + "step": 16132 + }, + { + "epoch": 4.04, + "grad_norm": 7.078181743621826, + "learning_rate": 8.889827623551822e-07, + "logits/chosen": -0.5161604285240173, + "logits/rejected": -0.615207314491272, + "logps/chosen": -56.056766510009766, + "logps/rejected": -112.56791687011719, + "loss": 0.6791, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.103802442550659, + "rewards/margins": 7.3618340492248535, + "rewards/rejected": -4.258031845092773, + "step": 16133 + }, + { + "epoch": 4.04, + "grad_norm": 4.996687412261963, + "learning_rate": 8.885354339141067e-07, + "logits/chosen": -0.508222758769989, + "logits/rejected": -0.5729507207870483, + "logps/chosen": -55.82129669189453, + "logps/rejected": -120.19171905517578, + "loss": 0.6915, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1063804626464844, + "rewards/margins": 7.29155158996582, + "rewards/rejected": -4.185171127319336, + "step": 16134 + }, + { + "epoch": 4.04, + "grad_norm": 3.8511085510253906, + "learning_rate": 8.880882070716945e-07, + "logits/chosen": -0.5372269153594971, + "logits/rejected": -0.5805036425590515, + "logps/chosen": -55.477291107177734, + "logps/rejected": -139.1767120361328, + "loss": 0.5494, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.893946409225464, + "rewards/margins": 8.279458045959473, + "rewards/rejected": -5.3855109214782715, + "step": 16135 + }, + { + "epoch": 4.04, + "grad_norm": 3.746570110321045, + "learning_rate": 8.876410818389963e-07, + "logits/chosen": -0.5326719880104065, + "logits/rejected": -0.585016131401062, + "logps/chosen": -44.49481964111328, + "logps/rejected": -110.56198120117188, + "loss": 0.5608, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2568068504333496, + "rewards/margins": 7.575273513793945, + "rewards/rejected": -4.318467140197754, + "step": 16136 + }, + { + "epoch": 4.04, + "grad_norm": 18.781070709228516, + "learning_rate": 8.871940582270605e-07, + "logits/chosen": -0.4968554377555847, + "logits/rejected": -0.5822274684906006, + "logps/chosen": -56.69356155395508, + "logps/rejected": -89.98171997070312, + "loss": 0.6118, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7671942710876465, + "rewards/margins": 7.349788188934326, + "rewards/rejected": -3.5825936794281006, + "step": 16137 + }, + { + "epoch": 4.04, + "grad_norm": 4.189792633056641, + "learning_rate": 8.867471362469349e-07, + "logits/chosen": -0.4885779619216919, + "logits/rejected": -0.5794196128845215, + "logps/chosen": -67.71708679199219, + "logps/rejected": -115.09862518310547, + "loss": 0.6198, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.797464370727539, + "rewards/margins": 7.823751449584961, + "rewards/rejected": -5.0262861251831055, + "step": 16138 + }, + { + "epoch": 4.04, + "grad_norm": 3.4194717407226562, + "learning_rate": 8.863003159096622e-07, + "logits/chosen": -0.4610978364944458, + "logits/rejected": -0.568734884262085, + "logps/chosen": -67.07395935058594, + "logps/rejected": -114.06490325927734, + "loss": 0.6156, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3965933322906494, + "rewards/margins": 7.9550652503967285, + "rewards/rejected": -4.558472633361816, + "step": 16139 + }, + { + "epoch": 4.04, + "grad_norm": 3.9344589710235596, + "learning_rate": 8.858535972262839e-07, + "logits/chosen": -0.615414559841156, + "logits/rejected": -0.6616017818450928, + "logps/chosen": -50.15150451660156, + "logps/rejected": -125.99222564697266, + "loss": 0.513, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9543652534484863, + "rewards/margins": 7.472379684448242, + "rewards/rejected": -4.5180134773254395, + "step": 16140 + }, + { + "epoch": 4.04, + "grad_norm": 4.7971272468566895, + "learning_rate": 8.854069802078385e-07, + "logits/chosen": -0.5927304029464722, + "logits/rejected": -0.6788452863693237, + "logps/chosen": -57.37962341308594, + "logps/rejected": -115.49021911621094, + "loss": 0.5669, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.125770092010498, + "rewards/margins": 7.285223960876465, + "rewards/rejected": -4.159453392028809, + "step": 16141 + }, + { + "epoch": 4.04, + "grad_norm": 3.2337799072265625, + "learning_rate": 8.849604648653659e-07, + "logits/chosen": -0.5454257726669312, + "logits/rejected": -0.6222245693206787, + "logps/chosen": -55.08360290527344, + "logps/rejected": -109.08851623535156, + "loss": 0.6059, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.353933572769165, + "rewards/margins": 7.66265344619751, + "rewards/rejected": -4.308720111846924, + "step": 16142 + }, + { + "epoch": 4.04, + "grad_norm": 4.2507195472717285, + "learning_rate": 8.845140512098943e-07, + "logits/chosen": -0.560154914855957, + "logits/rejected": -0.6005366444587708, + "logps/chosen": -47.46548843383789, + "logps/rejected": -123.61649322509766, + "loss": 0.6394, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.257301092147827, + "rewards/margins": 7.95444393157959, + "rewards/rejected": -4.697142124176025, + "step": 16143 + }, + { + "epoch": 4.04, + "grad_norm": 3.1252803802490234, + "learning_rate": 8.840677392524583e-07, + "logits/chosen": -0.5546329021453857, + "logits/rejected": -0.6527127027511597, + "logps/chosen": -52.35383605957031, + "logps/rejected": -117.61227416992188, + "loss": 0.5682, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.835108757019043, + "rewards/margins": 8.375882148742676, + "rewards/rejected": -5.540773391723633, + "step": 16144 + }, + { + "epoch": 4.04, + "grad_norm": 3.1093976497650146, + "learning_rate": 8.836215290040873e-07, + "logits/chosen": -0.5784934759140015, + "logits/rejected": -0.6474586129188538, + "logps/chosen": -53.61359786987305, + "logps/rejected": -112.69330596923828, + "loss": 0.6301, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.218506336212158, + "rewards/margins": 8.060957908630371, + "rewards/rejected": -4.842451572418213, + "step": 16145 + }, + { + "epoch": 4.04, + "grad_norm": 3.762397289276123, + "learning_rate": 8.83175420475807e-07, + "logits/chosen": -0.5493147373199463, + "logits/rejected": -0.6232612133026123, + "logps/chosen": -51.7236213684082, + "logps/rejected": -111.63441467285156, + "loss": 0.5276, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4939069747924805, + "rewards/margins": 6.825726509094238, + "rewards/rejected": -3.331820249557495, + "step": 16146 + }, + { + "epoch": 4.04, + "grad_norm": 5.4718523025512695, + "learning_rate": 8.827294136786396e-07, + "logits/chosen": -0.4807793200016022, + "logits/rejected": -0.5933157801628113, + "logps/chosen": -62.08210754394531, + "logps/rejected": -96.00703430175781, + "loss": 0.6885, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8844993114471436, + "rewards/margins": 5.0622735023498535, + "rewards/rejected": -2.177774429321289, + "step": 16147 + }, + { + "epoch": 4.04, + "grad_norm": 5.52601432800293, + "learning_rate": 8.822835086236086e-07, + "logits/chosen": -0.5838768482208252, + "logits/rejected": -0.621161162853241, + "logps/chosen": -60.24589538574219, + "logps/rejected": -104.6893310546875, + "loss": 0.7131, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2417402267456055, + "rewards/margins": 7.746624946594238, + "rewards/rejected": -4.504884719848633, + "step": 16148 + }, + { + "epoch": 4.04, + "grad_norm": 15.847976684570312, + "learning_rate": 8.818377053217325e-07, + "logits/chosen": -0.5639930367469788, + "logits/rejected": -0.625990092754364, + "logps/chosen": -46.46751022338867, + "logps/rejected": -133.38516235351562, + "loss": 0.5906, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0445902347564697, + "rewards/margins": 7.913747787475586, + "rewards/rejected": -4.869156837463379, + "step": 16149 + }, + { + "epoch": 4.04, + "grad_norm": 6.061367511749268, + "learning_rate": 8.813920037840257e-07, + "logits/chosen": -0.6025614738464355, + "logits/rejected": -0.6846910715103149, + "logps/chosen": -57.88127899169922, + "logps/rejected": -100.60240936279297, + "loss": 0.6541, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2644410133361816, + "rewards/margins": 7.051393508911133, + "rewards/rejected": -3.786952257156372, + "step": 16150 + }, + { + "epoch": 4.04, + "grad_norm": 8.84986686706543, + "learning_rate": 8.809464040215038e-07, + "logits/chosen": -0.49731963872909546, + "logits/rejected": -0.5752593278884888, + "logps/chosen": -51.75416946411133, + "logps/rejected": -109.41636657714844, + "loss": 0.6379, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0529074668884277, + "rewards/margins": 6.905105113983154, + "rewards/rejected": -3.8521976470947266, + "step": 16151 + }, + { + "epoch": 4.04, + "grad_norm": 12.04826831817627, + "learning_rate": 8.805009060451769e-07, + "logits/chosen": -0.5504778027534485, + "logits/rejected": -0.6085498929023743, + "logps/chosen": -49.17595672607422, + "logps/rejected": -126.60375213623047, + "loss": 0.561, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0393457412719727, + "rewards/margins": 7.912825107574463, + "rewards/rejected": -4.873478889465332, + "step": 16152 + }, + { + "epoch": 4.04, + "grad_norm": 13.43246841430664, + "learning_rate": 8.800555098660551e-07, + "logits/chosen": -0.5008993744850159, + "logits/rejected": -0.5808617472648621, + "logps/chosen": -62.578163146972656, + "logps/rejected": -108.21040344238281, + "loss": 0.6234, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0049290657043457, + "rewards/margins": 7.130925178527832, + "rewards/rejected": -4.125996112823486, + "step": 16153 + }, + { + "epoch": 4.04, + "grad_norm": 3.4931564331054688, + "learning_rate": 8.796102154951441e-07, + "logits/chosen": -0.5037250518798828, + "logits/rejected": -0.5446252226829529, + "logps/chosen": -47.42845153808594, + "logps/rejected": -135.36895751953125, + "loss": 0.6085, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.053154945373535, + "rewards/margins": 8.407547950744629, + "rewards/rejected": -5.354393005371094, + "step": 16154 + }, + { + "epoch": 4.04, + "grad_norm": 3.049260139465332, + "learning_rate": 8.791650229434462e-07, + "logits/chosen": -0.5952552556991577, + "logits/rejected": -0.6261700391769409, + "logps/chosen": -63.74614715576172, + "logps/rejected": -104.40386199951172, + "loss": 0.6352, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3909029960632324, + "rewards/margins": 6.009088516235352, + "rewards/rejected": -2.618185043334961, + "step": 16155 + }, + { + "epoch": 4.04, + "grad_norm": 6.613412380218506, + "learning_rate": 8.787199322219647e-07, + "logits/chosen": -0.5833191871643066, + "logits/rejected": -0.6810007095336914, + "logps/chosen": -48.55399703979492, + "logps/rejected": -84.68429565429688, + "loss": 0.5829, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9516515731811523, + "rewards/margins": 6.930233478546143, + "rewards/rejected": -3.978581190109253, + "step": 16156 + }, + { + "epoch": 4.04, + "grad_norm": 1.7604743242263794, + "learning_rate": 8.782749433416971e-07, + "logits/chosen": -0.4931548237800598, + "logits/rejected": -0.6219059228897095, + "logps/chosen": -51.95394515991211, + "logps/rejected": -92.8480224609375, + "loss": 0.5066, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2303614616394043, + "rewards/margins": 7.569452285766602, + "rewards/rejected": -4.339090347290039, + "step": 16157 + }, + { + "epoch": 4.04, + "grad_norm": 1.9513591527938843, + "learning_rate": 8.778300563136388e-07, + "logits/chosen": -0.5809878706932068, + "logits/rejected": -0.6157690286636353, + "logps/chosen": -43.77590560913086, + "logps/rejected": -125.69876098632812, + "loss": 0.564, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.175877094268799, + "rewards/margins": 8.187671661376953, + "rewards/rejected": -5.011794090270996, + "step": 16158 + }, + { + "epoch": 4.04, + "grad_norm": 8.911877632141113, + "learning_rate": 8.77385271148784e-07, + "logits/chosen": -0.5766254663467407, + "logits/rejected": -0.6382852792739868, + "logps/chosen": -53.762935638427734, + "logps/rejected": -98.26274108886719, + "loss": 0.6394, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5696065425872803, + "rewards/margins": 6.4487624168396, + "rewards/rejected": -2.8791556358337402, + "step": 16159 + }, + { + "epoch": 4.04, + "grad_norm": 4.525817394256592, + "learning_rate": 8.769405878581271e-07, + "logits/chosen": -0.5250290632247925, + "logits/rejected": -0.5810959339141846, + "logps/chosen": -53.44471740722656, + "logps/rejected": -99.03778076171875, + "loss": 0.6412, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4451353549957275, + "rewards/margins": 6.370832443237305, + "rewards/rejected": -2.925696849822998, + "step": 16160 + }, + { + "epoch": 4.04, + "grad_norm": 3.836108684539795, + "learning_rate": 8.764960064526512e-07, + "logits/chosen": -0.5197517275810242, + "logits/rejected": -0.5691195130348206, + "logps/chosen": -49.946739196777344, + "logps/rejected": -100.2388916015625, + "loss": 0.6992, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4575047492980957, + "rewards/margins": 6.645707607269287, + "rewards/rejected": -3.1882028579711914, + "step": 16161 + }, + { + "epoch": 4.04, + "grad_norm": 6.6181511878967285, + "learning_rate": 8.760515269433445e-07, + "logits/chosen": -0.5375840663909912, + "logits/rejected": -0.6420021057128906, + "logps/chosen": -49.3447265625, + "logps/rejected": -95.09651947021484, + "loss": 0.5981, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0682411193847656, + "rewards/margins": 7.321318626403809, + "rewards/rejected": -4.253077983856201, + "step": 16162 + }, + { + "epoch": 4.04, + "grad_norm": 5.084682941436768, + "learning_rate": 8.756071493411927e-07, + "logits/chosen": -0.5473780632019043, + "logits/rejected": -0.636506199836731, + "logps/chosen": -51.8857421875, + "logps/rejected": -99.1180419921875, + "loss": 0.604, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1184136867523193, + "rewards/margins": 6.5055036544799805, + "rewards/rejected": -3.387089967727661, + "step": 16163 + }, + { + "epoch": 4.04, + "grad_norm": 4.8415727615356445, + "learning_rate": 8.751628736571749e-07, + "logits/chosen": -0.5028284192085266, + "logits/rejected": -0.5653934478759766, + "logps/chosen": -64.6211929321289, + "logps/rejected": -126.66465759277344, + "loss": 0.6629, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.152705669403076, + "rewards/margins": 6.609402656555176, + "rewards/rejected": -3.456697702407837, + "step": 16164 + }, + { + "epoch": 4.04, + "grad_norm": 3.6640336513519287, + "learning_rate": 8.747186999022683e-07, + "logits/chosen": -0.552871584892273, + "logits/rejected": -0.6607604026794434, + "logps/chosen": -52.39468002319336, + "logps/rejected": -97.93440246582031, + "loss": 0.6139, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.044747829437256, + "rewards/margins": 7.648749351501465, + "rewards/rejected": -4.604001998901367, + "step": 16165 + }, + { + "epoch": 4.04, + "grad_norm": 7.259915351867676, + "learning_rate": 8.742746280874526e-07, + "logits/chosen": -0.5658829808235168, + "logits/rejected": -0.6621502637863159, + "logps/chosen": -75.17321014404297, + "logps/rejected": -103.48046112060547, + "loss": 0.7345, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.875831365585327, + "rewards/margins": 6.103362083435059, + "rewards/rejected": -3.2275309562683105, + "step": 16166 + }, + { + "epoch": 4.04, + "grad_norm": 7.184856414794922, + "learning_rate": 8.738306582236983e-07, + "logits/chosen": -0.48867616057395935, + "logits/rejected": -0.6276385188102722, + "logps/chosen": -70.02751159667969, + "logps/rejected": -90.3701400756836, + "loss": 0.6304, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0177955627441406, + "rewards/margins": 6.738160610198975, + "rewards/rejected": -3.720364570617676, + "step": 16167 + }, + { + "epoch": 4.04, + "grad_norm": 6.522279739379883, + "learning_rate": 8.733867903219762e-07, + "logits/chosen": -0.5810754299163818, + "logits/rejected": -0.6454903483390808, + "logps/chosen": -52.72674560546875, + "logps/rejected": -108.68279266357422, + "loss": 0.6873, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2500569820404053, + "rewards/margins": 7.568761348724365, + "rewards/rejected": -4.318704605102539, + "step": 16168 + }, + { + "epoch": 4.04, + "grad_norm": 4.1497883796691895, + "learning_rate": 8.729430243932574e-07, + "logits/chosen": -0.5278357267379761, + "logits/rejected": -0.5806879997253418, + "logps/chosen": -45.70528030395508, + "logps/rejected": -104.06721496582031, + "loss": 0.5254, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1150918006896973, + "rewards/margins": 6.765878200531006, + "rewards/rejected": -3.6507863998413086, + "step": 16169 + }, + { + "epoch": 4.05, + "grad_norm": 9.621260643005371, + "learning_rate": 8.724993604485044e-07, + "logits/chosen": -0.6308348178863525, + "logits/rejected": -0.6475350856781006, + "logps/chosen": -52.432403564453125, + "logps/rejected": -91.59259033203125, + "loss": 0.6873, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.4569780826568604, + "rewards/margins": 5.768143177032471, + "rewards/rejected": -2.3111653327941895, + "step": 16170 + }, + { + "epoch": 4.05, + "grad_norm": 2.2618319988250732, + "learning_rate": 8.720557984986843e-07, + "logits/chosen": -0.46346402168273926, + "logits/rejected": -0.5196086168289185, + "logps/chosen": -47.711727142333984, + "logps/rejected": -96.82414245605469, + "loss": 0.5754, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1381871700286865, + "rewards/margins": 6.4427809715271, + "rewards/rejected": -3.304593563079834, + "step": 16171 + }, + { + "epoch": 4.05, + "grad_norm": 3.699871778488159, + "learning_rate": 8.716123385547554e-07, + "logits/chosen": -0.619113564491272, + "logits/rejected": -0.7214449644088745, + "logps/chosen": -56.728111267089844, + "logps/rejected": -103.10635375976562, + "loss": 0.5692, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.280611276626587, + "rewards/margins": 7.487598419189453, + "rewards/rejected": -4.206987380981445, + "step": 16172 + }, + { + "epoch": 4.05, + "grad_norm": 3.6889333724975586, + "learning_rate": 8.711689806276757e-07, + "logits/chosen": -0.5987403988838196, + "logits/rejected": -0.6719368100166321, + "logps/chosen": -62.167381286621094, + "logps/rejected": -100.04316711425781, + "loss": 0.6913, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.330655813217163, + "rewards/margins": 6.915340423583984, + "rewards/rejected": -3.5846850872039795, + "step": 16173 + }, + { + "epoch": 4.05, + "grad_norm": 4.4630231857299805, + "learning_rate": 8.70725724728404e-07, + "logits/chosen": -0.5531175136566162, + "logits/rejected": -0.6731665134429932, + "logps/chosen": -50.62825012207031, + "logps/rejected": -107.43033599853516, + "loss": 0.5966, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.037583351135254, + "rewards/margins": 8.416055679321289, + "rewards/rejected": -5.378472328186035, + "step": 16174 + }, + { + "epoch": 4.05, + "grad_norm": 30.73834228515625, + "learning_rate": 8.702825708678897e-07, + "logits/chosen": -0.5443218946456909, + "logits/rejected": -0.6178417205810547, + "logps/chosen": -52.51786804199219, + "logps/rejected": -108.5994873046875, + "loss": 0.6975, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.124924659729004, + "rewards/margins": 6.909461975097656, + "rewards/rejected": -3.7845377922058105, + "step": 16175 + }, + { + "epoch": 4.05, + "grad_norm": 5.247925281524658, + "learning_rate": 8.698395190570874e-07, + "logits/chosen": -0.43543559312820435, + "logits/rejected": -0.5785048604011536, + "logps/chosen": -53.02128601074219, + "logps/rejected": -106.50366973876953, + "loss": 0.5413, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.365182876586914, + "rewards/margins": 7.754608154296875, + "rewards/rejected": -4.389424800872803, + "step": 16176 + }, + { + "epoch": 4.05, + "grad_norm": 5.586559772491455, + "learning_rate": 8.693965693069422e-07, + "logits/chosen": -0.6287093758583069, + "logits/rejected": -0.7231518030166626, + "logps/chosen": -48.34077072143555, + "logps/rejected": -105.7021713256836, + "loss": 0.5351, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4600915908813477, + "rewards/margins": 7.799816131591797, + "rewards/rejected": -4.339725494384766, + "step": 16177 + }, + { + "epoch": 4.05, + "grad_norm": 5.093551158905029, + "learning_rate": 8.689537216284022e-07, + "logits/chosen": -0.5528279542922974, + "logits/rejected": -0.63211989402771, + "logps/chosen": -58.59884262084961, + "logps/rejected": -143.9261474609375, + "loss": 0.5663, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.20237398147583, + "rewards/margins": 9.480989456176758, + "rewards/rejected": -6.278615474700928, + "step": 16178 + }, + { + "epoch": 4.05, + "grad_norm": 2.7363247871398926, + "learning_rate": 8.685109760324101e-07, + "logits/chosen": -0.6402171850204468, + "logits/rejected": -0.7067470550537109, + "logps/chosen": -47.13871765136719, + "logps/rejected": -122.06610870361328, + "loss": 0.5587, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3419790267944336, + "rewards/margins": 8.156224250793457, + "rewards/rejected": -4.814245223999023, + "step": 16179 + }, + { + "epoch": 4.05, + "grad_norm": 14.340744018554688, + "learning_rate": 8.68068332529905e-07, + "logits/chosen": -0.5177767276763916, + "logits/rejected": -0.6265901327133179, + "logps/chosen": -55.94122314453125, + "logps/rejected": -108.83786010742188, + "loss": 0.645, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8915202617645264, + "rewards/margins": 6.333959579467773, + "rewards/rejected": -3.442439556121826, + "step": 16180 + }, + { + "epoch": 4.05, + "grad_norm": 3.4379937648773193, + "learning_rate": 8.676257911318276e-07, + "logits/chosen": -0.5251661539077759, + "logits/rejected": -0.5775812268257141, + "logps/chosen": -55.32461929321289, + "logps/rejected": -103.30719757080078, + "loss": 0.5739, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1995577812194824, + "rewards/margins": 5.953221797943115, + "rewards/rejected": -2.753664493560791, + "step": 16181 + }, + { + "epoch": 4.05, + "grad_norm": 8.792500495910645, + "learning_rate": 8.671833518491119e-07, + "logits/chosen": -0.5155231952667236, + "logits/rejected": -0.566760778427124, + "logps/chosen": -48.31295394897461, + "logps/rejected": -95.61285400390625, + "loss": 0.5691, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.428046464920044, + "rewards/margins": 6.066240310668945, + "rewards/rejected": -2.6381940841674805, + "step": 16182 + }, + { + "epoch": 4.05, + "grad_norm": 4.3603644371032715, + "learning_rate": 8.667410146926902e-07, + "logits/chosen": -0.5954146385192871, + "logits/rejected": -0.6370205879211426, + "logps/chosen": -53.290771484375, + "logps/rejected": -121.84721374511719, + "loss": 0.6348, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1273505687713623, + "rewards/margins": 7.277037620544434, + "rewards/rejected": -4.14968729019165, + "step": 16183 + }, + { + "epoch": 4.05, + "grad_norm": 3.919830560684204, + "learning_rate": 8.662987796734945e-07, + "logits/chosen": -0.5461253523826599, + "logits/rejected": -0.628148078918457, + "logps/chosen": -56.86408996582031, + "logps/rejected": -105.51973724365234, + "loss": 0.5978, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3553099632263184, + "rewards/margins": 7.78585147857666, + "rewards/rejected": -4.430541515350342, + "step": 16184 + }, + { + "epoch": 4.05, + "grad_norm": 6.749035835266113, + "learning_rate": 8.658566468024554e-07, + "logits/chosen": -0.6332145929336548, + "logits/rejected": -0.6832583546638489, + "logps/chosen": -59.224700927734375, + "logps/rejected": -128.29037475585938, + "loss": 0.7257, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8881421089172363, + "rewards/margins": 9.095236778259277, + "rewards/rejected": -6.207094192504883, + "step": 16185 + }, + { + "epoch": 4.05, + "grad_norm": 4.110963821411133, + "learning_rate": 8.654146160904936e-07, + "logits/chosen": -0.6189164519309998, + "logits/rejected": -0.685626745223999, + "logps/chosen": -56.483314514160156, + "logps/rejected": -113.61222076416016, + "loss": 0.56, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.243107795715332, + "rewards/margins": 7.5089802742004395, + "rewards/rejected": -4.265872001647949, + "step": 16186 + }, + { + "epoch": 4.05, + "grad_norm": 7.480495929718018, + "learning_rate": 8.649726875485337e-07, + "logits/chosen": -0.5327677726745605, + "logits/rejected": -0.5492357015609741, + "logps/chosen": -62.68648910522461, + "logps/rejected": -93.86595916748047, + "loss": 0.7339, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2386679649353027, + "rewards/margins": 5.128925323486328, + "rewards/rejected": -1.8902572393417358, + "step": 16187 + }, + { + "epoch": 4.05, + "grad_norm": 7.913378715515137, + "learning_rate": 8.645308611874987e-07, + "logits/chosen": -0.5258902311325073, + "logits/rejected": -0.5769079923629761, + "logps/chosen": -63.05116653442383, + "logps/rejected": -123.92173767089844, + "loss": 0.6805, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.679633617401123, + "rewards/margins": 7.534185409545898, + "rewards/rejected": -4.854552268981934, + "step": 16188 + }, + { + "epoch": 4.05, + "grad_norm": 4.313901901245117, + "learning_rate": 8.640891370183047e-07, + "logits/chosen": -0.5645315647125244, + "logits/rejected": -0.6610024571418762, + "logps/chosen": -58.16493225097656, + "logps/rejected": -102.51144409179688, + "loss": 0.677, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9328322410583496, + "rewards/margins": 7.553134441375732, + "rewards/rejected": -4.620302200317383, + "step": 16189 + }, + { + "epoch": 4.05, + "grad_norm": 5.286943435668945, + "learning_rate": 8.636475150518658e-07, + "logits/chosen": -0.5421061515808105, + "logits/rejected": -0.6046752333641052, + "logps/chosen": -61.62257385253906, + "logps/rejected": -109.13102722167969, + "loss": 0.6545, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.892951011657715, + "rewards/margins": 6.150196552276611, + "rewards/rejected": -3.257246255874634, + "step": 16190 + }, + { + "epoch": 4.05, + "grad_norm": 5.096631050109863, + "learning_rate": 8.632059952990979e-07, + "logits/chosen": -0.5231912136077881, + "logits/rejected": -0.5953117609024048, + "logps/chosen": -54.34748458862305, + "logps/rejected": -96.32711029052734, + "loss": 0.6746, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.103780746459961, + "rewards/margins": 6.158172607421875, + "rewards/rejected": -3.054391622543335, + "step": 16191 + }, + { + "epoch": 4.05, + "grad_norm": 4.724056243896484, + "learning_rate": 8.627645777709098e-07, + "logits/chosen": -0.5154010057449341, + "logits/rejected": -0.6132678389549255, + "logps/chosen": -70.26244354248047, + "logps/rejected": -108.52938842773438, + "loss": 0.6377, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.991560935974121, + "rewards/margins": 6.6614155769348145, + "rewards/rejected": -3.6698551177978516, + "step": 16192 + }, + { + "epoch": 4.05, + "grad_norm": 4.330060005187988, + "learning_rate": 8.623232624782085e-07, + "logits/chosen": -0.5216862559318542, + "logits/rejected": -0.5899741649627686, + "logps/chosen": -55.96620178222656, + "logps/rejected": -100.03681945800781, + "loss": 0.6354, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.282421112060547, + "rewards/margins": 6.635112285614014, + "rewards/rejected": -3.3526906967163086, + "step": 16193 + }, + { + "epoch": 4.05, + "grad_norm": 3.816863775253296, + "learning_rate": 8.618820494319013e-07, + "logits/chosen": -0.5480977892875671, + "logits/rejected": -0.6001107692718506, + "logps/chosen": -59.56528854370117, + "logps/rejected": -111.92095184326172, + "loss": 0.6309, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9970107078552246, + "rewards/margins": 6.319316864013672, + "rewards/rejected": -3.3223061561584473, + "step": 16194 + }, + { + "epoch": 4.05, + "grad_norm": 7.937013149261475, + "learning_rate": 8.61440938642889e-07, + "logits/chosen": -0.609127402305603, + "logits/rejected": -0.6856207251548767, + "logps/chosen": -46.93730926513672, + "logps/rejected": -84.27820587158203, + "loss": 0.6307, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1701395511627197, + "rewards/margins": 6.469517707824707, + "rewards/rejected": -3.29937744140625, + "step": 16195 + }, + { + "epoch": 4.05, + "grad_norm": 2.6046531200408936, + "learning_rate": 8.609999301220746e-07, + "logits/chosen": -0.5304415822029114, + "logits/rejected": -0.640222430229187, + "logps/chosen": -45.11613082885742, + "logps/rejected": -106.73728942871094, + "loss": 0.501, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.154623508453369, + "rewards/margins": 7.08544921875, + "rewards/rejected": -3.93082594871521, + "step": 16196 + }, + { + "epoch": 4.05, + "grad_norm": 5.704887390136719, + "learning_rate": 8.60559023880354e-07, + "logits/chosen": -0.5066987872123718, + "logits/rejected": -0.5387916564941406, + "logps/chosen": -55.76490020751953, + "logps/rejected": -98.31207275390625, + "loss": 0.7975, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2963948249816895, + "rewards/margins": 5.0356974601745605, + "rewards/rejected": -1.7393025159835815, + "step": 16197 + }, + { + "epoch": 4.05, + "grad_norm": 2.8177096843719482, + "learning_rate": 8.601182199286218e-07, + "logits/chosen": -0.5338967442512512, + "logits/rejected": -0.63276606798172, + "logps/chosen": -53.19789123535156, + "logps/rejected": -102.61183166503906, + "loss": 0.549, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0597081184387207, + "rewards/margins": 7.399504661560059, + "rewards/rejected": -4.339796543121338, + "step": 16198 + }, + { + "epoch": 4.05, + "grad_norm": 9.454172134399414, + "learning_rate": 8.596775182777728e-07, + "logits/chosen": -0.5706228613853455, + "logits/rejected": -0.6327759623527527, + "logps/chosen": -53.73967742919922, + "logps/rejected": -105.139892578125, + "loss": 0.6199, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1971945762634277, + "rewards/margins": 7.15725040435791, + "rewards/rejected": -3.960055351257324, + "step": 16199 + }, + { + "epoch": 4.05, + "grad_norm": 8.455758094787598, + "learning_rate": 8.592369189386968e-07, + "logits/chosen": -0.5510882139205933, + "logits/rejected": -0.6301674246788025, + "logps/chosen": -44.75041198730469, + "logps/rejected": -93.430419921875, + "loss": 0.6175, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.120220422744751, + "rewards/margins": 7.387338638305664, + "rewards/rejected": -4.267118453979492, + "step": 16200 + }, + { + "epoch": 4.05, + "grad_norm": 4.465207099914551, + "learning_rate": 8.587964219222789e-07, + "logits/chosen": -0.5126460194587708, + "logits/rejected": -0.5532587766647339, + "logps/chosen": -69.29948425292969, + "logps/rejected": -113.49718475341797, + "loss": 0.6412, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9771194458007812, + "rewards/margins": 7.105739593505859, + "rewards/rejected": -4.12861967086792, + "step": 16201 + }, + { + "epoch": 4.05, + "grad_norm": 3.3958969116210938, + "learning_rate": 8.583560272394065e-07, + "logits/chosen": -0.6306537985801697, + "logits/rejected": -0.7268330454826355, + "logps/chosen": -47.4072380065918, + "logps/rejected": -95.18138122558594, + "loss": 0.6244, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.118124485015869, + "rewards/margins": 6.990841865539551, + "rewards/rejected": -3.8727169036865234, + "step": 16202 + }, + { + "epoch": 4.05, + "grad_norm": 4.3091816902160645, + "learning_rate": 8.579157349009643e-07, + "logits/chosen": -0.5699820518493652, + "logits/rejected": -0.6280986666679382, + "logps/chosen": -53.764923095703125, + "logps/rejected": -106.75010681152344, + "loss": 0.6272, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.236215591430664, + "rewards/margins": 7.828596591949463, + "rewards/rejected": -4.592380523681641, + "step": 16203 + }, + { + "epoch": 4.05, + "grad_norm": 12.096345901489258, + "learning_rate": 8.574755449178274e-07, + "logits/chosen": -0.5133334398269653, + "logits/rejected": -0.5862877368927002, + "logps/chosen": -50.30194091796875, + "logps/rejected": -88.86862182617188, + "loss": 0.6093, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.317234992980957, + "rewards/margins": 6.924098014831543, + "rewards/rejected": -3.606862783432007, + "step": 16204 + }, + { + "epoch": 4.05, + "grad_norm": 2.308962106704712, + "learning_rate": 8.57035457300876e-07, + "logits/chosen": -0.5159319639205933, + "logits/rejected": -0.6165143847465515, + "logps/chosen": -54.68046951293945, + "logps/rejected": -125.24714660644531, + "loss": 0.5804, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3917646408081055, + "rewards/margins": 8.931983947753906, + "rewards/rejected": -5.540219783782959, + "step": 16205 + }, + { + "epoch": 4.05, + "grad_norm": 8.659820556640625, + "learning_rate": 8.565954720609864e-07, + "logits/chosen": -0.5747670531272888, + "logits/rejected": -0.6414780020713806, + "logps/chosen": -53.81028366088867, + "logps/rejected": -115.30398559570312, + "loss": 0.6691, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0247578620910645, + "rewards/margins": 8.064460754394531, + "rewards/rejected": -5.039703369140625, + "step": 16206 + }, + { + "epoch": 4.05, + "grad_norm": 16.47747039794922, + "learning_rate": 8.5615558920903e-07, + "logits/chosen": -0.5547996759414673, + "logits/rejected": -0.6320884823799133, + "logps/chosen": -55.1988525390625, + "logps/rejected": -111.16165924072266, + "loss": 0.6945, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.043802499771118, + "rewards/margins": 7.71329402923584, + "rewards/rejected": -4.669491291046143, + "step": 16207 + }, + { + "epoch": 4.05, + "grad_norm": 5.218960285186768, + "learning_rate": 8.557158087558753e-07, + "logits/chosen": -0.5290775299072266, + "logits/rejected": -0.5862582921981812, + "logps/chosen": -51.54246139526367, + "logps/rejected": -95.53001403808594, + "loss": 0.6989, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1337838172912598, + "rewards/margins": 5.215165615081787, + "rewards/rejected": -2.081381320953369, + "step": 16208 + }, + { + "epoch": 4.05, + "grad_norm": 7.972501277923584, + "learning_rate": 8.552761307123924e-07, + "logits/chosen": -0.5293983817100525, + "logits/rejected": -0.6796063184738159, + "logps/chosen": -61.69465255737305, + "logps/rejected": -95.33628845214844, + "loss": 0.7923, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7151546478271484, + "rewards/margins": 7.109354019165039, + "rewards/rejected": -4.394199848175049, + "step": 16209 + }, + { + "epoch": 4.06, + "grad_norm": 21.005762100219727, + "learning_rate": 8.548365550894444e-07, + "logits/chosen": -0.5713917016983032, + "logits/rejected": -0.6416549682617188, + "logps/chosen": -48.163822174072266, + "logps/rejected": -101.21736907958984, + "loss": 0.6946, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1564159393310547, + "rewards/margins": 6.655173301696777, + "rewards/rejected": -3.4987568855285645, + "step": 16210 + }, + { + "epoch": 4.06, + "grad_norm": 5.991433143615723, + "learning_rate": 8.543970818978931e-07, + "logits/chosen": -0.47415879368782043, + "logits/rejected": -0.5418887138366699, + "logps/chosen": -74.29959869384766, + "logps/rejected": -92.59538269042969, + "loss": 0.7888, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0673930644989014, + "rewards/margins": 6.023514270782471, + "rewards/rejected": -2.9561214447021484, + "step": 16211 + }, + { + "epoch": 4.06, + "grad_norm": 4.580175876617432, + "learning_rate": 8.539577111486008e-07, + "logits/chosen": -0.5589499473571777, + "logits/rejected": -0.6220904588699341, + "logps/chosen": -55.00794219970703, + "logps/rejected": -123.32678985595703, + "loss": 0.6238, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1445577144622803, + "rewards/margins": 7.930225372314453, + "rewards/rejected": -4.7856669425964355, + "step": 16212 + }, + { + "epoch": 4.06, + "grad_norm": 3.4213266372680664, + "learning_rate": 8.535184428524218e-07, + "logits/chosen": -0.5797162055969238, + "logits/rejected": -0.6749246120452881, + "logps/chosen": -53.19340896606445, + "logps/rejected": -145.6746063232422, + "loss": 0.5317, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.10823392868042, + "rewards/margins": 9.65471363067627, + "rewards/rejected": -6.546479225158691, + "step": 16213 + }, + { + "epoch": 4.06, + "grad_norm": 6.656684875488281, + "learning_rate": 8.53079277020214e-07, + "logits/chosen": -0.49575579166412354, + "logits/rejected": -0.5924507975578308, + "logps/chosen": -62.41053771972656, + "logps/rejected": -101.72610473632812, + "loss": 0.683, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8012795448303223, + "rewards/margins": 6.620400905609131, + "rewards/rejected": -3.8191215991973877, + "step": 16214 + }, + { + "epoch": 4.06, + "grad_norm": 41.82727813720703, + "learning_rate": 8.526402136628281e-07, + "logits/chosen": -0.5643365979194641, + "logits/rejected": -0.6067154407501221, + "logps/chosen": -52.34360122680664, + "logps/rejected": -102.59402465820312, + "loss": 0.7705, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9097495079040527, + "rewards/margins": 7.03727388381958, + "rewards/rejected": -4.127523899078369, + "step": 16215 + }, + { + "epoch": 4.06, + "grad_norm": 10.868517875671387, + "learning_rate": 8.522012527911122e-07, + "logits/chosen": -0.6068174839019775, + "logits/rejected": -0.6956087350845337, + "logps/chosen": -59.039154052734375, + "logps/rejected": -107.70941925048828, + "loss": 0.6052, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.94681978225708, + "rewards/margins": 7.271478176116943, + "rewards/rejected": -4.324658393859863, + "step": 16216 + }, + { + "epoch": 4.06, + "grad_norm": 3.170675754547119, + "learning_rate": 8.51762394415917e-07, + "logits/chosen": -0.575955867767334, + "logits/rejected": -0.6551212072372437, + "logps/chosen": -43.8250846862793, + "logps/rejected": -89.9158935546875, + "loss": 0.5939, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1310348510742188, + "rewards/margins": 6.877895832061768, + "rewards/rejected": -3.746860980987549, + "step": 16217 + }, + { + "epoch": 4.06, + "grad_norm": 6.393641948699951, + "learning_rate": 8.513236385480834e-07, + "logits/chosen": -0.5318687558174133, + "logits/rejected": -0.5711970329284668, + "logps/chosen": -52.21110153198242, + "logps/rejected": -98.1651382446289, + "loss": 0.6299, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.058361768722534, + "rewards/margins": 6.713049411773682, + "rewards/rejected": -3.6546878814697266, + "step": 16218 + }, + { + "epoch": 4.06, + "grad_norm": 5.1688313484191895, + "learning_rate": 8.508849851984569e-07, + "logits/chosen": -0.5521132946014404, + "logits/rejected": -0.608655571937561, + "logps/chosen": -40.76229476928711, + "logps/rejected": -93.72466278076172, + "loss": 0.5708, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.391244888305664, + "rewards/margins": 6.666581153869629, + "rewards/rejected": -3.275336503982544, + "step": 16219 + }, + { + "epoch": 4.06, + "grad_norm": 2.4521565437316895, + "learning_rate": 8.504464343778746e-07, + "logits/chosen": -0.5752173066139221, + "logits/rejected": -0.6640016436576843, + "logps/chosen": -52.61515426635742, + "logps/rejected": -129.76034545898438, + "loss": 0.5305, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.027069568634033, + "rewards/margins": 9.7252197265625, + "rewards/rejected": -6.698150634765625, + "step": 16220 + }, + { + "epoch": 4.06, + "grad_norm": 1.753381371498108, + "learning_rate": 8.500079860971755e-07, + "logits/chosen": -0.5229684114456177, + "logits/rejected": -0.6369830965995789, + "logps/chosen": -49.770652770996094, + "logps/rejected": -101.23355102539062, + "loss": 0.4661, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4484119415283203, + "rewards/margins": 6.906692981719971, + "rewards/rejected": -3.4582810401916504, + "step": 16221 + }, + { + "epoch": 4.06, + "grad_norm": 6.958922386169434, + "learning_rate": 8.495696403671938e-07, + "logits/chosen": -0.5823591351509094, + "logits/rejected": -0.6561694145202637, + "logps/chosen": -56.64479446411133, + "logps/rejected": -103.76806640625, + "loss": 0.6819, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1746675968170166, + "rewards/margins": 6.870904922485352, + "rewards/rejected": -3.696237564086914, + "step": 16222 + }, + { + "epoch": 4.06, + "grad_norm": 11.277095794677734, + "learning_rate": 8.491313971987592e-07, + "logits/chosen": -0.5223023295402527, + "logits/rejected": -0.5659079551696777, + "logps/chosen": -63.818870544433594, + "logps/rejected": -127.03793334960938, + "loss": 0.6326, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.069179058074951, + "rewards/margins": 7.332009315490723, + "rewards/rejected": -4.262829780578613, + "step": 16223 + }, + { + "epoch": 4.06, + "grad_norm": 3.985513925552368, + "learning_rate": 8.48693256602704e-07, + "logits/chosen": -0.5020087361335754, + "logits/rejected": -0.5484523773193359, + "logps/chosen": -57.61783981323242, + "logps/rejected": -97.5979232788086, + "loss": 0.6447, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.82942533493042, + "rewards/margins": 6.164877891540527, + "rewards/rejected": -3.3354527950286865, + "step": 16224 + }, + { + "epoch": 4.06, + "grad_norm": 7.083352088928223, + "learning_rate": 8.48255218589854e-07, + "logits/chosen": -0.5085792541503906, + "logits/rejected": -0.5657036304473877, + "logps/chosen": -52.043861389160156, + "logps/rejected": -115.19029235839844, + "loss": 0.5281, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.228191375732422, + "rewards/margins": 7.43928337097168, + "rewards/rejected": -4.211091995239258, + "step": 16225 + }, + { + "epoch": 4.06, + "grad_norm": 22.966745376586914, + "learning_rate": 8.478172831710324e-07, + "logits/chosen": -0.6232182383537292, + "logits/rejected": -0.7001884579658508, + "logps/chosen": -47.92771911621094, + "logps/rejected": -96.97886657714844, + "loss": 0.6065, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0650076866149902, + "rewards/margins": 6.575923442840576, + "rewards/rejected": -3.510915994644165, + "step": 16226 + }, + { + "epoch": 4.06, + "grad_norm": 5.189046859741211, + "learning_rate": 8.473794503570632e-07, + "logits/chosen": -0.5089404582977295, + "logits/rejected": -0.5932636857032776, + "logps/chosen": -47.4010009765625, + "logps/rejected": -96.40184783935547, + "loss": 0.6279, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2475574016571045, + "rewards/margins": 7.068775177001953, + "rewards/rejected": -3.8212180137634277, + "step": 16227 + }, + { + "epoch": 4.06, + "grad_norm": 6.117433071136475, + "learning_rate": 8.469417201587648e-07, + "logits/chosen": -0.5500373840332031, + "logits/rejected": -0.636273980140686, + "logps/chosen": -56.96063232421875, + "logps/rejected": -103.54817962646484, + "loss": 0.5791, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.205491065979004, + "rewards/margins": 7.073084831237793, + "rewards/rejected": -3.867593765258789, + "step": 16228 + }, + { + "epoch": 4.06, + "grad_norm": 6.970223903656006, + "learning_rate": 8.465040925869527e-07, + "logits/chosen": -0.5171167850494385, + "logits/rejected": -0.5823655724525452, + "logps/chosen": -45.62495422363281, + "logps/rejected": -83.95726776123047, + "loss": 0.6412, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.193594217300415, + "rewards/margins": 6.035738945007324, + "rewards/rejected": -2.842144727706909, + "step": 16229 + }, + { + "epoch": 4.06, + "grad_norm": 3.817553997039795, + "learning_rate": 8.460665676524427e-07, + "logits/chosen": -0.5718414783477783, + "logits/rejected": -0.6600696444511414, + "logps/chosen": -55.829463958740234, + "logps/rejected": -88.54154968261719, + "loss": 0.6455, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.227200746536255, + "rewards/margins": 6.113584041595459, + "rewards/rejected": -2.886383295059204, + "step": 16230 + }, + { + "epoch": 4.06, + "grad_norm": 3.0550413131713867, + "learning_rate": 8.456291453660481e-07, + "logits/chosen": -0.516630232334137, + "logits/rejected": -0.6005744934082031, + "logps/chosen": -56.80170440673828, + "logps/rejected": -87.7686996459961, + "loss": 0.641, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.194429397583008, + "rewards/margins": 6.625975131988525, + "rewards/rejected": -3.4315452575683594, + "step": 16231 + }, + { + "epoch": 4.06, + "grad_norm": 15.33246898651123, + "learning_rate": 8.451918257385738e-07, + "logits/chosen": -0.536457896232605, + "logits/rejected": -0.5873882174491882, + "logps/chosen": -56.734859466552734, + "logps/rejected": -110.09123992919922, + "loss": 0.7147, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1726884841918945, + "rewards/margins": 6.536145210266113, + "rewards/rejected": -3.3634562492370605, + "step": 16232 + }, + { + "epoch": 4.06, + "grad_norm": 3.1873891353607178, + "learning_rate": 8.447546087808284e-07, + "logits/chosen": -0.5278061032295227, + "logits/rejected": -0.6388072967529297, + "logps/chosen": -48.31548309326172, + "logps/rejected": -98.13988494873047, + "loss": 0.5206, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5546467304229736, + "rewards/margins": 7.655061721801758, + "rewards/rejected": -4.100414752960205, + "step": 16233 + }, + { + "epoch": 4.06, + "grad_norm": 3.6547112464904785, + "learning_rate": 8.443174945036181e-07, + "logits/chosen": -0.5463132858276367, + "logits/rejected": -0.6090688109397888, + "logps/chosen": -55.45137405395508, + "logps/rejected": -108.01828002929688, + "loss": 0.5288, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0131938457489014, + "rewards/margins": 6.873559951782227, + "rewards/rejected": -3.8603670597076416, + "step": 16234 + }, + { + "epoch": 4.06, + "grad_norm": 4.4621477127075195, + "learning_rate": 8.438804829177427e-07, + "logits/chosen": -0.5499042272567749, + "logits/rejected": -0.6423614621162415, + "logps/chosen": -47.03284454345703, + "logps/rejected": -102.53201293945312, + "loss": 0.6039, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1790366172790527, + "rewards/margins": 7.161778450012207, + "rewards/rejected": -3.9827418327331543, + "step": 16235 + }, + { + "epoch": 4.06, + "grad_norm": 2.0562455654144287, + "learning_rate": 8.434435740339997e-07, + "logits/chosen": -0.44186559319496155, + "logits/rejected": -0.5538619160652161, + "logps/chosen": -59.80146026611328, + "logps/rejected": -107.13191223144531, + "loss": 0.5498, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7601048946380615, + "rewards/margins": 6.946173667907715, + "rewards/rejected": -4.186069011688232, + "step": 16236 + }, + { + "epoch": 4.06, + "grad_norm": 4.8548736572265625, + "learning_rate": 8.430067678631881e-07, + "logits/chosen": -0.5019059777259827, + "logits/rejected": -0.6211073398590088, + "logps/chosen": -44.330928802490234, + "logps/rejected": -86.76838684082031, + "loss": 0.5973, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4898054599761963, + "rewards/margins": 7.360898971557617, + "rewards/rejected": -3.871093273162842, + "step": 16237 + }, + { + "epoch": 4.06, + "grad_norm": 3.3811609745025635, + "learning_rate": 8.425700644161e-07, + "logits/chosen": -0.5804651975631714, + "logits/rejected": -0.6320559978485107, + "logps/chosen": -45.10652160644531, + "logps/rejected": -118.00921630859375, + "loss": 0.5518, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.370802402496338, + "rewards/margins": 7.6175031661987305, + "rewards/rejected": -4.246700763702393, + "step": 16238 + }, + { + "epoch": 4.06, + "grad_norm": 3.7988476753234863, + "learning_rate": 8.421334637035288e-07, + "logits/chosen": -0.5796487331390381, + "logits/rejected": -0.6536357998847961, + "logps/chosen": -53.993797302246094, + "logps/rejected": -96.79624938964844, + "loss": 0.5751, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.361645221710205, + "rewards/margins": 6.874246120452881, + "rewards/rejected": -3.512601375579834, + "step": 16239 + }, + { + "epoch": 4.06, + "grad_norm": 3.6516940593719482, + "learning_rate": 8.416969657362622e-07, + "logits/chosen": -0.5217867493629456, + "logits/rejected": -0.6454877853393555, + "logps/chosen": -48.858375549316406, + "logps/rejected": -96.05484008789062, + "loss": 0.5315, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.218024730682373, + "rewards/margins": 6.472951889038086, + "rewards/rejected": -3.2549266815185547, + "step": 16240 + }, + { + "epoch": 4.06, + "grad_norm": 2.918118715286255, + "learning_rate": 8.412605705250854e-07, + "logits/chosen": -0.5974417924880981, + "logits/rejected": -0.6713823080062866, + "logps/chosen": -45.403411865234375, + "logps/rejected": -104.33219146728516, + "loss": 0.5581, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.441631555557251, + "rewards/margins": 7.075057029724121, + "rewards/rejected": -3.6334259510040283, + "step": 16241 + }, + { + "epoch": 4.06, + "grad_norm": 15.125813484191895, + "learning_rate": 8.408242780807846e-07, + "logits/chosen": -0.5219481587409973, + "logits/rejected": -0.6197916865348816, + "logps/chosen": -55.93239974975586, + "logps/rejected": -93.69245910644531, + "loss": 0.6649, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.858199119567871, + "rewards/margins": 6.194703102111816, + "rewards/rejected": -3.336503744125366, + "step": 16242 + }, + { + "epoch": 4.06, + "grad_norm": 5.319316864013672, + "learning_rate": 8.403880884141396e-07, + "logits/chosen": -0.5862927436828613, + "logits/rejected": -0.6670116186141968, + "logps/chosen": -62.7923469543457, + "logps/rejected": -107.57524871826172, + "loss": 0.6013, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.685513734817505, + "rewards/margins": 7.814915657043457, + "rewards/rejected": -5.129401683807373, + "step": 16243 + }, + { + "epoch": 4.06, + "grad_norm": 9.278560638427734, + "learning_rate": 8.399520015359281e-07, + "logits/chosen": -0.5281199812889099, + "logits/rejected": -0.607545793056488, + "logps/chosen": -53.48486328125, + "logps/rejected": -95.7759780883789, + "loss": 0.6438, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.279081344604492, + "rewards/margins": 6.847191333770752, + "rewards/rejected": -3.5681097507476807, + "step": 16244 + }, + { + "epoch": 4.06, + "grad_norm": 3.1086721420288086, + "learning_rate": 8.395160174569272e-07, + "logits/chosen": -0.5494385957717896, + "logits/rejected": -0.6287516951560974, + "logps/chosen": -59.05277633666992, + "logps/rejected": -103.34054565429688, + "loss": 0.6215, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1754150390625, + "rewards/margins": 7.102076530456543, + "rewards/rejected": -3.9266610145568848, + "step": 16245 + }, + { + "epoch": 4.06, + "grad_norm": 4.699803829193115, + "learning_rate": 8.390801361879136e-07, + "logits/chosen": -0.4803731441497803, + "logits/rejected": -0.5388876795768738, + "logps/chosen": -55.64805603027344, + "logps/rejected": -115.75416564941406, + "loss": 0.6933, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1066160202026367, + "rewards/margins": 7.046977519989014, + "rewards/rejected": -3.9403610229492188, + "step": 16246 + }, + { + "epoch": 4.06, + "grad_norm": 4.044780731201172, + "learning_rate": 8.386443577396531e-07, + "logits/chosen": -0.532031774520874, + "logits/rejected": -0.608985424041748, + "logps/chosen": -48.449832916259766, + "logps/rejected": -125.13089752197266, + "loss": 0.5374, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.267298460006714, + "rewards/margins": 9.023053169250488, + "rewards/rejected": -5.755754470825195, + "step": 16247 + }, + { + "epoch": 4.06, + "grad_norm": 12.047228813171387, + "learning_rate": 8.382086821229169e-07, + "logits/chosen": -0.5636824369430542, + "logits/rejected": -0.6053322553634644, + "logps/chosen": -59.2698974609375, + "logps/rejected": -99.63134002685547, + "loss": 0.6159, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9493393898010254, + "rewards/margins": 5.909557342529297, + "rewards/rejected": -2.9602179527282715, + "step": 16248 + }, + { + "epoch": 4.06, + "grad_norm": 8.307024002075195, + "learning_rate": 8.37773109348472e-07, + "logits/chosen": -0.4211435914039612, + "logits/rejected": -0.5265886783599854, + "logps/chosen": -63.75867462158203, + "logps/rejected": -94.55480194091797, + "loss": 0.6716, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2836241722106934, + "rewards/margins": 6.537132263183594, + "rewards/rejected": -3.2535083293914795, + "step": 16249 + }, + { + "epoch": 4.07, + "grad_norm": 2.813018321990967, + "learning_rate": 8.373376394270804e-07, + "logits/chosen": -0.49295395612716675, + "logits/rejected": -0.5434694886207581, + "logps/chosen": -61.56837463378906, + "logps/rejected": -122.24684143066406, + "loss": 0.5881, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.293715000152588, + "rewards/margins": 7.45212459564209, + "rewards/rejected": -4.158409118652344, + "step": 16250 + }, + { + "epoch": 4.07, + "grad_norm": 5.588411808013916, + "learning_rate": 8.36902272369502e-07, + "logits/chosen": -0.5289714336395264, + "logits/rejected": -0.6159122586250305, + "logps/chosen": -54.707801818847656, + "logps/rejected": -113.07772827148438, + "loss": 0.6349, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.4044649600982666, + "rewards/margins": 8.061152458190918, + "rewards/rejected": -4.656688213348389, + "step": 16251 + }, + { + "epoch": 4.07, + "grad_norm": 2.309140682220459, + "learning_rate": 8.364670081864984e-07, + "logits/chosen": -0.4779790937900543, + "logits/rejected": -0.6155292987823486, + "logps/chosen": -60.265892028808594, + "logps/rejected": -114.27543640136719, + "loss": 0.5434, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3990318775177, + "rewards/margins": 8.378456115722656, + "rewards/rejected": -4.979425430297852, + "step": 16252 + }, + { + "epoch": 4.07, + "grad_norm": 6.223079681396484, + "learning_rate": 8.360318468888229e-07, + "logits/chosen": -0.5457521080970764, + "logits/rejected": -0.6019009351730347, + "logps/chosen": -46.14795684814453, + "logps/rejected": -106.9649887084961, + "loss": 0.638, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.454892873764038, + "rewards/margins": 7.132423400878906, + "rewards/rejected": -3.6775307655334473, + "step": 16253 + }, + { + "epoch": 4.07, + "grad_norm": 4.577897071838379, + "learning_rate": 8.355967884872285e-07, + "logits/chosen": -0.5439885854721069, + "logits/rejected": -0.6164270639419556, + "logps/chosen": -54.830440521240234, + "logps/rejected": -117.22686767578125, + "loss": 0.5524, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.099496364593506, + "rewards/margins": 7.3796234130859375, + "rewards/rejected": -4.280127048492432, + "step": 16254 + }, + { + "epoch": 4.07, + "grad_norm": 3.7349531650543213, + "learning_rate": 8.351618329924682e-07, + "logits/chosen": -0.5462610721588135, + "logits/rejected": -0.5793390274047852, + "logps/chosen": -50.83761215209961, + "logps/rejected": -119.29251098632812, + "loss": 0.5963, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0091395378112793, + "rewards/margins": 6.730897903442383, + "rewards/rejected": -3.7217586040496826, + "step": 16255 + }, + { + "epoch": 4.07, + "grad_norm": 1.8749831914901733, + "learning_rate": 8.347269804152891e-07, + "logits/chosen": -0.48506787419319153, + "logits/rejected": -0.6005297303199768, + "logps/chosen": -51.772422790527344, + "logps/rejected": -121.25985717773438, + "loss": 0.5137, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1333577632904053, + "rewards/margins": 9.589210510253906, + "rewards/rejected": -6.4558515548706055, + "step": 16256 + }, + { + "epoch": 4.07, + "grad_norm": 4.476556301116943, + "learning_rate": 8.342922307664353e-07, + "logits/chosen": -0.5296186804771423, + "logits/rejected": -0.616711437702179, + "logps/chosen": -56.616416931152344, + "logps/rejected": -113.22592163085938, + "loss": 0.6733, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0941343307495117, + "rewards/margins": 7.568578720092773, + "rewards/rejected": -4.474444389343262, + "step": 16257 + }, + { + "epoch": 4.07, + "grad_norm": 6.429366111755371, + "learning_rate": 8.338575840566526e-07, + "logits/chosen": -0.5828313231468201, + "logits/rejected": -0.6214402914047241, + "logps/chosen": -53.283912658691406, + "logps/rejected": -126.86943817138672, + "loss": 0.5971, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.273069381713867, + "rewards/margins": 8.278353691101074, + "rewards/rejected": -5.005283832550049, + "step": 16258 + }, + { + "epoch": 4.07, + "grad_norm": 4.42492151260376, + "learning_rate": 8.334230402966787e-07, + "logits/chosen": -0.5341739654541016, + "logits/rejected": -0.6194273233413696, + "logps/chosen": -56.28057098388672, + "logps/rejected": -118.11658477783203, + "loss": 0.6383, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1524925231933594, + "rewards/margins": 7.934341907501221, + "rewards/rejected": -4.781849384307861, + "step": 16259 + }, + { + "epoch": 4.07, + "grad_norm": 3.2136199474334717, + "learning_rate": 8.329885994972548e-07, + "logits/chosen": -0.4617832601070404, + "logits/rejected": -0.5845581293106079, + "logps/chosen": -53.68843078613281, + "logps/rejected": -95.61963653564453, + "loss": 0.4698, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3032922744750977, + "rewards/margins": 7.32113790512085, + "rewards/rejected": -4.017845153808594, + "step": 16260 + }, + { + "epoch": 4.07, + "grad_norm": 8.534187316894531, + "learning_rate": 8.325542616691134e-07, + "logits/chosen": -0.5112664103507996, + "logits/rejected": -0.5858632922172546, + "logps/chosen": -50.327667236328125, + "logps/rejected": -103.6183853149414, + "loss": 0.5609, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2839906215667725, + "rewards/margins": 7.131435871124268, + "rewards/rejected": -3.847444772720337, + "step": 16261 + }, + { + "epoch": 4.07, + "grad_norm": 5.560378074645996, + "learning_rate": 8.321200268229906e-07, + "logits/chosen": -0.5297357439994812, + "logits/rejected": -0.5766346454620361, + "logps/chosen": -54.44588088989258, + "logps/rejected": -123.3665542602539, + "loss": 0.577, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8773834705352783, + "rewards/margins": 6.913600444793701, + "rewards/rejected": -4.0362162590026855, + "step": 16262 + }, + { + "epoch": 4.07, + "grad_norm": 2.3983957767486572, + "learning_rate": 8.316858949696133e-07, + "logits/chosen": -0.5561642646789551, + "logits/rejected": -0.6588031053543091, + "logps/chosen": -55.41032791137695, + "logps/rejected": -95.09521484375, + "loss": 0.5383, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.140124797821045, + "rewards/margins": 7.552203178405762, + "rewards/rejected": -4.412079334259033, + "step": 16263 + }, + { + "epoch": 4.07, + "grad_norm": 8.458516120910645, + "learning_rate": 8.312518661197127e-07, + "logits/chosen": -0.4722999036312103, + "logits/rejected": -0.5911852121353149, + "logps/chosen": -54.16571044921875, + "logps/rejected": -99.88131713867188, + "loss": 0.617, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.206345796585083, + "rewards/margins": 7.486027717590332, + "rewards/rejected": -4.27968168258667, + "step": 16264 + }, + { + "epoch": 4.07, + "grad_norm": 3.61120867729187, + "learning_rate": 8.308179402840127e-07, + "logits/chosen": -0.6317411661148071, + "logits/rejected": -0.6878406405448914, + "logps/chosen": -49.73234558105469, + "logps/rejected": -110.12361145019531, + "loss": 0.624, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6553871631622314, + "rewards/margins": 6.9763641357421875, + "rewards/rejected": -4.320977210998535, + "step": 16265 + }, + { + "epoch": 4.07, + "grad_norm": 3.851287364959717, + "learning_rate": 8.303841174732347e-07, + "logits/chosen": -0.5221136212348938, + "logits/rejected": -0.6634188294410706, + "logps/chosen": -58.09040451049805, + "logps/rejected": -84.7270278930664, + "loss": 0.5966, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.049207925796509, + "rewards/margins": 6.781536102294922, + "rewards/rejected": -3.732327938079834, + "step": 16266 + }, + { + "epoch": 4.07, + "grad_norm": 7.297608852386475, + "learning_rate": 8.29950397698101e-07, + "logits/chosen": -0.5961733460426331, + "logits/rejected": -0.6761115193367004, + "logps/chosen": -45.68564224243164, + "logps/rejected": -88.71416473388672, + "loss": 0.5752, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3127846717834473, + "rewards/margins": 6.743683338165283, + "rewards/rejected": -3.4308993816375732, + "step": 16267 + }, + { + "epoch": 4.07, + "grad_norm": 3.643094301223755, + "learning_rate": 8.295167809693284e-07, + "logits/chosen": -0.5210643410682678, + "logits/rejected": -0.6082280874252319, + "logps/chosen": -48.64744567871094, + "logps/rejected": -107.06840515136719, + "loss": 0.5945, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.340397596359253, + "rewards/margins": 7.562251091003418, + "rewards/rejected": -4.221853733062744, + "step": 16268 + }, + { + "epoch": 4.07, + "grad_norm": 7.3856425285339355, + "learning_rate": 8.290832672976312e-07, + "logits/chosen": -0.5799490213394165, + "logits/rejected": -0.7284429669380188, + "logps/chosen": -61.90658187866211, + "logps/rejected": -90.4543228149414, + "loss": 0.7355, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.8412628173828125, + "rewards/margins": 5.3266448974609375, + "rewards/rejected": -2.485382080078125, + "step": 16269 + }, + { + "epoch": 4.07, + "grad_norm": 4.167906284332275, + "learning_rate": 8.28649856693724e-07, + "logits/chosen": -0.4991346597671509, + "logits/rejected": -0.5991393327713013, + "logps/chosen": -47.401702880859375, + "logps/rejected": -103.11609649658203, + "loss": 0.6119, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0159201622009277, + "rewards/margins": 7.2945098876953125, + "rewards/rejected": -4.278590202331543, + "step": 16270 + }, + { + "epoch": 4.07, + "grad_norm": 2.329951763153076, + "learning_rate": 8.282165491683159e-07, + "logits/chosen": -0.5797164440155029, + "logits/rejected": -0.6813887357711792, + "logps/chosen": -46.77145004272461, + "logps/rejected": -134.24411010742188, + "loss": 0.5262, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2848687171936035, + "rewards/margins": 10.047996520996094, + "rewards/rejected": -6.763127326965332, + "step": 16271 + }, + { + "epoch": 4.07, + "grad_norm": 10.152003288269043, + "learning_rate": 8.277833447321127e-07, + "logits/chosen": -0.5865310430526733, + "logits/rejected": -0.6503956913948059, + "logps/chosen": -52.875160217285156, + "logps/rejected": -94.13623046875, + "loss": 0.6262, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8156113624572754, + "rewards/margins": 6.115114212036133, + "rewards/rejected": -3.2995028495788574, + "step": 16272 + }, + { + "epoch": 4.07, + "grad_norm": 4.615334987640381, + "learning_rate": 8.273502433958202e-07, + "logits/chosen": -0.5686166286468506, + "logits/rejected": -0.6555154919624329, + "logps/chosen": -54.15420150756836, + "logps/rejected": -94.10476684570312, + "loss": 0.6037, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1750905513763428, + "rewards/margins": 6.938083648681641, + "rewards/rejected": -3.7629923820495605, + "step": 16273 + }, + { + "epoch": 4.07, + "grad_norm": 2.7025399208068848, + "learning_rate": 8.269172451701446e-07, + "logits/chosen": -0.5542334318161011, + "logits/rejected": -0.6007077693939209, + "logps/chosen": -49.17533874511719, + "logps/rejected": -111.13314819335938, + "loss": 0.5688, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.069737672805786, + "rewards/margins": 7.9336256980896, + "rewards/rejected": -4.863887786865234, + "step": 16274 + }, + { + "epoch": 4.07, + "grad_norm": 15.422468185424805, + "learning_rate": 8.264843500657799e-07, + "logits/chosen": -0.5039215087890625, + "logits/rejected": -0.5688145756721497, + "logps/chosen": -75.01788330078125, + "logps/rejected": -109.77558135986328, + "loss": 0.677, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.952056884765625, + "rewards/margins": 6.946830749511719, + "rewards/rejected": -3.994774103164673, + "step": 16275 + }, + { + "epoch": 4.07, + "grad_norm": 4.225553035736084, + "learning_rate": 8.260515580934264e-07, + "logits/chosen": -0.5601211786270142, + "logits/rejected": -0.6380945444107056, + "logps/chosen": -46.981719970703125, + "logps/rejected": -128.26876831054688, + "loss": 0.5419, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.773519992828369, + "rewards/margins": 8.378296852111816, + "rewards/rejected": -5.6047773361206055, + "step": 16276 + }, + { + "epoch": 4.07, + "grad_norm": 3.284186601638794, + "learning_rate": 8.256188692637795e-07, + "logits/chosen": -0.48488521575927734, + "logits/rejected": -0.5607600212097168, + "logps/chosen": -67.28341674804688, + "logps/rejected": -103.272705078125, + "loss": 0.667, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0381762981414795, + "rewards/margins": 6.150908946990967, + "rewards/rejected": -3.1127331256866455, + "step": 16277 + }, + { + "epoch": 4.07, + "grad_norm": 2.2002251148223877, + "learning_rate": 8.251862835875302e-07, + "logits/chosen": -0.5707252025604248, + "logits/rejected": -0.6576611995697021, + "logps/chosen": -57.704872131347656, + "logps/rejected": -106.25250244140625, + "loss": 0.5995, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2944083213806152, + "rewards/margins": 7.654798984527588, + "rewards/rejected": -4.360391139984131, + "step": 16278 + }, + { + "epoch": 4.07, + "grad_norm": 4.851527214050293, + "learning_rate": 8.247538010753676e-07, + "logits/chosen": -0.5909814238548279, + "logits/rejected": -0.6222715973854065, + "logps/chosen": -53.63351058959961, + "logps/rejected": -123.89197540283203, + "loss": 0.7017, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.812251329421997, + "rewards/margins": 6.574578285217285, + "rewards/rejected": -3.76232647895813, + "step": 16279 + }, + { + "epoch": 4.07, + "grad_norm": 3.6132712364196777, + "learning_rate": 8.243214217379803e-07, + "logits/chosen": -0.5890355110168457, + "logits/rejected": -0.6238098740577698, + "logps/chosen": -48.70918273925781, + "logps/rejected": -121.55862426757812, + "loss": 0.5743, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4822092056274414, + "rewards/margins": 7.023480415344238, + "rewards/rejected": -3.5412707328796387, + "step": 16280 + }, + { + "epoch": 4.07, + "grad_norm": 4.000880718231201, + "learning_rate": 8.23889145586052e-07, + "logits/chosen": -0.555300772190094, + "logits/rejected": -0.6287586688995361, + "logps/chosen": -50.25565719604492, + "logps/rejected": -118.84660339355469, + "loss": 0.5412, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0755648612976074, + "rewards/margins": 7.903116226196289, + "rewards/rejected": -4.827552318572998, + "step": 16281 + }, + { + "epoch": 4.07, + "grad_norm": 5.6016411781311035, + "learning_rate": 8.234569726302643e-07, + "logits/chosen": -0.5457034111022949, + "logits/rejected": -0.6178139448165894, + "logps/chosen": -52.90846633911133, + "logps/rejected": -106.54744720458984, + "loss": 0.5326, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.259754180908203, + "rewards/margins": 6.046918869018555, + "rewards/rejected": -2.7871649265289307, + "step": 16282 + }, + { + "epoch": 4.07, + "grad_norm": 5.639341831207275, + "learning_rate": 8.230249028812981e-07, + "logits/chosen": -0.5357394218444824, + "logits/rejected": -0.5978562235832214, + "logps/chosen": -51.18097686767578, + "logps/rejected": -96.50183868408203, + "loss": 0.5989, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9650893211364746, + "rewards/margins": 6.519231796264648, + "rewards/rejected": -3.5541419982910156, + "step": 16283 + }, + { + "epoch": 4.07, + "grad_norm": 11.448062896728516, + "learning_rate": 8.225929363498281e-07, + "logits/chosen": -0.5128471851348877, + "logits/rejected": -0.5874419212341309, + "logps/chosen": -72.99800872802734, + "logps/rejected": -99.740966796875, + "loss": 0.6747, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0202252864837646, + "rewards/margins": 5.96112060546875, + "rewards/rejected": -2.9408950805664062, + "step": 16284 + }, + { + "epoch": 4.07, + "grad_norm": 3.550708770751953, + "learning_rate": 8.221610730465307e-07, + "logits/chosen": -0.5824727416038513, + "logits/rejected": -0.670280933380127, + "logps/chosen": -58.419158935546875, + "logps/rejected": -116.78264617919922, + "loss": 0.6679, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0735902786254883, + "rewards/margins": 7.367657661437988, + "rewards/rejected": -4.2940673828125, + "step": 16285 + }, + { + "epoch": 4.07, + "grad_norm": 6.701996326446533, + "learning_rate": 8.217293129820775e-07, + "logits/chosen": -0.519288182258606, + "logits/rejected": -0.6021228432655334, + "logps/chosen": -58.87263488769531, + "logps/rejected": -92.07134246826172, + "loss": 0.6622, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7547049522399902, + "rewards/margins": 6.315239906311035, + "rewards/rejected": -3.560535192489624, + "step": 16286 + }, + { + "epoch": 4.07, + "grad_norm": 17.001752853393555, + "learning_rate": 8.21297656167136e-07, + "logits/chosen": -0.5154127478599548, + "logits/rejected": -0.5952287912368774, + "logps/chosen": -49.3531494140625, + "logps/rejected": -107.82234954833984, + "loss": 0.5189, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.313652992248535, + "rewards/margins": 7.278507709503174, + "rewards/rejected": -3.9648547172546387, + "step": 16287 + }, + { + "epoch": 4.07, + "grad_norm": 4.781513214111328, + "learning_rate": 8.208661026123749e-07, + "logits/chosen": -0.4972290098667145, + "logits/rejected": -0.6003082394599915, + "logps/chosen": -55.81636428833008, + "logps/rejected": -104.22174072265625, + "loss": 0.5568, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.445181369781494, + "rewards/margins": 7.394246578216553, + "rewards/rejected": -3.9490652084350586, + "step": 16288 + }, + { + "epoch": 4.07, + "grad_norm": 6.095459938049316, + "learning_rate": 8.204346523284579e-07, + "logits/chosen": -0.5161163806915283, + "logits/rejected": -0.572853684425354, + "logps/chosen": -59.58758544921875, + "logps/rejected": -126.85420227050781, + "loss": 0.6386, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7041635513305664, + "rewards/margins": 7.247431755065918, + "rewards/rejected": -4.543267726898193, + "step": 16289 + }, + { + "epoch": 4.08, + "grad_norm": 15.310190200805664, + "learning_rate": 8.200033053260448e-07, + "logits/chosen": -0.600512683391571, + "logits/rejected": -0.6997494697570801, + "logps/chosen": -47.84580993652344, + "logps/rejected": -93.32827758789062, + "loss": 0.6196, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3472118377685547, + "rewards/margins": 6.194545745849609, + "rewards/rejected": -2.847334384918213, + "step": 16290 + }, + { + "epoch": 4.08, + "grad_norm": 5.354835033416748, + "learning_rate": 8.19572061615796e-07, + "logits/chosen": -0.5209407806396484, + "logits/rejected": -0.6244929432868958, + "logps/chosen": -56.520328521728516, + "logps/rejected": -130.77076721191406, + "loss": 0.5658, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.054854393005371, + "rewards/margins": 8.77325439453125, + "rewards/rejected": -5.718400001525879, + "step": 16291 + }, + { + "epoch": 4.08, + "grad_norm": 7.025359630584717, + "learning_rate": 8.191409212083707e-07, + "logits/chosen": -0.5855309367179871, + "logits/rejected": -0.6626849174499512, + "logps/chosen": -54.041343688964844, + "logps/rejected": -85.98739624023438, + "loss": 0.6524, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9749302864074707, + "rewards/margins": 6.497799396514893, + "rewards/rejected": -3.522869110107422, + "step": 16292 + }, + { + "epoch": 4.08, + "grad_norm": 3.546461343765259, + "learning_rate": 8.187098841144181e-07, + "logits/chosen": -0.5601789355278015, + "logits/rejected": -0.5969516038894653, + "logps/chosen": -53.53397750854492, + "logps/rejected": -137.62997436523438, + "loss": 0.5614, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.456465721130371, + "rewards/margins": 8.917329788208008, + "rewards/rejected": -5.460864067077637, + "step": 16293 + }, + { + "epoch": 4.08, + "grad_norm": 5.140791893005371, + "learning_rate": 8.182789503445909e-07, + "logits/chosen": -0.5691256523132324, + "logits/rejected": -0.6674679517745972, + "logps/chosen": -51.65849304199219, + "logps/rejected": -94.88632202148438, + "loss": 0.6405, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.822115421295166, + "rewards/margins": 6.145082950592041, + "rewards/rejected": -3.322968006134033, + "step": 16294 + }, + { + "epoch": 4.08, + "grad_norm": 6.172056674957275, + "learning_rate": 8.178481199095406e-07, + "logits/chosen": -0.5703576803207397, + "logits/rejected": -0.6524119973182678, + "logps/chosen": -61.27188491821289, + "logps/rejected": -98.48075866699219, + "loss": 0.6129, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.320786476135254, + "rewards/margins": 6.86848258972168, + "rewards/rejected": -3.547696352005005, + "step": 16295 + }, + { + "epoch": 4.08, + "grad_norm": 1.7786775827407837, + "learning_rate": 8.174173928199108e-07, + "logits/chosen": -0.5036846399307251, + "logits/rejected": -0.5640230774879456, + "logps/chosen": -47.1553840637207, + "logps/rejected": -109.67916870117188, + "loss": 0.526, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2061946392059326, + "rewards/margins": 7.665907382965088, + "rewards/rejected": -4.459712982177734, + "step": 16296 + }, + { + "epoch": 4.08, + "grad_norm": 6.284891128540039, + "learning_rate": 8.169867690863453e-07, + "logits/chosen": -0.5704074501991272, + "logits/rejected": -0.6643451452255249, + "logps/chosen": -52.15437316894531, + "logps/rejected": -94.42721557617188, + "loss": 0.6179, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3035190105438232, + "rewards/margins": 6.254423141479492, + "rewards/rejected": -2.950904130935669, + "step": 16297 + }, + { + "epoch": 4.08, + "grad_norm": 7.928180694580078, + "learning_rate": 8.165562487194872e-07, + "logits/chosen": -0.5962037444114685, + "logits/rejected": -0.6269968748092651, + "logps/chosen": -55.294315338134766, + "logps/rejected": -114.0795669555664, + "loss": 0.6097, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.211174488067627, + "rewards/margins": 6.891043663024902, + "rewards/rejected": -3.6798689365386963, + "step": 16298 + }, + { + "epoch": 4.08, + "grad_norm": 8.80554485321045, + "learning_rate": 8.161258317299742e-07, + "logits/chosen": -0.5068051815032959, + "logits/rejected": -0.5989450812339783, + "logps/chosen": -69.45134735107422, + "logps/rejected": -111.85262298583984, + "loss": 0.7719, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.883354663848877, + "rewards/margins": 6.2839226722717285, + "rewards/rejected": -3.4005675315856934, + "step": 16299 + }, + { + "epoch": 4.08, + "grad_norm": 6.131483554840088, + "learning_rate": 8.156955181284404e-07, + "logits/chosen": -0.534824788570404, + "logits/rejected": -0.5767168402671814, + "logps/chosen": -66.04996490478516, + "logps/rejected": -115.3890380859375, + "loss": 0.6768, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3510048389434814, + "rewards/margins": 6.823063373565674, + "rewards/rejected": -3.4720585346221924, + "step": 16300 + }, + { + "epoch": 4.08, + "grad_norm": 7.832431793212891, + "learning_rate": 8.152653079255224e-07, + "logits/chosen": -0.5188426375389099, + "logits/rejected": -0.5686826705932617, + "logps/chosen": -59.115638732910156, + "logps/rejected": -122.38438415527344, + "loss": 0.6378, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.86513090133667, + "rewards/margins": 7.580848217010498, + "rewards/rejected": -4.715717315673828, + "step": 16301 + }, + { + "epoch": 4.08, + "grad_norm": 5.481777191162109, + "learning_rate": 8.148352011318483e-07, + "logits/chosen": -0.5731267929077148, + "logits/rejected": -0.6182616949081421, + "logps/chosen": -60.665428161621094, + "logps/rejected": -117.15885925292969, + "loss": 0.6757, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.329242467880249, + "rewards/margins": 7.613170146942139, + "rewards/rejected": -4.283926963806152, + "step": 16302 + }, + { + "epoch": 4.08, + "grad_norm": 4.880429267883301, + "learning_rate": 8.144051977580497e-07, + "logits/chosen": -0.5237336158752441, + "logits/rejected": -0.6169342994689941, + "logps/chosen": -52.8473014831543, + "logps/rejected": -84.83111572265625, + "loss": 0.6417, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.038059711456299, + "rewards/margins": 6.058292865753174, + "rewards/rejected": -3.020233154296875, + "step": 16303 + }, + { + "epoch": 4.08, + "grad_norm": 5.834183692932129, + "learning_rate": 8.139752978147486e-07, + "logits/chosen": -0.5500962734222412, + "logits/rejected": -0.6226038336753845, + "logps/chosen": -48.34605407714844, + "logps/rejected": -92.5811767578125, + "loss": 0.6181, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.964442253112793, + "rewards/margins": 6.3467535972595215, + "rewards/rejected": -3.3823113441467285, + "step": 16304 + }, + { + "epoch": 4.08, + "grad_norm": 7.632512092590332, + "learning_rate": 8.13545501312572e-07, + "logits/chosen": -0.5302804708480835, + "logits/rejected": -0.6272910833358765, + "logps/chosen": -55.384117126464844, + "logps/rejected": -102.65155029296875, + "loss": 0.644, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7400851249694824, + "rewards/margins": 7.629837512969971, + "rewards/rejected": -4.88975191116333, + "step": 16305 + }, + { + "epoch": 4.08, + "grad_norm": 2.3912463188171387, + "learning_rate": 8.131158082621393e-07, + "logits/chosen": -0.5555877089500427, + "logits/rejected": -0.6114748120307922, + "logps/chosen": -52.45780944824219, + "logps/rejected": -132.67640686035156, + "loss": 0.4746, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.267390727996826, + "rewards/margins": 7.669018745422363, + "rewards/rejected": -4.401628017425537, + "step": 16306 + }, + { + "epoch": 4.08, + "grad_norm": 3.619041919708252, + "learning_rate": 8.126862186740664e-07, + "logits/chosen": -0.5768600106239319, + "logits/rejected": -0.6643254160881042, + "logps/chosen": -52.132686614990234, + "logps/rejected": -93.26371765136719, + "loss": 0.5761, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4318747520446777, + "rewards/margins": 5.825674057006836, + "rewards/rejected": -2.393799066543579, + "step": 16307 + }, + { + "epoch": 4.08, + "grad_norm": 4.256883144378662, + "learning_rate": 8.122567325589725e-07, + "logits/chosen": -0.5767205953598022, + "logits/rejected": -0.6424570083618164, + "logps/chosen": -43.580711364746094, + "logps/rejected": -99.14562225341797, + "loss": 0.5002, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.320988178253174, + "rewards/margins": 7.209158420562744, + "rewards/rejected": -3.8881702423095703, + "step": 16308 + }, + { + "epoch": 4.08, + "grad_norm": 3.6504108905792236, + "learning_rate": 8.118273499274682e-07, + "logits/chosen": -0.6566832065582275, + "logits/rejected": -0.7389875650405884, + "logps/chosen": -35.2523193359375, + "logps/rejected": -96.9307632446289, + "loss": 0.5118, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0899596214294434, + "rewards/margins": 7.813403129577637, + "rewards/rejected": -4.723443031311035, + "step": 16309 + }, + { + "epoch": 4.08, + "grad_norm": 6.251439571380615, + "learning_rate": 8.113980707901653e-07, + "logits/chosen": -0.5199366807937622, + "logits/rejected": -0.5740048289299011, + "logps/chosen": -64.37274169921875, + "logps/rejected": -115.20164489746094, + "loss": 0.7007, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.322774887084961, + "rewards/margins": 7.132607460021973, + "rewards/rejected": -3.8098323345184326, + "step": 16310 + }, + { + "epoch": 4.08, + "grad_norm": 8.027545928955078, + "learning_rate": 8.109688951576717e-07, + "logits/chosen": -0.5914281010627747, + "logits/rejected": -0.7042843103408813, + "logps/chosen": -63.038021087646484, + "logps/rejected": -93.75111389160156, + "loss": 0.7715, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6966962814331055, + "rewards/margins": 6.604236125946045, + "rewards/rejected": -3.9075393676757812, + "step": 16311 + }, + { + "epoch": 4.08, + "grad_norm": 7.597062587738037, + "learning_rate": 8.105398230405908e-07, + "logits/chosen": -0.52210932970047, + "logits/rejected": -0.6368563175201416, + "logps/chosen": -70.49462127685547, + "logps/rejected": -118.98951721191406, + "loss": 0.7187, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.609330654144287, + "rewards/margins": 7.746062278747559, + "rewards/rejected": -5.13673210144043, + "step": 16312 + }, + { + "epoch": 4.08, + "grad_norm": 3.210033893585205, + "learning_rate": 8.101108544495284e-07, + "logits/chosen": -0.5268339514732361, + "logits/rejected": -0.5725179314613342, + "logps/chosen": -53.53933334350586, + "logps/rejected": -113.7930908203125, + "loss": 0.5783, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.097537040710449, + "rewards/margins": 6.397750377655029, + "rewards/rejected": -3.3002140522003174, + "step": 16313 + }, + { + "epoch": 4.08, + "grad_norm": 6.37912654876709, + "learning_rate": 8.096819893950835e-07, + "logits/chosen": -0.62445068359375, + "logits/rejected": -0.7516932487487793, + "logps/chosen": -59.30239486694336, + "logps/rejected": -95.95661163330078, + "loss": 0.5918, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1296041011810303, + "rewards/margins": 7.453665256500244, + "rewards/rejected": -4.324060916900635, + "step": 16314 + }, + { + "epoch": 4.08, + "grad_norm": 5.6858744621276855, + "learning_rate": 8.092532278878523e-07, + "logits/chosen": -0.5241798758506775, + "logits/rejected": -0.6105495095252991, + "logps/chosen": -59.76333999633789, + "logps/rejected": -98.40235137939453, + "loss": 0.7101, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.010043144226074, + "rewards/margins": 6.211556434631348, + "rewards/rejected": -3.2015128135681152, + "step": 16315 + }, + { + "epoch": 4.08, + "grad_norm": 1.946524739265442, + "learning_rate": 8.088245699384312e-07, + "logits/chosen": -0.517512321472168, + "logits/rejected": -0.573201060295105, + "logps/chosen": -45.805397033691406, + "logps/rejected": -122.41634368896484, + "loss": 0.4999, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2650411128997803, + "rewards/margins": 8.53287124633789, + "rewards/rejected": -5.267829895019531, + "step": 16316 + }, + { + "epoch": 4.08, + "grad_norm": 10.020853042602539, + "learning_rate": 8.083960155574156e-07, + "logits/chosen": -0.5302018523216248, + "logits/rejected": -0.5921480059623718, + "logps/chosen": -49.58005142211914, + "logps/rejected": -112.0603256225586, + "loss": 0.5689, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2888689041137695, + "rewards/margins": 7.75759220123291, + "rewards/rejected": -4.468722820281982, + "step": 16317 + }, + { + "epoch": 4.08, + "grad_norm": 9.511678695678711, + "learning_rate": 8.079675647553902e-07, + "logits/chosen": -0.47747886180877686, + "logits/rejected": -0.5618881583213806, + "logps/chosen": -57.43031311035156, + "logps/rejected": -96.17002868652344, + "loss": 0.7073, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0629942417144775, + "rewards/margins": 6.407320976257324, + "rewards/rejected": -3.344326972961426, + "step": 16318 + }, + { + "epoch": 4.08, + "grad_norm": 2.67851185798645, + "learning_rate": 8.075392175429453e-07, + "logits/chosen": -0.6085421442985535, + "logits/rejected": -0.6878663301467896, + "logps/chosen": -53.870643615722656, + "logps/rejected": -97.91543579101562, + "loss": 0.619, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.163163423538208, + "rewards/margins": 6.126107692718506, + "rewards/rejected": -2.9629437923431396, + "step": 16319 + }, + { + "epoch": 4.08, + "grad_norm": 4.9281487464904785, + "learning_rate": 8.071109739306665e-07, + "logits/chosen": -0.5994893312454224, + "logits/rejected": -0.6729527115821838, + "logps/chosen": -54.206764221191406, + "logps/rejected": -123.65911865234375, + "loss": 0.5519, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.247751235961914, + "rewards/margins": 8.612468719482422, + "rewards/rejected": -5.364718437194824, + "step": 16320 + }, + { + "epoch": 4.08, + "grad_norm": 6.130060195922852, + "learning_rate": 8.066828339291355e-07, + "logits/chosen": -0.5991693735122681, + "logits/rejected": -0.6901863813400269, + "logps/chosen": -64.00057983398438, + "logps/rejected": -98.18022918701172, + "loss": 0.6569, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2146339416503906, + "rewards/margins": 7.072469711303711, + "rewards/rejected": -3.857835292816162, + "step": 16321 + }, + { + "epoch": 4.08, + "grad_norm": 4.739243507385254, + "learning_rate": 8.06254797548931e-07, + "logits/chosen": -0.5094935894012451, + "logits/rejected": -0.6366275548934937, + "logps/chosen": -69.36322784423828, + "logps/rejected": -105.93689727783203, + "loss": 0.6364, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9153382778167725, + "rewards/margins": 6.823403358459473, + "rewards/rejected": -3.9080653190612793, + "step": 16322 + }, + { + "epoch": 4.08, + "grad_norm": 7.193336486816406, + "learning_rate": 8.058268648006318e-07, + "logits/chosen": -0.5532740950584412, + "logits/rejected": -0.6397367119789124, + "logps/chosen": -53.49073791503906, + "logps/rejected": -98.1076889038086, + "loss": 0.6993, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.109097957611084, + "rewards/margins": 5.957972526550293, + "rewards/rejected": -2.848874807357788, + "step": 16323 + }, + { + "epoch": 4.08, + "grad_norm": 5.72605562210083, + "learning_rate": 8.053990356948122e-07, + "logits/chosen": -0.47323381900787354, + "logits/rejected": -0.5222790837287903, + "logps/chosen": -53.506591796875, + "logps/rejected": -96.29112243652344, + "loss": 0.6854, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0605812072753906, + "rewards/margins": 5.83703088760376, + "rewards/rejected": -2.776449680328369, + "step": 16324 + }, + { + "epoch": 4.08, + "grad_norm": 2.6149168014526367, + "learning_rate": 8.049713102420426e-07, + "logits/chosen": -0.6094913482666016, + "logits/rejected": -0.6898950338363647, + "logps/chosen": -59.315711975097656, + "logps/rejected": -101.5992202758789, + "loss": 0.611, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.192396402359009, + "rewards/margins": 7.929120063781738, + "rewards/rejected": -4.73672342300415, + "step": 16325 + }, + { + "epoch": 4.08, + "grad_norm": 2.873042345046997, + "learning_rate": 8.045436884528957e-07, + "logits/chosen": -0.6319377422332764, + "logits/rejected": -0.7342810034751892, + "logps/chosen": -54.901676177978516, + "logps/rejected": -113.02902221679688, + "loss": 0.5566, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.107494592666626, + "rewards/margins": 8.42846965789795, + "rewards/rejected": -5.320974349975586, + "step": 16326 + }, + { + "epoch": 4.08, + "grad_norm": 3.3170855045318604, + "learning_rate": 8.041161703379352e-07, + "logits/chosen": -0.573565661907196, + "logits/rejected": -0.616269052028656, + "logps/chosen": -49.15000915527344, + "logps/rejected": -102.01581573486328, + "loss": 0.6779, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0542900562286377, + "rewards/margins": 7.2686686515808105, + "rewards/rejected": -4.214378833770752, + "step": 16327 + }, + { + "epoch": 4.08, + "grad_norm": 4.172554969787598, + "learning_rate": 8.036887559077277e-07, + "logits/chosen": -0.5448160171508789, + "logits/rejected": -0.6231675744056702, + "logps/chosen": -61.272804260253906, + "logps/rejected": -110.8750991821289, + "loss": 0.5984, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1647226810455322, + "rewards/margins": 6.956465721130371, + "rewards/rejected": -3.791743278503418, + "step": 16328 + }, + { + "epoch": 4.08, + "grad_norm": 7.321341514587402, + "learning_rate": 8.032614451728355e-07, + "logits/chosen": -0.5213841795921326, + "logits/rejected": -0.59135901927948, + "logps/chosen": -67.23004913330078, + "logps/rejected": -108.94961547851562, + "loss": 0.6945, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.905703067779541, + "rewards/margins": 6.786205291748047, + "rewards/rejected": -3.880502223968506, + "step": 16329 + }, + { + "epoch": 4.09, + "grad_norm": 13.700316429138184, + "learning_rate": 8.028342381438148e-07, + "logits/chosen": -0.5674774050712585, + "logits/rejected": -0.6729205250740051, + "logps/chosen": -58.4015007019043, + "logps/rejected": -94.6329116821289, + "loss": 0.7891, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0659000873565674, + "rewards/margins": 6.5042829513549805, + "rewards/rejected": -3.438383102416992, + "step": 16330 + }, + { + "epoch": 4.09, + "grad_norm": 5.099987506866455, + "learning_rate": 8.024071348312257e-07, + "logits/chosen": -0.5114718675613403, + "logits/rejected": -0.6177601218223572, + "logps/chosen": -59.145137786865234, + "logps/rejected": -92.78660583496094, + "loss": 0.6423, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9442758560180664, + "rewards/margins": 5.111150741577148, + "rewards/rejected": -2.166874408721924, + "step": 16331 + }, + { + "epoch": 4.09, + "grad_norm": 10.69154167175293, + "learning_rate": 8.019801352456214e-07, + "logits/chosen": -0.6473848819732666, + "logits/rejected": -0.7403299808502197, + "logps/chosen": -57.342742919921875, + "logps/rejected": -107.8270492553711, + "loss": 0.6707, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.212714433670044, + "rewards/margins": 7.796121120452881, + "rewards/rejected": -4.583407402038574, + "step": 16332 + }, + { + "epoch": 4.09, + "grad_norm": 3.4000210762023926, + "learning_rate": 8.015532393975517e-07, + "logits/chosen": -0.4862878620624542, + "logits/rejected": -0.5977378487586975, + "logps/chosen": -55.576416015625, + "logps/rejected": -106.12117767333984, + "loss": 0.5226, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1651804447174072, + "rewards/margins": 7.5991997718811035, + "rewards/rejected": -4.434019088745117, + "step": 16333 + }, + { + "epoch": 4.09, + "grad_norm": 6.1623759269714355, + "learning_rate": 8.011264472975672e-07, + "logits/chosen": -0.4733213484287262, + "logits/rejected": -0.5748059153556824, + "logps/chosen": -59.54792022705078, + "logps/rejected": -108.97147369384766, + "loss": 0.6674, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.899562120437622, + "rewards/margins": 7.313815116882324, + "rewards/rejected": -4.414252758026123, + "step": 16334 + }, + { + "epoch": 4.09, + "grad_norm": 2.9795479774475098, + "learning_rate": 8.006997589562165e-07, + "logits/chosen": -0.5084401965141296, + "logits/rejected": -0.6465629935264587, + "logps/chosen": -60.12981414794922, + "logps/rejected": -93.90545654296875, + "loss": 0.5961, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.77119779586792, + "rewards/margins": 7.508817195892334, + "rewards/rejected": -4.737619400024414, + "step": 16335 + }, + { + "epoch": 4.09, + "grad_norm": 5.691465377807617, + "learning_rate": 8.002731743840392e-07, + "logits/chosen": -0.4886443614959717, + "logits/rejected": -0.5694015026092529, + "logps/chosen": -50.17897415161133, + "logps/rejected": -123.20039367675781, + "loss": 0.5231, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.188521146774292, + "rewards/margins": 8.330684661865234, + "rewards/rejected": -5.142163276672363, + "step": 16336 + }, + { + "epoch": 4.09, + "grad_norm": 5.377397537231445, + "learning_rate": 7.998466935915788e-07, + "logits/chosen": -0.584509015083313, + "logits/rejected": -0.6575961709022522, + "logps/chosen": -42.489280700683594, + "logps/rejected": -105.64338684082031, + "loss": 0.4873, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2315895557403564, + "rewards/margins": 8.0292329788208, + "rewards/rejected": -4.797643661499023, + "step": 16337 + }, + { + "epoch": 4.09, + "grad_norm": 5.435596466064453, + "learning_rate": 7.994203165893755e-07, + "logits/chosen": -0.5165616273880005, + "logits/rejected": -0.6154400110244751, + "logps/chosen": -61.60713195800781, + "logps/rejected": -109.8545913696289, + "loss": 0.6854, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2071993350982666, + "rewards/margins": 7.505460262298584, + "rewards/rejected": -4.2982611656188965, + "step": 16338 + }, + { + "epoch": 4.09, + "grad_norm": 4.93205451965332, + "learning_rate": 7.989940433879639e-07, + "logits/chosen": -0.6071076989173889, + "logits/rejected": -0.7335383296012878, + "logps/chosen": -50.66225051879883, + "logps/rejected": -95.92839050292969, + "loss": 0.6194, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9645447731018066, + "rewards/margins": 7.9632344245910645, + "rewards/rejected": -4.998689651489258, + "step": 16339 + }, + { + "epoch": 4.09, + "grad_norm": 4.728170394897461, + "learning_rate": 7.985678739978764e-07, + "logits/chosen": -0.5983951687812805, + "logits/rejected": -0.6309813857078552, + "logps/chosen": -43.449729919433594, + "logps/rejected": -99.41455078125, + "loss": 0.5862, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.117950439453125, + "rewards/margins": 6.414856433868408, + "rewards/rejected": -3.2969064712524414, + "step": 16340 + }, + { + "epoch": 4.09, + "grad_norm": 4.433178424835205, + "learning_rate": 7.981418084296472e-07, + "logits/chosen": -0.5472152829170227, + "logits/rejected": -0.6619610786437988, + "logps/chosen": -50.06309509277344, + "logps/rejected": -108.76355743408203, + "loss": 0.5713, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.120572805404663, + "rewards/margins": 8.385719299316406, + "rewards/rejected": -5.265146255493164, + "step": 16341 + }, + { + "epoch": 4.09, + "grad_norm": 3.6884212493896484, + "learning_rate": 7.977158466938029e-07, + "logits/chosen": -0.5780551433563232, + "logits/rejected": -0.6403356790542603, + "logps/chosen": -59.41182327270508, + "logps/rejected": -103.0195541381836, + "loss": 0.687, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.00024151802063, + "rewards/margins": 7.199151039123535, + "rewards/rejected": -4.198909282684326, + "step": 16342 + }, + { + "epoch": 4.09, + "grad_norm": 5.860002517700195, + "learning_rate": 7.972899888008684e-07, + "logits/chosen": -0.4948069155216217, + "logits/rejected": -0.5546097755432129, + "logps/chosen": -61.38858413696289, + "logps/rejected": -114.00355529785156, + "loss": 0.6506, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.143674850463867, + "rewards/margins": 6.75537109375, + "rewards/rejected": -3.6116950511932373, + "step": 16343 + }, + { + "epoch": 4.09, + "grad_norm": 11.386307716369629, + "learning_rate": 7.968642347613703e-07, + "logits/chosen": -0.4822666347026825, + "logits/rejected": -0.5748614072799683, + "logps/chosen": -66.25150299072266, + "logps/rejected": -118.47015380859375, + "loss": 0.6384, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7879281044006348, + "rewards/margins": 6.707538604736328, + "rewards/rejected": -3.919609785079956, + "step": 16344 + }, + { + "epoch": 4.09, + "grad_norm": 4.593112468719482, + "learning_rate": 7.964385845858258e-07, + "logits/chosen": -0.5751938223838806, + "logits/rejected": -0.6590795516967773, + "logps/chosen": -57.19255065917969, + "logps/rejected": -97.3346176147461, + "loss": 0.6876, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1719002723693848, + "rewards/margins": 7.348690032958984, + "rewards/rejected": -4.176789283752441, + "step": 16345 + }, + { + "epoch": 4.09, + "grad_norm": 7.355050086975098, + "learning_rate": 7.960130382847564e-07, + "logits/chosen": -0.5625542402267456, + "logits/rejected": -0.5936381220817566, + "logps/chosen": -50.79719924926758, + "logps/rejected": -106.97312927246094, + "loss": 0.6204, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.267390489578247, + "rewards/margins": 6.1608076095581055, + "rewards/rejected": -2.8934175968170166, + "step": 16346 + }, + { + "epoch": 4.09, + "grad_norm": 3.5119755268096924, + "learning_rate": 7.955875958686754e-07, + "logits/chosen": -0.5507653951644897, + "logits/rejected": -0.6725387573242188, + "logps/chosen": -51.42677307128906, + "logps/rejected": -90.67874145507812, + "loss": 0.6011, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3467915058135986, + "rewards/margins": 6.797374725341797, + "rewards/rejected": -3.4505834579467773, + "step": 16347 + }, + { + "epoch": 4.09, + "grad_norm": 4.095570087432861, + "learning_rate": 7.951622573480977e-07, + "logits/chosen": -0.5114635229110718, + "logits/rejected": -0.5903557538986206, + "logps/chosen": -53.90102005004883, + "logps/rejected": -105.25086212158203, + "loss": 0.5737, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.168407678604126, + "rewards/margins": 7.400611400604248, + "rewards/rejected": -4.232203483581543, + "step": 16348 + }, + { + "epoch": 4.09, + "grad_norm": 3.3230533599853516, + "learning_rate": 7.947370227335332e-07, + "logits/chosen": -0.515051543712616, + "logits/rejected": -0.6038336157798767, + "logps/chosen": -49.01202392578125, + "logps/rejected": -101.12434387207031, + "loss": 0.5859, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.194253921508789, + "rewards/margins": 6.988141059875488, + "rewards/rejected": -3.793887138366699, + "step": 16349 + }, + { + "epoch": 4.09, + "grad_norm": 2.846252918243408, + "learning_rate": 7.943118920354892e-07, + "logits/chosen": -0.5047316551208496, + "logits/rejected": -0.5885014533996582, + "logps/chosen": -57.24750900268555, + "logps/rejected": -113.49226379394531, + "loss": 0.5328, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.227250099182129, + "rewards/margins": 8.134963035583496, + "rewards/rejected": -4.907712459564209, + "step": 16350 + }, + { + "epoch": 4.09, + "grad_norm": 3.689701795578003, + "learning_rate": 7.938868652644721e-07, + "logits/chosen": -0.5425777435302734, + "logits/rejected": -0.6292107105255127, + "logps/chosen": -56.08988952636719, + "logps/rejected": -97.12144470214844, + "loss": 0.5823, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.071788787841797, + "rewards/margins": 6.613803386688232, + "rewards/rejected": -3.5420143604278564, + "step": 16351 + }, + { + "epoch": 4.09, + "grad_norm": 6.26364278793335, + "learning_rate": 7.934619424309836e-07, + "logits/chosen": -0.5065656304359436, + "logits/rejected": -0.575972318649292, + "logps/chosen": -67.65496826171875, + "logps/rejected": -103.98933410644531, + "loss": 0.6758, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8521132469177246, + "rewards/margins": 6.674236297607422, + "rewards/rejected": -3.8221232891082764, + "step": 16352 + }, + { + "epoch": 4.09, + "grad_norm": 5.374841213226318, + "learning_rate": 7.930371235455259e-07, + "logits/chosen": -0.57472163438797, + "logits/rejected": -0.6174437403678894, + "logps/chosen": -57.077186584472656, + "logps/rejected": -105.09288024902344, + "loss": 0.6136, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0509843826293945, + "rewards/margins": 5.741569519042969, + "rewards/rejected": -2.690584659576416, + "step": 16353 + }, + { + "epoch": 4.09, + "grad_norm": 4.491715908050537, + "learning_rate": 7.926124086185954e-07, + "logits/chosen": -0.4552460312843323, + "logits/rejected": -0.5674903392791748, + "logps/chosen": -56.88164138793945, + "logps/rejected": -99.22808074951172, + "loss": 0.5815, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.799825668334961, + "rewards/margins": 6.656078338623047, + "rewards/rejected": -3.856252908706665, + "step": 16354 + }, + { + "epoch": 4.09, + "grad_norm": 2.901667356491089, + "learning_rate": 7.921877976606868e-07, + "logits/chosen": -0.505698025226593, + "logits/rejected": -0.6221278309822083, + "logps/chosen": -57.931331634521484, + "logps/rejected": -104.62980651855469, + "loss": 0.5447, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0505340099334717, + "rewards/margins": 7.4542436599731445, + "rewards/rejected": -4.403709888458252, + "step": 16355 + }, + { + "epoch": 4.09, + "grad_norm": 7.83721923828125, + "learning_rate": 7.917632906822942e-07, + "logits/chosen": -0.5926167964935303, + "logits/rejected": -0.6243543028831482, + "logps/chosen": -51.86128616333008, + "logps/rejected": -101.03367614746094, + "loss": 0.612, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.74477481842041, + "rewards/margins": 6.965261459350586, + "rewards/rejected": -4.220485687255859, + "step": 16356 + }, + { + "epoch": 4.09, + "grad_norm": 6.360658645629883, + "learning_rate": 7.913388876939065e-07, + "logits/chosen": -0.48916909098625183, + "logits/rejected": -0.5571943521499634, + "logps/chosen": -64.59198760986328, + "logps/rejected": -104.5113525390625, + "loss": 0.6498, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.150158405303955, + "rewards/margins": 7.1895365715026855, + "rewards/rejected": -4.039377212524414, + "step": 16357 + }, + { + "epoch": 4.09, + "grad_norm": 14.478378295898438, + "learning_rate": 7.909145887060104e-07, + "logits/chosen": -0.6050729155540466, + "logits/rejected": -0.6590873599052429, + "logps/chosen": -44.14162826538086, + "logps/rejected": -125.30353546142578, + "loss": 0.5656, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0174362659454346, + "rewards/margins": 8.260433197021484, + "rewards/rejected": -5.242997169494629, + "step": 16358 + }, + { + "epoch": 4.09, + "grad_norm": 6.617910385131836, + "learning_rate": 7.90490393729092e-07, + "logits/chosen": -0.5706446170806885, + "logits/rejected": -0.6766625642776489, + "logps/chosen": -50.88439178466797, + "logps/rejected": -114.2621078491211, + "loss": 0.5998, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2468438148498535, + "rewards/margins": 8.430307388305664, + "rewards/rejected": -5.183464050292969, + "step": 16359 + }, + { + "epoch": 4.09, + "grad_norm": 3.8684141635894775, + "learning_rate": 7.900663027736361e-07, + "logits/chosen": -0.5650815963745117, + "logits/rejected": -0.6369228363037109, + "logps/chosen": -48.35691833496094, + "logps/rejected": -100.13584899902344, + "loss": 0.5532, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1548705101013184, + "rewards/margins": 6.659793853759766, + "rewards/rejected": -3.5049233436584473, + "step": 16360 + }, + { + "epoch": 4.09, + "grad_norm": 3.230069398880005, + "learning_rate": 7.896423158501166e-07, + "logits/chosen": -0.4661998450756073, + "logits/rejected": -0.5352010726928711, + "logps/chosen": -45.87654113769531, + "logps/rejected": -119.7966079711914, + "loss": 0.5088, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1836471557617188, + "rewards/margins": 8.512415885925293, + "rewards/rejected": -5.328769207000732, + "step": 16361 + }, + { + "epoch": 4.09, + "grad_norm": 4.183671474456787, + "learning_rate": 7.892184329690144e-07, + "logits/chosen": -0.6006456613540649, + "logits/rejected": -0.660790205001831, + "logps/chosen": -49.798484802246094, + "logps/rejected": -109.51055908203125, + "loss": 0.6157, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9108023643493652, + "rewards/margins": 7.715359210968018, + "rewards/rejected": -4.804556369781494, + "step": 16362 + }, + { + "epoch": 4.09, + "grad_norm": 4.220805644989014, + "learning_rate": 7.887946541408053e-07, + "logits/chosen": -0.5375178456306458, + "logits/rejected": -0.607724130153656, + "logps/chosen": -71.87089538574219, + "logps/rejected": -125.16171264648438, + "loss": 0.7685, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1163082122802734, + "rewards/margins": 7.045787811279297, + "rewards/rejected": -3.9294795989990234, + "step": 16363 + }, + { + "epoch": 4.09, + "grad_norm": 2.3236207962036133, + "learning_rate": 7.883709793759575e-07, + "logits/chosen": -0.5689991116523743, + "logits/rejected": -0.6196455955505371, + "logps/chosen": -55.35776901245117, + "logps/rejected": -118.39763641357422, + "loss": 0.5742, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6360881328582764, + "rewards/margins": 8.154653549194336, + "rewards/rejected": -4.518566131591797, + "step": 16364 + }, + { + "epoch": 4.09, + "grad_norm": 4.100929260253906, + "learning_rate": 7.879474086849425e-07, + "logits/chosen": -0.5162546634674072, + "logits/rejected": -0.6017548441886902, + "logps/chosen": -45.32876968383789, + "logps/rejected": -98.47589111328125, + "loss": 0.5922, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.442857027053833, + "rewards/margins": 7.669643878936768, + "rewards/rejected": -4.226787090301514, + "step": 16365 + }, + { + "epoch": 4.09, + "grad_norm": 4.731595039367676, + "learning_rate": 7.875239420782283e-07, + "logits/chosen": -0.5546858310699463, + "logits/rejected": -0.6236433982849121, + "logps/chosen": -52.05985641479492, + "logps/rejected": -114.14043426513672, + "loss": 0.6525, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0725574493408203, + "rewards/margins": 8.49797534942627, + "rewards/rejected": -5.425418376922607, + "step": 16366 + }, + { + "epoch": 4.09, + "grad_norm": 4.885508060455322, + "learning_rate": 7.871005795662784e-07, + "logits/chosen": -0.5071520209312439, + "logits/rejected": -0.5666204690933228, + "logps/chosen": -50.847572326660156, + "logps/rejected": -123.18995666503906, + "loss": 0.5895, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.004179000854492, + "rewards/margins": 8.600957870483398, + "rewards/rejected": -5.596778869628906, + "step": 16367 + }, + { + "epoch": 4.09, + "grad_norm": 5.428661823272705, + "learning_rate": 7.866773211595524e-07, + "logits/chosen": -0.5079535245895386, + "logits/rejected": -0.5847263932228088, + "logps/chosen": -52.309471130371094, + "logps/rejected": -107.00379180908203, + "loss": 0.5656, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.934865951538086, + "rewards/margins": 7.831605911254883, + "rewards/rejected": -4.896740436553955, + "step": 16368 + }, + { + "epoch": 4.09, + "grad_norm": 2.8054494857788086, + "learning_rate": 7.86254166868513e-07, + "logits/chosen": -0.5279552936553955, + "logits/rejected": -0.6624883413314819, + "logps/chosen": -52.99299621582031, + "logps/rejected": -86.937255859375, + "loss": 0.5723, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2429444789886475, + "rewards/margins": 7.189682483673096, + "rewards/rejected": -3.946737766265869, + "step": 16369 + }, + { + "epoch": 4.1, + "grad_norm": 4.9982476234436035, + "learning_rate": 7.858311167036137e-07, + "logits/chosen": -0.5573244094848633, + "logits/rejected": -0.5704883337020874, + "logps/chosen": -47.64519119262695, + "logps/rejected": -106.90499877929688, + "loss": 0.7127, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3534698486328125, + "rewards/margins": 5.966683387756348, + "rewards/rejected": -2.613212823867798, + "step": 16370 + }, + { + "epoch": 4.1, + "grad_norm": 3.1251137256622314, + "learning_rate": 7.854081706753114e-07, + "logits/chosen": -0.5193274617195129, + "logits/rejected": -0.6131919026374817, + "logps/chosen": -58.24944305419922, + "logps/rejected": -87.56360626220703, + "loss": 0.5935, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.148817300796509, + "rewards/margins": 7.1709418296813965, + "rewards/rejected": -4.022124767303467, + "step": 16371 + }, + { + "epoch": 4.1, + "grad_norm": 3.7262938022613525, + "learning_rate": 7.849853287940556e-07, + "logits/chosen": -0.5793517231941223, + "logits/rejected": -0.6377372741699219, + "logps/chosen": -55.428157806396484, + "logps/rejected": -97.76591491699219, + "loss": 0.6856, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.992316961288452, + "rewards/margins": 6.259615898132324, + "rewards/rejected": -3.267298936843872, + "step": 16372 + }, + { + "epoch": 4.1, + "grad_norm": 2.3821890354156494, + "learning_rate": 7.845625910702942e-07, + "logits/chosen": -0.552411675453186, + "logits/rejected": -0.6163740754127502, + "logps/chosen": -51.18788528442383, + "logps/rejected": -121.02010345458984, + "loss": 0.5852, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.952305316925049, + "rewards/margins": 6.9702839851379395, + "rewards/rejected": -4.017979145050049, + "step": 16373 + }, + { + "epoch": 4.1, + "grad_norm": 5.022200584411621, + "learning_rate": 7.841399575144765e-07, + "logits/chosen": -0.5711643695831299, + "logits/rejected": -0.659464418888092, + "logps/chosen": -53.48989486694336, + "logps/rejected": -90.60887145996094, + "loss": 0.6657, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.66732120513916, + "rewards/margins": 7.217718601226807, + "rewards/rejected": -3.5503973960876465, + "step": 16374 + }, + { + "epoch": 4.1, + "grad_norm": 6.028695583343506, + "learning_rate": 7.837174281370447e-07, + "logits/chosen": -0.5173481702804565, + "logits/rejected": -0.5651150345802307, + "logps/chosen": -56.61232376098633, + "logps/rejected": -110.0317611694336, + "loss": 0.6355, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8389453887939453, + "rewards/margins": 6.308190822601318, + "rewards/rejected": -3.469245195388794, + "step": 16375 + }, + { + "epoch": 4.1, + "grad_norm": 17.977001190185547, + "learning_rate": 7.832950029484387e-07, + "logits/chosen": -0.5400494337081909, + "logits/rejected": -0.6054813861846924, + "logps/chosen": -57.9671745300293, + "logps/rejected": -130.38189697265625, + "loss": 0.6816, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1877639293670654, + "rewards/margins": 9.034286499023438, + "rewards/rejected": -5.846522331237793, + "step": 16376 + }, + { + "epoch": 4.1, + "grad_norm": 4.854177951812744, + "learning_rate": 7.828726819590981e-07, + "logits/chosen": -0.5711043477058411, + "logits/rejected": -0.6503186225891113, + "logps/chosen": -65.8970947265625, + "logps/rejected": -103.96847534179688, + "loss": 0.7283, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9502615928649902, + "rewards/margins": 7.73659086227417, + "rewards/rejected": -4.78632926940918, + "step": 16377 + }, + { + "epoch": 4.1, + "grad_norm": 10.546392440795898, + "learning_rate": 7.824504651794617e-07, + "logits/chosen": -0.5672819018363953, + "logits/rejected": -0.568243145942688, + "logps/chosen": -46.64582824707031, + "logps/rejected": -119.7625503540039, + "loss": 0.5309, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.892176389694214, + "rewards/margins": 7.481595993041992, + "rewards/rejected": -4.589419364929199, + "step": 16378 + }, + { + "epoch": 4.1, + "grad_norm": 3.3080756664276123, + "learning_rate": 7.820283526199579e-07, + "logits/chosen": -0.6175423860549927, + "logits/rejected": -0.6969249248504639, + "logps/chosen": -55.62849426269531, + "logps/rejected": -111.55125427246094, + "loss": 0.5378, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8175668716430664, + "rewards/margins": 7.8904290199279785, + "rewards/rejected": -5.072862148284912, + "step": 16379 + }, + { + "epoch": 4.1, + "grad_norm": 7.490943908691406, + "learning_rate": 7.816063442910193e-07, + "logits/chosen": -0.45844119787216187, + "logits/rejected": -0.5821645259857178, + "logps/chosen": -60.33078384399414, + "logps/rejected": -96.20603942871094, + "loss": 0.7158, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1484243869781494, + "rewards/margins": 6.50629997253418, + "rewards/rejected": -3.357875108718872, + "step": 16380 + }, + { + "epoch": 4.1, + "grad_norm": 3.9868199825286865, + "learning_rate": 7.811844402030766e-07, + "logits/chosen": -0.5122637748718262, + "logits/rejected": -0.6173770427703857, + "logps/chosen": -63.887413024902344, + "logps/rejected": -120.56999206542969, + "loss": 0.6409, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9428787231445312, + "rewards/margins": 8.691120147705078, + "rewards/rejected": -5.748241901397705, + "step": 16381 + }, + { + "epoch": 4.1, + "grad_norm": 7.7594218254089355, + "learning_rate": 7.807626403665536e-07, + "logits/chosen": -0.5841741561889648, + "logits/rejected": -0.627467930316925, + "logps/chosen": -54.566314697265625, + "logps/rejected": -87.57905578613281, + "loss": 0.7232, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9086265563964844, + "rewards/margins": 5.897500514984131, + "rewards/rejected": -2.9888739585876465, + "step": 16382 + }, + { + "epoch": 4.1, + "grad_norm": 6.713076591491699, + "learning_rate": 7.803409447918725e-07, + "logits/chosen": -0.593693733215332, + "logits/rejected": -0.6696621775627136, + "logps/chosen": -64.22808074951172, + "logps/rejected": -110.34033966064453, + "loss": 0.7055, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.169273614883423, + "rewards/margins": 6.508021831512451, + "rewards/rejected": -3.33874773979187, + "step": 16383 + }, + { + "epoch": 4.1, + "grad_norm": 10.506324768066406, + "learning_rate": 7.799193534894556e-07, + "logits/chosen": -0.5845059752464294, + "logits/rejected": -0.6556288003921509, + "logps/chosen": -55.5396728515625, + "logps/rejected": -103.16011810302734, + "loss": 0.7283, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8524844646453857, + "rewards/margins": 6.312974452972412, + "rewards/rejected": -3.4604897499084473, + "step": 16384 + }, + { + "epoch": 4.1, + "grad_norm": 8.863924980163574, + "learning_rate": 7.794978664697206e-07, + "logits/chosen": -0.5497599244117737, + "logits/rejected": -0.5876703858375549, + "logps/chosen": -61.138702392578125, + "logps/rejected": -148.25790405273438, + "loss": 0.7449, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.791069746017456, + "rewards/margins": 8.305262565612793, + "rewards/rejected": -5.514193058013916, + "step": 16385 + }, + { + "epoch": 4.1, + "grad_norm": 5.610477447509766, + "learning_rate": 7.790764837430815e-07, + "logits/chosen": -0.5601896047592163, + "logits/rejected": -0.616419792175293, + "logps/chosen": -50.088157653808594, + "logps/rejected": -97.64453887939453, + "loss": 0.7186, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3468515872955322, + "rewards/margins": 6.277099132537842, + "rewards/rejected": -2.9302473068237305, + "step": 16386 + }, + { + "epoch": 4.1, + "grad_norm": 4.833560943603516, + "learning_rate": 7.786552053199525e-07, + "logits/chosen": -0.615985631942749, + "logits/rejected": -0.6807898283004761, + "logps/chosen": -50.033721923828125, + "logps/rejected": -120.29863739013672, + "loss": 0.6462, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.290839195251465, + "rewards/margins": 6.414299964904785, + "rewards/rejected": -3.123460531234741, + "step": 16387 + }, + { + "epoch": 4.1, + "grad_norm": 6.1893134117126465, + "learning_rate": 7.782340312107439e-07, + "logits/chosen": -0.554794192314148, + "logits/rejected": -0.5852809548377991, + "logps/chosen": -71.21903228759766, + "logps/rejected": -113.20222473144531, + "loss": 0.7544, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.051879644393921, + "rewards/margins": 6.918004512786865, + "rewards/rejected": -3.8661253452301025, + "step": 16388 + }, + { + "epoch": 4.1, + "grad_norm": 12.173276901245117, + "learning_rate": 7.778129614258617e-07, + "logits/chosen": -0.5972508788108826, + "logits/rejected": -0.6275676488876343, + "logps/chosen": -58.25785446166992, + "logps/rejected": -111.60971069335938, + "loss": 0.7199, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1938066482543945, + "rewards/margins": 7.49759578704834, + "rewards/rejected": -4.303788661956787, + "step": 16389 + }, + { + "epoch": 4.1, + "grad_norm": 4.445295333862305, + "learning_rate": 7.773919959757132e-07, + "logits/chosen": -0.5386816263198853, + "logits/rejected": -0.6409047245979309, + "logps/chosen": -67.21177673339844, + "logps/rejected": -124.46653747558594, + "loss": 0.7217, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.375279188156128, + "rewards/margins": 7.685461044311523, + "rewards/rejected": -4.310181617736816, + "step": 16390 + }, + { + "epoch": 4.1, + "grad_norm": 2.783015489578247, + "learning_rate": 7.769711348706982e-07, + "logits/chosen": -0.4379122257232666, + "logits/rejected": -0.5634956359863281, + "logps/chosen": -59.31531524658203, + "logps/rejected": -109.81233215332031, + "loss": 0.572, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.941624879837036, + "rewards/margins": 8.035983085632324, + "rewards/rejected": -5.094357967376709, + "step": 16391 + }, + { + "epoch": 4.1, + "grad_norm": 5.165018558502197, + "learning_rate": 7.765503781212203e-07, + "logits/chosen": -0.5255822539329529, + "logits/rejected": -0.5879581570625305, + "logps/chosen": -53.93222427368164, + "logps/rejected": -117.19281005859375, + "loss": 0.6777, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0262808799743652, + "rewards/margins": 8.131513595581055, + "rewards/rejected": -5.105231761932373, + "step": 16392 + }, + { + "epoch": 4.1, + "grad_norm": 7.652928352355957, + "learning_rate": 7.761297257376732e-07, + "logits/chosen": -0.5371686220169067, + "logits/rejected": -0.5917391180992126, + "logps/chosen": -58.49254608154297, + "logps/rejected": -103.134765625, + "loss": 0.6109, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8655316829681396, + "rewards/margins": 5.882242202758789, + "rewards/rejected": -3.0167105197906494, + "step": 16393 + }, + { + "epoch": 4.1, + "grad_norm": 2.1479432582855225, + "learning_rate": 7.757091777304543e-07, + "logits/chosen": -0.5418399572372437, + "logits/rejected": -0.6200426816940308, + "logps/chosen": -56.5386848449707, + "logps/rejected": -118.1444091796875, + "loss": 0.5841, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2459867000579834, + "rewards/margins": 8.898225784301758, + "rewards/rejected": -5.652238845825195, + "step": 16394 + }, + { + "epoch": 4.1, + "grad_norm": 3.9135162830352783, + "learning_rate": 7.752887341099546e-07, + "logits/chosen": -0.5188279747962952, + "logits/rejected": -0.5902666449546814, + "logps/chosen": -58.474632263183594, + "logps/rejected": -96.63119506835938, + "loss": 0.5861, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.144073009490967, + "rewards/margins": 6.529165267944336, + "rewards/rejected": -3.3850929737091064, + "step": 16395 + }, + { + "epoch": 4.1, + "grad_norm": 4.067214012145996, + "learning_rate": 7.748683948865631e-07, + "logits/chosen": -0.5612297058105469, + "logits/rejected": -0.6425167322158813, + "logps/chosen": -55.29045104980469, + "logps/rejected": -110.65852355957031, + "loss": 0.6053, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9331138134002686, + "rewards/margins": 7.351853370666504, + "rewards/rejected": -4.418739318847656, + "step": 16396 + }, + { + "epoch": 4.1, + "grad_norm": 8.004685401916504, + "learning_rate": 7.744481600706689e-07, + "logits/chosen": -0.5165799856185913, + "logits/rejected": -0.6124929785728455, + "logps/chosen": -57.827049255371094, + "logps/rejected": -97.65514373779297, + "loss": 0.6952, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7014102935791016, + "rewards/margins": 6.38111686706543, + "rewards/rejected": -3.6797068119049072, + "step": 16397 + }, + { + "epoch": 4.1, + "grad_norm": 2.4969396591186523, + "learning_rate": 7.740280296726538e-07, + "logits/chosen": -0.5528370141983032, + "logits/rejected": -0.6535983681678772, + "logps/chosen": -56.453975677490234, + "logps/rejected": -112.39321899414062, + "loss": 0.5604, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3140783309936523, + "rewards/margins": 8.081656455993652, + "rewards/rejected": -4.767577648162842, + "step": 16398 + }, + { + "epoch": 4.1, + "grad_norm": 11.473854064941406, + "learning_rate": 7.736080037029026e-07, + "logits/chosen": -0.6405685544013977, + "logits/rejected": -0.7399445176124573, + "logps/chosen": -58.312007904052734, + "logps/rejected": -92.27133178710938, + "loss": 0.7779, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.045194625854492, + "rewards/margins": 6.0781331062316895, + "rewards/rejected": -3.0329389572143555, + "step": 16399 + }, + { + "epoch": 4.1, + "grad_norm": 4.673350811004639, + "learning_rate": 7.731880821717924e-07, + "logits/chosen": -0.6243577003479004, + "logits/rejected": -0.7210413217544556, + "logps/chosen": -54.495819091796875, + "logps/rejected": -91.51808166503906, + "loss": 0.598, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1747264862060547, + "rewards/margins": 6.364102363586426, + "rewards/rejected": -3.18937611579895, + "step": 16400 + }, + { + "epoch": 4.1, + "grad_norm": 4.084051609039307, + "learning_rate": 7.727682650896995e-07, + "logits/chosen": -0.5779318809509277, + "logits/rejected": -0.6642650365829468, + "logps/chosen": -56.80010223388672, + "logps/rejected": -120.09109497070312, + "loss": 0.5999, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8777592182159424, + "rewards/margins": 8.20156478881836, + "rewards/rejected": -5.32380485534668, + "step": 16401 + }, + { + "epoch": 4.1, + "grad_norm": 2.3851587772369385, + "learning_rate": 7.723485524669994e-07, + "logits/chosen": -0.5212681293487549, + "logits/rejected": -0.5783931016921997, + "logps/chosen": -50.12349319458008, + "logps/rejected": -105.06249237060547, + "loss": 0.5349, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2043683528900146, + "rewards/margins": 7.407882213592529, + "rewards/rejected": -4.203513145446777, + "step": 16402 + }, + { + "epoch": 4.1, + "grad_norm": 4.219554424285889, + "learning_rate": 7.719289443140653e-07, + "logits/chosen": -0.531926691532135, + "logits/rejected": -0.643953263759613, + "logps/chosen": -62.90931701660156, + "logps/rejected": -100.50926971435547, + "loss": 0.642, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3320887088775635, + "rewards/margins": 7.01178503036499, + "rewards/rejected": -3.679696559906006, + "step": 16403 + }, + { + "epoch": 4.1, + "grad_norm": 2.1032817363739014, + "learning_rate": 7.715094406412621e-07, + "logits/chosen": -0.5429414510726929, + "logits/rejected": -0.5954478979110718, + "logps/chosen": -49.924922943115234, + "logps/rejected": -119.15992736816406, + "loss": 0.5808, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1357812881469727, + "rewards/margins": 7.736627578735352, + "rewards/rejected": -4.600846290588379, + "step": 16404 + }, + { + "epoch": 4.1, + "grad_norm": 3.7468583583831787, + "learning_rate": 7.710900414589578e-07, + "logits/chosen": -0.6371240019798279, + "logits/rejected": -0.7100028395652771, + "logps/chosen": -49.77484130859375, + "logps/rejected": -105.19564056396484, + "loss": 0.5803, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0956218242645264, + "rewards/margins": 7.878210067749023, + "rewards/rejected": -4.782588005065918, + "step": 16405 + }, + { + "epoch": 4.1, + "grad_norm": 18.251941680908203, + "learning_rate": 7.706707467775193e-07, + "logits/chosen": -0.5369232892990112, + "logits/rejected": -0.6206216216087341, + "logps/chosen": -55.005611419677734, + "logps/rejected": -114.79734802246094, + "loss": 0.6573, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7664175033569336, + "rewards/margins": 7.733431816101074, + "rewards/rejected": -4.967013835906982, + "step": 16406 + }, + { + "epoch": 4.1, + "grad_norm": 2.89640212059021, + "learning_rate": 7.702515566073027e-07, + "logits/chosen": -0.5589773654937744, + "logits/rejected": -0.6530287265777588, + "logps/chosen": -47.98699188232422, + "logps/rejected": -114.12345886230469, + "loss": 0.5298, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.068868637084961, + "rewards/margins": 8.090897560119629, + "rewards/rejected": -5.02202844619751, + "step": 16407 + }, + { + "epoch": 4.1, + "grad_norm": 4.4789557456970215, + "learning_rate": 7.698324709586696e-07, + "logits/chosen": -0.539743185043335, + "logits/rejected": -0.6861700415611267, + "logps/chosen": -68.42366790771484, + "logps/rejected": -122.27375030517578, + "loss": 0.5872, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2037339210510254, + "rewards/margins": 8.371604919433594, + "rewards/rejected": -5.167871475219727, + "step": 16408 + }, + { + "epoch": 4.1, + "grad_norm": 18.174612045288086, + "learning_rate": 7.694134898419759e-07, + "logits/chosen": -0.6388225555419922, + "logits/rejected": -0.7048251628875732, + "logps/chosen": -54.41847229003906, + "logps/rejected": -90.81490325927734, + "loss": 0.6127, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.860300302505493, + "rewards/margins": 5.2883453369140625, + "rewards/rejected": -2.4280450344085693, + "step": 16409 + }, + { + "epoch": 4.11, + "grad_norm": 11.779475212097168, + "learning_rate": 7.689946132675752e-07, + "logits/chosen": -0.5799311399459839, + "logits/rejected": -0.622467041015625, + "logps/chosen": -50.206722259521484, + "logps/rejected": -101.36775207519531, + "loss": 0.7395, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.709878921508789, + "rewards/margins": 6.565361022949219, + "rewards/rejected": -3.8554818630218506, + "step": 16410 + }, + { + "epoch": 4.11, + "grad_norm": 2.7374260425567627, + "learning_rate": 7.685758412458167e-07, + "logits/chosen": -0.5713543891906738, + "logits/rejected": -0.6904715299606323, + "logps/chosen": -53.72527313232422, + "logps/rejected": -108.90889739990234, + "loss": 0.5614, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2552640438079834, + "rewards/margins": 8.166218757629395, + "rewards/rejected": -4.91095495223999, + "step": 16411 + }, + { + "epoch": 4.11, + "grad_norm": 6.913616180419922, + "learning_rate": 7.681571737870513e-07, + "logits/chosen": -0.5440977215766907, + "logits/rejected": -0.6137512922286987, + "logps/chosen": -62.714508056640625, + "logps/rejected": -124.45538330078125, + "loss": 0.6084, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9379935264587402, + "rewards/margins": 7.5671067237854, + "rewards/rejected": -4.62911319732666, + "step": 16412 + }, + { + "epoch": 4.11, + "grad_norm": 6.702451705932617, + "learning_rate": 7.677386109016227e-07, + "logits/chosen": -0.5459449291229248, + "logits/rejected": -0.6470698118209839, + "logps/chosen": -52.9386100769043, + "logps/rejected": -89.87126159667969, + "loss": 0.5901, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8668227195739746, + "rewards/margins": 6.751868724822998, + "rewards/rejected": -3.8850462436676025, + "step": 16413 + }, + { + "epoch": 4.11, + "grad_norm": 5.882294178009033, + "learning_rate": 7.673201525998741e-07, + "logits/chosen": -0.5914738774299622, + "logits/rejected": -0.6703677773475647, + "logps/chosen": -60.32160949707031, + "logps/rejected": -129.42507934570312, + "loss": 0.6398, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.92635178565979, + "rewards/margins": 8.410587310791016, + "rewards/rejected": -5.484235763549805, + "step": 16414 + }, + { + "epoch": 4.11, + "grad_norm": 3.500232458114624, + "learning_rate": 7.669017988921474e-07, + "logits/chosen": -0.6156225800514221, + "logits/rejected": -0.6565000414848328, + "logps/chosen": -43.22821044921875, + "logps/rejected": -105.36444091796875, + "loss": 0.6081, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.314518451690674, + "rewards/margins": 6.990022659301758, + "rewards/rejected": -3.675504207611084, + "step": 16415 + }, + { + "epoch": 4.11, + "grad_norm": 3.6284124851226807, + "learning_rate": 7.664835497887785e-07, + "logits/chosen": -0.4610242247581482, + "logits/rejected": -0.5537547469139099, + "logps/chosen": -56.49165725708008, + "logps/rejected": -109.42931365966797, + "loss": 0.5887, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.159700393676758, + "rewards/margins": 7.433712482452393, + "rewards/rejected": -4.274012565612793, + "step": 16416 + }, + { + "epoch": 4.11, + "grad_norm": 4.827010154724121, + "learning_rate": 7.660654053001054e-07, + "logits/chosen": -0.5095844268798828, + "logits/rejected": -0.6113365292549133, + "logps/chosen": -58.14736557006836, + "logps/rejected": -121.43241882324219, + "loss": 0.5594, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.682823896408081, + "rewards/margins": 7.775284767150879, + "rewards/rejected": -5.092460632324219, + "step": 16417 + }, + { + "epoch": 4.11, + "grad_norm": 31.11276626586914, + "learning_rate": 7.656473654364599e-07, + "logits/chosen": -0.6131500005722046, + "logits/rejected": -0.6591571569442749, + "logps/chosen": -53.48780059814453, + "logps/rejected": -125.86114501953125, + "loss": 0.6959, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.899643659591675, + "rewards/margins": 6.642788410186768, + "rewards/rejected": -3.74314546585083, + "step": 16418 + }, + { + "epoch": 4.11, + "grad_norm": 6.284783363342285, + "learning_rate": 7.652294302081703e-07, + "logits/chosen": -0.591995894908905, + "logits/rejected": -0.5874298214912415, + "logps/chosen": -61.118125915527344, + "logps/rejected": -103.70014953613281, + "loss": 0.7501, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7168712615966797, + "rewards/margins": 5.104367256164551, + "rewards/rejected": -2.3874964714050293, + "step": 16419 + }, + { + "epoch": 4.11, + "grad_norm": 1.8015226125717163, + "learning_rate": 7.648115996255673e-07, + "logits/chosen": -0.55707848072052, + "logits/rejected": -0.643571674823761, + "logps/chosen": -54.72970962524414, + "logps/rejected": -103.19346618652344, + "loss": 0.5168, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.24070405960083, + "rewards/margins": 7.727855682373047, + "rewards/rejected": -4.487151145935059, + "step": 16420 + }, + { + "epoch": 4.11, + "grad_norm": 12.3562650680542, + "learning_rate": 7.643938736989742e-07, + "logits/chosen": -0.47036945819854736, + "logits/rejected": -0.5886494517326355, + "logps/chosen": -59.48394775390625, + "logps/rejected": -105.7801513671875, + "loss": 0.6299, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9501001834869385, + "rewards/margins": 6.661802291870117, + "rewards/rejected": -3.7117016315460205, + "step": 16421 + }, + { + "epoch": 4.11, + "grad_norm": 6.419870853424072, + "learning_rate": 7.639762524387123e-07, + "logits/chosen": -0.5843935012817383, + "logits/rejected": -0.6566904187202454, + "logps/chosen": -48.84428405761719, + "logps/rejected": -108.76260375976562, + "loss": 0.6231, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.987581491470337, + "rewards/margins": 6.070298194885254, + "rewards/rejected": -3.0827174186706543, + "step": 16422 + }, + { + "epoch": 4.11, + "grad_norm": 2.7474958896636963, + "learning_rate": 7.635587358551027e-07, + "logits/chosen": -0.6132829785346985, + "logits/rejected": -0.7052678465843201, + "logps/chosen": -62.27671813964844, + "logps/rejected": -91.31037902832031, + "loss": 0.6013, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3082826137542725, + "rewards/margins": 7.217730522155762, + "rewards/rejected": -3.9094483852386475, + "step": 16423 + }, + { + "epoch": 4.11, + "grad_norm": 7.525461673736572, + "learning_rate": 7.631413239584645e-07, + "logits/chosen": -0.5215490460395813, + "logits/rejected": -0.624332845211029, + "logps/chosen": -54.4781608581543, + "logps/rejected": -101.21281433105469, + "loss": 0.6111, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.888381004333496, + "rewards/margins": 6.600159645080566, + "rewards/rejected": -3.7117788791656494, + "step": 16424 + }, + { + "epoch": 4.11, + "grad_norm": 3.679239273071289, + "learning_rate": 7.627240167591105e-07, + "logits/chosen": -0.4834262728691101, + "logits/rejected": -0.5819460153579712, + "logps/chosen": -64.02347564697266, + "logps/rejected": -117.11480712890625, + "loss": 0.6163, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.306457996368408, + "rewards/margins": 8.286917686462402, + "rewards/rejected": -4.980459213256836, + "step": 16425 + }, + { + "epoch": 4.11, + "grad_norm": 3.954369068145752, + "learning_rate": 7.623068142673518e-07, + "logits/chosen": -0.5542355179786682, + "logits/rejected": -0.6314418315887451, + "logps/chosen": -46.17662048339844, + "logps/rejected": -103.85328674316406, + "loss": 0.5559, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0072691440582275, + "rewards/margins": 7.640326499938965, + "rewards/rejected": -4.633057117462158, + "step": 16426 + }, + { + "epoch": 4.11, + "grad_norm": 5.554870128631592, + "learning_rate": 7.618897164935002e-07, + "logits/chosen": -0.5450884699821472, + "logits/rejected": -0.5857503414154053, + "logps/chosen": -56.07026290893555, + "logps/rejected": -95.32423400878906, + "loss": 0.6718, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.223778247833252, + "rewards/margins": 6.254408359527588, + "rewards/rejected": -3.0306296348571777, + "step": 16427 + }, + { + "epoch": 4.11, + "grad_norm": 6.477357387542725, + "learning_rate": 7.61472723447862e-07, + "logits/chosen": -0.5639420747756958, + "logits/rejected": -0.6437303423881531, + "logps/chosen": -47.43663024902344, + "logps/rejected": -104.48236846923828, + "loss": 0.6328, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1105868816375732, + "rewards/margins": 6.623750686645508, + "rewards/rejected": -3.5131642818450928, + "step": 16428 + }, + { + "epoch": 4.11, + "grad_norm": 3.1936395168304443, + "learning_rate": 7.610558351407393e-07, + "logits/chosen": -0.5588288307189941, + "logits/rejected": -0.62297523021698, + "logps/chosen": -44.64894485473633, + "logps/rejected": -104.2453384399414, + "loss": 0.5119, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2096540927886963, + "rewards/margins": 7.841061115264893, + "rewards/rejected": -4.631407260894775, + "step": 16429 + }, + { + "epoch": 4.11, + "grad_norm": 4.339686393737793, + "learning_rate": 7.606390515824375e-07, + "logits/chosen": -0.5719215273857117, + "logits/rejected": -0.6669681072235107, + "logps/chosen": -55.1633186340332, + "logps/rejected": -92.30005645751953, + "loss": 0.6395, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1087584495544434, + "rewards/margins": 6.8362836837768555, + "rewards/rejected": -3.727525472640991, + "step": 16430 + }, + { + "epoch": 4.11, + "grad_norm": 11.478761672973633, + "learning_rate": 7.602223727832536e-07, + "logits/chosen": -0.515347957611084, + "logits/rejected": -0.6204466819763184, + "logps/chosen": -57.7087516784668, + "logps/rejected": -95.85826873779297, + "loss": 0.6657, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8395442962646484, + "rewards/margins": 7.003796577453613, + "rewards/rejected": -4.164251804351807, + "step": 16431 + }, + { + "epoch": 4.11, + "grad_norm": 3.798358678817749, + "learning_rate": 7.598057987534829e-07, + "logits/chosen": -0.5696296095848083, + "logits/rejected": -0.6539509296417236, + "logps/chosen": -61.759925842285156, + "logps/rejected": -121.2864761352539, + "loss": 0.6291, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3682491779327393, + "rewards/margins": 8.03262996673584, + "rewards/rejected": -4.664380073547363, + "step": 16432 + }, + { + "epoch": 4.11, + "grad_norm": 2.791537284851074, + "learning_rate": 7.593893295034227e-07, + "logits/chosen": -0.5667154788970947, + "logits/rejected": -0.6459742784500122, + "logps/chosen": -52.084415435791016, + "logps/rejected": -126.36343383789062, + "loss": 0.5734, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.171173095703125, + "rewards/margins": 8.785456657409668, + "rewards/rejected": -5.614283561706543, + "step": 16433 + }, + { + "epoch": 4.11, + "grad_norm": 12.285547256469727, + "learning_rate": 7.589729650433613e-07, + "logits/chosen": -0.5806237459182739, + "logits/rejected": -0.6616337299346924, + "logps/chosen": -55.6664924621582, + "logps/rejected": -104.43053436279297, + "loss": 0.6951, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.758880853652954, + "rewards/margins": 6.815216541290283, + "rewards/rejected": -4.056334972381592, + "step": 16434 + }, + { + "epoch": 4.11, + "grad_norm": 2.8159711360931396, + "learning_rate": 7.585567053835902e-07, + "logits/chosen": -0.533250629901886, + "logits/rejected": -0.604682207107544, + "logps/chosen": -46.12809753417969, + "logps/rejected": -103.45883178710938, + "loss": 0.5693, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0130696296691895, + "rewards/margins": 7.038228511810303, + "rewards/rejected": -4.025158882141113, + "step": 16435 + }, + { + "epoch": 4.11, + "grad_norm": 5.567004203796387, + "learning_rate": 7.581405505343931e-07, + "logits/chosen": -0.4442988932132721, + "logits/rejected": -0.5314159393310547, + "logps/chosen": -58.67729187011719, + "logps/rejected": -114.93069458007812, + "loss": 0.6379, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.179769992828369, + "rewards/margins": 7.033067226409912, + "rewards/rejected": -3.853297472000122, + "step": 16436 + }, + { + "epoch": 4.11, + "grad_norm": 3.832387685775757, + "learning_rate": 7.577245005060562e-07, + "logits/chosen": -0.4946832060813904, + "logits/rejected": -0.5710011124610901, + "logps/chosen": -53.83460235595703, + "logps/rejected": -113.03665161132812, + "loss": 0.5763, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3804354667663574, + "rewards/margins": 7.77649450302124, + "rewards/rejected": -4.396059036254883, + "step": 16437 + }, + { + "epoch": 4.11, + "grad_norm": 3.06472110748291, + "learning_rate": 7.573085553088589e-07, + "logits/chosen": -0.5143840312957764, + "logits/rejected": -0.5944308638572693, + "logps/chosen": -66.26136016845703, + "logps/rejected": -108.19335174560547, + "loss": 0.6374, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.956235408782959, + "rewards/margins": 7.149385929107666, + "rewards/rejected": -4.193150520324707, + "step": 16438 + }, + { + "epoch": 4.11, + "grad_norm": 10.388118743896484, + "learning_rate": 7.568927149530786e-07, + "logits/chosen": -0.52448570728302, + "logits/rejected": -0.5848100781440735, + "logps/chosen": -62.19756317138672, + "logps/rejected": -129.4874267578125, + "loss": 0.7186, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8364923000335693, + "rewards/margins": 7.934144496917725, + "rewards/rejected": -5.097652435302734, + "step": 16439 + }, + { + "epoch": 4.11, + "grad_norm": 5.475322723388672, + "learning_rate": 7.56476979448994e-07, + "logits/chosen": -0.5880926847457886, + "logits/rejected": -0.6276981234550476, + "logps/chosen": -49.041175842285156, + "logps/rejected": -96.54431915283203, + "loss": 0.7266, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0472803115844727, + "rewards/margins": 5.401801109313965, + "rewards/rejected": -2.3545212745666504, + "step": 16440 + }, + { + "epoch": 4.11, + "grad_norm": 3.9466519355773926, + "learning_rate": 7.560613488068758e-07, + "logits/chosen": -0.4791633188724518, + "logits/rejected": -0.6284195184707642, + "logps/chosen": -67.31977844238281, + "logps/rejected": -116.98410034179688, + "loss": 0.6236, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.976099967956543, + "rewards/margins": 8.006080627441406, + "rewards/rejected": -5.029979705810547, + "step": 16441 + }, + { + "epoch": 4.11, + "grad_norm": 5.948079586029053, + "learning_rate": 7.556458230369962e-07, + "logits/chosen": -0.5006396174430847, + "logits/rejected": -0.6226610541343689, + "logps/chosen": -58.21323013305664, + "logps/rejected": -101.61170959472656, + "loss": 0.5661, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2598679065704346, + "rewards/margins": 7.277296543121338, + "rewards/rejected": -4.017428398132324, + "step": 16442 + }, + { + "epoch": 4.11, + "grad_norm": 5.64738655090332, + "learning_rate": 7.552304021496238e-07, + "logits/chosen": -0.535113513469696, + "logits/rejected": -0.616348922252655, + "logps/chosen": -52.307369232177734, + "logps/rejected": -92.60797119140625, + "loss": 0.6244, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.814877510070801, + "rewards/margins": 6.047905921936035, + "rewards/rejected": -3.2330284118652344, + "step": 16443 + }, + { + "epoch": 4.11, + "grad_norm": 2.661878824234009, + "learning_rate": 7.548150861550213e-07, + "logits/chosen": -0.5451350212097168, + "logits/rejected": -0.6169357299804688, + "logps/chosen": -45.628570556640625, + "logps/rejected": -119.47328186035156, + "loss": 0.5543, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1125621795654297, + "rewards/margins": 8.364797592163086, + "rewards/rejected": -5.252235412597656, + "step": 16444 + }, + { + "epoch": 4.11, + "grad_norm": 5.720695972442627, + "learning_rate": 7.543998750634546e-07, + "logits/chosen": -0.5654655694961548, + "logits/rejected": -0.6247508525848389, + "logps/chosen": -58.72525405883789, + "logps/rejected": -108.70398712158203, + "loss": 0.6583, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8861474990844727, + "rewards/margins": 6.714888572692871, + "rewards/rejected": -3.8287415504455566, + "step": 16445 + }, + { + "epoch": 4.11, + "grad_norm": 11.341100692749023, + "learning_rate": 7.539847688851826e-07, + "logits/chosen": -0.5270195603370667, + "logits/rejected": -0.560897171497345, + "logps/chosen": -51.1848030090332, + "logps/rejected": -97.4593734741211, + "loss": 0.7471, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.117957592010498, + "rewards/margins": 5.075949192047119, + "rewards/rejected": -1.957991600036621, + "step": 16446 + }, + { + "epoch": 4.11, + "grad_norm": 5.4590678215026855, + "learning_rate": 7.535697676304626e-07, + "logits/chosen": -0.6068477034568787, + "logits/rejected": -0.6740624904632568, + "logps/chosen": -59.2851676940918, + "logps/rejected": -88.5418472290039, + "loss": 0.6972, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.400808095932007, + "rewards/margins": 6.693575859069824, + "rewards/rejected": -3.2927682399749756, + "step": 16447 + }, + { + "epoch": 4.11, + "grad_norm": 7.7135233879089355, + "learning_rate": 7.531548713095499e-07, + "logits/chosen": -0.5438849925994873, + "logits/rejected": -0.6484313011169434, + "logps/chosen": -47.94117736816406, + "logps/rejected": -99.57073974609375, + "loss": 0.5435, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2510900497436523, + "rewards/margins": 7.355603218078613, + "rewards/rejected": -4.104513645172119, + "step": 16448 + }, + { + "epoch": 4.11, + "grad_norm": 8.103094100952148, + "learning_rate": 7.527400799327001e-07, + "logits/chosen": -0.554839551448822, + "logits/rejected": -0.6436711549758911, + "logps/chosen": -58.616939544677734, + "logps/rejected": -97.55931091308594, + "loss": 0.638, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.069957971572876, + "rewards/margins": 6.361207962036133, + "rewards/rejected": -3.291249990463257, + "step": 16449 + }, + { + "epoch": 4.12, + "grad_norm": 3.892808198928833, + "learning_rate": 7.523253935101577e-07, + "logits/chosen": -0.482073038816452, + "logits/rejected": -0.48455509543418884, + "logps/chosen": -56.28507614135742, + "logps/rejected": -120.45359802246094, + "loss": 0.6392, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.653262138366699, + "rewards/margins": 6.673645973205566, + "rewards/rejected": -3.020383358001709, + "step": 16450 + }, + { + "epoch": 4.12, + "grad_norm": 5.650324821472168, + "learning_rate": 7.519108120521734e-07, + "logits/chosen": -0.48671647906303406, + "logits/rejected": -0.5539581179618835, + "logps/chosen": -48.03608703613281, + "logps/rejected": -122.32342529296875, + "loss": 0.6356, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.149965763092041, + "rewards/margins": 6.665666580200195, + "rewards/rejected": -3.515700578689575, + "step": 16451 + }, + { + "epoch": 4.12, + "grad_norm": 4.726722240447998, + "learning_rate": 7.514963355689924e-07, + "logits/chosen": -0.5018609166145325, + "logits/rejected": -0.5799826979637146, + "logps/chosen": -56.681304931640625, + "logps/rejected": -102.18948364257812, + "loss": 0.6098, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.171881675720215, + "rewards/margins": 6.331902503967285, + "rewards/rejected": -3.1600213050842285, + "step": 16452 + }, + { + "epoch": 4.12, + "grad_norm": 4.358827114105225, + "learning_rate": 7.510819640708561e-07, + "logits/chosen": -0.5330077409744263, + "logits/rejected": -0.5705428123474121, + "logps/chosen": -51.38950729370117, + "logps/rejected": -122.94156646728516, + "loss": 0.5882, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7822673320770264, + "rewards/margins": 6.604236602783203, + "rewards/rejected": -3.8219692707061768, + "step": 16453 + }, + { + "epoch": 4.12, + "grad_norm": 7.227928638458252, + "learning_rate": 7.506676975680022e-07, + "logits/chosen": -0.6307088732719421, + "logits/rejected": -0.7250577211380005, + "logps/chosen": -56.230224609375, + "logps/rejected": -99.10890197753906, + "loss": 0.6626, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1152586936950684, + "rewards/margins": 6.0904741287231445, + "rewards/rejected": -2.975215435028076, + "step": 16454 + }, + { + "epoch": 4.12, + "grad_norm": 3.8566622734069824, + "learning_rate": 7.502535360706703e-07, + "logits/chosen": -0.6161337494850159, + "logits/rejected": -0.7029082775115967, + "logps/chosen": -51.88750457763672, + "logps/rejected": -91.68412017822266, + "loss": 0.6472, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9995129108428955, + "rewards/margins": 6.8852338790893555, + "rewards/rejected": -3.885720729827881, + "step": 16455 + }, + { + "epoch": 4.12, + "grad_norm": 5.201844692230225, + "learning_rate": 7.498394795890945e-07, + "logits/chosen": -0.467392235994339, + "logits/rejected": -0.5737525820732117, + "logps/chosen": -57.031219482421875, + "logps/rejected": -119.14491271972656, + "loss": 0.5614, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2512829303741455, + "rewards/margins": 7.266537189483643, + "rewards/rejected": -4.015254497528076, + "step": 16456 + }, + { + "epoch": 4.12, + "grad_norm": 2.650501012802124, + "learning_rate": 7.494255281335039e-07, + "logits/chosen": -0.556492030620575, + "logits/rejected": -0.663037896156311, + "logps/chosen": -47.40733337402344, + "logps/rejected": -103.32121276855469, + "loss": 0.5057, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6585609912872314, + "rewards/margins": 8.334929466247559, + "rewards/rejected": -5.676368236541748, + "step": 16457 + }, + { + "epoch": 4.12, + "grad_norm": 2.538341522216797, + "learning_rate": 7.490116817141313e-07, + "logits/chosen": -0.5637028217315674, + "logits/rejected": -0.6671608686447144, + "logps/chosen": -46.16534423828125, + "logps/rejected": -89.42791748046875, + "loss": 0.5393, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.21157169342041, + "rewards/margins": 6.762085437774658, + "rewards/rejected": -3.55051326751709, + "step": 16458 + }, + { + "epoch": 4.12, + "grad_norm": 3.1060571670532227, + "learning_rate": 7.485979403412003e-07, + "logits/chosen": -0.6526791453361511, + "logits/rejected": -0.7571921348571777, + "logps/chosen": -44.60298538208008, + "logps/rejected": -96.7330551147461, + "loss": 0.5244, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.042405605316162, + "rewards/margins": 7.546167373657227, + "rewards/rejected": -4.5037617683410645, + "step": 16459 + }, + { + "epoch": 4.12, + "grad_norm": 3.5931010246276855, + "learning_rate": 7.481843040249376e-07, + "logits/chosen": -0.5044178366661072, + "logits/rejected": -0.610324501991272, + "logps/chosen": -56.99989318847656, + "logps/rejected": -97.25778198242188, + "loss": 0.5864, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0215904712677, + "rewards/margins": 7.138162612915039, + "rewards/rejected": -4.116572856903076, + "step": 16460 + }, + { + "epoch": 4.12, + "grad_norm": 6.209488868713379, + "learning_rate": 7.477707727755629e-07, + "logits/chosen": -0.47793763875961304, + "logits/rejected": -0.5891053080558777, + "logps/chosen": -60.099708557128906, + "logps/rejected": -125.21490478515625, + "loss": 0.6574, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.91282057762146, + "rewards/margins": 9.213942527770996, + "rewards/rejected": -6.301121234893799, + "step": 16461 + }, + { + "epoch": 4.12, + "grad_norm": 14.39331340789795, + "learning_rate": 7.47357346603294e-07, + "logits/chosen": -0.5049042105674744, + "logits/rejected": -0.605842649936676, + "logps/chosen": -68.71354675292969, + "logps/rejected": -115.41935729980469, + "loss": 0.6077, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.52073335647583, + "rewards/margins": 7.972740650177002, + "rewards/rejected": -5.45200777053833, + "step": 16462 + }, + { + "epoch": 4.12, + "grad_norm": 8.375652313232422, + "learning_rate": 7.469440255183497e-07, + "logits/chosen": -0.5196336507797241, + "logits/rejected": -0.6149113774299622, + "logps/chosen": -52.26792907714844, + "logps/rejected": -89.07080841064453, + "loss": 0.6401, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1404590606689453, + "rewards/margins": 6.525365829467773, + "rewards/rejected": -3.384906530380249, + "step": 16463 + }, + { + "epoch": 4.12, + "grad_norm": 6.487356185913086, + "learning_rate": 7.465308095309421e-07, + "logits/chosen": -0.670693039894104, + "logits/rejected": -0.7244044542312622, + "logps/chosen": -54.31787872314453, + "logps/rejected": -108.2198486328125, + "loss": 0.5911, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1710591316223145, + "rewards/margins": 7.140654563903809, + "rewards/rejected": -3.969594955444336, + "step": 16464 + }, + { + "epoch": 4.12, + "grad_norm": 11.525816917419434, + "learning_rate": 7.461176986512814e-07, + "logits/chosen": -0.49439528584480286, + "logits/rejected": -0.5733762979507446, + "logps/chosen": -62.60600280761719, + "logps/rejected": -90.34259796142578, + "loss": 0.7469, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.867525100708008, + "rewards/margins": 6.003638744354248, + "rewards/rejected": -3.1361136436462402, + "step": 16465 + }, + { + "epoch": 4.12, + "grad_norm": 3.1992392539978027, + "learning_rate": 7.457046928895766e-07, + "logits/chosen": -0.5682755708694458, + "logits/rejected": -0.6825429201126099, + "logps/chosen": -56.00080490112305, + "logps/rejected": -93.1808090209961, + "loss": 0.5977, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.134965419769287, + "rewards/margins": 7.684894561767578, + "rewards/rejected": -4.549929141998291, + "step": 16466 + }, + { + "epoch": 4.12, + "grad_norm": 3.8200154304504395, + "learning_rate": 7.45291792256036e-07, + "logits/chosen": -0.6400734186172485, + "logits/rejected": -0.6691809892654419, + "logps/chosen": -83.70594024658203, + "logps/rejected": -106.68357849121094, + "loss": 0.6229, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0199708938598633, + "rewards/margins": 7.881484031677246, + "rewards/rejected": -4.861513137817383, + "step": 16467 + }, + { + "epoch": 4.12, + "grad_norm": 7.114534854888916, + "learning_rate": 7.448789967608599e-07, + "logits/chosen": -0.5244215130805969, + "logits/rejected": -0.561433732509613, + "logps/chosen": -75.19624328613281, + "logps/rejected": -116.06346893310547, + "loss": 0.7501, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.881831407546997, + "rewards/margins": 5.987964630126953, + "rewards/rejected": -3.106133222579956, + "step": 16468 + }, + { + "epoch": 4.12, + "grad_norm": 3.867342710494995, + "learning_rate": 7.444663064142488e-07, + "logits/chosen": -0.5913230180740356, + "logits/rejected": -0.6795054078102112, + "logps/chosen": -43.70305633544922, + "logps/rejected": -130.77757263183594, + "loss": 0.5966, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4262137413024902, + "rewards/margins": 8.243477821350098, + "rewards/rejected": -4.817263603210449, + "step": 16469 + }, + { + "epoch": 4.12, + "grad_norm": 4.290648937225342, + "learning_rate": 7.440537212264026e-07, + "logits/chosen": -0.5789029002189636, + "logits/rejected": -0.6356408596038818, + "logps/chosen": -61.372962951660156, + "logps/rejected": -96.99333953857422, + "loss": 0.7028, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2292535305023193, + "rewards/margins": 6.1643290519714355, + "rewards/rejected": -2.9350757598876953, + "step": 16470 + }, + { + "epoch": 4.12, + "grad_norm": 3.850963592529297, + "learning_rate": 7.436412412075161e-07, + "logits/chosen": -0.575543999671936, + "logits/rejected": -0.6836770176887512, + "logps/chosen": -56.4659309387207, + "logps/rejected": -94.94711303710938, + "loss": 0.5394, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0683257579803467, + "rewards/margins": 6.956350803375244, + "rewards/rejected": -3.8880255222320557, + "step": 16471 + }, + { + "epoch": 4.12, + "grad_norm": 5.917359828948975, + "learning_rate": 7.432288663677806e-07, + "logits/chosen": -0.4945148825645447, + "logits/rejected": -0.6072397828102112, + "logps/chosen": -59.87184143066406, + "logps/rejected": -134.8143310546875, + "loss": 0.6034, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9438986778259277, + "rewards/margins": 7.622112274169922, + "rewards/rejected": -4.678213119506836, + "step": 16472 + }, + { + "epoch": 4.12, + "grad_norm": 4.877294063568115, + "learning_rate": 7.428165967173889e-07, + "logits/chosen": -0.5230293273925781, + "logits/rejected": -0.609458863735199, + "logps/chosen": -61.25387954711914, + "logps/rejected": -106.63787078857422, + "loss": 0.6256, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.93161678314209, + "rewards/margins": 6.141345977783203, + "rewards/rejected": -3.2097291946411133, + "step": 16473 + }, + { + "epoch": 4.12, + "grad_norm": 6.217321395874023, + "learning_rate": 7.424044322665269e-07, + "logits/chosen": -0.5298467874526978, + "logits/rejected": -0.6638282537460327, + "logps/chosen": -57.76463317871094, + "logps/rejected": -123.12202453613281, + "loss": 0.5744, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.931553363800049, + "rewards/margins": 8.118534088134766, + "rewards/rejected": -5.186980247497559, + "step": 16474 + }, + { + "epoch": 4.12, + "grad_norm": 5.880768775939941, + "learning_rate": 7.419923730253786e-07, + "logits/chosen": -0.47860226035118103, + "logits/rejected": -0.5725208520889282, + "logps/chosen": -49.7537727355957, + "logps/rejected": -120.59705352783203, + "loss": 0.6995, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1125714778900146, + "rewards/margins": 7.403336048126221, + "rewards/rejected": -4.290764808654785, + "step": 16475 + }, + { + "epoch": 4.12, + "grad_norm": 4.151822090148926, + "learning_rate": 7.415804190041292e-07, + "logits/chosen": -0.5486716032028198, + "logits/rejected": -0.5874236822128296, + "logps/chosen": -49.35499954223633, + "logps/rejected": -117.79528045654297, + "loss": 0.6294, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2862510681152344, + "rewards/margins": 6.35744047164917, + "rewards/rejected": -3.0711896419525146, + "step": 16476 + }, + { + "epoch": 4.12, + "grad_norm": 3.014030694961548, + "learning_rate": 7.411685702129562e-07, + "logits/chosen": -0.5030529499053955, + "logits/rejected": -0.6038594245910645, + "logps/chosen": -59.464317321777344, + "logps/rejected": -103.66658782958984, + "loss": 0.5844, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.913804054260254, + "rewards/margins": 8.07542610168457, + "rewards/rejected": -5.161622047424316, + "step": 16477 + }, + { + "epoch": 4.12, + "grad_norm": 8.365155220031738, + "learning_rate": 7.407568266620386e-07, + "logits/chosen": -0.5590907335281372, + "logits/rejected": -0.6463404893875122, + "logps/chosen": -56.84440612792969, + "logps/rejected": -116.23715209960938, + "loss": 0.6795, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6874799728393555, + "rewards/margins": 7.102640151977539, + "rewards/rejected": -4.415160179138184, + "step": 16478 + }, + { + "epoch": 4.12, + "grad_norm": 2.640641450881958, + "learning_rate": 7.403451883615492e-07, + "logits/chosen": -0.5107569694519043, + "logits/rejected": -0.6049602031707764, + "logps/chosen": -66.64492797851562, + "logps/rejected": -103.4630355834961, + "loss": 0.648, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0491302013397217, + "rewards/margins": 7.166089057922363, + "rewards/rejected": -4.1169586181640625, + "step": 16479 + }, + { + "epoch": 4.12, + "grad_norm": 5.090885639190674, + "learning_rate": 7.399336553216618e-07, + "logits/chosen": -0.59697425365448, + "logits/rejected": -0.6899932026863098, + "logps/chosen": -49.75886535644531, + "logps/rejected": -96.17739868164062, + "loss": 0.5925, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8048696517944336, + "rewards/margins": 6.136289596557617, + "rewards/rejected": -3.3314197063446045, + "step": 16480 + }, + { + "epoch": 4.12, + "grad_norm": 10.433841705322266, + "learning_rate": 7.395222275525449e-07, + "logits/chosen": -0.5094990730285645, + "logits/rejected": -0.6082038879394531, + "logps/chosen": -55.53460693359375, + "logps/rejected": -107.47309112548828, + "loss": 0.5257, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.81620454788208, + "rewards/margins": 6.787748336791992, + "rewards/rejected": -3.971543550491333, + "step": 16481 + }, + { + "epoch": 4.12, + "grad_norm": 9.513684272766113, + "learning_rate": 7.391109050643647e-07, + "logits/chosen": -0.5947831869125366, + "logits/rejected": -0.654147207736969, + "logps/chosen": -52.13347625732422, + "logps/rejected": -110.51883697509766, + "loss": 0.6387, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.009101390838623, + "rewards/margins": 6.993436813354492, + "rewards/rejected": -3.9843361377716064, + "step": 16482 + }, + { + "epoch": 4.12, + "grad_norm": 8.742352485656738, + "learning_rate": 7.386996878672869e-07, + "logits/chosen": -0.4372895061969757, + "logits/rejected": -0.5103577375411987, + "logps/chosen": -58.01194763183594, + "logps/rejected": -94.80322265625, + "loss": 0.6928, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.787203311920166, + "rewards/margins": 4.751336574554443, + "rewards/rejected": -1.9641337394714355, + "step": 16483 + }, + { + "epoch": 4.12, + "grad_norm": 7.803264617919922, + "learning_rate": 7.382885759714715e-07, + "logits/chosen": -0.5201614499092102, + "logits/rejected": -0.6167802214622498, + "logps/chosen": -64.96117401123047, + "logps/rejected": -109.37640380859375, + "loss": 0.63, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9342594146728516, + "rewards/margins": 8.112286567687988, + "rewards/rejected": -5.1780266761779785, + "step": 16484 + }, + { + "epoch": 4.12, + "grad_norm": 3.9248440265655518, + "learning_rate": 7.378775693870793e-07, + "logits/chosen": -0.5461571216583252, + "logits/rejected": -0.5671582221984863, + "logps/chosen": -46.76894760131836, + "logps/rejected": -119.74021911621094, + "loss": 0.6221, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.317506790161133, + "rewards/margins": 6.742959976196289, + "rewards/rejected": -3.4254531860351562, + "step": 16485 + }, + { + "epoch": 4.12, + "grad_norm": 6.924482822418213, + "learning_rate": 7.374666681242659e-07, + "logits/chosen": -0.6064959764480591, + "logits/rejected": -0.6927101016044617, + "logps/chosen": -47.54533004760742, + "logps/rejected": -92.72392272949219, + "loss": 0.6397, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1452159881591797, + "rewards/margins": 7.226625919342041, + "rewards/rejected": -4.081409454345703, + "step": 16486 + }, + { + "epoch": 4.12, + "grad_norm": 6.327210426330566, + "learning_rate": 7.370558721931837e-07, + "logits/chosen": -0.5513722896575928, + "logits/rejected": -0.6355181932449341, + "logps/chosen": -67.69602966308594, + "logps/rejected": -120.61907196044922, + "loss": 0.7043, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6209065914154053, + "rewards/margins": 7.042670249938965, + "rewards/rejected": -4.421764373779297, + "step": 16487 + }, + { + "epoch": 4.12, + "grad_norm": 7.964548587799072, + "learning_rate": 7.366451816039865e-07, + "logits/chosen": -0.5783483386039734, + "logits/rejected": -0.652307391166687, + "logps/chosen": -44.735939025878906, + "logps/rejected": -110.55445861816406, + "loss": 0.6152, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4638500213623047, + "rewards/margins": 8.197412490844727, + "rewards/rejected": -4.73356294631958, + "step": 16488 + }, + { + "epoch": 4.12, + "grad_norm": 3.2222819328308105, + "learning_rate": 7.36234596366821e-07, + "logits/chosen": -0.4832981824874878, + "logits/rejected": -0.5841363668441772, + "logps/chosen": -57.5570068359375, + "logps/rejected": -120.8650894165039, + "loss": 0.5781, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7999627590179443, + "rewards/margins": 7.488426208496094, + "rewards/rejected": -4.688462734222412, + "step": 16489 + }, + { + "epoch": 4.13, + "grad_norm": 13.226713180541992, + "learning_rate": 7.358241164918334e-07, + "logits/chosen": -0.5888577699661255, + "logits/rejected": -0.6615204215049744, + "logps/chosen": -44.92216873168945, + "logps/rejected": -103.38333129882812, + "loss": 0.6158, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.237396001815796, + "rewards/margins": 7.354415416717529, + "rewards/rejected": -4.1170196533203125, + "step": 16490 + }, + { + "epoch": 4.13, + "grad_norm": 7.433551788330078, + "learning_rate": 7.354137419891666e-07, + "logits/chosen": -0.593746542930603, + "logits/rejected": -0.6544989347457886, + "logps/chosen": -65.13177490234375, + "logps/rejected": -121.77161407470703, + "loss": 0.7144, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.199345588684082, + "rewards/margins": 8.23188304901123, + "rewards/rejected": -5.032537937164307, + "step": 16491 + }, + { + "epoch": 4.13, + "grad_norm": 11.337844848632812, + "learning_rate": 7.350034728689653e-07, + "logits/chosen": -0.5313261151313782, + "logits/rejected": -0.6284036040306091, + "logps/chosen": -70.12738800048828, + "logps/rejected": -109.74775695800781, + "loss": 0.7581, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7735583782196045, + "rewards/margins": 6.214871883392334, + "rewards/rejected": -3.4413132667541504, + "step": 16492 + }, + { + "epoch": 4.13, + "grad_norm": 3.8872005939483643, + "learning_rate": 7.345933091413621e-07, + "logits/chosen": -0.5555710792541504, + "logits/rejected": -0.6672353744506836, + "logps/chosen": -58.54343795776367, + "logps/rejected": -114.27741241455078, + "loss": 0.5885, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.159327268600464, + "rewards/margins": 8.076313972473145, + "rewards/rejected": -4.916986465454102, + "step": 16493 + }, + { + "epoch": 4.13, + "grad_norm": 3.8440663814544678, + "learning_rate": 7.341832508164948e-07, + "logits/chosen": -0.5851048231124878, + "logits/rejected": -0.6457653045654297, + "logps/chosen": -64.9964370727539, + "logps/rejected": -99.11738586425781, + "loss": 0.6251, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.934185743331909, + "rewards/margins": 6.185546875, + "rewards/rejected": -3.2513608932495117, + "step": 16494 + }, + { + "epoch": 4.13, + "grad_norm": 2.775329113006592, + "learning_rate": 7.337732979044998e-07, + "logits/chosen": -0.5235669016838074, + "logits/rejected": -0.6239209771156311, + "logps/chosen": -51.905487060546875, + "logps/rejected": -98.67825317382812, + "loss": 0.5318, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3752198219299316, + "rewards/margins": 7.2439866065979, + "rewards/rejected": -3.868767738342285, + "step": 16495 + }, + { + "epoch": 4.13, + "grad_norm": 5.124736309051514, + "learning_rate": 7.333634504155012e-07, + "logits/chosen": -0.5037960410118103, + "logits/rejected": -0.5507843494415283, + "logps/chosen": -44.55358123779297, + "logps/rejected": -111.06704711914062, + "loss": 0.5104, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1348717212677, + "rewards/margins": 7.7829976081848145, + "rewards/rejected": -4.648126602172852, + "step": 16496 + }, + { + "epoch": 4.13, + "grad_norm": 2.2897775173187256, + "learning_rate": 7.329537083596305e-07, + "logits/chosen": -0.5293832421302795, + "logits/rejected": -0.6136583089828491, + "logps/chosen": -56.59798812866211, + "logps/rejected": -143.18264770507812, + "loss": 0.4968, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.359915018081665, + "rewards/margins": 10.009082794189453, + "rewards/rejected": -6.649167060852051, + "step": 16497 + }, + { + "epoch": 4.13, + "grad_norm": 7.179948806762695, + "learning_rate": 7.325440717470134e-07, + "logits/chosen": -0.5619902014732361, + "logits/rejected": -0.6511661410331726, + "logps/chosen": -62.9408073425293, + "logps/rejected": -107.69178771972656, + "loss": 0.6126, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4284236431121826, + "rewards/margins": 8.022187232971191, + "rewards/rejected": -4.59376335144043, + "step": 16498 + }, + { + "epoch": 4.13, + "grad_norm": 5.5479559898376465, + "learning_rate": 7.321345405877711e-07, + "logits/chosen": -0.6019902229309082, + "logits/rejected": -0.6659100651741028, + "logps/chosen": -48.81250762939453, + "logps/rejected": -125.03602600097656, + "loss": 0.6118, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0763845443725586, + "rewards/margins": 7.442931175231934, + "rewards/rejected": -4.366546630859375, + "step": 16499 + }, + { + "epoch": 4.13, + "grad_norm": 7.842352867126465, + "learning_rate": 7.317251148920223e-07, + "logits/chosen": -0.49965304136276245, + "logits/rejected": -0.5906782150268555, + "logps/chosen": -61.36555480957031, + "logps/rejected": -95.35589599609375, + "loss": 0.6191, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.115123748779297, + "rewards/margins": 6.767524242401123, + "rewards/rejected": -3.652399778366089, + "step": 16500 + }, + { + "epoch": 4.13, + "grad_norm": 4.953773021697998, + "learning_rate": 7.313157946698879e-07, + "logits/chosen": -0.49964797496795654, + "logits/rejected": -0.5953284502029419, + "logps/chosen": -52.07131576538086, + "logps/rejected": -99.77610778808594, + "loss": 0.6606, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2057549953460693, + "rewards/margins": 6.280874252319336, + "rewards/rejected": -3.0751192569732666, + "step": 16501 + }, + { + "epoch": 4.13, + "grad_norm": 3.8633649349212646, + "learning_rate": 7.309065799314801e-07, + "logits/chosen": -0.5460444688796997, + "logits/rejected": -0.6089130640029907, + "logps/chosen": -49.71226501464844, + "logps/rejected": -114.26498413085938, + "loss": 0.5653, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9769904613494873, + "rewards/margins": 8.641129493713379, + "rewards/rejected": -5.6641387939453125, + "step": 16502 + }, + { + "epoch": 4.13, + "grad_norm": 4.409076690673828, + "learning_rate": 7.3049747068691e-07, + "logits/chosen": -0.4586677551269531, + "logits/rejected": -0.5390755534172058, + "logps/chosen": -61.00849533081055, + "logps/rejected": -106.07478332519531, + "loss": 0.6481, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5484657287597656, + "rewards/margins": 7.238264560699463, + "rewards/rejected": -3.68979811668396, + "step": 16503 + }, + { + "epoch": 4.13, + "grad_norm": 6.192152500152588, + "learning_rate": 7.300884669462905e-07, + "logits/chosen": -0.5373167991638184, + "logits/rejected": -0.5942510366439819, + "logps/chosen": -39.53821563720703, + "logps/rejected": -106.42312622070312, + "loss": 0.6001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.953073024749756, + "rewards/margins": 6.706210136413574, + "rewards/rejected": -3.7531371116638184, + "step": 16504 + }, + { + "epoch": 4.13, + "grad_norm": 54.03258514404297, + "learning_rate": 7.29679568719725e-07, + "logits/chosen": -0.49623292684555054, + "logits/rejected": -0.607349157333374, + "logps/chosen": -65.1535873413086, + "logps/rejected": -89.8348388671875, + "loss": 0.7228, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7954280376434326, + "rewards/margins": 6.282505512237549, + "rewards/rejected": -3.4870777130126953, + "step": 16505 + }, + { + "epoch": 4.13, + "grad_norm": 5.628083229064941, + "learning_rate": 7.292707760173212e-07, + "logits/chosen": -0.625007152557373, + "logits/rejected": -0.6713752746582031, + "logps/chosen": -46.36479568481445, + "logps/rejected": -122.73492431640625, + "loss": 0.5706, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.011725664138794, + "rewards/margins": 7.7965240478515625, + "rewards/rejected": -4.784799098968506, + "step": 16506 + }, + { + "epoch": 4.13, + "grad_norm": 4.20181131362915, + "learning_rate": 7.288620888491782e-07, + "logits/chosen": -0.5009191036224365, + "logits/rejected": -0.617976725101471, + "logps/chosen": -58.3842887878418, + "logps/rejected": -103.28428649902344, + "loss": 0.5741, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2328858375549316, + "rewards/margins": 7.782886981964111, + "rewards/rejected": -4.5500006675720215, + "step": 16507 + }, + { + "epoch": 4.13, + "grad_norm": 3.858570098876953, + "learning_rate": 7.284535072253957e-07, + "logits/chosen": -0.5011289119720459, + "logits/rejected": -0.5587762594223022, + "logps/chosen": -60.827598571777344, + "logps/rejected": -125.74471282958984, + "loss": 0.6413, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.361201286315918, + "rewards/margins": 8.390320777893066, + "rewards/rejected": -5.029118537902832, + "step": 16508 + }, + { + "epoch": 4.13, + "grad_norm": 3.651369333267212, + "learning_rate": 7.2804503115607e-07, + "logits/chosen": -0.5655887126922607, + "logits/rejected": -0.6531286239624023, + "logps/chosen": -49.64619827270508, + "logps/rejected": -144.75808715820312, + "loss": 0.5607, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3804891109466553, + "rewards/margins": 9.889233589172363, + "rewards/rejected": -6.508745193481445, + "step": 16509 + }, + { + "epoch": 4.13, + "grad_norm": 8.461264610290527, + "learning_rate": 7.276366606512981e-07, + "logits/chosen": -0.4697076678276062, + "logits/rejected": -0.5642260909080505, + "logps/chosen": -57.23148727416992, + "logps/rejected": -106.43994140625, + "loss": 0.6578, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0136971473693848, + "rewards/margins": 6.973489761352539, + "rewards/rejected": -3.9597928524017334, + "step": 16510 + }, + { + "epoch": 4.13, + "grad_norm": 8.744962692260742, + "learning_rate": 7.272283957211656e-07, + "logits/chosen": -0.6327089071273804, + "logits/rejected": -0.6856687664985657, + "logps/chosen": -51.01456832885742, + "logps/rejected": -109.82343292236328, + "loss": 0.6684, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3730149269104004, + "rewards/margins": 6.485047340393066, + "rewards/rejected": -3.112032175064087, + "step": 16511 + }, + { + "epoch": 4.13, + "grad_norm": 3.7772812843322754, + "learning_rate": 7.268202363757648e-07, + "logits/chosen": -0.532021164894104, + "logits/rejected": -0.6154651641845703, + "logps/chosen": -50.86359405517578, + "logps/rejected": -112.07585906982422, + "loss": 0.5453, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.908508777618408, + "rewards/margins": 7.494208812713623, + "rewards/rejected": -4.585700035095215, + "step": 16512 + }, + { + "epoch": 4.13, + "grad_norm": 4.587902069091797, + "learning_rate": 7.264121826251819e-07, + "logits/chosen": -0.5605629682540894, + "logits/rejected": -0.6077067852020264, + "logps/chosen": -53.66773986816406, + "logps/rejected": -112.4913330078125, + "loss": 0.5803, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.390885591506958, + "rewards/margins": 7.092309951782227, + "rewards/rejected": -3.7014243602752686, + "step": 16513 + }, + { + "epoch": 4.13, + "grad_norm": 3.0916340351104736, + "learning_rate": 7.260042344794999e-07, + "logits/chosen": -0.5079510807991028, + "logits/rejected": -0.608847975730896, + "logps/chosen": -64.47994995117188, + "logps/rejected": -121.80342102050781, + "loss": 0.6321, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9182167053222656, + "rewards/margins": 8.683549880981445, + "rewards/rejected": -5.76533317565918, + "step": 16514 + }, + { + "epoch": 4.13, + "grad_norm": 4.041347026824951, + "learning_rate": 7.255963919487979e-07, + "logits/chosen": -0.5520105361938477, + "logits/rejected": -0.6014242172241211, + "logps/chosen": -61.67429733276367, + "logps/rejected": -101.55657958984375, + "loss": 0.6632, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.983956813812256, + "rewards/margins": 6.48006010055542, + "rewards/rejected": -3.4961040019989014, + "step": 16515 + }, + { + "epoch": 4.13, + "grad_norm": 8.013797760009766, + "learning_rate": 7.251886550431564e-07, + "logits/chosen": -0.6175646185874939, + "logits/rejected": -0.6770281791687012, + "logps/chosen": -49.56259536743164, + "logps/rejected": -94.04137420654297, + "loss": 0.5354, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6041901111602783, + "rewards/margins": 7.3293585777282715, + "rewards/rejected": -3.725167989730835, + "step": 16516 + }, + { + "epoch": 4.13, + "grad_norm": 2.01676869392395, + "learning_rate": 7.247810237726504e-07, + "logits/chosen": -0.5302329659461975, + "logits/rejected": -0.6008929014205933, + "logps/chosen": -43.83733367919922, + "logps/rejected": -114.22187805175781, + "loss": 0.4809, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.979452610015869, + "rewards/margins": 8.403543472290039, + "rewards/rejected": -5.42409086227417, + "step": 16517 + }, + { + "epoch": 4.13, + "grad_norm": 2.7818427085876465, + "learning_rate": 7.243734981473515e-07, + "logits/chosen": -0.5320454835891724, + "logits/rejected": -0.623261034488678, + "logps/chosen": -46.823326110839844, + "logps/rejected": -97.28028869628906, + "loss": 0.5473, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0216987133026123, + "rewards/margins": 8.258514404296875, + "rewards/rejected": -5.236815452575684, + "step": 16518 + }, + { + "epoch": 4.13, + "grad_norm": 2.8896584510803223, + "learning_rate": 7.239660781773328e-07, + "logits/chosen": -0.6081475615501404, + "logits/rejected": -0.6707900166511536, + "logps/chosen": -48.892765045166016, + "logps/rejected": -98.4864730834961, + "loss": 0.5363, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2501113414764404, + "rewards/margins": 6.544018268585205, + "rewards/rejected": -3.2939066886901855, + "step": 16519 + }, + { + "epoch": 4.13, + "grad_norm": 25.178817749023438, + "learning_rate": 7.235587638726599e-07, + "logits/chosen": -0.5620042681694031, + "logits/rejected": -0.6455371379852295, + "logps/chosen": -41.1151237487793, + "logps/rejected": -89.43595886230469, + "loss": 0.5566, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1342196464538574, + "rewards/margins": 7.465447425842285, + "rewards/rejected": -4.3312273025512695, + "step": 16520 + }, + { + "epoch": 4.13, + "grad_norm": 3.1019794940948486, + "learning_rate": 7.231515552433976e-07, + "logits/chosen": -0.49264058470726013, + "logits/rejected": -0.5584269762039185, + "logps/chosen": -49.159385681152344, + "logps/rejected": -115.84636688232422, + "loss": 0.518, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2830848693847656, + "rewards/margins": 8.054862022399902, + "rewards/rejected": -4.771777629852295, + "step": 16521 + }, + { + "epoch": 4.13, + "grad_norm": 4.725474834442139, + "learning_rate": 7.227444522996096e-07, + "logits/chosen": -0.5558875799179077, + "logits/rejected": -0.6438021063804626, + "logps/chosen": -60.95283126831055, + "logps/rejected": -104.20037078857422, + "loss": 0.6445, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1167287826538086, + "rewards/margins": 6.541952133178711, + "rewards/rejected": -3.4252243041992188, + "step": 16522 + }, + { + "epoch": 4.13, + "grad_norm": 6.924704551696777, + "learning_rate": 7.223374550513567e-07, + "logits/chosen": -0.4982898235321045, + "logits/rejected": -0.5933621525764465, + "logps/chosen": -53.736602783203125, + "logps/rejected": -90.70591735839844, + "loss": 0.5918, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9098873138427734, + "rewards/margins": 6.653385162353516, + "rewards/rejected": -3.743497848510742, + "step": 16523 + }, + { + "epoch": 4.13, + "grad_norm": 8.689972877502441, + "learning_rate": 7.219305635086949e-07, + "logits/chosen": -0.5116919875144958, + "logits/rejected": -0.5772230625152588, + "logps/chosen": -44.22645950317383, + "logps/rejected": -116.02696228027344, + "loss": 0.5721, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.97430419921875, + "rewards/margins": 6.58615779876709, + "rewards/rejected": -3.611853837966919, + "step": 16524 + }, + { + "epoch": 4.13, + "grad_norm": 4.260871410369873, + "learning_rate": 7.215237776816786e-07, + "logits/chosen": -0.5371114015579224, + "logits/rejected": -0.6585828065872192, + "logps/chosen": -52.733272552490234, + "logps/rejected": -87.38002014160156, + "loss": 0.5663, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3386101722717285, + "rewards/margins": 6.777398109436035, + "rewards/rejected": -3.4387869834899902, + "step": 16525 + }, + { + "epoch": 4.13, + "grad_norm": 16.858747482299805, + "learning_rate": 7.211170975803611e-07, + "logits/chosen": -0.5528392791748047, + "logits/rejected": -0.6206398010253906, + "logps/chosen": -52.93339538574219, + "logps/rejected": -98.77513885498047, + "loss": 0.6436, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.21409010887146, + "rewards/margins": 6.544973373413086, + "rewards/rejected": -3.330883026123047, + "step": 16526 + }, + { + "epoch": 4.13, + "grad_norm": 3.379746913909912, + "learning_rate": 7.207105232147921e-07, + "logits/chosen": -0.5363048911094666, + "logits/rejected": -0.6198208332061768, + "logps/chosen": -48.61888122558594, + "logps/rejected": -114.25232696533203, + "loss": 0.5299, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2662312984466553, + "rewards/margins": 7.878539085388184, + "rewards/rejected": -4.612307548522949, + "step": 16527 + }, + { + "epoch": 4.13, + "grad_norm": 4.144565582275391, + "learning_rate": 7.203040545950158e-07, + "logits/chosen": -0.534928560256958, + "logits/rejected": -0.6034278273582458, + "logps/chosen": -50.651832580566406, + "logps/rejected": -107.40448760986328, + "loss": 0.592, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.142916202545166, + "rewards/margins": 7.28436803817749, + "rewards/rejected": -4.141451835632324, + "step": 16528 + }, + { + "epoch": 4.13, + "grad_norm": 4.5378241539001465, + "learning_rate": 7.198976917310801e-07, + "logits/chosen": -0.5080583691596985, + "logits/rejected": -0.5803164839744568, + "logps/chosen": -49.429100036621094, + "logps/rejected": -107.97174072265625, + "loss": 0.619, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.820815086364746, + "rewards/margins": 6.569179534912109, + "rewards/rejected": -3.7483644485473633, + "step": 16529 + }, + { + "epoch": 4.14, + "grad_norm": 6.953816890716553, + "learning_rate": 7.194914346330234e-07, + "logits/chosen": -0.5238826274871826, + "logits/rejected": -0.587304949760437, + "logps/chosen": -72.18157958984375, + "logps/rejected": -114.38874816894531, + "loss": 0.6645, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1884799003601074, + "rewards/margins": 6.854720115661621, + "rewards/rejected": -3.6662402153015137, + "step": 16530 + }, + { + "epoch": 4.14, + "grad_norm": 3.6137595176696777, + "learning_rate": 7.19085283310888e-07, + "logits/chosen": -0.49567973613739014, + "logits/rejected": -0.6209104061126709, + "logps/chosen": -56.127784729003906, + "logps/rejected": -103.98917388916016, + "loss": 0.5821, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9829983711242676, + "rewards/margins": 7.9945454597473145, + "rewards/rejected": -5.011547565460205, + "step": 16531 + }, + { + "epoch": 4.14, + "grad_norm": 5.189240455627441, + "learning_rate": 7.186792377747082e-07, + "logits/chosen": -0.5139151811599731, + "logits/rejected": -0.649724006652832, + "logps/chosen": -58.46260070800781, + "logps/rejected": -100.56689453125, + "loss": 0.6241, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.247286319732666, + "rewards/margins": 7.523401260375977, + "rewards/rejected": -4.2761149406433105, + "step": 16532 + }, + { + "epoch": 4.14, + "grad_norm": 2.8276565074920654, + "learning_rate": 7.182732980345175e-07, + "logits/chosen": -0.5625989437103271, + "logits/rejected": -0.6541289687156677, + "logps/chosen": -55.94304656982422, + "logps/rejected": -100.72576904296875, + "loss": 0.6124, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.315495491027832, + "rewards/margins": 7.969089508056641, + "rewards/rejected": -4.653594970703125, + "step": 16533 + }, + { + "epoch": 4.14, + "grad_norm": 11.30039119720459, + "learning_rate": 7.178674641003474e-07, + "logits/chosen": -0.5774810314178467, + "logits/rejected": -0.6540431380271912, + "logps/chosen": -61.55979537963867, + "logps/rejected": -102.789306640625, + "loss": 0.639, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0996975898742676, + "rewards/margins": 6.141323089599609, + "rewards/rejected": -3.0416247844696045, + "step": 16534 + }, + { + "epoch": 4.14, + "grad_norm": 3.932729482650757, + "learning_rate": 7.174617359822295e-07, + "logits/chosen": -0.4911969304084778, + "logits/rejected": -0.5582236647605896, + "logps/chosen": -63.01247787475586, + "logps/rejected": -100.48680877685547, + "loss": 0.5859, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1692862510681152, + "rewards/margins": 6.298355579376221, + "rewards/rejected": -3.1290698051452637, + "step": 16535 + }, + { + "epoch": 4.14, + "grad_norm": 3.657398223876953, + "learning_rate": 7.170561136901844e-07, + "logits/chosen": -0.5140166878700256, + "logits/rejected": -0.6170904040336609, + "logps/chosen": -54.36750030517578, + "logps/rejected": -117.58381652832031, + "loss": 0.5545, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2065391540527344, + "rewards/margins": 8.21702766418457, + "rewards/rejected": -5.010488033294678, + "step": 16536 + }, + { + "epoch": 4.14, + "grad_norm": 3.769449472427368, + "learning_rate": 7.166505972342391e-07, + "logits/chosen": -0.6755145192146301, + "logits/rejected": -0.6788040399551392, + "logps/chosen": -50.37313461303711, + "logps/rejected": -127.68083953857422, + "loss": 0.6336, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2365634441375732, + "rewards/margins": 7.171188831329346, + "rewards/rejected": -3.9346261024475098, + "step": 16537 + }, + { + "epoch": 4.14, + "grad_norm": 3.6480681896209717, + "learning_rate": 7.162451866244153e-07, + "logits/chosen": -0.5300700664520264, + "logits/rejected": -0.5989157557487488, + "logps/chosen": -49.93144989013672, + "logps/rejected": -100.79277801513672, + "loss": 0.5916, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.207291603088379, + "rewards/margins": 7.1182637214660645, + "rewards/rejected": -3.910972833633423, + "step": 16538 + }, + { + "epoch": 4.14, + "grad_norm": 14.333098411560059, + "learning_rate": 7.158398818707268e-07, + "logits/chosen": -0.5586746335029602, + "logits/rejected": -0.6247410774230957, + "logps/chosen": -54.621604919433594, + "logps/rejected": -94.47830963134766, + "loss": 0.7062, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.09311580657959, + "rewards/margins": 6.1962409019470215, + "rewards/rejected": -3.1031246185302734, + "step": 16539 + }, + { + "epoch": 4.14, + "grad_norm": 2.3598296642303467, + "learning_rate": 7.154346829831921e-07, + "logits/chosen": -0.5302703380584717, + "logits/rejected": -0.5950948596000671, + "logps/chosen": -49.94675064086914, + "logps/rejected": -107.69010925292969, + "loss": 0.5547, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0817277431488037, + "rewards/margins": 7.697910308837891, + "rewards/rejected": -4.61618185043335, + "step": 16540 + }, + { + "epoch": 4.14, + "grad_norm": 15.033602714538574, + "learning_rate": 7.150295899718252e-07, + "logits/chosen": -0.49503782391548157, + "logits/rejected": -0.6029330492019653, + "logps/chosen": -48.432044982910156, + "logps/rejected": -96.64803314208984, + "loss": 0.5655, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0947113037109375, + "rewards/margins": 7.684055328369141, + "rewards/rejected": -4.589343547821045, + "step": 16541 + }, + { + "epoch": 4.14, + "grad_norm": 17.818172454833984, + "learning_rate": 7.146246028466342e-07, + "logits/chosen": -0.5170804262161255, + "logits/rejected": -0.5919398665428162, + "logps/chosen": -58.51430892944336, + "logps/rejected": -102.9449462890625, + "loss": 0.8341, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0005476474761963, + "rewards/margins": 6.696683406829834, + "rewards/rejected": -3.696135997772217, + "step": 16542 + }, + { + "epoch": 4.14, + "grad_norm": 2.795949697494507, + "learning_rate": 7.142197216176261e-07, + "logits/chosen": -0.5448856353759766, + "logits/rejected": -0.6568265557289124, + "logps/chosen": -53.80747604370117, + "logps/rejected": -115.35560607910156, + "loss": 0.5374, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.037827968597412, + "rewards/margins": 8.75567626953125, + "rewards/rejected": -5.717848777770996, + "step": 16543 + }, + { + "epoch": 4.14, + "grad_norm": 6.6141815185546875, + "learning_rate": 7.138149462948085e-07, + "logits/chosen": -0.542218804359436, + "logits/rejected": -0.6304913759231567, + "logps/chosen": -68.48818969726562, + "logps/rejected": -117.77484130859375, + "loss": 0.5998, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.300508737564087, + "rewards/margins": 8.172956466674805, + "rewards/rejected": -4.8724470138549805, + "step": 16544 + }, + { + "epoch": 4.14, + "grad_norm": 4.551846027374268, + "learning_rate": 7.134102768881829e-07, + "logits/chosen": -0.6452959775924683, + "logits/rejected": -0.7397226691246033, + "logps/chosen": -59.35562515258789, + "logps/rejected": -102.10791778564453, + "loss": 0.5805, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0111963748931885, + "rewards/margins": 7.0092363357543945, + "rewards/rejected": -3.998039722442627, + "step": 16545 + }, + { + "epoch": 4.14, + "grad_norm": 4.960016250610352, + "learning_rate": 7.130057134077472e-07, + "logits/chosen": -0.552548348903656, + "logits/rejected": -0.5873574018478394, + "logps/chosen": -53.314605712890625, + "logps/rejected": -105.22103881835938, + "loss": 0.6273, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9019644260406494, + "rewards/margins": 7.100702285766602, + "rewards/rejected": -4.198738098144531, + "step": 16546 + }, + { + "epoch": 4.14, + "grad_norm": 2.084833860397339, + "learning_rate": 7.126012558635021e-07, + "logits/chosen": -0.5608818531036377, + "logits/rejected": -0.654880166053772, + "logps/chosen": -66.10292053222656, + "logps/rejected": -90.83393859863281, + "loss": 0.629, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3354671001434326, + "rewards/margins": 7.339613437652588, + "rewards/rejected": -4.004146099090576, + "step": 16547 + }, + { + "epoch": 4.14, + "grad_norm": 6.566831111907959, + "learning_rate": 7.121969042654387e-07, + "logits/chosen": -0.5724442601203918, + "logits/rejected": -0.6661931276321411, + "logps/chosen": -50.80864334106445, + "logps/rejected": -117.48171997070312, + "loss": 0.5893, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.063021183013916, + "rewards/margins": 8.30278491973877, + "rewards/rejected": -5.239764213562012, + "step": 16548 + }, + { + "epoch": 4.14, + "grad_norm": 4.293820381164551, + "learning_rate": 7.117926586235513e-07, + "logits/chosen": -0.5851337909698486, + "logits/rejected": -0.6868208050727844, + "logps/chosen": -62.24423599243164, + "logps/rejected": -91.92058563232422, + "loss": 0.6218, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8945558071136475, + "rewards/margins": 5.97225284576416, + "rewards/rejected": -3.077697277069092, + "step": 16549 + }, + { + "epoch": 4.14, + "grad_norm": 4.756053447723389, + "learning_rate": 7.113885189478286e-07, + "logits/chosen": -0.465069442987442, + "logits/rejected": -0.5668126344680786, + "logps/chosen": -65.5066909790039, + "logps/rejected": -92.72024536132812, + "loss": 0.7293, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.871572732925415, + "rewards/margins": 5.901449680328369, + "rewards/rejected": -3.029876708984375, + "step": 16550 + }, + { + "epoch": 4.14, + "grad_norm": 3.735459089279175, + "learning_rate": 7.109844852482561e-07, + "logits/chosen": -0.5616961717605591, + "logits/rejected": -0.6487545967102051, + "logps/chosen": -48.40924072265625, + "logps/rejected": -101.19145202636719, + "loss": 0.5623, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4530587196350098, + "rewards/margins": 7.988701343536377, + "rewards/rejected": -4.535643100738525, + "step": 16551 + }, + { + "epoch": 4.14, + "grad_norm": 5.278431415557861, + "learning_rate": 7.105805575348201e-07, + "logits/chosen": -0.5482056140899658, + "logits/rejected": -0.6169608235359192, + "logps/chosen": -57.382606506347656, + "logps/rejected": -112.45059204101562, + "loss": 0.6262, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2306766510009766, + "rewards/margins": 8.436992645263672, + "rewards/rejected": -5.206316947937012, + "step": 16552 + }, + { + "epoch": 4.14, + "grad_norm": 3.937143087387085, + "learning_rate": 7.10176735817501e-07, + "logits/chosen": -0.5927315354347229, + "logits/rejected": -0.6671007871627808, + "logps/chosen": -46.6092529296875, + "logps/rejected": -95.71179962158203, + "loss": 0.6026, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0270631313323975, + "rewards/margins": 7.421457290649414, + "rewards/rejected": -4.3943939208984375, + "step": 16553 + }, + { + "epoch": 4.14, + "grad_norm": 3.9467129707336426, + "learning_rate": 7.097730201062764e-07, + "logits/chosen": -0.5519660711288452, + "logits/rejected": -0.656563401222229, + "logps/chosen": -67.57913208007812, + "logps/rejected": -119.73072814941406, + "loss": 0.6375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.00490403175354, + "rewards/margins": 8.262341499328613, + "rewards/rejected": -5.257437705993652, + "step": 16554 + }, + { + "epoch": 4.14, + "grad_norm": 6.361236095428467, + "learning_rate": 7.093694104111237e-07, + "logits/chosen": -0.5339471697807312, + "logits/rejected": -0.5906975865364075, + "logps/chosen": -50.63716125488281, + "logps/rejected": -93.73621368408203, + "loss": 0.6923, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9057912826538086, + "rewards/margins": 5.979445934295654, + "rewards/rejected": -3.0736541748046875, + "step": 16555 + }, + { + "epoch": 4.14, + "grad_norm": 4.385092258453369, + "learning_rate": 7.089659067420179e-07, + "logits/chosen": -0.5587483644485474, + "logits/rejected": -0.6283179521560669, + "logps/chosen": -60.38458251953125, + "logps/rejected": -115.78717041015625, + "loss": 0.6685, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.482611894607544, + "rewards/margins": 8.241532325744629, + "rewards/rejected": -4.758920192718506, + "step": 16556 + }, + { + "epoch": 4.14, + "grad_norm": 5.317382335662842, + "learning_rate": 7.085625091089288e-07, + "logits/chosen": -0.5382431745529175, + "logits/rejected": -0.6306125521659851, + "logps/chosen": -62.14976501464844, + "logps/rejected": -98.77238464355469, + "loss": 0.7145, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7198286056518555, + "rewards/margins": 6.537193775177002, + "rewards/rejected": -3.81736421585083, + "step": 16557 + }, + { + "epoch": 4.14, + "grad_norm": 5.098907947540283, + "learning_rate": 7.081592175218233e-07, + "logits/chosen": -0.5751979947090149, + "logits/rejected": -0.5952562093734741, + "logps/chosen": -59.07655334472656, + "logps/rejected": -127.3887939453125, + "loss": 0.6456, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1998164653778076, + "rewards/margins": 8.672022819519043, + "rewards/rejected": -5.472207069396973, + "step": 16558 + }, + { + "epoch": 4.14, + "grad_norm": 8.782886505126953, + "learning_rate": 7.077560319906696e-07, + "logits/chosen": -0.5343537330627441, + "logits/rejected": -0.5653346180915833, + "logps/chosen": -75.80976867675781, + "logps/rejected": -128.4117431640625, + "loss": 0.7217, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6950039863586426, + "rewards/margins": 6.991939067840576, + "rewards/rejected": -4.296934127807617, + "step": 16559 + }, + { + "epoch": 4.14, + "grad_norm": 8.345118522644043, + "learning_rate": 7.073529525254302e-07, + "logits/chosen": -0.6509274244308472, + "logits/rejected": -0.7200824022293091, + "logps/chosen": -46.81156539916992, + "logps/rejected": -104.47207641601562, + "loss": 0.6356, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4418694972991943, + "rewards/margins": 7.366288661956787, + "rewards/rejected": -3.9244189262390137, + "step": 16560 + }, + { + "epoch": 4.14, + "grad_norm": 8.271177291870117, + "learning_rate": 7.069499791360634e-07, + "logits/chosen": -0.5765575170516968, + "logits/rejected": -0.611004114151001, + "logps/chosen": -61.28150177001953, + "logps/rejected": -99.25357055664062, + "loss": 0.7523, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2135252952575684, + "rewards/margins": 5.6810302734375, + "rewards/rejected": -2.4675047397613525, + "step": 16561 + }, + { + "epoch": 4.14, + "grad_norm": 5.350466251373291, + "learning_rate": 7.065471118325306e-07, + "logits/chosen": -0.5602251887321472, + "logits/rejected": -0.6440631151199341, + "logps/chosen": -39.54171371459961, + "logps/rejected": -130.7478790283203, + "loss": 0.5747, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2150111198425293, + "rewards/margins": 8.72879409790039, + "rewards/rejected": -5.5137834548950195, + "step": 16562 + }, + { + "epoch": 4.14, + "grad_norm": 1.9484069347381592, + "learning_rate": 7.06144350624785e-07, + "logits/chosen": -0.5897526741027832, + "logits/rejected": -0.7113146781921387, + "logps/chosen": -69.01620483398438, + "logps/rejected": -101.03428649902344, + "loss": 0.5897, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6951849460601807, + "rewards/margins": 7.649501323699951, + "rewards/rejected": -4.954317569732666, + "step": 16563 + }, + { + "epoch": 4.14, + "grad_norm": 8.2998628616333, + "learning_rate": 7.057416955227791e-07, + "logits/chosen": -0.5632941126823425, + "logits/rejected": -0.6246477961540222, + "logps/chosen": -55.4849853515625, + "logps/rejected": -124.54966735839844, + "loss": 0.7089, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1650068759918213, + "rewards/margins": 7.600680351257324, + "rewards/rejected": -4.43567419052124, + "step": 16564 + }, + { + "epoch": 4.14, + "grad_norm": 7.469822883605957, + "learning_rate": 7.053391465364639e-07, + "logits/chosen": -0.5817652940750122, + "logits/rejected": -0.652803897857666, + "logps/chosen": -58.22127151489258, + "logps/rejected": -111.94182586669922, + "loss": 0.7129, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.073424816131592, + "rewards/margins": 7.775184154510498, + "rewards/rejected": -4.701758861541748, + "step": 16565 + }, + { + "epoch": 4.14, + "grad_norm": 9.46871566772461, + "learning_rate": 7.049367036757859e-07, + "logits/chosen": -0.5303283333778381, + "logits/rejected": -0.6080105304718018, + "logps/chosen": -51.97690200805664, + "logps/rejected": -107.14273071289062, + "loss": 0.602, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0992119312286377, + "rewards/margins": 7.0579047203063965, + "rewards/rejected": -3.958693027496338, + "step": 16566 + }, + { + "epoch": 4.14, + "grad_norm": 4.883347988128662, + "learning_rate": 7.04534366950691e-07, + "logits/chosen": -0.4865086078643799, + "logits/rejected": -0.5868527293205261, + "logps/chosen": -62.97551345825195, + "logps/rejected": -105.59619140625, + "loss": 0.6444, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0522396564483643, + "rewards/margins": 5.966476917266846, + "rewards/rejected": -2.9142367839813232, + "step": 16567 + }, + { + "epoch": 4.14, + "grad_norm": 19.60321617126465, + "learning_rate": 7.041321363711201e-07, + "logits/chosen": -0.5405203104019165, + "logits/rejected": -0.6084499359130859, + "logps/chosen": -55.16411209106445, + "logps/rejected": -111.45799255371094, + "loss": 0.5763, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1435179710388184, + "rewards/margins": 7.251144886016846, + "rewards/rejected": -4.107626438140869, + "step": 16568 + }, + { + "epoch": 4.14, + "grad_norm": 1.934831142425537, + "learning_rate": 7.037300119470142e-07, + "logits/chosen": -0.584697961807251, + "logits/rejected": -0.6475663185119629, + "logps/chosen": -52.700138092041016, + "logps/rejected": -127.27365112304688, + "loss": 0.5645, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9691731929779053, + "rewards/margins": 9.026790618896484, + "rewards/rejected": -6.057616710662842, + "step": 16569 + }, + { + "epoch": 4.15, + "grad_norm": 30.35779571533203, + "learning_rate": 7.033279936883097e-07, + "logits/chosen": -0.6010414361953735, + "logits/rejected": -0.6764056086540222, + "logps/chosen": -54.65156555175781, + "logps/rejected": -108.13064575195312, + "loss": 0.6809, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.151430130004883, + "rewards/margins": 7.84148645401001, + "rewards/rejected": -4.690056324005127, + "step": 16570 + }, + { + "epoch": 4.15, + "grad_norm": 2.2895052433013916, + "learning_rate": 7.029260816049393e-07, + "logits/chosen": -0.5596154928207397, + "logits/rejected": -0.572169303894043, + "logps/chosen": -50.96348571777344, + "logps/rejected": -127.89518737792969, + "loss": 0.562, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.201140880584717, + "rewards/margins": 7.8404951095581055, + "rewards/rejected": -4.6393537521362305, + "step": 16571 + }, + { + "epoch": 4.15, + "grad_norm": 27.724000930786133, + "learning_rate": 7.025242757068368e-07, + "logits/chosen": -0.5613675713539124, + "logits/rejected": -0.5976566672325134, + "logps/chosen": -60.783538818359375, + "logps/rejected": -128.81951904296875, + "loss": 0.7396, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2066822052001953, + "rewards/margins": 7.660543441772461, + "rewards/rejected": -4.453861236572266, + "step": 16572 + }, + { + "epoch": 4.15, + "grad_norm": 3.1929121017456055, + "learning_rate": 7.021225760039297e-07, + "logits/chosen": -0.5351006388664246, + "logits/rejected": -0.6385804414749146, + "logps/chosen": -63.22807312011719, + "logps/rejected": -91.68852233886719, + "loss": 0.5667, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7834596633911133, + "rewards/margins": 7.379683971405029, + "rewards/rejected": -4.596224308013916, + "step": 16573 + }, + { + "epoch": 4.15, + "grad_norm": 2.5534496307373047, + "learning_rate": 7.017209825061461e-07, + "logits/chosen": -0.5604255199432373, + "logits/rejected": -0.6118578314781189, + "logps/chosen": -44.83934020996094, + "logps/rejected": -104.469482421875, + "loss": 0.544, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1804089546203613, + "rewards/margins": 7.307922840118408, + "rewards/rejected": -4.127513885498047, + "step": 16574 + }, + { + "epoch": 4.15, + "grad_norm": 3.5970840454101562, + "learning_rate": 7.013194952234092e-07, + "logits/chosen": -0.5964967012405396, + "logits/rejected": -0.6861244440078735, + "logps/chosen": -51.436622619628906, + "logps/rejected": -106.31134033203125, + "loss": 0.6064, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1267900466918945, + "rewards/margins": 7.627875328063965, + "rewards/rejected": -4.501084327697754, + "step": 16575 + }, + { + "epoch": 4.15, + "grad_norm": 3.2305643558502197, + "learning_rate": 7.009181141656385e-07, + "logits/chosen": -0.4992256760597229, + "logits/rejected": -0.6328847408294678, + "logps/chosen": -58.583290100097656, + "logps/rejected": -116.26597595214844, + "loss": 0.5334, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2438113689422607, + "rewards/margins": 8.45311450958252, + "rewards/rejected": -5.2093024253845215, + "step": 16576 + }, + { + "epoch": 4.15, + "grad_norm": 4.8139262199401855, + "learning_rate": 7.005168393427553e-07, + "logits/chosen": -0.5581876039505005, + "logits/rejected": -0.6526869535446167, + "logps/chosen": -48.640541076660156, + "logps/rejected": -95.60494232177734, + "loss": 0.6188, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1556813716888428, + "rewards/margins": 7.548031806945801, + "rewards/rejected": -4.392349720001221, + "step": 16577 + }, + { + "epoch": 4.15, + "grad_norm": 4.527629375457764, + "learning_rate": 7.001156707646739e-07, + "logits/chosen": -0.48762649297714233, + "logits/rejected": -0.5652113556861877, + "logps/chosen": -47.762245178222656, + "logps/rejected": -85.32848358154297, + "loss": 0.5954, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1650476455688477, + "rewards/margins": 6.217199802398682, + "rewards/rejected": -3.052152633666992, + "step": 16578 + }, + { + "epoch": 4.15, + "grad_norm": 2.946129083633423, + "learning_rate": 6.997146084413065e-07, + "logits/chosen": -0.5658628940582275, + "logits/rejected": -0.617624044418335, + "logps/chosen": -53.41641616821289, + "logps/rejected": -119.85177612304688, + "loss": 0.6011, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.299314022064209, + "rewards/margins": 7.681146621704102, + "rewards/rejected": -4.381833076477051, + "step": 16579 + }, + { + "epoch": 4.15, + "grad_norm": 3.841475248336792, + "learning_rate": 6.993136523825655e-07, + "logits/chosen": -0.6415867805480957, + "logits/rejected": -0.7695018649101257, + "logps/chosen": -52.919612884521484, + "logps/rejected": -103.24102020263672, + "loss": 0.5948, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.699923038482666, + "rewards/margins": 7.780819892883301, + "rewards/rejected": -5.080896854400635, + "step": 16580 + }, + { + "epoch": 4.15, + "grad_norm": 7.357598304748535, + "learning_rate": 6.989128025983604e-07, + "logits/chosen": -0.5154301524162292, + "logits/rejected": -0.6036214232444763, + "logps/chosen": -53.19533920288086, + "logps/rejected": -93.2987289428711, + "loss": 0.6187, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.161163806915283, + "rewards/margins": 7.370997905731201, + "rewards/rejected": -4.209834575653076, + "step": 16581 + }, + { + "epoch": 4.15, + "grad_norm": 2.4414381980895996, + "learning_rate": 6.985120590985928e-07, + "logits/chosen": -0.5269536375999451, + "logits/rejected": -0.614366888999939, + "logps/chosen": -53.268741607666016, + "logps/rejected": -120.40725708007812, + "loss": 0.5909, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3844282627105713, + "rewards/margins": 7.477105617523193, + "rewards/rejected": -4.092677593231201, + "step": 16582 + }, + { + "epoch": 4.15, + "grad_norm": 12.037043571472168, + "learning_rate": 6.981114218931678e-07, + "logits/chosen": -0.5851716995239258, + "logits/rejected": -0.6243584156036377, + "logps/chosen": -49.34939956665039, + "logps/rejected": -113.7340087890625, + "loss": 0.6469, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4013514518737793, + "rewards/margins": 6.704370975494385, + "rewards/rejected": -4.303020000457764, + "step": 16583 + }, + { + "epoch": 4.15, + "grad_norm": 4.3921308517456055, + "learning_rate": 6.97710890991986e-07, + "logits/chosen": -0.5464184284210205, + "logits/rejected": -0.6314224004745483, + "logps/chosen": -43.26581573486328, + "logps/rejected": -98.295654296875, + "loss": 0.5538, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.187382936477661, + "rewards/margins": 6.9349894523620605, + "rewards/rejected": -3.7476065158843994, + "step": 16584 + }, + { + "epoch": 4.15, + "grad_norm": 3.9372756481170654, + "learning_rate": 6.97310466404944e-07, + "logits/chosen": -0.5339865684509277, + "logits/rejected": -0.6342943906784058, + "logps/chosen": -55.65690231323242, + "logps/rejected": -113.14293670654297, + "loss": 0.6057, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.567913055419922, + "rewards/margins": 7.691354274749756, + "rewards/rejected": -5.123441696166992, + "step": 16585 + }, + { + "epoch": 4.15, + "grad_norm": 18.535751342773438, + "learning_rate": 6.969101481419361e-07, + "logits/chosen": -0.46752315759658813, + "logits/rejected": -0.6067626476287842, + "logps/chosen": -58.832332611083984, + "logps/rejected": -112.42457580566406, + "loss": 0.607, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1744465827941895, + "rewards/margins": 8.737852096557617, + "rewards/rejected": -5.5634050369262695, + "step": 16586 + }, + { + "epoch": 4.15, + "grad_norm": 2.2682182788848877, + "learning_rate": 6.965099362128564e-07, + "logits/chosen": -0.5271511077880859, + "logits/rejected": -0.6296720504760742, + "logps/chosen": -50.430877685546875, + "logps/rejected": -90.23816680908203, + "loss": 0.5136, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2159998416900635, + "rewards/margins": 7.043585777282715, + "rewards/rejected": -3.8275864124298096, + "step": 16587 + }, + { + "epoch": 4.15, + "grad_norm": 2.2450764179229736, + "learning_rate": 6.961098306275938e-07, + "logits/chosen": -0.5316139459609985, + "logits/rejected": -0.6051087379455566, + "logps/chosen": -53.264774322509766, + "logps/rejected": -123.75687408447266, + "loss": 0.5997, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3039801120758057, + "rewards/margins": 8.025181770324707, + "rewards/rejected": -4.721201419830322, + "step": 16588 + }, + { + "epoch": 4.15, + "grad_norm": 5.939836025238037, + "learning_rate": 6.957098313960337e-07, + "logits/chosen": -0.5201593041419983, + "logits/rejected": -0.5790970325469971, + "logps/chosen": -52.62388610839844, + "logps/rejected": -125.49348449707031, + "loss": 0.6091, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1769301891326904, + "rewards/margins": 8.485452651977539, + "rewards/rejected": -5.3085222244262695, + "step": 16589 + }, + { + "epoch": 4.15, + "grad_norm": 7.671715259552002, + "learning_rate": 6.953099385280632e-07, + "logits/chosen": -0.6038644313812256, + "logits/rejected": -0.6488220691680908, + "logps/chosen": -61.75701141357422, + "logps/rejected": -91.16148376464844, + "loss": 0.7907, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1675446033477783, + "rewards/margins": 5.715763092041016, + "rewards/rejected": -2.5482187271118164, + "step": 16590 + }, + { + "epoch": 4.15, + "grad_norm": 4.167182922363281, + "learning_rate": 6.949101520335616e-07, + "logits/chosen": -0.4998086094856262, + "logits/rejected": -0.588615357875824, + "logps/chosen": -52.055545806884766, + "logps/rejected": -105.92762756347656, + "loss": 0.5859, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.567551374435425, + "rewards/margins": 7.725912570953369, + "rewards/rejected": -4.158360958099365, + "step": 16591 + }, + { + "epoch": 4.15, + "grad_norm": 8.553227424621582, + "learning_rate": 6.945104719224105e-07, + "logits/chosen": -0.5897085666656494, + "logits/rejected": -0.5981603264808655, + "logps/chosen": -56.032684326171875, + "logps/rejected": -115.71804809570312, + "loss": 0.7399, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2897391319274902, + "rewards/margins": 6.4765448570251465, + "rewards/rejected": -3.186805486679077, + "step": 16592 + }, + { + "epoch": 4.15, + "grad_norm": 4.6687397956848145, + "learning_rate": 6.941108982044853e-07, + "logits/chosen": -0.5065131187438965, + "logits/rejected": -0.6141509413719177, + "logps/chosen": -55.63071060180664, + "logps/rejected": -84.53914642333984, + "loss": 0.6981, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.261211633682251, + "rewards/margins": 6.27001428604126, + "rewards/rejected": -3.0088021755218506, + "step": 16593 + }, + { + "epoch": 4.15, + "grad_norm": 3.5647706985473633, + "learning_rate": 6.937114308896581e-07, + "logits/chosen": -0.5554619431495667, + "logits/rejected": -0.6310780644416809, + "logps/chosen": -44.213409423828125, + "logps/rejected": -111.84837341308594, + "loss": 0.5506, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2501635551452637, + "rewards/margins": 8.22662353515625, + "rewards/rejected": -4.976460933685303, + "step": 16594 + }, + { + "epoch": 4.15, + "grad_norm": 2.501267910003662, + "learning_rate": 6.933120699878032e-07, + "logits/chosen": -0.5573149919509888, + "logits/rejected": -0.6536254286766052, + "logps/chosen": -59.74317932128906, + "logps/rejected": -92.05391693115234, + "loss": 0.5876, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0760202407836914, + "rewards/margins": 7.372837066650391, + "rewards/rejected": -4.296816349029541, + "step": 16595 + }, + { + "epoch": 4.15, + "grad_norm": 3.6791579723358154, + "learning_rate": 6.929128155087878e-07, + "logits/chosen": -0.5680652856826782, + "logits/rejected": -0.6688714623451233, + "logps/chosen": -55.13641357421875, + "logps/rejected": -101.02359771728516, + "loss": 0.6071, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.148343324661255, + "rewards/margins": 6.508910655975342, + "rewards/rejected": -3.3605682849884033, + "step": 16596 + }, + { + "epoch": 4.15, + "grad_norm": 4.625962734222412, + "learning_rate": 6.925136674624772e-07, + "logits/chosen": -0.485676646232605, + "logits/rejected": -0.5890235304832458, + "logps/chosen": -51.47913360595703, + "logps/rejected": -98.29940795898438, + "loss": 0.5793, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1861796379089355, + "rewards/margins": 7.349861145019531, + "rewards/rejected": -4.163681507110596, + "step": 16597 + }, + { + "epoch": 4.15, + "grad_norm": 6.300489902496338, + "learning_rate": 6.921146258587353e-07, + "logits/chosen": -0.5708218812942505, + "logits/rejected": -0.6682931184768677, + "logps/chosen": -52.837615966796875, + "logps/rejected": -109.38764953613281, + "loss": 0.5919, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9601385593414307, + "rewards/margins": 7.340617656707764, + "rewards/rejected": -4.380478858947754, + "step": 16598 + }, + { + "epoch": 4.15, + "grad_norm": 2.8087730407714844, + "learning_rate": 6.917156907074241e-07, + "logits/chosen": -0.5650335550308228, + "logits/rejected": -0.6287744045257568, + "logps/chosen": -39.60124588012695, + "logps/rejected": -98.7181396484375, + "loss": 0.6061, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.174872398376465, + "rewards/margins": 6.757174491882324, + "rewards/rejected": -3.5823020935058594, + "step": 16599 + }, + { + "epoch": 4.15, + "grad_norm": 19.190505981445312, + "learning_rate": 6.913168620184008e-07, + "logits/chosen": -0.5680095553398132, + "logits/rejected": -0.6192010045051575, + "logps/chosen": -68.45541381835938, + "logps/rejected": -117.79000854492188, + "loss": 0.7421, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0837512016296387, + "rewards/margins": 6.162634372711182, + "rewards/rejected": -3.078882932662964, + "step": 16600 + }, + { + "epoch": 4.15, + "grad_norm": 3.2191827297210693, + "learning_rate": 6.909181398015191e-07, + "logits/chosen": -0.5802042484283447, + "logits/rejected": -0.6774367690086365, + "logps/chosen": -58.589290618896484, + "logps/rejected": -127.95944213867188, + "loss": 0.6107, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0381574630737305, + "rewards/margins": 9.083536148071289, + "rewards/rejected": -6.045378684997559, + "step": 16601 + }, + { + "epoch": 4.15, + "grad_norm": 3.5156469345092773, + "learning_rate": 6.905195240666351e-07, + "logits/chosen": -0.5609323382377625, + "logits/rejected": -0.6300182342529297, + "logps/chosen": -48.43472671508789, + "logps/rejected": -112.26151275634766, + "loss": 0.6133, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0378856658935547, + "rewards/margins": 7.375714302062988, + "rewards/rejected": -4.337828159332275, + "step": 16602 + }, + { + "epoch": 4.15, + "grad_norm": 3.365952730178833, + "learning_rate": 6.901210148235976e-07, + "logits/chosen": -0.6352107524871826, + "logits/rejected": -0.7227504253387451, + "logps/chosen": -41.2330322265625, + "logps/rejected": -87.3265380859375, + "loss": 0.5795, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.229527473449707, + "rewards/margins": 7.0066423416137695, + "rewards/rejected": -3.777114152908325, + "step": 16603 + }, + { + "epoch": 4.15, + "grad_norm": 4.484745025634766, + "learning_rate": 6.897226120822525e-07, + "logits/chosen": -0.45842283964157104, + "logits/rejected": -0.557701587677002, + "logps/chosen": -50.95470428466797, + "logps/rejected": -100.5200424194336, + "loss": 0.5908, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.00850510597229, + "rewards/margins": 7.5131659507751465, + "rewards/rejected": -4.504660606384277, + "step": 16604 + }, + { + "epoch": 4.15, + "grad_norm": 5.831976413726807, + "learning_rate": 6.893243158524476e-07, + "logits/chosen": -0.5508168339729309, + "logits/rejected": -0.6209571361541748, + "logps/chosen": -54.772300720214844, + "logps/rejected": -96.20697021484375, + "loss": 0.6616, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.29066801071167, + "rewards/margins": 6.199378490447998, + "rewards/rejected": -2.908710479736328, + "step": 16605 + }, + { + "epoch": 4.15, + "grad_norm": 4.03079891204834, + "learning_rate": 6.889261261440239e-07, + "logits/chosen": -0.5221766233444214, + "logits/rejected": -0.5741623044013977, + "logps/chosen": -55.158164978027344, + "logps/rejected": -95.7906723022461, + "loss": 0.6247, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2778284549713135, + "rewards/margins": 6.154633045196533, + "rewards/rejected": -2.8768045902252197, + "step": 16606 + }, + { + "epoch": 4.15, + "grad_norm": 16.299768447875977, + "learning_rate": 6.885280429668201e-07, + "logits/chosen": -0.5401188731193542, + "logits/rejected": -0.6497845649719238, + "logps/chosen": -51.43684005737305, + "logps/rejected": -113.84864044189453, + "loss": 0.6387, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.5285515785217285, + "rewards/margins": 8.043180465698242, + "rewards/rejected": -4.514629364013672, + "step": 16607 + }, + { + "epoch": 4.15, + "grad_norm": 5.122636318206787, + "learning_rate": 6.88130066330675e-07, + "logits/chosen": -0.6015530824661255, + "logits/rejected": -0.6916685700416565, + "logps/chosen": -47.2206916809082, + "logps/rejected": -93.84681701660156, + "loss": 0.6015, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.017806053161621, + "rewards/margins": 7.959123134613037, + "rewards/rejected": -4.941318035125732, + "step": 16608 + }, + { + "epoch": 4.15, + "grad_norm": 5.983414173126221, + "learning_rate": 6.877321962454225e-07, + "logits/chosen": -0.5338215231895447, + "logits/rejected": -0.6331754326820374, + "logps/chosen": -57.81254959106445, + "logps/rejected": -121.79837799072266, + "loss": 0.6005, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.208502769470215, + "rewards/margins": 7.840517520904541, + "rewards/rejected": -4.632014751434326, + "step": 16609 + }, + { + "epoch": 4.16, + "grad_norm": 4.889910697937012, + "learning_rate": 6.87334432720893e-07, + "logits/chosen": -0.462802916765213, + "logits/rejected": -0.5121541023254395, + "logps/chosen": -73.09135437011719, + "logps/rejected": -143.74183654785156, + "loss": 0.6356, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.213829278945923, + "rewards/margins": 8.833234786987305, + "rewards/rejected": -5.619405269622803, + "step": 16610 + }, + { + "epoch": 4.16, + "grad_norm": 3.6025431156158447, + "learning_rate": 6.869367757669171e-07, + "logits/chosen": -0.5974015593528748, + "logits/rejected": -0.667866587638855, + "logps/chosen": -87.73306274414062, + "logps/rejected": -119.10099792480469, + "loss": 0.6097, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1572601795196533, + "rewards/margins": 7.808172225952148, + "rewards/rejected": -4.650912284851074, + "step": 16611 + }, + { + "epoch": 4.16, + "grad_norm": 3.273660659790039, + "learning_rate": 6.865392253933218e-07, + "logits/chosen": -0.46943986415863037, + "logits/rejected": -0.5287888646125793, + "logps/chosen": -64.02847290039062, + "logps/rejected": -110.17689514160156, + "loss": 0.6442, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.270477294921875, + "rewards/margins": 6.970459461212158, + "rewards/rejected": -3.699981927871704, + "step": 16612 + }, + { + "epoch": 4.16, + "grad_norm": 6.581147193908691, + "learning_rate": 6.861417816099302e-07, + "logits/chosen": -0.5641428232192993, + "logits/rejected": -0.6493914127349854, + "logps/chosen": -51.570831298828125, + "logps/rejected": -99.64239501953125, + "loss": 0.5592, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0299649238586426, + "rewards/margins": 7.573756217956543, + "rewards/rejected": -4.543790817260742, + "step": 16613 + }, + { + "epoch": 4.16, + "grad_norm": 3.511388063430786, + "learning_rate": 6.857444444265626e-07, + "logits/chosen": -0.5712379813194275, + "logits/rejected": -0.6998828053474426, + "logps/chosen": -57.83176803588867, + "logps/rejected": -107.65383911132812, + "loss": 0.5727, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4456593990325928, + "rewards/margins": 7.768290996551514, + "rewards/rejected": -4.3226318359375, + "step": 16614 + }, + { + "epoch": 4.16, + "grad_norm": 8.573177337646484, + "learning_rate": 6.853472138530398e-07, + "logits/chosen": -0.5845032930374146, + "logits/rejected": -0.6827890276908875, + "logps/chosen": -50.729740142822266, + "logps/rejected": -117.31433868408203, + "loss": 0.6562, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.173760175704956, + "rewards/margins": 8.404047012329102, + "rewards/rejected": -5.230287551879883, + "step": 16615 + }, + { + "epoch": 4.16, + "grad_norm": 5.340972900390625, + "learning_rate": 6.849500898991757e-07, + "logits/chosen": -0.5516610741615295, + "logits/rejected": -0.588988721370697, + "logps/chosen": -50.58577346801758, + "logps/rejected": -121.04768371582031, + "loss": 0.5965, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8113605976104736, + "rewards/margins": 7.16553258895874, + "rewards/rejected": -4.354172706604004, + "step": 16616 + }, + { + "epoch": 4.16, + "grad_norm": 2.635160207748413, + "learning_rate": 6.845530725747851e-07, + "logits/chosen": -0.5416396856307983, + "logits/rejected": -0.6181212067604065, + "logps/chosen": -47.42784118652344, + "logps/rejected": -97.78070831298828, + "loss": 0.553, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4747583866119385, + "rewards/margins": 6.965604305267334, + "rewards/rejected": -3.4908459186553955, + "step": 16617 + }, + { + "epoch": 4.16, + "grad_norm": 17.93675994873047, + "learning_rate": 6.841561618896786e-07, + "logits/chosen": -0.560844898223877, + "logits/rejected": -0.590075671672821, + "logps/chosen": -49.40847396850586, + "logps/rejected": -108.11660766601562, + "loss": 0.6098, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0999207496643066, + "rewards/margins": 6.1846537590026855, + "rewards/rejected": -3.084733247756958, + "step": 16618 + }, + { + "epoch": 4.16, + "grad_norm": 3.155076503753662, + "learning_rate": 6.837593578536628e-07, + "logits/chosen": -0.5960169434547424, + "logits/rejected": -0.6294909715652466, + "logps/chosen": -38.497779846191406, + "logps/rejected": -108.77394104003906, + "loss": 0.5065, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8476102352142334, + "rewards/margins": 7.2218217849731445, + "rewards/rejected": -3.3742117881774902, + "step": 16619 + }, + { + "epoch": 4.16, + "grad_norm": 2.6279966831207275, + "learning_rate": 6.833626604765454e-07, + "logits/chosen": -0.6980912685394287, + "logits/rejected": -0.7502371072769165, + "logps/chosen": -43.49736785888672, + "logps/rejected": -117.91324615478516, + "loss": 0.5856, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.275745153427124, + "rewards/margins": 9.357839584350586, + "rewards/rejected": -6.082094192504883, + "step": 16620 + }, + { + "epoch": 4.16, + "grad_norm": 5.475236892700195, + "learning_rate": 6.829660697681278e-07, + "logits/chosen": -0.6150662899017334, + "logits/rejected": -0.670911431312561, + "logps/chosen": -41.003395080566406, + "logps/rejected": -104.24476623535156, + "loss": 0.6193, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.233124256134033, + "rewards/margins": 6.170361518859863, + "rewards/rejected": -2.937236785888672, + "step": 16621 + }, + { + "epoch": 4.16, + "grad_norm": 3.3616678714752197, + "learning_rate": 6.825695857382092e-07, + "logits/chosen": -0.6014354228973389, + "logits/rejected": -0.7093163132667542, + "logps/chosen": -56.768218994140625, + "logps/rejected": -119.21307373046875, + "loss": 0.5886, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4443416595458984, + "rewards/margins": 8.43770694732666, + "rewards/rejected": -4.993365287780762, + "step": 16622 + }, + { + "epoch": 4.16, + "grad_norm": 3.3416733741760254, + "learning_rate": 6.821732083965888e-07, + "logits/chosen": -0.5354673266410828, + "logits/rejected": -0.6105220913887024, + "logps/chosen": -53.29985809326172, + "logps/rejected": -96.50892639160156, + "loss": 0.6182, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.199723720550537, + "rewards/margins": 6.858357906341553, + "rewards/rejected": -3.6586344242095947, + "step": 16623 + }, + { + "epoch": 4.16, + "grad_norm": 4.493473052978516, + "learning_rate": 6.817769377530631e-07, + "logits/chosen": -0.5221346616744995, + "logits/rejected": -0.6140402555465698, + "logps/chosen": -55.71478271484375, + "logps/rejected": -111.6105728149414, + "loss": 0.6122, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.815639019012451, + "rewards/margins": 7.156510353088379, + "rewards/rejected": -4.3408708572387695, + "step": 16624 + }, + { + "epoch": 4.16, + "grad_norm": 2.9728403091430664, + "learning_rate": 6.813807738174199e-07, + "logits/chosen": -0.6015096306800842, + "logits/rejected": -0.6470604538917542, + "logps/chosen": -50.64715576171875, + "logps/rejected": -110.88811492919922, + "loss": 0.622, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6689252853393555, + "rewards/margins": 6.9170403480529785, + "rewards/rejected": -4.248115062713623, + "step": 16625 + }, + { + "epoch": 4.16, + "grad_norm": 2.6282496452331543, + "learning_rate": 6.809847165994515e-07, + "logits/chosen": -0.5288031101226807, + "logits/rejected": -0.5572214722633362, + "logps/chosen": -49.2653923034668, + "logps/rejected": -126.12521362304688, + "loss": 0.5315, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4289565086364746, + "rewards/margins": 8.40529727935791, + "rewards/rejected": -4.976340293884277, + "step": 16626 + }, + { + "epoch": 4.16, + "grad_norm": 53.784217834472656, + "learning_rate": 6.805887661089467e-07, + "logits/chosen": -0.5486021637916565, + "logits/rejected": -0.6432000398635864, + "logps/chosen": -56.90266799926758, + "logps/rejected": -119.68965148925781, + "loss": 0.8018, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.940704107284546, + "rewards/margins": 6.497547149658203, + "rewards/rejected": -3.5568430423736572, + "step": 16627 + }, + { + "epoch": 4.16, + "grad_norm": 5.307334899902344, + "learning_rate": 6.801929223556858e-07, + "logits/chosen": -0.47101670503616333, + "logits/rejected": -0.5936464071273804, + "logps/chosen": -70.48921966552734, + "logps/rejected": -102.31513214111328, + "loss": 0.6368, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0246522426605225, + "rewards/margins": 7.233639717102051, + "rewards/rejected": -4.208987236022949, + "step": 16628 + }, + { + "epoch": 4.16, + "grad_norm": 5.76077938079834, + "learning_rate": 6.797971853494522e-07, + "logits/chosen": -0.5799911022186279, + "logits/rejected": -0.6646620035171509, + "logps/chosen": -66.66190338134766, + "logps/rejected": -114.94795989990234, + "loss": 0.5674, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6518654823303223, + "rewards/margins": 6.903512954711914, + "rewards/rejected": -4.251646995544434, + "step": 16629 + }, + { + "epoch": 4.16, + "grad_norm": 2.781005382537842, + "learning_rate": 6.794015551000271e-07, + "logits/chosen": -0.45182183384895325, + "logits/rejected": -0.5392730236053467, + "logps/chosen": -48.85427474975586, + "logps/rejected": -97.25456237792969, + "loss": 0.5435, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3926868438720703, + "rewards/margins": 7.483325004577637, + "rewards/rejected": -4.090638160705566, + "step": 16630 + }, + { + "epoch": 4.16, + "grad_norm": 2.9295434951782227, + "learning_rate": 6.790060316171848e-07, + "logits/chosen": -0.543831467628479, + "logits/rejected": -0.6060411334037781, + "logps/chosen": -51.956172943115234, + "logps/rejected": -116.2662353515625, + "loss": 0.5798, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3946869373321533, + "rewards/margins": 7.282474517822266, + "rewards/rejected": -3.887787342071533, + "step": 16631 + }, + { + "epoch": 4.16, + "grad_norm": 5.060517311096191, + "learning_rate": 6.78610614910698e-07, + "logits/chosen": -0.6274478435516357, + "logits/rejected": -0.6602842807769775, + "logps/chosen": -45.72404098510742, + "logps/rejected": -114.27994537353516, + "loss": 0.5715, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0961015224456787, + "rewards/margins": 7.855268478393555, + "rewards/rejected": -4.759166717529297, + "step": 16632 + }, + { + "epoch": 4.16, + "grad_norm": 2.8681466579437256, + "learning_rate": 6.782153049903411e-07, + "logits/chosen": -0.558301568031311, + "logits/rejected": -0.6482720971107483, + "logps/chosen": -49.7888069152832, + "logps/rejected": -102.15119934082031, + "loss": 0.5418, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4950642585754395, + "rewards/margins": 7.845388889312744, + "rewards/rejected": -4.350325107574463, + "step": 16633 + }, + { + "epoch": 4.16, + "grad_norm": 3.8966572284698486, + "learning_rate": 6.778201018658803e-07, + "logits/chosen": -0.5896333456039429, + "logits/rejected": -0.6287789940834045, + "logps/chosen": -57.95658493041992, + "logps/rejected": -108.63109588623047, + "loss": 0.5829, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4631149768829346, + "rewards/margins": 7.460181713104248, + "rewards/rejected": -3.997067451477051, + "step": 16634 + }, + { + "epoch": 4.16, + "grad_norm": 2.651784658432007, + "learning_rate": 6.77425005547081e-07, + "logits/chosen": -0.5661734938621521, + "logits/rejected": -0.6391632556915283, + "logps/chosen": -57.18564987182617, + "logps/rejected": -90.0830307006836, + "loss": 0.7049, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.559004545211792, + "rewards/margins": 6.933709144592285, + "rewards/rejected": -3.374704360961914, + "step": 16635 + }, + { + "epoch": 4.16, + "grad_norm": 4.445462226867676, + "learning_rate": 6.770300160437088e-07, + "logits/chosen": -0.5582655072212219, + "logits/rejected": -0.6037132143974304, + "logps/chosen": -54.730831146240234, + "logps/rejected": -111.93905639648438, + "loss": 0.7324, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.854531764984131, + "rewards/margins": 7.032863140106201, + "rewards/rejected": -4.17833137512207, + "step": 16636 + }, + { + "epoch": 4.16, + "grad_norm": 7.782927513122559, + "learning_rate": 6.766351333655219e-07, + "logits/chosen": -0.5615242719650269, + "logits/rejected": -0.6475931406021118, + "logps/chosen": -50.91402816772461, + "logps/rejected": -105.81268310546875, + "loss": 0.5739, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0385589599609375, + "rewards/margins": 7.411884307861328, + "rewards/rejected": -4.373325824737549, + "step": 16637 + }, + { + "epoch": 4.16, + "grad_norm": 4.338449954986572, + "learning_rate": 6.762403575222809e-07, + "logits/chosen": -0.6298765540122986, + "logits/rejected": -0.687303900718689, + "logps/chosen": -46.32981872558594, + "logps/rejected": -107.58383178710938, + "loss": 0.5647, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.312969446182251, + "rewards/margins": 7.467885971069336, + "rewards/rejected": -4.154916286468506, + "step": 16638 + }, + { + "epoch": 4.16, + "grad_norm": 5.468535900115967, + "learning_rate": 6.758456885237391e-07, + "logits/chosen": -0.517445981502533, + "logits/rejected": -0.590093731880188, + "logps/chosen": -51.913570404052734, + "logps/rejected": -95.25314331054688, + "loss": 0.6171, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.093510627746582, + "rewards/margins": 6.78316068649292, + "rewards/rejected": -3.689650535583496, + "step": 16639 + }, + { + "epoch": 4.16, + "grad_norm": 4.29091739654541, + "learning_rate": 6.754511263796487e-07, + "logits/chosen": -0.6077131032943726, + "logits/rejected": -0.7070095539093018, + "logps/chosen": -48.56588363647461, + "logps/rejected": -106.51748657226562, + "loss": 0.5763, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2831978797912598, + "rewards/margins": 8.533550262451172, + "rewards/rejected": -5.250353813171387, + "step": 16640 + }, + { + "epoch": 4.16, + "grad_norm": 3.9159836769104004, + "learning_rate": 6.750566710997608e-07, + "logits/chosen": -0.5584667921066284, + "logits/rejected": -0.6008517146110535, + "logps/chosen": -51.86359786987305, + "logps/rejected": -120.22075653076172, + "loss": 0.5133, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.538363218307495, + "rewards/margins": 7.4625091552734375, + "rewards/rejected": -3.9241461753845215, + "step": 16641 + }, + { + "epoch": 4.16, + "grad_norm": 11.546252250671387, + "learning_rate": 6.746623226938237e-07, + "logits/chosen": -0.4578838348388672, + "logits/rejected": -0.5689530968666077, + "logps/chosen": -47.94095993041992, + "logps/rejected": -106.74918365478516, + "loss": 0.5035, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1862552165985107, + "rewards/margins": 6.840800762176514, + "rewards/rejected": -3.654546022415161, + "step": 16642 + }, + { + "epoch": 4.16, + "grad_norm": 6.134631633758545, + "learning_rate": 6.742680811715819e-07, + "logits/chosen": -0.5892959237098694, + "logits/rejected": -0.6848769187927246, + "logps/chosen": -59.810672760009766, + "logps/rejected": -96.18775939941406, + "loss": 0.6794, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0512888431549072, + "rewards/margins": 7.213278293609619, + "rewards/rejected": -4.161989212036133, + "step": 16643 + }, + { + "epoch": 4.16, + "grad_norm": 6.584635257720947, + "learning_rate": 6.738739465427752e-07, + "logits/chosen": -0.5114601850509644, + "logits/rejected": -0.6183745265007019, + "logps/chosen": -53.05010223388672, + "logps/rejected": -101.12621307373047, + "loss": 0.6278, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9660468101501465, + "rewards/margins": 7.439033508300781, + "rewards/rejected": -4.472986698150635, + "step": 16644 + }, + { + "epoch": 4.16, + "grad_norm": 2.48982572555542, + "learning_rate": 6.734799188171459e-07, + "logits/chosen": -0.5693979263305664, + "logits/rejected": -0.6550182104110718, + "logps/chosen": -49.202598571777344, + "logps/rejected": -113.51757049560547, + "loss": 0.5637, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1749722957611084, + "rewards/margins": 8.391654968261719, + "rewards/rejected": -5.216683387756348, + "step": 16645 + }, + { + "epoch": 4.16, + "grad_norm": 4.132091045379639, + "learning_rate": 6.730859980044297e-07, + "logits/chosen": -0.559831976890564, + "logits/rejected": -0.6955663561820984, + "logps/chosen": -63.561309814453125, + "logps/rejected": -108.45967102050781, + "loss": 0.6763, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2436585426330566, + "rewards/margins": 9.134928703308105, + "rewards/rejected": -5.891269683837891, + "step": 16646 + }, + { + "epoch": 4.16, + "grad_norm": 3.654167413711548, + "learning_rate": 6.726921841143597e-07, + "logits/chosen": -0.580180823802948, + "logits/rejected": -0.6545776128768921, + "logps/chosen": -43.625877380371094, + "logps/rejected": -116.71080017089844, + "loss": 0.5049, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2359814643859863, + "rewards/margins": 7.694502830505371, + "rewards/rejected": -4.458522319793701, + "step": 16647 + }, + { + "epoch": 4.16, + "grad_norm": 4.940796375274658, + "learning_rate": 6.722984771566693e-07, + "logits/chosen": -0.4823843240737915, + "logits/rejected": -0.5805564522743225, + "logps/chosen": -62.54765701293945, + "logps/rejected": -104.90025329589844, + "loss": 0.6134, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1591861248016357, + "rewards/margins": 6.336743354797363, + "rewards/rejected": -3.1775577068328857, + "step": 16648 + }, + { + "epoch": 4.16, + "grad_norm": 4.650238990783691, + "learning_rate": 6.719048771410874e-07, + "logits/chosen": -0.5289669632911682, + "logits/rejected": -0.6364657282829285, + "logps/chosen": -55.3861083984375, + "logps/rejected": -112.32341003417969, + "loss": 0.6239, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4549005031585693, + "rewards/margins": 7.905807971954346, + "rewards/rejected": -4.4509077072143555, + "step": 16649 + }, + { + "epoch": 4.17, + "grad_norm": 2.264939308166504, + "learning_rate": 6.71511384077338e-07, + "logits/chosen": -0.5300235152244568, + "logits/rejected": -0.6025246381759644, + "logps/chosen": -52.506080627441406, + "logps/rejected": -104.39491271972656, + "loss": 0.5896, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.270204544067383, + "rewards/margins": 8.181663513183594, + "rewards/rejected": -4.911459922790527, + "step": 16650 + }, + { + "epoch": 4.17, + "grad_norm": 2.9802041053771973, + "learning_rate": 6.711179979751475e-07, + "logits/chosen": -0.5110460519790649, + "logits/rejected": -0.5726831555366516, + "logps/chosen": -64.46742248535156, + "logps/rejected": -107.18572998046875, + "loss": 0.6648, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.415668249130249, + "rewards/margins": 7.567506790161133, + "rewards/rejected": -4.151839256286621, + "step": 16651 + }, + { + "epoch": 4.17, + "grad_norm": 2.43674373626709, + "learning_rate": 6.707247188442356e-07, + "logits/chosen": -0.5659166574478149, + "logits/rejected": -0.6283743381500244, + "logps/chosen": -46.72909927368164, + "logps/rejected": -109.31227111816406, + "loss": 0.5385, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1093125343322754, + "rewards/margins": 7.428926467895508, + "rewards/rejected": -4.319613456726074, + "step": 16652 + }, + { + "epoch": 4.17, + "grad_norm": 4.092193603515625, + "learning_rate": 6.703315466943199e-07, + "logits/chosen": -0.5252836346626282, + "logits/rejected": -0.6558434963226318, + "logps/chosen": -57.67243957519531, + "logps/rejected": -89.85416412353516, + "loss": 0.7145, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.333852529525757, + "rewards/margins": 6.719452857971191, + "rewards/rejected": -3.3856000900268555, + "step": 16653 + }, + { + "epoch": 4.17, + "grad_norm": 8.689693450927734, + "learning_rate": 6.699384815351167e-07, + "logits/chosen": -0.6089925169944763, + "logits/rejected": -0.6626930236816406, + "logps/chosen": -57.386009216308594, + "logps/rejected": -106.84571075439453, + "loss": 0.6781, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.958153247833252, + "rewards/margins": 6.166153907775879, + "rewards/rejected": -3.208001136779785, + "step": 16654 + }, + { + "epoch": 4.17, + "grad_norm": 26.872028350830078, + "learning_rate": 6.695455233763404e-07, + "logits/chosen": -0.5185704827308655, + "logits/rejected": -0.5869086384773254, + "logps/chosen": -58.9940071105957, + "logps/rejected": -96.02397918701172, + "loss": 0.7513, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8115601539611816, + "rewards/margins": 6.234428882598877, + "rewards/rejected": -3.4228696823120117, + "step": 16655 + }, + { + "epoch": 4.17, + "grad_norm": 2.954678773880005, + "learning_rate": 6.691526722277003e-07, + "logits/chosen": -0.4812166392803192, + "logits/rejected": -0.5478769540786743, + "logps/chosen": -50.010250091552734, + "logps/rejected": -124.7156982421875, + "loss": 0.5097, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.368483781814575, + "rewards/margins": 7.668540000915527, + "rewards/rejected": -4.300055980682373, + "step": 16656 + }, + { + "epoch": 4.17, + "grad_norm": 6.734416484832764, + "learning_rate": 6.687599280989032e-07, + "logits/chosen": -0.6056868433952332, + "logits/rejected": -0.6669210195541382, + "logps/chosen": -55.9615592956543, + "logps/rejected": -115.6265640258789, + "loss": 0.7255, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7630362510681152, + "rewards/margins": 8.173211097717285, + "rewards/rejected": -5.410175323486328, + "step": 16657 + }, + { + "epoch": 4.17, + "grad_norm": 4.558076858520508, + "learning_rate": 6.683672909996559e-07, + "logits/chosen": -0.5141546130180359, + "logits/rejected": -0.592740535736084, + "logps/chosen": -55.6866340637207, + "logps/rejected": -103.01747131347656, + "loss": 0.6046, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.07336163520813, + "rewards/margins": 7.063451766967773, + "rewards/rejected": -3.9900901317596436, + "step": 16658 + }, + { + "epoch": 4.17, + "grad_norm": 1.9141563177108765, + "learning_rate": 6.679747609396603e-07, + "logits/chosen": -0.5388849973678589, + "logits/rejected": -0.6227262020111084, + "logps/chosen": -43.57618713378906, + "logps/rejected": -108.86365509033203, + "loss": 0.4854, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0732829570770264, + "rewards/margins": 8.532172203063965, + "rewards/rejected": -5.458889007568359, + "step": 16659 + }, + { + "epoch": 4.17, + "grad_norm": 4.888642311096191, + "learning_rate": 6.675823379286151e-07, + "logits/chosen": -0.5153089165687561, + "logits/rejected": -0.6184877753257751, + "logps/chosen": -45.54117965698242, + "logps/rejected": -105.00801849365234, + "loss": 0.5318, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.260636806488037, + "rewards/margins": 7.462621688842773, + "rewards/rejected": -4.201984882354736, + "step": 16660 + }, + { + "epoch": 4.17, + "grad_norm": 3.0804200172424316, + "learning_rate": 6.6719002197622e-07, + "logits/chosen": -0.514188289642334, + "logits/rejected": -0.5981022715568542, + "logps/chosen": -51.277557373046875, + "logps/rejected": -110.44462585449219, + "loss": 0.6381, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9242959022521973, + "rewards/margins": 7.743506908416748, + "rewards/rejected": -4.819210052490234, + "step": 16661 + }, + { + "epoch": 4.17, + "grad_norm": 5.602231025695801, + "learning_rate": 6.667978130921665e-07, + "logits/chosen": -0.5649803876876831, + "logits/rejected": -0.6143640875816345, + "logps/chosen": -50.508880615234375, + "logps/rejected": -101.88227844238281, + "loss": 0.5914, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.184641122817993, + "rewards/margins": 6.656497478485107, + "rewards/rejected": -3.4718565940856934, + "step": 16662 + }, + { + "epoch": 4.17, + "grad_norm": 7.644747734069824, + "learning_rate": 6.664057112861489e-07, + "logits/chosen": -0.5342984795570374, + "logits/rejected": -0.6507457494735718, + "logps/chosen": -55.330474853515625, + "logps/rejected": -98.27156066894531, + "loss": 0.784, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1551032066345215, + "rewards/margins": 7.615973472595215, + "rewards/rejected": -4.460870265960693, + "step": 16663 + }, + { + "epoch": 4.17, + "grad_norm": 3.2241263389587402, + "learning_rate": 6.660137165678559e-07, + "logits/chosen": -0.5746558308601379, + "logits/rejected": -0.6277506947517395, + "logps/chosen": -54.493160247802734, + "logps/rejected": -116.15144348144531, + "loss": 0.615, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.092623710632324, + "rewards/margins": 5.95673942565918, + "rewards/rejected": -2.8641152381896973, + "step": 16664 + }, + { + "epoch": 4.17, + "grad_norm": 4.411528587341309, + "learning_rate": 6.656218289469729e-07, + "logits/chosen": -0.5769219398498535, + "logits/rejected": -0.640357255935669, + "logps/chosen": -49.01522445678711, + "logps/rejected": -112.90496826171875, + "loss": 0.5202, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8447232246398926, + "rewards/margins": 8.002852439880371, + "rewards/rejected": -5.158129692077637, + "step": 16665 + }, + { + "epoch": 4.17, + "grad_norm": 7.149940490722656, + "learning_rate": 6.652300484331848e-07, + "logits/chosen": -0.5760191679000854, + "logits/rejected": -0.6066420674324036, + "logps/chosen": -54.46919250488281, + "logps/rejected": -109.4619140625, + "loss": 0.7204, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0094618797302246, + "rewards/margins": 6.268527030944824, + "rewards/rejected": -3.2590644359588623, + "step": 16666 + }, + { + "epoch": 4.17, + "grad_norm": 5.039179801940918, + "learning_rate": 6.648383750361748e-07, + "logits/chosen": -0.5349007844924927, + "logits/rejected": -0.6255306005477905, + "logps/chosen": -56.393531799316406, + "logps/rejected": -105.11277770996094, + "loss": 0.6473, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.025391101837158, + "rewards/margins": 6.895479679107666, + "rewards/rejected": -3.870089054107666, + "step": 16667 + }, + { + "epoch": 4.17, + "grad_norm": 9.260462760925293, + "learning_rate": 6.64446808765617e-07, + "logits/chosen": -0.4819638133049011, + "logits/rejected": -0.5491099953651428, + "logps/chosen": -49.393287658691406, + "logps/rejected": -87.00572204589844, + "loss": 0.5637, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.310288906097412, + "rewards/margins": 6.412148475646973, + "rewards/rejected": -3.1018590927124023, + "step": 16668 + }, + { + "epoch": 4.17, + "grad_norm": 6.740200042724609, + "learning_rate": 6.640553496311908e-07, + "logits/chosen": -0.5815228223800659, + "logits/rejected": -0.6415690183639526, + "logps/chosen": -52.7235221862793, + "logps/rejected": -111.17874908447266, + "loss": 0.627, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3097727298736572, + "rewards/margins": 7.196037292480469, + "rewards/rejected": -3.886263847351074, + "step": 16669 + }, + { + "epoch": 4.17, + "grad_norm": 3.1118264198303223, + "learning_rate": 6.636639976425707e-07, + "logits/chosen": -0.553970217704773, + "logits/rejected": -0.6443464756011963, + "logps/chosen": -47.34980010986328, + "logps/rejected": -115.07554626464844, + "loss": 0.5536, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.025820732116699, + "rewards/margins": 7.499734401702881, + "rewards/rejected": -4.473913669586182, + "step": 16670 + }, + { + "epoch": 4.17, + "grad_norm": 4.144894123077393, + "learning_rate": 6.632727528094235e-07, + "logits/chosen": -0.535712480545044, + "logits/rejected": -0.6608402132987976, + "logps/chosen": -54.020503997802734, + "logps/rejected": -96.59293365478516, + "loss": 0.5923, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.965646982192993, + "rewards/margins": 7.159819602966309, + "rewards/rejected": -4.194173336029053, + "step": 16671 + }, + { + "epoch": 4.17, + "grad_norm": 5.810588836669922, + "learning_rate": 6.628816151414191e-07, + "logits/chosen": -0.5709882974624634, + "logits/rejected": -0.6768268346786499, + "logps/chosen": -52.104583740234375, + "logps/rejected": -97.55193328857422, + "loss": 0.6639, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.022113084793091, + "rewards/margins": 7.162985324859619, + "rewards/rejected": -4.140872001647949, + "step": 16672 + }, + { + "epoch": 4.17, + "grad_norm": 3.1747703552246094, + "learning_rate": 6.624905846482243e-07, + "logits/chosen": -0.5846608281135559, + "logits/rejected": -0.6326510906219482, + "logps/chosen": -41.783409118652344, + "logps/rejected": -110.71098327636719, + "loss": 0.5646, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4260525703430176, + "rewards/margins": 8.049209594726562, + "rewards/rejected": -4.623157024383545, + "step": 16673 + }, + { + "epoch": 4.17, + "grad_norm": 3.217972755432129, + "learning_rate": 6.620996613395009e-07, + "logits/chosen": -0.4598924219608307, + "logits/rejected": -0.548360288143158, + "logps/chosen": -61.53376388549805, + "logps/rejected": -106.64208984375, + "loss": 0.664, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8254008293151855, + "rewards/margins": 6.855497360229492, + "rewards/rejected": -4.03009557723999, + "step": 16674 + }, + { + "epoch": 4.17, + "grad_norm": 5.2002034187316895, + "learning_rate": 6.617088452249076e-07, + "logits/chosen": -0.5825662612915039, + "logits/rejected": -0.6272766590118408, + "logps/chosen": -65.60665893554688, + "logps/rejected": -135.79849243164062, + "loss": 0.7243, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.758527994155884, + "rewards/margins": 6.882957458496094, + "rewards/rejected": -4.124428749084473, + "step": 16675 + }, + { + "epoch": 4.17, + "grad_norm": 6.652989864349365, + "learning_rate": 6.613181363141041e-07, + "logits/chosen": -0.5131301283836365, + "logits/rejected": -0.6031792163848877, + "logps/chosen": -56.13335418701172, + "logps/rejected": -124.75787353515625, + "loss": 0.6281, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.207430601119995, + "rewards/margins": 7.729269027709961, + "rewards/rejected": -4.521838665008545, + "step": 16676 + }, + { + "epoch": 4.17, + "grad_norm": 6.752265453338623, + "learning_rate": 6.609275346167443e-07, + "logits/chosen": -0.5811258554458618, + "logits/rejected": -0.6493009924888611, + "logps/chosen": -50.55057907104492, + "logps/rejected": -106.97665405273438, + "loss": 0.6399, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2028422355651855, + "rewards/margins": 7.399781703948975, + "rewards/rejected": -4.196939945220947, + "step": 16677 + }, + { + "epoch": 4.17, + "grad_norm": 5.366381645202637, + "learning_rate": 6.605370401424799e-07, + "logits/chosen": -0.5248607397079468, + "logits/rejected": -0.5800517797470093, + "logps/chosen": -65.99847412109375, + "logps/rejected": -105.88630676269531, + "loss": 0.6946, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.190493106842041, + "rewards/margins": 7.311964511871338, + "rewards/rejected": -4.1214704513549805, + "step": 16678 + }, + { + "epoch": 4.17, + "grad_norm": 3.3544490337371826, + "learning_rate": 6.601466529009615e-07, + "logits/chosen": -0.5039141774177551, + "logits/rejected": -0.5971344709396362, + "logps/chosen": -47.58934020996094, + "logps/rejected": -108.8216552734375, + "loss": 0.5652, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.115687370300293, + "rewards/margins": 8.231772422790527, + "rewards/rejected": -5.116084575653076, + "step": 16679 + }, + { + "epoch": 4.17, + "grad_norm": 4.5596795082092285, + "learning_rate": 6.597563729018341e-07, + "logits/chosen": -0.44945406913757324, + "logits/rejected": -0.570283055305481, + "logps/chosen": -78.1969985961914, + "logps/rejected": -96.66102600097656, + "loss": 0.7005, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7016005516052246, + "rewards/margins": 5.861112594604492, + "rewards/rejected": -3.1595115661621094, + "step": 16680 + }, + { + "epoch": 4.17, + "grad_norm": 2.398817300796509, + "learning_rate": 6.593662001547446e-07, + "logits/chosen": -0.509371817111969, + "logits/rejected": -0.6128758788108826, + "logps/chosen": -54.146114349365234, + "logps/rejected": -110.85389709472656, + "loss": 0.5387, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1462416648864746, + "rewards/margins": 7.6673383712768555, + "rewards/rejected": -4.521096706390381, + "step": 16681 + }, + { + "epoch": 4.17, + "grad_norm": 5.043103218078613, + "learning_rate": 6.589761346693329e-07, + "logits/chosen": -0.5481007695198059, + "logits/rejected": -0.6114507913589478, + "logps/chosen": -48.11143112182617, + "logps/rejected": -100.75093078613281, + "loss": 0.5446, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.040846109390259, + "rewards/margins": 6.123934745788574, + "rewards/rejected": -3.0830888748168945, + "step": 16682 + }, + { + "epoch": 4.17, + "grad_norm": 4.552803039550781, + "learning_rate": 6.585861764552376e-07, + "logits/chosen": -0.5383713841438293, + "logits/rejected": -0.62871915102005, + "logps/chosen": -51.451683044433594, + "logps/rejected": -99.83075714111328, + "loss": 0.6087, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.155858039855957, + "rewards/margins": 6.377532005310059, + "rewards/rejected": -3.2216739654541016, + "step": 16683 + }, + { + "epoch": 4.17, + "grad_norm": 6.854371547698975, + "learning_rate": 6.581963255220963e-07, + "logits/chosen": -0.4862837791442871, + "logits/rejected": -0.5255711078643799, + "logps/chosen": -53.88227081298828, + "logps/rejected": -101.596923828125, + "loss": 0.6131, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8486692905426025, + "rewards/margins": 5.8927106857299805, + "rewards/rejected": -3.0440409183502197, + "step": 16684 + }, + { + "epoch": 4.17, + "grad_norm": 5.24702787399292, + "learning_rate": 6.578065818795415e-07, + "logits/chosen": -0.5680862069129944, + "logits/rejected": -0.6509791612625122, + "logps/chosen": -59.542266845703125, + "logps/rejected": -107.75379180908203, + "loss": 0.7195, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.289487838745117, + "rewards/margins": 7.20660400390625, + "rewards/rejected": -3.917116165161133, + "step": 16685 + }, + { + "epoch": 4.17, + "grad_norm": 8.506890296936035, + "learning_rate": 6.574169455372048e-07, + "logits/chosen": -0.5695964097976685, + "logits/rejected": -0.6528472900390625, + "logps/chosen": -53.90634536743164, + "logps/rejected": -102.2428970336914, + "loss": 0.6129, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0355918407440186, + "rewards/margins": 7.243143081665039, + "rewards/rejected": -4.2075514793396, + "step": 16686 + }, + { + "epoch": 4.17, + "grad_norm": 12.153691291809082, + "learning_rate": 6.570274165047142e-07, + "logits/chosen": -0.5694841146469116, + "logits/rejected": -0.6468327045440674, + "logps/chosen": -62.697444915771484, + "logps/rejected": -135.6935272216797, + "loss": 0.5998, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0133988857269287, + "rewards/margins": 7.691469192504883, + "rewards/rejected": -4.678070068359375, + "step": 16687 + }, + { + "epoch": 4.17, + "grad_norm": 4.244734764099121, + "learning_rate": 6.566379947916956e-07, + "logits/chosen": -0.5632146596908569, + "logits/rejected": -0.6600632071495056, + "logps/chosen": -54.469825744628906, + "logps/rejected": -99.13778686523438, + "loss": 0.6013, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.394892454147339, + "rewards/margins": 7.120556831359863, + "rewards/rejected": -3.7256641387939453, + "step": 16688 + }, + { + "epoch": 4.17, + "grad_norm": 4.236939907073975, + "learning_rate": 6.562486804077728e-07, + "logits/chosen": -0.5156373977661133, + "logits/rejected": -0.6442559361457825, + "logps/chosen": -54.420433044433594, + "logps/rejected": -92.7820816040039, + "loss": 0.581, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.978362560272217, + "rewards/margins": 7.696288585662842, + "rewards/rejected": -4.717925071716309, + "step": 16689 + }, + { + "epoch": 4.18, + "grad_norm": 25.897085189819336, + "learning_rate": 6.55859473362564e-07, + "logits/chosen": -0.6023532152175903, + "logits/rejected": -0.7120765447616577, + "logps/chosen": -59.450557708740234, + "logps/rejected": -99.15145111083984, + "loss": 0.7743, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9162588119506836, + "rewards/margins": 7.010537624359131, + "rewards/rejected": -4.094278812408447, + "step": 16690 + }, + { + "epoch": 4.18, + "grad_norm": 10.396002769470215, + "learning_rate": 6.554703736656892e-07, + "logits/chosen": -0.5453768968582153, + "logits/rejected": -0.5797949433326721, + "logps/chosen": -44.02156448364258, + "logps/rejected": -99.76526641845703, + "loss": 0.7148, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.860013961791992, + "rewards/margins": 6.492620468139648, + "rewards/rejected": -3.6326072216033936, + "step": 16691 + }, + { + "epoch": 4.18, + "grad_norm": 8.452186584472656, + "learning_rate": 6.550813813267625e-07, + "logits/chosen": -0.5108455419540405, + "logits/rejected": -0.5959848761558533, + "logps/chosen": -56.58278274536133, + "logps/rejected": -115.34565734863281, + "loss": 0.5889, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.392756223678589, + "rewards/margins": 7.798466682434082, + "rewards/rejected": -4.405710220336914, + "step": 16692 + }, + { + "epoch": 4.18, + "grad_norm": 2.5747148990631104, + "learning_rate": 6.546924963553953e-07, + "logits/chosen": -0.6063568592071533, + "logits/rejected": -0.7128915190696716, + "logps/chosen": -57.108489990234375, + "logps/rejected": -91.75503540039062, + "loss": 0.5208, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.700018882751465, + "rewards/margins": 8.249176979064941, + "rewards/rejected": -4.549158096313477, + "step": 16693 + }, + { + "epoch": 4.18, + "grad_norm": 7.5443034172058105, + "learning_rate": 6.543037187611995e-07, + "logits/chosen": -0.557075023651123, + "logits/rejected": -0.6459276676177979, + "logps/chosen": -54.240562438964844, + "logps/rejected": -96.37582397460938, + "loss": 0.7447, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.420412063598633, + "rewards/margins": 6.740633010864258, + "rewards/rejected": -4.320219993591309, + "step": 16694 + }, + { + "epoch": 4.18, + "grad_norm": 3.5589795112609863, + "learning_rate": 6.53915048553781e-07, + "logits/chosen": -0.7042503952980042, + "logits/rejected": -0.7732342481613159, + "logps/chosen": -42.06663513183594, + "logps/rejected": -110.68405151367188, + "loss": 0.5635, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.982673168182373, + "rewards/margins": 7.147418022155762, + "rewards/rejected": -4.164744853973389, + "step": 16695 + }, + { + "epoch": 4.18, + "grad_norm": 18.569433212280273, + "learning_rate": 6.535264857427437e-07, + "logits/chosen": -0.4556483328342438, + "logits/rejected": -0.5941144824028015, + "logps/chosen": -68.43270874023438, + "logps/rejected": -96.80414581298828, + "loss": 0.7134, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.260164499282837, + "rewards/margins": 6.983786582946777, + "rewards/rejected": -3.7236223220825195, + "step": 16696 + }, + { + "epoch": 4.18, + "grad_norm": 6.409154415130615, + "learning_rate": 6.531380303376894e-07, + "logits/chosen": -0.5487698316574097, + "logits/rejected": -0.6150451302528381, + "logps/chosen": -48.16847610473633, + "logps/rejected": -120.00016784667969, + "loss": 0.5969, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1860268115997314, + "rewards/margins": 7.979328632354736, + "rewards/rejected": -4.793302059173584, + "step": 16697 + }, + { + "epoch": 4.18, + "grad_norm": 12.406195640563965, + "learning_rate": 6.527496823482193e-07, + "logits/chosen": -0.6249666213989258, + "logits/rejected": -0.6432375311851501, + "logps/chosen": -60.32145690917969, + "logps/rejected": -123.64907836914062, + "loss": 0.6695, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.856020212173462, + "rewards/margins": 6.6464524269104, + "rewards/rejected": -3.7904324531555176, + "step": 16698 + }, + { + "epoch": 4.18, + "grad_norm": 32.08244705200195, + "learning_rate": 6.523614417839286e-07, + "logits/chosen": -0.5422009825706482, + "logits/rejected": -0.6286131143569946, + "logps/chosen": -58.954246520996094, + "logps/rejected": -92.89299011230469, + "loss": 0.8539, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6843209266662598, + "rewards/margins": 5.384613990783691, + "rewards/rejected": -2.7002930641174316, + "step": 16699 + }, + { + "epoch": 4.18, + "grad_norm": 4.215346336364746, + "learning_rate": 6.519733086544094e-07, + "logits/chosen": -0.5604186654090881, + "logits/rejected": -0.639495313167572, + "logps/chosen": -56.40243148803711, + "logps/rejected": -122.953857421875, + "loss": 0.6344, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.886962652206421, + "rewards/margins": 7.431149005889893, + "rewards/rejected": -4.544186115264893, + "step": 16700 + }, + { + "epoch": 4.18, + "grad_norm": 3.2751049995422363, + "learning_rate": 6.515852829692565e-07, + "logits/chosen": -0.5530918836593628, + "logits/rejected": -0.6557434797286987, + "logps/chosen": -52.4862174987793, + "logps/rejected": -104.04204559326172, + "loss": 0.559, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2487874031066895, + "rewards/margins": 7.4865217208862305, + "rewards/rejected": -4.237734317779541, + "step": 16701 + }, + { + "epoch": 4.18, + "grad_norm": 3.6932260990142822, + "learning_rate": 6.51197364738056e-07, + "logits/chosen": -0.4634625017642975, + "logits/rejected": -0.554201066493988, + "logps/chosen": -56.48403549194336, + "logps/rejected": -107.1360855102539, + "loss": 0.5738, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9255483150482178, + "rewards/margins": 7.104032039642334, + "rewards/rejected": -4.178483963012695, + "step": 16702 + }, + { + "epoch": 4.18, + "grad_norm": 2.99330735206604, + "learning_rate": 6.508095539703929e-07, + "logits/chosen": -0.5815165638923645, + "logits/rejected": -0.6794967651367188, + "logps/chosen": -51.18648910522461, + "logps/rejected": -118.15350341796875, + "loss": 0.5643, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.098964214324951, + "rewards/margins": 8.624902725219727, + "rewards/rejected": -5.525939464569092, + "step": 16703 + }, + { + "epoch": 4.18, + "grad_norm": 6.444582939147949, + "learning_rate": 6.504218506758531e-07, + "logits/chosen": -0.5180213451385498, + "logits/rejected": -0.5872117877006531, + "logps/chosen": -57.89404296875, + "logps/rejected": -106.25111389160156, + "loss": 0.6326, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.007204532623291, + "rewards/margins": 6.903781890869141, + "rewards/rejected": -3.8965768814086914, + "step": 16704 + }, + { + "epoch": 4.18, + "grad_norm": 7.721399784088135, + "learning_rate": 6.500342548640142e-07, + "logits/chosen": -0.5618877410888672, + "logits/rejected": -0.6509438157081604, + "logps/chosen": -58.38532638549805, + "logps/rejected": -101.3707046508789, + "loss": 0.6589, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.747262477874756, + "rewards/margins": 6.376238822937012, + "rewards/rejected": -3.628976821899414, + "step": 16705 + }, + { + "epoch": 4.18, + "grad_norm": 3.2608466148376465, + "learning_rate": 6.496467665444572e-07, + "logits/chosen": -0.4981665015220642, + "logits/rejected": -0.5274096727371216, + "logps/chosen": -49.72200012207031, + "logps/rejected": -117.03946685791016, + "loss": 0.5593, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2057809829711914, + "rewards/margins": 7.140064716339111, + "rewards/rejected": -3.934283494949341, + "step": 16706 + }, + { + "epoch": 4.18, + "grad_norm": 6.332653045654297, + "learning_rate": 6.49259385726756e-07, + "logits/chosen": -0.520263135433197, + "logits/rejected": -0.6082648634910583, + "logps/chosen": -53.285037994384766, + "logps/rejected": -111.18153381347656, + "loss": 0.6415, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2239134311676025, + "rewards/margins": 7.295956134796143, + "rewards/rejected": -4.072042465209961, + "step": 16707 + }, + { + "epoch": 4.18, + "grad_norm": 7.103244781494141, + "learning_rate": 6.488721124204817e-07, + "logits/chosen": -0.4822644591331482, + "logits/rejected": -0.551125168800354, + "logps/chosen": -53.29290771484375, + "logps/rejected": -94.7607650756836, + "loss": 0.6402, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.12325119972229, + "rewards/margins": 7.041234493255615, + "rewards/rejected": -3.917983055114746, + "step": 16708 + }, + { + "epoch": 4.18, + "grad_norm": 2.966574192047119, + "learning_rate": 6.484849466352067e-07, + "logits/chosen": -0.5760331153869629, + "logits/rejected": -0.6290246248245239, + "logps/chosen": -58.005367279052734, + "logps/rejected": -103.43809509277344, + "loss": 0.6274, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3025243282318115, + "rewards/margins": 7.168820381164551, + "rewards/rejected": -3.86629581451416, + "step": 16709 + }, + { + "epoch": 4.18, + "grad_norm": 4.311488151550293, + "learning_rate": 6.480978883804972e-07, + "logits/chosen": -0.4992648959159851, + "logits/rejected": -0.5699691772460938, + "logps/chosen": -62.621856689453125, + "logps/rejected": -104.73450469970703, + "loss": 0.6811, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0670745372772217, + "rewards/margins": 6.7951788902282715, + "rewards/rejected": -3.728104591369629, + "step": 16710 + }, + { + "epoch": 4.18, + "grad_norm": 4.432705879211426, + "learning_rate": 6.477109376659169e-07, + "logits/chosen": -0.5939887166023254, + "logits/rejected": -0.6895372271537781, + "logps/chosen": -54.5567626953125, + "logps/rejected": -92.93463897705078, + "loss": 0.5814, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.089188575744629, + "rewards/margins": 7.043508052825928, + "rewards/rejected": -3.954319477081299, + "step": 16711 + }, + { + "epoch": 4.18, + "grad_norm": 5.634825229644775, + "learning_rate": 6.473240945010284e-07, + "logits/chosen": -0.5137789845466614, + "logits/rejected": -0.5704509019851685, + "logps/chosen": -47.62981033325195, + "logps/rejected": -107.62609100341797, + "loss": 0.5495, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.103142261505127, + "rewards/margins": 6.9812397956848145, + "rewards/rejected": -3.8780977725982666, + "step": 16712 + }, + { + "epoch": 4.18, + "grad_norm": 3.7344017028808594, + "learning_rate": 6.469373588953931e-07, + "logits/chosen": -0.6240226030349731, + "logits/rejected": -0.682785153388977, + "logps/chosen": -51.00756072998047, + "logps/rejected": -100.11786651611328, + "loss": 0.5857, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3783063888549805, + "rewards/margins": 6.595834732055664, + "rewards/rejected": -3.2175278663635254, + "step": 16713 + }, + { + "epoch": 4.18, + "grad_norm": 5.327778339385986, + "learning_rate": 6.465507308585639e-07, + "logits/chosen": -0.5960496664047241, + "logits/rejected": -0.6530513167381287, + "logps/chosen": -44.28520202636719, + "logps/rejected": -106.9469223022461, + "loss": 0.5723, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3467636108398438, + "rewards/margins": 6.415092945098877, + "rewards/rejected": -3.068328857421875, + "step": 16714 + }, + { + "epoch": 4.18, + "grad_norm": 6.342721462249756, + "learning_rate": 6.46164210400097e-07, + "logits/chosen": -0.6066005825996399, + "logits/rejected": -0.7103980779647827, + "logps/chosen": -56.0248908996582, + "logps/rejected": -116.6080322265625, + "loss": 0.6679, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1660337448120117, + "rewards/margins": 8.051584243774414, + "rewards/rejected": -4.885550498962402, + "step": 16715 + }, + { + "epoch": 4.18, + "grad_norm": 3.166541576385498, + "learning_rate": 6.457777975295448e-07, + "logits/chosen": -0.47897353768348694, + "logits/rejected": -0.5547211170196533, + "logps/chosen": -55.04205322265625, + "logps/rejected": -138.71209716796875, + "loss": 0.5092, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0089049339294434, + "rewards/margins": 9.404779434204102, + "rewards/rejected": -6.395875453948975, + "step": 16716 + }, + { + "epoch": 4.18, + "grad_norm": 4.8602614402771, + "learning_rate": 6.45391492256453e-07, + "logits/chosen": -0.534054160118103, + "logits/rejected": -0.5865070223808289, + "logps/chosen": -51.58124542236328, + "logps/rejected": -127.49384307861328, + "loss": 0.5896, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9254937171936035, + "rewards/margins": 7.77071475982666, + "rewards/rejected": -4.845221042633057, + "step": 16717 + }, + { + "epoch": 4.18, + "grad_norm": 2.8457424640655518, + "learning_rate": 6.450052945903689e-07, + "logits/chosen": -0.5011793971061707, + "logits/rejected": -0.6232421398162842, + "logps/chosen": -69.63134765625, + "logps/rejected": -106.26189422607422, + "loss": 0.6098, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.904947280883789, + "rewards/margins": 7.085588455200195, + "rewards/rejected": -4.180641174316406, + "step": 16718 + }, + { + "epoch": 4.18, + "grad_norm": 3.867880344390869, + "learning_rate": 6.446192045408378e-07, + "logits/chosen": -0.588850736618042, + "logits/rejected": -0.6848195791244507, + "logps/chosen": -69.78047180175781, + "logps/rejected": -96.92056274414062, + "loss": 0.6517, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1593427658081055, + "rewards/margins": 6.4220733642578125, + "rewards/rejected": -3.262730598449707, + "step": 16719 + }, + { + "epoch": 4.18, + "grad_norm": 3.9337568283081055, + "learning_rate": 6.442332221173981e-07, + "logits/chosen": -0.46994516253471375, + "logits/rejected": -0.5349660515785217, + "logps/chosen": -55.243675231933594, + "logps/rejected": -123.23542785644531, + "loss": 0.5959, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1082701683044434, + "rewards/margins": 7.875988006591797, + "rewards/rejected": -4.767717361450195, + "step": 16720 + }, + { + "epoch": 4.18, + "grad_norm": 2.8129653930664062, + "learning_rate": 6.438473473295875e-07, + "logits/chosen": -0.613135039806366, + "logits/rejected": -0.6363705396652222, + "logps/chosen": -83.56102752685547, + "logps/rejected": -106.80640411376953, + "loss": 0.5665, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.346302032470703, + "rewards/margins": 6.966740131378174, + "rewards/rejected": -3.6204373836517334, + "step": 16721 + }, + { + "epoch": 4.18, + "grad_norm": 15.369610786437988, + "learning_rate": 6.434615801869437e-07, + "logits/chosen": -0.5507397055625916, + "logits/rejected": -0.6212773323059082, + "logps/chosen": -63.47208786010742, + "logps/rejected": -115.854736328125, + "loss": 0.6885, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6723477840423584, + "rewards/margins": 7.3888959884643555, + "rewards/rejected": -4.716548442840576, + "step": 16722 + }, + { + "epoch": 4.18, + "grad_norm": 3.640615940093994, + "learning_rate": 6.430759206989967e-07, + "logits/chosen": -0.6242414712905884, + "logits/rejected": -0.6388744115829468, + "logps/chosen": -52.73474884033203, + "logps/rejected": -113.8718032836914, + "loss": 0.6375, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0219550132751465, + "rewards/margins": 6.742110729217529, + "rewards/rejected": -3.720155954360962, + "step": 16723 + }, + { + "epoch": 4.18, + "grad_norm": 3.196471691131592, + "learning_rate": 6.426903688752795e-07, + "logits/chosen": -0.5376446843147278, + "logits/rejected": -0.6357702016830444, + "logps/chosen": -51.42615509033203, + "logps/rejected": -111.10135650634766, + "loss": 0.5833, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.28518009185791, + "rewards/margins": 7.54502534866333, + "rewards/rejected": -4.25984525680542, + "step": 16724 + }, + { + "epoch": 4.18, + "grad_norm": 3.638162851333618, + "learning_rate": 6.423049247253176e-07, + "logits/chosen": -0.47300082445144653, + "logits/rejected": -0.5394276976585388, + "logps/chosen": -51.35608673095703, + "logps/rejected": -110.47086334228516, + "loss": 0.5606, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0484421253204346, + "rewards/margins": 7.307032585144043, + "rewards/rejected": -4.258591175079346, + "step": 16725 + }, + { + "epoch": 4.18, + "grad_norm": 4.7046942710876465, + "learning_rate": 6.419195882586349e-07, + "logits/chosen": -0.4159913659095764, + "logits/rejected": -0.5570735931396484, + "logps/chosen": -81.80565643310547, + "logps/rejected": -117.8979721069336, + "loss": 0.7098, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6958649158477783, + "rewards/margins": 7.259084701538086, + "rewards/rejected": -4.5632195472717285, + "step": 16726 + }, + { + "epoch": 4.18, + "grad_norm": 7.483184814453125, + "learning_rate": 6.415343594847556e-07, + "logits/chosen": -0.5783727765083313, + "logits/rejected": -0.6283436417579651, + "logps/chosen": -56.552921295166016, + "logps/rejected": -104.61502838134766, + "loss": 0.7345, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.317274808883667, + "rewards/margins": 7.482524871826172, + "rewards/rejected": -4.165250778198242, + "step": 16727 + }, + { + "epoch": 4.18, + "grad_norm": 2.1840453147888184, + "learning_rate": 6.411492384131984e-07, + "logits/chosen": -0.4807426929473877, + "logits/rejected": -0.5933980941772461, + "logps/chosen": -61.89284896850586, + "logps/rejected": -115.11935424804688, + "loss": 0.5605, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.026857376098633, + "rewards/margins": 7.828978538513184, + "rewards/rejected": -4.802121162414551, + "step": 16728 + }, + { + "epoch": 4.18, + "grad_norm": 3.929142713546753, + "learning_rate": 6.40764225053479e-07, + "logits/chosen": -0.46897536516189575, + "logits/rejected": -0.5155589580535889, + "logps/chosen": -52.87760925292969, + "logps/rejected": -117.85157012939453, + "loss": 0.5678, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9262523651123047, + "rewards/margins": 7.369943141937256, + "rewards/rejected": -4.443690299987793, + "step": 16729 + }, + { + "epoch": 4.19, + "grad_norm": 3.9868855476379395, + "learning_rate": 6.40379319415112e-07, + "logits/chosen": -0.7067868113517761, + "logits/rejected": -0.8045310974121094, + "logps/chosen": -49.91669845581055, + "logps/rejected": -96.1256103515625, + "loss": 0.644, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.035792112350464, + "rewards/margins": 6.503622055053711, + "rewards/rejected": -3.4678304195404053, + "step": 16730 + }, + { + "epoch": 4.19, + "grad_norm": 5.244261264801025, + "learning_rate": 6.399945215076097e-07, + "logits/chosen": -0.5145934820175171, + "logits/rejected": -0.6103088855743408, + "logps/chosen": -66.24656677246094, + "logps/rejected": -96.38710021972656, + "loss": 0.7081, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2885372638702393, + "rewards/margins": 6.579207897186279, + "rewards/rejected": -3.290670394897461, + "step": 16731 + }, + { + "epoch": 4.19, + "grad_norm": 5.621415138244629, + "learning_rate": 6.396098313404808e-07, + "logits/chosen": -0.5674950480461121, + "logits/rejected": -0.6469065546989441, + "logps/chosen": -60.11725616455078, + "logps/rejected": -99.09168243408203, + "loss": 0.6363, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0560755729675293, + "rewards/margins": 6.991313457489014, + "rewards/rejected": -3.935237169265747, + "step": 16732 + }, + { + "epoch": 4.19, + "grad_norm": 2.565725088119507, + "learning_rate": 6.392252489232298e-07, + "logits/chosen": -0.4648594856262207, + "logits/rejected": -0.5409327149391174, + "logps/chosen": -56.109718322753906, + "logps/rejected": -118.70170593261719, + "loss": 0.5549, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3747894763946533, + "rewards/margins": 8.196627616882324, + "rewards/rejected": -4.821837902069092, + "step": 16733 + }, + { + "epoch": 4.19, + "grad_norm": 3.9657375812530518, + "learning_rate": 6.388407742653624e-07, + "logits/chosen": -0.5540604591369629, + "logits/rejected": -0.6670429110527039, + "logps/chosen": -53.267974853515625, + "logps/rejected": -98.40548706054688, + "loss": 0.5797, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2935526371002197, + "rewards/margins": 7.791154861450195, + "rewards/rejected": -4.497602462768555, + "step": 16734 + }, + { + "epoch": 4.19, + "grad_norm": 6.188791275024414, + "learning_rate": 6.384564073763783e-07, + "logits/chosen": -0.5376938581466675, + "logits/rejected": -0.5961433053016663, + "logps/chosen": -65.0533447265625, + "logps/rejected": -100.57099151611328, + "loss": 0.717, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0488791465759277, + "rewards/margins": 5.6478590965271, + "rewards/rejected": -2.598979949951172, + "step": 16735 + }, + { + "epoch": 4.19, + "grad_norm": 5.711155891418457, + "learning_rate": 6.380721482657737e-07, + "logits/chosen": -0.5340995192527771, + "logits/rejected": -0.660973072052002, + "logps/chosen": -54.421844482421875, + "logps/rejected": -85.18937683105469, + "loss": 0.6153, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9179763793945312, + "rewards/margins": 6.938685417175293, + "rewards/rejected": -4.020709037780762, + "step": 16736 + }, + { + "epoch": 4.19, + "grad_norm": 2.2384443283081055, + "learning_rate": 6.376879969430478e-07, + "logits/chosen": -0.5176393389701843, + "logits/rejected": -0.5926903486251831, + "logps/chosen": -61.47968673706055, + "logps/rejected": -146.1683807373047, + "loss": 0.5757, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.58632230758667, + "rewards/margins": 9.728984832763672, + "rewards/rejected": -6.142662048339844, + "step": 16737 + }, + { + "epoch": 4.19, + "grad_norm": 4.813199520111084, + "learning_rate": 6.373039534176912e-07, + "logits/chosen": -0.5914331674575806, + "logits/rejected": -0.7029256820678711, + "logps/chosen": -58.70006561279297, + "logps/rejected": -90.51931762695312, + "loss": 0.6469, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.182926654815674, + "rewards/margins": 6.886979103088379, + "rewards/rejected": -3.704052209854126, + "step": 16738 + }, + { + "epoch": 4.19, + "grad_norm": 3.3801867961883545, + "learning_rate": 6.36920017699193e-07, + "logits/chosen": -0.6187751293182373, + "logits/rejected": -0.7073673009872437, + "logps/chosen": -53.19914627075195, + "logps/rejected": -99.0520248413086, + "loss": 0.6559, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1561713218688965, + "rewards/margins": 7.491591930389404, + "rewards/rejected": -4.335421085357666, + "step": 16739 + }, + { + "epoch": 4.19, + "grad_norm": 4.949608325958252, + "learning_rate": 6.365361897970418e-07, + "logits/chosen": -0.542711615562439, + "logits/rejected": -0.5992769002914429, + "logps/chosen": -69.01876068115234, + "logps/rejected": -116.02351379394531, + "loss": 0.6436, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2089786529541016, + "rewards/margins": 7.950079917907715, + "rewards/rejected": -4.741101264953613, + "step": 16740 + }, + { + "epoch": 4.19, + "grad_norm": 23.073158264160156, + "learning_rate": 6.36152469720725e-07, + "logits/chosen": -0.48152250051498413, + "logits/rejected": -0.5348447561264038, + "logps/chosen": -50.1036262512207, + "logps/rejected": -95.01173400878906, + "loss": 0.6053, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9692678451538086, + "rewards/margins": 5.769449710845947, + "rewards/rejected": -2.8001816272735596, + "step": 16741 + }, + { + "epoch": 4.19, + "grad_norm": 6.985607147216797, + "learning_rate": 6.3576885747972e-07, + "logits/chosen": -0.522612452507019, + "logits/rejected": -0.6472191214561462, + "logps/chosen": -76.75337219238281, + "logps/rejected": -105.22228240966797, + "loss": 0.7732, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.244257688522339, + "rewards/margins": 6.071055889129639, + "rewards/rejected": -2.8267979621887207, + "step": 16742 + }, + { + "epoch": 4.19, + "grad_norm": 2.945505142211914, + "learning_rate": 6.353853530835086e-07, + "logits/chosen": -0.5269952416419983, + "logits/rejected": -0.6110358238220215, + "logps/chosen": -49.36387634277344, + "logps/rejected": -108.60479736328125, + "loss": 0.4939, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3755276203155518, + "rewards/margins": 8.052093505859375, + "rewards/rejected": -4.676565647125244, + "step": 16743 + }, + { + "epoch": 4.19, + "grad_norm": 2.766085147857666, + "learning_rate": 6.350019565415688e-07, + "logits/chosen": -0.5418421626091003, + "logits/rejected": -0.6045772433280945, + "logps/chosen": -53.428470611572266, + "logps/rejected": -106.64945220947266, + "loss": 0.5773, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9451873302459717, + "rewards/margins": 7.635398864746094, + "rewards/rejected": -4.690211296081543, + "step": 16744 + }, + { + "epoch": 4.19, + "grad_norm": 4.747675895690918, + "learning_rate": 6.346186678633731e-07, + "logits/chosen": -0.5322839021682739, + "logits/rejected": -0.6176046133041382, + "logps/chosen": -44.306949615478516, + "logps/rejected": -103.66368103027344, + "loss": 0.5533, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2780158519744873, + "rewards/margins": 7.258580684661865, + "rewards/rejected": -3.980564832687378, + "step": 16745 + }, + { + "epoch": 4.19, + "grad_norm": 5.723739147186279, + "learning_rate": 6.342354870583922e-07, + "logits/chosen": -0.5570191740989685, + "logits/rejected": -0.6317530274391174, + "logps/chosen": -52.31056594848633, + "logps/rejected": -98.9829330444336, + "loss": 0.6796, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.681734561920166, + "rewards/margins": 7.382719039916992, + "rewards/rejected": -4.700984001159668, + "step": 16746 + }, + { + "epoch": 4.19, + "grad_norm": 3.5735859870910645, + "learning_rate": 6.338524141360974e-07, + "logits/chosen": -0.6070443391799927, + "logits/rejected": -0.6754865050315857, + "logps/chosen": -49.76902389526367, + "logps/rejected": -94.90497589111328, + "loss": 0.6023, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1480813026428223, + "rewards/margins": 6.593971252441406, + "rewards/rejected": -3.445890426635742, + "step": 16747 + }, + { + "epoch": 4.19, + "grad_norm": 9.25268840789795, + "learning_rate": 6.334694491059529e-07, + "logits/chosen": -0.5646277070045471, + "logits/rejected": -0.6111394762992859, + "logps/chosen": -48.939361572265625, + "logps/rejected": -111.36367797851562, + "loss": 0.6203, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.692265272140503, + "rewards/margins": 7.309171199798584, + "rewards/rejected": -3.616905689239502, + "step": 16748 + }, + { + "epoch": 4.19, + "grad_norm": 5.719738483428955, + "learning_rate": 6.330865919774232e-07, + "logits/chosen": -0.5817157626152039, + "logits/rejected": -0.6155604124069214, + "logps/chosen": -54.195457458496094, + "logps/rejected": -109.78485107421875, + "loss": 0.6496, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.961035966873169, + "rewards/margins": 5.996147155761719, + "rewards/rejected": -3.03511118888855, + "step": 16749 + }, + { + "epoch": 4.19, + "grad_norm": 8.031352043151855, + "learning_rate": 6.327038427599685e-07, + "logits/chosen": -0.5446348190307617, + "logits/rejected": -0.6420871615409851, + "logps/chosen": -54.40195083618164, + "logps/rejected": -104.7187271118164, + "loss": 0.6309, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.061110258102417, + "rewards/margins": 6.666748046875, + "rewards/rejected": -3.605637788772583, + "step": 16750 + }, + { + "epoch": 4.19, + "grad_norm": 6.683813571929932, + "learning_rate": 6.323212014630464e-07, + "logits/chosen": -0.5273921489715576, + "logits/rejected": -0.6261028051376343, + "logps/chosen": -61.85696029663086, + "logps/rejected": -95.01121520996094, + "loss": 0.6807, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3138599395751953, + "rewards/margins": 6.598299980163574, + "rewards/rejected": -3.2844393253326416, + "step": 16751 + }, + { + "epoch": 4.19, + "grad_norm": 3.4115495681762695, + "learning_rate": 6.319386680961142e-07, + "logits/chosen": -0.5683202147483826, + "logits/rejected": -0.5893057584762573, + "logps/chosen": -45.83846664428711, + "logps/rejected": -129.995849609375, + "loss": 0.4953, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4904489517211914, + "rewards/margins": 7.933099746704102, + "rewards/rejected": -4.44265079498291, + "step": 16752 + }, + { + "epoch": 4.19, + "grad_norm": 3.9528591632843018, + "learning_rate": 6.315562426686233e-07, + "logits/chosen": -0.5427595973014832, + "logits/rejected": -0.6568845510482788, + "logps/chosen": -62.275123596191406, + "logps/rejected": -110.12705993652344, + "loss": 0.5677, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.047703266143799, + "rewards/margins": 7.292097568511963, + "rewards/rejected": -4.244394302368164, + "step": 16753 + }, + { + "epoch": 4.19, + "grad_norm": 5.429779529571533, + "learning_rate": 6.311739251900234e-07, + "logits/chosen": -0.515146791934967, + "logits/rejected": -0.5924318432807922, + "logps/chosen": -47.358062744140625, + "logps/rejected": -96.99956512451172, + "loss": 0.6522, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.109504222869873, + "rewards/margins": 7.037655353546143, + "rewards/rejected": -3.9281506538391113, + "step": 16754 + }, + { + "epoch": 4.19, + "grad_norm": 7.626863956451416, + "learning_rate": 6.307917156697624e-07, + "logits/chosen": -0.62308669090271, + "logits/rejected": -0.7289418578147888, + "logps/chosen": -50.32197952270508, + "logps/rejected": -99.76512145996094, + "loss": 0.574, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.010532855987549, + "rewards/margins": 7.535465240478516, + "rewards/rejected": -4.524931907653809, + "step": 16755 + }, + { + "epoch": 4.19, + "grad_norm": 8.194368362426758, + "learning_rate": 6.304096141172878e-07, + "logits/chosen": -0.5269872546195984, + "logits/rejected": -0.6571007966995239, + "logps/chosen": -54.402198791503906, + "logps/rejected": -102.73681640625, + "loss": 0.5993, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9660751819610596, + "rewards/margins": 6.777309894561768, + "rewards/rejected": -3.811234951019287, + "step": 16756 + }, + { + "epoch": 4.19, + "grad_norm": 7.228208541870117, + "learning_rate": 6.30027620542037e-07, + "logits/chosen": -0.5895721912384033, + "logits/rejected": -0.6163796782493591, + "logps/chosen": -47.377891540527344, + "logps/rejected": -123.78911590576172, + "loss": 0.5413, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0239145755767822, + "rewards/margins": 7.960629463195801, + "rewards/rejected": -4.936714172363281, + "step": 16757 + }, + { + "epoch": 4.19, + "grad_norm": 5.286334991455078, + "learning_rate": 6.296457349534518e-07, + "logits/chosen": -0.5860726833343506, + "logits/rejected": -0.6530323624610901, + "logps/chosen": -52.548709869384766, + "logps/rejected": -103.29246520996094, + "loss": 0.6288, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9023327827453613, + "rewards/margins": 6.004499435424805, + "rewards/rejected": -3.1021666526794434, + "step": 16758 + }, + { + "epoch": 4.19, + "grad_norm": 4.387292861938477, + "learning_rate": 6.29263957360971e-07, + "logits/chosen": -0.5946992635726929, + "logits/rejected": -0.6808607578277588, + "logps/chosen": -44.91884231567383, + "logps/rejected": -135.6810760498047, + "loss": 0.5647, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3976449966430664, + "rewards/margins": 8.942005157470703, + "rewards/rejected": -5.544360637664795, + "step": 16759 + }, + { + "epoch": 4.19, + "grad_norm": 6.015694618225098, + "learning_rate": 6.288822877740247e-07, + "logits/chosen": -0.5979546308517456, + "logits/rejected": -0.6636387705802917, + "logps/chosen": -63.42009735107422, + "logps/rejected": -98.87306213378906, + "loss": 0.6889, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.098745107650757, + "rewards/margins": 6.364619255065918, + "rewards/rejected": -3.265873908996582, + "step": 16760 + }, + { + "epoch": 4.19, + "grad_norm": 4.362228870391846, + "learning_rate": 6.285007262020465e-07, + "logits/chosen": -0.5398253798484802, + "logits/rejected": -0.6194165945053101, + "logps/chosen": -56.83582305908203, + "logps/rejected": -105.66043090820312, + "loss": 0.6861, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.231839895248413, + "rewards/margins": 7.102374076843262, + "rewards/rejected": -3.8705334663391113, + "step": 16761 + }, + { + "epoch": 4.19, + "grad_norm": 2.924995183944702, + "learning_rate": 6.281192726544666e-07, + "logits/chosen": -0.5098192095756531, + "logits/rejected": -0.6241837739944458, + "logps/chosen": -56.99977493286133, + "logps/rejected": -116.25914001464844, + "loss": 0.5608, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0611326694488525, + "rewards/margins": 8.421303749084473, + "rewards/rejected": -5.360171318054199, + "step": 16762 + }, + { + "epoch": 4.19, + "grad_norm": 5.1805853843688965, + "learning_rate": 6.277379271407091e-07, + "logits/chosen": -0.5852974057197571, + "logits/rejected": -0.5988575220108032, + "logps/chosen": -41.366127014160156, + "logps/rejected": -109.95875549316406, + "loss": 0.5499, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3035664558410645, + "rewards/margins": 7.250397682189941, + "rewards/rejected": -3.946831703186035, + "step": 16763 + }, + { + "epoch": 4.19, + "grad_norm": 2.2125253677368164, + "learning_rate": 6.273566896701965e-07, + "logits/chosen": -0.5919175744056702, + "logits/rejected": -0.6612616777420044, + "logps/chosen": -46.76354217529297, + "logps/rejected": -95.22879791259766, + "loss": 0.5327, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4407663345336914, + "rewards/margins": 7.842050552368164, + "rewards/rejected": -4.401284217834473, + "step": 16764 + }, + { + "epoch": 4.19, + "grad_norm": 7.222161293029785, + "learning_rate": 6.269755602523531e-07, + "logits/chosen": -0.4786125421524048, + "logits/rejected": -0.5998748540878296, + "logps/chosen": -62.646156311035156, + "logps/rejected": -98.04634857177734, + "loss": 0.5838, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3136935234069824, + "rewards/margins": 8.03506088256836, + "rewards/rejected": -4.721367359161377, + "step": 16765 + }, + { + "epoch": 4.19, + "grad_norm": 5.424219131469727, + "learning_rate": 6.265945388965944e-07, + "logits/chosen": -0.5441679954528809, + "logits/rejected": -0.5885931253433228, + "logps/chosen": -51.98174285888672, + "logps/rejected": -109.86762237548828, + "loss": 0.5582, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3791606426239014, + "rewards/margins": 7.4261603355407715, + "rewards/rejected": -4.046999454498291, + "step": 16766 + }, + { + "epoch": 4.19, + "grad_norm": 2.555758476257324, + "learning_rate": 6.262136256123353e-07, + "logits/chosen": -0.5430875420570374, + "logits/rejected": -0.6028904318809509, + "logps/chosen": -51.71018600463867, + "logps/rejected": -113.45569610595703, + "loss": 0.5749, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3488049507141113, + "rewards/margins": 7.2631120681762695, + "rewards/rejected": -3.9143075942993164, + "step": 16767 + }, + { + "epoch": 4.19, + "grad_norm": 5.308467864990234, + "learning_rate": 6.258328204089909e-07, + "logits/chosen": -0.5394930243492126, + "logits/rejected": -0.6131808757781982, + "logps/chosen": -52.48247528076172, + "logps/rejected": -95.65947723388672, + "loss": 0.6757, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.825336456298828, + "rewards/margins": 6.920414447784424, + "rewards/rejected": -4.095077991485596, + "step": 16768 + }, + { + "epoch": 4.19, + "grad_norm": 3.980668783187866, + "learning_rate": 6.254521232959687e-07, + "logits/chosen": -0.5552003383636475, + "logits/rejected": -0.6040903925895691, + "logps/chosen": -53.34027862548828, + "logps/rejected": -99.39080810546875, + "loss": 0.6215, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0754122734069824, + "rewards/margins": 6.661890983581543, + "rewards/rejected": -3.5864789485931396, + "step": 16769 + }, + { + "epoch": 4.2, + "grad_norm": 22.176225662231445, + "learning_rate": 6.250715342826791e-07, + "logits/chosen": -0.4960339665412903, + "logits/rejected": -0.6208624243736267, + "logps/chosen": -58.139244079589844, + "logps/rejected": -93.36210632324219, + "loss": 0.6544, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.074122190475464, + "rewards/margins": 6.615960121154785, + "rewards/rejected": -3.5418381690979004, + "step": 16770 + }, + { + "epoch": 4.2, + "grad_norm": 5.156944751739502, + "learning_rate": 6.24691053378525e-07, + "logits/chosen": -0.5278801918029785, + "logits/rejected": -0.6108157634735107, + "logps/chosen": -54.236515045166016, + "logps/rejected": -107.92735290527344, + "loss": 0.5968, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1456570625305176, + "rewards/margins": 7.0998854637146, + "rewards/rejected": -3.954228162765503, + "step": 16771 + }, + { + "epoch": 4.2, + "grad_norm": 2.8486244678497314, + "learning_rate": 6.243106805929073e-07, + "logits/chosen": -0.5725119709968567, + "logits/rejected": -0.6490199565887451, + "logps/chosen": -49.12422180175781, + "logps/rejected": -105.57518768310547, + "loss": 0.5796, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1572647094726562, + "rewards/margins": 7.227937698364258, + "rewards/rejected": -4.070672988891602, + "step": 16772 + }, + { + "epoch": 4.2, + "grad_norm": 10.017953872680664, + "learning_rate": 6.239304159352272e-07, + "logits/chosen": -0.5252742171287537, + "logits/rejected": -0.6004130840301514, + "logps/chosen": -59.065738677978516, + "logps/rejected": -107.42962646484375, + "loss": 0.638, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2802321910858154, + "rewards/margins": 6.743973731994629, + "rewards/rejected": -3.4637417793273926, + "step": 16773 + }, + { + "epoch": 4.2, + "grad_norm": 2.2925758361816406, + "learning_rate": 6.235502594148824e-07, + "logits/chosen": -0.552664041519165, + "logits/rejected": -0.6234540343284607, + "logps/chosen": -55.12796401977539, + "logps/rejected": -96.94984436035156, + "loss": 0.5614, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2032082080841064, + "rewards/margins": 7.6620025634765625, + "rewards/rejected": -4.458794116973877, + "step": 16774 + }, + { + "epoch": 4.2, + "grad_norm": 2.321768283843994, + "learning_rate": 6.231702110412657e-07, + "logits/chosen": -0.5441189408302307, + "logits/rejected": -0.6258631348609924, + "logps/chosen": -55.25446701049805, + "logps/rejected": -103.2027816772461, + "loss": 0.5605, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0256333351135254, + "rewards/margins": 7.399597644805908, + "rewards/rejected": -4.373964309692383, + "step": 16775 + }, + { + "epoch": 4.2, + "grad_norm": 4.441594123840332, + "learning_rate": 6.227902708237671e-07, + "logits/chosen": -0.529285728931427, + "logits/rejected": -0.5774881839752197, + "logps/chosen": -57.98731231689453, + "logps/rejected": -121.75393676757812, + "loss": 0.6174, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.925457715988159, + "rewards/margins": 7.388502597808838, + "rewards/rejected": -4.463045120239258, + "step": 16776 + }, + { + "epoch": 4.2, + "grad_norm": 6.320392608642578, + "learning_rate": 6.224104387717778e-07, + "logits/chosen": -0.5947738289833069, + "logits/rejected": -0.6531873941421509, + "logps/chosen": -51.40298843383789, + "logps/rejected": -100.08307647705078, + "loss": 0.6284, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0407354831695557, + "rewards/margins": 6.693782806396484, + "rewards/rejected": -3.653047561645508, + "step": 16777 + }, + { + "epoch": 4.2, + "grad_norm": 23.681819915771484, + "learning_rate": 6.220307148946831e-07, + "logits/chosen": -0.5407484769821167, + "logits/rejected": -0.5696821212768555, + "logps/chosen": -47.455413818359375, + "logps/rejected": -106.74019622802734, + "loss": 0.6384, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2451372146606445, + "rewards/margins": 5.895929336547852, + "rewards/rejected": -2.650791883468628, + "step": 16778 + }, + { + "epoch": 4.2, + "grad_norm": 6.343268871307373, + "learning_rate": 6.216510992018649e-07, + "logits/chosen": -0.4919731020927429, + "logits/rejected": -0.6012269854545593, + "logps/chosen": -64.11616516113281, + "logps/rejected": -108.6508560180664, + "loss": 0.657, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.883021354675293, + "rewards/margins": 7.408540725708008, + "rewards/rejected": -4.525519847869873, + "step": 16779 + }, + { + "epoch": 4.2, + "grad_norm": 4.740669250488281, + "learning_rate": 6.212715917027057e-07, + "logits/chosen": -0.5374452471733093, + "logits/rejected": -0.6595134139060974, + "logps/chosen": -56.973140716552734, + "logps/rejected": -98.3225326538086, + "loss": 0.5961, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.090121030807495, + "rewards/margins": 7.581140995025635, + "rewards/rejected": -4.491019248962402, + "step": 16780 + }, + { + "epoch": 4.2, + "grad_norm": 5.776226043701172, + "learning_rate": 6.20892192406583e-07, + "logits/chosen": -0.5919468998908997, + "logits/rejected": -0.6682634949684143, + "logps/chosen": -55.57215881347656, + "logps/rejected": -105.68488311767578, + "loss": 0.6851, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9744811058044434, + "rewards/margins": 6.8058013916015625, + "rewards/rejected": -3.831320285797119, + "step": 16781 + }, + { + "epoch": 4.2, + "grad_norm": 8.936054229736328, + "learning_rate": 6.205129013228716e-07, + "logits/chosen": -0.5533732175827026, + "logits/rejected": -0.5923691391944885, + "logps/chosen": -48.64110565185547, + "logps/rejected": -112.93416595458984, + "loss": 0.5997, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3164525032043457, + "rewards/margins": 7.92825174331665, + "rewards/rejected": -4.611799716949463, + "step": 16782 + }, + { + "epoch": 4.2, + "grad_norm": 8.778157234191895, + "learning_rate": 6.201337184609452e-07, + "logits/chosen": -0.5158321261405945, + "logits/rejected": -0.5621378421783447, + "logps/chosen": -61.13441467285156, + "logps/rejected": -106.63664245605469, + "loss": 0.6292, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.423635959625244, + "rewards/margins": 7.269433975219727, + "rewards/rejected": -3.845797300338745, + "step": 16783 + }, + { + "epoch": 4.2, + "grad_norm": 5.039434432983398, + "learning_rate": 6.197546438301733e-07, + "logits/chosen": -0.5264066457748413, + "logits/rejected": -0.5988094806671143, + "logps/chosen": -63.786468505859375, + "logps/rejected": -106.35213470458984, + "loss": 0.6884, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.364478588104248, + "rewards/margins": 6.223343372344971, + "rewards/rejected": -2.8588643074035645, + "step": 16784 + }, + { + "epoch": 4.2, + "grad_norm": 2.770414352416992, + "learning_rate": 6.193756774399224e-07, + "logits/chosen": -0.5126963257789612, + "logits/rejected": -0.6126265525817871, + "logps/chosen": -62.14250946044922, + "logps/rejected": -119.31377410888672, + "loss": 0.5563, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.164559841156006, + "rewards/margins": 7.784191131591797, + "rewards/rejected": -4.619631290435791, + "step": 16785 + }, + { + "epoch": 4.2, + "grad_norm": 2.346083879470825, + "learning_rate": 6.189968192995577e-07, + "logits/chosen": -0.5799977779388428, + "logits/rejected": -0.6311884522438049, + "logps/chosen": -46.892967224121094, + "logps/rejected": -120.48847198486328, + "loss": 0.5081, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1246280670166016, + "rewards/margins": 8.633159637451172, + "rewards/rejected": -5.50853157043457, + "step": 16786 + }, + { + "epoch": 4.2, + "grad_norm": 6.108994483947754, + "learning_rate": 6.186180694184424e-07, + "logits/chosen": -0.5816890001296997, + "logits/rejected": -0.6487421989440918, + "logps/chosen": -51.53606414794922, + "logps/rejected": -108.24332427978516, + "loss": 0.6723, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.208894729614258, + "rewards/margins": 7.737679481506348, + "rewards/rejected": -4.52878475189209, + "step": 16787 + }, + { + "epoch": 4.2, + "grad_norm": 4.947154521942139, + "learning_rate": 6.182394278059351e-07, + "logits/chosen": -0.5107436180114746, + "logits/rejected": -0.6179124116897583, + "logps/chosen": -55.36514663696289, + "logps/rejected": -98.98347473144531, + "loss": 0.6231, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9685208797454834, + "rewards/margins": 7.088493347167969, + "rewards/rejected": -4.119972229003906, + "step": 16788 + }, + { + "epoch": 4.2, + "grad_norm": 3.5089869499206543, + "learning_rate": 6.17860894471391e-07, + "logits/chosen": -0.5544817447662354, + "logits/rejected": -0.6081922650337219, + "logps/chosen": -51.57984161376953, + "logps/rejected": -113.14591217041016, + "loss": 0.6044, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1894664764404297, + "rewards/margins": 7.649953365325928, + "rewards/rejected": -4.460485935211182, + "step": 16789 + }, + { + "epoch": 4.2, + "grad_norm": 1.828602910041809, + "learning_rate": 6.17482469424166e-07, + "logits/chosen": -0.594893217086792, + "logits/rejected": -0.7238936424255371, + "logps/chosen": -64.7119140625, + "logps/rejected": -111.44891357421875, + "loss": 0.6075, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9911794662475586, + "rewards/margins": 9.45789909362793, + "rewards/rejected": -6.466719627380371, + "step": 16790 + }, + { + "epoch": 4.2, + "grad_norm": 9.698287010192871, + "learning_rate": 6.17104152673611e-07, + "logits/chosen": -0.5514487028121948, + "logits/rejected": -0.6295045614242554, + "logps/chosen": -63.33021926879883, + "logps/rejected": -91.37928009033203, + "loss": 0.8129, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.233290910720825, + "rewards/margins": 5.764316082000732, + "rewards/rejected": -2.531024694442749, + "step": 16791 + }, + { + "epoch": 4.2, + "grad_norm": 6.801873207092285, + "learning_rate": 6.167259442290729e-07, + "logits/chosen": -0.5845808386802673, + "logits/rejected": -0.6994988322257996, + "logps/chosen": -60.40882873535156, + "logps/rejected": -116.17642974853516, + "loss": 0.5881, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1252405643463135, + "rewards/margins": 7.965842247009277, + "rewards/rejected": -4.840601921081543, + "step": 16792 + }, + { + "epoch": 4.2, + "grad_norm": 2.0332260131835938, + "learning_rate": 6.163478440999005e-07, + "logits/chosen": -0.5802639722824097, + "logits/rejected": -0.7123437523841858, + "logps/chosen": -54.255287170410156, + "logps/rejected": -118.14362335205078, + "loss": 0.5505, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.077519416809082, + "rewards/margins": 9.520106315612793, + "rewards/rejected": -6.442587375640869, + "step": 16793 + }, + { + "epoch": 4.2, + "grad_norm": 4.6960859298706055, + "learning_rate": 6.159698522954344e-07, + "logits/chosen": -0.5222557783126831, + "logits/rejected": -0.5675776600837708, + "logps/chosen": -54.15314483642578, + "logps/rejected": -137.68389892578125, + "loss": 0.5768, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9284634590148926, + "rewards/margins": 8.622030258178711, + "rewards/rejected": -5.693565845489502, + "step": 16794 + }, + { + "epoch": 4.2, + "grad_norm": 3.448986053466797, + "learning_rate": 6.155919688250167e-07, + "logits/chosen": -0.5974629521369934, + "logits/rejected": -0.686083197593689, + "logps/chosen": -61.21638488769531, + "logps/rejected": -101.7110824584961, + "loss": 0.6125, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.08211088180542, + "rewards/margins": 7.1824517250061035, + "rewards/rejected": -4.100340843200684, + "step": 16795 + }, + { + "epoch": 4.2, + "grad_norm": 4.393859386444092, + "learning_rate": 6.152141936979855e-07, + "logits/chosen": -0.5838736891746521, + "logits/rejected": -0.6301559209823608, + "logps/chosen": -56.346160888671875, + "logps/rejected": -123.62116241455078, + "loss": 0.5877, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.172546625137329, + "rewards/margins": 7.77446174621582, + "rewards/rejected": -4.60191535949707, + "step": 16796 + }, + { + "epoch": 4.2, + "grad_norm": 4.104441165924072, + "learning_rate": 6.148365269236745e-07, + "logits/chosen": -0.6397057771682739, + "logits/rejected": -0.7013561725616455, + "logps/chosen": -39.12384033203125, + "logps/rejected": -112.71600341796875, + "loss": 0.4782, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0889205932617188, + "rewards/margins": 7.356429576873779, + "rewards/rejected": -4.2675089836120605, + "step": 16797 + }, + { + "epoch": 4.2, + "grad_norm": 2.8217101097106934, + "learning_rate": 6.144589685114171e-07, + "logits/chosen": -0.5273118615150452, + "logits/rejected": -0.5953318476676941, + "logps/chosen": -50.10421371459961, + "logps/rejected": -107.79055786132812, + "loss": 0.6171, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.515756130218506, + "rewards/margins": 7.044033050537109, + "rewards/rejected": -3.528277635574341, + "step": 16798 + }, + { + "epoch": 4.2, + "grad_norm": 4.105891704559326, + "learning_rate": 6.140815184705451e-07, + "logits/chosen": -0.554452657699585, + "logits/rejected": -0.656906247138977, + "logps/chosen": -56.98100662231445, + "logps/rejected": -94.56806945800781, + "loss": 0.6379, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.656850576400757, + "rewards/margins": 7.092093467712402, + "rewards/rejected": -3.4352431297302246, + "step": 16799 + }, + { + "epoch": 4.2, + "grad_norm": 9.084123611450195, + "learning_rate": 6.137041768103819e-07, + "logits/chosen": -0.6366202235221863, + "logits/rejected": -0.712666928768158, + "logps/chosen": -56.84798812866211, + "logps/rejected": -114.59675598144531, + "loss": 0.5979, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.572664499282837, + "rewards/margins": 7.993659973144531, + "rewards/rejected": -4.420995712280273, + "step": 16800 + }, + { + "epoch": 4.2, + "grad_norm": 8.770730018615723, + "learning_rate": 6.13326943540254e-07, + "logits/chosen": -0.49046653509140015, + "logits/rejected": -0.543212354183197, + "logps/chosen": -59.75223159790039, + "logps/rejected": -96.099853515625, + "loss": 0.67, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3936052322387695, + "rewards/margins": 7.1188507080078125, + "rewards/rejected": -3.725245475769043, + "step": 16801 + }, + { + "epoch": 4.2, + "grad_norm": 3.7650582790374756, + "learning_rate": 6.129498186694855e-07, + "logits/chosen": -0.5633059740066528, + "logits/rejected": -0.6858923435211182, + "logps/chosen": -42.369667053222656, + "logps/rejected": -87.70025634765625, + "loss": 0.5094, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.419217348098755, + "rewards/margins": 7.383110523223877, + "rewards/rejected": -3.963892698287964, + "step": 16802 + }, + { + "epoch": 4.2, + "grad_norm": 8.47021198272705, + "learning_rate": 6.125728022073907e-07, + "logits/chosen": -0.5755324959754944, + "logits/rejected": -0.6671244502067566, + "logps/chosen": -51.86592483520508, + "logps/rejected": -101.62696075439453, + "loss": 0.6121, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7966251373291016, + "rewards/margins": 6.61402702331543, + "rewards/rejected": -3.817401885986328, + "step": 16803 + }, + { + "epoch": 4.2, + "grad_norm": 7.515117645263672, + "learning_rate": 6.121958941632889e-07, + "logits/chosen": -0.5188112258911133, + "logits/rejected": -0.6276484131813049, + "logps/chosen": -52.37672805786133, + "logps/rejected": -93.48811340332031, + "loss": 0.6405, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.850121259689331, + "rewards/margins": 6.078438758850098, + "rewards/rejected": -3.228318214416504, + "step": 16804 + }, + { + "epoch": 4.2, + "grad_norm": 3.6627864837646484, + "learning_rate": 6.118190945464947e-07, + "logits/chosen": -0.49113842844963074, + "logits/rejected": -0.5836139917373657, + "logps/chosen": -51.96561813354492, + "logps/rejected": -76.3582534790039, + "loss": 0.5922, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3534138202667236, + "rewards/margins": 5.748654842376709, + "rewards/rejected": -2.3952412605285645, + "step": 16805 + }, + { + "epoch": 4.2, + "grad_norm": 3.258357286453247, + "learning_rate": 6.11442403366318e-07, + "logits/chosen": -0.555046021938324, + "logits/rejected": -0.5846553444862366, + "logps/chosen": -49.643001556396484, + "logps/rejected": -110.76925659179688, + "loss": 0.5619, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3522791862487793, + "rewards/margins": 6.2708916664123535, + "rewards/rejected": -2.918612241744995, + "step": 16806 + }, + { + "epoch": 4.2, + "grad_norm": 4.283286094665527, + "learning_rate": 6.110658206320663e-07, + "logits/chosen": -0.5859407782554626, + "logits/rejected": -0.619130551815033, + "logps/chosen": -50.79798126220703, + "logps/rejected": -108.18376159667969, + "loss": 0.531, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.312059164047241, + "rewards/margins": 6.7602691650390625, + "rewards/rejected": -3.448209524154663, + "step": 16807 + }, + { + "epoch": 4.2, + "grad_norm": 7.661229133605957, + "learning_rate": 6.106893463530478e-07, + "logits/chosen": -0.580573320388794, + "logits/rejected": -0.6304130554199219, + "logps/chosen": -56.21432876586914, + "logps/rejected": -114.01436614990234, + "loss": 0.7071, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0444772243499756, + "rewards/margins": 6.185238361358643, + "rewards/rejected": -3.140760660171509, + "step": 16808 + }, + { + "epoch": 4.2, + "grad_norm": 11.543906211853027, + "learning_rate": 6.103129805385639e-07, + "logits/chosen": -0.5798414945602417, + "logits/rejected": -0.6554606556892395, + "logps/chosen": -49.190860748291016, + "logps/rejected": -97.63251495361328, + "loss": 0.5824, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8050646781921387, + "rewards/margins": 6.318391799926758, + "rewards/rejected": -3.513326644897461, + "step": 16809 + }, + { + "epoch": 4.21, + "grad_norm": 5.402731418609619, + "learning_rate": 6.099367231979142e-07, + "logits/chosen": -0.6364772915840149, + "logits/rejected": -0.7099757790565491, + "logps/chosen": -41.95347595214844, + "logps/rejected": -111.1115951538086, + "loss": 0.6037, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.324415445327759, + "rewards/margins": 8.671354293823242, + "rewards/rejected": -5.3469390869140625, + "step": 16810 + }, + { + "epoch": 4.21, + "grad_norm": 4.631683826446533, + "learning_rate": 6.095605743403987e-07, + "logits/chosen": -0.5880765318870544, + "logits/rejected": -0.6505365967750549, + "logps/chosen": -41.178619384765625, + "logps/rejected": -104.20439147949219, + "loss": 0.5706, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1136059761047363, + "rewards/margins": 7.79771614074707, + "rewards/rejected": -4.684110164642334, + "step": 16811 + }, + { + "epoch": 4.21, + "grad_norm": 5.495625972747803, + "learning_rate": 6.091845339753105e-07, + "logits/chosen": -0.5640738010406494, + "logits/rejected": -0.6427631974220276, + "logps/chosen": -48.158050537109375, + "logps/rejected": -94.37843322753906, + "loss": 0.6179, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.964550495147705, + "rewards/margins": 7.141101837158203, + "rewards/rejected": -4.176551342010498, + "step": 16812 + }, + { + "epoch": 4.21, + "grad_norm": 4.194469928741455, + "learning_rate": 6.088086021119438e-07, + "logits/chosen": -0.5359782576560974, + "logits/rejected": -0.6343372464179993, + "logps/chosen": -53.71529006958008, + "logps/rejected": -91.37750244140625, + "loss": 0.5457, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1437268257141113, + "rewards/margins": 6.845754146575928, + "rewards/rejected": -3.7020275592803955, + "step": 16813 + }, + { + "epoch": 4.21, + "grad_norm": 4.89098596572876, + "learning_rate": 6.084327787595872e-07, + "logits/chosen": -0.5880604982376099, + "logits/rejected": -0.6581502556800842, + "logps/chosen": -56.35350036621094, + "logps/rejected": -121.48941040039062, + "loss": 0.54, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5054938793182373, + "rewards/margins": 8.411009788513184, + "rewards/rejected": -4.905516147613525, + "step": 16814 + }, + { + "epoch": 4.21, + "grad_norm": 5.891386032104492, + "learning_rate": 6.080570639275263e-07, + "logits/chosen": -0.5197088718414307, + "logits/rejected": -0.5789421200752258, + "logps/chosen": -46.46357727050781, + "logps/rejected": -108.88725280761719, + "loss": 0.7081, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.656954288482666, + "rewards/margins": 7.127984046936035, + "rewards/rejected": -4.471029281616211, + "step": 16815 + }, + { + "epoch": 4.21, + "grad_norm": 16.906166076660156, + "learning_rate": 6.076814576250484e-07, + "logits/chosen": -0.5966508984565735, + "logits/rejected": -0.6728602647781372, + "logps/chosen": -52.12713623046875, + "logps/rejected": -94.59635925292969, + "loss": 0.5738, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1642279624938965, + "rewards/margins": 6.956210136413574, + "rewards/rejected": -3.791982650756836, + "step": 16816 + }, + { + "epoch": 4.21, + "grad_norm": 4.814339637756348, + "learning_rate": 6.073059598614323e-07, + "logits/chosen": -0.5227665901184082, + "logits/rejected": -0.5692691802978516, + "logps/chosen": -54.74340057373047, + "logps/rejected": -99.51168823242188, + "loss": 0.6334, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7458930015563965, + "rewards/margins": 6.352705478668213, + "rewards/rejected": -3.6068127155303955, + "step": 16817 + }, + { + "epoch": 4.21, + "grad_norm": 5.742656707763672, + "learning_rate": 6.069305706459589e-07, + "logits/chosen": -0.5459873676300049, + "logits/rejected": -0.5701403021812439, + "logps/chosen": -55.00594711303711, + "logps/rejected": -106.9452133178711, + "loss": 0.6852, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2407901287078857, + "rewards/margins": 6.156213283538818, + "rewards/rejected": -2.9154224395751953, + "step": 16818 + }, + { + "epoch": 4.21, + "grad_norm": 4.415740966796875, + "learning_rate": 6.06555289987903e-07, + "logits/chosen": -0.5593224763870239, + "logits/rejected": -0.682668924331665, + "logps/chosen": -53.91612243652344, + "logps/rejected": -95.48854064941406, + "loss": 0.6297, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1429994106292725, + "rewards/margins": 7.398705005645752, + "rewards/rejected": -4.2557053565979, + "step": 16819 + }, + { + "epoch": 4.21, + "grad_norm": 5.101531505584717, + "learning_rate": 6.061801178965399e-07, + "logits/chosen": -0.5039979815483093, + "logits/rejected": -0.5607478022575378, + "logps/chosen": -58.9443473815918, + "logps/rejected": -112.996337890625, + "loss": 0.6822, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2898874282836914, + "rewards/margins": 7.578975677490234, + "rewards/rejected": -4.289087772369385, + "step": 16820 + }, + { + "epoch": 4.21, + "grad_norm": 3.587972640991211, + "learning_rate": 6.0580505438114e-07, + "logits/chosen": -0.5228476524353027, + "logits/rejected": -0.6172699928283691, + "logps/chosen": -47.72869873046875, + "logps/rejected": -99.46876525878906, + "loss": 0.5499, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.098670721054077, + "rewards/margins": 7.734986782073975, + "rewards/rejected": -4.636315822601318, + "step": 16821 + }, + { + "epoch": 4.21, + "grad_norm": 4.733515739440918, + "learning_rate": 6.054300994509698e-07, + "logits/chosen": -0.595067024230957, + "logits/rejected": -0.6789611577987671, + "logps/chosen": -53.881736755371094, + "logps/rejected": -108.46522521972656, + "loss": 0.6505, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5304641723632812, + "rewards/margins": 7.11212158203125, + "rewards/rejected": -3.5816566944122314, + "step": 16822 + }, + { + "epoch": 4.21, + "grad_norm": 2.2962050437927246, + "learning_rate": 6.05055253115297e-07, + "logits/chosen": -0.5689553618431091, + "logits/rejected": -0.6531139612197876, + "logps/chosen": -50.80868911743164, + "logps/rejected": -103.31401062011719, + "loss": 0.5547, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9891738891601562, + "rewards/margins": 7.22507905960083, + "rewards/rejected": -4.235905170440674, + "step": 16823 + }, + { + "epoch": 4.21, + "grad_norm": 3.9592387676239014, + "learning_rate": 6.046805153833835e-07, + "logits/chosen": -0.5723073482513428, + "logits/rejected": -0.6192684173583984, + "logps/chosen": -46.915611267089844, + "logps/rejected": -118.44603729248047, + "loss": 0.6474, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.989760398864746, + "rewards/margins": 8.76926326751709, + "rewards/rejected": -5.779502868652344, + "step": 16824 + }, + { + "epoch": 4.21, + "grad_norm": 4.092177867889404, + "learning_rate": 6.043058862644885e-07, + "logits/chosen": -0.4749760031700134, + "logits/rejected": -0.5376487970352173, + "logps/chosen": -48.82684326171875, + "logps/rejected": -102.81245422363281, + "loss": 0.5681, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1857411861419678, + "rewards/margins": 6.915122985839844, + "rewards/rejected": -3.7293810844421387, + "step": 16825 + }, + { + "epoch": 4.21, + "grad_norm": 2.1037206649780273, + "learning_rate": 6.039313657678719e-07, + "logits/chosen": -0.6074721813201904, + "logits/rejected": -0.6705070734024048, + "logps/chosen": -38.164939880371094, + "logps/rejected": -102.24913024902344, + "loss": 0.4712, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1821084022521973, + "rewards/margins": 7.718374729156494, + "rewards/rejected": -4.536266803741455, + "step": 16826 + }, + { + "epoch": 4.21, + "grad_norm": 2.602134943008423, + "learning_rate": 6.035569539027869e-07, + "logits/chosen": -0.5308293700218201, + "logits/rejected": -0.6487451791763306, + "logps/chosen": -51.453224182128906, + "logps/rejected": -113.57485961914062, + "loss": 0.5423, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9028823375701904, + "rewards/margins": 7.356658935546875, + "rewards/rejected": -4.4537763595581055, + "step": 16827 + }, + { + "epoch": 4.21, + "grad_norm": 4.656032085418701, + "learning_rate": 6.031826506784849e-07, + "logits/chosen": -0.5932396650314331, + "logits/rejected": -0.6302884817123413, + "logps/chosen": -45.24071502685547, + "logps/rejected": -121.00874328613281, + "loss": 0.5323, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.071357250213623, + "rewards/margins": 7.762354850769043, + "rewards/rejected": -4.69099760055542, + "step": 16828 + }, + { + "epoch": 4.21, + "grad_norm": 3.799171209335327, + "learning_rate": 6.028084561042163e-07, + "logits/chosen": -0.5706352591514587, + "logits/rejected": -0.6041247844696045, + "logps/chosen": -49.540672302246094, + "logps/rejected": -113.29085540771484, + "loss": 0.5773, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9232630729675293, + "rewards/margins": 6.780597686767578, + "rewards/rejected": -3.857334613800049, + "step": 16829 + }, + { + "epoch": 4.21, + "grad_norm": 2.3693745136260986, + "learning_rate": 6.024343701892287e-07, + "logits/chosen": -0.5333117246627808, + "logits/rejected": -0.5867854952812195, + "logps/chosen": -52.37921905517578, + "logps/rejected": -117.85547637939453, + "loss": 0.5385, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0125620365142822, + "rewards/margins": 7.034670829772949, + "rewards/rejected": -4.022108554840088, + "step": 16830 + }, + { + "epoch": 4.21, + "grad_norm": 9.144301414489746, + "learning_rate": 6.020603929427654e-07, + "logits/chosen": -0.5196459889411926, + "logits/rejected": -0.6298548579216003, + "logps/chosen": -58.00849914550781, + "logps/rejected": -104.07662963867188, + "loss": 0.6102, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8742735385894775, + "rewards/margins": 7.4888410568237305, + "rewards/rejected": -4.614567756652832, + "step": 16831 + }, + { + "epoch": 4.21, + "grad_norm": 6.042243480682373, + "learning_rate": 6.016865243740661e-07, + "logits/chosen": -0.5303141474723816, + "logits/rejected": -0.6034038662910461, + "logps/chosen": -60.71324920654297, + "logps/rejected": -88.28048706054688, + "loss": 0.7232, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9931752681732178, + "rewards/margins": 5.308913707733154, + "rewards/rejected": -2.3157386779785156, + "step": 16832 + }, + { + "epoch": 4.21, + "grad_norm": 3.1657416820526123, + "learning_rate": 6.013127644923727e-07, + "logits/chosen": -0.4880172610282898, + "logits/rejected": -0.579318106174469, + "logps/chosen": -53.24798583984375, + "logps/rejected": -113.28717803955078, + "loss": 0.6196, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4807541370391846, + "rewards/margins": 7.811278343200684, + "rewards/rejected": -4.33052396774292, + "step": 16833 + }, + { + "epoch": 4.21, + "grad_norm": 4.41988468170166, + "learning_rate": 6.009391133069193e-07, + "logits/chosen": -0.5318565368652344, + "logits/rejected": -0.6068754196166992, + "logps/chosen": -65.89032745361328, + "logps/rejected": -107.78788757324219, + "loss": 0.6402, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.00771427154541, + "rewards/margins": 7.286520957946777, + "rewards/rejected": -4.278806686401367, + "step": 16834 + }, + { + "epoch": 4.21, + "grad_norm": 6.1966047286987305, + "learning_rate": 6.005655708269386e-07, + "logits/chosen": -0.5377724170684814, + "logits/rejected": -0.6447493433952332, + "logps/chosen": -59.93989562988281, + "logps/rejected": -111.55706787109375, + "loss": 0.6583, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2495288848876953, + "rewards/margins": 7.908471584320068, + "rewards/rejected": -4.658943176269531, + "step": 16835 + }, + { + "epoch": 4.21, + "grad_norm": 1.4129019975662231, + "learning_rate": 6.001921370616626e-07, + "logits/chosen": -0.4448762834072113, + "logits/rejected": -0.5806891918182373, + "logps/chosen": -56.66529846191406, + "logps/rejected": -113.3834457397461, + "loss": 0.5312, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.052117347717285, + "rewards/margins": 9.146634101867676, + "rewards/rejected": -6.094516277313232, + "step": 16836 + }, + { + "epoch": 4.21, + "grad_norm": 5.918139457702637, + "learning_rate": 5.998188120203185e-07, + "logits/chosen": -0.4974876642227173, + "logits/rejected": -0.5790396332740784, + "logps/chosen": -57.56172561645508, + "logps/rejected": -93.67086791992188, + "loss": 0.5885, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.412874460220337, + "rewards/margins": 7.173162937164307, + "rewards/rejected": -3.7602875232696533, + "step": 16837 + }, + { + "epoch": 4.21, + "grad_norm": 7.896069049835205, + "learning_rate": 5.99445595712132e-07, + "logits/chosen": -0.5274709463119507, + "logits/rejected": -0.6204198598861694, + "logps/chosen": -61.703697204589844, + "logps/rejected": -98.77535247802734, + "loss": 0.7262, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8000123500823975, + "rewards/margins": 5.842668056488037, + "rewards/rejected": -3.0426559448242188, + "step": 16838 + }, + { + "epoch": 4.21, + "grad_norm": 5.972043037414551, + "learning_rate": 5.990724881463262e-07, + "logits/chosen": -0.566943883895874, + "logits/rejected": -0.662344753742218, + "logps/chosen": -53.31560516357422, + "logps/rejected": -95.61124420166016, + "loss": 0.628, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.278747797012329, + "rewards/margins": 6.876760482788086, + "rewards/rejected": -3.5980124473571777, + "step": 16839 + }, + { + "epoch": 4.21, + "grad_norm": 3.3682005405426025, + "learning_rate": 5.986994893321191e-07, + "logits/chosen": -0.654878556728363, + "logits/rejected": -0.7307188510894775, + "logps/chosen": -58.21900939941406, + "logps/rejected": -111.67903137207031, + "loss": 0.6534, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.314342737197876, + "rewards/margins": 8.487622261047363, + "rewards/rejected": -5.17327880859375, + "step": 16840 + }, + { + "epoch": 4.21, + "grad_norm": 6.37495231628418, + "learning_rate": 5.983265992787296e-07, + "logits/chosen": -0.5467609167098999, + "logits/rejected": -0.6318246126174927, + "logps/chosen": -53.06098175048828, + "logps/rejected": -113.8072738647461, + "loss": 0.5613, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.952366590499878, + "rewards/margins": 8.07905387878418, + "rewards/rejected": -5.126688480377197, + "step": 16841 + }, + { + "epoch": 4.21, + "grad_norm": 2.8172905445098877, + "learning_rate": 5.97953817995372e-07, + "logits/chosen": -0.5615653991699219, + "logits/rejected": -0.6412890553474426, + "logps/chosen": -52.91305923461914, + "logps/rejected": -121.96988677978516, + "loss": 0.5899, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1492228507995605, + "rewards/margins": 8.80100154876709, + "rewards/rejected": -5.651778697967529, + "step": 16842 + }, + { + "epoch": 4.21, + "grad_norm": 6.820895671844482, + "learning_rate": 5.975811454912567e-07, + "logits/chosen": -0.5192581415176392, + "logits/rejected": -0.6181051135063171, + "logps/chosen": -47.487056732177734, + "logps/rejected": -97.61394500732422, + "loss": 0.5426, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.134352207183838, + "rewards/margins": 7.388121604919434, + "rewards/rejected": -4.253769397735596, + "step": 16843 + }, + { + "epoch": 4.21, + "grad_norm": 5.7958574295043945, + "learning_rate": 5.972085817755941e-07, + "logits/chosen": -0.5016531944274902, + "logits/rejected": -0.5835943222045898, + "logps/chosen": -56.39954376220703, + "logps/rejected": -110.2991714477539, + "loss": 0.6153, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.261148452758789, + "rewards/margins": 7.451364040374756, + "rewards/rejected": -4.190216064453125, + "step": 16844 + }, + { + "epoch": 4.21, + "grad_norm": 2.4893627166748047, + "learning_rate": 5.968361268575923e-07, + "logits/chosen": -0.4923841655254364, + "logits/rejected": -0.5681197047233582, + "logps/chosen": -55.45541763305664, + "logps/rejected": -100.71910095214844, + "loss": 0.5857, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3223297595977783, + "rewards/margins": 6.312912464141846, + "rewards/rejected": -2.9905827045440674, + "step": 16845 + }, + { + "epoch": 4.21, + "grad_norm": 3.1430857181549072, + "learning_rate": 5.964637807464507e-07, + "logits/chosen": -0.5118310451507568, + "logits/rejected": -0.6328033804893494, + "logps/chosen": -58.003440856933594, + "logps/rejected": -104.37003326416016, + "loss": 0.6097, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.946462631225586, + "rewards/margins": 7.60135555267334, + "rewards/rejected": -4.654892921447754, + "step": 16846 + }, + { + "epoch": 4.21, + "grad_norm": 3.1622629165649414, + "learning_rate": 5.960915434513736e-07, + "logits/chosen": -0.5940028429031372, + "logits/rejected": -0.6585474610328674, + "logps/chosen": -49.65565872192383, + "logps/rejected": -100.5001220703125, + "loss": 0.5673, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0971291065216064, + "rewards/margins": 7.128450393676758, + "rewards/rejected": -4.0313215255737305, + "step": 16847 + }, + { + "epoch": 4.21, + "grad_norm": 2.2398743629455566, + "learning_rate": 5.957194149815604e-07, + "logits/chosen": -0.5380409359931946, + "logits/rejected": -0.6239379048347473, + "logps/chosen": -46.97411346435547, + "logps/rejected": -119.63013458251953, + "loss": 0.5608, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2526283264160156, + "rewards/margins": 8.647749900817871, + "rewards/rejected": -5.3951215744018555, + "step": 16848 + }, + { + "epoch": 4.21, + "grad_norm": 5.299468040466309, + "learning_rate": 5.953473953462036e-07, + "logits/chosen": -0.526752233505249, + "logits/rejected": -0.6028855443000793, + "logps/chosen": -52.09300231933594, + "logps/rejected": -116.75245666503906, + "loss": 0.5819, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1451234817504883, + "rewards/margins": 7.603282928466797, + "rewards/rejected": -4.458159923553467, + "step": 16849 + }, + { + "epoch": 4.22, + "grad_norm": 5.497061252593994, + "learning_rate": 5.949754845544969e-07, + "logits/chosen": -0.5626974701881409, + "logits/rejected": -0.533418595790863, + "logps/chosen": -55.18020248413086, + "logps/rejected": -124.7348861694336, + "loss": 0.6356, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3168110847473145, + "rewards/margins": 6.318811416625977, + "rewards/rejected": -3.002000331878662, + "step": 16850 + }, + { + "epoch": 4.22, + "grad_norm": 10.58015251159668, + "learning_rate": 5.946036826156327e-07, + "logits/chosen": -0.5707657933235168, + "logits/rejected": -0.6159713864326477, + "logps/chosen": -55.37137222290039, + "logps/rejected": -96.8478775024414, + "loss": 0.6625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0563549995422363, + "rewards/margins": 5.984971523284912, + "rewards/rejected": -2.9286160469055176, + "step": 16851 + }, + { + "epoch": 4.22, + "grad_norm": 4.928633213043213, + "learning_rate": 5.942319895387971e-07, + "logits/chosen": -0.5861437320709229, + "logits/rejected": -0.6772527694702148, + "logps/chosen": -44.85744094848633, + "logps/rejected": -96.14036560058594, + "loss": 0.6104, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1556737422943115, + "rewards/margins": 8.085883140563965, + "rewards/rejected": -4.930209159851074, + "step": 16852 + }, + { + "epoch": 4.22, + "grad_norm": 3.806870937347412, + "learning_rate": 5.938604053331743e-07, + "logits/chosen": -0.5499563217163086, + "logits/rejected": -0.6708031892776489, + "logps/chosen": -56.58745193481445, + "logps/rejected": -111.73774719238281, + "loss": 0.5735, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.832526683807373, + "rewards/margins": 7.215837478637695, + "rewards/rejected": -4.383310794830322, + "step": 16853 + }, + { + "epoch": 4.22, + "grad_norm": 3.7203712463378906, + "learning_rate": 5.934889300079489e-07, + "logits/chosen": -0.5799196362495422, + "logits/rejected": -0.7018716931343079, + "logps/chosen": -49.609439849853516, + "logps/rejected": -110.7535629272461, + "loss": 0.5608, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9877851009368896, + "rewards/margins": 7.397489070892334, + "rewards/rejected": -4.409704208374023, + "step": 16854 + }, + { + "epoch": 4.22, + "grad_norm": 2.4815547466278076, + "learning_rate": 5.931175635722974e-07, + "logits/chosen": -0.512069582939148, + "logits/rejected": -0.5844742059707642, + "logps/chosen": -52.83388900756836, + "logps/rejected": -107.8611068725586, + "loss": 0.5771, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1471362113952637, + "rewards/margins": 7.479123592376709, + "rewards/rejected": -4.331987380981445, + "step": 16855 + }, + { + "epoch": 4.22, + "grad_norm": 3.416060209274292, + "learning_rate": 5.927463060353994e-07, + "logits/chosen": -0.5262225866317749, + "logits/rejected": -0.584906816482544, + "logps/chosen": -59.80195236206055, + "logps/rejected": -121.82955932617188, + "loss": 0.6319, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.285984992980957, + "rewards/margins": 8.170602798461914, + "rewards/rejected": -4.884618282318115, + "step": 16856 + }, + { + "epoch": 4.22, + "grad_norm": 4.375586986541748, + "learning_rate": 5.923751574064285e-07, + "logits/chosen": -0.5779792666435242, + "logits/rejected": -0.6831375956535339, + "logps/chosen": -57.4077033996582, + "logps/rejected": -121.62606048583984, + "loss": 0.643, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.045189380645752, + "rewards/margins": 7.231575012207031, + "rewards/rejected": -4.186385631561279, + "step": 16857 + }, + { + "epoch": 4.22, + "grad_norm": 10.050924301147461, + "learning_rate": 5.92004117694554e-07, + "logits/chosen": -0.5388941168785095, + "logits/rejected": -0.5704344511032104, + "logps/chosen": -56.482173919677734, + "logps/rejected": -126.65565490722656, + "loss": 0.5968, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.905527114868164, + "rewards/margins": 7.263846397399902, + "rewards/rejected": -4.358319282531738, + "step": 16858 + }, + { + "epoch": 4.22, + "grad_norm": 5.264479160308838, + "learning_rate": 5.916331869089475e-07, + "logits/chosen": -0.5351345539093018, + "logits/rejected": -0.574417233467102, + "logps/chosen": -51.519866943359375, + "logps/rejected": -109.984619140625, + "loss": 0.5991, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7020492553710938, + "rewards/margins": 6.764281272888184, + "rewards/rejected": -4.06223201751709, + "step": 16859 + }, + { + "epoch": 4.22, + "grad_norm": 4.558798789978027, + "learning_rate": 5.912623650587729e-07, + "logits/chosen": -0.6185218095779419, + "logits/rejected": -0.6896014213562012, + "logps/chosen": -75.17570495605469, + "logps/rejected": -99.42545318603516, + "loss": 0.7407, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9636380672454834, + "rewards/margins": 6.316003322601318, + "rewards/rejected": -3.352365016937256, + "step": 16860 + }, + { + "epoch": 4.22, + "grad_norm": 2.9627280235290527, + "learning_rate": 5.908916521531955e-07, + "logits/chosen": -0.4839259386062622, + "logits/rejected": -0.5824503898620605, + "logps/chosen": -55.188018798828125, + "logps/rejected": -103.63201904296875, + "loss": 0.5773, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2763233184814453, + "rewards/margins": 7.5720391273498535, + "rewards/rejected": -4.295716285705566, + "step": 16861 + }, + { + "epoch": 4.22, + "grad_norm": 2.2402124404907227, + "learning_rate": 5.90521048201374e-07, + "logits/chosen": -0.5137392282485962, + "logits/rejected": -0.5857954025268555, + "logps/chosen": -49.382659912109375, + "logps/rejected": -102.54261779785156, + "loss": 0.5132, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.417841911315918, + "rewards/margins": 6.905979156494141, + "rewards/rejected": -3.4881367683410645, + "step": 16862 + }, + { + "epoch": 4.22, + "grad_norm": 6.350591659545898, + "learning_rate": 5.901505532124691e-07, + "logits/chosen": -0.5985348224639893, + "logits/rejected": -0.6826659440994263, + "logps/chosen": -50.635093688964844, + "logps/rejected": -94.76861572265625, + "loss": 0.5918, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2775723934173584, + "rewards/margins": 6.174283504486084, + "rewards/rejected": -2.8967106342315674, + "step": 16863 + }, + { + "epoch": 4.22, + "grad_norm": 6.420272350311279, + "learning_rate": 5.897801671956338e-07, + "logits/chosen": -0.5577428340911865, + "logits/rejected": -0.5980195999145508, + "logps/chosen": -55.482913970947266, + "logps/rejected": -109.43136596679688, + "loss": 0.5603, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.151575803756714, + "rewards/margins": 6.227139472961426, + "rewards/rejected": -3.07556414604187, + "step": 16864 + }, + { + "epoch": 4.22, + "grad_norm": 6.65200662612915, + "learning_rate": 5.894098901600209e-07, + "logits/chosen": -0.528204619884491, + "logits/rejected": -0.6161056756973267, + "logps/chosen": -58.78325653076172, + "logps/rejected": -98.58118438720703, + "loss": 0.6513, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.085991382598877, + "rewards/margins": 6.459677219390869, + "rewards/rejected": -3.373685359954834, + "step": 16865 + }, + { + "epoch": 4.22, + "grad_norm": 3.7562174797058105, + "learning_rate": 5.89039722114782e-07, + "logits/chosen": -0.5788651704788208, + "logits/rejected": -0.6359265446662903, + "logps/chosen": -50.69920349121094, + "logps/rejected": -99.63545227050781, + "loss": 0.6074, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0377087593078613, + "rewards/margins": 6.996213912963867, + "rewards/rejected": -3.9585044384002686, + "step": 16866 + }, + { + "epoch": 4.22, + "grad_norm": 10.438531875610352, + "learning_rate": 5.886696630690631e-07, + "logits/chosen": -0.5961686968803406, + "logits/rejected": -0.6938927173614502, + "logps/chosen": -54.9403190612793, + "logps/rejected": -109.1700210571289, + "loss": 0.6196, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9281060695648193, + "rewards/margins": 8.098273277282715, + "rewards/rejected": -5.170166492462158, + "step": 16867 + }, + { + "epoch": 4.22, + "grad_norm": 4.78233528137207, + "learning_rate": 5.882997130320079e-07, + "logits/chosen": -0.7028594613075256, + "logits/rejected": -0.7610486745834351, + "logps/chosen": -43.933677673339844, + "logps/rejected": -126.99881744384766, + "loss": 0.551, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2388174533843994, + "rewards/margins": 9.12469482421875, + "rewards/rejected": -5.885878562927246, + "step": 16868 + }, + { + "epoch": 4.22, + "grad_norm": 18.486921310424805, + "learning_rate": 5.879298720127607e-07, + "logits/chosen": -0.5766618251800537, + "logits/rejected": -0.6611183285713196, + "logps/chosen": -49.869232177734375, + "logps/rejected": -120.78984832763672, + "loss": 0.7006, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1995656490325928, + "rewards/margins": 6.919914722442627, + "rewards/rejected": -3.720349073410034, + "step": 16869 + }, + { + "epoch": 4.22, + "grad_norm": 3.960226058959961, + "learning_rate": 5.87560140020459e-07, + "logits/chosen": -0.5634317398071289, + "logits/rejected": -0.6055009365081787, + "logps/chosen": -54.60443115234375, + "logps/rejected": -127.11671447753906, + "loss": 0.6423, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1956429481506348, + "rewards/margins": 7.887317180633545, + "rewards/rejected": -4.69167423248291, + "step": 16870 + }, + { + "epoch": 4.22, + "grad_norm": 4.907484531402588, + "learning_rate": 5.871905170642383e-07, + "logits/chosen": -0.549401044845581, + "logits/rejected": -0.6179159283638, + "logps/chosen": -43.214935302734375, + "logps/rejected": -103.292724609375, + "loss": 0.5018, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.502837657928467, + "rewards/margins": 8.105876922607422, + "rewards/rejected": -4.603039264678955, + "step": 16871 + }, + { + "epoch": 4.22, + "grad_norm": 6.903292179107666, + "learning_rate": 5.868210031532334e-07, + "logits/chosen": -0.5895723104476929, + "logits/rejected": -0.6680097579956055, + "logps/chosen": -50.1244010925293, + "logps/rejected": -100.12300109863281, + "loss": 0.5666, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.319944381713867, + "rewards/margins": 7.313661098480225, + "rewards/rejected": -3.9937167167663574, + "step": 16872 + }, + { + "epoch": 4.22, + "grad_norm": 16.794885635375977, + "learning_rate": 5.864515982965785e-07, + "logits/chosen": -0.60509192943573, + "logits/rejected": -0.6669076681137085, + "logps/chosen": -66.934814453125, + "logps/rejected": -105.62261962890625, + "loss": 0.7099, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.663628339767456, + "rewards/margins": 6.522610187530518, + "rewards/rejected": -3.8589816093444824, + "step": 16873 + }, + { + "epoch": 4.22, + "grad_norm": 4.103189468383789, + "learning_rate": 5.860823025033963e-07, + "logits/chosen": -0.5569210648536682, + "logits/rejected": -0.6416184902191162, + "logps/chosen": -56.64820098876953, + "logps/rejected": -109.17874908447266, + "loss": 0.6118, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.991989850997925, + "rewards/margins": 7.548135757446289, + "rewards/rejected": -4.556144714355469, + "step": 16874 + }, + { + "epoch": 4.22, + "grad_norm": 10.021173477172852, + "learning_rate": 5.857131157828156e-07, + "logits/chosen": -0.6266628503799438, + "logits/rejected": -0.725378155708313, + "logps/chosen": -58.012786865234375, + "logps/rejected": -102.85494995117188, + "loss": 0.6524, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.079479694366455, + "rewards/margins": 6.231206893920898, + "rewards/rejected": -3.1517271995544434, + "step": 16875 + }, + { + "epoch": 4.22, + "grad_norm": 37.249359130859375, + "learning_rate": 5.853440381439596e-07, + "logits/chosen": -0.5762194395065308, + "logits/rejected": -0.6649990677833557, + "logps/chosen": -54.17645263671875, + "logps/rejected": -121.39505767822266, + "loss": 0.6372, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9465956687927246, + "rewards/margins": 8.327589988708496, + "rewards/rejected": -5.380995273590088, + "step": 16876 + }, + { + "epoch": 4.22, + "grad_norm": 1.8958324193954468, + "learning_rate": 5.849750695959483e-07, + "logits/chosen": -0.5432174205780029, + "logits/rejected": -0.5943338871002197, + "logps/chosen": -43.424652099609375, + "logps/rejected": -130.29867553710938, + "loss": 0.5098, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3985514640808105, + "rewards/margins": 8.54062271118164, + "rewards/rejected": -5.142070293426514, + "step": 16877 + }, + { + "epoch": 4.22, + "grad_norm": 6.5929951667785645, + "learning_rate": 5.846062101478983e-07, + "logits/chosen": -0.6304062604904175, + "logits/rejected": -0.7412444353103638, + "logps/chosen": -61.681270599365234, + "logps/rejected": -105.01793670654297, + "loss": 0.638, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2540624141693115, + "rewards/margins": 8.242273330688477, + "rewards/rejected": -4.988211154937744, + "step": 16878 + }, + { + "epoch": 4.22, + "grad_norm": 4.851074695587158, + "learning_rate": 5.842374598089262e-07, + "logits/chosen": -0.5114128589630127, + "logits/rejected": -0.6151154637336731, + "logps/chosen": -52.48008728027344, + "logps/rejected": -123.05213165283203, + "loss": 0.5505, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2559263706207275, + "rewards/margins": 9.119382858276367, + "rewards/rejected": -5.8634562492370605, + "step": 16879 + }, + { + "epoch": 4.22, + "grad_norm": 4.683154582977295, + "learning_rate": 5.838688185881419e-07, + "logits/chosen": -0.5215896964073181, + "logits/rejected": -0.6002232432365417, + "logps/chosen": -50.16464614868164, + "logps/rejected": -94.36395263671875, + "loss": 0.4951, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1039881706237793, + "rewards/margins": 6.454463005065918, + "rewards/rejected": -3.3504745960235596, + "step": 16880 + }, + { + "epoch": 4.22, + "grad_norm": 6.410794258117676, + "learning_rate": 5.835002864946576e-07, + "logits/chosen": -0.513073742389679, + "logits/rejected": -0.5597847104072571, + "logps/chosen": -59.055179595947266, + "logps/rejected": -125.73455810546875, + "loss": 0.7636, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.895301580429077, + "rewards/margins": 6.888530254364014, + "rewards/rejected": -3.9932291507720947, + "step": 16881 + }, + { + "epoch": 4.22, + "grad_norm": 3.561359167098999, + "learning_rate": 5.831318635375788e-07, + "logits/chosen": -0.5116280317306519, + "logits/rejected": -0.5725180506706238, + "logps/chosen": -45.3942985534668, + "logps/rejected": -129.48277282714844, + "loss": 0.4999, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.174823045730591, + "rewards/margins": 9.558361053466797, + "rewards/rejected": -6.383537292480469, + "step": 16882 + }, + { + "epoch": 4.22, + "grad_norm": 3.9697940349578857, + "learning_rate": 5.827635497260086e-07, + "logits/chosen": -0.6135698556900024, + "logits/rejected": -0.7129790186882019, + "logps/chosen": -55.68632507324219, + "logps/rejected": -117.33008575439453, + "loss": 0.6191, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.11991024017334, + "rewards/margins": 8.430082321166992, + "rewards/rejected": -5.310172080993652, + "step": 16883 + }, + { + "epoch": 4.22, + "grad_norm": 3.215003728866577, + "learning_rate": 5.823953450690506e-07, + "logits/chosen": -0.5506500005722046, + "logits/rejected": -0.6651908159255981, + "logps/chosen": -53.6328125, + "logps/rejected": -106.43885040283203, + "loss": 0.5265, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8018546104431152, + "rewards/margins": 7.697521686553955, + "rewards/rejected": -4.895667552947998, + "step": 16884 + }, + { + "epoch": 4.22, + "grad_norm": 12.341059684753418, + "learning_rate": 5.820272495758023e-07, + "logits/chosen": -0.5297448039054871, + "logits/rejected": -0.574547529220581, + "logps/chosen": -75.28148651123047, + "logps/rejected": -149.79942321777344, + "loss": 0.628, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.031980037689209, + "rewards/margins": 8.989668846130371, + "rewards/rejected": -5.957688331604004, + "step": 16885 + }, + { + "epoch": 4.22, + "grad_norm": 2.1507463455200195, + "learning_rate": 5.816592632553586e-07, + "logits/chosen": -0.5897440910339355, + "logits/rejected": -0.689337432384491, + "logps/chosen": -49.935340881347656, + "logps/rejected": -115.86585235595703, + "loss": 0.5601, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3983092308044434, + "rewards/margins": 9.250375747680664, + "rewards/rejected": -5.852067947387695, + "step": 16886 + }, + { + "epoch": 4.22, + "grad_norm": 11.155770301818848, + "learning_rate": 5.812913861168146e-07, + "logits/chosen": -0.5861584544181824, + "logits/rejected": -0.6278960704803467, + "logps/chosen": -58.12919998168945, + "logps/rejected": -116.01226043701172, + "loss": 0.8874, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.891855239868164, + "rewards/margins": 5.648340225219727, + "rewards/rejected": -2.7564845085144043, + "step": 16887 + }, + { + "epoch": 4.22, + "grad_norm": 7.829804420471191, + "learning_rate": 5.809236181692618e-07, + "logits/chosen": -0.4987042546272278, + "logits/rejected": -0.5071264505386353, + "logps/chosen": -64.2052993774414, + "logps/rejected": -113.9158706665039, + "loss": 0.7262, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.108905792236328, + "rewards/margins": 6.253793716430664, + "rewards/rejected": -3.1448872089385986, + "step": 16888 + }, + { + "epoch": 4.22, + "grad_norm": 2.808100938796997, + "learning_rate": 5.805559594217853e-07, + "logits/chosen": -0.5851700305938721, + "logits/rejected": -0.6722366809844971, + "logps/chosen": -42.31249237060547, + "logps/rejected": -116.6832275390625, + "loss": 0.506, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3248002529144287, + "rewards/margins": 8.383662223815918, + "rewards/rejected": -5.058862209320068, + "step": 16889 + }, + { + "epoch": 4.23, + "grad_norm": 6.670212268829346, + "learning_rate": 5.801884098834715e-07, + "logits/chosen": -0.530128002166748, + "logits/rejected": -0.6538036465644836, + "logps/chosen": -51.40653610229492, + "logps/rejected": -105.8180160522461, + "loss": 0.5651, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1589086055755615, + "rewards/margins": 8.61082935333252, + "rewards/rejected": -5.451920509338379, + "step": 16890 + }, + { + "epoch": 4.23, + "grad_norm": 3.7992300987243652, + "learning_rate": 5.798209695634049e-07, + "logits/chosen": -0.6204918622970581, + "logits/rejected": -0.6391559839248657, + "logps/chosen": -40.812828063964844, + "logps/rejected": -107.43071746826172, + "loss": 0.5495, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1539244651794434, + "rewards/margins": 6.848796844482422, + "rewards/rejected": -3.6948723793029785, + "step": 16891 + }, + { + "epoch": 4.23, + "grad_norm": 7.102022647857666, + "learning_rate": 5.794536384706623e-07, + "logits/chosen": -0.5213304758071899, + "logits/rejected": -0.6149378418922424, + "logps/chosen": -67.60862731933594, + "logps/rejected": -103.65042877197266, + "loss": 0.6379, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.016477108001709, + "rewards/margins": 6.604331970214844, + "rewards/rejected": -3.5878543853759766, + "step": 16892 + }, + { + "epoch": 4.23, + "grad_norm": 3.557623863220215, + "learning_rate": 5.790864166143217e-07, + "logits/chosen": -0.5238791108131409, + "logits/rejected": -0.6141228079795837, + "logps/chosen": -47.06135559082031, + "logps/rejected": -114.11376953125, + "loss": 0.5314, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.035118818283081, + "rewards/margins": 7.997758388519287, + "rewards/rejected": -4.962639808654785, + "step": 16893 + }, + { + "epoch": 4.23, + "grad_norm": 4.537546634674072, + "learning_rate": 5.787193040034589e-07, + "logits/chosen": -0.4708593487739563, + "logits/rejected": -0.5234618186950684, + "logps/chosen": -59.90089416503906, + "logps/rejected": -95.91293334960938, + "loss": 0.6505, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.914031505584717, + "rewards/margins": 6.239211082458496, + "rewards/rejected": -3.325178623199463, + "step": 16894 + }, + { + "epoch": 4.23, + "grad_norm": 3.653743267059326, + "learning_rate": 5.783523006471454e-07, + "logits/chosen": -0.5594319701194763, + "logits/rejected": -0.6580781936645508, + "logps/chosen": -53.83877944946289, + "logps/rejected": -114.74533081054688, + "loss": 0.5678, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7684714794158936, + "rewards/margins": 7.254705429077148, + "rewards/rejected": -4.486234664916992, + "step": 16895 + }, + { + "epoch": 4.23, + "grad_norm": 3.5003793239593506, + "learning_rate": 5.779854065544482e-07, + "logits/chosen": -0.6069767475128174, + "logits/rejected": -0.6804521679878235, + "logps/chosen": -54.08050537109375, + "logps/rejected": -96.81362915039062, + "loss": 0.5027, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.873591423034668, + "rewards/margins": 6.914737701416016, + "rewards/rejected": -4.041146278381348, + "step": 16896 + }, + { + "epoch": 4.23, + "grad_norm": 3.7664616107940674, + "learning_rate": 5.77618621734436e-07, + "logits/chosen": -0.6936194896697998, + "logits/rejected": -0.7366913557052612, + "logps/chosen": -46.779720306396484, + "logps/rejected": -104.49546813964844, + "loss": 0.5659, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1738691329956055, + "rewards/margins": 6.837678909301758, + "rewards/rejected": -3.6638095378875732, + "step": 16897 + }, + { + "epoch": 4.23, + "grad_norm": 3.0577752590179443, + "learning_rate": 5.772519461961718e-07, + "logits/chosen": -0.5438026785850525, + "logits/rejected": -0.6163855791091919, + "logps/chosen": -52.68404006958008, + "logps/rejected": -107.1212158203125, + "loss": 0.6044, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.166177988052368, + "rewards/margins": 7.765102863311768, + "rewards/rejected": -4.598924160003662, + "step": 16898 + }, + { + "epoch": 4.23, + "grad_norm": 26.7305850982666, + "learning_rate": 5.768853799487151e-07, + "logits/chosen": -0.5667040944099426, + "logits/rejected": -0.6351649761199951, + "logps/chosen": -51.601871490478516, + "logps/rejected": -115.50465393066406, + "loss": 0.6333, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.143951654434204, + "rewards/margins": 7.340972423553467, + "rewards/rejected": -4.197020530700684, + "step": 16899 + }, + { + "epoch": 4.23, + "grad_norm": 5.65669059753418, + "learning_rate": 5.765189230011265e-07, + "logits/chosen": -0.5854135751724243, + "logits/rejected": -0.6753888130187988, + "logps/chosen": -56.586883544921875, + "logps/rejected": -108.62347412109375, + "loss": 0.5452, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7575740814208984, + "rewards/margins": 8.271693229675293, + "rewards/rejected": -5.5141191482543945, + "step": 16900 + }, + { + "epoch": 4.23, + "grad_norm": 6.789384365081787, + "learning_rate": 5.761525753624592e-07, + "logits/chosen": -0.5998285412788391, + "logits/rejected": -0.6925064921379089, + "logps/chosen": -66.95306396484375, + "logps/rejected": -104.03759765625, + "loss": 0.6396, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0165419578552246, + "rewards/margins": 7.815422058105469, + "rewards/rejected": -4.798879623413086, + "step": 16901 + }, + { + "epoch": 4.23, + "grad_norm": 5.369110584259033, + "learning_rate": 5.757863370417682e-07, + "logits/chosen": -0.5984207987785339, + "logits/rejected": -0.6426388621330261, + "logps/chosen": -45.039268493652344, + "logps/rejected": -102.43241119384766, + "loss": 0.637, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.96002197265625, + "rewards/margins": 7.079606056213379, + "rewards/rejected": -4.119584083557129, + "step": 16902 + }, + { + "epoch": 4.23, + "grad_norm": 5.132857322692871, + "learning_rate": 5.754202080481025e-07, + "logits/chosen": -0.5042356848716736, + "logits/rejected": -0.5980678200721741, + "logps/chosen": -51.596588134765625, + "logps/rejected": -91.18732452392578, + "loss": 0.6397, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1051104068756104, + "rewards/margins": 6.714985370635986, + "rewards/rejected": -3.609875202178955, + "step": 16903 + }, + { + "epoch": 4.23, + "grad_norm": 4.4242844581604, + "learning_rate": 5.750541883905086e-07, + "logits/chosen": -0.5713094472885132, + "logits/rejected": -0.6470591425895691, + "logps/chosen": -58.13766098022461, + "logps/rejected": -111.25450134277344, + "loss": 0.6179, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2158193588256836, + "rewards/margins": 7.128481388092041, + "rewards/rejected": -3.912661552429199, + "step": 16904 + }, + { + "epoch": 4.23, + "grad_norm": 4.783470153808594, + "learning_rate": 5.746882780780322e-07, + "logits/chosen": -0.6045598387718201, + "logits/rejected": -0.7022421956062317, + "logps/chosen": -37.600494384765625, + "logps/rejected": -99.45647430419922, + "loss": 0.5176, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.204948902130127, + "rewards/margins": 7.466421604156494, + "rewards/rejected": -4.261472702026367, + "step": 16905 + }, + { + "epoch": 4.23, + "grad_norm": 5.926851749420166, + "learning_rate": 5.74322477119717e-07, + "logits/chosen": -0.5103089809417725, + "logits/rejected": -0.5620150566101074, + "logps/chosen": -58.7328987121582, + "logps/rejected": -120.9937973022461, + "loss": 0.779, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9993786811828613, + "rewards/margins": 7.267881393432617, + "rewards/rejected": -4.268502235412598, + "step": 16906 + }, + { + "epoch": 4.23, + "grad_norm": 3.4597954750061035, + "learning_rate": 5.739567855246003e-07, + "logits/chosen": -0.5429419279098511, + "logits/rejected": -0.6443183422088623, + "logps/chosen": -56.04709243774414, + "logps/rejected": -125.08531188964844, + "loss": 0.5532, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.875230073928833, + "rewards/margins": 8.779579162597656, + "rewards/rejected": -5.904350280761719, + "step": 16907 + }, + { + "epoch": 4.23, + "grad_norm": 2.422757863998413, + "learning_rate": 5.735912033017183e-07, + "logits/chosen": -0.516506016254425, + "logits/rejected": -0.6505733132362366, + "logps/chosen": -61.48208999633789, + "logps/rejected": -111.81275939941406, + "loss": 0.5771, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.129870653152466, + "rewards/margins": 8.564898490905762, + "rewards/rejected": -5.435028076171875, + "step": 16908 + }, + { + "epoch": 4.23, + "grad_norm": 6.600182056427002, + "learning_rate": 5.732257304601063e-07, + "logits/chosen": -0.6346044540405273, + "logits/rejected": -0.6621986627578735, + "logps/chosen": -52.706993103027344, + "logps/rejected": -116.78221893310547, + "loss": 0.6436, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.245528221130371, + "rewards/margins": 7.434115886688232, + "rewards/rejected": -4.1885881423950195, + "step": 16909 + }, + { + "epoch": 4.23, + "grad_norm": 5.55273962020874, + "learning_rate": 5.728603670087957e-07, + "logits/chosen": -0.4818643629550934, + "logits/rejected": -0.6171314716339111, + "logps/chosen": -54.5153694152832, + "logps/rejected": -97.27546691894531, + "loss": 0.6303, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.106266975402832, + "rewards/margins": 6.920137405395508, + "rewards/rejected": -3.813871145248413, + "step": 16910 + }, + { + "epoch": 4.23, + "grad_norm": 13.678643226623535, + "learning_rate": 5.724951129568129e-07, + "logits/chosen": -0.584480881690979, + "logits/rejected": -0.6610623598098755, + "logps/chosen": -49.37994384765625, + "logps/rejected": -122.6475601196289, + "loss": 0.5237, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.937981605529785, + "rewards/margins": 8.289885520935059, + "rewards/rejected": -5.351903438568115, + "step": 16911 + }, + { + "epoch": 4.23, + "grad_norm": 4.173101902008057, + "learning_rate": 5.721299683131865e-07, + "logits/chosen": -0.5758249163627625, + "logits/rejected": -0.5937949419021606, + "logps/chosen": -54.99541473388672, + "logps/rejected": -110.35030364990234, + "loss": 0.6795, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.455709218978882, + "rewards/margins": 6.503400802612305, + "rewards/rejected": -3.04769229888916, + "step": 16912 + }, + { + "epoch": 4.23, + "grad_norm": 3.443499803543091, + "learning_rate": 5.717649330869384e-07, + "logits/chosen": -0.5720151662826538, + "logits/rejected": -0.6574194431304932, + "logps/chosen": -43.60897445678711, + "logps/rejected": -124.5435791015625, + "loss": 0.5557, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3030624389648438, + "rewards/margins": 8.176949501037598, + "rewards/rejected": -4.873887538909912, + "step": 16913 + }, + { + "epoch": 4.23, + "grad_norm": 3.5371127128601074, + "learning_rate": 5.714000072870874e-07, + "logits/chosen": -0.6050941944122314, + "logits/rejected": -0.713356614112854, + "logps/chosen": -72.40890502929688, + "logps/rejected": -110.12893676757812, + "loss": 0.6801, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0251078605651855, + "rewards/margins": 7.134554862976074, + "rewards/rejected": -4.109447002410889, + "step": 16914 + }, + { + "epoch": 4.23, + "grad_norm": 6.428924560546875, + "learning_rate": 5.710351909226531e-07, + "logits/chosen": -0.6111119985580444, + "logits/rejected": -0.7038332223892212, + "logps/chosen": -50.34403991699219, + "logps/rejected": -103.53240203857422, + "loss": 0.6365, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1428050994873047, + "rewards/margins": 7.696630001068115, + "rewards/rejected": -4.553824424743652, + "step": 16915 + }, + { + "epoch": 4.23, + "grad_norm": 1.6412380933761597, + "learning_rate": 5.706704840026517e-07, + "logits/chosen": -0.5599216818809509, + "logits/rejected": -0.6578879356384277, + "logps/chosen": -53.58917999267578, + "logps/rejected": -103.82917022705078, + "loss": 0.5263, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.386085033416748, + "rewards/margins": 8.3994779586792, + "rewards/rejected": -5.013392448425293, + "step": 16916 + }, + { + "epoch": 4.23, + "grad_norm": 2.8950207233428955, + "learning_rate": 5.703058865360922e-07, + "logits/chosen": -0.4914143979549408, + "logits/rejected": -0.6401086449623108, + "logps/chosen": -62.961204528808594, + "logps/rejected": -99.20459747314453, + "loss": 0.5415, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.923128604888916, + "rewards/margins": 7.965888977050781, + "rewards/rejected": -5.042759895324707, + "step": 16917 + }, + { + "epoch": 4.23, + "grad_norm": 2.873386859893799, + "learning_rate": 5.699413985319857e-07, + "logits/chosen": -0.5289916396141052, + "logits/rejected": -0.6616146564483643, + "logps/chosen": -46.32652282714844, + "logps/rejected": -107.9681167602539, + "loss": 0.5533, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3152108192443848, + "rewards/margins": 9.072173118591309, + "rewards/rejected": -5.756962299346924, + "step": 16918 + }, + { + "epoch": 4.23, + "grad_norm": 7.020948886871338, + "learning_rate": 5.695770199993406e-07, + "logits/chosen": -0.49634629487991333, + "logits/rejected": -0.5640095472335815, + "logps/chosen": -59.76974105834961, + "logps/rejected": -104.2474365234375, + "loss": 0.6418, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1874470710754395, + "rewards/margins": 5.370355606079102, + "rewards/rejected": -2.182908773422241, + "step": 16919 + }, + { + "epoch": 4.23, + "grad_norm": 5.754435062408447, + "learning_rate": 5.692127509471595e-07, + "logits/chosen": -0.45522594451904297, + "logits/rejected": -0.6006001234054565, + "logps/chosen": -69.65596771240234, + "logps/rejected": -120.37443542480469, + "loss": 0.6454, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.926348924636841, + "rewards/margins": 6.6835832595825195, + "rewards/rejected": -3.7572338581085205, + "step": 16920 + }, + { + "epoch": 4.23, + "grad_norm": 4.804325580596924, + "learning_rate": 5.688485913844433e-07, + "logits/chosen": -0.5546371936798096, + "logits/rejected": -0.5499317646026611, + "logps/chosen": -52.66197967529297, + "logps/rejected": -138.6640167236328, + "loss": 0.673, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.017117500305176, + "rewards/margins": 8.233269691467285, + "rewards/rejected": -5.216151714324951, + "step": 16921 + }, + { + "epoch": 4.23, + "grad_norm": 5.897287368774414, + "learning_rate": 5.684845413201923e-07, + "logits/chosen": -0.4622567594051361, + "logits/rejected": -0.5860997438430786, + "logps/chosen": -79.52640533447266, + "logps/rejected": -110.22454833984375, + "loss": 0.7095, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6848323345184326, + "rewards/margins": 7.261327743530273, + "rewards/rejected": -4.576495170593262, + "step": 16922 + }, + { + "epoch": 4.23, + "grad_norm": 6.995874881744385, + "learning_rate": 5.681206007634016e-07, + "logits/chosen": -0.4705398380756378, + "logits/rejected": -0.5474269390106201, + "logps/chosen": -48.866607666015625, + "logps/rejected": -100.15377807617188, + "loss": 0.6266, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1444640159606934, + "rewards/margins": 6.733277320861816, + "rewards/rejected": -3.588813543319702, + "step": 16923 + }, + { + "epoch": 4.23, + "grad_norm": 5.040289402008057, + "learning_rate": 5.677567697230646e-07, + "logits/chosen": -0.5140053629875183, + "logits/rejected": -0.606737494468689, + "logps/chosen": -60.88223648071289, + "logps/rejected": -108.43973541259766, + "loss": 0.6166, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.94134783744812, + "rewards/margins": 6.887284755706787, + "rewards/rejected": -3.9459376335144043, + "step": 16924 + }, + { + "epoch": 4.23, + "grad_norm": 6.488990306854248, + "learning_rate": 5.673930482081725e-07, + "logits/chosen": -0.5635372400283813, + "logits/rejected": -0.5478364825248718, + "logps/chosen": -54.30755615234375, + "logps/rejected": -134.69020080566406, + "loss": 0.67, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1310410499572754, + "rewards/margins": 7.369034290313721, + "rewards/rejected": -4.237993240356445, + "step": 16925 + }, + { + "epoch": 4.23, + "grad_norm": 6.61823844909668, + "learning_rate": 5.670294362277123e-07, + "logits/chosen": -0.5049984455108643, + "logits/rejected": -0.6038135290145874, + "logps/chosen": -49.3795280456543, + "logps/rejected": -103.94923400878906, + "loss": 0.6556, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0651068687438965, + "rewards/margins": 7.810825824737549, + "rewards/rejected": -4.745718955993652, + "step": 16926 + }, + { + "epoch": 4.23, + "grad_norm": 3.9167938232421875, + "learning_rate": 5.666659337906705e-07, + "logits/chosen": -0.5462263822555542, + "logits/rejected": -0.6351649761199951, + "logps/chosen": -55.87338638305664, + "logps/rejected": -98.07829284667969, + "loss": 0.7093, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0469117164611816, + "rewards/margins": 7.157297611236572, + "rewards/rejected": -4.110384941101074, + "step": 16927 + }, + { + "epoch": 4.23, + "grad_norm": 2.922295570373535, + "learning_rate": 5.663025409060291e-07, + "logits/chosen": -0.5357062816619873, + "logits/rejected": -0.6456074714660645, + "logps/chosen": -58.30510330200195, + "logps/rejected": -105.11392974853516, + "loss": 0.5933, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4028799533843994, + "rewards/margins": 7.999270439147949, + "rewards/rejected": -4.596390247344971, + "step": 16928 + }, + { + "epoch": 4.23, + "grad_norm": 5.641586780548096, + "learning_rate": 5.659392575827671e-07, + "logits/chosen": -0.5846786499023438, + "logits/rejected": -0.6361466646194458, + "logps/chosen": -45.746124267578125, + "logps/rejected": -115.80219268798828, + "loss": 0.6243, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2517192363739014, + "rewards/margins": 7.657263278961182, + "rewards/rejected": -4.405544281005859, + "step": 16929 + }, + { + "epoch": 4.24, + "grad_norm": 2.5534162521362305, + "learning_rate": 5.655760838298619e-07, + "logits/chosen": -0.5894826650619507, + "logits/rejected": -0.7077093720436096, + "logps/chosen": -65.36031341552734, + "logps/rejected": -94.60704803466797, + "loss": 0.5944, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.238321304321289, + "rewards/margins": 7.803768157958984, + "rewards/rejected": -4.565446376800537, + "step": 16930 + }, + { + "epoch": 4.24, + "grad_norm": 3.8878698348999023, + "learning_rate": 5.652130196562905e-07, + "logits/chosen": -0.5555152893066406, + "logits/rejected": -0.6250336170196533, + "logps/chosen": -53.58445358276367, + "logps/rejected": -116.83366394042969, + "loss": 0.6268, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3102376461029053, + "rewards/margins": 8.151894569396973, + "rewards/rejected": -4.841657638549805, + "step": 16931 + }, + { + "epoch": 4.24, + "grad_norm": 11.362235069274902, + "learning_rate": 5.648500650710203e-07, + "logits/chosen": -0.5674515962600708, + "logits/rejected": -0.6373782753944397, + "logps/chosen": -58.05990219116211, + "logps/rejected": -105.96569061279297, + "loss": 0.6153, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1473944187164307, + "rewards/margins": 7.100462913513184, + "rewards/rejected": -3.9530680179595947, + "step": 16932 + }, + { + "epoch": 4.24, + "grad_norm": 6.393433570861816, + "learning_rate": 5.644872200830226e-07, + "logits/chosen": -0.580425500869751, + "logits/rejected": -0.6804034113883972, + "logps/chosen": -64.7370834350586, + "logps/rejected": -109.52806091308594, + "loss": 0.6039, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1978812217712402, + "rewards/margins": 6.361275672912598, + "rewards/rejected": -3.1633946895599365, + "step": 16933 + }, + { + "epoch": 4.24, + "grad_norm": 5.837739944458008, + "learning_rate": 5.641244847012656e-07, + "logits/chosen": -0.5277503728866577, + "logits/rejected": -0.5859661102294922, + "logps/chosen": -50.78108215332031, + "logps/rejected": -107.70619201660156, + "loss": 0.6165, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2989096641540527, + "rewards/margins": 6.869904041290283, + "rewards/rejected": -3.5709943771362305, + "step": 16934 + }, + { + "epoch": 4.24, + "grad_norm": 7.0164265632629395, + "learning_rate": 5.637618589347088e-07, + "logits/chosen": -0.6128546595573425, + "logits/rejected": -0.6555459499359131, + "logps/chosen": -45.64502716064453, + "logps/rejected": -94.25259399414062, + "loss": 0.6121, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9058566093444824, + "rewards/margins": 6.2545599937438965, + "rewards/rejected": -3.348703145980835, + "step": 16935 + }, + { + "epoch": 4.24, + "grad_norm": 10.021832466125488, + "learning_rate": 5.633993427923151e-07, + "logits/chosen": -0.5542822480201721, + "logits/rejected": -0.6355461478233337, + "logps/chosen": -59.474945068359375, + "logps/rejected": -114.8830795288086, + "loss": 0.8283, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9190454483032227, + "rewards/margins": 7.715821743011475, + "rewards/rejected": -4.7967753410339355, + "step": 16936 + }, + { + "epoch": 4.24, + "grad_norm": 10.899397850036621, + "learning_rate": 5.630369362830435e-07, + "logits/chosen": -0.5136671662330627, + "logits/rejected": -0.6295892596244812, + "logps/chosen": -67.27326965332031, + "logps/rejected": -112.31339263916016, + "loss": 0.6716, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.837970733642578, + "rewards/margins": 7.026241779327393, + "rewards/rejected": -4.1882710456848145, + "step": 16937 + }, + { + "epoch": 4.24, + "grad_norm": 5.416808128356934, + "learning_rate": 5.626746394158483e-07, + "logits/chosen": -0.580560564994812, + "logits/rejected": -0.6624599099159241, + "logps/chosen": -52.36180114746094, + "logps/rejected": -104.96647644042969, + "loss": 0.6648, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.213632106781006, + "rewards/margins": 6.299839019775391, + "rewards/rejected": -3.0862069129943848, + "step": 16938 + }, + { + "epoch": 4.24, + "grad_norm": 2.32222843170166, + "learning_rate": 5.62312452199682e-07, + "logits/chosen": -0.5919654965400696, + "logits/rejected": -0.6866981983184814, + "logps/chosen": -59.31324005126953, + "logps/rejected": -110.45069885253906, + "loss": 0.5828, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.349465847015381, + "rewards/margins": 8.478486061096191, + "rewards/rejected": -5.129019260406494, + "step": 16939 + }, + { + "epoch": 4.24, + "grad_norm": 4.527287483215332, + "learning_rate": 5.619503746434956e-07, + "logits/chosen": -0.506775438785553, + "logits/rejected": -0.6265145540237427, + "logps/chosen": -64.7779541015625, + "logps/rejected": -115.72744750976562, + "loss": 0.5652, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.158132791519165, + "rewards/margins": 8.427777290344238, + "rewards/rejected": -5.269644737243652, + "step": 16940 + }, + { + "epoch": 4.24, + "grad_norm": 4.845571041107178, + "learning_rate": 5.615884067562367e-07, + "logits/chosen": -0.5127763152122498, + "logits/rejected": -0.5737508535385132, + "logps/chosen": -57.69050598144531, + "logps/rejected": -112.11660766601562, + "loss": 0.5678, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.154757022857666, + "rewards/margins": 6.798962116241455, + "rewards/rejected": -3.6442055702209473, + "step": 16941 + }, + { + "epoch": 4.24, + "grad_norm": 5.856681823730469, + "learning_rate": 5.612265485468476e-07, + "logits/chosen": -0.5916934013366699, + "logits/rejected": -0.6626484990119934, + "logps/chosen": -55.31623840332031, + "logps/rejected": -115.0033187866211, + "loss": 0.6317, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9686191082000732, + "rewards/margins": 7.136880874633789, + "rewards/rejected": -4.168261528015137, + "step": 16942 + }, + { + "epoch": 4.24, + "grad_norm": 8.413586616516113, + "learning_rate": 5.608648000242728e-07, + "logits/chosen": -0.5087692141532898, + "logits/rejected": -0.48354852199554443, + "logps/chosen": -54.04450225830078, + "logps/rejected": -108.83277893066406, + "loss": 0.6721, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4023327827453613, + "rewards/margins": 6.339784145355225, + "rewards/rejected": -2.9374513626098633, + "step": 16943 + }, + { + "epoch": 4.24, + "grad_norm": 3.7481517791748047, + "learning_rate": 5.605031611974493e-07, + "logits/chosen": -0.5158255696296692, + "logits/rejected": -0.6679866313934326, + "logps/chosen": -64.9814453125, + "logps/rejected": -94.24226379394531, + "loss": 0.5882, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4082608222961426, + "rewards/margins": 6.622002124786377, + "rewards/rejected": -3.2137415409088135, + "step": 16944 + }, + { + "epoch": 4.24, + "grad_norm": 6.431117057800293, + "learning_rate": 5.601416320753161e-07, + "logits/chosen": -0.6004552841186523, + "logits/rejected": -0.6734566688537598, + "logps/chosen": -59.03185272216797, + "logps/rejected": -123.53215789794922, + "loss": 0.6018, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.359799861907959, + "rewards/margins": 9.252095222473145, + "rewards/rejected": -5.8922953605651855, + "step": 16945 + }, + { + "epoch": 4.24, + "grad_norm": 3.124619722366333, + "learning_rate": 5.597802126668056e-07, + "logits/chosen": -0.5606136918067932, + "logits/rejected": -0.6470800638198853, + "logps/chosen": -51.419288635253906, + "logps/rejected": -96.66030883789062, + "loss": 0.5777, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.027848720550537, + "rewards/margins": 6.677346229553223, + "rewards/rejected": -3.6494972705841064, + "step": 16946 + }, + { + "epoch": 4.24, + "grad_norm": 4.265054225921631, + "learning_rate": 5.594189029808478e-07, + "logits/chosen": -0.5453670620918274, + "logits/rejected": -0.5717746019363403, + "logps/chosen": -39.1771240234375, + "logps/rejected": -114.79734802246094, + "loss": 0.6999, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2132487297058105, + "rewards/margins": 7.3214545249938965, + "rewards/rejected": -4.108206748962402, + "step": 16947 + }, + { + "epoch": 4.24, + "grad_norm": 5.012471675872803, + "learning_rate": 5.590577030263733e-07, + "logits/chosen": -0.6208509802818298, + "logits/rejected": -0.7065115571022034, + "logps/chosen": -48.65919494628906, + "logps/rejected": -108.96659851074219, + "loss": 0.6197, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3985719680786133, + "rewards/margins": 8.157622337341309, + "rewards/rejected": -4.759049892425537, + "step": 16948 + }, + { + "epoch": 4.24, + "grad_norm": 10.522173881530762, + "learning_rate": 5.586966128123051e-07, + "logits/chosen": -0.5684155821800232, + "logits/rejected": -0.6504191756248474, + "logps/chosen": -53.994407653808594, + "logps/rejected": -89.86964416503906, + "loss": 0.5137, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0111162662506104, + "rewards/margins": 7.745981693267822, + "rewards/rejected": -4.734866142272949, + "step": 16949 + }, + { + "epoch": 4.24, + "grad_norm": 4.895866394042969, + "learning_rate": 5.583356323475692e-07, + "logits/chosen": -0.45689377188682556, + "logits/rejected": -0.5793710947036743, + "logps/chosen": -67.67157745361328, + "logps/rejected": -104.7842788696289, + "loss": 0.655, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1969852447509766, + "rewards/margins": 6.519896984100342, + "rewards/rejected": -3.3229117393493652, + "step": 16950 + }, + { + "epoch": 4.24, + "grad_norm": 1.9336872100830078, + "learning_rate": 5.579747616410824e-07, + "logits/chosen": -0.5502167344093323, + "logits/rejected": -0.6513402462005615, + "logps/chosen": -49.78357696533203, + "logps/rejected": -102.65484619140625, + "loss": 0.5618, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9860148429870605, + "rewards/margins": 7.671863079071045, + "rewards/rejected": -4.685847759246826, + "step": 16951 + }, + { + "epoch": 4.24, + "grad_norm": 9.81701946258545, + "learning_rate": 5.576140007017656e-07, + "logits/chosen": -0.49399858713150024, + "logits/rejected": -0.5566737651824951, + "logps/chosen": -56.64482116699219, + "logps/rejected": -97.57535552978516, + "loss": 0.6392, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9943795204162598, + "rewards/margins": 6.444045543670654, + "rewards/rejected": -3.4496660232543945, + "step": 16952 + }, + { + "epoch": 4.24, + "grad_norm": 4.267953872680664, + "learning_rate": 5.572533495385318e-07, + "logits/chosen": -0.4469963610172272, + "logits/rejected": -0.5377793312072754, + "logps/chosen": -64.58003997802734, + "logps/rejected": -99.10389709472656, + "loss": 0.6574, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.428410768508911, + "rewards/margins": 7.024853706359863, + "rewards/rejected": -3.596442699432373, + "step": 16953 + }, + { + "epoch": 4.24, + "grad_norm": 4.30832576751709, + "learning_rate": 5.568928081602926e-07, + "logits/chosen": -0.5534114837646484, + "logits/rejected": -0.6023707985877991, + "logps/chosen": -56.637447357177734, + "logps/rejected": -101.40653228759766, + "loss": 0.6973, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2864465713500977, + "rewards/margins": 6.637968063354492, + "rewards/rejected": -3.3515219688415527, + "step": 16954 + }, + { + "epoch": 4.24, + "grad_norm": 9.828912734985352, + "learning_rate": 5.565323765759584e-07, + "logits/chosen": -0.5658140778541565, + "logits/rejected": -0.6953732371330261, + "logps/chosen": -52.871055603027344, + "logps/rejected": -98.43783569335938, + "loss": 0.6679, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0395026206970215, + "rewards/margins": 7.587979316711426, + "rewards/rejected": -4.5484771728515625, + "step": 16955 + }, + { + "epoch": 4.24, + "grad_norm": 4.275726795196533, + "learning_rate": 5.56172054794436e-07, + "logits/chosen": -0.6060112118721008, + "logits/rejected": -0.6420817971229553, + "logps/chosen": -57.07255554199219, + "logps/rejected": -113.33565521240234, + "loss": 0.5663, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.223630905151367, + "rewards/margins": 6.939848899841309, + "rewards/rejected": -3.7162187099456787, + "step": 16956 + }, + { + "epoch": 4.24, + "grad_norm": 3.8180859088897705, + "learning_rate": 5.558118428246273e-07, + "logits/chosen": -0.5145859718322754, + "logits/rejected": -0.6211228966712952, + "logps/chosen": -59.30293273925781, + "logps/rejected": -106.93001556396484, + "loss": 0.5321, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.017648458480835, + "rewards/margins": 7.586731433868408, + "rewards/rejected": -4.5690836906433105, + "step": 16957 + }, + { + "epoch": 4.24, + "grad_norm": 6.181593894958496, + "learning_rate": 5.554517406754362e-07, + "logits/chosen": -0.5477280020713806, + "logits/rejected": -0.6150225400924683, + "logps/chosen": -64.0141372680664, + "logps/rejected": -117.29264068603516, + "loss": 0.7142, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9934349060058594, + "rewards/margins": 7.149155139923096, + "rewards/rejected": -4.155719757080078, + "step": 16958 + }, + { + "epoch": 4.24, + "grad_norm": 5.181203365325928, + "learning_rate": 5.5509174835576e-07, + "logits/chosen": -0.4843920171260834, + "logits/rejected": -0.5589462518692017, + "logps/chosen": -65.1540298461914, + "logps/rejected": -95.25701904296875, + "loss": 0.6997, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.275089979171753, + "rewards/margins": 6.5458245277404785, + "rewards/rejected": -3.2707345485687256, + "step": 16959 + }, + { + "epoch": 4.24, + "grad_norm": 3.313896417617798, + "learning_rate": 5.547318658744938e-07, + "logits/chosen": -0.5520480871200562, + "logits/rejected": -0.6349221467971802, + "logps/chosen": -42.93077087402344, + "logps/rejected": -106.47968292236328, + "loss": 0.5378, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.172678232192993, + "rewards/margins": 7.627799034118652, + "rewards/rejected": -4.455120086669922, + "step": 16960 + }, + { + "epoch": 4.24, + "grad_norm": 5.3859992027282715, + "learning_rate": 5.543720932405311e-07, + "logits/chosen": -0.5564409494400024, + "logits/rejected": -0.6642265319824219, + "logps/chosen": -56.024742126464844, + "logps/rejected": -109.86849975585938, + "loss": 0.6592, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.895871639251709, + "rewards/margins": 6.979508399963379, + "rewards/rejected": -4.083637237548828, + "step": 16961 + }, + { + "epoch": 4.24, + "grad_norm": 5.966983795166016, + "learning_rate": 5.540124304627636e-07, + "logits/chosen": -0.605445384979248, + "logits/rejected": -0.6674102544784546, + "logps/chosen": -52.68160629272461, + "logps/rejected": -126.04987335205078, + "loss": 0.6515, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0844902992248535, + "rewards/margins": 7.547957897186279, + "rewards/rejected": -4.463468074798584, + "step": 16962 + }, + { + "epoch": 4.24, + "grad_norm": 4.487372398376465, + "learning_rate": 5.536528775500776e-07, + "logits/chosen": -0.5809441804885864, + "logits/rejected": -0.7015885710716248, + "logps/chosen": -61.057376861572266, + "logps/rejected": -102.35516357421875, + "loss": 0.5883, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.144152879714966, + "rewards/margins": 7.095928192138672, + "rewards/rejected": -3.951775312423706, + "step": 16963 + }, + { + "epoch": 4.24, + "grad_norm": 4.9259514808654785, + "learning_rate": 5.532934345113578e-07, + "logits/chosen": -0.5484668016433716, + "logits/rejected": -0.6285901069641113, + "logps/chosen": -54.20595169067383, + "logps/rejected": -86.95124816894531, + "loss": 0.5969, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4021475315093994, + "rewards/margins": 6.129196643829346, + "rewards/rejected": -2.727048873901367, + "step": 16964 + }, + { + "epoch": 4.24, + "grad_norm": 5.792520523071289, + "learning_rate": 5.529341013554878e-07, + "logits/chosen": -0.4944276213645935, + "logits/rejected": -0.587212860584259, + "logps/chosen": -58.424888610839844, + "logps/rejected": -95.61265563964844, + "loss": 0.6542, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.965059995651245, + "rewards/margins": 6.651875972747803, + "rewards/rejected": -3.686816453933716, + "step": 16965 + }, + { + "epoch": 4.24, + "grad_norm": 7.104158878326416, + "learning_rate": 5.525748780913465e-07, + "logits/chosen": -0.5642212629318237, + "logits/rejected": -0.6652205586433411, + "logps/chosen": -50.48347473144531, + "logps/rejected": -88.60555267333984, + "loss": 0.5724, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.371480703353882, + "rewards/margins": 7.189604759216309, + "rewards/rejected": -3.8181240558624268, + "step": 16966 + }, + { + "epoch": 4.24, + "grad_norm": 5.3978495597839355, + "learning_rate": 5.522157647278092e-07, + "logits/chosen": -0.49189212918281555, + "logits/rejected": -0.5834258198738098, + "logps/chosen": -65.78211975097656, + "logps/rejected": -121.74281311035156, + "loss": 0.6599, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0619473457336426, + "rewards/margins": 8.537525177001953, + "rewards/rejected": -5.475578784942627, + "step": 16967 + }, + { + "epoch": 4.24, + "grad_norm": 3.712110757827759, + "learning_rate": 5.518567612737524e-07, + "logits/chosen": -0.5342202186584473, + "logits/rejected": -0.6277849674224854, + "logps/chosen": -48.38013458251953, + "logps/rejected": -131.26217651367188, + "loss": 0.5979, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4524807929992676, + "rewards/margins": 8.498614311218262, + "rewards/rejected": -5.0461344718933105, + "step": 16968 + }, + { + "epoch": 4.24, + "grad_norm": 3.601414203643799, + "learning_rate": 5.51497867738045e-07, + "logits/chosen": -0.5657558441162109, + "logits/rejected": -0.6141955852508545, + "logps/chosen": -57.841495513916016, + "logps/rejected": -114.3471450805664, + "loss": 0.6826, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.425941228866577, + "rewards/margins": 7.020386695861816, + "rewards/rejected": -3.59444522857666, + "step": 16969 + }, + { + "epoch": 4.25, + "grad_norm": 2.901216745376587, + "learning_rate": 5.511390841295583e-07, + "logits/chosen": -0.4917091131210327, + "logits/rejected": -0.5823188424110413, + "logps/chosen": -51.04798126220703, + "logps/rejected": -115.51506042480469, + "loss": 0.5271, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5630218982696533, + "rewards/margins": 8.169023513793945, + "rewards/rejected": -4.606001377105713, + "step": 16970 + }, + { + "epoch": 4.25, + "grad_norm": 5.864878177642822, + "learning_rate": 5.507804104571562e-07, + "logits/chosen": -0.47338753938674927, + "logits/rejected": -0.5581279397010803, + "logps/chosen": -58.3786506652832, + "logps/rejected": -105.60997009277344, + "loss": 0.629, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9202470779418945, + "rewards/margins": 6.4595112800598145, + "rewards/rejected": -3.5392637252807617, + "step": 16971 + }, + { + "epoch": 4.25, + "grad_norm": 3.315924882888794, + "learning_rate": 5.504218467297024e-07, + "logits/chosen": -0.559177041053772, + "logits/rejected": -0.6361075043678284, + "logps/chosen": -53.02044677734375, + "logps/rejected": -105.83988952636719, + "loss": 0.5138, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.385745048522949, + "rewards/margins": 7.601115703582764, + "rewards/rejected": -4.215371131896973, + "step": 16972 + }, + { + "epoch": 4.25, + "grad_norm": 4.891780376434326, + "learning_rate": 5.500633929560579e-07, + "logits/chosen": -0.5000597238540649, + "logits/rejected": -0.5386945009231567, + "logps/chosen": -51.200538635253906, + "logps/rejected": -127.8636703491211, + "loss": 0.5792, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.280414581298828, + "rewards/margins": 7.384233474731445, + "rewards/rejected": -4.103818893432617, + "step": 16973 + }, + { + "epoch": 4.25, + "grad_norm": 4.727351665496826, + "learning_rate": 5.497050491450807e-07, + "logits/chosen": -0.5768635869026184, + "logits/rejected": -0.6467975378036499, + "logps/chosen": -46.09911346435547, + "logps/rejected": -101.841796875, + "loss": 0.5588, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0661542415618896, + "rewards/margins": 7.096427917480469, + "rewards/rejected": -4.030273914337158, + "step": 16974 + }, + { + "epoch": 4.25, + "grad_norm": 5.59345817565918, + "learning_rate": 5.493468153056236e-07, + "logits/chosen": -0.5180575847625732, + "logits/rejected": -0.6339973211288452, + "logps/chosen": -62.21234130859375, + "logps/rejected": -89.62078094482422, + "loss": 0.7823, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2140989303588867, + "rewards/margins": 6.6999945640563965, + "rewards/rejected": -3.485895872116089, + "step": 16975 + }, + { + "epoch": 4.25, + "grad_norm": 3.4808318614959717, + "learning_rate": 5.489886914465409e-07, + "logits/chosen": -0.5641212463378906, + "logits/rejected": -0.6480647921562195, + "logps/chosen": -60.24763870239258, + "logps/rejected": -110.66445922851562, + "loss": 0.6962, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1155765056610107, + "rewards/margins": 7.324962139129639, + "rewards/rejected": -4.209385871887207, + "step": 16976 + }, + { + "epoch": 4.25, + "grad_norm": 8.46926212310791, + "learning_rate": 5.486306775766842e-07, + "logits/chosen": -0.5505364537239075, + "logits/rejected": -0.6056609749794006, + "logps/chosen": -52.18768310546875, + "logps/rejected": -121.23108673095703, + "loss": 0.7309, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.136185646057129, + "rewards/margins": 8.179559707641602, + "rewards/rejected": -5.043374538421631, + "step": 16977 + }, + { + "epoch": 4.25, + "grad_norm": 3.6753952503204346, + "learning_rate": 5.48272773704896e-07, + "logits/chosen": -0.6298588514328003, + "logits/rejected": -0.7098503708839417, + "logps/chosen": -64.39353942871094, + "logps/rejected": -100.99259948730469, + "loss": 0.6793, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.171487808227539, + "rewards/margins": 7.677583694458008, + "rewards/rejected": -4.506095886230469, + "step": 16978 + }, + { + "epoch": 4.25, + "grad_norm": 5.885716915130615, + "learning_rate": 5.479149798400224e-07, + "logits/chosen": -0.547169029712677, + "logits/rejected": -0.5861175656318665, + "logps/chosen": -64.46581268310547, + "logps/rejected": -121.99214172363281, + "loss": 0.7176, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1350150108337402, + "rewards/margins": 6.629543781280518, + "rewards/rejected": -3.49452805519104, + "step": 16979 + }, + { + "epoch": 4.25, + "grad_norm": 4.336661338806152, + "learning_rate": 5.475572959909064e-07, + "logits/chosen": -0.5652725100517273, + "logits/rejected": -0.6517958045005798, + "logps/chosen": -68.00387573242188, + "logps/rejected": -122.64400482177734, + "loss": 0.5975, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.021932601928711, + "rewards/margins": 7.9030656814575195, + "rewards/rejected": -4.881133556365967, + "step": 16980 + }, + { + "epoch": 4.25, + "grad_norm": 7.5103440284729, + "learning_rate": 5.471997221663855e-07, + "logits/chosen": -0.565909743309021, + "logits/rejected": -0.6173791885375977, + "logps/chosen": -53.43092346191406, + "logps/rejected": -111.90965270996094, + "loss": 0.7222, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.19077730178833, + "rewards/margins": 8.097689628601074, + "rewards/rejected": -4.906912803649902, + "step": 16981 + }, + { + "epoch": 4.25, + "grad_norm": 3.9160399436950684, + "learning_rate": 5.468422583752941e-07, + "logits/chosen": -0.5504082441329956, + "logits/rejected": -0.614599347114563, + "logps/chosen": -53.47738265991211, + "logps/rejected": -122.61721801757812, + "loss": 0.6201, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.001581907272339, + "rewards/margins": 8.070503234863281, + "rewards/rejected": -5.06892204284668, + "step": 16982 + }, + { + "epoch": 4.25, + "grad_norm": 4.266577243804932, + "learning_rate": 5.464849046264686e-07, + "logits/chosen": -0.5378782749176025, + "logits/rejected": -0.5632840394973755, + "logps/chosen": -58.20090866088867, + "logps/rejected": -119.34479522705078, + "loss": 0.6368, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.026899576187134, + "rewards/margins": 7.714556694030762, + "rewards/rejected": -4.687656402587891, + "step": 16983 + }, + { + "epoch": 4.25, + "grad_norm": 14.879754066467285, + "learning_rate": 5.461276609287381e-07, + "logits/chosen": -0.5183013677597046, + "logits/rejected": -0.6081992983818054, + "logps/chosen": -54.59233093261719, + "logps/rejected": -94.28678894042969, + "loss": 0.6022, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1117467880249023, + "rewards/margins": 6.800998687744141, + "rewards/rejected": -3.6892521381378174, + "step": 16984 + }, + { + "epoch": 4.25, + "grad_norm": 4.867260932922363, + "learning_rate": 5.457705272909292e-07, + "logits/chosen": -0.5277405381202698, + "logits/rejected": -0.6295274496078491, + "logps/chosen": -51.73444747924805, + "logps/rejected": -111.7671890258789, + "loss": 0.528, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1732590198516846, + "rewards/margins": 7.457949161529541, + "rewards/rejected": -4.284689903259277, + "step": 16985 + }, + { + "epoch": 4.25, + "grad_norm": 2.6294634342193604, + "learning_rate": 5.454135037218694e-07, + "logits/chosen": -0.5856207609176636, + "logits/rejected": -0.6425837874412537, + "logps/chosen": -42.950645446777344, + "logps/rejected": -96.47330474853516, + "loss": 0.5968, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3662188053131104, + "rewards/margins": 6.80940055847168, + "rewards/rejected": -3.4431822299957275, + "step": 16986 + }, + { + "epoch": 4.25, + "grad_norm": 3.558814764022827, + "learning_rate": 5.450565902303789e-07, + "logits/chosen": -0.578096866607666, + "logits/rejected": -0.6780382394790649, + "logps/chosen": -54.59362030029297, + "logps/rejected": -101.31145477294922, + "loss": 0.611, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3731765747070312, + "rewards/margins": 7.670924186706543, + "rewards/rejected": -4.297747611999512, + "step": 16987 + }, + { + "epoch": 4.25, + "grad_norm": 6.978712558746338, + "learning_rate": 5.446997868252796e-07, + "logits/chosen": -0.5580601096153259, + "logits/rejected": -0.5962355136871338, + "logps/chosen": -53.46787643432617, + "logps/rejected": -103.01689910888672, + "loss": 0.5628, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.47743821144104, + "rewards/margins": 6.680495738983154, + "rewards/rejected": -3.2030575275421143, + "step": 16988 + }, + { + "epoch": 4.25, + "grad_norm": 3.4993081092834473, + "learning_rate": 5.443430935153876e-07, + "logits/chosen": -0.547226071357727, + "logits/rejected": -0.6349773406982422, + "logps/chosen": -59.63111114501953, + "logps/rejected": -113.43534851074219, + "loss": 0.6199, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4882726669311523, + "rewards/margins": 6.986802101135254, + "rewards/rejected": -3.4985289573669434, + "step": 16989 + }, + { + "epoch": 4.25, + "grad_norm": 7.328575611114502, + "learning_rate": 5.43986510309516e-07, + "logits/chosen": -0.5421865582466125, + "logits/rejected": -0.5949035286903381, + "logps/chosen": -61.19936752319336, + "logps/rejected": -112.66252136230469, + "loss": 0.5805, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1394119262695312, + "rewards/margins": 7.290914058685303, + "rewards/rejected": -4.1515021324157715, + "step": 16990 + }, + { + "epoch": 4.25, + "grad_norm": 2.216275930404663, + "learning_rate": 5.436300372164782e-07, + "logits/chosen": -0.532904326915741, + "logits/rejected": -0.6095306873321533, + "logps/chosen": -53.739219665527344, + "logps/rejected": -114.02326965332031, + "loss": 0.5502, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.118626832962036, + "rewards/margins": 8.201616287231445, + "rewards/rejected": -5.082989692687988, + "step": 16991 + }, + { + "epoch": 4.25, + "grad_norm": 9.260557174682617, + "learning_rate": 5.432736742450817e-07, + "logits/chosen": -0.5196899175643921, + "logits/rejected": -0.5593632459640503, + "logps/chosen": -45.10022735595703, + "logps/rejected": -126.76473999023438, + "loss": 0.6376, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.308347463607788, + "rewards/margins": 6.9168853759765625, + "rewards/rejected": -3.6085379123687744, + "step": 16992 + }, + { + "epoch": 4.25, + "grad_norm": 3.0657923221588135, + "learning_rate": 5.429174214041333e-07, + "logits/chosen": -0.6154937148094177, + "logits/rejected": -0.7007268667221069, + "logps/chosen": -53.763065338134766, + "logps/rejected": -107.37014770507812, + "loss": 0.6077, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.313230276107788, + "rewards/margins": 7.569395542144775, + "rewards/rejected": -4.256165504455566, + "step": 16993 + }, + { + "epoch": 4.25, + "grad_norm": 14.734087944030762, + "learning_rate": 5.425612787024359e-07, + "logits/chosen": -0.5952389240264893, + "logits/rejected": -0.6966529488563538, + "logps/chosen": -51.9040641784668, + "logps/rejected": -102.0340805053711, + "loss": 0.7139, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9840619564056396, + "rewards/margins": 7.297372817993164, + "rewards/rejected": -4.313310623168945, + "step": 16994 + }, + { + "epoch": 4.25, + "grad_norm": 4.1978230476379395, + "learning_rate": 5.422052461487909e-07, + "logits/chosen": -0.57366943359375, + "logits/rejected": -0.6709268093109131, + "logps/chosen": -53.25489807128906, + "logps/rejected": -88.67286682128906, + "loss": 0.5825, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2111685276031494, + "rewards/margins": 7.524805545806885, + "rewards/rejected": -4.313636302947998, + "step": 16995 + }, + { + "epoch": 4.25, + "grad_norm": 4.003293991088867, + "learning_rate": 5.418493237519962e-07, + "logits/chosen": -0.6197519302368164, + "logits/rejected": -0.6754754781723022, + "logps/chosen": -65.76526641845703, + "logps/rejected": -116.07306671142578, + "loss": 0.6151, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0082638263702393, + "rewards/margins": 7.923730850219727, + "rewards/rejected": -4.915466785430908, + "step": 16996 + }, + { + "epoch": 4.25, + "grad_norm": 8.211628913879395, + "learning_rate": 5.414935115208453e-07, + "logits/chosen": -0.5518056154251099, + "logits/rejected": -0.5749163031578064, + "logps/chosen": -40.998905181884766, + "logps/rejected": -109.89208221435547, + "loss": 0.6402, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0436794757843018, + "rewards/margins": 7.015382289886475, + "rewards/rejected": -3.97170352935791, + "step": 16997 + }, + { + "epoch": 4.25, + "grad_norm": 6.785293102264404, + "learning_rate": 5.411378094641329e-07, + "logits/chosen": -0.5876728296279907, + "logits/rejected": -0.6814090609550476, + "logps/chosen": -71.1833724975586, + "logps/rejected": -96.49232482910156, + "loss": 0.7598, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0045418739318848, + "rewards/margins": 6.052223205566406, + "rewards/rejected": -3.0476810932159424, + "step": 16998 + }, + { + "epoch": 4.25, + "grad_norm": 6.026978492736816, + "learning_rate": 5.40782217590648e-07, + "logits/chosen": -0.6082005500793457, + "logits/rejected": -0.6432743072509766, + "logps/chosen": -46.965858459472656, + "logps/rejected": -105.71261596679688, + "loss": 0.6473, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.08738374710083, + "rewards/margins": 7.004211902618408, + "rewards/rejected": -3.91682767868042, + "step": 16999 + }, + { + "epoch": 4.25, + "grad_norm": 13.294208526611328, + "learning_rate": 5.404267359091769e-07, + "logits/chosen": -0.5485202074050903, + "logits/rejected": -0.6424843668937683, + "logps/chosen": -87.90382385253906, + "logps/rejected": -86.16024780273438, + "loss": 0.8448, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.863504648208618, + "rewards/margins": 6.113337993621826, + "rewards/rejected": -3.249833345413208, + "step": 17000 + }, + { + "epoch": 4.25, + "grad_norm": 2.7703614234924316, + "learning_rate": 5.40071364428505e-07, + "logits/chosen": -0.5863306522369385, + "logits/rejected": -0.6485222578048706, + "logps/chosen": -62.974159240722656, + "logps/rejected": -106.386474609375, + "loss": 0.6331, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0245072841644287, + "rewards/margins": 7.376955032348633, + "rewards/rejected": -4.352447032928467, + "step": 17001 + }, + { + "epoch": 4.25, + "grad_norm": 8.450738906860352, + "learning_rate": 5.397161031574138e-07, + "logits/chosen": -0.5407359004020691, + "logits/rejected": -0.6822800636291504, + "logps/chosen": -51.1969108581543, + "logps/rejected": -96.3436050415039, + "loss": 0.5447, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1393141746520996, + "rewards/margins": 8.23511028289795, + "rewards/rejected": -5.09579610824585, + "step": 17002 + }, + { + "epoch": 4.25, + "grad_norm": 3.8285553455352783, + "learning_rate": 5.393609521046811e-07, + "logits/chosen": -0.5748528838157654, + "logits/rejected": -0.6371404528617859, + "logps/chosen": -45.238224029541016, + "logps/rejected": -131.59251403808594, + "loss": 0.5316, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.057107925415039, + "rewards/margins": 8.213521003723145, + "rewards/rejected": -5.156412601470947, + "step": 17003 + }, + { + "epoch": 4.25, + "grad_norm": 5.680673599243164, + "learning_rate": 5.390059112790835e-07, + "logits/chosen": -0.517978310585022, + "logits/rejected": -0.6512521505355835, + "logps/chosen": -57.01732635498047, + "logps/rejected": -105.46002960205078, + "loss": 0.6555, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0052459239959717, + "rewards/margins": 8.180689811706543, + "rewards/rejected": -5.175443649291992, + "step": 17004 + }, + { + "epoch": 4.25, + "grad_norm": 5.825105667114258, + "learning_rate": 5.386509806893969e-07, + "logits/chosen": -0.6152264475822449, + "logits/rejected": -0.6423822641372681, + "logps/chosen": -45.00442123413086, + "logps/rejected": -114.18414306640625, + "loss": 0.5737, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1055469512939453, + "rewards/margins": 7.030168056488037, + "rewards/rejected": -3.924621105194092, + "step": 17005 + }, + { + "epoch": 4.25, + "grad_norm": 5.202925682067871, + "learning_rate": 5.382961603443875e-07, + "logits/chosen": -0.6322859525680542, + "logits/rejected": -0.6968841552734375, + "logps/chosen": -50.43903350830078, + "logps/rejected": -102.64305114746094, + "loss": 0.6263, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.653215169906616, + "rewards/margins": 6.891953945159912, + "rewards/rejected": -4.238739013671875, + "step": 17006 + }, + { + "epoch": 4.25, + "grad_norm": 6.075902938842773, + "learning_rate": 5.379414502528263e-07, + "logits/chosen": -0.5031023621559143, + "logits/rejected": -0.6376277804374695, + "logps/chosen": -63.29291534423828, + "logps/rejected": -101.19583129882812, + "loss": 0.6311, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3422694206237793, + "rewards/margins": 7.6160430908203125, + "rewards/rejected": -4.273774147033691, + "step": 17007 + }, + { + "epoch": 4.25, + "grad_norm": 4.819268226623535, + "learning_rate": 5.375868504234782e-07, + "logits/chosen": -0.5808674097061157, + "logits/rejected": -0.6210929155349731, + "logps/chosen": -56.836769104003906, + "logps/rejected": -129.38803100585938, + "loss": 0.6342, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1480586528778076, + "rewards/margins": 8.594711303710938, + "rewards/rejected": -5.446652889251709, + "step": 17008 + }, + { + "epoch": 4.25, + "grad_norm": 4.4161834716796875, + "learning_rate": 5.372323608651059e-07, + "logits/chosen": -0.5761794447898865, + "logits/rejected": -0.6695875525474548, + "logps/chosen": -59.8602294921875, + "logps/rejected": -122.94610595703125, + "loss": 0.5953, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.322059154510498, + "rewards/margins": 9.1713285446167, + "rewards/rejected": -5.849270820617676, + "step": 17009 + }, + { + "epoch": 4.26, + "grad_norm": 2.777559280395508, + "learning_rate": 5.368779815864678e-07, + "logits/chosen": -0.6026684641838074, + "logits/rejected": -0.7095577716827393, + "logps/chosen": -72.85980987548828, + "logps/rejected": -104.942138671875, + "loss": 0.6546, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.804161548614502, + "rewards/margins": 8.074359893798828, + "rewards/rejected": -5.270198345184326, + "step": 17010 + }, + { + "epoch": 4.26, + "grad_norm": 6.308774948120117, + "learning_rate": 5.36523712596323e-07, + "logits/chosen": -0.5506983399391174, + "logits/rejected": -0.6311328411102295, + "logps/chosen": -44.667694091796875, + "logps/rejected": -99.67332458496094, + "loss": 0.5552, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.330958843231201, + "rewards/margins": 7.159348964691162, + "rewards/rejected": -3.82839035987854, + "step": 17011 + }, + { + "epoch": 4.26, + "grad_norm": 5.709482669830322, + "learning_rate": 5.361695539034234e-07, + "logits/chosen": -0.5232610702514648, + "logits/rejected": -0.5757787227630615, + "logps/chosen": -57.48143005371094, + "logps/rejected": -122.39159393310547, + "loss": 0.6702, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2349047660827637, + "rewards/margins": 7.047364711761475, + "rewards/rejected": -3.8124594688415527, + "step": 17012 + }, + { + "epoch": 4.26, + "grad_norm": 2.9773919582366943, + "learning_rate": 5.358155055165232e-07, + "logits/chosen": -0.5337703824043274, + "logits/rejected": -0.6435263156890869, + "logps/chosen": -58.291259765625, + "logps/rejected": -113.07267761230469, + "loss": 0.5987, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.221506357192993, + "rewards/margins": 8.721321105957031, + "rewards/rejected": -5.499814987182617, + "step": 17013 + }, + { + "epoch": 4.26, + "grad_norm": 22.471710205078125, + "learning_rate": 5.354615674443702e-07, + "logits/chosen": -0.5260499119758606, + "logits/rejected": -0.5882689952850342, + "logps/chosen": -55.804752349853516, + "logps/rejected": -100.45092010498047, + "loss": 0.59, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.250819683074951, + "rewards/margins": 6.394463062286377, + "rewards/rejected": -3.143644094467163, + "step": 17014 + }, + { + "epoch": 4.26, + "grad_norm": 6.7087626457214355, + "learning_rate": 5.3510773969571e-07, + "logits/chosen": -0.5590167045593262, + "logits/rejected": -0.5827902555465698, + "logps/chosen": -59.93841552734375, + "logps/rejected": -109.20203399658203, + "loss": 0.6556, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.874616861343384, + "rewards/margins": 6.084810256958008, + "rewards/rejected": -3.2101926803588867, + "step": 17015 + }, + { + "epoch": 4.26, + "grad_norm": 3.567866563796997, + "learning_rate": 5.347540222792874e-07, + "logits/chosen": -0.5697183609008789, + "logits/rejected": -0.6532964706420898, + "logps/chosen": -52.58515930175781, + "logps/rejected": -113.83673095703125, + "loss": 0.5616, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1440601348876953, + "rewards/margins": 7.957367897033691, + "rewards/rejected": -4.813308238983154, + "step": 17016 + }, + { + "epoch": 4.26, + "grad_norm": 3.7557260990142822, + "learning_rate": 5.344004152038423e-07, + "logits/chosen": -0.5217546224594116, + "logits/rejected": -0.5729310512542725, + "logps/chosen": -51.47998046875, + "logps/rejected": -128.5445556640625, + "loss": 0.5491, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.296781539916992, + "rewards/margins": 8.039896011352539, + "rewards/rejected": -4.743114471435547, + "step": 17017 + }, + { + "epoch": 4.26, + "grad_norm": 3.170398712158203, + "learning_rate": 5.340469184781116e-07, + "logits/chosen": -0.49012911319732666, + "logits/rejected": -0.5468198657035828, + "logps/chosen": -46.993377685546875, + "logps/rejected": -138.74810791015625, + "loss": 0.5475, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.370840549468994, + "rewards/margins": 8.95986270904541, + "rewards/rejected": -5.589022636413574, + "step": 17018 + }, + { + "epoch": 4.26, + "grad_norm": 3.5360124111175537, + "learning_rate": 5.336935321108322e-07, + "logits/chosen": -0.5209780335426331, + "logits/rejected": -0.6240786910057068, + "logps/chosen": -53.87508773803711, + "logps/rejected": -102.26708984375, + "loss": 0.6231, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9355385303497314, + "rewards/margins": 7.669515132904053, + "rewards/rejected": -4.733975410461426, + "step": 17019 + }, + { + "epoch": 4.26, + "grad_norm": 4.160617828369141, + "learning_rate": 5.333402561107376e-07, + "logits/chosen": -0.5725985169410706, + "logits/rejected": -0.6277175545692444, + "logps/chosen": -44.2371711730957, + "logps/rejected": -102.1865463256836, + "loss": 0.6417, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.178781747817993, + "rewards/margins": 6.578346252441406, + "rewards/rejected": -3.399564266204834, + "step": 17020 + }, + { + "epoch": 4.26, + "grad_norm": 2.797750473022461, + "learning_rate": 5.329870904865548e-07, + "logits/chosen": -0.49193185567855835, + "logits/rejected": -0.6207464933395386, + "logps/chosen": -62.9979133605957, + "logps/rejected": -123.27546691894531, + "loss": 0.5486, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3410701751708984, + "rewards/margins": 9.945755958557129, + "rewards/rejected": -6.6046857833862305, + "step": 17021 + }, + { + "epoch": 4.26, + "grad_norm": 1.749578595161438, + "learning_rate": 5.326340352470116e-07, + "logits/chosen": -0.5605223178863525, + "logits/rejected": -0.6499946117401123, + "logps/chosen": -55.3555908203125, + "logps/rejected": -127.27848815917969, + "loss": 0.5438, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.066330671310425, + "rewards/margins": 9.332886695861816, + "rewards/rejected": -6.266556262969971, + "step": 17022 + }, + { + "epoch": 4.26, + "grad_norm": 5.552408695220947, + "learning_rate": 5.322810904008346e-07, + "logits/chosen": -0.543728768825531, + "logits/rejected": -0.5895246267318726, + "logps/chosen": -57.23045349121094, + "logps/rejected": -99.80767822265625, + "loss": 0.5845, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1846840381622314, + "rewards/margins": 6.081714630126953, + "rewards/rejected": -2.897030830383301, + "step": 17023 + }, + { + "epoch": 4.26, + "grad_norm": 2.3092501163482666, + "learning_rate": 5.31928255956744e-07, + "logits/chosen": -0.5495686531066895, + "logits/rejected": -0.6649632453918457, + "logps/chosen": -58.014713287353516, + "logps/rejected": -100.73377990722656, + "loss": 0.5838, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.118267059326172, + "rewards/margins": 8.254853248596191, + "rewards/rejected": -5.1365861892700195, + "step": 17024 + }, + { + "epoch": 4.26, + "grad_norm": 4.540933609008789, + "learning_rate": 5.315755319234572e-07, + "logits/chosen": -0.47784677147865295, + "logits/rejected": -0.5725381970405579, + "logps/chosen": -51.853431701660156, + "logps/rejected": -112.08527374267578, + "loss": 0.5228, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3631792068481445, + "rewards/margins": 7.550238609313965, + "rewards/rejected": -4.1870598793029785, + "step": 17025 + }, + { + "epoch": 4.26, + "grad_norm": 4.478732585906982, + "learning_rate": 5.312229183096934e-07, + "logits/chosen": -0.5660022497177124, + "logits/rejected": -0.6339855790138245, + "logps/chosen": -43.97100830078125, + "logps/rejected": -101.88025665283203, + "loss": 0.5502, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.011338472366333, + "rewards/margins": 7.253084182739258, + "rewards/rejected": -4.241746425628662, + "step": 17026 + }, + { + "epoch": 4.26, + "grad_norm": 3.378875970840454, + "learning_rate": 5.308704151241639e-07, + "logits/chosen": -0.5801022052764893, + "logits/rejected": -0.6414446830749512, + "logps/chosen": -44.877567291259766, + "logps/rejected": -113.88531494140625, + "loss": 0.5347, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2844204902648926, + "rewards/margins": 8.444643020629883, + "rewards/rejected": -5.16022253036499, + "step": 17027 + }, + { + "epoch": 4.26, + "grad_norm": 2.2444167137145996, + "learning_rate": 5.305180223755796e-07, + "logits/chosen": -0.5552024841308594, + "logits/rejected": -0.6390950679779053, + "logps/chosen": -59.907108306884766, + "logps/rejected": -120.2780990600586, + "loss": 0.5711, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.202202796936035, + "rewards/margins": 8.154294967651367, + "rewards/rejected": -4.952091693878174, + "step": 17028 + }, + { + "epoch": 4.26, + "grad_norm": 7.167212963104248, + "learning_rate": 5.301657400726495e-07, + "logits/chosen": -0.5518161654472351, + "logits/rejected": -0.6481728553771973, + "logps/chosen": -57.621490478515625, + "logps/rejected": -116.4180908203125, + "loss": 0.6, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0171854496002197, + "rewards/margins": 7.737354755401611, + "rewards/rejected": -4.720169544219971, + "step": 17029 + }, + { + "epoch": 4.26, + "grad_norm": 2.4631881713867188, + "learning_rate": 5.298135682240785e-07, + "logits/chosen": -0.5281204581260681, + "logits/rejected": -0.6444463729858398, + "logps/chosen": -64.50658416748047, + "logps/rejected": -99.89404296875, + "loss": 0.5998, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.341435670852661, + "rewards/margins": 7.450293064117432, + "rewards/rejected": -4.108858108520508, + "step": 17030 + }, + { + "epoch": 4.26, + "grad_norm": 5.811748027801514, + "learning_rate": 5.294615068385678e-07, + "logits/chosen": -0.5880661010742188, + "logits/rejected": -0.6357418298721313, + "logps/chosen": -50.901634216308594, + "logps/rejected": -129.18377685546875, + "loss": 0.6307, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0219216346740723, + "rewards/margins": 7.433284759521484, + "rewards/rejected": -4.411363124847412, + "step": 17031 + }, + { + "epoch": 4.26, + "grad_norm": 4.2891106605529785, + "learning_rate": 5.291095559248193e-07, + "logits/chosen": -0.49829235672950745, + "logits/rejected": -0.6090748310089111, + "logps/chosen": -62.85212707519531, + "logps/rejected": -118.5797348022461, + "loss": 0.6137, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9370648860931396, + "rewards/margins": 7.554912090301514, + "rewards/rejected": -4.6178483963012695, + "step": 17032 + }, + { + "epoch": 4.26, + "grad_norm": 5.964933395385742, + "learning_rate": 5.287577154915285e-07, + "logits/chosen": -0.6075210571289062, + "logits/rejected": -0.7039713263511658, + "logps/chosen": -58.22135543823242, + "logps/rejected": -96.13278198242188, + "loss": 0.7285, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9000723361968994, + "rewards/margins": 7.160098075866699, + "rewards/rejected": -4.260025501251221, + "step": 17033 + }, + { + "epoch": 4.26, + "grad_norm": 3.399871826171875, + "learning_rate": 5.284059855473911e-07, + "logits/chosen": -0.5497868061065674, + "logits/rejected": -0.6095520257949829, + "logps/chosen": -52.89676284790039, + "logps/rejected": -113.96768188476562, + "loss": 0.5794, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.025606632232666, + "rewards/margins": 7.348673343658447, + "rewards/rejected": -4.323066711425781, + "step": 17034 + }, + { + "epoch": 4.26, + "grad_norm": 2.3694233894348145, + "learning_rate": 5.280543661010973e-07, + "logits/chosen": -0.652934730052948, + "logits/rejected": -0.72684246301651, + "logps/chosen": -35.907814025878906, + "logps/rejected": -93.99832916259766, + "loss": 0.4558, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2859082221984863, + "rewards/margins": 6.693124294281006, + "rewards/rejected": -3.4072158336639404, + "step": 17035 + }, + { + "epoch": 4.26, + "grad_norm": 9.434873580932617, + "learning_rate": 5.277028571613368e-07, + "logits/chosen": -0.5043689012527466, + "logits/rejected": -0.5369203090667725, + "logps/chosen": -50.82614517211914, + "logps/rejected": -129.37869262695312, + "loss": 0.6336, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2657299041748047, + "rewards/margins": 7.518223762512207, + "rewards/rejected": -4.252494812011719, + "step": 17036 + }, + { + "epoch": 4.26, + "grad_norm": 2.8818869590759277, + "learning_rate": 5.273514587367956e-07, + "logits/chosen": -0.5124915838241577, + "logits/rejected": -0.5893154144287109, + "logps/chosen": -53.03677749633789, + "logps/rejected": -113.39752960205078, + "loss": 0.5318, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2554306983947754, + "rewards/margins": 7.595765590667725, + "rewards/rejected": -4.340334415435791, + "step": 17037 + }, + { + "epoch": 4.26, + "grad_norm": 4.2598795890808105, + "learning_rate": 5.270001708361578e-07, + "logits/chosen": -0.5921968221664429, + "logits/rejected": -0.6546413898468018, + "logps/chosen": -55.089195251464844, + "logps/rejected": -104.42253875732422, + "loss": 0.6228, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2651045322418213, + "rewards/margins": 6.342985153198242, + "rewards/rejected": -3.077880859375, + "step": 17038 + }, + { + "epoch": 4.26, + "grad_norm": 4.741854190826416, + "learning_rate": 5.266489934681035e-07, + "logits/chosen": -0.5080910325050354, + "logits/rejected": -0.6031408309936523, + "logps/chosen": -60.66950225830078, + "logps/rejected": -116.3031997680664, + "loss": 0.6013, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.163634777069092, + "rewards/margins": 7.757374286651611, + "rewards/rejected": -4.5937395095825195, + "step": 17039 + }, + { + "epoch": 4.26, + "grad_norm": 5.570171356201172, + "learning_rate": 5.262979266413099e-07, + "logits/chosen": -0.4536479115486145, + "logits/rejected": -0.5101732015609741, + "logps/chosen": -53.97638702392578, + "logps/rejected": -117.66937255859375, + "loss": 0.581, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2762701511383057, + "rewards/margins": 7.487510681152344, + "rewards/rejected": -4.211239814758301, + "step": 17040 + }, + { + "epoch": 4.26, + "grad_norm": 5.340259552001953, + "learning_rate": 5.259469703644532e-07, + "logits/chosen": -0.5619488954544067, + "logits/rejected": -0.6545404195785522, + "logps/chosen": -53.09793472290039, + "logps/rejected": -111.3857650756836, + "loss": 0.6549, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.15987491607666, + "rewards/margins": 8.022991180419922, + "rewards/rejected": -4.8631157875061035, + "step": 17041 + }, + { + "epoch": 4.26, + "grad_norm": 5.868357181549072, + "learning_rate": 5.255961246462065e-07, + "logits/chosen": -0.5632555484771729, + "logits/rejected": -0.6627669334411621, + "logps/chosen": -57.34825134277344, + "logps/rejected": -101.32626342773438, + "loss": 0.69, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.770937442779541, + "rewards/margins": 5.981271266937256, + "rewards/rejected": -3.2103335857391357, + "step": 17042 + }, + { + "epoch": 4.26, + "grad_norm": 5.980073928833008, + "learning_rate": 5.252453894952375e-07, + "logits/chosen": -0.6063582301139832, + "logits/rejected": -0.7042698860168457, + "logps/chosen": -54.65874481201172, + "logps/rejected": -99.20336151123047, + "loss": 0.5482, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0418004989624023, + "rewards/margins": 7.172393798828125, + "rewards/rejected": -4.130593776702881, + "step": 17043 + }, + { + "epoch": 4.26, + "grad_norm": 29.366209030151367, + "learning_rate": 5.248947649202152e-07, + "logits/chosen": -0.5682637691497803, + "logits/rejected": -0.6504703760147095, + "logps/chosen": -55.74016189575195, + "logps/rejected": -119.09928131103516, + "loss": 0.7044, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2689313888549805, + "rewards/margins": 7.806164741516113, + "rewards/rejected": -4.537232398986816, + "step": 17044 + }, + { + "epoch": 4.26, + "grad_norm": 7.439853191375732, + "learning_rate": 5.245442509298038e-07, + "logits/chosen": -0.5800027251243591, + "logits/rejected": -0.6884517669677734, + "logps/chosen": -60.7720947265625, + "logps/rejected": -96.73284912109375, + "loss": 0.6316, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2121458053588867, + "rewards/margins": 7.470468044281006, + "rewards/rejected": -4.258322715759277, + "step": 17045 + }, + { + "epoch": 4.26, + "grad_norm": 12.96960163116455, + "learning_rate": 5.241938475326624e-07, + "logits/chosen": -0.5487412810325623, + "logits/rejected": -0.6024068593978882, + "logps/chosen": -54.61648178100586, + "logps/rejected": -102.18709564208984, + "loss": 0.6146, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1364121437072754, + "rewards/margins": 6.456396102905273, + "rewards/rejected": -3.319983720779419, + "step": 17046 + }, + { + "epoch": 4.26, + "grad_norm": 8.10153579711914, + "learning_rate": 5.238435547374526e-07, + "logits/chosen": -0.6402396559715271, + "logits/rejected": -0.7169158458709717, + "logps/chosen": -74.48077392578125, + "logps/rejected": -122.94355773925781, + "loss": 0.6577, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.810600996017456, + "rewards/margins": 7.703712463378906, + "rewards/rejected": -4.8931121826171875, + "step": 17047 + }, + { + "epoch": 4.26, + "grad_norm": 2.889587879180908, + "learning_rate": 5.234933725528312e-07, + "logits/chosen": -0.5408955812454224, + "logits/rejected": -0.592987060546875, + "logps/chosen": -48.926395416259766, + "logps/rejected": -110.09394836425781, + "loss": 0.5838, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5638723373413086, + "rewards/margins": 7.46979284286499, + "rewards/rejected": -3.9059205055236816, + "step": 17048 + }, + { + "epoch": 4.26, + "grad_norm": 3.057938575744629, + "learning_rate": 5.231433009874476e-07, + "logits/chosen": -0.45484843850135803, + "logits/rejected": -0.5381758213043213, + "logps/chosen": -54.55564880371094, + "logps/rejected": -108.70984649658203, + "loss": 0.5505, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0530457496643066, + "rewards/margins": 6.952727317810059, + "rewards/rejected": -3.899681568145752, + "step": 17049 + }, + { + "epoch": 4.27, + "grad_norm": 11.308274269104004, + "learning_rate": 5.227933400499552e-07, + "logits/chosen": -0.50441575050354, + "logits/rejected": -0.5788156986236572, + "logps/chosen": -62.72737121582031, + "logps/rejected": -128.717529296875, + "loss": 0.6225, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.143137216567993, + "rewards/margins": 8.524327278137207, + "rewards/rejected": -5.381190776824951, + "step": 17050 + }, + { + "epoch": 4.27, + "grad_norm": 5.5749688148498535, + "learning_rate": 5.224434897490027e-07, + "logits/chosen": -0.5052492618560791, + "logits/rejected": -0.5744736194610596, + "logps/chosen": -49.150142669677734, + "logps/rejected": -102.40595245361328, + "loss": 0.5881, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1255879402160645, + "rewards/margins": 6.430713653564453, + "rewards/rejected": -3.3051257133483887, + "step": 17051 + }, + { + "epoch": 4.27, + "grad_norm": 2.651228666305542, + "learning_rate": 5.220937500932338e-07, + "logits/chosen": -0.5296065807342529, + "logits/rejected": -0.606824517250061, + "logps/chosen": -49.01642990112305, + "logps/rejected": -113.74938201904297, + "loss": 0.5942, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.419632911682129, + "rewards/margins": 8.23536491394043, + "rewards/rejected": -4.815732002258301, + "step": 17052 + }, + { + "epoch": 4.27, + "grad_norm": 4.20850133895874, + "learning_rate": 5.217441210912904e-07, + "logits/chosen": -0.5119860172271729, + "logits/rejected": -0.6359561085700989, + "logps/chosen": -71.71142578125, + "logps/rejected": -106.51016235351562, + "loss": 0.6428, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8301525115966797, + "rewards/margins": 6.936286449432373, + "rewards/rejected": -4.106134414672852, + "step": 17053 + }, + { + "epoch": 4.27, + "grad_norm": 4.809596538543701, + "learning_rate": 5.213946027518146e-07, + "logits/chosen": -0.6166338324546814, + "logits/rejected": -0.692552387714386, + "logps/chosen": -43.868873596191406, + "logps/rejected": -125.56194305419922, + "loss": 0.5087, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2380130290985107, + "rewards/margins": 8.416274070739746, + "rewards/rejected": -5.178261756896973, + "step": 17054 + }, + { + "epoch": 4.27, + "grad_norm": 5.72588586807251, + "learning_rate": 5.210451950834411e-07, + "logits/chosen": -0.5006927847862244, + "logits/rejected": -0.5761614441871643, + "logps/chosen": -48.55409240722656, + "logps/rejected": -94.99073028564453, + "loss": 0.5818, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0181922912597656, + "rewards/margins": 5.863286972045898, + "rewards/rejected": -2.845094680786133, + "step": 17055 + }, + { + "epoch": 4.27, + "grad_norm": 5.541719436645508, + "learning_rate": 5.20695898094804e-07, + "logits/chosen": -0.5281930565834045, + "logits/rejected": -0.6050543189048767, + "logps/chosen": -60.69505310058594, + "logps/rejected": -112.31591033935547, + "loss": 0.5981, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4605777263641357, + "rewards/margins": 8.208927154541016, + "rewards/rejected": -4.748349666595459, + "step": 17056 + }, + { + "epoch": 4.27, + "grad_norm": 4.865194797515869, + "learning_rate": 5.203467117945365e-07, + "logits/chosen": -0.5382004380226135, + "logits/rejected": -0.6728851199150085, + "logps/chosen": -76.94395446777344, + "logps/rejected": -104.70175170898438, + "loss": 0.687, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8904623985290527, + "rewards/margins": 7.163080215454102, + "rewards/rejected": -4.272618293762207, + "step": 17057 + }, + { + "epoch": 4.27, + "grad_norm": 14.702088356018066, + "learning_rate": 5.199976361912662e-07, + "logits/chosen": -0.6238198280334473, + "logits/rejected": -0.6796165108680725, + "logps/chosen": -55.141910552978516, + "logps/rejected": -109.00692749023438, + "loss": 0.5999, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.537025213241577, + "rewards/margins": 7.325618267059326, + "rewards/rejected": -3.788593053817749, + "step": 17058 + }, + { + "epoch": 4.27, + "grad_norm": 3.7598178386688232, + "learning_rate": 5.1964867129362e-07, + "logits/chosen": -0.5569852590560913, + "logits/rejected": -0.6567975282669067, + "logps/chosen": -56.45880889892578, + "logps/rejected": -122.52861022949219, + "loss": 0.5974, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.417987585067749, + "rewards/margins": 9.432929039001465, + "rewards/rejected": -6.014941215515137, + "step": 17059 + }, + { + "epoch": 4.27, + "grad_norm": 2.5738508701324463, + "learning_rate": 5.192998171102204e-07, + "logits/chosen": -0.5877172350883484, + "logits/rejected": -0.6414281129837036, + "logps/chosen": -46.42554473876953, + "logps/rejected": -111.06546020507812, + "loss": 0.5579, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2916526794433594, + "rewards/margins": 7.671303749084473, + "rewards/rejected": -4.3796515464782715, + "step": 17060 + }, + { + "epoch": 4.27, + "grad_norm": 6.160717010498047, + "learning_rate": 5.189510736496872e-07, + "logits/chosen": -0.5474907755851746, + "logits/rejected": -0.6232055425643921, + "logps/chosen": -60.97501754760742, + "logps/rejected": -106.09086608886719, + "loss": 0.6397, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5543291568756104, + "rewards/margins": 6.089692115783691, + "rewards/rejected": -3.535362958908081, + "step": 17061 + }, + { + "epoch": 4.27, + "grad_norm": 3.127046585083008, + "learning_rate": 5.186024409206408e-07, + "logits/chosen": -0.486052542924881, + "logits/rejected": -0.582250714302063, + "logps/chosen": -51.97863006591797, + "logps/rejected": -123.77037811279297, + "loss": 0.5721, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.029928207397461, + "rewards/margins": 8.720907211303711, + "rewards/rejected": -5.69097900390625, + "step": 17062 + }, + { + "epoch": 4.27, + "grad_norm": 3.1013622283935547, + "learning_rate": 5.182539189316943e-07, + "logits/chosen": -0.5377710461616516, + "logits/rejected": -0.5951405763626099, + "logps/chosen": -49.660709381103516, + "logps/rejected": -108.24112701416016, + "loss": 0.5708, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1826114654541016, + "rewards/margins": 7.354115962982178, + "rewards/rejected": -4.171504974365234, + "step": 17063 + }, + { + "epoch": 4.27, + "grad_norm": 7.712810516357422, + "learning_rate": 5.179055076914597e-07, + "logits/chosen": -0.6045671701431274, + "logits/rejected": -0.6851390600204468, + "logps/chosen": -44.904502868652344, + "logps/rejected": -107.4530258178711, + "loss": 0.5727, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.105377197265625, + "rewards/margins": 7.763428688049316, + "rewards/rejected": -4.658051013946533, + "step": 17064 + }, + { + "epoch": 4.27, + "grad_norm": 6.004859447479248, + "learning_rate": 5.175572072085472e-07, + "logits/chosen": -0.45530030131340027, + "logits/rejected": -0.5631685256958008, + "logps/chosen": -70.7970199584961, + "logps/rejected": -93.50141906738281, + "loss": 0.6323, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2869510650634766, + "rewards/margins": 6.654129505157471, + "rewards/rejected": -3.367178440093994, + "step": 17065 + }, + { + "epoch": 4.27, + "grad_norm": 3.700502872467041, + "learning_rate": 5.172090174915656e-07, + "logits/chosen": -0.5434963703155518, + "logits/rejected": -0.6661278605461121, + "logps/chosen": -59.912166595458984, + "logps/rejected": -105.69326782226562, + "loss": 0.6168, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.162158250808716, + "rewards/margins": 7.397717475891113, + "rewards/rejected": -4.235559463500977, + "step": 17066 + }, + { + "epoch": 4.27, + "grad_norm": 10.448848724365234, + "learning_rate": 5.168609385491157e-07, + "logits/chosen": -0.5848842859268188, + "logits/rejected": -0.6572203040122986, + "logps/chosen": -55.29391860961914, + "logps/rejected": -128.24822998046875, + "loss": 0.6312, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.136014223098755, + "rewards/margins": 7.570491790771484, + "rewards/rejected": -4.434477806091309, + "step": 17067 + }, + { + "epoch": 4.27, + "grad_norm": 2.614436388015747, + "learning_rate": 5.165129703898003e-07, + "logits/chosen": -0.588063657283783, + "logits/rejected": -0.6729287505149841, + "logps/chosen": -64.56263732910156, + "logps/rejected": -127.89389038085938, + "loss": 0.6084, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4841864109039307, + "rewards/margins": 8.688211441040039, + "rewards/rejected": -5.2040252685546875, + "step": 17068 + }, + { + "epoch": 4.27, + "grad_norm": 4.582353115081787, + "learning_rate": 5.161651130222195e-07, + "logits/chosen": -0.5540206432342529, + "logits/rejected": -0.6494286060333252, + "logps/chosen": -60.75852966308594, + "logps/rejected": -98.93456268310547, + "loss": 0.5891, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8982653617858887, + "rewards/margins": 6.634627342224121, + "rewards/rejected": -3.736361026763916, + "step": 17069 + }, + { + "epoch": 4.27, + "grad_norm": 5.557376384735107, + "learning_rate": 5.158173664549677e-07, + "logits/chosen": -0.5606713891029358, + "logits/rejected": -0.6643388271331787, + "logps/chosen": -46.72471237182617, + "logps/rejected": -90.341064453125, + "loss": 0.5221, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.006308078765869, + "rewards/margins": 6.706114292144775, + "rewards/rejected": -3.6998074054718018, + "step": 17070 + }, + { + "epoch": 4.27, + "grad_norm": 4.4854326248168945, + "learning_rate": 5.154697306966372e-07, + "logits/chosen": -0.5702034831047058, + "logits/rejected": -0.6379045844078064, + "logps/chosen": -47.20201110839844, + "logps/rejected": -114.6499252319336, + "loss": 0.5715, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4262986183166504, + "rewards/margins": 8.12130069732666, + "rewards/rejected": -4.69500207901001, + "step": 17071 + }, + { + "epoch": 4.27, + "grad_norm": 4.366748332977295, + "learning_rate": 5.151222057558209e-07, + "logits/chosen": -0.5033262968063354, + "logits/rejected": -0.614953339099884, + "logps/chosen": -71.77423858642578, + "logps/rejected": -108.35722351074219, + "loss": 0.6496, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.833240509033203, + "rewards/margins": 6.851601600646973, + "rewards/rejected": -4.018360614776611, + "step": 17072 + }, + { + "epoch": 4.27, + "grad_norm": 5.525829792022705, + "learning_rate": 5.14774791641105e-07, + "logits/chosen": -0.5131151080131531, + "logits/rejected": -0.6464163661003113, + "logps/chosen": -60.47971725463867, + "logps/rejected": -108.0015869140625, + "loss": 0.5261, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.557399272918701, + "rewards/margins": 8.030350685119629, + "rewards/rejected": -5.4729509353637695, + "step": 17073 + }, + { + "epoch": 4.27, + "grad_norm": 13.104263305664062, + "learning_rate": 5.144274883610739e-07, + "logits/chosen": -0.5408788919448853, + "logits/rejected": -0.5958172082901001, + "logps/chosen": -56.37644958496094, + "logps/rejected": -128.42092895507812, + "loss": 0.6703, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3248374462127686, + "rewards/margins": 6.979063034057617, + "rewards/rejected": -3.6542258262634277, + "step": 17074 + }, + { + "epoch": 4.27, + "grad_norm": 2.1288700103759766, + "learning_rate": 5.140802959243119e-07, + "logits/chosen": -0.5787214636802673, + "logits/rejected": -0.6610276699066162, + "logps/chosen": -59.128074645996094, + "logps/rejected": -104.4661865234375, + "loss": 0.5997, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1383373737335205, + "rewards/margins": 7.673160552978516, + "rewards/rejected": -4.534822940826416, + "step": 17075 + }, + { + "epoch": 4.27, + "grad_norm": 17.431474685668945, + "learning_rate": 5.137332143393958e-07, + "logits/chosen": -0.5909791588783264, + "logits/rejected": -0.7300349473953247, + "logps/chosen": -56.926456451416016, + "logps/rejected": -105.47404479980469, + "loss": 0.5842, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.716259717941284, + "rewards/margins": 7.285499572753906, + "rewards/rejected": -4.569239616394043, + "step": 17076 + }, + { + "epoch": 4.27, + "grad_norm": 6.195041179656982, + "learning_rate": 5.13386243614905e-07, + "logits/chosen": -0.5173049569129944, + "logits/rejected": -0.631068766117096, + "logps/chosen": -54.38360595703125, + "logps/rejected": -98.99270629882812, + "loss": 0.7321, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.154618740081787, + "rewards/margins": 7.213802814483643, + "rewards/rejected": -4.0591840744018555, + "step": 17077 + }, + { + "epoch": 4.27, + "grad_norm": 3.686321973800659, + "learning_rate": 5.130393837594111e-07, + "logits/chosen": -0.5304761528968811, + "logits/rejected": -0.6389884352684021, + "logps/chosen": -53.0428466796875, + "logps/rejected": -89.14142608642578, + "loss": 0.632, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2834293842315674, + "rewards/margins": 6.765895366668701, + "rewards/rejected": -3.4824657440185547, + "step": 17078 + }, + { + "epoch": 4.27, + "grad_norm": 4.8994140625, + "learning_rate": 5.126926347814876e-07, + "logits/chosen": -0.525986909866333, + "logits/rejected": -0.5828008055686951, + "logps/chosen": -54.891693115234375, + "logps/rejected": -113.30558776855469, + "loss": 0.5867, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2660903930664062, + "rewards/margins": 7.214162826538086, + "rewards/rejected": -3.9480721950531006, + "step": 17079 + }, + { + "epoch": 4.27, + "grad_norm": 6.4320173263549805, + "learning_rate": 5.123459966897021e-07, + "logits/chosen": -0.5757880806922913, + "logits/rejected": -0.6795997619628906, + "logps/chosen": -61.660743713378906, + "logps/rejected": -97.66315460205078, + "loss": 0.7645, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3844492435455322, + "rewards/margins": 7.166158676147461, + "rewards/rejected": -3.781709671020508, + "step": 17080 + }, + { + "epoch": 4.27, + "grad_norm": 5.7490105628967285, + "learning_rate": 5.119994694926194e-07, + "logits/chosen": -0.5847591757774353, + "logits/rejected": -0.6591044068336487, + "logps/chosen": -54.77768325805664, + "logps/rejected": -114.91358947753906, + "loss": 0.6477, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.032377243041992, + "rewards/margins": 7.348149299621582, + "rewards/rejected": -4.31577205657959, + "step": 17081 + }, + { + "epoch": 4.27, + "grad_norm": 3.0895228385925293, + "learning_rate": 5.11653053198804e-07, + "logits/chosen": -0.4713343381881714, + "logits/rejected": -0.5902649760246277, + "logps/chosen": -61.4969367980957, + "logps/rejected": -94.74067687988281, + "loss": 0.6446, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.004450798034668, + "rewards/margins": 7.637091159820557, + "rewards/rejected": -4.632640361785889, + "step": 17082 + }, + { + "epoch": 4.27, + "grad_norm": 6.134993553161621, + "learning_rate": 5.113067478168155e-07, + "logits/chosen": -0.5698240399360657, + "logits/rejected": -0.597361147403717, + "logps/chosen": -51.03669357299805, + "logps/rejected": -112.22403717041016, + "loss": 0.6235, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.092407703399658, + "rewards/margins": 6.947266578674316, + "rewards/rejected": -3.854858875274658, + "step": 17083 + }, + { + "epoch": 4.27, + "grad_norm": 3.7655434608459473, + "learning_rate": 5.10960553355212e-07, + "logits/chosen": -0.5407780408859253, + "logits/rejected": -0.6648241281509399, + "logps/chosen": -66.56804656982422, + "logps/rejected": -100.70291137695312, + "loss": 0.6157, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.956244468688965, + "rewards/margins": 7.861172676086426, + "rewards/rejected": -4.904927730560303, + "step": 17084 + }, + { + "epoch": 4.27, + "grad_norm": 5.246946811676025, + "learning_rate": 5.106144698225479e-07, + "logits/chosen": -0.5528532862663269, + "logits/rejected": -0.6661524772644043, + "logps/chosen": -57.12192916870117, + "logps/rejected": -96.23696899414062, + "loss": 0.6573, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8712100982666016, + "rewards/margins": 7.569859027862549, + "rewards/rejected": -4.6986494064331055, + "step": 17085 + }, + { + "epoch": 4.27, + "grad_norm": 3.082523822784424, + "learning_rate": 5.102684972273747e-07, + "logits/chosen": -0.5865093469619751, + "logits/rejected": -0.6458644866943359, + "logps/chosen": -57.340213775634766, + "logps/rejected": -133.30880737304688, + "loss": 0.5741, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.111196994781494, + "rewards/margins": 8.749711990356445, + "rewards/rejected": -5.638514995574951, + "step": 17086 + }, + { + "epoch": 4.27, + "grad_norm": 3.6772537231445312, + "learning_rate": 5.099226355782433e-07, + "logits/chosen": -0.49501317739486694, + "logits/rejected": -0.5883735418319702, + "logps/chosen": -56.31031799316406, + "logps/rejected": -94.10598754882812, + "loss": 0.614, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0807790756225586, + "rewards/margins": 7.438640594482422, + "rewards/rejected": -4.357861518859863, + "step": 17087 + }, + { + "epoch": 4.27, + "grad_norm": 3.606135845184326, + "learning_rate": 5.095768848836996e-07, + "logits/chosen": -0.6029793620109558, + "logits/rejected": -0.6663161516189575, + "logps/chosen": -48.49552536010742, + "logps/rejected": -102.05331420898438, + "loss": 0.6144, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1782984733581543, + "rewards/margins": 6.50658655166626, + "rewards/rejected": -3.3282876014709473, + "step": 17088 + }, + { + "epoch": 4.27, + "grad_norm": 3.353520393371582, + "learning_rate": 5.092312451522862e-07, + "logits/chosen": -0.5870966911315918, + "logits/rejected": -0.7071621417999268, + "logps/chosen": -42.46944046020508, + "logps/rejected": -84.34357452392578, + "loss": 0.5219, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.049565315246582, + "rewards/margins": 8.033284187316895, + "rewards/rejected": -4.9837188720703125, + "step": 17089 + }, + { + "epoch": 4.28, + "grad_norm": 5.109275817871094, + "learning_rate": 5.088857163925453e-07, + "logits/chosen": -0.5968227386474609, + "logits/rejected": -0.684502363204956, + "logps/chosen": -57.800331115722656, + "logps/rejected": -110.06671142578125, + "loss": 0.6342, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2104763984680176, + "rewards/margins": 6.550134658813477, + "rewards/rejected": -3.339658737182617, + "step": 17090 + }, + { + "epoch": 4.28, + "grad_norm": 3.952057123184204, + "learning_rate": 5.085402986130173e-07, + "logits/chosen": -0.5301229953765869, + "logits/rejected": -0.607695996761322, + "logps/chosen": -56.56557846069336, + "logps/rejected": -104.0174789428711, + "loss": 0.6337, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0866951942443848, + "rewards/margins": 7.158611297607422, + "rewards/rejected": -4.071916103363037, + "step": 17091 + }, + { + "epoch": 4.28, + "grad_norm": 6.582822322845459, + "learning_rate": 5.081949918222339e-07, + "logits/chosen": -0.43259960412979126, + "logits/rejected": -0.48129165172576904, + "logps/chosen": -59.286502838134766, + "logps/rejected": -107.51558685302734, + "loss": 0.6394, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1469264030456543, + "rewards/margins": 5.847002029418945, + "rewards/rejected": -2.700075149536133, + "step": 17092 + }, + { + "epoch": 4.28, + "grad_norm": 5.426305294036865, + "learning_rate": 5.078497960287298e-07, + "logits/chosen": -0.49807584285736084, + "logits/rejected": -0.6242508888244629, + "logps/chosen": -59.80207061767578, + "logps/rejected": -102.1498794555664, + "loss": 0.6502, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1346986293792725, + "rewards/margins": 7.015957832336426, + "rewards/rejected": -3.8812592029571533, + "step": 17093 + }, + { + "epoch": 4.28, + "grad_norm": 5.183628082275391, + "learning_rate": 5.075047112410364e-07, + "logits/chosen": -0.5680760741233826, + "logits/rejected": -0.6708781123161316, + "logps/chosen": -45.5622444152832, + "logps/rejected": -88.81214141845703, + "loss": 0.633, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2669427394866943, + "rewards/margins": 6.842926025390625, + "rewards/rejected": -3.5759830474853516, + "step": 17094 + }, + { + "epoch": 4.28, + "grad_norm": 6.670342922210693, + "learning_rate": 5.071597374676801e-07, + "logits/chosen": -0.5264829993247986, + "logits/rejected": -0.599616289138794, + "logps/chosen": -68.84368133544922, + "logps/rejected": -119.27316284179688, + "loss": 0.6992, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.948319911956787, + "rewards/margins": 7.200666904449463, + "rewards/rejected": -4.252346992492676, + "step": 17095 + }, + { + "epoch": 4.28, + "grad_norm": 4.155229568481445, + "learning_rate": 5.068148747171842e-07, + "logits/chosen": -0.5237880945205688, + "logits/rejected": -0.6220400333404541, + "logps/chosen": -63.40178680419922, + "logps/rejected": -114.28206634521484, + "loss": 0.5887, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.788783311843872, + "rewards/margins": 8.046388626098633, + "rewards/rejected": -5.25760555267334, + "step": 17096 + }, + { + "epoch": 4.28, + "grad_norm": 5.110450267791748, + "learning_rate": 5.064701229980734e-07, + "logits/chosen": -0.5732834339141846, + "logits/rejected": -0.700132429599762, + "logps/chosen": -49.39265060424805, + "logps/rejected": -106.02179718017578, + "loss": 0.5408, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.134199619293213, + "rewards/margins": 8.18724536895752, + "rewards/rejected": -5.053045749664307, + "step": 17097 + }, + { + "epoch": 4.28, + "grad_norm": 3.315791606903076, + "learning_rate": 5.061254823188649e-07, + "logits/chosen": -0.619096040725708, + "logits/rejected": -0.7281530499458313, + "logps/chosen": -46.782901763916016, + "logps/rejected": -103.94041442871094, + "loss": 0.5967, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.055830478668213, + "rewards/margins": 7.703590393066406, + "rewards/rejected": -4.647760391235352, + "step": 17098 + }, + { + "epoch": 4.28, + "grad_norm": 8.384339332580566, + "learning_rate": 5.057809526880746e-07, + "logits/chosen": -0.5431137681007385, + "logits/rejected": -0.6412226557731628, + "logps/chosen": -61.29430389404297, + "logps/rejected": -101.7963638305664, + "loss": 0.751, + "rewards/accuracies": 0.84375, + "rewards/chosen": 3.3119125366210938, + "rewards/margins": 5.862970352172852, + "rewards/rejected": -2.551058530807495, + "step": 17099 + }, + { + "epoch": 4.28, + "grad_norm": 3.7946810722351074, + "learning_rate": 5.054365341142187e-07, + "logits/chosen": -0.5869808197021484, + "logits/rejected": -0.6330892443656921, + "logps/chosen": -41.88821792602539, + "logps/rejected": -115.81919860839844, + "loss": 0.5181, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.948162317276001, + "rewards/margins": 8.08135986328125, + "rewards/rejected": -5.133197784423828, + "step": 17100 + }, + { + "epoch": 4.28, + "grad_norm": 23.809173583984375, + "learning_rate": 5.050922266058056e-07, + "logits/chosen": -0.45781052112579346, + "logits/rejected": -0.5079127550125122, + "logps/chosen": -63.77898406982422, + "logps/rejected": -110.54875946044922, + "loss": 0.7215, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.053199529647827, + "rewards/margins": 5.781252384185791, + "rewards/rejected": -2.728053331375122, + "step": 17101 + }, + { + "epoch": 4.28, + "grad_norm": 5.653079032897949, + "learning_rate": 5.047480301713453e-07, + "logits/chosen": -0.5458736419677734, + "logits/rejected": -0.6419479846954346, + "logps/chosen": -63.09611892700195, + "logps/rejected": -117.39755249023438, + "loss": 0.6315, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9186601638793945, + "rewards/margins": 8.608654022216797, + "rewards/rejected": -5.689993381500244, + "step": 17102 + }, + { + "epoch": 4.28, + "grad_norm": 2.776926279067993, + "learning_rate": 5.044039448193427e-07, + "logits/chosen": -0.5850811004638672, + "logits/rejected": -0.6422630548477173, + "logps/chosen": -39.82503128051758, + "logps/rejected": -113.05828094482422, + "loss": 0.5239, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7116925716400146, + "rewards/margins": 7.9262614250183105, + "rewards/rejected": -4.214568614959717, + "step": 17103 + }, + { + "epoch": 4.28, + "grad_norm": 4.090481758117676, + "learning_rate": 5.040599705582993e-07, + "logits/chosen": -0.49761080741882324, + "logits/rejected": -0.5713372230529785, + "logps/chosen": -66.52420806884766, + "logps/rejected": -111.29474639892578, + "loss": 0.5952, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1449437141418457, + "rewards/margins": 7.246016979217529, + "rewards/rejected": -4.101073741912842, + "step": 17104 + }, + { + "epoch": 4.28, + "grad_norm": 6.367162227630615, + "learning_rate": 5.037161073967173e-07, + "logits/chosen": -0.5081165432929993, + "logits/rejected": -0.6241403222084045, + "logps/chosen": -63.188316345214844, + "logps/rejected": -98.706298828125, + "loss": 0.6811, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.161991834640503, + "rewards/margins": 7.1453351974487305, + "rewards/rejected": -3.9833431243896484, + "step": 17105 + }, + { + "epoch": 4.28, + "grad_norm": 5.427957057952881, + "learning_rate": 5.033723553430925e-07, + "logits/chosen": -0.5374612808227539, + "logits/rejected": -0.578229546546936, + "logps/chosen": -64.86013793945312, + "logps/rejected": -122.3580093383789, + "loss": 0.7003, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4155778884887695, + "rewards/margins": 6.835546970367432, + "rewards/rejected": -3.4199697971343994, + "step": 17106 + }, + { + "epoch": 4.28, + "grad_norm": 2.5995965003967285, + "learning_rate": 5.030287144059187e-07, + "logits/chosen": -0.5727469325065613, + "logits/rejected": -0.6016020774841309, + "logps/chosen": -52.18974685668945, + "logps/rejected": -131.33287048339844, + "loss": 0.5664, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2318873405456543, + "rewards/margins": 9.713224411010742, + "rewards/rejected": -6.48133659362793, + "step": 17107 + }, + { + "epoch": 4.28, + "grad_norm": 6.027554512023926, + "learning_rate": 5.026851845936887e-07, + "logits/chosen": -0.4982733428478241, + "logits/rejected": -0.6429827213287354, + "logps/chosen": -75.29561614990234, + "logps/rejected": -90.27034759521484, + "loss": 0.7538, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.395538806915283, + "rewards/margins": 7.294694900512695, + "rewards/rejected": -3.899155616760254, + "step": 17108 + }, + { + "epoch": 4.28, + "grad_norm": 3.5959906578063965, + "learning_rate": 5.023417659148927e-07, + "logits/chosen": -0.5340756773948669, + "logits/rejected": -0.613749623298645, + "logps/chosen": -52.99603271484375, + "logps/rejected": -111.94744873046875, + "loss": 0.5921, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.162916898727417, + "rewards/margins": 7.256587982177734, + "rewards/rejected": -4.093670845031738, + "step": 17109 + }, + { + "epoch": 4.28, + "grad_norm": 2.6928579807281494, + "learning_rate": 5.019984583780141e-07, + "logits/chosen": -0.5645356178283691, + "logits/rejected": -0.6193065643310547, + "logps/chosen": -51.312564849853516, + "logps/rejected": -102.202392578125, + "loss": 0.5947, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.112226724624634, + "rewards/margins": 6.871810436248779, + "rewards/rejected": -3.7595837116241455, + "step": 17110 + }, + { + "epoch": 4.28, + "grad_norm": 11.351099967956543, + "learning_rate": 5.016552619915383e-07, + "logits/chosen": -0.5051378011703491, + "logits/rejected": -0.5729831457138062, + "logps/chosen": -50.467529296875, + "logps/rejected": -89.36663055419922, + "loss": 0.6878, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.020594596862793, + "rewards/margins": 5.930131435394287, + "rewards/rejected": -2.909536600112915, + "step": 17111 + }, + { + "epoch": 4.28, + "grad_norm": 3.0912532806396484, + "learning_rate": 5.013121767639462e-07, + "logits/chosen": -0.5866332650184631, + "logits/rejected": -0.6811829805374146, + "logps/chosen": -56.07080078125, + "logps/rejected": -101.90817260742188, + "loss": 0.595, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3381853103637695, + "rewards/margins": 7.775603771209717, + "rewards/rejected": -4.437418460845947, + "step": 17112 + }, + { + "epoch": 4.28, + "grad_norm": 4.369786262512207, + "learning_rate": 5.00969202703715e-07, + "logits/chosen": -0.5643167495727539, + "logits/rejected": -0.5914828181266785, + "logps/chosen": -51.37652587890625, + "logps/rejected": -124.75885009765625, + "loss": 0.5161, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1467947959899902, + "rewards/margins": 6.895556449890137, + "rewards/rejected": -3.7487616539001465, + "step": 17113 + }, + { + "epoch": 4.28, + "grad_norm": 2.5248680114746094, + "learning_rate": 5.006263398193195e-07, + "logits/chosen": -0.4711798429489136, + "logits/rejected": -0.5209316611289978, + "logps/chosen": -61.197349548339844, + "logps/rejected": -112.77973937988281, + "loss": 0.5796, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.235032081604004, + "rewards/margins": 6.901726245880127, + "rewards/rejected": -3.666693925857544, + "step": 17114 + }, + { + "epoch": 4.28, + "grad_norm": 6.612767696380615, + "learning_rate": 5.002835881192336e-07, + "logits/chosen": -0.5849791765213013, + "logits/rejected": -0.664164125919342, + "logps/chosen": -45.07883071899414, + "logps/rejected": -94.61554718017578, + "loss": 0.5755, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.249600410461426, + "rewards/margins": 6.965799331665039, + "rewards/rejected": -3.716198682785034, + "step": 17115 + }, + { + "epoch": 4.28, + "grad_norm": 5.870656967163086, + "learning_rate": 4.999409476119266e-07, + "logits/chosen": -0.54729163646698, + "logits/rejected": -0.6094766855239868, + "logps/chosen": -47.58221435546875, + "logps/rejected": -108.95413208007812, + "loss": 0.6086, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1480019092559814, + "rewards/margins": 8.244786262512207, + "rewards/rejected": -5.096785545349121, + "step": 17116 + }, + { + "epoch": 4.28, + "grad_norm": 2.792330741882324, + "learning_rate": 4.995984183058644e-07, + "logits/chosen": -0.5347212553024292, + "logits/rejected": -0.6626812815666199, + "logps/chosen": -51.85075378417969, + "logps/rejected": -100.5965576171875, + "loss": 0.5272, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9576663970947266, + "rewards/margins": 7.606125831604004, + "rewards/rejected": -4.648459434509277, + "step": 17117 + }, + { + "epoch": 4.28, + "grad_norm": 6.956307888031006, + "learning_rate": 4.992560002095132e-07, + "logits/chosen": -0.4751316010951996, + "logits/rejected": -0.5226536393165588, + "logps/chosen": -60.33095932006836, + "logps/rejected": -123.03264617919922, + "loss": 0.5981, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.421548366546631, + "rewards/margins": 7.83070182800293, + "rewards/rejected": -4.409153938293457, + "step": 17118 + }, + { + "epoch": 4.28, + "grad_norm": 5.602781772613525, + "learning_rate": 4.989136933313321e-07, + "logits/chosen": -0.555748462677002, + "logits/rejected": -0.6249991059303284, + "logps/chosen": -58.6240234375, + "logps/rejected": -116.72415161132812, + "loss": 0.6771, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7103989124298096, + "rewards/margins": 6.014342784881592, + "rewards/rejected": -3.303943634033203, + "step": 17119 + }, + { + "epoch": 4.28, + "grad_norm": 34.20043182373047, + "learning_rate": 4.985714976797823e-07, + "logits/chosen": -0.5704401135444641, + "logits/rejected": -0.6151766777038574, + "logps/chosen": -54.41870880126953, + "logps/rejected": -90.46553039550781, + "loss": 0.9562, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6496458053588867, + "rewards/margins": 5.667224407196045, + "rewards/rejected": -3.017578601837158, + "step": 17120 + }, + { + "epoch": 4.28, + "grad_norm": 6.16093635559082, + "learning_rate": 4.982294132633186e-07, + "logits/chosen": -0.5775740146636963, + "logits/rejected": -0.6206772923469543, + "logps/chosen": -52.31508255004883, + "logps/rejected": -125.57457733154297, + "loss": 0.6666, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.970564365386963, + "rewards/margins": 7.949779033660889, + "rewards/rejected": -4.979215145111084, + "step": 17121 + }, + { + "epoch": 4.28, + "grad_norm": 7.295985698699951, + "learning_rate": 4.978874400903932e-07, + "logits/chosen": -0.5218614339828491, + "logits/rejected": -0.5931335687637329, + "logps/chosen": -56.685935974121094, + "logps/rejected": -104.4853286743164, + "loss": 0.59, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9048757553100586, + "rewards/margins": 6.520968914031982, + "rewards/rejected": -3.6160929203033447, + "step": 17122 + }, + { + "epoch": 4.28, + "grad_norm": 12.675678253173828, + "learning_rate": 4.975455781694594e-07, + "logits/chosen": -0.5355467200279236, + "logits/rejected": -0.5590924024581909, + "logps/chosen": -54.56094741821289, + "logps/rejected": -116.25638580322266, + "loss": 0.6762, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.098391532897949, + "rewards/margins": 7.262019157409668, + "rewards/rejected": -4.163627624511719, + "step": 17123 + }, + { + "epoch": 4.28, + "grad_norm": 6.525111675262451, + "learning_rate": 4.972038275089619e-07, + "logits/chosen": -0.5720165967941284, + "logits/rejected": -0.6290815472602844, + "logps/chosen": -42.22313690185547, + "logps/rejected": -111.35273742675781, + "loss": 0.5316, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.370694875717163, + "rewards/margins": 6.2393341064453125, + "rewards/rejected": -2.8686389923095703, + "step": 17124 + }, + { + "epoch": 4.28, + "grad_norm": 10.510934829711914, + "learning_rate": 4.968621881173486e-07, + "logits/chosen": -0.5964309573173523, + "logits/rejected": -0.6996132731437683, + "logps/chosen": -52.927001953125, + "logps/rejected": -106.76648712158203, + "loss": 0.5838, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.828770160675049, + "rewards/margins": 6.625197410583496, + "rewards/rejected": -3.7964260578155518, + "step": 17125 + }, + { + "epoch": 4.28, + "grad_norm": 6.423187732696533, + "learning_rate": 4.965206600030592e-07, + "logits/chosen": -0.5934441089630127, + "logits/rejected": -0.708634078502655, + "logps/chosen": -68.8468017578125, + "logps/rejected": -91.09425354003906, + "loss": 0.7079, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.925675868988037, + "rewards/margins": 6.335967540740967, + "rewards/rejected": -3.410291910171509, + "step": 17126 + }, + { + "epoch": 4.28, + "grad_norm": 3.7052338123321533, + "learning_rate": 4.961792431745355e-07, + "logits/chosen": -0.5708983540534973, + "logits/rejected": -0.6324806809425354, + "logps/chosen": -56.54918670654297, + "logps/rejected": -104.96566772460938, + "loss": 0.6055, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9101293087005615, + "rewards/margins": 6.673685073852539, + "rewards/rejected": -3.7635555267333984, + "step": 17127 + }, + { + "epoch": 4.28, + "grad_norm": 7.5831170082092285, + "learning_rate": 4.958379376402129e-07, + "logits/chosen": -0.5955126285552979, + "logits/rejected": -0.7167177796363831, + "logps/chosen": -46.174537658691406, + "logps/rejected": -98.24777221679688, + "loss": 0.5571, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0812573432922363, + "rewards/margins": 7.1813554763793945, + "rewards/rejected": -4.100098133087158, + "step": 17128 + }, + { + "epoch": 4.29, + "grad_norm": 8.890135765075684, + "learning_rate": 4.95496743408525e-07, + "logits/chosen": -0.5754193663597107, + "logits/rejected": -0.6658806800842285, + "logps/chosen": -50.61616897583008, + "logps/rejected": -98.46641540527344, + "loss": 0.5588, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2167019844055176, + "rewards/margins": 6.85626220703125, + "rewards/rejected": -3.6395602226257324, + "step": 17129 + }, + { + "epoch": 4.29, + "grad_norm": 3.1540160179138184, + "learning_rate": 4.951556604879049e-07, + "logits/chosen": -0.549746036529541, + "logits/rejected": -0.6544324159622192, + "logps/chosen": -51.8054313659668, + "logps/rejected": -94.95564270019531, + "loss": 0.531, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3618602752685547, + "rewards/margins": 7.498249053955078, + "rewards/rejected": -4.136388778686523, + "step": 17130 + }, + { + "epoch": 4.29, + "grad_norm": 5.181049823760986, + "learning_rate": 4.948146888867794e-07, + "logits/chosen": -0.6001974940299988, + "logits/rejected": -0.6759646534919739, + "logps/chosen": -45.642967224121094, + "logps/rejected": -102.5224609375, + "loss": 0.5735, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1168768405914307, + "rewards/margins": 7.072680473327637, + "rewards/rejected": -3.955803632736206, + "step": 17131 + }, + { + "epoch": 4.29, + "grad_norm": 8.42024040222168, + "learning_rate": 4.944738286135742e-07, + "logits/chosen": -0.49627363681793213, + "logits/rejected": -0.6058931350708008, + "logps/chosen": -68.27009582519531, + "logps/rejected": -123.05552673339844, + "loss": 0.7164, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.932663917541504, + "rewards/margins": 8.41457748413086, + "rewards/rejected": -5.4819135665893555, + "step": 17132 + }, + { + "epoch": 4.29, + "grad_norm": 4.790369033813477, + "learning_rate": 4.941330796767129e-07, + "logits/chosen": -0.6329507231712341, + "logits/rejected": -0.6737240552902222, + "logps/chosen": -47.86396408081055, + "logps/rejected": -104.41830444335938, + "loss": 0.8001, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2160158157348633, + "rewards/margins": 6.566669940948486, + "rewards/rejected": -3.350654363632202, + "step": 17133 + }, + { + "epoch": 4.29, + "grad_norm": 3.094691038131714, + "learning_rate": 4.93792442084618e-07, + "logits/chosen": -0.6063922047615051, + "logits/rejected": -0.6370925903320312, + "logps/chosen": -39.71794891357422, + "logps/rejected": -114.942626953125, + "loss": 0.5016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.375462293624878, + "rewards/margins": 7.516301155090332, + "rewards/rejected": -4.140838623046875, + "step": 17134 + }, + { + "epoch": 4.29, + "grad_norm": 5.694883346557617, + "learning_rate": 4.93451915845703e-07, + "logits/chosen": -0.5563186407089233, + "logits/rejected": -0.6052637100219727, + "logps/chosen": -56.950721740722656, + "logps/rejected": -113.54667663574219, + "loss": 0.6267, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0516903400421143, + "rewards/margins": 6.938177585601807, + "rewards/rejected": -3.8864870071411133, + "step": 17135 + }, + { + "epoch": 4.29, + "grad_norm": 9.431683540344238, + "learning_rate": 4.931115009683846e-07, + "logits/chosen": -0.5695791244506836, + "logits/rejected": -0.6235164999961853, + "logps/chosen": -44.97407150268555, + "logps/rejected": -114.67469024658203, + "loss": 0.6681, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.948626756668091, + "rewards/margins": 6.687900543212891, + "rewards/rejected": -3.7392737865448, + "step": 17136 + }, + { + "epoch": 4.29, + "grad_norm": 3.059465169906616, + "learning_rate": 4.92771197461076e-07, + "logits/chosen": -0.5416865348815918, + "logits/rejected": -0.6513161063194275, + "logps/chosen": -62.65447235107422, + "logps/rejected": -96.44145965576172, + "loss": 0.5865, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.175632953643799, + "rewards/margins": 7.278752326965332, + "rewards/rejected": -4.103118896484375, + "step": 17137 + }, + { + "epoch": 4.29, + "grad_norm": 2.895195722579956, + "learning_rate": 4.92431005332184e-07, + "logits/chosen": -0.5282415151596069, + "logits/rejected": -0.5507537722587585, + "logps/chosen": -44.18551254272461, + "logps/rejected": -106.10372924804688, + "loss": 0.5778, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2634758949279785, + "rewards/margins": 6.300778865814209, + "rewards/rejected": -3.0373027324676514, + "step": 17138 + }, + { + "epoch": 4.29, + "grad_norm": 6.239084720611572, + "learning_rate": 4.920909245901162e-07, + "logits/chosen": -0.5206671357154846, + "logits/rejected": -0.5749350190162659, + "logps/chosen": -58.25457763671875, + "logps/rejected": -91.09747314453125, + "loss": 0.6187, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0941145420074463, + "rewards/margins": 4.827498912811279, + "rewards/rejected": -1.7333848476409912, + "step": 17139 + }, + { + "epoch": 4.29, + "grad_norm": 9.140976905822754, + "learning_rate": 4.917509552432776e-07, + "logits/chosen": -0.5388849377632141, + "logits/rejected": -0.5920127630233765, + "logps/chosen": -52.70057678222656, + "logps/rejected": -112.91659545898438, + "loss": 0.5941, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1030635833740234, + "rewards/margins": 7.089841842651367, + "rewards/rejected": -3.9867780208587646, + "step": 17140 + }, + { + "epoch": 4.29, + "grad_norm": 3.2496633529663086, + "learning_rate": 4.914110973000674e-07, + "logits/chosen": -0.567313551902771, + "logits/rejected": -0.6413809657096863, + "logps/chosen": -59.296451568603516, + "logps/rejected": -114.4515609741211, + "loss": 0.5744, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.273906707763672, + "rewards/margins": 7.907987117767334, + "rewards/rejected": -4.634080410003662, + "step": 17141 + }, + { + "epoch": 4.29, + "grad_norm": 23.64259147644043, + "learning_rate": 4.910713507688841e-07, + "logits/chosen": -0.6091532707214355, + "logits/rejected": -0.6872855424880981, + "logps/chosen": -44.16979217529297, + "logps/rejected": -104.37883758544922, + "loss": 0.6452, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0032403469085693, + "rewards/margins": 7.830801963806152, + "rewards/rejected": -4.827561378479004, + "step": 17142 + }, + { + "epoch": 4.29, + "grad_norm": 4.563237190246582, + "learning_rate": 4.907317156581242e-07, + "logits/chosen": -0.5222553610801697, + "logits/rejected": -0.6295897960662842, + "logps/chosen": -53.5968132019043, + "logps/rejected": -95.33885955810547, + "loss": 0.6214, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.818880081176758, + "rewards/margins": 6.742618560791016, + "rewards/rejected": -3.923738956451416, + "step": 17143 + }, + { + "epoch": 4.29, + "grad_norm": 10.429431915283203, + "learning_rate": 4.903921919761795e-07, + "logits/chosen": -0.5916315317153931, + "logits/rejected": -0.6001330614089966, + "logps/chosen": -47.33446502685547, + "logps/rejected": -109.77555084228516, + "loss": 0.5934, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5935542583465576, + "rewards/margins": 6.352476119995117, + "rewards/rejected": -2.7589213848114014, + "step": 17144 + }, + { + "epoch": 4.29, + "grad_norm": 5.673194885253906, + "learning_rate": 4.900527797314408e-07, + "logits/chosen": -0.5319792032241821, + "logits/rejected": -0.6153594255447388, + "logps/chosen": -52.29240417480469, + "logps/rejected": -99.55742645263672, + "loss": 0.6964, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.047072410583496, + "rewards/margins": 6.747407913208008, + "rewards/rejected": -3.700335741043091, + "step": 17145 + }, + { + "epoch": 4.29, + "grad_norm": 5.137815475463867, + "learning_rate": 4.897134789322955e-07, + "logits/chosen": -0.4588901400566101, + "logits/rejected": -0.5473378300666809, + "logps/chosen": -57.3140869140625, + "logps/rejected": -110.84803771972656, + "loss": 0.656, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1501405239105225, + "rewards/margins": 6.551416397094727, + "rewards/rejected": -3.401276111602783, + "step": 17146 + }, + { + "epoch": 4.29, + "grad_norm": 5.850433826446533, + "learning_rate": 4.893742895871262e-07, + "logits/chosen": -0.5300508141517639, + "logits/rejected": -0.5781968235969543, + "logps/chosen": -59.93949890136719, + "logps/rejected": -127.23698425292969, + "loss": 0.7048, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.014096975326538, + "rewards/margins": 7.738740921020508, + "rewards/rejected": -4.724643707275391, + "step": 17147 + }, + { + "epoch": 4.29, + "grad_norm": 6.481734275817871, + "learning_rate": 4.890352117043168e-07, + "logits/chosen": -0.5306228995323181, + "logits/rejected": -0.6189078092575073, + "logps/chosen": -59.412967681884766, + "logps/rejected": -107.20953369140625, + "loss": 0.5857, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.187742233276367, + "rewards/margins": 7.583784103393555, + "rewards/rejected": -4.396041393280029, + "step": 17148 + }, + { + "epoch": 4.29, + "grad_norm": 4.663182258605957, + "learning_rate": 4.88696245292245e-07, + "logits/chosen": -0.6132041215896606, + "logits/rejected": -0.7358100414276123, + "logps/chosen": -49.34633255004883, + "logps/rejected": -102.56312561035156, + "loss": 0.5642, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9286739826202393, + "rewards/margins": 7.6157450675964355, + "rewards/rejected": -4.687071323394775, + "step": 17149 + }, + { + "epoch": 4.29, + "grad_norm": 5.569618225097656, + "learning_rate": 4.88357390359287e-07, + "logits/chosen": -0.5756709575653076, + "logits/rejected": -0.6286450028419495, + "logps/chosen": -55.20204544067383, + "logps/rejected": -115.62825012207031, + "loss": 0.6768, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.889594078063965, + "rewards/margins": 6.973413467407227, + "rewards/rejected": -4.083819389343262, + "step": 17150 + }, + { + "epoch": 4.29, + "grad_norm": 7.7055768966674805, + "learning_rate": 4.880186469138165e-07, + "logits/chosen": -0.6252297163009644, + "logits/rejected": -0.7043349742889404, + "logps/chosen": -59.74116134643555, + "logps/rejected": -108.08663940429688, + "loss": 0.7413, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4413259029388428, + "rewards/margins": 7.374588966369629, + "rewards/rejected": -3.933262348175049, + "step": 17151 + }, + { + "epoch": 4.29, + "grad_norm": 3.27638578414917, + "learning_rate": 4.876800149642063e-07, + "logits/chosen": -0.5321233868598938, + "logits/rejected": -0.6007357239723206, + "logps/chosen": -49.862274169921875, + "logps/rejected": -110.9477310180664, + "loss": 0.5802, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.392026901245117, + "rewards/margins": 8.068686485290527, + "rewards/rejected": -4.67665958404541, + "step": 17152 + }, + { + "epoch": 4.29, + "grad_norm": 5.4158525466918945, + "learning_rate": 4.87341494518821e-07, + "logits/chosen": -0.5704547166824341, + "logits/rejected": -0.6335252523422241, + "logps/chosen": -42.57106018066406, + "logps/rejected": -130.77426147460938, + "loss": 0.5529, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0693771839141846, + "rewards/margins": 7.449909210205078, + "rewards/rejected": -4.3805317878723145, + "step": 17153 + }, + { + "epoch": 4.29, + "grad_norm": 3.218914747238159, + "learning_rate": 4.87003085586027e-07, + "logits/chosen": -0.4956516921520233, + "logits/rejected": -0.6082101464271545, + "logps/chosen": -52.447044372558594, + "logps/rejected": -88.38296508789062, + "loss": 0.5366, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3974313735961914, + "rewards/margins": 7.3159284591674805, + "rewards/rejected": -3.918497323989868, + "step": 17154 + }, + { + "epoch": 4.29, + "grad_norm": 2.6736855506896973, + "learning_rate": 4.866647881741876e-07, + "logits/chosen": -0.5736604928970337, + "logits/rejected": -0.6421650052070618, + "logps/chosen": -66.14525604248047, + "logps/rejected": -108.62793731689453, + "loss": 0.5558, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8822336196899414, + "rewards/margins": 6.76191520690918, + "rewards/rejected": -3.8796818256378174, + "step": 17155 + }, + { + "epoch": 4.29, + "grad_norm": 2.7414472103118896, + "learning_rate": 4.863266022916619e-07, + "logits/chosen": -0.6121600270271301, + "logits/rejected": -0.6819879412651062, + "logps/chosen": -44.47321319580078, + "logps/rejected": -105.2630615234375, + "loss": 0.5097, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.11014723777771, + "rewards/margins": 7.754010200500488, + "rewards/rejected": -4.643863201141357, + "step": 17156 + }, + { + "epoch": 4.29, + "grad_norm": 6.810426712036133, + "learning_rate": 4.859885279468063e-07, + "logits/chosen": -0.5061362385749817, + "logits/rejected": -0.4966883063316345, + "logps/chosen": -63.232147216796875, + "logps/rejected": -132.74729919433594, + "loss": 0.6879, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.119817018508911, + "rewards/margins": 5.994668960571289, + "rewards/rejected": -2.874852418899536, + "step": 17157 + }, + { + "epoch": 4.29, + "grad_norm": 6.917740821838379, + "learning_rate": 4.856505651479759e-07, + "logits/chosen": -0.62729412317276, + "logits/rejected": -0.6122661232948303, + "logps/chosen": -73.5529556274414, + "logps/rejected": -112.58659362792969, + "loss": 0.6319, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3605189323425293, + "rewards/margins": 7.440327167510986, + "rewards/rejected": -4.079808712005615, + "step": 17158 + }, + { + "epoch": 4.29, + "grad_norm": 5.0221967697143555, + "learning_rate": 4.853127139035224e-07, + "logits/chosen": -0.5623382329940796, + "logits/rejected": -0.5904672145843506, + "logps/chosen": -44.04985427856445, + "logps/rejected": -104.6625747680664, + "loss": 0.5944, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4041590690612793, + "rewards/margins": 6.768860340118408, + "rewards/rejected": -3.364701271057129, + "step": 17159 + }, + { + "epoch": 4.29, + "grad_norm": 2.6056101322174072, + "learning_rate": 4.849749742217924e-07, + "logits/chosen": -0.561231255531311, + "logits/rejected": -0.6441105604171753, + "logps/chosen": -53.67274856567383, + "logps/rejected": -107.5675048828125, + "loss": 0.5411, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1339609622955322, + "rewards/margins": 7.773833751678467, + "rewards/rejected": -4.6398725509643555, + "step": 17160 + }, + { + "epoch": 4.29, + "grad_norm": 6.682713031768799, + "learning_rate": 4.846373461111343e-07, + "logits/chosen": -0.5697150826454163, + "logits/rejected": -0.6332473754882812, + "logps/chosen": -61.52758026123047, + "logps/rejected": -107.35845947265625, + "loss": 0.7591, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9003958702087402, + "rewards/margins": 6.727329254150391, + "rewards/rejected": -3.8269333839416504, + "step": 17161 + }, + { + "epoch": 4.29, + "grad_norm": 12.602014541625977, + "learning_rate": 4.842998295798895e-07, + "logits/chosen": -0.5023323893547058, + "logits/rejected": -0.5543906092643738, + "logps/chosen": -59.87008285522461, + "logps/rejected": -121.49119567871094, + "loss": 0.7515, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.273487091064453, + "rewards/margins": 7.1140031814575195, + "rewards/rejected": -3.8405160903930664, + "step": 17162 + }, + { + "epoch": 4.29, + "grad_norm": 6.3941426277160645, + "learning_rate": 4.839624246363988e-07, + "logits/chosen": -0.6149952411651611, + "logits/rejected": -0.6803475618362427, + "logps/chosen": -47.503150939941406, + "logps/rejected": -118.53101348876953, + "loss": 0.629, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.085015296936035, + "rewards/margins": 7.002058029174805, + "rewards/rejected": -3.9170427322387695, + "step": 17163 + }, + { + "epoch": 4.29, + "grad_norm": 6.265228748321533, + "learning_rate": 4.836251312890006e-07, + "logits/chosen": -0.5012975931167603, + "logits/rejected": -0.5998712182044983, + "logps/chosen": -51.6855354309082, + "logps/rejected": -104.35986328125, + "loss": 0.6667, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7396483421325684, + "rewards/margins": 6.894490718841553, + "rewards/rejected": -4.154841899871826, + "step": 17164 + }, + { + "epoch": 4.29, + "grad_norm": 5.965298652648926, + "learning_rate": 4.832879495460286e-07, + "logits/chosen": -0.6137482523918152, + "logits/rejected": -0.6780893206596375, + "logps/chosen": -50.926631927490234, + "logps/rejected": -107.47509765625, + "loss": 0.6689, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.744995594024658, + "rewards/margins": 6.332976341247559, + "rewards/rejected": -3.5879805088043213, + "step": 17165 + }, + { + "epoch": 4.29, + "grad_norm": 4.981812477111816, + "learning_rate": 4.829508794158166e-07, + "logits/chosen": -0.5345296263694763, + "logits/rejected": -0.6384708881378174, + "logps/chosen": -54.498756408691406, + "logps/rejected": -91.23623657226562, + "loss": 0.5657, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.054776668548584, + "rewards/margins": 6.363419532775879, + "rewards/rejected": -3.308643102645874, + "step": 17166 + }, + { + "epoch": 4.29, + "grad_norm": 3.6952314376831055, + "learning_rate": 4.826139209066915e-07, + "logits/chosen": -0.4648391008377075, + "logits/rejected": -0.5255970358848572, + "logps/chosen": -67.6341552734375, + "logps/rejected": -110.95478057861328, + "loss": 0.6525, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.837613105773926, + "rewards/margins": 6.011519432067871, + "rewards/rejected": -3.1739070415496826, + "step": 17167 + }, + { + "epoch": 4.29, + "grad_norm": 3.5184128284454346, + "learning_rate": 4.822770740269827e-07, + "logits/chosen": -0.5076741576194763, + "logits/rejected": -0.5664147138595581, + "logps/chosen": -46.34496307373047, + "logps/rejected": -108.97130584716797, + "loss": 0.6134, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0219194889068604, + "rewards/margins": 7.370345115661621, + "rewards/rejected": -4.34842586517334, + "step": 17168 + }, + { + "epoch": 4.3, + "grad_norm": 3.12559175491333, + "learning_rate": 4.819403387850124e-07, + "logits/chosen": -0.5551040172576904, + "logits/rejected": -0.6359055638313293, + "logps/chosen": -50.616024017333984, + "logps/rejected": -112.39855194091797, + "loss": 0.5634, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9664883613586426, + "rewards/margins": 7.882609844207764, + "rewards/rejected": -4.916121006011963, + "step": 17169 + }, + { + "epoch": 4.3, + "grad_norm": 3.218655824661255, + "learning_rate": 4.816037151891012e-07, + "logits/chosen": -0.6238681077957153, + "logits/rejected": -0.6731523275375366, + "logps/chosen": -44.833702087402344, + "logps/rejected": -116.69304656982422, + "loss": 0.5596, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.238762855529785, + "rewards/margins": 8.017288208007812, + "rewards/rejected": -4.7785258293151855, + "step": 17170 + }, + { + "epoch": 4.3, + "grad_norm": 3.7265355587005615, + "learning_rate": 4.812672032475691e-07, + "logits/chosen": -0.6367440223693848, + "logits/rejected": -0.7107207179069519, + "logps/chosen": -42.134361267089844, + "logps/rejected": -102.63038635253906, + "loss": 0.5182, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3553202152252197, + "rewards/margins": 7.485600471496582, + "rewards/rejected": -4.130280017852783, + "step": 17171 + }, + { + "epoch": 4.3, + "grad_norm": 3.469292640686035, + "learning_rate": 4.809308029687293e-07, + "logits/chosen": -0.590301513671875, + "logits/rejected": -0.6579101085662842, + "logps/chosen": -64.0870132446289, + "logps/rejected": -117.50764465332031, + "loss": 0.6122, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1839237213134766, + "rewards/margins": 8.179582595825195, + "rewards/rejected": -4.995657920837402, + "step": 17172 + }, + { + "epoch": 4.3, + "grad_norm": 2.104825496673584, + "learning_rate": 4.805945143608976e-07, + "logits/chosen": -0.6345157623291016, + "logits/rejected": -0.645560622215271, + "logps/chosen": -46.135169982910156, + "logps/rejected": -115.26813507080078, + "loss": 0.5408, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.032740592956543, + "rewards/margins": 6.763688087463379, + "rewards/rejected": -3.730947256088257, + "step": 17173 + }, + { + "epoch": 4.3, + "grad_norm": 2.466552257537842, + "learning_rate": 4.802583374323822e-07, + "logits/chosen": -0.46528419852256775, + "logits/rejected": -0.5557695031166077, + "logps/chosen": -53.016448974609375, + "logps/rejected": -114.17230224609375, + "loss": 0.5576, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.259084463119507, + "rewards/margins": 7.937798023223877, + "rewards/rejected": -4.678713798522949, + "step": 17174 + }, + { + "epoch": 4.3, + "grad_norm": 6.3775835037231445, + "learning_rate": 4.799222721914898e-07, + "logits/chosen": -0.5767742395401001, + "logits/rejected": -0.6566661596298218, + "logps/chosen": -49.838497161865234, + "logps/rejected": -99.22330474853516, + "loss": 0.6818, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.91818904876709, + "rewards/margins": 7.348079681396484, + "rewards/rejected": -4.4298906326293945, + "step": 17175 + }, + { + "epoch": 4.3, + "grad_norm": 4.175844669342041, + "learning_rate": 4.795863186465272e-07, + "logits/chosen": -0.5475135445594788, + "logits/rejected": -0.624085545539856, + "logps/chosen": -51.6419677734375, + "logps/rejected": -101.36461639404297, + "loss": 0.6072, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2153186798095703, + "rewards/margins": 7.674851417541504, + "rewards/rejected": -4.459532737731934, + "step": 17176 + }, + { + "epoch": 4.3, + "grad_norm": 3.248903512954712, + "learning_rate": 4.792504768057942e-07, + "logits/chosen": -0.45084288716316223, + "logits/rejected": -0.534503161907196, + "logps/chosen": -66.9458236694336, + "logps/rejected": -117.70677947998047, + "loss": 0.6313, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1558279991149902, + "rewards/margins": 7.640906810760498, + "rewards/rejected": -4.485079288482666, + "step": 17177 + }, + { + "epoch": 4.3, + "grad_norm": 7.487334728240967, + "learning_rate": 4.789147466775895e-07, + "logits/chosen": -0.5605586767196655, + "logits/rejected": -0.6460603475570679, + "logps/chosen": -56.75425338745117, + "logps/rejected": -84.54335021972656, + "loss": 0.6879, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.051147937774658, + "rewards/margins": 5.931710243225098, + "rewards/rejected": -2.8805623054504395, + "step": 17178 + }, + { + "epoch": 4.3, + "grad_norm": 5.530473232269287, + "learning_rate": 4.785791282702107e-07, + "logits/chosen": -0.5413122177124023, + "logits/rejected": -0.6465504169464111, + "logps/chosen": -68.6911392211914, + "logps/rejected": -95.73120880126953, + "loss": 0.6366, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3948705196380615, + "rewards/margins": 6.868561267852783, + "rewards/rejected": -3.4736905097961426, + "step": 17179 + }, + { + "epoch": 4.3, + "grad_norm": 4.754073143005371, + "learning_rate": 4.782436215919522e-07, + "logits/chosen": -0.5550121068954468, + "logits/rejected": -0.6199900507926941, + "logps/chosen": -55.264801025390625, + "logps/rejected": -110.39881896972656, + "loss": 0.6357, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6374928951263428, + "rewards/margins": 7.3910980224609375, + "rewards/rejected": -3.753605365753174, + "step": 17180 + }, + { + "epoch": 4.3, + "grad_norm": 4.9354119300842285, + "learning_rate": 4.77908226651102e-07, + "logits/chosen": -0.5151528120040894, + "logits/rejected": -0.605792760848999, + "logps/chosen": -60.18019485473633, + "logps/rejected": -118.08126831054688, + "loss": 0.5901, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.001805067062378, + "rewards/margins": 6.968747615814209, + "rewards/rejected": -3.966942548751831, + "step": 17181 + }, + { + "epoch": 4.3, + "grad_norm": 4.486373424530029, + "learning_rate": 4.775729434559489e-07, + "logits/chosen": -0.5130572319030762, + "logits/rejected": -0.556754469871521, + "logps/chosen": -53.108951568603516, + "logps/rejected": -126.73805236816406, + "loss": 0.6098, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0084383487701416, + "rewards/margins": 7.776703834533691, + "rewards/rejected": -4.768265247344971, + "step": 17182 + }, + { + "epoch": 4.3, + "grad_norm": 2.676762580871582, + "learning_rate": 4.772377720147797e-07, + "logits/chosen": -0.5345539450645447, + "logits/rejected": -0.6249762177467346, + "logps/chosen": -57.11437225341797, + "logps/rejected": -112.65815734863281, + "loss": 0.6285, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0799577236175537, + "rewards/margins": 7.491157531738281, + "rewards/rejected": -4.411200046539307, + "step": 17183 + }, + { + "epoch": 4.3, + "grad_norm": 7.973464012145996, + "learning_rate": 4.76902712335876e-07, + "logits/chosen": -0.6167144775390625, + "logits/rejected": -0.6820608377456665, + "logps/chosen": -62.509124755859375, + "logps/rejected": -100.5335464477539, + "loss": 0.628, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2612252235412598, + "rewards/margins": 6.562673091888428, + "rewards/rejected": -3.301448345184326, + "step": 17184 + }, + { + "epoch": 4.3, + "grad_norm": 5.891883850097656, + "learning_rate": 4.765677644275163e-07, + "logits/chosen": -0.5963810086250305, + "logits/rejected": -0.6559062600135803, + "logps/chosen": -56.911216735839844, + "logps/rejected": -95.36398315429688, + "loss": 0.712, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.177685499191284, + "rewards/margins": 6.787476539611816, + "rewards/rejected": -3.6097912788391113, + "step": 17185 + }, + { + "epoch": 4.3, + "grad_norm": 3.8564975261688232, + "learning_rate": 4.762329282979794e-07, + "logits/chosen": -0.5917686223983765, + "logits/rejected": -0.6948013305664062, + "logps/chosen": -50.084754943847656, + "logps/rejected": -106.82173919677734, + "loss": 0.6121, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.381500244140625, + "rewards/margins": 7.9821391105651855, + "rewards/rejected": -4.6006388664245605, + "step": 17186 + }, + { + "epoch": 4.3, + "grad_norm": 3.896606922149658, + "learning_rate": 4.758982039555382e-07, + "logits/chosen": -0.47699859738349915, + "logits/rejected": -0.5720890164375305, + "logps/chosen": -63.47797775268555, + "logps/rejected": -99.74417114257812, + "loss": 0.6564, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3676376342773438, + "rewards/margins": 6.475298881530762, + "rewards/rejected": -3.107661485671997, + "step": 17187 + }, + { + "epoch": 4.3, + "grad_norm": 6.521575927734375, + "learning_rate": 4.7556359140846386e-07, + "logits/chosen": -0.5085421800613403, + "logits/rejected": -0.580585777759552, + "logps/chosen": -61.02549743652344, + "logps/rejected": -91.77919006347656, + "loss": 0.6537, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.228266954421997, + "rewards/margins": 5.415356636047363, + "rewards/rejected": -2.187089681625366, + "step": 17188 + }, + { + "epoch": 4.3, + "grad_norm": 3.411275863647461, + "learning_rate": 4.752290906650264e-07, + "logits/chosen": -0.563960611820221, + "logits/rejected": -0.6231747269630432, + "logps/chosen": -61.14088439941406, + "logps/rejected": -117.36256408691406, + "loss": 0.5917, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3122811317443848, + "rewards/margins": 7.673172473907471, + "rewards/rejected": -4.360891342163086, + "step": 17189 + }, + { + "epoch": 4.3, + "grad_norm": 4.096463203430176, + "learning_rate": 4.748947017334904e-07, + "logits/chosen": -0.5702559947967529, + "logits/rejected": -0.6764501333236694, + "logps/chosen": -58.374671936035156, + "logps/rejected": -109.12921142578125, + "loss": 0.5683, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5206897258758545, + "rewards/margins": 8.40703296661377, + "rewards/rejected": -4.886343002319336, + "step": 17190 + }, + { + "epoch": 4.3, + "grad_norm": 5.935420036315918, + "learning_rate": 4.745604246221197e-07, + "logits/chosen": -0.4543909728527069, + "logits/rejected": -0.5618083477020264, + "logps/chosen": -65.1204833984375, + "logps/rejected": -93.60743713378906, + "loss": 0.631, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9109385013580322, + "rewards/margins": 6.219468116760254, + "rewards/rejected": -3.308529853820801, + "step": 17191 + }, + { + "epoch": 4.3, + "grad_norm": 4.857524394989014, + "learning_rate": 4.7422625933917445e-07, + "logits/chosen": -0.4916670620441437, + "logits/rejected": -0.5595299005508423, + "logps/chosen": -52.233421325683594, + "logps/rejected": -118.6558609008789, + "loss": 0.5906, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.252183437347412, + "rewards/margins": 7.881908416748047, + "rewards/rejected": -4.629724025726318, + "step": 17192 + }, + { + "epoch": 4.3, + "grad_norm": 3.4931137561798096, + "learning_rate": 4.738922058929113e-07, + "logits/chosen": -0.5823020935058594, + "logits/rejected": -0.6448832154273987, + "logps/chosen": -48.894386291503906, + "logps/rejected": -105.68295288085938, + "loss": 0.5667, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.49298095703125, + "rewards/margins": 7.387395858764648, + "rewards/rejected": -3.8944146633148193, + "step": 17193 + }, + { + "epoch": 4.3, + "grad_norm": 4.644961357116699, + "learning_rate": 4.7355826429158703e-07, + "logits/chosen": -0.4487488567829132, + "logits/rejected": -0.5321097373962402, + "logps/chosen": -60.69554138183594, + "logps/rejected": -111.09405517578125, + "loss": 0.6428, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9061074256896973, + "rewards/margins": 7.150628566741943, + "rewards/rejected": -4.244521141052246, + "step": 17194 + }, + { + "epoch": 4.3, + "grad_norm": 2.286470413208008, + "learning_rate": 4.732244345434528e-07, + "logits/chosen": -0.5277103185653687, + "logits/rejected": -0.6075917482376099, + "logps/chosen": -49.36058044433594, + "logps/rejected": -103.92341613769531, + "loss": 0.5424, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9693777561187744, + "rewards/margins": 7.536190509796143, + "rewards/rejected": -4.5668134689331055, + "step": 17195 + }, + { + "epoch": 4.3, + "grad_norm": 7.288760662078857, + "learning_rate": 4.7289071665675647e-07, + "logits/chosen": -0.5131563544273376, + "logits/rejected": -0.618205726146698, + "logps/chosen": -55.97241973876953, + "logps/rejected": -98.93003845214844, + "loss": 0.6401, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1288692951202393, + "rewards/margins": 6.911115646362305, + "rewards/rejected": -3.7822463512420654, + "step": 17196 + }, + { + "epoch": 4.3, + "grad_norm": 15.269232749938965, + "learning_rate": 4.7255711063974587e-07, + "logits/chosen": -0.6041082143783569, + "logits/rejected": -0.7313517332077026, + "logps/chosen": -55.14081954956055, + "logps/rejected": -118.45556640625, + "loss": 0.6871, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.713555097579956, + "rewards/margins": 8.439970970153809, + "rewards/rejected": -5.726415157318115, + "step": 17197 + }, + { + "epoch": 4.3, + "grad_norm": 1.7781438827514648, + "learning_rate": 4.722236165006655e-07, + "logits/chosen": -0.527744472026825, + "logits/rejected": -0.609108567237854, + "logps/chosen": -49.056434631347656, + "logps/rejected": -109.53308868408203, + "loss": 0.5449, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1537532806396484, + "rewards/margins": 8.620635032653809, + "rewards/rejected": -5.46688175201416, + "step": 17198 + }, + { + "epoch": 4.3, + "grad_norm": 4.880141258239746, + "learning_rate": 4.7189023424775547e-07, + "logits/chosen": -0.571112871170044, + "logits/rejected": -0.644734263420105, + "logps/chosen": -50.49416732788086, + "logps/rejected": -95.11238098144531, + "loss": 0.6142, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1653618812561035, + "rewards/margins": 6.418067932128906, + "rewards/rejected": -3.2527060508728027, + "step": 17199 + }, + { + "epoch": 4.3, + "grad_norm": 3.3364572525024414, + "learning_rate": 4.715569638892531e-07, + "logits/chosen": -0.4822606146335602, + "logits/rejected": -0.5681519508361816, + "logps/chosen": -60.51856231689453, + "logps/rejected": -114.8375244140625, + "loss": 0.6116, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.392326593399048, + "rewards/margins": 7.826476573944092, + "rewards/rejected": -4.434150218963623, + "step": 17200 + }, + { + "epoch": 4.3, + "grad_norm": 5.584160327911377, + "learning_rate": 4.712238054333956e-07, + "logits/chosen": -0.5643677115440369, + "logits/rejected": -0.6700338125228882, + "logps/chosen": -59.39918518066406, + "logps/rejected": -100.55277252197266, + "loss": 0.7117, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.42808198928833, + "rewards/margins": 7.047654151916504, + "rewards/rejected": -3.619572401046753, + "step": 17201 + }, + { + "epoch": 4.3, + "grad_norm": 5.663692474365234, + "learning_rate": 4.708907588884154e-07, + "logits/chosen": -0.5791788697242737, + "logits/rejected": -0.645156979560852, + "logps/chosen": -58.3798828125, + "logps/rejected": -120.06294250488281, + "loss": 0.6256, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.649871587753296, + "rewards/margins": 6.654362678527832, + "rewards/rejected": -4.004489898681641, + "step": 17202 + }, + { + "epoch": 4.3, + "grad_norm": 6.288808822631836, + "learning_rate": 4.705578242625408e-07, + "logits/chosen": -0.5887793898582458, + "logits/rejected": -0.6618026494979858, + "logps/chosen": -55.89396667480469, + "logps/rejected": -110.68611145019531, + "loss": 0.7016, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0642473697662354, + "rewards/margins": 6.841776371002197, + "rewards/rejected": -3.777528762817383, + "step": 17203 + }, + { + "epoch": 4.3, + "grad_norm": 2.268517017364502, + "learning_rate": 4.702250015640009e-07, + "logits/chosen": -0.5328686833381653, + "logits/rejected": -0.613177478313446, + "logps/chosen": -51.35245895385742, + "logps/rejected": -111.93986511230469, + "loss": 0.5193, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.299952268600464, + "rewards/margins": 7.371681213378906, + "rewards/rejected": -4.071728706359863, + "step": 17204 + }, + { + "epoch": 4.3, + "grad_norm": 4.676119804382324, + "learning_rate": 4.6989229080101897e-07, + "logits/chosen": -0.5059297680854797, + "logits/rejected": -0.636425793170929, + "logps/chosen": -61.11333465576172, + "logps/rejected": -98.92567443847656, + "loss": 0.5474, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.068014621734619, + "rewards/margins": 7.6358866691589355, + "rewards/rejected": -4.567871570587158, + "step": 17205 + }, + { + "epoch": 4.3, + "grad_norm": 2.810375452041626, + "learning_rate": 4.695596919818168e-07, + "logits/chosen": -0.5766321420669556, + "logits/rejected": -0.6573368310928345, + "logps/chosen": -50.1424446105957, + "logps/rejected": -109.80331420898438, + "loss": 0.5092, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.335296630859375, + "rewards/margins": 8.488052368164062, + "rewards/rejected": -5.1527557373046875, + "step": 17206 + }, + { + "epoch": 4.3, + "grad_norm": 3.5315754413604736, + "learning_rate": 4.6922720511461404e-07, + "logits/chosen": -0.4858980178833008, + "logits/rejected": -0.5798826217651367, + "logps/chosen": -61.35469436645508, + "logps/rejected": -125.31526184082031, + "loss": 0.6034, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1211838722229004, + "rewards/margins": 8.584095001220703, + "rewards/rejected": -5.462911605834961, + "step": 17207 + }, + { + "epoch": 4.3, + "grad_norm": 8.066465377807617, + "learning_rate": 4.6889483020762515e-07, + "logits/chosen": -0.48500046133995056, + "logits/rejected": -0.5349777936935425, + "logps/chosen": -49.08326721191406, + "logps/rejected": -109.67823791503906, + "loss": 0.6921, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.089470148086548, + "rewards/margins": 5.767118453979492, + "rewards/rejected": -2.6776480674743652, + "step": 17208 + }, + { + "epoch": 4.31, + "grad_norm": 6.684508800506592, + "learning_rate": 4.6856256726906526e-07, + "logits/chosen": -0.515110194683075, + "logits/rejected": -0.6068439483642578, + "logps/chosen": -60.36812210083008, + "logps/rejected": -94.95508575439453, + "loss": 0.681, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0040578842163086, + "rewards/margins": 6.840394973754883, + "rewards/rejected": -3.8363358974456787, + "step": 17209 + }, + { + "epoch": 4.31, + "grad_norm": 6.2035136222839355, + "learning_rate": 4.682304163071433e-07, + "logits/chosen": -0.5542963147163391, + "logits/rejected": -0.6488609910011292, + "logps/chosen": -60.11756896972656, + "logps/rejected": -90.2784194946289, + "loss": 0.7959, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.910914897918701, + "rewards/margins": 5.020008563995361, + "rewards/rejected": -2.109093427658081, + "step": 17210 + }, + { + "epoch": 4.31, + "grad_norm": 4.53764009475708, + "learning_rate": 4.6789837733006935e-07, + "logits/chosen": -0.6016151905059814, + "logits/rejected": -0.6966632008552551, + "logps/chosen": -52.136714935302734, + "logps/rejected": -117.23945617675781, + "loss": 0.6181, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.804302215576172, + "rewards/margins": 9.018928527832031, + "rewards/rejected": -6.214626312255859, + "step": 17211 + }, + { + "epoch": 4.31, + "grad_norm": 5.6961774826049805, + "learning_rate": 4.675664503460464e-07, + "logits/chosen": -0.5853286385536194, + "logits/rejected": -0.6708399653434753, + "logps/chosen": -50.24421691894531, + "logps/rejected": -105.05736541748047, + "loss": 0.5981, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0654172897338867, + "rewards/margins": 6.8261189460754395, + "rewards/rejected": -3.7607014179229736, + "step": 17212 + }, + { + "epoch": 4.31, + "grad_norm": 12.18039608001709, + "learning_rate": 4.672346353632773e-07, + "logits/chosen": -0.566631019115448, + "logits/rejected": -0.6583774089813232, + "logps/chosen": -58.716064453125, + "logps/rejected": -107.68050384521484, + "loss": 0.675, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.119472026824951, + "rewards/margins": 7.855778694152832, + "rewards/rejected": -4.736306190490723, + "step": 17213 + }, + { + "epoch": 4.31, + "grad_norm": 3.6141085624694824, + "learning_rate": 4.6690293238996155e-07, + "logits/chosen": -0.573323667049408, + "logits/rejected": -0.6751729249954224, + "logps/chosen": -55.657814025878906, + "logps/rejected": -106.26685333251953, + "loss": 0.5704, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.724302291870117, + "rewards/margins": 7.897618293762207, + "rewards/rejected": -5.173316478729248, + "step": 17214 + }, + { + "epoch": 4.31, + "grad_norm": 3.534517288208008, + "learning_rate": 4.6657134143429594e-07, + "logits/chosen": -0.5206462740898132, + "logits/rejected": -0.5718967914581299, + "logps/chosen": -48.667938232421875, + "logps/rejected": -121.47549438476562, + "loss": 0.5408, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.052398681640625, + "rewards/margins": 7.31072998046875, + "rewards/rejected": -4.258330821990967, + "step": 17215 + }, + { + "epoch": 4.31, + "grad_norm": 2.2508819103240967, + "learning_rate": 4.6623986250447504e-07, + "logits/chosen": -0.4893842339515686, + "logits/rejected": -0.5387893319129944, + "logps/chosen": -49.12865447998047, + "logps/rejected": -114.32886505126953, + "loss": 0.48, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.911475896835327, + "rewards/margins": 7.4128007888793945, + "rewards/rejected": -4.501324653625488, + "step": 17216 + }, + { + "epoch": 4.31, + "grad_norm": 4.171233177185059, + "learning_rate": 4.65908495608689e-07, + "logits/chosen": -0.6042202711105347, + "logits/rejected": -0.66581791639328, + "logps/chosen": -53.62574005126953, + "logps/rejected": -107.7393798828125, + "loss": 0.6161, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8954591751098633, + "rewards/margins": 7.012756824493408, + "rewards/rejected": -4.117298126220703, + "step": 17217 + }, + { + "epoch": 4.31, + "grad_norm": 5.3664445877075195, + "learning_rate": 4.6557724075512623e-07, + "logits/chosen": -0.510603666305542, + "logits/rejected": -0.6216782331466675, + "logps/chosen": -56.78931427001953, + "logps/rejected": -105.0966567993164, + "loss": 0.5441, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0262866020202637, + "rewards/margins": 7.685328006744385, + "rewards/rejected": -4.659041404724121, + "step": 17218 + }, + { + "epoch": 4.31, + "grad_norm": 3.277045726776123, + "learning_rate": 4.6524609795197407e-07, + "logits/chosen": -0.5570729970932007, + "logits/rejected": -0.6316643953323364, + "logps/chosen": -55.91438293457031, + "logps/rejected": -101.00359344482422, + "loss": 0.6139, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.165217399597168, + "rewards/margins": 7.550243377685547, + "rewards/rejected": -4.385025501251221, + "step": 17219 + }, + { + "epoch": 4.31, + "grad_norm": 4.2666425704956055, + "learning_rate": 4.6491506720741376e-07, + "logits/chosen": -0.4898146390914917, + "logits/rejected": -0.5291151404380798, + "logps/chosen": -59.885677337646484, + "logps/rejected": -112.07821655273438, + "loss": 0.6118, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0625038146972656, + "rewards/margins": 6.496440410614014, + "rewards/rejected": -3.43393611907959, + "step": 17220 + }, + { + "epoch": 4.31, + "grad_norm": 4.047080993652344, + "learning_rate": 4.645841485296249e-07, + "logits/chosen": -0.650066077709198, + "logits/rejected": -0.7089422941207886, + "logps/chosen": -51.25213623046875, + "logps/rejected": -109.71266174316406, + "loss": 0.6625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.961422920227051, + "rewards/margins": 7.214465618133545, + "rewards/rejected": -4.253042221069336, + "step": 17221 + }, + { + "epoch": 4.31, + "grad_norm": 4.353343963623047, + "learning_rate": 4.6425334192678585e-07, + "logits/chosen": -0.5964240431785583, + "logits/rejected": -0.6757954955101013, + "logps/chosen": -55.885948181152344, + "logps/rejected": -105.62129211425781, + "loss": 0.6411, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.359574317932129, + "rewards/margins": 7.661076068878174, + "rewards/rejected": -4.301502227783203, + "step": 17222 + }, + { + "epoch": 4.31, + "grad_norm": 6.2403717041015625, + "learning_rate": 4.639226474070735e-07, + "logits/chosen": -0.5484839677810669, + "logits/rejected": -0.6519910097122192, + "logps/chosen": -55.35082244873047, + "logps/rejected": -104.1684341430664, + "loss": 0.655, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2679476737976074, + "rewards/margins": 6.48089075088501, + "rewards/rejected": -3.2129433155059814, + "step": 17223 + }, + { + "epoch": 4.31, + "grad_norm": 10.105761528015137, + "learning_rate": 4.6359206497865515e-07, + "logits/chosen": -0.5426417589187622, + "logits/rejected": -0.6194903254508972, + "logps/chosen": -65.97642517089844, + "logps/rejected": -93.74836730957031, + "loss": 0.7353, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9696645736694336, + "rewards/margins": 6.211274147033691, + "rewards/rejected": -3.2416093349456787, + "step": 17224 + }, + { + "epoch": 4.31, + "grad_norm": 3.5580105781555176, + "learning_rate": 4.6326159464970153e-07, + "logits/chosen": -0.6147202253341675, + "logits/rejected": -0.6960886120796204, + "logps/chosen": -46.474365234375, + "logps/rejected": -95.99345397949219, + "loss": 0.5812, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2104897499084473, + "rewards/margins": 7.725383758544922, + "rewards/rejected": -4.514894008636475, + "step": 17225 + }, + { + "epoch": 4.31, + "grad_norm": 2.706751585006714, + "learning_rate": 4.629312364283811e-07, + "logits/chosen": -0.5216655731201172, + "logits/rejected": -0.6525784134864807, + "logps/chosen": -54.62546157836914, + "logps/rejected": -112.2708740234375, + "loss": 0.5149, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3668482303619385, + "rewards/margins": 8.272887229919434, + "rewards/rejected": -4.906038284301758, + "step": 17226 + }, + { + "epoch": 4.31, + "grad_norm": 4.449039936065674, + "learning_rate": 4.626009903228551e-07, + "logits/chosen": -0.5378985404968262, + "logits/rejected": -0.6082484722137451, + "logps/chosen": -53.61271286010742, + "logps/rejected": -118.26592254638672, + "loss": 0.5451, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.294421434402466, + "rewards/margins": 7.365470886230469, + "rewards/rejected": -4.071049213409424, + "step": 17227 + }, + { + "epoch": 4.31, + "grad_norm": 4.583466529846191, + "learning_rate": 4.622708563412842e-07, + "logits/chosen": -0.6084495782852173, + "logits/rejected": -0.6601347923278809, + "logps/chosen": -52.212867736816406, + "logps/rejected": -97.30009460449219, + "loss": 0.672, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2988147735595703, + "rewards/margins": 6.228480815887451, + "rewards/rejected": -2.92966628074646, + "step": 17228 + }, + { + "epoch": 4.31, + "grad_norm": 18.7125301361084, + "learning_rate": 4.61940834491828e-07, + "logits/chosen": -0.5471606254577637, + "logits/rejected": -0.6156477332115173, + "logps/chosen": -56.14463806152344, + "logps/rejected": -113.38815307617188, + "loss": 0.6287, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.910212993621826, + "rewards/margins": 6.644984722137451, + "rewards/rejected": -3.7347710132598877, + "step": 17229 + }, + { + "epoch": 4.31, + "grad_norm": 2.9549672603607178, + "learning_rate": 4.616109247826406e-07, + "logits/chosen": -0.553064227104187, + "logits/rejected": -0.6687483787536621, + "logps/chosen": -53.14906311035156, + "logps/rejected": -109.99669647216797, + "loss": 0.5725, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2700142860412598, + "rewards/margins": 8.291555404663086, + "rewards/rejected": -5.021541118621826, + "step": 17230 + }, + { + "epoch": 4.31, + "grad_norm": 31.312307357788086, + "learning_rate": 4.612811272218731e-07, + "logits/chosen": -0.5567078590393066, + "logits/rejected": -0.6096463203430176, + "logps/chosen": -54.445884704589844, + "logps/rejected": -122.830322265625, + "loss": 0.8663, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0635063648223877, + "rewards/margins": 7.557697296142578, + "rewards/rejected": -4.494192123413086, + "step": 17231 + }, + { + "epoch": 4.31, + "grad_norm": 8.198087692260742, + "learning_rate": 4.6095144181767806e-07, + "logits/chosen": -0.6284998059272766, + "logits/rejected": -0.648996889591217, + "logps/chosen": -50.43960189819336, + "logps/rejected": -98.86648559570312, + "loss": 0.6577, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.589322566986084, + "rewards/margins": 6.889675617218018, + "rewards/rejected": -3.3003532886505127, + "step": 17232 + }, + { + "epoch": 4.31, + "grad_norm": 3.939929246902466, + "learning_rate": 4.6062186857819936e-07, + "logits/chosen": -0.5245048403739929, + "logits/rejected": -0.5450419783592224, + "logps/chosen": -50.55215072631836, + "logps/rejected": -103.6192398071289, + "loss": 0.676, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0497238636016846, + "rewards/margins": 5.872736930847168, + "rewards/rejected": -2.8230133056640625, + "step": 17233 + }, + { + "epoch": 4.31, + "grad_norm": 2.5513579845428467, + "learning_rate": 4.6029240751158386e-07, + "logits/chosen": -0.5135528445243835, + "logits/rejected": -0.6192814111709595, + "logps/chosen": -47.293155670166016, + "logps/rejected": -100.90557098388672, + "loss": 0.4732, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7892343997955322, + "rewards/margins": 7.009912490844727, + "rewards/rejected": -4.220677375793457, + "step": 17234 + }, + { + "epoch": 4.31, + "grad_norm": 7.188262462615967, + "learning_rate": 4.5996305862597177e-07, + "logits/chosen": -0.5539479851722717, + "logits/rejected": -0.656377911567688, + "logps/chosen": -54.61111831665039, + "logps/rejected": -87.75526428222656, + "loss": 0.6543, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8970253467559814, + "rewards/margins": 6.802379131317139, + "rewards/rejected": -3.9053537845611572, + "step": 17235 + }, + { + "epoch": 4.31, + "grad_norm": 2.7993483543395996, + "learning_rate": 4.596338219295005e-07, + "logits/chosen": -0.5954126119613647, + "logits/rejected": -0.7007803916931152, + "logps/chosen": -50.42068099975586, + "logps/rejected": -87.1099624633789, + "loss": 0.5628, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3403689861297607, + "rewards/margins": 6.735958576202393, + "rewards/rejected": -3.395589828491211, + "step": 17236 + }, + { + "epoch": 4.31, + "grad_norm": 6.197394371032715, + "learning_rate": 4.593046974303078e-07, + "logits/chosen": -0.5941339731216431, + "logits/rejected": -0.6326965689659119, + "logps/chosen": -47.14168167114258, + "logps/rejected": -103.83073425292969, + "loss": 0.6013, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0877902507781982, + "rewards/margins": 7.4086408615112305, + "rewards/rejected": -4.320849895477295, + "step": 17237 + }, + { + "epoch": 4.31, + "grad_norm": 4.984918117523193, + "learning_rate": 4.589756851365251e-07, + "logits/chosen": -0.5978299379348755, + "logits/rejected": -0.678124189376831, + "logps/chosen": -53.42725372314453, + "logps/rejected": -110.96173095703125, + "loss": 0.6499, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.079455852508545, + "rewards/margins": 7.732493877410889, + "rewards/rejected": -4.65303897857666, + "step": 17238 + }, + { + "epoch": 4.31, + "grad_norm": 8.064079284667969, + "learning_rate": 4.5864678505628315e-07, + "logits/chosen": -0.5141929984092712, + "logits/rejected": -0.601454496383667, + "logps/chosen": -65.56534576416016, + "logps/rejected": -103.90983581542969, + "loss": 0.7412, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9782512187957764, + "rewards/margins": 6.580723285675049, + "rewards/rejected": -3.6024723052978516, + "step": 17239 + }, + { + "epoch": 4.31, + "grad_norm": 14.524001121520996, + "learning_rate": 4.583179971977092e-07, + "logits/chosen": -0.4915888011455536, + "logits/rejected": -0.5287632346153259, + "logps/chosen": -55.02507781982422, + "logps/rejected": -103.31687927246094, + "loss": 0.8022, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.749995708465576, + "rewards/margins": 6.74943733215332, + "rewards/rejected": -3.999441623687744, + "step": 17240 + }, + { + "epoch": 4.31, + "grad_norm": 2.971184015274048, + "learning_rate": 4.579893215689302e-07, + "logits/chosen": -0.5780261754989624, + "logits/rejected": -0.6581416130065918, + "logps/chosen": -46.52522659301758, + "logps/rejected": -113.33354949951172, + "loss": 0.5441, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3255882263183594, + "rewards/margins": 8.015509605407715, + "rewards/rejected": -4.689920902252197, + "step": 17241 + }, + { + "epoch": 4.31, + "grad_norm": 2.4331204891204834, + "learning_rate": 4.57660758178064e-07, + "logits/chosen": -0.5222111344337463, + "logits/rejected": -0.6231788396835327, + "logps/chosen": -51.60577392578125, + "logps/rejected": -104.22334289550781, + "loss": 0.5504, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4135448932647705, + "rewards/margins": 7.185478210449219, + "rewards/rejected": -3.7719335556030273, + "step": 17242 + }, + { + "epoch": 4.31, + "grad_norm": 4.123523712158203, + "learning_rate": 4.5733230703323253e-07, + "logits/chosen": -0.5356298685073853, + "logits/rejected": -0.6056544184684753, + "logps/chosen": -52.392005920410156, + "logps/rejected": -108.29043579101562, + "loss": 0.5805, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.036532402038574, + "rewards/margins": 7.030484676361084, + "rewards/rejected": -3.9939515590667725, + "step": 17243 + }, + { + "epoch": 4.31, + "grad_norm": 4.39988374710083, + "learning_rate": 4.57003968142552e-07, + "logits/chosen": -0.5626444220542908, + "logits/rejected": -0.6201784610748291, + "logps/chosen": -52.44261932373047, + "logps/rejected": -92.54039001464844, + "loss": 0.6177, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1869170665740967, + "rewards/margins": 6.5010480880737305, + "rewards/rejected": -3.314131259918213, + "step": 17244 + }, + { + "epoch": 4.31, + "grad_norm": 8.832709312438965, + "learning_rate": 4.5667574151413597e-07, + "logits/chosen": -0.536814272403717, + "logits/rejected": -0.6233255863189697, + "logps/chosen": -61.304481506347656, + "logps/rejected": -104.9079818725586, + "loss": 0.7025, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3531785011291504, + "rewards/margins": 7.135744094848633, + "rewards/rejected": -3.7825653553009033, + "step": 17245 + }, + { + "epoch": 4.31, + "grad_norm": 3.8505959510803223, + "learning_rate": 4.563476271560935e-07, + "logits/chosen": -0.5319430828094482, + "logits/rejected": -0.6021534204483032, + "logps/chosen": -63.97254943847656, + "logps/rejected": -106.21125793457031, + "loss": 0.6918, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.954102039337158, + "rewards/margins": 6.792396545410156, + "rewards/rejected": -3.838294267654419, + "step": 17246 + }, + { + "epoch": 4.31, + "grad_norm": 4.746644020080566, + "learning_rate": 4.5601962507653477e-07, + "logits/chosen": -0.5712934732437134, + "logits/rejected": -0.6510542631149292, + "logps/chosen": -45.82908630371094, + "logps/rejected": -88.8143081665039, + "loss": 0.6331, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.359828472137451, + "rewards/margins": 6.192338943481445, + "rewards/rejected": -2.832510471343994, + "step": 17247 + }, + { + "epoch": 4.31, + "grad_norm": 2.480319023132324, + "learning_rate": 4.556917352835649e-07, + "logits/chosen": -0.4700598120689392, + "logits/rejected": -0.5246603488922119, + "logps/chosen": -76.24373626708984, + "logps/rejected": -117.73604583740234, + "loss": 0.6011, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9960670471191406, + "rewards/margins": 7.922066688537598, + "rewards/rejected": -4.926000118255615, + "step": 17248 + }, + { + "epoch": 4.32, + "grad_norm": 3.01314377784729, + "learning_rate": 4.5536395778528475e-07, + "logits/chosen": -0.5633657574653625, + "logits/rejected": -0.6418696045875549, + "logps/chosen": -48.19721221923828, + "logps/rejected": -102.38958740234375, + "loss": 0.5388, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.209368944168091, + "rewards/margins": 6.979588985443115, + "rewards/rejected": -3.7702202796936035, + "step": 17249 + }, + { + "epoch": 4.32, + "grad_norm": 6.417053699493408, + "learning_rate": 4.550362925897955e-07, + "logits/chosen": -0.5635420083999634, + "logits/rejected": -0.5994784235954285, + "logps/chosen": -49.47877502441406, + "logps/rejected": -117.84947967529297, + "loss": 0.6489, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2719757556915283, + "rewards/margins": 7.133060932159424, + "rewards/rejected": -3.8610854148864746, + "step": 17250 + }, + { + "epoch": 4.32, + "grad_norm": 2.7636473178863525, + "learning_rate": 4.5470873970519305e-07, + "logits/chosen": -0.6123406291007996, + "logits/rejected": -0.7094895243644714, + "logps/chosen": -46.559959411621094, + "logps/rejected": -98.61365509033203, + "loss": 0.5055, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.385228157043457, + "rewards/margins": 7.665448188781738, + "rewards/rejected": -4.280220031738281, + "step": 17251 + }, + { + "epoch": 4.32, + "grad_norm": 6.158541202545166, + "learning_rate": 4.5438129913957353e-07, + "logits/chosen": -0.5201119184494019, + "logits/rejected": -0.5618784427642822, + "logps/chosen": -69.09925079345703, + "logps/rejected": -100.68101501464844, + "loss": 0.6796, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.915436029434204, + "rewards/margins": 5.422088146209717, + "rewards/rejected": -2.506652355194092, + "step": 17252 + }, + { + "epoch": 4.32, + "grad_norm": 9.463324546813965, + "learning_rate": 4.5405397090102555e-07, + "logits/chosen": -0.5924564003944397, + "logits/rejected": -0.658210813999176, + "logps/chosen": -47.237754821777344, + "logps/rejected": -82.26049041748047, + "loss": 0.6624, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0141422748565674, + "rewards/margins": 5.39915132522583, + "rewards/rejected": -2.3850090503692627, + "step": 17253 + }, + { + "epoch": 4.32, + "grad_norm": 4.0507049560546875, + "learning_rate": 4.537267549976404e-07, + "logits/chosen": -0.5603436231613159, + "logits/rejected": -0.6345347762107849, + "logps/chosen": -50.01706314086914, + "logps/rejected": -102.51805114746094, + "loss": 0.6095, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.408874750137329, + "rewards/margins": 7.0514302253723145, + "rewards/rejected": -3.6425559520721436, + "step": 17254 + }, + { + "epoch": 4.32, + "grad_norm": 5.137632846832275, + "learning_rate": 4.533996514375033e-07, + "logits/chosen": -0.602321982383728, + "logits/rejected": -0.6985318660736084, + "logps/chosen": -56.359745025634766, + "logps/rejected": -95.13037872314453, + "loss": 0.5885, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3327369689941406, + "rewards/margins": 6.215449333190918, + "rewards/rejected": -2.8827123641967773, + "step": 17255 + }, + { + "epoch": 4.32, + "grad_norm": 3.2227792739868164, + "learning_rate": 4.53072660228695e-07, + "logits/chosen": -0.5878615379333496, + "logits/rejected": -0.6455934643745422, + "logps/chosen": -50.213294982910156, + "logps/rejected": -100.74610900878906, + "loss": 0.5666, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4961085319519043, + "rewards/margins": 6.327523708343506, + "rewards/rejected": -2.831415891647339, + "step": 17256 + }, + { + "epoch": 4.32, + "grad_norm": 8.357312202453613, + "learning_rate": 4.5274578137929905e-07, + "logits/chosen": -0.5953492522239685, + "logits/rejected": -0.6044847369194031, + "logps/chosen": -44.38706970214844, + "logps/rejected": -95.21971893310547, + "loss": 0.6043, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5445165634155273, + "rewards/margins": 5.7476301193237305, + "rewards/rejected": -3.203113317489624, + "step": 17257 + }, + { + "epoch": 4.32, + "grad_norm": 7.240762233734131, + "learning_rate": 4.524190148973906e-07, + "logits/chosen": -0.5343993902206421, + "logits/rejected": -0.6388531923294067, + "logps/chosen": -49.58381652832031, + "logps/rejected": -111.58969116210938, + "loss": 0.6591, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9952056407928467, + "rewards/margins": 7.286038398742676, + "rewards/rejected": -4.290832996368408, + "step": 17258 + }, + { + "epoch": 4.32, + "grad_norm": 4.451662540435791, + "learning_rate": 4.5209236079104603e-07, + "logits/chosen": -0.5848662853240967, + "logits/rejected": -0.6663522124290466, + "logps/chosen": -51.41212844848633, + "logps/rejected": -93.8117446899414, + "loss": 0.5512, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1766068935394287, + "rewards/margins": 6.683489799499512, + "rewards/rejected": -3.506883382797241, + "step": 17259 + }, + { + "epoch": 4.32, + "grad_norm": 2.4025959968566895, + "learning_rate": 4.517658190683366e-07, + "logits/chosen": -0.5427948236465454, + "logits/rejected": -0.6414746046066284, + "logps/chosen": -48.830963134765625, + "logps/rejected": -125.94479370117188, + "loss": 0.5473, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.194417715072632, + "rewards/margins": 9.687167167663574, + "rewards/rejected": -6.492749214172363, + "step": 17260 + }, + { + "epoch": 4.32, + "grad_norm": 5.987517356872559, + "learning_rate": 4.5143938973733035e-07, + "logits/chosen": -0.523368775844574, + "logits/rejected": -0.5711720585823059, + "logps/chosen": -51.60083770751953, + "logps/rejected": -102.33845520019531, + "loss": 0.528, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.216268301010132, + "rewards/margins": 6.889533996582031, + "rewards/rejected": -3.6732661724090576, + "step": 17261 + }, + { + "epoch": 4.32, + "grad_norm": 6.696113109588623, + "learning_rate": 4.511130728060964e-07, + "logits/chosen": -0.5371785163879395, + "logits/rejected": -0.6403939127922058, + "logps/chosen": -67.30470275878906, + "logps/rejected": -104.55534362792969, + "loss": 0.6448, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.685415267944336, + "rewards/margins": 6.473475933074951, + "rewards/rejected": -3.7880609035491943, + "step": 17262 + }, + { + "epoch": 4.32, + "grad_norm": 2.849536418914795, + "learning_rate": 4.50786868282696e-07, + "logits/chosen": -0.48855435848236084, + "logits/rejected": -0.5614721775054932, + "logps/chosen": -53.48114013671875, + "logps/rejected": -110.71573638916016, + "loss": 0.552, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1990113258361816, + "rewards/margins": 7.532669544219971, + "rewards/rejected": -4.333658218383789, + "step": 17263 + }, + { + "epoch": 4.32, + "grad_norm": 7.609947204589844, + "learning_rate": 4.504607761751906e-07, + "logits/chosen": -0.5483987331390381, + "logits/rejected": -0.6106401681900024, + "logps/chosen": -72.90946960449219, + "logps/rejected": -127.38592529296875, + "loss": 0.6796, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.164679765701294, + "rewards/margins": 8.242790222167969, + "rewards/rejected": -5.0781097412109375, + "step": 17264 + }, + { + "epoch": 4.32, + "grad_norm": 3.2785260677337646, + "learning_rate": 4.5013479649163804e-07, + "logits/chosen": -0.5281540751457214, + "logits/rejected": -0.6149390339851379, + "logps/chosen": -49.509822845458984, + "logps/rejected": -116.35517120361328, + "loss": 0.5332, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.110856533050537, + "rewards/margins": 8.095471382141113, + "rewards/rejected": -4.984614372253418, + "step": 17265 + }, + { + "epoch": 4.32, + "grad_norm": 4.542805194854736, + "learning_rate": 4.498089292400965e-07, + "logits/chosen": -0.5151679515838623, + "logits/rejected": -0.6235852241516113, + "logps/chosen": -64.68206024169922, + "logps/rejected": -93.69945526123047, + "loss": 0.6683, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1695375442504883, + "rewards/margins": 5.714841365814209, + "rewards/rejected": -2.5453040599823, + "step": 17266 + }, + { + "epoch": 4.32, + "grad_norm": 10.685977935791016, + "learning_rate": 4.4948317442861376e-07, + "logits/chosen": -0.5403850078582764, + "logits/rejected": -0.5889075994491577, + "logps/chosen": -55.407527923583984, + "logps/rejected": -121.12704467773438, + "loss": 0.6393, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.178692102432251, + "rewards/margins": 6.362683296203613, + "rewards/rejected": -3.1839914321899414, + "step": 17267 + }, + { + "epoch": 4.32, + "grad_norm": 3.8948566913604736, + "learning_rate": 4.491575320652425e-07, + "logits/chosen": -0.5476555228233337, + "logits/rejected": -0.6025574207305908, + "logps/chosen": -46.31167221069336, + "logps/rejected": -100.0434341430664, + "loss": 0.5464, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3774566650390625, + "rewards/margins": 6.7540740966796875, + "rewards/rejected": -3.376617431640625, + "step": 17268 + }, + { + "epoch": 4.32, + "grad_norm": 4.636667728424072, + "learning_rate": 4.4883200215803055e-07, + "logits/chosen": -0.4959303140640259, + "logits/rejected": -0.6245642900466919, + "logps/chosen": -52.38748550415039, + "logps/rejected": -92.41731262207031, + "loss": 0.5688, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4916558265686035, + "rewards/margins": 8.083166122436523, + "rewards/rejected": -4.5915093421936035, + "step": 17269 + }, + { + "epoch": 4.32, + "grad_norm": 2.892889976501465, + "learning_rate": 4.485065847150188e-07, + "logits/chosen": -0.6083179712295532, + "logits/rejected": -0.6805707812309265, + "logps/chosen": -53.65241241455078, + "logps/rejected": -96.53851318359375, + "loss": 0.5731, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.082540273666382, + "rewards/margins": 7.346377372741699, + "rewards/rejected": -4.263835906982422, + "step": 17270 + }, + { + "epoch": 4.32, + "grad_norm": 3.5246875286102295, + "learning_rate": 4.4818127974425086e-07, + "logits/chosen": -0.6043487787246704, + "logits/rejected": -0.630674421787262, + "logps/chosen": -63.30592727661133, + "logps/rejected": -124.50006103515625, + "loss": 0.716, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.512352228164673, + "rewards/margins": 7.443592071533203, + "rewards/rejected": -3.931239366531372, + "step": 17271 + }, + { + "epoch": 4.32, + "grad_norm": 4.369362831115723, + "learning_rate": 4.478560872537657e-07, + "logits/chosen": -0.612575888633728, + "logits/rejected": -0.7329589128494263, + "logps/chosen": -60.884986877441406, + "logps/rejected": -86.37381744384766, + "loss": 0.5786, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8137366771698, + "rewards/margins": 6.948576927185059, + "rewards/rejected": -4.134840488433838, + "step": 17272 + }, + { + "epoch": 4.32, + "grad_norm": 5.672970294952393, + "learning_rate": 4.4753100725159815e-07, + "logits/chosen": -0.6260401010513306, + "logits/rejected": -0.7112330198287964, + "logps/chosen": -50.12360382080078, + "logps/rejected": -92.31571960449219, + "loss": 0.6164, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1927034854888916, + "rewards/margins": 7.072930335998535, + "rewards/rejected": -3.8802266120910645, + "step": 17273 + }, + { + "epoch": 4.32, + "grad_norm": 2.9548397064208984, + "learning_rate": 4.4720603974578114e-07, + "logits/chosen": -0.5902913808822632, + "logits/rejected": -0.6475258469581604, + "logps/chosen": -50.75586700439453, + "logps/rejected": -94.88312530517578, + "loss": 0.5632, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9442508220672607, + "rewards/margins": 6.248660564422607, + "rewards/rejected": -3.3044097423553467, + "step": 17274 + }, + { + "epoch": 4.32, + "grad_norm": 6.645341396331787, + "learning_rate": 4.468811847443466e-07, + "logits/chosen": -0.5313019752502441, + "logits/rejected": -0.6331826448440552, + "logps/chosen": -46.0391845703125, + "logps/rejected": -112.372802734375, + "loss": 0.578, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.181544542312622, + "rewards/margins": 8.757545471191406, + "rewards/rejected": -5.576000690460205, + "step": 17275 + }, + { + "epoch": 4.32, + "grad_norm": 4.6933135986328125, + "learning_rate": 4.4655644225532035e-07, + "logits/chosen": -0.587822437286377, + "logits/rejected": -0.6817838549613953, + "logps/chosen": -64.07276916503906, + "logps/rejected": -89.15066528320312, + "loss": 0.7196, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.897829055786133, + "rewards/margins": 6.05568265914917, + "rewards/rejected": -3.157853841781616, + "step": 17276 + }, + { + "epoch": 4.32, + "grad_norm": 5.914195537567139, + "learning_rate": 4.4623181228672706e-07, + "logits/chosen": -0.5750482082366943, + "logits/rejected": -0.6618088483810425, + "logps/chosen": -60.30554962158203, + "logps/rejected": -109.16972351074219, + "loss": 0.6734, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3244099617004395, + "rewards/margins": 8.099601745605469, + "rewards/rejected": -4.7751922607421875, + "step": 17277 + }, + { + "epoch": 4.32, + "grad_norm": 10.335238456726074, + "learning_rate": 4.4590729484659035e-07, + "logits/chosen": -0.6006282567977905, + "logits/rejected": -0.696438729763031, + "logps/chosen": -56.598854064941406, + "logps/rejected": -86.1309814453125, + "loss": 0.6775, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.6720356941223145, + "rewards/margins": 7.450982093811035, + "rewards/rejected": -3.7789463996887207, + "step": 17278 + }, + { + "epoch": 4.32, + "grad_norm": 2.374619483947754, + "learning_rate": 4.455828899429271e-07, + "logits/chosen": -0.5784783959388733, + "logits/rejected": -0.6146308183670044, + "logps/chosen": -57.329139709472656, + "logps/rejected": -128.27146911621094, + "loss": 0.5839, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0971598625183105, + "rewards/margins": 7.0602874755859375, + "rewards/rejected": -3.963127613067627, + "step": 17279 + }, + { + "epoch": 4.32, + "grad_norm": 4.663280487060547, + "learning_rate": 4.452585975837559e-07, + "logits/chosen": -0.6049014925956726, + "logits/rejected": -0.6849141120910645, + "logps/chosen": -50.40409851074219, + "logps/rejected": -97.67683410644531, + "loss": 0.6199, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1093826293945312, + "rewards/margins": 7.1687235832214355, + "rewards/rejected": -4.059340953826904, + "step": 17280 + }, + { + "epoch": 4.32, + "grad_norm": 3.1655375957489014, + "learning_rate": 4.4493441777708924e-07, + "logits/chosen": -0.48090311884880066, + "logits/rejected": -0.5922973155975342, + "logps/chosen": -67.05082702636719, + "logps/rejected": -124.8630142211914, + "loss": 0.6211, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0810225009918213, + "rewards/margins": 7.566054821014404, + "rewards/rejected": -4.485032558441162, + "step": 17281 + }, + { + "epoch": 4.32, + "grad_norm": 5.849298477172852, + "learning_rate": 4.4461035053093736e-07, + "logits/chosen": -0.5233349800109863, + "logits/rejected": -0.5739427804946899, + "logps/chosen": -47.67561340332031, + "logps/rejected": -102.18867492675781, + "loss": 0.6742, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9611475467681885, + "rewards/margins": 7.079066276550293, + "rewards/rejected": -4.117918968200684, + "step": 17282 + }, + { + "epoch": 4.32, + "grad_norm": 3.209831953048706, + "learning_rate": 4.442863958533095e-07, + "logits/chosen": -0.5897702574729919, + "logits/rejected": -0.6387959122657776, + "logps/chosen": -53.885536193847656, + "logps/rejected": -126.31161499023438, + "loss": 0.567, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.240903854370117, + "rewards/margins": 8.109758377075195, + "rewards/rejected": -4.868854522705078, + "step": 17283 + }, + { + "epoch": 4.32, + "grad_norm": 7.230466842651367, + "learning_rate": 4.4396255375221185e-07, + "logits/chosen": -0.636198103427887, + "logits/rejected": -0.7165419459342957, + "logps/chosen": -51.392425537109375, + "logps/rejected": -91.75611877441406, + "loss": 0.588, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.223612070083618, + "rewards/margins": 7.149569988250732, + "rewards/rejected": -3.925957679748535, + "step": 17284 + }, + { + "epoch": 4.32, + "grad_norm": 5.157993793487549, + "learning_rate": 4.4363882423564376e-07, + "logits/chosen": -0.49804162979125977, + "logits/rejected": -0.6089857816696167, + "logps/chosen": -66.28275299072266, + "logps/rejected": -119.40971374511719, + "loss": 0.7448, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.805734872817993, + "rewards/margins": 6.907418727874756, + "rewards/rejected": -4.101683616638184, + "step": 17285 + }, + { + "epoch": 4.32, + "grad_norm": 5.663863182067871, + "learning_rate": 4.433152073116065e-07, + "logits/chosen": -0.510036051273346, + "logits/rejected": -0.6023195385932922, + "logps/chosen": -53.48836898803711, + "logps/rejected": -88.55615234375, + "loss": 0.6085, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.127898693084717, + "rewards/margins": 6.633875846862793, + "rewards/rejected": -3.5059773921966553, + "step": 17286 + }, + { + "epoch": 4.32, + "grad_norm": 3.1434707641601562, + "learning_rate": 4.4299170298809814e-07, + "logits/chosen": -0.5705791711807251, + "logits/rejected": -0.6464391946792603, + "logps/chosen": -42.25996398925781, + "logps/rejected": -106.291015625, + "loss": 0.5683, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2613303661346436, + "rewards/margins": 8.107129096984863, + "rewards/rejected": -4.845798492431641, + "step": 17287 + }, + { + "epoch": 4.32, + "grad_norm": 3.1443564891815186, + "learning_rate": 4.4266831127311227e-07, + "logits/chosen": -0.5323208570480347, + "logits/rejected": -0.6584757566452026, + "logps/chosen": -59.04100036621094, + "logps/rejected": -95.12733459472656, + "loss": 0.5417, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2063558101654053, + "rewards/margins": 6.416399002075195, + "rewards/rejected": -3.210043430328369, + "step": 17288 + }, + { + "epoch": 4.33, + "grad_norm": 3.953927516937256, + "learning_rate": 4.423450321746381e-07, + "logits/chosen": -0.5129484534263611, + "logits/rejected": -0.6550779342651367, + "logps/chosen": -65.08599853515625, + "logps/rejected": -93.89824676513672, + "loss": 0.6419, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0199975967407227, + "rewards/margins": 6.757690906524658, + "rewards/rejected": -3.7376937866210938, + "step": 17289 + }, + { + "epoch": 4.33, + "grad_norm": 3.7976248264312744, + "learning_rate": 4.4202186570066753e-07, + "logits/chosen": -0.4656979441642761, + "logits/rejected": -0.5806121826171875, + "logps/chosen": -69.15745544433594, + "logps/rejected": -112.82809448242188, + "loss": 0.65, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0257155895233154, + "rewards/margins": 8.077120780944824, + "rewards/rejected": -5.05140495300293, + "step": 17290 + }, + { + "epoch": 4.33, + "grad_norm": 5.444403648376465, + "learning_rate": 4.416988118591847e-07, + "logits/chosen": -0.5702154636383057, + "logits/rejected": -0.6865127086639404, + "logps/chosen": -54.28316116333008, + "logps/rejected": -84.5418701171875, + "loss": 0.6207, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9725182056427, + "rewards/margins": 6.723116874694824, + "rewards/rejected": -3.750598907470703, + "step": 17291 + }, + { + "epoch": 4.33, + "grad_norm": 5.820425987243652, + "learning_rate": 4.4137587065817165e-07, + "logits/chosen": -0.5439040064811707, + "logits/rejected": -0.646739661693573, + "logps/chosen": -50.210758209228516, + "logps/rejected": -103.46092987060547, + "loss": 0.6, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.86277437210083, + "rewards/margins": 7.626321792602539, + "rewards/rejected": -4.763546943664551, + "step": 17292 + }, + { + "epoch": 4.33, + "grad_norm": 2.1791465282440186, + "learning_rate": 4.410530421056103e-07, + "logits/chosen": -0.5592228174209595, + "logits/rejected": -0.6857959628105164, + "logps/chosen": -57.869384765625, + "logps/rejected": -104.49665069580078, + "loss": 0.5897, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0915186405181885, + "rewards/margins": 8.474422454833984, + "rewards/rejected": -5.382903575897217, + "step": 17293 + }, + { + "epoch": 4.33, + "grad_norm": 4.417165279388428, + "learning_rate": 4.407303262094775e-07, + "logits/chosen": -0.5074832439422607, + "logits/rejected": -0.5935478210449219, + "logps/chosen": -59.75160598754883, + "logps/rejected": -111.14509582519531, + "loss": 0.5691, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0598526000976562, + "rewards/margins": 7.696810245513916, + "rewards/rejected": -4.636957168579102, + "step": 17294 + }, + { + "epoch": 4.33, + "grad_norm": 3.9298360347747803, + "learning_rate": 4.40407722977747e-07, + "logits/chosen": -0.5849540829658508, + "logits/rejected": -0.6655006408691406, + "logps/chosen": -58.147605895996094, + "logps/rejected": -113.5676040649414, + "loss": 0.5828, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1416211128234863, + "rewards/margins": 8.373560905456543, + "rewards/rejected": -5.231939792633057, + "step": 17295 + }, + { + "epoch": 4.33, + "grad_norm": 5.563135623931885, + "learning_rate": 4.400852324183924e-07, + "logits/chosen": -0.4560648798942566, + "logits/rejected": -0.576633870601654, + "logps/chosen": -65.81654357910156, + "logps/rejected": -98.2673110961914, + "loss": 0.633, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.912186622619629, + "rewards/margins": 7.0165276527404785, + "rewards/rejected": -4.104341506958008, + "step": 17296 + }, + { + "epoch": 4.33, + "grad_norm": 3.5392119884490967, + "learning_rate": 4.397628545393806e-07, + "logits/chosen": -0.5191980004310608, + "logits/rejected": -0.6086177825927734, + "logps/chosen": -57.041908264160156, + "logps/rejected": -109.41439819335938, + "loss": 0.6066, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8423542976379395, + "rewards/margins": 7.0582427978515625, + "rewards/rejected": -4.215888500213623, + "step": 17297 + }, + { + "epoch": 4.33, + "grad_norm": 1.7719614505767822, + "learning_rate": 4.3944058934868025e-07, + "logits/chosen": -0.4779495596885681, + "logits/rejected": -0.5830110907554626, + "logps/chosen": -79.36328887939453, + "logps/rejected": -109.08636474609375, + "loss": 0.6158, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2006564140319824, + "rewards/margins": 7.62567138671875, + "rewards/rejected": -4.425015449523926, + "step": 17298 + }, + { + "epoch": 4.33, + "grad_norm": 21.330486297607422, + "learning_rate": 4.391184368542528e-07, + "logits/chosen": -0.64227294921875, + "logits/rejected": -0.700295090675354, + "logps/chosen": -61.329383850097656, + "logps/rejected": -102.13143157958984, + "loss": 0.6365, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.157681941986084, + "rewards/margins": 6.325221061706543, + "rewards/rejected": -3.16753888130188, + "step": 17299 + }, + { + "epoch": 4.33, + "grad_norm": 3.2567756175994873, + "learning_rate": 4.387963970640613e-07, + "logits/chosen": -0.6011884212493896, + "logits/rejected": -0.6921302676200867, + "logps/chosen": -64.28305053710938, + "logps/rejected": -114.95831298828125, + "loss": 0.61, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4326820373535156, + "rewards/margins": 8.020475387573242, + "rewards/rejected": -4.58779239654541, + "step": 17300 + }, + { + "epoch": 4.33, + "grad_norm": 8.644594192504883, + "learning_rate": 4.384744699860616e-07, + "logits/chosen": -0.56000155210495, + "logits/rejected": -0.5713344812393188, + "logps/chosen": -56.53940963745117, + "logps/rejected": -128.73574829101562, + "loss": 0.7237, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8124072551727295, + "rewards/margins": 6.52790641784668, + "rewards/rejected": -3.71549916267395, + "step": 17301 + }, + { + "epoch": 4.33, + "grad_norm": 5.492056369781494, + "learning_rate": 4.3815265562820954e-07, + "logits/chosen": -0.575982928276062, + "logits/rejected": -0.6409919261932373, + "logps/chosen": -51.610755920410156, + "logps/rejected": -99.7381591796875, + "loss": 0.6381, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2410783767700195, + "rewards/margins": 6.990242004394531, + "rewards/rejected": -3.7491629123687744, + "step": 17302 + }, + { + "epoch": 4.33, + "grad_norm": 5.157958507537842, + "learning_rate": 4.378309539984577e-07, + "logits/chosen": -0.5382438898086548, + "logits/rejected": -0.5787158608436584, + "logps/chosen": -54.46733856201172, + "logps/rejected": -103.37642669677734, + "loss": 0.6847, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2336649894714355, + "rewards/margins": 5.9231085777282715, + "rewards/rejected": -2.689443826675415, + "step": 17303 + }, + { + "epoch": 4.33, + "grad_norm": 2.4213743209838867, + "learning_rate": 4.375093651047552e-07, + "logits/chosen": -0.5781853199005127, + "logits/rejected": -0.6735227108001709, + "logps/chosen": -49.28736114501953, + "logps/rejected": -119.63341522216797, + "loss": 0.5035, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.123845100402832, + "rewards/margins": 8.981603622436523, + "rewards/rejected": -5.857758522033691, + "step": 17304 + }, + { + "epoch": 4.33, + "grad_norm": 8.82383918762207, + "learning_rate": 4.371878889550496e-07, + "logits/chosen": -0.5165571570396423, + "logits/rejected": -0.6245399713516235, + "logps/chosen": -56.45145034790039, + "logps/rejected": -84.60966491699219, + "loss": 0.655, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.707174301147461, + "rewards/margins": 6.308664321899414, + "rewards/rejected": -3.601489782333374, + "step": 17305 + }, + { + "epoch": 4.33, + "grad_norm": 3.41098952293396, + "learning_rate": 4.368665255572846e-07, + "logits/chosen": -0.5353381037712097, + "logits/rejected": -0.5912378430366516, + "logps/chosen": -56.00922775268555, + "logps/rejected": -99.04386138916016, + "loss": 0.5541, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.144073009490967, + "rewards/margins": 5.730320453643799, + "rewards/rejected": -2.586247682571411, + "step": 17306 + }, + { + "epoch": 4.33, + "grad_norm": 6.098256587982178, + "learning_rate": 4.3654527491940046e-07, + "logits/chosen": -0.5625780820846558, + "logits/rejected": -0.6529776453971863, + "logps/chosen": -46.12887954711914, + "logps/rejected": -108.79969024658203, + "loss": 0.6459, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.965503215789795, + "rewards/margins": 7.4095540046691895, + "rewards/rejected": -4.4440507888793945, + "step": 17307 + }, + { + "epoch": 4.33, + "grad_norm": 4.372624397277832, + "learning_rate": 4.3622413704933644e-07, + "logits/chosen": -0.4812659025192261, + "logits/rejected": -0.595558226108551, + "logps/chosen": -53.41405487060547, + "logps/rejected": -100.63801574707031, + "loss": 0.5429, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.109315872192383, + "rewards/margins": 7.486196517944336, + "rewards/rejected": -4.376880645751953, + "step": 17308 + }, + { + "epoch": 4.33, + "grad_norm": 6.307508945465088, + "learning_rate": 4.3590311195503054e-07, + "logits/chosen": -0.5435411930084229, + "logits/rejected": -0.6031220555305481, + "logps/chosen": -61.98925018310547, + "logps/rejected": -111.97496795654297, + "loss": 0.6516, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.252202033996582, + "rewards/margins": 7.286293029785156, + "rewards/rejected": -4.034090995788574, + "step": 17309 + }, + { + "epoch": 4.33, + "grad_norm": 11.704668998718262, + "learning_rate": 4.3558219964441094e-07, + "logits/chosen": -0.5587209463119507, + "logits/rejected": -0.6469653248786926, + "logps/chosen": -61.19413757324219, + "logps/rejected": -128.43910217285156, + "loss": 0.652, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8370749950408936, + "rewards/margins": 7.132901191711426, + "rewards/rejected": -4.2958269119262695, + "step": 17310 + }, + { + "epoch": 4.33, + "grad_norm": 4.09207820892334, + "learning_rate": 4.352614001254102e-07, + "logits/chosen": -0.5718337893486023, + "logits/rejected": -0.6310059428215027, + "logps/chosen": -48.33669662475586, + "logps/rejected": -117.50459289550781, + "loss": 0.5231, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.865057945251465, + "rewards/margins": 7.316208362579346, + "rewards/rejected": -4.451150417327881, + "step": 17311 + }, + { + "epoch": 4.33, + "grad_norm": 3.2875092029571533, + "learning_rate": 4.349407134059574e-07, + "logits/chosen": -0.48881134390830994, + "logits/rejected": -0.5652956962585449, + "logps/chosen": -60.19065475463867, + "logps/rejected": -110.726806640625, + "loss": 0.6311, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0808122158050537, + "rewards/margins": 6.840813636779785, + "rewards/rejected": -3.7600018978118896, + "step": 17312 + }, + { + "epoch": 4.33, + "grad_norm": 3.7888474464416504, + "learning_rate": 4.346201394939736e-07, + "logits/chosen": -0.5942952632904053, + "logits/rejected": -0.6906589269638062, + "logps/chosen": -50.15412139892578, + "logps/rejected": -94.3580322265625, + "loss": 0.5845, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1335959434509277, + "rewards/margins": 6.965456962585449, + "rewards/rejected": -3.8318607807159424, + "step": 17313 + }, + { + "epoch": 4.33, + "grad_norm": 4.755474090576172, + "learning_rate": 4.3429967839738187e-07, + "logits/chosen": -0.5332751870155334, + "logits/rejected": -0.62917560338974, + "logps/chosen": -57.530357360839844, + "logps/rejected": -113.78140258789062, + "loss": 0.6835, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.214590549468994, + "rewards/margins": 7.575425148010254, + "rewards/rejected": -4.360835075378418, + "step": 17314 + }, + { + "epoch": 4.33, + "grad_norm": 5.957252025604248, + "learning_rate": 4.3397933012410243e-07, + "logits/chosen": -0.56094890832901, + "logits/rejected": -0.6666647791862488, + "logps/chosen": -79.18978881835938, + "logps/rejected": -88.81932830810547, + "loss": 0.7227, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2223522663116455, + "rewards/margins": 5.8411736488342285, + "rewards/rejected": -2.618821382522583, + "step": 17315 + }, + { + "epoch": 4.33, + "grad_norm": 3.185129404067993, + "learning_rate": 4.336590946820501e-07, + "logits/chosen": -0.5514196157455444, + "logits/rejected": -0.6285220384597778, + "logps/chosen": -58.54645538330078, + "logps/rejected": -105.3926773071289, + "loss": 0.669, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.115527391433716, + "rewards/margins": 7.5019097328186035, + "rewards/rejected": -4.386382579803467, + "step": 17316 + }, + { + "epoch": 4.33, + "grad_norm": 5.319897174835205, + "learning_rate": 4.333389720791381e-07, + "logits/chosen": -0.5004851818084717, + "logits/rejected": -0.5582371950149536, + "logps/chosen": -56.31155776977539, + "logps/rejected": -101.03572082519531, + "loss": 0.708, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1277291774749756, + "rewards/margins": 7.043116569519043, + "rewards/rejected": -3.9153871536254883, + "step": 17317 + }, + { + "epoch": 4.33, + "grad_norm": 6.707228660583496, + "learning_rate": 4.330189623232778e-07, + "logits/chosen": -0.5318743586540222, + "logits/rejected": -0.5784049034118652, + "logps/chosen": -57.4156379699707, + "logps/rejected": -106.178955078125, + "loss": 0.631, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1174445152282715, + "rewards/margins": 7.191255569458008, + "rewards/rejected": -4.073810577392578, + "step": 17318 + }, + { + "epoch": 4.33, + "grad_norm": 3.8582537174224854, + "learning_rate": 4.3269906542237674e-07, + "logits/chosen": -0.5746065378189087, + "logits/rejected": -0.6452153921127319, + "logps/chosen": -49.42336654663086, + "logps/rejected": -100.61738586425781, + "loss": 0.673, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9047915935516357, + "rewards/margins": 6.735210418701172, + "rewards/rejected": -3.830418586730957, + "step": 17319 + }, + { + "epoch": 4.33, + "grad_norm": 3.4162063598632812, + "learning_rate": 4.3237928138433913e-07, + "logits/chosen": -0.5482997298240662, + "logits/rejected": -0.6590029001235962, + "logps/chosen": -49.077484130859375, + "logps/rejected": -97.00502014160156, + "loss": 0.5696, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1421918869018555, + "rewards/margins": 7.765658855438232, + "rewards/rejected": -4.623467922210693, + "step": 17320 + }, + { + "epoch": 4.33, + "grad_norm": 1.6817868947982788, + "learning_rate": 4.320596102170682e-07, + "logits/chosen": -0.5227280855178833, + "logits/rejected": -0.6248342990875244, + "logps/chosen": -69.2765884399414, + "logps/rejected": -118.76268768310547, + "loss": 0.543, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4162206649780273, + "rewards/margins": 8.635924339294434, + "rewards/rejected": -5.219703197479248, + "step": 17321 + }, + { + "epoch": 4.33, + "grad_norm": 6.288252830505371, + "learning_rate": 4.3174005192846257e-07, + "logits/chosen": -0.6716047525405884, + "logits/rejected": -0.7689223885536194, + "logps/chosen": -39.50632858276367, + "logps/rejected": -108.97998046875, + "loss": 0.5319, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.385446548461914, + "rewards/margins": 7.933755874633789, + "rewards/rejected": -4.548309326171875, + "step": 17322 + }, + { + "epoch": 4.33, + "grad_norm": 5.611845016479492, + "learning_rate": 4.3142060652642034e-07, + "logits/chosen": -0.5054697394371033, + "logits/rejected": -0.6116944551467896, + "logps/chosen": -69.91069793701172, + "logps/rejected": -108.02999114990234, + "loss": 0.6313, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4997847080230713, + "rewards/margins": 7.458162784576416, + "rewards/rejected": -4.958377361297607, + "step": 17323 + }, + { + "epoch": 4.33, + "grad_norm": 6.548311233520508, + "learning_rate": 4.311012740188336e-07, + "logits/chosen": -0.5370632410049438, + "logits/rejected": -0.6234486103057861, + "logps/chosen": -48.134674072265625, + "logps/rejected": -108.53254699707031, + "loss": 0.575, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4856836795806885, + "rewards/margins": 7.115863800048828, + "rewards/rejected": -3.6301794052124023, + "step": 17324 + }, + { + "epoch": 4.33, + "grad_norm": 6.104543209075928, + "learning_rate": 4.307820544135938e-07, + "logits/chosen": -0.5669267773628235, + "logits/rejected": -0.601093590259552, + "logps/chosen": -52.8717155456543, + "logps/rejected": -96.10757446289062, + "loss": 0.6908, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2218666076660156, + "rewards/margins": 6.044710159301758, + "rewards/rejected": -2.8228440284729004, + "step": 17325 + }, + { + "epoch": 4.33, + "grad_norm": 2.441383123397827, + "learning_rate": 4.304629477185901e-07, + "logits/chosen": -0.5353819131851196, + "logits/rejected": -0.6440202593803406, + "logps/chosen": -54.488304138183594, + "logps/rejected": -96.60173034667969, + "loss": 0.5307, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.322254180908203, + "rewards/margins": 7.196251392364502, + "rewards/rejected": -3.873997211456299, + "step": 17326 + }, + { + "epoch": 4.33, + "grad_norm": 3.581951856613159, + "learning_rate": 4.3014395394170736e-07, + "logits/chosen": -0.5591742992401123, + "logits/rejected": -0.5904121398925781, + "logps/chosen": -53.92799377441406, + "logps/rejected": -120.89663696289062, + "loss": 0.6498, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.748471260070801, + "rewards/margins": 7.630438804626465, + "rewards/rejected": -4.881967067718506, + "step": 17327 + }, + { + "epoch": 4.33, + "grad_norm": 4.47470760345459, + "learning_rate": 4.2982507309082713e-07, + "logits/chosen": -0.6090275049209595, + "logits/rejected": -0.6487123966217041, + "logps/chosen": -54.78241729736328, + "logps/rejected": -92.30644989013672, + "loss": 0.6758, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5403361320495605, + "rewards/margins": 5.4017462730407715, + "rewards/rejected": -1.861409306526184, + "step": 17328 + }, + { + "epoch": 4.34, + "grad_norm": 14.757704734802246, + "learning_rate": 4.295063051738307e-07, + "logits/chosen": -0.5866104960441589, + "logits/rejected": -0.6633754968643188, + "logps/chosen": -47.64811706542969, + "logps/rejected": -98.80059051513672, + "loss": 0.6191, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.940450668334961, + "rewards/margins": 7.049953460693359, + "rewards/rejected": -4.109501838684082, + "step": 17329 + }, + { + "epoch": 4.34, + "grad_norm": 4.256051063537598, + "learning_rate": 4.2918765019859587e-07, + "logits/chosen": -0.645759105682373, + "logits/rejected": -0.7096558809280396, + "logps/chosen": -53.519012451171875, + "logps/rejected": -99.40440368652344, + "loss": 0.5461, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5506510734558105, + "rewards/margins": 6.316539764404297, + "rewards/rejected": -2.7658891677856445, + "step": 17330 + }, + { + "epoch": 4.34, + "grad_norm": 10.587103843688965, + "learning_rate": 4.288691081729957e-07, + "logits/chosen": -0.5023328065872192, + "logits/rejected": -0.6154042482376099, + "logps/chosen": -54.32454299926758, + "logps/rejected": -82.4619369506836, + "loss": 0.6505, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.931591510772705, + "rewards/margins": 5.869132041931152, + "rewards/rejected": -2.937540292739868, + "step": 17331 + }, + { + "epoch": 4.34, + "grad_norm": 5.300501346588135, + "learning_rate": 4.28550679104901e-07, + "logits/chosen": -0.5640300512313843, + "logits/rejected": -0.6477391719818115, + "logps/chosen": -47.78422927856445, + "logps/rejected": -94.78779602050781, + "loss": 0.5376, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0461647510528564, + "rewards/margins": 6.367298126220703, + "rewards/rejected": -3.3211328983306885, + "step": 17332 + }, + { + "epoch": 4.34, + "grad_norm": 8.780359268188477, + "learning_rate": 4.282323630021823e-07, + "logits/chosen": -0.557326078414917, + "logits/rejected": -0.6637554168701172, + "logps/chosen": -58.16501235961914, + "logps/rejected": -103.45061492919922, + "loss": 0.5927, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9654970169067383, + "rewards/margins": 7.546018600463867, + "rewards/rejected": -4.580522060394287, + "step": 17333 + }, + { + "epoch": 4.34, + "grad_norm": 5.822732925415039, + "learning_rate": 4.279141598727049e-07, + "logits/chosen": -0.5213417410850525, + "logits/rejected": -0.6241400241851807, + "logps/chosen": -46.233558654785156, + "logps/rejected": -106.41130828857422, + "loss": 0.5446, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2990469932556152, + "rewards/margins": 8.285106658935547, + "rewards/rejected": -4.98606014251709, + "step": 17334 + }, + { + "epoch": 4.34, + "grad_norm": 13.833695411682129, + "learning_rate": 4.275960697243303e-07, + "logits/chosen": -0.5365691781044006, + "logits/rejected": -0.591698169708252, + "logps/chosen": -50.6607666015625, + "logps/rejected": -110.40638732910156, + "loss": 0.6242, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.948293685913086, + "rewards/margins": 6.38956356048584, + "rewards/rejected": -3.441270351409912, + "step": 17335 + }, + { + "epoch": 4.34, + "grad_norm": 2.642033576965332, + "learning_rate": 4.272780925649217e-07, + "logits/chosen": -0.5329894423484802, + "logits/rejected": -0.6049254536628723, + "logps/chosen": -55.35915756225586, + "logps/rejected": -107.01225280761719, + "loss": 0.5344, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3576431274414062, + "rewards/margins": 7.776188850402832, + "rewards/rejected": -4.418545246124268, + "step": 17336 + }, + { + "epoch": 4.34, + "grad_norm": 7.385156631469727, + "learning_rate": 4.2696022840233497e-07, + "logits/chosen": -0.5401380062103271, + "logits/rejected": -0.6547698974609375, + "logps/chosen": -48.65420150756836, + "logps/rejected": -113.10033416748047, + "loss": 0.5125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.939063310623169, + "rewards/margins": 8.34900188446045, + "rewards/rejected": -5.409938812255859, + "step": 17337 + }, + { + "epoch": 4.34, + "grad_norm": 3.0964620113372803, + "learning_rate": 4.2664247724442443e-07, + "logits/chosen": -0.49448931217193604, + "logits/rejected": -0.5972532629966736, + "logps/chosen": -53.91606521606445, + "logps/rejected": -91.33365631103516, + "loss": 0.5512, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.86533784866333, + "rewards/margins": 6.796346664428711, + "rewards/rejected": -3.9310081005096436, + "step": 17338 + }, + { + "epoch": 4.34, + "grad_norm": 10.127145767211914, + "learning_rate": 4.263248390990432e-07, + "logits/chosen": -0.5579694509506226, + "logits/rejected": -0.6885953545570374, + "logps/chosen": -55.65468978881836, + "logps/rejected": -97.021484375, + "loss": 0.6368, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0277700424194336, + "rewards/margins": 7.021029949188232, + "rewards/rejected": -3.993260145187378, + "step": 17339 + }, + { + "epoch": 4.34, + "grad_norm": 8.196023941040039, + "learning_rate": 4.2600731397403895e-07, + "logits/chosen": -0.592507541179657, + "logits/rejected": -0.6651453971862793, + "logps/chosen": -59.47145080566406, + "logps/rejected": -112.0560073852539, + "loss": 0.6829, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3458058834075928, + "rewards/margins": 7.700003623962402, + "rewards/rejected": -4.3541975021362305, + "step": 17340 + }, + { + "epoch": 4.34, + "grad_norm": 3.2553036212921143, + "learning_rate": 4.2568990187726033e-07, + "logits/chosen": -0.5643486976623535, + "logits/rejected": -0.6279189586639404, + "logps/chosen": -47.160160064697266, + "logps/rejected": -96.70697021484375, + "loss": 0.6061, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2016358375549316, + "rewards/margins": 7.41522216796875, + "rewards/rejected": -4.213586330413818, + "step": 17341 + }, + { + "epoch": 4.34, + "grad_norm": 3.0916330814361572, + "learning_rate": 4.253726028165489e-07, + "logits/chosen": -0.5919849872589111, + "logits/rejected": -0.6541587710380554, + "logps/chosen": -52.35514450073242, + "logps/rejected": -106.64286804199219, + "loss": 0.6208, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.140657901763916, + "rewards/margins": 7.321542739868164, + "rewards/rejected": -4.180884838104248, + "step": 17342 + }, + { + "epoch": 4.34, + "grad_norm": 3.48103928565979, + "learning_rate": 4.2505541679974724e-07, + "logits/chosen": -0.5198622941970825, + "logits/rejected": -0.6228981018066406, + "logps/chosen": -55.71357727050781, + "logps/rejected": -93.4551010131836, + "loss": 0.6558, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.976848840713501, + "rewards/margins": 7.287402153015137, + "rewards/rejected": -4.310553550720215, + "step": 17343 + }, + { + "epoch": 4.34, + "grad_norm": 10.406620979309082, + "learning_rate": 4.2473834383469194e-07, + "logits/chosen": -0.537830114364624, + "logits/rejected": -0.6459440588951111, + "logps/chosen": -46.25519943237305, + "logps/rejected": -98.0090103149414, + "loss": 0.4801, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.418909788131714, + "rewards/margins": 8.27902889251709, + "rewards/rejected": -4.860118865966797, + "step": 17344 + }, + { + "epoch": 4.34, + "grad_norm": 23.640464782714844, + "learning_rate": 4.244213839292183e-07, + "logits/chosen": -0.597838819026947, + "logits/rejected": -0.6973828077316284, + "logps/chosen": -63.45154571533203, + "logps/rejected": -98.65248107910156, + "loss": 0.8658, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.807560682296753, + "rewards/margins": 6.559988498687744, + "rewards/rejected": -3.752427339553833, + "step": 17345 + }, + { + "epoch": 4.34, + "grad_norm": 19.40900421142578, + "learning_rate": 4.241045370911595e-07, + "logits/chosen": -0.5526260733604431, + "logits/rejected": -0.6337981820106506, + "logps/chosen": -56.425743103027344, + "logps/rejected": -123.52108764648438, + "loss": 0.7612, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.677152633666992, + "rewards/margins": 7.0335540771484375, + "rewards/rejected": -4.356401443481445, + "step": 17346 + }, + { + "epoch": 4.34, + "grad_norm": 10.747514724731445, + "learning_rate": 4.237878033283438e-07, + "logits/chosen": -0.5402934551239014, + "logits/rejected": -0.639644205570221, + "logps/chosen": -53.65037536621094, + "logps/rejected": -122.50459289550781, + "loss": 0.7268, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8877389430999756, + "rewards/margins": 8.969036102294922, + "rewards/rejected": -6.081296920776367, + "step": 17347 + }, + { + "epoch": 4.34, + "grad_norm": 4.733805179595947, + "learning_rate": 4.234711826485999e-07, + "logits/chosen": -0.5462432503700256, + "logits/rejected": -0.6121970415115356, + "logps/chosen": -50.8497200012207, + "logps/rejected": -113.68940734863281, + "loss": 0.6138, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0154683589935303, + "rewards/margins": 8.101858139038086, + "rewards/rejected": -5.086389541625977, + "step": 17348 + }, + { + "epoch": 4.34, + "grad_norm": 10.149415969848633, + "learning_rate": 4.2315467505975097e-07, + "logits/chosen": -0.5957558155059814, + "logits/rejected": -0.6720767021179199, + "logps/chosen": -53.203861236572266, + "logps/rejected": -107.05049133300781, + "loss": 0.7541, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.822561740875244, + "rewards/margins": 6.507171630859375, + "rewards/rejected": -3.6846096515655518, + "step": 17349 + }, + { + "epoch": 4.34, + "grad_norm": 9.77344799041748, + "learning_rate": 4.228382805696174e-07, + "logits/chosen": -0.5415163636207581, + "logits/rejected": -0.6056753396987915, + "logps/chosen": -47.649024963378906, + "logps/rejected": -105.08307647705078, + "loss": 0.6714, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0797953605651855, + "rewards/margins": 7.012668609619141, + "rewards/rejected": -3.9328739643096924, + "step": 17350 + }, + { + "epoch": 4.34, + "grad_norm": 13.15526008605957, + "learning_rate": 4.2252199918601913e-07, + "logits/chosen": -0.5015818476676941, + "logits/rejected": -0.6147319674491882, + "logps/chosen": -52.93904495239258, + "logps/rejected": -105.54841613769531, + "loss": 0.6305, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4602742195129395, + "rewards/margins": 8.168388366699219, + "rewards/rejected": -4.708114147186279, + "step": 17351 + }, + { + "epoch": 4.34, + "grad_norm": 4.9653000831604, + "learning_rate": 4.222058309167709e-07, + "logits/chosen": -0.5443021059036255, + "logits/rejected": -0.6199429631233215, + "logps/chosen": -58.81401062011719, + "logps/rejected": -107.73233032226562, + "loss": 0.7154, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.999174118041992, + "rewards/margins": 8.298851013183594, + "rewards/rejected": -5.29967737197876, + "step": 17352 + }, + { + "epoch": 4.34, + "grad_norm": 3.85746169090271, + "learning_rate": 4.218897757696849e-07, + "logits/chosen": -0.6137450933456421, + "logits/rejected": -0.6771411299705505, + "logps/chosen": -55.15700149536133, + "logps/rejected": -110.97201538085938, + "loss": 0.6559, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1339240074157715, + "rewards/margins": 6.470149040222168, + "rewards/rejected": -3.3362250328063965, + "step": 17353 + }, + { + "epoch": 4.34, + "grad_norm": 3.30269455909729, + "learning_rate": 4.2157383375257264e-07, + "logits/chosen": -0.5806121230125427, + "logits/rejected": -0.6755267977714539, + "logps/chosen": -45.89772415161133, + "logps/rejected": -108.875, + "loss": 0.4694, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.039283037185669, + "rewards/margins": 7.990671634674072, + "rewards/rejected": -4.951388835906982, + "step": 17354 + }, + { + "epoch": 4.34, + "grad_norm": 3.027198314666748, + "learning_rate": 4.2125800487324175e-07, + "logits/chosen": -0.5878778100013733, + "logits/rejected": -0.6903797388076782, + "logps/chosen": -56.857357025146484, + "logps/rejected": -82.43354797363281, + "loss": 0.5738, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1114299297332764, + "rewards/margins": 6.70661735534668, + "rewards/rejected": -3.595187187194824, + "step": 17355 + }, + { + "epoch": 4.34, + "grad_norm": 19.573486328125, + "learning_rate": 4.209422891394943e-07, + "logits/chosen": -0.570083498954773, + "logits/rejected": -0.6578460931777954, + "logps/chosen": -54.13093566894531, + "logps/rejected": -112.21144104003906, + "loss": 0.7036, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.144672393798828, + "rewards/margins": 7.552057266235352, + "rewards/rejected": -4.407385349273682, + "step": 17356 + }, + { + "epoch": 4.34, + "grad_norm": 4.782991886138916, + "learning_rate": 4.206266865591335e-07, + "logits/chosen": -0.5719546675682068, + "logits/rejected": -0.6695365309715271, + "logps/chosen": -67.3126220703125, + "logps/rejected": -106.48286437988281, + "loss": 0.6755, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1505722999572754, + "rewards/margins": 7.223743438720703, + "rewards/rejected": -4.0731706619262695, + "step": 17357 + }, + { + "epoch": 4.34, + "grad_norm": 4.850840091705322, + "learning_rate": 4.2031119713995926e-07, + "logits/chosen": -0.549308180809021, + "logits/rejected": -0.5878198146820068, + "logps/chosen": -64.52530670166016, + "logps/rejected": -121.72576904296875, + "loss": 0.6609, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0586700439453125, + "rewards/margins": 7.73846960067749, + "rewards/rejected": -4.679799556732178, + "step": 17358 + }, + { + "epoch": 4.34, + "grad_norm": 3.9579970836639404, + "learning_rate": 4.1999582088976585e-07, + "logits/chosen": -0.5424510836601257, + "logits/rejected": -0.6223963499069214, + "logps/chosen": -58.80162811279297, + "logps/rejected": -93.25938415527344, + "loss": 0.6068, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.133662700653076, + "rewards/margins": 6.998910903930664, + "rewards/rejected": -3.865248680114746, + "step": 17359 + }, + { + "epoch": 4.34, + "grad_norm": 7.189399719238281, + "learning_rate": 4.1968055781634655e-07, + "logits/chosen": -0.5703167915344238, + "logits/rejected": -0.6249504089355469, + "logps/chosen": -57.22863006591797, + "logps/rejected": -113.86111450195312, + "loss": 0.7345, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.116563081741333, + "rewards/margins": 6.530843257904053, + "rewards/rejected": -3.414280414581299, + "step": 17360 + }, + { + "epoch": 4.34, + "grad_norm": 8.861882209777832, + "learning_rate": 4.193654079274939e-07, + "logits/chosen": -0.5581018924713135, + "logits/rejected": -0.649563729763031, + "logps/chosen": -46.042694091796875, + "logps/rejected": -113.68653869628906, + "loss": 0.6543, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2256996631622314, + "rewards/margins": 7.944522380828857, + "rewards/rejected": -4.718822956085205, + "step": 17361 + }, + { + "epoch": 4.34, + "grad_norm": 4.063100814819336, + "learning_rate": 4.1905037123099346e-07, + "logits/chosen": -0.5811116099357605, + "logits/rejected": -0.6118664741516113, + "logps/chosen": -48.075103759765625, + "logps/rejected": -110.70248413085938, + "loss": 0.6788, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.925381660461426, + "rewards/margins": 6.321568489074707, + "rewards/rejected": -3.396186113357544, + "step": 17362 + }, + { + "epoch": 4.34, + "grad_norm": 6.7527031898498535, + "learning_rate": 4.187354477346306e-07, + "logits/chosen": -0.547128438949585, + "logits/rejected": -0.5756204128265381, + "logps/chosen": -54.01946258544922, + "logps/rejected": -99.1079330444336, + "loss": 0.6728, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6555819511413574, + "rewards/margins": 6.290753364562988, + "rewards/rejected": -2.6351706981658936, + "step": 17363 + }, + { + "epoch": 4.34, + "grad_norm": 12.34989070892334, + "learning_rate": 4.18420637446188e-07, + "logits/chosen": -0.4579368829727173, + "logits/rejected": -0.5756776928901672, + "logps/chosen": -55.79292678833008, + "logps/rejected": -96.27900695800781, + "loss": 0.729, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.176337957382202, + "rewards/margins": 6.435009956359863, + "rewards/rejected": -3.2586724758148193, + "step": 17364 + }, + { + "epoch": 4.34, + "grad_norm": 2.627020835876465, + "learning_rate": 4.181059403734439e-07, + "logits/chosen": -0.5387983322143555, + "logits/rejected": -0.6611238718032837, + "logps/chosen": -63.88706588745117, + "logps/rejected": -115.608642578125, + "loss": 0.5605, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5166919231414795, + "rewards/margins": 8.727014541625977, + "rewards/rejected": -5.210322856903076, + "step": 17365 + }, + { + "epoch": 4.34, + "grad_norm": 3.991079092025757, + "learning_rate": 4.17791356524177e-07, + "logits/chosen": -0.586561918258667, + "logits/rejected": -0.6453982591629028, + "logps/chosen": -51.310184478759766, + "logps/rejected": -104.28146362304688, + "loss": 0.5894, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1495535373687744, + "rewards/margins": 6.776501178741455, + "rewards/rejected": -3.6269469261169434, + "step": 17366 + }, + { + "epoch": 4.34, + "grad_norm": 7.514543056488037, + "learning_rate": 4.17476885906159e-07, + "logits/chosen": -0.5428518652915955, + "logits/rejected": -0.6101132035255432, + "logps/chosen": -56.454254150390625, + "logps/rejected": -96.05303192138672, + "loss": 0.6182, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0775721073150635, + "rewards/margins": 6.453368186950684, + "rewards/rejected": -3.375795841217041, + "step": 17367 + }, + { + "epoch": 4.34, + "grad_norm": 3.6497647762298584, + "learning_rate": 4.171625285271602e-07, + "logits/chosen": -0.6013607382774353, + "logits/rejected": -0.6725196242332458, + "logps/chosen": -47.61956787109375, + "logps/rejected": -97.17106628417969, + "loss": 0.6158, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.998899221420288, + "rewards/margins": 6.749532699584961, + "rewards/rejected": -3.7506330013275146, + "step": 17368 + }, + { + "epoch": 4.35, + "grad_norm": 2.8366870880126953, + "learning_rate": 4.168482843949512e-07, + "logits/chosen": -0.5344592928886414, + "logits/rejected": -0.5905501842498779, + "logps/chosen": -56.633506774902344, + "logps/rejected": -104.97452545166016, + "loss": 0.5592, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0366592407226562, + "rewards/margins": 7.400020122528076, + "rewards/rejected": -4.36336088180542, + "step": 17369 + }, + { + "epoch": 4.35, + "grad_norm": 5.106375217437744, + "learning_rate": 4.1653415351729564e-07, + "logits/chosen": -0.493152916431427, + "logits/rejected": -0.5890426635742188, + "logps/chosen": -60.466007232666016, + "logps/rejected": -101.80655670166016, + "loss": 0.7027, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0127711296081543, + "rewards/margins": 5.99271821975708, + "rewards/rejected": -2.979947805404663, + "step": 17370 + }, + { + "epoch": 4.35, + "grad_norm": 3.497220754623413, + "learning_rate": 4.1622013590195575e-07, + "logits/chosen": -0.5444658398628235, + "logits/rejected": -0.6511210203170776, + "logps/chosen": -66.92146301269531, + "logps/rejected": -106.81954193115234, + "loss": 0.6418, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.258363962173462, + "rewards/margins": 7.191147804260254, + "rewards/rejected": -3.932783842086792, + "step": 17371 + }, + { + "epoch": 4.35, + "grad_norm": 10.867176055908203, + "learning_rate": 4.15906231556692e-07, + "logits/chosen": -0.5467876195907593, + "logits/rejected": -0.6004318594932556, + "logps/chosen": -48.05745315551758, + "logps/rejected": -97.1037368774414, + "loss": 0.584, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.6688356399536133, + "rewards/margins": 7.0736002922058105, + "rewards/rejected": -3.40476393699646, + "step": 17372 + }, + { + "epoch": 4.35, + "grad_norm": 16.341693878173828, + "learning_rate": 4.155924404892614e-07, + "logits/chosen": -0.4921989440917969, + "logits/rejected": -0.5965622067451477, + "logps/chosen": -63.58198165893555, + "logps/rejected": -105.18234252929688, + "loss": 0.6019, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.915172576904297, + "rewards/margins": 6.447883129119873, + "rewards/rejected": -3.5327107906341553, + "step": 17373 + }, + { + "epoch": 4.35, + "grad_norm": 2.1776726245880127, + "learning_rate": 4.1527876270741784e-07, + "logits/chosen": -0.5512771010398865, + "logits/rejected": -0.6821771264076233, + "logps/chosen": -51.797149658203125, + "logps/rejected": -101.93864440917969, + "loss": 0.5288, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.066129684448242, + "rewards/margins": 8.354524612426758, + "rewards/rejected": -5.288395404815674, + "step": 17374 + }, + { + "epoch": 4.35, + "grad_norm": 12.974443435668945, + "learning_rate": 4.1496519821891234e-07, + "logits/chosen": -0.5644688010215759, + "logits/rejected": -0.663057804107666, + "logps/chosen": -54.14219665527344, + "logps/rejected": -90.67768859863281, + "loss": 0.6459, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.196241855621338, + "rewards/margins": 6.235899925231934, + "rewards/rejected": -3.0396580696105957, + "step": 17375 + }, + { + "epoch": 4.35, + "grad_norm": 2.6792259216308594, + "learning_rate": 4.1465174703149423e-07, + "logits/chosen": -0.5353262424468994, + "logits/rejected": -0.6103976368904114, + "logps/chosen": -49.03241729736328, + "logps/rejected": -105.6340103149414, + "loss": 0.5407, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.052027463912964, + "rewards/margins": 8.040184020996094, + "rewards/rejected": -4.988156318664551, + "step": 17376 + }, + { + "epoch": 4.35, + "grad_norm": 3.569387912750244, + "learning_rate": 4.1433840915290844e-07, + "logits/chosen": -0.5405533313751221, + "logits/rejected": -0.6146739721298218, + "logps/chosen": -49.938323974609375, + "logps/rejected": -94.32868957519531, + "loss": 0.5587, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.225912094116211, + "rewards/margins": 6.956861972808838, + "rewards/rejected": -3.7309494018554688, + "step": 17377 + }, + { + "epoch": 4.35, + "grad_norm": 6.529690742492676, + "learning_rate": 4.1402518459089713e-07, + "logits/chosen": -0.5650507211685181, + "logits/rejected": -0.6331899166107178, + "logps/chosen": -51.683998107910156, + "logps/rejected": -98.01081848144531, + "loss": 0.6235, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3278470039367676, + "rewards/margins": 6.331080436706543, + "rewards/rejected": -3.0032331943511963, + "step": 17378 + }, + { + "epoch": 4.35, + "grad_norm": 5.1872100830078125, + "learning_rate": 4.137120733532024e-07, + "logits/chosen": -0.5713452696800232, + "logits/rejected": -0.6185393333435059, + "logps/chosen": -53.4487419128418, + "logps/rejected": -99.65875244140625, + "loss": 0.5968, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1544432640075684, + "rewards/margins": 6.354527473449707, + "rewards/rejected": -3.2000842094421387, + "step": 17379 + }, + { + "epoch": 4.35, + "grad_norm": 5.076272010803223, + "learning_rate": 4.1339907544756084e-07, + "logits/chosen": -0.5426645874977112, + "logits/rejected": -0.6242327690124512, + "logps/chosen": -61.675880432128906, + "logps/rejected": -98.66281127929688, + "loss": 0.6151, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1321494579315186, + "rewards/margins": 6.8989129066467285, + "rewards/rejected": -3.766763210296631, + "step": 17380 + }, + { + "epoch": 4.35, + "grad_norm": 3.4800729751586914, + "learning_rate": 4.130861908817052e-07, + "logits/chosen": -0.4817277193069458, + "logits/rejected": -0.5571090579032898, + "logps/chosen": -54.49927520751953, + "logps/rejected": -114.50923156738281, + "loss": 0.5724, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2297019958496094, + "rewards/margins": 7.8545122146606445, + "rewards/rejected": -4.624810218811035, + "step": 17381 + }, + { + "epoch": 4.35, + "grad_norm": 8.894774436950684, + "learning_rate": 4.127734196633698e-07, + "logits/chosen": -0.5306745767593384, + "logits/rejected": -0.5812493562698364, + "logps/chosen": -67.86768341064453, + "logps/rejected": -112.12468719482422, + "loss": 0.8236, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0615973472595215, + "rewards/margins": 6.127087116241455, + "rewards/rejected": -3.0654895305633545, + "step": 17382 + }, + { + "epoch": 4.35, + "grad_norm": 4.036936283111572, + "learning_rate": 4.1246076180028296e-07, + "logits/chosen": -0.5632017254829407, + "logits/rejected": -0.6214805841445923, + "logps/chosen": -50.8325080871582, + "logps/rejected": -105.70269775390625, + "loss": 0.5886, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.365199327468872, + "rewards/margins": 6.546967506408691, + "rewards/rejected": -3.1817684173583984, + "step": 17383 + }, + { + "epoch": 4.35, + "grad_norm": 24.191505432128906, + "learning_rate": 4.121482173001684e-07, + "logits/chosen": -0.5408704280853271, + "logits/rejected": -0.6344357132911682, + "logps/chosen": -57.77751922607422, + "logps/rejected": -98.05284118652344, + "loss": 0.5976, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8818163871765137, + "rewards/margins": 6.438644886016846, + "rewards/rejected": -3.556828498840332, + "step": 17384 + }, + { + "epoch": 4.35, + "grad_norm": 16.947208404541016, + "learning_rate": 4.118357861707517e-07, + "logits/chosen": -0.534542977809906, + "logits/rejected": -0.6399872899055481, + "logps/chosen": -54.152252197265625, + "logps/rejected": -95.84848022460938, + "loss": 0.6652, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.038378953933716, + "rewards/margins": 7.5148725509643555, + "rewards/rejected": -4.476493835449219, + "step": 17385 + }, + { + "epoch": 4.35, + "grad_norm": 7.184230327606201, + "learning_rate": 4.115234684197539e-07, + "logits/chosen": -0.4524196982383728, + "logits/rejected": -0.6049882769584656, + "logps/chosen": -68.14413452148438, + "logps/rejected": -81.14476013183594, + "loss": 0.6909, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.866560220718384, + "rewards/margins": 6.670215606689453, + "rewards/rejected": -3.8036556243896484, + "step": 17386 + }, + { + "epoch": 4.35, + "grad_norm": 5.742925643920898, + "learning_rate": 4.112112640548915e-07, + "logits/chosen": -0.5163490176200867, + "logits/rejected": -0.6264166831970215, + "logps/chosen": -56.966583251953125, + "logps/rejected": -97.80805969238281, + "loss": 0.5963, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1297848224639893, + "rewards/margins": 7.609779357910156, + "rewards/rejected": -4.47999382019043, + "step": 17387 + }, + { + "epoch": 4.35, + "grad_norm": 3.097334861755371, + "learning_rate": 4.108991730838785e-07, + "logits/chosen": -0.5161048173904419, + "logits/rejected": -0.5870349407196045, + "logps/chosen": -47.23678970336914, + "logps/rejected": -108.33590698242188, + "loss": 0.5755, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1683897972106934, + "rewards/margins": 7.244797706604004, + "rewards/rejected": -4.0764079093933105, + "step": 17388 + }, + { + "epoch": 4.35, + "grad_norm": 3.340864896774292, + "learning_rate": 4.105871955144297e-07, + "logits/chosen": -0.5535902976989746, + "logits/rejected": -0.5934909582138062, + "logps/chosen": -82.0817642211914, + "logps/rejected": -103.65292358398438, + "loss": 0.5606, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3592488765716553, + "rewards/margins": 7.827173709869385, + "rewards/rejected": -4.467925071716309, + "step": 17389 + }, + { + "epoch": 4.35, + "grad_norm": 6.8540120124816895, + "learning_rate": 4.1027533135425125e-07, + "logits/chosen": -0.5644168853759766, + "logits/rejected": -0.6463046073913574, + "logps/chosen": -53.09010314941406, + "logps/rejected": -138.5176239013672, + "loss": 0.6408, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5600979328155518, + "rewards/margins": 9.179468154907227, + "rewards/rejected": -5.619369029998779, + "step": 17390 + }, + { + "epoch": 4.35, + "grad_norm": 2.584657907485962, + "learning_rate": 4.0996358061105244e-07, + "logits/chosen": -0.5635280609130859, + "logits/rejected": -0.6413512229919434, + "logps/chosen": -53.82485580444336, + "logps/rejected": -119.25680541992188, + "loss": 0.6345, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.158815383911133, + "rewards/margins": 8.181890487670898, + "rewards/rejected": -5.023075103759766, + "step": 17391 + }, + { + "epoch": 4.35, + "grad_norm": 7.481873512268066, + "learning_rate": 4.0965194329253556e-07, + "logits/chosen": -0.6397547721862793, + "logits/rejected": -0.7220665812492371, + "logps/chosen": -50.85841369628906, + "logps/rejected": -120.81381225585938, + "loss": 0.6706, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.16251277923584, + "rewards/margins": 7.540595054626465, + "rewards/rejected": -4.378082275390625, + "step": 17392 + }, + { + "epoch": 4.35, + "grad_norm": 8.46008014678955, + "learning_rate": 4.0934041940640103e-07, + "logits/chosen": -0.49045446515083313, + "logits/rejected": -0.5431163311004639, + "logps/chosen": -47.92246627807617, + "logps/rejected": -102.20346069335938, + "loss": 0.6147, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4035186767578125, + "rewards/margins": 6.531826019287109, + "rewards/rejected": -3.1283068656921387, + "step": 17393 + }, + { + "epoch": 4.35, + "grad_norm": 5.344444274902344, + "learning_rate": 4.090290089603482e-07, + "logits/chosen": -0.5130314230918884, + "logits/rejected": -0.6255285143852234, + "logps/chosen": -51.110595703125, + "logps/rejected": -110.7653579711914, + "loss": 0.5549, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.781822443008423, + "rewards/margins": 8.230142593383789, + "rewards/rejected": -5.448320388793945, + "step": 17394 + }, + { + "epoch": 4.35, + "grad_norm": 1.843483567237854, + "learning_rate": 4.0871771196207223e-07, + "logits/chosen": -0.5168421268463135, + "logits/rejected": -0.623517632484436, + "logps/chosen": -53.11141586303711, + "logps/rejected": -107.95985412597656, + "loss": 0.5353, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.185635566711426, + "rewards/margins": 7.530226707458496, + "rewards/rejected": -4.344590663909912, + "step": 17395 + }, + { + "epoch": 4.35, + "grad_norm": 10.556724548339844, + "learning_rate": 4.084065284192634e-07, + "logits/chosen": -0.5494497418403625, + "logits/rejected": -0.6236875653266907, + "logps/chosen": -55.561065673828125, + "logps/rejected": -114.37145233154297, + "loss": 0.689, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.205981492996216, + "rewards/margins": 7.30291748046875, + "rewards/rejected": -4.096935749053955, + "step": 17396 + }, + { + "epoch": 4.35, + "grad_norm": 6.603940010070801, + "learning_rate": 4.0809545833961404e-07, + "logits/chosen": -0.6129099726676941, + "logits/rejected": -0.7147437334060669, + "logps/chosen": -51.330142974853516, + "logps/rejected": -105.50556182861328, + "loss": 0.5832, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.207258701324463, + "rewards/margins": 7.011778831481934, + "rewards/rejected": -3.8045201301574707, + "step": 17397 + }, + { + "epoch": 4.35, + "grad_norm": 3.8910624980926514, + "learning_rate": 4.077845017308113e-07, + "logits/chosen": -0.4732716679573059, + "logits/rejected": -0.5704458951950073, + "logps/chosen": -49.1484489440918, + "logps/rejected": -106.47787475585938, + "loss": 0.5454, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1041994094848633, + "rewards/margins": 6.477046012878418, + "rewards/rejected": -3.372847557067871, + "step": 17398 + }, + { + "epoch": 4.35, + "grad_norm": 2.537004232406616, + "learning_rate": 4.074736586005362e-07, + "logits/chosen": -0.5371214151382446, + "logits/rejected": -0.6784389019012451, + "logps/chosen": -58.392215728759766, + "logps/rejected": -92.9620361328125, + "loss": 0.5758, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7218246459960938, + "rewards/margins": 7.643954277038574, + "rewards/rejected": -4.9221296310424805, + "step": 17399 + }, + { + "epoch": 4.35, + "grad_norm": 2.334754467010498, + "learning_rate": 4.071629289564716e-07, + "logits/chosen": -0.4948042631149292, + "logits/rejected": -0.6035548448562622, + "logps/chosen": -54.32270050048828, + "logps/rejected": -94.57145690917969, + "loss": 0.5419, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4978911876678467, + "rewards/margins": 7.3965277671813965, + "rewards/rejected": -3.898636817932129, + "step": 17400 + }, + { + "epoch": 4.35, + "grad_norm": 24.155860900878906, + "learning_rate": 4.0685231280629843e-07, + "logits/chosen": -0.5171984434127808, + "logits/rejected": -0.6196674704551697, + "logps/chosen": -63.144287109375, + "logps/rejected": -97.29151916503906, + "loss": 0.6592, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0019314289093018, + "rewards/margins": 7.843196868896484, + "rewards/rejected": -4.84126615524292, + "step": 17401 + }, + { + "epoch": 4.35, + "grad_norm": 3.162064552307129, + "learning_rate": 4.0654181015768846e-07, + "logits/chosen": -0.5834623575210571, + "logits/rejected": -0.6532481908798218, + "logps/chosen": -50.11264419555664, + "logps/rejected": -107.88431549072266, + "loss": 0.5469, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.394529342651367, + "rewards/margins": 7.4730682373046875, + "rewards/rejected": -4.07853889465332, + "step": 17402 + }, + { + "epoch": 4.35, + "grad_norm": 5.691033363342285, + "learning_rate": 4.0623142101831547e-07, + "logits/chosen": -0.5582792162895203, + "logits/rejected": -0.678632378578186, + "logps/chosen": -56.41313171386719, + "logps/rejected": -110.60916137695312, + "loss": 0.5789, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.077605724334717, + "rewards/margins": 7.953249931335449, + "rewards/rejected": -4.875644207000732, + "step": 17403 + }, + { + "epoch": 4.35, + "grad_norm": 5.8025360107421875, + "learning_rate": 4.059211453958517e-07, + "logits/chosen": -0.5587157607078552, + "logits/rejected": -0.6196348071098328, + "logps/chosen": -59.23970031738281, + "logps/rejected": -102.39067840576172, + "loss": 0.5898, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.019537925720215, + "rewards/margins": 7.14771032333374, + "rewards/rejected": -4.128172397613525, + "step": 17404 + }, + { + "epoch": 4.35, + "grad_norm": 6.957341194152832, + "learning_rate": 4.0561098329796264e-07, + "logits/chosen": -0.5604124069213867, + "logits/rejected": -0.5976633429527283, + "logps/chosen": -53.61540985107422, + "logps/rejected": -106.02751159667969, + "loss": 0.7715, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.765221118927002, + "rewards/margins": 6.8355889320373535, + "rewards/rejected": -4.070367813110352, + "step": 17405 + }, + { + "epoch": 4.35, + "grad_norm": 7.019801616668701, + "learning_rate": 4.0530093473231224e-07, + "logits/chosen": -0.47281530499458313, + "logits/rejected": -0.5536416172981262, + "logps/chosen": -66.09989929199219, + "logps/rejected": -122.83738708496094, + "loss": 0.7104, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6257736682891846, + "rewards/margins": 7.0179443359375, + "rewards/rejected": -4.392169952392578, + "step": 17406 + }, + { + "epoch": 4.35, + "grad_norm": 4.2461981773376465, + "learning_rate": 4.0499099970656376e-07, + "logits/chosen": -0.5859043598175049, + "logits/rejected": -0.6559028029441833, + "logps/chosen": -39.68800735473633, + "logps/rejected": -88.14606475830078, + "loss": 0.5178, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1021320819854736, + "rewards/margins": 6.189988136291504, + "rewards/rejected": -3.087855577468872, + "step": 17407 + }, + { + "epoch": 4.35, + "grad_norm": 8.072346687316895, + "learning_rate": 4.0468117822837494e-07, + "logits/chosen": -0.5275171995162964, + "logits/rejected": -0.6173895597457886, + "logps/chosen": -53.04655456542969, + "logps/rejected": -95.64700317382812, + "loss": 0.6228, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0201408863067627, + "rewards/margins": 6.6400604248046875, + "rewards/rejected": -3.6199193000793457, + "step": 17408 + }, + { + "epoch": 4.36, + "grad_norm": 2.4365899562835693, + "learning_rate": 4.043714703054014e-07, + "logits/chosen": -0.4839051365852356, + "logits/rejected": -0.6143962144851685, + "logps/chosen": -57.727699279785156, + "logps/rejected": -99.40498352050781, + "loss": 0.6247, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1752946376800537, + "rewards/margins": 7.343397617340088, + "rewards/rejected": -4.1681036949157715, + "step": 17409 + }, + { + "epoch": 4.36, + "grad_norm": 3.5059263706207275, + "learning_rate": 4.04061875945298e-07, + "logits/chosen": -0.5260021090507507, + "logits/rejected": -0.6239755153656006, + "logps/chosen": -58.866981506347656, + "logps/rejected": -111.11152648925781, + "loss": 0.6127, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1053271293640137, + "rewards/margins": 7.822856903076172, + "rewards/rejected": -4.717530250549316, + "step": 17410 + }, + { + "epoch": 4.36, + "grad_norm": 2.818923234939575, + "learning_rate": 4.037523951557126e-07, + "logits/chosen": -0.583072304725647, + "logits/rejected": -0.6824246644973755, + "logps/chosen": -61.467472076416016, + "logps/rejected": -109.07050323486328, + "loss": 0.6119, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2192084789276123, + "rewards/margins": 7.505282402038574, + "rewards/rejected": -4.286074638366699, + "step": 17411 + }, + { + "epoch": 4.36, + "grad_norm": 3.589813709259033, + "learning_rate": 4.034430279442958e-07, + "logits/chosen": -0.5436756610870361, + "logits/rejected": -0.614909827709198, + "logps/chosen": -60.331207275390625, + "logps/rejected": -112.37377166748047, + "loss": 0.6249, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9017088413238525, + "rewards/margins": 7.423175811767578, + "rewards/rejected": -4.521467208862305, + "step": 17412 + }, + { + "epoch": 4.36, + "grad_norm": 3.1980953216552734, + "learning_rate": 4.031337743186908e-07, + "logits/chosen": -0.5026577115058899, + "logits/rejected": -0.5987294316291809, + "logps/chosen": -52.956207275390625, + "logps/rejected": -107.83290100097656, + "loss": 0.5732, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.885859251022339, + "rewards/margins": 7.229918956756592, + "rewards/rejected": -4.344059467315674, + "step": 17413 + }, + { + "epoch": 4.36, + "grad_norm": 3.901590347290039, + "learning_rate": 4.028246342865383e-07, + "logits/chosen": -0.6336080431938171, + "logits/rejected": -0.6748538017272949, + "logps/chosen": -44.079071044921875, + "logps/rejected": -118.08128356933594, + "loss": 0.6276, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.099719524383545, + "rewards/margins": 7.713191032409668, + "rewards/rejected": -4.613471508026123, + "step": 17414 + }, + { + "epoch": 4.36, + "grad_norm": 4.639073371887207, + "learning_rate": 4.025156078554793e-07, + "logits/chosen": -0.5420575141906738, + "logits/rejected": -0.657731831073761, + "logps/chosen": -59.241172790527344, + "logps/rejected": -87.80145263671875, + "loss": 0.6081, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.841186046600342, + "rewards/margins": 5.801761627197266, + "rewards/rejected": -2.960575580596924, + "step": 17415 + }, + { + "epoch": 4.36, + "grad_norm": 3.3707845211029053, + "learning_rate": 4.0220669503315046e-07, + "logits/chosen": -0.5824178457260132, + "logits/rejected": -0.688382089138031, + "logps/chosen": -60.40120315551758, + "logps/rejected": -100.82072448730469, + "loss": 0.5874, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1229875087738037, + "rewards/margins": 7.416206359863281, + "rewards/rejected": -4.293218612670898, + "step": 17416 + }, + { + "epoch": 4.36, + "grad_norm": 5.888118267059326, + "learning_rate": 4.0189789582718464e-07, + "logits/chosen": -0.4470099210739136, + "logits/rejected": -0.528204619884491, + "logps/chosen": -53.36481475830078, + "logps/rejected": -117.51709747314453, + "loss": 0.5649, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2420027256011963, + "rewards/margins": 8.108478546142578, + "rewards/rejected": -4.866476058959961, + "step": 17417 + }, + { + "epoch": 4.36, + "grad_norm": 3.980637311935425, + "learning_rate": 4.015892102452118e-07, + "logits/chosen": -0.5630437135696411, + "logits/rejected": -0.6350197196006775, + "logps/chosen": -50.93166732788086, + "logps/rejected": -106.97906494140625, + "loss": 0.592, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.166611909866333, + "rewards/margins": 7.962907314300537, + "rewards/rejected": -4.796295642852783, + "step": 17418 + }, + { + "epoch": 4.36, + "grad_norm": 11.75365161895752, + "learning_rate": 4.012806382948614e-07, + "logits/chosen": -0.5193082690238953, + "logits/rejected": -0.5998741388320923, + "logps/chosen": -57.954681396484375, + "logps/rejected": -112.92110443115234, + "loss": 0.7562, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0297598838806152, + "rewards/margins": 7.431318759918213, + "rewards/rejected": -4.401558876037598, + "step": 17419 + }, + { + "epoch": 4.36, + "grad_norm": 5.334880352020264, + "learning_rate": 4.009721799837579e-07, + "logits/chosen": -0.537527322769165, + "logits/rejected": -0.6151638627052307, + "logps/chosen": -53.714195251464844, + "logps/rejected": -108.6116943359375, + "loss": 0.638, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.217367172241211, + "rewards/margins": 7.823229789733887, + "rewards/rejected": -4.605862140655518, + "step": 17420 + }, + { + "epoch": 4.36, + "grad_norm": 6.188726902008057, + "learning_rate": 4.0066383531952304e-07, + "logits/chosen": -0.6337230205535889, + "logits/rejected": -0.6858548521995544, + "logps/chosen": -46.646385192871094, + "logps/rejected": -94.54398345947266, + "loss": 0.6638, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8927958011627197, + "rewards/margins": 6.7333221435546875, + "rewards/rejected": -3.8405258655548096, + "step": 17421 + }, + { + "epoch": 4.36, + "grad_norm": 3.537167549133301, + "learning_rate": 4.0035560430977727e-07, + "logits/chosen": -0.6395835876464844, + "logits/rejected": -0.7119563221931458, + "logps/chosen": -40.14038848876953, + "logps/rejected": -102.59517669677734, + "loss": 0.4794, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2807469367980957, + "rewards/margins": 7.413029670715332, + "rewards/rejected": -4.132282733917236, + "step": 17422 + }, + { + "epoch": 4.36, + "grad_norm": 9.460175514221191, + "learning_rate": 4.000474869621368e-07, + "logits/chosen": -0.5957679748535156, + "logits/rejected": -0.6982840895652771, + "logps/chosen": -49.25258255004883, + "logps/rejected": -106.2100830078125, + "loss": 0.6251, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1965651512145996, + "rewards/margins": 7.858745098114014, + "rewards/rejected": -4.662179946899414, + "step": 17423 + }, + { + "epoch": 4.36, + "grad_norm": 4.327571392059326, + "learning_rate": 3.9973948328421553e-07, + "logits/chosen": -0.617521345615387, + "logits/rejected": -0.6460456848144531, + "logps/chosen": -52.34754180908203, + "logps/rejected": -122.5477294921875, + "loss": 0.5948, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.180720329284668, + "rewards/margins": 7.554758548736572, + "rewards/rejected": -4.3740386962890625, + "step": 17424 + }, + { + "epoch": 4.36, + "grad_norm": 6.002648830413818, + "learning_rate": 3.9943159328362457e-07, + "logits/chosen": -0.6104140877723694, + "logits/rejected": -0.6882010102272034, + "logps/chosen": -48.89113998413086, + "logps/rejected": -128.33349609375, + "loss": 0.5773, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2503695487976074, + "rewards/margins": 8.695745468139648, + "rewards/rejected": -5.445374965667725, + "step": 17425 + }, + { + "epoch": 4.36, + "grad_norm": 8.351358413696289, + "learning_rate": 3.9912381696797286e-07, + "logits/chosen": -0.5043423175811768, + "logits/rejected": -0.5941492319107056, + "logps/chosen": -57.97142028808594, + "logps/rejected": -97.94670104980469, + "loss": 0.6469, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.056752920150757, + "rewards/margins": 6.336657524108887, + "rewards/rejected": -3.279904365539551, + "step": 17426 + }, + { + "epoch": 4.36, + "grad_norm": 3.410287857055664, + "learning_rate": 3.9881615434486485e-07, + "logits/chosen": -0.595123291015625, + "logits/rejected": -0.7155205607414246, + "logps/chosen": -58.68162155151367, + "logps/rejected": -95.06910705566406, + "loss": 0.6082, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6990480422973633, + "rewards/margins": 6.232793807983398, + "rewards/rejected": -3.533745765686035, + "step": 17427 + }, + { + "epoch": 4.36, + "grad_norm": 2.3952009677886963, + "learning_rate": 3.9850860542190283e-07, + "logits/chosen": -0.5880415439605713, + "logits/rejected": -0.6754578948020935, + "logps/chosen": -52.87325668334961, + "logps/rejected": -93.04134368896484, + "loss": 0.6176, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9901540279388428, + "rewards/margins": 7.417418479919434, + "rewards/rejected": -4.427264213562012, + "step": 17428 + }, + { + "epoch": 4.36, + "grad_norm": 3.8575565814971924, + "learning_rate": 3.98201170206689e-07, + "logits/chosen": -0.5258427262306213, + "logits/rejected": -0.6280136108398438, + "logps/chosen": -60.28281784057617, + "logps/rejected": -111.31979370117188, + "loss": 0.5848, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.29207444190979, + "rewards/margins": 7.669727802276611, + "rewards/rejected": -4.3776535987854, + "step": 17429 + }, + { + "epoch": 4.36, + "grad_norm": 10.086820602416992, + "learning_rate": 3.9789384870681904e-07, + "logits/chosen": -0.4854487478733063, + "logits/rejected": -0.5465958714485168, + "logps/chosen": -60.40339660644531, + "logps/rejected": -108.92874908447266, + "loss": 0.5738, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1831300258636475, + "rewards/margins": 7.031552791595459, + "rewards/rejected": -3.8484222888946533, + "step": 17430 + }, + { + "epoch": 4.36, + "grad_norm": 5.977726936340332, + "learning_rate": 3.975866409298856e-07, + "logits/chosen": -0.6022523641586304, + "logits/rejected": -0.6732085347175598, + "logps/chosen": -51.860599517822266, + "logps/rejected": -105.74531555175781, + "loss": 0.6295, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1772868633270264, + "rewards/margins": 6.610692501068115, + "rewards/rejected": -3.4334053993225098, + "step": 17431 + }, + { + "epoch": 4.36, + "grad_norm": 3.8086578845977783, + "learning_rate": 3.972795468834834e-07, + "logits/chosen": -0.5879551768302917, + "logits/rejected": -0.6419517993927002, + "logps/chosen": -53.00870132446289, + "logps/rejected": -110.27251434326172, + "loss": 0.6244, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7972307205200195, + "rewards/margins": 7.161259651184082, + "rewards/rejected": -4.364029407501221, + "step": 17432 + }, + { + "epoch": 4.36, + "grad_norm": 3.3478825092315674, + "learning_rate": 3.9697256657519835e-07, + "logits/chosen": -0.5567755699157715, + "logits/rejected": -0.6382624506950378, + "logps/chosen": -41.57503890991211, + "logps/rejected": -97.05238342285156, + "loss": 0.5861, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4267609119415283, + "rewards/margins": 7.415599822998047, + "rewards/rejected": -3.988837957382202, + "step": 17433 + }, + { + "epoch": 4.36, + "grad_norm": 4.1751017570495605, + "learning_rate": 3.9666570001261675e-07, + "logits/chosen": -0.5675947666168213, + "logits/rejected": -0.6024398803710938, + "logps/chosen": -51.508663177490234, + "logps/rejected": -105.79659271240234, + "loss": 0.6656, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2161483764648438, + "rewards/margins": 6.298470497131348, + "rewards/rejected": -3.082322120666504, + "step": 17434 + }, + { + "epoch": 4.36, + "grad_norm": 5.835269927978516, + "learning_rate": 3.9635894720332303e-07, + "logits/chosen": -0.4792185425758362, + "logits/rejected": -0.572601854801178, + "logps/chosen": -56.355255126953125, + "logps/rejected": -97.36026763916016, + "loss": 0.5842, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.282299280166626, + "rewards/margins": 7.230632781982422, + "rewards/rejected": -3.9483346939086914, + "step": 17435 + }, + { + "epoch": 4.36, + "grad_norm": 3.222653865814209, + "learning_rate": 3.96052308154895e-07, + "logits/chosen": -0.6221734881401062, + "logits/rejected": -0.7060000896453857, + "logps/chosen": -58.3330192565918, + "logps/rejected": -129.20738220214844, + "loss": 0.5887, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.334157705307007, + "rewards/margins": 9.114715576171875, + "rewards/rejected": -5.7805585861206055, + "step": 17436 + }, + { + "epoch": 4.36, + "grad_norm": 6.773096561431885, + "learning_rate": 3.9574578287491286e-07, + "logits/chosen": -0.511820912361145, + "logits/rejected": -0.5704964399337769, + "logps/chosen": -61.46098709106445, + "logps/rejected": -109.9822769165039, + "loss": 0.6505, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.043370246887207, + "rewards/margins": 7.301209926605225, + "rewards/rejected": -4.257840156555176, + "step": 17437 + }, + { + "epoch": 4.36, + "grad_norm": 2.1482255458831787, + "learning_rate": 3.9543937137094976e-07, + "logits/chosen": -0.5316619873046875, + "logits/rejected": -0.6322011351585388, + "logps/chosen": -43.87543487548828, + "logps/rejected": -107.14461517333984, + "loss": 0.4888, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4491913318634033, + "rewards/margins": 8.086031913757324, + "rewards/rejected": -4.6368408203125, + "step": 17438 + }, + { + "epoch": 4.36, + "grad_norm": 4.133804798126221, + "learning_rate": 3.95133073650576e-07, + "logits/chosen": -0.5252118110656738, + "logits/rejected": -0.635826826095581, + "logps/chosen": -51.39840316772461, + "logps/rejected": -114.81025695800781, + "loss": 0.5233, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.243358850479126, + "rewards/margins": 8.466203689575195, + "rewards/rejected": -5.22284460067749, + "step": 17439 + }, + { + "epoch": 4.36, + "grad_norm": 13.828981399536133, + "learning_rate": 3.948268897213625e-07, + "logits/chosen": -0.5368784666061401, + "logits/rejected": -0.6192920804023743, + "logps/chosen": -45.98265838623047, + "logps/rejected": -108.6564712524414, + "loss": 0.5786, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.386373996734619, + "rewards/margins": 7.679747581481934, + "rewards/rejected": -4.2933735847473145, + "step": 17440 + }, + { + "epoch": 4.36, + "grad_norm": 3.227604627609253, + "learning_rate": 3.945208195908762e-07, + "logits/chosen": -0.5865554809570312, + "logits/rejected": -0.617533802986145, + "logps/chosen": -47.961517333984375, + "logps/rejected": -108.4263916015625, + "loss": 0.589, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.33396577835083, + "rewards/margins": 6.931682109832764, + "rewards/rejected": -3.5977160930633545, + "step": 17441 + }, + { + "epoch": 4.36, + "grad_norm": 4.2272748947143555, + "learning_rate": 3.9421486326667704e-07, + "logits/chosen": -0.5157915353775024, + "logits/rejected": -0.5937690138816833, + "logps/chosen": -64.1449203491211, + "logps/rejected": -117.74873352050781, + "loss": 0.6211, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.84451961517334, + "rewards/margins": 7.207157611846924, + "rewards/rejected": -4.362637519836426, + "step": 17442 + }, + { + "epoch": 4.36, + "grad_norm": 4.930607795715332, + "learning_rate": 3.9390902075632785e-07, + "logits/chosen": -0.5002292394638062, + "logits/rejected": -0.5944674015045166, + "logps/chosen": -66.14889526367188, + "logps/rejected": -103.2688217163086, + "loss": 0.7154, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.321521759033203, + "rewards/margins": 7.050266742706299, + "rewards/rejected": -3.728745460510254, + "step": 17443 + }, + { + "epoch": 4.36, + "grad_norm": 2.815194845199585, + "learning_rate": 3.9360329206738826e-07, + "logits/chosen": -0.6350964903831482, + "logits/rejected": -0.7262897491455078, + "logps/chosen": -45.8572883605957, + "logps/rejected": -103.31050109863281, + "loss": 0.6012, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1276140213012695, + "rewards/margins": 7.291138172149658, + "rewards/rejected": -4.163524150848389, + "step": 17444 + }, + { + "epoch": 4.36, + "grad_norm": 10.373539924621582, + "learning_rate": 3.932976772074087e-07, + "logits/chosen": -0.5606952905654907, + "logits/rejected": -0.6474369168281555, + "logps/chosen": -65.51495361328125, + "logps/rejected": -119.6623764038086, + "loss": 0.6886, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1180579662323, + "rewards/margins": 7.652460098266602, + "rewards/rejected": -4.534402370452881, + "step": 17445 + }, + { + "epoch": 4.36, + "grad_norm": 2.7734758853912354, + "learning_rate": 3.9299217618394335e-07, + "logits/chosen": -0.4852132201194763, + "logits/rejected": -0.5767200589179993, + "logps/chosen": -67.9986801147461, + "logps/rejected": -92.83836364746094, + "loss": 0.6729, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1923716068267822, + "rewards/margins": 7.07663631439209, + "rewards/rejected": -3.8842647075653076, + "step": 17446 + }, + { + "epoch": 4.36, + "grad_norm": 5.071746349334717, + "learning_rate": 3.926867890045427e-07, + "logits/chosen": -0.5468161106109619, + "logits/rejected": -0.611615002155304, + "logps/chosen": -53.47538375854492, + "logps/rejected": -118.88922119140625, + "loss": 0.5485, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.240459442138672, + "rewards/margins": 7.85181999206543, + "rewards/rejected": -4.611360549926758, + "step": 17447 + }, + { + "epoch": 4.36, + "grad_norm": 3.5010488033294678, + "learning_rate": 3.9238151567675186e-07, + "logits/chosen": -0.5531227588653564, + "logits/rejected": -0.6982913017272949, + "logps/chosen": -68.39171600341797, + "logps/rejected": -107.5845718383789, + "loss": 0.6618, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7042112350463867, + "rewards/margins": 7.875616073608398, + "rewards/rejected": -5.171404838562012, + "step": 17448 + }, + { + "epoch": 4.37, + "grad_norm": 3.1956539154052734, + "learning_rate": 3.920763562081137e-07, + "logits/chosen": -0.5762094259262085, + "logits/rejected": -0.6454971432685852, + "logps/chosen": -53.3316535949707, + "logps/rejected": -116.73810577392578, + "loss": 0.5698, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9558284282684326, + "rewards/margins": 8.388113021850586, + "rewards/rejected": -5.432283401489258, + "step": 17449 + }, + { + "epoch": 4.37, + "grad_norm": 5.785974502563477, + "learning_rate": 3.9177131060617104e-07, + "logits/chosen": -0.5454704761505127, + "logits/rejected": -0.6114579439163208, + "logps/chosen": -48.161094665527344, + "logps/rejected": -98.11602783203125, + "loss": 0.6045, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.089092969894409, + "rewards/margins": 6.071717262268066, + "rewards/rejected": -2.9826247692108154, + "step": 17450 + }, + { + "epoch": 4.37, + "grad_norm": 5.693896770477295, + "learning_rate": 3.914663788784606e-07, + "logits/chosen": -0.4851375222206116, + "logits/rejected": -0.5423533916473389, + "logps/chosen": -61.255104064941406, + "logps/rejected": -120.99578094482422, + "loss": 0.7149, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.345008373260498, + "rewards/margins": 6.209395885467529, + "rewards/rejected": -2.864386796951294, + "step": 17451 + }, + { + "epoch": 4.37, + "grad_norm": 3.8937559127807617, + "learning_rate": 3.9116156103251703e-07, + "logits/chosen": -0.5208678841590881, + "logits/rejected": -0.5873076915740967, + "logps/chosen": -50.956974029541016, + "logps/rejected": -92.48353576660156, + "loss": 0.5898, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.352905035018921, + "rewards/margins": 6.478011608123779, + "rewards/rejected": -3.1251065731048584, + "step": 17452 + }, + { + "epoch": 4.37, + "grad_norm": 7.970343112945557, + "learning_rate": 3.9085685707587417e-07, + "logits/chosen": -0.4908091127872467, + "logits/rejected": -0.5630292296409607, + "logps/chosen": -54.73347854614258, + "logps/rejected": -124.414306640625, + "loss": 0.5649, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0264780521392822, + "rewards/margins": 7.696882724761963, + "rewards/rejected": -4.670404434204102, + "step": 17453 + }, + { + "epoch": 4.37, + "grad_norm": 4.64161491394043, + "learning_rate": 3.905522670160605e-07, + "logits/chosen": -0.5537841320037842, + "logits/rejected": -0.6402103304862976, + "logps/chosen": -59.737648010253906, + "logps/rejected": -111.32205200195312, + "loss": 0.579, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9861361980438232, + "rewards/margins": 7.827147483825684, + "rewards/rejected": -4.841011047363281, + "step": 17454 + }, + { + "epoch": 4.37, + "grad_norm": 6.391563892364502, + "learning_rate": 3.902477908606034e-07, + "logits/chosen": -0.5852270722389221, + "logits/rejected": -0.6672501564025879, + "logps/chosen": -51.95121765136719, + "logps/rejected": -101.91644287109375, + "loss": 0.5807, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0478744506835938, + "rewards/margins": 7.028604984283447, + "rewards/rejected": -3.9807302951812744, + "step": 17455 + }, + { + "epoch": 4.37, + "grad_norm": 7.12170934677124, + "learning_rate": 3.899434286170273e-07, + "logits/chosen": -0.48685842752456665, + "logits/rejected": -0.5404148101806641, + "logps/chosen": -56.971092224121094, + "logps/rejected": -101.02007293701172, + "loss": 0.6581, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.983503580093384, + "rewards/margins": 5.601423263549805, + "rewards/rejected": -2.617919445037842, + "step": 17456 + }, + { + "epoch": 4.37, + "grad_norm": 2.7350919246673584, + "learning_rate": 3.896391802928512e-07, + "logits/chosen": -0.5522365570068359, + "logits/rejected": -0.6404942274093628, + "logps/chosen": -49.72634506225586, + "logps/rejected": -110.74269104003906, + "loss": 0.545, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2069926261901855, + "rewards/margins": 8.443857192993164, + "rewards/rejected": -5.23686408996582, + "step": 17457 + }, + { + "epoch": 4.37, + "grad_norm": 3.263019561767578, + "learning_rate": 3.8933504589559577e-07, + "logits/chosen": -0.5754590630531311, + "logits/rejected": -0.6351956725120544, + "logps/chosen": -56.54288101196289, + "logps/rejected": -115.42881774902344, + "loss": 0.5966, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.23870587348938, + "rewards/margins": 7.338253021240234, + "rewards/rejected": -4.099546432495117, + "step": 17458 + }, + { + "epoch": 4.37, + "grad_norm": 2.4435136318206787, + "learning_rate": 3.8903102543277507e-07, + "logits/chosen": -0.4987552762031555, + "logits/rejected": -0.5900626182556152, + "logps/chosen": -62.53742218017578, + "logps/rejected": -111.2080078125, + "loss": 0.58, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.248623847961426, + "rewards/margins": 7.888950824737549, + "rewards/rejected": -4.640327453613281, + "step": 17459 + }, + { + "epoch": 4.37, + "grad_norm": 5.47802734375, + "learning_rate": 3.8872711891190184e-07, + "logits/chosen": -0.5261677503585815, + "logits/rejected": -0.6116434335708618, + "logps/chosen": -63.78595733642578, + "logps/rejected": -93.03218841552734, + "loss": 0.7372, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.871858596801758, + "rewards/margins": 6.217894554138184, + "rewards/rejected": -3.346036672592163, + "step": 17460 + }, + { + "epoch": 4.37, + "grad_norm": 6.840686798095703, + "learning_rate": 3.884233263404857e-07, + "logits/chosen": -0.6253925561904907, + "logits/rejected": -0.6969190835952759, + "logps/chosen": -41.00141143798828, + "logps/rejected": -131.5849609375, + "loss": 0.5899, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.959177255630493, + "rewards/margins": 9.227740287780762, + "rewards/rejected": -6.268563747406006, + "step": 17461 + }, + { + "epoch": 4.37, + "grad_norm": 3.1726224422454834, + "learning_rate": 3.881196477260357e-07, + "logits/chosen": -0.5212802290916443, + "logits/rejected": -0.6419789791107178, + "logps/chosen": -50.86954879760742, + "logps/rejected": -107.51368713378906, + "loss": 0.5296, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4617233276367188, + "rewards/margins": 8.20314884185791, + "rewards/rejected": -4.74142599105835, + "step": 17462 + }, + { + "epoch": 4.37, + "grad_norm": 5.81286096572876, + "learning_rate": 3.878160830760541e-07, + "logits/chosen": -0.5343369841575623, + "logits/rejected": -0.6110799312591553, + "logps/chosen": -58.77622604370117, + "logps/rejected": -100.94509887695312, + "loss": 0.5908, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7823572158813477, + "rewards/margins": 6.2019171714782715, + "rewards/rejected": -3.419560432434082, + "step": 17463 + }, + { + "epoch": 4.37, + "grad_norm": 5.550711154937744, + "learning_rate": 3.875126323980427e-07, + "logits/chosen": -0.5248483419418335, + "logits/rejected": -0.5938154458999634, + "logps/chosen": -56.03515625, + "logps/rejected": -121.1413345336914, + "loss": 0.6835, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9077136516571045, + "rewards/margins": 7.278562545776367, + "rewards/rejected": -4.370848178863525, + "step": 17464 + }, + { + "epoch": 4.37, + "grad_norm": 7.295332431793213, + "learning_rate": 3.872092956995005e-07, + "logits/chosen": -0.5669339895248413, + "logits/rejected": -0.6626901626586914, + "logps/chosen": -45.58527374267578, + "logps/rejected": -105.81531524658203, + "loss": 0.7211, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.044938087463379, + "rewards/margins": 7.574854850769043, + "rewards/rejected": -4.529916763305664, + "step": 17465 + }, + { + "epoch": 4.37, + "grad_norm": 4.736901760101318, + "learning_rate": 3.869060729879232e-07, + "logits/chosen": -0.485703706741333, + "logits/rejected": -0.5849009156227112, + "logps/chosen": -62.79790496826172, + "logps/rejected": -94.77017974853516, + "loss": 0.5938, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.867455005645752, + "rewards/margins": 6.8075456619262695, + "rewards/rejected": -3.9400901794433594, + "step": 17466 + }, + { + "epoch": 4.37, + "grad_norm": 4.3122382164001465, + "learning_rate": 3.8660296427080245e-07, + "logits/chosen": -0.5798114538192749, + "logits/rejected": -0.6529220938682556, + "logps/chosen": -47.34965896606445, + "logps/rejected": -102.03880310058594, + "loss": 0.5394, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.930650234222412, + "rewards/margins": 6.775687217712402, + "rewards/rejected": -3.8450372219085693, + "step": 17467 + }, + { + "epoch": 4.37, + "grad_norm": 4.0650200843811035, + "learning_rate": 3.862999695556302e-07, + "logits/chosen": -0.4780430197715759, + "logits/rejected": -0.5672738552093506, + "logps/chosen": -57.05714416503906, + "logps/rejected": -124.2066650390625, + "loss": 0.5437, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.173457145690918, + "rewards/margins": 8.352521896362305, + "rewards/rejected": -5.179064750671387, + "step": 17468 + }, + { + "epoch": 4.37, + "grad_norm": 4.3098368644714355, + "learning_rate": 3.859970888498937e-07, + "logits/chosen": -0.4882865846157074, + "logits/rejected": -0.5650547742843628, + "logps/chosen": -55.8546142578125, + "logps/rejected": -93.0168228149414, + "loss": 0.6678, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3355565071105957, + "rewards/margins": 6.882380485534668, + "rewards/rejected": -3.5468244552612305, + "step": 17469 + }, + { + "epoch": 4.37, + "grad_norm": 3.7952165603637695, + "learning_rate": 3.856943221610754e-07, + "logits/chosen": -0.5265051126480103, + "logits/rejected": -0.5996102094650269, + "logps/chosen": -57.76344299316406, + "logps/rejected": -104.49443054199219, + "loss": 0.6214, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.183971643447876, + "rewards/margins": 7.880487442016602, + "rewards/rejected": -4.696516513824463, + "step": 17470 + }, + { + "epoch": 4.37, + "grad_norm": 4.484805583953857, + "learning_rate": 3.853916694966586e-07, + "logits/chosen": -0.5771360993385315, + "logits/rejected": -0.6495600342750549, + "logps/chosen": -62.99042892456055, + "logps/rejected": -101.78844451904297, + "loss": 0.6138, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4257779121398926, + "rewards/margins": 6.461873531341553, + "rewards/rejected": -3.036095142364502, + "step": 17471 + }, + { + "epoch": 4.37, + "grad_norm": 4.95857048034668, + "learning_rate": 3.8508913086412304e-07, + "logits/chosen": -0.52873694896698, + "logits/rejected": -0.6000060439109802, + "logps/chosen": -50.25457763671875, + "logps/rejected": -115.69054412841797, + "loss": 0.6809, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9937210083007812, + "rewards/margins": 7.273927688598633, + "rewards/rejected": -4.280207633972168, + "step": 17472 + }, + { + "epoch": 4.37, + "grad_norm": 3.743875741958618, + "learning_rate": 3.847867062709432e-07, + "logits/chosen": -0.5844632983207703, + "logits/rejected": -0.6224600672721863, + "logps/chosen": -48.170169830322266, + "logps/rejected": -110.77700805664062, + "loss": 0.5925, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.391298294067383, + "rewards/margins": 6.821924686431885, + "rewards/rejected": -3.430626392364502, + "step": 17473 + }, + { + "epoch": 4.37, + "grad_norm": 5.197054386138916, + "learning_rate": 3.8448439572459197e-07, + "logits/chosen": -0.5958114266395569, + "logits/rejected": -0.6464195251464844, + "logps/chosen": -50.21413803100586, + "logps/rejected": -119.95321655273438, + "loss": 0.6021, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.895742416381836, + "rewards/margins": 7.248554229736328, + "rewards/rejected": -4.352811813354492, + "step": 17474 + }, + { + "epoch": 4.37, + "grad_norm": 2.593730926513672, + "learning_rate": 3.8418219923254175e-07, + "logits/chosen": -0.5655496716499329, + "logits/rejected": -0.6564552783966064, + "logps/chosen": -53.53086853027344, + "logps/rejected": -122.76025390625, + "loss": 0.5283, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.982644557952881, + "rewards/margins": 8.556771278381348, + "rewards/rejected": -5.574126243591309, + "step": 17475 + }, + { + "epoch": 4.37, + "grad_norm": 4.001918315887451, + "learning_rate": 3.8388011680225933e-07, + "logits/chosen": -0.5239067077636719, + "logits/rejected": -0.5666664242744446, + "logps/chosen": -53.20853042602539, + "logps/rejected": -113.9384765625, + "loss": 0.6093, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.121702194213867, + "rewards/margins": 8.046979904174805, + "rewards/rejected": -4.9252777099609375, + "step": 17476 + }, + { + "epoch": 4.37, + "grad_norm": 5.404750823974609, + "learning_rate": 3.8357814844120766e-07, + "logits/chosen": -0.45887625217437744, + "logits/rejected": -0.5344583988189697, + "logps/chosen": -62.4150390625, + "logps/rejected": -112.61439514160156, + "loss": 0.6362, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2862439155578613, + "rewards/margins": 7.7409868240356445, + "rewards/rejected": -4.454743385314941, + "step": 17477 + }, + { + "epoch": 4.37, + "grad_norm": 9.619250297546387, + "learning_rate": 3.832762941568513e-07, + "logits/chosen": -0.5820303559303284, + "logits/rejected": -0.6587507724761963, + "logps/chosen": -56.09049987792969, + "logps/rejected": -120.42835998535156, + "loss": 0.6142, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.313455581665039, + "rewards/margins": 7.888122081756592, + "rewards/rejected": -4.574666976928711, + "step": 17478 + }, + { + "epoch": 4.37, + "grad_norm": 2.7376668453216553, + "learning_rate": 3.8297455395664763e-07, + "logits/chosen": -0.5669075846672058, + "logits/rejected": -0.6202658414840698, + "logps/chosen": -48.03143310546875, + "logps/rejected": -103.14908599853516, + "loss": 0.583, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0066046714782715, + "rewards/margins": 6.700089454650879, + "rewards/rejected": -3.6934847831726074, + "step": 17479 + }, + { + "epoch": 4.37, + "grad_norm": 9.564698219299316, + "learning_rate": 3.82672927848054e-07, + "logits/chosen": -0.5400429368019104, + "logits/rejected": -0.628656804561615, + "logps/chosen": -59.59947967529297, + "logps/rejected": -92.40586853027344, + "loss": 0.6464, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.28468656539917, + "rewards/margins": 5.916760444641113, + "rewards/rejected": -2.6320741176605225, + "step": 17480 + }, + { + "epoch": 4.37, + "grad_norm": 3.394551992416382, + "learning_rate": 3.8237141583852387e-07, + "logits/chosen": -0.4695320427417755, + "logits/rejected": -0.5497197508811951, + "logps/chosen": -53.91096878051758, + "logps/rejected": -107.07512664794922, + "loss": 0.6009, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4700071811676025, + "rewards/margins": 6.632308483123779, + "rewards/rejected": -3.1623013019561768, + "step": 17481 + }, + { + "epoch": 4.37, + "grad_norm": 9.248952865600586, + "learning_rate": 3.820700179355064e-07, + "logits/chosen": -0.5087959170341492, + "logits/rejected": -0.5980042219161987, + "logps/chosen": -57.538978576660156, + "logps/rejected": -93.72914123535156, + "loss": 0.6858, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9717984199523926, + "rewards/margins": 6.226540565490723, + "rewards/rejected": -3.25474214553833, + "step": 17482 + }, + { + "epoch": 4.37, + "grad_norm": 4.863162994384766, + "learning_rate": 3.8176873414645154e-07, + "logits/chosen": -0.5115019083023071, + "logits/rejected": -0.5590627193450928, + "logps/chosen": -52.61289978027344, + "logps/rejected": -106.49723815917969, + "loss": 0.5858, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2610156536102295, + "rewards/margins": 6.11306095123291, + "rewards/rejected": -2.8520455360412598, + "step": 17483 + }, + { + "epoch": 4.37, + "grad_norm": 2.710736036300659, + "learning_rate": 3.8146756447880405e-07, + "logits/chosen": -0.581555962562561, + "logits/rejected": -0.6693035364151001, + "logps/chosen": -58.864593505859375, + "logps/rejected": -102.03025817871094, + "loss": 0.5926, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1057071685791016, + "rewards/margins": 6.997501373291016, + "rewards/rejected": -3.891794443130493, + "step": 17484 + }, + { + "epoch": 4.37, + "grad_norm": 4.900416374206543, + "learning_rate": 3.811665089400041e-07, + "logits/chosen": -0.5598586201667786, + "logits/rejected": -0.6594703197479248, + "logps/chosen": -47.920936584472656, + "logps/rejected": -87.25346374511719, + "loss": 0.6143, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.291154146194458, + "rewards/margins": 5.891547203063965, + "rewards/rejected": -2.6003928184509277, + "step": 17485 + }, + { + "epoch": 4.37, + "grad_norm": 3.3563413619995117, + "learning_rate": 3.808655675374928e-07, + "logits/chosen": -0.6086421012878418, + "logits/rejected": -0.6954019665718079, + "logps/chosen": -43.58257293701172, + "logps/rejected": -93.99495697021484, + "loss": 0.5704, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4374284744262695, + "rewards/margins": 7.497663497924805, + "rewards/rejected": -4.060235023498535, + "step": 17486 + }, + { + "epoch": 4.37, + "grad_norm": 11.786799430847168, + "learning_rate": 3.805647402787077e-07, + "logits/chosen": -0.5612389445304871, + "logits/rejected": -0.6690143942832947, + "logps/chosen": -67.40571594238281, + "logps/rejected": -109.22608184814453, + "loss": 0.7083, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2937848567962646, + "rewards/margins": 7.869282245635986, + "rewards/rejected": -4.575497627258301, + "step": 17487 + }, + { + "epoch": 4.37, + "grad_norm": 19.911142349243164, + "learning_rate": 3.8026402717108e-07, + "logits/chosen": -0.5947837829589844, + "logits/rejected": -0.7089915871620178, + "logps/chosen": -59.31208038330078, + "logps/rejected": -93.73478698730469, + "loss": 0.6202, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1184725761413574, + "rewards/margins": 6.7052483558654785, + "rewards/rejected": -3.586775779724121, + "step": 17488 + }, + { + "epoch": 4.38, + "grad_norm": 5.674837589263916, + "learning_rate": 3.7996342822204213e-07, + "logits/chosen": -0.5334665179252625, + "logits/rejected": -0.6314476728439331, + "logps/chosen": -56.12775802612305, + "logps/rejected": -100.73391723632812, + "loss": 0.6482, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8596084117889404, + "rewards/margins": 6.987086772918701, + "rewards/rejected": -4.12747859954834, + "step": 17489 + }, + { + "epoch": 4.38, + "grad_norm": 9.112363815307617, + "learning_rate": 3.796629434390231e-07, + "logits/chosen": -0.6090880632400513, + "logits/rejected": -0.6691564321517944, + "logps/chosen": -47.89183044433594, + "logps/rejected": -103.42676544189453, + "loss": 0.6085, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.258423089981079, + "rewards/margins": 6.764792442321777, + "rewards/rejected": -3.5063700675964355, + "step": 17490 + }, + { + "epoch": 4.38, + "grad_norm": 6.64722204208374, + "learning_rate": 3.793625728294459e-07, + "logits/chosen": -0.5735610723495483, + "logits/rejected": -0.6568375825881958, + "logps/chosen": -51.20687484741211, + "logps/rejected": -94.43353271484375, + "loss": 0.6644, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.96728515625, + "rewards/margins": 6.588029384613037, + "rewards/rejected": -3.620743751525879, + "step": 17491 + }, + { + "epoch": 4.38, + "grad_norm": 5.625317096710205, + "learning_rate": 3.7906231640073456e-07, + "logits/chosen": -0.6130223274230957, + "logits/rejected": -0.6927920579910278, + "logps/chosen": -65.82893371582031, + "logps/rejected": -129.33847045898438, + "loss": 0.6997, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.80419921875, + "rewards/margins": 8.913766860961914, + "rewards/rejected": -6.109567165374756, + "step": 17492 + }, + { + "epoch": 4.38, + "grad_norm": 3.319617986679077, + "learning_rate": 3.787621741603087e-07, + "logits/chosen": -0.5204156041145325, + "logits/rejected": -0.6174752712249756, + "logps/chosen": -57.78846740722656, + "logps/rejected": -103.02163696289062, + "loss": 0.6156, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0543627738952637, + "rewards/margins": 7.326403617858887, + "rewards/rejected": -4.272041320800781, + "step": 17493 + }, + { + "epoch": 4.38, + "grad_norm": 10.594884872436523, + "learning_rate": 3.784621461155852e-07, + "logits/chosen": -0.5731611847877502, + "logits/rejected": -0.6180888414382935, + "logps/chosen": -59.85787582397461, + "logps/rejected": -110.20478820800781, + "loss": 0.6285, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0963799953460693, + "rewards/margins": 7.938235282897949, + "rewards/rejected": -4.841855525970459, + "step": 17494 + }, + { + "epoch": 4.38, + "grad_norm": 9.184672355651855, + "learning_rate": 3.78162232273977e-07, + "logits/chosen": -0.6108947992324829, + "logits/rejected": -0.652970552444458, + "logps/chosen": -56.83730697631836, + "logps/rejected": -124.27716064453125, + "loss": 0.6574, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.906456232070923, + "rewards/margins": 6.746622562408447, + "rewards/rejected": -3.8401663303375244, + "step": 17495 + }, + { + "epoch": 4.38, + "grad_norm": 2.5314624309539795, + "learning_rate": 3.778624326428965e-07, + "logits/chosen": -0.4562212824821472, + "logits/rejected": -0.5945409536361694, + "logps/chosen": -57.00083541870117, + "logps/rejected": -133.43898010253906, + "loss": 0.5294, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.127375364303589, + "rewards/margins": 9.793174743652344, + "rewards/rejected": -6.665800094604492, + "step": 17496 + }, + { + "epoch": 4.38, + "grad_norm": 3.972360134124756, + "learning_rate": 3.775627472297511e-07, + "logits/chosen": -0.5267915725708008, + "logits/rejected": -0.635643720626831, + "logps/chosen": -55.14318084716797, + "logps/rejected": -108.19070434570312, + "loss": 0.5573, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9277944564819336, + "rewards/margins": 8.27362060546875, + "rewards/rejected": -5.345826625823975, + "step": 17497 + }, + { + "epoch": 4.38, + "grad_norm": 3.860689163208008, + "learning_rate": 3.772631760419476e-07, + "logits/chosen": -0.5288175940513611, + "logits/rejected": -0.6201167106628418, + "logps/chosen": -52.48671340942383, + "logps/rejected": -122.27106475830078, + "loss": 0.5299, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3681421279907227, + "rewards/margins": 8.493324279785156, + "rewards/rejected": -5.125181198120117, + "step": 17498 + }, + { + "epoch": 4.38, + "grad_norm": 4.57118558883667, + "learning_rate": 3.76963719086888e-07, + "logits/chosen": -0.5767651796340942, + "logits/rejected": -0.697568953037262, + "logps/chosen": -62.98524475097656, + "logps/rejected": -124.04061126708984, + "loss": 0.644, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.014188766479492, + "rewards/margins": 9.207629203796387, + "rewards/rejected": -6.193439483642578, + "step": 17499 + }, + { + "epoch": 4.38, + "grad_norm": 20.24466896057129, + "learning_rate": 3.7666437637197127e-07, + "logits/chosen": -0.5324928164482117, + "logits/rejected": -0.5969898104667664, + "logps/chosen": -55.40925598144531, + "logps/rejected": -102.31803131103516, + "loss": 0.6488, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.041015386581421, + "rewards/margins": 6.40101957321167, + "rewards/rejected": -3.360003709793091, + "step": 17500 + }, + { + "epoch": 4.38, + "grad_norm": 4.076535224914551, + "learning_rate": 3.76365147904596e-07, + "logits/chosen": -0.5434759855270386, + "logits/rejected": -0.5834670662879944, + "logps/chosen": -55.99562454223633, + "logps/rejected": -115.75755310058594, + "loss": 0.6921, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4456496238708496, + "rewards/margins": 7.7964372634887695, + "rewards/rejected": -4.35078763961792, + "step": 17501 + }, + { + "epoch": 4.38, + "grad_norm": 4.076896667480469, + "learning_rate": 3.760660336921562e-07, + "logits/chosen": -0.5087165236473083, + "logits/rejected": -0.609869658946991, + "logps/chosen": -58.996673583984375, + "logps/rejected": -108.7518539428711, + "loss": 0.5815, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0199978351593018, + "rewards/margins": 7.063858509063721, + "rewards/rejected": -4.04386043548584, + "step": 17502 + }, + { + "epoch": 4.38, + "grad_norm": 2.2019193172454834, + "learning_rate": 3.757670337420422e-07, + "logits/chosen": -0.4311535358428955, + "logits/rejected": -0.5580706596374512, + "logps/chosen": -62.773353576660156, + "logps/rejected": -98.66218566894531, + "loss": 0.5427, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3700742721557617, + "rewards/margins": 7.8928680419921875, + "rewards/rejected": -4.522792816162109, + "step": 17503 + }, + { + "epoch": 4.38, + "grad_norm": 3.6265499591827393, + "learning_rate": 3.754681480616429e-07, + "logits/chosen": -0.5376107096672058, + "logits/rejected": -0.6129706501960754, + "logps/chosen": -53.95191955566406, + "logps/rejected": -97.69256591796875, + "loss": 0.6283, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.071751117706299, + "rewards/margins": 6.302801132202148, + "rewards/rejected": -3.2310502529144287, + "step": 17504 + }, + { + "epoch": 4.38, + "grad_norm": 7.368412971496582, + "learning_rate": 3.7516937665834583e-07, + "logits/chosen": -0.5566762685775757, + "logits/rejected": -0.6063719987869263, + "logps/chosen": -63.7229118347168, + "logps/rejected": -109.43209075927734, + "loss": 0.6792, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.143505096435547, + "rewards/margins": 6.0122833251953125, + "rewards/rejected": -2.8687782287597656, + "step": 17505 + }, + { + "epoch": 4.38, + "grad_norm": 5.8778557777404785, + "learning_rate": 3.7487071953953234e-07, + "logits/chosen": -0.5721245408058167, + "logits/rejected": -0.6421616077423096, + "logps/chosen": -55.12889099121094, + "logps/rejected": -113.26699829101562, + "loss": 0.7547, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.207141160964966, + "rewards/margins": 7.570807933807373, + "rewards/rejected": -4.36366605758667, + "step": 17506 + }, + { + "epoch": 4.38, + "grad_norm": 3.5528924465179443, + "learning_rate": 3.7457217671258203e-07, + "logits/chosen": -0.5348526239395142, + "logits/rejected": -0.6102919578552246, + "logps/chosen": -61.557525634765625, + "logps/rejected": -103.1302719116211, + "loss": 0.6733, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0711445808410645, + "rewards/margins": 6.96445369720459, + "rewards/rejected": -3.8933091163635254, + "step": 17507 + }, + { + "epoch": 4.38, + "grad_norm": 6.262067794799805, + "learning_rate": 3.7427374818487407e-07, + "logits/chosen": -0.5055049061775208, + "logits/rejected": -0.6301539540290833, + "logps/chosen": -48.382320404052734, + "logps/rejected": -97.57437133789062, + "loss": 0.5485, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1519408226013184, + "rewards/margins": 7.265172004699707, + "rewards/rejected": -4.1132307052612305, + "step": 17508 + }, + { + "epoch": 4.38, + "grad_norm": 4.431318283081055, + "learning_rate": 3.739754339637819e-07, + "logits/chosen": -0.5933426022529602, + "logits/rejected": -0.6570591926574707, + "logps/chosen": -59.084991455078125, + "logps/rejected": -104.75129699707031, + "loss": 0.6818, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.230955123901367, + "rewards/margins": 7.277660369873047, + "rewards/rejected": -4.04670524597168, + "step": 17509 + }, + { + "epoch": 4.38, + "grad_norm": 3.120302200317383, + "learning_rate": 3.736772340566763e-07, + "logits/chosen": -0.5749472379684448, + "logits/rejected": -0.6634852886199951, + "logps/chosen": -71.96585845947266, + "logps/rejected": -109.3431625366211, + "loss": 0.7099, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.023057222366333, + "rewards/margins": 7.792573928833008, + "rewards/rejected": -4.769516944885254, + "step": 17510 + }, + { + "epoch": 4.38, + "grad_norm": 5.061879634857178, + "learning_rate": 3.7337914847092814e-07, + "logits/chosen": -0.5426056385040283, + "logits/rejected": -0.6126573085784912, + "logps/chosen": -54.473731994628906, + "logps/rejected": -115.13397979736328, + "loss": 0.6353, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3357439041137695, + "rewards/margins": 7.3264923095703125, + "rewards/rejected": -3.990748167037964, + "step": 17511 + }, + { + "epoch": 4.38, + "grad_norm": 4.72787618637085, + "learning_rate": 3.73081177213902e-07, + "logits/chosen": -0.4825834035873413, + "logits/rejected": -0.5818386673927307, + "logps/chosen": -53.96832275390625, + "logps/rejected": -102.55372619628906, + "loss": 0.5494, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8289384841918945, + "rewards/margins": 7.985810279846191, + "rewards/rejected": -4.156871795654297, + "step": 17512 + }, + { + "epoch": 4.38, + "grad_norm": 12.141244888305664, + "learning_rate": 3.7278332029296096e-07, + "logits/chosen": -0.5902038812637329, + "logits/rejected": -0.6567558646202087, + "logps/chosen": -53.20671463012695, + "logps/rejected": -92.92819213867188, + "loss": 0.5995, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.219998359680176, + "rewards/margins": 6.854863166809082, + "rewards/rejected": -3.6348648071289062, + "step": 17513 + }, + { + "epoch": 4.38, + "grad_norm": 6.837019443511963, + "learning_rate": 3.7248557771546625e-07, + "logits/chosen": -0.5206679105758667, + "logits/rejected": -0.6519948244094849, + "logps/chosen": -72.03412628173828, + "logps/rejected": -98.68236541748047, + "loss": 0.6674, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.884160280227661, + "rewards/margins": 7.033055782318115, + "rewards/rejected": -4.148895263671875, + "step": 17514 + }, + { + "epoch": 4.38, + "grad_norm": 6.9663987159729, + "learning_rate": 3.721879494887748e-07, + "logits/chosen": -0.5241830348968506, + "logits/rejected": -0.5747730731964111, + "logps/chosen": -49.74888610839844, + "logps/rejected": -113.39398193359375, + "loss": 0.5752, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.987708568572998, + "rewards/margins": 7.320754528045654, + "rewards/rejected": -4.333045959472656, + "step": 17515 + }, + { + "epoch": 4.38, + "grad_norm": 9.009521484375, + "learning_rate": 3.7189043562024127e-07, + "logits/chosen": -0.5645270347595215, + "logits/rejected": -0.6378028392791748, + "logps/chosen": -57.50962829589844, + "logps/rejected": -111.90565490722656, + "loss": 0.7616, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.999523878097534, + "rewards/margins": 7.72230339050293, + "rewards/rejected": -4.722779750823975, + "step": 17516 + }, + { + "epoch": 4.38, + "grad_norm": 6.723373889923096, + "learning_rate": 3.715930361172171e-07, + "logits/chosen": -0.5186675786972046, + "logits/rejected": -0.6031931042671204, + "logps/chosen": -70.76734924316406, + "logps/rejected": -134.58419799804688, + "loss": 0.6973, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6983144283294678, + "rewards/margins": 7.65838098526001, + "rewards/rejected": -4.960066795349121, + "step": 17517 + }, + { + "epoch": 4.38, + "grad_norm": 4.514462947845459, + "learning_rate": 3.7129575098705293e-07, + "logits/chosen": -0.5682833790779114, + "logits/rejected": -0.6766020059585571, + "logps/chosen": -42.75029754638672, + "logps/rejected": -102.44364929199219, + "loss": 0.5703, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.954315662384033, + "rewards/margins": 7.356758117675781, + "rewards/rejected": -4.402442455291748, + "step": 17518 + }, + { + "epoch": 4.38, + "grad_norm": 4.886958122253418, + "learning_rate": 3.709985802370947e-07, + "logits/chosen": -0.5457640886306763, + "logits/rejected": -0.6064081788063049, + "logps/chosen": -57.569854736328125, + "logps/rejected": -101.80806732177734, + "loss": 0.6538, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7465505599975586, + "rewards/margins": 6.475216388702393, + "rewards/rejected": -3.728665351867676, + "step": 17519 + }, + { + "epoch": 4.38, + "grad_norm": 12.644996643066406, + "learning_rate": 3.707015238746836e-07, + "logits/chosen": -0.6099129915237427, + "logits/rejected": -0.6431276202201843, + "logps/chosen": -48.026405334472656, + "logps/rejected": -98.27071380615234, + "loss": 0.6606, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0792434215545654, + "rewards/margins": 5.840509414672852, + "rewards/rejected": -2.761265754699707, + "step": 17520 + }, + { + "epoch": 4.38, + "grad_norm": 4.046334743499756, + "learning_rate": 3.704045819071628e-07, + "logits/chosen": -0.5534167885780334, + "logits/rejected": -0.5991389751434326, + "logps/chosen": -64.337646484375, + "logps/rejected": -122.16979217529297, + "loss": 0.5621, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9894847869873047, + "rewards/margins": 7.225186824798584, + "rewards/rejected": -4.2357025146484375, + "step": 17521 + }, + { + "epoch": 4.38, + "grad_norm": 5.860609531402588, + "learning_rate": 3.701077543418685e-07, + "logits/chosen": -0.5333473086357117, + "logits/rejected": -0.6615293622016907, + "logps/chosen": -49.54444122314453, + "logps/rejected": -107.9292984008789, + "loss": 0.6071, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9196524620056152, + "rewards/margins": 7.821559906005859, + "rewards/rejected": -4.901907920837402, + "step": 17522 + }, + { + "epoch": 4.38, + "grad_norm": 2.5184566974639893, + "learning_rate": 3.698110411861366e-07, + "logits/chosen": -0.511874794960022, + "logits/rejected": -0.655782163143158, + "logps/chosen": -73.1014175415039, + "logps/rejected": -105.8823013305664, + "loss": 0.5653, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.846625566482544, + "rewards/margins": 7.731961727142334, + "rewards/rejected": -4.885336399078369, + "step": 17523 + }, + { + "epoch": 4.38, + "grad_norm": 2.4522337913513184, + "learning_rate": 3.6951444244729904e-07, + "logits/chosen": -0.6630675792694092, + "logits/rejected": -0.7365766167640686, + "logps/chosen": -57.24359893798828, + "logps/rejected": -127.74530792236328, + "loss": 0.6037, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1843745708465576, + "rewards/margins": 7.664424896240234, + "rewards/rejected": -4.480050086975098, + "step": 17524 + }, + { + "epoch": 4.38, + "grad_norm": 3.052593946456909, + "learning_rate": 3.692179581326838e-07, + "logits/chosen": -0.5638892650604248, + "logits/rejected": -0.6920431852340698, + "logps/chosen": -50.52373504638672, + "logps/rejected": -104.55237579345703, + "loss": 0.5442, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2445363998413086, + "rewards/margins": 8.429671287536621, + "rewards/rejected": -5.185134410858154, + "step": 17525 + }, + { + "epoch": 4.38, + "grad_norm": 2.5682241916656494, + "learning_rate": 3.6892158824961943e-07, + "logits/chosen": -0.5686354041099548, + "logits/rejected": -0.6593263149261475, + "logps/chosen": -63.2266845703125, + "logps/rejected": -105.91818237304688, + "loss": 0.5992, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2735564708709717, + "rewards/margins": 7.791265964508057, + "rewards/rejected": -4.5177106857299805, + "step": 17526 + }, + { + "epoch": 4.38, + "grad_norm": 3.247995615005493, + "learning_rate": 3.6862533280542843e-07, + "logits/chosen": -0.611598551273346, + "logits/rejected": -0.7301273345947266, + "logps/chosen": -58.88341522216797, + "logps/rejected": -95.811767578125, + "loss": 0.5903, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0865347385406494, + "rewards/margins": 7.531505584716797, + "rewards/rejected": -4.444970607757568, + "step": 17527 + }, + { + "epoch": 4.38, + "grad_norm": 6.40002965927124, + "learning_rate": 3.6832919180743055e-07, + "logits/chosen": -0.5730249285697937, + "logits/rejected": -0.669540524482727, + "logps/chosen": -62.86355209350586, + "logps/rejected": -145.6363067626953, + "loss": 0.654, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8551039695739746, + "rewards/margins": 8.87846565246582, + "rewards/rejected": -6.023360729217529, + "step": 17528 + }, + { + "epoch": 4.39, + "grad_norm": 3.087538242340088, + "learning_rate": 3.6803316526294485e-07, + "logits/chosen": -0.4835374653339386, + "logits/rejected": -0.5506469011306763, + "logps/chosen": -63.948509216308594, + "logps/rejected": -120.90035247802734, + "loss": 0.643, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.930539846420288, + "rewards/margins": 7.583466053009033, + "rewards/rejected": -4.652926445007324, + "step": 17529 + }, + { + "epoch": 4.39, + "grad_norm": 4.827174186706543, + "learning_rate": 3.677372531792878e-07, + "logits/chosen": -0.6335852742195129, + "logits/rejected": -0.7315778136253357, + "logps/chosen": -58.63761901855469, + "logps/rejected": -105.84705352783203, + "loss": 0.6184, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.974015951156616, + "rewards/margins": 7.144392967224121, + "rewards/rejected": -4.170376777648926, + "step": 17530 + }, + { + "epoch": 4.39, + "grad_norm": 3.889136552810669, + "learning_rate": 3.67441455563769e-07, + "logits/chosen": -0.5620822906494141, + "logits/rejected": -0.6482316851615906, + "logps/chosen": -62.98698043823242, + "logps/rejected": -111.18569946289062, + "loss": 0.6688, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4636998176574707, + "rewards/margins": 6.572618007659912, + "rewards/rejected": -3.1089181900024414, + "step": 17531 + }, + { + "epoch": 4.39, + "grad_norm": 5.198609352111816, + "learning_rate": 3.671457724236993e-07, + "logits/chosen": -0.5385565757751465, + "logits/rejected": -0.6013265252113342, + "logps/chosen": -43.875823974609375, + "logps/rejected": -100.85489654541016, + "loss": 0.5549, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.105931282043457, + "rewards/margins": 6.667727470397949, + "rewards/rejected": -3.5617966651916504, + "step": 17532 + }, + { + "epoch": 4.39, + "grad_norm": 3.040060520172119, + "learning_rate": 3.668502037663868e-07, + "logits/chosen": -0.5940988659858704, + "logits/rejected": -0.7188411355018616, + "logps/chosen": -47.82746505737305, + "logps/rejected": -106.89336395263672, + "loss": 0.5807, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.963899612426758, + "rewards/margins": 9.019173622131348, + "rewards/rejected": -6.055273056030273, + "step": 17533 + }, + { + "epoch": 4.39, + "grad_norm": 3.6851956844329834, + "learning_rate": 3.6655474959913227e-07, + "logits/chosen": -0.5522264838218689, + "logits/rejected": -0.6327834129333496, + "logps/chosen": -50.42620849609375, + "logps/rejected": -121.01671600341797, + "loss": 0.5491, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3512325286865234, + "rewards/margins": 7.785786151885986, + "rewards/rejected": -4.434553146362305, + "step": 17534 + }, + { + "epoch": 4.39, + "grad_norm": 6.2460432052612305, + "learning_rate": 3.6625940992923826e-07, + "logits/chosen": -0.4786585867404938, + "logits/rejected": -0.5381298661231995, + "logps/chosen": -62.64678192138672, + "logps/rejected": -110.752197265625, + "loss": 0.7307, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8490240573883057, + "rewards/margins": 6.424078941345215, + "rewards/rejected": -3.5750551223754883, + "step": 17535 + }, + { + "epoch": 4.39, + "grad_norm": 2.6609230041503906, + "learning_rate": 3.659641847640039e-07, + "logits/chosen": -0.5648223161697388, + "logits/rejected": -0.6224581003189087, + "logps/chosen": -52.68244171142578, + "logps/rejected": -120.03950500488281, + "loss": 0.5573, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1720190048217773, + "rewards/margins": 8.36182975769043, + "rewards/rejected": -5.189810752868652, + "step": 17536 + }, + { + "epoch": 4.39, + "grad_norm": 3.675342559814453, + "learning_rate": 3.656690741107227e-07, + "logits/chosen": -0.46340394020080566, + "logits/rejected": -0.6085913777351379, + "logps/chosen": -48.77505874633789, + "logps/rejected": -124.35430145263672, + "loss": 0.5318, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0455470085144043, + "rewards/margins": 9.24018669128418, + "rewards/rejected": -6.194640636444092, + "step": 17537 + }, + { + "epoch": 4.39, + "grad_norm": 3.14286470413208, + "learning_rate": 3.6537407797668787e-07, + "logits/chosen": -0.5581023693084717, + "logits/rejected": -0.6358465552330017, + "logps/chosen": -61.80158615112305, + "logps/rejected": -115.8107681274414, + "loss": 0.5671, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2165439128875732, + "rewards/margins": 7.90497350692749, + "rewards/rejected": -4.688429832458496, + "step": 17538 + }, + { + "epoch": 4.39, + "grad_norm": 6.909306049346924, + "learning_rate": 3.6507919636918967e-07, + "logits/chosen": -0.4554210901260376, + "logits/rejected": -0.5063934922218323, + "logps/chosen": -72.83440399169922, + "logps/rejected": -102.08184051513672, + "loss": 0.7722, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.070058584213257, + "rewards/margins": 5.402435302734375, + "rewards/rejected": -2.3323769569396973, + "step": 17539 + }, + { + "epoch": 4.39, + "grad_norm": 3.8385121822357178, + "learning_rate": 3.6478442929551496e-07, + "logits/chosen": -0.5767523050308228, + "logits/rejected": -0.6702656149864197, + "logps/chosen": -49.71538543701172, + "logps/rejected": -113.81764221191406, + "loss": 0.5687, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8156399726867676, + "rewards/margins": 8.286357879638672, + "rewards/rejected": -5.470717430114746, + "step": 17540 + }, + { + "epoch": 4.39, + "grad_norm": 3.605555534362793, + "learning_rate": 3.6448977676294575e-07, + "logits/chosen": -0.5643721222877502, + "logits/rejected": -0.6232802867889404, + "logps/chosen": -63.733333587646484, + "logps/rejected": -123.8783187866211, + "loss": 0.646, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.145871162414551, + "rewards/margins": 8.364730834960938, + "rewards/rejected": -5.218859672546387, + "step": 17541 + }, + { + "epoch": 4.39, + "grad_norm": 6.04081916809082, + "learning_rate": 3.6419523877876614e-07, + "logits/chosen": -0.5671663880348206, + "logits/rejected": -0.6058926582336426, + "logps/chosen": -44.502540588378906, + "logps/rejected": -101.02870178222656, + "loss": 0.6433, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.240035057067871, + "rewards/margins": 7.244150161743164, + "rewards/rejected": -4.004116058349609, + "step": 17542 + }, + { + "epoch": 4.39, + "grad_norm": 3.211979866027832, + "learning_rate": 3.639008153502516e-07, + "logits/chosen": -0.5699944496154785, + "logits/rejected": -0.589148998260498, + "logps/chosen": -46.65851593017578, + "logps/rejected": -110.62399291992188, + "loss": 0.589, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.209066152572632, + "rewards/margins": 7.617572784423828, + "rewards/rejected": -4.408506393432617, + "step": 17543 + }, + { + "epoch": 4.39, + "grad_norm": 8.7390718460083, + "learning_rate": 3.6360650648468055e-07, + "logits/chosen": -0.5154871344566345, + "logits/rejected": -0.6052665710449219, + "logps/chosen": -55.62540817260742, + "logps/rejected": -114.54956817626953, + "loss": 0.6561, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9661777019500732, + "rewards/margins": 7.322432041168213, + "rewards/rejected": -4.3562541007995605, + "step": 17544 + }, + { + "epoch": 4.39, + "grad_norm": 5.644781112670898, + "learning_rate": 3.6331231218932395e-07, + "logits/chosen": -0.6026825904846191, + "logits/rejected": -0.6785797476768494, + "logps/chosen": -58.07189178466797, + "logps/rejected": -109.72338104248047, + "loss": 0.634, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.799586296081543, + "rewards/margins": 7.33930778503418, + "rewards/rejected": -4.539721965789795, + "step": 17545 + }, + { + "epoch": 4.39, + "grad_norm": 5.09269380569458, + "learning_rate": 3.6301823247145094e-07, + "logits/chosen": -0.49432843923568726, + "logits/rejected": -0.5918766856193542, + "logps/chosen": -68.90689086914062, + "logps/rejected": -98.91715240478516, + "loss": 0.7233, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9767544269561768, + "rewards/margins": 6.867986679077148, + "rewards/rejected": -3.8912320137023926, + "step": 17546 + }, + { + "epoch": 4.39, + "grad_norm": 3.017487049102783, + "learning_rate": 3.6272426733832966e-07, + "logits/chosen": -0.4815555810928345, + "logits/rejected": -0.5713174343109131, + "logps/chosen": -58.909767150878906, + "logps/rejected": -117.6402816772461, + "loss": 0.5425, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1267318725585938, + "rewards/margins": 8.74317455291748, + "rewards/rejected": -5.616442680358887, + "step": 17547 + }, + { + "epoch": 4.39, + "grad_norm": 3.054532527923584, + "learning_rate": 3.624304167972248e-07, + "logits/chosen": -0.5856576561927795, + "logits/rejected": -0.6573068499565125, + "logps/chosen": -47.18910217285156, + "logps/rejected": -124.63986206054688, + "loss": 0.5218, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.016028881072998, + "rewards/margins": 7.653782367706299, + "rewards/rejected": -4.637753486633301, + "step": 17548 + }, + { + "epoch": 4.39, + "grad_norm": 3.9338574409484863, + "learning_rate": 3.621366808553972e-07, + "logits/chosen": -0.5410408973693848, + "logits/rejected": -0.667873740196228, + "logps/chosen": -57.50142288208008, + "logps/rejected": -89.41617584228516, + "loss": 0.6734, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1243510246276855, + "rewards/margins": 6.885758399963379, + "rewards/rejected": -3.7614071369171143, + "step": 17549 + }, + { + "epoch": 4.39, + "grad_norm": 3.1563315391540527, + "learning_rate": 3.6184305952010443e-07, + "logits/chosen": -0.5764089822769165, + "logits/rejected": -0.5675414204597473, + "logps/chosen": -75.06156158447266, + "logps/rejected": -117.38832092285156, + "loss": 0.6111, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.207282066345215, + "rewards/margins": 7.967870712280273, + "rewards/rejected": -4.760588645935059, + "step": 17550 + }, + { + "epoch": 4.39, + "grad_norm": 6.0519022941589355, + "learning_rate": 3.61549552798604e-07, + "logits/chosen": -0.5135622620582581, + "logits/rejected": -0.6252909898757935, + "logps/chosen": -45.07862091064453, + "logps/rejected": -94.70940399169922, + "loss": 0.5862, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0730080604553223, + "rewards/margins": 6.827826976776123, + "rewards/rejected": -3.75481915473938, + "step": 17551 + }, + { + "epoch": 4.39, + "grad_norm": 4.568307399749756, + "learning_rate": 3.612561606981474e-07, + "logits/chosen": -0.5608593225479126, + "logits/rejected": -0.6044772863388062, + "logps/chosen": -58.6907844543457, + "logps/rejected": -115.83352661132812, + "loss": 0.6129, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.997110605239868, + "rewards/margins": 7.582915782928467, + "rewards/rejected": -4.585805416107178, + "step": 17552 + }, + { + "epoch": 4.39, + "grad_norm": 3.5716021060943604, + "learning_rate": 3.6096288322598426e-07, + "logits/chosen": -0.6213316917419434, + "logits/rejected": -0.6812210083007812, + "logps/chosen": -54.950862884521484, + "logps/rejected": -111.99568176269531, + "loss": 0.6108, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3534460067749023, + "rewards/margins": 7.836362838745117, + "rewards/rejected": -4.482917308807373, + "step": 17553 + }, + { + "epoch": 4.39, + "grad_norm": 8.370487213134766, + "learning_rate": 3.606697203893639e-07, + "logits/chosen": -0.49349379539489746, + "logits/rejected": -0.5096798539161682, + "logps/chosen": -61.07128143310547, + "logps/rejected": -107.4356918334961, + "loss": 0.8197, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0337533950805664, + "rewards/margins": 6.01519250869751, + "rewards/rejected": -2.9814391136169434, + "step": 17554 + }, + { + "epoch": 4.39, + "grad_norm": 6.256159782409668, + "learning_rate": 3.603766721955293e-07, + "logits/chosen": -0.47622841596603394, + "logits/rejected": -0.5585057735443115, + "logps/chosen": -50.08357620239258, + "logps/rejected": -105.42698669433594, + "loss": 0.6074, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0959599018096924, + "rewards/margins": 7.515862941741943, + "rewards/rejected": -4.419902801513672, + "step": 17555 + }, + { + "epoch": 4.39, + "grad_norm": 4.2055439949035645, + "learning_rate": 3.6008373865172085e-07, + "logits/chosen": -0.5912480354309082, + "logits/rejected": -0.6858657598495483, + "logps/chosen": -40.02694320678711, + "logps/rejected": -100.11671447753906, + "loss": 0.5137, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.306720495223999, + "rewards/margins": 8.366497993469238, + "rewards/rejected": -5.05977725982666, + "step": 17556 + }, + { + "epoch": 4.39, + "grad_norm": 3.9408950805664062, + "learning_rate": 3.5979091976517946e-07, + "logits/chosen": -0.5613102912902832, + "logits/rejected": -0.6328252553939819, + "logps/chosen": -45.13664245605469, + "logps/rejected": -110.34490966796875, + "loss": 0.5983, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.096529722213745, + "rewards/margins": 6.564244270324707, + "rewards/rejected": -3.4677138328552246, + "step": 17557 + }, + { + "epoch": 4.39, + "grad_norm": 5.940185546875, + "learning_rate": 3.594982155431398e-07, + "logits/chosen": -0.5677622556686401, + "logits/rejected": -0.612411618232727, + "logps/chosen": -47.873844146728516, + "logps/rejected": -114.57791900634766, + "loss": 0.6592, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3058576583862305, + "rewards/margins": 7.091132164001465, + "rewards/rejected": -3.7852742671966553, + "step": 17558 + }, + { + "epoch": 4.39, + "grad_norm": 3.5341875553131104, + "learning_rate": 3.592056259928345e-07, + "logits/chosen": -0.5276913046836853, + "logits/rejected": -0.5515801906585693, + "logps/chosen": -48.4239616394043, + "logps/rejected": -99.7568359375, + "loss": 0.5607, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1628308296203613, + "rewards/margins": 6.024097442626953, + "rewards/rejected": -2.861266613006592, + "step": 17559 + }, + { + "epoch": 4.39, + "grad_norm": 3.4853250980377197, + "learning_rate": 3.589131511214944e-07, + "logits/chosen": -0.5289459228515625, + "logits/rejected": -0.6324970722198486, + "logps/chosen": -51.401832580566406, + "logps/rejected": -108.65335083007812, + "loss": 0.6091, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4390709400177, + "rewards/margins": 7.652508735656738, + "rewards/rejected": -4.213438034057617, + "step": 17560 + }, + { + "epoch": 4.39, + "grad_norm": 7.245344161987305, + "learning_rate": 3.586207909363476e-07, + "logits/chosen": -0.3992899954319, + "logits/rejected": -0.48067328333854675, + "logps/chosen": -58.65924835205078, + "logps/rejected": -122.30609893798828, + "loss": 0.6576, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.095796585083008, + "rewards/margins": 7.112241744995117, + "rewards/rejected": -4.016445636749268, + "step": 17561 + }, + { + "epoch": 4.39, + "grad_norm": 4.456131458282471, + "learning_rate": 3.5832854544461783e-07, + "logits/chosen": -0.560656726360321, + "logits/rejected": -0.6712526082992554, + "logps/chosen": -47.27031707763672, + "logps/rejected": -107.35221862792969, + "loss": 0.5874, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9960052967071533, + "rewards/margins": 8.005047798156738, + "rewards/rejected": -5.009042263031006, + "step": 17562 + }, + { + "epoch": 4.39, + "grad_norm": 3.6442532539367676, + "learning_rate": 3.580364146535259e-07, + "logits/chosen": -0.5023115277290344, + "logits/rejected": -0.6200683116912842, + "logps/chosen": -45.062461853027344, + "logps/rejected": -108.73487091064453, + "loss": 0.5921, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0396323204040527, + "rewards/margins": 7.896467685699463, + "rewards/rejected": -4.85683536529541, + "step": 17563 + }, + { + "epoch": 4.39, + "grad_norm": 15.774211883544922, + "learning_rate": 3.577443985702922e-07, + "logits/chosen": -0.5496695041656494, + "logits/rejected": -0.6385291814804077, + "logps/chosen": -48.238643646240234, + "logps/rejected": -102.16743469238281, + "loss": 0.5644, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.167067050933838, + "rewards/margins": 6.3537516593933105, + "rewards/rejected": -3.1866846084594727, + "step": 17564 + }, + { + "epoch": 4.39, + "grad_norm": 3.3807859420776367, + "learning_rate": 3.574524972021326e-07, + "logits/chosen": -0.5419948101043701, + "logits/rejected": -0.6827438473701477, + "logps/chosen": -71.65868377685547, + "logps/rejected": -109.31051635742188, + "loss": 0.6352, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2077934741973877, + "rewards/margins": 7.563548564910889, + "rewards/rejected": -4.35575532913208, + "step": 17565 + }, + { + "epoch": 4.39, + "grad_norm": 6.82733154296875, + "learning_rate": 3.5716071055625854e-07, + "logits/chosen": -0.6199886798858643, + "logits/rejected": -0.6729485988616943, + "logps/chosen": -48.15160369873047, + "logps/rejected": -96.61058807373047, + "loss": 0.7296, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2102763652801514, + "rewards/margins": 6.145010948181152, + "rewards/rejected": -2.934735059738159, + "step": 17566 + }, + { + "epoch": 4.39, + "grad_norm": 2.4061126708984375, + "learning_rate": 3.5686903863988267e-07, + "logits/chosen": -0.49250906705856323, + "logits/rejected": -0.5506719350814819, + "logps/chosen": -54.627315521240234, + "logps/rejected": -120.65058135986328, + "loss": 0.5443, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1661324501037598, + "rewards/margins": 7.648713111877441, + "rewards/rejected": -4.482580184936523, + "step": 17567 + }, + { + "epoch": 4.39, + "grad_norm": 3.2850046157836914, + "learning_rate": 3.5657748146021074e-07, + "logits/chosen": -0.44556984305381775, + "logits/rejected": -0.526883065700531, + "logps/chosen": -68.03526306152344, + "logps/rejected": -124.65973663330078, + "loss": 0.6159, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9165990352630615, + "rewards/margins": 7.542966365814209, + "rewards/rejected": -4.626367568969727, + "step": 17568 + }, + { + "epoch": 4.4, + "grad_norm": 7.3409905433654785, + "learning_rate": 3.5628603902444934e-07, + "logits/chosen": -0.5829657316207886, + "logits/rejected": -0.6178203821182251, + "logps/chosen": -55.33594512939453, + "logps/rejected": -123.47349548339844, + "loss": 0.6161, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2214064598083496, + "rewards/margins": 7.679373264312744, + "rewards/rejected": -4.4579668045043945, + "step": 17569 + }, + { + "epoch": 4.4, + "grad_norm": 4.225645542144775, + "learning_rate": 3.559947113397988e-07, + "logits/chosen": -0.5216038823127747, + "logits/rejected": -0.5641810894012451, + "logps/chosen": -55.53828048706055, + "logps/rejected": -102.3726577758789, + "loss": 0.6303, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.934556722640991, + "rewards/margins": 6.039236068725586, + "rewards/rejected": -3.1046786308288574, + "step": 17570 + }, + { + "epoch": 4.4, + "grad_norm": 7.810576438903809, + "learning_rate": 3.5570349841345777e-07, + "logits/chosen": -0.579968273639679, + "logits/rejected": -0.6582901477813721, + "logps/chosen": -46.53969955444336, + "logps/rejected": -105.95752716064453, + "loss": 0.6191, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.099689483642578, + "rewards/margins": 7.87067174911499, + "rewards/rejected": -4.770982265472412, + "step": 17571 + }, + { + "epoch": 4.4, + "grad_norm": 2.8656721115112305, + "learning_rate": 3.5541240025262324e-07, + "logits/chosen": -0.5880743265151978, + "logits/rejected": -0.7004663944244385, + "logps/chosen": -57.19544982910156, + "logps/rejected": -98.40399169921875, + "loss": 0.5433, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.087660551071167, + "rewards/margins": 7.501585483551025, + "rewards/rejected": -4.4139251708984375, + "step": 17572 + }, + { + "epoch": 4.4, + "grad_norm": 5.318374156951904, + "learning_rate": 3.551214168644901e-07, + "logits/chosen": -0.5465102791786194, + "logits/rejected": -0.6241392493247986, + "logps/chosen": -55.07075119018555, + "logps/rejected": -102.5927734375, + "loss": 0.5548, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1832330226898193, + "rewards/margins": 7.057896137237549, + "rewards/rejected": -3.8746628761291504, + "step": 17573 + }, + { + "epoch": 4.4, + "grad_norm": 5.969338417053223, + "learning_rate": 3.548305482562459e-07, + "logits/chosen": -0.5184900760650635, + "logits/rejected": -0.5830683708190918, + "logps/chosen": -45.8169059753418, + "logps/rejected": -87.80199432373047, + "loss": 0.6507, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2705633640289307, + "rewards/margins": 6.067718982696533, + "rewards/rejected": -2.7971558570861816, + "step": 17574 + }, + { + "epoch": 4.4, + "grad_norm": 11.923704147338867, + "learning_rate": 3.5453979443507926e-07, + "logits/chosen": -0.5367032289505005, + "logits/rejected": -0.6248644590377808, + "logps/chosen": -46.28645324707031, + "logps/rejected": -87.27014923095703, + "loss": 0.6155, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.11445951461792, + "rewards/margins": 6.802879333496094, + "rewards/rejected": -3.688419818878174, + "step": 17575 + }, + { + "epoch": 4.4, + "grad_norm": 3.184126138687134, + "learning_rate": 3.542491554081773e-07, + "logits/chosen": -0.5669419169425964, + "logits/rejected": -0.6867983937263489, + "logps/chosen": -67.9126205444336, + "logps/rejected": -120.9292221069336, + "loss": 0.6221, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0523552894592285, + "rewards/margins": 8.388625144958496, + "rewards/rejected": -5.336270332336426, + "step": 17576 + }, + { + "epoch": 4.4, + "grad_norm": 5.579156398773193, + "learning_rate": 3.539586311827181e-07, + "logits/chosen": -0.5490894317626953, + "logits/rejected": -0.6382937431335449, + "logps/chosen": -52.82659149169922, + "logps/rejected": -95.1209487915039, + "loss": 0.6206, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.218975782394409, + "rewards/margins": 7.807012557983398, + "rewards/rejected": -4.588037014007568, + "step": 17577 + }, + { + "epoch": 4.4, + "grad_norm": 4.669830322265625, + "learning_rate": 3.5366822176588324e-07, + "logits/chosen": -0.5691094398498535, + "logits/rejected": -0.6305427551269531, + "logps/chosen": -46.66643142700195, + "logps/rejected": -137.82998657226562, + "loss": 0.5696, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4914987087249756, + "rewards/margins": 9.658397674560547, + "rewards/rejected": -6.166898250579834, + "step": 17578 + }, + { + "epoch": 4.4, + "grad_norm": 3.423091411590576, + "learning_rate": 3.5337792716484966e-07, + "logits/chosen": -0.6058409214019775, + "logits/rejected": -0.6340121030807495, + "logps/chosen": -47.20501708984375, + "logps/rejected": -155.32659912109375, + "loss": 0.5147, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3845338821411133, + "rewards/margins": 10.934059143066406, + "rewards/rejected": -7.549525260925293, + "step": 17579 + }, + { + "epoch": 4.4, + "grad_norm": 5.786707878112793, + "learning_rate": 3.530877473867894e-07, + "logits/chosen": -0.49955400824546814, + "logits/rejected": -0.5792564153671265, + "logps/chosen": -59.41720199584961, + "logps/rejected": -105.98421478271484, + "loss": 0.5987, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.059032917022705, + "rewards/margins": 7.6162919998168945, + "rewards/rejected": -4.5572590827941895, + "step": 17580 + }, + { + "epoch": 4.4, + "grad_norm": 32.017276763916016, + "learning_rate": 3.527976824388735e-07, + "logits/chosen": -0.5078322291374207, + "logits/rejected": -0.6164697408676147, + "logps/chosen": -55.336158752441406, + "logps/rejected": -111.93675994873047, + "loss": 0.5984, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9863710403442383, + "rewards/margins": 7.898601055145264, + "rewards/rejected": -4.912230014801025, + "step": 17581 + }, + { + "epoch": 4.4, + "grad_norm": 4.002064228057861, + "learning_rate": 3.5250773232826995e-07, + "logits/chosen": -0.6014114618301392, + "logits/rejected": -0.673882246017456, + "logps/chosen": -70.04196166992188, + "logps/rejected": -96.46190643310547, + "loss": 0.6988, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.65625262260437, + "rewards/margins": 5.336670398712158, + "rewards/rejected": -2.680417537689209, + "step": 17582 + }, + { + "epoch": 4.4, + "grad_norm": 3.0715510845184326, + "learning_rate": 3.522178970621437e-07, + "logits/chosen": -0.5874965190887451, + "logits/rejected": -0.6691665649414062, + "logps/chosen": -56.85104751586914, + "logps/rejected": -118.54419708251953, + "loss": 0.6244, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.28273868560791, + "rewards/margins": 7.991488456726074, + "rewards/rejected": -4.708749771118164, + "step": 17583 + }, + { + "epoch": 4.4, + "grad_norm": 4.920637130737305, + "learning_rate": 3.519281766476562e-07, + "logits/chosen": -0.6244259476661682, + "logits/rejected": -0.6740353107452393, + "logps/chosen": -58.37582778930664, + "logps/rejected": -121.9127197265625, + "loss": 0.6047, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9364144802093506, + "rewards/margins": 7.817884922027588, + "rewards/rejected": -4.881470203399658, + "step": 17584 + }, + { + "epoch": 4.4, + "grad_norm": 4.721176624298096, + "learning_rate": 3.516385710919684e-07, + "logits/chosen": -0.5589667558670044, + "logits/rejected": -0.6178656816482544, + "logps/chosen": -49.055747985839844, + "logps/rejected": -96.37535858154297, + "loss": 0.5524, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0003838539123535, + "rewards/margins": 6.457948684692383, + "rewards/rejected": -3.457564353942871, + "step": 17585 + }, + { + "epoch": 4.4, + "grad_norm": 4.019976615905762, + "learning_rate": 3.513490804022346e-07, + "logits/chosen": -0.5844724774360657, + "logits/rejected": -0.7126144170761108, + "logps/chosen": -55.013916015625, + "logps/rejected": -86.79563903808594, + "loss": 0.6095, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0779874324798584, + "rewards/margins": 6.094597816467285, + "rewards/rejected": -3.0166096687316895, + "step": 17586 + }, + { + "epoch": 4.4, + "grad_norm": 3.4001801013946533, + "learning_rate": 3.510597045856101e-07, + "logits/chosen": -0.5589126348495483, + "logits/rejected": -0.6471539735794067, + "logps/chosen": -54.1300048828125, + "logps/rejected": -104.5296630859375, + "loss": 0.5784, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.129476308822632, + "rewards/margins": 7.128480911254883, + "rewards/rejected": -3.9990053176879883, + "step": 17587 + }, + { + "epoch": 4.4, + "grad_norm": 4.441892623901367, + "learning_rate": 3.5077044364924597e-07, + "logits/chosen": -0.46912550926208496, + "logits/rejected": -0.5527963042259216, + "logps/chosen": -57.56101608276367, + "logps/rejected": -101.31378936767578, + "loss": 0.623, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8517234325408936, + "rewards/margins": 6.249200344085693, + "rewards/rejected": -3.3974761962890625, + "step": 17588 + }, + { + "epoch": 4.4, + "grad_norm": 8.793167114257812, + "learning_rate": 3.504812976002875e-07, + "logits/chosen": -0.5399442911148071, + "logits/rejected": -0.5754703879356384, + "logps/chosen": -52.329200744628906, + "logps/rejected": -119.45088958740234, + "loss": 0.6428, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3895606994628906, + "rewards/margins": 6.806585788726807, + "rewards/rejected": -3.417025566101074, + "step": 17589 + }, + { + "epoch": 4.4, + "grad_norm": 7.174174785614014, + "learning_rate": 3.501922664458829e-07, + "logits/chosen": -0.5768510103225708, + "logits/rejected": -0.6128630042076111, + "logps/chosen": -55.553855895996094, + "logps/rejected": -107.15413665771484, + "loss": 0.6625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8474156856536865, + "rewards/margins": 6.128778457641602, + "rewards/rejected": -3.281362533569336, + "step": 17590 + }, + { + "epoch": 4.4, + "grad_norm": 20.384628295898438, + "learning_rate": 3.4990335019317254e-07, + "logits/chosen": -0.5579397678375244, + "logits/rejected": -0.6092559099197388, + "logps/chosen": -54.4990348815918, + "logps/rejected": -91.03327178955078, + "loss": 0.6934, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1156938076019287, + "rewards/margins": 4.916189670562744, + "rewards/rejected": -1.8004958629608154, + "step": 17591 + }, + { + "epoch": 4.4, + "grad_norm": 2.351182222366333, + "learning_rate": 3.496145488492969e-07, + "logits/chosen": -0.5714525580406189, + "logits/rejected": -0.6613781452178955, + "logps/chosen": -49.7762451171875, + "logps/rejected": -90.81777954101562, + "loss": 0.5105, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3085274696350098, + "rewards/margins": 7.349410057067871, + "rewards/rejected": -4.040882587432861, + "step": 17592 + }, + { + "epoch": 4.4, + "grad_norm": 5.873152256011963, + "learning_rate": 3.493258624213919e-07, + "logits/chosen": -0.5228511095046997, + "logits/rejected": -0.6040793061256409, + "logps/chosen": -62.89220428466797, + "logps/rejected": -109.46492767333984, + "loss": 0.5947, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1961820125579834, + "rewards/margins": 6.750831604003906, + "rewards/rejected": -3.554649829864502, + "step": 17593 + }, + { + "epoch": 4.4, + "grad_norm": 4.5480217933654785, + "learning_rate": 3.490372909165918e-07, + "logits/chosen": -0.5788941383361816, + "logits/rejected": -0.7011235356330872, + "logps/chosen": -46.80375289916992, + "logps/rejected": -113.52368927001953, + "loss": 0.616, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2261903285980225, + "rewards/margins": 7.866393566131592, + "rewards/rejected": -4.640203952789307, + "step": 17594 + }, + { + "epoch": 4.4, + "grad_norm": 20.05159568786621, + "learning_rate": 3.4874883434202756e-07, + "logits/chosen": -0.5386501550674438, + "logits/rejected": -0.6076789498329163, + "logps/chosen": -54.008541107177734, + "logps/rejected": -103.35840606689453, + "loss": 0.6781, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1403937339782715, + "rewards/margins": 6.796171188354492, + "rewards/rejected": -3.655777931213379, + "step": 17595 + }, + { + "epoch": 4.4, + "grad_norm": 2.669935703277588, + "learning_rate": 3.484604927048257e-07, + "logits/chosen": -0.5423829555511475, + "logits/rejected": -0.630709707736969, + "logps/chosen": -42.46286392211914, + "logps/rejected": -100.09929656982422, + "loss": 0.4842, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1815311908721924, + "rewards/margins": 7.673064231872559, + "rewards/rejected": -4.491533279418945, + "step": 17596 + }, + { + "epoch": 4.4, + "grad_norm": 4.442025661468506, + "learning_rate": 3.4817226601211394e-07, + "logits/chosen": -0.5499277114868164, + "logits/rejected": -0.5851007103919983, + "logps/chosen": -53.26369094848633, + "logps/rejected": -118.99162292480469, + "loss": 0.5665, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.27590012550354, + "rewards/margins": 7.2951741218566895, + "rewards/rejected": -4.0192742347717285, + "step": 17597 + }, + { + "epoch": 4.4, + "grad_norm": 6.046992301940918, + "learning_rate": 3.4788415427101316e-07, + "logits/chosen": -0.5847771763801575, + "logits/rejected": -0.6695435047149658, + "logps/chosen": -56.21977233886719, + "logps/rejected": -97.09461975097656, + "loss": 0.628, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9568464756011963, + "rewards/margins": 6.614171028137207, + "rewards/rejected": -3.6573243141174316, + "step": 17598 + }, + { + "epoch": 4.4, + "grad_norm": 7.524412631988525, + "learning_rate": 3.475961574886422e-07, + "logits/chosen": -0.47734498977661133, + "logits/rejected": -0.5961825847625732, + "logps/chosen": -65.55036926269531, + "logps/rejected": -102.16432189941406, + "loss": 0.579, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9884228706359863, + "rewards/margins": 6.984795093536377, + "rewards/rejected": -3.9963722229003906, + "step": 17599 + }, + { + "epoch": 4.4, + "grad_norm": 3.329782247543335, + "learning_rate": 3.473082756721197e-07, + "logits/chosen": -0.5909334421157837, + "logits/rejected": -0.6592238545417786, + "logps/chosen": -63.147830963134766, + "logps/rejected": -115.68093872070312, + "loss": 0.6115, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.177433490753174, + "rewards/margins": 7.848794937133789, + "rewards/rejected": -4.671361446380615, + "step": 17600 + }, + { + "epoch": 4.4, + "grad_norm": 11.731093406677246, + "learning_rate": 3.470205088285583e-07, + "logits/chosen": -0.5371357202529907, + "logits/rejected": -0.6392484307289124, + "logps/chosen": -75.90949249267578, + "logps/rejected": -114.3660888671875, + "loss": 0.7435, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.509730815887451, + "rewards/margins": 7.101733207702637, + "rewards/rejected": -4.5920023918151855, + "step": 17601 + }, + { + "epoch": 4.4, + "grad_norm": 7.186254501342773, + "learning_rate": 3.4673285696506854e-07, + "logits/chosen": -0.48874494433403015, + "logits/rejected": -0.5836236476898193, + "logps/chosen": -48.53606414794922, + "logps/rejected": -107.32628631591797, + "loss": 0.607, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1611826419830322, + "rewards/margins": 7.3553853034973145, + "rewards/rejected": -4.194202899932861, + "step": 17602 + }, + { + "epoch": 4.4, + "grad_norm": 4.3448309898376465, + "learning_rate": 3.4644532008875963e-07, + "logits/chosen": -0.49158716201782227, + "logits/rejected": -0.5839185118675232, + "logps/chosen": -52.27476119995117, + "logps/rejected": -92.88777160644531, + "loss": 0.5785, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.978726387023926, + "rewards/margins": 6.768668174743652, + "rewards/rejected": -3.7899422645568848, + "step": 17603 + }, + { + "epoch": 4.4, + "grad_norm": 2.5981290340423584, + "learning_rate": 3.4615789820673705e-07, + "logits/chosen": -0.580488920211792, + "logits/rejected": -0.6234263777732849, + "logps/chosen": -53.28287887573242, + "logps/rejected": -131.60305786132812, + "loss": 0.5997, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2419276237487793, + "rewards/margins": 7.532597541809082, + "rewards/rejected": -4.290669918060303, + "step": 17604 + }, + { + "epoch": 4.4, + "grad_norm": 2.4973182678222656, + "learning_rate": 3.458705913261029e-07, + "logits/chosen": -0.5127846002578735, + "logits/rejected": -0.5713634490966797, + "logps/chosen": -76.36085510253906, + "logps/rejected": -135.65821838378906, + "loss": 0.6339, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2067251205444336, + "rewards/margins": 8.341700553894043, + "rewards/rejected": -5.134975910186768, + "step": 17605 + }, + { + "epoch": 4.4, + "grad_norm": 5.193064212799072, + "learning_rate": 3.455833994539559e-07, + "logits/chosen": -0.5143978595733643, + "logits/rejected": -0.6171907186508179, + "logps/chosen": -72.14496612548828, + "logps/rejected": -94.56712341308594, + "loss": 0.731, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2130117416381836, + "rewards/margins": 6.154365062713623, + "rewards/rejected": -2.9413535594940186, + "step": 17606 + }, + { + "epoch": 4.4, + "grad_norm": 3.8029556274414062, + "learning_rate": 3.4529632259739487e-07, + "logits/chosen": -0.580881655216217, + "logits/rejected": -0.5910148024559021, + "logps/chosen": -39.91157150268555, + "logps/rejected": -106.6141586303711, + "loss": 0.6044, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.657416582107544, + "rewards/margins": 6.320660591125488, + "rewards/rejected": -2.6632442474365234, + "step": 17607 + }, + { + "epoch": 4.4, + "grad_norm": 4.365079402923584, + "learning_rate": 3.450093607635124e-07, + "logits/chosen": -0.5012684464454651, + "logits/rejected": -0.6118376851081848, + "logps/chosen": -50.653995513916016, + "logps/rejected": -100.01525115966797, + "loss": 0.5496, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.88626766204834, + "rewards/margins": 7.36406946182251, + "rewards/rejected": -4.47780179977417, + "step": 17608 + }, + { + "epoch": 4.41, + "grad_norm": 6.527164459228516, + "learning_rate": 3.4472251395939906e-07, + "logits/chosen": -0.6278820037841797, + "logits/rejected": -0.7205323576927185, + "logps/chosen": -51.511417388916016, + "logps/rejected": -103.67633056640625, + "loss": 0.617, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.806429862976074, + "rewards/margins": 7.554155349731445, + "rewards/rejected": -4.747725963592529, + "step": 17609 + }, + { + "epoch": 4.41, + "grad_norm": 5.624699592590332, + "learning_rate": 3.444357821921451e-07, + "logits/chosen": -0.5472537279129028, + "logits/rejected": -0.5937867164611816, + "logps/chosen": -37.28316879272461, + "logps/rejected": -99.38094329833984, + "loss": 0.5445, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1996564865112305, + "rewards/margins": 7.114558219909668, + "rewards/rejected": -3.9149019718170166, + "step": 17610 + }, + { + "epoch": 4.41, + "grad_norm": 12.741982460021973, + "learning_rate": 3.441491654688339e-07, + "logits/chosen": -0.48308491706848145, + "logits/rejected": -0.5340516567230225, + "logps/chosen": -53.73762893676758, + "logps/rejected": -99.63099670410156, + "loss": 0.61, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0004632472991943, + "rewards/margins": 6.184011936187744, + "rewards/rejected": -3.18354868888855, + "step": 17611 + }, + { + "epoch": 4.41, + "grad_norm": 32.12660217285156, + "learning_rate": 3.4386266379654977e-07, + "logits/chosen": -0.5697520971298218, + "logits/rejected": -0.6793884038925171, + "logps/chosen": -57.75343322753906, + "logps/rejected": -111.09577941894531, + "loss": 0.7495, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0511491298675537, + "rewards/margins": 7.89511251449585, + "rewards/rejected": -4.843963623046875, + "step": 17612 + }, + { + "epoch": 4.41, + "grad_norm": 23.490507125854492, + "learning_rate": 3.435762771823714e-07, + "logits/chosen": -0.5543850660324097, + "logits/rejected": -0.6437219381332397, + "logps/chosen": -44.72208786010742, + "logps/rejected": -83.55482482910156, + "loss": 0.6691, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.059556484222412, + "rewards/margins": 6.473500728607178, + "rewards/rejected": -3.4139440059661865, + "step": 17613 + }, + { + "epoch": 4.41, + "grad_norm": 6.589587688446045, + "learning_rate": 3.43290005633376e-07, + "logits/chosen": -0.5226327180862427, + "logits/rejected": -0.6034989356994629, + "logps/chosen": -46.45235061645508, + "logps/rejected": -124.75050354003906, + "loss": 0.5851, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3635666370391846, + "rewards/margins": 9.083937644958496, + "rewards/rejected": -5.720370292663574, + "step": 17614 + }, + { + "epoch": 4.41, + "grad_norm": 1.7091468572616577, + "learning_rate": 3.4300384915663786e-07, + "logits/chosen": -0.5045396089553833, + "logits/rejected": -0.6131900548934937, + "logps/chosen": -61.917510986328125, + "logps/rejected": -108.4923324584961, + "loss": 0.5163, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2891931533813477, + "rewards/margins": 7.155038833618164, + "rewards/rejected": -3.865845203399658, + "step": 17615 + }, + { + "epoch": 4.41, + "grad_norm": 4.506745338439941, + "learning_rate": 3.42717807759228e-07, + "logits/chosen": -0.5635517835617065, + "logits/rejected": -0.6654347777366638, + "logps/chosen": -47.3166389465332, + "logps/rejected": -98.43333435058594, + "loss": 0.5709, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.164700984954834, + "rewards/margins": 7.7090959548950195, + "rewards/rejected": -4.5443949699401855, + "step": 17616 + }, + { + "epoch": 4.41, + "grad_norm": 8.146777153015137, + "learning_rate": 3.424318814482142e-07, + "logits/chosen": -0.44926461577415466, + "logits/rejected": -0.5108816623687744, + "logps/chosen": -48.572227478027344, + "logps/rejected": -118.22306823730469, + "loss": 0.627, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2655012607574463, + "rewards/margins": 7.3419694900512695, + "rewards/rejected": -4.076468467712402, + "step": 17617 + }, + { + "epoch": 4.41, + "grad_norm": 5.562738418579102, + "learning_rate": 3.421460702306623e-07, + "logits/chosen": -0.6515995264053345, + "logits/rejected": -0.7152523994445801, + "logps/chosen": -55.785953521728516, + "logps/rejected": -113.98758697509766, + "loss": 0.5729, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.229222059249878, + "rewards/margins": 6.912757873535156, + "rewards/rejected": -3.6835365295410156, + "step": 17618 + }, + { + "epoch": 4.41, + "grad_norm": 3.1415181159973145, + "learning_rate": 3.4186037411363725e-07, + "logits/chosen": -0.535119354724884, + "logits/rejected": -0.6963566541671753, + "logps/chosen": -60.13439178466797, + "logps/rejected": -106.99822235107422, + "loss": 0.5588, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9682765007019043, + "rewards/margins": 9.446425437927246, + "rewards/rejected": -6.478148937225342, + "step": 17619 + }, + { + "epoch": 4.41, + "grad_norm": 9.821473121643066, + "learning_rate": 3.415747931041952e-07, + "logits/chosen": -0.5151644945144653, + "logits/rejected": -0.5842245221138, + "logps/chosen": -56.359100341796875, + "logps/rejected": -105.22723388671875, + "loss": 0.6358, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3157355785369873, + "rewards/margins": 8.04776382446289, + "rewards/rejected": -4.732028961181641, + "step": 17620 + }, + { + "epoch": 4.41, + "grad_norm": 3.698934555053711, + "learning_rate": 3.4128932720939533e-07, + "logits/chosen": -0.5962140560150146, + "logits/rejected": -0.6690819263458252, + "logps/chosen": -53.129634857177734, + "logps/rejected": -115.59822845458984, + "loss": 0.6351, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7016355991363525, + "rewards/margins": 8.30634593963623, + "rewards/rejected": -5.604710578918457, + "step": 17621 + }, + { + "epoch": 4.41, + "grad_norm": 7.413287162780762, + "learning_rate": 3.4100397643629266e-07, + "logits/chosen": -0.5802832841873169, + "logits/rejected": -0.7139455080032349, + "logps/chosen": -61.922096252441406, + "logps/rejected": -103.53749084472656, + "loss": 0.608, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4161908626556396, + "rewards/margins": 7.553217887878418, + "rewards/rejected": -4.137027263641357, + "step": 17622 + }, + { + "epoch": 4.41, + "grad_norm": 7.183329105377197, + "learning_rate": 3.4071874079193544e-07, + "logits/chosen": -0.5094202160835266, + "logits/rejected": -0.5859898328781128, + "logps/chosen": -53.66996383666992, + "logps/rejected": -88.89384460449219, + "loss": 0.6665, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.941406488418579, + "rewards/margins": 6.682870864868164, + "rewards/rejected": -3.7414650917053223, + "step": 17623 + }, + { + "epoch": 4.41, + "grad_norm": 3.053516149520874, + "learning_rate": 3.404336202833747e-07, + "logits/chosen": -0.6328799724578857, + "logits/rejected": -0.6998144388198853, + "logps/chosen": -52.69189453125, + "logps/rejected": -116.8056869506836, + "loss": 0.5731, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3337900638580322, + "rewards/margins": 8.023219108581543, + "rewards/rejected": -4.68942928314209, + "step": 17624 + }, + { + "epoch": 4.41, + "grad_norm": 2.621490955352783, + "learning_rate": 3.4014861491765526e-07, + "logits/chosen": -0.4728420078754425, + "logits/rejected": -0.5955758094787598, + "logps/chosen": -55.70182800292969, + "logps/rejected": -89.99662780761719, + "loss": 0.5697, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.093799114227295, + "rewards/margins": 7.1173996925354, + "rewards/rejected": -4.023601055145264, + "step": 17625 + }, + { + "epoch": 4.41, + "grad_norm": 2.971360445022583, + "learning_rate": 3.398637247018205e-07, + "logits/chosen": -0.5877819657325745, + "logits/rejected": -0.6297851204872131, + "logps/chosen": -66.10633850097656, + "logps/rejected": -112.61626434326172, + "loss": 0.6472, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2083234786987305, + "rewards/margins": 8.374062538146973, + "rewards/rejected": -5.1657395362854, + "step": 17626 + }, + { + "epoch": 4.41, + "grad_norm": 4.993083953857422, + "learning_rate": 3.3957894964290915e-07, + "logits/chosen": -0.5381161570549011, + "logits/rejected": -0.6216128468513489, + "logps/chosen": -60.81337356567383, + "logps/rejected": -93.21094512939453, + "loss": 0.6778, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0398929119110107, + "rewards/margins": 5.983155250549316, + "rewards/rejected": -2.9432616233825684, + "step": 17627 + }, + { + "epoch": 4.41, + "grad_norm": 5.693356990814209, + "learning_rate": 3.3929428974795954e-07, + "logits/chosen": -0.5309341549873352, + "logits/rejected": -0.6329324841499329, + "logps/chosen": -45.81039047241211, + "logps/rejected": -98.2514419555664, + "loss": 0.6369, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.365372896194458, + "rewards/margins": 7.616372585296631, + "rewards/rejected": -4.250999450683594, + "step": 17628 + }, + { + "epoch": 4.41, + "grad_norm": 4.447522163391113, + "learning_rate": 3.3900974502400485e-07, + "logits/chosen": -0.5739811658859253, + "logits/rejected": -0.6452787518501282, + "logps/chosen": -42.16442108154297, + "logps/rejected": -113.50515747070312, + "loss": 0.5175, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.176454544067383, + "rewards/margins": 7.807884693145752, + "rewards/rejected": -4.631430149078369, + "step": 17629 + }, + { + "epoch": 4.41, + "grad_norm": 4.3046040534973145, + "learning_rate": 3.387253154780773e-07, + "logits/chosen": -0.5224166512489319, + "logits/rejected": -0.5688753724098206, + "logps/chosen": -59.635963439941406, + "logps/rejected": -111.88902282714844, + "loss": 0.6873, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1726715564727783, + "rewards/margins": 6.077902793884277, + "rewards/rejected": -2.905231475830078, + "step": 17630 + }, + { + "epoch": 4.41, + "grad_norm": 5.292444705963135, + "learning_rate": 3.384410011172057e-07, + "logits/chosen": -0.5554739832878113, + "logits/rejected": -0.6344595551490784, + "logps/chosen": -53.00008773803711, + "logps/rejected": -109.51982116699219, + "loss": 0.6794, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2965991497039795, + "rewards/margins": 7.105621337890625, + "rewards/rejected": -3.8090221881866455, + "step": 17631 + }, + { + "epoch": 4.41, + "grad_norm": 4.680538654327393, + "learning_rate": 3.3815680194841384e-07, + "logits/chosen": -0.5512328743934631, + "logits/rejected": -0.5902957916259766, + "logps/chosen": -50.472145080566406, + "logps/rejected": -99.4933853149414, + "loss": 0.6195, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0823912620544434, + "rewards/margins": 6.641412258148193, + "rewards/rejected": -3.559021472930908, + "step": 17632 + }, + { + "epoch": 4.41, + "grad_norm": 8.832080841064453, + "learning_rate": 3.3787271797872725e-07, + "logits/chosen": -0.5765644311904907, + "logits/rejected": -0.6821064352989197, + "logps/chosen": -66.89600372314453, + "logps/rejected": -122.74375915527344, + "loss": 0.6624, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3015944957733154, + "rewards/margins": 7.550244331359863, + "rewards/rejected": -4.2486491203308105, + "step": 17633 + }, + { + "epoch": 4.41, + "grad_norm": 3.5386533737182617, + "learning_rate": 3.375887492151647e-07, + "logits/chosen": -0.5043153762817383, + "logits/rejected": -0.5417126417160034, + "logps/chosen": -51.830284118652344, + "logps/rejected": -114.1044692993164, + "loss": 0.513, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3289177417755127, + "rewards/margins": 7.55534553527832, + "rewards/rejected": -4.226428031921387, + "step": 17634 + }, + { + "epoch": 4.41, + "grad_norm": 4.678965091705322, + "learning_rate": 3.3730489566474236e-07, + "logits/chosen": -0.5837575793266296, + "logits/rejected": -0.66526859998703, + "logps/chosen": -54.70258331298828, + "logps/rejected": -103.12020874023438, + "loss": 0.605, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4509453773498535, + "rewards/margins": 7.485419273376465, + "rewards/rejected": -4.034473419189453, + "step": 17635 + }, + { + "epoch": 4.41, + "grad_norm": 3.4072964191436768, + "learning_rate": 3.3702115733447504e-07, + "logits/chosen": -0.5170499682426453, + "logits/rejected": -0.604173481464386, + "logps/chosen": -51.02048110961914, + "logps/rejected": -106.59321594238281, + "loss": 0.5515, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8545970916748047, + "rewards/margins": 6.638933181762695, + "rewards/rejected": -3.7843356132507324, + "step": 17636 + }, + { + "epoch": 4.41, + "grad_norm": 4.478281497955322, + "learning_rate": 3.3673753423137614e-07, + "logits/chosen": -0.48680379986763, + "logits/rejected": -0.5709131956100464, + "logps/chosen": -67.06396484375, + "logps/rejected": -120.0659408569336, + "loss": 0.6329, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4655675888061523, + "rewards/margins": 8.059752464294434, + "rewards/rejected": -4.594184875488281, + "step": 17637 + }, + { + "epoch": 4.41, + "grad_norm": 13.218073844909668, + "learning_rate": 3.364540263624527e-07, + "logits/chosen": -0.47785684466362, + "logits/rejected": -0.5802902579307556, + "logps/chosen": -59.962982177734375, + "logps/rejected": -92.93009185791016, + "loss": 0.633, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.101637363433838, + "rewards/margins": 6.25942325592041, + "rewards/rejected": -3.1577863693237305, + "step": 17638 + }, + { + "epoch": 4.41, + "grad_norm": 3.727123737335205, + "learning_rate": 3.361706337347098e-07, + "logits/chosen": -0.5642647743225098, + "logits/rejected": -0.5890248417854309, + "logps/chosen": -60.518489837646484, + "logps/rejected": -116.8943099975586, + "loss": 0.6189, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0799036026000977, + "rewards/margins": 7.0412750244140625, + "rewards/rejected": -3.961371421813965, + "step": 17639 + }, + { + "epoch": 4.41, + "grad_norm": 5.248614311218262, + "learning_rate": 3.3588735635515177e-07, + "logits/chosen": -0.5803470015525818, + "logits/rejected": -0.6868734955787659, + "logps/chosen": -57.10094451904297, + "logps/rejected": -115.30474853515625, + "loss": 0.6211, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.301744222640991, + "rewards/margins": 8.812840461730957, + "rewards/rejected": -5.511096000671387, + "step": 17640 + }, + { + "epoch": 4.41, + "grad_norm": 3.9814515113830566, + "learning_rate": 3.356041942307786e-07, + "logits/chosen": -0.4970139265060425, + "logits/rejected": -0.5750187635421753, + "logps/chosen": -60.818695068359375, + "logps/rejected": -124.33026885986328, + "loss": 0.6011, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3350844383239746, + "rewards/margins": 7.4697065353393555, + "rewards/rejected": -4.134621620178223, + "step": 17641 + }, + { + "epoch": 4.41, + "grad_norm": 2.735459804534912, + "learning_rate": 3.353211473685858e-07, + "logits/chosen": -0.5836731791496277, + "logits/rejected": -0.6582301259040833, + "logps/chosen": -53.364871978759766, + "logps/rejected": -127.29586029052734, + "loss": 0.6792, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5009827613830566, + "rewards/margins": 8.111504554748535, + "rewards/rejected": -4.61052131652832, + "step": 17642 + }, + { + "epoch": 4.41, + "grad_norm": 7.639667987823486, + "learning_rate": 3.3503821577556994e-07, + "logits/chosen": -0.5433018207550049, + "logits/rejected": -0.6359211802482605, + "logps/chosen": -59.394779205322266, + "logps/rejected": -89.82716369628906, + "loss": 0.7146, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4653406143188477, + "rewards/margins": 5.286515235900879, + "rewards/rejected": -1.8211745023727417, + "step": 17643 + }, + { + "epoch": 4.41, + "grad_norm": 4.932564735412598, + "learning_rate": 3.347553994587216e-07, + "logits/chosen": -0.5311657190322876, + "logits/rejected": -0.6462704539299011, + "logps/chosen": -53.725303649902344, + "logps/rejected": -100.88233947753906, + "loss": 0.5943, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9180665016174316, + "rewards/margins": 7.415306568145752, + "rewards/rejected": -4.49724006652832, + "step": 17644 + }, + { + "epoch": 4.41, + "grad_norm": 3.7121543884277344, + "learning_rate": 3.3447269842502906e-07, + "logits/chosen": -0.5698365569114685, + "logits/rejected": -0.6182644963264465, + "logps/chosen": -52.22066879272461, + "logps/rejected": -112.3300552368164, + "loss": 0.6185, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.395796775817871, + "rewards/margins": 7.013821601867676, + "rewards/rejected": -3.6180248260498047, + "step": 17645 + }, + { + "epoch": 4.41, + "grad_norm": 12.968816757202148, + "learning_rate": 3.341901126814784e-07, + "logits/chosen": -0.6691989898681641, + "logits/rejected": -0.719281792640686, + "logps/chosen": -54.66007614135742, + "logps/rejected": -115.4155044555664, + "loss": 0.8456, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.955392360687256, + "rewards/margins": 7.0192413330078125, + "rewards/rejected": -4.063849449157715, + "step": 17646 + }, + { + "epoch": 4.41, + "grad_norm": 3.8818063735961914, + "learning_rate": 3.339076422350546e-07, + "logits/chosen": -0.5927868485450745, + "logits/rejected": -0.6841060519218445, + "logps/chosen": -59.43543243408203, + "logps/rejected": -98.51466369628906, + "loss": 0.6528, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1517531871795654, + "rewards/margins": 6.372212886810303, + "rewards/rejected": -3.220459461212158, + "step": 17647 + }, + { + "epoch": 4.41, + "grad_norm": 8.990621566772461, + "learning_rate": 3.3362528709273423e-07, + "logits/chosen": -0.5212026238441467, + "logits/rejected": -0.6416721343994141, + "logps/chosen": -56.086265563964844, + "logps/rejected": -94.43095397949219, + "loss": 0.6393, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.217982292175293, + "rewards/margins": 6.793942451477051, + "rewards/rejected": -3.5759596824645996, + "step": 17648 + }, + { + "epoch": 4.42, + "grad_norm": 7.575989723205566, + "learning_rate": 3.3334304726149626e-07, + "logits/chosen": -0.5449302196502686, + "logits/rejected": -0.581794798374176, + "logps/chosen": -63.2384147644043, + "logps/rejected": -95.542236328125, + "loss": 0.7006, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.086827278137207, + "rewards/margins": 6.276600360870361, + "rewards/rejected": -3.189772844314575, + "step": 17649 + }, + { + "epoch": 4.42, + "grad_norm": 4.438827037811279, + "learning_rate": 3.3306092274831613e-07, + "logits/chosen": -0.47336435317993164, + "logits/rejected": -0.5882089138031006, + "logps/chosen": -53.41533660888672, + "logps/rejected": -102.07234191894531, + "loss": 0.5962, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3206958770751953, + "rewards/margins": 7.670531749725342, + "rewards/rejected": -4.3498358726501465, + "step": 17650 + }, + { + "epoch": 4.42, + "grad_norm": 3.974904775619507, + "learning_rate": 3.3277891356016445e-07, + "logits/chosen": -0.5133541822433472, + "logits/rejected": -0.6264837980270386, + "logps/chosen": -78.55384826660156, + "logps/rejected": -120.23954010009766, + "loss": 0.6614, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.044808864593506, + "rewards/margins": 7.450691223144531, + "rewards/rejected": -4.405882835388184, + "step": 17651 + }, + { + "epoch": 4.42, + "grad_norm": 12.10937213897705, + "learning_rate": 3.3249701970401004e-07, + "logits/chosen": -0.5270944237709045, + "logits/rejected": -0.6156133413314819, + "logps/chosen": -48.54027557373047, + "logps/rejected": -96.62911987304688, + "loss": 0.5504, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.983229875564575, + "rewards/margins": 6.8584489822387695, + "rewards/rejected": -3.8752188682556152, + "step": 17652 + }, + { + "epoch": 4.42, + "grad_norm": 3.3270130157470703, + "learning_rate": 3.3221524118681903e-07, + "logits/chosen": -0.6212040185928345, + "logits/rejected": -0.6501377820968628, + "logps/chosen": -50.58392333984375, + "logps/rejected": -120.94599914550781, + "loss": 0.6032, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1214728355407715, + "rewards/margins": 6.788386344909668, + "rewards/rejected": -3.6669135093688965, + "step": 17653 + }, + { + "epoch": 4.42, + "grad_norm": 6.155774116516113, + "learning_rate": 3.319335780155536e-07, + "logits/chosen": -0.5934492349624634, + "logits/rejected": -0.5854978561401367, + "logps/chosen": -55.102481842041016, + "logps/rejected": -118.28929901123047, + "loss": 0.5846, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0343737602233887, + "rewards/margins": 6.685028076171875, + "rewards/rejected": -3.6506543159484863, + "step": 17654 + }, + { + "epoch": 4.42, + "grad_norm": 4.421265602111816, + "learning_rate": 3.316520301971754e-07, + "logits/chosen": -0.5206788182258606, + "logits/rejected": -0.6213926672935486, + "logps/chosen": -62.114505767822266, + "logps/rejected": -129.3359375, + "loss": 0.6073, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.31923770904541, + "rewards/margins": 9.283536911010742, + "rewards/rejected": -5.964299201965332, + "step": 17655 + }, + { + "epoch": 4.42, + "grad_norm": 4.192867279052734, + "learning_rate": 3.313705977386411e-07, + "logits/chosen": -0.49162882566452026, + "logits/rejected": -0.5500288605690002, + "logps/chosen": -50.842411041259766, + "logps/rejected": -106.33161926269531, + "loss": 0.5823, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.120173454284668, + "rewards/margins": 6.2013139724731445, + "rewards/rejected": -3.0811405181884766, + "step": 17656 + }, + { + "epoch": 4.42, + "grad_norm": 2.6690430641174316, + "learning_rate": 3.3108928064690407e-07, + "logits/chosen": -0.5479886531829834, + "logits/rejected": -0.6436677575111389, + "logps/chosen": -38.56422424316406, + "logps/rejected": -97.02518463134766, + "loss": 0.4847, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2217578887939453, + "rewards/margins": 7.448444843292236, + "rewards/rejected": -4.226686954498291, + "step": 17657 + }, + { + "epoch": 4.42, + "grad_norm": 8.661688804626465, + "learning_rate": 3.308080789289181e-07, + "logits/chosen": -0.4767531156539917, + "logits/rejected": -0.5701078176498413, + "logps/chosen": -62.878028869628906, + "logps/rejected": -111.95636749267578, + "loss": 0.686, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.21832537651062, + "rewards/margins": 7.6160969734191895, + "rewards/rejected": -4.397771835327148, + "step": 17658 + }, + { + "epoch": 4.42, + "grad_norm": 6.4741973876953125, + "learning_rate": 3.3052699259163047e-07, + "logits/chosen": -0.5401006937026978, + "logits/rejected": -0.6467501521110535, + "logps/chosen": -52.37550354003906, + "logps/rejected": -113.57637023925781, + "loss": 0.6271, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3733065128326416, + "rewards/margins": 8.549083709716797, + "rewards/rejected": -5.175776481628418, + "step": 17659 + }, + { + "epoch": 4.42, + "grad_norm": 2.3785035610198975, + "learning_rate": 3.302460216419873e-07, + "logits/chosen": -0.5641627907752991, + "logits/rejected": -0.6629099249839783, + "logps/chosen": -50.66460418701172, + "logps/rejected": -109.99955749511719, + "loss": 0.6286, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4275832176208496, + "rewards/margins": 7.998698711395264, + "rewards/rejected": -4.571115493774414, + "step": 17660 + }, + { + "epoch": 4.42, + "grad_norm": 11.629786491394043, + "learning_rate": 3.299651660869313e-07, + "logits/chosen": -0.47761160135269165, + "logits/rejected": -0.5521992444992065, + "logps/chosen": -56.741668701171875, + "logps/rejected": -119.61975860595703, + "loss": 0.5803, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3198063373565674, + "rewards/margins": 7.85966682434082, + "rewards/rejected": -4.539859771728516, + "step": 17661 + }, + { + "epoch": 4.42, + "grad_norm": 6.5311126708984375, + "learning_rate": 3.296844259334053e-07, + "logits/chosen": -0.6164986491203308, + "logits/rejected": -0.7425832748413086, + "logps/chosen": -60.388187408447266, + "logps/rejected": -114.4492416381836, + "loss": 0.6856, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9392306804656982, + "rewards/margins": 8.458355903625488, + "rewards/rejected": -5.519125461578369, + "step": 17662 + }, + { + "epoch": 4.42, + "grad_norm": 4.91733980178833, + "learning_rate": 3.294038011883433e-07, + "logits/chosen": -0.46471190452575684, + "logits/rejected": -0.6070191264152527, + "logps/chosen": -55.295936584472656, + "logps/rejected": -103.28790283203125, + "loss": 0.643, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2079992294311523, + "rewards/margins": 8.373615264892578, + "rewards/rejected": -5.165616989135742, + "step": 17663 + }, + { + "epoch": 4.42, + "grad_norm": 6.553837776184082, + "learning_rate": 3.2912329185868065e-07, + "logits/chosen": -0.4936814308166504, + "logits/rejected": -0.5533769130706787, + "logps/chosen": -71.25884246826172, + "logps/rejected": -127.308837890625, + "loss": 0.6857, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1728200912475586, + "rewards/margins": 7.580392837524414, + "rewards/rejected": -4.407573223114014, + "step": 17664 + }, + { + "epoch": 4.42, + "grad_norm": 4.2716474533081055, + "learning_rate": 3.288428979513514e-07, + "logits/chosen": -0.5501606464385986, + "logits/rejected": -0.6145491003990173, + "logps/chosen": -57.78694534301758, + "logps/rejected": -120.02071380615234, + "loss": 0.5565, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.327538013458252, + "rewards/margins": 8.080155372619629, + "rewards/rejected": -4.752617835998535, + "step": 17665 + }, + { + "epoch": 4.42, + "grad_norm": 10.022517204284668, + "learning_rate": 3.2856261947328103e-07, + "logits/chosen": -0.5866108536720276, + "logits/rejected": -0.6513628363609314, + "logps/chosen": -47.84834289550781, + "logps/rejected": -107.62640380859375, + "loss": 0.558, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0560202598571777, + "rewards/margins": 8.056597709655762, + "rewards/rejected": -5.000576972961426, + "step": 17666 + }, + { + "epoch": 4.42, + "grad_norm": 6.345032691955566, + "learning_rate": 3.2828245643139747e-07, + "logits/chosen": -0.608161211013794, + "logits/rejected": -0.6595284342765808, + "logps/chosen": -46.393348693847656, + "logps/rejected": -115.2431640625, + "loss": 0.5777, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8161141872406006, + "rewards/margins": 7.714504241943359, + "rewards/rejected": -4.89838981628418, + "step": 17667 + }, + { + "epoch": 4.42, + "grad_norm": 2.586876392364502, + "learning_rate": 3.280024088326239e-07, + "logits/chosen": -0.5806320905685425, + "logits/rejected": -0.6670634746551514, + "logps/chosen": -61.04909133911133, + "logps/rejected": -117.69286346435547, + "loss": 0.5847, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.34858775138855, + "rewards/margins": 7.841142654418945, + "rewards/rejected": -4.492554187774658, + "step": 17668 + }, + { + "epoch": 4.42, + "grad_norm": 4.196495532989502, + "learning_rate": 3.277224766838805e-07, + "logits/chosen": -0.49373549222946167, + "logits/rejected": -0.5462965369224548, + "logps/chosen": -50.85725402832031, + "logps/rejected": -112.13333892822266, + "loss": 0.5534, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.912658214569092, + "rewards/margins": 7.5173749923706055, + "rewards/rejected": -4.6047163009643555, + "step": 17669 + }, + { + "epoch": 4.42, + "grad_norm": 5.131835460662842, + "learning_rate": 3.2744265999208325e-07, + "logits/chosen": -0.5647368431091309, + "logits/rejected": -0.6366599798202515, + "logps/chosen": -54.718467712402344, + "logps/rejected": -108.577880859375, + "loss": 0.6271, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3135485649108887, + "rewards/margins": 7.963379859924316, + "rewards/rejected": -4.649831771850586, + "step": 17670 + }, + { + "epoch": 4.42, + "grad_norm": 3.6974570751190186, + "learning_rate": 3.271629587641484e-07, + "logits/chosen": -0.5244700908660889, + "logits/rejected": -0.6029149293899536, + "logps/chosen": -54.413612365722656, + "logps/rejected": -100.93891906738281, + "loss": 0.6057, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8118069171905518, + "rewards/margins": 7.2486066818237305, + "rewards/rejected": -4.4367995262146, + "step": 17671 + }, + { + "epoch": 4.42, + "grad_norm": 4.043921947479248, + "learning_rate": 3.268833730069876e-07, + "logits/chosen": -0.5084562301635742, + "logits/rejected": -0.6118765473365784, + "logps/chosen": -49.45747756958008, + "logps/rejected": -127.21051788330078, + "loss": 0.5427, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.225005626678467, + "rewards/margins": 7.926833152770996, + "rewards/rejected": -4.701827049255371, + "step": 17672 + }, + { + "epoch": 4.42, + "grad_norm": 4.814972877502441, + "learning_rate": 3.2660390272750807e-07, + "logits/chosen": -0.5881682634353638, + "logits/rejected": -0.6481797695159912, + "logps/chosen": -50.47189712524414, + "logps/rejected": -116.56434631347656, + "loss": 0.6294, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2139532566070557, + "rewards/margins": 7.991166591644287, + "rewards/rejected": -4.7772135734558105, + "step": 17673 + }, + { + "epoch": 4.42, + "grad_norm": 6.1766767501831055, + "learning_rate": 3.2632454793261827e-07, + "logits/chosen": -0.5321065783500671, + "logits/rejected": -0.5803734064102173, + "logps/chosen": -57.248138427734375, + "logps/rejected": -101.74159240722656, + "loss": 0.632, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.184718370437622, + "rewards/margins": 6.8339924812316895, + "rewards/rejected": -3.6492743492126465, + "step": 17674 + }, + { + "epoch": 4.42, + "grad_norm": 9.528226852416992, + "learning_rate": 3.260453086292187e-07, + "logits/chosen": -0.6013952493667603, + "logits/rejected": -0.6875771284103394, + "logps/chosen": -66.64277648925781, + "logps/rejected": -106.29825592041016, + "loss": 0.6978, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.130967140197754, + "rewards/margins": 6.072566986083984, + "rewards/rejected": -2.9416003227233887, + "step": 17675 + }, + { + "epoch": 4.42, + "grad_norm": 4.263554096221924, + "learning_rate": 3.257661848242116e-07, + "logits/chosen": -0.5431074500083923, + "logits/rejected": -0.5931837558746338, + "logps/chosen": -41.32843017578125, + "logps/rejected": -94.34661865234375, + "loss": 0.5957, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.04404354095459, + "rewards/margins": 6.789484024047852, + "rewards/rejected": -3.74544095993042, + "step": 17676 + }, + { + "epoch": 4.42, + "grad_norm": 2.738980531692505, + "learning_rate": 3.2548717652449435e-07, + "logits/chosen": -0.5874574184417725, + "logits/rejected": -0.6767191886901855, + "logps/chosen": -49.957984924316406, + "logps/rejected": -107.24163818359375, + "loss": 0.5409, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2543575763702393, + "rewards/margins": 7.748859882354736, + "rewards/rejected": -4.494502067565918, + "step": 17677 + }, + { + "epoch": 4.42, + "grad_norm": 4.530860900878906, + "learning_rate": 3.252082837369602e-07, + "logits/chosen": -0.5650895237922668, + "logits/rejected": -0.6201640367507935, + "logps/chosen": -54.82817077636719, + "logps/rejected": -119.43051147460938, + "loss": 0.6538, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3259992599487305, + "rewards/margins": 6.8728766441345215, + "rewards/rejected": -3.5468766689300537, + "step": 17678 + }, + { + "epoch": 4.42, + "grad_norm": 5.239708423614502, + "learning_rate": 3.2492950646850154e-07, + "logits/chosen": -0.5272765755653381, + "logits/rejected": -0.6374602317810059, + "logps/chosen": -62.335289001464844, + "logps/rejected": -110.28338623046875, + "loss": 0.6088, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1606643199920654, + "rewards/margins": 8.454313278198242, + "rewards/rejected": -5.293648719787598, + "step": 17679 + }, + { + "epoch": 4.42, + "grad_norm": 4.254561424255371, + "learning_rate": 3.2465084472600774e-07, + "logits/chosen": -0.5976940989494324, + "logits/rejected": -0.6746407747268677, + "logps/chosen": -52.200408935546875, + "logps/rejected": -96.0712661743164, + "loss": 0.6288, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.163473129272461, + "rewards/margins": 6.452506065368652, + "rewards/rejected": -3.2890326976776123, + "step": 17680 + }, + { + "epoch": 4.42, + "grad_norm": 3.87056565284729, + "learning_rate": 3.24372298516365e-07, + "logits/chosen": -0.5147295594215393, + "logits/rejected": -0.6080385446548462, + "logps/chosen": -63.90658187866211, + "logps/rejected": -113.86717987060547, + "loss": 0.6411, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8543765544891357, + "rewards/margins": 7.712543487548828, + "rewards/rejected": -4.8581671714782715, + "step": 17681 + }, + { + "epoch": 4.42, + "grad_norm": 4.888230800628662, + "learning_rate": 3.2409386784645504e-07, + "logits/chosen": -0.5112963318824768, + "logits/rejected": -0.5812937617301941, + "logps/chosen": -61.07522201538086, + "logps/rejected": -125.04137420654297, + "loss": 0.5803, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6515517234802246, + "rewards/margins": 7.910152435302734, + "rewards/rejected": -5.258600234985352, + "step": 17682 + }, + { + "epoch": 4.42, + "grad_norm": 5.233627796173096, + "learning_rate": 3.238155527231601e-07, + "logits/chosen": -0.5470472574234009, + "logits/rejected": -0.6284949779510498, + "logps/chosen": -57.04063415527344, + "logps/rejected": -102.61688995361328, + "loss": 0.5694, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0223982334136963, + "rewards/margins": 6.554065704345703, + "rewards/rejected": -3.5316667556762695, + "step": 17683 + }, + { + "epoch": 4.42, + "grad_norm": 6.857227325439453, + "learning_rate": 3.2353735315335646e-07, + "logits/chosen": -0.5609108209609985, + "logits/rejected": -0.6206781268119812, + "logps/chosen": -53.22435760498047, + "logps/rejected": -119.52334594726562, + "loss": 0.6539, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.073883056640625, + "rewards/margins": 7.051281929016113, + "rewards/rejected": -3.9773988723754883, + "step": 17684 + }, + { + "epoch": 4.42, + "grad_norm": 2.6056485176086426, + "learning_rate": 3.2325926914391846e-07, + "logits/chosen": -0.4702466130256653, + "logits/rejected": -0.6245881915092468, + "logps/chosen": -55.226036071777344, + "logps/rejected": -99.32475280761719, + "loss": 0.5141, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2658698558807373, + "rewards/margins": 7.518760681152344, + "rewards/rejected": -4.252890586853027, + "step": 17685 + }, + { + "epoch": 4.42, + "grad_norm": 7.854556560516357, + "learning_rate": 3.229813007017185e-07, + "logits/chosen": -0.5415915846824646, + "logits/rejected": -0.6233595609664917, + "logps/chosen": -47.40350341796875, + "logps/rejected": -94.8310775756836, + "loss": 0.6285, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.04542875289917, + "rewards/margins": 5.712299823760986, + "rewards/rejected": -2.6668710708618164, + "step": 17686 + }, + { + "epoch": 4.42, + "grad_norm": 6.24489688873291, + "learning_rate": 3.227034478336255e-07, + "logits/chosen": -0.5798664689064026, + "logits/rejected": -0.6440340876579285, + "logps/chosen": -60.199127197265625, + "logps/rejected": -103.18830871582031, + "loss": 0.6182, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.215646266937256, + "rewards/margins": 6.157064437866211, + "rewards/rejected": -2.941418409347534, + "step": 17687 + }, + { + "epoch": 4.42, + "grad_norm": 6.892159461975098, + "learning_rate": 3.224257105465051e-07, + "logits/chosen": -0.5462358593940735, + "logits/rejected": -0.5975167155265808, + "logps/chosen": -52.84473419189453, + "logps/rejected": -118.51800537109375, + "loss": 0.6311, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0177106857299805, + "rewards/margins": 7.087137222290039, + "rewards/rejected": -4.069427013397217, + "step": 17688 + }, + { + "epoch": 4.43, + "grad_norm": 11.892644882202148, + "learning_rate": 3.2214808884722004e-07, + "logits/chosen": -0.5072878003120422, + "logits/rejected": -0.5975430607795715, + "logps/chosen": -55.48595428466797, + "logps/rejected": -95.46979522705078, + "loss": 0.6307, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3096487522125244, + "rewards/margins": 7.118212699890137, + "rewards/rejected": -3.808563709259033, + "step": 17689 + }, + { + "epoch": 4.43, + "grad_norm": 3.2890987396240234, + "learning_rate": 3.218705827426333e-07, + "logits/chosen": -0.5875844955444336, + "logits/rejected": -0.6270204782485962, + "logps/chosen": -52.16868209838867, + "logps/rejected": -133.8373260498047, + "loss": 0.5611, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9739139080047607, + "rewards/margins": 7.859191417694092, + "rewards/rejected": -4.885277271270752, + "step": 17690 + }, + { + "epoch": 4.43, + "grad_norm": 5.706084728240967, + "learning_rate": 3.215931922395982e-07, + "logits/chosen": -0.558824896812439, + "logits/rejected": -0.6319605708122253, + "logps/chosen": -61.367218017578125, + "logps/rejected": -113.04872131347656, + "loss": 0.638, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.074032783508301, + "rewards/margins": 7.567115783691406, + "rewards/rejected": -4.4930830001831055, + "step": 17691 + }, + { + "epoch": 4.43, + "grad_norm": 2.4908714294433594, + "learning_rate": 3.2131591734497204e-07, + "logits/chosen": -0.6945863366127014, + "logits/rejected": -0.7872586250305176, + "logps/chosen": -40.7664680480957, + "logps/rejected": -92.19288635253906, + "loss": 0.5703, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9472033977508545, + "rewards/margins": 6.948851585388184, + "rewards/rejected": -4.00164794921875, + "step": 17692 + }, + { + "epoch": 4.43, + "grad_norm": 6.939419269561768, + "learning_rate": 3.210387580656066e-07, + "logits/chosen": -0.5431667566299438, + "logits/rejected": -0.6384976506233215, + "logps/chosen": -48.98451232910156, + "logps/rejected": -103.14014434814453, + "loss": 0.5637, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1534767150878906, + "rewards/margins": 6.744527339935303, + "rewards/rejected": -3.591050624847412, + "step": 17693 + }, + { + "epoch": 4.43, + "grad_norm": 3.227073907852173, + "learning_rate": 3.2076171440835023e-07, + "logits/chosen": -0.5579920411109924, + "logits/rejected": -0.6652868390083313, + "logps/chosen": -42.074623107910156, + "logps/rejected": -116.9063720703125, + "loss": 0.4759, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4560418128967285, + "rewards/margins": 9.222169876098633, + "rewards/rejected": -5.766128063201904, + "step": 17694 + }, + { + "epoch": 4.43, + "grad_norm": 26.955978393554688, + "learning_rate": 3.204847863800481e-07, + "logits/chosen": -0.5283809900283813, + "logits/rejected": -0.5957176089286804, + "logps/chosen": -57.93692398071289, + "logps/rejected": -109.01097106933594, + "loss": 0.7354, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1130635738372803, + "rewards/margins": 6.536190986633301, + "rewards/rejected": -3.4231274127960205, + "step": 17695 + }, + { + "epoch": 4.43, + "grad_norm": 4.623409748077393, + "learning_rate": 3.202079739875452e-07, + "logits/chosen": -0.5886619091033936, + "logits/rejected": -0.6635096073150635, + "logps/chosen": -54.90346908569336, + "logps/rejected": -121.01571655273438, + "loss": 0.7161, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1845479011535645, + "rewards/margins": 8.724656105041504, + "rewards/rejected": -5.540107250213623, + "step": 17696 + }, + { + "epoch": 4.43, + "grad_norm": 4.76540470123291, + "learning_rate": 3.199312772376806e-07, + "logits/chosen": -0.538274347782135, + "logits/rejected": -0.6360337138175964, + "logps/chosen": -50.24555206298828, + "logps/rejected": -111.60491943359375, + "loss": 0.6391, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3209445476531982, + "rewards/margins": 8.261491775512695, + "rewards/rejected": -4.940546989440918, + "step": 17697 + }, + { + "epoch": 4.43, + "grad_norm": 2.04803204536438, + "learning_rate": 3.196546961372915e-07, + "logits/chosen": -0.49590736627578735, + "logits/rejected": -0.5184115767478943, + "logps/chosen": -42.8826789855957, + "logps/rejected": -114.23030853271484, + "loss": 0.5249, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1809072494506836, + "rewards/margins": 7.543476104736328, + "rewards/rejected": -4.362569808959961, + "step": 17698 + }, + { + "epoch": 4.43, + "grad_norm": 4.3324384689331055, + "learning_rate": 3.193782306932142e-07, + "logits/chosen": -0.6072059869766235, + "logits/rejected": -0.7178207635879517, + "logps/chosen": -42.09676742553711, + "logps/rejected": -97.33586883544922, + "loss": 0.5847, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2068228721618652, + "rewards/margins": 7.834428310394287, + "rewards/rejected": -4.6276044845581055, + "step": 17699 + }, + { + "epoch": 4.43, + "grad_norm": 5.0050153732299805, + "learning_rate": 3.1910188091227813e-07, + "logits/chosen": -0.507591724395752, + "logits/rejected": -0.643490195274353, + "logps/chosen": -57.39763259887695, + "logps/rejected": -89.65872192382812, + "loss": 0.5565, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.255580186843872, + "rewards/margins": 7.222784042358398, + "rewards/rejected": -3.9672038555145264, + "step": 17700 + }, + { + "epoch": 4.43, + "grad_norm": 8.516911506652832, + "learning_rate": 3.18825646801314e-07, + "logits/chosen": -0.5393782258033752, + "logits/rejected": -0.6345868706703186, + "logps/chosen": -58.60211181640625, + "logps/rejected": -115.76515197753906, + "loss": 0.5571, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.758931875228882, + "rewards/margins": 7.624462127685547, + "rewards/rejected": -4.865530014038086, + "step": 17701 + }, + { + "epoch": 4.43, + "grad_norm": 6.084005832672119, + "learning_rate": 3.185495283671475e-07, + "logits/chosen": -0.5859928727149963, + "logits/rejected": -0.6703278422355652, + "logps/chosen": -45.57419204711914, + "logps/rejected": -112.59248352050781, + "loss": 0.5714, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9884836673736572, + "rewards/margins": 7.4058146476745605, + "rewards/rejected": -4.417329788208008, + "step": 17702 + }, + { + "epoch": 4.43, + "grad_norm": 4.30954647064209, + "learning_rate": 3.1827352561660084e-07, + "logits/chosen": -0.5310032367706299, + "logits/rejected": -0.6438069939613342, + "logps/chosen": -52.80152130126953, + "logps/rejected": -92.6436767578125, + "loss": 0.5435, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0125176906585693, + "rewards/margins": 6.698491096496582, + "rewards/rejected": -3.68597412109375, + "step": 17703 + }, + { + "epoch": 4.43, + "grad_norm": 6.043856143951416, + "learning_rate": 3.179976385564953e-07, + "logits/chosen": -0.5380448698997498, + "logits/rejected": -0.6308497786521912, + "logps/chosen": -48.88552474975586, + "logps/rejected": -86.03329467773438, + "loss": 0.5753, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.177319049835205, + "rewards/margins": 7.2275004386901855, + "rewards/rejected": -4.050180912017822, + "step": 17704 + }, + { + "epoch": 4.43, + "grad_norm": 9.81413459777832, + "learning_rate": 3.1772186719364937e-07, + "logits/chosen": -0.5964179635047913, + "logits/rejected": -0.605837345123291, + "logps/chosen": -55.329410552978516, + "logps/rejected": -120.70243835449219, + "loss": 0.845, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9638731479644775, + "rewards/margins": 7.041492462158203, + "rewards/rejected": -4.077619552612305, + "step": 17705 + }, + { + "epoch": 4.43, + "grad_norm": 7.525641918182373, + "learning_rate": 3.174462115348753e-07, + "logits/chosen": -0.5513803362846375, + "logits/rejected": -0.6205988526344299, + "logps/chosen": -55.1114616394043, + "logps/rejected": -108.62612915039062, + "loss": 0.6827, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0283939838409424, + "rewards/margins": 6.8613362312316895, + "rewards/rejected": -3.832941770553589, + "step": 17706 + }, + { + "epoch": 4.43, + "grad_norm": 5.367796897888184, + "learning_rate": 3.1717067158698544e-07, + "logits/chosen": -0.4719882905483246, + "logits/rejected": -0.5561965703964233, + "logps/chosen": -64.87833404541016, + "logps/rejected": -125.65357971191406, + "loss": 0.6716, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.931057929992676, + "rewards/margins": 7.868152618408203, + "rewards/rejected": -4.937093734741211, + "step": 17707 + }, + { + "epoch": 4.43, + "grad_norm": 3.0623421669006348, + "learning_rate": 3.1689524735679045e-07, + "logits/chosen": -0.4842453896999359, + "logits/rejected": -0.6102151870727539, + "logps/chosen": -65.21762084960938, + "logps/rejected": -84.3013687133789, + "loss": 0.5947, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4566943645477295, + "rewards/margins": 6.964571952819824, + "rewards/rejected": -3.507877826690674, + "step": 17708 + }, + { + "epoch": 4.43, + "grad_norm": 8.603241920471191, + "learning_rate": 3.1661993885109434e-07, + "logits/chosen": -0.5550776720046997, + "logits/rejected": -0.6337870359420776, + "logps/chosen": -61.3552131652832, + "logps/rejected": -101.1898193359375, + "loss": 0.7818, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9559624195098877, + "rewards/margins": 5.675801753997803, + "rewards/rejected": -2.719839572906494, + "step": 17709 + }, + { + "epoch": 4.43, + "grad_norm": 3.294344186782837, + "learning_rate": 3.163447460767005e-07, + "logits/chosen": -0.4972113370895386, + "logits/rejected": -0.5884186029434204, + "logps/chosen": -47.586273193359375, + "logps/rejected": -107.51872253417969, + "loss": 0.529, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1184535026550293, + "rewards/margins": 8.040043830871582, + "rewards/rejected": -4.921590328216553, + "step": 17710 + }, + { + "epoch": 4.43, + "grad_norm": 3.3237056732177734, + "learning_rate": 3.1606966904041026e-07, + "logits/chosen": -0.5333324670791626, + "logits/rejected": -0.6272203922271729, + "logps/chosen": -63.42864227294922, + "logps/rejected": -103.61674499511719, + "loss": 0.5664, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1247761249542236, + "rewards/margins": 7.670348644256592, + "rewards/rejected": -4.545572280883789, + "step": 17711 + }, + { + "epoch": 4.43, + "grad_norm": 13.88824462890625, + "learning_rate": 3.157947077490209e-07, + "logits/chosen": -0.5906915664672852, + "logits/rejected": -0.632203221321106, + "logps/chosen": -53.910701751708984, + "logps/rejected": -108.53943634033203, + "loss": 0.7822, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2857797145843506, + "rewards/margins": 6.327916622161865, + "rewards/rejected": -3.042137384414673, + "step": 17712 + }, + { + "epoch": 4.43, + "grad_norm": 4.32663631439209, + "learning_rate": 3.1551986220932586e-07, + "logits/chosen": -0.5359350442886353, + "logits/rejected": -0.6029615998268127, + "logps/chosen": -39.34489822387695, + "logps/rejected": -95.07429504394531, + "loss": 0.5642, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.372602939605713, + "rewards/margins": 7.809591293334961, + "rewards/rejected": -4.436988830566406, + "step": 17713 + }, + { + "epoch": 4.43, + "grad_norm": 6.467526912689209, + "learning_rate": 3.152451324281186e-07, + "logits/chosen": -0.47360873222351074, + "logits/rejected": -0.5932230353355408, + "logps/chosen": -57.885589599609375, + "logps/rejected": -117.15415954589844, + "loss": 0.6476, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.825157880783081, + "rewards/margins": 6.874396324157715, + "rewards/rejected": -4.049238204956055, + "step": 17714 + }, + { + "epoch": 4.43, + "grad_norm": 4.271967887878418, + "learning_rate": 3.149705184121871e-07, + "logits/chosen": -0.602568507194519, + "logits/rejected": -0.681212842464447, + "logps/chosen": -47.304161071777344, + "logps/rejected": -125.60861206054688, + "loss": 0.568, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.324777126312256, + "rewards/margins": 9.73381233215332, + "rewards/rejected": -6.409034252166748, + "step": 17715 + }, + { + "epoch": 4.43, + "grad_norm": 3.7577450275421143, + "learning_rate": 3.146960201683158e-07, + "logits/chosen": -0.5314124822616577, + "logits/rejected": -0.5826005935668945, + "logps/chosen": -60.82719421386719, + "logps/rejected": -109.59715270996094, + "loss": 0.6582, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0374627113342285, + "rewards/margins": 7.240922927856445, + "rewards/rejected": -4.203459739685059, + "step": 17716 + }, + { + "epoch": 4.43, + "grad_norm": 4.160747528076172, + "learning_rate": 3.14421637703291e-07, + "logits/chosen": -0.4669434130191803, + "logits/rejected": -0.6163817048072815, + "logps/chosen": -56.669071197509766, + "logps/rejected": -101.91617584228516, + "loss": 0.5568, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2716007232666016, + "rewards/margins": 7.763744354248047, + "rewards/rejected": -4.492143154144287, + "step": 17717 + }, + { + "epoch": 4.43, + "grad_norm": 5.566540241241455, + "learning_rate": 3.141473710238901e-07, + "logits/chosen": -0.6124873757362366, + "logits/rejected": -0.6081601977348328, + "logps/chosen": -63.84497833251953, + "logps/rejected": -100.871337890625, + "loss": 0.7358, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8722150325775146, + "rewards/margins": 4.711641311645508, + "rewards/rejected": -1.8394261598587036, + "step": 17718 + }, + { + "epoch": 4.43, + "grad_norm": 3.3958826065063477, + "learning_rate": 3.1387322013689267e-07, + "logits/chosen": -0.6123697757720947, + "logits/rejected": -0.7152053117752075, + "logps/chosen": -49.97493362426758, + "logps/rejected": -109.61622619628906, + "loss": 0.6398, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0077381134033203, + "rewards/margins": 8.087105751037598, + "rewards/rejected": -5.0793681144714355, + "step": 17719 + }, + { + "epoch": 4.43, + "grad_norm": 6.500575542449951, + "learning_rate": 3.135991850490722e-07, + "logits/chosen": -0.5288678407669067, + "logits/rejected": -0.6083967089653015, + "logps/chosen": -58.92851257324219, + "logps/rejected": -117.23255920410156, + "loss": 0.6362, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.873648166656494, + "rewards/margins": 7.293561935424805, + "rewards/rejected": -4.419914245605469, + "step": 17720 + }, + { + "epoch": 4.43, + "grad_norm": 4.894636631011963, + "learning_rate": 3.133252657671998e-07, + "logits/chosen": -0.5755991339683533, + "logits/rejected": -0.6452846527099609, + "logps/chosen": -50.58306884765625, + "logps/rejected": -94.22063446044922, + "loss": 0.597, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.041482925415039, + "rewards/margins": 6.356270790100098, + "rewards/rejected": -3.3147878646850586, + "step": 17721 + }, + { + "epoch": 4.43, + "grad_norm": 5.016247272491455, + "learning_rate": 3.130514622980463e-07, + "logits/chosen": -0.42984580993652344, + "logits/rejected": -0.5138075351715088, + "logps/chosen": -74.57984161376953, + "logps/rejected": -118.98519897460938, + "loss": 0.6982, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4374778270721436, + "rewards/margins": 8.348161697387695, + "rewards/rejected": -4.910683631896973, + "step": 17722 + }, + { + "epoch": 4.43, + "grad_norm": 9.694479942321777, + "learning_rate": 3.1277777464837513e-07, + "logits/chosen": -0.5320988893508911, + "logits/rejected": -0.5831586122512817, + "logps/chosen": -51.91017150878906, + "logps/rejected": -115.66410827636719, + "loss": 0.5917, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3513200283050537, + "rewards/margins": 7.028884410858154, + "rewards/rejected": -3.6775641441345215, + "step": 17723 + }, + { + "epoch": 4.43, + "grad_norm": 12.339225769042969, + "learning_rate": 3.125042028249514e-07, + "logits/chosen": -0.5317448377609253, + "logits/rejected": -0.621016800403595, + "logps/chosen": -57.51237869262695, + "logps/rejected": -94.54963684082031, + "loss": 0.558, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.094496965408325, + "rewards/margins": 6.612659931182861, + "rewards/rejected": -3.518162488937378, + "step": 17724 + }, + { + "epoch": 4.43, + "grad_norm": 7.479452133178711, + "learning_rate": 3.122307468345342e-07, + "logits/chosen": -0.5851514339447021, + "logits/rejected": -0.6815650463104248, + "logps/chosen": -52.83932876586914, + "logps/rejected": -98.9217529296875, + "loss": 0.5998, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.400804042816162, + "rewards/margins": 7.545162200927734, + "rewards/rejected": -4.144357681274414, + "step": 17725 + }, + { + "epoch": 4.43, + "grad_norm": 9.244017601013184, + "learning_rate": 3.119574066838821e-07, + "logits/chosen": -0.48981812596321106, + "logits/rejected": -0.5478613376617432, + "logps/chosen": -59.85950469970703, + "logps/rejected": -108.00337982177734, + "loss": 0.5933, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.198528289794922, + "rewards/margins": 6.550498008728027, + "rewards/rejected": -3.3519701957702637, + "step": 17726 + }, + { + "epoch": 4.43, + "grad_norm": 3.381558895111084, + "learning_rate": 3.1168418237974893e-07, + "logits/chosen": -0.5159755945205688, + "logits/rejected": -0.6052985787391663, + "logps/chosen": -58.113983154296875, + "logps/rejected": -124.71102905273438, + "loss": 0.5822, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4565916061401367, + "rewards/margins": 8.322705268859863, + "rewards/rejected": -4.866113662719727, + "step": 17727 + }, + { + "epoch": 4.43, + "grad_norm": 1.9950945377349854, + "learning_rate": 3.1141107392888506e-07, + "logits/chosen": -0.5089512467384338, + "logits/rejected": -0.6612541675567627, + "logps/chosen": -49.89225769042969, + "logps/rejected": -95.57132720947266, + "loss": 0.5629, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.098355293273926, + "rewards/margins": 8.512020111083984, + "rewards/rejected": -5.413663864135742, + "step": 17728 + }, + { + "epoch": 4.44, + "grad_norm": 3.4129700660705566, + "learning_rate": 3.111380813380421e-07, + "logits/chosen": -0.6850175261497498, + "logits/rejected": -0.7837357521057129, + "logps/chosen": -61.20525360107422, + "logps/rejected": -95.07433319091797, + "loss": 0.6524, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1130659580230713, + "rewards/margins": 5.925619125366211, + "rewards/rejected": -2.8125524520874023, + "step": 17729 + }, + { + "epoch": 4.44, + "grad_norm": 6.52957010269165, + "learning_rate": 3.1086520461396373e-07, + "logits/chosen": -0.5595226287841797, + "logits/rejected": -0.6511995196342468, + "logps/chosen": -60.798091888427734, + "logps/rejected": -122.02204895019531, + "loss": 0.6941, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9797494411468506, + "rewards/margins": 7.544878959655762, + "rewards/rejected": -4.565129280090332, + "step": 17730 + }, + { + "epoch": 4.44, + "grad_norm": 5.309074878692627, + "learning_rate": 3.1059244376339337e-07, + "logits/chosen": -0.6029662489891052, + "logits/rejected": -0.6670517921447754, + "logps/chosen": -45.52729415893555, + "logps/rejected": -130.2176055908203, + "loss": 0.6093, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.392298460006714, + "rewards/margins": 8.981874465942383, + "rewards/rejected": -5.589575290679932, + "step": 17731 + }, + { + "epoch": 4.44, + "grad_norm": 3.45169997215271, + "learning_rate": 3.103197987930723e-07, + "logits/chosen": -0.4545018970966339, + "logits/rejected": -0.5298657417297363, + "logps/chosen": -64.52363586425781, + "logps/rejected": -111.15493774414062, + "loss": 0.5919, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.989569664001465, + "rewards/margins": 7.624865531921387, + "rewards/rejected": -4.635295867919922, + "step": 17732 + }, + { + "epoch": 4.44, + "grad_norm": 6.384986877441406, + "learning_rate": 3.100472697097373e-07, + "logits/chosen": -0.593716025352478, + "logits/rejected": -0.68125981092453, + "logps/chosen": -49.89799118041992, + "logps/rejected": -105.0831069946289, + "loss": 0.6848, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.017944574356079, + "rewards/margins": 6.642692565917969, + "rewards/rejected": -3.6247479915618896, + "step": 17733 + }, + { + "epoch": 4.44, + "grad_norm": 3.0487396717071533, + "learning_rate": 3.0977485652012194e-07, + "logits/chosen": -0.5516513586044312, + "logits/rejected": -0.6682264804840088, + "logps/chosen": -56.46105194091797, + "logps/rejected": -119.0002670288086, + "loss": 0.5475, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2604050636291504, + "rewards/margins": 8.70872688293457, + "rewards/rejected": -5.448322296142578, + "step": 17734 + }, + { + "epoch": 4.44, + "grad_norm": 4.073707103729248, + "learning_rate": 3.095025592309586e-07, + "logits/chosen": -0.5344783067703247, + "logits/rejected": -0.6380906105041504, + "logps/chosen": -52.767127990722656, + "logps/rejected": -109.37054443359375, + "loss": 0.6073, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.968947649002075, + "rewards/margins": 6.955331325531006, + "rewards/rejected": -3.9863839149475098, + "step": 17735 + }, + { + "epoch": 4.44, + "grad_norm": 3.166361093521118, + "learning_rate": 3.092303778489769e-07, + "logits/chosen": -0.5811805725097656, + "logits/rejected": -0.6288741230964661, + "logps/chosen": -63.04857635498047, + "logps/rejected": -121.33856201171875, + "loss": 0.6474, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.837476968765259, + "rewards/margins": 7.1766357421875, + "rewards/rejected": -4.339158535003662, + "step": 17736 + }, + { + "epoch": 4.44, + "grad_norm": 3.8751986026763916, + "learning_rate": 3.0895831238090145e-07, + "logits/chosen": -0.5831190347671509, + "logits/rejected": -0.6758813261985779, + "logps/chosen": -56.5940055847168, + "logps/rejected": -118.79864501953125, + "loss": 0.6045, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4775962829589844, + "rewards/margins": 9.291242599487305, + "rewards/rejected": -5.813645839691162, + "step": 17737 + }, + { + "epoch": 4.44, + "grad_norm": 6.697956085205078, + "learning_rate": 3.086863628334552e-07, + "logits/chosen": -0.5739896297454834, + "logits/rejected": -0.6559842824935913, + "logps/chosen": -62.13361358642578, + "logps/rejected": -111.50942993164062, + "loss": 0.6775, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7598328590393066, + "rewards/margins": 6.879830360412598, + "rewards/rejected": -4.119997501373291, + "step": 17738 + }, + { + "epoch": 4.44, + "grad_norm": 3.8519444465637207, + "learning_rate": 3.0841452921336004e-07, + "logits/chosen": -0.5511195063591003, + "logits/rejected": -0.6079725623130798, + "logps/chosen": -52.21576690673828, + "logps/rejected": -110.81257629394531, + "loss": 0.589, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.28332781791687, + "rewards/margins": 7.528478622436523, + "rewards/rejected": -4.245151042938232, + "step": 17739 + }, + { + "epoch": 4.44, + "grad_norm": 3.720792531967163, + "learning_rate": 3.081428115273316e-07, + "logits/chosen": -0.6039244532585144, + "logits/rejected": -0.6724649667739868, + "logps/chosen": -42.616973876953125, + "logps/rejected": -89.35308837890625, + "loss": 0.5797, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.195751667022705, + "rewards/margins": 5.900515556335449, + "rewards/rejected": -2.7047643661499023, + "step": 17740 + }, + { + "epoch": 4.44, + "grad_norm": 3.450075149536133, + "learning_rate": 3.0787120978208406e-07, + "logits/chosen": -0.5475034713745117, + "logits/rejected": -0.5485461950302124, + "logps/chosen": -47.57220458984375, + "logps/rejected": -137.59246826171875, + "loss": 0.5043, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2794291973114014, + "rewards/margins": 7.87116813659668, + "rewards/rejected": -4.591739177703857, + "step": 17741 + }, + { + "epoch": 4.44, + "grad_norm": 3.6990280151367188, + "learning_rate": 3.0759972398433035e-07, + "logits/chosen": -0.49592825770378113, + "logits/rejected": -0.5934948325157166, + "logps/chosen": -67.72343444824219, + "logps/rejected": -100.08814239501953, + "loss": 0.6857, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.316391706466675, + "rewards/margins": 7.079931735992432, + "rewards/rejected": -3.763540029525757, + "step": 17742 + }, + { + "epoch": 4.44, + "grad_norm": 8.307565689086914, + "learning_rate": 3.073283541407779e-07, + "logits/chosen": -0.5462325811386108, + "logits/rejected": -0.614316999912262, + "logps/chosen": -55.451175689697266, + "logps/rejected": -111.3580551147461, + "loss": 0.586, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1294569969177246, + "rewards/margins": 7.699094772338867, + "rewards/rejected": -4.569638252258301, + "step": 17743 + }, + { + "epoch": 4.44, + "grad_norm": 3.8400778770446777, + "learning_rate": 3.0705710025813405e-07, + "logits/chosen": -0.5140379667282104, + "logits/rejected": -0.5905030369758606, + "logps/chosen": -53.91114807128906, + "logps/rejected": -115.36001586914062, + "loss": 0.5762, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2917184829711914, + "rewards/margins": 7.522091865539551, + "rewards/rejected": -4.230372428894043, + "step": 17744 + }, + { + "epoch": 4.44, + "grad_norm": 3.272681474685669, + "learning_rate": 3.067859623431008e-07, + "logits/chosen": -0.6025608777999878, + "logits/rejected": -0.6674392819404602, + "logps/chosen": -52.43510055541992, + "logps/rejected": -103.06822967529297, + "loss": 0.6203, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1545064449310303, + "rewards/margins": 7.383961200714111, + "rewards/rejected": -4.229454040527344, + "step": 17745 + }, + { + "epoch": 4.44, + "grad_norm": 3.688105344772339, + "learning_rate": 3.0651494040237706e-07, + "logits/chosen": -0.5363136529922485, + "logits/rejected": -0.6554957032203674, + "logps/chosen": -50.9837532043457, + "logps/rejected": -125.6029281616211, + "loss": 0.5796, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0843803882598877, + "rewards/margins": 9.662979125976562, + "rewards/rejected": -6.578598976135254, + "step": 17746 + }, + { + "epoch": 4.44, + "grad_norm": 6.417537689208984, + "learning_rate": 3.062440344426626e-07, + "logits/chosen": -0.5309782028198242, + "logits/rejected": -0.6201860904693604, + "logps/chosen": -52.83677673339844, + "logps/rejected": -108.85481262207031, + "loss": 0.6458, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3135008811950684, + "rewards/margins": 8.005855560302734, + "rewards/rejected": -4.692355155944824, + "step": 17747 + }, + { + "epoch": 4.44, + "grad_norm": 9.18217945098877, + "learning_rate": 3.059732444706509e-07, + "logits/chosen": -0.5137261748313904, + "logits/rejected": -0.5474187135696411, + "logps/chosen": -56.55602264404297, + "logps/rejected": -107.9862289428711, + "loss": 0.7119, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2554149627685547, + "rewards/margins": 6.879899978637695, + "rewards/rejected": -3.6244850158691406, + "step": 17748 + }, + { + "epoch": 4.44, + "grad_norm": 5.853510856628418, + "learning_rate": 3.0570257049303167e-07, + "logits/chosen": -0.5868033170700073, + "logits/rejected": -0.651310920715332, + "logps/chosen": -58.576717376708984, + "logps/rejected": -107.3389892578125, + "loss": 0.5671, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1974215507507324, + "rewards/margins": 8.137622833251953, + "rewards/rejected": -4.940201282501221, + "step": 17749 + }, + { + "epoch": 4.44, + "grad_norm": 2.7113020420074463, + "learning_rate": 3.054320125164956e-07, + "logits/chosen": -0.5736522078514099, + "logits/rejected": -0.6125953197479248, + "logps/chosen": -44.77569580078125, + "logps/rejected": -125.864990234375, + "loss": 0.5343, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2009143829345703, + "rewards/margins": 8.108308792114258, + "rewards/rejected": -4.907394886016846, + "step": 17750 + }, + { + "epoch": 4.44, + "grad_norm": 4.383045196533203, + "learning_rate": 3.05161570547729e-07, + "logits/chosen": -0.4831610918045044, + "logits/rejected": -0.5704198479652405, + "logps/chosen": -60.55213165283203, + "logps/rejected": -108.0199203491211, + "loss": 0.6352, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8233752250671387, + "rewards/margins": 6.587519645690918, + "rewards/rejected": -3.7641448974609375, + "step": 17751 + }, + { + "epoch": 4.44, + "grad_norm": 6.386303424835205, + "learning_rate": 3.048912445934121e-07, + "logits/chosen": -0.620783269405365, + "logits/rejected": -0.7071845531463623, + "logps/chosen": -46.4940071105957, + "logps/rejected": -101.24887084960938, + "loss": 0.5877, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1228201389312744, + "rewards/margins": 7.641641616821289, + "rewards/rejected": -4.5188212394714355, + "step": 17752 + }, + { + "epoch": 4.44, + "grad_norm": 4.379003524780273, + "learning_rate": 3.0462103466022685e-07, + "logits/chosen": -0.5248536467552185, + "logits/rejected": -0.6244214177131653, + "logps/chosen": -57.93171310424805, + "logps/rejected": -101.45365905761719, + "loss": 0.5633, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.827237129211426, + "rewards/margins": 6.233166694641113, + "rewards/rejected": -3.405930519104004, + "step": 17753 + }, + { + "epoch": 4.44, + "grad_norm": 7.087609767913818, + "learning_rate": 3.0435094075485063e-07, + "logits/chosen": -0.6194534301757812, + "logits/rejected": -0.6876984238624573, + "logps/chosen": -52.58537292480469, + "logps/rejected": -104.73268127441406, + "loss": 0.6997, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.189882278442383, + "rewards/margins": 7.783636569976807, + "rewards/rejected": -4.593753814697266, + "step": 17754 + }, + { + "epoch": 4.44, + "grad_norm": 9.833572387695312, + "learning_rate": 3.0408096288395697e-07, + "logits/chosen": -0.5719537734985352, + "logits/rejected": -0.6932623386383057, + "logps/chosen": -79.96986389160156, + "logps/rejected": -93.84779357910156, + "loss": 0.8233, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.640843629837036, + "rewards/margins": 6.0003180503845215, + "rewards/rejected": -3.3594744205474854, + "step": 17755 + }, + { + "epoch": 4.44, + "grad_norm": 4.863345146179199, + "learning_rate": 3.038111010542166e-07, + "logits/chosen": -0.5508864521980286, + "logits/rejected": -0.5854703783988953, + "logps/chosen": -52.73527526855469, + "logps/rejected": -105.8149642944336, + "loss": 0.6998, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.321119785308838, + "rewards/margins": 6.651551246643066, + "rewards/rejected": -3.3304319381713867, + "step": 17756 + }, + { + "epoch": 4.44, + "grad_norm": 3.5940864086151123, + "learning_rate": 3.0354135527230047e-07, + "logits/chosen": -0.5271148085594177, + "logits/rejected": -0.5882167816162109, + "logps/chosen": -43.41558837890625, + "logps/rejected": -116.48660278320312, + "loss": 0.5131, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.341367721557617, + "rewards/margins": 7.352041244506836, + "rewards/rejected": -4.010673522949219, + "step": 17757 + }, + { + "epoch": 4.44, + "grad_norm": 3.1864399909973145, + "learning_rate": 3.0327172554487247e-07, + "logits/chosen": -0.5677658319473267, + "logits/rejected": -0.5790736675262451, + "logps/chosen": -48.69697570800781, + "logps/rejected": -107.12715148925781, + "loss": 0.5802, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1797232627868652, + "rewards/margins": 6.998778820037842, + "rewards/rejected": -3.8190560340881348, + "step": 17758 + }, + { + "epoch": 4.44, + "grad_norm": 2.610731363296509, + "learning_rate": 3.030022118785947e-07, + "logits/chosen": -0.495045006275177, + "logits/rejected": -0.6168041825294495, + "logps/chosen": -51.264400482177734, + "logps/rejected": -112.93865966796875, + "loss": 0.5098, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3830509185791016, + "rewards/margins": 7.687559604644775, + "rewards/rejected": -4.304508686065674, + "step": 17759 + }, + { + "epoch": 4.44, + "grad_norm": 4.506255626678467, + "learning_rate": 3.0273281428012947e-07, + "logits/chosen": -0.4898492097854614, + "logits/rejected": -0.5851364731788635, + "logps/chosen": -58.779151916503906, + "logps/rejected": -108.26136016845703, + "loss": 0.5834, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1877596378326416, + "rewards/margins": 8.445466041564941, + "rewards/rejected": -5.257706642150879, + "step": 17760 + }, + { + "epoch": 4.44, + "grad_norm": 5.619505405426025, + "learning_rate": 3.024635327561315e-07, + "logits/chosen": -0.5031867027282715, + "logits/rejected": -0.5362738966941833, + "logps/chosen": -47.985694885253906, + "logps/rejected": -107.9139404296875, + "loss": 0.6352, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9163379669189453, + "rewards/margins": 6.45832633972168, + "rewards/rejected": -3.5419888496398926, + "step": 17761 + }, + { + "epoch": 4.44, + "grad_norm": 4.195778846740723, + "learning_rate": 3.0219436731325713e-07, + "logits/chosen": -0.5476043224334717, + "logits/rejected": -0.6230005621910095, + "logps/chosen": -49.324913024902344, + "logps/rejected": -91.5030288696289, + "loss": 0.6022, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.245042324066162, + "rewards/margins": 5.563589572906494, + "rewards/rejected": -2.318547248840332, + "step": 17762 + }, + { + "epoch": 4.44, + "grad_norm": 5.92473030090332, + "learning_rate": 3.0192531795815717e-07, + "logits/chosen": -0.552767276763916, + "logits/rejected": -0.6493356227874756, + "logps/chosen": -58.03743362426758, + "logps/rejected": -109.3931884765625, + "loss": 0.6303, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2421650886535645, + "rewards/margins": 7.144810676574707, + "rewards/rejected": -3.9026455879211426, + "step": 17763 + }, + { + "epoch": 4.44, + "grad_norm": 4.554481506347656, + "learning_rate": 3.016563846974785e-07, + "logits/chosen": -0.6550193428993225, + "logits/rejected": -0.6436107158660889, + "logps/chosen": -54.89845275878906, + "logps/rejected": -101.77444458007812, + "loss": 0.6593, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.355886697769165, + "rewards/margins": 5.939072608947754, + "rewards/rejected": -2.5831854343414307, + "step": 17764 + }, + { + "epoch": 4.44, + "grad_norm": 7.334022521972656, + "learning_rate": 3.0138756753786856e-07, + "logits/chosen": -0.5586007833480835, + "logits/rejected": -0.6187083125114441, + "logps/chosen": -53.73210906982422, + "logps/rejected": -121.96049499511719, + "loss": 0.7339, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.887638807296753, + "rewards/margins": 6.95953369140625, + "rewards/rejected": -4.071894645690918, + "step": 17765 + }, + { + "epoch": 4.44, + "grad_norm": 3.0208523273468018, + "learning_rate": 3.011188664859688e-07, + "logits/chosen": -0.5436766743659973, + "logits/rejected": -0.638393223285675, + "logps/chosen": -56.86286163330078, + "logps/rejected": -88.38565826416016, + "loss": 0.6012, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2771553993225098, + "rewards/margins": 7.039340972900391, + "rewards/rejected": -3.7621848583221436, + "step": 17766 + }, + { + "epoch": 4.44, + "grad_norm": 14.317115783691406, + "learning_rate": 3.00850281548421e-07, + "logits/chosen": -0.6022407412528992, + "logits/rejected": -0.6740190982818604, + "logps/chosen": -46.458892822265625, + "logps/rejected": -91.919189453125, + "loss": 0.6361, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9365999698638916, + "rewards/margins": 6.66218900680542, + "rewards/rejected": -3.7255890369415283, + "step": 17767 + }, + { + "epoch": 4.44, + "grad_norm": 4.630744934082031, + "learning_rate": 3.0058181273185935e-07, + "logits/chosen": -0.5811884999275208, + "logits/rejected": -0.6400181651115417, + "logps/chosen": -46.985713958740234, + "logps/rejected": -120.36787414550781, + "loss": 0.5923, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.394639015197754, + "rewards/margins": 7.148920059204102, + "rewards/rejected": -3.7542808055877686, + "step": 17768 + }, + { + "epoch": 4.45, + "grad_norm": 20.777389526367188, + "learning_rate": 3.003134600429214e-07, + "logits/chosen": -0.5167145729064941, + "logits/rejected": -0.6297516226768494, + "logps/chosen": -61.58125305175781, + "logps/rejected": -99.9024658203125, + "loss": 0.7782, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0550076961517334, + "rewards/margins": 6.479359149932861, + "rewards/rejected": -3.4243509769439697, + "step": 17769 + }, + { + "epoch": 4.45, + "grad_norm": 4.860335826873779, + "learning_rate": 3.0004522348823617e-07, + "logits/chosen": -0.6474636793136597, + "logits/rejected": -0.7370953559875488, + "logps/chosen": -57.59734344482422, + "logps/rejected": -92.30974578857422, + "loss": 0.5863, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.177978515625, + "rewards/margins": 7.349820137023926, + "rewards/rejected": -4.171842098236084, + "step": 17770 + }, + { + "epoch": 4.45, + "grad_norm": 7.294169902801514, + "learning_rate": 2.997771030744318e-07, + "logits/chosen": -0.5042145252227783, + "logits/rejected": -0.6271315813064575, + "logps/chosen": -68.20447540283203, + "logps/rejected": -82.62981414794922, + "loss": 0.6601, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.261401653289795, + "rewards/margins": 5.622702598571777, + "rewards/rejected": -2.3613011837005615, + "step": 17771 + }, + { + "epoch": 4.45, + "grad_norm": 6.745594501495361, + "learning_rate": 2.9950909880813505e-07, + "logits/chosen": -0.513224720954895, + "logits/rejected": -0.5855756998062134, + "logps/chosen": -62.740299224853516, + "logps/rejected": -125.963623046875, + "loss": 0.6919, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8788206577301025, + "rewards/margins": 6.4602437019348145, + "rewards/rejected": -3.581423282623291, + "step": 17772 + }, + { + "epoch": 4.45, + "grad_norm": 7.137237071990967, + "learning_rate": 2.9924121069596856e-07, + "logits/chosen": -0.558516800403595, + "logits/rejected": -0.6370945572853088, + "logps/chosen": -57.49596405029297, + "logps/rejected": -107.20284271240234, + "loss": 0.6659, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2949678897857666, + "rewards/margins": 6.939898490905762, + "rewards/rejected": -3.644930362701416, + "step": 17773 + }, + { + "epoch": 4.45, + "grad_norm": 2.5026423931121826, + "learning_rate": 2.9897343874455087e-07, + "logits/chosen": -0.5583032369613647, + "logits/rejected": -0.6137544512748718, + "logps/chosen": -49.713294982910156, + "logps/rejected": -111.52749633789062, + "loss": 0.5907, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.184494972229004, + "rewards/margins": 7.21697473526001, + "rewards/rejected": -4.032479763031006, + "step": 17774 + }, + { + "epoch": 4.45, + "grad_norm": 15.471373558044434, + "learning_rate": 2.987057829605006e-07, + "logits/chosen": -0.5339707136154175, + "logits/rejected": -0.6129255294799805, + "logps/chosen": -54.048797607421875, + "logps/rejected": -106.71499633789062, + "loss": 0.6871, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.877411365509033, + "rewards/margins": 6.890868663787842, + "rewards/rejected": -4.013457298278809, + "step": 17775 + }, + { + "epoch": 4.45, + "grad_norm": 3.57505464553833, + "learning_rate": 2.984382433504307e-07, + "logits/chosen": -0.5291392207145691, + "logits/rejected": -0.6069761514663696, + "logps/chosen": -60.31214904785156, + "logps/rejected": -94.86942291259766, + "loss": 0.6906, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9981064796447754, + "rewards/margins": 5.797818660736084, + "rewards/rejected": -2.7997119426727295, + "step": 17776 + }, + { + "epoch": 4.45, + "grad_norm": 4.0379743576049805, + "learning_rate": 2.981708199209521e-07, + "logits/chosen": -0.5056122541427612, + "logits/rejected": -0.5746334791183472, + "logps/chosen": -58.6318244934082, + "logps/rejected": -111.77925109863281, + "loss": 0.6389, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.11904239654541, + "rewards/margins": 7.136345386505127, + "rewards/rejected": -4.017303466796875, + "step": 17777 + }, + { + "epoch": 4.45, + "grad_norm": 8.369867324829102, + "learning_rate": 2.9790351267867337e-07, + "logits/chosen": -0.5404677391052246, + "logits/rejected": -0.6220081448554993, + "logps/chosen": -54.876136779785156, + "logps/rejected": -110.35912322998047, + "loss": 0.6616, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.002413272857666, + "rewards/margins": 6.943106651306152, + "rewards/rejected": -3.9406938552856445, + "step": 17778 + }, + { + "epoch": 4.45, + "grad_norm": 5.762966156005859, + "learning_rate": 2.9763632163020194e-07, + "logits/chosen": -0.5605815649032593, + "logits/rejected": -0.6559617519378662, + "logps/chosen": -57.0612678527832, + "logps/rejected": -143.1267852783203, + "loss": 0.6681, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.230879068374634, + "rewards/margins": 7.580732345581055, + "rewards/rejected": -4.349853515625, + "step": 17779 + }, + { + "epoch": 4.45, + "grad_norm": 7.856430530548096, + "learning_rate": 2.973692467821371e-07, + "logits/chosen": -0.5652462840080261, + "logits/rejected": -0.6427434682846069, + "logps/chosen": -51.809688568115234, + "logps/rejected": -97.45620727539062, + "loss": 0.5853, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.971985340118408, + "rewards/margins": 6.824730396270752, + "rewards/rejected": -3.8527448177337646, + "step": 17780 + }, + { + "epoch": 4.45, + "grad_norm": 2.430255889892578, + "learning_rate": 2.971022881410801e-07, + "logits/chosen": -0.47890374064445496, + "logits/rejected": -0.6063531041145325, + "logps/chosen": -53.64588928222656, + "logps/rejected": -125.22675323486328, + "loss": 0.5407, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2795896530151367, + "rewards/margins": 9.048511505126953, + "rewards/rejected": -5.768921852111816, + "step": 17781 + }, + { + "epoch": 4.45, + "grad_norm": 2.68754506111145, + "learning_rate": 2.9683544571362854e-07, + "logits/chosen": -0.5896899700164795, + "logits/rejected": -0.6790739297866821, + "logps/chosen": -47.55656433105469, + "logps/rejected": -101.63902282714844, + "loss": 0.5717, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.196834087371826, + "rewards/margins": 7.4509992599487305, + "rewards/rejected": -4.2541656494140625, + "step": 17782 + }, + { + "epoch": 4.45, + "grad_norm": 3.29526948928833, + "learning_rate": 2.9656871950637543e-07, + "logits/chosen": -0.5183441638946533, + "logits/rejected": -0.6420283913612366, + "logps/chosen": -60.917327880859375, + "logps/rejected": -103.45442199707031, + "loss": 0.5378, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.93143367767334, + "rewards/margins": 8.030426979064941, + "rewards/rejected": -5.098994255065918, + "step": 17783 + }, + { + "epoch": 4.45, + "grad_norm": 2.4764859676361084, + "learning_rate": 2.9630210952591164e-07, + "logits/chosen": -0.5390706062316895, + "logits/rejected": -0.6314495801925659, + "logps/chosen": -48.15545654296875, + "logps/rejected": -100.4712142944336, + "loss": 0.5526, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.489076614379883, + "rewards/margins": 7.969576358795166, + "rewards/rejected": -4.480500221252441, + "step": 17784 + }, + { + "epoch": 4.45, + "grad_norm": 2.4501118659973145, + "learning_rate": 2.9603561577882633e-07, + "logits/chosen": -0.5327772498130798, + "logits/rejected": -0.6403626799583435, + "logps/chosen": -47.861751556396484, + "logps/rejected": -87.94892883300781, + "loss": 0.5302, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3848159313201904, + "rewards/margins": 7.189432144165039, + "rewards/rejected": -3.8046162128448486, + "step": 17785 + }, + { + "epoch": 4.45, + "grad_norm": 2.1148831844329834, + "learning_rate": 2.9576923827170365e-07, + "logits/chosen": -0.6106383204460144, + "logits/rejected": -0.6566996574401855, + "logps/chosen": -47.83929443359375, + "logps/rejected": -126.63551330566406, + "loss": 0.5921, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1730093955993652, + "rewards/margins": 8.291851043701172, + "rewards/rejected": -5.11884069442749, + "step": 17786 + }, + { + "epoch": 4.45, + "grad_norm": 7.873303413391113, + "learning_rate": 2.955029770111273e-07, + "logits/chosen": -0.5879068374633789, + "logits/rejected": -0.6558349132537842, + "logps/chosen": -47.177547454833984, + "logps/rejected": -98.8914794921875, + "loss": 0.5872, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1080384254455566, + "rewards/margins": 7.077742099761963, + "rewards/rejected": -3.9697036743164062, + "step": 17787 + }, + { + "epoch": 4.45, + "grad_norm": 2.317096471786499, + "learning_rate": 2.9523683200367637e-07, + "logits/chosen": -0.5671378970146179, + "logits/rejected": -0.6679974794387817, + "logps/chosen": -52.70890808105469, + "logps/rejected": -99.35726928710938, + "loss": 0.5247, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.37107253074646, + "rewards/margins": 8.135061264038086, + "rewards/rejected": -4.7639875411987305, + "step": 17788 + }, + { + "epoch": 4.45, + "grad_norm": 31.231121063232422, + "learning_rate": 2.9497080325592677e-07, + "logits/chosen": -0.5390833020210266, + "logits/rejected": -0.6142823696136475, + "logps/chosen": -46.75415802001953, + "logps/rejected": -105.1444320678711, + "loss": 0.5847, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.982576847076416, + "rewards/margins": 6.512505054473877, + "rewards/rejected": -3.52992844581604, + "step": 17789 + }, + { + "epoch": 4.45, + "grad_norm": 2.40940523147583, + "learning_rate": 2.947048907744543e-07, + "logits/chosen": -0.557953953742981, + "logits/rejected": -0.6415011882781982, + "logps/chosen": -49.296810150146484, + "logps/rejected": -96.8250503540039, + "loss": 0.5922, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2473156452178955, + "rewards/margins": 7.067974090576172, + "rewards/rejected": -3.8206586837768555, + "step": 17790 + }, + { + "epoch": 4.45, + "grad_norm": 5.187165260314941, + "learning_rate": 2.944390945658282e-07, + "logits/chosen": -0.5829183459281921, + "logits/rejected": -0.6116788387298584, + "logps/chosen": -50.43232727050781, + "logps/rejected": -108.50444030761719, + "loss": 0.6312, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.171696662902832, + "rewards/margins": 6.889337539672852, + "rewards/rejected": -3.7176413536071777, + "step": 17791 + }, + { + "epoch": 4.45, + "grad_norm": 9.549239158630371, + "learning_rate": 2.941734146366165e-07, + "logits/chosen": -0.4776573181152344, + "logits/rejected": -0.5500514507293701, + "logps/chosen": -54.72936248779297, + "logps/rejected": -103.67582702636719, + "loss": 0.59, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.412230968475342, + "rewards/margins": 7.7821946144104, + "rewards/rejected": -4.369963645935059, + "step": 17792 + }, + { + "epoch": 4.45, + "grad_norm": 5.537353038787842, + "learning_rate": 2.9390785099338515e-07, + "logits/chosen": -0.6246806979179382, + "logits/rejected": -0.7261112332344055, + "logps/chosen": -52.119503021240234, + "logps/rejected": -136.253662109375, + "loss": 0.5795, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.198255777359009, + "rewards/margins": 9.106592178344727, + "rewards/rejected": -5.908336639404297, + "step": 17793 + }, + { + "epoch": 4.45, + "grad_norm": 3.039811849594116, + "learning_rate": 2.936424036426982e-07, + "logits/chosen": -0.5846378207206726, + "logits/rejected": -0.6559987664222717, + "logps/chosen": -50.52587127685547, + "logps/rejected": -114.88465118408203, + "loss": 0.5945, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8589487075805664, + "rewards/margins": 7.7629852294921875, + "rewards/rejected": -4.904036521911621, + "step": 17794 + }, + { + "epoch": 4.45, + "grad_norm": 7.040187835693359, + "learning_rate": 2.933770725911117e-07, + "logits/chosen": -0.5503479838371277, + "logits/rejected": -0.6447903513908386, + "logps/chosen": -58.393775939941406, + "logps/rejected": -95.62549591064453, + "loss": 0.6469, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6982383728027344, + "rewards/margins": 6.015635013580322, + "rewards/rejected": -3.317397117614746, + "step": 17795 + }, + { + "epoch": 4.45, + "grad_norm": 3.527235507965088, + "learning_rate": 2.931118578451836e-07, + "logits/chosen": -0.5768962502479553, + "logits/rejected": -0.6320536136627197, + "logps/chosen": -51.42249298095703, + "logps/rejected": -91.43870544433594, + "loss": 0.6294, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.378281593322754, + "rewards/margins": 6.009466171264648, + "rewards/rejected": -2.6311841011047363, + "step": 17796 + }, + { + "epoch": 4.45, + "grad_norm": 4.757827281951904, + "learning_rate": 2.928467594114698e-07, + "logits/chosen": -0.5536082983016968, + "logits/rejected": -0.660394549369812, + "logps/chosen": -61.24738311767578, + "logps/rejected": -89.88569641113281, + "loss": 0.5892, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2893786430358887, + "rewards/margins": 6.006422519683838, + "rewards/rejected": -2.7170441150665283, + "step": 17797 + }, + { + "epoch": 4.45, + "grad_norm": 3.0407896041870117, + "learning_rate": 2.9258177729651784e-07, + "logits/chosen": -0.6595025658607483, + "logits/rejected": -0.7044974565505981, + "logps/chosen": -50.934444427490234, + "logps/rejected": -92.54537963867188, + "loss": 0.6241, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.172940969467163, + "rewards/margins": 7.105865478515625, + "rewards/rejected": -3.9329240322113037, + "step": 17798 + }, + { + "epoch": 4.45, + "grad_norm": 5.115039825439453, + "learning_rate": 2.9231691150687693e-07, + "logits/chosen": -0.5710307359695435, + "logits/rejected": -0.6547898650169373, + "logps/chosen": -52.91891098022461, + "logps/rejected": -100.17568969726562, + "loss": 0.6415, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2037014961242676, + "rewards/margins": 7.3791093826293945, + "rewards/rejected": -4.175407886505127, + "step": 17799 + }, + { + "epoch": 4.45, + "grad_norm": 3.560716390609741, + "learning_rate": 2.920521620490935e-07, + "logits/chosen": -0.6536368131637573, + "logits/rejected": -0.727699875831604, + "logps/chosen": -54.55072784423828, + "logps/rejected": -111.50395202636719, + "loss": 0.583, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0681047439575195, + "rewards/margins": 8.110716819763184, + "rewards/rejected": -5.042612552642822, + "step": 17800 + }, + { + "epoch": 4.45, + "grad_norm": 3.7176027297973633, + "learning_rate": 2.917875289297084e-07, + "logits/chosen": -0.6216623783111572, + "logits/rejected": -0.6838501691818237, + "logps/chosen": -53.40080261230469, + "logps/rejected": -100.08660888671875, + "loss": 0.6331, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4092583656311035, + "rewards/margins": 6.956477642059326, + "rewards/rejected": -3.54721999168396, + "step": 17801 + }, + { + "epoch": 4.45, + "grad_norm": 3.4545481204986572, + "learning_rate": 2.915230121552609e-07, + "logits/chosen": -0.5782105922698975, + "logits/rejected": -0.6105578541755676, + "logps/chosen": -54.1937255859375, + "logps/rejected": -109.79008483886719, + "loss": 0.5975, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1686465740203857, + "rewards/margins": 6.6019673347473145, + "rewards/rejected": -3.433321237564087, + "step": 17802 + }, + { + "epoch": 4.45, + "grad_norm": 13.66128158569336, + "learning_rate": 2.9125861173228854e-07, + "logits/chosen": -0.537630558013916, + "logits/rejected": -0.6577085256576538, + "logps/chosen": -60.55820846557617, + "logps/rejected": -80.99063110351562, + "loss": 0.7299, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9865851402282715, + "rewards/margins": 6.262929439544678, + "rewards/rejected": -3.2763442993164062, + "step": 17803 + }, + { + "epoch": 4.45, + "grad_norm": 9.545409202575684, + "learning_rate": 2.909943276673244e-07, + "logits/chosen": -0.5553464889526367, + "logits/rejected": -0.6457282304763794, + "logps/chosen": -55.24614334106445, + "logps/rejected": -104.10664367675781, + "loss": 0.6368, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9848790168762207, + "rewards/margins": 7.4081268310546875, + "rewards/rejected": -4.423248291015625, + "step": 17804 + }, + { + "epoch": 4.45, + "grad_norm": 5.0443196296691895, + "learning_rate": 2.907301599668982e-07, + "logits/chosen": -0.5103921890258789, + "logits/rejected": -0.5522440075874329, + "logps/chosen": -59.558834075927734, + "logps/rejected": -133.48855590820312, + "loss": 0.5617, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.401925563812256, + "rewards/margins": 8.270454406738281, + "rewards/rejected": -4.868528366088867, + "step": 17805 + }, + { + "epoch": 4.45, + "grad_norm": 4.830974578857422, + "learning_rate": 2.904661086375399e-07, + "logits/chosen": -0.5736799836158752, + "logits/rejected": -0.6498616933822632, + "logps/chosen": -47.319549560546875, + "logps/rejected": -114.1092529296875, + "loss": 0.5601, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.178645610809326, + "rewards/margins": 7.851911544799805, + "rewards/rejected": -4.67326545715332, + "step": 17806 + }, + { + "epoch": 4.45, + "grad_norm": 2.357484817504883, + "learning_rate": 2.9020217368577184e-07, + "logits/chosen": -0.5273675918579102, + "logits/rejected": -0.6154621243476868, + "logps/chosen": -57.99957275390625, + "logps/rejected": -110.37712097167969, + "loss": 0.5794, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.400078296661377, + "rewards/margins": 8.19371509552002, + "rewards/rejected": -4.793636798858643, + "step": 17807 + }, + { + "epoch": 4.45, + "grad_norm": 26.61710548400879, + "learning_rate": 2.89938355118119e-07, + "logits/chosen": -0.532187283039093, + "logits/rejected": -0.6177328824996948, + "logps/chosen": -52.01716995239258, + "logps/rejected": -104.24478912353516, + "loss": 0.6452, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.16119122505188, + "rewards/margins": 6.517413139343262, + "rewards/rejected": -3.356222152709961, + "step": 17808 + }, + { + "epoch": 4.46, + "grad_norm": 9.216626167297363, + "learning_rate": 2.896746529410982e-07, + "logits/chosen": -0.6615625023841858, + "logits/rejected": -0.7492653131484985, + "logps/chosen": -54.75735855102539, + "logps/rejected": -98.71609497070312, + "loss": 0.6153, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.6257717609405518, + "rewards/margins": 8.44245719909668, + "rewards/rejected": -4.816685199737549, + "step": 17809 + }, + { + "epoch": 4.46, + "grad_norm": 8.493854522705078, + "learning_rate": 2.894110671612282e-07, + "logits/chosen": -0.6164551973342896, + "logits/rejected": -0.668411910533905, + "logps/chosen": -57.820350646972656, + "logps/rejected": -111.86671447753906, + "loss": 0.6113, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3782167434692383, + "rewards/margins": 7.598154067993164, + "rewards/rejected": -4.219937324523926, + "step": 17810 + }, + { + "epoch": 4.46, + "grad_norm": 3.7257485389709473, + "learning_rate": 2.8914759778502e-07, + "logits/chosen": -0.6517694592475891, + "logits/rejected": -0.6894921064376831, + "logps/chosen": -40.51062774658203, + "logps/rejected": -115.52304077148438, + "loss": 0.5283, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.234748601913452, + "rewards/margins": 8.651801109313965, + "rewards/rejected": -5.417051792144775, + "step": 17811 + }, + { + "epoch": 4.46, + "grad_norm": 2.5346479415893555, + "learning_rate": 2.88884244818986e-07, + "logits/chosen": -0.5761098265647888, + "logits/rejected": -0.6445592641830444, + "logps/chosen": -54.99632263183594, + "logps/rejected": -119.63847351074219, + "loss": 0.5796, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0887722969055176, + "rewards/margins": 8.526522636413574, + "rewards/rejected": -5.437749862670898, + "step": 17812 + }, + { + "epoch": 4.46, + "grad_norm": 6.027565956115723, + "learning_rate": 2.8862100826963335e-07, + "logits/chosen": -0.5279451608657837, + "logits/rejected": -0.598463773727417, + "logps/chosen": -48.72532653808594, + "logps/rejected": -106.73506927490234, + "loss": 0.5259, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3100759983062744, + "rewards/margins": 8.155269622802734, + "rewards/rejected": -4.845193386077881, + "step": 17813 + }, + { + "epoch": 4.46, + "grad_norm": 6.31809139251709, + "learning_rate": 2.8835788814346566e-07, + "logits/chosen": -0.47435101866722107, + "logits/rejected": -0.5733038187026978, + "logps/chosen": -57.277374267578125, + "logps/rejected": -94.0143051147461, + "loss": 0.6762, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.997159242630005, + "rewards/margins": 6.450960636138916, + "rewards/rejected": -3.453801393508911, + "step": 17814 + }, + { + "epoch": 4.46, + "grad_norm": 3.947139263153076, + "learning_rate": 2.880948844469872e-07, + "logits/chosen": -0.6270610690116882, + "logits/rejected": -0.6978949308395386, + "logps/chosen": -53.702579498291016, + "logps/rejected": -109.47034454345703, + "loss": 0.6528, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2717297077178955, + "rewards/margins": 8.723627090454102, + "rewards/rejected": -5.451897144317627, + "step": 17815 + }, + { + "epoch": 4.46, + "grad_norm": 5.781947612762451, + "learning_rate": 2.8783199718669606e-07, + "logits/chosen": -0.5992814898490906, + "logits/rejected": -0.6441052556037903, + "logps/chosen": -49.419654846191406, + "logps/rejected": -117.65685272216797, + "loss": 0.6847, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1848068237304688, + "rewards/margins": 6.8835039138793945, + "rewards/rejected": -3.698697090148926, + "step": 17816 + }, + { + "epoch": 4.46, + "grad_norm": 4.129783630371094, + "learning_rate": 2.875692263690877e-07, + "logits/chosen": -0.5223988890647888, + "logits/rejected": -0.5942939519882202, + "logps/chosen": -55.486602783203125, + "logps/rejected": -114.36094665527344, + "loss": 0.6152, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4351344108581543, + "rewards/margins": 7.81419563293457, + "rewards/rejected": -4.379060745239258, + "step": 17817 + }, + { + "epoch": 4.46, + "grad_norm": 5.4521684646606445, + "learning_rate": 2.873065720006568e-07, + "logits/chosen": -0.5170261859893799, + "logits/rejected": -0.593589723110199, + "logps/chosen": -62.246768951416016, + "logps/rejected": -112.85499572753906, + "loss": 0.8031, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.789478302001953, + "rewards/margins": 6.245574474334717, + "rewards/rejected": -3.4560964107513428, + "step": 17818 + }, + { + "epoch": 4.46, + "grad_norm": 6.410633563995361, + "learning_rate": 2.870440340878927e-07, + "logits/chosen": -0.5576828718185425, + "logits/rejected": -0.6419637203216553, + "logps/chosen": -56.89291000366211, + "logps/rejected": -111.17402648925781, + "loss": 0.6239, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4705514907836914, + "rewards/margins": 7.663938045501709, + "rewards/rejected": -4.193386554718018, + "step": 17819 + }, + { + "epoch": 4.46, + "grad_norm": 5.508309364318848, + "learning_rate": 2.86781612637283e-07, + "logits/chosen": -0.44105076789855957, + "logits/rejected": -0.5474604368209839, + "logps/chosen": -63.787803649902344, + "logps/rejected": -100.97702026367188, + "loss": 0.5901, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.02295184135437, + "rewards/margins": 6.828485012054443, + "rewards/rejected": -3.805532932281494, + "step": 17820 + }, + { + "epoch": 4.46, + "grad_norm": 45.66011428833008, + "learning_rate": 2.86519307655313e-07, + "logits/chosen": -0.5732024908065796, + "logits/rejected": -0.6569985747337341, + "logps/chosen": -50.37875747680664, + "logps/rejected": -101.10637664794922, + "loss": 0.6229, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.127336263656616, + "rewards/margins": 6.759657382965088, + "rewards/rejected": -3.6323206424713135, + "step": 17821 + }, + { + "epoch": 4.46, + "grad_norm": 5.6098408699035645, + "learning_rate": 2.8625711914846587e-07, + "logits/chosen": -0.5327909588813782, + "logits/rejected": -0.614583432674408, + "logps/chosen": -47.2558479309082, + "logps/rejected": -107.36430358886719, + "loss": 0.6038, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1852543354034424, + "rewards/margins": 7.9731831550598145, + "rewards/rejected": -4.787929534912109, + "step": 17822 + }, + { + "epoch": 4.46, + "grad_norm": 3.6603567600250244, + "learning_rate": 2.859950471232176e-07, + "logits/chosen": -0.5392292141914368, + "logits/rejected": -0.6397128701210022, + "logps/chosen": -52.03483200073242, + "logps/rejected": -109.18142700195312, + "loss": 0.6025, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0233099460601807, + "rewards/margins": 8.718071937561035, + "rewards/rejected": -5.694761753082275, + "step": 17823 + }, + { + "epoch": 4.46, + "grad_norm": 3.8576157093048096, + "learning_rate": 2.857330915860457e-07, + "logits/chosen": -0.5991604328155518, + "logits/rejected": -0.6492589116096497, + "logps/chosen": -56.178306579589844, + "logps/rejected": -128.9155731201172, + "loss": 0.6183, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2255022525787354, + "rewards/margins": 7.478092670440674, + "rewards/rejected": -4.252591133117676, + "step": 17824 + }, + { + "epoch": 4.46, + "grad_norm": 5.985421180725098, + "learning_rate": 2.854712525434239e-07, + "logits/chosen": -0.572597861289978, + "logits/rejected": -0.6084935665130615, + "logps/chosen": -71.23607635498047, + "logps/rejected": -118.21409606933594, + "loss": 0.7316, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.60090708732605, + "rewards/margins": 7.201037406921387, + "rewards/rejected": -3.600130081176758, + "step": 17825 + }, + { + "epoch": 4.46, + "grad_norm": 6.532032012939453, + "learning_rate": 2.852095300018226e-07, + "logits/chosen": -0.5273137092590332, + "logits/rejected": -0.608667254447937, + "logps/chosen": -51.7133903503418, + "logps/rejected": -124.77397155761719, + "loss": 0.6583, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1926074028015137, + "rewards/margins": 8.118587493896484, + "rewards/rejected": -4.925980567932129, + "step": 17826 + }, + { + "epoch": 4.46, + "grad_norm": 5.994617462158203, + "learning_rate": 2.849479239677072e-07, + "logits/chosen": -0.5643994808197021, + "logits/rejected": -0.6373909711837769, + "logps/chosen": -46.85541534423828, + "logps/rejected": -106.322021484375, + "loss": 0.6406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2198691368103027, + "rewards/margins": 7.299968719482422, + "rewards/rejected": -4.080099582672119, + "step": 17827 + }, + { + "epoch": 4.46, + "grad_norm": 3.9424514770507812, + "learning_rate": 2.846864344475453e-07, + "logits/chosen": -0.6490294933319092, + "logits/rejected": -0.723369836807251, + "logps/chosen": -48.647987365722656, + "logps/rejected": -95.44083404541016, + "loss": 0.5685, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.254503011703491, + "rewards/margins": 7.245151519775391, + "rewards/rejected": -3.9906492233276367, + "step": 17828 + }, + { + "epoch": 4.46, + "grad_norm": 3.7651827335357666, + "learning_rate": 2.844250614477967e-07, + "logits/chosen": -0.5552941560745239, + "logits/rejected": -0.6752740740776062, + "logps/chosen": -48.82910919189453, + "logps/rejected": -91.42799377441406, + "loss": 0.5427, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.805729627609253, + "rewards/margins": 7.388898849487305, + "rewards/rejected": -4.583169937133789, + "step": 17829 + }, + { + "epoch": 4.46, + "grad_norm": 3.154313564300537, + "learning_rate": 2.841638049749196e-07, + "logits/chosen": -0.5268393158912659, + "logits/rejected": -0.678084135055542, + "logps/chosen": -51.29822540283203, + "logps/rejected": -111.9912109375, + "loss": 0.5365, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.270458221435547, + "rewards/margins": 8.948683738708496, + "rewards/rejected": -5.678225517272949, + "step": 17830 + }, + { + "epoch": 4.46, + "grad_norm": 3.56520676612854, + "learning_rate": 2.8390266503537154e-07, + "logits/chosen": -0.4925956726074219, + "logits/rejected": -0.5598047971725464, + "logps/chosen": -51.25333023071289, + "logps/rejected": -117.48709869384766, + "loss": 0.6088, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3097527027130127, + "rewards/margins": 7.6665825843811035, + "rewards/rejected": -4.35683012008667, + "step": 17831 + }, + { + "epoch": 4.46, + "grad_norm": 5.43211555480957, + "learning_rate": 2.836416416356047e-07, + "logits/chosen": -0.6272473931312561, + "logits/rejected": -0.7124370336532593, + "logps/chosen": -45.95188903808594, + "logps/rejected": -89.88299560546875, + "loss": 0.5335, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.266327381134033, + "rewards/margins": 7.398308753967285, + "rewards/rejected": -4.131981372833252, + "step": 17832 + }, + { + "epoch": 4.46, + "grad_norm": 6.5172600746154785, + "learning_rate": 2.833807347820694e-07, + "logits/chosen": -0.6357899904251099, + "logits/rejected": -0.6891800165176392, + "logps/chosen": -51.1833610534668, + "logps/rejected": -107.94939422607422, + "loss": 0.6161, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.113253116607666, + "rewards/margins": 7.370726585388184, + "rewards/rejected": -4.257473468780518, + "step": 17833 + }, + { + "epoch": 4.46, + "grad_norm": 6.154587268829346, + "learning_rate": 2.8311994448121384e-07, + "logits/chosen": -0.63646000623703, + "logits/rejected": -0.6664166450500488, + "logps/chosen": -47.69646453857422, + "logps/rejected": -110.08998107910156, + "loss": 0.5874, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2638344764709473, + "rewards/margins": 6.61253023147583, + "rewards/rejected": -3.348695755004883, + "step": 17834 + }, + { + "epoch": 4.46, + "grad_norm": 13.017315864562988, + "learning_rate": 2.8285927073948007e-07, + "logits/chosen": -0.5351150035858154, + "logits/rejected": -0.6111366748809814, + "logps/chosen": -60.69068145751953, + "logps/rejected": -111.29220581054688, + "loss": 0.6562, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2130463123321533, + "rewards/margins": 7.911503314971924, + "rewards/rejected": -4.698456287384033, + "step": 17835 + }, + { + "epoch": 4.46, + "grad_norm": 6.244287967681885, + "learning_rate": 2.8259871356331237e-07, + "logits/chosen": -0.4961205720901489, + "logits/rejected": -0.5555317997932434, + "logps/chosen": -49.691795349121094, + "logps/rejected": -98.70417785644531, + "loss": 0.6831, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.105557918548584, + "rewards/margins": 6.076475620269775, + "rewards/rejected": -2.970917224884033, + "step": 17836 + }, + { + "epoch": 4.46, + "grad_norm": 31.557863235473633, + "learning_rate": 2.823382729591478e-07, + "logits/chosen": -0.5521222352981567, + "logits/rejected": -0.6142570376396179, + "logps/chosen": -52.54124069213867, + "logps/rejected": -109.0197525024414, + "loss": 0.7082, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1085705757141113, + "rewards/margins": 7.565069675445557, + "rewards/rejected": -4.456498622894287, + "step": 17837 + }, + { + "epoch": 4.46, + "grad_norm": 4.287449836730957, + "learning_rate": 2.820779489334219e-07, + "logits/chosen": -0.55000239610672, + "logits/rejected": -0.5941904783248901, + "logps/chosen": -64.61483764648438, + "logps/rejected": -115.73736572265625, + "loss": 0.6556, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.069434642791748, + "rewards/margins": 7.157947540283203, + "rewards/rejected": -4.088513374328613, + "step": 17838 + }, + { + "epoch": 4.46, + "grad_norm": 9.464929580688477, + "learning_rate": 2.818177414925677e-07, + "logits/chosen": -0.5180912613868713, + "logits/rejected": -0.6285867691040039, + "logps/chosen": -48.60489273071289, + "logps/rejected": -89.63477325439453, + "loss": 0.557, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1037216186523438, + "rewards/margins": 7.433633804321289, + "rewards/rejected": -4.329911708831787, + "step": 17839 + }, + { + "epoch": 4.46, + "grad_norm": 5.03175163269043, + "learning_rate": 2.815576506430173e-07, + "logits/chosen": -0.5594073534011841, + "logits/rejected": -0.6388747692108154, + "logps/chosen": -66.44363403320312, + "logps/rejected": -105.17064666748047, + "loss": 0.6506, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.004216194152832, + "rewards/margins": 6.915181636810303, + "rewards/rejected": -3.91096568107605, + "step": 17840 + }, + { + "epoch": 4.46, + "grad_norm": 3.235647201538086, + "learning_rate": 2.8129767639119454e-07, + "logits/chosen": -0.560550332069397, + "logits/rejected": -0.6584715247154236, + "logps/chosen": -61.300594329833984, + "logps/rejected": -93.02278137207031, + "loss": 0.6858, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.948908567428589, + "rewards/margins": 6.522876262664795, + "rewards/rejected": -3.573967456817627, + "step": 17841 + }, + { + "epoch": 4.46, + "grad_norm": 3.59993314743042, + "learning_rate": 2.810378187435253e-07, + "logits/chosen": -0.6318662762641907, + "logits/rejected": -0.6844455003738403, + "logps/chosen": -43.988948822021484, + "logps/rejected": -123.58615112304688, + "loss": 0.5106, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.99617862701416, + "rewards/margins": 8.01778507232666, + "rewards/rejected": -5.0216064453125, + "step": 17842 + }, + { + "epoch": 4.46, + "grad_norm": 12.48582935333252, + "learning_rate": 2.8077807770643174e-07, + "logits/chosen": -0.5322172045707703, + "logits/rejected": -0.59193354845047, + "logps/chosen": -55.726715087890625, + "logps/rejected": -107.47857666015625, + "loss": 0.6835, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.503222703933716, + "rewards/margins": 5.405028343200684, + "rewards/rejected": -2.9018054008483887, + "step": 17843 + }, + { + "epoch": 4.46, + "grad_norm": 3.13039493560791, + "learning_rate": 2.805184532863314e-07, + "logits/chosen": -0.5549635887145996, + "logits/rejected": -0.6735653281211853, + "logps/chosen": -58.05498123168945, + "logps/rejected": -97.53367614746094, + "loss": 0.6022, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.382046699523926, + "rewards/margins": 7.453125953674316, + "rewards/rejected": -4.071079730987549, + "step": 17844 + }, + { + "epoch": 4.46, + "grad_norm": 4.378281116485596, + "learning_rate": 2.802589454896393e-07, + "logits/chosen": -0.5651783347129822, + "logits/rejected": -0.6594133377075195, + "logps/chosen": -48.37971878051758, + "logps/rejected": -97.21688079833984, + "loss": 0.6967, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0007705688476562, + "rewards/margins": 6.483458518981934, + "rewards/rejected": -3.4826877117156982, + "step": 17845 + }, + { + "epoch": 4.46, + "grad_norm": 8.442092895507812, + "learning_rate": 2.7999955432276957e-07, + "logits/chosen": -0.4571903944015503, + "logits/rejected": -0.5395104289054871, + "logps/chosen": -51.03847885131836, + "logps/rejected": -98.52922058105469, + "loss": 0.6877, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.112471580505371, + "rewards/margins": 6.166266918182373, + "rewards/rejected": -3.053795576095581, + "step": 17846 + }, + { + "epoch": 4.46, + "grad_norm": 6.491267204284668, + "learning_rate": 2.797402797921311e-07, + "logits/chosen": -0.5638942122459412, + "logits/rejected": -0.6988656520843506, + "logps/chosen": -54.57869338989258, + "logps/rejected": -113.30896759033203, + "loss": 0.5364, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1204116344451904, + "rewards/margins": 7.766515731811523, + "rewards/rejected": -4.646104335784912, + "step": 17847 + }, + { + "epoch": 4.46, + "grad_norm": 3.0252318382263184, + "learning_rate": 2.7948112190413043e-07, + "logits/chosen": -0.5608428120613098, + "logits/rejected": -0.6394292712211609, + "logps/chosen": -43.7934684753418, + "logps/rejected": -114.83871459960938, + "loss": 0.5188, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3654849529266357, + "rewards/margins": 7.573269844055176, + "rewards/rejected": -4.207784652709961, + "step": 17848 + }, + { + "epoch": 4.47, + "grad_norm": 5.444936275482178, + "learning_rate": 2.7922208066517244e-07, + "logits/chosen": -0.5344945788383484, + "logits/rejected": -0.5981459617614746, + "logps/chosen": -50.312042236328125, + "logps/rejected": -117.59716033935547, + "loss": 0.5467, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0596764087677, + "rewards/margins": 7.6465582847595215, + "rewards/rejected": -4.586881637573242, + "step": 17849 + }, + { + "epoch": 4.47, + "grad_norm": 2.7317910194396973, + "learning_rate": 2.78963156081658e-07, + "logits/chosen": -0.5111849308013916, + "logits/rejected": -0.5860602259635925, + "logps/chosen": -54.1296272277832, + "logps/rejected": -109.01581573486328, + "loss": 0.5725, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9714064598083496, + "rewards/margins": 7.1439995765686035, + "rewards/rejected": -4.172593116760254, + "step": 17850 + }, + { + "epoch": 4.47, + "grad_norm": 5.6423444747924805, + "learning_rate": 2.78704348159986e-07, + "logits/chosen": -0.5427335500717163, + "logits/rejected": -0.6083453893661499, + "logps/chosen": -46.62216567993164, + "logps/rejected": -116.28401184082031, + "loss": 0.6191, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.075977325439453, + "rewards/margins": 7.829921722412109, + "rewards/rejected": -4.753944396972656, + "step": 17851 + }, + { + "epoch": 4.47, + "grad_norm": 4.3226118087768555, + "learning_rate": 2.7844565690655133e-07, + "logits/chosen": -0.6289081573486328, + "logits/rejected": -0.7362736463546753, + "logps/chosen": -46.17136001586914, + "logps/rejected": -114.6439437866211, + "loss": 0.5598, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2345054149627686, + "rewards/margins": 8.732895851135254, + "rewards/rejected": -5.498390197753906, + "step": 17852 + }, + { + "epoch": 4.47, + "grad_norm": 3.2183570861816406, + "learning_rate": 2.781870823277455e-07, + "logits/chosen": -0.5429753065109253, + "logits/rejected": -0.6255512237548828, + "logps/chosen": -64.1118392944336, + "logps/rejected": -134.96849060058594, + "loss": 0.6478, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.258727550506592, + "rewards/margins": 9.099774360656738, + "rewards/rejected": -5.841047286987305, + "step": 17853 + }, + { + "epoch": 4.47, + "grad_norm": 4.985060691833496, + "learning_rate": 2.7792862442996005e-07, + "logits/chosen": -0.5987004637718201, + "logits/rejected": -0.6727349162101746, + "logps/chosen": -55.123287200927734, + "logps/rejected": -109.73272705078125, + "loss": 0.6178, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.130277633666992, + "rewards/margins": 6.93272590637207, + "rewards/rejected": -3.802447557449341, + "step": 17854 + }, + { + "epoch": 4.47, + "grad_norm": 3.851438522338867, + "learning_rate": 2.7767028321958047e-07, + "logits/chosen": -0.4670115113258362, + "logits/rejected": -0.5511406064033508, + "logps/chosen": -67.3747329711914, + "logps/rejected": -120.45843505859375, + "loss": 0.642, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7130305767059326, + "rewards/margins": 7.605494499206543, + "rewards/rejected": -4.892463207244873, + "step": 17855 + }, + { + "epoch": 4.47, + "grad_norm": 2.280853509902954, + "learning_rate": 2.774120587029916e-07, + "logits/chosen": -0.5384715795516968, + "logits/rejected": -0.6113964319229126, + "logps/chosen": -50.371116638183594, + "logps/rejected": -112.39217376708984, + "loss": 0.5882, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0644078254699707, + "rewards/margins": 7.532468795776367, + "rewards/rejected": -4.468061447143555, + "step": 17856 + }, + { + "epoch": 4.47, + "grad_norm": 8.977933883666992, + "learning_rate": 2.771539508865734e-07, + "logits/chosen": -0.5578143000602722, + "logits/rejected": -0.621997594833374, + "logps/chosen": -54.840877532958984, + "logps/rejected": -114.90766906738281, + "loss": 0.7126, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.191070556640625, + "rewards/margins": 8.46201229095459, + "rewards/rejected": -5.270942211151123, + "step": 17857 + }, + { + "epoch": 4.47, + "grad_norm": 5.7928314208984375, + "learning_rate": 2.7689595977670513e-07, + "logits/chosen": -0.5679842233657837, + "logits/rejected": -0.6487739086151123, + "logps/chosen": -61.76274108886719, + "logps/rejected": -107.56547546386719, + "loss": 0.7686, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.069333791732788, + "rewards/margins": 6.98714542388916, + "rewards/rejected": -3.9178109169006348, + "step": 17858 + }, + { + "epoch": 4.47, + "grad_norm": 36.904720306396484, + "learning_rate": 2.7663808537976125e-07, + "logits/chosen": -0.6215115785598755, + "logits/rejected": -0.6735899448394775, + "logps/chosen": -61.42755889892578, + "logps/rejected": -100.76012420654297, + "loss": 0.6395, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.979356288909912, + "rewards/margins": 6.595850944519043, + "rewards/rejected": -3.6164944171905518, + "step": 17859 + }, + { + "epoch": 4.47, + "grad_norm": 10.839424133300781, + "learning_rate": 2.763803277021138e-07, + "logits/chosen": -0.41450339555740356, + "logits/rejected": -0.5500100255012512, + "logps/chosen": -75.92144012451172, + "logps/rejected": -85.25051879882812, + "loss": 0.7592, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.829153537750244, + "rewards/margins": 5.862615585327148, + "rewards/rejected": -3.0334620475769043, + "step": 17860 + }, + { + "epoch": 4.47, + "grad_norm": 2.0431458950042725, + "learning_rate": 2.7612268675013377e-07, + "logits/chosen": -0.5673356652259827, + "logits/rejected": -0.6371458768844604, + "logps/chosen": -45.814720153808594, + "logps/rejected": -111.97200012207031, + "loss": 0.4873, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.268057346343994, + "rewards/margins": 7.880914211273193, + "rewards/rejected": -4.612856864929199, + "step": 17861 + }, + { + "epoch": 4.47, + "grad_norm": 7.331906795501709, + "learning_rate": 2.7586516253018616e-07, + "logits/chosen": -0.5499690771102905, + "logits/rejected": -0.6588202714920044, + "logps/chosen": -54.67701721191406, + "logps/rejected": -99.82439422607422, + "loss": 0.5974, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.30401349067688, + "rewards/margins": 7.263164043426514, + "rewards/rejected": -3.959150791168213, + "step": 17862 + }, + { + "epoch": 4.47, + "grad_norm": 8.233509063720703, + "learning_rate": 2.756077550486347e-07, + "logits/chosen": -0.5147707462310791, + "logits/rejected": -0.6252636909484863, + "logps/chosen": -45.79153823852539, + "logps/rejected": -105.2418441772461, + "loss": 0.6051, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4015657901763916, + "rewards/margins": 8.075667381286621, + "rewards/rejected": -4.674101829528809, + "step": 17863 + }, + { + "epoch": 4.47, + "grad_norm": 7.890023231506348, + "learning_rate": 2.7535046431184045e-07, + "logits/chosen": -0.49021434783935547, + "logits/rejected": -0.5877421498298645, + "logps/chosen": -56.117103576660156, + "logps/rejected": -107.65574645996094, + "loss": 0.6288, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.193629741668701, + "rewards/margins": 7.742124080657959, + "rewards/rejected": -4.5484938621521, + "step": 17864 + }, + { + "epoch": 4.47, + "grad_norm": 8.560785293579102, + "learning_rate": 2.7509329032616325e-07, + "logits/chosen": -0.6079661250114441, + "logits/rejected": -0.6877291798591614, + "logps/chosen": -56.83028030395508, + "logps/rejected": -99.78617858886719, + "loss": 0.6586, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.188382148742676, + "rewards/margins": 6.9176411628723145, + "rewards/rejected": -3.7292587757110596, + "step": 17865 + }, + { + "epoch": 4.47, + "grad_norm": 3.944296360015869, + "learning_rate": 2.748362330979548e-07, + "logits/chosen": -0.5375880002975464, + "logits/rejected": -0.6456200480461121, + "logps/chosen": -65.26932525634766, + "logps/rejected": -114.67524719238281, + "loss": 0.5846, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4742395877838135, + "rewards/margins": 8.154476165771484, + "rewards/rejected": -4.680236339569092, + "step": 17866 + }, + { + "epoch": 4.47, + "grad_norm": 7.393341064453125, + "learning_rate": 2.745792926335689e-07, + "logits/chosen": -0.624703586101532, + "logits/rejected": -0.7003662586212158, + "logps/chosen": -48.252525329589844, + "logps/rejected": -104.2625503540039, + "loss": 0.7205, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.877500534057617, + "rewards/margins": 7.589605331420898, + "rewards/rejected": -4.712104797363281, + "step": 17867 + }, + { + "epoch": 4.47, + "grad_norm": 7.973534107208252, + "learning_rate": 2.743224689393559e-07, + "logits/chosen": -0.5823619365692139, + "logits/rejected": -0.6390469074249268, + "logps/chosen": -45.01976776123047, + "logps/rejected": -92.35504913330078, + "loss": 0.6035, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8198940753936768, + "rewards/margins": 6.044867038726807, + "rewards/rejected": -3.224972724914551, + "step": 17868 + }, + { + "epoch": 4.47, + "grad_norm": 4.257143497467041, + "learning_rate": 2.740657620216608e-07, + "logits/chosen": -0.5403210520744324, + "logits/rejected": -0.581145703792572, + "logps/chosen": -49.80229568481445, + "logps/rejected": -105.88562774658203, + "loss": 0.6631, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2442262172698975, + "rewards/margins": 6.352187156677246, + "rewards/rejected": -3.1079609394073486, + "step": 17869 + }, + { + "epoch": 4.47, + "grad_norm": 1.764318585395813, + "learning_rate": 2.7380917188682745e-07, + "logits/chosen": -0.5824904441833496, + "logits/rejected": -0.6748067736625671, + "logps/chosen": -50.04243850708008, + "logps/rejected": -112.28360748291016, + "loss": 0.513, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.128859281539917, + "rewards/margins": 8.199056625366211, + "rewards/rejected": -5.070197582244873, + "step": 17870 + }, + { + "epoch": 4.47, + "grad_norm": 11.698832511901855, + "learning_rate": 2.7355269854119625e-07, + "logits/chosen": -0.5579560995101929, + "logits/rejected": -0.6050288677215576, + "logps/chosen": -59.691829681396484, + "logps/rejected": -120.17729949951172, + "loss": 0.7005, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.947528123855591, + "rewards/margins": 6.432271480560303, + "rewards/rejected": -3.484743356704712, + "step": 17871 + }, + { + "epoch": 4.47, + "grad_norm": 4.641432762145996, + "learning_rate": 2.7329634199110553e-07, + "logits/chosen": -0.5378499031066895, + "logits/rejected": -0.571107029914856, + "logps/chosen": -66.09823608398438, + "logps/rejected": -112.96983337402344, + "loss": 0.6449, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0338847637176514, + "rewards/margins": 6.618741512298584, + "rewards/rejected": -3.5848567485809326, + "step": 17872 + }, + { + "epoch": 4.47, + "grad_norm": 4.517560005187988, + "learning_rate": 2.7304010224288915e-07, + "logits/chosen": -0.5033811926841736, + "logits/rejected": -0.5936252474784851, + "logps/chosen": -54.123443603515625, + "logps/rejected": -104.79988861083984, + "loss": 0.5351, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3014090061187744, + "rewards/margins": 6.707106590270996, + "rewards/rejected": -3.4056973457336426, + "step": 17873 + }, + { + "epoch": 4.47, + "grad_norm": 5.713227272033691, + "learning_rate": 2.7278397930288026e-07, + "logits/chosen": -0.5736026763916016, + "logits/rejected": -0.6490446925163269, + "logps/chosen": -53.83492660522461, + "logps/rejected": -102.22437286376953, + "loss": 0.6403, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1768908500671387, + "rewards/margins": 6.216705799102783, + "rewards/rejected": -3.0398151874542236, + "step": 17874 + }, + { + "epoch": 4.47, + "grad_norm": 4.302087783813477, + "learning_rate": 2.7252797317740663e-07, + "logits/chosen": -0.5606926679611206, + "logits/rejected": -0.6763077974319458, + "logps/chosen": -47.96597671508789, + "logps/rejected": -97.51893615722656, + "loss": 0.5571, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1090035438537598, + "rewards/margins": 6.496705532073975, + "rewards/rejected": -3.387701988220215, + "step": 17875 + }, + { + "epoch": 4.47, + "grad_norm": 7.891915798187256, + "learning_rate": 2.72272083872796e-07, + "logits/chosen": -0.5029124021530151, + "logits/rejected": -0.5806593298912048, + "logps/chosen": -60.88410949707031, + "logps/rejected": -101.08476257324219, + "loss": 0.7298, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7852258682250977, + "rewards/margins": 5.755842208862305, + "rewards/rejected": -2.9706168174743652, + "step": 17876 + }, + { + "epoch": 4.47, + "grad_norm": 32.67631149291992, + "learning_rate": 2.720163113953711e-07, + "logits/chosen": -0.4980868101119995, + "logits/rejected": -0.5961787700653076, + "logps/chosen": -53.9238166809082, + "logps/rejected": -101.50391387939453, + "loss": 0.6778, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7749791145324707, + "rewards/margins": 6.770503997802734, + "rewards/rejected": -3.9955248832702637, + "step": 17877 + }, + { + "epoch": 4.47, + "grad_norm": 5.976474285125732, + "learning_rate": 2.717606557514507e-07, + "logits/chosen": -0.4991651475429535, + "logits/rejected": -0.6153380870819092, + "logps/chosen": -53.666481018066406, + "logps/rejected": -107.48367309570312, + "loss": 0.6489, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0969789028167725, + "rewards/margins": 7.562745094299316, + "rewards/rejected": -4.465766429901123, + "step": 17878 + }, + { + "epoch": 4.47, + "grad_norm": 3.0083250999450684, + "learning_rate": 2.7150511694735483e-07, + "logits/chosen": -0.6069272756576538, + "logits/rejected": -0.6964124441146851, + "logps/chosen": -54.61106872558594, + "logps/rejected": -122.43257141113281, + "loss": 0.5836, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.07138729095459, + "rewards/margins": 9.327780723571777, + "rewards/rejected": -6.2563934326171875, + "step": 17879 + }, + { + "epoch": 4.47, + "grad_norm": 4.868298530578613, + "learning_rate": 2.712496949893967e-07, + "logits/chosen": -0.49504074454307556, + "logits/rejected": -0.6044050455093384, + "logps/chosen": -73.13423919677734, + "logps/rejected": -100.39644622802734, + "loss": 0.6936, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1818580627441406, + "rewards/margins": 7.203702926635742, + "rewards/rejected": -4.021844863891602, + "step": 17880 + }, + { + "epoch": 4.47, + "grad_norm": 3.9935593605041504, + "learning_rate": 2.7099438988388736e-07, + "logits/chosen": -0.5290650725364685, + "logits/rejected": -0.6280054450035095, + "logps/chosen": -61.90003967285156, + "logps/rejected": -123.61073303222656, + "loss": 0.646, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.038994073867798, + "rewards/margins": 7.615472793579102, + "rewards/rejected": -4.576478958129883, + "step": 17881 + }, + { + "epoch": 4.47, + "grad_norm": 4.761059284210205, + "learning_rate": 2.707392016371374e-07, + "logits/chosen": -0.5431749820709229, + "logits/rejected": -0.6109679341316223, + "logps/chosen": -48.98147201538086, + "logps/rejected": -99.84086608886719, + "loss": 0.5558, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.057793617248535, + "rewards/margins": 7.50315523147583, + "rewards/rejected": -4.445361614227295, + "step": 17882 + }, + { + "epoch": 4.47, + "grad_norm": 23.292404174804688, + "learning_rate": 2.704841302554528e-07, + "logits/chosen": -0.6010496616363525, + "logits/rejected": -0.6436692476272583, + "logps/chosen": -50.06045150756836, + "logps/rejected": -125.82991790771484, + "loss": 0.6209, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.218190908432007, + "rewards/margins": 8.11011028289795, + "rewards/rejected": -4.8919196128845215, + "step": 17883 + }, + { + "epoch": 4.47, + "grad_norm": 3.9088993072509766, + "learning_rate": 2.7022917574513475e-07, + "logits/chosen": -0.5271875262260437, + "logits/rejected": -0.6033082008361816, + "logps/chosen": -54.9620361328125, + "logps/rejected": -108.65899658203125, + "loss": 0.6379, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1559767723083496, + "rewards/margins": 7.313146114349365, + "rewards/rejected": -4.157169818878174, + "step": 17884 + }, + { + "epoch": 4.47, + "grad_norm": 7.635452747344971, + "learning_rate": 2.6997433811248475e-07, + "logits/chosen": -0.5518338084220886, + "logits/rejected": -0.5926659107208252, + "logps/chosen": -53.58168029785156, + "logps/rejected": -113.41023254394531, + "loss": 0.6842, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2191789150238037, + "rewards/margins": 6.720879554748535, + "rewards/rejected": -3.5017011165618896, + "step": 17885 + }, + { + "epoch": 4.47, + "grad_norm": 4.732136249542236, + "learning_rate": 2.697196173638e-07, + "logits/chosen": -0.6018811464309692, + "logits/rejected": -0.7298413515090942, + "logps/chosen": -58.97746658325195, + "logps/rejected": -105.1324234008789, + "loss": 0.6608, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0842952728271484, + "rewards/margins": 8.076197624206543, + "rewards/rejected": -4.991901874542236, + "step": 17886 + }, + { + "epoch": 4.47, + "grad_norm": 5.9894819259643555, + "learning_rate": 2.694650135053756e-07, + "logits/chosen": -0.5250344276428223, + "logits/rejected": -0.620635986328125, + "logps/chosen": -51.85799026489258, + "logps/rejected": -98.4578628540039, + "loss": 0.5138, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5460410118103027, + "rewards/margins": 7.261972427368164, + "rewards/rejected": -3.7159314155578613, + "step": 17887 + }, + { + "epoch": 4.47, + "grad_norm": 2.7027831077575684, + "learning_rate": 2.6921052654350135e-07, + "logits/chosen": -0.589836061000824, + "logits/rejected": -0.7189156413078308, + "logps/chosen": -62.514625549316406, + "logps/rejected": -96.1202392578125, + "loss": 0.5988, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.156174421310425, + "rewards/margins": 7.383030891418457, + "rewards/rejected": -4.226856708526611, + "step": 17888 + }, + { + "epoch": 4.48, + "grad_norm": 18.082759857177734, + "learning_rate": 2.6895615648446724e-07, + "logits/chosen": -0.5735048055648804, + "logits/rejected": -0.616582453250885, + "logps/chosen": -52.13411331176758, + "logps/rejected": -107.21620178222656, + "loss": 0.5991, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1663753986358643, + "rewards/margins": 7.171743392944336, + "rewards/rejected": -4.005368232727051, + "step": 17889 + }, + { + "epoch": 4.48, + "grad_norm": 3.6671395301818848, + "learning_rate": 2.6870190333455894e-07, + "logits/chosen": -0.5416801571846008, + "logits/rejected": -0.6098430156707764, + "logps/chosen": -61.47654342651367, + "logps/rejected": -100.1847152709961, + "loss": 0.6137, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.110454559326172, + "rewards/margins": 6.903623104095459, + "rewards/rejected": -3.793168783187866, + "step": 17890 + }, + { + "epoch": 4.48, + "grad_norm": 5.91330623626709, + "learning_rate": 2.6844776710005846e-07, + "logits/chosen": -0.6444764137268066, + "logits/rejected": -0.6792927384376526, + "logps/chosen": -51.69013977050781, + "logps/rejected": -92.62667083740234, + "loss": 0.646, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.104463577270508, + "rewards/margins": 6.18564510345459, + "rewards/rejected": -3.0811820030212402, + "step": 17891 + }, + { + "epoch": 4.48, + "grad_norm": 4.7005486488342285, + "learning_rate": 2.6819374778724704e-07, + "logits/chosen": -0.5592304468154907, + "logits/rejected": -0.5896977186203003, + "logps/chosen": -62.4230842590332, + "logps/rejected": -131.45135498046875, + "loss": 0.6867, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3468472957611084, + "rewards/margins": 6.866694927215576, + "rewards/rejected": -3.519847869873047, + "step": 17892 + }, + { + "epoch": 4.48, + "grad_norm": 6.492218017578125, + "learning_rate": 2.679398454024001e-07, + "logits/chosen": -0.5625733733177185, + "logits/rejected": -0.6724422574043274, + "logps/chosen": -52.701446533203125, + "logps/rejected": -107.88207244873047, + "loss": 0.5697, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9264774322509766, + "rewards/margins": 6.8723859786987305, + "rewards/rejected": -3.945909261703491, + "step": 17893 + }, + { + "epoch": 4.48, + "grad_norm": 11.824888229370117, + "learning_rate": 2.6768605995179386e-07, + "logits/chosen": -0.6758551597595215, + "logits/rejected": -0.7398687601089478, + "logps/chosen": -45.243221282958984, + "logps/rejected": -94.02742004394531, + "loss": 0.605, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2225708961486816, + "rewards/margins": 6.905531883239746, + "rewards/rejected": -3.682960271835327, + "step": 17894 + }, + { + "epoch": 4.48, + "grad_norm": 18.560043334960938, + "learning_rate": 2.674323914416982e-07, + "logits/chosen": -0.5424290895462036, + "logits/rejected": -0.5932771563529968, + "logps/chosen": -53.27585983276367, + "logps/rejected": -141.10374450683594, + "loss": 0.5918, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.922111988067627, + "rewards/margins": 8.554370880126953, + "rewards/rejected": -5.632258415222168, + "step": 17895 + }, + { + "epoch": 4.48, + "grad_norm": 12.919755935668945, + "learning_rate": 2.6717883987838144e-07, + "logits/chosen": -0.508967399597168, + "logits/rejected": -0.5762565732002258, + "logps/chosen": -60.65053176879883, + "logps/rejected": -136.10076904296875, + "loss": 0.6273, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7881271839141846, + "rewards/margins": 8.191414833068848, + "rewards/rejected": -5.403286933898926, + "step": 17896 + }, + { + "epoch": 4.48, + "grad_norm": 4.414088726043701, + "learning_rate": 2.6692540526811084e-07, + "logits/chosen": -0.5453643202781677, + "logits/rejected": -0.653235673904419, + "logps/chosen": -58.674461364746094, + "logps/rejected": -89.9554672241211, + "loss": 0.6013, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2597286701202393, + "rewards/margins": 7.270805358886719, + "rewards/rejected": -4.0110764503479, + "step": 17897 + }, + { + "epoch": 4.48, + "grad_norm": 18.831026077270508, + "learning_rate": 2.6667208761714634e-07, + "logits/chosen": -0.49722325801849365, + "logits/rejected": -0.544154167175293, + "logps/chosen": -62.97712326049805, + "logps/rejected": -103.73332977294922, + "loss": 0.6601, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0146703720092773, + "rewards/margins": 6.499993324279785, + "rewards/rejected": -3.485322952270508, + "step": 17898 + }, + { + "epoch": 4.48, + "grad_norm": 2.5615434646606445, + "learning_rate": 2.664188869317502e-07, + "logits/chosen": -0.554011344909668, + "logits/rejected": -0.658885657787323, + "logps/chosen": -49.35443115234375, + "logps/rejected": -114.66383361816406, + "loss": 0.5183, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.380244255065918, + "rewards/margins": 8.497053146362305, + "rewards/rejected": -5.116808891296387, + "step": 17899 + }, + { + "epoch": 4.48, + "grad_norm": 5.857161045074463, + "learning_rate": 2.661658032181774e-07, + "logits/chosen": -0.53708815574646, + "logits/rejected": -0.6333929896354675, + "logps/chosen": -52.744720458984375, + "logps/rejected": -109.08776092529297, + "loss": 0.5898, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.131514549255371, + "rewards/margins": 8.234701156616211, + "rewards/rejected": -5.10318660736084, + "step": 17900 + }, + { + "epoch": 4.48, + "grad_norm": 2.4206855297088623, + "learning_rate": 2.659128364826835e-07, + "logits/chosen": -0.5327439308166504, + "logits/rejected": -0.6047634482383728, + "logps/chosen": -44.25144958496094, + "logps/rejected": -106.75408172607422, + "loss": 0.5172, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0478382110595703, + "rewards/margins": 7.375326633453369, + "rewards/rejected": -4.327488899230957, + "step": 17901 + }, + { + "epoch": 4.48, + "grad_norm": 17.230422973632812, + "learning_rate": 2.6565998673151847e-07, + "logits/chosen": -0.5808122754096985, + "logits/rejected": -0.6465765237808228, + "logps/chosen": -53.15114212036133, + "logps/rejected": -105.74446868896484, + "loss": 0.6867, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.001990795135498, + "rewards/margins": 7.025885105133057, + "rewards/rejected": -4.023895263671875, + "step": 17902 + }, + { + "epoch": 4.48, + "grad_norm": 6.479052543640137, + "learning_rate": 2.654072539709307e-07, + "logits/chosen": -0.6103836894035339, + "logits/rejected": -0.6811521053314209, + "logps/chosen": -58.11434555053711, + "logps/rejected": -99.92880249023438, + "loss": 0.7405, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.915182113647461, + "rewards/margins": 6.581302165985107, + "rewards/rejected": -3.6661202907562256, + "step": 17903 + }, + { + "epoch": 4.48, + "grad_norm": 3.732098340988159, + "learning_rate": 2.6515463820716626e-07, + "logits/chosen": -0.5630791187286377, + "logits/rejected": -0.6543111801147461, + "logps/chosen": -64.0678482055664, + "logps/rejected": -108.23272705078125, + "loss": 0.6062, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9614078998565674, + "rewards/margins": 6.856165409088135, + "rewards/rejected": -3.8947579860687256, + "step": 17904 + }, + { + "epoch": 4.48, + "grad_norm": 6.1449408531188965, + "learning_rate": 2.649021394464668e-07, + "logits/chosen": -0.5347176790237427, + "logits/rejected": -0.6085106134414673, + "logps/chosen": -50.81704330444336, + "logps/rejected": -117.7274169921875, + "loss": 0.6694, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.007368564605713, + "rewards/margins": 7.629045009613037, + "rewards/rejected": -4.621676445007324, + "step": 17905 + }, + { + "epoch": 4.48, + "grad_norm": 8.06083869934082, + "learning_rate": 2.6464975769507073e-07, + "logits/chosen": -0.495989054441452, + "logits/rejected": -0.5486792325973511, + "logps/chosen": -64.44921112060547, + "logps/rejected": -101.53952026367188, + "loss": 0.7896, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8745510578155518, + "rewards/margins": 5.969367504119873, + "rewards/rejected": -3.094815969467163, + "step": 17906 + }, + { + "epoch": 4.48, + "grad_norm": 12.389395713806152, + "learning_rate": 2.6439749295921745e-07, + "logits/chosen": -0.5178207159042358, + "logits/rejected": -0.6035395264625549, + "logps/chosen": -54.02165222167969, + "logps/rejected": -113.49243927001953, + "loss": 0.6512, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0849099159240723, + "rewards/margins": 6.774394989013672, + "rewards/rejected": -3.6894850730895996, + "step": 17907 + }, + { + "epoch": 4.48, + "grad_norm": 5.1013689041137695, + "learning_rate": 2.641453452451387e-07, + "logits/chosen": -0.49593019485473633, + "logits/rejected": -0.586190402507782, + "logps/chosen": -59.61615753173828, + "logps/rejected": -105.17650604248047, + "loss": 0.545, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9048874378204346, + "rewards/margins": 7.761774063110352, + "rewards/rejected": -4.8568878173828125, + "step": 17908 + }, + { + "epoch": 4.48, + "grad_norm": 3.1502063274383545, + "learning_rate": 2.6389331455906495e-07, + "logits/chosen": -0.47605910897254944, + "logits/rejected": -0.5430552363395691, + "logps/chosen": -54.66670227050781, + "logps/rejected": -116.66043090820312, + "loss": 0.5691, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.399240255355835, + "rewards/margins": 7.86495304107666, + "rewards/rejected": -4.465713024139404, + "step": 17909 + }, + { + "epoch": 4.48, + "grad_norm": 12.568564414978027, + "learning_rate": 2.636414009072252e-07, + "logits/chosen": -0.5825973153114319, + "logits/rejected": -0.6583548784255981, + "logps/chosen": -56.925323486328125, + "logps/rejected": -96.78548431396484, + "loss": 0.6356, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9665815830230713, + "rewards/margins": 7.415194511413574, + "rewards/rejected": -4.448613166809082, + "step": 17910 + }, + { + "epoch": 4.48, + "grad_norm": 2.0633180141448975, + "learning_rate": 2.63389604295845e-07, + "logits/chosen": -0.6041995882987976, + "logits/rejected": -0.7073991894721985, + "logps/chosen": -57.81212615966797, + "logps/rejected": -108.57245635986328, + "loss": 0.5708, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.02150297164917, + "rewards/margins": 8.133846282958984, + "rewards/rejected": -5.1123433113098145, + "step": 17911 + }, + { + "epoch": 4.48, + "grad_norm": 6.4963860511779785, + "learning_rate": 2.6313792473114497e-07, + "logits/chosen": -0.5448235273361206, + "logits/rejected": -0.6056867837905884, + "logps/chosen": -55.746376037597656, + "logps/rejected": -102.47864532470703, + "loss": 0.6228, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.257206439971924, + "rewards/margins": 7.289615631103516, + "rewards/rejected": -4.032409191131592, + "step": 17912 + }, + { + "epoch": 4.48, + "grad_norm": 6.791279315948486, + "learning_rate": 2.6288636221934504e-07, + "logits/chosen": -0.5794991850852966, + "logits/rejected": -0.6606625914573669, + "logps/chosen": -56.51512145996094, + "logps/rejected": -99.40986633300781, + "loss": 0.5945, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.156468629837036, + "rewards/margins": 6.26055908203125, + "rewards/rejected": -3.104090929031372, + "step": 17913 + }, + { + "epoch": 4.48, + "grad_norm": 4.039926528930664, + "learning_rate": 2.6263491676666255e-07, + "logits/chosen": -0.5569006204605103, + "logits/rejected": -0.6673508882522583, + "logps/chosen": -57.49552917480469, + "logps/rejected": -99.26985168457031, + "loss": 0.635, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.505708694458008, + "rewards/margins": 8.738580703735352, + "rewards/rejected": -5.232872009277344, + "step": 17914 + }, + { + "epoch": 4.48, + "grad_norm": 2.9038610458374023, + "learning_rate": 2.6238358837931024e-07, + "logits/chosen": -0.5239473581314087, + "logits/rejected": -0.5495477914810181, + "logps/chosen": -51.573143005371094, + "logps/rejected": -126.74397277832031, + "loss": 0.5664, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.164860248565674, + "rewards/margins": 8.370638847351074, + "rewards/rejected": -5.205779075622559, + "step": 17915 + }, + { + "epoch": 4.48, + "grad_norm": 65.08084106445312, + "learning_rate": 2.621323770634976e-07, + "logits/chosen": -0.5517913103103638, + "logits/rejected": -0.5704518556594849, + "logps/chosen": -55.33832550048828, + "logps/rejected": -114.1600112915039, + "loss": 0.7314, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0471835136413574, + "rewards/margins": 6.052431106567383, + "rewards/rejected": -3.0052475929260254, + "step": 17916 + }, + { + "epoch": 4.48, + "grad_norm": 8.38977336883545, + "learning_rate": 2.6188128282543414e-07, + "logits/chosen": -0.640421986579895, + "logits/rejected": -0.6870605945587158, + "logps/chosen": -50.62225341796875, + "logps/rejected": -126.1895980834961, + "loss": 0.6475, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.129772186279297, + "rewards/margins": 8.226005554199219, + "rewards/rejected": -5.096232891082764, + "step": 17917 + }, + { + "epoch": 4.48, + "grad_norm": 3.701469659805298, + "learning_rate": 2.616303056713232e-07, + "logits/chosen": -0.5014021396636963, + "logits/rejected": -0.5528438687324524, + "logps/chosen": -45.06548309326172, + "logps/rejected": -108.66633605957031, + "loss": 0.5942, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1467204093933105, + "rewards/margins": 6.657049179077148, + "rewards/rejected": -3.510328531265259, + "step": 17918 + }, + { + "epoch": 4.48, + "grad_norm": 4.289680004119873, + "learning_rate": 2.613794456073682e-07, + "logits/chosen": -0.5549532771110535, + "logits/rejected": -0.5654920339584351, + "logps/chosen": -51.322044372558594, + "logps/rejected": -122.0097427368164, + "loss": 0.6193, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.152393102645874, + "rewards/margins": 7.000840187072754, + "rewards/rejected": -3.848447322845459, + "step": 17919 + }, + { + "epoch": 4.48, + "grad_norm": 3.0619304180145264, + "learning_rate": 2.6112870263976686e-07, + "logits/chosen": -0.4812052547931671, + "logits/rejected": -0.5486918091773987, + "logps/chosen": -54.63407516479492, + "logps/rejected": -117.57127380371094, + "loss": 0.6293, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.980022430419922, + "rewards/margins": 7.474191188812256, + "rewards/rejected": -4.494168758392334, + "step": 17920 + }, + { + "epoch": 4.48, + "grad_norm": 5.244083404541016, + "learning_rate": 2.6087807677471544e-07, + "logits/chosen": -0.5436162948608398, + "logits/rejected": -0.6020569205284119, + "logps/chosen": -53.87828826904297, + "logps/rejected": -114.27526092529297, + "loss": 0.5873, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.388428211212158, + "rewards/margins": 6.860511779785156, + "rewards/rejected": -3.472083330154419, + "step": 17921 + }, + { + "epoch": 4.48, + "grad_norm": 4.240898609161377, + "learning_rate": 2.606275680184084e-07, + "logits/chosen": -0.6010173559188843, + "logits/rejected": -0.6485849618911743, + "logps/chosen": -45.27540969848633, + "logps/rejected": -94.13705444335938, + "loss": 0.5997, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2197675704956055, + "rewards/margins": 5.773960590362549, + "rewards/rejected": -2.5541930198669434, + "step": 17922 + }, + { + "epoch": 4.48, + "grad_norm": 6.810690402984619, + "learning_rate": 2.603771763770346e-07, + "logits/chosen": -0.5259131193161011, + "logits/rejected": -0.5997738242149353, + "logps/chosen": -48.231834411621094, + "logps/rejected": -99.33096313476562, + "loss": 0.6271, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.804858446121216, + "rewards/margins": 7.5941925048828125, + "rewards/rejected": -4.789335250854492, + "step": 17923 + }, + { + "epoch": 4.48, + "grad_norm": 7.817254066467285, + "learning_rate": 2.60126901856782e-07, + "logits/chosen": -0.5088711380958557, + "logits/rejected": -0.5881936550140381, + "logps/chosen": -49.60608673095703, + "logps/rejected": -99.27839660644531, + "loss": 0.6099, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1168558597564697, + "rewards/margins": 7.208196640014648, + "rewards/rejected": -4.0913405418396, + "step": 17924 + }, + { + "epoch": 4.48, + "grad_norm": 9.998292922973633, + "learning_rate": 2.5987674446383505e-07, + "logits/chosen": -0.556341826915741, + "logits/rejected": -0.6438618898391724, + "logps/chosen": -60.21966552734375, + "logps/rejected": -109.72026824951172, + "loss": 0.6721, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.009711265563965, + "rewards/margins": 6.760892391204834, + "rewards/rejected": -3.751181125640869, + "step": 17925 + }, + { + "epoch": 4.48, + "grad_norm": 2.9404778480529785, + "learning_rate": 2.596267042043765e-07, + "logits/chosen": -0.5761213898658752, + "logits/rejected": -0.6186297535896301, + "logps/chosen": -63.838314056396484, + "logps/rejected": -125.94882202148438, + "loss": 0.6302, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2765932083129883, + "rewards/margins": 7.584747314453125, + "rewards/rejected": -4.3081536293029785, + "step": 17926 + }, + { + "epoch": 4.48, + "grad_norm": 3.250884532928467, + "learning_rate": 2.593767810845832e-07, + "logits/chosen": -0.5365604162216187, + "logits/rejected": -0.6386071443557739, + "logps/chosen": -60.345333099365234, + "logps/rejected": -112.15359497070312, + "loss": 0.5986, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3461062908172607, + "rewards/margins": 7.087285995483398, + "rewards/rejected": -3.7411789894104004, + "step": 17927 + }, + { + "epoch": 4.48, + "grad_norm": 32.14027786254883, + "learning_rate": 2.5912697511063177e-07, + "logits/chosen": -0.5194092392921448, + "logits/rejected": -0.560187578201294, + "logps/chosen": -60.05756759643555, + "logps/rejected": -112.09354400634766, + "loss": 0.7769, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.814761161804199, + "rewards/margins": 5.623923301696777, + "rewards/rejected": -2.809161901473999, + "step": 17928 + }, + { + "epoch": 4.49, + "grad_norm": 21.60649299621582, + "learning_rate": 2.5887728628869625e-07, + "logits/chosen": -0.524215817451477, + "logits/rejected": -0.5553881525993347, + "logps/chosen": -49.535953521728516, + "logps/rejected": -111.89124298095703, + "loss": 0.6913, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2410495281219482, + "rewards/margins": 6.903468608856201, + "rewards/rejected": -3.662419557571411, + "step": 17929 + }, + { + "epoch": 4.49, + "grad_norm": 7.288723945617676, + "learning_rate": 2.5862771462494604e-07, + "logits/chosen": -0.5069589018821716, + "logits/rejected": -0.4887183904647827, + "logps/chosen": -48.05010986328125, + "logps/rejected": -121.65790557861328, + "loss": 0.5929, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3015050888061523, + "rewards/margins": 6.757534980773926, + "rewards/rejected": -3.4560298919677734, + "step": 17930 + }, + { + "epoch": 4.49, + "grad_norm": 4.86046838760376, + "learning_rate": 2.583782601255469e-07, + "logits/chosen": -0.5600895881652832, + "logits/rejected": -0.6403806209564209, + "logps/chosen": -52.32591247558594, + "logps/rejected": -89.30088806152344, + "loss": 0.6195, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.011986255645752, + "rewards/margins": 6.061995983123779, + "rewards/rejected": -3.0500099658966064, + "step": 17931 + }, + { + "epoch": 4.49, + "grad_norm": 5.50349235534668, + "learning_rate": 2.5812892279666546e-07, + "logits/chosen": -0.5349063277244568, + "logits/rejected": -0.6406528353691101, + "logps/chosen": -56.76424789428711, + "logps/rejected": -123.07417297363281, + "loss": 0.6902, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0298638343811035, + "rewards/margins": 7.392543315887451, + "rewards/rejected": -4.3626790046691895, + "step": 17932 + }, + { + "epoch": 4.49, + "grad_norm": 1.3526647090911865, + "learning_rate": 2.5787970264446185e-07, + "logits/chosen": -0.5502604842185974, + "logits/rejected": -0.6388093829154968, + "logps/chosen": -48.801307678222656, + "logps/rejected": -98.73699951171875, + "loss": 0.5529, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2975621223449707, + "rewards/margins": 7.810959815979004, + "rewards/rejected": -4.513398170471191, + "step": 17933 + }, + { + "epoch": 4.49, + "grad_norm": 3.6148533821105957, + "learning_rate": 2.5763059967509443e-07, + "logits/chosen": -0.5482505559921265, + "logits/rejected": -0.5911933183670044, + "logps/chosen": -65.66543579101562, + "logps/rejected": -108.56056213378906, + "loss": 0.6715, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.766085147857666, + "rewards/margins": 6.862362861633301, + "rewards/rejected": -4.096277236938477, + "step": 17934 + }, + { + "epoch": 4.49, + "grad_norm": 5.786584854125977, + "learning_rate": 2.573816138947194e-07, + "logits/chosen": -0.5359652638435364, + "logits/rejected": -0.6298221349716187, + "logps/chosen": -56.91477966308594, + "logps/rejected": -96.41099548339844, + "loss": 0.6469, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3677756786346436, + "rewards/margins": 6.366769313812256, + "rewards/rejected": -2.9989938735961914, + "step": 17935 + }, + { + "epoch": 4.49, + "grad_norm": 6.385553359985352, + "learning_rate": 2.571327453094891e-07, + "logits/chosen": -0.5682306289672852, + "logits/rejected": -0.6301833987236023, + "logps/chosen": -44.506561279296875, + "logps/rejected": -96.71900177001953, + "loss": 0.614, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.886821985244751, + "rewards/margins": 6.415435791015625, + "rewards/rejected": -3.528613328933716, + "step": 17936 + }, + { + "epoch": 4.49, + "grad_norm": 4.002262592315674, + "learning_rate": 2.5688399392555294e-07, + "logits/chosen": -0.5214519500732422, + "logits/rejected": -0.6146172285079956, + "logps/chosen": -63.04244613647461, + "logps/rejected": -104.6690673828125, + "loss": 0.607, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.707129716873169, + "rewards/margins": 7.541059970855713, + "rewards/rejected": -4.833930015563965, + "step": 17937 + }, + { + "epoch": 4.49, + "grad_norm": 6.899618625640869, + "learning_rate": 2.56635359749059e-07, + "logits/chosen": -0.48941147327423096, + "logits/rejected": -0.5799881815910339, + "logps/chosen": -63.10606384277344, + "logps/rejected": -110.49286651611328, + "loss": 0.6353, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9647109508514404, + "rewards/margins": 7.406133651733398, + "rewards/rejected": -4.441422939300537, + "step": 17938 + }, + { + "epoch": 4.49, + "grad_norm": 4.857011795043945, + "learning_rate": 2.563868427861499e-07, + "logits/chosen": -0.5775762796401978, + "logits/rejected": -0.6635540723800659, + "logps/chosen": -50.50865936279297, + "logps/rejected": -100.91305541992188, + "loss": 0.5411, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.251899242401123, + "rewards/margins": 6.787983417510986, + "rewards/rejected": -3.536083936691284, + "step": 17939 + }, + { + "epoch": 4.49, + "grad_norm": 4.188796043395996, + "learning_rate": 2.561384430429681e-07, + "logits/chosen": -0.4984143376350403, + "logits/rejected": -0.5761355757713318, + "logps/chosen": -48.055233001708984, + "logps/rejected": -104.93000793457031, + "loss": 0.4684, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4962539672851562, + "rewards/margins": 6.642043113708496, + "rewards/rejected": -3.1457886695861816, + "step": 17940 + }, + { + "epoch": 4.49, + "grad_norm": 4.354711532592773, + "learning_rate": 2.558901605256503e-07, + "logits/chosen": -0.5694687366485596, + "logits/rejected": -0.6582632064819336, + "logps/chosen": -48.06252670288086, + "logps/rejected": -116.29408264160156, + "loss": 0.6308, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.850329875946045, + "rewards/margins": 8.23111343383789, + "rewards/rejected": -5.380783557891846, + "step": 17941 + }, + { + "epoch": 4.49, + "grad_norm": 8.403300285339355, + "learning_rate": 2.556419952403333e-07, + "logits/chosen": -0.5252825021743774, + "logits/rejected": -0.6078722476959229, + "logps/chosen": -65.06114959716797, + "logps/rejected": -110.99073028564453, + "loss": 0.8017, + "rewards/accuracies": 0.84375, + "rewards/chosen": 2.5280380249023438, + "rewards/margins": 5.775150775909424, + "rewards/rejected": -3.247112989425659, + "step": 17942 + }, + { + "epoch": 4.49, + "grad_norm": 4.986837387084961, + "learning_rate": 2.553939471931494e-07, + "logits/chosen": -0.4844418168067932, + "logits/rejected": -0.542924702167511, + "logps/chosen": -56.73077392578125, + "logps/rejected": -104.51968383789062, + "loss": 0.6337, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.225430727005005, + "rewards/margins": 6.613218307495117, + "rewards/rejected": -3.3877875804901123, + "step": 17943 + }, + { + "epoch": 4.49, + "grad_norm": 3.686617612838745, + "learning_rate": 2.551460163902264e-07, + "logits/chosen": -0.5150748491287231, + "logits/rejected": -0.5538264513015747, + "logps/chosen": -53.1726188659668, + "logps/rejected": -123.25989532470703, + "loss": 0.6135, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.992205858230591, + "rewards/margins": 7.417103290557861, + "rewards/rejected": -4.424897193908691, + "step": 17944 + }, + { + "epoch": 4.49, + "grad_norm": 2.939781665802002, + "learning_rate": 2.548982028376934e-07, + "logits/chosen": -0.6252625584602356, + "logits/rejected": -0.7094154953956604, + "logps/chosen": -58.324806213378906, + "logps/rejected": -87.31781768798828, + "loss": 0.6381, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.390394687652588, + "rewards/margins": 5.965153217315674, + "rewards/rejected": -2.574758768081665, + "step": 17945 + }, + { + "epoch": 4.49, + "grad_norm": 5.7473578453063965, + "learning_rate": 2.546505065416721e-07, + "logits/chosen": -0.5623326301574707, + "logits/rejected": -0.6331303119659424, + "logps/chosen": -46.517574310302734, + "logps/rejected": -112.0395278930664, + "loss": 0.5646, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4093666076660156, + "rewards/margins": 7.731818199157715, + "rewards/rejected": -4.322451591491699, + "step": 17946 + }, + { + "epoch": 4.49, + "grad_norm": 5.3091511726379395, + "learning_rate": 2.5440292750828486e-07, + "logits/chosen": -0.5118061900138855, + "logits/rejected": -0.5846922397613525, + "logps/chosen": -53.57704544067383, + "logps/rejected": -96.5702896118164, + "loss": 0.6576, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2364959716796875, + "rewards/margins": 6.338548183441162, + "rewards/rejected": -3.1020514965057373, + "step": 17947 + }, + { + "epoch": 4.49, + "grad_norm": 3.8538475036621094, + "learning_rate": 2.5415546574364845e-07, + "logits/chosen": -0.5378448367118835, + "logits/rejected": -0.56316077709198, + "logps/chosen": -57.7578239440918, + "logps/rejected": -121.83928680419922, + "loss": 0.6383, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.113288402557373, + "rewards/margins": 7.007690906524658, + "rewards/rejected": -3.894402265548706, + "step": 17948 + }, + { + "epoch": 4.49, + "grad_norm": 7.891546726226807, + "learning_rate": 2.5390812125387853e-07, + "logits/chosen": -0.5393204689025879, + "logits/rejected": -0.6176064610481262, + "logps/chosen": -57.58092498779297, + "logps/rejected": -102.43116760253906, + "loss": 0.7276, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8147518634796143, + "rewards/margins": 6.517960548400879, + "rewards/rejected": -3.7032084465026855, + "step": 17949 + }, + { + "epoch": 4.49, + "grad_norm": 3.423858165740967, + "learning_rate": 2.5366089404508685e-07, + "logits/chosen": -0.5101717114448547, + "logits/rejected": -0.5471556782722473, + "logps/chosen": -53.47301483154297, + "logps/rejected": -111.28317260742188, + "loss": 0.6466, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5861589908599854, + "rewards/margins": 7.299256801605225, + "rewards/rejected": -3.7130980491638184, + "step": 17950 + }, + { + "epoch": 4.49, + "grad_norm": 7.323432445526123, + "learning_rate": 2.534137841233836e-07, + "logits/chosen": -0.5921019911766052, + "logits/rejected": -0.6767187118530273, + "logps/chosen": -57.16416931152344, + "logps/rejected": -119.90719604492188, + "loss": 0.6231, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.01800799369812, + "rewards/margins": 8.014420509338379, + "rewards/rejected": -4.996412754058838, + "step": 17951 + }, + { + "epoch": 4.49, + "grad_norm": 3.6536073684692383, + "learning_rate": 2.531667914948738e-07, + "logits/chosen": -0.6760663390159607, + "logits/rejected": -0.7718680500984192, + "logps/chosen": -49.56684875488281, + "logps/rejected": -129.55947875976562, + "loss": 0.5636, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.132779121398926, + "rewards/margins": 9.071324348449707, + "rewards/rejected": -5.938545227050781, + "step": 17952 + }, + { + "epoch": 4.49, + "grad_norm": 3.343843936920166, + "learning_rate": 2.5291991616566093e-07, + "logits/chosen": -0.5202500224113464, + "logits/rejected": -0.6238769292831421, + "logps/chosen": -55.006103515625, + "logps/rejected": -97.22019958496094, + "loss": 0.5579, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.151423931121826, + "rewards/margins": 6.563912868499756, + "rewards/rejected": -3.4124882221221924, + "step": 17953 + }, + { + "epoch": 4.49, + "grad_norm": 6.801484107971191, + "learning_rate": 2.526731581418479e-07, + "logits/chosen": -0.545036256313324, + "logits/rejected": -0.6545760035514832, + "logps/chosen": -62.16289520263672, + "logps/rejected": -93.0918960571289, + "loss": 0.66, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3592939376831055, + "rewards/margins": 7.081768989562988, + "rewards/rejected": -3.72247576713562, + "step": 17954 + }, + { + "epoch": 4.49, + "grad_norm": 7.9240312576293945, + "learning_rate": 2.524265174295293e-07, + "logits/chosen": -0.5326507687568665, + "logits/rejected": -0.5766609311103821, + "logps/chosen": -56.33623123168945, + "logps/rejected": -119.1417465209961, + "loss": 0.6783, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.060262680053711, + "rewards/margins": 7.452714920043945, + "rewards/rejected": -4.392453193664551, + "step": 17955 + }, + { + "epoch": 4.49, + "grad_norm": 4.257997512817383, + "learning_rate": 2.521799940348013e-07, + "logits/chosen": -0.499732106924057, + "logits/rejected": -0.5415736436843872, + "logps/chosen": -61.23017501831055, + "logps/rejected": -126.35912322998047, + "loss": 0.6776, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3818893432617188, + "rewards/margins": 7.772980213165283, + "rewards/rejected": -4.3910908699035645, + "step": 17956 + }, + { + "epoch": 4.49, + "grad_norm": 3.37052321434021, + "learning_rate": 2.519335879637558e-07, + "logits/chosen": -0.6524278521537781, + "logits/rejected": -0.7236825227737427, + "logps/chosen": -50.05696105957031, + "logps/rejected": -111.0723876953125, + "loss": 0.5961, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9707632064819336, + "rewards/margins": 7.919332027435303, + "rewards/rejected": -4.948568344116211, + "step": 17957 + }, + { + "epoch": 4.49, + "grad_norm": 5.85062837600708, + "learning_rate": 2.5168729922248225e-07, + "logits/chosen": -0.6031290292739868, + "logits/rejected": -0.6960639357566833, + "logps/chosen": -43.82209777832031, + "logps/rejected": -115.83101654052734, + "loss": 0.5612, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.154240608215332, + "rewards/margins": 8.369404792785645, + "rewards/rejected": -5.215165138244629, + "step": 17958 + }, + { + "epoch": 4.49, + "grad_norm": 2.737589120864868, + "learning_rate": 2.5144112781706476e-07, + "logits/chosen": -0.5661516785621643, + "logits/rejected": -0.6204553842544556, + "logps/chosen": -61.040958404541016, + "logps/rejected": -90.80006408691406, + "loss": 0.6313, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.858621835708618, + "rewards/margins": 6.525346279144287, + "rewards/rejected": -3.666724920272827, + "step": 17959 + }, + { + "epoch": 4.49, + "grad_norm": 4.453322410583496, + "learning_rate": 2.5119507375358895e-07, + "logits/chosen": -0.5426613688468933, + "logits/rejected": -0.6104001998901367, + "logps/chosen": -57.86096954345703, + "logps/rejected": -124.73287963867188, + "loss": 0.583, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3162736892700195, + "rewards/margins": 7.760735511779785, + "rewards/rejected": -4.444461822509766, + "step": 17960 + }, + { + "epoch": 4.49, + "grad_norm": 21.438127517700195, + "learning_rate": 2.5094913703813395e-07, + "logits/chosen": -0.5738427639007568, + "logits/rejected": -0.617012619972229, + "logps/chosen": -58.17079162597656, + "logps/rejected": -108.45492553710938, + "loss": 0.7476, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.227100133895874, + "rewards/margins": 6.708624839782715, + "rewards/rejected": -3.4815242290496826, + "step": 17961 + }, + { + "epoch": 4.49, + "grad_norm": 4.140427112579346, + "learning_rate": 2.5070331767677647e-07, + "logits/chosen": -0.5878918170928955, + "logits/rejected": -0.638064444065094, + "logps/chosen": -59.16419219970703, + "logps/rejected": -126.39955139160156, + "loss": 0.7207, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2859907150268555, + "rewards/margins": 8.040996551513672, + "rewards/rejected": -4.755005359649658, + "step": 17962 + }, + { + "epoch": 4.49, + "grad_norm": 4.4663920402526855, + "learning_rate": 2.504576156755917e-07, + "logits/chosen": -0.5288406610488892, + "logits/rejected": -0.6135838031768799, + "logps/chosen": -63.82148361206055, + "logps/rejected": -102.68221282958984, + "loss": 0.6163, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1617884635925293, + "rewards/margins": 7.424965858459473, + "rewards/rejected": -4.263177871704102, + "step": 17963 + }, + { + "epoch": 4.49, + "grad_norm": 3.484372615814209, + "learning_rate": 2.5021203104065083e-07, + "logits/chosen": -0.5558463931083679, + "logits/rejected": -0.636047899723053, + "logps/chosen": -66.53523254394531, + "logps/rejected": -91.52615356445312, + "loss": 0.6226, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.10640811920166, + "rewards/margins": 6.958409309387207, + "rewards/rejected": -3.8520021438598633, + "step": 17964 + }, + { + "epoch": 4.49, + "grad_norm": 3.9681475162506104, + "learning_rate": 2.499665637780235e-07, + "logits/chosen": -0.4854446053504944, + "logits/rejected": -0.5804802179336548, + "logps/chosen": -64.56124114990234, + "logps/rejected": -107.4468765258789, + "loss": 0.6648, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.341336727142334, + "rewards/margins": 6.907797336578369, + "rewards/rejected": -3.566460609436035, + "step": 17965 + }, + { + "epoch": 4.49, + "grad_norm": 4.895842552185059, + "learning_rate": 2.4972121389377434e-07, + "logits/chosen": -0.6066854000091553, + "logits/rejected": -0.6383370757102966, + "logps/chosen": -46.19599151611328, + "logps/rejected": -114.10374450683594, + "loss": 0.5798, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.540494918823242, + "rewards/margins": 7.486938953399658, + "rewards/rejected": -3.946443557739258, + "step": 17966 + }, + { + "epoch": 4.49, + "grad_norm": 5.977118968963623, + "learning_rate": 2.494759813939662e-07, + "logits/chosen": -0.44604623317718506, + "logits/rejected": -0.5269563794136047, + "logps/chosen": -64.98541259765625, + "logps/rejected": -109.96861267089844, + "loss": 0.5929, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9866766929626465, + "rewards/margins": 7.428656101226807, + "rewards/rejected": -4.44197940826416, + "step": 17967 + }, + { + "epoch": 4.49, + "grad_norm": 3.532771110534668, + "learning_rate": 2.4923086628466044e-07, + "logits/chosen": -0.5767659544944763, + "logits/rejected": -0.6426649689674377, + "logps/chosen": -65.72774505615234, + "logps/rejected": -110.82957458496094, + "loss": 0.7297, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0310187339782715, + "rewards/margins": 7.69373083114624, + "rewards/rejected": -4.662711143493652, + "step": 17968 + }, + { + "epoch": 4.5, + "grad_norm": 5.936094284057617, + "learning_rate": 2.489858685719126e-07, + "logits/chosen": -0.5888288021087646, + "logits/rejected": -0.6484403014183044, + "logps/chosen": -50.43167495727539, + "logps/rejected": -93.88919830322266, + "loss": 0.6296, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.123831033706665, + "rewards/margins": 6.365574359893799, + "rewards/rejected": -3.241744041442871, + "step": 17969 + }, + { + "epoch": 4.5, + "grad_norm": 4.360585689544678, + "learning_rate": 2.4874098826177694e-07, + "logits/chosen": -0.5557079911231995, + "logits/rejected": -0.578029215335846, + "logps/chosen": -58.55338668823242, + "logps/rejected": -117.90349578857422, + "loss": 0.6472, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.320540428161621, + "rewards/margins": 6.8104448318481445, + "rewards/rejected": -3.4899046421051025, + "step": 17970 + }, + { + "epoch": 4.5, + "grad_norm": 7.395333766937256, + "learning_rate": 2.4849622536030516e-07, + "logits/chosen": -0.602403461933136, + "logits/rejected": -0.650816798210144, + "logps/chosen": -56.91722106933594, + "logps/rejected": -107.39314270019531, + "loss": 0.675, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.244474411010742, + "rewards/margins": 6.468961715698242, + "rewards/rejected": -3.2244873046875, + "step": 17971 + }, + { + "epoch": 4.5, + "grad_norm": 6.575145244598389, + "learning_rate": 2.4825157987354685e-07, + "logits/chosen": -0.5175949931144714, + "logits/rejected": -0.6144018173217773, + "logps/chosen": -50.99320602416992, + "logps/rejected": -101.66734313964844, + "loss": 0.5886, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3891983032226562, + "rewards/margins": 8.033315658569336, + "rewards/rejected": -4.644117832183838, + "step": 17972 + }, + { + "epoch": 4.5, + "grad_norm": 7.455745220184326, + "learning_rate": 2.480070518075445e-07, + "logits/chosen": -0.5017455816268921, + "logits/rejected": -0.5977442264556885, + "logps/chosen": -64.10531616210938, + "logps/rejected": -135.32925415039062, + "loss": 0.6635, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0475661754608154, + "rewards/margins": 8.579874992370605, + "rewards/rejected": -5.532309532165527, + "step": 17973 + }, + { + "epoch": 4.5, + "grad_norm": 7.3234758377075195, + "learning_rate": 2.477626411683426e-07, + "logits/chosen": -0.6142934560775757, + "logits/rejected": -0.6800017356872559, + "logps/chosen": -51.15263366699219, + "logps/rejected": -112.92691040039062, + "loss": 0.6136, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9083502292633057, + "rewards/margins": 7.228493690490723, + "rewards/rejected": -4.320143699645996, + "step": 17974 + }, + { + "epoch": 4.5, + "grad_norm": 2.769829750061035, + "learning_rate": 2.475183479619814e-07, + "logits/chosen": -0.6053416728973389, + "logits/rejected": -0.6385189294815063, + "logps/chosen": -45.489906311035156, + "logps/rejected": -105.25505065917969, + "loss": 0.5987, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4608020782470703, + "rewards/margins": 7.102173805236816, + "rewards/rejected": -3.641371488571167, + "step": 17975 + }, + { + "epoch": 4.5, + "grad_norm": 10.044574737548828, + "learning_rate": 2.4727417219449656e-07, + "logits/chosen": -0.6307733058929443, + "logits/rejected": -0.6999521255493164, + "logps/chosen": -47.28886032104492, + "logps/rejected": -107.82405853271484, + "loss": 0.6635, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8402016162872314, + "rewards/margins": 8.368197441101074, + "rewards/rejected": -5.527995586395264, + "step": 17976 + }, + { + "epoch": 4.5, + "grad_norm": 4.491708755493164, + "learning_rate": 2.470301138719217e-07, + "logits/chosen": -0.5269798040390015, + "logits/rejected": -0.5730443000793457, + "logps/chosen": -57.12273406982422, + "logps/rejected": -110.53813171386719, + "loss": 0.5882, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0506980419158936, + "rewards/margins": 6.592787742614746, + "rewards/rejected": -3.5420894622802734, + "step": 17977 + }, + { + "epoch": 4.5, + "grad_norm": 5.414517402648926, + "learning_rate": 2.4678617300028904e-07, + "logits/chosen": -0.5317930579185486, + "logits/rejected": -0.6345564723014832, + "logps/chosen": -50.910911560058594, + "logps/rejected": -104.87498474121094, + "loss": 0.6756, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0704293251037598, + "rewards/margins": 7.480745315551758, + "rewards/rejected": -4.4103169441223145, + "step": 17978 + }, + { + "epoch": 4.5, + "grad_norm": 9.105850219726562, + "learning_rate": 2.4654234958562506e-07, + "logits/chosen": -0.5208739638328552, + "logits/rejected": -0.5666649341583252, + "logps/chosen": -61.15044021606445, + "logps/rejected": -108.11540985107422, + "loss": 0.713, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.803990125656128, + "rewards/margins": 7.0299458503723145, + "rewards/rejected": -4.225955963134766, + "step": 17979 + }, + { + "epoch": 4.5, + "grad_norm": 6.217188835144043, + "learning_rate": 2.462986436339554e-07, + "logits/chosen": -0.48063021898269653, + "logits/rejected": -0.5986819267272949, + "logps/chosen": -67.87580108642578, + "logps/rejected": -122.02420043945312, + "loss": 0.6846, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.675405979156494, + "rewards/margins": 6.998237133026123, + "rewards/rejected": -4.322831630706787, + "step": 17980 + }, + { + "epoch": 4.5, + "grad_norm": 3.555511236190796, + "learning_rate": 2.4605505515130357e-07, + "logits/chosen": -0.5519341826438904, + "logits/rejected": -0.6519390940666199, + "logps/chosen": -59.04472351074219, + "logps/rejected": -104.2019271850586, + "loss": 0.652, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.952482223510742, + "rewards/margins": 7.520316123962402, + "rewards/rejected": -4.567833423614502, + "step": 17981 + }, + { + "epoch": 4.5, + "grad_norm": 12.680638313293457, + "learning_rate": 2.4581158414368646e-07, + "logits/chosen": -0.5883585810661316, + "logits/rejected": -0.6935120820999146, + "logps/chosen": -53.88239288330078, + "logps/rejected": -78.11726379394531, + "loss": 0.6196, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2060179710388184, + "rewards/margins": 6.259103775024414, + "rewards/rejected": -3.053086042404175, + "step": 17982 + }, + { + "epoch": 4.5, + "grad_norm": 4.677508354187012, + "learning_rate": 2.455682306171231e-07, + "logits/chosen": -0.49442362785339355, + "logits/rejected": -0.5852238535881042, + "logps/chosen": -48.98389434814453, + "logps/rejected": -99.38819885253906, + "loss": 0.5566, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2478294372558594, + "rewards/margins": 7.708828926086426, + "rewards/rejected": -4.460999965667725, + "step": 17983 + }, + { + "epoch": 4.5, + "grad_norm": 3.0463619232177734, + "learning_rate": 2.4532499457762484e-07, + "logits/chosen": -0.5520496368408203, + "logits/rejected": -0.6668307781219482, + "logps/chosen": -62.92009735107422, + "logps/rejected": -103.12384796142578, + "loss": 0.6202, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.968102216720581, + "rewards/margins": 8.222306251525879, + "rewards/rejected": -5.254204273223877, + "step": 17984 + }, + { + "epoch": 4.5, + "grad_norm": 4.410383701324463, + "learning_rate": 2.4508187603120406e-07, + "logits/chosen": -0.5626661777496338, + "logits/rejected": -0.618139386177063, + "logps/chosen": -45.753395080566406, + "logps/rejected": -104.54786682128906, + "loss": 0.6076, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0877060890197754, + "rewards/margins": 6.611290454864502, + "rewards/rejected": -3.5235848426818848, + "step": 17985 + }, + { + "epoch": 4.5, + "grad_norm": 5.833958625793457, + "learning_rate": 2.448388749838676e-07, + "logits/chosen": -0.5143975615501404, + "logits/rejected": -0.6102352142333984, + "logps/chosen": -63.414276123046875, + "logps/rejected": -107.8477783203125, + "loss": 0.6341, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1285903453826904, + "rewards/margins": 6.532189846038818, + "rewards/rejected": -3.403599739074707, + "step": 17986 + }, + { + "epoch": 4.5, + "grad_norm": 2.191615104675293, + "learning_rate": 2.445959914416196e-07, + "logits/chosen": -0.5617624521255493, + "logits/rejected": -0.6470195055007935, + "logps/chosen": -69.43095397949219, + "logps/rejected": -114.40067291259766, + "loss": 0.6102, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3348169326782227, + "rewards/margins": 8.469799995422363, + "rewards/rejected": -5.134983062744141, + "step": 17987 + }, + { + "epoch": 4.5, + "grad_norm": 3.3269286155700684, + "learning_rate": 2.443532254104636e-07, + "logits/chosen": -0.5075889229774475, + "logits/rejected": -0.5880739688873291, + "logps/chosen": -48.64341354370117, + "logps/rejected": -104.55506134033203, + "loss": 0.5828, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.51594877243042, + "rewards/margins": 7.804952621459961, + "rewards/rejected": -4.289004325866699, + "step": 17988 + }, + { + "epoch": 4.5, + "grad_norm": 9.336496353149414, + "learning_rate": 2.441105768963969e-07, + "logits/chosen": -0.49125343561172485, + "logits/rejected": -0.6282500624656677, + "logps/chosen": -61.43485641479492, + "logps/rejected": -107.47838592529297, + "loss": 0.6627, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.517357110977173, + "rewards/margins": 7.82794189453125, + "rewards/rejected": -5.310585021972656, + "step": 17989 + }, + { + "epoch": 4.5, + "grad_norm": 5.2838263511657715, + "learning_rate": 2.438680459054171e-07, + "logits/chosen": -0.49650493264198303, + "logits/rejected": -0.5771784782409668, + "logps/chosen": -52.576454162597656, + "logps/rejected": -86.90892028808594, + "loss": 0.5693, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2794899940490723, + "rewards/margins": 5.524543285369873, + "rewards/rejected": -2.24505352973938, + "step": 17990 + }, + { + "epoch": 4.5, + "grad_norm": 4.234214782714844, + "learning_rate": 2.436256324435171e-07, + "logits/chosen": -0.5632521510124207, + "logits/rejected": -0.650187611579895, + "logps/chosen": -49.887542724609375, + "logps/rejected": -95.14064025878906, + "loss": 0.5949, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1986451148986816, + "rewards/margins": 6.648906707763672, + "rewards/rejected": -3.4502623081207275, + "step": 17991 + }, + { + "epoch": 4.5, + "grad_norm": 21.114755630493164, + "learning_rate": 2.43383336516686e-07, + "logits/chosen": -0.522946834564209, + "logits/rejected": -0.6165202856063843, + "logps/chosen": -63.714942932128906, + "logps/rejected": -113.82807159423828, + "loss": 0.7199, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7859156131744385, + "rewards/margins": 6.611125946044922, + "rewards/rejected": -3.8252103328704834, + "step": 17992 + }, + { + "epoch": 4.5, + "grad_norm": 5.563959121704102, + "learning_rate": 2.4314115813091234e-07, + "logits/chosen": -0.5297238230705261, + "logits/rejected": -0.6117820739746094, + "logps/chosen": -50.71132278442383, + "logps/rejected": -86.9945068359375, + "loss": 0.6806, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1488540172576904, + "rewards/margins": 6.461060523986816, + "rewards/rejected": -3.312206745147705, + "step": 17993 + }, + { + "epoch": 4.5, + "grad_norm": 7.544463157653809, + "learning_rate": 2.428990972921808e-07, + "logits/chosen": -0.5219951272010803, + "logits/rejected": -0.5784822106361389, + "logps/chosen": -47.827754974365234, + "logps/rejected": -107.84382629394531, + "loss": 0.5611, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9793190956115723, + "rewards/margins": 7.683382511138916, + "rewards/rejected": -4.704063415527344, + "step": 17994 + }, + { + "epoch": 4.5, + "grad_norm": 8.819615364074707, + "learning_rate": 2.4265715400647094e-07, + "logits/chosen": -0.579518735408783, + "logits/rejected": -0.6806098222732544, + "logps/chosen": -54.28927230834961, + "logps/rejected": -108.97673797607422, + "loss": 0.7295, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.760498523712158, + "rewards/margins": 6.3090972900390625, + "rewards/rejected": -3.5485992431640625, + "step": 17995 + }, + { + "epoch": 4.5, + "grad_norm": 22.254350662231445, + "learning_rate": 2.4241532827976367e-07, + "logits/chosen": -0.5427809953689575, + "logits/rejected": -0.625393807888031, + "logps/chosen": -60.3527717590332, + "logps/rejected": -106.61187744140625, + "loss": 0.7405, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2150819301605225, + "rewards/margins": 7.037527084350586, + "rewards/rejected": -3.8224453926086426, + "step": 17996 + }, + { + "epoch": 4.5, + "grad_norm": 6.490735054016113, + "learning_rate": 2.4217362011803527e-07, + "logits/chosen": -0.5663986206054688, + "logits/rejected": -0.600928783416748, + "logps/chosen": -51.32184982299805, + "logps/rejected": -133.13455200195312, + "loss": 0.6456, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1835954189300537, + "rewards/margins": 7.5633368492126465, + "rewards/rejected": -4.379741191864014, + "step": 17997 + }, + { + "epoch": 4.5, + "grad_norm": 4.82460880279541, + "learning_rate": 2.4193202952725593e-07, + "logits/chosen": -0.5936539173126221, + "logits/rejected": -0.6605775952339172, + "logps/chosen": -63.5526008605957, + "logps/rejected": -106.23017120361328, + "loss": 0.7499, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.737118721008301, + "rewards/margins": 6.7895188331604, + "rewards/rejected": -4.0524001121521, + "step": 17998 + }, + { + "epoch": 4.5, + "grad_norm": 6.667372703552246, + "learning_rate": 2.41690556513397e-07, + "logits/chosen": -0.49709051847457886, + "logits/rejected": -0.579737663269043, + "logps/chosen": -58.91994094848633, + "logps/rejected": -99.4076156616211, + "loss": 0.6489, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.023494005203247, + "rewards/margins": 5.941553592681885, + "rewards/rejected": -2.9180593490600586, + "step": 17999 + }, + { + "epoch": 4.5, + "grad_norm": 9.41193962097168, + "learning_rate": 2.414492010824265e-07, + "logits/chosen": -0.5636712312698364, + "logits/rejected": -0.6509004831314087, + "logps/chosen": -69.24583435058594, + "logps/rejected": -108.83309936523438, + "loss": 0.7321, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8028178215026855, + "rewards/margins": 7.272263526916504, + "rewards/rejected": -4.469445705413818, + "step": 18000 + }, + { + "epoch": 4.5, + "grad_norm": 7.501517295837402, + "learning_rate": 2.412079632403075e-07, + "logits/chosen": -0.6009067296981812, + "logits/rejected": -0.6143446564674377, + "logps/chosen": -57.598365783691406, + "logps/rejected": -126.91960144042969, + "loss": 0.7074, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9762916564941406, + "rewards/margins": 7.476664066314697, + "rewards/rejected": -4.500372409820557, + "step": 18001 + }, + { + "epoch": 4.5, + "grad_norm": 18.804187774658203, + "learning_rate": 2.4096684299300065e-07, + "logits/chosen": -0.5084933042526245, + "logits/rejected": -0.5871866941452026, + "logps/chosen": -66.63204193115234, + "logps/rejected": -106.73997497558594, + "loss": 0.8286, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8997206687927246, + "rewards/margins": 6.473995208740234, + "rewards/rejected": -3.5742743015289307, + "step": 18002 + }, + { + "epoch": 4.5, + "grad_norm": 1.9206827878952026, + "learning_rate": 2.407258403464657e-07, + "logits/chosen": -0.5600690245628357, + "logits/rejected": -0.6366103887557983, + "logps/chosen": -54.215545654296875, + "logps/rejected": -100.37818908691406, + "loss": 0.5622, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2118024826049805, + "rewards/margins": 7.5806074142456055, + "rewards/rejected": -4.368805408477783, + "step": 18003 + }, + { + "epoch": 4.5, + "grad_norm": 22.47768211364746, + "learning_rate": 2.404849553066574e-07, + "logits/chosen": -0.5420117378234863, + "logits/rejected": -0.625268816947937, + "logps/chosen": -61.76475524902344, + "logps/rejected": -101.92243957519531, + "loss": 0.7232, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.808516263961792, + "rewards/margins": 6.677886962890625, + "rewards/rejected": -3.869370222091675, + "step": 18004 + }, + { + "epoch": 4.5, + "grad_norm": 6.628373146057129, + "learning_rate": 2.4024418787952755e-07, + "logits/chosen": -0.5643811225891113, + "logits/rejected": -0.660666823387146, + "logps/chosen": -58.113731384277344, + "logps/rejected": -102.62425994873047, + "loss": 0.6864, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1044676303863525, + "rewards/margins": 6.317592620849609, + "rewards/rejected": -3.2131245136260986, + "step": 18005 + }, + { + "epoch": 4.5, + "grad_norm": 1.5115907192230225, + "learning_rate": 2.4000353807102747e-07, + "logits/chosen": -0.60178542137146, + "logits/rejected": -0.6874898076057434, + "logps/chosen": -47.793724060058594, + "logps/rejected": -120.36319732666016, + "loss": 0.5464, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2559123039245605, + "rewards/margins": 9.553336143493652, + "rewards/rejected": -6.297423362731934, + "step": 18006 + }, + { + "epoch": 4.5, + "grad_norm": 2.1067535877227783, + "learning_rate": 2.39763005887102e-07, + "logits/chosen": -0.5944260358810425, + "logits/rejected": -0.6936652660369873, + "logps/chosen": -55.6654167175293, + "logps/rejected": -110.41055297851562, + "loss": 0.638, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0790908336639404, + "rewards/margins": 8.964632034301758, + "rewards/rejected": -5.885540962219238, + "step": 18007 + }, + { + "epoch": 4.5, + "grad_norm": 5.550245761871338, + "learning_rate": 2.3952259133369683e-07, + "logits/chosen": -0.4894724190235138, + "logits/rejected": -0.5860145688056946, + "logps/chosen": -61.031959533691406, + "logps/rejected": -103.19358825683594, + "loss": 0.6236, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3858017921447754, + "rewards/margins": 6.931116580963135, + "rewards/rejected": -3.5453152656555176, + "step": 18008 + }, + { + "epoch": 4.51, + "grad_norm": 4.304075241088867, + "learning_rate": 2.3928229441675166e-07, + "logits/chosen": -0.5684900879859924, + "logits/rejected": -0.6247048377990723, + "logps/chosen": -55.14692687988281, + "logps/rejected": -102.63225555419922, + "loss": 0.5858, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1206343173980713, + "rewards/margins": 7.171873092651367, + "rewards/rejected": -4.051239013671875, + "step": 18009 + }, + { + "epoch": 4.51, + "grad_norm": 5.49207067489624, + "learning_rate": 2.3904211514220397e-07, + "logits/chosen": -0.5139491558074951, + "logits/rejected": -0.6039950251579285, + "logps/chosen": -42.51657485961914, + "logps/rejected": -102.10546875, + "loss": 0.5145, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2880303859710693, + "rewards/margins": 7.899357318878174, + "rewards/rejected": -4.611326217651367, + "step": 18010 + }, + { + "epoch": 4.51, + "grad_norm": 4.159340858459473, + "learning_rate": 2.3880205351599074e-07, + "logits/chosen": -0.583519458770752, + "logits/rejected": -0.6051391363143921, + "logps/chosen": -55.75361633300781, + "logps/rejected": -115.27821350097656, + "loss": 0.6146, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.214200258255005, + "rewards/margins": 6.9098005294799805, + "rewards/rejected": -3.6956005096435547, + "step": 18011 + }, + { + "epoch": 4.51, + "grad_norm": 4.113177299499512, + "learning_rate": 2.385621095440427e-07, + "logits/chosen": -0.6185939311981201, + "logits/rejected": -0.6805056929588318, + "logps/chosen": -51.36515808105469, + "logps/rejected": -104.91342163085938, + "loss": 0.5747, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1839699745178223, + "rewards/margins": 7.415953636169434, + "rewards/rejected": -4.231983661651611, + "step": 18012 + }, + { + "epoch": 4.51, + "grad_norm": 4.853322982788086, + "learning_rate": 2.3832228323228846e-07, + "logits/chosen": -0.4977358281612396, + "logits/rejected": -0.628052830696106, + "logps/chosen": -61.866920471191406, + "logps/rejected": -92.72997283935547, + "loss": 0.5944, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.782370090484619, + "rewards/margins": 6.572935104370117, + "rewards/rejected": -3.79056453704834, + "step": 18013 + }, + { + "epoch": 4.51, + "grad_norm": 5.095815181732178, + "learning_rate": 2.3808257458665551e-07, + "logits/chosen": -0.6056120991706848, + "logits/rejected": -0.6402945518493652, + "logps/chosen": -49.687076568603516, + "logps/rejected": -95.36554718017578, + "loss": 0.7143, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.257185935974121, + "rewards/margins": 7.136932849884033, + "rewards/rejected": -3.879746198654175, + "step": 18014 + }, + { + "epoch": 4.51, + "grad_norm": 7.339695930480957, + "learning_rate": 2.3784298361306912e-07, + "logits/chosen": -0.5585556030273438, + "logits/rejected": -0.6065680384635925, + "logps/chosen": -53.091400146484375, + "logps/rejected": -106.03439331054688, + "loss": 0.6904, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.045654296875, + "rewards/margins": 6.442531108856201, + "rewards/rejected": -3.396876335144043, + "step": 18015 + }, + { + "epoch": 4.51, + "grad_norm": 2.722348213195801, + "learning_rate": 2.3760351031744567e-07, + "logits/chosen": -0.4927615523338318, + "logits/rejected": -0.5864282846450806, + "logps/chosen": -63.343666076660156, + "logps/rejected": -104.94284057617188, + "loss": 0.6433, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.505676507949829, + "rewards/margins": 8.552114486694336, + "rewards/rejected": -5.046437740325928, + "step": 18016 + }, + { + "epoch": 4.51, + "grad_norm": 5.557985782623291, + "learning_rate": 2.373641547057054e-07, + "logits/chosen": -0.5307995676994324, + "logits/rejected": -0.6482940912246704, + "logps/chosen": -58.44375991821289, + "logps/rejected": -90.2223129272461, + "loss": 0.6, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.228015899658203, + "rewards/margins": 6.927261829376221, + "rewards/rejected": -3.6992459297180176, + "step": 18017 + }, + { + "epoch": 4.51, + "grad_norm": 3.5100955963134766, + "learning_rate": 2.3712491678376304e-07, + "logits/chosen": -0.5108660459518433, + "logits/rejected": -0.5790278911590576, + "logps/chosen": -55.87950134277344, + "logps/rejected": -110.6512451171875, + "loss": 0.5947, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.895249843597412, + "rewards/margins": 7.517630577087402, + "rewards/rejected": -4.62238073348999, + "step": 18018 + }, + { + "epoch": 4.51, + "grad_norm": 2.8923428058624268, + "learning_rate": 2.3688579655753053e-07, + "logits/chosen": -0.525198757648468, + "logits/rejected": -0.6271636486053467, + "logps/chosen": -53.89190673828125, + "logps/rejected": -111.17137908935547, + "loss": 0.5632, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.251174211502075, + "rewards/margins": 7.529850006103516, + "rewards/rejected": -4.278675556182861, + "step": 18019 + }, + { + "epoch": 4.51, + "grad_norm": 6.053790092468262, + "learning_rate": 2.3664679403291534e-07, + "logits/chosen": -0.6124632954597473, + "logits/rejected": -0.6577017307281494, + "logps/chosen": -51.934364318847656, + "logps/rejected": -106.36577606201172, + "loss": 0.6553, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.019132137298584, + "rewards/margins": 6.412798881530762, + "rewards/rejected": -3.393666982650757, + "step": 18020 + }, + { + "epoch": 4.51, + "grad_norm": 4.997313976287842, + "learning_rate": 2.3640790921582502e-07, + "logits/chosen": -0.5857208967208862, + "logits/rejected": -0.6557698249816895, + "logps/chosen": -53.8963623046875, + "logps/rejected": -100.23957824707031, + "loss": 0.6875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8950486183166504, + "rewards/margins": 6.546433925628662, + "rewards/rejected": -3.6513853073120117, + "step": 18021 + }, + { + "epoch": 4.51, + "grad_norm": 10.530454635620117, + "learning_rate": 2.3616914211216203e-07, + "logits/chosen": -0.5606403946876526, + "logits/rejected": -0.601170539855957, + "logps/chosen": -49.195674896240234, + "logps/rejected": -123.66357421875, + "loss": 0.6452, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9796557426452637, + "rewards/margins": 7.939329147338867, + "rewards/rejected": -4.9596734046936035, + "step": 18022 + }, + { + "epoch": 4.51, + "grad_norm": 4.573243618011475, + "learning_rate": 2.3593049272782553e-07, + "logits/chosen": -0.5270711779594421, + "logits/rejected": -0.6122384071350098, + "logps/chosen": -56.228492736816406, + "logps/rejected": -96.8878173828125, + "loss": 0.6083, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9891085624694824, + "rewards/margins": 7.704950332641602, + "rewards/rejected": -4.715841293334961, + "step": 18023 + }, + { + "epoch": 4.51, + "grad_norm": 3.1754701137542725, + "learning_rate": 2.3569196106871474e-07, + "logits/chosen": -0.593632698059082, + "logits/rejected": -0.6744757294654846, + "logps/chosen": -47.05726623535156, + "logps/rejected": -98.94768524169922, + "loss": 0.4769, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1888492107391357, + "rewards/margins": 7.526616096496582, + "rewards/rejected": -4.337766647338867, + "step": 18024 + }, + { + "epoch": 4.51, + "grad_norm": 4.177467346191406, + "learning_rate": 2.3545354714072265e-07, + "logits/chosen": -0.505428671836853, + "logits/rejected": -0.593449592590332, + "logps/chosen": -57.238216400146484, + "logps/rejected": -92.83912658691406, + "loss": 0.6329, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0901949405670166, + "rewards/margins": 6.5138678550720215, + "rewards/rejected": -3.4236724376678467, + "step": 18025 + }, + { + "epoch": 4.51, + "grad_norm": 8.102323532104492, + "learning_rate": 2.3521525094974185e-07, + "logits/chosen": -0.538021981716156, + "logits/rejected": -0.6352159976959229, + "logps/chosen": -58.036468505859375, + "logps/rejected": -102.35696411132812, + "loss": 0.5989, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0569400787353516, + "rewards/margins": 6.251217365264893, + "rewards/rejected": -3.1942765712738037, + "step": 18026 + }, + { + "epoch": 4.51, + "grad_norm": 8.116854667663574, + "learning_rate": 2.3497707250165979e-07, + "logits/chosen": -0.5785409808158875, + "logits/rejected": -0.6812566518783569, + "logps/chosen": -58.4998664855957, + "logps/rejected": -82.97803497314453, + "loss": 0.6565, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1054933071136475, + "rewards/margins": 6.850826263427734, + "rewards/rejected": -3.745333433151245, + "step": 18027 + }, + { + "epoch": 4.51, + "grad_norm": 13.957700729370117, + "learning_rate": 2.3473901180236293e-07, + "logits/chosen": -0.5390323400497437, + "logits/rejected": -0.6220584511756897, + "logps/chosen": -63.7717170715332, + "logps/rejected": -107.0599136352539, + "loss": 0.6677, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.641035795211792, + "rewards/margins": 6.940722465515137, + "rewards/rejected": -4.299686431884766, + "step": 18028 + }, + { + "epoch": 4.51, + "grad_norm": 5.0919365882873535, + "learning_rate": 2.3450106885773372e-07, + "logits/chosen": -0.6010825037956238, + "logits/rejected": -0.6276295185089111, + "logps/chosen": -53.92781448364258, + "logps/rejected": -113.93607330322266, + "loss": 0.7129, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.174595832824707, + "rewards/margins": 7.408530235290527, + "rewards/rejected": -4.2339348793029785, + "step": 18029 + }, + { + "epoch": 4.51, + "grad_norm": 5.152696132659912, + "learning_rate": 2.3426324367365138e-07, + "logits/chosen": -0.5493475198745728, + "logits/rejected": -0.5928162932395935, + "logps/chosen": -59.48880386352539, + "logps/rejected": -102.80135345458984, + "loss": 0.7049, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0971457958221436, + "rewards/margins": 5.892361164093018, + "rewards/rejected": -2.7952160835266113, + "step": 18030 + }, + { + "epoch": 4.51, + "grad_norm": 7.161851406097412, + "learning_rate": 2.34025536255994e-07, + "logits/chosen": -0.5213569402694702, + "logits/rejected": -0.61090087890625, + "logps/chosen": -59.96955871582031, + "logps/rejected": -102.14522552490234, + "loss": 0.7386, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.913848638534546, + "rewards/margins": 6.853118896484375, + "rewards/rejected": -3.939269781112671, + "step": 18031 + }, + { + "epoch": 4.51, + "grad_norm": 4.488552570343018, + "learning_rate": 2.337879466106341e-07, + "logits/chosen": -0.5402082204818726, + "logits/rejected": -0.6099855303764343, + "logps/chosen": -49.16924285888672, + "logps/rejected": -104.02490997314453, + "loss": 0.5717, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1795105934143066, + "rewards/margins": 7.154676914215088, + "rewards/rejected": -3.9751667976379395, + "step": 18032 + }, + { + "epoch": 4.51, + "grad_norm": 2.8660314083099365, + "learning_rate": 2.3355047474344418e-07, + "logits/chosen": -0.5239416360855103, + "logits/rejected": -0.6215778589248657, + "logps/chosen": -52.48285675048828, + "logps/rejected": -90.33145904541016, + "loss": 0.5253, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.916978359222412, + "rewards/margins": 6.66044807434082, + "rewards/rejected": -3.7434699535369873, + "step": 18033 + }, + { + "epoch": 4.51, + "grad_norm": 6.594851016998291, + "learning_rate": 2.3331312066029178e-07, + "logits/chosen": -0.5597674250602722, + "logits/rejected": -0.6479381322860718, + "logps/chosen": -55.895381927490234, + "logps/rejected": -109.07850646972656, + "loss": 0.6579, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.511786460876465, + "rewards/margins": 8.097105026245117, + "rewards/rejected": -4.585317611694336, + "step": 18034 + }, + { + "epoch": 4.51, + "grad_norm": 6.411174297332764, + "learning_rate": 2.330758843670422e-07, + "logits/chosen": -0.5097494125366211, + "logits/rejected": -0.595345139503479, + "logps/chosen": -55.597206115722656, + "logps/rejected": -100.33988952636719, + "loss": 0.7294, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.076692581176758, + "rewards/margins": 6.103561878204346, + "rewards/rejected": -3.0268688201904297, + "step": 18035 + }, + { + "epoch": 4.51, + "grad_norm": 6.951145648956299, + "learning_rate": 2.32838765869558e-07, + "logits/chosen": -0.5293090343475342, + "logits/rejected": -0.6019173264503479, + "logps/chosen": -61.91629409790039, + "logps/rejected": -106.0153579711914, + "loss": 0.6534, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0362305641174316, + "rewards/margins": 6.135061740875244, + "rewards/rejected": -3.0988309383392334, + "step": 18036 + }, + { + "epoch": 4.51, + "grad_norm": 6.454556465148926, + "learning_rate": 2.3260176517369837e-07, + "logits/chosen": -0.49201253056526184, + "logits/rejected": -0.5700069069862366, + "logps/chosen": -49.808006286621094, + "logps/rejected": -117.8747787475586, + "loss": 0.5248, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.899488925933838, + "rewards/margins": 8.02592945098877, + "rewards/rejected": -5.126440048217773, + "step": 18037 + }, + { + "epoch": 4.51, + "grad_norm": 4.485660552978516, + "learning_rate": 2.3236488228531974e-07, + "logits/chosen": -0.5564563274383545, + "logits/rejected": -0.6087026596069336, + "logps/chosen": -51.2061767578125, + "logps/rejected": -109.34147644042969, + "loss": 0.6483, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1000592708587646, + "rewards/margins": 6.787232398986816, + "rewards/rejected": -3.68717360496521, + "step": 18038 + }, + { + "epoch": 4.51, + "grad_norm": 3.2864768505096436, + "learning_rate": 2.3212811721027572e-07, + "logits/chosen": -0.5997442007064819, + "logits/rejected": -0.6923015713691711, + "logps/chosen": -59.7109375, + "logps/rejected": -102.34518432617188, + "loss": 0.6378, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9303054809570312, + "rewards/margins": 6.970008373260498, + "rewards/rejected": -4.039702892303467, + "step": 18039 + }, + { + "epoch": 4.51, + "grad_norm": 6.249222278594971, + "learning_rate": 2.3189146995441836e-07, + "logits/chosen": -0.5633015632629395, + "logits/rejected": -0.6765713095664978, + "logps/chosen": -53.7389030456543, + "logps/rejected": -91.51573944091797, + "loss": 0.5515, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0267746448516846, + "rewards/margins": 7.01993989944458, + "rewards/rejected": -3.9931652545928955, + "step": 18040 + }, + { + "epoch": 4.51, + "grad_norm": 2.6602821350097656, + "learning_rate": 2.316549405235935e-07, + "logits/chosen": -0.5994283556938171, + "logits/rejected": -0.6793729662895203, + "logps/chosen": -39.96746063232422, + "logps/rejected": -108.05506896972656, + "loss": 0.5205, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2563717365264893, + "rewards/margins": 8.536962509155273, + "rewards/rejected": -5.280590534210205, + "step": 18041 + }, + { + "epoch": 4.51, + "grad_norm": 2.3053340911865234, + "learning_rate": 2.3141852892364647e-07, + "logits/chosen": -0.5777515172958374, + "logits/rejected": -0.668330729007721, + "logps/chosen": -52.5316162109375, + "logps/rejected": -96.4017333984375, + "loss": 0.5883, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9007554054260254, + "rewards/margins": 7.412383079528809, + "rewards/rejected": -4.511628150939941, + "step": 18042 + }, + { + "epoch": 4.51, + "grad_norm": 2.2456700801849365, + "learning_rate": 2.3118223516042094e-07, + "logits/chosen": -0.57424396276474, + "logits/rejected": -0.6849274039268494, + "logps/chosen": -56.477394104003906, + "logps/rejected": -108.38191986083984, + "loss": 0.5552, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.229414939880371, + "rewards/margins": 7.727529048919678, + "rewards/rejected": -4.498114585876465, + "step": 18043 + }, + { + "epoch": 4.51, + "grad_norm": 4.638193607330322, + "learning_rate": 2.3094605923975388e-07, + "logits/chosen": -0.5742285251617432, + "logits/rejected": -0.6391245126724243, + "logps/chosen": -47.37215042114258, + "logps/rejected": -107.16136169433594, + "loss": 0.7016, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.218656539916992, + "rewards/margins": 6.848132133483887, + "rewards/rejected": -3.6294753551483154, + "step": 18044 + }, + { + "epoch": 4.51, + "grad_norm": 7.45668363571167, + "learning_rate": 2.3071000116748177e-07, + "logits/chosen": -0.5190554261207581, + "logits/rejected": -0.6482127904891968, + "logps/chosen": -63.229270935058594, + "logps/rejected": -100.55519104003906, + "loss": 0.5966, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5734949111938477, + "rewards/margins": 6.3464226722717285, + "rewards/rejected": -3.772928237915039, + "step": 18045 + }, + { + "epoch": 4.51, + "grad_norm": 6.557450771331787, + "learning_rate": 2.3047406094943937e-07, + "logits/chosen": -0.5242261290550232, + "logits/rejected": -0.6253365278244019, + "logps/chosen": -60.17778015136719, + "logps/rejected": -91.29306030273438, + "loss": 0.7225, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.843048095703125, + "rewards/margins": 6.744797229766846, + "rewards/rejected": -3.901749610900879, + "step": 18046 + }, + { + "epoch": 4.51, + "grad_norm": 3.0145387649536133, + "learning_rate": 2.3023823859145589e-07, + "logits/chosen": -0.5767441987991333, + "logits/rejected": -0.6594420671463013, + "logps/chosen": -57.770408630371094, + "logps/rejected": -109.0027847290039, + "loss": 0.5173, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.368847131729126, + "rewards/margins": 7.2942118644714355, + "rewards/rejected": -3.9253644943237305, + "step": 18047 + }, + { + "epoch": 4.51, + "grad_norm": 4.98638391494751, + "learning_rate": 2.3000253409935834e-07, + "logits/chosen": -0.541024923324585, + "logits/rejected": -0.6007997989654541, + "logps/chosen": -53.69124221801758, + "logps/rejected": -125.37164306640625, + "loss": 0.6385, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3673417568206787, + "rewards/margins": 8.533174514770508, + "rewards/rejected": -5.16583251953125, + "step": 18048 + }, + { + "epoch": 4.52, + "grad_norm": 5.120129108428955, + "learning_rate": 2.2976694747897266e-07, + "logits/chosen": -0.4748324453830719, + "logits/rejected": -0.5909423828125, + "logps/chosen": -55.921478271484375, + "logps/rejected": -106.93241119384766, + "loss": 0.6357, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.915442943572998, + "rewards/margins": 7.176333904266357, + "rewards/rejected": -4.260890960693359, + "step": 18049 + }, + { + "epoch": 4.52, + "grad_norm": 6.197268486022949, + "learning_rate": 2.2953147873611915e-07, + "logits/chosen": -0.6284090876579285, + "logits/rejected": -0.7341179847717285, + "logps/chosen": -57.287261962890625, + "logps/rejected": -104.33319091796875, + "loss": 0.658, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8775434494018555, + "rewards/margins": 7.236917018890381, + "rewards/rejected": -4.359374046325684, + "step": 18050 + }, + { + "epoch": 4.52, + "grad_norm": 4.500795364379883, + "learning_rate": 2.2929612787661647e-07, + "logits/chosen": -0.5255533456802368, + "logits/rejected": -0.6042565703392029, + "logps/chosen": -46.995574951171875, + "logps/rejected": -96.3414535522461, + "loss": 0.5236, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2328600883483887, + "rewards/margins": 6.935319900512695, + "rewards/rejected": -3.7024595737457275, + "step": 18051 + }, + { + "epoch": 4.52, + "grad_norm": 3.9844601154327393, + "learning_rate": 2.2906089490628113e-07, + "logits/chosen": -0.6262239217758179, + "logits/rejected": -0.7009038925170898, + "logps/chosen": -52.18262481689453, + "logps/rejected": -98.23217010498047, + "loss": 0.6651, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2974157333374023, + "rewards/margins": 6.4787187576293945, + "rewards/rejected": -3.181303024291992, + "step": 18052 + }, + { + "epoch": 4.52, + "grad_norm": 19.132883071899414, + "learning_rate": 2.2882577983092514e-07, + "logits/chosen": -0.48907777667045593, + "logits/rejected": -0.5707826614379883, + "logps/chosen": -60.731407165527344, + "logps/rejected": -105.7355728149414, + "loss": 0.6989, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5564403533935547, + "rewards/margins": 7.641444206237793, + "rewards/rejected": -4.085003852844238, + "step": 18053 + }, + { + "epoch": 4.52, + "grad_norm": 4.257443904876709, + "learning_rate": 2.2859078265635992e-07, + "logits/chosen": -0.484511137008667, + "logits/rejected": -0.5480328798294067, + "logps/chosen": -44.70219039916992, + "logps/rejected": -91.07373046875, + "loss": 0.558, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0216574668884277, + "rewards/margins": 5.662495136260986, + "rewards/rejected": -2.6408376693725586, + "step": 18054 + }, + { + "epoch": 4.52, + "grad_norm": 4.9488935470581055, + "learning_rate": 2.2835590338839142e-07, + "logits/chosen": -0.5396804809570312, + "logits/rejected": -0.619318425655365, + "logps/chosen": -65.50572967529297, + "logps/rejected": -107.05104064941406, + "loss": 0.6832, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.027434825897217, + "rewards/margins": 6.592848777770996, + "rewards/rejected": -3.5654139518737793, + "step": 18055 + }, + { + "epoch": 4.52, + "grad_norm": 3.464986801147461, + "learning_rate": 2.2812114203282331e-07, + "logits/chosen": -0.47899624705314636, + "logits/rejected": -0.5571226477622986, + "logps/chosen": -51.17765808105469, + "logps/rejected": -124.28108215332031, + "loss": 0.5685, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.15669322013855, + "rewards/margins": 8.777298927307129, + "rewards/rejected": -5.620606899261475, + "step": 18056 + }, + { + "epoch": 4.52, + "grad_norm": 5.266774654388428, + "learning_rate": 2.2788649859545708e-07, + "logits/chosen": -0.6086905002593994, + "logits/rejected": -0.685370683670044, + "logps/chosen": -52.803688049316406, + "logps/rejected": -97.34086608886719, + "loss": 0.6596, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.201730728149414, + "rewards/margins": 6.713832855224609, + "rewards/rejected": -3.5121023654937744, + "step": 18057 + }, + { + "epoch": 4.52, + "grad_norm": 6.03307580947876, + "learning_rate": 2.2765197308209307e-07, + "logits/chosen": -0.5698687434196472, + "logits/rejected": -0.6340034604072571, + "logps/chosen": -49.600399017333984, + "logps/rejected": -113.54366302490234, + "loss": 0.5889, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1154110431671143, + "rewards/margins": 6.974608898162842, + "rewards/rejected": -3.859198570251465, + "step": 18058 + }, + { + "epoch": 4.52, + "grad_norm": 8.380008697509766, + "learning_rate": 2.2741756549852335e-07, + "logits/chosen": -0.5818980932235718, + "logits/rejected": -0.6201450228691101, + "logps/chosen": -53.56182098388672, + "logps/rejected": -103.00308990478516, + "loss": 0.8458, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.808213472366333, + "rewards/margins": 7.107322692871094, + "rewards/rejected": -4.299108982086182, + "step": 18059 + }, + { + "epoch": 4.52, + "grad_norm": 8.06897258758545, + "learning_rate": 2.2718327585054156e-07, + "logits/chosen": -0.5218124389648438, + "logits/rejected": -0.6440371870994568, + "logps/chosen": -51.14857482910156, + "logps/rejected": -96.52783203125, + "loss": 0.5906, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.24780535697937, + "rewards/margins": 7.505227088928223, + "rewards/rejected": -4.257421016693115, + "step": 18060 + }, + { + "epoch": 4.52, + "grad_norm": 8.392592430114746, + "learning_rate": 2.2694910414393867e-07, + "logits/chosen": -0.5535960793495178, + "logits/rejected": -0.6201497912406921, + "logps/chosen": -53.56790542602539, + "logps/rejected": -100.31441497802734, + "loss": 0.6008, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.136519432067871, + "rewards/margins": 6.704853057861328, + "rewards/rejected": -3.568333625793457, + "step": 18061 + }, + { + "epoch": 4.52, + "grad_norm": 6.456925868988037, + "learning_rate": 2.267150503845006e-07, + "logits/chosen": -0.468112587928772, + "logits/rejected": -0.594147264957428, + "logps/chosen": -67.85711669921875, + "logps/rejected": -108.98477935791016, + "loss": 0.7352, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7587730884552, + "rewards/margins": 6.998417377471924, + "rewards/rejected": -4.2396440505981445, + "step": 18062 + }, + { + "epoch": 4.52, + "grad_norm": 3.2829127311706543, + "learning_rate": 2.2648111457800937e-07, + "logits/chosen": -0.6128174662590027, + "logits/rejected": -0.6236135363578796, + "logps/chosen": -41.34604263305664, + "logps/rejected": -113.96768188476562, + "loss": 0.5906, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2907371520996094, + "rewards/margins": 7.894931793212891, + "rewards/rejected": -4.6041951179504395, + "step": 18063 + }, + { + "epoch": 4.52, + "grad_norm": 3.9025518894195557, + "learning_rate": 2.2624729673024814e-07, + "logits/chosen": -0.5428569912910461, + "logits/rejected": -0.6564920544624329, + "logps/chosen": -59.891754150390625, + "logps/rejected": -108.40888214111328, + "loss": 0.6167, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9968831539154053, + "rewards/margins": 7.900345802307129, + "rewards/rejected": -4.903461933135986, + "step": 18064 + }, + { + "epoch": 4.52, + "grad_norm": 3.645667791366577, + "learning_rate": 2.2601359684699398e-07, + "logits/chosen": -0.5680361986160278, + "logits/rejected": -0.6375316381454468, + "logps/chosen": -43.02733612060547, + "logps/rejected": -99.20128631591797, + "loss": 0.5478, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.419222831726074, + "rewards/margins": 6.763102054595947, + "rewards/rejected": -3.343879222869873, + "step": 18065 + }, + { + "epoch": 4.52, + "grad_norm": 8.158973693847656, + "learning_rate": 2.2578001493402058e-07, + "logits/chosen": -0.49862435460090637, + "logits/rejected": -0.6055192947387695, + "logps/chosen": -60.27357482910156, + "logps/rejected": -102.67554473876953, + "loss": 0.6289, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.016254425048828, + "rewards/margins": 7.284218788146973, + "rewards/rejected": -4.2679643630981445, + "step": 18066 + }, + { + "epoch": 4.52, + "grad_norm": 23.204038619995117, + "learning_rate": 2.255465509971022e-07, + "logits/chosen": -0.6057515740394592, + "logits/rejected": -0.671367883682251, + "logps/chosen": -53.224334716796875, + "logps/rejected": -103.34823608398438, + "loss": 0.8386, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8779077529907227, + "rewards/margins": 7.6347479820251465, + "rewards/rejected": -4.756839752197266, + "step": 18067 + }, + { + "epoch": 4.52, + "grad_norm": 5.275785446166992, + "learning_rate": 2.2531320504200648e-07, + "logits/chosen": -0.5459113717079163, + "logits/rejected": -0.6234951019287109, + "logps/chosen": -58.27943420410156, + "logps/rejected": -125.62960052490234, + "loss": 0.6247, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2503106594085693, + "rewards/margins": 7.89901876449585, + "rewards/rejected": -4.648708343505859, + "step": 18068 + }, + { + "epoch": 4.52, + "grad_norm": 3.5260422229766846, + "learning_rate": 2.2507997707449935e-07, + "logits/chosen": -0.5583796501159668, + "logits/rejected": -0.6403113603591919, + "logps/chosen": -47.8548469543457, + "logps/rejected": -118.66639709472656, + "loss": 0.5315, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0120418071746826, + "rewards/margins": 7.086030006408691, + "rewards/rejected": -4.073988437652588, + "step": 18069 + }, + { + "epoch": 4.52, + "grad_norm": 4.738791465759277, + "learning_rate": 2.248468671003462e-07, + "logits/chosen": -0.5641577839851379, + "logits/rejected": -0.6636638641357422, + "logps/chosen": -50.136470794677734, + "logps/rejected": -84.22835540771484, + "loss": 0.6095, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0548532009124756, + "rewards/margins": 6.5091047286987305, + "rewards/rejected": -3.454251289367676, + "step": 18070 + }, + { + "epoch": 4.52, + "grad_norm": 3.245802640914917, + "learning_rate": 2.2461387512530463e-07, + "logits/chosen": -0.5268917083740234, + "logits/rejected": -0.6006456613540649, + "logps/chosen": -55.2191276550293, + "logps/rejected": -110.06949615478516, + "loss": 0.6225, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.34696626663208, + "rewards/margins": 7.035688400268555, + "rewards/rejected": -3.6887214183807373, + "step": 18071 + }, + { + "epoch": 4.52, + "grad_norm": 3.5666627883911133, + "learning_rate": 2.2438100115513451e-07, + "logits/chosen": -0.548082709312439, + "logits/rejected": -0.6271729469299316, + "logps/chosen": -57.39589309692383, + "logps/rejected": -93.05760955810547, + "loss": 0.6492, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.207834482192993, + "rewards/margins": 7.419472694396973, + "rewards/rejected": -4.211637496948242, + "step": 18072 + }, + { + "epoch": 4.52, + "grad_norm": 3.012587785720825, + "learning_rate": 2.2414824519558842e-07, + "logits/chosen": -0.4627096652984619, + "logits/rejected": -0.5640379190444946, + "logps/chosen": -68.0488052368164, + "logps/rejected": -91.86214447021484, + "loss": 0.5821, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1468467712402344, + "rewards/margins": 6.339797496795654, + "rewards/rejected": -3.1929502487182617, + "step": 18073 + }, + { + "epoch": 4.52, + "grad_norm": 4.988044261932373, + "learning_rate": 2.2391560725242011e-07, + "logits/chosen": -0.5579917430877686, + "logits/rejected": -0.6280252933502197, + "logps/chosen": -58.70362854003906, + "logps/rejected": -123.59156036376953, + "loss": 0.6304, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1221816539764404, + "rewards/margins": 8.215938568115234, + "rewards/rejected": -5.093756198883057, + "step": 18074 + }, + { + "epoch": 4.52, + "grad_norm": 4.0736470222473145, + "learning_rate": 2.236830873313772e-07, + "logits/chosen": -0.5331090688705444, + "logits/rejected": -0.5995153784751892, + "logps/chosen": -62.3892822265625, + "logps/rejected": -122.65184020996094, + "loss": 0.5647, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.542686939239502, + "rewards/margins": 8.343217849731445, + "rewards/rejected": -4.800531387329102, + "step": 18075 + }, + { + "epoch": 4.52, + "grad_norm": 5.337796211242676, + "learning_rate": 2.2345068543820458e-07, + "logits/chosen": -0.5544072985649109, + "logits/rejected": -0.6409887671470642, + "logps/chosen": -49.385040283203125, + "logps/rejected": -138.20791625976562, + "loss": 0.5836, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.035388231277466, + "rewards/margins": 8.231135368347168, + "rewards/rejected": -5.195747375488281, + "step": 18076 + }, + { + "epoch": 4.52, + "grad_norm": 4.2947492599487305, + "learning_rate": 2.2321840157864704e-07, + "logits/chosen": -0.6306895613670349, + "logits/rejected": -0.6552940607070923, + "logps/chosen": -50.95002365112305, + "logps/rejected": -120.32698059082031, + "loss": 0.5318, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.049311399459839, + "rewards/margins": 6.980010032653809, + "rewards/rejected": -3.9306983947753906, + "step": 18077 + }, + { + "epoch": 4.52, + "grad_norm": 5.699289798736572, + "learning_rate": 2.229862357584428e-07, + "logits/chosen": -0.567092776298523, + "logits/rejected": -0.6399851441383362, + "logps/chosen": -68.2867660522461, + "logps/rejected": -116.4593505859375, + "loss": 0.6372, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.254513740539551, + "rewards/margins": 7.738411903381348, + "rewards/rejected": -4.483898162841797, + "step": 18078 + }, + { + "epoch": 4.52, + "grad_norm": 3.3747446537017822, + "learning_rate": 2.2275418798333058e-07, + "logits/chosen": -0.577804684638977, + "logits/rejected": -0.6744840145111084, + "logps/chosen": -50.994232177734375, + "logps/rejected": -98.5547103881836, + "loss": 0.5173, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1768765449523926, + "rewards/margins": 6.889906406402588, + "rewards/rejected": -3.713029384613037, + "step": 18079 + }, + { + "epoch": 4.52, + "grad_norm": 34.49937057495117, + "learning_rate": 2.2252225825904305e-07, + "logits/chosen": -0.5325483083724976, + "logits/rejected": -0.6207387447357178, + "logps/chosen": -60.7781867980957, + "logps/rejected": -125.51005554199219, + "loss": 0.6448, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4806969165802, + "rewards/margins": 8.5402250289917, + "rewards/rejected": -6.05952787399292, + "step": 18080 + }, + { + "epoch": 4.52, + "grad_norm": 3.849200487136841, + "learning_rate": 2.2229044659131226e-07, + "logits/chosen": -0.476718008518219, + "logits/rejected": -0.5362215638160706, + "logps/chosen": -59.43244934082031, + "logps/rejected": -121.17241668701172, + "loss": 0.5782, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0740914344787598, + "rewards/margins": 7.052157402038574, + "rewards/rejected": -3.9780659675598145, + "step": 18081 + }, + { + "epoch": 4.52, + "grad_norm": 24.584033966064453, + "learning_rate": 2.2205875298586587e-07, + "logits/chosen": -0.5910282731056213, + "logits/rejected": -0.6852928400039673, + "logps/chosen": -61.39875030517578, + "logps/rejected": -119.4200439453125, + "loss": 0.5855, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9751834869384766, + "rewards/margins": 8.305429458618164, + "rewards/rejected": -5.330246925354004, + "step": 18082 + }, + { + "epoch": 4.52, + "grad_norm": 2.3926193714141846, + "learning_rate": 2.2182717744843097e-07, + "logits/chosen": -0.6314378976821899, + "logits/rejected": -0.6768429279327393, + "logps/chosen": -43.386592864990234, + "logps/rejected": -110.20482635498047, + "loss": 0.4602, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6363401412963867, + "rewards/margins": 8.033058166503906, + "rewards/rejected": -4.396717548370361, + "step": 18083 + }, + { + "epoch": 4.52, + "grad_norm": 8.28221321105957, + "learning_rate": 2.215957199847274e-07, + "logits/chosen": -0.5325127243995667, + "logits/rejected": -0.5938687324523926, + "logps/chosen": -49.628684997558594, + "logps/rejected": -102.846435546875, + "loss": 0.6905, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.443899631500244, + "rewards/margins": 5.593226909637451, + "rewards/rejected": -2.1493277549743652, + "step": 18084 + }, + { + "epoch": 4.52, + "grad_norm": 5.178305149078369, + "learning_rate": 2.2136438060047672e-07, + "logits/chosen": -0.5035919547080994, + "logits/rejected": -0.5947823524475098, + "logps/chosen": -65.48355865478516, + "logps/rejected": -114.82963562011719, + "loss": 0.7024, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2720091342926025, + "rewards/margins": 8.29656982421875, + "rewards/rejected": -5.024560451507568, + "step": 18085 + }, + { + "epoch": 4.52, + "grad_norm": 4.19119930267334, + "learning_rate": 2.21133159301396e-07, + "logits/chosen": -0.5769132971763611, + "logits/rejected": -0.6932175159454346, + "logps/chosen": -53.51988983154297, + "logps/rejected": -99.912841796875, + "loss": 0.6093, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4055473804473877, + "rewards/margins": 6.768845558166504, + "rewards/rejected": -3.3632984161376953, + "step": 18086 + }, + { + "epoch": 4.52, + "grad_norm": 3.493604898452759, + "learning_rate": 2.2090205609319625e-07, + "logits/chosen": -0.5148437023162842, + "logits/rejected": -0.6117846369743347, + "logps/chosen": -51.89933776855469, + "logps/rejected": -104.67693328857422, + "loss": 0.5446, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2226791381835938, + "rewards/margins": 8.017647743225098, + "rewards/rejected": -4.794969081878662, + "step": 18087 + }, + { + "epoch": 4.52, + "grad_norm": 3.103940725326538, + "learning_rate": 2.206710709815907e-07, + "logits/chosen": -0.6210730075836182, + "logits/rejected": -0.7205585241317749, + "logps/chosen": -65.70823669433594, + "logps/rejected": -114.46534729003906, + "loss": 0.6212, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0068612098693848, + "rewards/margins": 8.156842231750488, + "rewards/rejected": -5.149981498718262, + "step": 18088 + }, + { + "epoch": 4.53, + "grad_norm": 9.177495956420898, + "learning_rate": 2.2044020397228639e-07, + "logits/chosen": -0.6691986322402954, + "logits/rejected": -0.7173813581466675, + "logps/chosen": -52.183448791503906, + "logps/rejected": -117.44572448730469, + "loss": 0.5759, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0284173488616943, + "rewards/margins": 7.107802391052246, + "rewards/rejected": -4.0793843269348145, + "step": 18089 + }, + { + "epoch": 4.53, + "grad_norm": 2.6910107135772705, + "learning_rate": 2.202094550709888e-07, + "logits/chosen": -0.6142699718475342, + "logits/rejected": -0.667628824710846, + "logps/chosen": -50.15138244628906, + "logps/rejected": -104.3127670288086, + "loss": 0.6023, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.170395851135254, + "rewards/margins": 7.612552642822266, + "rewards/rejected": -4.4421563148498535, + "step": 18090 + }, + { + "epoch": 4.53, + "grad_norm": 22.098133087158203, + "learning_rate": 2.199788242833989e-07, + "logits/chosen": -0.5146306753158569, + "logits/rejected": -0.6265419125556946, + "logps/chosen": -55.0621337890625, + "logps/rejected": -99.57441711425781, + "loss": 0.6164, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.150383949279785, + "rewards/margins": 7.10060977935791, + "rewards/rejected": -3.9502267837524414, + "step": 18091 + }, + { + "epoch": 4.53, + "grad_norm": 3.6047141551971436, + "learning_rate": 2.1974831161521714e-07, + "logits/chosen": -0.5756320953369141, + "logits/rejected": -0.6435893177986145, + "logps/chosen": -59.059810638427734, + "logps/rejected": -112.26255798339844, + "loss": 0.56, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.910586357116699, + "rewards/margins": 7.397648811340332, + "rewards/rejected": -4.487062931060791, + "step": 18092 + }, + { + "epoch": 4.53, + "grad_norm": 3.3434441089630127, + "learning_rate": 2.1951791707213897e-07, + "logits/chosen": -0.5529803037643433, + "logits/rejected": -0.6448667645454407, + "logps/chosen": -51.90983963012695, + "logps/rejected": -100.7652359008789, + "loss": 0.531, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.46335768699646, + "rewards/margins": 7.938406944274902, + "rewards/rejected": -4.475048542022705, + "step": 18093 + }, + { + "epoch": 4.53, + "grad_norm": 21.356760025024414, + "learning_rate": 2.1928764065985763e-07, + "logits/chosen": -0.4819853603839874, + "logits/rejected": -0.5859777927398682, + "logps/chosen": -62.069610595703125, + "logps/rejected": -101.48725891113281, + "loss": 0.7838, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.77891206741333, + "rewards/margins": 6.888778209686279, + "rewards/rejected": -4.109865665435791, + "step": 18094 + }, + { + "epoch": 4.53, + "grad_norm": 9.041935920715332, + "learning_rate": 2.190574823840641e-07, + "logits/chosen": -0.4840790629386902, + "logits/rejected": -0.5567032098770142, + "logps/chosen": -70.86643981933594, + "logps/rejected": -109.47294616699219, + "loss": 0.6335, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8745877742767334, + "rewards/margins": 6.167941570281982, + "rewards/rejected": -3.29335355758667, + "step": 18095 + }, + { + "epoch": 4.53, + "grad_norm": 3.501593828201294, + "learning_rate": 2.188274422504444e-07, + "logits/chosen": -0.5432595014572144, + "logits/rejected": -0.5886929631233215, + "logps/chosen": -48.503570556640625, + "logps/rejected": -112.12391662597656, + "loss": 0.5294, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.234726905822754, + "rewards/margins": 7.367813587188721, + "rewards/rejected": -4.133086681365967, + "step": 18096 + }, + { + "epoch": 4.53, + "grad_norm": 4.586040019989014, + "learning_rate": 2.1859752026468505e-07, + "logits/chosen": -0.5957831740379333, + "logits/rejected": -0.7020744681358337, + "logps/chosen": -66.3763427734375, + "logps/rejected": -108.60507202148438, + "loss": 0.632, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0864622592926025, + "rewards/margins": 7.22331428527832, + "rewards/rejected": -4.136852264404297, + "step": 18097 + }, + { + "epoch": 4.53, + "grad_norm": 10.15395736694336, + "learning_rate": 2.1836771643246656e-07, + "logits/chosen": -0.5932123064994812, + "logits/rejected": -0.6810594797134399, + "logps/chosen": -66.87189483642578, + "logps/rejected": -109.31387329101562, + "loss": 0.7385, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.015883684158325, + "rewards/margins": 6.5692362785339355, + "rewards/rejected": -3.5533533096313477, + "step": 18098 + }, + { + "epoch": 4.53, + "grad_norm": 12.18350887298584, + "learning_rate": 2.1813803075946715e-07, + "logits/chosen": -0.58524090051651, + "logits/rejected": -0.6450861692428589, + "logps/chosen": -48.169227600097656, + "logps/rejected": -95.92386627197266, + "loss": 0.6695, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.831066131591797, + "rewards/margins": 7.139309883117676, + "rewards/rejected": -4.308244228363037, + "step": 18099 + }, + { + "epoch": 4.53, + "grad_norm": 5.001995086669922, + "learning_rate": 2.1790846325136394e-07, + "logits/chosen": -0.6170086860656738, + "logits/rejected": -0.6276331543922424, + "logps/chosen": -45.307891845703125, + "logps/rejected": -110.738037109375, + "loss": 0.5559, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.742844820022583, + "rewards/margins": 7.592235565185547, + "rewards/rejected": -3.849390983581543, + "step": 18100 + }, + { + "epoch": 4.53, + "grad_norm": 4.822690010070801, + "learning_rate": 2.1767901391382906e-07, + "logits/chosen": -0.5504865050315857, + "logits/rejected": -0.6187449097633362, + "logps/chosen": -50.43043899536133, + "logps/rejected": -108.22976684570312, + "loss": 0.6458, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1224863529205322, + "rewards/margins": 7.100685119628906, + "rewards/rejected": -3.978199005126953, + "step": 18101 + }, + { + "epoch": 4.53, + "grad_norm": 5.029607772827148, + "learning_rate": 2.1744968275253185e-07, + "logits/chosen": -0.580588698387146, + "logits/rejected": -0.6750510334968567, + "logps/chosen": -52.584190368652344, + "logps/rejected": -96.45355224609375, + "loss": 0.582, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0829365253448486, + "rewards/margins": 7.228955268859863, + "rewards/rejected": -4.146019458770752, + "step": 18102 + }, + { + "epoch": 4.53, + "grad_norm": 5.999867916107178, + "learning_rate": 2.1722046977314003e-07, + "logits/chosen": -0.5900463461875916, + "logits/rejected": -0.6519694328308105, + "logps/chosen": -53.060829162597656, + "logps/rejected": -110.87893676757812, + "loss": 0.6511, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.002471446990967, + "rewards/margins": 6.820037841796875, + "rewards/rejected": -3.817566394805908, + "step": 18103 + }, + { + "epoch": 4.53, + "grad_norm": 8.328104972839355, + "learning_rate": 2.1699137498131794e-07, + "logits/chosen": -0.48487746715545654, + "logits/rejected": -0.5173153877258301, + "logps/chosen": -54.80141830444336, + "logps/rejected": -107.22148132324219, + "loss": 0.6691, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9675509929656982, + "rewards/margins": 6.8440446853637695, + "rewards/rejected": -3.8764939308166504, + "step": 18104 + }, + { + "epoch": 4.53, + "grad_norm": 3.0516107082366943, + "learning_rate": 2.167623983827266e-07, + "logits/chosen": -0.5082618594169617, + "logits/rejected": -0.5918009281158447, + "logps/chosen": -58.69910430908203, + "logps/rejected": -116.5341567993164, + "loss": 0.5153, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.138274908065796, + "rewards/margins": 7.589959621429443, + "rewards/rejected": -4.451684474945068, + "step": 18105 + }, + { + "epoch": 4.53, + "grad_norm": 5.973672866821289, + "learning_rate": 2.1653353998302374e-07, + "logits/chosen": -0.6046835780143738, + "logits/rejected": -0.6522313952445984, + "logps/chosen": -43.587684631347656, + "logps/rejected": -117.86689758300781, + "loss": 0.5514, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.473820686340332, + "rewards/margins": 7.903960227966309, + "rewards/rejected": -4.430139064788818, + "step": 18106 + }, + { + "epoch": 4.53, + "grad_norm": 8.986477851867676, + "learning_rate": 2.1630479978786533e-07, + "logits/chosen": -0.4927973747253418, + "logits/rejected": -0.5550177097320557, + "logps/chosen": -60.617000579833984, + "logps/rejected": -105.96440887451172, + "loss": 0.6299, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8925511837005615, + "rewards/margins": 6.126130104064941, + "rewards/rejected": -3.233578681945801, + "step": 18107 + }, + { + "epoch": 4.53, + "grad_norm": 8.191537857055664, + "learning_rate": 2.1607617780290302e-07, + "logits/chosen": -0.6135351657867432, + "logits/rejected": -0.6617602109909058, + "logps/chosen": -47.01593017578125, + "logps/rejected": -110.29661560058594, + "loss": 0.5564, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.192704200744629, + "rewards/margins": 7.778127670288086, + "rewards/rejected": -4.585422992706299, + "step": 18108 + }, + { + "epoch": 4.53, + "grad_norm": 16.622907638549805, + "learning_rate": 2.158476740337867e-07, + "logits/chosen": -0.446902871131897, + "logits/rejected": -0.5367322564125061, + "logps/chosen": -54.144439697265625, + "logps/rejected": -98.55657958984375, + "loss": 0.7692, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0232839584350586, + "rewards/margins": 5.711511135101318, + "rewards/rejected": -2.6882271766662598, + "step": 18109 + }, + { + "epoch": 4.53, + "grad_norm": 3.6799206733703613, + "learning_rate": 2.1561928848616354e-07, + "logits/chosen": -0.478825181722641, + "logits/rejected": -0.587565541267395, + "logps/chosen": -54.040470123291016, + "logps/rejected": -104.78773498535156, + "loss": 0.5473, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2360057830810547, + "rewards/margins": 7.417306900024414, + "rewards/rejected": -4.181300640106201, + "step": 18110 + }, + { + "epoch": 4.53, + "grad_norm": 2.2780392169952393, + "learning_rate": 2.1539102116567678e-07, + "logits/chosen": -0.4946371912956238, + "logits/rejected": -0.6086394190788269, + "logps/chosen": -56.443660736083984, + "logps/rejected": -110.48795318603516, + "loss": 0.5148, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.887197971343994, + "rewards/margins": 8.122334480285645, + "rewards/rejected": -5.23513650894165, + "step": 18111 + }, + { + "epoch": 4.53, + "grad_norm": 3.266347646713257, + "learning_rate": 2.1516287207796638e-07, + "logits/chosen": -0.5987461805343628, + "logits/rejected": -0.6549955010414124, + "logps/chosen": -48.01771545410156, + "logps/rejected": -122.22175598144531, + "loss": 0.5985, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7893309593200684, + "rewards/margins": 7.741383075714111, + "rewards/rejected": -4.952052116394043, + "step": 18112 + }, + { + "epoch": 4.53, + "grad_norm": 4.055827617645264, + "learning_rate": 2.1493484122867115e-07, + "logits/chosen": -0.5168880224227905, + "logits/rejected": -0.6353827714920044, + "logps/chosen": -58.148834228515625, + "logps/rejected": -90.29158020019531, + "loss": 0.6265, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.05076265335083, + "rewards/margins": 7.768826484680176, + "rewards/rejected": -4.718063831329346, + "step": 18113 + }, + { + "epoch": 4.53, + "grad_norm": 18.39044189453125, + "learning_rate": 2.147069286234249e-07, + "logits/chosen": -0.5965362787246704, + "logits/rejected": -0.6518856883049011, + "logps/chosen": -54.093589782714844, + "logps/rejected": -118.55924224853516, + "loss": 0.6504, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.721357822418213, + "rewards/margins": 6.3940324783325195, + "rewards/rejected": -3.6726744174957275, + "step": 18114 + }, + { + "epoch": 4.53, + "grad_norm": 4.045774459838867, + "learning_rate": 2.144791342678615e-07, + "logits/chosen": -0.5968462824821472, + "logits/rejected": -0.7091533541679382, + "logps/chosen": -45.240943908691406, + "logps/rejected": -90.36099243164062, + "loss": 0.5404, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0879886150360107, + "rewards/margins": 7.823382377624512, + "rewards/rejected": -4.735393524169922, + "step": 18115 + }, + { + "epoch": 4.53, + "grad_norm": 6.837914943695068, + "learning_rate": 2.1425145816760752e-07, + "logits/chosen": -0.5975051522254944, + "logits/rejected": -0.7011789083480835, + "logps/chosen": -62.16290283203125, + "logps/rejected": -103.57345581054688, + "loss": 0.6659, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.152993679046631, + "rewards/margins": 6.7369465827941895, + "rewards/rejected": -3.5839529037475586, + "step": 18116 + }, + { + "epoch": 4.53, + "grad_norm": 4.394484043121338, + "learning_rate": 2.1402390032829122e-07, + "logits/chosen": -0.5521162748336792, + "logits/rejected": -0.6375182867050171, + "logps/chosen": -58.40580368041992, + "logps/rejected": -95.35413360595703, + "loss": 0.6397, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.286374092102051, + "rewards/margins": 6.831267833709717, + "rewards/rejected": -3.544893980026245, + "step": 18117 + }, + { + "epoch": 4.53, + "grad_norm": 6.275394439697266, + "learning_rate": 2.137964607555354e-07, + "logits/chosen": -0.543265163898468, + "logits/rejected": -0.6168389320373535, + "logps/chosen": -48.72956085205078, + "logps/rejected": -130.2290496826172, + "loss": 0.6052, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0544698238372803, + "rewards/margins": 8.763408660888672, + "rewards/rejected": -5.7089385986328125, + "step": 18118 + }, + { + "epoch": 4.53, + "grad_norm": 5.0919671058654785, + "learning_rate": 2.1356913945495884e-07, + "logits/chosen": -0.5640085935592651, + "logits/rejected": -0.6587904095649719, + "logps/chosen": -51.104984283447266, + "logps/rejected": -106.64820098876953, + "loss": 0.5636, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.915088653564453, + "rewards/margins": 8.025748252868652, + "rewards/rejected": -5.110659599304199, + "step": 18119 + }, + { + "epoch": 4.53, + "grad_norm": 6.815273761749268, + "learning_rate": 2.1334193643218036e-07, + "logits/chosen": -0.6461464762687683, + "logits/rejected": -0.7136867046356201, + "logps/chosen": -52.12687301635742, + "logps/rejected": -103.9113540649414, + "loss": 0.5753, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6660780906677246, + "rewards/margins": 7.333275318145752, + "rewards/rejected": -3.6671974658966064, + "step": 18120 + }, + { + "epoch": 4.53, + "grad_norm": 4.061889171600342, + "learning_rate": 2.1311485169281332e-07, + "logits/chosen": -0.5815656781196594, + "logits/rejected": -0.6862147450447083, + "logps/chosen": -51.15701675415039, + "logps/rejected": -103.32994842529297, + "loss": 0.5491, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.279160499572754, + "rewards/margins": 7.828887462615967, + "rewards/rejected": -4.549726963043213, + "step": 18121 + }, + { + "epoch": 4.53, + "grad_norm": 5.263529300689697, + "learning_rate": 2.1288788524247094e-07, + "logits/chosen": -0.5772293210029602, + "logits/rejected": -0.6672173738479614, + "logps/chosen": -52.5686149597168, + "logps/rejected": -107.1229248046875, + "loss": 0.5942, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.175412893295288, + "rewards/margins": 7.3989644050598145, + "rewards/rejected": -4.2235517501831055, + "step": 18122 + }, + { + "epoch": 4.53, + "grad_norm": 5.595513820648193, + "learning_rate": 2.1266103708676045e-07, + "logits/chosen": -0.5592941045761108, + "logits/rejected": -0.6253621578216553, + "logps/chosen": -53.05693817138672, + "logps/rejected": -104.37457275390625, + "loss": 0.6266, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0229005813598633, + "rewards/margins": 6.4109954833984375, + "rewards/rejected": -3.388094902038574, + "step": 18123 + }, + { + "epoch": 4.53, + "grad_norm": 9.463604927062988, + "learning_rate": 2.1243430723128733e-07, + "logits/chosen": -0.5883041620254517, + "logits/rejected": -0.6453699469566345, + "logps/chosen": -45.02053451538086, + "logps/rejected": -113.99703216552734, + "loss": 0.6714, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1661975383758545, + "rewards/margins": 6.625226020812988, + "rewards/rejected": -3.459028482437134, + "step": 18124 + }, + { + "epoch": 4.53, + "grad_norm": 5.056805610656738, + "learning_rate": 2.122076956816549e-07, + "logits/chosen": -0.581008791923523, + "logits/rejected": -0.6797452569007874, + "logps/chosen": -56.258636474609375, + "logps/rejected": -105.20513153076172, + "loss": 0.5753, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.351626396179199, + "rewards/margins": 7.492623329162598, + "rewards/rejected": -4.140997886657715, + "step": 18125 + }, + { + "epoch": 4.53, + "grad_norm": 4.412419319152832, + "learning_rate": 2.119812024434631e-07, + "logits/chosen": -0.5328391194343567, + "logits/rejected": -0.5803390145301819, + "logps/chosen": -57.45967483520508, + "logps/rejected": -116.04830932617188, + "loss": 0.6489, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0307443141937256, + "rewards/margins": 7.088566303253174, + "rewards/rejected": -4.057822227478027, + "step": 18126 + }, + { + "epoch": 4.53, + "grad_norm": 4.625634670257568, + "learning_rate": 2.1175482752230803e-07, + "logits/chosen": -0.5070939660072327, + "logits/rejected": -0.5608716011047363, + "logps/chosen": -47.030006408691406, + "logps/rejected": -115.09454345703125, + "loss": 0.5861, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.263758659362793, + "rewards/margins": 7.486930847167969, + "rewards/rejected": -4.223172664642334, + "step": 18127 + }, + { + "epoch": 4.53, + "grad_norm": 10.65234375, + "learning_rate": 2.115285709237841e-07, + "logits/chosen": -0.6040353775024414, + "logits/rejected": -0.6720906496047974, + "logps/chosen": -52.49555969238281, + "logps/rejected": -114.05159759521484, + "loss": 0.608, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5629730224609375, + "rewards/margins": 8.429798126220703, + "rewards/rejected": -4.866825103759766, + "step": 18128 + }, + { + "epoch": 4.54, + "grad_norm": 6.796176910400391, + "learning_rate": 2.1130243265348404e-07, + "logits/chosen": -0.6125349402427673, + "logits/rejected": -0.658605694770813, + "logps/chosen": -47.329200744628906, + "logps/rejected": -122.43385314941406, + "loss": 0.6264, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.012087345123291, + "rewards/margins": 7.580989837646484, + "rewards/rejected": -4.568902492523193, + "step": 18129 + }, + { + "epoch": 4.54, + "grad_norm": 15.89037799835205, + "learning_rate": 2.110764127169923e-07, + "logits/chosen": -0.5427336692810059, + "logits/rejected": -0.6322402954101562, + "logps/chosen": -52.723087310791016, + "logps/rejected": -101.61585235595703, + "loss": 0.6269, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7551872730255127, + "rewards/margins": 6.498257160186768, + "rewards/rejected": -3.743069648742676, + "step": 18130 + }, + { + "epoch": 4.54, + "grad_norm": 5.716980934143066, + "learning_rate": 2.1085051111989663e-07, + "logits/chosen": -0.5004667043685913, + "logits/rejected": -0.549951434135437, + "logps/chosen": -58.440208435058594, + "logps/rejected": -99.86077880859375, + "loss": 0.7261, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.56514573097229, + "rewards/margins": 5.909234046936035, + "rewards/rejected": -3.344088077545166, + "step": 18131 + }, + { + "epoch": 4.54, + "grad_norm": 15.757180213928223, + "learning_rate": 2.1062472786777976e-07, + "logits/chosen": -0.5814039707183838, + "logits/rejected": -0.6364716291427612, + "logps/chosen": -59.484371185302734, + "logps/rejected": -106.02368927001953, + "loss": 0.7724, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1989974975585938, + "rewards/margins": 5.75309419631958, + "rewards/rejected": -2.5540966987609863, + "step": 18132 + }, + { + "epoch": 4.54, + "grad_norm": 3.883425235748291, + "learning_rate": 2.1039906296621947e-07, + "logits/chosen": -0.5923817753791809, + "logits/rejected": -0.6712295413017273, + "logps/chosen": -45.820247650146484, + "logps/rejected": -92.14385223388672, + "loss": 0.5773, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0909006595611572, + "rewards/margins": 5.951786994934082, + "rewards/rejected": -2.8608860969543457, + "step": 18133 + }, + { + "epoch": 4.54, + "grad_norm": 8.397159576416016, + "learning_rate": 2.1017351642079242e-07, + "logits/chosen": -0.5885679721832275, + "logits/rejected": -0.6254819631576538, + "logps/chosen": -61.88457489013672, + "logps/rejected": -127.72998809814453, + "loss": 0.7423, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.376565933227539, + "rewards/margins": 7.530632019042969, + "rewards/rejected": -4.1540656089782715, + "step": 18134 + }, + { + "epoch": 4.54, + "grad_norm": 4.350442409515381, + "learning_rate": 2.0994808823707303e-07, + "logits/chosen": -0.6542505025863647, + "logits/rejected": -0.68902587890625, + "logps/chosen": -42.80754089355469, + "logps/rejected": -110.85588836669922, + "loss": 0.5845, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5391428470611572, + "rewards/margins": 7.742738723754883, + "rewards/rejected": -4.203596591949463, + "step": 18135 + }, + { + "epoch": 4.54, + "grad_norm": 6.43560266494751, + "learning_rate": 2.0972277842063126e-07, + "logits/chosen": -0.5405155420303345, + "logits/rejected": -0.6152249574661255, + "logps/chosen": -53.6278076171875, + "logps/rejected": -102.35386657714844, + "loss": 0.6875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0923173427581787, + "rewards/margins": 7.6064605712890625, + "rewards/rejected": -4.514142990112305, + "step": 18136 + }, + { + "epoch": 4.54, + "grad_norm": 2.9890828132629395, + "learning_rate": 2.0949758697703437e-07, + "logits/chosen": -0.528070867061615, + "logits/rejected": -0.6366243958473206, + "logps/chosen": -48.485328674316406, + "logps/rejected": -89.98377990722656, + "loss": 0.5733, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.403994560241699, + "rewards/margins": 7.350785255432129, + "rewards/rejected": -3.9467906951904297, + "step": 18137 + }, + { + "epoch": 4.54, + "grad_norm": 6.90796422958374, + "learning_rate": 2.0927251391184789e-07, + "logits/chosen": -0.5740284323692322, + "logits/rejected": -0.6051453351974487, + "logps/chosen": -52.35757827758789, + "logps/rejected": -100.98696899414062, + "loss": 0.6674, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.02046537399292, + "rewards/margins": 5.735530853271484, + "rewards/rejected": -2.7150650024414062, + "step": 18138 + }, + { + "epoch": 4.54, + "grad_norm": 3.006399631500244, + "learning_rate": 2.090475592306329e-07, + "logits/chosen": -0.5707018375396729, + "logits/rejected": -0.6552180051803589, + "logps/chosen": -61.323509216308594, + "logps/rejected": -145.22506713867188, + "loss": 0.5766, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.194159746170044, + "rewards/margins": 9.817937850952148, + "rewards/rejected": -6.623777389526367, + "step": 18139 + }, + { + "epoch": 4.54, + "grad_norm": 4.9716901779174805, + "learning_rate": 2.088227229389489e-07, + "logits/chosen": -0.5995715260505676, + "logits/rejected": -0.6554279327392578, + "logps/chosen": -54.4412841796875, + "logps/rejected": -109.07430267333984, + "loss": 0.6746, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1398348808288574, + "rewards/margins": 7.06256103515625, + "rewards/rejected": -3.9227254390716553, + "step": 18140 + }, + { + "epoch": 4.54, + "grad_norm": 3.2313389778137207, + "learning_rate": 2.0859800504235196e-07, + "logits/chosen": -0.5929613709449768, + "logits/rejected": -0.6903889179229736, + "logps/chosen": -51.99867248535156, + "logps/rejected": -111.29446411132812, + "loss": 0.5905, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2982723712921143, + "rewards/margins": 7.930088043212891, + "rewards/rejected": -4.6318159103393555, + "step": 18141 + }, + { + "epoch": 4.54, + "grad_norm": 5.710500240325928, + "learning_rate": 2.0837340554639374e-07, + "logits/chosen": -0.49168336391448975, + "logits/rejected": -0.5788193345069885, + "logps/chosen": -66.96380615234375, + "logps/rejected": -108.83961486816406, + "loss": 0.6976, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1712210178375244, + "rewards/margins": 8.071182250976562, + "rewards/rejected": -4.899960994720459, + "step": 18142 + }, + { + "epoch": 4.54, + "grad_norm": 3.2490084171295166, + "learning_rate": 2.0814892445662594e-07, + "logits/chosen": -0.5308243036270142, + "logits/rejected": -0.6253602504730225, + "logps/chosen": -46.505699157714844, + "logps/rejected": -85.25202941894531, + "loss": 0.5537, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5109305381774902, + "rewards/margins": 6.833620548248291, + "rewards/rejected": -3.3226895332336426, + "step": 18143 + }, + { + "epoch": 4.54, + "grad_norm": 3.026461362838745, + "learning_rate": 2.0792456177859467e-07, + "logits/chosen": -0.5001411437988281, + "logits/rejected": -0.6035046577453613, + "logps/chosen": -56.745479583740234, + "logps/rejected": -97.28826904296875, + "loss": 0.5766, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.518517255783081, + "rewards/margins": 8.129866600036621, + "rewards/rejected": -4.611350059509277, + "step": 18144 + }, + { + "epoch": 4.54, + "grad_norm": 4.648390293121338, + "learning_rate": 2.0770031751784436e-07, + "logits/chosen": -0.536455512046814, + "logits/rejected": -0.5989762544631958, + "logps/chosen": -48.828712463378906, + "logps/rejected": -118.01719665527344, + "loss": 0.5899, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.264751434326172, + "rewards/margins": 7.362026214599609, + "rewards/rejected": -4.0972747802734375, + "step": 18145 + }, + { + "epoch": 4.54, + "grad_norm": 7.327064037322998, + "learning_rate": 2.0747619167991674e-07, + "logits/chosen": -0.5478301644325256, + "logits/rejected": -0.6282500624656677, + "logps/chosen": -63.143470764160156, + "logps/rejected": -103.69755554199219, + "loss": 0.7438, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8335742950439453, + "rewards/margins": 5.96193265914917, + "rewards/rejected": -3.1283578872680664, + "step": 18146 + }, + { + "epoch": 4.54, + "grad_norm": 19.524320602416992, + "learning_rate": 2.072521842703501e-07, + "logits/chosen": -0.5246562957763672, + "logits/rejected": -0.5963075160980225, + "logps/chosen": -44.07473373413086, + "logps/rejected": -94.37781524658203, + "loss": 0.5352, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4049460887908936, + "rewards/margins": 7.532220840454102, + "rewards/rejected": -4.127274513244629, + "step": 18147 + }, + { + "epoch": 4.54, + "grad_norm": 17.990793228149414, + "learning_rate": 2.0702829529468004e-07, + "logits/chosen": -0.5085201859474182, + "logits/rejected": -0.5606935024261475, + "logps/chosen": -51.3848991394043, + "logps/rejected": -117.03882598876953, + "loss": 0.6023, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7833876609802246, + "rewards/margins": 7.18448543548584, + "rewards/rejected": -4.401097774505615, + "step": 18148 + }, + { + "epoch": 4.54, + "grad_norm": 2.5039358139038086, + "learning_rate": 2.0680452475843828e-07, + "logits/chosen": -0.5214734673500061, + "logits/rejected": -0.5917317867279053, + "logps/chosen": -59.80409240722656, + "logps/rejected": -132.75929260253906, + "loss": 0.601, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6214423179626465, + "rewards/margins": 7.845454692840576, + "rewards/rejected": -4.224012851715088, + "step": 18149 + }, + { + "epoch": 4.54, + "grad_norm": 7.24435567855835, + "learning_rate": 2.065808726671553e-07, + "logits/chosen": -0.5478504300117493, + "logits/rejected": -0.6132229566574097, + "logps/chosen": -49.08583068847656, + "logps/rejected": -87.93240356445312, + "loss": 0.6482, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8136558532714844, + "rewards/margins": 6.301517963409424, + "rewards/rejected": -3.4878621101379395, + "step": 18150 + }, + { + "epoch": 4.54, + "grad_norm": 2.234894037246704, + "learning_rate": 2.063573390263579e-07, + "logits/chosen": -0.543018102645874, + "logits/rejected": -0.6329566240310669, + "logps/chosen": -55.14609146118164, + "logps/rejected": -121.24365997314453, + "loss": 0.5425, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1247684955596924, + "rewards/margins": 8.008182525634766, + "rewards/rejected": -4.883414268493652, + "step": 18151 + }, + { + "epoch": 4.54, + "grad_norm": 3.413177251815796, + "learning_rate": 2.0613392384156884e-07, + "logits/chosen": -0.512220561504364, + "logits/rejected": -0.6021997332572937, + "logps/chosen": -68.3882827758789, + "logps/rejected": -110.5809097290039, + "loss": 0.6471, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7629871368408203, + "rewards/margins": 6.9986958503723145, + "rewards/rejected": -4.235708713531494, + "step": 18152 + }, + { + "epoch": 4.54, + "grad_norm": 1.9713674783706665, + "learning_rate": 2.0591062711830977e-07, + "logits/chosen": -0.5690100789070129, + "logits/rejected": -0.6694169640541077, + "logps/chosen": -44.83037567138672, + "logps/rejected": -98.0942153930664, + "loss": 0.4956, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1053085327148438, + "rewards/margins": 7.704927444458008, + "rewards/rejected": -4.599618911743164, + "step": 18153 + }, + { + "epoch": 4.54, + "grad_norm": 15.316526412963867, + "learning_rate": 2.0568744886209803e-07, + "logits/chosen": -0.5712705850601196, + "logits/rejected": -0.6822659969329834, + "logps/chosen": -54.64027786254883, + "logps/rejected": -102.897705078125, + "loss": 0.5509, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1923747062683105, + "rewards/margins": 7.5569915771484375, + "rewards/rejected": -4.364616870880127, + "step": 18154 + }, + { + "epoch": 4.54, + "grad_norm": 2.5814075469970703, + "learning_rate": 2.0546438907844857e-07, + "logits/chosen": -0.5188447833061218, + "logits/rejected": -0.5805554389953613, + "logps/chosen": -63.63746643066406, + "logps/rejected": -92.90299987792969, + "loss": 0.6767, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3420097827911377, + "rewards/margins": 6.223639965057373, + "rewards/rejected": -2.8816299438476562, + "step": 18155 + }, + { + "epoch": 4.54, + "grad_norm": 12.469420433044434, + "learning_rate": 2.052414477728748e-07, + "logits/chosen": -0.4591013193130493, + "logits/rejected": -0.5462954640388489, + "logps/chosen": -55.72771072387695, + "logps/rejected": -110.28919219970703, + "loss": 0.6205, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.923032522201538, + "rewards/margins": 7.117216110229492, + "rewards/rejected": -4.194183349609375, + "step": 18156 + }, + { + "epoch": 4.54, + "grad_norm": 4.417202472686768, + "learning_rate": 2.0501862495088454e-07, + "logits/chosen": -0.5768053531646729, + "logits/rejected": -0.6167624592781067, + "logps/chosen": -52.30925750732422, + "logps/rejected": -118.08578491210938, + "loss": 0.6455, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9407835006713867, + "rewards/margins": 7.12679386138916, + "rewards/rejected": -4.186010360717773, + "step": 18157 + }, + { + "epoch": 4.54, + "grad_norm": 3.913003921508789, + "learning_rate": 2.0479592061798336e-07, + "logits/chosen": -0.529387354850769, + "logits/rejected": -0.5725187659263611, + "logps/chosen": -48.293888092041016, + "logps/rejected": -109.37980651855469, + "loss": 0.6339, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5781986713409424, + "rewards/margins": 7.426370620727539, + "rewards/rejected": -3.848172187805176, + "step": 18158 + }, + { + "epoch": 4.54, + "grad_norm": 3.5711517333984375, + "learning_rate": 2.0457333477967523e-07, + "logits/chosen": -0.6021925210952759, + "logits/rejected": -0.6419216394424438, + "logps/chosen": -57.3692741394043, + "logps/rejected": -113.93080139160156, + "loss": 0.5875, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2438998222351074, + "rewards/margins": 7.432547569274902, + "rewards/rejected": -4.188647270202637, + "step": 18159 + }, + { + "epoch": 4.54, + "grad_norm": 2.8037025928497314, + "learning_rate": 2.0435086744146181e-07, + "logits/chosen": -0.5511791706085205, + "logits/rejected": -0.6207106113433838, + "logps/chosen": -59.82379150390625, + "logps/rejected": -110.94901275634766, + "loss": 0.5969, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.018441915512085, + "rewards/margins": 7.5782060623168945, + "rewards/rejected": -4.559764385223389, + "step": 18160 + }, + { + "epoch": 4.54, + "grad_norm": 5.390477180480957, + "learning_rate": 2.0412851860883875e-07, + "logits/chosen": -0.5215777158737183, + "logits/rejected": -0.6415618062019348, + "logps/chosen": -57.01991271972656, + "logps/rejected": -110.1645736694336, + "loss": 0.5095, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9605295658111572, + "rewards/margins": 7.525887966156006, + "rewards/rejected": -4.565358638763428, + "step": 18161 + }, + { + "epoch": 4.54, + "grad_norm": 3.5354573726654053, + "learning_rate": 2.0390628828730107e-07, + "logits/chosen": -0.593963086605072, + "logits/rejected": -0.6947363018989563, + "logps/chosen": -57.25897216796875, + "logps/rejected": -125.66136169433594, + "loss": 0.604, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.449767827987671, + "rewards/margins": 7.897644519805908, + "rewards/rejected": -5.4478759765625, + "step": 18162 + }, + { + "epoch": 4.54, + "grad_norm": 4.943371295928955, + "learning_rate": 2.0368417648234052e-07, + "logits/chosen": -0.6250197887420654, + "logits/rejected": -0.6885780096054077, + "logps/chosen": -57.44532775878906, + "logps/rejected": -115.73361206054688, + "loss": 0.5871, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.145998001098633, + "rewards/margins": 7.277224540710449, + "rewards/rejected": -4.131226539611816, + "step": 18163 + }, + { + "epoch": 4.54, + "grad_norm": 4.427739143371582, + "learning_rate": 2.0346218319944545e-07, + "logits/chosen": -0.549527108669281, + "logits/rejected": -0.6498483419418335, + "logps/chosen": -64.38941192626953, + "logps/rejected": -106.83557891845703, + "loss": 0.6867, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3125147819519043, + "rewards/margins": 8.323172569274902, + "rewards/rejected": -5.0106587409973145, + "step": 18164 + }, + { + "epoch": 4.54, + "grad_norm": 10.551900863647461, + "learning_rate": 2.0324030844410204e-07, + "logits/chosen": -0.6391013264656067, + "logits/rejected": -0.5919688940048218, + "logps/chosen": -49.39364242553711, + "logps/rejected": -131.85337829589844, + "loss": 0.6555, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8994133472442627, + "rewards/margins": 6.241371154785156, + "rewards/rejected": -3.3419580459594727, + "step": 18165 + }, + { + "epoch": 4.54, + "grad_norm": 3.5234007835388184, + "learning_rate": 2.0301855222179258e-07, + "logits/chosen": -0.5540396571159363, + "logits/rejected": -0.6377925872802734, + "logps/chosen": -54.63255310058594, + "logps/rejected": -104.77145385742188, + "loss": 0.5374, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.959664821624756, + "rewards/margins": 6.924437522888184, + "rewards/rejected": -3.9647717475891113, + "step": 18166 + }, + { + "epoch": 4.54, + "grad_norm": 3.3409860134124756, + "learning_rate": 2.0279691453799654e-07, + "logits/chosen": -0.5783076286315918, + "logits/rejected": -0.6632116436958313, + "logps/chosen": -53.448341369628906, + "logps/rejected": -99.81392669677734, + "loss": 0.6259, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.20080304145813, + "rewards/margins": 6.685589790344238, + "rewards/rejected": -3.4847867488861084, + "step": 18167 + }, + { + "epoch": 4.54, + "grad_norm": 3.938737630844116, + "learning_rate": 2.0257539539819128e-07, + "logits/chosen": -0.5639835000038147, + "logits/rejected": -0.613899827003479, + "logps/chosen": -52.0135612487793, + "logps/rejected": -116.7413101196289, + "loss": 0.5902, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.950178861618042, + "rewards/margins": 7.663510322570801, + "rewards/rejected": -4.713331699371338, + "step": 18168 + }, + { + "epoch": 4.55, + "grad_norm": 4.22367000579834, + "learning_rate": 2.0235399480785123e-07, + "logits/chosen": -0.5540809631347656, + "logits/rejected": -0.6286744475364685, + "logps/chosen": -53.63150405883789, + "logps/rejected": -111.14637756347656, + "loss": 0.631, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.723724842071533, + "rewards/margins": 7.761945724487305, + "rewards/rejected": -5.0382208824157715, + "step": 18169 + }, + { + "epoch": 4.55, + "grad_norm": 4.439108848571777, + "learning_rate": 2.021327127724465e-07, + "logits/chosen": -0.5551097393035889, + "logits/rejected": -0.6478566527366638, + "logps/chosen": -46.36965560913086, + "logps/rejected": -80.53118896484375, + "loss": 0.5983, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.391538143157959, + "rewards/margins": 5.785247325897217, + "rewards/rejected": -2.393709421157837, + "step": 18170 + }, + { + "epoch": 4.55, + "grad_norm": 4.043075084686279, + "learning_rate": 2.0191154929744549e-07, + "logits/chosen": -0.5041543245315552, + "logits/rejected": -0.5415452718734741, + "logps/chosen": -55.67481994628906, + "logps/rejected": -139.61611938476562, + "loss": 0.6486, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0025675296783447, + "rewards/margins": 7.675765037536621, + "rewards/rejected": -4.6731977462768555, + "step": 18171 + }, + { + "epoch": 4.55, + "grad_norm": 7.073858737945557, + "learning_rate": 2.0169050438831495e-07, + "logits/chosen": -0.5823514461517334, + "logits/rejected": -0.6708996891975403, + "logps/chosen": -52.83779525756836, + "logps/rejected": -99.52580261230469, + "loss": 0.6445, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3175292015075684, + "rewards/margins": 7.701176643371582, + "rewards/rejected": -4.3836469650268555, + "step": 18172 + }, + { + "epoch": 4.55, + "grad_norm": 23.620880126953125, + "learning_rate": 2.0146957805051437e-07, + "logits/chosen": -0.48505812883377075, + "logits/rejected": -0.5737038850784302, + "logps/chosen": -74.65608215332031, + "logps/rejected": -119.53645324707031, + "loss": 0.685, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9474215507507324, + "rewards/margins": 8.071673393249512, + "rewards/rejected": -5.124251365661621, + "step": 18173 + }, + { + "epoch": 4.55, + "grad_norm": 4.800014019012451, + "learning_rate": 2.012487702895044e-07, + "logits/chosen": -0.5818662643432617, + "logits/rejected": -0.655522346496582, + "logps/chosen": -48.34573745727539, + "logps/rejected": -114.71634674072266, + "loss": 0.6012, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1401309967041016, + "rewards/margins": 7.263030052185059, + "rewards/rejected": -4.122899055480957, + "step": 18174 + }, + { + "epoch": 4.55, + "grad_norm": 4.197585582733154, + "learning_rate": 2.010280811107429e-07, + "logits/chosen": -0.5105022192001343, + "logits/rejected": -0.6357443928718567, + "logps/chosen": -56.822654724121094, + "logps/rejected": -102.45988464355469, + "loss": 0.6286, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2205734252929688, + "rewards/margins": 8.338626861572266, + "rewards/rejected": -5.118052959442139, + "step": 18175 + }, + { + "epoch": 4.55, + "grad_norm": 4.814592361450195, + "learning_rate": 2.0080751051968105e-07, + "logits/chosen": -0.5404868125915527, + "logits/rejected": -0.6542266011238098, + "logps/chosen": -49.0257568359375, + "logps/rejected": -96.9207992553711, + "loss": 0.5023, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3595542907714844, + "rewards/margins": 7.58558988571167, + "rewards/rejected": -4.2260355949401855, + "step": 18176 + }, + { + "epoch": 4.55, + "grad_norm": 4.00753927230835, + "learning_rate": 2.005870585217695e-07, + "logits/chosen": -0.6100376844406128, + "logits/rejected": -0.6905741691589355, + "logps/chosen": -48.03364181518555, + "logps/rejected": -116.42262268066406, + "loss": 0.544, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1733033657073975, + "rewards/margins": 7.311490535736084, + "rewards/rejected": -4.138188362121582, + "step": 18177 + }, + { + "epoch": 4.55, + "grad_norm": 4.714548110961914, + "learning_rate": 2.003667251224578e-07, + "logits/chosen": -0.4876742959022522, + "logits/rejected": -0.6323086023330688, + "logps/chosen": -65.4968490600586, + "logps/rejected": -117.07186889648438, + "loss": 0.6518, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9799444675445557, + "rewards/margins": 8.267457008361816, + "rewards/rejected": -5.287512302398682, + "step": 18178 + }, + { + "epoch": 4.55, + "grad_norm": 3.2266786098480225, + "learning_rate": 2.0014651032718934e-07, + "logits/chosen": -0.5248825550079346, + "logits/rejected": -0.6183346509933472, + "logps/chosen": -52.80146026611328, + "logps/rejected": -123.33493041992188, + "loss": 0.552, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1444084644317627, + "rewards/margins": 7.535358905792236, + "rewards/rejected": -4.390950679779053, + "step": 18179 + }, + { + "epoch": 4.55, + "grad_norm": 5.111538887023926, + "learning_rate": 1.999264141414048e-07, + "logits/chosen": -0.5677772164344788, + "logits/rejected": -0.6687957644462585, + "logps/chosen": -68.33747100830078, + "logps/rejected": -98.94625091552734, + "loss": 0.6417, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2802188396453857, + "rewards/margins": 6.079866886138916, + "rewards/rejected": -2.799647808074951, + "step": 18180 + }, + { + "epoch": 4.55, + "grad_norm": 2.337625026702881, + "learning_rate": 1.9970643657054535e-07, + "logits/chosen": -0.6262109875679016, + "logits/rejected": -0.7254205942153931, + "logps/chosen": -48.00084686279297, + "logps/rejected": -105.79457092285156, + "loss": 0.5318, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2747063636779785, + "rewards/margins": 8.711755752563477, + "rewards/rejected": -5.437049865722656, + "step": 18181 + }, + { + "epoch": 4.55, + "grad_norm": 2.020171880722046, + "learning_rate": 1.9948657762004553e-07, + "logits/chosen": -0.5160939693450928, + "logits/rejected": -0.6308091878890991, + "logps/chosen": -57.33564758300781, + "logps/rejected": -101.15054321289062, + "loss": 0.5744, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5075559616088867, + "rewards/margins": 7.791015625, + "rewards/rejected": -4.283459663391113, + "step": 18182 + }, + { + "epoch": 4.55, + "grad_norm": 2.180049419403076, + "learning_rate": 1.992668372953377e-07, + "logits/chosen": -0.616402268409729, + "logits/rejected": -0.6852340698242188, + "logps/chosen": -49.33338165283203, + "logps/rejected": -118.74079895019531, + "loss": 0.6149, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4427735805511475, + "rewards/margins": 8.207328796386719, + "rewards/rejected": -4.764554977416992, + "step": 18183 + }, + { + "epoch": 4.55, + "grad_norm": 6.432596206665039, + "learning_rate": 1.9904721560185358e-07, + "logits/chosen": -0.5283438563346863, + "logits/rejected": -0.6111893653869629, + "logps/chosen": -46.92177200317383, + "logps/rejected": -116.43860626220703, + "loss": 0.5832, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1378703117370605, + "rewards/margins": 8.534709930419922, + "rewards/rejected": -5.396839141845703, + "step": 18184 + }, + { + "epoch": 4.55, + "grad_norm": 9.362257957458496, + "learning_rate": 1.9882771254501832e-07, + "logits/chosen": -0.5505150556564331, + "logits/rejected": -0.6020755767822266, + "logps/chosen": -55.6702995300293, + "logps/rejected": -110.24231719970703, + "loss": 0.7216, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.180931806564331, + "rewards/margins": 6.2978386878967285, + "rewards/rejected": -3.1169071197509766, + "step": 18185 + }, + { + "epoch": 4.55, + "grad_norm": 4.166068077087402, + "learning_rate": 1.986083281302581e-07, + "logits/chosen": -0.548812985420227, + "logits/rejected": -0.6002770066261292, + "logps/chosen": -51.887752532958984, + "logps/rejected": -117.38562774658203, + "loss": 0.6047, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.299919366836548, + "rewards/margins": 7.969911098480225, + "rewards/rejected": -4.669991493225098, + "step": 18186 + }, + { + "epoch": 4.55, + "grad_norm": 5.228281497955322, + "learning_rate": 1.9838906236299248e-07, + "logits/chosen": -0.5584677457809448, + "logits/rejected": -0.6301720142364502, + "logps/chosen": -64.1429443359375, + "logps/rejected": -93.20207214355469, + "loss": 0.6461, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2407987117767334, + "rewards/margins": 6.310572624206543, + "rewards/rejected": -3.0697739124298096, + "step": 18187 + }, + { + "epoch": 4.55, + "grad_norm": 7.957038879394531, + "learning_rate": 1.9816991524863992e-07, + "logits/chosen": -0.5766589045524597, + "logits/rejected": -0.6870492696762085, + "logps/chosen": -60.41202163696289, + "logps/rejected": -112.01360321044922, + "loss": 0.6956, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1256325244903564, + "rewards/margins": 6.819007396697998, + "rewards/rejected": -3.693375587463379, + "step": 18188 + }, + { + "epoch": 4.55, + "grad_norm": 11.448019981384277, + "learning_rate": 1.979508867926161e-07, + "logits/chosen": -0.5196805000305176, + "logits/rejected": -0.5736483335494995, + "logps/chosen": -59.849403381347656, + "logps/rejected": -116.07101440429688, + "loss": 0.8481, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8859810829162598, + "rewards/margins": 7.195758819580078, + "rewards/rejected": -4.30977725982666, + "step": 18189 + }, + { + "epoch": 4.55, + "grad_norm": 4.275214195251465, + "learning_rate": 1.9773197700033498e-07, + "logits/chosen": -0.5582823157310486, + "logits/rejected": -0.6256132125854492, + "logps/chosen": -48.17125701904297, + "logps/rejected": -95.92094421386719, + "loss": 0.6787, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4381165504455566, + "rewards/margins": 6.911986351013184, + "rewards/rejected": -3.473869800567627, + "step": 18190 + }, + { + "epoch": 4.55, + "grad_norm": 4.341268539428711, + "learning_rate": 1.9751318587720337e-07, + "logits/chosen": -0.4602697789669037, + "logits/rejected": -0.5496727824211121, + "logps/chosen": -74.30047607421875, + "logps/rejected": -108.249267578125, + "loss": 0.6867, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7730791568756104, + "rewards/margins": 6.884793758392334, + "rewards/rejected": -4.111713886260986, + "step": 18191 + }, + { + "epoch": 4.55, + "grad_norm": 3.1071505546569824, + "learning_rate": 1.9729451342862917e-07, + "logits/chosen": -0.5037244558334351, + "logits/rejected": -0.5921978950500488, + "logps/chosen": -58.97766876220703, + "logps/rejected": -95.69419860839844, + "loss": 0.6017, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3742775917053223, + "rewards/margins": 7.436746597290039, + "rewards/rejected": -4.062469005584717, + "step": 18192 + }, + { + "epoch": 4.55, + "grad_norm": 17.15694236755371, + "learning_rate": 1.970759596600169e-07, + "logits/chosen": -0.5693035125732422, + "logits/rejected": -0.6466289162635803, + "logps/chosen": -70.1297378540039, + "logps/rejected": -126.83368682861328, + "loss": 0.7236, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9671578407287598, + "rewards/margins": 7.673172950744629, + "rewards/rejected": -4.706015110015869, + "step": 18193 + }, + { + "epoch": 4.55, + "grad_norm": 3.670928478240967, + "learning_rate": 1.9685752457676567e-07, + "logits/chosen": -0.5594281554222107, + "logits/rejected": -0.6409847736358643, + "logps/chosen": -56.978965759277344, + "logps/rejected": -110.05915069580078, + "loss": 0.6195, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2125697135925293, + "rewards/margins": 7.294755458831787, + "rewards/rejected": -4.0821852684021, + "step": 18194 + }, + { + "epoch": 4.55, + "grad_norm": 1.7649497985839844, + "learning_rate": 1.966392081842733e-07, + "logits/chosen": -0.516446053981781, + "logits/rejected": -0.5999110341072083, + "logps/chosen": -49.95155334472656, + "logps/rejected": -115.4593505859375, + "loss": 0.5319, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.374969482421875, + "rewards/margins": 8.728874206542969, + "rewards/rejected": -5.353904724121094, + "step": 18195 + }, + { + "epoch": 4.55, + "grad_norm": 5.578145503997803, + "learning_rate": 1.9642101048793605e-07, + "logits/chosen": -0.4961259961128235, + "logits/rejected": -0.572956919670105, + "logps/chosen": -51.56637954711914, + "logps/rejected": -109.82150268554688, + "loss": 0.6012, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.865856647491455, + "rewards/margins": 6.374868869781494, + "rewards/rejected": -3.509011745452881, + "step": 18196 + }, + { + "epoch": 4.55, + "grad_norm": 5.486403465270996, + "learning_rate": 1.9620293149314463e-07, + "logits/chosen": -0.5420486330986023, + "logits/rejected": -0.6468018889427185, + "logps/chosen": -72.6920166015625, + "logps/rejected": -109.28361511230469, + "loss": 0.6174, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.052245616912842, + "rewards/margins": 7.598986625671387, + "rewards/rejected": -4.546741008758545, + "step": 18197 + }, + { + "epoch": 4.55, + "grad_norm": 2.2120931148529053, + "learning_rate": 1.9598497120528804e-07, + "logits/chosen": -0.5345690250396729, + "logits/rejected": -0.6062940955162048, + "logps/chosen": -64.28858947753906, + "logps/rejected": -131.38613891601562, + "loss": 0.5599, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1580822467803955, + "rewards/margins": 9.309410095214844, + "rewards/rejected": -6.151327610015869, + "step": 18198 + }, + { + "epoch": 4.55, + "grad_norm": 4.959357261657715, + "learning_rate": 1.957671296297531e-07, + "logits/chosen": -0.5993292927742004, + "logits/rejected": -0.660342276096344, + "logps/chosen": -58.30959701538086, + "logps/rejected": -103.27299499511719, + "loss": 0.6816, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.25822114944458, + "rewards/margins": 7.2395830154418945, + "rewards/rejected": -3.9813618659973145, + "step": 18199 + }, + { + "epoch": 4.55, + "grad_norm": 4.555331707000732, + "learning_rate": 1.9554940677192213e-07, + "logits/chosen": -0.5098381042480469, + "logits/rejected": -0.5919499397277832, + "logps/chosen": -53.34607696533203, + "logps/rejected": -99.32755279541016, + "loss": 0.6219, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.248929500579834, + "rewards/margins": 6.165823936462402, + "rewards/rejected": -2.9168949127197266, + "step": 18200 + }, + { + "epoch": 4.55, + "grad_norm": 4.347263813018799, + "learning_rate": 1.9533180263717533e-07, + "logits/chosen": -0.6255196332931519, + "logits/rejected": -0.6439036130905151, + "logps/chosen": -59.69445037841797, + "logps/rejected": -106.15768432617188, + "loss": 0.6437, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.001734733581543, + "rewards/margins": 6.299546718597412, + "rewards/rejected": -3.29781174659729, + "step": 18201 + }, + { + "epoch": 4.55, + "grad_norm": 4.283572196960449, + "learning_rate": 1.9511431723089003e-07, + "logits/chosen": -0.5043470859527588, + "logits/rejected": -0.5772802829742432, + "logps/chosen": -48.010986328125, + "logps/rejected": -93.67864227294922, + "loss": 0.5835, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0148558616638184, + "rewards/margins": 7.5171990394592285, + "rewards/rejected": -4.50234317779541, + "step": 18202 + }, + { + "epoch": 4.55, + "grad_norm": 4.515061378479004, + "learning_rate": 1.948969505584414e-07, + "logits/chosen": -0.5485474467277527, + "logits/rejected": -0.6765013337135315, + "logps/chosen": -59.6412239074707, + "logps/rejected": -93.75968170166016, + "loss": 0.5438, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0635807514190674, + "rewards/margins": 6.871865749359131, + "rewards/rejected": -3.8082852363586426, + "step": 18203 + }, + { + "epoch": 4.55, + "grad_norm": 3.5647685527801514, + "learning_rate": 1.9467970262519952e-07, + "logits/chosen": -0.5475009083747864, + "logits/rejected": -0.651107668876648, + "logps/chosen": -52.665645599365234, + "logps/rejected": -100.06192779541016, + "loss": 0.4979, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.132336139678955, + "rewards/margins": 6.772953033447266, + "rewards/rejected": -3.6406171321868896, + "step": 18204 + }, + { + "epoch": 4.55, + "grad_norm": 6.482091903686523, + "learning_rate": 1.9446257343653352e-07, + "logits/chosen": -0.5682253241539001, + "logits/rejected": -0.6646970510482788, + "logps/chosen": -53.50886154174805, + "logps/rejected": -107.12205505371094, + "loss": 0.6137, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.069147825241089, + "rewards/margins": 7.424487113952637, + "rewards/rejected": -4.355339527130127, + "step": 18205 + }, + { + "epoch": 4.55, + "grad_norm": 4.531668186187744, + "learning_rate": 1.942455629978085e-07, + "logits/chosen": -0.5636901259422302, + "logits/rejected": -0.6545615196228027, + "logps/chosen": -55.52705001831055, + "logps/rejected": -103.10003662109375, + "loss": 0.6203, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1254169940948486, + "rewards/margins": 6.217830657958984, + "rewards/rejected": -3.092413902282715, + "step": 18206 + }, + { + "epoch": 4.55, + "grad_norm": 7.028702735900879, + "learning_rate": 1.9402867131438797e-07, + "logits/chosen": -0.48702263832092285, + "logits/rejected": -0.5905038118362427, + "logps/chosen": -62.14752197265625, + "logps/rejected": -111.168212890625, + "loss": 0.5577, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.900784492492676, + "rewards/margins": 7.716342926025391, + "rewards/rejected": -4.815558433532715, + "step": 18207 + }, + { + "epoch": 4.55, + "grad_norm": 2.5664291381835938, + "learning_rate": 1.9381189839162983e-07, + "logits/chosen": -0.5335988402366638, + "logits/rejected": -0.6452786326408386, + "logps/chosen": -58.598915100097656, + "logps/rejected": -113.01859283447266, + "loss": 0.5802, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1085524559020996, + "rewards/margins": 9.162262916564941, + "rewards/rejected": -6.053710460662842, + "step": 18208 + }, + { + "epoch": 4.56, + "grad_norm": 6.493095397949219, + "learning_rate": 1.9359524423489263e-07, + "logits/chosen": -0.5687845945358276, + "logits/rejected": -0.6288092732429504, + "logps/chosen": -50.0015869140625, + "logps/rejected": -98.33540344238281, + "loss": 0.5447, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.08866548538208, + "rewards/margins": 5.78619384765625, + "rewards/rejected": -2.69752836227417, + "step": 18209 + }, + { + "epoch": 4.56, + "grad_norm": 3.322293281555176, + "learning_rate": 1.933787088495287e-07, + "logits/chosen": -0.48763227462768555, + "logits/rejected": -0.5431527495384216, + "logps/chosen": -57.954158782958984, + "logps/rejected": -115.21766662597656, + "loss": 0.6342, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.369784355163574, + "rewards/margins": 6.988187789916992, + "rewards/rejected": -3.6184041500091553, + "step": 18210 + }, + { + "epoch": 4.56, + "grad_norm": 4.462362289428711, + "learning_rate": 1.9316229224088933e-07, + "logits/chosen": -0.5871785879135132, + "logits/rejected": -0.6661471128463745, + "logps/chosen": -48.23014831542969, + "logps/rejected": -104.75196838378906, + "loss": 0.5866, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8692803382873535, + "rewards/margins": 7.529898166656494, + "rewards/rejected": -4.660617828369141, + "step": 18211 + }, + { + "epoch": 4.56, + "grad_norm": 3.625900983810425, + "learning_rate": 1.9294599441432303e-07, + "logits/chosen": -0.5614004731178284, + "logits/rejected": -0.6763306856155396, + "logps/chosen": -58.199371337890625, + "logps/rejected": -120.31549072265625, + "loss": 0.5639, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.405771017074585, + "rewards/margins": 8.954957008361816, + "rewards/rejected": -5.549185752868652, + "step": 18212 + }, + { + "epoch": 4.56, + "grad_norm": 3.5404772758483887, + "learning_rate": 1.927298153751739e-07, + "logits/chosen": -0.5792707204818726, + "logits/rejected": -0.6220998167991638, + "logps/chosen": -43.174949645996094, + "logps/rejected": -117.89286804199219, + "loss": 0.5884, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2690844535827637, + "rewards/margins": 7.691732406616211, + "rewards/rejected": -4.4226484298706055, + "step": 18213 + }, + { + "epoch": 4.56, + "grad_norm": 7.053272724151611, + "learning_rate": 1.9251375512878367e-07, + "logits/chosen": -0.6202504634857178, + "logits/rejected": -0.6337198615074158, + "logps/chosen": -57.07293701171875, + "logps/rejected": -131.99404907226562, + "loss": 0.7005, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0497281551361084, + "rewards/margins": 6.721734523773193, + "rewards/rejected": -3.6720058917999268, + "step": 18214 + }, + { + "epoch": 4.56, + "grad_norm": 3.9108660221099854, + "learning_rate": 1.9229781368049373e-07, + "logits/chosen": -0.5697464346885681, + "logits/rejected": -0.6444280743598938, + "logps/chosen": -58.532958984375, + "logps/rejected": -96.53805541992188, + "loss": 0.6985, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.258681297302246, + "rewards/margins": 6.526504039764404, + "rewards/rejected": -3.2678229808807373, + "step": 18215 + }, + { + "epoch": 4.56, + "grad_norm": 4.949291229248047, + "learning_rate": 1.9208199103563696e-07, + "logits/chosen": -0.6315693855285645, + "logits/rejected": -0.6847892999649048, + "logps/chosen": -42.271610260009766, + "logps/rejected": -108.59384155273438, + "loss": 0.6294, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.882831335067749, + "rewards/margins": 7.682897567749023, + "rewards/rejected": -4.800065994262695, + "step": 18216 + }, + { + "epoch": 4.56, + "grad_norm": 3.422771453857422, + "learning_rate": 1.918662871995486e-07, + "logits/chosen": -0.5607779622077942, + "logits/rejected": -0.6654483675956726, + "logps/chosen": -55.05278396606445, + "logps/rejected": -105.67464447021484, + "loss": 0.6099, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3744399547576904, + "rewards/margins": 6.738802433013916, + "rewards/rejected": -3.3643624782562256, + "step": 18217 + }, + { + "epoch": 4.56, + "grad_norm": 6.795166492462158, + "learning_rate": 1.916507021775593e-07, + "logits/chosen": -0.5312406420707703, + "logits/rejected": -0.622041642665863, + "logps/chosen": -61.86924743652344, + "logps/rejected": -125.468017578125, + "loss": 0.6118, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9413976669311523, + "rewards/margins": 8.03414535522461, + "rewards/rejected": -5.092748165130615, + "step": 18218 + }, + { + "epoch": 4.56, + "grad_norm": 7.415528774261475, + "learning_rate": 1.9143523597499437e-07, + "logits/chosen": -0.5384262800216675, + "logits/rejected": -0.6347751617431641, + "logps/chosen": -55.897010803222656, + "logps/rejected": -109.28331756591797, + "loss": 0.6151, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.256282329559326, + "rewards/margins": 8.806059837341309, + "rewards/rejected": -5.549777507781982, + "step": 18219 + }, + { + "epoch": 4.56, + "grad_norm": 5.440186977386475, + "learning_rate": 1.9121988859717944e-07, + "logits/chosen": -0.4862107038497925, + "logits/rejected": -0.5854886770248413, + "logps/chosen": -53.240379333496094, + "logps/rejected": -106.73253631591797, + "loss": 0.6025, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3343451023101807, + "rewards/margins": 7.542181491851807, + "rewards/rejected": -4.207836151123047, + "step": 18220 + }, + { + "epoch": 4.56, + "grad_norm": 4.53938102722168, + "learning_rate": 1.910046600494364e-07, + "logits/chosen": -0.575015664100647, + "logits/rejected": -0.657832682132721, + "logps/chosen": -66.89894104003906, + "logps/rejected": -105.71025848388672, + "loss": 0.6018, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2680890560150146, + "rewards/margins": 7.6056365966796875, + "rewards/rejected": -4.337547779083252, + "step": 18221 + }, + { + "epoch": 4.56, + "grad_norm": 7.915313243865967, + "learning_rate": 1.9078955033708323e-07, + "logits/chosen": -0.5478029251098633, + "logits/rejected": -0.6355884671211243, + "logps/chosen": -65.1479721069336, + "logps/rejected": -111.63945007324219, + "loss": 0.6156, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1916680335998535, + "rewards/margins": 7.991844654083252, + "rewards/rejected": -4.800177097320557, + "step": 18222 + }, + { + "epoch": 4.56, + "grad_norm": 5.173860549926758, + "learning_rate": 1.9057455946543511e-07, + "logits/chosen": -0.5497389435768127, + "logits/rejected": -0.6659300923347473, + "logps/chosen": -56.923187255859375, + "logps/rejected": -105.0547866821289, + "loss": 0.6004, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.560318946838379, + "rewards/margins": 8.747432708740234, + "rewards/rejected": -5.187114238739014, + "step": 18223 + }, + { + "epoch": 4.56, + "grad_norm": 7.101881504058838, + "learning_rate": 1.903596874398056e-07, + "logits/chosen": -0.4474538266658783, + "logits/rejected": -0.5878769159317017, + "logps/chosen": -73.556884765625, + "logps/rejected": -101.58283996582031, + "loss": 0.72, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9381940364837646, + "rewards/margins": 6.877093315124512, + "rewards/rejected": -3.938899278640747, + "step": 18224 + }, + { + "epoch": 4.56, + "grad_norm": 4.260620594024658, + "learning_rate": 1.9014493426550373e-07, + "logits/chosen": -0.517827033996582, + "logits/rejected": -0.6057778596878052, + "logps/chosen": -52.26544952392578, + "logps/rejected": -93.30489349365234, + "loss": 0.7061, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8262529373168945, + "rewards/margins": 6.6578826904296875, + "rewards/rejected": -3.8316290378570557, + "step": 18225 + }, + { + "epoch": 4.56, + "grad_norm": 4.806575775146484, + "learning_rate": 1.899302999478364e-07, + "logits/chosen": -0.5287652015686035, + "logits/rejected": -0.6220344305038452, + "logps/chosen": -47.07074737548828, + "logps/rejected": -98.24366760253906, + "loss": 0.5746, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3156962394714355, + "rewards/margins": 6.497823715209961, + "rewards/rejected": -3.1821274757385254, + "step": 18226 + }, + { + "epoch": 4.56, + "grad_norm": 4.809284687042236, + "learning_rate": 1.8971578449210826e-07, + "logits/chosen": -0.5770895481109619, + "logits/rejected": -0.6372000575065613, + "logps/chosen": -55.31499099731445, + "logps/rejected": -111.47132873535156, + "loss": 0.6541, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.082059144973755, + "rewards/margins": 7.30130672454834, + "rewards/rejected": -4.219247817993164, + "step": 18227 + }, + { + "epoch": 4.56, + "grad_norm": 7.43888521194458, + "learning_rate": 1.8950138790361837e-07, + "logits/chosen": -0.5509317517280579, + "logits/rejected": -0.6507717967033386, + "logps/chosen": -59.57575988769531, + "logps/rejected": -95.27963256835938, + "loss": 0.5682, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4504873752593994, + "rewards/margins": 7.722897052764893, + "rewards/rejected": -4.272409439086914, + "step": 18228 + }, + { + "epoch": 4.56, + "grad_norm": 8.48099136352539, + "learning_rate": 1.89287110187667e-07, + "logits/chosen": -0.5942126512527466, + "logits/rejected": -0.6973016858100891, + "logps/chosen": -52.92121505737305, + "logps/rejected": -111.3710708618164, + "loss": 0.5708, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4321212768554688, + "rewards/margins": 7.165467739105225, + "rewards/rejected": -3.733346462249756, + "step": 18229 + }, + { + "epoch": 4.56, + "grad_norm": 5.474620819091797, + "learning_rate": 1.8907295134954818e-07, + "logits/chosen": -0.5287256240844727, + "logits/rejected": -0.6238645911216736, + "logps/chosen": -54.43867111206055, + "logps/rejected": -115.59728240966797, + "loss": 0.6162, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2920279502868652, + "rewards/margins": 7.851433277130127, + "rewards/rejected": -4.5594048500061035, + "step": 18230 + }, + { + "epoch": 4.56, + "grad_norm": 2.9787099361419678, + "learning_rate": 1.888589113945527e-07, + "logits/chosen": -0.5992598533630371, + "logits/rejected": -0.6786684989929199, + "logps/chosen": -42.20663833618164, + "logps/rejected": -87.52733612060547, + "loss": 0.5842, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1627233028411865, + "rewards/margins": 6.178180694580078, + "rewards/rejected": -3.0154571533203125, + "step": 18231 + }, + { + "epoch": 4.56, + "grad_norm": 11.34990406036377, + "learning_rate": 1.886449903279719e-07, + "logits/chosen": -0.4959965944290161, + "logits/rejected": -0.593907356262207, + "logps/chosen": -67.16738891601562, + "logps/rejected": -105.38325500488281, + "loss": 0.707, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1151931285858154, + "rewards/margins": 6.577092170715332, + "rewards/rejected": -3.461899757385254, + "step": 18232 + }, + { + "epoch": 4.56, + "grad_norm": 6.135507583618164, + "learning_rate": 1.8843118815509043e-07, + "logits/chosen": -0.5086578130722046, + "logits/rejected": -0.606738805770874, + "logps/chosen": -55.298980712890625, + "logps/rejected": -106.73226928710938, + "loss": 0.6218, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9210548400878906, + "rewards/margins": 7.281594276428223, + "rewards/rejected": -4.360539436340332, + "step": 18233 + }, + { + "epoch": 4.56, + "grad_norm": 5.904098033905029, + "learning_rate": 1.8821750488119183e-07, + "logits/chosen": -0.6084275841712952, + "logits/rejected": -0.7144932746887207, + "logps/chosen": -56.50815200805664, + "logps/rejected": -95.9659652709961, + "loss": 0.5892, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.084493637084961, + "rewards/margins": 7.9972004890441895, + "rewards/rejected": -4.912707328796387, + "step": 18234 + }, + { + "epoch": 4.56, + "grad_norm": 5.443840026855469, + "learning_rate": 1.880039405115569e-07, + "logits/chosen": -0.536997377872467, + "logits/rejected": -0.6032249927520752, + "logps/chosen": -56.38581848144531, + "logps/rejected": -117.02230072021484, + "loss": 0.6111, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9052181243896484, + "rewards/margins": 6.797741889953613, + "rewards/rejected": -3.892524242401123, + "step": 18235 + }, + { + "epoch": 4.56, + "grad_norm": 3.78804612159729, + "learning_rate": 1.8779049505146362e-07, + "logits/chosen": -0.5483108758926392, + "logits/rejected": -0.6557311415672302, + "logps/chosen": -59.96282196044922, + "logps/rejected": -100.56763458251953, + "loss": 0.6028, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6719136238098145, + "rewards/margins": 7.147396087646484, + "rewards/rejected": -4.47548246383667, + "step": 18236 + }, + { + "epoch": 4.56, + "grad_norm": 9.808472633361816, + "learning_rate": 1.8757716850618502e-07, + "logits/chosen": -0.5074982643127441, + "logits/rejected": -0.5759168267250061, + "logps/chosen": -47.80022430419922, + "logps/rejected": -119.21757507324219, + "loss": 0.6178, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.222090005874634, + "rewards/margins": 7.755568981170654, + "rewards/rejected": -4.5334792137146, + "step": 18237 + }, + { + "epoch": 4.56, + "grad_norm": 2.6819076538085938, + "learning_rate": 1.8736396088099296e-07, + "logits/chosen": -0.5606818199157715, + "logits/rejected": -0.6941572427749634, + "logps/chosen": -71.39386749267578, + "logps/rejected": -119.416259765625, + "loss": 0.5855, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.072937488555908, + "rewards/margins": 8.624394416809082, + "rewards/rejected": -5.551457405090332, + "step": 18238 + }, + { + "epoch": 4.56, + "grad_norm": 4.574248313903809, + "learning_rate": 1.8715087218115713e-07, + "logits/chosen": -0.5777029991149902, + "logits/rejected": -0.625768780708313, + "logps/chosen": -51.42770004272461, + "logps/rejected": -106.43585968017578, + "loss": 0.6287, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1570680141448975, + "rewards/margins": 7.160254955291748, + "rewards/rejected": -4.0031867027282715, + "step": 18239 + }, + { + "epoch": 4.56, + "grad_norm": 3.1381475925445557, + "learning_rate": 1.8693790241194277e-07, + "logits/chosen": -0.5507274866104126, + "logits/rejected": -0.6336425542831421, + "logps/chosen": -47.95977783203125, + "logps/rejected": -100.56642150878906, + "loss": 0.5987, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2585060596466064, + "rewards/margins": 6.99566650390625, + "rewards/rejected": -3.7371599674224854, + "step": 18240 + }, + { + "epoch": 4.56, + "grad_norm": 4.239643573760986, + "learning_rate": 1.867250515786112e-07, + "logits/chosen": -0.601869523525238, + "logits/rejected": -0.6652663946151733, + "logps/chosen": -54.196434020996094, + "logps/rejected": -124.09724426269531, + "loss": 0.5556, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.59204363822937, + "rewards/margins": 8.57988166809082, + "rewards/rejected": -4.987837314605713, + "step": 18241 + }, + { + "epoch": 4.56, + "grad_norm": 2.983945846557617, + "learning_rate": 1.8651231968642382e-07, + "logits/chosen": -0.5861782431602478, + "logits/rejected": -0.6719192266464233, + "logps/chosen": -54.23989486694336, + "logps/rejected": -100.90308380126953, + "loss": 0.5552, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.154539108276367, + "rewards/margins": 6.670567035675049, + "rewards/rejected": -3.5160279273986816, + "step": 18242 + }, + { + "epoch": 4.56, + "grad_norm": 2.97243332862854, + "learning_rate": 1.8629970674063691e-07, + "logits/chosen": -0.5298736691474915, + "logits/rejected": -0.6677585244178772, + "logps/chosen": -54.73925018310547, + "logps/rejected": -113.0428466796875, + "loss": 0.5544, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.219350814819336, + "rewards/margins": 8.689048767089844, + "rewards/rejected": -5.469698429107666, + "step": 18243 + }, + { + "epoch": 4.56, + "grad_norm": 5.367715835571289, + "learning_rate": 1.860872127465041e-07, + "logits/chosen": -0.563642144203186, + "logits/rejected": -0.689200758934021, + "logps/chosen": -54.094547271728516, + "logps/rejected": -95.2876968383789, + "loss": 0.5631, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.062446117401123, + "rewards/margins": 7.568247318267822, + "rewards/rejected": -4.505801200866699, + "step": 18244 + }, + { + "epoch": 4.56, + "grad_norm": 5.294567584991455, + "learning_rate": 1.8587483770927673e-07, + "logits/chosen": -0.5333365201950073, + "logits/rejected": -0.5993310213088989, + "logps/chosen": -52.83091735839844, + "logps/rejected": -110.35313415527344, + "loss": 0.6434, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9419631958007812, + "rewards/margins": 7.148872375488281, + "rewards/rejected": -4.206909656524658, + "step": 18245 + }, + { + "epoch": 4.56, + "grad_norm": 9.955294609069824, + "learning_rate": 1.8566258163420226e-07, + "logits/chosen": -0.5067523717880249, + "logits/rejected": -0.5946601629257202, + "logps/chosen": -45.16436004638672, + "logps/rejected": -108.64712524414062, + "loss": 0.5662, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8882718086242676, + "rewards/margins": 7.210723400115967, + "rewards/rejected": -4.322451591491699, + "step": 18246 + }, + { + "epoch": 4.56, + "grad_norm": 2.135575532913208, + "learning_rate": 1.8545044452652649e-07, + "logits/chosen": -0.5477812886238098, + "logits/rejected": -0.5667043924331665, + "logps/chosen": -48.894901275634766, + "logps/rejected": -120.10081481933594, + "loss": 0.5146, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.297863483428955, + "rewards/margins": 8.597949981689453, + "rewards/rejected": -5.3000874519348145, + "step": 18247 + }, + { + "epoch": 4.56, + "grad_norm": 6.229990482330322, + "learning_rate": 1.852384263914908e-07, + "logits/chosen": -0.6222416162490845, + "logits/rejected": -0.6611720323562622, + "logps/chosen": -56.43876266479492, + "logps/rejected": -97.49446868896484, + "loss": 0.6632, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1819159984588623, + "rewards/margins": 6.531795024871826, + "rewards/rejected": -3.3498783111572266, + "step": 18248 + }, + { + "epoch": 4.57, + "grad_norm": 5.969093322753906, + "learning_rate": 1.8502652723433546e-07, + "logits/chosen": -0.5588986277580261, + "logits/rejected": -0.6265602111816406, + "logps/chosen": -65.88660430908203, + "logps/rejected": -129.50506591796875, + "loss": 0.6825, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.332180976867676, + "rewards/margins": 7.2187581062316895, + "rewards/rejected": -3.8865773677825928, + "step": 18249 + }, + { + "epoch": 4.57, + "grad_norm": 3.6629984378814697, + "learning_rate": 1.8481474706029568e-07, + "logits/chosen": -0.5235652923583984, + "logits/rejected": -0.611964762210846, + "logps/chosen": -57.98914337158203, + "logps/rejected": -118.27046203613281, + "loss": 0.5941, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1064465045928955, + "rewards/margins": 8.813233375549316, + "rewards/rejected": -5.706786632537842, + "step": 18250 + }, + { + "epoch": 4.57, + "grad_norm": 3.7621679306030273, + "learning_rate": 1.8460308587460508e-07, + "logits/chosen": -0.5974757075309753, + "logits/rejected": -0.6615467071533203, + "logps/chosen": -46.599327087402344, + "logps/rejected": -98.19161224365234, + "loss": 0.547, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0965867042541504, + "rewards/margins": 7.622647762298584, + "rewards/rejected": -4.526060581207275, + "step": 18251 + }, + { + "epoch": 4.57, + "grad_norm": 4.812653541564941, + "learning_rate": 1.8439154368249445e-07, + "logits/chosen": -0.620180070400238, + "logits/rejected": -0.6830325126647949, + "logps/chosen": -53.77851867675781, + "logps/rejected": -110.3973159790039, + "loss": 0.6281, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.042100191116333, + "rewards/margins": 7.379062652587891, + "rewards/rejected": -4.3369622230529785, + "step": 18252 + }, + { + "epoch": 4.57, + "grad_norm": 3.711168050765991, + "learning_rate": 1.841801204891902e-07, + "logits/chosen": -0.5084037780761719, + "logits/rejected": -0.6696888208389282, + "logps/chosen": -54.91887664794922, + "logps/rejected": -113.05658721923828, + "loss": 0.6207, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.885021924972534, + "rewards/margins": 8.562448501586914, + "rewards/rejected": -5.677426338195801, + "step": 18253 + }, + { + "epoch": 4.57, + "grad_norm": 2.858435869216919, + "learning_rate": 1.839688162999187e-07, + "logits/chosen": -0.495650976896286, + "logits/rejected": -0.5557230710983276, + "logps/chosen": -53.15972900390625, + "logps/rejected": -113.11639404296875, + "loss": 0.5629, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0686051845550537, + "rewards/margins": 8.642857551574707, + "rewards/rejected": -5.574252605438232, + "step": 18254 + }, + { + "epoch": 4.57, + "grad_norm": 4.30697774887085, + "learning_rate": 1.8375763111989964e-07, + "logits/chosen": -0.5687390565872192, + "logits/rejected": -0.6496170163154602, + "logps/chosen": -62.69322204589844, + "logps/rejected": -124.51300048828125, + "loss": 0.6191, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2944531440734863, + "rewards/margins": 7.800271987915039, + "rewards/rejected": -4.505819320678711, + "step": 18255 + }, + { + "epoch": 4.57, + "grad_norm": 3.7441418170928955, + "learning_rate": 1.8354656495435218e-07, + "logits/chosen": -0.5950093269348145, + "logits/rejected": -0.6240991950035095, + "logps/chosen": -64.78667449951172, + "logps/rejected": -107.87000274658203, + "loss": 0.6726, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2678277492523193, + "rewards/margins": 6.579221725463867, + "rewards/rejected": -3.3113934993743896, + "step": 18256 + }, + { + "epoch": 4.57, + "grad_norm": 5.104598045349121, + "learning_rate": 1.833356178084922e-07, + "logits/chosen": -0.5170038342475891, + "logits/rejected": -0.6156729459762573, + "logps/chosen": -55.82494354248047, + "logps/rejected": -84.96585845947266, + "loss": 0.7291, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.866792678833008, + "rewards/margins": 5.617096424102783, + "rewards/rejected": -2.7503037452697754, + "step": 18257 + }, + { + "epoch": 4.57, + "grad_norm": 3.819284200668335, + "learning_rate": 1.8312478968753267e-07, + "logits/chosen": -0.512644350528717, + "logits/rejected": -0.6146617531776428, + "logps/chosen": -63.50922775268555, + "logps/rejected": -105.79125213623047, + "loss": 0.6427, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.125692367553711, + "rewards/margins": 7.232982158660889, + "rewards/rejected": -4.107290267944336, + "step": 18258 + }, + { + "epoch": 4.57, + "grad_norm": 6.717508792877197, + "learning_rate": 1.8291408059668224e-07, + "logits/chosen": -0.5698930621147156, + "logits/rejected": -0.6492028832435608, + "logps/chosen": -51.7088623046875, + "logps/rejected": -96.35391235351562, + "loss": 0.6774, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0890398025512695, + "rewards/margins": 6.611814498901367, + "rewards/rejected": -3.5227746963500977, + "step": 18259 + }, + { + "epoch": 4.57, + "grad_norm": 3.138673782348633, + "learning_rate": 1.82703490541149e-07, + "logits/chosen": -0.5358898639678955, + "logits/rejected": -0.6448124051094055, + "logps/chosen": -56.950836181640625, + "logps/rejected": -105.62186431884766, + "loss": 0.6153, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.846957206726074, + "rewards/margins": 8.166031837463379, + "rewards/rejected": -5.319074630737305, + "step": 18260 + }, + { + "epoch": 4.57, + "grad_norm": 3.3709466457366943, + "learning_rate": 1.8249301952613706e-07, + "logits/chosen": -0.6174113750457764, + "logits/rejected": -0.6963908672332764, + "logps/chosen": -58.209144592285156, + "logps/rejected": -100.5278549194336, + "loss": 0.6517, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.121208429336548, + "rewards/margins": 7.224540710449219, + "rewards/rejected": -4.10333251953125, + "step": 18261 + }, + { + "epoch": 4.57, + "grad_norm": 5.9358673095703125, + "learning_rate": 1.8228266755684565e-07, + "logits/chosen": -0.5616833567619324, + "logits/rejected": -0.6467297673225403, + "logps/chosen": -52.76722717285156, + "logps/rejected": -106.52009582519531, + "loss": 0.6332, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.09598445892334, + "rewards/margins": 6.481964588165283, + "rewards/rejected": -3.3859801292419434, + "step": 18262 + }, + { + "epoch": 4.57, + "grad_norm": 3.0492706298828125, + "learning_rate": 1.820724346384739e-07, + "logits/chosen": -0.5555115342140198, + "logits/rejected": -0.629227876663208, + "logps/chosen": -49.297508239746094, + "logps/rejected": -129.6163787841797, + "loss": 0.5711, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1746249198913574, + "rewards/margins": 9.641134262084961, + "rewards/rejected": -6.466509819030762, + "step": 18263 + }, + { + "epoch": 4.57, + "grad_norm": 3.3173553943634033, + "learning_rate": 1.8186232077621767e-07, + "logits/chosen": -0.6088822484016418, + "logits/rejected": -0.6921413540840149, + "logps/chosen": -56.2524299621582, + "logps/rejected": -95.98860168457031, + "loss": 0.6443, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1535379886627197, + "rewards/margins": 6.733147621154785, + "rewards/rejected": -3.5796096324920654, + "step": 18264 + }, + { + "epoch": 4.57, + "grad_norm": 2.4310009479522705, + "learning_rate": 1.816523259752684e-07, + "logits/chosen": -0.581384539604187, + "logits/rejected": -0.6611809134483337, + "logps/chosen": -51.8959846496582, + "logps/rejected": -102.1678466796875, + "loss": 0.5651, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0723748207092285, + "rewards/margins": 8.448077201843262, + "rewards/rejected": -5.375701904296875, + "step": 18265 + }, + { + "epoch": 4.57, + "grad_norm": 6.4872212409973145, + "learning_rate": 1.814424502408141e-07, + "logits/chosen": -0.6210479140281677, + "logits/rejected": -0.7004233598709106, + "logps/chosen": -57.527008056640625, + "logps/rejected": -100.45355987548828, + "loss": 0.6529, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5955164432525635, + "rewards/margins": 6.825607776641846, + "rewards/rejected": -4.230091571807861, + "step": 18266 + }, + { + "epoch": 4.57, + "grad_norm": 2.5176122188568115, + "learning_rate": 1.812326935780434e-07, + "logits/chosen": -0.6418133974075317, + "logits/rejected": -0.6971865296363831, + "logps/chosen": -44.211524963378906, + "logps/rejected": -96.69612884521484, + "loss": 0.5574, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.163860559463501, + "rewards/margins": 6.945727348327637, + "rewards/rejected": -3.781867742538452, + "step": 18267 + }, + { + "epoch": 4.57, + "grad_norm": 3.517359733581543, + "learning_rate": 1.8102305599213777e-07, + "logits/chosen": -0.5627418756484985, + "logits/rejected": -0.6535981893539429, + "logps/chosen": -60.79307556152344, + "logps/rejected": -105.40930938720703, + "loss": 0.6194, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0279579162597656, + "rewards/margins": 7.383974075317383, + "rewards/rejected": -4.356015682220459, + "step": 18268 + }, + { + "epoch": 4.57, + "grad_norm": 5.6821417808532715, + "learning_rate": 1.8081353748827744e-07, + "logits/chosen": -0.6020156741142273, + "logits/rejected": -0.674787700176239, + "logps/chosen": -47.971378326416016, + "logps/rejected": -119.17170715332031, + "loss": 0.5922, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2916219234466553, + "rewards/margins": 8.199275016784668, + "rewards/rejected": -4.90765380859375, + "step": 18269 + }, + { + "epoch": 4.57, + "grad_norm": 4.2106547355651855, + "learning_rate": 1.806041380716411e-07, + "logits/chosen": -0.5229510068893433, + "logits/rejected": -0.6378929615020752, + "logps/chosen": -56.95619583129883, + "logps/rejected": -97.66046142578125, + "loss": 0.5599, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7765636444091797, + "rewards/margins": 7.030982971191406, + "rewards/rejected": -4.254419326782227, + "step": 18270 + }, + { + "epoch": 4.57, + "grad_norm": 9.54987621307373, + "learning_rate": 1.803948577474024e-07, + "logits/chosen": -0.5178957581520081, + "logits/rejected": -0.609297513961792, + "logps/chosen": -62.53620910644531, + "logps/rejected": -106.60333251953125, + "loss": 0.6342, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9232213497161865, + "rewards/margins": 6.98963737487793, + "rewards/rejected": -4.066415786743164, + "step": 18271 + }, + { + "epoch": 4.57, + "grad_norm": 3.628880262374878, + "learning_rate": 1.801856965207338e-07, + "logits/chosen": -0.5420863032341003, + "logits/rejected": -0.6338105797767639, + "logps/chosen": -59.024410247802734, + "logps/rejected": -104.25399780273438, + "loss": 0.5778, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0977883338928223, + "rewards/margins": 7.150674819946289, + "rewards/rejected": -4.052886486053467, + "step": 18272 + }, + { + "epoch": 4.57, + "grad_norm": 1.8530347347259521, + "learning_rate": 1.799766543968029e-07, + "logits/chosen": -0.5137972235679626, + "logits/rejected": -0.6382073760032654, + "logps/chosen": -50.298458099365234, + "logps/rejected": -115.19917297363281, + "loss": 0.5629, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0043606758117676, + "rewards/margins": 8.914571762084961, + "rewards/rejected": -5.910210609436035, + "step": 18273 + }, + { + "epoch": 4.57, + "grad_norm": 17.26846694946289, + "learning_rate": 1.7976773138077496e-07, + "logits/chosen": -0.550109326839447, + "logits/rejected": -0.614482581615448, + "logps/chosen": -57.699859619140625, + "logps/rejected": -107.67263793945312, + "loss": 0.7243, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9746763706207275, + "rewards/margins": 6.809484481811523, + "rewards/rejected": -3.834807872772217, + "step": 18274 + }, + { + "epoch": 4.57, + "grad_norm": 5.551483631134033, + "learning_rate": 1.7955892747781422e-07, + "logits/chosen": -0.5681109428405762, + "logits/rejected": -0.6640033721923828, + "logps/chosen": -46.34413146972656, + "logps/rejected": -118.22946166992188, + "loss": 0.5254, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0944557189941406, + "rewards/margins": 7.276613712310791, + "rewards/rejected": -4.182157516479492, + "step": 18275 + }, + { + "epoch": 4.57, + "grad_norm": 3.926084041595459, + "learning_rate": 1.7935024269307987e-07, + "logits/chosen": -0.6158742904663086, + "logits/rejected": -0.6936678290367126, + "logps/chosen": -53.45280456542969, + "logps/rejected": -103.85041809082031, + "loss": 0.5549, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3171098232269287, + "rewards/margins": 7.510490894317627, + "rewards/rejected": -4.193381309509277, + "step": 18276 + }, + { + "epoch": 4.57, + "grad_norm": 5.3039870262146, + "learning_rate": 1.791416770317278e-07, + "logits/chosen": -0.5844660401344299, + "logits/rejected": -0.6633360385894775, + "logps/chosen": -66.62535858154297, + "logps/rejected": -105.028076171875, + "loss": 0.6611, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0781137943267822, + "rewards/margins": 6.288520812988281, + "rewards/rejected": -3.210406541824341, + "step": 18277 + }, + { + "epoch": 4.57, + "grad_norm": 8.041677474975586, + "learning_rate": 1.789332304989122e-07, + "logits/chosen": -0.563711404800415, + "logits/rejected": -0.6510930061340332, + "logps/chosen": -65.96574401855469, + "logps/rejected": -92.47767639160156, + "loss": 0.6944, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8707900047302246, + "rewards/margins": 6.189401149749756, + "rewards/rejected": -3.318610906600952, + "step": 18278 + }, + { + "epoch": 4.57, + "grad_norm": 4.5839996337890625, + "learning_rate": 1.7872490309978509e-07, + "logits/chosen": -0.6397374272346497, + "logits/rejected": -0.6867181062698364, + "logps/chosen": -51.2646598815918, + "logps/rejected": -114.18357849121094, + "loss": 0.5921, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3324356079101562, + "rewards/margins": 7.701424598693848, + "rewards/rejected": -4.368988990783691, + "step": 18279 + }, + { + "epoch": 4.57, + "grad_norm": 4.282309055328369, + "learning_rate": 1.7851669483949397e-07, + "logits/chosen": -0.48573219776153564, + "logits/rejected": -0.5473029613494873, + "logps/chosen": -64.69146728515625, + "logps/rejected": -99.74577331542969, + "loss": 0.6355, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.542344331741333, + "rewards/margins": 5.713862419128418, + "rewards/rejected": -2.171518087387085, + "step": 18280 + }, + { + "epoch": 4.57, + "grad_norm": 3.72151780128479, + "learning_rate": 1.7830860572318308e-07, + "logits/chosen": -0.5815939903259277, + "logits/rejected": -0.6738535165786743, + "logps/chosen": -57.80753707885742, + "logps/rejected": -100.1130142211914, + "loss": 0.6178, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.981275796890259, + "rewards/margins": 7.21433162689209, + "rewards/rejected": -4.23305606842041, + "step": 18281 + }, + { + "epoch": 4.57, + "grad_norm": 3.179765224456787, + "learning_rate": 1.7810063575599555e-07, + "logits/chosen": -0.6615468263626099, + "logits/rejected": -0.7359054684638977, + "logps/chosen": -50.560638427734375, + "logps/rejected": -99.47196197509766, + "loss": 0.6222, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.297795534133911, + "rewards/margins": 7.666676998138428, + "rewards/rejected": -4.3688812255859375, + "step": 18282 + }, + { + "epoch": 4.57, + "grad_norm": 5.218745708465576, + "learning_rate": 1.778927849430706e-07, + "logits/chosen": -0.5073465704917908, + "logits/rejected": -0.5741459131240845, + "logps/chosen": -49.4757080078125, + "logps/rejected": -100.20750427246094, + "loss": 0.6427, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2501323223114014, + "rewards/margins": 6.782665729522705, + "rewards/rejected": -3.532533645629883, + "step": 18283 + }, + { + "epoch": 4.57, + "grad_norm": 6.791336536407471, + "learning_rate": 1.77685053289543e-07, + "logits/chosen": -0.5236823558807373, + "logits/rejected": -0.6186723709106445, + "logps/chosen": -94.73452758789062, + "logps/rejected": -125.21280670166016, + "loss": 0.7704, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.926499843597412, + "rewards/margins": 8.610249519348145, + "rewards/rejected": -5.683750152587891, + "step": 18284 + }, + { + "epoch": 4.57, + "grad_norm": 4.205883502960205, + "learning_rate": 1.7747744080054752e-07, + "logits/chosen": -0.4862755537033081, + "logits/rejected": -0.5606053471565247, + "logps/chosen": -53.888668060302734, + "logps/rejected": -108.04812622070312, + "loss": 0.6114, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5056991577148438, + "rewards/margins": 7.672170639038086, + "rewards/rejected": -4.1664719581604, + "step": 18285 + }, + { + "epoch": 4.57, + "grad_norm": 5.524328231811523, + "learning_rate": 1.7726994748121397e-07, + "logits/chosen": -0.5591017603874207, + "logits/rejected": -0.6340837478637695, + "logps/chosen": -55.47285461425781, + "logps/rejected": -119.86105346679688, + "loss": 0.591, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.669795513153076, + "rewards/margins": 8.276310920715332, + "rewards/rejected": -5.606515407562256, + "step": 18286 + }, + { + "epoch": 4.57, + "grad_norm": 6.873175144195557, + "learning_rate": 1.7706257333666877e-07, + "logits/chosen": -0.5167210102081299, + "logits/rejected": -0.5966264009475708, + "logps/chosen": -53.45561599731445, + "logps/rejected": -111.1011962890625, + "loss": 0.5632, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.919793128967285, + "rewards/margins": 7.582858085632324, + "rewards/rejected": -4.6630659103393555, + "step": 18287 + }, + { + "epoch": 4.57, + "grad_norm": 5.888869762420654, + "learning_rate": 1.768553183720384e-07, + "logits/chosen": -0.5808404684066772, + "logits/rejected": -0.6606540679931641, + "logps/chosen": -51.584102630615234, + "logps/rejected": -102.30642700195312, + "loss": 0.6614, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4265694618225098, + "rewards/margins": 7.881821155548096, + "rewards/rejected": -4.455252170562744, + "step": 18288 + }, + { + "epoch": 4.58, + "grad_norm": 4.655566215515137, + "learning_rate": 1.7664818259244264e-07, + "logits/chosen": -0.5141530632972717, + "logits/rejected": -0.5856257677078247, + "logps/chosen": -54.27638244628906, + "logps/rejected": -119.23406219482422, + "loss": 0.6381, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9522669315338135, + "rewards/margins": 7.6860809326171875, + "rewards/rejected": -4.733814239501953, + "step": 18289 + }, + { + "epoch": 4.58, + "grad_norm": 7.152895927429199, + "learning_rate": 1.7644116600300075e-07, + "logits/chosen": -0.6197853088378906, + "logits/rejected": -0.6994513869285583, + "logps/chosen": -72.6635971069336, + "logps/rejected": -97.75494384765625, + "loss": 0.7575, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.849658489227295, + "rewards/margins": 5.689444541931152, + "rewards/rejected": -2.839785575866699, + "step": 18290 + }, + { + "epoch": 4.58, + "grad_norm": 15.295324325561523, + "learning_rate": 1.7623426860882807e-07, + "logits/chosen": -0.5234680771827698, + "logits/rejected": -0.6062479615211487, + "logps/chosen": -62.94636917114258, + "logps/rejected": -94.66333770751953, + "loss": 0.7627, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0073537826538086, + "rewards/margins": 5.8733296394348145, + "rewards/rejected": -2.8659756183624268, + "step": 18291 + }, + { + "epoch": 4.58, + "grad_norm": 6.582785606384277, + "learning_rate": 1.7602749041503775e-07, + "logits/chosen": -0.5840510129928589, + "logits/rejected": -0.6521652340888977, + "logps/chosen": -66.73042297363281, + "logps/rejected": -89.28263854980469, + "loss": 0.71, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.172290325164795, + "rewards/margins": 5.862598896026611, + "rewards/rejected": -2.6903085708618164, + "step": 18292 + }, + { + "epoch": 4.58, + "grad_norm": 2.909494161605835, + "learning_rate": 1.75820831426739e-07, + "logits/chosen": -0.5851656198501587, + "logits/rejected": -0.6762390732765198, + "logps/chosen": -42.13054656982422, + "logps/rejected": -103.34837341308594, + "loss": 0.5421, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.61722731590271, + "rewards/margins": 8.795967102050781, + "rewards/rejected": -5.17874002456665, + "step": 18293 + }, + { + "epoch": 4.58, + "grad_norm": 3.5725386142730713, + "learning_rate": 1.756142916490383e-07, + "logits/chosen": -0.5551269054412842, + "logits/rejected": -0.6472506523132324, + "logps/chosen": -52.40080642700195, + "logps/rejected": -98.80078887939453, + "loss": 0.5886, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.884021282196045, + "rewards/margins": 6.849803924560547, + "rewards/rejected": -3.965782403945923, + "step": 18294 + }, + { + "epoch": 4.58, + "grad_norm": 4.844062328338623, + "learning_rate": 1.7540787108704104e-07, + "logits/chosen": -0.5641948580741882, + "logits/rejected": -0.6046736836433411, + "logps/chosen": -53.89176559448242, + "logps/rejected": -116.05377197265625, + "loss": 0.6905, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.11960506439209, + "rewards/margins": 6.67327880859375, + "rewards/rejected": -3.553673505783081, + "step": 18295 + }, + { + "epoch": 4.58, + "grad_norm": 4.161301612854004, + "learning_rate": 1.752015697458459e-07, + "logits/chosen": -0.6258838772773743, + "logits/rejected": -0.6460707783699036, + "logps/chosen": -61.45621109008789, + "logps/rejected": -110.85091400146484, + "loss": 0.6724, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.999138593673706, + "rewards/margins": 6.532116889953613, + "rewards/rejected": -3.532977819442749, + "step": 18296 + }, + { + "epoch": 4.58, + "grad_norm": 18.677289962768555, + "learning_rate": 1.7499538763055212e-07, + "logits/chosen": -0.5454238057136536, + "logits/rejected": -0.6084532737731934, + "logps/chosen": -63.55055236816406, + "logps/rejected": -114.80946350097656, + "loss": 0.6958, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1259756088256836, + "rewards/margins": 7.1108808517456055, + "rewards/rejected": -3.984905242919922, + "step": 18297 + }, + { + "epoch": 4.58, + "grad_norm": 2.942640781402588, + "learning_rate": 1.7478932474625453e-07, + "logits/chosen": -0.5169322490692139, + "logits/rejected": -0.6033751368522644, + "logps/chosen": -51.10095977783203, + "logps/rejected": -121.34270477294922, + "loss": 0.5935, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9398727416992188, + "rewards/margins": 8.367267608642578, + "rewards/rejected": -5.427395343780518, + "step": 18298 + }, + { + "epoch": 4.58, + "grad_norm": 4.630238056182861, + "learning_rate": 1.7458338109804462e-07, + "logits/chosen": -0.5558000206947327, + "logits/rejected": -0.609506368637085, + "logps/chosen": -47.23102569580078, + "logps/rejected": -111.940673828125, + "loss": 0.6598, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3449177742004395, + "rewards/margins": 6.916900157928467, + "rewards/rejected": -3.571981906890869, + "step": 18299 + }, + { + "epoch": 4.58, + "grad_norm": 3.3718512058258057, + "learning_rate": 1.7437755669101274e-07, + "logits/chosen": -0.5677105784416199, + "logits/rejected": -0.6525896787643433, + "logps/chosen": -57.48230743408203, + "logps/rejected": -93.9894790649414, + "loss": 0.6075, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1368489265441895, + "rewards/margins": 6.175713539123535, + "rewards/rejected": -3.038865089416504, + "step": 18300 + }, + { + "epoch": 4.58, + "grad_norm": 2.248722553253174, + "learning_rate": 1.741718515302443e-07, + "logits/chosen": -0.5879087448120117, + "logits/rejected": -0.6854479312896729, + "logps/chosen": -48.75877380371094, + "logps/rejected": -103.68894958496094, + "loss": 0.5498, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.833277702331543, + "rewards/margins": 7.494534015655518, + "rewards/rejected": -4.661255836486816, + "step": 18301 + }, + { + "epoch": 4.58, + "grad_norm": 5.056125164031982, + "learning_rate": 1.739662656208213e-07, + "logits/chosen": -0.4981514513492584, + "logits/rejected": -0.5637444257736206, + "logps/chosen": -55.61366271972656, + "logps/rejected": -98.2184066772461, + "loss": 0.6755, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0119783878326416, + "rewards/margins": 6.109232425689697, + "rewards/rejected": -3.0972535610198975, + "step": 18302 + }, + { + "epoch": 4.58, + "grad_norm": 7.265672206878662, + "learning_rate": 1.7376079896782527e-07, + "logits/chosen": -0.583118200302124, + "logits/rejected": -0.6423441171646118, + "logps/chosen": -53.74693298339844, + "logps/rejected": -115.04442596435547, + "loss": 0.6688, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.994718551635742, + "rewards/margins": 7.498231887817383, + "rewards/rejected": -4.503514289855957, + "step": 18303 + }, + { + "epoch": 4.58, + "grad_norm": 5.820254802703857, + "learning_rate": 1.7355545157633436e-07, + "logits/chosen": -0.6047826409339905, + "logits/rejected": -0.7222321629524231, + "logps/chosen": -61.86464309692383, + "logps/rejected": -94.3000717163086, + "loss": 0.6765, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0913496017456055, + "rewards/margins": 6.380003452301025, + "rewards/rejected": -3.28865385055542, + "step": 18304 + }, + { + "epoch": 4.58, + "grad_norm": 7.488094329833984, + "learning_rate": 1.733502234514206e-07, + "logits/chosen": -0.6021647453308105, + "logits/rejected": -0.7072374820709229, + "logps/chosen": -49.661075592041016, + "logps/rejected": -95.70516204833984, + "loss": 0.6426, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8563997745513916, + "rewards/margins": 7.397613048553467, + "rewards/rejected": -4.541212558746338, + "step": 18305 + }, + { + "epoch": 4.58, + "grad_norm": 5.353944778442383, + "learning_rate": 1.731451145981572e-07, + "logits/chosen": -0.5376564264297485, + "logits/rejected": -0.5938591361045837, + "logps/chosen": -47.560333251953125, + "logps/rejected": -123.44581604003906, + "loss": 0.5861, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2451436519622803, + "rewards/margins": 8.763258934020996, + "rewards/rejected": -5.518115997314453, + "step": 18306 + }, + { + "epoch": 4.58, + "grad_norm": 3.014437198638916, + "learning_rate": 1.7294012502161283e-07, + "logits/chosen": -0.546268880367279, + "logits/rejected": -0.6048256158828735, + "logps/chosen": -42.70588684082031, + "logps/rejected": -117.52411651611328, + "loss": 0.5178, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0431816577911377, + "rewards/margins": 9.146430015563965, + "rewards/rejected": -6.103248596191406, + "step": 18307 + }, + { + "epoch": 4.58, + "grad_norm": 3.1712255477905273, + "learning_rate": 1.7273525472685127e-07, + "logits/chosen": -0.5117464661598206, + "logits/rejected": -0.618658721446991, + "logps/chosen": -62.356197357177734, + "logps/rejected": -96.40422058105469, + "loss": 0.5697, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.172680139541626, + "rewards/margins": 6.7590012550354, + "rewards/rejected": -3.5863208770751953, + "step": 18308 + }, + { + "epoch": 4.58, + "grad_norm": 2.255066394805908, + "learning_rate": 1.725305037189362e-07, + "logits/chosen": -0.6101360321044922, + "logits/rejected": -0.7216295003890991, + "logps/chosen": -61.583221435546875, + "logps/rejected": -104.82767486572266, + "loss": 0.6189, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7430648803710938, + "rewards/margins": 7.651266574859619, + "rewards/rejected": -4.908201694488525, + "step": 18309 + }, + { + "epoch": 4.58, + "grad_norm": 3.555305242538452, + "learning_rate": 1.723258720029275e-07, + "logits/chosen": -0.5204286575317383, + "logits/rejected": -0.6128858923912048, + "logps/chosen": -58.80555725097656, + "logps/rejected": -118.11377716064453, + "loss": 0.5843, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.993438482284546, + "rewards/margins": 7.200514316558838, + "rewards/rejected": -4.207076072692871, + "step": 18310 + }, + { + "epoch": 4.58, + "grad_norm": 5.28959321975708, + "learning_rate": 1.721213595838811e-07, + "logits/chosen": -0.5413283109664917, + "logits/rejected": -0.5756720900535583, + "logps/chosen": -59.19730758666992, + "logps/rejected": -122.16194152832031, + "loss": 0.6512, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.85054874420166, + "rewards/margins": 7.667722702026367, + "rewards/rejected": -4.817173957824707, + "step": 18311 + }, + { + "epoch": 4.58, + "grad_norm": 3.4549193382263184, + "learning_rate": 1.7191696646685073e-07, + "logits/chosen": -0.6217923760414124, + "logits/rejected": -0.6914064884185791, + "logps/chosen": -51.80438232421875, + "logps/rejected": -111.75648498535156, + "loss": 0.6003, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.383115768432617, + "rewards/margins": 7.558597564697266, + "rewards/rejected": -4.175481796264648, + "step": 18312 + }, + { + "epoch": 4.58, + "grad_norm": 24.933673858642578, + "learning_rate": 1.717126926568885e-07, + "logits/chosen": -0.6029844880104065, + "logits/rejected": -0.6990803480148315, + "logps/chosen": -49.991249084472656, + "logps/rejected": -91.15151977539062, + "loss": 0.6505, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9742519855499268, + "rewards/margins": 6.454846382141113, + "rewards/rejected": -3.4805939197540283, + "step": 18313 + }, + { + "epoch": 4.58, + "grad_norm": 2.5153932571411133, + "learning_rate": 1.715085381590409e-07, + "logits/chosen": -0.5115991234779358, + "logits/rejected": -0.5845469832420349, + "logps/chosen": -58.20185470581055, + "logps/rejected": -118.38868713378906, + "loss": 0.611, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.193006992340088, + "rewards/margins": 7.826539516448975, + "rewards/rejected": -4.6335320472717285, + "step": 18314 + }, + { + "epoch": 4.58, + "grad_norm": 4.217438220977783, + "learning_rate": 1.7130450297835278e-07, + "logits/chosen": -0.554078221321106, + "logits/rejected": -0.6441551446914673, + "logps/chosen": -48.856197357177734, + "logps/rejected": -93.94102478027344, + "loss": 0.6024, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.265916347503662, + "rewards/margins": 6.065586566925049, + "rewards/rejected": -2.7996702194213867, + "step": 18315 + }, + { + "epoch": 4.58, + "grad_norm": 6.433277130126953, + "learning_rate": 1.7110058711986676e-07, + "logits/chosen": -0.5926988124847412, + "logits/rejected": -0.6699543595314026, + "logps/chosen": -58.47871780395508, + "logps/rejected": -141.0194549560547, + "loss": 0.6712, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.752298593521118, + "rewards/margins": 8.458663940429688, + "rewards/rejected": -5.70636510848999, + "step": 18316 + }, + { + "epoch": 4.58, + "grad_norm": 16.901931762695312, + "learning_rate": 1.708967905886205e-07, + "logits/chosen": -0.5301031470298767, + "logits/rejected": -0.6123226284980774, + "logps/chosen": -60.96588134765625, + "logps/rejected": -118.05300903320312, + "loss": 0.7096, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9553427696228027, + "rewards/margins": 7.395410537719727, + "rewards/rejected": -4.440067291259766, + "step": 18317 + }, + { + "epoch": 4.58, + "grad_norm": 1.5508661270141602, + "learning_rate": 1.7069311338965222e-07, + "logits/chosen": -0.5006189942359924, + "logits/rejected": -0.5976751446723938, + "logps/chosen": -50.49353790283203, + "logps/rejected": -115.65101623535156, + "loss": 0.482, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.249769926071167, + "rewards/margins": 8.865381240844727, + "rewards/rejected": -5.615612030029297, + "step": 18318 + }, + { + "epoch": 4.58, + "grad_norm": 2.933077812194824, + "learning_rate": 1.704895555279934e-07, + "logits/chosen": -0.5360811948776245, + "logits/rejected": -0.6213316917419434, + "logps/chosen": -66.29031372070312, + "logps/rejected": -113.22749328613281, + "loss": 0.6034, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.903170585632324, + "rewards/margins": 8.00051498413086, + "rewards/rejected": -5.097344875335693, + "step": 18319 + }, + { + "epoch": 4.58, + "grad_norm": 10.659029006958008, + "learning_rate": 1.7028611700867449e-07, + "logits/chosen": -0.5484916567802429, + "logits/rejected": -0.6420708894729614, + "logps/chosen": -57.89664077758789, + "logps/rejected": -88.53546142578125, + "loss": 0.6642, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.988165855407715, + "rewards/margins": 6.4584641456604, + "rewards/rejected": -3.4702985286712646, + "step": 18320 + }, + { + "epoch": 4.58, + "grad_norm": 4.946686267852783, + "learning_rate": 1.7008279783672255e-07, + "logits/chosen": -0.4940720200538635, + "logits/rejected": -0.563901424407959, + "logps/chosen": -49.519866943359375, + "logps/rejected": -104.03237915039062, + "loss": 0.5915, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0436694622039795, + "rewards/margins": 6.947238922119141, + "rewards/rejected": -3.903569221496582, + "step": 18321 + }, + { + "epoch": 4.58, + "grad_norm": 5.393827438354492, + "learning_rate": 1.6987959801716193e-07, + "logits/chosen": -0.48565974831581116, + "logits/rejected": -0.5791797637939453, + "logps/chosen": -60.378761291503906, + "logps/rejected": -115.3114013671875, + "loss": 0.6243, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9103317260742188, + "rewards/margins": 7.11395263671875, + "rewards/rejected": -4.2036213874816895, + "step": 18322 + }, + { + "epoch": 4.58, + "grad_norm": 3.387565851211548, + "learning_rate": 1.696765175550147e-07, + "logits/chosen": -0.4830978214740753, + "logits/rejected": -0.5731570720672607, + "logps/chosen": -57.36589431762695, + "logps/rejected": -106.55839538574219, + "loss": 0.6287, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1257846355438232, + "rewards/margins": 6.318923473358154, + "rewards/rejected": -3.193138360977173, + "step": 18323 + }, + { + "epoch": 4.58, + "grad_norm": 4.808558940887451, + "learning_rate": 1.6947355645529795e-07, + "logits/chosen": -0.6414210796356201, + "logits/rejected": -0.7038050889968872, + "logps/chosen": -57.029327392578125, + "logps/rejected": -127.4134521484375, + "loss": 0.6417, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.082035779953003, + "rewards/margins": 7.8789381980896, + "rewards/rejected": -4.796901226043701, + "step": 18324 + }, + { + "epoch": 4.58, + "grad_norm": 11.001924514770508, + "learning_rate": 1.692707147230277e-07, + "logits/chosen": -0.5726629495620728, + "logits/rejected": -0.706519603729248, + "logps/chosen": -54.11670684814453, + "logps/rejected": -87.59083557128906, + "loss": 0.6641, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1186673641204834, + "rewards/margins": 6.67947244644165, + "rewards/rejected": -3.560804843902588, + "step": 18325 + }, + { + "epoch": 4.58, + "grad_norm": 5.728771686553955, + "learning_rate": 1.6906799236321658e-07, + "logits/chosen": -0.6019365787506104, + "logits/rejected": -0.7033464312553406, + "logps/chosen": -55.665130615234375, + "logps/rejected": -92.81494140625, + "loss": 0.6236, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.589383363723755, + "rewards/margins": 8.08149242401123, + "rewards/rejected": -4.4921088218688965, + "step": 18326 + }, + { + "epoch": 4.58, + "grad_norm": 4.156133651733398, + "learning_rate": 1.6886538938087337e-07, + "logits/chosen": -0.5134847164154053, + "logits/rejected": -0.6221596002578735, + "logps/chosen": -59.465415954589844, + "logps/rejected": -101.68141174316406, + "loss": 0.6276, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1048216819763184, + "rewards/margins": 7.255372524261475, + "rewards/rejected": -4.150550842285156, + "step": 18327 + }, + { + "epoch": 4.58, + "grad_norm": 3.922112226486206, + "learning_rate": 1.686629057810052e-07, + "logits/chosen": -0.5373233556747437, + "logits/rejected": -0.6334111094474792, + "logps/chosen": -63.956886291503906, + "logps/rejected": -140.04830932617188, + "loss": 0.6242, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.225691318511963, + "rewards/margins": 9.063114166259766, + "rewards/rejected": -5.837423324584961, + "step": 18328 + }, + { + "epoch": 4.59, + "grad_norm": 20.79561424255371, + "learning_rate": 1.684605415686158e-07, + "logits/chosen": -0.5279922485351562, + "logits/rejected": -0.5879555344581604, + "logps/chosen": -56.92485809326172, + "logps/rejected": -117.86405944824219, + "loss": 0.749, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0772552490234375, + "rewards/margins": 6.524911403656006, + "rewards/rejected": -3.4476563930511475, + "step": 18329 + }, + { + "epoch": 4.59, + "grad_norm": 6.407917022705078, + "learning_rate": 1.6825829674870453e-07, + "logits/chosen": -0.5344778895378113, + "logits/rejected": -0.6096968054771423, + "logps/chosen": -53.64102554321289, + "logps/rejected": -102.5196533203125, + "loss": 0.5708, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0768089294433594, + "rewards/margins": 7.435578346252441, + "rewards/rejected": -4.358768939971924, + "step": 18330 + }, + { + "epoch": 4.59, + "grad_norm": 4.778964996337891, + "learning_rate": 1.6805617132627073e-07, + "logits/chosen": -0.5219403505325317, + "logits/rejected": -0.5712993741035461, + "logps/chosen": -52.31512451171875, + "logps/rejected": -107.97286224365234, + "loss": 0.5502, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.365750789642334, + "rewards/margins": 7.604583263397217, + "rewards/rejected": -4.238831996917725, + "step": 18331 + }, + { + "epoch": 4.59, + "grad_norm": 4.809692859649658, + "learning_rate": 1.6785416530630872e-07, + "logits/chosen": -0.5626418590545654, + "logits/rejected": -0.6175543069839478, + "logps/chosen": -55.236724853515625, + "logps/rejected": -120.73353576660156, + "loss": 0.6545, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9077534675598145, + "rewards/margins": 7.564390659332275, + "rewards/rejected": -4.656637191772461, + "step": 18332 + }, + { + "epoch": 4.59, + "grad_norm": 4.1153178215026855, + "learning_rate": 1.6765227869380895e-07, + "logits/chosen": -0.5169060230255127, + "logits/rejected": -0.6324401497840881, + "logps/chosen": -61.82244110107422, + "logps/rejected": -90.31788635253906, + "loss": 0.63, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.6927552223205566, + "rewards/margins": 6.865772724151611, + "rewards/rejected": -3.1730175018310547, + "step": 18333 + }, + { + "epoch": 4.59, + "grad_norm": 3.382673740386963, + "learning_rate": 1.6745051149376135e-07, + "logits/chosen": -0.5744786262512207, + "logits/rejected": -0.6314092874526978, + "logps/chosen": -47.45214080810547, + "logps/rejected": -111.362548828125, + "loss": 0.5585, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.042253017425537, + "rewards/margins": 7.017329216003418, + "rewards/rejected": -3.9750757217407227, + "step": 18334 + }, + { + "epoch": 4.59, + "grad_norm": 2.522658586502075, + "learning_rate": 1.6724886371115245e-07, + "logits/chosen": -0.5349971652030945, + "logits/rejected": -0.6002474427223206, + "logps/chosen": -57.77278137207031, + "logps/rejected": -104.66006469726562, + "loss": 0.6248, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.493368625640869, + "rewards/margins": 7.9579997062683105, + "rewards/rejected": -4.464630603790283, + "step": 18335 + }, + { + "epoch": 4.59, + "grad_norm": 2.3122761249542236, + "learning_rate": 1.670473353509644e-07, + "logits/chosen": -0.5948570966720581, + "logits/rejected": -0.6813368201255798, + "logps/chosen": -42.22395706176758, + "logps/rejected": -107.84088897705078, + "loss": 0.5265, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1497068405151367, + "rewards/margins": 7.954687118530273, + "rewards/rejected": -4.804980278015137, + "step": 18336 + }, + { + "epoch": 4.59, + "grad_norm": 4.749149322509766, + "learning_rate": 1.668459264181771e-07, + "logits/chosen": -0.5807468295097351, + "logits/rejected": -0.6739574670791626, + "logps/chosen": -45.618019104003906, + "logps/rejected": -89.47998046875, + "loss": 0.56, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7941203117370605, + "rewards/margins": 6.982846260070801, + "rewards/rejected": -4.18872594833374, + "step": 18337 + }, + { + "epoch": 4.59, + "grad_norm": 6.186037063598633, + "learning_rate": 1.6664463691776766e-07, + "logits/chosen": -0.5516589283943176, + "logits/rejected": -0.6060568690299988, + "logps/chosen": -62.400848388671875, + "logps/rejected": -108.09205627441406, + "loss": 0.7132, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.052168369293213, + "rewards/margins": 7.096301078796387, + "rewards/rejected": -4.044133186340332, + "step": 18338 + }, + { + "epoch": 4.59, + "grad_norm": 9.470471382141113, + "learning_rate": 1.6644346685471048e-07, + "logits/chosen": -0.5140752196311951, + "logits/rejected": -0.5965782999992371, + "logps/chosen": -43.15473175048828, + "logps/rejected": -101.04373931884766, + "loss": 0.5867, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9147632122039795, + "rewards/margins": 7.653511047363281, + "rewards/rejected": -4.738747596740723, + "step": 18339 + }, + { + "epoch": 4.59, + "grad_norm": 13.974390029907227, + "learning_rate": 1.6624241623397598e-07, + "logits/chosen": -0.5374945402145386, + "logits/rejected": -0.5783838033676147, + "logps/chosen": -53.06818389892578, + "logps/rejected": -108.56120300292969, + "loss": 0.6294, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.165442943572998, + "rewards/margins": 7.049022674560547, + "rewards/rejected": -3.883580207824707, + "step": 18340 + }, + { + "epoch": 4.59, + "grad_norm": 3.3222408294677734, + "learning_rate": 1.6604148506053354e-07, + "logits/chosen": -0.518799901008606, + "logits/rejected": -0.5932404398918152, + "logps/chosen": -71.76150512695312, + "logps/rejected": -116.56061553955078, + "loss": 0.5973, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.485452651977539, + "rewards/margins": 7.304621696472168, + "rewards/rejected": -3.819169044494629, + "step": 18341 + }, + { + "epoch": 4.59, + "grad_norm": 6.7563557624816895, + "learning_rate": 1.6584067333934696e-07, + "logits/chosen": -0.6002722978591919, + "logits/rejected": -0.6787217259407043, + "logps/chosen": -64.3687744140625, + "logps/rejected": -101.43231201171875, + "loss": 0.7728, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.139835834503174, + "rewards/margins": 6.024750709533691, + "rewards/rejected": -2.884915351867676, + "step": 18342 + }, + { + "epoch": 4.59, + "grad_norm": 3.3511977195739746, + "learning_rate": 1.6563998107537948e-07, + "logits/chosen": -0.5632541179656982, + "logits/rejected": -0.5798937082290649, + "logps/chosen": -47.98906326293945, + "logps/rejected": -121.11898803710938, + "loss": 0.5729, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3601789474487305, + "rewards/margins": 8.515572547912598, + "rewards/rejected": -5.155394077301025, + "step": 18343 + }, + { + "epoch": 4.59, + "grad_norm": 9.011810302734375, + "learning_rate": 1.6543940827358995e-07, + "logits/chosen": -0.6447281241416931, + "logits/rejected": -0.7077572345733643, + "logps/chosen": -61.61662292480469, + "logps/rejected": -119.2876205444336, + "loss": 0.6656, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.352905035018921, + "rewards/margins": 8.15651798248291, + "rewards/rejected": -4.803613662719727, + "step": 18344 + }, + { + "epoch": 4.59, + "grad_norm": 34.41158676147461, + "learning_rate": 1.6523895493893494e-07, + "logits/chosen": -0.5684782266616821, + "logits/rejected": -0.6284120082855225, + "logps/chosen": -68.14286041259766, + "logps/rejected": -129.56195068359375, + "loss": 0.6885, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.614508867263794, + "rewards/margins": 7.389465808868408, + "rewards/rejected": -4.774957180023193, + "step": 18345 + }, + { + "epoch": 4.59, + "grad_norm": 4.771520614624023, + "learning_rate": 1.650386210763677e-07, + "logits/chosen": -0.5288435816764832, + "logits/rejected": -0.591895341873169, + "logps/chosen": -58.23463439941406, + "logps/rejected": -138.5355987548828, + "loss": 0.5747, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.146206855773926, + "rewards/margins": 8.335981369018555, + "rewards/rejected": -5.189774990081787, + "step": 18346 + }, + { + "epoch": 4.59, + "grad_norm": 12.777605056762695, + "learning_rate": 1.6483840669083928e-07, + "logits/chosen": -0.5568268895149231, + "logits/rejected": -0.64015132188797, + "logps/chosen": -48.30127716064453, + "logps/rejected": -98.1148452758789, + "loss": 0.5621, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1460750102996826, + "rewards/margins": 7.639130592346191, + "rewards/rejected": -4.49305534362793, + "step": 18347 + }, + { + "epoch": 4.59, + "grad_norm": 2.6410534381866455, + "learning_rate": 1.6463831178729628e-07, + "logits/chosen": -0.6253442168235779, + "logits/rejected": -0.6772116422653198, + "logps/chosen": -47.2540397644043, + "logps/rejected": -107.19486236572266, + "loss": 0.5675, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.37129282951355, + "rewards/margins": 7.180805206298828, + "rewards/rejected": -3.8095126152038574, + "step": 18348 + }, + { + "epoch": 4.59, + "grad_norm": 3.2164206504821777, + "learning_rate": 1.6443833637068362e-07, + "logits/chosen": -0.5614952445030212, + "logits/rejected": -0.6399175524711609, + "logps/chosen": -50.77721405029297, + "logps/rejected": -113.92757415771484, + "loss": 0.5769, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3969886302948, + "rewards/margins": 7.851954936981201, + "rewards/rejected": -4.454966068267822, + "step": 18349 + }, + { + "epoch": 4.59, + "grad_norm": 2.859943389892578, + "learning_rate": 1.6423848044594404e-07, + "logits/chosen": -0.5659804344177246, + "logits/rejected": -0.6592676639556885, + "logps/chosen": -47.48808288574219, + "logps/rejected": -117.19969177246094, + "loss": 0.5699, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.267780065536499, + "rewards/margins": 8.65530776977539, + "rewards/rejected": -5.3875274658203125, + "step": 18350 + }, + { + "epoch": 4.59, + "grad_norm": 3.9292171001434326, + "learning_rate": 1.6403874401801412e-07, + "logits/chosen": -0.6463622450828552, + "logits/rejected": -0.7491077184677124, + "logps/chosen": -60.90899658203125, + "logps/rejected": -98.30863952636719, + "loss": 0.596, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4906673431396484, + "rewards/margins": 7.714834213256836, + "rewards/rejected": -4.2241668701171875, + "step": 18351 + }, + { + "epoch": 4.59, + "grad_norm": 3.471816301345825, + "learning_rate": 1.6383912709183047e-07, + "logits/chosen": -0.5680397152900696, + "logits/rejected": -0.6404259204864502, + "logps/chosen": -55.01652145385742, + "logps/rejected": -102.26359558105469, + "loss": 0.6045, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.213083028793335, + "rewards/margins": 7.449036598205566, + "rewards/rejected": -4.235952854156494, + "step": 18352 + }, + { + "epoch": 4.59, + "grad_norm": 3.2116200923919678, + "learning_rate": 1.6363962967232637e-07, + "logits/chosen": -0.5899339914321899, + "logits/rejected": -0.6545653939247131, + "logps/chosen": -47.35832214355469, + "logps/rejected": -94.00856018066406, + "loss": 0.6036, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.296504497528076, + "rewards/margins": 6.509974002838135, + "rewards/rejected": -3.213469982147217, + "step": 18353 + }, + { + "epoch": 4.59, + "grad_norm": 3.947503089904785, + "learning_rate": 1.634402517644318e-07, + "logits/chosen": -0.5153996348381042, + "logits/rejected": -0.6280255317687988, + "logps/chosen": -65.7134017944336, + "logps/rejected": -106.18976593017578, + "loss": 0.5566, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.707517147064209, + "rewards/margins": 7.1637444496154785, + "rewards/rejected": -4.456227779388428, + "step": 18354 + }, + { + "epoch": 4.59, + "grad_norm": 5.942803382873535, + "learning_rate": 1.632409933730722e-07, + "logits/chosen": -0.6106739640235901, + "logits/rejected": -0.677023708820343, + "logps/chosen": -52.80715560913086, + "logps/rejected": -104.72526550292969, + "loss": 0.6233, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.232452154159546, + "rewards/margins": 6.514927387237549, + "rewards/rejected": -3.282475709915161, + "step": 18355 + }, + { + "epoch": 4.59, + "grad_norm": 6.488150596618652, + "learning_rate": 1.6304185450317255e-07, + "logits/chosen": -0.5019428133964539, + "logits/rejected": -0.6174518465995789, + "logps/chosen": -65.26455688476562, + "logps/rejected": -118.13117980957031, + "loss": 0.6023, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1070194244384766, + "rewards/margins": 8.269020080566406, + "rewards/rejected": -5.162001609802246, + "step": 18356 + }, + { + "epoch": 4.59, + "grad_norm": 7.803522109985352, + "learning_rate": 1.6284283515965338e-07, + "logits/chosen": -0.5078033804893494, + "logits/rejected": -0.5443999767303467, + "logps/chosen": -59.009315490722656, + "logps/rejected": -133.49208068847656, + "loss": 0.6128, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2501888275146484, + "rewards/margins": 8.444287300109863, + "rewards/rejected": -5.194098472595215, + "step": 18357 + }, + { + "epoch": 4.59, + "grad_norm": 9.7190523147583, + "learning_rate": 1.626439353474324e-07, + "logits/chosen": -0.56667160987854, + "logits/rejected": -0.5970336198806763, + "logps/chosen": -56.565860748291016, + "logps/rejected": -107.39299011230469, + "loss": 0.6639, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9852168560028076, + "rewards/margins": 6.107196807861328, + "rewards/rejected": -3.1219799518585205, + "step": 18358 + }, + { + "epoch": 4.59, + "grad_norm": 4.778152942657471, + "learning_rate": 1.6244515507142566e-07, + "logits/chosen": -0.6697033643722534, + "logits/rejected": -0.7308290600776672, + "logps/chosen": -47.95680236816406, + "logps/rejected": -110.02953338623047, + "loss": 0.5892, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.182213306427002, + "rewards/margins": 7.25918436050415, + "rewards/rejected": -4.076971054077148, + "step": 18359 + }, + { + "epoch": 4.59, + "grad_norm": 4.305064678192139, + "learning_rate": 1.6224649433654427e-07, + "logits/chosen": -0.6049084663391113, + "logits/rejected": -0.6700533032417297, + "logps/chosen": -55.86728286743164, + "logps/rejected": -118.56241607666016, + "loss": 0.6427, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.131143569946289, + "rewards/margins": 7.607669353485107, + "rewards/rejected": -4.476526260375977, + "step": 18360 + }, + { + "epoch": 4.59, + "grad_norm": 7.514374256134033, + "learning_rate": 1.6204795314769762e-07, + "logits/chosen": -0.5619250535964966, + "logits/rejected": -0.6287571787834167, + "logps/chosen": -65.03083038330078, + "logps/rejected": -104.62391662597656, + "loss": 0.6386, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0277342796325684, + "rewards/margins": 6.4727373123168945, + "rewards/rejected": -3.445002555847168, + "step": 18361 + }, + { + "epoch": 4.59, + "grad_norm": 4.973299026489258, + "learning_rate": 1.6184953150979231e-07, + "logits/chosen": -0.5156397819519043, + "logits/rejected": -0.6087278127670288, + "logps/chosen": -55.81153106689453, + "logps/rejected": -110.84330749511719, + "loss": 0.6176, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2715179920196533, + "rewards/margins": 6.626644611358643, + "rewards/rejected": -3.35512638092041, + "step": 18362 + }, + { + "epoch": 4.59, + "grad_norm": 9.347975730895996, + "learning_rate": 1.616512294277306e-07, + "logits/chosen": -0.589710533618927, + "logits/rejected": -0.6675199270248413, + "logps/chosen": -61.64271926879883, + "logps/rejected": -105.56317901611328, + "loss": 0.6979, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.261701822280884, + "rewards/margins": 7.486729621887207, + "rewards/rejected": -4.225027084350586, + "step": 18363 + }, + { + "epoch": 4.59, + "grad_norm": 8.098893165588379, + "learning_rate": 1.6145304690641405e-07, + "logits/chosen": -0.4851233661174774, + "logits/rejected": -0.6043047904968262, + "logps/chosen": -62.02665328979492, + "logps/rejected": -88.63944244384766, + "loss": 0.6719, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0719656944274902, + "rewards/margins": 6.297312259674072, + "rewards/rejected": -3.225346565246582, + "step": 18364 + }, + { + "epoch": 4.59, + "grad_norm": 4.177730560302734, + "learning_rate": 1.6125498395073825e-07, + "logits/chosen": -0.5739321708679199, + "logits/rejected": -0.6485791206359863, + "logps/chosen": -54.135292053222656, + "logps/rejected": -102.91897583007812, + "loss": 0.5945, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2299249172210693, + "rewards/margins": 6.828505516052246, + "rewards/rejected": -3.5985805988311768, + "step": 18365 + }, + { + "epoch": 4.59, + "grad_norm": 5.345685005187988, + "learning_rate": 1.6105704056559923e-07, + "logits/chosen": -0.550670325756073, + "logits/rejected": -0.6360726356506348, + "logps/chosen": -50.27020263671875, + "logps/rejected": -101.31842041015625, + "loss": 0.6333, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8239948749542236, + "rewards/margins": 6.843524932861328, + "rewards/rejected": -4.019529819488525, + "step": 18366 + }, + { + "epoch": 4.59, + "grad_norm": 4.205094814300537, + "learning_rate": 1.6085921675588756e-07, + "logits/chosen": -0.5658957362174988, + "logits/rejected": -0.5997499227523804, + "logps/chosen": -51.034080505371094, + "logps/rejected": -129.80406188964844, + "loss": 0.5374, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.936046600341797, + "rewards/margins": 7.757678031921387, + "rewards/rejected": -4.82163143157959, + "step": 18367 + }, + { + "epoch": 4.59, + "grad_norm": 4.070720672607422, + "learning_rate": 1.6066151252649153e-07, + "logits/chosen": -0.4701536297798157, + "logits/rejected": -0.5523837804794312, + "logps/chosen": -47.32649612426758, + "logps/rejected": -125.25430297851562, + "loss": 0.5234, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3209471702575684, + "rewards/margins": 7.788734436035156, + "rewards/rejected": -4.467787265777588, + "step": 18368 + }, + { + "epoch": 4.6, + "grad_norm": 4.379093170166016, + "learning_rate": 1.6046392788229726e-07, + "logits/chosen": -0.5602316856384277, + "logits/rejected": -0.6477196216583252, + "logps/chosen": -51.89409637451172, + "logps/rejected": -100.00352478027344, + "loss": 0.6388, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.403775215148926, + "rewards/margins": 7.323359489440918, + "rewards/rejected": -3.919583559036255, + "step": 18369 + }, + { + "epoch": 4.6, + "grad_norm": 2.709426164627075, + "learning_rate": 1.6026646282818635e-07, + "logits/chosen": -0.6179175972938538, + "logits/rejected": -0.6786060929298401, + "logps/chosen": -64.58946228027344, + "logps/rejected": -122.9915771484375, + "loss": 0.616, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.014444589614868, + "rewards/margins": 7.315454483032227, + "rewards/rejected": -4.301010608673096, + "step": 18370 + }, + { + "epoch": 4.6, + "grad_norm": 9.219728469848633, + "learning_rate": 1.600691173690394e-07, + "logits/chosen": -0.5888760685920715, + "logits/rejected": -0.6526162624359131, + "logps/chosen": -59.39927291870117, + "logps/rejected": -105.93721008300781, + "loss": 0.6214, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0364279747009277, + "rewards/margins": 6.534479141235352, + "rewards/rejected": -3.498051166534424, + "step": 18371 + }, + { + "epoch": 4.6, + "grad_norm": 2.2890212535858154, + "learning_rate": 1.5987189150973304e-07, + "logits/chosen": -0.5929045677185059, + "logits/rejected": -0.6159530878067017, + "logps/chosen": -50.366451263427734, + "logps/rejected": -118.82928466796875, + "loss": 0.5736, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1431784629821777, + "rewards/margins": 7.2258758544921875, + "rewards/rejected": -4.082697868347168, + "step": 18372 + }, + { + "epoch": 4.6, + "grad_norm": 2.2030084133148193, + "learning_rate": 1.5967478525513947e-07, + "logits/chosen": -0.5737806558609009, + "logits/rejected": -0.659618616104126, + "logps/chosen": -45.245601654052734, + "logps/rejected": -99.31109619140625, + "loss": 0.5199, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.263411521911621, + "rewards/margins": 7.2642998695373535, + "rewards/rejected": -4.000888347625732, + "step": 18373 + }, + { + "epoch": 4.6, + "grad_norm": 4.058398246765137, + "learning_rate": 1.594777986101309e-07, + "logits/chosen": -0.5485174655914307, + "logits/rejected": -0.6028578281402588, + "logps/chosen": -50.13145065307617, + "logps/rejected": -109.83108520507812, + "loss": 0.5726, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.241715669631958, + "rewards/margins": 7.405372142791748, + "rewards/rejected": -4.163656711578369, + "step": 18374 + }, + { + "epoch": 4.6, + "grad_norm": 36.483741760253906, + "learning_rate": 1.5928093157957403e-07, + "logits/chosen": -0.49996548891067505, + "logits/rejected": -0.5687800645828247, + "logps/chosen": -52.991363525390625, + "logps/rejected": -121.4917984008789, + "loss": 0.637, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0990467071533203, + "rewards/margins": 7.799417972564697, + "rewards/rejected": -4.700371742248535, + "step": 18375 + }, + { + "epoch": 4.6, + "grad_norm": 4.33172082901001, + "learning_rate": 1.5908418416833438e-07, + "logits/chosen": -0.6345140933990479, + "logits/rejected": -0.7174748182296753, + "logps/chosen": -47.57136154174805, + "logps/rejected": -120.1829833984375, + "loss": 0.5464, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1276869773864746, + "rewards/margins": 8.663918495178223, + "rewards/rejected": -5.536231994628906, + "step": 18376 + }, + { + "epoch": 4.6, + "grad_norm": 3.9677155017852783, + "learning_rate": 1.5888755638127308e-07, + "logits/chosen": -0.5338119268417358, + "logits/rejected": -0.606801450252533, + "logps/chosen": -51.506710052490234, + "logps/rejected": -124.1514892578125, + "loss": 0.5429, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.258030652999878, + "rewards/margins": 8.641897201538086, + "rewards/rejected": -5.383867263793945, + "step": 18377 + }, + { + "epoch": 4.6, + "grad_norm": 3.471632242202759, + "learning_rate": 1.5869104822325066e-07, + "logits/chosen": -0.5306119918823242, + "logits/rejected": -0.6400014758110046, + "logps/chosen": -57.53569793701172, + "logps/rejected": -114.25084686279297, + "loss": 0.5721, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1267826557159424, + "rewards/margins": 7.928731918334961, + "rewards/rejected": -4.801949501037598, + "step": 18378 + }, + { + "epoch": 4.6, + "grad_norm": 3.8075294494628906, + "learning_rate": 1.5849465969912103e-07, + "logits/chosen": -0.5162703394889832, + "logits/rejected": -0.6016703248023987, + "logps/chosen": -54.52605438232422, + "logps/rejected": -118.13278198242188, + "loss": 0.5861, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.136415481567383, + "rewards/margins": 8.191659927368164, + "rewards/rejected": -5.055244445800781, + "step": 18379 + }, + { + "epoch": 4.6, + "grad_norm": 5.735969543457031, + "learning_rate": 1.582983908137381e-07, + "logits/chosen": -0.49519914388656616, + "logits/rejected": -0.6095986366271973, + "logps/chosen": -66.0684814453125, + "logps/rejected": -115.3623275756836, + "loss": 0.584, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.854538917541504, + "rewards/margins": 7.319914817810059, + "rewards/rejected": -4.465376377105713, + "step": 18380 + }, + { + "epoch": 4.6, + "grad_norm": 3.857158660888672, + "learning_rate": 1.5810224157195243e-07, + "logits/chosen": -0.4852011203765869, + "logits/rejected": -0.5997063517570496, + "logps/chosen": -52.23537826538086, + "logps/rejected": -102.51661682128906, + "loss": 0.5732, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.316013813018799, + "rewards/margins": 7.081930160522461, + "rewards/rejected": -3.765916585922241, + "step": 18381 + }, + { + "epoch": 4.6, + "grad_norm": 4.122426509857178, + "learning_rate": 1.579062119786101e-07, + "logits/chosen": -0.5515648126602173, + "logits/rejected": -0.6327154636383057, + "logps/chosen": -66.26061248779297, + "logps/rejected": -107.495849609375, + "loss": 0.6008, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.08046555519104, + "rewards/margins": 7.5615339279174805, + "rewards/rejected": -4.481068134307861, + "step": 18382 + }, + { + "epoch": 4.6, + "grad_norm": 3.999945640563965, + "learning_rate": 1.577103020385551e-07, + "logits/chosen": -0.5251998901367188, + "logits/rejected": -0.5797613263130188, + "logps/chosen": -47.62398147583008, + "logps/rejected": -116.11433410644531, + "loss": 0.6231, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9617955684661865, + "rewards/margins": 7.635295867919922, + "rewards/rejected": -4.6735005378723145, + "step": 18383 + }, + { + "epoch": 4.6, + "grad_norm": 4.137177467346191, + "learning_rate": 1.5751451175662956e-07, + "logits/chosen": -0.5380563139915466, + "logits/rejected": -0.5849607586860657, + "logps/chosen": -57.44840621948242, + "logps/rejected": -130.14813232421875, + "loss": 0.6545, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.996105194091797, + "rewards/margins": 8.113236427307129, + "rewards/rejected": -5.117130279541016, + "step": 18384 + }, + { + "epoch": 4.6, + "grad_norm": 2.3101577758789062, + "learning_rate": 1.5731884113767026e-07, + "logits/chosen": -0.5857272744178772, + "logits/rejected": -0.6609732508659363, + "logps/chosen": -44.95929718017578, + "logps/rejected": -107.2564697265625, + "loss": 0.5214, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.45864200592041, + "rewards/margins": 7.945036888122559, + "rewards/rejected": -4.486394882202148, + "step": 18385 + }, + { + "epoch": 4.6, + "grad_norm": 5.987118721008301, + "learning_rate": 1.571232901865144e-07, + "logits/chosen": -0.5177907347679138, + "logits/rejected": -0.5701338648796082, + "logps/chosen": -49.87086486816406, + "logps/rejected": -101.04708099365234, + "loss": 0.6327, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2783727645874023, + "rewards/margins": 6.359596252441406, + "rewards/rejected": -3.0812230110168457, + "step": 18386 + }, + { + "epoch": 4.6, + "grad_norm": 7.223101615905762, + "learning_rate": 1.5692785890799256e-07, + "logits/chosen": -0.5424647927284241, + "logits/rejected": -0.5981733798980713, + "logps/chosen": -61.549537658691406, + "logps/rejected": -108.02654266357422, + "loss": 0.603, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.952305793762207, + "rewards/margins": 5.545375347137451, + "rewards/rejected": -2.593069314956665, + "step": 18387 + }, + { + "epoch": 4.6, + "grad_norm": 3.3225271701812744, + "learning_rate": 1.5673254730693478e-07, + "logits/chosen": -0.5519207715988159, + "logits/rejected": -0.6131116151809692, + "logps/chosen": -57.50189971923828, + "logps/rejected": -118.27252960205078, + "loss": 0.5786, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2412219047546387, + "rewards/margins": 8.336861610412598, + "rewards/rejected": -5.095639228820801, + "step": 18388 + }, + { + "epoch": 4.6, + "grad_norm": 4.188390254974365, + "learning_rate": 1.5653735538816718e-07, + "logits/chosen": -0.5415894985198975, + "logits/rejected": -0.5911865830421448, + "logps/chosen": -53.330596923828125, + "logps/rejected": -117.96197509765625, + "loss": 0.5909, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2689497470855713, + "rewards/margins": 7.944669246673584, + "rewards/rejected": -4.675719738006592, + "step": 18389 + }, + { + "epoch": 4.6, + "grad_norm": 9.301969528198242, + "learning_rate": 1.5634228315651367e-07, + "logits/chosen": -0.5204926133155823, + "logits/rejected": -0.5699896812438965, + "logps/chosen": -54.80324172973633, + "logps/rejected": -110.94760131835938, + "loss": 0.6704, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2147014141082764, + "rewards/margins": 6.58970832824707, + "rewards/rejected": -3.375007390975952, + "step": 18390 + }, + { + "epoch": 4.6, + "grad_norm": 6.668707370758057, + "learning_rate": 1.5614733061679376e-07, + "logits/chosen": -0.5054219961166382, + "logits/rejected": -0.562753438949585, + "logps/chosen": -49.18232345581055, + "logps/rejected": -95.0791244506836, + "loss": 0.6249, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.139540195465088, + "rewards/margins": 6.78817892074585, + "rewards/rejected": -3.6486387252807617, + "step": 18391 + }, + { + "epoch": 4.6, + "grad_norm": 6.005425453186035, + "learning_rate": 1.5595249777382526e-07, + "logits/chosen": -0.5913103818893433, + "logits/rejected": -0.6402237415313721, + "logps/chosen": -63.30712890625, + "logps/rejected": -96.53741455078125, + "loss": 0.6969, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2803292274475098, + "rewards/margins": 6.095625877380371, + "rewards/rejected": -2.8152966499328613, + "step": 18392 + }, + { + "epoch": 4.6, + "grad_norm": 17.72385025024414, + "learning_rate": 1.557577846324243e-07, + "logits/chosen": -0.5605296492576599, + "logits/rejected": -0.6483502984046936, + "logps/chosen": -70.7281265258789, + "logps/rejected": -102.60018920898438, + "loss": 0.7642, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8706326484680176, + "rewards/margins": 7.840276718139648, + "rewards/rejected": -4.969644069671631, + "step": 18393 + }, + { + "epoch": 4.6, + "grad_norm": 5.0426201820373535, + "learning_rate": 1.5556319119739982e-07, + "logits/chosen": -0.5388545393943787, + "logits/rejected": -0.6112250685691833, + "logps/chosen": -58.35356903076172, + "logps/rejected": -113.90767669677734, + "loss": 0.6203, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.093763589859009, + "rewards/margins": 7.194901466369629, + "rewards/rejected": -4.101138114929199, + "step": 18394 + }, + { + "epoch": 4.6, + "grad_norm": 4.692385196685791, + "learning_rate": 1.553687174735613e-07, + "logits/chosen": -0.641924262046814, + "logits/rejected": -0.7306580543518066, + "logps/chosen": -50.151432037353516, + "logps/rejected": -108.88447570800781, + "loss": 0.5423, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0894033908843994, + "rewards/margins": 8.164999961853027, + "rewards/rejected": -5.075596332550049, + "step": 18395 + }, + { + "epoch": 4.6, + "grad_norm": 34.72515106201172, + "learning_rate": 1.55174363465716e-07, + "logits/chosen": -0.5776283740997314, + "logits/rejected": -0.6250219345092773, + "logps/chosen": -46.462364196777344, + "logps/rejected": -106.10363006591797, + "loss": 0.705, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.097047805786133, + "rewards/margins": 6.321316719055176, + "rewards/rejected": -3.224269390106201, + "step": 18396 + }, + { + "epoch": 4.6, + "grad_norm": 7.192630767822266, + "learning_rate": 1.5498012917866457e-07, + "logits/chosen": -0.5725401043891907, + "logits/rejected": -0.6338980197906494, + "logps/chosen": -55.649803161621094, + "logps/rejected": -111.61849975585938, + "loss": 0.6527, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0448031425476074, + "rewards/margins": 6.846460342407227, + "rewards/rejected": -3.801657199859619, + "step": 18397 + }, + { + "epoch": 4.6, + "grad_norm": 7.284085750579834, + "learning_rate": 1.5478601461720754e-07, + "logits/chosen": -0.5072210431098938, + "logits/rejected": -0.5351575016975403, + "logps/chosen": -52.90044403076172, + "logps/rejected": -108.68736267089844, + "loss": 0.6849, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0167393684387207, + "rewards/margins": 5.82979679107666, + "rewards/rejected": -2.8130578994750977, + "step": 18398 + }, + { + "epoch": 4.6, + "grad_norm": 5.227624893188477, + "learning_rate": 1.5459201978614168e-07, + "logits/chosen": -0.5044181942939758, + "logits/rejected": -0.6236153244972229, + "logps/chosen": -53.16854476928711, + "logps/rejected": -112.65043640136719, + "loss": 0.5497, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.713564157485962, + "rewards/margins": 7.493810653686523, + "rewards/rejected": -4.780246734619141, + "step": 18399 + }, + { + "epoch": 4.6, + "grad_norm": 5.699762344360352, + "learning_rate": 1.543981446902615e-07, + "logits/chosen": -0.6297663450241089, + "logits/rejected": -0.703960657119751, + "logps/chosen": -55.62702178955078, + "logps/rejected": -119.50772857666016, + "loss": 0.6921, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1754703521728516, + "rewards/margins": 7.659486770629883, + "rewards/rejected": -4.484016418457031, + "step": 18400 + }, + { + "epoch": 4.6, + "grad_norm": 3.6786534786224365, + "learning_rate": 1.5420438933435644e-07, + "logits/chosen": -0.5484986305236816, + "logits/rejected": -0.6170258522033691, + "logps/chosen": -48.54906463623047, + "logps/rejected": -100.47225952148438, + "loss": 0.5911, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1794962882995605, + "rewards/margins": 6.577152729034424, + "rewards/rejected": -3.3976569175720215, + "step": 18401 + }, + { + "epoch": 4.6, + "grad_norm": 5.484551906585693, + "learning_rate": 1.540107537232155e-07, + "logits/chosen": -0.5563691258430481, + "logits/rejected": -0.6460907459259033, + "logps/chosen": -67.64336395263672, + "logps/rejected": -113.50453186035156, + "loss": 0.7798, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1906793117523193, + "rewards/margins": 7.144786834716797, + "rewards/rejected": -3.9541072845458984, + "step": 18402 + }, + { + "epoch": 4.6, + "grad_norm": 4.478363513946533, + "learning_rate": 1.538172378616226e-07, + "logits/chosen": -0.5054280161857605, + "logits/rejected": -0.6007931232452393, + "logps/chosen": -56.242855072021484, + "logps/rejected": -107.88880157470703, + "loss": 0.5343, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1427245140075684, + "rewards/margins": 7.683038711547852, + "rewards/rejected": -4.540313720703125, + "step": 18403 + }, + { + "epoch": 4.6, + "grad_norm": 3.5190412998199463, + "learning_rate": 1.5362384175436117e-07, + "logits/chosen": -0.6375768780708313, + "logits/rejected": -0.7340648174285889, + "logps/chosen": -39.36366653442383, + "logps/rejected": -110.33799743652344, + "loss": 0.5065, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.055968999862671, + "rewards/margins": 7.53034782409668, + "rewards/rejected": -4.474379062652588, + "step": 18404 + }, + { + "epoch": 4.6, + "grad_norm": 3.087003231048584, + "learning_rate": 1.534305654062096e-07, + "logits/chosen": -0.49029740691185, + "logits/rejected": -0.5597741603851318, + "logps/chosen": -71.94316101074219, + "logps/rejected": -125.96578216552734, + "loss": 0.6288, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.005125045776367, + "rewards/margins": 8.727066993713379, + "rewards/rejected": -5.721940994262695, + "step": 18405 + }, + { + "epoch": 4.6, + "grad_norm": 4.843205451965332, + "learning_rate": 1.5323740882194294e-07, + "logits/chosen": -0.6624647378921509, + "logits/rejected": -0.7006187438964844, + "logps/chosen": -53.013057708740234, + "logps/rejected": -109.705322265625, + "loss": 0.6981, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8528411388397217, + "rewards/margins": 6.39539909362793, + "rewards/rejected": -3.54255747795105, + "step": 18406 + }, + { + "epoch": 4.6, + "grad_norm": 4.038904190063477, + "learning_rate": 1.530443720063357e-07, + "logits/chosen": -0.555020809173584, + "logits/rejected": -0.6674713492393494, + "logps/chosen": -60.35786056518555, + "logps/rejected": -106.52255249023438, + "loss": 0.5885, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1178321838378906, + "rewards/margins": 8.367708206176758, + "rewards/rejected": -5.249875545501709, + "step": 18407 + }, + { + "epoch": 4.6, + "grad_norm": 2.6859500408172607, + "learning_rate": 1.5285145496415744e-07, + "logits/chosen": -0.526852011680603, + "logits/rejected": -0.6349254250526428, + "logps/chosen": -58.94453430175781, + "logps/rejected": -104.97532653808594, + "loss": 0.5742, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0035808086395264, + "rewards/margins": 7.838865280151367, + "rewards/rejected": -4.835284233093262, + "step": 18408 + }, + { + "epoch": 4.61, + "grad_norm": 3.312157154083252, + "learning_rate": 1.526586577001754e-07, + "logits/chosen": -0.4826444983482361, + "logits/rejected": -0.6134175658226013, + "logps/chosen": -63.087379455566406, + "logps/rejected": -108.70360565185547, + "loss": 0.627, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2120704650878906, + "rewards/margins": 8.308820724487305, + "rewards/rejected": -5.0967512130737305, + "step": 18409 + }, + { + "epoch": 4.61, + "grad_norm": 3.7498939037323, + "learning_rate": 1.5246598021915304e-07, + "logits/chosen": -0.5226719379425049, + "logits/rejected": -0.6766163110733032, + "logps/chosen": -63.51594161987305, + "logps/rejected": -113.9137954711914, + "loss": 0.655, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.174678087234497, + "rewards/margins": 9.478534698486328, + "rewards/rejected": -6.30385684967041, + "step": 18410 + }, + { + "epoch": 4.61, + "grad_norm": 6.7048821449279785, + "learning_rate": 1.522734225258532e-07, + "logits/chosen": -0.5556500554084778, + "logits/rejected": -0.646266758441925, + "logps/chosen": -53.11495590209961, + "logps/rejected": -105.65853881835938, + "loss": 0.5949, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.308927059173584, + "rewards/margins": 8.364060401916504, + "rewards/rejected": -5.055132865905762, + "step": 18411 + }, + { + "epoch": 4.61, + "grad_norm": 7.251153469085693, + "learning_rate": 1.5208098462503373e-07, + "logits/chosen": -0.5802796483039856, + "logits/rejected": -0.6286846995353699, + "logps/chosen": -66.41795349121094, + "logps/rejected": -118.43672180175781, + "loss": 0.6739, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.931225061416626, + "rewards/margins": 6.739083766937256, + "rewards/rejected": -3.807858943939209, + "step": 18412 + }, + { + "epoch": 4.61, + "grad_norm": 8.334836959838867, + "learning_rate": 1.5188866652144862e-07, + "logits/chosen": -0.5579305291175842, + "logits/rejected": -0.6345192193984985, + "logps/chosen": -59.51506423950195, + "logps/rejected": -95.67658996582031, + "loss": 0.6849, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2171053886413574, + "rewards/margins": 6.532287120819092, + "rewards/rejected": -3.3151817321777344, + "step": 18413 + }, + { + "epoch": 4.61, + "grad_norm": 6.684495449066162, + "learning_rate": 1.5169646821985184e-07, + "logits/chosen": -0.6173967123031616, + "logits/rejected": -0.66121906042099, + "logps/chosen": -44.55934143066406, + "logps/rejected": -93.25188446044922, + "loss": 0.6247, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.012053966522217, + "rewards/margins": 6.475632667541504, + "rewards/rejected": -3.4635794162750244, + "step": 18414 + }, + { + "epoch": 4.61, + "grad_norm": 4.669053554534912, + "learning_rate": 1.5150438972499237e-07, + "logits/chosen": -0.573509156703949, + "logits/rejected": -0.680708110332489, + "logps/chosen": -65.12986755371094, + "logps/rejected": -88.66836547851562, + "loss": 0.6955, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0293850898742676, + "rewards/margins": 6.371539115905762, + "rewards/rejected": -3.342153549194336, + "step": 18415 + }, + { + "epoch": 4.61, + "grad_norm": 19.996660232543945, + "learning_rate": 1.5131243104161587e-07, + "logits/chosen": -0.5899180173873901, + "logits/rejected": -0.6338863372802734, + "logps/chosen": -52.61077880859375, + "logps/rejected": -96.50630950927734, + "loss": 0.6808, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1152400970458984, + "rewards/margins": 5.507362365722656, + "rewards/rejected": -2.3921217918395996, + "step": 18416 + }, + { + "epoch": 4.61, + "grad_norm": 29.93700408935547, + "learning_rate": 1.5112059217446684e-07, + "logits/chosen": -0.434488981962204, + "logits/rejected": -0.5428537130355835, + "logps/chosen": -75.44735717773438, + "logps/rejected": -128.66477966308594, + "loss": 0.6702, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.696913719177246, + "rewards/margins": 6.105489730834961, + "rewards/rejected": -3.408576488494873, + "step": 18417 + }, + { + "epoch": 4.61, + "grad_norm": 16.221643447875977, + "learning_rate": 1.5092887312828598e-07, + "logits/chosen": -0.6053592562675476, + "logits/rejected": -0.6696712374687195, + "logps/chosen": -62.62508773803711, + "logps/rejected": -109.67755889892578, + "loss": 0.6722, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.115138530731201, + "rewards/margins": 6.495677471160889, + "rewards/rejected": -3.3805389404296875, + "step": 18418 + }, + { + "epoch": 4.61, + "grad_norm": 3.8139572143554688, + "learning_rate": 1.5073727390780946e-07, + "logits/chosen": -0.5905633568763733, + "logits/rejected": -0.7134943604469299, + "logps/chosen": -59.295745849609375, + "logps/rejected": -103.29266357421875, + "loss": 0.6096, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.948176145553589, + "rewards/margins": 7.768021106719971, + "rewards/rejected": -4.819844722747803, + "step": 18419 + }, + { + "epoch": 4.61, + "grad_norm": 4.917858123779297, + "learning_rate": 1.5054579451777295e-07, + "logits/chosen": -0.4935407340526581, + "logits/rejected": -0.5475460290908813, + "logps/chosen": -52.547306060791016, + "logps/rejected": -109.50242614746094, + "loss": 0.6313, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.240950584411621, + "rewards/margins": 7.011523723602295, + "rewards/rejected": -3.770573139190674, + "step": 18420 + }, + { + "epoch": 4.61, + "grad_norm": 4.245321273803711, + "learning_rate": 1.503544349629088e-07, + "logits/chosen": -0.6234337091445923, + "logits/rejected": -0.6923102140426636, + "logps/chosen": -46.647552490234375, + "logps/rejected": -117.79569244384766, + "loss": 0.6362, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2049055099487305, + "rewards/margins": 8.331564903259277, + "rewards/rejected": -5.126658916473389, + "step": 18421 + }, + { + "epoch": 4.61, + "grad_norm": 3.3943800926208496, + "learning_rate": 1.5016319524794432e-07, + "logits/chosen": -0.5260967016220093, + "logits/rejected": -0.5428200960159302, + "logps/chosen": -54.1109619140625, + "logps/rejected": -103.76773834228516, + "loss": 0.5652, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3480653762817383, + "rewards/margins": 6.8495707511901855, + "rewards/rejected": -3.501505136489868, + "step": 18422 + }, + { + "epoch": 4.61, + "grad_norm": 5.678082466125488, + "learning_rate": 1.4997207537760517e-07, + "logits/chosen": -0.5455195307731628, + "logits/rejected": -0.6227821111679077, + "logps/chosen": -66.42911529541016, + "logps/rejected": -95.55156707763672, + "loss": 0.6981, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.930971145629883, + "rewards/margins": 5.905210018157959, + "rewards/rejected": -2.974238872528076, + "step": 18423 + }, + { + "epoch": 4.61, + "grad_norm": 3.384401798248291, + "learning_rate": 1.497810753566159e-07, + "logits/chosen": -0.6547173261642456, + "logits/rejected": -0.7472962141036987, + "logps/chosen": -62.6823616027832, + "logps/rejected": -90.81800079345703, + "loss": 0.712, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3544065952301025, + "rewards/margins": 6.601502895355225, + "rewards/rejected": -3.247096061706543, + "step": 18424 + }, + { + "epoch": 4.61, + "grad_norm": 4.230917930603027, + "learning_rate": 1.4959019518969443e-07, + "logits/chosen": -0.6338738799095154, + "logits/rejected": -0.7203788757324219, + "logps/chosen": -45.529266357421875, + "logps/rejected": -97.0833740234375, + "loss": 0.5414, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.290203094482422, + "rewards/margins": 7.495454788208008, + "rewards/rejected": -4.205251693725586, + "step": 18425 + }, + { + "epoch": 4.61, + "grad_norm": 2.7982592582702637, + "learning_rate": 1.493994348815586e-07, + "logits/chosen": -0.5052458047866821, + "logits/rejected": -0.573235809803009, + "logps/chosen": -47.086517333984375, + "logps/rejected": -115.13928985595703, + "loss": 0.521, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0271921157836914, + "rewards/margins": 7.4964799880981445, + "rewards/rejected": -4.469287872314453, + "step": 18426 + }, + { + "epoch": 4.61, + "grad_norm": 4.521262168884277, + "learning_rate": 1.4920879443692192e-07, + "logits/chosen": -0.4898727536201477, + "logits/rejected": -0.5499275326728821, + "logps/chosen": -69.06068420410156, + "logps/rejected": -107.8951187133789, + "loss": 0.6646, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1066386699676514, + "rewards/margins": 6.294734954833984, + "rewards/rejected": -3.188096046447754, + "step": 18427 + }, + { + "epoch": 4.61, + "grad_norm": 9.132817268371582, + "learning_rate": 1.490182738604956e-07, + "logits/chosen": -0.5653152465820312, + "logits/rejected": -0.5950631499290466, + "logps/chosen": -61.1999626159668, + "logps/rejected": -120.07472229003906, + "loss": 0.7451, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.229322671890259, + "rewards/margins": 7.162500381469727, + "rewards/rejected": -3.9331777095794678, + "step": 18428 + }, + { + "epoch": 4.61, + "grad_norm": 2.8828392028808594, + "learning_rate": 1.4882787315698755e-07, + "logits/chosen": -0.5848536491394043, + "logits/rejected": -0.7063989043235779, + "logps/chosen": -49.939002990722656, + "logps/rejected": -102.39623260498047, + "loss": 0.5403, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2352044582366943, + "rewards/margins": 7.885614395141602, + "rewards/rejected": -4.650410175323486, + "step": 18429 + }, + { + "epoch": 4.61, + "grad_norm": 3.1315371990203857, + "learning_rate": 1.4863759233110286e-07, + "logits/chosen": -0.6348691582679749, + "logits/rejected": -0.7367739677429199, + "logps/chosen": -43.58523941040039, + "logps/rejected": -111.7599105834961, + "loss": 0.5919, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1171889305114746, + "rewards/margins": 8.406135559082031, + "rewards/rejected": -5.288948059082031, + "step": 18430 + }, + { + "epoch": 4.61, + "grad_norm": 8.396078109741211, + "learning_rate": 1.484474313875428e-07, + "logits/chosen": -0.5359154343605042, + "logits/rejected": -0.6327946782112122, + "logps/chosen": -55.267940521240234, + "logps/rejected": -115.86920166015625, + "loss": 0.5521, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.111314535140991, + "rewards/margins": 7.818077087402344, + "rewards/rejected": -4.706762313842773, + "step": 18431 + }, + { + "epoch": 4.61, + "grad_norm": 4.0047783851623535, + "learning_rate": 1.482573903310075e-07, + "logits/chosen": -0.5835940837860107, + "logits/rejected": -0.7100105285644531, + "logps/chosen": -39.203636169433594, + "logps/rejected": -84.04862976074219, + "loss": 0.4828, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.228703260421753, + "rewards/margins": 7.896188259124756, + "rewards/rejected": -4.667485237121582, + "step": 18432 + }, + { + "epoch": 4.61, + "grad_norm": 4.131725788116455, + "learning_rate": 1.4806746916619263e-07, + "logits/chosen": -0.5530660152435303, + "logits/rejected": -0.6933861970901489, + "logps/chosen": -55.345951080322266, + "logps/rejected": -100.54430389404297, + "loss": 0.5942, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8141720294952393, + "rewards/margins": 7.5405168533325195, + "rewards/rejected": -4.726344585418701, + "step": 18433 + }, + { + "epoch": 4.61, + "grad_norm": 10.176673889160156, + "learning_rate": 1.478776678977911e-07, + "logits/chosen": -0.5746676921844482, + "logits/rejected": -0.6637440919876099, + "logps/chosen": -65.97135925292969, + "logps/rejected": -112.0789794921875, + "loss": 0.8112, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.316469669342041, + "rewards/margins": 7.486366271972656, + "rewards/rejected": -4.169896125793457, + "step": 18434 + }, + { + "epoch": 4.61, + "grad_norm": 4.307641506195068, + "learning_rate": 1.476879865304931e-07, + "logits/chosen": -0.5842913389205933, + "logits/rejected": -0.6866624355316162, + "logps/chosen": -55.82656478881836, + "logps/rejected": -99.99711608886719, + "loss": 0.7107, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.106292724609375, + "rewards/margins": 7.378407001495361, + "rewards/rejected": -4.2721147537231445, + "step": 18435 + }, + { + "epoch": 4.61, + "grad_norm": 5.4275221824646, + "learning_rate": 1.474984250689876e-07, + "logits/chosen": -0.5788276791572571, + "logits/rejected": -0.6764869689941406, + "logps/chosen": -51.68145751953125, + "logps/rejected": -101.56230163574219, + "loss": 0.5108, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1471681594848633, + "rewards/margins": 7.683880805969238, + "rewards/rejected": -4.536711692810059, + "step": 18436 + }, + { + "epoch": 4.61, + "grad_norm": 6.296716213226318, + "learning_rate": 1.4730898351795586e-07, + "logits/chosen": -0.6873803734779358, + "logits/rejected": -0.6907889246940613, + "logps/chosen": -43.336769104003906, + "logps/rejected": -117.77993774414062, + "loss": 0.5686, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3881137371063232, + "rewards/margins": 7.06756591796875, + "rewards/rejected": -3.6794519424438477, + "step": 18437 + }, + { + "epoch": 4.61, + "grad_norm": 4.188784599304199, + "learning_rate": 1.471196618820808e-07, + "logits/chosen": -0.5084785223007202, + "logits/rejected": -0.5933922529220581, + "logps/chosen": -58.69803237915039, + "logps/rejected": -111.7873764038086, + "loss": 0.7355, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1537082195281982, + "rewards/margins": 7.92363977432251, + "rewards/rejected": -4.769932270050049, + "step": 18438 + }, + { + "epoch": 4.61, + "grad_norm": 7.133900165557861, + "learning_rate": 1.4693046016604152e-07, + "logits/chosen": -0.5533196330070496, + "logits/rejected": -0.6704493761062622, + "logps/chosen": -67.72608184814453, + "logps/rejected": -95.74198913574219, + "loss": 0.6343, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0263803005218506, + "rewards/margins": 7.676225185394287, + "rewards/rejected": -4.649844646453857, + "step": 18439 + }, + { + "epoch": 4.61, + "grad_norm": 3.0838232040405273, + "learning_rate": 1.467413783745114e-07, + "logits/chosen": -0.5451022982597351, + "logits/rejected": -0.6262849569320679, + "logps/chosen": -42.579002380371094, + "logps/rejected": -105.80769348144531, + "loss": 0.5464, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2552356719970703, + "rewards/margins": 7.853385925292969, + "rewards/rejected": -4.598150730133057, + "step": 18440 + }, + { + "epoch": 4.61, + "grad_norm": 10.829850196838379, + "learning_rate": 1.4655241651216456e-07, + "logits/chosen": -0.599736213684082, + "logits/rejected": -0.7135962247848511, + "logps/chosen": -54.35026550292969, + "logps/rejected": -107.34019470214844, + "loss": 0.6451, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.735154628753662, + "rewards/margins": 7.557730674743652, + "rewards/rejected": -4.82257604598999, + "step": 18441 + }, + { + "epoch": 4.61, + "grad_norm": 3.8177003860473633, + "learning_rate": 1.4636357458367e-07, + "logits/chosen": -0.5777589678764343, + "logits/rejected": -0.6918892860412598, + "logps/chosen": -55.87877655029297, + "logps/rejected": -108.3144302368164, + "loss": 0.5899, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.448547840118408, + "rewards/margins": 8.502554893493652, + "rewards/rejected": -5.054007530212402, + "step": 18442 + }, + { + "epoch": 4.61, + "grad_norm": 11.01729965209961, + "learning_rate": 1.46174852593694e-07, + "logits/chosen": -0.6045094728469849, + "logits/rejected": -0.7456791400909424, + "logps/chosen": -64.41484069824219, + "logps/rejected": -97.31902313232422, + "loss": 0.6761, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7245302200317383, + "rewards/margins": 7.870630741119385, + "rewards/rejected": -5.1461005210876465, + "step": 18443 + }, + { + "epoch": 4.61, + "grad_norm": 5.868189811706543, + "learning_rate": 1.4598625054689953e-07, + "logits/chosen": -0.6196625232696533, + "logits/rejected": -0.6999559998512268, + "logps/chosen": -57.35647201538086, + "logps/rejected": -107.16720581054688, + "loss": 0.6282, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3818840980529785, + "rewards/margins": 8.025303840637207, + "rewards/rejected": -4.6434197425842285, + "step": 18444 + }, + { + "epoch": 4.61, + "grad_norm": 3.9550747871398926, + "learning_rate": 1.4579776844794834e-07, + "logits/chosen": -0.5551726818084717, + "logits/rejected": -0.6240448951721191, + "logps/chosen": -48.012107849121094, + "logps/rejected": -100.67366027832031, + "loss": 0.6971, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.307072162628174, + "rewards/margins": 6.55837869644165, + "rewards/rejected": -3.2513065338134766, + "step": 18445 + }, + { + "epoch": 4.61, + "grad_norm": 2.7324814796447754, + "learning_rate": 1.456094063014979e-07, + "logits/chosen": -0.5653655529022217, + "logits/rejected": -0.6280409693717957, + "logps/chosen": -48.649688720703125, + "logps/rejected": -113.50370025634766, + "loss": 0.5876, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.985811948776245, + "rewards/margins": 7.481936454772949, + "rewards/rejected": -4.496125221252441, + "step": 18446 + }, + { + "epoch": 4.61, + "grad_norm": 4.479350566864014, + "learning_rate": 1.4542116411220164e-07, + "logits/chosen": -0.5274064540863037, + "logits/rejected": -0.6187217831611633, + "logps/chosen": -63.954551696777344, + "logps/rejected": -101.13531494140625, + "loss": 0.6192, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.750244140625, + "rewards/margins": 7.1350274085998535, + "rewards/rejected": -4.3847832679748535, + "step": 18447 + }, + { + "epoch": 4.61, + "grad_norm": 7.186918258666992, + "learning_rate": 1.4523304188471255e-07, + "logits/chosen": -0.5337638258934021, + "logits/rejected": -0.6391469836235046, + "logps/chosen": -57.43482208251953, + "logps/rejected": -114.40363311767578, + "loss": 0.6115, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2885260581970215, + "rewards/margins": 8.409378051757812, + "rewards/rejected": -5.120851516723633, + "step": 18448 + }, + { + "epoch": 4.62, + "grad_norm": 4.9154133796691895, + "learning_rate": 1.45045039623678e-07, + "logits/chosen": -0.6359223127365112, + "logits/rejected": -0.6945561766624451, + "logps/chosen": -55.804752349853516, + "logps/rejected": -107.49244689941406, + "loss": 0.7715, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.861494541168213, + "rewards/margins": 7.348684787750244, + "rewards/rejected": -4.487189769744873, + "step": 18449 + }, + { + "epoch": 4.62, + "grad_norm": 9.891653060913086, + "learning_rate": 1.4485715733374482e-07, + "logits/chosen": -0.487437903881073, + "logits/rejected": -0.6135947108268738, + "logps/chosen": -62.80312728881836, + "logps/rejected": -100.11296081542969, + "loss": 0.7389, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.974637508392334, + "rewards/margins": 7.247100353240967, + "rewards/rejected": -4.272462368011475, + "step": 18450 + }, + { + "epoch": 4.62, + "grad_norm": 12.78709602355957, + "learning_rate": 1.4466939501955602e-07, + "logits/chosen": -0.5053683519363403, + "logits/rejected": -0.5906517505645752, + "logps/chosen": -55.454627990722656, + "logps/rejected": -105.20590209960938, + "loss": 0.6171, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.134896755218506, + "rewards/margins": 7.824058532714844, + "rewards/rejected": -4.68916130065918, + "step": 18451 + }, + { + "epoch": 4.62, + "grad_norm": 4.97633695602417, + "learning_rate": 1.4448175268575005e-07, + "logits/chosen": -0.49538326263427734, + "logits/rejected": -0.6422094106674194, + "logps/chosen": -57.94849395751953, + "logps/rejected": -105.55970764160156, + "loss": 0.6107, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.07835054397583, + "rewards/margins": 8.60618782043457, + "rewards/rejected": -5.52783727645874, + "step": 18452 + }, + { + "epoch": 4.62, + "grad_norm": 5.773235321044922, + "learning_rate": 1.442942303369643e-07, + "logits/chosen": -0.6353707909584045, + "logits/rejected": -0.7230668067932129, + "logps/chosen": -54.25889587402344, + "logps/rejected": -110.14899444580078, + "loss": 0.5837, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.285616397857666, + "rewards/margins": 7.893896102905273, + "rewards/rejected": -4.608280181884766, + "step": 18453 + }, + { + "epoch": 4.62, + "grad_norm": 3.526353120803833, + "learning_rate": 1.4410682797783348e-07, + "logits/chosen": -0.44763046503067017, + "logits/rejected": -0.5572181940078735, + "logps/chosen": -46.7181396484375, + "logps/rejected": -100.04901885986328, + "loss": 0.5418, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.230928897857666, + "rewards/margins": 7.263278961181641, + "rewards/rejected": -4.032349586486816, + "step": 18454 + }, + { + "epoch": 4.62, + "grad_norm": 2.869570255279541, + "learning_rate": 1.4391954561298827e-07, + "logits/chosen": -0.6073725819587708, + "logits/rejected": -0.6770752668380737, + "logps/chosen": -51.66178894042969, + "logps/rejected": -113.08158111572266, + "loss": 0.5724, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.452193260192871, + "rewards/margins": 7.650046348571777, + "rewards/rejected": -4.1978535652160645, + "step": 18455 + }, + { + "epoch": 4.62, + "grad_norm": 5.192141056060791, + "learning_rate": 1.4373238324705552e-07, + "logits/chosen": -0.5288646221160889, + "logits/rejected": -0.6576855778694153, + "logps/chosen": -65.57377624511719, + "logps/rejected": -104.06304931640625, + "loss": 0.6031, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7978782653808594, + "rewards/margins": 7.0570502281188965, + "rewards/rejected": -4.259171962738037, + "step": 18456 + }, + { + "epoch": 4.62, + "grad_norm": 6.067903518676758, + "learning_rate": 1.4354534088466098e-07, + "logits/chosen": -0.5576624274253845, + "logits/rejected": -0.6739488840103149, + "logps/chosen": -60.206729888916016, + "logps/rejected": -106.35735321044922, + "loss": 0.7452, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8893630504608154, + "rewards/margins": 7.613508224487305, + "rewards/rejected": -4.72414493560791, + "step": 18457 + }, + { + "epoch": 4.62, + "grad_norm": 2.646120071411133, + "learning_rate": 1.4335841853042708e-07, + "logits/chosen": -0.5167410373687744, + "logits/rejected": -0.5603365898132324, + "logps/chosen": -50.199302673339844, + "logps/rejected": -103.117919921875, + "loss": 0.5693, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1676292419433594, + "rewards/margins": 7.483181953430176, + "rewards/rejected": -4.315552711486816, + "step": 18458 + }, + { + "epoch": 4.62, + "grad_norm": 3.145704507827759, + "learning_rate": 1.431716161889718e-07, + "logits/chosen": -0.5777453184127808, + "logits/rejected": -0.6676897406578064, + "logps/chosen": -57.57623291015625, + "logps/rejected": -102.71290588378906, + "loss": 0.6016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1267824172973633, + "rewards/margins": 7.390064716339111, + "rewards/rejected": -4.263282299041748, + "step": 18459 + }, + { + "epoch": 4.62, + "grad_norm": 5.690033912658691, + "learning_rate": 1.4298493386491252e-07, + "logits/chosen": -0.570110559463501, + "logits/rejected": -0.649578332901001, + "logps/chosen": -68.50721740722656, + "logps/rejected": -101.8949203491211, + "loss": 0.6259, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8482556343078613, + "rewards/margins": 6.315277099609375, + "rewards/rejected": -3.4670207500457764, + "step": 18460 + }, + { + "epoch": 4.62, + "grad_norm": 8.451935768127441, + "learning_rate": 1.4279837156286168e-07, + "logits/chosen": -0.572319746017456, + "logits/rejected": -0.6630426645278931, + "logps/chosen": -56.09082794189453, + "logps/rejected": -98.68836212158203, + "loss": 0.614, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0759153366088867, + "rewards/margins": 6.971757411956787, + "rewards/rejected": -3.8958418369293213, + "step": 18461 + }, + { + "epoch": 4.62, + "grad_norm": 29.647016525268555, + "learning_rate": 1.4261192928742894e-07, + "logits/chosen": -0.580726146697998, + "logits/rejected": -0.6566603183746338, + "logps/chosen": -57.6711311340332, + "logps/rejected": -118.42927551269531, + "loss": 0.6928, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8998806476593018, + "rewards/margins": 7.739961624145508, + "rewards/rejected": -4.840081214904785, + "step": 18462 + }, + { + "epoch": 4.62, + "grad_norm": 2.8949508666992188, + "learning_rate": 1.4242560704322228e-07, + "logits/chosen": -0.6091124415397644, + "logits/rejected": -0.6583424806594849, + "logps/chosen": -49.86172866821289, + "logps/rejected": -99.56572723388672, + "loss": 0.6006, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2110633850097656, + "rewards/margins": 7.222107887268066, + "rewards/rejected": -4.011044502258301, + "step": 18463 + }, + { + "epoch": 4.62, + "grad_norm": 4.277819633483887, + "learning_rate": 1.4223940483484577e-07, + "logits/chosen": -0.5201418399810791, + "logits/rejected": -0.63433438539505, + "logps/chosen": -61.70531463623047, + "logps/rejected": -111.73157501220703, + "loss": 0.5801, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.877509355545044, + "rewards/margins": 7.435166358947754, + "rewards/rejected": -4.557656288146973, + "step": 18464 + }, + { + "epoch": 4.62, + "grad_norm": 2.8242602348327637, + "learning_rate": 1.4205332266689965e-07, + "logits/chosen": -0.5496016144752502, + "logits/rejected": -0.6424810290336609, + "logps/chosen": -45.79668426513672, + "logps/rejected": -102.37642669677734, + "loss": 0.6033, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.03745174407959, + "rewards/margins": 7.777660369873047, + "rewards/rejected": -4.740208148956299, + "step": 18465 + }, + { + "epoch": 4.62, + "grad_norm": 5.036864280700684, + "learning_rate": 1.4186736054398353e-07, + "logits/chosen": -0.5458247661590576, + "logits/rejected": -0.6215978264808655, + "logps/chosen": -58.739830017089844, + "logps/rejected": -107.97801971435547, + "loss": 0.6978, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8695168495178223, + "rewards/margins": 6.520503520965576, + "rewards/rejected": -3.650987148284912, + "step": 18466 + }, + { + "epoch": 4.62, + "grad_norm": 2.7290077209472656, + "learning_rate": 1.4168151847069267e-07, + "logits/chosen": -0.6318212747573853, + "logits/rejected": -0.7127053737640381, + "logps/chosen": -53.77703094482422, + "logps/rejected": -110.3581771850586, + "loss": 0.5465, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.448732852935791, + "rewards/margins": 8.192066192626953, + "rewards/rejected": -4.74333381652832, + "step": 18467 + }, + { + "epoch": 4.62, + "grad_norm": 6.078935146331787, + "learning_rate": 1.4149579645161892e-07, + "logits/chosen": -0.5680851340293884, + "logits/rejected": -0.6620502471923828, + "logps/chosen": -55.22056579589844, + "logps/rejected": -114.87574005126953, + "loss": 0.5649, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.319258451461792, + "rewards/margins": 8.489439010620117, + "rewards/rejected": -5.170179843902588, + "step": 18468 + }, + { + "epoch": 4.62, + "grad_norm": 5.727390289306641, + "learning_rate": 1.4131019449135143e-07, + "logits/chosen": -0.5800096988677979, + "logits/rejected": -0.6675981879234314, + "logps/chosen": -50.1864013671875, + "logps/rejected": -113.42362976074219, + "loss": 0.6408, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6450886726379395, + "rewards/margins": 7.591939449310303, + "rewards/rejected": -4.946850299835205, + "step": 18469 + }, + { + "epoch": 4.62, + "grad_norm": 3.5471394062042236, + "learning_rate": 1.4112471259447703e-07, + "logits/chosen": -0.6432307362556458, + "logits/rejected": -0.7445358633995056, + "logps/chosen": -51.481544494628906, + "logps/rejected": -116.98672485351562, + "loss": 0.6341, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.188093900680542, + "rewards/margins": 9.11119556427002, + "rewards/rejected": -5.923101902008057, + "step": 18470 + }, + { + "epoch": 4.62, + "grad_norm": 3.9111862182617188, + "learning_rate": 1.409393507655793e-07, + "logits/chosen": -0.539862334728241, + "logits/rejected": -0.6212419271469116, + "logps/chosen": -63.44162368774414, + "logps/rejected": -89.69913482666016, + "loss": 0.6263, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.198455572128296, + "rewards/margins": 6.395038604736328, + "rewards/rejected": -3.1965830326080322, + "step": 18471 + }, + { + "epoch": 4.62, + "grad_norm": 5.01838493347168, + "learning_rate": 1.4075410900923847e-07, + "logits/chosen": -0.5758573412895203, + "logits/rejected": -0.6873120665550232, + "logps/chosen": -51.45526123046875, + "logps/rejected": -103.77268981933594, + "loss": 0.5834, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.903959274291992, + "rewards/margins": 7.6391448974609375, + "rewards/rejected": -4.735185146331787, + "step": 18472 + }, + { + "epoch": 4.62, + "grad_norm": 3.933972120285034, + "learning_rate": 1.40568987330032e-07, + "logits/chosen": -0.6041074991226196, + "logits/rejected": -0.6704300045967102, + "logps/chosen": -72.4547119140625, + "logps/rejected": -117.94259643554688, + "loss": 0.7014, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.127744674682617, + "rewards/margins": 7.639636516571045, + "rewards/rejected": -4.5118913650512695, + "step": 18473 + }, + { + "epoch": 4.62, + "grad_norm": 7.359065532684326, + "learning_rate": 1.4038398573253452e-07, + "logits/chosen": -0.4795292019844055, + "logits/rejected": -0.5708296298980713, + "logps/chosen": -63.546234130859375, + "logps/rejected": -131.86590576171875, + "loss": 0.5984, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9447121620178223, + "rewards/margins": 7.369151592254639, + "rewards/rejected": -4.424439430236816, + "step": 18474 + }, + { + "epoch": 4.62, + "grad_norm": 3.0660245418548584, + "learning_rate": 1.4019910422131799e-07, + "logits/chosen": -0.5991713404655457, + "logits/rejected": -0.7109869718551636, + "logps/chosen": -53.439266204833984, + "logps/rejected": -92.64198303222656, + "loss": 0.5568, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1233201026916504, + "rewards/margins": 7.501143932342529, + "rewards/rejected": -4.377823829650879, + "step": 18475 + }, + { + "epoch": 4.62, + "grad_norm": 3.2979118824005127, + "learning_rate": 1.4001434280095094e-07, + "logits/chosen": -0.6098273396492004, + "logits/rejected": -0.6811153888702393, + "logps/chosen": -50.22636413574219, + "logps/rejected": -107.36400604248047, + "loss": 0.582, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.571239471435547, + "rewards/margins": 7.653952598571777, + "rewards/rejected": -4.0827131271362305, + "step": 18476 + }, + { + "epoch": 4.62, + "grad_norm": 9.29659366607666, + "learning_rate": 1.3982970147599806e-07, + "logits/chosen": -0.5604119896888733, + "logits/rejected": -0.6514191627502441, + "logps/chosen": -51.50884246826172, + "logps/rejected": -102.67242431640625, + "loss": 0.5956, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1542460918426514, + "rewards/margins": 7.62756872177124, + "rewards/rejected": -4.47332239151001, + "step": 18477 + }, + { + "epoch": 4.62, + "grad_norm": 6.3326191902160645, + "learning_rate": 1.3964518025102292e-07, + "logits/chosen": -0.5670545101165771, + "logits/rejected": -0.6256366968154907, + "logps/chosen": -45.524818420410156, + "logps/rejected": -110.17314147949219, + "loss": 0.5942, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0389227867126465, + "rewards/margins": 7.404476165771484, + "rewards/rejected": -4.365553379058838, + "step": 18478 + }, + { + "epoch": 4.62, + "grad_norm": 7.999419212341309, + "learning_rate": 1.3946077913058575e-07, + "logits/chosen": -0.5198894739151001, + "logits/rejected": -0.6180892586708069, + "logps/chosen": -62.243385314941406, + "logps/rejected": -88.5509262084961, + "loss": 0.6712, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9779586791992188, + "rewards/margins": 5.792728424072266, + "rewards/rejected": -2.814770460128784, + "step": 18479 + }, + { + "epoch": 4.62, + "grad_norm": 4.525938034057617, + "learning_rate": 1.3927649811924182e-07, + "logits/chosen": -0.5061566233634949, + "logits/rejected": -0.5610877275466919, + "logps/chosen": -55.404842376708984, + "logps/rejected": -96.28082275390625, + "loss": 0.6455, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0926353931427, + "rewards/margins": 6.380970001220703, + "rewards/rejected": -3.288334608078003, + "step": 18480 + }, + { + "epoch": 4.62, + "grad_norm": 4.989224433898926, + "learning_rate": 1.390923372215458e-07, + "logits/chosen": -0.5284961462020874, + "logits/rejected": -0.6391825675964355, + "logps/chosen": -56.228389739990234, + "logps/rejected": -126.03382110595703, + "loss": 0.6169, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.902042865753174, + "rewards/margins": 8.56276798248291, + "rewards/rejected": -5.660724639892578, + "step": 18481 + }, + { + "epoch": 4.62, + "grad_norm": 2.5141677856445312, + "learning_rate": 1.389082964420496e-07, + "logits/chosen": -0.6270264983177185, + "logits/rejected": -0.7050790786743164, + "logps/chosen": -59.72052764892578, + "logps/rejected": -116.30241394042969, + "loss": 0.6454, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5489089488983154, + "rewards/margins": 9.0826416015625, + "rewards/rejected": -5.533731460571289, + "step": 18482 + }, + { + "epoch": 4.62, + "grad_norm": 6.590972900390625, + "learning_rate": 1.3872437578529851e-07, + "logits/chosen": -0.5591471195220947, + "logits/rejected": -0.6525722146034241, + "logps/chosen": -46.631492614746094, + "logps/rejected": -108.55644989013672, + "loss": 0.4874, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3284475803375244, + "rewards/margins": 8.450296401977539, + "rewards/rejected": -5.121849060058594, + "step": 18483 + }, + { + "epoch": 4.62, + "grad_norm": 3.816296339035034, + "learning_rate": 1.3854057525583885e-07, + "logits/chosen": -0.5542503595352173, + "logits/rejected": -0.6292675137519836, + "logps/chosen": -48.27267074584961, + "logps/rejected": -102.6889419555664, + "loss": 0.6214, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3772802352905273, + "rewards/margins": 7.953294277191162, + "rewards/rejected": -4.576014041900635, + "step": 18484 + }, + { + "epoch": 4.62, + "grad_norm": 3.2118399143218994, + "learning_rate": 1.3835689485821313e-07, + "logits/chosen": -0.5187495350837708, + "logits/rejected": -0.6190199851989746, + "logps/chosen": -59.43812942504883, + "logps/rejected": -96.18729400634766, + "loss": 0.5209, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3346378803253174, + "rewards/margins": 6.3294830322265625, + "rewards/rejected": -2.9948458671569824, + "step": 18485 + }, + { + "epoch": 4.62, + "grad_norm": 5.65465784072876, + "learning_rate": 1.3817333459695936e-07, + "logits/chosen": -0.5230748057365417, + "logits/rejected": -0.5935672521591187, + "logps/chosen": -62.064666748046875, + "logps/rejected": -124.48229217529297, + "loss": 0.585, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.176276683807373, + "rewards/margins": 8.325252532958984, + "rewards/rejected": -5.148974895477295, + "step": 18486 + }, + { + "epoch": 4.62, + "grad_norm": 3.393629550933838, + "learning_rate": 1.3798989447661336e-07, + "logits/chosen": -0.5253666639328003, + "logits/rejected": -0.6053491234779358, + "logps/chosen": -55.714569091796875, + "logps/rejected": -85.37933349609375, + "loss": 0.6191, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5051496028900146, + "rewards/margins": 6.186505317687988, + "rewards/rejected": -2.6813549995422363, + "step": 18487 + }, + { + "epoch": 4.62, + "grad_norm": 7.648283958435059, + "learning_rate": 1.3780657450170932e-07, + "logits/chosen": -0.5254917144775391, + "logits/rejected": -0.6212986707687378, + "logps/chosen": -56.297325134277344, + "logps/rejected": -102.20503234863281, + "loss": 0.6216, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.00217604637146, + "rewards/margins": 7.610988616943359, + "rewards/rejected": -4.608811855316162, + "step": 18488 + }, + { + "epoch": 4.63, + "grad_norm": 7.908689975738525, + "learning_rate": 1.3762337467677633e-07, + "logits/chosen": -0.4775099754333496, + "logits/rejected": -0.6420085430145264, + "logps/chosen": -74.27322387695312, + "logps/rejected": -108.0630111694336, + "loss": 0.5929, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.616001844406128, + "rewards/margins": 7.002285003662109, + "rewards/rejected": -4.386282920837402, + "step": 18489 + }, + { + "epoch": 4.63, + "grad_norm": 17.075519561767578, + "learning_rate": 1.374402950063408e-07, + "logits/chosen": -0.5842788219451904, + "logits/rejected": -0.6254281997680664, + "logps/chosen": -49.7181396484375, + "logps/rejected": -112.73250579833984, + "loss": 0.6599, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.719609260559082, + "rewards/margins": 6.8146257400512695, + "rewards/rejected": -4.095015525817871, + "step": 18490 + }, + { + "epoch": 4.63, + "grad_norm": 12.147019386291504, + "learning_rate": 1.372573354949286e-07, + "logits/chosen": -0.5897321701049805, + "logits/rejected": -0.6424738764762878, + "logps/chosen": -62.790809631347656, + "logps/rejected": -119.53595733642578, + "loss": 0.7557, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.6586482524871826, + "rewards/margins": 7.057720184326172, + "rewards/rejected": -4.39907169342041, + "step": 18491 + }, + { + "epoch": 4.63, + "grad_norm": 8.804954528808594, + "learning_rate": 1.3707449614705882e-07, + "logits/chosen": -0.5338008999824524, + "logits/rejected": -0.6458321809768677, + "logps/chosen": -50.51516342163086, + "logps/rejected": -99.13225555419922, + "loss": 0.6184, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8091583251953125, + "rewards/margins": 7.746926307678223, + "rewards/rejected": -4.93776798248291, + "step": 18492 + }, + { + "epoch": 4.63, + "grad_norm": 4.526075839996338, + "learning_rate": 1.3689177696725176e-07, + "logits/chosen": -0.5989097356796265, + "logits/rejected": -0.6224370002746582, + "logps/chosen": -49.6666374206543, + "logps/rejected": -105.08890533447266, + "loss": 0.6596, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0496342182159424, + "rewards/margins": 6.747501850128174, + "rewards/rejected": -3.6978678703308105, + "step": 18493 + }, + { + "epoch": 4.63, + "grad_norm": 8.366461753845215, + "learning_rate": 1.3670917796002104e-07, + "logits/chosen": -0.5555751919746399, + "logits/rejected": -0.5913290977478027, + "logps/chosen": -56.05345916748047, + "logps/rejected": -116.9950942993164, + "loss": 0.6289, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3773303031921387, + "rewards/margins": 7.809471607208252, + "rewards/rejected": -4.432140827178955, + "step": 18494 + }, + { + "epoch": 4.63, + "grad_norm": 6.0081706047058105, + "learning_rate": 1.3652669912987914e-07, + "logits/chosen": -0.5253843069076538, + "logits/rejected": -0.6022760272026062, + "logps/chosen": -69.0745849609375, + "logps/rejected": -106.82682800292969, + "loss": 0.6141, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.039486885070801, + "rewards/margins": 7.165899276733398, + "rewards/rejected": -4.126412868499756, + "step": 18495 + }, + { + "epoch": 4.63, + "grad_norm": 6.499022960662842, + "learning_rate": 1.3634434048133583e-07, + "logits/chosen": -0.619468629360199, + "logits/rejected": -0.7046397924423218, + "logps/chosen": -54.224788665771484, + "logps/rejected": -121.90645599365234, + "loss": 0.5704, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5203428268432617, + "rewards/margins": 7.969610214233398, + "rewards/rejected": -4.449267387390137, + "step": 18496 + }, + { + "epoch": 4.63, + "grad_norm": 4.115801811218262, + "learning_rate": 1.361621020188969e-07, + "logits/chosen": -0.5745084285736084, + "logits/rejected": -0.6693778038024902, + "logps/chosen": -81.7928466796875, + "logps/rejected": -117.43630981445312, + "loss": 0.6906, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0677433013916016, + "rewards/margins": 7.80415153503418, + "rewards/rejected": -4.736408710479736, + "step": 18497 + }, + { + "epoch": 4.63, + "grad_norm": 8.071520805358887, + "learning_rate": 1.35979983747066e-07, + "logits/chosen": -0.5082718729972839, + "logits/rejected": -0.5998460054397583, + "logps/chosen": -50.461971282958984, + "logps/rejected": -117.8973388671875, + "loss": 0.5816, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.194390296936035, + "rewards/margins": 8.991588592529297, + "rewards/rejected": -5.797198295593262, + "step": 18498 + }, + { + "epoch": 4.63, + "grad_norm": 2.7271881103515625, + "learning_rate": 1.3579798567034286e-07, + "logits/chosen": -0.6007087230682373, + "logits/rejected": -0.6826967597007751, + "logps/chosen": -51.251548767089844, + "logps/rejected": -107.2890853881836, + "loss": 0.5998, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3198318481445312, + "rewards/margins": 9.079852104187012, + "rewards/rejected": -5.760021209716797, + "step": 18499 + }, + { + "epoch": 4.63, + "grad_norm": 2.3659989833831787, + "learning_rate": 1.3561610779322609e-07, + "logits/chosen": -0.4902109205722809, + "logits/rejected": -0.5990089178085327, + "logps/chosen": -50.718082427978516, + "logps/rejected": -98.62084197998047, + "loss": 0.5915, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3109874725341797, + "rewards/margins": 8.21189022064209, + "rewards/rejected": -4.90090274810791, + "step": 18500 + }, + { + "epoch": 4.63, + "grad_norm": 15.015575408935547, + "learning_rate": 1.354343501202088e-07, + "logits/chosen": -0.5198379158973694, + "logits/rejected": -0.622505247592926, + "logps/chosen": -52.98636245727539, + "logps/rejected": -101.49137115478516, + "loss": 0.6436, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.869854211807251, + "rewards/margins": 7.10845422744751, + "rewards/rejected": -4.238600254058838, + "step": 18501 + }, + { + "epoch": 4.63, + "grad_norm": 3.557887315750122, + "learning_rate": 1.3525271265578287e-07, + "logits/chosen": -0.4947452247142792, + "logits/rejected": -0.5821173191070557, + "logps/chosen": -42.93651580810547, + "logps/rejected": -108.17146301269531, + "loss": 0.5196, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.398186206817627, + "rewards/margins": 7.8318190574646, + "rewards/rejected": -4.433632850646973, + "step": 18502 + }, + { + "epoch": 4.63, + "grad_norm": 8.483743667602539, + "learning_rate": 1.3507119540443703e-07, + "logits/chosen": -0.6293566226959229, + "logits/rejected": -0.6796526908874512, + "logps/chosen": -48.017601013183594, + "logps/rejected": -128.8303985595703, + "loss": 0.5287, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.995058059692383, + "rewards/margins": 8.757257461547852, + "rewards/rejected": -5.762200355529785, + "step": 18503 + }, + { + "epoch": 4.63, + "grad_norm": 1.8334499597549438, + "learning_rate": 1.3488979837065653e-07, + "logits/chosen": -0.5927553772926331, + "logits/rejected": -0.6725910902023315, + "logps/chosen": -56.94102478027344, + "logps/rejected": -104.88487243652344, + "loss": 0.5402, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4172277450561523, + "rewards/margins": 7.787868499755859, + "rewards/rejected": -4.370640754699707, + "step": 18504 + }, + { + "epoch": 4.63, + "grad_norm": 5.078468322753906, + "learning_rate": 1.3470852155892332e-07, + "logits/chosen": -0.563402533531189, + "logits/rejected": -0.5917094349861145, + "logps/chosen": -54.18718719482422, + "logps/rejected": -120.35269165039062, + "loss": 0.6076, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.053645133972168, + "rewards/margins": 7.309937477111816, + "rewards/rejected": -4.256292819976807, + "step": 18505 + }, + { + "epoch": 4.63, + "grad_norm": 4.930517673492432, + "learning_rate": 1.345273649737183e-07, + "logits/chosen": -0.5736672878265381, + "logits/rejected": -0.596784770488739, + "logps/chosen": -53.28043746948242, + "logps/rejected": -94.72154998779297, + "loss": 0.6699, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.009390354156494, + "rewards/margins": 5.42985200881958, + "rewards/rejected": -2.420461654663086, + "step": 18506 + }, + { + "epoch": 4.63, + "grad_norm": 4.86274528503418, + "learning_rate": 1.343463286195168e-07, + "logits/chosen": -0.6089975237846375, + "logits/rejected": -0.6611326932907104, + "logps/chosen": -47.93611526489258, + "logps/rejected": -118.94965362548828, + "loss": 0.5535, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.5433969497680664, + "rewards/margins": 6.8096208572387695, + "rewards/rejected": -4.266223907470703, + "step": 18507 + }, + { + "epoch": 4.63, + "grad_norm": 4.3643999099731445, + "learning_rate": 1.3416541250079295e-07, + "logits/chosen": -0.5576719045639038, + "logits/rejected": -0.670318067073822, + "logps/chosen": -54.87386703491211, + "logps/rejected": -116.6332015991211, + "loss": 0.5731, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.13130521774292, + "rewards/margins": 7.698168754577637, + "rewards/rejected": -4.566863536834717, + "step": 18508 + }, + { + "epoch": 4.63, + "grad_norm": 23.38018798828125, + "learning_rate": 1.3398461662201656e-07, + "logits/chosen": -0.5579917430877686, + "logits/rejected": -0.6368932127952576, + "logps/chosen": -55.76302719116211, + "logps/rejected": -101.0577163696289, + "loss": 0.6222, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9776196479797363, + "rewards/margins": 7.2831268310546875, + "rewards/rejected": -4.305507659912109, + "step": 18509 + }, + { + "epoch": 4.63, + "grad_norm": 4.681807518005371, + "learning_rate": 1.3380394098765738e-07, + "logits/chosen": -0.5479328036308289, + "logits/rejected": -0.6511052250862122, + "logps/chosen": -62.62449264526367, + "logps/rejected": -105.63975524902344, + "loss": 0.6304, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7869937419891357, + "rewards/margins": 7.425045967102051, + "rewards/rejected": -3.6380515098571777, + "step": 18510 + }, + { + "epoch": 4.63, + "grad_norm": 8.021562576293945, + "learning_rate": 1.336233856021779e-07, + "logits/chosen": -0.5846529603004456, + "logits/rejected": -0.6075745820999146, + "logps/chosen": -40.96446228027344, + "logps/rejected": -119.89739990234375, + "loss": 0.5212, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.173513412475586, + "rewards/margins": 6.872376441955566, + "rewards/rejected": -3.698862075805664, + "step": 18511 + }, + { + "epoch": 4.63, + "grad_norm": 8.192214012145996, + "learning_rate": 1.3344295047004075e-07, + "logits/chosen": -0.5299268960952759, + "logits/rejected": -0.6214522123336792, + "logps/chosen": -67.94662475585938, + "logps/rejected": -103.9690170288086, + "loss": 0.7029, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9278411865234375, + "rewards/margins": 6.954460144042969, + "rewards/rejected": -4.0266194343566895, + "step": 18512 + }, + { + "epoch": 4.63, + "grad_norm": 7.332427501678467, + "learning_rate": 1.3326263559570453e-07, + "logits/chosen": -0.5327216982841492, + "logits/rejected": -0.6236935257911682, + "logps/chosen": -57.22683334350586, + "logps/rejected": -98.71562957763672, + "loss": 0.6376, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.022162675857544, + "rewards/margins": 6.326697826385498, + "rewards/rejected": -3.304535388946533, + "step": 18513 + }, + { + "epoch": 4.63, + "grad_norm": 4.225401878356934, + "learning_rate": 1.3308244098362565e-07, + "logits/chosen": -0.5600703954696655, + "logits/rejected": -0.6246209740638733, + "logps/chosen": -56.633724212646484, + "logps/rejected": -109.54702758789062, + "loss": 0.6114, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0961756706237793, + "rewards/margins": 7.3367791175842285, + "rewards/rejected": -4.240602970123291, + "step": 18514 + }, + { + "epoch": 4.63, + "grad_norm": 8.445503234863281, + "learning_rate": 1.3290236663825562e-07, + "logits/chosen": -0.6257051229476929, + "logits/rejected": -0.7052342295646667, + "logps/chosen": -40.4824104309082, + "logps/rejected": -88.11288452148438, + "loss": 0.6268, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9732372760772705, + "rewards/margins": 7.028076648712158, + "rewards/rejected": -4.054840087890625, + "step": 18515 + }, + { + "epoch": 4.63, + "grad_norm": 3.4865005016326904, + "learning_rate": 1.3272241256404528e-07, + "logits/chosen": -0.6386445164680481, + "logits/rejected": -0.7258535623550415, + "logps/chosen": -69.66105651855469, + "logps/rejected": -86.32618713378906, + "loss": 0.726, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9735758304595947, + "rewards/margins": 6.350162506103516, + "rewards/rejected": -3.376586675643921, + "step": 18516 + }, + { + "epoch": 4.63, + "grad_norm": 18.39946746826172, + "learning_rate": 1.3254257876544107e-07, + "logits/chosen": -0.5298271179199219, + "logits/rejected": -0.6156821846961975, + "logps/chosen": -53.7718505859375, + "logps/rejected": -110.65855407714844, + "loss": 0.61, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3004798889160156, + "rewards/margins": 8.0224027633667, + "rewards/rejected": -4.721924304962158, + "step": 18517 + }, + { + "epoch": 4.63, + "grad_norm": 5.2974653244018555, + "learning_rate": 1.3236286524688725e-07, + "logits/chosen": -0.5330613851547241, + "logits/rejected": -0.6060933470726013, + "logps/chosen": -52.492408752441406, + "logps/rejected": -114.29963684082031, + "loss": 0.5552, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.191176652908325, + "rewards/margins": 7.878779411315918, + "rewards/rejected": -4.687602519989014, + "step": 18518 + }, + { + "epoch": 4.63, + "grad_norm": 2.2983970642089844, + "learning_rate": 1.3218327201282466e-07, + "logits/chosen": -0.5622215867042542, + "logits/rejected": -0.6537983417510986, + "logps/chosen": -53.36280059814453, + "logps/rejected": -118.92817687988281, + "loss": 0.5713, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2742342948913574, + "rewards/margins": 8.961469650268555, + "rewards/rejected": -5.687234878540039, + "step": 18519 + }, + { + "epoch": 4.63, + "grad_norm": 4.61130428314209, + "learning_rate": 1.3200379906769035e-07, + "logits/chosen": -0.48271456360816956, + "logits/rejected": -0.6082595586776733, + "logps/chosen": -47.95764923095703, + "logps/rejected": -102.82227325439453, + "loss": 0.5773, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2615861892700195, + "rewards/margins": 8.02047061920166, + "rewards/rejected": -4.758883953094482, + "step": 18520 + }, + { + "epoch": 4.63, + "grad_norm": 4.055043697357178, + "learning_rate": 1.3182444641592074e-07, + "logits/chosen": -0.49664032459259033, + "logits/rejected": -0.6251377463340759, + "logps/chosen": -62.59385299682617, + "logps/rejected": -100.45474243164062, + "loss": 0.6027, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.191704273223877, + "rewards/margins": 7.930776596069336, + "rewards/rejected": -4.739072322845459, + "step": 18521 + }, + { + "epoch": 4.63, + "grad_norm": 3.9637646675109863, + "learning_rate": 1.3164521406194674e-07, + "logits/chosen": -0.5468101501464844, + "logits/rejected": -0.6448597311973572, + "logps/chosen": -66.1588363647461, + "logps/rejected": -99.23629760742188, + "loss": 0.6234, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8996024131774902, + "rewards/margins": 6.979111671447754, + "rewards/rejected": -4.079509735107422, + "step": 18522 + }, + { + "epoch": 4.63, + "grad_norm": 2.558795690536499, + "learning_rate": 1.314661020101976e-07, + "logits/chosen": -0.4890098571777344, + "logits/rejected": -0.6149269342422485, + "logps/chosen": -67.23321533203125, + "logps/rejected": -107.52030944824219, + "loss": 0.5749, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1877033710479736, + "rewards/margins": 8.262842178344727, + "rewards/rejected": -5.075139045715332, + "step": 18523 + }, + { + "epoch": 4.63, + "grad_norm": 6.351691722869873, + "learning_rate": 1.3128711026509922e-07, + "logits/chosen": -0.5755590796470642, + "logits/rejected": -0.682583212852478, + "logps/chosen": -56.84367370605469, + "logps/rejected": -110.98751068115234, + "loss": 0.5498, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0740127563476562, + "rewards/margins": 8.078691482543945, + "rewards/rejected": -5.004678726196289, + "step": 18524 + }, + { + "epoch": 4.63, + "grad_norm": 2.393341302871704, + "learning_rate": 1.3110823883107583e-07, + "logits/chosen": -0.5693516135215759, + "logits/rejected": -0.6593145728111267, + "logps/chosen": -59.00431442260742, + "logps/rejected": -99.60166931152344, + "loss": 0.5953, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.955396890640259, + "rewards/margins": 7.106773376464844, + "rewards/rejected": -4.151376247406006, + "step": 18525 + }, + { + "epoch": 4.63, + "grad_norm": 3.076957941055298, + "learning_rate": 1.309294877125461e-07, + "logits/chosen": -0.5251920223236084, + "logits/rejected": -0.6125324964523315, + "logps/chosen": -55.230079650878906, + "logps/rejected": -117.1681900024414, + "loss": 0.5922, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1165354251861572, + "rewards/margins": 7.8837199211120605, + "rewards/rejected": -4.767183780670166, + "step": 18526 + }, + { + "epoch": 4.63, + "grad_norm": 17.33225440979004, + "learning_rate": 1.307508569139271e-07, + "logits/chosen": -0.6060900688171387, + "logits/rejected": -0.6410959362983704, + "logps/chosen": -47.78034973144531, + "logps/rejected": -91.96324920654297, + "loss": 0.7382, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7258031368255615, + "rewards/margins": 6.112363338470459, + "rewards/rejected": -3.3865604400634766, + "step": 18527 + }, + { + "epoch": 4.63, + "grad_norm": 13.940807342529297, + "learning_rate": 1.3057234643963524e-07, + "logits/chosen": -0.5526554584503174, + "logits/rejected": -0.6020056009292603, + "logps/chosen": -57.497581481933594, + "logps/rejected": -128.74081420898438, + "loss": 0.6038, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8981943130493164, + "rewards/margins": 7.360745429992676, + "rewards/rejected": -4.462551116943359, + "step": 18528 + }, + { + "epoch": 4.64, + "grad_norm": 10.023402214050293, + "learning_rate": 1.303939562940787e-07, + "logits/chosen": -0.5466974973678589, + "logits/rejected": -0.6425463557243347, + "logps/chosen": -67.37965393066406, + "logps/rejected": -99.49667358398438, + "loss": 0.6279, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5065619945526123, + "rewards/margins": 7.13184118270874, + "rewards/rejected": -3.625279426574707, + "step": 18529 + }, + { + "epoch": 4.64, + "grad_norm": 10.087655067443848, + "learning_rate": 1.3021568648166672e-07, + "logits/chosen": -0.5885685682296753, + "logits/rejected": -0.6788090467453003, + "logps/chosen": -50.782196044921875, + "logps/rejected": -91.17284393310547, + "loss": 0.6585, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.382707118988037, + "rewards/margins": 6.906227111816406, + "rewards/rejected": -3.52351975440979, + "step": 18530 + }, + { + "epoch": 4.64, + "grad_norm": 6.058001518249512, + "learning_rate": 1.3003753700680578e-07, + "logits/chosen": -0.5347652435302734, + "logits/rejected": -0.6086876392364502, + "logps/chosen": -60.70948791503906, + "logps/rejected": -105.36698150634766, + "loss": 0.722, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.748965263366699, + "rewards/margins": 7.0807037353515625, + "rewards/rejected": -4.331737995147705, + "step": 18531 + }, + { + "epoch": 4.64, + "grad_norm": 6.2968878746032715, + "learning_rate": 1.2985950787389735e-07, + "logits/chosen": -0.49880361557006836, + "logits/rejected": -0.6009721755981445, + "logps/chosen": -59.219512939453125, + "logps/rejected": -98.71259307861328, + "loss": 0.6611, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0063326358795166, + "rewards/margins": 6.518346786499023, + "rewards/rejected": -3.512014389038086, + "step": 18532 + }, + { + "epoch": 4.64, + "grad_norm": 8.525494575500488, + "learning_rate": 1.2968159908733956e-07, + "logits/chosen": -0.49794724583625793, + "logits/rejected": -0.5891969203948975, + "logps/chosen": -53.258541107177734, + "logps/rejected": -97.94223022460938, + "loss": 0.6253, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.200385808944702, + "rewards/margins": 4.989817142486572, + "rewards/rejected": -1.7894313335418701, + "step": 18533 + }, + { + "epoch": 4.64, + "grad_norm": 2.745424747467041, + "learning_rate": 1.2950381065153061e-07, + "logits/chosen": -0.5186476111412048, + "logits/rejected": -0.5859281420707703, + "logps/chosen": -56.926673889160156, + "logps/rejected": -129.28367614746094, + "loss": 0.5893, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9414522647857666, + "rewards/margins": 8.756889343261719, + "rewards/rejected": -5.815437316894531, + "step": 18534 + }, + { + "epoch": 4.64, + "grad_norm": 9.34061050415039, + "learning_rate": 1.293261425708625e-07, + "logits/chosen": -0.4838566482067108, + "logits/rejected": -0.5900804996490479, + "logps/chosen": -52.33793640136719, + "logps/rejected": -92.1679458618164, + "loss": 0.6127, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.85750675201416, + "rewards/margins": 7.0749335289001465, + "rewards/rejected": -4.217426300048828, + "step": 18535 + }, + { + "epoch": 4.64, + "grad_norm": 5.20020866394043, + "learning_rate": 1.2914859484972675e-07, + "logits/chosen": -0.5101594924926758, + "logits/rejected": -0.5744799375534058, + "logps/chosen": -50.46290969848633, + "logps/rejected": -93.09796142578125, + "loss": 0.5927, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3250534534454346, + "rewards/margins": 6.354732513427734, + "rewards/rejected": -3.029679775238037, + "step": 18536 + }, + { + "epoch": 4.64, + "grad_norm": 21.752456665039062, + "learning_rate": 1.2897116749250982e-07, + "logits/chosen": -0.563581645488739, + "logits/rejected": -0.6370182037353516, + "logps/chosen": -57.963565826416016, + "logps/rejected": -117.78511047363281, + "loss": 0.6595, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9304006099700928, + "rewards/margins": 7.725004196166992, + "rewards/rejected": -4.794604301452637, + "step": 18537 + }, + { + "epoch": 4.64, + "grad_norm": 4.599127292633057, + "learning_rate": 1.28793860503596e-07, + "logits/chosen": -0.5409196019172668, + "logits/rejected": -0.611058235168457, + "logps/chosen": -50.68236541748047, + "logps/rejected": -113.24612426757812, + "loss": 0.6198, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.274709463119507, + "rewards/margins": 7.57021427154541, + "rewards/rejected": -4.295505523681641, + "step": 18538 + }, + { + "epoch": 4.64, + "grad_norm": 6.444062232971191, + "learning_rate": 1.2861667388736786e-07, + "logits/chosen": -0.5574122667312622, + "logits/rejected": -0.6393387317657471, + "logps/chosen": -54.5660514831543, + "logps/rejected": -131.30084228515625, + "loss": 0.6407, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0601041316986084, + "rewards/margins": 9.143465042114258, + "rewards/rejected": -6.083361625671387, + "step": 18539 + }, + { + "epoch": 4.64, + "grad_norm": 3.9708316326141357, + "learning_rate": 1.2843960764820195e-07, + "logits/chosen": -0.5705597996711731, + "logits/rejected": -0.6453698873519897, + "logps/chosen": -53.34368896484375, + "logps/rejected": -118.95349884033203, + "loss": 0.5693, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.314236640930176, + "rewards/margins": 8.490168571472168, + "rewards/rejected": -5.17593240737915, + "step": 18540 + }, + { + "epoch": 4.64, + "grad_norm": 9.070464134216309, + "learning_rate": 1.2826266179047643e-07, + "logits/chosen": -0.5222369432449341, + "logits/rejected": -0.5956657528877258, + "logps/chosen": -56.72822189331055, + "logps/rejected": -100.42772674560547, + "loss": 0.6107, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.85923433303833, + "rewards/margins": 7.1899542808532715, + "rewards/rejected": -4.3307204246521, + "step": 18541 + }, + { + "epoch": 4.64, + "grad_norm": 5.186763286590576, + "learning_rate": 1.2808583631856108e-07, + "logits/chosen": -0.48993349075317383, + "logits/rejected": -0.6184664964675903, + "logps/chosen": -58.18276596069336, + "logps/rejected": -83.56378936767578, + "loss": 0.5465, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0624194145202637, + "rewards/margins": 6.546227931976318, + "rewards/rejected": -3.4838080406188965, + "step": 18542 + }, + { + "epoch": 4.64, + "grad_norm": 5.017457008361816, + "learning_rate": 1.2790913123682746e-07, + "logits/chosen": -0.6050564050674438, + "logits/rejected": -0.6494682431221008, + "logps/chosen": -47.50685501098633, + "logps/rejected": -99.69453430175781, + "loss": 0.6379, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9469010829925537, + "rewards/margins": 6.320781707763672, + "rewards/rejected": -3.37388014793396, + "step": 18543 + }, + { + "epoch": 4.64, + "grad_norm": 5.139683246612549, + "learning_rate": 1.2773254654964152e-07, + "logits/chosen": -0.4926210641860962, + "logits/rejected": -0.5948286652565002, + "logps/chosen": -58.35432434082031, + "logps/rejected": -107.20398712158203, + "loss": 0.6169, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.197756290435791, + "rewards/margins": 7.687228202819824, + "rewards/rejected": -4.489470958709717, + "step": 18544 + }, + { + "epoch": 4.64, + "grad_norm": 5.256248950958252, + "learning_rate": 1.2755608226136585e-07, + "logits/chosen": -0.5903943777084351, + "logits/rejected": -0.6469967365264893, + "logps/chosen": -57.67755126953125, + "logps/rejected": -118.71356964111328, + "loss": 0.7321, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9287712574005127, + "rewards/margins": 7.40767765045166, + "rewards/rejected": -4.478907108306885, + "step": 18545 + }, + { + "epoch": 4.64, + "grad_norm": 4.277639389038086, + "learning_rate": 1.2737973837636252e-07, + "logits/chosen": -0.5545838475227356, + "logits/rejected": -0.6604095101356506, + "logps/chosen": -56.33564376831055, + "logps/rejected": -106.097412109375, + "loss": 0.5776, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1910054683685303, + "rewards/margins": 7.561277866363525, + "rewards/rejected": -4.370271682739258, + "step": 18546 + }, + { + "epoch": 4.64, + "grad_norm": 3.2440054416656494, + "learning_rate": 1.272035148989892e-07, + "logits/chosen": -0.6242894530296326, + "logits/rejected": -0.673181414604187, + "logps/chosen": -52.78379440307617, + "logps/rejected": -136.0745086669922, + "loss": 0.6345, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.231020212173462, + "rewards/margins": 9.158171653747559, + "rewards/rejected": -5.927150726318359, + "step": 18547 + }, + { + "epoch": 4.64, + "grad_norm": 3.2867648601531982, + "learning_rate": 1.27027411833599e-07, + "logits/chosen": -0.6167525053024292, + "logits/rejected": -0.6605972647666931, + "logps/chosen": -61.79816818237305, + "logps/rejected": -111.42511749267578, + "loss": 0.6932, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8192646503448486, + "rewards/margins": 7.342791557312012, + "rewards/rejected": -4.523527145385742, + "step": 18548 + }, + { + "epoch": 4.64, + "grad_norm": 4.6724371910095215, + "learning_rate": 1.2685142918454518e-07, + "logits/chosen": -0.5935071110725403, + "logits/rejected": -0.6881523132324219, + "logps/chosen": -61.35408401489258, + "logps/rejected": -136.83839416503906, + "loss": 0.6642, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.027698516845703, + "rewards/margins": 8.150264739990234, + "rewards/rejected": -5.122566223144531, + "step": 18549 + }, + { + "epoch": 4.64, + "grad_norm": 3.163822650909424, + "learning_rate": 1.2667556695617534e-07, + "logits/chosen": -0.537247896194458, + "logits/rejected": -0.5910030007362366, + "logps/chosen": -50.01631164550781, + "logps/rejected": -117.66815185546875, + "loss": 0.5523, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.260575771331787, + "rewards/margins": 7.413634777069092, + "rewards/rejected": -4.1530585289001465, + "step": 18550 + }, + { + "epoch": 4.64, + "grad_norm": 2.1426899433135986, + "learning_rate": 1.2649982515283598e-07, + "logits/chosen": -0.5371338725090027, + "logits/rejected": -0.6324896812438965, + "logps/chosen": -60.91943359375, + "logps/rejected": -111.46822357177734, + "loss": 0.5481, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.128861904144287, + "rewards/margins": 7.652793884277344, + "rewards/rejected": -4.523931503295898, + "step": 18551 + }, + { + "epoch": 4.64, + "grad_norm": 4.565175533294678, + "learning_rate": 1.2632420377886922e-07, + "logits/chosen": -0.5529280304908752, + "logits/rejected": -0.6539974808692932, + "logps/chosen": -75.49092102050781, + "logps/rejected": -100.62873840332031, + "loss": 0.6908, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3165602684020996, + "rewards/margins": 8.295175552368164, + "rewards/rejected": -4.978615760803223, + "step": 18552 + }, + { + "epoch": 4.64, + "grad_norm": 3.9069831371307373, + "learning_rate": 1.2614870283861657e-07, + "logits/chosen": -0.577033281326294, + "logits/rejected": -0.6474621295928955, + "logps/chosen": -56.44290542602539, + "logps/rejected": -110.21997833251953, + "loss": 0.6427, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1485085487365723, + "rewards/margins": 7.417391777038574, + "rewards/rejected": -4.268882751464844, + "step": 18553 + }, + { + "epoch": 4.64, + "grad_norm": 10.189823150634766, + "learning_rate": 1.2597332233641236e-07, + "logits/chosen": -0.49989649653434753, + "logits/rejected": -0.5807143449783325, + "logps/chosen": -61.62653350830078, + "logps/rejected": -130.93606567382812, + "loss": 0.5933, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1406469345092773, + "rewards/margins": 7.727573871612549, + "rewards/rejected": -4.5869269371032715, + "step": 18554 + }, + { + "epoch": 4.64, + "grad_norm": 4.1933722496032715, + "learning_rate": 1.2579806227659142e-07, + "logits/chosen": -0.4971306622028351, + "logits/rejected": -0.5651565790176392, + "logps/chosen": -61.754608154296875, + "logps/rejected": -119.4910888671875, + "loss": 0.604, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.441636800765991, + "rewards/margins": 7.09281063079834, + "rewards/rejected": -3.6511740684509277, + "step": 18555 + }, + { + "epoch": 4.64, + "grad_norm": 6.11046838760376, + "learning_rate": 1.256229226634853e-07, + "logits/chosen": -0.588001012802124, + "logits/rejected": -0.6296457648277283, + "logps/chosen": -56.95999526977539, + "logps/rejected": -134.26992797851562, + "loss": 0.6534, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.230386257171631, + "rewards/margins": 7.964489936828613, + "rewards/rejected": -4.734102725982666, + "step": 18556 + }, + { + "epoch": 4.64, + "grad_norm": 3.1324448585510254, + "learning_rate": 1.2544790350142166e-07, + "logits/chosen": -0.5819024443626404, + "logits/rejected": -0.6279310584068298, + "logps/chosen": -53.25382995605469, + "logps/rejected": -108.81123352050781, + "loss": 0.5423, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.047110080718994, + "rewards/margins": 7.073322296142578, + "rewards/rejected": -4.026212692260742, + "step": 18557 + }, + { + "epoch": 4.64, + "grad_norm": 6.667198657989502, + "learning_rate": 1.2527300479472483e-07, + "logits/chosen": -0.5488170385360718, + "logits/rejected": -0.5542048215866089, + "logps/chosen": -48.284584045410156, + "logps/rejected": -120.99791717529297, + "loss": 0.6163, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.4317593574523926, + "rewards/margins": 6.544386863708496, + "rewards/rejected": -3.1126279830932617, + "step": 18558 + }, + { + "epoch": 4.64, + "grad_norm": 3.6857759952545166, + "learning_rate": 1.2509822654771686e-07, + "logits/chosen": -0.4927208423614502, + "logits/rejected": -0.6022889614105225, + "logps/chosen": -61.77808380126953, + "logps/rejected": -112.3050765991211, + "loss": 0.6346, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9359467029571533, + "rewards/margins": 7.722472190856934, + "rewards/rejected": -4.786526679992676, + "step": 18559 + }, + { + "epoch": 4.64, + "grad_norm": 21.88299560546875, + "learning_rate": 1.2492356876471713e-07, + "logits/chosen": -0.5279232859611511, + "logits/rejected": -0.6044092178344727, + "logps/chosen": -51.419166564941406, + "logps/rejected": -99.17169952392578, + "loss": 0.5741, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.204195737838745, + "rewards/margins": 6.680583953857422, + "rewards/rejected": -3.4763879776000977, + "step": 18560 + }, + { + "epoch": 4.64, + "grad_norm": 24.04201316833496, + "learning_rate": 1.2474903145004103e-07, + "logits/chosen": -0.5813918113708496, + "logits/rejected": -0.6763955354690552, + "logps/chosen": -60.0233039855957, + "logps/rejected": -111.72865295410156, + "loss": 0.6474, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9499142169952393, + "rewards/margins": 6.9657392501831055, + "rewards/rejected": -4.015825271606445, + "step": 18561 + }, + { + "epoch": 4.64, + "grad_norm": 4.627476692199707, + "learning_rate": 1.2457461460800236e-07, + "logits/chosen": -0.4873804450035095, + "logits/rejected": -0.5743544101715088, + "logps/chosen": -50.250083923339844, + "logps/rejected": -98.66262817382812, + "loss": 0.6167, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.12752103805542, + "rewards/margins": 7.513423919677734, + "rewards/rejected": -4.385902404785156, + "step": 18562 + }, + { + "epoch": 4.64, + "grad_norm": 17.50528907775879, + "learning_rate": 1.2440031824291043e-07, + "logits/chosen": -0.6206358671188354, + "logits/rejected": -0.6822233200073242, + "logps/chosen": -50.595123291015625, + "logps/rejected": -117.64938354492188, + "loss": 0.6199, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1021745204925537, + "rewards/margins": 6.894698619842529, + "rewards/rejected": -3.7925243377685547, + "step": 18563 + }, + { + "epoch": 4.64, + "grad_norm": 5.175752639770508, + "learning_rate": 1.2422614235907294e-07, + "logits/chosen": -0.5287269353866577, + "logits/rejected": -0.6057038903236389, + "logps/chosen": -61.60118103027344, + "logps/rejected": -88.14617156982422, + "loss": 0.639, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3055355548858643, + "rewards/margins": 6.812188148498535, + "rewards/rejected": -3.506652355194092, + "step": 18564 + }, + { + "epoch": 4.64, + "grad_norm": 4.078466415405273, + "learning_rate": 1.2405208696079363e-07, + "logits/chosen": -0.6308629512786865, + "logits/rejected": -0.6684110164642334, + "logps/chosen": -77.33821868896484, + "logps/rejected": -92.2064437866211, + "loss": 0.5873, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7574713230133057, + "rewards/margins": 6.767533302307129, + "rewards/rejected": -4.010061264038086, + "step": 18565 + }, + { + "epoch": 4.64, + "grad_norm": 15.52825927734375, + "learning_rate": 1.238781520523724e-07, + "logits/chosen": -0.5571064949035645, + "logits/rejected": -0.6499450206756592, + "logps/chosen": -54.215145111083984, + "logps/rejected": -112.2784423828125, + "loss": 0.6256, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0688889026641846, + "rewards/margins": 8.016379356384277, + "rewards/rejected": -4.947490692138672, + "step": 18566 + }, + { + "epoch": 4.64, + "grad_norm": 2.4217121601104736, + "learning_rate": 1.2370433763810919e-07, + "logits/chosen": -0.45267409086227417, + "logits/rejected": -0.5708336234092712, + "logps/chosen": -49.00113296508789, + "logps/rejected": -91.45559692382812, + "loss": 0.5148, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1809113025665283, + "rewards/margins": 7.330872535705566, + "rewards/rejected": -4.149960517883301, + "step": 18567 + }, + { + "epoch": 4.64, + "grad_norm": 5.965843677520752, + "learning_rate": 1.235306437222994e-07, + "logits/chosen": -0.5051369071006775, + "logits/rejected": -0.6139413118362427, + "logps/chosen": -54.729461669921875, + "logps/rejected": -102.36448669433594, + "loss": 0.5694, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3511149883270264, + "rewards/margins": 7.615891456604004, + "rewards/rejected": -4.264776229858398, + "step": 18568 + }, + { + "epoch": 4.65, + "grad_norm": 8.362785339355469, + "learning_rate": 1.2335707030923294e-07, + "logits/chosen": -0.547795295715332, + "logits/rejected": -0.5993317365646362, + "logps/chosen": -48.61135482788086, + "logps/rejected": -109.91507720947266, + "loss": 0.643, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.238706588745117, + "rewards/margins": 6.598962306976318, + "rewards/rejected": -3.360255718231201, + "step": 18569 + }, + { + "epoch": 4.65, + "grad_norm": 4.6251912117004395, + "learning_rate": 1.231836174032003e-07, + "logits/chosen": -0.5271275043487549, + "logits/rejected": -0.6470111012458801, + "logps/chosen": -70.5254135131836, + "logps/rejected": -101.38916015625, + "loss": 0.7291, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8000714778900146, + "rewards/margins": 6.795321941375732, + "rewards/rejected": -3.995250701904297, + "step": 18570 + }, + { + "epoch": 4.65, + "grad_norm": 4.322770595550537, + "learning_rate": 1.230102850084891e-07, + "logits/chosen": -0.5737941861152649, + "logits/rejected": -0.6228306293487549, + "logps/chosen": -50.37204360961914, + "logps/rejected": -106.77071380615234, + "loss": 0.5881, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.949852705001831, + "rewards/margins": 6.25019645690918, + "rewards/rejected": -3.3003439903259277, + "step": 18571 + }, + { + "epoch": 4.65, + "grad_norm": 6.917540550231934, + "learning_rate": 1.2283707312937987e-07, + "logits/chosen": -0.4883686304092407, + "logits/rejected": -0.5420866012573242, + "logps/chosen": -53.31407165527344, + "logps/rejected": -110.94392395019531, + "loss": 0.6429, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.190195322036743, + "rewards/margins": 6.338513374328613, + "rewards/rejected": -3.148318290710449, + "step": 18572 + }, + { + "epoch": 4.65, + "grad_norm": 9.399404525756836, + "learning_rate": 1.2266398177015415e-07, + "logits/chosen": -0.5078333020210266, + "logits/rejected": -0.5757129192352295, + "logps/chosen": -52.13783645629883, + "logps/rejected": -113.851318359375, + "loss": 0.6366, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.936002016067505, + "rewards/margins": 7.864968299865723, + "rewards/rejected": -4.928966045379639, + "step": 18573 + }, + { + "epoch": 4.65, + "grad_norm": 3.699406385421753, + "learning_rate": 1.2249101093508964e-07, + "logits/chosen": -0.5559483170509338, + "logits/rejected": -0.6081935167312622, + "logps/chosen": -47.515403747558594, + "logps/rejected": -107.3941650390625, + "loss": 0.5775, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.120558261871338, + "rewards/margins": 6.703170299530029, + "rewards/rejected": -3.5826127529144287, + "step": 18574 + }, + { + "epoch": 4.65, + "grad_norm": 5.812413215637207, + "learning_rate": 1.223181606284607e-07, + "logits/chosen": -0.4953199326992035, + "logits/rejected": -0.5742245316505432, + "logps/chosen": -55.06211853027344, + "logps/rejected": -94.25617218017578, + "loss": 0.6277, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0821545124053955, + "rewards/margins": 6.484665393829346, + "rewards/rejected": -3.402510404586792, + "step": 18575 + }, + { + "epoch": 4.65, + "grad_norm": 5.0816521644592285, + "learning_rate": 1.2214543085453722e-07, + "logits/chosen": -0.5499109625816345, + "logits/rejected": -0.628061056137085, + "logps/chosen": -55.662906646728516, + "logps/rejected": -94.2939682006836, + "loss": 0.691, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3122732639312744, + "rewards/margins": 6.755740165710449, + "rewards/rejected": -3.443467140197754, + "step": 18576 + }, + { + "epoch": 4.65, + "grad_norm": 2.5407676696777344, + "learning_rate": 1.219728216175886e-07, + "logits/chosen": -0.5579304695129395, + "logits/rejected": -0.6569085121154785, + "logps/chosen": -67.00973510742188, + "logps/rejected": -129.8373565673828, + "loss": 0.584, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.957707166671753, + "rewards/margins": 7.780771255493164, + "rewards/rejected": -4.823064804077148, + "step": 18577 + }, + { + "epoch": 4.65, + "grad_norm": 3.2109177112579346, + "learning_rate": 1.2180033292188086e-07, + "logits/chosen": -0.48034799098968506, + "logits/rejected": -0.5911216735839844, + "logps/chosen": -64.09420013427734, + "logps/rejected": -97.34883117675781, + "loss": 0.5892, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9066920280456543, + "rewards/margins": 7.067054271697998, + "rewards/rejected": -4.160362243652344, + "step": 18578 + }, + { + "epoch": 4.65, + "grad_norm": 4.358616828918457, + "learning_rate": 1.216279647716745e-07, + "logits/chosen": -0.5675506591796875, + "logits/rejected": -0.6451062560081482, + "logps/chosen": -50.14830780029297, + "logps/rejected": -104.50967407226562, + "loss": 0.5998, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.359938383102417, + "rewards/margins": 7.442312717437744, + "rewards/rejected": -4.0823750495910645, + "step": 18579 + }, + { + "epoch": 4.65, + "grad_norm": 3.994753360748291, + "learning_rate": 1.214557171712305e-07, + "logits/chosen": -0.5699069499969482, + "logits/rejected": -0.6337732076644897, + "logps/chosen": -47.463104248046875, + "logps/rejected": -115.47990417480469, + "loss": 0.5308, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.116922378540039, + "rewards/margins": 8.397926330566406, + "rewards/rejected": -5.281004428863525, + "step": 18580 + }, + { + "epoch": 4.65, + "grad_norm": 4.132551193237305, + "learning_rate": 1.2128359012480439e-07, + "logits/chosen": -0.6631007194519043, + "logits/rejected": -0.7360449433326721, + "logps/chosen": -42.18507766723633, + "logps/rejected": -106.3648910522461, + "loss": 0.5389, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.264822483062744, + "rewards/margins": 8.009819030761719, + "rewards/rejected": -4.744996070861816, + "step": 18581 + }, + { + "epoch": 4.65, + "grad_norm": 6.649653434753418, + "learning_rate": 1.2111158363665053e-07, + "logits/chosen": -0.511060893535614, + "logits/rejected": -0.6076934337615967, + "logps/chosen": -57.23529815673828, + "logps/rejected": -105.00532531738281, + "loss": 0.6088, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9614830017089844, + "rewards/margins": 7.065759658813477, + "rewards/rejected": -4.104276657104492, + "step": 18582 + }, + { + "epoch": 4.65, + "grad_norm": 5.50379753112793, + "learning_rate": 1.2093969771101887e-07, + "logits/chosen": -0.6221883296966553, + "logits/rejected": -0.6881248950958252, + "logps/chosen": -46.0946159362793, + "logps/rejected": -96.46417999267578, + "loss": 0.6003, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0205845832824707, + "rewards/margins": 6.676499843597412, + "rewards/rejected": -3.6559154987335205, + "step": 18583 + }, + { + "epoch": 4.65, + "grad_norm": 9.645502090454102, + "learning_rate": 1.2076793235215655e-07, + "logits/chosen": -0.5491880774497986, + "logits/rejected": -0.5866982936859131, + "logps/chosen": -54.14219284057617, + "logps/rejected": -82.97508239746094, + "loss": 0.6015, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.910728693008423, + "rewards/margins": 4.692465782165527, + "rewards/rejected": -1.7817373275756836, + "step": 18584 + }, + { + "epoch": 4.65, + "grad_norm": 3.5689897537231445, + "learning_rate": 1.2059628756430797e-07, + "logits/chosen": -0.5717193484306335, + "logits/rejected": -0.670890212059021, + "logps/chosen": -49.578819274902344, + "logps/rejected": -91.03955841064453, + "loss": 0.585, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.352341413497925, + "rewards/margins": 6.9069366455078125, + "rewards/rejected": -3.5545945167541504, + "step": 18585 + }, + { + "epoch": 4.65, + "grad_norm": 6.8049821853637695, + "learning_rate": 1.2042476335171638e-07, + "logits/chosen": -0.5303916931152344, + "logits/rejected": -0.5940319895744324, + "logps/chosen": -52.20954895019531, + "logps/rejected": -120.99212646484375, + "loss": 0.6516, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.06646990776062, + "rewards/margins": 7.009442329406738, + "rewards/rejected": -3.94297194480896, + "step": 18586 + }, + { + "epoch": 4.65, + "grad_norm": 8.033833503723145, + "learning_rate": 1.202533597186184e-07, + "logits/chosen": -0.582576334476471, + "logits/rejected": -0.7032949924468994, + "logps/chosen": -51.43590545654297, + "logps/rejected": -102.66758728027344, + "loss": 0.6415, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.115011692047119, + "rewards/margins": 7.380747318267822, + "rewards/rejected": -4.265735626220703, + "step": 18587 + }, + { + "epoch": 4.65, + "grad_norm": 10.712526321411133, + "learning_rate": 1.2008207666924953e-07, + "logits/chosen": -0.5808890461921692, + "logits/rejected": -0.692499577999115, + "logps/chosen": -51.647438049316406, + "logps/rejected": -103.17768096923828, + "loss": 0.6702, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.071760654449463, + "rewards/margins": 7.423620223999023, + "rewards/rejected": -4.3518595695495605, + "step": 18588 + }, + { + "epoch": 4.65, + "grad_norm": 2.210440158843994, + "learning_rate": 1.1991091420784418e-07, + "logits/chosen": -0.5480204820632935, + "logits/rejected": -0.6756194829940796, + "logps/chosen": -59.30103302001953, + "logps/rejected": -106.4770736694336, + "loss": 0.6121, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3453781604766846, + "rewards/margins": 8.351877212524414, + "rewards/rejected": -5.006497859954834, + "step": 18589 + }, + { + "epoch": 4.65, + "grad_norm": 9.09958267211914, + "learning_rate": 1.1973987233863061e-07, + "logits/chosen": -0.5705516338348389, + "logits/rejected": -0.6234253644943237, + "logps/chosen": -44.56523895263672, + "logps/rejected": -115.59471893310547, + "loss": 0.5647, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0827765464782715, + "rewards/margins": 7.943661689758301, + "rewards/rejected": -4.8608856201171875, + "step": 18590 + }, + { + "epoch": 4.65, + "grad_norm": 4.281425476074219, + "learning_rate": 1.1956895106583488e-07, + "logits/chosen": -0.5237729549407959, + "logits/rejected": -0.5665479302406311, + "logps/chosen": -51.47117233276367, + "logps/rejected": -112.41276550292969, + "loss": 0.6137, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.290290594100952, + "rewards/margins": 6.662782192230225, + "rewards/rejected": -3.3724918365478516, + "step": 18591 + }, + { + "epoch": 4.65, + "grad_norm": 3.613246202468872, + "learning_rate": 1.1939815039368197e-07, + "logits/chosen": -0.6314186453819275, + "logits/rejected": -0.6526342630386353, + "logps/chosen": -49.31289291381836, + "logps/rejected": -99.08969116210938, + "loss": 0.6001, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.272660732269287, + "rewards/margins": 6.592198371887207, + "rewards/rejected": -3.3195371627807617, + "step": 18592 + }, + { + "epoch": 4.65, + "grad_norm": 3.8841476440429688, + "learning_rate": 1.1922747032639236e-07, + "logits/chosen": -0.5349804759025574, + "logits/rejected": -0.6415682435035706, + "logps/chosen": -59.10531234741211, + "logps/rejected": -101.32765197753906, + "loss": 0.5575, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.023653030395508, + "rewards/margins": 7.712297439575195, + "rewards/rejected": -4.6886444091796875, + "step": 18593 + }, + { + "epoch": 4.65, + "grad_norm": 26.224044799804688, + "learning_rate": 1.1905691086818272e-07, + "logits/chosen": -0.5062695741653442, + "logits/rejected": -0.5998371839523315, + "logps/chosen": -59.272857666015625, + "logps/rejected": -116.89665985107422, + "loss": 0.622, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1011626720428467, + "rewards/margins": 7.357803821563721, + "rewards/rejected": -4.256641387939453, + "step": 18594 + }, + { + "epoch": 4.65, + "grad_norm": 6.06820821762085, + "learning_rate": 1.1888647202326853e-07, + "logits/chosen": -0.541179895401001, + "logits/rejected": -0.5723872780799866, + "logps/chosen": -49.25705337524414, + "logps/rejected": -109.07142639160156, + "loss": 0.6046, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.016441822052002, + "rewards/margins": 6.716720104217529, + "rewards/rejected": -3.700277805328369, + "step": 18595 + }, + { + "epoch": 4.65, + "grad_norm": 2.42607045173645, + "learning_rate": 1.18716153795862e-07, + "logits/chosen": -0.6267837285995483, + "logits/rejected": -0.7183206081390381, + "logps/chosen": -47.899024963378906, + "logps/rejected": -130.58984375, + "loss": 0.5224, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1115851402282715, + "rewards/margins": 9.87391471862793, + "rewards/rejected": -6.7623291015625, + "step": 18596 + }, + { + "epoch": 4.65, + "grad_norm": 5.0969390869140625, + "learning_rate": 1.1854595619017084e-07, + "logits/chosen": -0.5173963308334351, + "logits/rejected": -0.6535153985023499, + "logps/chosen": -57.22233963012695, + "logps/rejected": -90.11808013916016, + "loss": 0.5747, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.907621145248413, + "rewards/margins": 7.031442642211914, + "rewards/rejected": -4.123822212219238, + "step": 18597 + }, + { + "epoch": 4.65, + "grad_norm": 4.098942756652832, + "learning_rate": 1.1837587921040061e-07, + "logits/chosen": -0.5501221418380737, + "logits/rejected": -0.6531246900558472, + "logps/chosen": -65.9127197265625, + "logps/rejected": -96.44588470458984, + "loss": 0.6682, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.859771251678467, + "rewards/margins": 6.702784061431885, + "rewards/rejected": -3.843013048171997, + "step": 18598 + }, + { + "epoch": 4.65, + "grad_norm": 6.557561874389648, + "learning_rate": 1.1820592286075572e-07, + "logits/chosen": -0.5416213274002075, + "logits/rejected": -0.6694775819778442, + "logps/chosen": -56.14237976074219, + "logps/rejected": -107.67493438720703, + "loss": 0.7309, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.119940757751465, + "rewards/margins": 8.160835266113281, + "rewards/rejected": -5.040894985198975, + "step": 18599 + }, + { + "epoch": 4.65, + "grad_norm": 4.272003173828125, + "learning_rate": 1.1803608714543447e-07, + "logits/chosen": -0.5814194083213806, + "logits/rejected": -0.6583218574523926, + "logps/chosen": -50.9334602355957, + "logps/rejected": -105.70449829101562, + "loss": 0.6257, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.44242787361145, + "rewards/margins": 7.616312026977539, + "rewards/rejected": -4.173884391784668, + "step": 18600 + }, + { + "epoch": 4.65, + "grad_norm": 4.953179836273193, + "learning_rate": 1.1786637206863405e-07, + "logits/chosen": -0.5319582223892212, + "logits/rejected": -0.6057754755020142, + "logps/chosen": -52.139381408691406, + "logps/rejected": -107.3022689819336, + "loss": 0.6086, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4719371795654297, + "rewards/margins": 7.417599201202393, + "rewards/rejected": -3.945662021636963, + "step": 18601 + }, + { + "epoch": 4.65, + "grad_norm": 3.108839750289917, + "learning_rate": 1.1769677763454945e-07, + "logits/chosen": -0.5200170874595642, + "logits/rejected": -0.6734691858291626, + "logps/chosen": -61.861244201660156, + "logps/rejected": -92.62261199951172, + "loss": 0.5965, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.325065851211548, + "rewards/margins": 7.102983474731445, + "rewards/rejected": -3.7779176235198975, + "step": 18602 + }, + { + "epoch": 4.65, + "grad_norm": 5.275631427764893, + "learning_rate": 1.1752730384736954e-07, + "logits/chosen": -0.5563463568687439, + "logits/rejected": -0.6463338136672974, + "logps/chosen": -63.47303771972656, + "logps/rejected": -112.86396789550781, + "loss": 0.6309, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0005741119384766, + "rewards/margins": 7.73452091217041, + "rewards/rejected": -4.733946323394775, + "step": 18603 + }, + { + "epoch": 4.65, + "grad_norm": 4.380381107330322, + "learning_rate": 1.173579507112832e-07, + "logits/chosen": -0.479282408952713, + "logits/rejected": -0.5588421821594238, + "logps/chosen": -58.502113342285156, + "logps/rejected": -111.60977172851562, + "loss": 0.5741, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8579604625701904, + "rewards/margins": 6.708970069885254, + "rewards/rejected": -3.8510091304779053, + "step": 18604 + }, + { + "epoch": 4.65, + "grad_norm": 44.82362747192383, + "learning_rate": 1.1718871823047595e-07, + "logits/chosen": -0.4892975687980652, + "logits/rejected": -0.6060199737548828, + "logps/chosen": -53.19374465942383, + "logps/rejected": -94.87174987792969, + "loss": 0.59, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9451534748077393, + "rewards/margins": 7.1555562019348145, + "rewards/rejected": -4.210402965545654, + "step": 18605 + }, + { + "epoch": 4.65, + "grad_norm": 32.91649627685547, + "learning_rate": 1.1701960640912835e-07, + "logits/chosen": -0.5435529351234436, + "logits/rejected": -0.6234495639801025, + "logps/chosen": -45.35182571411133, + "logps/rejected": -101.0331802368164, + "loss": 0.5911, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0750203132629395, + "rewards/margins": 6.826357841491699, + "rewards/rejected": -3.7513372898101807, + "step": 18606 + }, + { + "epoch": 4.65, + "grad_norm": 4.312323093414307, + "learning_rate": 1.1685061525142038e-07, + "logits/chosen": -0.575222909450531, + "logits/rejected": -0.638835072517395, + "logps/chosen": -54.09491729736328, + "logps/rejected": -112.82548522949219, + "loss": 0.6213, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.959259510040283, + "rewards/margins": 7.5479817390441895, + "rewards/rejected": -4.588722229003906, + "step": 18607 + }, + { + "epoch": 4.65, + "grad_norm": 2.3152172565460205, + "learning_rate": 1.1668174476152816e-07, + "logits/chosen": -0.6040833592414856, + "logits/rejected": -0.6578260064125061, + "logps/chosen": -58.93926239013672, + "logps/rejected": -125.61184692382812, + "loss": 0.6147, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4622018337249756, + "rewards/margins": 7.40803861618042, + "rewards/rejected": -3.9458367824554443, + "step": 18608 + }, + { + "epoch": 4.66, + "grad_norm": 3.0991530418395996, + "learning_rate": 1.1651299494362334e-07, + "logits/chosen": -0.5649330615997314, + "logits/rejected": -0.6553503274917603, + "logps/chosen": -49.54437255859375, + "logps/rejected": -93.27207946777344, + "loss": 0.5519, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.418412923812866, + "rewards/margins": 7.159484386444092, + "rewards/rejected": -3.7410712242126465, + "step": 18609 + }, + { + "epoch": 4.66, + "grad_norm": 7.37844181060791, + "learning_rate": 1.1634436580187703e-07, + "logits/chosen": -0.5517359972000122, + "logits/rejected": -0.6280476450920105, + "logps/chosen": -57.82280731201172, + "logps/rejected": -105.63150024414062, + "loss": 0.6758, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5536117553710938, + "rewards/margins": 6.653363227844238, + "rewards/rejected": -4.099751949310303, + "step": 18610 + }, + { + "epoch": 4.66, + "grad_norm": 4.941125392913818, + "learning_rate": 1.1617585734045644e-07, + "logits/chosen": -0.6079755425453186, + "logits/rejected": -0.6850165128707886, + "logps/chosen": -47.66859436035156, + "logps/rejected": -119.42523193359375, + "loss": 0.6197, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.192399740219116, + "rewards/margins": 8.822493553161621, + "rewards/rejected": -5.630094051361084, + "step": 18611 + }, + { + "epoch": 4.66, + "grad_norm": 8.99788761138916, + "learning_rate": 1.1600746956352383e-07, + "logits/chosen": -0.567994236946106, + "logits/rejected": -0.6589258909225464, + "logps/chosen": -43.60955810546875, + "logps/rejected": -110.8382568359375, + "loss": 0.5781, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0496745109558105, + "rewards/margins": 7.319012641906738, + "rewards/rejected": -4.269338130950928, + "step": 18612 + }, + { + "epoch": 4.66, + "grad_norm": 2.133021354675293, + "learning_rate": 1.1583920247524194e-07, + "logits/chosen": -0.5711238980293274, + "logits/rejected": -0.6755603551864624, + "logps/chosen": -48.070045471191406, + "logps/rejected": -93.59358215332031, + "loss": 0.5498, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.387909412384033, + "rewards/margins": 7.256251335144043, + "rewards/rejected": -3.8683409690856934, + "step": 18613 + }, + { + "epoch": 4.66, + "grad_norm": 3.5077784061431885, + "learning_rate": 1.156710560797697e-07, + "logits/chosen": -0.5735924243927002, + "logits/rejected": -0.6226662993431091, + "logps/chosen": -48.30693435668945, + "logps/rejected": -107.12637329101562, + "loss": 0.5987, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.939868450164795, + "rewards/margins": 6.9561662673950195, + "rewards/rejected": -4.016298294067383, + "step": 18614 + }, + { + "epoch": 4.66, + "grad_norm": 4.281184673309326, + "learning_rate": 1.1550303038125931e-07, + "logits/chosen": -0.4850589334964752, + "logits/rejected": -0.55511474609375, + "logps/chosen": -62.1204719543457, + "logps/rejected": -109.95226287841797, + "loss": 0.6022, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9847559928894043, + "rewards/margins": 6.68874454498291, + "rewards/rejected": -3.703989028930664, + "step": 18615 + }, + { + "epoch": 4.66, + "grad_norm": 6.055567741394043, + "learning_rate": 1.1533512538386471e-07, + "logits/chosen": -0.5590516924858093, + "logits/rejected": -0.6345934867858887, + "logps/chosen": -58.366065979003906, + "logps/rejected": -119.61262512207031, + "loss": 0.5889, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1364030838012695, + "rewards/margins": 7.669544696807861, + "rewards/rejected": -4.533141613006592, + "step": 18616 + }, + { + "epoch": 4.66, + "grad_norm": 10.977899551391602, + "learning_rate": 1.1516734109173533e-07, + "logits/chosen": -0.4736740291118622, + "logits/rejected": -0.5675575733184814, + "logps/chosen": -57.989654541015625, + "logps/rejected": -96.20645141601562, + "loss": 0.6099, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.944779872894287, + "rewards/margins": 7.390364646911621, + "rewards/rejected": -4.445584774017334, + "step": 18617 + }, + { + "epoch": 4.66, + "grad_norm": 2.607877731323242, + "learning_rate": 1.149996775090162e-07, + "logits/chosen": -0.6195105910301208, + "logits/rejected": -0.6976642608642578, + "logps/chosen": -42.76139831542969, + "logps/rejected": -116.7155532836914, + "loss": 0.5664, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1047754287719727, + "rewards/margins": 8.714151382446289, + "rewards/rejected": -5.609375953674316, + "step": 18618 + }, + { + "epoch": 4.66, + "grad_norm": 23.72441291809082, + "learning_rate": 1.1483213463985122e-07, + "logits/chosen": -0.659656822681427, + "logits/rejected": -0.740066409111023, + "logps/chosen": -51.31065368652344, + "logps/rejected": -125.39480590820312, + "loss": 0.6541, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9389853477478027, + "rewards/margins": 8.30543327331543, + "rewards/rejected": -5.366447925567627, + "step": 18619 + }, + { + "epoch": 4.66, + "grad_norm": 7.143115520477295, + "learning_rate": 1.1466471248837985e-07, + "logits/chosen": -0.5692542791366577, + "logits/rejected": -0.6499737501144409, + "logps/chosen": -58.73426055908203, + "logps/rejected": -117.68108367919922, + "loss": 0.6296, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6805503368377686, + "rewards/margins": 7.532360553741455, + "rewards/rejected": -4.851809501647949, + "step": 18620 + }, + { + "epoch": 4.66, + "grad_norm": 26.685392379760742, + "learning_rate": 1.1449741105874046e-07, + "logits/chosen": -0.5229272842407227, + "logits/rejected": -0.614467978477478, + "logps/chosen": -56.167747497558594, + "logps/rejected": -107.42708587646484, + "loss": 0.5886, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4266412258148193, + "rewards/margins": 8.46348762512207, + "rewards/rejected": -5.036845684051514, + "step": 18621 + }, + { + "epoch": 4.66, + "grad_norm": 3.9883291721343994, + "learning_rate": 1.1433023035506585e-07, + "logits/chosen": -0.6087894439697266, + "logits/rejected": -0.6844582557678223, + "logps/chosen": -53.23879623413086, + "logps/rejected": -101.0744857788086, + "loss": 0.6345, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1290881633758545, + "rewards/margins": 7.599833965301514, + "rewards/rejected": -4.47074556350708, + "step": 18622 + }, + { + "epoch": 4.66, + "grad_norm": 4.575464248657227, + "learning_rate": 1.1416317038148827e-07, + "logits/chosen": -0.5501382946968079, + "logits/rejected": -0.626912534236908, + "logps/chosen": -56.293704986572266, + "logps/rejected": -114.93152618408203, + "loss": 0.6044, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4748401641845703, + "rewards/margins": 7.926108360290527, + "rewards/rejected": -4.451268196105957, + "step": 18623 + }, + { + "epoch": 4.66, + "grad_norm": 4.147599697113037, + "learning_rate": 1.1399623114213499e-07, + "logits/chosen": -0.5107382535934448, + "logits/rejected": -0.592150866985321, + "logps/chosen": -61.00578308105469, + "logps/rejected": -98.08924865722656, + "loss": 0.6463, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.183882713317871, + "rewards/margins": 6.749467372894287, + "rewards/rejected": -3.565584897994995, + "step": 18624 + }, + { + "epoch": 4.66, + "grad_norm": 4.661943435668945, + "learning_rate": 1.1382941264113212e-07, + "logits/chosen": -0.5463821887969971, + "logits/rejected": -0.6098426580429077, + "logps/chosen": -58.40139389038086, + "logps/rejected": -109.11927795410156, + "loss": 0.5544, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2493672370910645, + "rewards/margins": 6.98748254776001, + "rewards/rejected": -3.7381157875061035, + "step": 18625 + }, + { + "epoch": 4.66, + "grad_norm": 4.052971839904785, + "learning_rate": 1.1366271488260194e-07, + "logits/chosen": -0.5927925109863281, + "logits/rejected": -0.615620493888855, + "logps/chosen": -56.40726852416992, + "logps/rejected": -97.57945251464844, + "loss": 0.6043, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.218881130218506, + "rewards/margins": 6.065235137939453, + "rewards/rejected": -2.8463540077209473, + "step": 18626 + }, + { + "epoch": 4.66, + "grad_norm": 3.704411745071411, + "learning_rate": 1.1349613787066339e-07, + "logits/chosen": -0.5691655874252319, + "logits/rejected": -0.6525731086730957, + "logps/chosen": -57.78229904174805, + "logps/rejected": -116.68144226074219, + "loss": 0.63, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.049692153930664, + "rewards/margins": 7.901676177978516, + "rewards/rejected": -4.851984024047852, + "step": 18627 + }, + { + "epoch": 4.66, + "grad_norm": 6.589084148406982, + "learning_rate": 1.1332968160943259e-07, + "logits/chosen": -0.558668851852417, + "logits/rejected": -0.6458706259727478, + "logps/chosen": -58.83975601196289, + "logps/rejected": -88.26490020751953, + "loss": 0.7171, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.89656400680542, + "rewards/margins": 6.961856842041016, + "rewards/rejected": -4.0652923583984375, + "step": 18628 + }, + { + "epoch": 4.66, + "grad_norm": 4.501461029052734, + "learning_rate": 1.1316334610302293e-07, + "logits/chosen": -0.520089328289032, + "logits/rejected": -0.6143503189086914, + "logps/chosen": -59.028892517089844, + "logps/rejected": -90.6403579711914, + "loss": 0.5754, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1043875217437744, + "rewards/margins": 7.050191879272461, + "rewards/rejected": -3.9458048343658447, + "step": 18629 + }, + { + "epoch": 4.66, + "grad_norm": 4.30747127532959, + "learning_rate": 1.1299713135554502e-07, + "logits/chosen": -0.5311408042907715, + "logits/rejected": -0.641220211982727, + "logps/chosen": -65.53273010253906, + "logps/rejected": -111.85382080078125, + "loss": 0.7119, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9320244789123535, + "rewards/margins": 7.892255783081055, + "rewards/rejected": -4.960230827331543, + "step": 18630 + }, + { + "epoch": 4.66, + "grad_norm": 3.0896098613739014, + "learning_rate": 1.1283103737110557e-07, + "logits/chosen": -0.5288647413253784, + "logits/rejected": -0.5783209204673767, + "logps/chosen": -54.024864196777344, + "logps/rejected": -107.29198455810547, + "loss": 0.5561, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1277873516082764, + "rewards/margins": 6.712228775024414, + "rewards/rejected": -3.584441661834717, + "step": 18631 + }, + { + "epoch": 4.66, + "grad_norm": 10.94832706451416, + "learning_rate": 1.1266506415381018e-07, + "logits/chosen": -0.5249067544937134, + "logits/rejected": -0.6096151471138, + "logps/chosen": -56.09262466430664, + "logps/rejected": -98.91058349609375, + "loss": 0.6264, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.137392282485962, + "rewards/margins": 6.912726402282715, + "rewards/rejected": -3.775333881378174, + "step": 18632 + }, + { + "epoch": 4.66, + "grad_norm": 8.205098152160645, + "learning_rate": 1.124992117077589e-07, + "logits/chosen": -0.5776879787445068, + "logits/rejected": -0.6269914507865906, + "logps/chosen": -47.4478874206543, + "logps/rejected": -104.59709167480469, + "loss": 0.6245, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1891772747039795, + "rewards/margins": 6.4175519943237305, + "rewards/rejected": -3.228375196456909, + "step": 18633 + }, + { + "epoch": 4.66, + "grad_norm": 17.313331604003906, + "learning_rate": 1.1233348003705069e-07, + "logits/chosen": -0.5130928754806519, + "logits/rejected": -0.6026108860969543, + "logps/chosen": -61.72178268432617, + "logps/rejected": -109.59835052490234, + "loss": 0.624, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9688162803649902, + "rewards/margins": 6.551980018615723, + "rewards/rejected": -3.5831642150878906, + "step": 18634 + }, + { + "epoch": 4.66, + "grad_norm": 9.56228256225586, + "learning_rate": 1.121678691457817e-07, + "logits/chosen": -0.4905271828174591, + "logits/rejected": -0.5924317836761475, + "logps/chosen": -53.63565444946289, + "logps/rejected": -90.8480224609375, + "loss": 0.5807, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0357470512390137, + "rewards/margins": 7.071996688842773, + "rewards/rejected": -4.03624963760376, + "step": 18635 + }, + { + "epoch": 4.66, + "grad_norm": 3.4893925189971924, + "learning_rate": 1.1200237903804312e-07, + "logits/chosen": -0.5814018249511719, + "logits/rejected": -0.6687200665473938, + "logps/chosen": -47.776100158691406, + "logps/rejected": -103.7196044921875, + "loss": 0.5179, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0650625228881836, + "rewards/margins": 7.120241165161133, + "rewards/rejected": -4.055179119110107, + "step": 18636 + }, + { + "epoch": 4.66, + "grad_norm": 3.3614203929901123, + "learning_rate": 1.1183700971792444e-07, + "logits/chosen": -0.535729169845581, + "logits/rejected": -0.6109492182731628, + "logps/chosen": -52.56311798095703, + "logps/rejected": -112.96495056152344, + "loss": 0.5436, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1500773429870605, + "rewards/margins": 7.4869704246521, + "rewards/rejected": -4.336893081665039, + "step": 18637 + }, + { + "epoch": 4.66, + "grad_norm": 15.476737976074219, + "learning_rate": 1.1167176118951296e-07, + "logits/chosen": -0.49870699644088745, + "logits/rejected": -0.6065682172775269, + "logps/chosen": -49.83063507080078, + "logps/rejected": -110.43963623046875, + "loss": 0.5753, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8649280071258545, + "rewards/margins": 7.547045707702637, + "rewards/rejected": -4.6821184158325195, + "step": 18638 + }, + { + "epoch": 4.66, + "grad_norm": 4.237320899963379, + "learning_rate": 1.1150663345689206e-07, + "logits/chosen": -0.5161961913108826, + "logits/rejected": -0.607826292514801, + "logps/chosen": -61.110843658447266, + "logps/rejected": -99.0489730834961, + "loss": 0.612, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.100857973098755, + "rewards/margins": 6.754647254943848, + "rewards/rejected": -3.653789520263672, + "step": 18639 + }, + { + "epoch": 4.66, + "grad_norm": 11.210604667663574, + "learning_rate": 1.1134162652414072e-07, + "logits/chosen": -0.5785719752311707, + "logits/rejected": -0.6503521800041199, + "logps/chosen": -64.81947326660156, + "logps/rejected": -115.00442504882812, + "loss": 0.6248, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3009321689605713, + "rewards/margins": 6.774768352508545, + "rewards/rejected": -3.4738357067108154, + "step": 18640 + }, + { + "epoch": 4.66, + "grad_norm": 6.747088432312012, + "learning_rate": 1.1117674039533844e-07, + "logits/chosen": -0.5906588435173035, + "logits/rejected": -0.6386674046516418, + "logps/chosen": -51.120330810546875, + "logps/rejected": -104.95037078857422, + "loss": 0.7171, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.142000198364258, + "rewards/margins": 6.528087615966797, + "rewards/rejected": -3.386087417602539, + "step": 18641 + }, + { + "epoch": 4.66, + "grad_norm": 5.4564738273620605, + "learning_rate": 1.1101197507455864e-07, + "logits/chosen": -0.5435203313827515, + "logits/rejected": -0.6320093274116516, + "logps/chosen": -55.3306999206543, + "logps/rejected": -92.78229522705078, + "loss": 0.6101, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3971171379089355, + "rewards/margins": 7.1669464111328125, + "rewards/rejected": -3.769829273223877, + "step": 18642 + }, + { + "epoch": 4.66, + "grad_norm": 4.927505970001221, + "learning_rate": 1.108473305658736e-07, + "logits/chosen": -0.5120096206665039, + "logits/rejected": -0.5902069807052612, + "logps/chosen": -54.28716278076172, + "logps/rejected": -110.84815979003906, + "loss": 0.7451, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.764822244644165, + "rewards/margins": 6.930191993713379, + "rewards/rejected": -4.165369987487793, + "step": 18643 + }, + { + "epoch": 4.66, + "grad_norm": 3.110689640045166, + "learning_rate": 1.1068280687335064e-07, + "logits/chosen": -0.5776640176773071, + "logits/rejected": -0.6826809048652649, + "logps/chosen": -49.286155700683594, + "logps/rejected": -116.8418197631836, + "loss": 0.569, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.316826581954956, + "rewards/margins": 8.62548542022705, + "rewards/rejected": -5.308658599853516, + "step": 18644 + }, + { + "epoch": 4.66, + "grad_norm": 2.207092046737671, + "learning_rate": 1.1051840400105651e-07, + "logits/chosen": -0.5087015628814697, + "logits/rejected": -0.5324171781539917, + "logps/chosen": -50.904930114746094, + "logps/rejected": -111.88824462890625, + "loss": 0.5511, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2333366870880127, + "rewards/margins": 6.957111835479736, + "rewards/rejected": -3.7237749099731445, + "step": 18645 + }, + { + "epoch": 4.66, + "grad_norm": 4.842913627624512, + "learning_rate": 1.103541219530535e-07, + "logits/chosen": -0.6262569427490234, + "logits/rejected": -0.7087959051132202, + "logps/chosen": -56.396976470947266, + "logps/rejected": -108.41744995117188, + "loss": 0.6076, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6307311058044434, + "rewards/margins": 6.963291168212891, + "rewards/rejected": -4.332560062408447, + "step": 18646 + }, + { + "epoch": 4.66, + "grad_norm": 5.022403240203857, + "learning_rate": 1.1018996073340005e-07, + "logits/chosen": -0.5831671357154846, + "logits/rejected": -0.6535014510154724, + "logps/chosen": -48.75611114501953, + "logps/rejected": -104.82728576660156, + "loss": 0.5504, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.052072286605835, + "rewards/margins": 7.675615310668945, + "rewards/rejected": -4.623542785644531, + "step": 18647 + }, + { + "epoch": 4.66, + "grad_norm": 2.6142125129699707, + "learning_rate": 1.1002592034615456e-07, + "logits/chosen": -0.6185814738273621, + "logits/rejected": -0.7026887536048889, + "logps/chosen": -55.49759292602539, + "logps/rejected": -131.4346923828125, + "loss": 0.5669, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0998597145080566, + "rewards/margins": 9.506839752197266, + "rewards/rejected": -6.406979560852051, + "step": 18648 + }, + { + "epoch": 4.67, + "grad_norm": 7.042466163635254, + "learning_rate": 1.0986200079536935e-07, + "logits/chosen": -0.5456268787384033, + "logits/rejected": -0.5652305483818054, + "logps/chosen": -57.15105438232422, + "logps/rejected": -122.20259857177734, + "loss": 0.5755, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1880552768707275, + "rewards/margins": 6.759685516357422, + "rewards/rejected": -3.5716304779052734, + "step": 18649 + }, + { + "epoch": 4.67, + "grad_norm": 3.478424549102783, + "learning_rate": 1.0969820208509563e-07, + "logits/chosen": -0.5598503351211548, + "logits/rejected": -0.6075012683868408, + "logps/chosen": -55.752525329589844, + "logps/rejected": -120.90299987792969, + "loss": 0.5986, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3216121196746826, + "rewards/margins": 7.429050445556641, + "rewards/rejected": -4.107438087463379, + "step": 18650 + }, + { + "epoch": 4.67, + "grad_norm": 3.2738919258117676, + "learning_rate": 1.0953452421938127e-07, + "logits/chosen": -0.5212509632110596, + "logits/rejected": -0.5985066890716553, + "logps/chosen": -61.52263641357422, + "logps/rejected": -113.21560668945312, + "loss": 0.6514, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3695406913757324, + "rewards/margins": 8.230518341064453, + "rewards/rejected": -4.860976696014404, + "step": 18651 + }, + { + "epoch": 4.67, + "grad_norm": 4.64669132232666, + "learning_rate": 1.0937096720227026e-07, + "logits/chosen": -0.5625360012054443, + "logits/rejected": -0.6262603402137756, + "logps/chosen": -46.654754638671875, + "logps/rejected": -91.507080078125, + "loss": 0.5529, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.342179298400879, + "rewards/margins": 7.412870407104492, + "rewards/rejected": -4.070691108703613, + "step": 18652 + }, + { + "epoch": 4.67, + "grad_norm": 4.73378324508667, + "learning_rate": 1.0920753103780435e-07, + "logits/chosen": -0.526701033115387, + "logits/rejected": -0.5899633169174194, + "logps/chosen": -57.053443908691406, + "logps/rejected": -132.28948974609375, + "loss": 0.6173, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0202786922454834, + "rewards/margins": 7.122179985046387, + "rewards/rejected": -4.101901054382324, + "step": 18653 + }, + { + "epoch": 4.67, + "grad_norm": 2.6828160285949707, + "learning_rate": 1.0904421573002311e-07, + "logits/chosen": -0.5707089304924011, + "logits/rejected": -0.6477755904197693, + "logps/chosen": -51.584415435791016, + "logps/rejected": -127.58554077148438, + "loss": 0.5579, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.120147228240967, + "rewards/margins": 8.86495590209961, + "rewards/rejected": -5.744809150695801, + "step": 18654 + }, + { + "epoch": 4.67, + "grad_norm": 3.5263705253601074, + "learning_rate": 1.0888102128296052e-07, + "logits/chosen": -0.5391166806221008, + "logits/rejected": -0.5930089950561523, + "logps/chosen": -53.49806594848633, + "logps/rejected": -127.12408447265625, + "loss": 0.5694, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.853135585784912, + "rewards/margins": 8.52292251586914, + "rewards/rejected": -5.669787406921387, + "step": 18655 + }, + { + "epoch": 4.67, + "grad_norm": 1.7673248052597046, + "learning_rate": 1.0871794770065058e-07, + "logits/chosen": -0.5049127340316772, + "logits/rejected": -0.5772320628166199, + "logps/chosen": -56.936832427978516, + "logps/rejected": -133.65176391601562, + "loss": 0.6011, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2608964443206787, + "rewards/margins": 9.668927192687988, + "rewards/rejected": -6.408031463623047, + "step": 18656 + }, + { + "epoch": 4.67, + "grad_norm": 5.542830467224121, + "learning_rate": 1.0855499498712341e-07, + "logits/chosen": -0.5824481248855591, + "logits/rejected": -0.6508824229240417, + "logps/chosen": -47.94456481933594, + "logps/rejected": -96.69497680664062, + "loss": 0.6978, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.041088581085205, + "rewards/margins": 6.624716758728027, + "rewards/rejected": -3.5836284160614014, + "step": 18657 + }, + { + "epoch": 4.67, + "grad_norm": 7.040152072906494, + "learning_rate": 1.0839216314640466e-07, + "logits/chosen": -0.5273587703704834, + "logits/rejected": -0.5761067867279053, + "logps/chosen": -60.33180618286133, + "logps/rejected": -129.04043579101562, + "loss": 0.6176, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0276870727539062, + "rewards/margins": 7.275432586669922, + "rewards/rejected": -4.247745513916016, + "step": 18658 + }, + { + "epoch": 4.67, + "grad_norm": 4.617903709411621, + "learning_rate": 1.0822945218251835e-07, + "logits/chosen": -0.5184358954429626, + "logits/rejected": -0.60802161693573, + "logps/chosen": -67.32142639160156, + "logps/rejected": -121.1099853515625, + "loss": 0.6529, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9266200065612793, + "rewards/margins": 8.73077392578125, + "rewards/rejected": -5.804154396057129, + "step": 18659 + }, + { + "epoch": 4.67, + "grad_norm": 4.266486167907715, + "learning_rate": 1.0806686209948569e-07, + "logits/chosen": -0.520291268825531, + "logits/rejected": -0.583915114402771, + "logps/chosen": -48.953067779541016, + "logps/rejected": -88.20526885986328, + "loss": 0.5948, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.481969118118286, + "rewards/margins": 6.405533313751221, + "rewards/rejected": -2.9235641956329346, + "step": 18660 + }, + { + "epoch": 4.67, + "grad_norm": 6.893463134765625, + "learning_rate": 1.0790439290132404e-07, + "logits/chosen": -0.6583618521690369, + "logits/rejected": -0.7246838212013245, + "logps/chosen": -54.84341049194336, + "logps/rejected": -107.6334457397461, + "loss": 0.6742, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9372780323028564, + "rewards/margins": 7.114686012268066, + "rewards/rejected": -4.177407741546631, + "step": 18661 + }, + { + "epoch": 4.67, + "grad_norm": 7.186334133148193, + "learning_rate": 1.0774204459204796e-07, + "logits/chosen": -0.6095592975616455, + "logits/rejected": -0.6444171071052551, + "logps/chosen": -44.684844970703125, + "logps/rejected": -101.37020111083984, + "loss": 0.5557, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1041433811187744, + "rewards/margins": 7.232987880706787, + "rewards/rejected": -4.128843784332275, + "step": 18662 + }, + { + "epoch": 4.67, + "grad_norm": 5.6344709396362305, + "learning_rate": 1.0757981717567034e-07, + "logits/chosen": -0.5145250558853149, + "logits/rejected": -0.6107816696166992, + "logps/chosen": -52.89745330810547, + "logps/rejected": -109.05175018310547, + "loss": 0.5773, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2719359397888184, + "rewards/margins": 7.951550006866455, + "rewards/rejected": -4.679614067077637, + "step": 18663 + }, + { + "epoch": 4.67, + "grad_norm": 10.328869819641113, + "learning_rate": 1.0741771065619911e-07, + "logits/chosen": -0.5518024563789368, + "logits/rejected": -0.6418936848640442, + "logps/chosen": -53.064247131347656, + "logps/rejected": -94.77825927734375, + "loss": 0.6761, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.726621627807617, + "rewards/margins": 6.16429328918457, + "rewards/rejected": -3.437671184539795, + "step": 18664 + }, + { + "epoch": 4.67, + "grad_norm": 8.046875953674316, + "learning_rate": 1.0725572503763936e-07, + "logits/chosen": -0.5228251814842224, + "logits/rejected": -0.6055080890655518, + "logps/chosen": -50.96770095825195, + "logps/rejected": -101.60844421386719, + "loss": 0.572, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0004138946533203, + "rewards/margins": 7.367189407348633, + "rewards/rejected": -4.366775035858154, + "step": 18665 + }, + { + "epoch": 4.67, + "grad_norm": 5.24355411529541, + "learning_rate": 1.0709386032399571e-07, + "logits/chosen": -0.501483142375946, + "logits/rejected": -0.5798664093017578, + "logps/chosen": -48.538516998291016, + "logps/rejected": -100.45449829101562, + "loss": 0.5053, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.04034161567688, + "rewards/margins": 7.205018043518066, + "rewards/rejected": -4.164676666259766, + "step": 18666 + }, + { + "epoch": 4.67, + "grad_norm": 8.500861167907715, + "learning_rate": 1.069321165192666e-07, + "logits/chosen": -0.5738275647163391, + "logits/rejected": -0.6335108876228333, + "logps/chosen": -55.02001190185547, + "logps/rejected": -105.17728424072266, + "loss": 0.6201, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.306891441345215, + "rewards/margins": 6.952681541442871, + "rewards/rejected": -3.645789861679077, + "step": 18667 + }, + { + "epoch": 4.67, + "grad_norm": 12.283364295959473, + "learning_rate": 1.0677049362744995e-07, + "logits/chosen": -0.5804715156555176, + "logits/rejected": -0.6372393369674683, + "logps/chosen": -64.73646545410156, + "logps/rejected": -116.28895568847656, + "loss": 0.7961, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8552629947662354, + "rewards/margins": 6.573368072509766, + "rewards/rejected": -3.7181055545806885, + "step": 18668 + }, + { + "epoch": 4.67, + "grad_norm": 4.596733093261719, + "learning_rate": 1.0660899165253869e-07, + "logits/chosen": -0.6451811790466309, + "logits/rejected": -0.7095642685890198, + "logps/chosen": -56.98859405517578, + "logps/rejected": -100.53547668457031, + "loss": 0.6635, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.22694993019104, + "rewards/margins": 5.918671607971191, + "rewards/rejected": -2.691721200942993, + "step": 18669 + }, + { + "epoch": 4.67, + "grad_norm": 7.558394432067871, + "learning_rate": 1.0644761059852404e-07, + "logits/chosen": -0.5556322336196899, + "logits/rejected": -0.5680503845214844, + "logps/chosen": -47.726043701171875, + "logps/rejected": -102.40518188476562, + "loss": 0.6253, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.777977228164673, + "rewards/margins": 5.361093044281006, + "rewards/rejected": -2.583115816116333, + "step": 18670 + }, + { + "epoch": 4.67, + "grad_norm": 6.083674430847168, + "learning_rate": 1.0628635046939395e-07, + "logits/chosen": -0.48754245042800903, + "logits/rejected": -0.5639118552207947, + "logps/chosen": -67.88972473144531, + "logps/rejected": -124.14068603515625, + "loss": 0.5903, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2915658950805664, + "rewards/margins": 8.230803489685059, + "rewards/rejected": -4.939237594604492, + "step": 18671 + }, + { + "epoch": 4.67, + "grad_norm": 5.2711501121521, + "learning_rate": 1.06125211269133e-07, + "logits/chosen": -0.4962303042411804, + "logits/rejected": -0.5840012431144714, + "logps/chosen": -53.01811218261719, + "logps/rejected": -104.89739990234375, + "loss": 0.5703, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1040871143341064, + "rewards/margins": 6.419959545135498, + "rewards/rejected": -3.31587290763855, + "step": 18672 + }, + { + "epoch": 4.67, + "grad_norm": 4.305011749267578, + "learning_rate": 1.0596419300172412e-07, + "logits/chosen": -0.5869094133377075, + "logits/rejected": -0.620173454284668, + "logps/chosen": -46.71874237060547, + "logps/rejected": -101.96358489990234, + "loss": 0.5968, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2633731365203857, + "rewards/margins": 6.388035297393799, + "rewards/rejected": -3.124661922454834, + "step": 18673 + }, + { + "epoch": 4.67, + "grad_norm": 2.751375436782837, + "learning_rate": 1.0580329567114523e-07, + "logits/chosen": -0.5692304372787476, + "logits/rejected": -0.6336361765861511, + "logps/chosen": -55.30543518066406, + "logps/rejected": -109.52793884277344, + "loss": 0.6094, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.150200843811035, + "rewards/margins": 8.00680160522461, + "rewards/rejected": -4.856601238250732, + "step": 18674 + }, + { + "epoch": 4.67, + "grad_norm": 3.1527645587921143, + "learning_rate": 1.056425192813726e-07, + "logits/chosen": -0.5398642420768738, + "logits/rejected": -0.6084264516830444, + "logps/chosen": -53.68214797973633, + "logps/rejected": -110.96617889404297, + "loss": 0.5769, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.151620388031006, + "rewards/margins": 7.771149158477783, + "rewards/rejected": -4.619528293609619, + "step": 18675 + }, + { + "epoch": 4.67, + "grad_norm": 4.191079616546631, + "learning_rate": 1.0548186383637915e-07, + "logits/chosen": -0.5410019755363464, + "logits/rejected": -0.5829156041145325, + "logps/chosen": -52.14530944824219, + "logps/rejected": -115.01081085205078, + "loss": 0.6414, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3611509799957275, + "rewards/margins": 7.0818328857421875, + "rewards/rejected": -3.720681667327881, + "step": 18676 + }, + { + "epoch": 4.67, + "grad_norm": 3.761157751083374, + "learning_rate": 1.0532132934013506e-07, + "logits/chosen": -0.6040048599243164, + "logits/rejected": -0.6387619972229004, + "logps/chosen": -54.20952224731445, + "logps/rejected": -107.97901916503906, + "loss": 0.6402, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2677431106567383, + "rewards/margins": 6.438501834869385, + "rewards/rejected": -3.1707592010498047, + "step": 18677 + }, + { + "epoch": 4.67, + "grad_norm": 23.443933486938477, + "learning_rate": 1.0516091579660714e-07, + "logits/chosen": -0.570616602897644, + "logits/rejected": -0.6780221462249756, + "logps/chosen": -54.01026153564453, + "logps/rejected": -91.29837036132812, + "loss": 0.6041, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0422167778015137, + "rewards/margins": 6.774410724639893, + "rewards/rejected": -3.732194185256958, + "step": 18678 + }, + { + "epoch": 4.67, + "grad_norm": 2.9315292835235596, + "learning_rate": 1.0500062320975946e-07, + "logits/chosen": -0.6261881589889526, + "logits/rejected": -0.6969384551048279, + "logps/chosen": -54.40428161621094, + "logps/rejected": -115.22274017333984, + "loss": 0.5706, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8173794746398926, + "rewards/margins": 7.902693271636963, + "rewards/rejected": -5.08531379699707, + "step": 18679 + }, + { + "epoch": 4.67, + "grad_norm": 8.194272994995117, + "learning_rate": 1.0484045158355272e-07, + "logits/chosen": -0.582118034362793, + "logits/rejected": -0.6033984422683716, + "logps/chosen": -55.046836853027344, + "logps/rejected": -119.12889099121094, + "loss": 0.6159, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.185548782348633, + "rewards/margins": 7.638359069824219, + "rewards/rejected": -4.452810764312744, + "step": 18680 + }, + { + "epoch": 4.67, + "grad_norm": 2.728379726409912, + "learning_rate": 1.0468040092194488e-07, + "logits/chosen": -0.5508544445037842, + "logits/rejected": -0.6722991466522217, + "logps/chosen": -57.190635681152344, + "logps/rejected": -102.38846588134766, + "loss": 0.6058, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1191585063934326, + "rewards/margins": 7.4641432762146, + "rewards/rejected": -4.344984531402588, + "step": 18681 + }, + { + "epoch": 4.67, + "grad_norm": 2.676459312438965, + "learning_rate": 1.0452047122889164e-07, + "logits/chosen": -0.5198011994361877, + "logits/rejected": -0.5786721706390381, + "logps/chosen": -52.344940185546875, + "logps/rejected": -109.44773864746094, + "loss": 0.5586, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.438556432723999, + "rewards/margins": 7.737009525299072, + "rewards/rejected": -4.298452854156494, + "step": 18682 + }, + { + "epoch": 4.67, + "grad_norm": 6.106379508972168, + "learning_rate": 1.0436066250834432e-07, + "logits/chosen": -0.5349751114845276, + "logits/rejected": -0.6550544500350952, + "logps/chosen": -57.088260650634766, + "logps/rejected": -88.73851776123047, + "loss": 0.6606, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.524358034133911, + "rewards/margins": 6.374819755554199, + "rewards/rejected": -3.850461483001709, + "step": 18683 + }, + { + "epoch": 4.67, + "grad_norm": 4.456026554107666, + "learning_rate": 1.042009747642525e-07, + "logits/chosen": -0.46607154607772827, + "logits/rejected": -0.4899647831916809, + "logps/chosen": -62.37385559082031, + "logps/rejected": -122.26701354980469, + "loss": 0.6486, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2637851238250732, + "rewards/margins": 7.360625743865967, + "rewards/rejected": -4.096840858459473, + "step": 18684 + }, + { + "epoch": 4.67, + "grad_norm": 5.479355335235596, + "learning_rate": 1.040414080005625e-07, + "logits/chosen": -0.5159338712692261, + "logits/rejected": -0.5513366460800171, + "logps/chosen": -55.140464782714844, + "logps/rejected": -114.463623046875, + "loss": 0.6016, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.346945285797119, + "rewards/margins": 7.767704486846924, + "rewards/rejected": -4.420759677886963, + "step": 18685 + }, + { + "epoch": 4.67, + "grad_norm": 4.187074184417725, + "learning_rate": 1.0388196222121616e-07, + "logits/chosen": -0.5215462446212769, + "logits/rejected": -0.5666776299476624, + "logps/chosen": -60.40080261230469, + "logps/rejected": -116.75373840332031, + "loss": 0.6179, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0873494148254395, + "rewards/margins": 6.838447570800781, + "rewards/rejected": -3.7510979175567627, + "step": 18686 + }, + { + "epoch": 4.67, + "grad_norm": 2.8723137378692627, + "learning_rate": 1.0372263743015421e-07, + "logits/chosen": -0.5471403002738953, + "logits/rejected": -0.6472187042236328, + "logps/chosen": -46.30445861816406, + "logps/rejected": -122.75850677490234, + "loss": 0.5108, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1481361389160156, + "rewards/margins": 9.750435829162598, + "rewards/rejected": -6.602299690246582, + "step": 18687 + }, + { + "epoch": 4.67, + "grad_norm": 6.973528861999512, + "learning_rate": 1.0356343363131405e-07, + "logits/chosen": -0.5642871260643005, + "logits/rejected": -0.6029727458953857, + "logps/chosen": -56.74159240722656, + "logps/rejected": -98.58097839355469, + "loss": 0.6109, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1758131980895996, + "rewards/margins": 5.51425838470459, + "rewards/rejected": -2.338444471359253, + "step": 18688 + }, + { + "epoch": 4.68, + "grad_norm": 6.03324031829834, + "learning_rate": 1.0340435082862976e-07, + "logits/chosen": -0.5623030662536621, + "logits/rejected": -0.638676106929779, + "logps/chosen": -52.94042205810547, + "logps/rejected": -91.18638610839844, + "loss": 0.6257, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.944349765777588, + "rewards/margins": 5.812954902648926, + "rewards/rejected": -2.868605136871338, + "step": 18689 + }, + { + "epoch": 4.68, + "grad_norm": 6.300746917724609, + "learning_rate": 1.0324538902603154e-07, + "logits/chosen": -0.5915611982345581, + "logits/rejected": -0.675618588924408, + "logps/chosen": -50.232173919677734, + "logps/rejected": -96.51522827148438, + "loss": 0.5554, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.164578437805176, + "rewards/margins": 7.430974960327148, + "rewards/rejected": -4.266396522521973, + "step": 18690 + }, + { + "epoch": 4.68, + "grad_norm": 11.641891479492188, + "learning_rate": 1.0308654822744901e-07, + "logits/chosen": -0.5474181771278381, + "logits/rejected": -0.6371519565582275, + "logps/chosen": -57.93383026123047, + "logps/rejected": -84.82666015625, + "loss": 0.5708, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.220425844192505, + "rewards/margins": 6.787954807281494, + "rewards/rejected": -3.5675294399261475, + "step": 18691 + }, + { + "epoch": 4.68, + "grad_norm": 2.281425714492798, + "learning_rate": 1.0292782843680571e-07, + "logits/chosen": -0.5148024559020996, + "logits/rejected": -0.6008293032646179, + "logps/chosen": -50.285743713378906, + "logps/rejected": -102.6597900390625, + "loss": 0.5839, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.207120418548584, + "rewards/margins": 7.88593864440918, + "rewards/rejected": -4.6788177490234375, + "step": 18692 + }, + { + "epoch": 4.68, + "grad_norm": 5.338720321655273, + "learning_rate": 1.0276922965802461e-07, + "logits/chosen": -0.46528542041778564, + "logits/rejected": -0.6009902954101562, + "logps/chosen": -54.12118911743164, + "logps/rejected": -79.50370025634766, + "loss": 0.5634, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.874889612197876, + "rewards/margins": 6.120846748352051, + "rewards/rejected": -3.245957136154175, + "step": 18693 + }, + { + "epoch": 4.68, + "grad_norm": 2.889979839324951, + "learning_rate": 1.0261075189502535e-07, + "logits/chosen": -0.5846834778785706, + "logits/rejected": -0.6186637878417969, + "logps/chosen": -51.6973876953125, + "logps/rejected": -111.25318908691406, + "loss": 0.6053, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.165147066116333, + "rewards/margins": 7.024435997009277, + "rewards/rejected": -3.8592896461486816, + "step": 18694 + }, + { + "epoch": 4.68, + "grad_norm": 12.975030899047852, + "learning_rate": 1.0245239515172312e-07, + "logits/chosen": -0.6699249744415283, + "logits/rejected": -0.742466151714325, + "logps/chosen": -45.15885925292969, + "logps/rejected": -83.82666015625, + "loss": 0.6302, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.965660572052002, + "rewards/margins": 5.989352703094482, + "rewards/rejected": -3.0236921310424805, + "step": 18695 + }, + { + "epoch": 4.68, + "grad_norm": 4.675624370574951, + "learning_rate": 1.0229415943203147e-07, + "logits/chosen": -0.5308618545532227, + "logits/rejected": -0.5779085755348206, + "logps/chosen": -61.53156280517578, + "logps/rejected": -105.61764526367188, + "loss": 0.632, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0760438442230225, + "rewards/margins": 5.857095718383789, + "rewards/rejected": -2.7810516357421875, + "step": 18696 + }, + { + "epoch": 4.68, + "grad_norm": 4.65127420425415, + "learning_rate": 1.0213604473986116e-07, + "logits/chosen": -0.4799845218658447, + "logits/rejected": -0.602051854133606, + "logps/chosen": -62.803855895996094, + "logps/rejected": -91.29859924316406, + "loss": 0.7183, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7475452423095703, + "rewards/margins": 6.649839878082275, + "rewards/rejected": -3.902294397354126, + "step": 18697 + }, + { + "epoch": 4.68, + "grad_norm": 2.8046276569366455, + "learning_rate": 1.0197805107911796e-07, + "logits/chosen": -0.4756052792072296, + "logits/rejected": -0.5727875828742981, + "logps/chosen": -57.834495544433594, + "logps/rejected": -104.78126525878906, + "loss": 0.588, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5943894386291504, + "rewards/margins": 7.788172245025635, + "rewards/rejected": -4.193782806396484, + "step": 18698 + }, + { + "epoch": 4.68, + "grad_norm": 8.790096282958984, + "learning_rate": 1.018201784537065e-07, + "logits/chosen": -0.540934145450592, + "logits/rejected": -0.6246496438980103, + "logps/chosen": -47.65581512451172, + "logps/rejected": -114.44464874267578, + "loss": 0.6442, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9205594062805176, + "rewards/margins": 7.727633476257324, + "rewards/rejected": -4.807074546813965, + "step": 18699 + }, + { + "epoch": 4.68, + "grad_norm": 7.312126159667969, + "learning_rate": 1.016624268675298e-07, + "logits/chosen": -0.548981249332428, + "logits/rejected": -0.5731039047241211, + "logps/chosen": -51.24942398071289, + "logps/rejected": -96.54254150390625, + "loss": 0.714, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.526890754699707, + "rewards/margins": 5.821258544921875, + "rewards/rejected": -3.294367551803589, + "step": 18700 + }, + { + "epoch": 4.68, + "grad_norm": 3.7106127738952637, + "learning_rate": 1.0150479632448363e-07, + "logits/chosen": -0.5805349946022034, + "logits/rejected": -0.6921608448028564, + "logps/chosen": -51.28754425048828, + "logps/rejected": -102.66139221191406, + "loss": 0.6, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.15928316116333, + "rewards/margins": 6.334028244018555, + "rewards/rejected": -3.1747450828552246, + "step": 18701 + }, + { + "epoch": 4.68, + "grad_norm": 3.861731767654419, + "learning_rate": 1.0134728682846428e-07, + "logits/chosen": -0.5028875470161438, + "logits/rejected": -0.5986201167106628, + "logps/chosen": -60.57463836669922, + "logps/rejected": -93.92168426513672, + "loss": 0.5743, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9599595069885254, + "rewards/margins": 6.589584827423096, + "rewards/rejected": -3.629625082015991, + "step": 18702 + }, + { + "epoch": 4.68, + "grad_norm": 3.2889058589935303, + "learning_rate": 1.0118989838336424e-07, + "logits/chosen": -0.5205749869346619, + "logits/rejected": -0.622478723526001, + "logps/chosen": -54.21065902709961, + "logps/rejected": -117.00758361816406, + "loss": 0.5486, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7522153854370117, + "rewards/margins": 8.181756019592285, + "rewards/rejected": -5.429540157318115, + "step": 18703 + }, + { + "epoch": 4.68, + "grad_norm": 7.672381401062012, + "learning_rate": 1.010326309930726e-07, + "logits/chosen": -0.6424440145492554, + "logits/rejected": -0.7355083227157593, + "logps/chosen": -47.31562423706055, + "logps/rejected": -122.62924194335938, + "loss": 0.5061, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.094783067703247, + "rewards/margins": 9.129485130310059, + "rewards/rejected": -6.034701824188232, + "step": 18704 + }, + { + "epoch": 4.68, + "grad_norm": 3.167769432067871, + "learning_rate": 1.0087548466147457e-07, + "logits/chosen": -0.49957919120788574, + "logits/rejected": -0.5863524675369263, + "logps/chosen": -54.08915328979492, + "logps/rejected": -122.35546112060547, + "loss": 0.5691, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1263010501861572, + "rewards/margins": 8.071596145629883, + "rewards/rejected": -4.945294380187988, + "step": 18705 + }, + { + "epoch": 4.68, + "grad_norm": 4.995025634765625, + "learning_rate": 1.0071845939245539e-07, + "logits/chosen": -0.5076361894607544, + "logits/rejected": -0.563835084438324, + "logps/chosen": -60.392417907714844, + "logps/rejected": -113.5736312866211, + "loss": 0.6523, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8861258029937744, + "rewards/margins": 7.236138343811035, + "rewards/rejected": -4.35001277923584, + "step": 18706 + }, + { + "epoch": 4.68, + "grad_norm": 4.430313587188721, + "learning_rate": 1.0056155518989363e-07, + "logits/chosen": -0.5076499581336975, + "logits/rejected": -0.6478245258331299, + "logps/chosen": -52.769649505615234, + "logps/rejected": -95.31105041503906, + "loss": 0.6078, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.150954008102417, + "rewards/margins": 6.793497085571289, + "rewards/rejected": -3.642542600631714, + "step": 18707 + }, + { + "epoch": 4.68, + "grad_norm": 9.829018592834473, + "learning_rate": 1.0040477205766674e-07, + "logits/chosen": -0.5727536678314209, + "logits/rejected": -0.6222777366638184, + "logps/chosen": -46.21840286254883, + "logps/rejected": -105.32615661621094, + "loss": 0.6743, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1324477195739746, + "rewards/margins": 6.759448051452637, + "rewards/rejected": -3.627000331878662, + "step": 18708 + }, + { + "epoch": 4.68, + "grad_norm": 6.242281436920166, + "learning_rate": 1.0024810999964995e-07, + "logits/chosen": -0.4938710927963257, + "logits/rejected": -0.580358624458313, + "logps/chosen": -74.85950469970703, + "logps/rejected": -139.76513671875, + "loss": 0.6159, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.225144147872925, + "rewards/margins": 9.592890739440918, + "rewards/rejected": -6.367746353149414, + "step": 18709 + }, + { + "epoch": 4.68, + "grad_norm": 3.911332845687866, + "learning_rate": 1.0009156901971406e-07, + "logits/chosen": -0.5961554646492004, + "logits/rejected": -0.6510570049285889, + "logps/chosen": -54.850624084472656, + "logps/rejected": -103.06793975830078, + "loss": 0.6096, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.55179500579834, + "rewards/margins": 6.504164218902588, + "rewards/rejected": -2.9523696899414062, + "step": 18710 + }, + { + "epoch": 4.68, + "grad_norm": 3.8604419231414795, + "learning_rate": 9.993514912172652e-08, + "logits/chosen": -0.5184093713760376, + "logits/rejected": -0.595786452293396, + "logps/chosen": -56.51359176635742, + "logps/rejected": -93.87299346923828, + "loss": 0.6061, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3556995391845703, + "rewards/margins": 6.99392032623291, + "rewards/rejected": -3.6382205486297607, + "step": 18711 + }, + { + "epoch": 4.68, + "grad_norm": 2.7307281494140625, + "learning_rate": 9.97788503095537e-08, + "logits/chosen": -0.5126936435699463, + "logits/rejected": -0.6070062518119812, + "logps/chosen": -62.89385223388672, + "logps/rejected": -122.65092468261719, + "loss": 0.5765, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7849650382995605, + "rewards/margins": 7.723220348358154, + "rewards/rejected": -4.938256740570068, + "step": 18712 + }, + { + "epoch": 4.68, + "grad_norm": 4.933358669281006, + "learning_rate": 9.962267258705749e-08, + "logits/chosen": -0.5621047616004944, + "logits/rejected": -0.6276156306266785, + "logps/chosen": -47.223045349121094, + "logps/rejected": -96.18656921386719, + "loss": 0.616, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9462714195251465, + "rewards/margins": 6.010295867919922, + "rewards/rejected": -3.0640246868133545, + "step": 18713 + }, + { + "epoch": 4.68, + "grad_norm": 6.057154655456543, + "learning_rate": 9.946661595809815e-08, + "logits/chosen": -0.6167575120925903, + "logits/rejected": -0.7314923405647278, + "logps/chosen": -48.11587142944336, + "logps/rejected": -117.1346206665039, + "loss": 0.5754, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.177011013031006, + "rewards/margins": 8.799171447753906, + "rewards/rejected": -5.6221604347229, + "step": 18714 + }, + { + "epoch": 4.68, + "grad_norm": 2.7391958236694336, + "learning_rate": 9.93106804265298e-08, + "logits/chosen": -0.5956652164459229, + "logits/rejected": -0.6491057872772217, + "logps/chosen": -49.953704833984375, + "logps/rejected": -115.46504974365234, + "loss": 0.5446, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4893569946289062, + "rewards/margins": 8.09504508972168, + "rewards/rejected": -4.605688095092773, + "step": 18715 + }, + { + "epoch": 4.68, + "grad_norm": 3.2933239936828613, + "learning_rate": 9.915486599620828e-08, + "logits/chosen": -0.5424160361289978, + "logits/rejected": -0.6302005052566528, + "logps/chosen": -54.25679397583008, + "logps/rejected": -124.20973205566406, + "loss": 0.5444, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2283787727355957, + "rewards/margins": 9.308338165283203, + "rewards/rejected": -6.079958915710449, + "step": 18716 + }, + { + "epoch": 4.68, + "grad_norm": 5.395649433135986, + "learning_rate": 9.89991726709827e-08, + "logits/chosen": -0.5132573843002319, + "logits/rejected": -0.5727401375770569, + "logps/chosen": -61.80601501464844, + "logps/rejected": -114.89539337158203, + "loss": 0.6813, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9180309772491455, + "rewards/margins": 7.2464280128479, + "rewards/rejected": -4.328396320343018, + "step": 18717 + }, + { + "epoch": 4.68, + "grad_norm": 9.858880043029785, + "learning_rate": 9.884360045469998e-08, + "logits/chosen": -0.6161714792251587, + "logits/rejected": -0.6332570910453796, + "logps/chosen": -46.18700408935547, + "logps/rejected": -106.58226776123047, + "loss": 0.6554, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0764896869659424, + "rewards/margins": 6.423887252807617, + "rewards/rejected": -3.347398281097412, + "step": 18718 + }, + { + "epoch": 4.68, + "grad_norm": 3.3195457458496094, + "learning_rate": 9.86881493512054e-08, + "logits/chosen": -0.5154977440834045, + "logits/rejected": -0.5755391716957092, + "logps/chosen": -58.79730987548828, + "logps/rejected": -126.92559814453125, + "loss": 0.5683, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.80823016166687, + "rewards/margins": 6.854185581207275, + "rewards/rejected": -4.045955657958984, + "step": 18719 + }, + { + "epoch": 4.68, + "grad_norm": 4.922382831573486, + "learning_rate": 9.853281936433922e-08, + "logits/chosen": -0.5576143264770508, + "logits/rejected": -0.6202411651611328, + "logps/chosen": -44.783203125, + "logps/rejected": -100.01063537597656, + "loss": 0.5315, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.266700267791748, + "rewards/margins": 7.231695652008057, + "rewards/rejected": -3.964996337890625, + "step": 18720 + }, + { + "epoch": 4.68, + "grad_norm": 7.29292106628418, + "learning_rate": 9.837761049794114e-08, + "logits/chosen": -0.5350658297538757, + "logits/rejected": -0.5971430540084839, + "logps/chosen": -62.23931121826172, + "logps/rejected": -108.4482421875, + "loss": 0.6983, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.929785966873169, + "rewards/margins": 7.652127742767334, + "rewards/rejected": -4.722341060638428, + "step": 18721 + }, + { + "epoch": 4.68, + "grad_norm": 6.5502448081970215, + "learning_rate": 9.822252275584587e-08, + "logits/chosen": -0.598190188407898, + "logits/rejected": -0.6531995534896851, + "logps/chosen": -46.43354415893555, + "logps/rejected": -118.172119140625, + "loss": 0.575, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.980678081512451, + "rewards/margins": 7.491167068481445, + "rewards/rejected": -4.510488510131836, + "step": 18722 + }, + { + "epoch": 4.68, + "grad_norm": 4.242838382720947, + "learning_rate": 9.80675561418859e-08, + "logits/chosen": -0.5257018208503723, + "logits/rejected": -0.6217078566551208, + "logps/chosen": -69.2379150390625, + "logps/rejected": -99.11795043945312, + "loss": 0.6398, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1417102813720703, + "rewards/margins": 6.598441123962402, + "rewards/rejected": -3.4567315578460693, + "step": 18723 + }, + { + "epoch": 4.68, + "grad_norm": 17.4306697845459, + "learning_rate": 9.791271065989039e-08, + "logits/chosen": -0.6291741728782654, + "logits/rejected": -0.7052183747291565, + "logps/chosen": -54.861961364746094, + "logps/rejected": -117.41624450683594, + "loss": 0.7452, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.767789602279663, + "rewards/margins": 7.511506080627441, + "rewards/rejected": -4.743716239929199, + "step": 18724 + }, + { + "epoch": 4.68, + "grad_norm": 6.948514938354492, + "learning_rate": 9.775798631368627e-08, + "logits/chosen": -0.5672352910041809, + "logits/rejected": -0.6622506976127625, + "logps/chosen": -50.0732307434082, + "logps/rejected": -107.90615844726562, + "loss": 0.5651, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.133643865585327, + "rewards/margins": 7.598044395446777, + "rewards/rejected": -4.464400291442871, + "step": 18725 + }, + { + "epoch": 4.68, + "grad_norm": 7.989544868469238, + "learning_rate": 9.76033831070955e-08, + "logits/chosen": -0.5541084408760071, + "logits/rejected": -0.656476616859436, + "logps/chosen": -54.07977294921875, + "logps/rejected": -113.155517578125, + "loss": 0.6793, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1042871475219727, + "rewards/margins": 7.68515682220459, + "rewards/rejected": -4.580869674682617, + "step": 18726 + }, + { + "epoch": 4.68, + "grad_norm": 3.4780125617980957, + "learning_rate": 9.744890104394e-08, + "logits/chosen": -0.5855311155319214, + "logits/rejected": -0.6773726344108582, + "logps/chosen": -55.92505645751953, + "logps/rejected": -107.57100677490234, + "loss": 0.6059, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9366488456726074, + "rewards/margins": 7.970081806182861, + "rewards/rejected": -5.033432960510254, + "step": 18727 + }, + { + "epoch": 4.68, + "grad_norm": 4.696505069732666, + "learning_rate": 9.729454012803786e-08, + "logits/chosen": -0.5425813794136047, + "logits/rejected": -0.6209179759025574, + "logps/chosen": -43.68357849121094, + "logps/rejected": -125.77019500732422, + "loss": 0.609, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1641476154327393, + "rewards/margins": 8.584585189819336, + "rewards/rejected": -5.420437812805176, + "step": 18728 + }, + { + "epoch": 4.69, + "grad_norm": 6.59438419342041, + "learning_rate": 9.714030036320099e-08, + "logits/chosen": -0.44463813304901123, + "logits/rejected": -0.5312206745147705, + "logps/chosen": -55.65077209472656, + "logps/rejected": -101.59919738769531, + "loss": 0.6683, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2434191703796387, + "rewards/margins": 7.6093292236328125, + "rewards/rejected": -4.365910053253174, + "step": 18729 + }, + { + "epoch": 4.69, + "grad_norm": 5.193567752838135, + "learning_rate": 9.698618175324249e-08, + "logits/chosen": -0.49843600392341614, + "logits/rejected": -0.6050453782081604, + "logps/chosen": -65.19768524169922, + "logps/rejected": -89.03528594970703, + "loss": 0.5995, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.217681407928467, + "rewards/margins": 6.67364501953125, + "rewards/rejected": -3.455963611602783, + "step": 18730 + }, + { + "epoch": 4.69, + "grad_norm": 5.186136245727539, + "learning_rate": 9.683218430197095e-08, + "logits/chosen": -0.6317296028137207, + "logits/rejected": -0.6755308508872986, + "logps/chosen": -47.88092041015625, + "logps/rejected": -99.51714324951172, + "loss": 0.5538, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1925058364868164, + "rewards/margins": 7.016517639160156, + "rewards/rejected": -3.82401180267334, + "step": 18731 + }, + { + "epoch": 4.69, + "grad_norm": 2.9286608695983887, + "learning_rate": 9.667830801319167e-08, + "logits/chosen": -0.5298135280609131, + "logits/rejected": -0.5895779728889465, + "logps/chosen": -53.83222961425781, + "logps/rejected": -86.83802032470703, + "loss": 0.5898, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2663135528564453, + "rewards/margins": 6.165992736816406, + "rewards/rejected": -2.8996798992156982, + "step": 18732 + }, + { + "epoch": 4.69, + "grad_norm": 4.11356782913208, + "learning_rate": 9.652455289070606e-08, + "logits/chosen": -0.45792871713638306, + "logits/rejected": -0.5885148048400879, + "logps/chosen": -61.99717330932617, + "logps/rejected": -101.02374267578125, + "loss": 0.5598, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9258618354797363, + "rewards/margins": 7.014850616455078, + "rewards/rejected": -4.088988304138184, + "step": 18733 + }, + { + "epoch": 4.69, + "grad_norm": 3.2080423831939697, + "learning_rate": 9.637091893831441e-08, + "logits/chosen": -0.47035765647888184, + "logits/rejected": -0.564495325088501, + "logps/chosen": -78.15912628173828, + "logps/rejected": -125.8729248046875, + "loss": 0.6722, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1580588817596436, + "rewards/margins": 8.45088005065918, + "rewards/rejected": -5.292821407318115, + "step": 18734 + }, + { + "epoch": 4.69, + "grad_norm": 9.902328491210938, + "learning_rate": 9.621740615981367e-08, + "logits/chosen": -0.5558260083198547, + "logits/rejected": -0.6416743397712708, + "logps/chosen": -52.99664306640625, + "logps/rejected": -121.97975158691406, + "loss": 0.6098, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8279619216918945, + "rewards/margins": 8.321394920349121, + "rewards/rejected": -5.493432998657227, + "step": 18735 + }, + { + "epoch": 4.69, + "grad_norm": 2.9364399909973145, + "learning_rate": 9.606401455899584e-08, + "logits/chosen": -0.5906911492347717, + "logits/rejected": -0.6590410470962524, + "logps/chosen": -45.171329498291016, + "logps/rejected": -101.28011322021484, + "loss": 0.5719, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.297123432159424, + "rewards/margins": 7.134639263153076, + "rewards/rejected": -3.8375158309936523, + "step": 18736 + }, + { + "epoch": 4.69, + "grad_norm": 3.4804952144622803, + "learning_rate": 9.591074413965284e-08, + "logits/chosen": -0.6319706439971924, + "logits/rejected": -0.7443505525588989, + "logps/chosen": -56.222206115722656, + "logps/rejected": -111.65345764160156, + "loss": 0.5625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2914974689483643, + "rewards/margins": 7.974182605743408, + "rewards/rejected": -4.682685852050781, + "step": 18737 + }, + { + "epoch": 4.69, + "grad_norm": 2.823169708251953, + "learning_rate": 9.575759490557113e-08, + "logits/chosen": -0.5367494821548462, + "logits/rejected": -0.6620299816131592, + "logps/chosen": -53.67040252685547, + "logps/rejected": -97.14749145507812, + "loss": 0.616, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.071991443634033, + "rewards/margins": 6.723325252532959, + "rewards/rejected": -3.6513335704803467, + "step": 18738 + }, + { + "epoch": 4.69, + "grad_norm": 3.019083023071289, + "learning_rate": 9.560456686053655e-08, + "logits/chosen": -0.5317265391349792, + "logits/rejected": -0.6029212474822998, + "logps/chosen": -57.09558868408203, + "logps/rejected": -110.89054870605469, + "loss": 0.6119, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3812639713287354, + "rewards/margins": 7.964360237121582, + "rewards/rejected": -4.583095550537109, + "step": 18739 + }, + { + "epoch": 4.69, + "grad_norm": 4.489247798919678, + "learning_rate": 9.545166000832883e-08, + "logits/chosen": -0.628462553024292, + "logits/rejected": -0.7133082151412964, + "logps/chosen": -37.5173225402832, + "logps/rejected": -98.18402099609375, + "loss": 0.4534, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3280270099639893, + "rewards/margins": 7.83424186706543, + "rewards/rejected": -4.5062150955200195, + "step": 18740 + }, + { + "epoch": 4.69, + "grad_norm": 5.589057922363281, + "learning_rate": 9.529887435272778e-08, + "logits/chosen": -0.550710916519165, + "logits/rejected": -0.6395300030708313, + "logps/chosen": -45.3558235168457, + "logps/rejected": -86.94908142089844, + "loss": 0.615, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0057625770568848, + "rewards/margins": 6.759578704833984, + "rewards/rejected": -3.7538161277770996, + "step": 18741 + }, + { + "epoch": 4.69, + "grad_norm": 3.3281521797180176, + "learning_rate": 9.514620989750812e-08, + "logits/chosen": -0.6079446077346802, + "logits/rejected": -0.6513275504112244, + "logps/chosen": -57.00090026855469, + "logps/rejected": -122.10298156738281, + "loss": 0.631, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2461884021759033, + "rewards/margins": 8.408554077148438, + "rewards/rejected": -5.1623663902282715, + "step": 18742 + }, + { + "epoch": 4.69, + "grad_norm": 41.859832763671875, + "learning_rate": 9.499366664644294e-08, + "logits/chosen": -0.5855770111083984, + "logits/rejected": -0.6962935924530029, + "logps/chosen": -54.89239501953125, + "logps/rejected": -107.25028991699219, + "loss": 0.6082, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.111880302429199, + "rewards/margins": 8.311960220336914, + "rewards/rejected": -5.200080871582031, + "step": 18743 + }, + { + "epoch": 4.69, + "grad_norm": 6.922101974487305, + "learning_rate": 9.48412446033009e-08, + "logits/chosen": -0.4997791051864624, + "logits/rejected": -0.6068649291992188, + "logps/chosen": -50.88235855102539, + "logps/rejected": -85.95679473876953, + "loss": 0.6271, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.811202049255371, + "rewards/margins": 6.446803092956543, + "rewards/rejected": -3.6356005668640137, + "step": 18744 + }, + { + "epoch": 4.69, + "grad_norm": 3.893294095993042, + "learning_rate": 9.4688943771849e-08, + "logits/chosen": -0.5484890937805176, + "logits/rejected": -0.6170384883880615, + "logps/chosen": -57.320072174072266, + "logps/rejected": -113.8781509399414, + "loss": 0.6016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.216336965560913, + "rewards/margins": 7.517330169677734, + "rewards/rejected": -4.300992012023926, + "step": 18745 + }, + { + "epoch": 4.69, + "grad_norm": 6.273228168487549, + "learning_rate": 9.4536764155852e-08, + "logits/chosen": -0.605238676071167, + "logits/rejected": -0.681109607219696, + "logps/chosen": -53.99770736694336, + "logps/rejected": -107.1123046875, + "loss": 0.6172, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3021938800811768, + "rewards/margins": 8.133623123168945, + "rewards/rejected": -4.831429481506348, + "step": 18746 + }, + { + "epoch": 4.69, + "grad_norm": 10.324148178100586, + "learning_rate": 9.438470575906799e-08, + "logits/chosen": -0.5337108373641968, + "logits/rejected": -0.6203418970108032, + "logps/chosen": -59.216976165771484, + "logps/rejected": -108.11875915527344, + "loss": 0.68, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9366235733032227, + "rewards/margins": 7.3737030029296875, + "rewards/rejected": -4.437079429626465, + "step": 18747 + }, + { + "epoch": 4.69, + "grad_norm": 11.728575706481934, + "learning_rate": 9.423276858525621e-08, + "logits/chosen": -0.5566619038581848, + "logits/rejected": -0.5926252603530884, + "logps/chosen": -63.237831115722656, + "logps/rejected": -105.64464569091797, + "loss": 0.8288, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.714364528656006, + "rewards/margins": 5.966720104217529, + "rewards/rejected": -3.2523558139801025, + "step": 18748 + }, + { + "epoch": 4.69, + "grad_norm": 3.504772901535034, + "learning_rate": 9.408095263817085e-08, + "logits/chosen": -0.48350897431373596, + "logits/rejected": -0.589509129524231, + "logps/chosen": -44.37215042114258, + "logps/rejected": -91.39248657226562, + "loss": 0.52, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9928691387176514, + "rewards/margins": 6.617993354797363, + "rewards/rejected": -3.625124454498291, + "step": 18749 + }, + { + "epoch": 4.69, + "grad_norm": 3.162360191345215, + "learning_rate": 9.392925792156338e-08, + "logits/chosen": -0.5730068683624268, + "logits/rejected": -0.6989068984985352, + "logps/chosen": -55.102596282958984, + "logps/rejected": -100.70802307128906, + "loss": 0.6226, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.103450298309326, + "rewards/margins": 7.478123664855957, + "rewards/rejected": -4.374673366546631, + "step": 18750 + }, + { + "epoch": 4.69, + "grad_norm": 6.146058082580566, + "learning_rate": 9.37776844391819e-08, + "logits/chosen": -0.549139142036438, + "logits/rejected": -0.5855037569999695, + "logps/chosen": -43.44344711303711, + "logps/rejected": -99.47750854492188, + "loss": 0.5814, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1447722911834717, + "rewards/margins": 6.815946578979492, + "rewards/rejected": -3.6711742877960205, + "step": 18751 + }, + { + "epoch": 4.69, + "grad_norm": 6.382009029388428, + "learning_rate": 9.362623219477285e-08, + "logits/chosen": -0.5965842008590698, + "logits/rejected": -0.6372148394584656, + "logps/chosen": -52.50749206542969, + "logps/rejected": -106.22112274169922, + "loss": 0.6366, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0798537731170654, + "rewards/margins": 6.750340461730957, + "rewards/rejected": -3.67048716545105, + "step": 18752 + }, + { + "epoch": 4.69, + "grad_norm": 5.3052215576171875, + "learning_rate": 9.34749011920777e-08, + "logits/chosen": -0.4935157299041748, + "logits/rejected": -0.6344287395477295, + "logps/chosen": -55.850067138671875, + "logps/rejected": -104.683837890625, + "loss": 0.5361, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.941357374191284, + "rewards/margins": 7.545989990234375, + "rewards/rejected": -4.60463285446167, + "step": 18753 + }, + { + "epoch": 4.69, + "grad_norm": 19.908933639526367, + "learning_rate": 9.332369143483678e-08, + "logits/chosen": -0.4941115379333496, + "logits/rejected": -0.602021336555481, + "logps/chosen": -65.80633544921875, + "logps/rejected": -93.48992919921875, + "loss": 0.6345, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3993842601776123, + "rewards/margins": 6.498372554779053, + "rewards/rejected": -3.0989882946014404, + "step": 18754 + }, + { + "epoch": 4.69, + "grad_norm": 4.038658618927002, + "learning_rate": 9.317260292678653e-08, + "logits/chosen": -0.5892744064331055, + "logits/rejected": -0.6689128279685974, + "logps/chosen": -44.238285064697266, + "logps/rejected": -95.79275512695312, + "loss": 0.5758, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.170832872390747, + "rewards/margins": 7.49664831161499, + "rewards/rejected": -4.3258161544799805, + "step": 18755 + }, + { + "epoch": 4.69, + "grad_norm": 6.401719093322754, + "learning_rate": 9.302163567166012e-08, + "logits/chosen": -0.5480465888977051, + "logits/rejected": -0.6241503953933716, + "logps/chosen": -59.747779846191406, + "logps/rejected": -110.8204345703125, + "loss": 0.6296, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9766204357147217, + "rewards/margins": 7.821478843688965, + "rewards/rejected": -4.844858169555664, + "step": 18756 + }, + { + "epoch": 4.69, + "grad_norm": 20.922386169433594, + "learning_rate": 9.287078967318841e-08, + "logits/chosen": -0.6198712587356567, + "logits/rejected": -0.7059129476547241, + "logps/chosen": -54.65874481201172, + "logps/rejected": -114.79360961914062, + "loss": 0.6063, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.243222713470459, + "rewards/margins": 7.522392272949219, + "rewards/rejected": -4.279169082641602, + "step": 18757 + }, + { + "epoch": 4.69, + "grad_norm": 7.881599426269531, + "learning_rate": 9.2720064935099e-08, + "logits/chosen": -0.5347288250923157, + "logits/rejected": -0.5775191783905029, + "logps/chosen": -63.704200744628906, + "logps/rejected": -112.93944549560547, + "loss": 0.8186, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7833430767059326, + "rewards/margins": 6.91811466217041, + "rewards/rejected": -4.134771347045898, + "step": 18758 + }, + { + "epoch": 4.69, + "grad_norm": 13.18081283569336, + "learning_rate": 9.256946146111667e-08, + "logits/chosen": -0.5577945113182068, + "logits/rejected": -0.6205999851226807, + "logps/chosen": -49.24221420288086, + "logps/rejected": -96.631103515625, + "loss": 0.5996, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.168166399002075, + "rewards/margins": 6.599092960357666, + "rewards/rejected": -3.4309258460998535, + "step": 18759 + }, + { + "epoch": 4.69, + "grad_norm": 6.1477274894714355, + "learning_rate": 9.24189792549629e-08, + "logits/chosen": -0.4954672157764435, + "logits/rejected": -0.6020843386650085, + "logps/chosen": -55.62623596191406, + "logps/rejected": -113.67911529541016, + "loss": 0.5639, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8849070072174072, + "rewards/margins": 7.696946144104004, + "rewards/rejected": -4.812038898468018, + "step": 18760 + }, + { + "epoch": 4.69, + "grad_norm": 5.232219696044922, + "learning_rate": 9.226861832035528e-08, + "logits/chosen": -0.5504654049873352, + "logits/rejected": -0.6373328566551208, + "logps/chosen": -44.62886047363281, + "logps/rejected": -107.99726867675781, + "loss": 0.5224, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.214266061782837, + "rewards/margins": 7.668560028076172, + "rewards/rejected": -4.454293727874756, + "step": 18761 + }, + { + "epoch": 4.69, + "grad_norm": 3.9144816398620605, + "learning_rate": 9.211837866101081e-08, + "logits/chosen": -0.6149007081985474, + "logits/rejected": -0.7180365920066833, + "logps/chosen": -46.6869010925293, + "logps/rejected": -103.76634216308594, + "loss": 0.5528, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.442929267883301, + "rewards/margins": 7.814472198486328, + "rewards/rejected": -4.3715434074401855, + "step": 18762 + }, + { + "epoch": 4.69, + "grad_norm": 3.1528961658477783, + "learning_rate": 9.196826028064099e-08, + "logits/chosen": -0.6249493360519409, + "logits/rejected": -0.7184223532676697, + "logps/chosen": -53.28425598144531, + "logps/rejected": -104.92874908447266, + "loss": 0.5688, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2238287925720215, + "rewards/margins": 7.6376953125, + "rewards/rejected": -4.413866996765137, + "step": 18763 + }, + { + "epoch": 4.69, + "grad_norm": 7.672727108001709, + "learning_rate": 9.181826318295672e-08, + "logits/chosen": -0.5900763869285583, + "logits/rejected": -0.6258874535560608, + "logps/chosen": -48.07616424560547, + "logps/rejected": -108.47818756103516, + "loss": 0.6094, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0359838008880615, + "rewards/margins": 6.654753684997559, + "rewards/rejected": -3.618769645690918, + "step": 18764 + }, + { + "epoch": 4.69, + "grad_norm": 4.443475246429443, + "learning_rate": 9.166838737166284e-08, + "logits/chosen": -0.5686531066894531, + "logits/rejected": -0.5975379943847656, + "logps/chosen": -55.42850112915039, + "logps/rejected": -125.17526245117188, + "loss": 0.6312, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.838798761367798, + "rewards/margins": 7.400362968444824, + "rewards/rejected": -4.561563968658447, + "step": 18765 + }, + { + "epoch": 4.69, + "grad_norm": 3.8917298316955566, + "learning_rate": 9.151863285046413e-08, + "logits/chosen": -0.5945732593536377, + "logits/rejected": -0.663489580154419, + "logps/chosen": -51.3277587890625, + "logps/rejected": -121.90259552001953, + "loss": 0.5759, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.339815855026245, + "rewards/margins": 9.158443450927734, + "rewards/rejected": -5.818627834320068, + "step": 18766 + }, + { + "epoch": 4.69, + "grad_norm": 5.334495544433594, + "learning_rate": 9.136899962306101e-08, + "logits/chosen": -0.611173152923584, + "logits/rejected": -0.6849491596221924, + "logps/chosen": -50.10693359375, + "logps/rejected": -98.32376861572266, + "loss": 0.6281, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2005562782287598, + "rewards/margins": 6.9135541915893555, + "rewards/rejected": -3.7129979133605957, + "step": 18767 + }, + { + "epoch": 4.7, + "grad_norm": 2.7765986919403076, + "learning_rate": 9.121948769315103e-08, + "logits/chosen": -0.5245509147644043, + "logits/rejected": -0.5315614938735962, + "logps/chosen": -54.462032318115234, + "logps/rejected": -153.09078979492188, + "loss": 0.5944, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0853044986724854, + "rewards/margins": 9.224130630493164, + "rewards/rejected": -6.138826847076416, + "step": 18768 + }, + { + "epoch": 4.7, + "grad_norm": 2.5473408699035645, + "learning_rate": 9.107009706442849e-08, + "logits/chosen": -0.5334625840187073, + "logits/rejected": -0.6228711009025574, + "logps/chosen": -50.9849853515625, + "logps/rejected": -105.6722412109375, + "loss": 0.5029, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.526643753051758, + "rewards/margins": 7.397489547729492, + "rewards/rejected": -3.870845317840576, + "step": 18769 + }, + { + "epoch": 4.7, + "grad_norm": 4.8731369972229, + "learning_rate": 9.092082774058542e-08, + "logits/chosen": -0.5615723729133606, + "logits/rejected": -0.6263074278831482, + "logps/chosen": -47.64625930786133, + "logps/rejected": -113.84317779541016, + "loss": 0.5766, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1347908973693848, + "rewards/margins": 7.289506912231445, + "rewards/rejected": -4.154715538024902, + "step": 18770 + }, + { + "epoch": 4.7, + "grad_norm": 11.378143310546875, + "learning_rate": 9.077167972531054e-08, + "logits/chosen": -0.48424771428108215, + "logits/rejected": -0.5504130721092224, + "logps/chosen": -50.409873962402344, + "logps/rejected": -123.49880981445312, + "loss": 0.6021, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6366798877716064, + "rewards/margins": 8.203178405761719, + "rewards/rejected": -5.566498756408691, + "step": 18771 + }, + { + "epoch": 4.7, + "grad_norm": 5.6552228927612305, + "learning_rate": 9.062265302228868e-08, + "logits/chosen": -0.5095757246017456, + "logits/rejected": -0.5941342711448669, + "logps/chosen": -65.51666259765625, + "logps/rejected": -114.66502380371094, + "loss": 0.6672, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3149771690368652, + "rewards/margins": 7.7269062995910645, + "rewards/rejected": -4.411928653717041, + "step": 18772 + }, + { + "epoch": 4.7, + "grad_norm": 4.894288063049316, + "learning_rate": 9.047374763520245e-08, + "logits/chosen": -0.5445798635482788, + "logits/rejected": -0.6571922898292542, + "logps/chosen": -54.87615203857422, + "logps/rejected": -107.6002197265625, + "loss": 0.6545, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9775500297546387, + "rewards/margins": 7.745471000671387, + "rewards/rejected": -4.767921447753906, + "step": 18773 + }, + { + "epoch": 4.7, + "grad_norm": 2.5798258781433105, + "learning_rate": 9.032496356773279e-08, + "logits/chosen": -0.5705332159996033, + "logits/rejected": -0.6413111686706543, + "logps/chosen": -46.42045593261719, + "logps/rejected": -116.15726470947266, + "loss": 0.5503, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0292348861694336, + "rewards/margins": 8.091111183166504, + "rewards/rejected": -5.06187629699707, + "step": 18774 + }, + { + "epoch": 4.7, + "grad_norm": 4.5734686851501465, + "learning_rate": 9.01763008235551e-08, + "logits/chosen": -0.5442709922790527, + "logits/rejected": -0.5927560329437256, + "logps/chosen": -53.656551361083984, + "logps/rejected": -128.7969207763672, + "loss": 0.6858, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.130556106567383, + "rewards/margins": 7.5239033699035645, + "rewards/rejected": -4.393346786499023, + "step": 18775 + }, + { + "epoch": 4.7, + "grad_norm": 2.935004472732544, + "learning_rate": 9.002775940634312e-08, + "logits/chosen": -0.5093223452568054, + "logits/rejected": -0.5537114143371582, + "logps/chosen": -57.34294128417969, + "logps/rejected": -101.19176483154297, + "loss": 0.5613, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2985024452209473, + "rewards/margins": 5.919231414794922, + "rewards/rejected": -2.620729446411133, + "step": 18776 + }, + { + "epoch": 4.7, + "grad_norm": 5.963072299957275, + "learning_rate": 8.987933931976834e-08, + "logits/chosen": -0.5135269165039062, + "logits/rejected": -0.616825520992279, + "logps/chosen": -56.69740295410156, + "logps/rejected": -93.26953125, + "loss": 0.6637, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.023648500442505, + "rewards/margins": 7.085470199584961, + "rewards/rejected": -4.061821937561035, + "step": 18777 + }, + { + "epoch": 4.7, + "grad_norm": 6.562585830688477, + "learning_rate": 8.973104056749726e-08, + "logits/chosen": -0.5489745140075684, + "logits/rejected": -0.6607360243797302, + "logps/chosen": -52.19654083251953, + "logps/rejected": -91.7376708984375, + "loss": 0.63, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.287113904953003, + "rewards/margins": 6.438414573669434, + "rewards/rejected": -3.1513006687164307, + "step": 18778 + }, + { + "epoch": 4.7, + "grad_norm": 4.626565456390381, + "learning_rate": 8.958286315319476e-08, + "logits/chosen": -0.4905931353569031, + "logits/rejected": -0.5799367427825928, + "logps/chosen": -56.229736328125, + "logps/rejected": -121.58812713623047, + "loss": 0.6216, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1566412448883057, + "rewards/margins": 8.39955997467041, + "rewards/rejected": -5.242918491363525, + "step": 18779 + }, + { + "epoch": 4.7, + "grad_norm": 2.0485734939575195, + "learning_rate": 8.943480708052288e-08, + "logits/chosen": -0.57503741979599, + "logits/rejected": -0.6597131490707397, + "logps/chosen": -55.912052154541016, + "logps/rejected": -113.59245300292969, + "loss": 0.5856, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.270782709121704, + "rewards/margins": 7.948273658752441, + "rewards/rejected": -4.677490234375, + "step": 18780 + }, + { + "epoch": 4.7, + "grad_norm": 4.794774532318115, + "learning_rate": 8.928687235313982e-08, + "logits/chosen": -0.473718523979187, + "logits/rejected": -0.5740834474563599, + "logps/chosen": -62.58806610107422, + "logps/rejected": -109.38240051269531, + "loss": 0.6214, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2528676986694336, + "rewards/margins": 6.285844326019287, + "rewards/rejected": -3.0329768657684326, + "step": 18781 + }, + { + "epoch": 4.7, + "grad_norm": 6.7158660888671875, + "learning_rate": 8.91390589747021e-08, + "logits/chosen": -0.5550864934921265, + "logits/rejected": -0.6219508647918701, + "logps/chosen": -57.972938537597656, + "logps/rejected": -100.70790100097656, + "loss": 0.6574, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.4709415435791016, + "rewards/margins": 7.4322943687438965, + "rewards/rejected": -3.9613535404205322, + "step": 18782 + }, + { + "epoch": 4.7, + "grad_norm": 2.297152519226074, + "learning_rate": 8.899136694886123e-08, + "logits/chosen": -0.4424399137496948, + "logits/rejected": -0.5601630806922913, + "logps/chosen": -56.540462493896484, + "logps/rejected": -125.94499206542969, + "loss": 0.5384, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1503238677978516, + "rewards/margins": 8.325215339660645, + "rewards/rejected": -5.174891471862793, + "step": 18783 + }, + { + "epoch": 4.7, + "grad_norm": 4.768500804901123, + "learning_rate": 8.884379627926709e-08, + "logits/chosen": -0.47144025564193726, + "logits/rejected": -0.5450149774551392, + "logps/chosen": -46.31512451171875, + "logps/rejected": -125.06558227539062, + "loss": 0.611, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.221428871154785, + "rewards/margins": 7.335529327392578, + "rewards/rejected": -4.114100456237793, + "step": 18784 + }, + { + "epoch": 4.7, + "grad_norm": 5.950331211090088, + "learning_rate": 8.869634696956675e-08, + "logits/chosen": -0.6184154152870178, + "logits/rejected": -0.7033669352531433, + "logps/chosen": -51.485076904296875, + "logps/rejected": -113.13360595703125, + "loss": 0.6937, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8303520679473877, + "rewards/margins": 7.374462604522705, + "rewards/rejected": -4.544111251831055, + "step": 18785 + }, + { + "epoch": 4.7, + "grad_norm": 2.8920676708221436, + "learning_rate": 8.85490190234034e-08, + "logits/chosen": -0.5413271188735962, + "logits/rejected": -0.6449591517448425, + "logps/chosen": -56.645442962646484, + "logps/rejected": -85.44694519042969, + "loss": 0.6135, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.151191234588623, + "rewards/margins": 6.8151535987854, + "rewards/rejected": -3.6639623641967773, + "step": 18786 + }, + { + "epoch": 4.7, + "grad_norm": 6.198523998260498, + "learning_rate": 8.840181244441804e-08, + "logits/chosen": -0.5595927238464355, + "logits/rejected": -0.6241705417633057, + "logps/chosen": -55.49368667602539, + "logps/rejected": -120.91705322265625, + "loss": 0.6537, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1070945262908936, + "rewards/margins": 7.776864051818848, + "rewards/rejected": -4.669769287109375, + "step": 18787 + }, + { + "epoch": 4.7, + "grad_norm": 2.4220974445343018, + "learning_rate": 8.825472723624773e-08, + "logits/chosen": -0.6081855297088623, + "logits/rejected": -0.663673460483551, + "logps/chosen": -49.24668884277344, + "logps/rejected": -106.83702087402344, + "loss": 0.5869, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5052545070648193, + "rewards/margins": 7.938544273376465, + "rewards/rejected": -4.433289051055908, + "step": 18788 + }, + { + "epoch": 4.7, + "grad_norm": 5.6369757652282715, + "learning_rate": 8.810776340252791e-08, + "logits/chosen": -0.5174912214279175, + "logits/rejected": -0.5774469375610352, + "logps/chosen": -52.674072265625, + "logps/rejected": -103.70370483398438, + "loss": 0.7081, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.298612356185913, + "rewards/margins": 6.654049396514893, + "rewards/rejected": -3.3554368019104004, + "step": 18789 + }, + { + "epoch": 4.7, + "grad_norm": 4.093749046325684, + "learning_rate": 8.796092094688957e-08, + "logits/chosen": -0.5376954078674316, + "logits/rejected": -0.5927346348762512, + "logps/chosen": -51.379417419433594, + "logps/rejected": -112.97744750976562, + "loss": 0.5031, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.286773681640625, + "rewards/margins": 7.171006679534912, + "rewards/rejected": -3.884232759475708, + "step": 18790 + }, + { + "epoch": 4.7, + "grad_norm": 7.60629940032959, + "learning_rate": 8.781419987296092e-08, + "logits/chosen": -0.49802887439727783, + "logits/rejected": -0.5655615925788879, + "logps/chosen": -60.178314208984375, + "logps/rejected": -110.32913208007812, + "loss": 0.6407, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9884510040283203, + "rewards/margins": 5.721733093261719, + "rewards/rejected": -2.7332816123962402, + "step": 18791 + }, + { + "epoch": 4.7, + "grad_norm": 4.810091972351074, + "learning_rate": 8.766760018436904e-08, + "logits/chosen": -0.5475307106971741, + "logits/rejected": -0.579247772693634, + "logps/chosen": -60.210201263427734, + "logps/rejected": -102.88605499267578, + "loss": 0.6787, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9557883739471436, + "rewards/margins": 5.858510971069336, + "rewards/rejected": -2.9027223587036133, + "step": 18792 + }, + { + "epoch": 4.7, + "grad_norm": 10.00964069366455, + "learning_rate": 8.752112188473605e-08, + "logits/chosen": -0.5223243236541748, + "logits/rejected": -0.6407990455627441, + "logps/chosen": -59.22821807861328, + "logps/rejected": -122.29425048828125, + "loss": 0.6173, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0555479526519775, + "rewards/margins": 8.114367485046387, + "rewards/rejected": -5.058818817138672, + "step": 18793 + }, + { + "epoch": 4.7, + "grad_norm": 7.2138991355896, + "learning_rate": 8.737476497768016e-08, + "logits/chosen": -0.5509856343269348, + "logits/rejected": -0.6360633373260498, + "logps/chosen": -55.97734069824219, + "logps/rejected": -115.15618133544922, + "loss": 0.62, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.955019235610962, + "rewards/margins": 7.876471996307373, + "rewards/rejected": -4.92145299911499, + "step": 18794 + }, + { + "epoch": 4.7, + "grad_norm": 4.450366020202637, + "learning_rate": 8.722852946682014e-08, + "logits/chosen": -0.5753543376922607, + "logits/rejected": -0.6712681651115417, + "logps/chosen": -55.170108795166016, + "logps/rejected": -106.49417877197266, + "loss": 0.5987, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2971489429473877, + "rewards/margins": 6.990756034851074, + "rewards/rejected": -3.6936070919036865, + "step": 18795 + }, + { + "epoch": 4.7, + "grad_norm": 3.8546016216278076, + "learning_rate": 8.708241535576867e-08, + "logits/chosen": -0.6047458648681641, + "logits/rejected": -0.7071646451950073, + "logps/chosen": -40.52671813964844, + "logps/rejected": -101.25297546386719, + "loss": 0.5864, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.444119453430176, + "rewards/margins": 8.212285041809082, + "rewards/rejected": -4.768165588378906, + "step": 18796 + }, + { + "epoch": 4.7, + "grad_norm": 2.851280450820923, + "learning_rate": 8.693642264813562e-08, + "logits/chosen": -0.5668781399726868, + "logits/rejected": -0.6061527729034424, + "logps/chosen": -51.580726623535156, + "logps/rejected": -128.54624938964844, + "loss": 0.5231, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1385674476623535, + "rewards/margins": 8.216090202331543, + "rewards/rejected": -5.077523231506348, + "step": 18797 + }, + { + "epoch": 4.7, + "grad_norm": 2.0573740005493164, + "learning_rate": 8.679055134752978e-08, + "logits/chosen": -0.5177897214889526, + "logits/rejected": -0.613520085811615, + "logps/chosen": -51.8525276184082, + "logps/rejected": -99.94141387939453, + "loss": 0.5671, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0949039459228516, + "rewards/margins": 8.14848804473877, + "rewards/rejected": -5.05358362197876, + "step": 18798 + }, + { + "epoch": 4.7, + "grad_norm": 5.996585369110107, + "learning_rate": 8.664480145755494e-08, + "logits/chosen": -0.5238360166549683, + "logits/rejected": -0.6656380891799927, + "logps/chosen": -54.26908874511719, + "logps/rejected": -100.84051513671875, + "loss": 0.5733, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0730934143066406, + "rewards/margins": 8.073163986206055, + "rewards/rejected": -5.000070571899414, + "step": 18799 + }, + { + "epoch": 4.7, + "grad_norm": 2.510711193084717, + "learning_rate": 8.64991729818132e-08, + "logits/chosen": -0.6530846953392029, + "logits/rejected": -0.7071077823638916, + "logps/chosen": -49.0226936340332, + "logps/rejected": -137.79498291015625, + "loss": 0.5356, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1704039573669434, + "rewards/margins": 10.2861328125, + "rewards/rejected": -7.115728855133057, + "step": 18800 + }, + { + "epoch": 4.7, + "grad_norm": 3.3699662685394287, + "learning_rate": 8.635366592390338e-08, + "logits/chosen": -0.565960168838501, + "logits/rejected": -0.6438324451446533, + "logps/chosen": -50.101226806640625, + "logps/rejected": -106.27379608154297, + "loss": 0.5507, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3851125240325928, + "rewards/margins": 6.978147506713867, + "rewards/rejected": -3.5930349826812744, + "step": 18801 + }, + { + "epoch": 4.7, + "grad_norm": 5.542975902557373, + "learning_rate": 8.62082802874209e-08, + "logits/chosen": -0.5714527368545532, + "logits/rejected": -0.6608285903930664, + "logps/chosen": -56.68671798706055, + "logps/rejected": -109.22972106933594, + "loss": 0.602, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.910771369934082, + "rewards/margins": 6.953718662261963, + "rewards/rejected": -4.042946815490723, + "step": 18802 + }, + { + "epoch": 4.7, + "grad_norm": 2.559195041656494, + "learning_rate": 8.60630160759579e-08, + "logits/chosen": -0.5349613428115845, + "logits/rejected": -0.6138164401054382, + "logps/chosen": -46.61425018310547, + "logps/rejected": -111.26827239990234, + "loss": 0.5029, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4840662479400635, + "rewards/margins": 8.016973495483398, + "rewards/rejected": -4.532907962799072, + "step": 18803 + }, + { + "epoch": 4.7, + "grad_norm": 6.5650410652160645, + "learning_rate": 8.591787329310486e-08, + "logits/chosen": -0.5596973299980164, + "logits/rejected": -0.6224113702774048, + "logps/chosen": -39.699134826660156, + "logps/rejected": -112.61647033691406, + "loss": 0.6508, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.904752492904663, + "rewards/margins": 6.78836727142334, + "rewards/rejected": -3.8836143016815186, + "step": 18804 + }, + { + "epoch": 4.7, + "grad_norm": 5.17419958114624, + "learning_rate": 8.577285194244778e-08, + "logits/chosen": -0.4680280387401581, + "logits/rejected": -0.5349308252334595, + "logps/chosen": -62.22802734375, + "logps/rejected": -103.70962524414062, + "loss": 0.6683, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.939472198486328, + "rewards/margins": 6.568258762359619, + "rewards/rejected": -3.628786563873291, + "step": 18805 + }, + { + "epoch": 4.7, + "grad_norm": 2.461669921875, + "learning_rate": 8.562795202757046e-08, + "logits/chosen": -0.5275634527206421, + "logits/rejected": -0.5759419202804565, + "logps/chosen": -43.13284683227539, + "logps/rejected": -109.76065826416016, + "loss": 0.5061, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5936591625213623, + "rewards/margins": 7.291231155395508, + "rewards/rejected": -3.6975717544555664, + "step": 18806 + }, + { + "epoch": 4.7, + "grad_norm": 10.106860160827637, + "learning_rate": 8.548317355205448e-08, + "logits/chosen": -0.5929144024848938, + "logits/rejected": -0.6776216626167297, + "logps/chosen": -59.184410095214844, + "logps/rejected": -98.21556091308594, + "loss": 0.6482, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.895427703857422, + "rewards/margins": 7.064605236053467, + "rewards/rejected": -4.169177532196045, + "step": 18807 + }, + { + "epoch": 4.71, + "grad_norm": 5.960072994232178, + "learning_rate": 8.53385165194759e-08, + "logits/chosen": -0.5187132358551025, + "logits/rejected": -0.635021448135376, + "logps/chosen": -49.261566162109375, + "logps/rejected": -104.24380493164062, + "loss": 0.6177, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9417834281921387, + "rewards/margins": 8.539717674255371, + "rewards/rejected": -5.597933292388916, + "step": 18808 + }, + { + "epoch": 4.71, + "grad_norm": 7.887542247772217, + "learning_rate": 8.519398093341013e-08, + "logits/chosen": -0.6223607063293457, + "logits/rejected": -0.6632320880889893, + "logps/chosen": -56.012882232666016, + "logps/rejected": -118.34097290039062, + "loss": 0.6951, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1712288856506348, + "rewards/margins": 7.430688858032227, + "rewards/rejected": -4.25946044921875, + "step": 18809 + }, + { + "epoch": 4.71, + "grad_norm": 5.028944969177246, + "learning_rate": 8.504956679742882e-08, + "logits/chosen": -0.5631840229034424, + "logits/rejected": -0.6135818958282471, + "logps/chosen": -77.37492370605469, + "logps/rejected": -103.03700256347656, + "loss": 0.729, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9048478603363037, + "rewards/margins": 6.36203670501709, + "rewards/rejected": -3.4571893215179443, + "step": 18810 + }, + { + "epoch": 4.71, + "grad_norm": 6.846827983856201, + "learning_rate": 8.490527411510019e-08, + "logits/chosen": -0.5426483154296875, + "logits/rejected": -0.615634560585022, + "logps/chosen": -59.06843948364258, + "logps/rejected": -119.70172882080078, + "loss": 0.6884, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3474714756011963, + "rewards/margins": 7.681214809417725, + "rewards/rejected": -4.333743572235107, + "step": 18811 + }, + { + "epoch": 4.71, + "grad_norm": 3.15364408493042, + "learning_rate": 8.476110288999029e-08, + "logits/chosen": -0.48351746797561646, + "logits/rejected": -0.5716223120689392, + "logps/chosen": -60.169586181640625, + "logps/rejected": -106.34019470214844, + "loss": 0.6205, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6818346977233887, + "rewards/margins": 7.304817199707031, + "rewards/rejected": -4.622982025146484, + "step": 18812 + }, + { + "epoch": 4.71, + "grad_norm": 3.4107487201690674, + "learning_rate": 8.461705312566126e-08, + "logits/chosen": -0.5596585273742676, + "logits/rejected": -0.6501238942146301, + "logps/chosen": -60.252174377441406, + "logps/rejected": -109.44269561767578, + "loss": 0.5768, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8590986728668213, + "rewards/margins": 7.688612461090088, + "rewards/rejected": -4.829514503479004, + "step": 18813 + }, + { + "epoch": 4.71, + "grad_norm": 7.3984575271606445, + "learning_rate": 8.447312482567416e-08, + "logits/chosen": -0.5989034175872803, + "logits/rejected": -0.6557157635688782, + "logps/chosen": -62.43812561035156, + "logps/rejected": -105.2406997680664, + "loss": 0.6317, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2393546104431152, + "rewards/margins": 6.7599263191223145, + "rewards/rejected": -3.5205724239349365, + "step": 18814 + }, + { + "epoch": 4.71, + "grad_norm": 4.517462253570557, + "learning_rate": 8.432931799358334e-08, + "logits/chosen": -0.499315470457077, + "logits/rejected": -0.554349958896637, + "logps/chosen": -63.11876678466797, + "logps/rejected": -118.59767150878906, + "loss": 0.7009, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.340394973754883, + "rewards/margins": 7.132352828979492, + "rewards/rejected": -3.7919576168060303, + "step": 18815 + }, + { + "epoch": 4.71, + "grad_norm": 9.360644340515137, + "learning_rate": 8.418563263294432e-08, + "logits/chosen": -0.48115241527557373, + "logits/rejected": -0.5782797336578369, + "logps/chosen": -55.18695068359375, + "logps/rejected": -93.96437072753906, + "loss": 0.7917, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.007861375808716, + "rewards/margins": 5.960735321044922, + "rewards/rejected": -2.952873706817627, + "step": 18816 + }, + { + "epoch": 4.71, + "grad_norm": 2.1960179805755615, + "learning_rate": 8.404206874730702e-08, + "logits/chosen": -0.5962327718734741, + "logits/rejected": -0.6597325801849365, + "logps/chosen": -40.01776123046875, + "logps/rejected": -113.34609985351562, + "loss": 0.5575, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3203415870666504, + "rewards/margins": 8.4584379196167, + "rewards/rejected": -5.138096332550049, + "step": 18817 + }, + { + "epoch": 4.71, + "grad_norm": 2.953129291534424, + "learning_rate": 8.389862634021861e-08, + "logits/chosen": -0.576373815536499, + "logits/rejected": -0.677562415599823, + "logps/chosen": -56.8009147644043, + "logps/rejected": -103.9442138671875, + "loss": 0.5584, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9528145790100098, + "rewards/margins": 6.32868766784668, + "rewards/rejected": -3.375873565673828, + "step": 18818 + }, + { + "epoch": 4.71, + "grad_norm": 4.123373031616211, + "learning_rate": 8.375530541522458e-08, + "logits/chosen": -0.5567681789398193, + "logits/rejected": -0.6442400217056274, + "logps/chosen": -54.9615592956543, + "logps/rejected": -106.85990142822266, + "loss": 0.5904, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.314591884613037, + "rewards/margins": 7.890336513519287, + "rewards/rejected": -4.575744152069092, + "step": 18819 + }, + { + "epoch": 4.71, + "grad_norm": 4.467792987823486, + "learning_rate": 8.3612105975866e-08, + "logits/chosen": -0.5200186371803284, + "logits/rejected": -0.6210191249847412, + "logps/chosen": -54.52962875366211, + "logps/rejected": -96.33055877685547, + "loss": 0.5976, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.429804801940918, + "rewards/margins": 6.455733299255371, + "rewards/rejected": -3.025928497314453, + "step": 18820 + }, + { + "epoch": 4.71, + "grad_norm": 4.349241256713867, + "learning_rate": 8.346902802568168e-08, + "logits/chosen": -0.4980432093143463, + "logits/rejected": -0.5676711797714233, + "logps/chosen": -52.98095703125, + "logps/rejected": -110.97054290771484, + "loss": 0.6098, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.087674856185913, + "rewards/margins": 6.479880332946777, + "rewards/rejected": -3.3922054767608643, + "step": 18821 + }, + { + "epoch": 4.71, + "grad_norm": 7.055597305297852, + "learning_rate": 8.33260715682066e-08, + "logits/chosen": -0.556277334690094, + "logits/rejected": -0.6405676007270813, + "logps/chosen": -49.085121154785156, + "logps/rejected": -113.60331726074219, + "loss": 0.6203, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9410524368286133, + "rewards/margins": 8.368274688720703, + "rewards/rejected": -5.42722225189209, + "step": 18822 + }, + { + "epoch": 4.71, + "grad_norm": 3.90505313873291, + "learning_rate": 8.318323660697402e-08, + "logits/chosen": -0.5815154910087585, + "logits/rejected": -0.6194931864738464, + "logps/chosen": -49.143184661865234, + "logps/rejected": -123.37258911132812, + "loss": 0.5451, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0528388023376465, + "rewards/margins": 8.00092887878418, + "rewards/rejected": -4.948090553283691, + "step": 18823 + }, + { + "epoch": 4.71, + "grad_norm": 4.746835231781006, + "learning_rate": 8.304052314551392e-08, + "logits/chosen": -0.5551438331604004, + "logits/rejected": -0.6023037433624268, + "logps/chosen": -50.179908752441406, + "logps/rejected": -107.30937957763672, + "loss": 0.5626, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.158689498901367, + "rewards/margins": 6.874164581298828, + "rewards/rejected": -3.7154743671417236, + "step": 18824 + }, + { + "epoch": 4.71, + "grad_norm": 6.618707656860352, + "learning_rate": 8.289793118735178e-08, + "logits/chosen": -0.5257172584533691, + "logits/rejected": -0.658929705619812, + "logps/chosen": -56.43553161621094, + "logps/rejected": -95.90180206298828, + "loss": 0.5589, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.222707748413086, + "rewards/margins": 7.568421840667725, + "rewards/rejected": -4.3457136154174805, + "step": 18825 + }, + { + "epoch": 4.71, + "grad_norm": 5.225081443786621, + "learning_rate": 8.27554607360126e-08, + "logits/chosen": -0.622943639755249, + "logits/rejected": -0.6798496246337891, + "logps/chosen": -52.376399993896484, + "logps/rejected": -98.4322509765625, + "loss": 0.5454, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1496050357818604, + "rewards/margins": 6.274548053741455, + "rewards/rejected": -3.124943256378174, + "step": 18826 + }, + { + "epoch": 4.71, + "grad_norm": 3.4672775268554688, + "learning_rate": 8.261311179501575e-08, + "logits/chosen": -0.5371494293212891, + "logits/rejected": -0.5971878170967102, + "logps/chosen": -60.81739044189453, + "logps/rejected": -120.00747680664062, + "loss": 0.5916, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.526880979537964, + "rewards/margins": 7.91585111618042, + "rewards/rejected": -4.388969898223877, + "step": 18827 + }, + { + "epoch": 4.71, + "grad_norm": 5.9579572677612305, + "learning_rate": 8.247088436787898e-08, + "logits/chosen": -0.596362829208374, + "logits/rejected": -0.703774094581604, + "logps/chosen": -47.245933532714844, + "logps/rejected": -98.09345245361328, + "loss": 0.5917, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3157248497009277, + "rewards/margins": 7.260557651519775, + "rewards/rejected": -3.9448330402374268, + "step": 18828 + }, + { + "epoch": 4.71, + "grad_norm": 2.6882734298706055, + "learning_rate": 8.232877845811782e-08, + "logits/chosen": -0.4538274109363556, + "logits/rejected": -0.5900684595108032, + "logps/chosen": -60.35540008544922, + "logps/rejected": -129.4158477783203, + "loss": 0.5118, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2715108394622803, + "rewards/margins": 9.332633972167969, + "rewards/rejected": -6.061122894287109, + "step": 18829 + }, + { + "epoch": 4.71, + "grad_norm": 4.08109712600708, + "learning_rate": 8.218679406924279e-08, + "logits/chosen": -0.5991572737693787, + "logits/rejected": -0.6525405049324036, + "logps/chosen": -43.30695343017578, + "logps/rejected": -118.00067138671875, + "loss": 0.6197, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4447288513183594, + "rewards/margins": 8.390192031860352, + "rewards/rejected": -4.945462703704834, + "step": 18830 + }, + { + "epoch": 4.71, + "grad_norm": 5.134457111358643, + "learning_rate": 8.204493120476276e-08, + "logits/chosen": -0.5498170852661133, + "logits/rejected": -0.6145240664482117, + "logps/chosen": -55.144866943359375, + "logps/rejected": -126.10446166992188, + "loss": 0.5668, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9175453186035156, + "rewards/margins": 7.942056179046631, + "rewards/rejected": -5.024511814117432, + "step": 18831 + }, + { + "epoch": 4.71, + "grad_norm": 37.84591293334961, + "learning_rate": 8.190318986818435e-08, + "logits/chosen": -0.5276885032653809, + "logits/rejected": -0.5870017409324646, + "logps/chosen": -48.68500900268555, + "logps/rejected": -123.28748321533203, + "loss": 0.6273, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0441603660583496, + "rewards/margins": 8.814615249633789, + "rewards/rejected": -5.7704548835754395, + "step": 18832 + }, + { + "epoch": 4.71, + "grad_norm": 6.024328231811523, + "learning_rate": 8.176157006300811e-08, + "logits/chosen": -0.5149308443069458, + "logits/rejected": -0.6364915370941162, + "logps/chosen": -58.91751480102539, + "logps/rejected": -89.47217559814453, + "loss": 0.6051, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.254344940185547, + "rewards/margins": 7.685904502868652, + "rewards/rejected": -4.4315595626831055, + "step": 18833 + }, + { + "epoch": 4.71, + "grad_norm": 4.8293585777282715, + "learning_rate": 8.162007179273512e-08, + "logits/chosen": -0.5643450617790222, + "logits/rejected": -0.586821436882019, + "logps/chosen": -53.66107940673828, + "logps/rejected": -109.59072875976562, + "loss": 0.6432, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9415628910064697, + "rewards/margins": 6.417750835418701, + "rewards/rejected": -3.4761877059936523, + "step": 18834 + }, + { + "epoch": 4.71, + "grad_norm": 6.690158843994141, + "learning_rate": 8.147869506086148e-08, + "logits/chosen": -0.5678549408912659, + "logits/rejected": -0.598893404006958, + "logps/chosen": -52.03937530517578, + "logps/rejected": -113.32174682617188, + "loss": 0.6313, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.240196704864502, + "rewards/margins": 6.31862735748291, + "rewards/rejected": -3.078429698944092, + "step": 18835 + }, + { + "epoch": 4.71, + "grad_norm": 4.997262954711914, + "learning_rate": 8.133743987088161e-08, + "logits/chosen": -0.48070061206817627, + "logits/rejected": -0.5685430765151978, + "logps/chosen": -50.9497184753418, + "logps/rejected": -110.2478256225586, + "loss": 0.6198, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.054347515106201, + "rewards/margins": 7.701529026031494, + "rewards/rejected": -4.647181510925293, + "step": 18836 + }, + { + "epoch": 4.71, + "grad_norm": 10.631492614746094, + "learning_rate": 8.119630622628438e-08, + "logits/chosen": -0.5664792060852051, + "logits/rejected": -0.6693461537361145, + "logps/chosen": -53.298744201660156, + "logps/rejected": -108.39407348632812, + "loss": 0.6518, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.992978096008301, + "rewards/margins": 7.053606986999512, + "rewards/rejected": -4.060628890991211, + "step": 18837 + }, + { + "epoch": 4.71, + "grad_norm": 4.082616806030273, + "learning_rate": 8.105529413055868e-08, + "logits/chosen": -0.5944790244102478, + "logits/rejected": -0.6841237545013428, + "logps/chosen": -47.82856750488281, + "logps/rejected": -125.03128051757812, + "loss": 0.5589, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.219477653503418, + "rewards/margins": 7.781932830810547, + "rewards/rejected": -4.562455177307129, + "step": 18838 + }, + { + "epoch": 4.71, + "grad_norm": 6.913477420806885, + "learning_rate": 8.091440358718894e-08, + "logits/chosen": -0.4994698464870453, + "logits/rejected": -0.5657231211662292, + "logps/chosen": -61.807472229003906, + "logps/rejected": -131.6969451904297, + "loss": 0.6141, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3896942138671875, + "rewards/margins": 8.25014877319336, + "rewards/rejected": -4.860454559326172, + "step": 18839 + }, + { + "epoch": 4.71, + "grad_norm": 4.965065956115723, + "learning_rate": 8.07736345996557e-08, + "logits/chosen": -0.4901421368122101, + "logits/rejected": -0.5908379554748535, + "logps/chosen": -63.511322021484375, + "logps/rejected": -85.76286315917969, + "loss": 0.6428, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.744450569152832, + "rewards/margins": 5.6505961418151855, + "rewards/rejected": -2.9061455726623535, + "step": 18840 + }, + { + "epoch": 4.71, + "grad_norm": 27.565141677856445, + "learning_rate": 8.063298717143898e-08, + "logits/chosen": -0.5971640348434448, + "logits/rejected": -0.6557357907295227, + "logps/chosen": -49.63343048095703, + "logps/rejected": -112.8235855102539, + "loss": 0.5735, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1260342597961426, + "rewards/margins": 6.980981349945068, + "rewards/rejected": -3.854947090148926, + "step": 18841 + }, + { + "epoch": 4.71, + "grad_norm": 3.481602191925049, + "learning_rate": 8.049246130601374e-08, + "logits/chosen": -0.5343194007873535, + "logits/rejected": -0.6363027095794678, + "logps/chosen": -54.596405029296875, + "logps/rejected": -113.71757507324219, + "loss": 0.5625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0000128746032715, + "rewards/margins": 6.710710525512695, + "rewards/rejected": -3.710697650909424, + "step": 18842 + }, + { + "epoch": 4.71, + "grad_norm": 3.0846288204193115, + "learning_rate": 8.035205700685167e-08, + "logits/chosen": -0.5796704888343811, + "logits/rejected": -0.63871169090271, + "logps/chosen": -60.806190490722656, + "logps/rejected": -99.01345825195312, + "loss": 0.566, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.109865427017212, + "rewards/margins": 7.168168544769287, + "rewards/rejected": -4.0583038330078125, + "step": 18843 + }, + { + "epoch": 4.71, + "grad_norm": 3.801701545715332, + "learning_rate": 8.021177427742388e-08, + "logits/chosen": -0.5034245848655701, + "logits/rejected": -0.5768340229988098, + "logps/chosen": -58.61225128173828, + "logps/rejected": -101.54646301269531, + "loss": 0.6195, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.070554733276367, + "rewards/margins": 6.230906009674072, + "rewards/rejected": -3.160351037979126, + "step": 18844 + }, + { + "epoch": 4.71, + "grad_norm": 3.1018640995025635, + "learning_rate": 8.007161312119538e-08, + "logits/chosen": -0.5524700284004211, + "logits/rejected": -0.6302984952926636, + "logps/chosen": -50.693763732910156, + "logps/rejected": -104.59937286376953, + "loss": 0.5861, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1558802127838135, + "rewards/margins": 7.110060691833496, + "rewards/rejected": -3.9541802406311035, + "step": 18845 + }, + { + "epoch": 4.71, + "grad_norm": 6.127920150756836, + "learning_rate": 7.993157354163062e-08, + "logits/chosen": -0.5415856838226318, + "logits/rejected": -0.5636225342750549, + "logps/chosen": -50.743316650390625, + "logps/rejected": -103.66834259033203, + "loss": 0.6997, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2130041122436523, + "rewards/margins": 6.900081157684326, + "rewards/rejected": -3.687077283859253, + "step": 18846 + }, + { + "epoch": 4.71, + "grad_norm": 5.522468090057373, + "learning_rate": 7.97916555421896e-08, + "logits/chosen": -0.44937586784362793, + "logits/rejected": -0.5950303077697754, + "logps/chosen": -72.73448181152344, + "logps/rejected": -95.37102508544922, + "loss": 0.678, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.048243522644043, + "rewards/margins": 6.974693775177002, + "rewards/rejected": -3.926450252532959, + "step": 18847 + }, + { + "epoch": 4.72, + "grad_norm": 4.852103233337402, + "learning_rate": 7.965185912633067e-08, + "logits/chosen": -0.582743763923645, + "logits/rejected": -0.6791079044342041, + "logps/chosen": -61.053916931152344, + "logps/rejected": -104.41767883300781, + "loss": 0.6091, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.173839569091797, + "rewards/margins": 6.733555793762207, + "rewards/rejected": -3.55971622467041, + "step": 18848 + }, + { + "epoch": 4.72, + "grad_norm": 7.5409722328186035, + "learning_rate": 7.951218429750773e-08, + "logits/chosen": -0.5766275525093079, + "logits/rejected": -0.6493731737136841, + "logps/chosen": -59.042789459228516, + "logps/rejected": -106.35000610351562, + "loss": 0.6902, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.296912908554077, + "rewards/margins": 6.727122783660889, + "rewards/rejected": -3.4302103519439697, + "step": 18849 + }, + { + "epoch": 4.72, + "grad_norm": 3.0901968479156494, + "learning_rate": 7.937263105917193e-08, + "logits/chosen": -0.5412522554397583, + "logits/rejected": -0.6200627088546753, + "logps/chosen": -64.09892272949219, + "logps/rejected": -109.26927185058594, + "loss": 0.5681, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1506845951080322, + "rewards/margins": 7.854982376098633, + "rewards/rejected": -4.704298496246338, + "step": 18850 + }, + { + "epoch": 4.72, + "grad_norm": 10.040175437927246, + "learning_rate": 7.92331994147727e-08, + "logits/chosen": -0.559367299079895, + "logits/rejected": -0.6436514854431152, + "logps/chosen": -56.12523651123047, + "logps/rejected": -113.97806549072266, + "loss": 0.6373, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9952099323272705, + "rewards/margins": 8.048009872436523, + "rewards/rejected": -5.052800178527832, + "step": 18851 + }, + { + "epoch": 4.72, + "grad_norm": 4.540896892547607, + "learning_rate": 7.909388936775508e-08, + "logits/chosen": -0.5134636163711548, + "logits/rejected": -0.5749701261520386, + "logps/chosen": -50.857017517089844, + "logps/rejected": -108.50473022460938, + "loss": 0.5569, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.263315439224243, + "rewards/margins": 7.567992210388184, + "rewards/rejected": -4.304677486419678, + "step": 18852 + }, + { + "epoch": 4.72, + "grad_norm": 3.9215946197509766, + "learning_rate": 7.895470092156132e-08, + "logits/chosen": -0.5336278080940247, + "logits/rejected": -0.6232525110244751, + "logps/chosen": -63.67755126953125, + "logps/rejected": -97.88278198242188, + "loss": 0.7149, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9456748962402344, + "rewards/margins": 6.451478481292725, + "rewards/rejected": -3.505802869796753, + "step": 18853 + }, + { + "epoch": 4.72, + "grad_norm": 5.454723358154297, + "learning_rate": 7.881563407963144e-08, + "logits/chosen": -0.5588104128837585, + "logits/rejected": -0.6241466999053955, + "logps/chosen": -54.48444747924805, + "logps/rejected": -112.1569595336914, + "loss": 0.5794, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.25773286819458, + "rewards/margins": 7.078127384185791, + "rewards/rejected": -3.820394992828369, + "step": 18854 + }, + { + "epoch": 4.72, + "grad_norm": 2.6898303031921387, + "learning_rate": 7.867668884540102e-08, + "logits/chosen": -0.4823741018772125, + "logits/rejected": -0.5896902680397034, + "logps/chosen": -48.99228286743164, + "logps/rejected": -104.77823638916016, + "loss": 0.5494, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1567893028259277, + "rewards/margins": 7.589526176452637, + "rewards/rejected": -4.432736396789551, + "step": 18855 + }, + { + "epoch": 4.72, + "grad_norm": 3.2009286880493164, + "learning_rate": 7.85378652223051e-08, + "logits/chosen": -0.5643020868301392, + "logits/rejected": -0.6385208964347839, + "logps/chosen": -54.27797317504883, + "logps/rejected": -119.18582153320312, + "loss": 0.6032, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8222579956054688, + "rewards/margins": 7.203811168670654, + "rewards/rejected": -4.381553649902344, + "step": 18856 + }, + { + "epoch": 4.72, + "grad_norm": 3.9083971977233887, + "learning_rate": 7.839916321377261e-08, + "logits/chosen": -0.5298061966896057, + "logits/rejected": -0.6405968070030212, + "logps/chosen": -60.07172393798828, + "logps/rejected": -103.92756652832031, + "loss": 0.6029, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5037364959716797, + "rewards/margins": 8.050467491149902, + "rewards/rejected": -4.546730995178223, + "step": 18857 + }, + { + "epoch": 4.72, + "grad_norm": 3.888091564178467, + "learning_rate": 7.82605828232319e-08, + "logits/chosen": -0.6052106618881226, + "logits/rejected": -0.6609284281730652, + "logps/chosen": -49.66535949707031, + "logps/rejected": -118.8116683959961, + "loss": 0.5899, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3794164657592773, + "rewards/margins": 7.49302339553833, + "rewards/rejected": -4.113606929779053, + "step": 18858 + }, + { + "epoch": 4.72, + "grad_norm": 4.371639728546143, + "learning_rate": 7.812212405410635e-08, + "logits/chosen": -0.6354169249534607, + "logits/rejected": -0.6799225807189941, + "logps/chosen": -46.53335189819336, + "logps/rejected": -103.94308471679688, + "loss": 0.665, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.019702434539795, + "rewards/margins": 7.150991916656494, + "rewards/rejected": -4.131289005279541, + "step": 18859 + }, + { + "epoch": 4.72, + "grad_norm": 4.889690399169922, + "learning_rate": 7.79837869098199e-08, + "logits/chosen": -0.5670521259307861, + "logits/rejected": -0.6716374754905701, + "logps/chosen": -58.44817352294922, + "logps/rejected": -92.44607543945312, + "loss": 0.6575, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.176806688308716, + "rewards/margins": 6.753386497497559, + "rewards/rejected": -3.576580047607422, + "step": 18860 + }, + { + "epoch": 4.72, + "grad_norm": 9.381608009338379, + "learning_rate": 7.784557139378812e-08, + "logits/chosen": -0.6610207557678223, + "logits/rejected": -0.7376574873924255, + "logps/chosen": -62.59307861328125, + "logps/rejected": -119.78573608398438, + "loss": 0.6145, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7542271614074707, + "rewards/margins": 8.595882415771484, + "rewards/rejected": -5.8416547775268555, + "step": 18861 + }, + { + "epoch": 4.72, + "grad_norm": 7.077112674713135, + "learning_rate": 7.770747750942775e-08, + "logits/chosen": -0.5931146144866943, + "logits/rejected": -0.6702358722686768, + "logps/chosen": -70.79508972167969, + "logps/rejected": -98.89056396484375, + "loss": 0.759, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8291471004486084, + "rewards/margins": 6.53183126449585, + "rewards/rejected": -3.702683687210083, + "step": 18862 + }, + { + "epoch": 4.72, + "grad_norm": 2.7300281524658203, + "learning_rate": 7.756950526015161e-08, + "logits/chosen": -0.547951340675354, + "logits/rejected": -0.6265450716018677, + "logps/chosen": -51.85427474975586, + "logps/rejected": -129.7675018310547, + "loss": 0.5961, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0739402770996094, + "rewards/margins": 8.844365119934082, + "rewards/rejected": -5.7704243659973145, + "step": 18863 + }, + { + "epoch": 4.72, + "grad_norm": 3.5764849185943604, + "learning_rate": 7.743165464936864e-08, + "logits/chosen": -0.5808169841766357, + "logits/rejected": -0.6841672658920288, + "logps/chosen": -50.2435302734375, + "logps/rejected": -103.85289001464844, + "loss": 0.6037, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.246917247772217, + "rewards/margins": 8.3936767578125, + "rewards/rejected": -5.1467604637146, + "step": 18864 + }, + { + "epoch": 4.72, + "grad_norm": 4.643272399902344, + "learning_rate": 7.7293925680485e-08, + "logits/chosen": -0.5332283973693848, + "logits/rejected": -0.6315068602561951, + "logps/chosen": -48.437740325927734, + "logps/rejected": -94.27731323242188, + "loss": 0.5355, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2918155193328857, + "rewards/margins": 7.0179314613342285, + "rewards/rejected": -3.726116180419922, + "step": 18865 + }, + { + "epoch": 4.72, + "grad_norm": 8.427162170410156, + "learning_rate": 7.715631835690518e-08, + "logits/chosen": -0.5387434959411621, + "logits/rejected": -0.5992515087127686, + "logps/chosen": -48.675350189208984, + "logps/rejected": -120.51617431640625, + "loss": 0.6752, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0235514640808105, + "rewards/margins": 7.6730241775512695, + "rewards/rejected": -4.649472713470459, + "step": 18866 + }, + { + "epoch": 4.72, + "grad_norm": 4.831012725830078, + "learning_rate": 7.701883268202815e-08, + "logits/chosen": -0.4934442639350891, + "logits/rejected": -0.5875235199928284, + "logps/chosen": -50.48145294189453, + "logps/rejected": -104.30038452148438, + "loss": 0.6288, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.211531400680542, + "rewards/margins": 6.7990899085998535, + "rewards/rejected": -3.5875580310821533, + "step": 18867 + }, + { + "epoch": 4.72, + "grad_norm": 3.730496406555176, + "learning_rate": 7.688146865925228e-08, + "logits/chosen": -0.48768380284309387, + "logits/rejected": -0.5849539637565613, + "logps/chosen": -56.53900909423828, + "logps/rejected": -103.72196960449219, + "loss": 0.6253, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.429661989212036, + "rewards/margins": 8.371431350708008, + "rewards/rejected": -4.941768646240234, + "step": 18868 + }, + { + "epoch": 4.72, + "grad_norm": 2.8213114738464355, + "learning_rate": 7.674422629197209e-08, + "logits/chosen": -0.6442346572875977, + "logits/rejected": -0.7014086842536926, + "logps/chosen": -54.02256774902344, + "logps/rejected": -125.2975845336914, + "loss": 0.6588, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.449817657470703, + "rewards/margins": 8.738786697387695, + "rewards/rejected": -5.288969993591309, + "step": 18869 + }, + { + "epoch": 4.72, + "grad_norm": 2.388274669647217, + "learning_rate": 7.66071055835782e-08, + "logits/chosen": -0.5836197733879089, + "logits/rejected": -0.6732246279716492, + "logps/chosen": -49.81797790527344, + "logps/rejected": -125.7162094116211, + "loss": 0.5701, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.184635877609253, + "rewards/margins": 8.778155326843262, + "rewards/rejected": -5.5935187339782715, + "step": 18870 + }, + { + "epoch": 4.72, + "grad_norm": 10.549094200134277, + "learning_rate": 7.647010653746012e-08, + "logits/chosen": -0.5316939353942871, + "logits/rejected": -0.6205580830574036, + "logps/chosen": -56.99868392944336, + "logps/rejected": -95.71339416503906, + "loss": 0.6965, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6702041625976562, + "rewards/margins": 7.2362189292907715, + "rewards/rejected": -4.566015243530273, + "step": 18871 + }, + { + "epoch": 4.72, + "grad_norm": 5.3368330001831055, + "learning_rate": 7.633322915700237e-08, + "logits/chosen": -0.5609606504440308, + "logits/rejected": -0.648464560508728, + "logps/chosen": -60.797813415527344, + "logps/rejected": -98.57168579101562, + "loss": 0.6482, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.363837718963623, + "rewards/margins": 6.874032497406006, + "rewards/rejected": -3.510194778442383, + "step": 18872 + }, + { + "epoch": 4.72, + "grad_norm": 5.2988362312316895, + "learning_rate": 7.619647344558723e-08, + "logits/chosen": -0.6029722094535828, + "logits/rejected": -0.6504238843917847, + "logps/chosen": -45.2366828918457, + "logps/rejected": -109.69154357910156, + "loss": 0.5776, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.121537685394287, + "rewards/margins": 7.1250996589660645, + "rewards/rejected": -4.0035624504089355, + "step": 18873 + }, + { + "epoch": 4.72, + "grad_norm": 7.56318473815918, + "learning_rate": 7.605983940659479e-08, + "logits/chosen": -0.5391573309898376, + "logits/rejected": -0.655905544757843, + "logps/chosen": -60.20195007324219, + "logps/rejected": -101.17879486083984, + "loss": 0.6741, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.969973087310791, + "rewards/margins": 6.074113845825195, + "rewards/rejected": -3.104140281677246, + "step": 18874 + }, + { + "epoch": 4.72, + "grad_norm": 13.759254455566406, + "learning_rate": 7.592332704340122e-08, + "logits/chosen": -0.5899783372879028, + "logits/rejected": -0.6940799355506897, + "logps/chosen": -54.13481521606445, + "logps/rejected": -90.24958801269531, + "loss": 0.6101, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.383861541748047, + "rewards/margins": 7.0642170906066895, + "rewards/rejected": -3.6803557872772217, + "step": 18875 + }, + { + "epoch": 4.72, + "grad_norm": 3.305388927459717, + "learning_rate": 7.57869363593794e-08, + "logits/chosen": -0.5315412878990173, + "logits/rejected": -0.6046565771102905, + "logps/chosen": -54.42180252075195, + "logps/rejected": -112.6859359741211, + "loss": 0.5416, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.853865146636963, + "rewards/margins": 7.079610347747803, + "rewards/rejected": -4.225746154785156, + "step": 18876 + }, + { + "epoch": 4.72, + "grad_norm": 4.863080024719238, + "learning_rate": 7.565066735789994e-08, + "logits/chosen": -0.5988804697990417, + "logits/rejected": -0.6968282461166382, + "logps/chosen": -50.15422821044922, + "logps/rejected": -125.82926940917969, + "loss": 0.5769, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2734639644622803, + "rewards/margins": 8.987147331237793, + "rewards/rejected": -5.713682174682617, + "step": 18877 + }, + { + "epoch": 4.72, + "grad_norm": 2.4784457683563232, + "learning_rate": 7.551452004233017e-08, + "logits/chosen": -0.6344135999679565, + "logits/rejected": -0.6806911826133728, + "logps/chosen": -41.81425094604492, + "logps/rejected": -103.24125671386719, + "loss": 0.5502, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.31211256980896, + "rewards/margins": 6.864757537841797, + "rewards/rejected": -3.5526442527770996, + "step": 18878 + }, + { + "epoch": 4.72, + "grad_norm": 6.5033392906188965, + "learning_rate": 7.537849441603518e-08, + "logits/chosen": -0.6056149005889893, + "logits/rejected": -0.685778021812439, + "logps/chosen": -59.287559509277344, + "logps/rejected": -104.89765930175781, + "loss": 0.752, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.789161205291748, + "rewards/margins": 7.162435531616211, + "rewards/rejected": -4.373273849487305, + "step": 18879 + }, + { + "epoch": 4.72, + "grad_norm": 5.274965286254883, + "learning_rate": 7.524259048237504e-08, + "logits/chosen": -0.5716845393180847, + "logits/rejected": -0.5965627431869507, + "logps/chosen": -50.94866943359375, + "logps/rejected": -110.16410064697266, + "loss": 0.7245, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0787978172302246, + "rewards/margins": 6.529448509216309, + "rewards/rejected": -3.450650215148926, + "step": 18880 + }, + { + "epoch": 4.72, + "grad_norm": 3.1446774005889893, + "learning_rate": 7.510680824470872e-08, + "logits/chosen": -0.5591990947723389, + "logits/rejected": -0.640933632850647, + "logps/chosen": -52.75808334350586, + "logps/rejected": -117.43038940429688, + "loss": 0.5726, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.135951280593872, + "rewards/margins": 7.732962131500244, + "rewards/rejected": -4.597011089324951, + "step": 18881 + }, + { + "epoch": 4.72, + "grad_norm": 2.8772952556610107, + "learning_rate": 7.497114770639191e-08, + "logits/chosen": -0.5969880223274231, + "logits/rejected": -0.6431295871734619, + "logps/chosen": -68.04243469238281, + "logps/rejected": -119.8873291015625, + "loss": 0.5721, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.243443250656128, + "rewards/margins": 7.876847267150879, + "rewards/rejected": -4.633403778076172, + "step": 18882 + }, + { + "epoch": 4.72, + "grad_norm": 11.879663467407227, + "learning_rate": 7.483560887077579e-08, + "logits/chosen": -0.5016990900039673, + "logits/rejected": -0.586806058883667, + "logps/chosen": -56.251800537109375, + "logps/rejected": -101.60179138183594, + "loss": 0.6901, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.764474630355835, + "rewards/margins": 6.111636161804199, + "rewards/rejected": -3.347161054611206, + "step": 18883 + }, + { + "epoch": 4.72, + "grad_norm": 3.6818573474884033, + "learning_rate": 7.470019174121101e-08, + "logits/chosen": -0.6203944683074951, + "logits/rejected": -0.6830018162727356, + "logps/chosen": -46.87550735473633, + "logps/rejected": -113.75115966796875, + "loss": 0.5985, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3949410915374756, + "rewards/margins": 7.564484119415283, + "rewards/rejected": -4.1695427894592285, + "step": 18884 + }, + { + "epoch": 4.72, + "grad_norm": 2.6982786655426025, + "learning_rate": 7.45648963210438e-08, + "logits/chosen": -0.5616963505744934, + "logits/rejected": -0.6620141267776489, + "logps/chosen": -53.84202194213867, + "logps/rejected": -94.94213104248047, + "loss": 0.6031, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5782816410064697, + "rewards/margins": 8.403928756713867, + "rewards/rejected": -4.825647354125977, + "step": 18885 + }, + { + "epoch": 4.72, + "grad_norm": 2.934323787689209, + "learning_rate": 7.442972261361592e-08, + "logits/chosen": -0.48662957549095154, + "logits/rejected": -0.6164149641990662, + "logps/chosen": -53.04233932495117, + "logps/rejected": -97.42119598388672, + "loss": 0.5278, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9996166229248047, + "rewards/margins": 7.774601936340332, + "rewards/rejected": -4.7749857902526855, + "step": 18886 + }, + { + "epoch": 4.72, + "grad_norm": 7.568023204803467, + "learning_rate": 7.429467062226914e-08, + "logits/chosen": -0.5631417036056519, + "logits/rejected": -0.6386533975601196, + "logps/chosen": -50.6651725769043, + "logps/rejected": -106.40615844726562, + "loss": 0.58, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.262843608856201, + "rewards/margins": 8.439693450927734, + "rewards/rejected": -5.176850318908691, + "step": 18887 + }, + { + "epoch": 4.73, + "grad_norm": 3.1866514682769775, + "learning_rate": 7.415974035033968e-08, + "logits/chosen": -0.5349018573760986, + "logits/rejected": -0.6630586981773376, + "logps/chosen": -53.42108917236328, + "logps/rejected": -88.12193298339844, + "loss": 0.5816, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5890164375305176, + "rewards/margins": 6.356090545654297, + "rewards/rejected": -3.7670741081237793, + "step": 18888 + }, + { + "epoch": 4.73, + "grad_norm": 4.100174903869629, + "learning_rate": 7.402493180116322e-08, + "logits/chosen": -0.540556788444519, + "logits/rejected": -0.5573858618736267, + "logps/chosen": -41.631534576416016, + "logps/rejected": -93.24260711669922, + "loss": 0.5422, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2901132106781006, + "rewards/margins": 6.141557693481445, + "rewards/rejected": -2.851444721221924, + "step": 18889 + }, + { + "epoch": 4.73, + "grad_norm": 23.39657974243164, + "learning_rate": 7.389024497806985e-08, + "logits/chosen": -0.5094351768493652, + "logits/rejected": -0.6237854957580566, + "logps/chosen": -58.73755645751953, + "logps/rejected": -96.43318176269531, + "loss": 0.7359, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9921183586120605, + "rewards/margins": 6.2136945724487305, + "rewards/rejected": -3.221576690673828, + "step": 18890 + }, + { + "epoch": 4.73, + "grad_norm": 10.958967208862305, + "learning_rate": 7.375567988438803e-08, + "logits/chosen": -0.5927778482437134, + "logits/rejected": -0.6942391395568848, + "logps/chosen": -49.44989776611328, + "logps/rejected": -106.45753479003906, + "loss": 0.5683, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.899458408355713, + "rewards/margins": 8.087628364562988, + "rewards/rejected": -5.188170433044434, + "step": 18891 + }, + { + "epoch": 4.73, + "grad_norm": 3.4659616947174072, + "learning_rate": 7.362123652344344e-08, + "logits/chosen": -0.5845777988433838, + "logits/rejected": -0.6687759757041931, + "logps/chosen": -52.99745559692383, + "logps/rejected": -99.57876586914062, + "loss": 0.665, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.241163969039917, + "rewards/margins": 7.266207695007324, + "rewards/rejected": -4.025043487548828, + "step": 18892 + }, + { + "epoch": 4.73, + "grad_norm": 3.4045042991638184, + "learning_rate": 7.348691489855786e-08, + "logits/chosen": -0.5317735075950623, + "logits/rejected": -0.5946449041366577, + "logps/chosen": -49.232521057128906, + "logps/rejected": -116.57006072998047, + "loss": 0.5531, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.158534526824951, + "rewards/margins": 7.152533531188965, + "rewards/rejected": -3.993999481201172, + "step": 18893 + }, + { + "epoch": 4.73, + "grad_norm": 2.637712001800537, + "learning_rate": 7.335271501305085e-08, + "logits/chosen": -0.556142270565033, + "logits/rejected": -0.6250103712081909, + "logps/chosen": -43.97756576538086, + "logps/rejected": -110.47122955322266, + "loss": 0.5321, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3143863677978516, + "rewards/margins": 8.188209533691406, + "rewards/rejected": -4.8738226890563965, + "step": 18894 + }, + { + "epoch": 4.73, + "grad_norm": 4.795499801635742, + "learning_rate": 7.321863687023811e-08, + "logits/chosen": -0.49863797426223755, + "logits/rejected": -0.5801935195922852, + "logps/chosen": -56.56991958618164, + "logps/rejected": -99.88555908203125, + "loss": 0.5822, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4016335010528564, + "rewards/margins": 7.8421525955200195, + "rewards/rejected": -4.440519332885742, + "step": 18895 + }, + { + "epoch": 4.73, + "grad_norm": 5.285066604614258, + "learning_rate": 7.308468047343365e-08, + "logits/chosen": -0.587400496006012, + "logits/rejected": -0.6450843214988708, + "logps/chosen": -52.93921661376953, + "logps/rejected": -87.34413146972656, + "loss": 0.7309, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4705967903137207, + "rewards/margins": 5.777586936950684, + "rewards/rejected": -2.306990146636963, + "step": 18896 + }, + { + "epoch": 4.73, + "grad_norm": 4.257883071899414, + "learning_rate": 7.295084582594702e-08, + "logits/chosen": -0.535412073135376, + "logits/rejected": -0.5551704168319702, + "logps/chosen": -75.1044921875, + "logps/rejected": -105.67272186279297, + "loss": 0.6923, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.203413486480713, + "rewards/margins": 6.718617916107178, + "rewards/rejected": -3.515204429626465, + "step": 18897 + }, + { + "epoch": 4.73, + "grad_norm": 5.791879653930664, + "learning_rate": 7.281713293108506e-08, + "logits/chosen": -0.501999020576477, + "logits/rejected": -0.5897928476333618, + "logps/chosen": -48.62043762207031, + "logps/rejected": -112.06464385986328, + "loss": 0.5601, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.028557538986206, + "rewards/margins": 7.5801615715026855, + "rewards/rejected": -4.5516037940979, + "step": 18898 + }, + { + "epoch": 4.73, + "grad_norm": 4.90090274810791, + "learning_rate": 7.268354179215342e-08, + "logits/chosen": -0.5799224376678467, + "logits/rejected": -0.6726886630058289, + "logps/chosen": -59.8324089050293, + "logps/rejected": -102.72478485107422, + "loss": 0.6428, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8965554237365723, + "rewards/margins": 6.866110801696777, + "rewards/rejected": -3.969555377960205, + "step": 18899 + }, + { + "epoch": 4.73, + "grad_norm": 16.2045955657959, + "learning_rate": 7.255007241245227e-08, + "logits/chosen": -0.5683594346046448, + "logits/rejected": -0.636606752872467, + "logps/chosen": -51.263160705566406, + "logps/rejected": -103.48368072509766, + "loss": 0.605, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4225287437438965, + "rewards/margins": 6.904069900512695, + "rewards/rejected": -3.4815406799316406, + "step": 18900 + }, + { + "epoch": 4.73, + "grad_norm": 6.522777080535889, + "learning_rate": 7.24167247952795e-08, + "logits/chosen": -0.5476789474487305, + "logits/rejected": -0.6860197186470032, + "logps/chosen": -63.93017578125, + "logps/rejected": -91.73381805419922, + "loss": 0.5715, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0968070030212402, + "rewards/margins": 7.746399879455566, + "rewards/rejected": -4.649593353271484, + "step": 18901 + }, + { + "epoch": 4.73, + "grad_norm": 5.189125061035156, + "learning_rate": 7.228349894393084e-08, + "logits/chosen": -0.5294486284255981, + "logits/rejected": -0.6181168556213379, + "logps/chosen": -61.847084045410156, + "logps/rejected": -102.27207946777344, + "loss": 0.6554, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9862775802612305, + "rewards/margins": 6.5736165046691895, + "rewards/rejected": -3.58733868598938, + "step": 18902 + }, + { + "epoch": 4.73, + "grad_norm": 2.274915933609009, + "learning_rate": 7.215039486169862e-08, + "logits/chosen": -0.5732251405715942, + "logits/rejected": -0.6951351165771484, + "logps/chosen": -61.01271057128906, + "logps/rejected": -105.38845825195312, + "loss": 0.5622, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.237231969833374, + "rewards/margins": 8.359573364257812, + "rewards/rejected": -5.122342109680176, + "step": 18903 + }, + { + "epoch": 4.73, + "grad_norm": 5.2741780281066895, + "learning_rate": 7.201741255187133e-08, + "logits/chosen": -0.5373541116714478, + "logits/rejected": -0.6286269426345825, + "logps/chosen": -54.3121452331543, + "logps/rejected": -95.28421783447266, + "loss": 0.6527, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.02937388420105, + "rewards/margins": 7.018840312957764, + "rewards/rejected": -3.9894661903381348, + "step": 18904 + }, + { + "epoch": 4.73, + "grad_norm": 9.945741653442383, + "learning_rate": 7.18845520177347e-08, + "logits/chosen": -0.5717208981513977, + "logits/rejected": -0.630535364151001, + "logps/chosen": -57.413509368896484, + "logps/rejected": -111.75944519042969, + "loss": 0.5521, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1673495769500732, + "rewards/margins": 7.199974536895752, + "rewards/rejected": -4.032624244689941, + "step": 18905 + }, + { + "epoch": 4.73, + "grad_norm": 5.092144966125488, + "learning_rate": 7.175181326257385e-08, + "logits/chosen": -0.5319194793701172, + "logits/rejected": -0.5862091183662415, + "logps/chosen": -51.44580841064453, + "logps/rejected": -115.9942626953125, + "loss": 0.68, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.154193639755249, + "rewards/margins": 6.6915106773376465, + "rewards/rejected": -3.5373165607452393, + "step": 18906 + }, + { + "epoch": 4.73, + "grad_norm": 1.8322244882583618, + "learning_rate": 7.161919628966674e-08, + "logits/chosen": -0.6072460412979126, + "logits/rejected": -0.6706852316856384, + "logps/chosen": -42.5936164855957, + "logps/rejected": -103.75286102294922, + "loss": 0.4936, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.031561851501465, + "rewards/margins": 7.784119606018066, + "rewards/rejected": -4.75255823135376, + "step": 18907 + }, + { + "epoch": 4.73, + "grad_norm": 4.685384273529053, + "learning_rate": 7.148670110229184e-08, + "logits/chosen": -0.5588841438293457, + "logits/rejected": -0.606223464012146, + "logps/chosen": -58.71623992919922, + "logps/rejected": -125.60330200195312, + "loss": 0.6317, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.148139476776123, + "rewards/margins": 7.422754287719727, + "rewards/rejected": -4.274615287780762, + "step": 18908 + }, + { + "epoch": 4.73, + "grad_norm": 4.928892135620117, + "learning_rate": 7.135432770372264e-08, + "logits/chosen": -0.5995519161224365, + "logits/rejected": -0.7150893211364746, + "logps/chosen": -62.012001037597656, + "logps/rejected": -105.78621673583984, + "loss": 0.5715, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.435452461242676, + "rewards/margins": 8.185505867004395, + "rewards/rejected": -4.7500529289245605, + "step": 18909 + }, + { + "epoch": 4.73, + "grad_norm": 6.182310581207275, + "learning_rate": 7.122207609723042e-08, + "logits/chosen": -0.5968055129051208, + "logits/rejected": -0.6699259281158447, + "logps/chosen": -60.64112854003906, + "logps/rejected": -119.74456024169922, + "loss": 0.6778, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.074983596801758, + "rewards/margins": 8.0416259765625, + "rewards/rejected": -4.966641426086426, + "step": 18910 + }, + { + "epoch": 4.73, + "grad_norm": 5.6118340492248535, + "learning_rate": 7.108994628608257e-08, + "logits/chosen": -0.5375170111656189, + "logits/rejected": -0.6332690119743347, + "logps/chosen": -53.98780822753906, + "logps/rejected": -114.01806640625, + "loss": 0.6742, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9251902103424072, + "rewards/margins": 7.689801216125488, + "rewards/rejected": -4.764610290527344, + "step": 18911 + }, + { + "epoch": 4.73, + "grad_norm": 2.7082924842834473, + "learning_rate": 7.095793827354535e-08, + "logits/chosen": -0.4928933382034302, + "logits/rejected": -0.6297236084938049, + "logps/chosen": -65.72674560546875, + "logps/rejected": -128.911865234375, + "loss": 0.5672, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0970468521118164, + "rewards/margins": 10.224811553955078, + "rewards/rejected": -7.127764701843262, + "step": 18912 + }, + { + "epoch": 4.73, + "grad_norm": 4.8213582038879395, + "learning_rate": 7.082605206288006e-08, + "logits/chosen": -0.5788888931274414, + "logits/rejected": -0.6575546860694885, + "logps/chosen": -45.94532012939453, + "logps/rejected": -90.72110748291016, + "loss": 0.602, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9164748191833496, + "rewards/margins": 7.022769451141357, + "rewards/rejected": -4.106294631958008, + "step": 18913 + }, + { + "epoch": 4.73, + "grad_norm": 8.072449684143066, + "learning_rate": 7.069428765734576e-08, + "logits/chosen": -0.5224811434745789, + "logits/rejected": -0.6006375551223755, + "logps/chosen": -54.00165557861328, + "logps/rejected": -104.7066879272461, + "loss": 0.6365, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7517142295837402, + "rewards/margins": 7.043123245239258, + "rewards/rejected": -4.291409492492676, + "step": 18914 + }, + { + "epoch": 4.73, + "grad_norm": 25.59031867980957, + "learning_rate": 7.056264506019927e-08, + "logits/chosen": -0.6121281385421753, + "logits/rejected": -0.6368808150291443, + "logps/chosen": -48.24433517456055, + "logps/rejected": -105.08065032958984, + "loss": 0.7025, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.985936403274536, + "rewards/margins": 6.85476016998291, + "rewards/rejected": -3.8688230514526367, + "step": 18915 + }, + { + "epoch": 4.73, + "grad_norm": 8.955101013183594, + "learning_rate": 7.043112427469246e-08, + "logits/chosen": -0.4865210950374603, + "logits/rejected": -0.6382200121879578, + "logps/chosen": -57.83331298828125, + "logps/rejected": -98.18812561035156, + "loss": 0.6943, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0613250732421875, + "rewards/margins": 8.333961486816406, + "rewards/rejected": -5.272636413574219, + "step": 18916 + }, + { + "epoch": 4.73, + "grad_norm": 2.7861320972442627, + "learning_rate": 7.029972530407602e-08, + "logits/chosen": -0.5561771988868713, + "logits/rejected": -0.6394468545913696, + "logps/chosen": -58.06541442871094, + "logps/rejected": -111.5481948852539, + "loss": 0.6116, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3359971046447754, + "rewards/margins": 8.07792854309082, + "rewards/rejected": -4.7419304847717285, + "step": 18917 + }, + { + "epoch": 4.73, + "grad_norm": 2.9482858180999756, + "learning_rate": 7.016844815159684e-08, + "logits/chosen": -0.5717223286628723, + "logits/rejected": -0.6498944163322449, + "logps/chosen": -62.08986282348633, + "logps/rejected": -100.37617492675781, + "loss": 0.5646, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2283222675323486, + "rewards/margins": 7.537955284118652, + "rewards/rejected": -4.309632778167725, + "step": 18918 + }, + { + "epoch": 4.73, + "grad_norm": 3.4145524501800537, + "learning_rate": 7.003729282049899e-08, + "logits/chosen": -0.5170596837997437, + "logits/rejected": -0.6104246377944946, + "logps/chosen": -48.31047058105469, + "logps/rejected": -92.98328399658203, + "loss": 0.5332, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.260321617126465, + "rewards/margins": 6.672430992126465, + "rewards/rejected": -3.412109136581421, + "step": 18919 + }, + { + "epoch": 4.73, + "grad_norm": 2.7022364139556885, + "learning_rate": 6.990625931402318e-08, + "logits/chosen": -0.5240045785903931, + "logits/rejected": -0.6460376977920532, + "logps/chosen": -66.09966278076172, + "logps/rejected": -111.23018646240234, + "loss": 0.5888, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.649081230163574, + "rewards/margins": 8.14358139038086, + "rewards/rejected": -4.494500160217285, + "step": 18920 + }, + { + "epoch": 4.73, + "grad_norm": 4.30825662612915, + "learning_rate": 6.977534763540794e-08, + "logits/chosen": -0.5955549478530884, + "logits/rejected": -0.6595104932785034, + "logps/chosen": -58.345916748046875, + "logps/rejected": -120.45569610595703, + "loss": 0.5672, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.280304193496704, + "rewards/margins": 8.725801467895508, + "rewards/rejected": -5.445496082305908, + "step": 18921 + }, + { + "epoch": 4.73, + "grad_norm": 4.3810882568359375, + "learning_rate": 6.964455778788737e-08, + "logits/chosen": -0.6390478014945984, + "logits/rejected": -0.715412437915802, + "logps/chosen": -49.0944938659668, + "logps/rejected": -93.85365295410156, + "loss": 0.5461, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3512094020843506, + "rewards/margins": 7.048133850097656, + "rewards/rejected": -3.6969246864318848, + "step": 18922 + }, + { + "epoch": 4.73, + "grad_norm": 11.448480606079102, + "learning_rate": 6.951388977469386e-08, + "logits/chosen": -0.5374346971511841, + "logits/rejected": -0.6016907095909119, + "logps/chosen": -50.021263122558594, + "logps/rejected": -95.46393585205078, + "loss": 0.6966, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.972754716873169, + "rewards/margins": 6.287319660186768, + "rewards/rejected": -3.3145649433135986, + "step": 18923 + }, + { + "epoch": 4.73, + "grad_norm": 4.020995140075684, + "learning_rate": 6.938334359905707e-08, + "logits/chosen": -0.5958666801452637, + "logits/rejected": -0.6366487741470337, + "logps/chosen": -32.6726188659668, + "logps/rejected": -114.84603118896484, + "loss": 0.4963, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0662970542907715, + "rewards/margins": 8.642809867858887, + "rewards/rejected": -5.576512813568115, + "step": 18924 + }, + { + "epoch": 4.73, + "grad_norm": 6.327815055847168, + "learning_rate": 6.925291926420219e-08, + "logits/chosen": -0.5432977676391602, + "logits/rejected": -0.5931969881057739, + "logps/chosen": -57.725826263427734, + "logps/rejected": -119.66224670410156, + "loss": 0.641, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1788268089294434, + "rewards/margins": 6.665868759155273, + "rewards/rejected": -3.48704195022583, + "step": 18925 + }, + { + "epoch": 4.73, + "grad_norm": 7.5360331535339355, + "learning_rate": 6.912261677335164e-08, + "logits/chosen": -0.5744911432266235, + "logits/rejected": -0.6706411838531494, + "logps/chosen": -47.014617919921875, + "logps/rejected": -102.25541687011719, + "loss": 0.6042, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2996511459350586, + "rewards/margins": 8.797945976257324, + "rewards/rejected": -5.498294830322266, + "step": 18926 + }, + { + "epoch": 4.73, + "grad_norm": 6.293739318847656, + "learning_rate": 6.899243612972673e-08, + "logits/chosen": -0.5210602283477783, + "logits/rejected": -0.6198082566261292, + "logps/chosen": -55.780906677246094, + "logps/rejected": -101.72167205810547, + "loss": 0.6605, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.971452236175537, + "rewards/margins": 7.588057041168213, + "rewards/rejected": -4.616604804992676, + "step": 18927 + }, + { + "epoch": 4.74, + "grad_norm": 5.621584892272949, + "learning_rate": 6.886237733654321e-08, + "logits/chosen": -0.5618272423744202, + "logits/rejected": -0.5928024053573608, + "logps/chosen": -54.272727966308594, + "logps/rejected": -121.9594955444336, + "loss": 0.6424, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.087000608444214, + "rewards/margins": 6.306214809417725, + "rewards/rejected": -3.2192142009735107, + "step": 18928 + }, + { + "epoch": 4.74, + "grad_norm": 6.451975345611572, + "learning_rate": 6.873244039701521e-08, + "logits/chosen": -0.582514226436615, + "logits/rejected": -0.6711879372596741, + "logps/chosen": -53.6563835144043, + "logps/rejected": -125.42787170410156, + "loss": 0.6421, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8385627269744873, + "rewards/margins": 7.274923324584961, + "rewards/rejected": -4.436360836029053, + "step": 18929 + }, + { + "epoch": 4.74, + "grad_norm": 4.6929802894592285, + "learning_rate": 6.860262531435402e-08, + "logits/chosen": -0.5155699253082275, + "logits/rejected": -0.5745534300804138, + "logps/chosen": -62.79197311401367, + "logps/rejected": -111.50479125976562, + "loss": 0.661, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0173418521881104, + "rewards/margins": 6.941573619842529, + "rewards/rejected": -3.9242312908172607, + "step": 18930 + }, + { + "epoch": 4.74, + "grad_norm": 5.494100570678711, + "learning_rate": 6.847293209176708e-08, + "logits/chosen": -0.5586692094802856, + "logits/rejected": -0.6182934045791626, + "logps/chosen": -56.25691604614258, + "logps/rejected": -119.90804290771484, + "loss": 0.6129, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0291666984558105, + "rewards/margins": 7.666666030883789, + "rewards/rejected": -4.637500286102295, + "step": 18931 + }, + { + "epoch": 4.74, + "grad_norm": 7.512936115264893, + "learning_rate": 6.834336073245962e-08, + "logits/chosen": -0.5139436721801758, + "logits/rejected": -0.6047489643096924, + "logps/chosen": -64.33265686035156, + "logps/rejected": -107.37127685546875, + "loss": 0.6098, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.118421792984009, + "rewards/margins": 7.379859924316406, + "rewards/rejected": -4.261438846588135, + "step": 18932 + }, + { + "epoch": 4.74, + "grad_norm": 4.549485206604004, + "learning_rate": 6.82139112396324e-08, + "logits/chosen": -0.6063593626022339, + "logits/rejected": -0.7309994101524353, + "logps/chosen": -56.6787109375, + "logps/rejected": -111.86177825927734, + "loss": 0.6637, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7839016914367676, + "rewards/margins": 8.089776039123535, + "rewards/rejected": -5.305874824523926, + "step": 18933 + }, + { + "epoch": 4.74, + "grad_norm": 3.352696180343628, + "learning_rate": 6.808458361648618e-08, + "logits/chosen": -0.5967311263084412, + "logits/rejected": -0.6018039584159851, + "logps/chosen": -43.33897018432617, + "logps/rejected": -126.00724792480469, + "loss": 0.5824, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2666561603546143, + "rewards/margins": 7.618999004364014, + "rewards/rejected": -4.35234260559082, + "step": 18934 + }, + { + "epoch": 4.74, + "grad_norm": 4.59998893737793, + "learning_rate": 6.795537786621564e-08, + "logits/chosen": -0.5087295770645142, + "logits/rejected": -0.5632392764091492, + "logps/chosen": -57.122108459472656, + "logps/rejected": -108.65242004394531, + "loss": 0.665, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.066640853881836, + "rewards/margins": 6.3850274085998535, + "rewards/rejected": -3.318387269973755, + "step": 18935 + }, + { + "epoch": 4.74, + "grad_norm": 3.9446141719818115, + "learning_rate": 6.782629399201269e-08, + "logits/chosen": -0.6307346224784851, + "logits/rejected": -0.705426037311554, + "logps/chosen": -48.07025146484375, + "logps/rejected": -101.81538391113281, + "loss": 0.5996, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9839324951171875, + "rewards/margins": 7.556149005889893, + "rewards/rejected": -4.572216987609863, + "step": 18936 + }, + { + "epoch": 4.74, + "grad_norm": 4.811196804046631, + "learning_rate": 6.769733199706919e-08, + "logits/chosen": -0.6187969446182251, + "logits/rejected": -0.6793836355209351, + "logps/chosen": -49.59442901611328, + "logps/rejected": -101.18794250488281, + "loss": 0.6237, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2130472660064697, + "rewards/margins": 7.579506874084473, + "rewards/rejected": -4.366459846496582, + "step": 18937 + }, + { + "epoch": 4.74, + "grad_norm": 7.090671539306641, + "learning_rate": 6.756849188456982e-08, + "logits/chosen": -0.46539306640625, + "logits/rejected": -0.5583403706550598, + "logps/chosen": -56.243675231933594, + "logps/rejected": -82.38908386230469, + "loss": 0.6354, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0951831340789795, + "rewards/margins": 5.906499862670898, + "rewards/rejected": -2.811316728591919, + "step": 18938 + }, + { + "epoch": 4.74, + "grad_norm": 8.167125701904297, + "learning_rate": 6.743977365770038e-08, + "logits/chosen": -0.4769519567489624, + "logits/rejected": -0.54205721616745, + "logps/chosen": -58.765159606933594, + "logps/rejected": -113.4385986328125, + "loss": 0.763, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.884727716445923, + "rewards/margins": 6.827147006988525, + "rewards/rejected": -3.94242000579834, + "step": 18939 + }, + { + "epoch": 4.74, + "grad_norm": 5.4236884117126465, + "learning_rate": 6.731117731964054e-08, + "logits/chosen": -0.6180794835090637, + "logits/rejected": -0.6769117712974548, + "logps/chosen": -53.65721893310547, + "logps/rejected": -102.2641830444336, + "loss": 0.6083, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1644434928894043, + "rewards/margins": 6.558295249938965, + "rewards/rejected": -3.3938517570495605, + "step": 18940 + }, + { + "epoch": 4.74, + "grad_norm": 3.871227741241455, + "learning_rate": 6.718270287356721e-08, + "logits/chosen": -0.5032939314842224, + "logits/rejected": -0.5358977317810059, + "logps/chosen": -59.70463562011719, + "logps/rejected": -110.16149139404297, + "loss": 0.5601, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1294615268707275, + "rewards/margins": 6.022396087646484, + "rewards/rejected": -2.892935037612915, + "step": 18941 + }, + { + "epoch": 4.74, + "grad_norm": 6.813233852386475, + "learning_rate": 6.705435032265673e-08, + "logits/chosen": -0.5665282607078552, + "logits/rejected": -0.6319311857223511, + "logps/chosen": -55.0006103515625, + "logps/rejected": -102.54501342773438, + "loss": 0.6972, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.833442211151123, + "rewards/margins": 6.149602890014648, + "rewards/rejected": -3.3161604404449463, + "step": 18942 + }, + { + "epoch": 4.74, + "grad_norm": 5.651668548583984, + "learning_rate": 6.692611967008045e-08, + "logits/chosen": -0.5531704425811768, + "logits/rejected": -0.6717263460159302, + "logps/chosen": -52.44215774536133, + "logps/rejected": -97.66297912597656, + "loss": 0.6655, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.254661798477173, + "rewards/margins": 8.398995399475098, + "rewards/rejected": -5.144333839416504, + "step": 18943 + }, + { + "epoch": 4.74, + "grad_norm": 4.04347562789917, + "learning_rate": 6.679801091900639e-08, + "logits/chosen": -0.5621545910835266, + "logits/rejected": -0.6256690621376038, + "logps/chosen": -57.81861877441406, + "logps/rejected": -99.57098388671875, + "loss": 0.6337, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.125728130340576, + "rewards/margins": 5.528841972351074, + "rewards/rejected": -2.40311336517334, + "step": 18944 + }, + { + "epoch": 4.74, + "grad_norm": 4.789196491241455, + "learning_rate": 6.667002407260037e-08, + "logits/chosen": -0.4920264184474945, + "logits/rejected": -0.5440220832824707, + "logps/chosen": -51.83372497558594, + "logps/rejected": -120.51368713378906, + "loss": 0.6369, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9989848136901855, + "rewards/margins": 7.472661972045898, + "rewards/rejected": -4.473677158355713, + "step": 18945 + }, + { + "epoch": 4.74, + "grad_norm": 6.133077621459961, + "learning_rate": 6.654215913402651e-08, + "logits/chosen": -0.6045199036598206, + "logits/rejected": -0.7008938193321228, + "logps/chosen": -56.076236724853516, + "logps/rejected": -88.79883575439453, + "loss": 0.6362, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.699991464614868, + "rewards/margins": 5.937623023986816, + "rewards/rejected": -3.2376317977905273, + "step": 18946 + }, + { + "epoch": 4.74, + "grad_norm": 2.796861171722412, + "learning_rate": 6.641441610644228e-08, + "logits/chosen": -0.5525984168052673, + "logits/rejected": -0.6376933455467224, + "logps/chosen": -53.93377685546875, + "logps/rejected": -108.87381744384766, + "loss": 0.6007, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.98004412651062, + "rewards/margins": 8.146403312683105, + "rewards/rejected": -5.166358470916748, + "step": 18947 + }, + { + "epoch": 4.74, + "grad_norm": 2.9083359241485596, + "learning_rate": 6.628679499300572e-08, + "logits/chosen": -0.5739449858665466, + "logits/rejected": -0.6214154362678528, + "logps/chosen": -53.310325622558594, + "logps/rejected": -105.03308868408203, + "loss": 0.6129, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.330960750579834, + "rewards/margins": 7.151600360870361, + "rewards/rejected": -3.8206393718719482, + "step": 18948 + }, + { + "epoch": 4.74, + "grad_norm": 8.844657897949219, + "learning_rate": 6.615929579687041e-08, + "logits/chosen": -0.5083677172660828, + "logits/rejected": -0.6033132672309875, + "logps/chosen": -63.79191970825195, + "logps/rejected": -118.79871368408203, + "loss": 0.6419, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2136449813842773, + "rewards/margins": 6.928332805633545, + "rewards/rejected": -3.714688539505005, + "step": 18949 + }, + { + "epoch": 4.74, + "grad_norm": 7.39702033996582, + "learning_rate": 6.603191852118662e-08, + "logits/chosen": -0.5241072177886963, + "logits/rejected": -0.6240572333335876, + "logps/chosen": -62.71405792236328, + "logps/rejected": -128.06997680664062, + "loss": 0.6736, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.90427827835083, + "rewards/margins": 8.31814956665039, + "rewards/rejected": -5.413871765136719, + "step": 18950 + }, + { + "epoch": 4.74, + "grad_norm": 6.551135063171387, + "learning_rate": 6.590466316910182e-08, + "logits/chosen": -0.6350436210632324, + "logits/rejected": -0.6981544494628906, + "logps/chosen": -58.049652099609375, + "logps/rejected": -99.501708984375, + "loss": 0.6764, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8063995838165283, + "rewards/margins": 7.126468181610107, + "rewards/rejected": -4.320067882537842, + "step": 18951 + }, + { + "epoch": 4.74, + "grad_norm": 6.929734230041504, + "learning_rate": 6.577752974376128e-08, + "logits/chosen": -0.5644663572311401, + "logits/rejected": -0.618908166885376, + "logps/chosen": -45.396453857421875, + "logps/rejected": -109.57627868652344, + "loss": 0.6176, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9408481121063232, + "rewards/margins": 7.627597808837891, + "rewards/rejected": -4.686749458312988, + "step": 18952 + }, + { + "epoch": 4.74, + "grad_norm": 3.7338993549346924, + "learning_rate": 6.565051824830637e-08, + "logits/chosen": -0.5931290984153748, + "logits/rejected": -0.6117182970046997, + "logps/chosen": -41.565147399902344, + "logps/rejected": -123.98260498046875, + "loss": 0.5401, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.174558639526367, + "rewards/margins": 7.366619110107422, + "rewards/rejected": -4.192060470581055, + "step": 18953 + }, + { + "epoch": 4.74, + "grad_norm": 8.193984985351562, + "learning_rate": 6.552362868587513e-08, + "logits/chosen": -0.550879955291748, + "logits/rejected": -0.6672472953796387, + "logps/chosen": -65.10904693603516, + "logps/rejected": -108.56053924560547, + "loss": 0.6683, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1741740703582764, + "rewards/margins": 7.442987442016602, + "rewards/rejected": -4.268813133239746, + "step": 18954 + }, + { + "epoch": 4.74, + "grad_norm": 8.090364456176758, + "learning_rate": 6.539686105960396e-08, + "logits/chosen": -0.5342807769775391, + "logits/rejected": -0.5713762044906616, + "logps/chosen": -54.20140838623047, + "logps/rejected": -127.04307556152344, + "loss": 0.641, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0519444942474365, + "rewards/margins": 7.314030647277832, + "rewards/rejected": -4.262085914611816, + "step": 18955 + }, + { + "epoch": 4.74, + "grad_norm": 6.317440032958984, + "learning_rate": 6.527021537262479e-08, + "logits/chosen": -0.5145188570022583, + "logits/rejected": -0.6111188530921936, + "logps/chosen": -62.896087646484375, + "logps/rejected": -97.97211456298828, + "loss": 0.6237, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3367421627044678, + "rewards/margins": 6.6189680099487305, + "rewards/rejected": -3.282226324081421, + "step": 18956 + }, + { + "epoch": 4.74, + "grad_norm": 7.275920391082764, + "learning_rate": 6.514369162806733e-08, + "logits/chosen": -0.656279444694519, + "logits/rejected": -0.7127063274383545, + "logps/chosen": -61.11933898925781, + "logps/rejected": -121.15935516357422, + "loss": 0.6536, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.012266159057617, + "rewards/margins": 7.412263870239258, + "rewards/rejected": -4.399997234344482, + "step": 18957 + }, + { + "epoch": 4.74, + "grad_norm": 5.620480060577393, + "learning_rate": 6.501728982905853e-08, + "logits/chosen": -0.558245837688446, + "logits/rejected": -0.6672638654708862, + "logps/chosen": -63.960384368896484, + "logps/rejected": -111.47654724121094, + "loss": 0.6616, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6819443702697754, + "rewards/margins": 7.201103210449219, + "rewards/rejected": -4.519158363342285, + "step": 18958 + }, + { + "epoch": 4.74, + "grad_norm": 4.269885063171387, + "learning_rate": 6.489100997872144e-08, + "logits/chosen": -0.6100547909736633, + "logits/rejected": -0.6627214550971985, + "logps/chosen": -51.961891174316406, + "logps/rejected": -135.12619018554688, + "loss": 0.643, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.153301954269409, + "rewards/margins": 9.171993255615234, + "rewards/rejected": -6.018691539764404, + "step": 18959 + }, + { + "epoch": 4.74, + "grad_norm": 5.341911315917969, + "learning_rate": 6.476485208017636e-08, + "logits/chosen": -0.5228801369667053, + "logits/rejected": -0.5638264417648315, + "logps/chosen": -61.32343292236328, + "logps/rejected": -106.34737396240234, + "loss": 0.7479, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0565521717071533, + "rewards/margins": 6.437891960144043, + "rewards/rejected": -3.3813395500183105, + "step": 18960 + }, + { + "epoch": 4.74, + "grad_norm": 4.303574085235596, + "learning_rate": 6.463881613654188e-08, + "logits/chosen": -0.5340129137039185, + "logits/rejected": -0.6262893676757812, + "logps/chosen": -56.91159439086914, + "logps/rejected": -116.2879638671875, + "loss": 0.5175, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.220231533050537, + "rewards/margins": 8.483684539794922, + "rewards/rejected": -5.263452529907227, + "step": 18961 + }, + { + "epoch": 4.74, + "grad_norm": 3.2255964279174805, + "learning_rate": 6.451290215093109e-08, + "logits/chosen": -0.4619993567466736, + "logits/rejected": -0.49388813972473145, + "logps/chosen": -58.70720291137695, + "logps/rejected": -115.20057678222656, + "loss": 0.6806, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2585418224334717, + "rewards/margins": 7.516056060791016, + "rewards/rejected": -4.257514476776123, + "step": 18962 + }, + { + "epoch": 4.74, + "grad_norm": 18.580297470092773, + "learning_rate": 6.438711012645593e-08, + "logits/chosen": -0.5519429445266724, + "logits/rejected": -0.6507338285446167, + "logps/chosen": -61.695186614990234, + "logps/rejected": -102.17117309570312, + "loss": 0.9678, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9963717460632324, + "rewards/margins": 6.743673324584961, + "rewards/rejected": -3.7473020553588867, + "step": 18963 + }, + { + "epoch": 4.74, + "grad_norm": 9.094446182250977, + "learning_rate": 6.426144006622615e-08, + "logits/chosen": -0.44536179304122925, + "logits/rejected": -0.5163662433624268, + "logps/chosen": -59.19969177246094, + "logps/rejected": -121.78555297851562, + "loss": 0.6548, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.306511640548706, + "rewards/margins": 7.405130386352539, + "rewards/rejected": -4.09861946105957, + "step": 18964 + }, + { + "epoch": 4.74, + "grad_norm": 33.07560729980469, + "learning_rate": 6.41358919733448e-08, + "logits/chosen": -0.5219665765762329, + "logits/rejected": -0.6407516002655029, + "logps/chosen": -85.71172332763672, + "logps/rejected": -117.67990112304688, + "loss": 0.6886, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2093663215637207, + "rewards/margins": 7.860746383666992, + "rewards/rejected": -4.65138053894043, + "step": 18965 + }, + { + "epoch": 4.74, + "grad_norm": 7.838076591491699, + "learning_rate": 6.401046585091609e-08, + "logits/chosen": -0.5431615114212036, + "logits/rejected": -0.6190927028656006, + "logps/chosen": -55.01154327392578, + "logps/rejected": -112.75325012207031, + "loss": 0.6413, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.324828624725342, + "rewards/margins": 6.724837779998779, + "rewards/rejected": -3.4000091552734375, + "step": 18966 + }, + { + "epoch": 4.74, + "grad_norm": 9.600655555725098, + "learning_rate": 6.388516170203918e-08, + "logits/chosen": -0.5392299294471741, + "logits/rejected": -0.6364929676055908, + "logps/chosen": -66.14256286621094, + "logps/rejected": -114.11624145507812, + "loss": 0.6851, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1132025718688965, + "rewards/margins": 7.651062488555908, + "rewards/rejected": -4.537859916687012, + "step": 18967 + }, + { + "epoch": 4.75, + "grad_norm": 4.771237373352051, + "learning_rate": 6.375997952980995e-08, + "logits/chosen": -0.5634995698928833, + "logits/rejected": -0.6177529692649841, + "logps/chosen": -53.22663497924805, + "logps/rejected": -101.75685119628906, + "loss": 0.6573, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.14141845703125, + "rewards/margins": 6.918428897857666, + "rewards/rejected": -3.777010679244995, + "step": 18968 + }, + { + "epoch": 4.75, + "grad_norm": 10.85031509399414, + "learning_rate": 6.363491933732201e-08, + "logits/chosen": -0.5603744983673096, + "logits/rejected": -0.6507219076156616, + "logps/chosen": -47.3564453125, + "logps/rejected": -98.319580078125, + "loss": 0.5686, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2118871212005615, + "rewards/margins": 7.795828342437744, + "rewards/rejected": -4.583941459655762, + "step": 18969 + }, + { + "epoch": 4.75, + "grad_norm": 3.3920745849609375, + "learning_rate": 6.350998112766626e-08, + "logits/chosen": -0.5741519927978516, + "logits/rejected": -0.6110444664955139, + "logps/chosen": -51.3541145324707, + "logps/rejected": -123.59925842285156, + "loss": 0.5604, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.001613140106201, + "rewards/margins": 7.671564102172852, + "rewards/rejected": -4.669951438903809, + "step": 18970 + }, + { + "epoch": 4.75, + "grad_norm": 4.198264122009277, + "learning_rate": 6.33851649039291e-08, + "logits/chosen": -0.5518993139266968, + "logits/rejected": -0.6209318041801453, + "logps/chosen": -56.44063949584961, + "logps/rejected": -107.64857482910156, + "loss": 0.6247, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1931264400482178, + "rewards/margins": 6.193378925323486, + "rewards/rejected": -3.0002520084381104, + "step": 18971 + }, + { + "epoch": 4.75, + "grad_norm": 5.743870258331299, + "learning_rate": 6.326047066919528e-08, + "logits/chosen": -0.5153408050537109, + "logits/rejected": -0.5869243741035461, + "logps/chosen": -52.79447937011719, + "logps/rejected": -98.10616302490234, + "loss": 0.5373, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9397671222686768, + "rewards/margins": 6.226860046386719, + "rewards/rejected": -3.2870936393737793, + "step": 18972 + }, + { + "epoch": 4.75, + "grad_norm": 4.018359661102295, + "learning_rate": 6.313589842654622e-08, + "logits/chosen": -0.5310467481613159, + "logits/rejected": -0.5979170799255371, + "logps/chosen": -55.637062072753906, + "logps/rejected": -124.19825744628906, + "loss": 0.6031, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2750375270843506, + "rewards/margins": 8.049105644226074, + "rewards/rejected": -4.7740678787231445, + "step": 18973 + }, + { + "epoch": 4.75, + "grad_norm": 4.243714332580566, + "learning_rate": 6.301144817906057e-08, + "logits/chosen": -0.5636654496192932, + "logits/rejected": -0.6719774007797241, + "logps/chosen": -52.304222106933594, + "logps/rejected": -118.14691925048828, + "loss": 0.5218, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.360666513442993, + "rewards/margins": 8.364660263061523, + "rewards/rejected": -5.003993511199951, + "step": 18974 + }, + { + "epoch": 4.75, + "grad_norm": 16.10201072692871, + "learning_rate": 6.288711992981256e-08, + "logits/chosen": -0.651850700378418, + "logits/rejected": -0.7198337912559509, + "logps/chosen": -54.754615783691406, + "logps/rejected": -130.45950317382812, + "loss": 0.6929, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1597681045532227, + "rewards/margins": 7.674036502838135, + "rewards/rejected": -4.51426887512207, + "step": 18975 + }, + { + "epoch": 4.75, + "grad_norm": 3.268089771270752, + "learning_rate": 6.276291368187581e-08, + "logits/chosen": -0.5878520607948303, + "logits/rejected": -0.6371542811393738, + "logps/chosen": -45.50178146362305, + "logps/rejected": -121.02239227294922, + "loss": 0.5653, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.940135955810547, + "rewards/margins": 7.025685787200928, + "rewards/rejected": -4.085549831390381, + "step": 18976 + }, + { + "epoch": 4.75, + "grad_norm": 5.212127685546875, + "learning_rate": 6.263882943831845e-08, + "logits/chosen": -0.47133636474609375, + "logits/rejected": -0.5623363256454468, + "logps/chosen": -54.08081817626953, + "logps/rejected": -112.12498474121094, + "loss": 0.6006, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.056363582611084, + "rewards/margins": 8.231355667114258, + "rewards/rejected": -5.174992084503174, + "step": 18977 + }, + { + "epoch": 4.75, + "grad_norm": 3.203078508377075, + "learning_rate": 6.251486720220745e-08, + "logits/chosen": -0.5877965092658997, + "logits/rejected": -0.6569782495498657, + "logps/chosen": -47.99669647216797, + "logps/rejected": -98.76882934570312, + "loss": 0.5188, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0761215686798096, + "rewards/margins": 6.4832305908203125, + "rewards/rejected": -3.407109022140503, + "step": 18978 + }, + { + "epoch": 4.75, + "grad_norm": 3.5803980827331543, + "learning_rate": 6.239102697660537e-08, + "logits/chosen": -0.4987993836402893, + "logits/rejected": -0.5914661884307861, + "logps/chosen": -54.40780258178711, + "logps/rejected": -118.94802856445312, + "loss": 0.5797, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4631431102752686, + "rewards/margins": 9.1900053024292, + "rewards/rejected": -5.726861953735352, + "step": 18979 + }, + { + "epoch": 4.75, + "grad_norm": 4.827396392822266, + "learning_rate": 6.226730876457366e-08, + "logits/chosen": -0.5949545502662659, + "logits/rejected": -0.6857615113258362, + "logps/chosen": -49.63100814819336, + "logps/rejected": -135.23757934570312, + "loss": 0.5722, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2432327270507812, + "rewards/margins": 9.373991012573242, + "rewards/rejected": -6.1307573318481445, + "step": 18980 + }, + { + "epoch": 4.75, + "grad_norm": 7.309849262237549, + "learning_rate": 6.214371256916873e-08, + "logits/chosen": -0.5546420216560364, + "logits/rejected": -0.6215554475784302, + "logps/chosen": -64.99824523925781, + "logps/rejected": -94.79800415039062, + "loss": 0.7365, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.441023349761963, + "rewards/margins": 6.8034820556640625, + "rewards/rejected": -3.3624589443206787, + "step": 18981 + }, + { + "epoch": 4.75, + "grad_norm": 4.035742282867432, + "learning_rate": 6.202023839344429e-08, + "logits/chosen": -0.5689383745193481, + "logits/rejected": -0.6822097301483154, + "logps/chosen": -61.173919677734375, + "logps/rejected": -105.00076293945312, + "loss": 0.6621, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.107588291168213, + "rewards/margins": 7.292172431945801, + "rewards/rejected": -4.18458366394043, + "step": 18982 + }, + { + "epoch": 4.75, + "grad_norm": 4.633169174194336, + "learning_rate": 6.189688624045231e-08, + "logits/chosen": -0.4816860556602478, + "logits/rejected": -0.6012353897094727, + "logps/chosen": -58.21278381347656, + "logps/rejected": -93.41248321533203, + "loss": 0.6798, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.105694532394409, + "rewards/margins": 6.7767415046691895, + "rewards/rejected": -3.6710469722747803, + "step": 18983 + }, + { + "epoch": 4.75, + "grad_norm": 4.607388496398926, + "learning_rate": 6.177365611324038e-08, + "logits/chosen": -0.4726530909538269, + "logits/rejected": -0.5743445158004761, + "logps/chosen": -65.32095336914062, + "logps/rejected": -115.24638366699219, + "loss": 0.6495, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.062458038330078, + "rewards/margins": 7.7600226402282715, + "rewards/rejected": -4.697565078735352, + "step": 18984 + }, + { + "epoch": 4.75, + "grad_norm": 3.9392752647399902, + "learning_rate": 6.165054801485437e-08, + "logits/chosen": -0.6304175853729248, + "logits/rejected": -0.7122601866722107, + "logps/chosen": -50.36503601074219, + "logps/rejected": -103.410888671875, + "loss": 0.5703, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9708497524261475, + "rewards/margins": 7.533742904663086, + "rewards/rejected": -4.562893390655518, + "step": 18985 + }, + { + "epoch": 4.75, + "grad_norm": 5.723968505859375, + "learning_rate": 6.152756194833576e-08, + "logits/chosen": -0.5608083605766296, + "logits/rejected": -0.6276189088821411, + "logps/chosen": -60.977378845214844, + "logps/rejected": -114.7178955078125, + "loss": 0.6013, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2364203929901123, + "rewards/margins": 7.371522903442383, + "rewards/rejected": -4.135102272033691, + "step": 18986 + }, + { + "epoch": 4.75, + "grad_norm": 3.8818519115448, + "learning_rate": 6.140469791672376e-08, + "logits/chosen": -0.5385138392448425, + "logits/rejected": -0.6519413590431213, + "logps/chosen": -60.990596771240234, + "logps/rejected": -97.2470474243164, + "loss": 0.5601, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6477341651916504, + "rewards/margins": 6.834287166595459, + "rewards/rejected": -4.18655252456665, + "step": 18987 + }, + { + "epoch": 4.75, + "grad_norm": 8.87642765045166, + "learning_rate": 6.12819559230543e-08, + "logits/chosen": -0.5199902057647705, + "logits/rejected": -0.6321690082550049, + "logps/chosen": -56.221229553222656, + "logps/rejected": -119.25210571289062, + "loss": 0.6408, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.897214651107788, + "rewards/margins": 8.489941596984863, + "rewards/rejected": -5.592726707458496, + "step": 18988 + }, + { + "epoch": 4.75, + "grad_norm": 9.644851684570312, + "learning_rate": 6.115933597036162e-08, + "logits/chosen": -0.5831735134124756, + "logits/rejected": -0.670551598072052, + "logps/chosen": -63.64921569824219, + "logps/rejected": -106.84603881835938, + "loss": 0.8135, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0236942768096924, + "rewards/margins": 6.964810371398926, + "rewards/rejected": -3.9411160945892334, + "step": 18989 + }, + { + "epoch": 4.75, + "grad_norm": 6.877015590667725, + "learning_rate": 6.103683806167438e-08, + "logits/chosen": -0.4595297574996948, + "logits/rejected": -0.5909775495529175, + "logps/chosen": -66.43161010742188, + "logps/rejected": -102.9673843383789, + "loss": 0.6261, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9639129638671875, + "rewards/margins": 7.360232830047607, + "rewards/rejected": -4.39631986618042, + "step": 18990 + }, + { + "epoch": 4.75, + "grad_norm": 12.242820739746094, + "learning_rate": 6.091446220002018e-08, + "logits/chosen": -0.5703929662704468, + "logits/rejected": -0.6405411958694458, + "logps/chosen": -54.55061340332031, + "logps/rejected": -106.92510986328125, + "loss": 0.5602, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2042346000671387, + "rewards/margins": 7.284772872924805, + "rewards/rejected": -4.080538272857666, + "step": 18991 + }, + { + "epoch": 4.75, + "grad_norm": 2.4801270961761475, + "learning_rate": 6.079220838842381e-08, + "logits/chosen": -0.5181124210357666, + "logits/rejected": -0.606507420539856, + "logps/chosen": -54.34300994873047, + "logps/rejected": -100.88214111328125, + "loss": 0.526, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0784008502960205, + "rewards/margins": 7.157991409301758, + "rewards/rejected": -4.079590797424316, + "step": 18992 + }, + { + "epoch": 4.75, + "grad_norm": 6.353078365325928, + "learning_rate": 6.067007662990455e-08, + "logits/chosen": -0.6012593507766724, + "logits/rejected": -0.6552920341491699, + "logps/chosen": -61.33580017089844, + "logps/rejected": -109.02429962158203, + "loss": 0.6085, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.163257360458374, + "rewards/margins": 7.243013381958008, + "rewards/rejected": -4.079755783081055, + "step": 18993 + }, + { + "epoch": 4.75, + "grad_norm": 5.263962268829346, + "learning_rate": 6.05480669274816e-08, + "logits/chosen": -0.4612913727760315, + "logits/rejected": -0.5082655549049377, + "logps/chosen": -76.3830337524414, + "logps/rejected": -113.81166076660156, + "loss": 0.7004, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8204352855682373, + "rewards/margins": 6.905345916748047, + "rewards/rejected": -4.084911823272705, + "step": 18994 + }, + { + "epoch": 4.75, + "grad_norm": 4.036675930023193, + "learning_rate": 6.042617928417038e-08, + "logits/chosen": -0.5765846967697144, + "logits/rejected": -0.6327681541442871, + "logps/chosen": -45.02417755126953, + "logps/rejected": -126.35828399658203, + "loss": 0.5547, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.289414882659912, + "rewards/margins": 8.54832649230957, + "rewards/rejected": -5.2589111328125, + "step": 18995 + }, + { + "epoch": 4.75, + "grad_norm": 3.4740021228790283, + "learning_rate": 6.030441370298178e-08, + "logits/chosen": -0.5103055238723755, + "logits/rejected": -0.6244674324989319, + "logps/chosen": -50.39349365234375, + "logps/rejected": -106.8824234008789, + "loss": 0.538, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0433225631713867, + "rewards/margins": 7.776839256286621, + "rewards/rejected": -4.733517169952393, + "step": 18996 + }, + { + "epoch": 4.75, + "grad_norm": 1.6554917097091675, + "learning_rate": 6.018277018692509e-08, + "logits/chosen": -0.5007920861244202, + "logits/rejected": -0.6062588095664978, + "logps/chosen": -61.99257278442383, + "logps/rejected": -99.34866333007812, + "loss": 0.5789, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1843008995056152, + "rewards/margins": 7.361382961273193, + "rewards/rejected": -4.17708158493042, + "step": 18997 + }, + { + "epoch": 4.75, + "grad_norm": 3.9524989128112793, + "learning_rate": 6.006124873900676e-08, + "logits/chosen": -0.6246510148048401, + "logits/rejected": -0.68202805519104, + "logps/chosen": -53.373722076416016, + "logps/rejected": -126.48886108398438, + "loss": 0.629, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3104143142700195, + "rewards/margins": 9.356436729431152, + "rewards/rejected": -6.046022415161133, + "step": 18998 + }, + { + "epoch": 4.75, + "grad_norm": 6.6111836433410645, + "learning_rate": 5.993984936222885e-08, + "logits/chosen": -0.5230532884597778, + "logits/rejected": -0.6005125045776367, + "logps/chosen": -58.03788375854492, + "logps/rejected": -105.27654266357422, + "loss": 0.6782, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4217336177825928, + "rewards/margins": 6.714873313903809, + "rewards/rejected": -3.293140411376953, + "step": 18999 + }, + { + "epoch": 4.75, + "grad_norm": 23.625957489013672, + "learning_rate": 5.981857205959174e-08, + "logits/chosen": -0.5372657179832458, + "logits/rejected": -0.5971941351890564, + "logps/chosen": -56.659767150878906, + "logps/rejected": -111.40464782714844, + "loss": 0.7851, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.772230386734009, + "rewards/margins": 5.881959915161133, + "rewards/rejected": -3.109729528427124, + "step": 19000 + }, + { + "epoch": 4.75, + "grad_norm": 7.589507579803467, + "learning_rate": 5.969741683409302e-08, + "logits/chosen": -0.4744301736354828, + "logits/rejected": -0.5858141779899597, + "logps/chosen": -59.96535110473633, + "logps/rejected": -111.98995971679688, + "loss": 0.642, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0629141330718994, + "rewards/margins": 8.005229949951172, + "rewards/rejected": -4.942315578460693, + "step": 19001 + }, + { + "epoch": 4.75, + "grad_norm": 15.101367950439453, + "learning_rate": 5.957638368872531e-08, + "logits/chosen": -0.5303087830543518, + "logits/rejected": -0.6165425777435303, + "logps/chosen": -53.78295135498047, + "logps/rejected": -100.29039764404297, + "loss": 0.5729, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3091204166412354, + "rewards/margins": 6.301495552062988, + "rewards/rejected": -2.992375373840332, + "step": 19002 + }, + { + "epoch": 4.75, + "grad_norm": 8.977278709411621, + "learning_rate": 5.94554726264801e-08, + "logits/chosen": -0.47699373960494995, + "logits/rejected": -0.5446643829345703, + "logps/chosen": -49.87952423095703, + "logps/rejected": -138.968994140625, + "loss": 0.6706, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.105316638946533, + "rewards/margins": 8.119029998779297, + "rewards/rejected": -5.013713836669922, + "step": 19003 + }, + { + "epoch": 4.75, + "grad_norm": 4.4427103996276855, + "learning_rate": 5.9334683650345005e-08, + "logits/chosen": -0.5581857562065125, + "logits/rejected": -0.6508335471153259, + "logps/chosen": -54.02153396606445, + "logps/rejected": -98.29917907714844, + "loss": 0.6169, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.938387632369995, + "rewards/margins": 6.841863632202148, + "rewards/rejected": -3.9034759998321533, + "step": 19004 + }, + { + "epoch": 4.75, + "grad_norm": 5.854193210601807, + "learning_rate": 5.92140167633054e-08, + "logits/chosen": -0.5457269549369812, + "logits/rejected": -0.6509503126144409, + "logps/chosen": -50.84831237792969, + "logps/rejected": -109.77982330322266, + "loss": 0.5509, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.074652671813965, + "rewards/margins": 7.97623348236084, + "rewards/rejected": -4.901580810546875, + "step": 19005 + }, + { + "epoch": 4.75, + "grad_norm": 5.890868663787842, + "learning_rate": 5.909347196834225e-08, + "logits/chosen": -0.5131417512893677, + "logits/rejected": -0.6002695560455322, + "logps/chosen": -59.266937255859375, + "logps/rejected": -102.1087417602539, + "loss": 0.6114, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8370754718780518, + "rewards/margins": 6.9622650146484375, + "rewards/rejected": -4.125189781188965, + "step": 19006 + }, + { + "epoch": 4.75, + "grad_norm": 2.072277307510376, + "learning_rate": 5.897304926843539e-08, + "logits/chosen": -0.5663926601409912, + "logits/rejected": -0.6493797302246094, + "logps/chosen": -51.4778938293457, + "logps/rejected": -109.27372741699219, + "loss": 0.6078, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3634560108184814, + "rewards/margins": 7.910333156585693, + "rewards/rejected": -4.546876907348633, + "step": 19007 + }, + { + "epoch": 4.76, + "grad_norm": 4.367226600646973, + "learning_rate": 5.885274866655966e-08, + "logits/chosen": -0.5658689737319946, + "logits/rejected": -0.6060863137245178, + "logps/chosen": -47.80529022216797, + "logps/rejected": -95.21969604492188, + "loss": 0.6093, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9805192947387695, + "rewards/margins": 6.636401653289795, + "rewards/rejected": -3.6558828353881836, + "step": 19008 + }, + { + "epoch": 4.76, + "grad_norm": 1.967654824256897, + "learning_rate": 5.8732570165687674e-08, + "logits/chosen": -0.5668106079101562, + "logits/rejected": -0.6261273622512817, + "logps/chosen": -45.6444091796875, + "logps/rejected": -100.2691650390625, + "loss": 0.5419, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2104806900024414, + "rewards/margins": 7.571983814239502, + "rewards/rejected": -4.361502647399902, + "step": 19009 + }, + { + "epoch": 4.76, + "grad_norm": 5.131523132324219, + "learning_rate": 5.86125137687904e-08, + "logits/chosen": -0.6241145730018616, + "logits/rejected": -0.6690677404403687, + "logps/chosen": -54.90489196777344, + "logps/rejected": -108.38036346435547, + "loss": 0.6103, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0840063095092773, + "rewards/margins": 6.818850040435791, + "rewards/rejected": -3.7348437309265137, + "step": 19010 + }, + { + "epoch": 4.76, + "grad_norm": 5.052120208740234, + "learning_rate": 5.8492579478833245e-08, + "logits/chosen": -0.5561147332191467, + "logits/rejected": -0.6904206275939941, + "logps/chosen": -59.42736053466797, + "logps/rejected": -95.53474426269531, + "loss": 0.6362, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.082956075668335, + "rewards/margins": 7.4541425704956055, + "rewards/rejected": -4.371186256408691, + "step": 19011 + }, + { + "epoch": 4.76, + "grad_norm": 6.674008846282959, + "learning_rate": 5.83727672987805e-08, + "logits/chosen": -0.45380356907844543, + "logits/rejected": -0.5467742085456848, + "logps/chosen": -55.73383712768555, + "logps/rejected": -103.78421020507812, + "loss": 0.557, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.187812566757202, + "rewards/margins": 7.098438262939453, + "rewards/rejected": -3.91062593460083, + "step": 19012 + }, + { + "epoch": 4.76, + "grad_norm": 5.125278472900391, + "learning_rate": 5.825307723159312e-08, + "logits/chosen": -0.5158817768096924, + "logits/rejected": -0.6421509385108948, + "logps/chosen": -53.02174377441406, + "logps/rejected": -125.22280883789062, + "loss": 0.5131, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.248154401779175, + "rewards/margins": 9.773077011108398, + "rewards/rejected": -6.524921417236328, + "step": 19013 + }, + { + "epoch": 4.76, + "grad_norm": 2.7729506492614746, + "learning_rate": 5.813350928022876e-08, + "logits/chosen": -0.4539567530155182, + "logits/rejected": -0.553234338760376, + "logps/chosen": -61.84199523925781, + "logps/rejected": -119.7674560546875, + "loss": 0.676, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0822601318359375, + "rewards/margins": 8.667941093444824, + "rewards/rejected": -5.5856804847717285, + "step": 19014 + }, + { + "epoch": 4.76, + "grad_norm": 4.376247882843018, + "learning_rate": 5.801406344764115e-08, + "logits/chosen": -0.6014282703399658, + "logits/rejected": -0.6733261942863464, + "logps/chosen": -47.908687591552734, + "logps/rejected": -99.93965148925781, + "loss": 0.572, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3802764415740967, + "rewards/margins": 7.164015769958496, + "rewards/rejected": -3.7837395668029785, + "step": 19015 + }, + { + "epoch": 4.76, + "grad_norm": 3.1575539112091064, + "learning_rate": 5.789473973678294e-08, + "logits/chosen": -0.5945717096328735, + "logits/rejected": -0.6430848836898804, + "logps/chosen": -65.64859008789062, + "logps/rejected": -121.05791473388672, + "loss": 0.6633, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2539308071136475, + "rewards/margins": 7.816197395324707, + "rewards/rejected": -4.562266826629639, + "step": 19016 + }, + { + "epoch": 4.76, + "grad_norm": 7.594664096832275, + "learning_rate": 5.777553815060233e-08, + "logits/chosen": -0.5177240967750549, + "logits/rejected": -0.5488353371620178, + "logps/chosen": -64.18229675292969, + "logps/rejected": -114.44058227539062, + "loss": 0.5847, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.425600051879883, + "rewards/margins": 6.889108180999756, + "rewards/rejected": -3.463508367538452, + "step": 19017 + }, + { + "epoch": 4.76, + "grad_norm": 7.523104190826416, + "learning_rate": 5.765645869204473e-08, + "logits/chosen": -0.6721220016479492, + "logits/rejected": -0.7503616809844971, + "logps/chosen": -42.66982650756836, + "logps/rejected": -107.47179412841797, + "loss": 0.6626, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2153093814849854, + "rewards/margins": 7.566073417663574, + "rewards/rejected": -4.35076379776001, + "step": 19018 + }, + { + "epoch": 4.76, + "grad_norm": 3.2039742469787598, + "learning_rate": 5.7537501364053337e-08, + "logits/chosen": -0.5810757875442505, + "logits/rejected": -0.6526244282722473, + "logps/chosen": -65.0097885131836, + "logps/rejected": -118.6861572265625, + "loss": 0.6585, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.976524829864502, + "rewards/margins": 8.136601448059082, + "rewards/rejected": -5.160076141357422, + "step": 19019 + }, + { + "epoch": 4.76, + "grad_norm": 2.9639945030212402, + "learning_rate": 5.741866616956693e-08, + "logits/chosen": -0.5531691312789917, + "logits/rejected": -0.6138260364532471, + "logps/chosen": -58.76725769042969, + "logps/rejected": -119.265625, + "loss": 0.5828, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.019906997680664, + "rewards/margins": 7.345970153808594, + "rewards/rejected": -4.326063632965088, + "step": 19020 + }, + { + "epoch": 4.76, + "grad_norm": 4.662352085113525, + "learning_rate": 5.7299953111522586e-08, + "logits/chosen": -0.5088253617286682, + "logits/rejected": -0.5624227523803711, + "logps/chosen": -58.53001403808594, + "logps/rejected": -100.33776092529297, + "loss": 0.6462, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.333134651184082, + "rewards/margins": 7.345741271972656, + "rewards/rejected": -4.012606620788574, + "step": 19021 + }, + { + "epoch": 4.76, + "grad_norm": 9.369949340820312, + "learning_rate": 5.718136219285353e-08, + "logits/chosen": -0.44796571135520935, + "logits/rejected": -0.530039370059967, + "logps/chosen": -56.818206787109375, + "logps/rejected": -106.7217025756836, + "loss": 0.6679, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9093902111053467, + "rewards/margins": 6.588564395904541, + "rewards/rejected": -3.6791739463806152, + "step": 19022 + }, + { + "epoch": 4.76, + "grad_norm": 2.6576929092407227, + "learning_rate": 5.706289341649074e-08, + "logits/chosen": -0.6569530367851257, + "logits/rejected": -0.7492992877960205, + "logps/chosen": -40.89772415161133, + "logps/rejected": -95.5861587524414, + "loss": 0.5499, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4301533699035645, + "rewards/margins": 7.961266994476318, + "rewards/rejected": -4.531114101409912, + "step": 19023 + }, + { + "epoch": 4.76, + "grad_norm": 5.873066425323486, + "learning_rate": 5.694454678536132e-08, + "logits/chosen": -0.5341627597808838, + "logits/rejected": -0.6003291606903076, + "logps/chosen": -58.83064270019531, + "logps/rejected": -100.30355834960938, + "loss": 0.6645, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9921469688415527, + "rewards/margins": 5.809933662414551, + "rewards/rejected": -2.817786693572998, + "step": 19024 + }, + { + "epoch": 4.76, + "grad_norm": 4.69780158996582, + "learning_rate": 5.68263223023896e-08, + "logits/chosen": -0.6047621369361877, + "logits/rejected": -0.6422092914581299, + "logps/chosen": -57.55058670043945, + "logps/rejected": -118.16677856445312, + "loss": 0.6184, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.338860034942627, + "rewards/margins": 7.603503704071045, + "rewards/rejected": -4.264643669128418, + "step": 19025 + }, + { + "epoch": 4.76, + "grad_norm": 2.376688241958618, + "learning_rate": 5.670821997049769e-08, + "logits/chosen": -0.5773293375968933, + "logits/rejected": -0.664928138256073, + "logps/chosen": -58.24054718017578, + "logps/rejected": -118.77193450927734, + "loss": 0.639, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.406162738800049, + "rewards/margins": 8.347726821899414, + "rewards/rejected": -4.941564083099365, + "step": 19026 + }, + { + "epoch": 4.76, + "grad_norm": 3.362811803817749, + "learning_rate": 5.659023979260325e-08, + "logits/chosen": -0.5798174738883972, + "logits/rejected": -0.6225153207778931, + "logps/chosen": -52.63592529296875, + "logps/rejected": -115.42190551757812, + "loss": 0.6083, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.92100191116333, + "rewards/margins": 7.46075963973999, + "rewards/rejected": -4.539757251739502, + "step": 19027 + }, + { + "epoch": 4.76, + "grad_norm": 4.503373622894287, + "learning_rate": 5.647238177162229e-08, + "logits/chosen": -0.5270073413848877, + "logits/rejected": -0.604715883731842, + "logps/chosen": -55.93867492675781, + "logps/rejected": -109.51228332519531, + "loss": 0.5872, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5570597648620605, + "rewards/margins": 6.446902751922607, + "rewards/rejected": -3.8898425102233887, + "step": 19028 + }, + { + "epoch": 4.76, + "grad_norm": 3.188176393508911, + "learning_rate": 5.635464591046691e-08, + "logits/chosen": -0.5352659225463867, + "logits/rejected": -0.6223036646842957, + "logps/chosen": -57.30186080932617, + "logps/rejected": -96.6769027709961, + "loss": 0.6154, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.164496898651123, + "rewards/margins": 7.036043167114258, + "rewards/rejected": -3.8715460300445557, + "step": 19029 + }, + { + "epoch": 4.76, + "grad_norm": 22.473215103149414, + "learning_rate": 5.62370322120459e-08, + "logits/chosen": -0.5342099070549011, + "logits/rejected": -0.5926172733306885, + "logps/chosen": -82.88043212890625, + "logps/rejected": -111.71006774902344, + "loss": 0.7636, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1654605865478516, + "rewards/margins": 6.979036331176758, + "rewards/rejected": -3.813575267791748, + "step": 19030 + }, + { + "epoch": 4.76, + "grad_norm": 3.5000622272491455, + "learning_rate": 5.611954067926695e-08, + "logits/chosen": -0.546354353427887, + "logits/rejected": -0.6541389226913452, + "logps/chosen": -44.772804260253906, + "logps/rejected": -94.98077392578125, + "loss": 0.6154, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2535908222198486, + "rewards/margins": 7.936189651489258, + "rewards/rejected": -4.68259859085083, + "step": 19031 + }, + { + "epoch": 4.76, + "grad_norm": 9.792022705078125, + "learning_rate": 5.6002171315032715e-08, + "logits/chosen": -0.5956375598907471, + "logits/rejected": -0.6862494945526123, + "logps/chosen": -52.72511672973633, + "logps/rejected": -119.95801544189453, + "loss": 0.576, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.232295513153076, + "rewards/margins": 8.011383056640625, + "rewards/rejected": -4.779087543487549, + "step": 19032 + }, + { + "epoch": 4.76, + "grad_norm": 3.422013282775879, + "learning_rate": 5.588492412224311e-08, + "logits/chosen": -0.6527803540229797, + "logits/rejected": -0.7586004734039307, + "logps/chosen": -47.79264831542969, + "logps/rejected": -107.54290771484375, + "loss": 0.5163, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2925336360931396, + "rewards/margins": 8.755192756652832, + "rewards/rejected": -5.462660312652588, + "step": 19033 + }, + { + "epoch": 4.76, + "grad_norm": 3.6010491847991943, + "learning_rate": 5.5767799103795813e-08, + "logits/chosen": -0.5034757256507874, + "logits/rejected": -0.5854309797286987, + "logps/chosen": -42.44401168823242, + "logps/rejected": -116.65404510498047, + "loss": 0.4953, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9903998374938965, + "rewards/margins": 8.409912109375, + "rewards/rejected": -5.4195122718811035, + "step": 19034 + }, + { + "epoch": 4.76, + "grad_norm": 4.6096649169921875, + "learning_rate": 5.565079626258574e-08, + "logits/chosen": -0.5901647806167603, + "logits/rejected": -0.6583223938941956, + "logps/chosen": -51.17485046386719, + "logps/rejected": -103.67835998535156, + "loss": 0.6201, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1206860542297363, + "rewards/margins": 6.535983562469482, + "rewards/rejected": -3.415297508239746, + "step": 19035 + }, + { + "epoch": 4.76, + "grad_norm": 4.273758888244629, + "learning_rate": 5.553391560150279e-08, + "logits/chosen": -0.5789753794670105, + "logits/rejected": -0.6295670866966248, + "logps/chosen": -39.35872268676758, + "logps/rejected": -122.8929443359375, + "loss": 0.5324, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.176276683807373, + "rewards/margins": 7.8888115882873535, + "rewards/rejected": -4.7125349044799805, + "step": 19036 + }, + { + "epoch": 4.76, + "grad_norm": 9.761504173278809, + "learning_rate": 5.541715712343576e-08, + "logits/chosen": -0.5033981800079346, + "logits/rejected": -0.5888037085533142, + "logps/chosen": -48.24480438232422, + "logps/rejected": -115.76948547363281, + "loss": 0.5566, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0364484786987305, + "rewards/margins": 7.0276312828063965, + "rewards/rejected": -3.991183280944824, + "step": 19037 + }, + { + "epoch": 4.76, + "grad_norm": 6.59414005279541, + "learning_rate": 5.530052083127069e-08, + "logits/chosen": -0.5506336092948914, + "logits/rejected": -0.6401228904724121, + "logps/chosen": -50.76363754272461, + "logps/rejected": -101.76903533935547, + "loss": 0.6475, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.907642364501953, + "rewards/margins": 7.331307411193848, + "rewards/rejected": -4.4236650466918945, + "step": 19038 + }, + { + "epoch": 4.76, + "grad_norm": 4.422689914703369, + "learning_rate": 5.518400672788915e-08, + "logits/chosen": -0.5637596249580383, + "logits/rejected": -0.6258758902549744, + "logps/chosen": -56.977603912353516, + "logps/rejected": -100.759033203125, + "loss": 0.7129, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1161789894104004, + "rewards/margins": 5.68947696685791, + "rewards/rejected": -2.573298215866089, + "step": 19039 + }, + { + "epoch": 4.76, + "grad_norm": 4.072452545166016, + "learning_rate": 5.5067614816169955e-08, + "logits/chosen": -0.5692676901817322, + "logits/rejected": -0.6367031335830688, + "logps/chosen": -55.337955474853516, + "logps/rejected": -106.56414794921875, + "loss": 0.6558, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1733880043029785, + "rewards/margins": 6.817436218261719, + "rewards/rejected": -3.644047498703003, + "step": 19040 + }, + { + "epoch": 4.76, + "grad_norm": 2.139822244644165, + "learning_rate": 5.495134509899025e-08, + "logits/chosen": -0.5254783630371094, + "logits/rejected": -0.5824596881866455, + "logps/chosen": -48.08351135253906, + "logps/rejected": -132.12646484375, + "loss": 0.5161, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3208556175231934, + "rewards/margins": 8.89638900756836, + "rewards/rejected": -5.575533866882324, + "step": 19041 + }, + { + "epoch": 4.76, + "grad_norm": 6.21522855758667, + "learning_rate": 5.4835197579222174e-08, + "logits/chosen": -0.5215775370597839, + "logits/rejected": -0.5900949239730835, + "logps/chosen": -59.85661315917969, + "logps/rejected": -120.67964172363281, + "loss": 0.6858, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.041646957397461, + "rewards/margins": 7.563612937927246, + "rewards/rejected": -4.521965980529785, + "step": 19042 + }, + { + "epoch": 4.76, + "grad_norm": 5.129444122314453, + "learning_rate": 5.471917225973622e-08, + "logits/chosen": -0.5799879431724548, + "logits/rejected": -0.7019665241241455, + "logps/chosen": -77.0403823852539, + "logps/rejected": -109.04225158691406, + "loss": 0.7131, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2253456115722656, + "rewards/margins": 7.800284385681152, + "rewards/rejected": -4.574939727783203, + "step": 19043 + }, + { + "epoch": 4.76, + "grad_norm": 11.0267333984375, + "learning_rate": 5.460326914340008e-08, + "logits/chosen": -0.5807672142982483, + "logits/rejected": -0.6556885242462158, + "logps/chosen": -45.795989990234375, + "logps/rejected": -94.76953887939453, + "loss": 0.5846, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.099003314971924, + "rewards/margins": 6.232564926147461, + "rewards/rejected": -3.133561849594116, + "step": 19044 + }, + { + "epoch": 4.76, + "grad_norm": 5.2882232666015625, + "learning_rate": 5.448748823307648e-08, + "logits/chosen": -0.5005028247833252, + "logits/rejected": -0.6282498836517334, + "logps/chosen": -54.67510223388672, + "logps/rejected": -109.748046875, + "loss": 0.6882, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9798548221588135, + "rewards/margins": 7.847057342529297, + "rewards/rejected": -4.867203235626221, + "step": 19045 + }, + { + "epoch": 4.76, + "grad_norm": 2.4385244846343994, + "learning_rate": 5.437182953162812e-08, + "logits/chosen": -0.4879204034805298, + "logits/rejected": -0.5903840661048889, + "logps/chosen": -61.29525375366211, + "logps/rejected": -121.77515411376953, + "loss": 0.646, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8080544471740723, + "rewards/margins": 8.203694343566895, + "rewards/rejected": -5.395639419555664, + "step": 19046 + }, + { + "epoch": 4.76, + "grad_norm": 6.301530361175537, + "learning_rate": 5.4256293041912153e-08, + "logits/chosen": -0.5455222129821777, + "logits/rejected": -0.5937354564666748, + "logps/chosen": -51.5510139465332, + "logps/rejected": -103.26023864746094, + "loss": 0.6771, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.094011068344116, + "rewards/margins": 7.527133464813232, + "rewards/rejected": -4.433122634887695, + "step": 19047 + }, + { + "epoch": 4.77, + "grad_norm": 44.29450225830078, + "learning_rate": 5.414087876678298e-08, + "logits/chosen": -0.5694127082824707, + "logits/rejected": -0.5962158441543579, + "logps/chosen": -53.334068298339844, + "logps/rejected": -91.73754119873047, + "loss": 0.7057, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3353686332702637, + "rewards/margins": 5.65435266494751, + "rewards/rejected": -2.318984031677246, + "step": 19048 + }, + { + "epoch": 4.77, + "grad_norm": 5.067171573638916, + "learning_rate": 5.402558670909386e-08, + "logits/chosen": -0.5379883050918579, + "logits/rejected": -0.6300643682479858, + "logps/chosen": -57.044010162353516, + "logps/rejected": -101.33042907714844, + "loss": 0.644, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.476729154586792, + "rewards/margins": 6.772463798522949, + "rewards/rejected": -3.2957348823547363, + "step": 19049 + }, + { + "epoch": 4.77, + "grad_norm": 16.71772575378418, + "learning_rate": 5.391041687169363e-08, + "logits/chosen": -0.5177886486053467, + "logits/rejected": -0.5689871311187744, + "logps/chosen": -52.411075592041016, + "logps/rejected": -104.56153869628906, + "loss": 0.7482, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8527727127075195, + "rewards/margins": 5.635763645172119, + "rewards/rejected": -2.7829906940460205, + "step": 19050 + }, + { + "epoch": 4.77, + "grad_norm": 3.6386187076568604, + "learning_rate": 5.379536925742723e-08, + "logits/chosen": -0.5547122359275818, + "logits/rejected": -0.5819540619850159, + "logps/chosen": -58.606746673583984, + "logps/rejected": -116.12374877929688, + "loss": 0.6667, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.07914400100708, + "rewards/margins": 7.391478061676025, + "rewards/rejected": -4.3123345375061035, + "step": 19051 + }, + { + "epoch": 4.77, + "grad_norm": 15.116386413574219, + "learning_rate": 5.368044386913851e-08, + "logits/chosen": -0.5478752851486206, + "logits/rejected": -0.6379035711288452, + "logps/chosen": -54.98417282104492, + "logps/rejected": -96.74209594726562, + "loss": 0.6013, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.450653553009033, + "rewards/margins": 6.2425642013549805, + "rewards/rejected": -2.7919111251831055, + "step": 19052 + }, + { + "epoch": 4.77, + "grad_norm": 7.791775703430176, + "learning_rate": 5.356564070966741e-08, + "logits/chosen": -0.548330545425415, + "logits/rejected": -0.6306321024894714, + "logps/chosen": -53.45087432861328, + "logps/rejected": -115.69160461425781, + "loss": 0.6856, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2255756855010986, + "rewards/margins": 7.761859893798828, + "rewards/rejected": -4.536284446716309, + "step": 19053 + }, + { + "epoch": 4.77, + "grad_norm": 9.561216354370117, + "learning_rate": 5.345095978185e-08, + "logits/chosen": -0.6421679854393005, + "logits/rejected": -0.7141742706298828, + "logps/chosen": -58.07963562011719, + "logps/rejected": -110.61811828613281, + "loss": 0.6757, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8997538089752197, + "rewards/margins": 8.145730972290039, + "rewards/rejected": -5.245976448059082, + "step": 19054 + }, + { + "epoch": 4.77, + "grad_norm": 7.15956974029541, + "learning_rate": 5.333640108852123e-08, + "logits/chosen": -0.5982746481895447, + "logits/rejected": -0.6604156494140625, + "logps/chosen": -53.078582763671875, + "logps/rejected": -116.83173370361328, + "loss": 0.5621, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.916186571121216, + "rewards/margins": 8.110891342163086, + "rewards/rejected": -5.194705963134766, + "step": 19055 + }, + { + "epoch": 4.77, + "grad_norm": 2.1943347454071045, + "learning_rate": 5.3221964632511616e-08, + "logits/chosen": -0.5814042091369629, + "logits/rejected": -0.6581552028656006, + "logps/chosen": -49.94475173950195, + "logps/rejected": -119.53977966308594, + "loss": 0.5366, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1741650104522705, + "rewards/margins": 8.585942268371582, + "rewards/rejected": -5.411777496337891, + "step": 19056 + }, + { + "epoch": 4.77, + "grad_norm": 3.622748613357544, + "learning_rate": 5.310765041664889e-08, + "logits/chosen": -0.5661899447441101, + "logits/rejected": -0.6164290308952332, + "logps/chosen": -58.039283752441406, + "logps/rejected": -104.81413269042969, + "loss": 0.6409, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8392436504364014, + "rewards/margins": 6.311058044433594, + "rewards/rejected": -3.4718141555786133, + "step": 19057 + }, + { + "epoch": 4.77, + "grad_norm": 3.683444023132324, + "learning_rate": 5.2993458443757475e-08, + "logits/chosen": -0.574121356010437, + "logits/rejected": -0.6604002714157104, + "logps/chosen": -54.99747085571289, + "logps/rejected": -104.06752014160156, + "loss": 0.5905, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0128297805786133, + "rewards/margins": 6.7878851890563965, + "rewards/rejected": -3.775055170059204, + "step": 19058 + }, + { + "epoch": 4.77, + "grad_norm": 6.430765628814697, + "learning_rate": 5.287938871665954e-08, + "logits/chosen": -0.6000418663024902, + "logits/rejected": -0.6534484624862671, + "logps/chosen": -52.86077880859375, + "logps/rejected": -109.30000305175781, + "loss": 0.6642, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.093362331390381, + "rewards/margins": 6.085283279418945, + "rewards/rejected": -2.9919209480285645, + "step": 19059 + }, + { + "epoch": 4.77, + "grad_norm": 18.676984786987305, + "learning_rate": 5.276544123817451e-08, + "logits/chosen": -0.49928101897239685, + "logits/rejected": -0.5593807101249695, + "logps/chosen": -57.36140441894531, + "logps/rejected": -118.64730072021484, + "loss": 0.7046, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0460243225097656, + "rewards/margins": 7.346686840057373, + "rewards/rejected": -4.300662994384766, + "step": 19060 + }, + { + "epoch": 4.77, + "grad_norm": 6.1366753578186035, + "learning_rate": 5.265161601111679e-08, + "logits/chosen": -0.5802870988845825, + "logits/rejected": -0.6608091592788696, + "logps/chosen": -59.30647277832031, + "logps/rejected": -118.8136978149414, + "loss": 0.623, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.868502140045166, + "rewards/margins": 8.340705871582031, + "rewards/rejected": -5.472203731536865, + "step": 19061 + }, + { + "epoch": 4.77, + "grad_norm": 2.3852522373199463, + "learning_rate": 5.253791303830025e-08, + "logits/chosen": -0.6236934065818787, + "logits/rejected": -0.7463778257369995, + "logps/chosen": -39.74475860595703, + "logps/rejected": -83.28921508789062, + "loss": 0.515, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2480547428131104, + "rewards/margins": 6.695761203765869, + "rewards/rejected": -3.4477062225341797, + "step": 19062 + }, + { + "epoch": 4.77, + "grad_norm": 4.11201286315918, + "learning_rate": 5.24243323225343e-08, + "logits/chosen": -0.5110353827476501, + "logits/rejected": -0.5888305902481079, + "logps/chosen": -52.2080078125, + "logps/rejected": -101.23422241210938, + "loss": 0.6171, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9655802249908447, + "rewards/margins": 7.3699469566345215, + "rewards/rejected": -4.404366493225098, + "step": 19063 + }, + { + "epoch": 4.77, + "grad_norm": 3.9704504013061523, + "learning_rate": 5.231087386662559e-08, + "logits/chosen": -0.6755678057670593, + "logits/rejected": -0.7622777223587036, + "logps/chosen": -52.01109313964844, + "logps/rejected": -117.35934448242188, + "loss": 0.635, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1758811473846436, + "rewards/margins": 8.148968696594238, + "rewards/rejected": -4.973087787628174, + "step": 19064 + }, + { + "epoch": 4.77, + "grad_norm": 4.177042007446289, + "learning_rate": 5.219753767337743e-08, + "logits/chosen": -0.5233385562896729, + "logits/rejected": -0.5834129452705383, + "logps/chosen": -67.86383056640625, + "logps/rejected": -109.35865783691406, + "loss": 0.7546, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8801827430725098, + "rewards/margins": 6.569090843200684, + "rewards/rejected": -3.688908338546753, + "step": 19065 + }, + { + "epoch": 4.77, + "grad_norm": 4.156994342803955, + "learning_rate": 5.208432374559092e-08, + "logits/chosen": -0.5314536094665527, + "logits/rejected": -0.6072970628738403, + "logps/chosen": -61.8900146484375, + "logps/rejected": -115.43929290771484, + "loss": 0.597, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.13285493850708, + "rewards/margins": 7.146455764770508, + "rewards/rejected": -4.013600826263428, + "step": 19066 + }, + { + "epoch": 4.77, + "grad_norm": 6.121500492095947, + "learning_rate": 5.197123208606436e-08, + "logits/chosen": -0.5235983729362488, + "logits/rejected": -0.5889357924461365, + "logps/chosen": -62.55875778198242, + "logps/rejected": -100.08357238769531, + "loss": 0.6669, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3019652366638184, + "rewards/margins": 6.39571475982666, + "rewards/rejected": -3.093749523162842, + "step": 19067 + }, + { + "epoch": 4.77, + "grad_norm": 6.499330997467041, + "learning_rate": 5.185826269759053e-08, + "logits/chosen": -0.4873441457748413, + "logits/rejected": -0.5712208151817322, + "logps/chosen": -58.62459182739258, + "logps/rejected": -96.93607330322266, + "loss": 0.6894, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.024223804473877, + "rewards/margins": 6.088545322418213, + "rewards/rejected": -3.064321517944336, + "step": 19068 + }, + { + "epoch": 4.77, + "grad_norm": 5.713554859161377, + "learning_rate": 5.174541558296276e-08, + "logits/chosen": -0.5529106259346008, + "logits/rejected": -0.6202458143234253, + "logps/chosen": -51.55238342285156, + "logps/rejected": -97.8397216796875, + "loss": 0.6631, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1234846115112305, + "rewards/margins": 6.259829521179199, + "rewards/rejected": -3.1363439559936523, + "step": 19069 + }, + { + "epoch": 4.77, + "grad_norm": 2.202446460723877, + "learning_rate": 5.1632690744968796e-08, + "logits/chosen": -0.5718218088150024, + "logits/rejected": -0.6696575284004211, + "logps/chosen": -55.27549743652344, + "logps/rejected": -126.96459197998047, + "loss": 0.5535, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1853089332580566, + "rewards/margins": 8.95293140411377, + "rewards/rejected": -5.767622947692871, + "step": 19070 + }, + { + "epoch": 4.77, + "grad_norm": 12.98408031463623, + "learning_rate": 5.15200881863942e-08, + "logits/chosen": -0.5167949199676514, + "logits/rejected": -0.6034921407699585, + "logps/chosen": -62.61946487426758, + "logps/rejected": -119.93814849853516, + "loss": 0.7656, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7762069702148438, + "rewards/margins": 6.61856746673584, + "rewards/rejected": -3.8423609733581543, + "step": 19071 + }, + { + "epoch": 4.77, + "grad_norm": 4.440497875213623, + "learning_rate": 5.1407607910022304e-08, + "logits/chosen": -0.5335093140602112, + "logits/rejected": -0.583921492099762, + "logps/chosen": -53.483856201171875, + "logps/rejected": -113.92828369140625, + "loss": 0.6296, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.194047689437866, + "rewards/margins": 7.498252868652344, + "rewards/rejected": -4.304205417633057, + "step": 19072 + }, + { + "epoch": 4.77, + "grad_norm": 3.435683012008667, + "learning_rate": 5.129524991863088e-08, + "logits/chosen": -0.56357741355896, + "logits/rejected": -0.6690369844436646, + "logps/chosen": -51.81056213378906, + "logps/rejected": -98.88197326660156, + "loss": 0.5852, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0095582008361816, + "rewards/margins": 6.615647315979004, + "rewards/rejected": -3.6060891151428223, + "step": 19073 + }, + { + "epoch": 4.77, + "grad_norm": 2.5972158908843994, + "learning_rate": 5.118301421499827e-08, + "logits/chosen": -0.5816614627838135, + "logits/rejected": -0.705123782157898, + "logps/chosen": -60.91446304321289, + "logps/rejected": -107.7503662109375, + "loss": 0.5311, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6573309898376465, + "rewards/margins": 8.250950813293457, + "rewards/rejected": -4.593618869781494, + "step": 19074 + }, + { + "epoch": 4.77, + "grad_norm": 7.474531650543213, + "learning_rate": 5.107090080189725e-08, + "logits/chosen": -0.5347368717193604, + "logits/rejected": -0.619360089302063, + "logps/chosen": -57.404605865478516, + "logps/rejected": -106.42088317871094, + "loss": 0.6323, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.304605722427368, + "rewards/margins": 6.833329200744629, + "rewards/rejected": -3.5287234783172607, + "step": 19075 + }, + { + "epoch": 4.77, + "grad_norm": 5.031313419342041, + "learning_rate": 5.0958909682097826e-08, + "logits/chosen": -0.5234896540641785, + "logits/rejected": -0.5992723703384399, + "logps/chosen": -54.66478729248047, + "logps/rejected": -125.45683288574219, + "loss": 0.6214, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9710590839385986, + "rewards/margins": 8.640937805175781, + "rewards/rejected": -5.66987943649292, + "step": 19076 + }, + { + "epoch": 4.77, + "grad_norm": 2.795152425765991, + "learning_rate": 5.084704085836778e-08, + "logits/chosen": -0.5753288865089417, + "logits/rejected": -0.6286061406135559, + "logps/chosen": -57.7180290222168, + "logps/rejected": -110.74839782714844, + "loss": 0.6906, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.022535800933838, + "rewards/margins": 6.900659561157227, + "rewards/rejected": -3.8781230449676514, + "step": 19077 + }, + { + "epoch": 4.77, + "grad_norm": 3.96681809425354, + "learning_rate": 5.073529433347213e-08, + "logits/chosen": -0.6550971269607544, + "logits/rejected": -0.7034100890159607, + "logps/chosen": -57.70278549194336, + "logps/rejected": -129.73548889160156, + "loss": 0.6708, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1385412216186523, + "rewards/margins": 7.7101826667785645, + "rewards/rejected": -4.571641445159912, + "step": 19078 + }, + { + "epoch": 4.77, + "grad_norm": 5.024994373321533, + "learning_rate": 5.0623670110170887e-08, + "logits/chosen": -0.4953189492225647, + "logits/rejected": -0.6133774518966675, + "logps/chosen": -50.61518859863281, + "logps/rejected": -96.47787475585938, + "loss": 0.6238, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.999938488006592, + "rewards/margins": 6.726729393005371, + "rewards/rejected": -3.7267913818359375, + "step": 19079 + }, + { + "epoch": 4.77, + "grad_norm": 6.589987754821777, + "learning_rate": 5.051216819122351e-08, + "logits/chosen": -0.6484581828117371, + "logits/rejected": -0.6972098350524902, + "logps/chosen": -51.8985481262207, + "logps/rejected": -111.63504791259766, + "loss": 0.6906, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2703588008880615, + "rewards/margins": 7.739162921905518, + "rewards/rejected": -4.468803405761719, + "step": 19080 + }, + { + "epoch": 4.77, + "grad_norm": 5.628835678100586, + "learning_rate": 5.0400788579385016e-08, + "logits/chosen": -0.5766317844390869, + "logits/rejected": -0.6299624443054199, + "logps/chosen": -35.79205322265625, + "logps/rejected": -119.61939239501953, + "loss": 0.4845, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1992814540863037, + "rewards/margins": 7.774897575378418, + "rewards/rejected": -4.575616836547852, + "step": 19081 + }, + { + "epoch": 4.77, + "grad_norm": 3.2819581031799316, + "learning_rate": 5.028953127740765e-08, + "logits/chosen": -0.5965638160705566, + "logits/rejected": -0.68987637758255, + "logps/chosen": -63.95647430419922, + "logps/rejected": -105.13788604736328, + "loss": 0.6455, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3863446712493896, + "rewards/margins": 7.896940231323242, + "rewards/rejected": -4.51059627532959, + "step": 19082 + }, + { + "epoch": 4.77, + "grad_norm": 2.8341116905212402, + "learning_rate": 5.017839628804033e-08, + "logits/chosen": -0.5591207146644592, + "logits/rejected": -0.6347472667694092, + "logps/chosen": -54.36479568481445, + "logps/rejected": -105.34367370605469, + "loss": 0.5941, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.249436140060425, + "rewards/margins": 7.609943389892578, + "rewards/rejected": -4.360507011413574, + "step": 19083 + }, + { + "epoch": 4.77, + "grad_norm": 5.564971446990967, + "learning_rate": 5.006738361402974e-08, + "logits/chosen": -0.5886875987052917, + "logits/rejected": -0.6684809327125549, + "logps/chosen": -60.26985549926758, + "logps/rejected": -103.14553833007812, + "loss": 0.6894, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.765040636062622, + "rewards/margins": 6.860790252685547, + "rewards/rejected": -4.095749378204346, + "step": 19084 + }, + { + "epoch": 4.77, + "grad_norm": 13.010041236877441, + "learning_rate": 4.99564932581198e-08, + "logits/chosen": -0.5579936504364014, + "logits/rejected": -0.6313288807868958, + "logps/chosen": -64.7969741821289, + "logps/rejected": -95.51309967041016, + "loss": 0.6283, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3184189796447754, + "rewards/margins": 7.002007961273193, + "rewards/rejected": -3.683589220046997, + "step": 19085 + }, + { + "epoch": 4.77, + "grad_norm": 3.313112497329712, + "learning_rate": 4.984572522304942e-08, + "logits/chosen": -0.5039669275283813, + "logits/rejected": -0.6038098931312561, + "logps/chosen": -46.219459533691406, + "logps/rejected": -97.96007537841797, + "loss": 0.5549, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.13462495803833, + "rewards/margins": 7.13751220703125, + "rewards/rejected": -4.002886772155762, + "step": 19086 + }, + { + "epoch": 4.77, + "grad_norm": 7.82889986038208, + "learning_rate": 4.973507951155643e-08, + "logits/chosen": -0.5247812867164612, + "logits/rejected": -0.6237994432449341, + "logps/chosen": -58.12278747558594, + "logps/rejected": -86.25972747802734, + "loss": 0.7938, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3916015625, + "rewards/margins": 6.6770405769348145, + "rewards/rejected": -3.2854390144348145, + "step": 19087 + }, + { + "epoch": 4.78, + "grad_norm": 4.196537971496582, + "learning_rate": 4.962455612637529e-08, + "logits/chosen": -0.5035586953163147, + "logits/rejected": -0.5902121067047119, + "logps/chosen": -53.04120635986328, + "logps/rejected": -111.73863983154297, + "loss": 0.6043, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.182291030883789, + "rewards/margins": 6.697991847991943, + "rewards/rejected": -3.5157008171081543, + "step": 19088 + }, + { + "epoch": 4.78, + "grad_norm": 2.8296234607696533, + "learning_rate": 4.951415507023605e-08, + "logits/chosen": -0.552724301815033, + "logits/rejected": -0.6198906898498535, + "logps/chosen": -57.716983795166016, + "logps/rejected": -109.28384399414062, + "loss": 0.5405, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1873397827148438, + "rewards/margins": 7.558202266693115, + "rewards/rejected": -4.3708624839782715, + "step": 19089 + }, + { + "epoch": 4.78, + "grad_norm": 4.378602981567383, + "learning_rate": 4.9403876345868186e-08, + "logits/chosen": -0.4602851867675781, + "logits/rejected": -0.5505275726318359, + "logps/chosen": -58.30139923095703, + "logps/rejected": -96.27738952636719, + "loss": 0.5433, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.897937536239624, + "rewards/margins": 6.3801069259643555, + "rewards/rejected": -3.4821696281433105, + "step": 19090 + }, + { + "epoch": 4.78, + "grad_norm": 17.659626007080078, + "learning_rate": 4.929371995599619e-08, + "logits/chosen": -0.5943482518196106, + "logits/rejected": -0.6620658040046692, + "logps/chosen": -56.71167755126953, + "logps/rejected": -92.84190368652344, + "loss": 0.63, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.140726089477539, + "rewards/margins": 6.157036781311035, + "rewards/rejected": -3.016310214996338, + "step": 19091 + }, + { + "epoch": 4.78, + "grad_norm": 2.9558684825897217, + "learning_rate": 4.918368590334233e-08, + "logits/chosen": -0.5311591625213623, + "logits/rejected": -0.6130534410476685, + "logps/chosen": -54.8361701965332, + "logps/rejected": -112.06421661376953, + "loss": 0.5955, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.138230800628662, + "rewards/margins": 8.407526969909668, + "rewards/rejected": -5.269295692443848, + "step": 19092 + }, + { + "epoch": 4.78, + "grad_norm": 3.6505610942840576, + "learning_rate": 4.907377419062553e-08, + "logits/chosen": -0.514743447303772, + "logits/rejected": -0.6286964416503906, + "logps/chosen": -55.34200668334961, + "logps/rejected": -110.93934631347656, + "loss": 0.5412, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9753360748291016, + "rewards/margins": 8.21847915649414, + "rewards/rejected": -5.243142127990723, + "step": 19093 + }, + { + "epoch": 4.78, + "grad_norm": 7.963485240936279, + "learning_rate": 4.8963984820561416e-08, + "logits/chosen": -0.565372109413147, + "logits/rejected": -0.6685193777084351, + "logps/chosen": -55.989341735839844, + "logps/rejected": -106.71626281738281, + "loss": 0.6244, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0370092391967773, + "rewards/margins": 8.136212348937988, + "rewards/rejected": -5.099203109741211, + "step": 19094 + }, + { + "epoch": 4.78, + "grad_norm": 6.453227996826172, + "learning_rate": 4.8854317795863356e-08, + "logits/chosen": -0.6409925222396851, + "logits/rejected": -0.7085819840431213, + "logps/chosen": -49.194637298583984, + "logps/rejected": -106.5525131225586, + "loss": 0.6741, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.067369222640991, + "rewards/margins": 7.44553804397583, + "rewards/rejected": -4.378168106079102, + "step": 19095 + }, + { + "epoch": 4.78, + "grad_norm": 4.421211242675781, + "learning_rate": 4.8744773119241415e-08, + "logits/chosen": -0.5360143184661865, + "logits/rejected": -0.6442726254463196, + "logps/chosen": -45.32235336303711, + "logps/rejected": -109.75779724121094, + "loss": 0.5525, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2760512828826904, + "rewards/margins": 7.904120445251465, + "rewards/rejected": -4.6280694007873535, + "step": 19096 + }, + { + "epoch": 4.78, + "grad_norm": 4.423033237457275, + "learning_rate": 4.863535079340287e-08, + "logits/chosen": -0.5488772988319397, + "logits/rejected": -0.620690643787384, + "logps/chosen": -60.483219146728516, + "logps/rejected": -110.32914733886719, + "loss": 0.6084, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8682775497436523, + "rewards/margins": 7.490746021270752, + "rewards/rejected": -4.6224684715271, + "step": 19097 + }, + { + "epoch": 4.78, + "grad_norm": 4.581755638122559, + "learning_rate": 4.852605082105111e-08, + "logits/chosen": -0.532276451587677, + "logits/rejected": -0.6627254486083984, + "logps/chosen": -55.68315887451172, + "logps/rejected": -92.28821563720703, + "loss": 0.5492, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.057237148284912, + "rewards/margins": 7.391888618469238, + "rewards/rejected": -4.334651470184326, + "step": 19098 + }, + { + "epoch": 4.78, + "grad_norm": 3.0012831687927246, + "learning_rate": 4.84168732048873e-08, + "logits/chosen": -0.484626442193985, + "logits/rejected": -0.5617526769638062, + "logps/chosen": -45.16712188720703, + "logps/rejected": -114.82281494140625, + "loss": 0.5727, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.284296751022339, + "rewards/margins": 7.552574634552002, + "rewards/rejected": -4.268277645111084, + "step": 19099 + }, + { + "epoch": 4.78, + "grad_norm": 5.836544513702393, + "learning_rate": 4.8307817947609305e-08, + "logits/chosen": -0.4975900948047638, + "logits/rejected": -0.5700188875198364, + "logps/chosen": -50.85649108886719, + "logps/rejected": -101.16746520996094, + "loss": 0.5234, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.345487117767334, + "rewards/margins": 6.897794723510742, + "rewards/rejected": -3.552307367324829, + "step": 19100 + }, + { + "epoch": 4.78, + "grad_norm": 9.284114837646484, + "learning_rate": 4.819888505191217e-08, + "logits/chosen": -0.6066023707389832, + "logits/rejected": -0.6847420930862427, + "logps/chosen": -55.55989074707031, + "logps/rejected": -109.62068176269531, + "loss": 0.6506, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.860775947570801, + "rewards/margins": 7.256243705749512, + "rewards/rejected": -4.395467758178711, + "step": 19101 + }, + { + "epoch": 4.78, + "grad_norm": 2.77195143699646, + "learning_rate": 4.809007452048708e-08, + "logits/chosen": -0.48837438225746155, + "logits/rejected": -0.5248621702194214, + "logps/chosen": -63.932098388671875, + "logps/rejected": -133.4799346923828, + "loss": 0.5616, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1164309978485107, + "rewards/margins": 7.080538272857666, + "rewards/rejected": -3.9641072750091553, + "step": 19102 + }, + { + "epoch": 4.78, + "grad_norm": 3.575307607650757, + "learning_rate": 4.7981386356024116e-08, + "logits/chosen": -0.5940181016921997, + "logits/rejected": -0.6534746289253235, + "logps/chosen": -45.85926055908203, + "logps/rejected": -116.85591888427734, + "loss": 0.6667, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.27017879486084, + "rewards/margins": 7.571542263031006, + "rewards/rejected": -4.301363468170166, + "step": 19103 + }, + { + "epoch": 4.78, + "grad_norm": 2.6249051094055176, + "learning_rate": 4.7872820561207235e-08, + "logits/chosen": -0.6181392669677734, + "logits/rejected": -0.6958238482475281, + "logps/chosen": -57.64613723754883, + "logps/rejected": -125.255615234375, + "loss": 0.5939, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1012394428253174, + "rewards/margins": 9.063173294067383, + "rewards/rejected": -5.961933612823486, + "step": 19104 + }, + { + "epoch": 4.78, + "grad_norm": 4.001430511474609, + "learning_rate": 4.776437713872095e-08, + "logits/chosen": -0.5539861917495728, + "logits/rejected": -0.6621300578117371, + "logps/chosen": -59.999324798583984, + "logps/rejected": -91.94780731201172, + "loss": 0.5865, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.136387825012207, + "rewards/margins": 7.3139729499816895, + "rewards/rejected": -4.177585124969482, + "step": 19105 + }, + { + "epoch": 4.78, + "grad_norm": 4.3626885414123535, + "learning_rate": 4.765605609124424e-08, + "logits/chosen": -0.6253343820571899, + "logits/rejected": -0.7086058855056763, + "logps/chosen": -52.077796936035156, + "logps/rejected": -105.94596862792969, + "loss": 0.5622, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.176978826522827, + "rewards/margins": 7.407281875610352, + "rewards/rejected": -4.2303032875061035, + "step": 19106 + }, + { + "epoch": 4.78, + "grad_norm": 5.126381874084473, + "learning_rate": 4.754785742145385e-08, + "logits/chosen": -0.5304173231124878, + "logits/rejected": -0.6305104494094849, + "logps/chosen": -52.13151931762695, + "logps/rejected": -105.77063751220703, + "loss": 0.6015, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9600250720977783, + "rewards/margins": 7.6473917961120605, + "rewards/rejected": -4.687366962432861, + "step": 19107 + }, + { + "epoch": 4.78, + "grad_norm": 5.176352024078369, + "learning_rate": 4.743978113202374e-08, + "logits/chosen": -0.49108651280403137, + "logits/rejected": -0.6072461009025574, + "logps/chosen": -54.61552429199219, + "logps/rejected": -103.12882232666016, + "loss": 0.655, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.251425266265869, + "rewards/margins": 7.212071895599365, + "rewards/rejected": -3.960646629333496, + "step": 19108 + }, + { + "epoch": 4.78, + "grad_norm": 5.446868896484375, + "learning_rate": 4.733182722562402e-08, + "logits/chosen": -0.5444915890693665, + "logits/rejected": -0.624977171421051, + "logps/chosen": -55.89537811279297, + "logps/rejected": -112.38681030273438, + "loss": 0.6758, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0024945735931396, + "rewards/margins": 7.431665420532227, + "rewards/rejected": -4.429171085357666, + "step": 19109 + }, + { + "epoch": 4.78, + "grad_norm": 4.832489013671875, + "learning_rate": 4.722399570492309e-08, + "logits/chosen": -0.5513529777526855, + "logits/rejected": -0.6580326557159424, + "logps/chosen": -56.93538284301758, + "logps/rejected": -105.42367553710938, + "loss": 0.6679, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0635628700256348, + "rewards/margins": 6.901525497436523, + "rewards/rejected": -3.8379628658294678, + "step": 19110 + }, + { + "epoch": 4.78, + "grad_norm": 6.166783809661865, + "learning_rate": 4.711628657258494e-08, + "logits/chosen": -0.5444973111152649, + "logits/rejected": -0.6209808588027954, + "logps/chosen": -60.82375717163086, + "logps/rejected": -119.44269561767578, + "loss": 0.6055, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.110562324523926, + "rewards/margins": 6.565330982208252, + "rewards/rejected": -3.454768657684326, + "step": 19111 + }, + { + "epoch": 4.78, + "grad_norm": 6.898260116577148, + "learning_rate": 4.7008699831271875e-08, + "logits/chosen": -0.548393726348877, + "logits/rejected": -0.6674219369888306, + "logps/chosen": -63.62452697753906, + "logps/rejected": -86.58152770996094, + "loss": 0.6425, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9267024993896484, + "rewards/margins": 5.6571574211120605, + "rewards/rejected": -2.730455160140991, + "step": 19112 + }, + { + "epoch": 4.78, + "grad_norm": 6.602125644683838, + "learning_rate": 4.6901235483642336e-08, + "logits/chosen": -0.6216964721679688, + "logits/rejected": -0.6617335081100464, + "logps/chosen": -44.88179397583008, + "logps/rejected": -127.46650695800781, + "loss": 0.5575, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1721930503845215, + "rewards/margins": 8.22425651550293, + "rewards/rejected": -5.052063941955566, + "step": 19113 + }, + { + "epoch": 4.78, + "grad_norm": 17.131669998168945, + "learning_rate": 4.679389353235087e-08, + "logits/chosen": -0.5071542859077454, + "logits/rejected": -0.5973380208015442, + "logps/chosen": -63.551307678222656, + "logps/rejected": -106.79515838623047, + "loss": 0.7172, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3837006092071533, + "rewards/margins": 7.673375129699707, + "rewards/rejected": -4.289674758911133, + "step": 19114 + }, + { + "epoch": 4.78, + "grad_norm": 2.9933693408966064, + "learning_rate": 4.668667398005089e-08, + "logits/chosen": -0.5318419933319092, + "logits/rejected": -0.5825853943824768, + "logps/chosen": -55.171226501464844, + "logps/rejected": -106.87276458740234, + "loss": 0.6024, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0224289894104004, + "rewards/margins": 6.060722827911377, + "rewards/rejected": -3.0382938385009766, + "step": 19115 + }, + { + "epoch": 4.78, + "grad_norm": 5.222579002380371, + "learning_rate": 4.657957682939196e-08, + "logits/chosen": -0.5340884923934937, + "logits/rejected": -0.6090685129165649, + "logps/chosen": -54.24565887451172, + "logps/rejected": -98.08683776855469, + "loss": 0.6253, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4316234588623047, + "rewards/margins": 6.8184943199157715, + "rewards/rejected": -3.386870861053467, + "step": 19116 + }, + { + "epoch": 4.78, + "grad_norm": 22.53059959411621, + "learning_rate": 4.647260208302029e-08, + "logits/chosen": -0.5181100368499756, + "logits/rejected": -0.5863238573074341, + "logps/chosen": -54.026092529296875, + "logps/rejected": -90.91551208496094, + "loss": 0.7011, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.8763585090637207, + "rewards/margins": 5.525938034057617, + "rewards/rejected": -2.6495797634124756, + "step": 19117 + }, + { + "epoch": 4.78, + "grad_norm": 27.85548210144043, + "learning_rate": 4.636574974357988e-08, + "logits/chosen": -0.5869669914245605, + "logits/rejected": -0.6379201412200928, + "logps/chosen": -53.37472915649414, + "logps/rejected": -122.97746276855469, + "loss": 0.6407, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0942623615264893, + "rewards/margins": 7.851280212402344, + "rewards/rejected": -4.757017135620117, + "step": 19118 + }, + { + "epoch": 4.78, + "grad_norm": 2.6331357955932617, + "learning_rate": 4.625901981371028e-08, + "logits/chosen": -0.49643877148628235, + "logits/rejected": -0.5805593729019165, + "logps/chosen": -56.93092727661133, + "logps/rejected": -90.17913055419922, + "loss": 0.6132, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3903095722198486, + "rewards/margins": 6.7565693855285645, + "rewards/rejected": -3.366259813308716, + "step": 19119 + }, + { + "epoch": 4.78, + "grad_norm": 6.766411304473877, + "learning_rate": 4.6152412296049385e-08, + "logits/chosen": -0.5697525143623352, + "logits/rejected": -0.636067807674408, + "logps/chosen": -49.26826858520508, + "logps/rejected": -122.23263549804688, + "loss": 0.662, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0036368370056152, + "rewards/margins": 6.771705627441406, + "rewards/rejected": -3.768068790435791, + "step": 19120 + }, + { + "epoch": 4.78, + "grad_norm": 3.8236753940582275, + "learning_rate": 4.6045927193231754e-08, + "logits/chosen": -0.6134366989135742, + "logits/rejected": -0.6823409795761108, + "logps/chosen": -46.70486068725586, + "logps/rejected": -112.49826049804688, + "loss": 0.5945, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.161504030227661, + "rewards/margins": 8.079595565795898, + "rewards/rejected": -4.918091297149658, + "step": 19121 + }, + { + "epoch": 4.78, + "grad_norm": 4.2843828201293945, + "learning_rate": 4.5939564507888055e-08, + "logits/chosen": -0.5383768081665039, + "logits/rejected": -0.6164963245391846, + "logps/chosen": -47.95439529418945, + "logps/rejected": -111.08799743652344, + "loss": 0.5883, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2646639347076416, + "rewards/margins": 8.808696746826172, + "rewards/rejected": -5.544032573699951, + "step": 19122 + }, + { + "epoch": 4.78, + "grad_norm": 2.3872172832489014, + "learning_rate": 4.5833324242647307e-08, + "logits/chosen": -0.6822502613067627, + "logits/rejected": -0.752838671207428, + "logps/chosen": -44.44131088256836, + "logps/rejected": -104.24903106689453, + "loss": 0.4982, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1772143840789795, + "rewards/margins": 8.671037673950195, + "rewards/rejected": -5.493823051452637, + "step": 19123 + }, + { + "epoch": 4.78, + "grad_norm": 10.36244010925293, + "learning_rate": 4.572720640013517e-08, + "logits/chosen": -0.6036407351493835, + "logits/rejected": -0.7004908323287964, + "logps/chosen": -54.02054977416992, + "logps/rejected": -120.59378051757812, + "loss": 0.6447, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.230433702468872, + "rewards/margins": 7.117093563079834, + "rewards/rejected": -3.886659860610962, + "step": 19124 + }, + { + "epoch": 4.78, + "grad_norm": 5.996269702911377, + "learning_rate": 4.562121098297234e-08, + "logits/chosen": -0.5478956699371338, + "logits/rejected": -0.6300539374351501, + "logps/chosen": -45.59176254272461, + "logps/rejected": -109.2379379272461, + "loss": 0.5174, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5924174785614014, + "rewards/margins": 7.346648693084717, + "rewards/rejected": -3.7542312145233154, + "step": 19125 + }, + { + "epoch": 4.78, + "grad_norm": 3.2563352584838867, + "learning_rate": 4.55153379937795e-08, + "logits/chosen": -0.5312045216560364, + "logits/rejected": -0.5963096618652344, + "logps/chosen": -59.66073989868164, + "logps/rejected": -125.77320861816406, + "loss": 0.633, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.508373737335205, + "rewards/margins": 8.764212608337402, + "rewards/rejected": -5.255838871002197, + "step": 19126 + }, + { + "epoch": 4.78, + "grad_norm": 4.039919376373291, + "learning_rate": 4.5409587435172876e-08, + "logits/chosen": -0.535618782043457, + "logits/rejected": -0.645687460899353, + "logps/chosen": -55.696754455566406, + "logps/rejected": -119.34461975097656, + "loss": 0.5743, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9412834644317627, + "rewards/margins": 8.72587776184082, + "rewards/rejected": -5.784595489501953, + "step": 19127 + }, + { + "epoch": 4.79, + "grad_norm": 8.914742469787598, + "learning_rate": 4.5303959309764836e-08, + "logits/chosen": -0.5865664482116699, + "logits/rejected": -0.674270749092102, + "logps/chosen": -60.63201141357422, + "logps/rejected": -99.8691177368164, + "loss": 0.708, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.753458261489868, + "rewards/margins": 6.871110439300537, + "rewards/rejected": -4.117652416229248, + "step": 19128 + }, + { + "epoch": 4.79, + "grad_norm": 6.9231367111206055, + "learning_rate": 4.519845362016606e-08, + "logits/chosen": -0.6012530326843262, + "logits/rejected": -0.7003528475761414, + "logps/chosen": -40.62232971191406, + "logps/rejected": -93.38681030273438, + "loss": 0.6216, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1811723709106445, + "rewards/margins": 7.351006984710693, + "rewards/rejected": -4.169834613800049, + "step": 19129 + }, + { + "epoch": 4.79, + "grad_norm": 5.909581184387207, + "learning_rate": 4.509307036898392e-08, + "logits/chosen": -0.4709656238555908, + "logits/rejected": -0.5822187066078186, + "logps/chosen": -63.07863235473633, + "logps/rejected": -117.18769836425781, + "loss": 0.6746, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.01320219039917, + "rewards/margins": 7.04704475402832, + "rewards/rejected": -4.033842086791992, + "step": 19130 + }, + { + "epoch": 4.79, + "grad_norm": 13.802290916442871, + "learning_rate": 4.4987809558822424e-08, + "logits/chosen": -0.5839127898216248, + "logits/rejected": -0.650625467300415, + "logps/chosen": -57.119686126708984, + "logps/rejected": -101.03544616699219, + "loss": 0.7662, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0588231086730957, + "rewards/margins": 6.36479377746582, + "rewards/rejected": -3.3059701919555664, + "step": 19131 + }, + { + "epoch": 4.79, + "grad_norm": 3.5152759552001953, + "learning_rate": 4.4882671192281737e-08, + "logits/chosen": -0.5415974855422974, + "logits/rejected": -0.6102292537689209, + "logps/chosen": -49.17109298706055, + "logps/rejected": -123.87803649902344, + "loss": 0.5517, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.273930788040161, + "rewards/margins": 7.2784833908081055, + "rewards/rejected": -4.004552364349365, + "step": 19132 + }, + { + "epoch": 4.79, + "grad_norm": 11.248125076293945, + "learning_rate": 4.4777655271961426e-08, + "logits/chosen": -0.5672774910926819, + "logits/rejected": -0.6627752184867859, + "logps/chosen": -55.66326141357422, + "logps/rejected": -139.48257446289062, + "loss": 0.576, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.664646625518799, + "rewards/margins": 8.078770637512207, + "rewards/rejected": -5.414124011993408, + "step": 19133 + }, + { + "epoch": 4.79, + "grad_norm": 14.532912254333496, + "learning_rate": 4.4672761800455544e-08, + "logits/chosen": -0.6580832004547119, + "logits/rejected": -0.7438195943832397, + "logps/chosen": -60.024627685546875, + "logps/rejected": -102.4715576171875, + "loss": 0.7569, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8103578090667725, + "rewards/margins": 6.567264080047607, + "rewards/rejected": -3.7569057941436768, + "step": 19134 + }, + { + "epoch": 4.79, + "grad_norm": 4.168537139892578, + "learning_rate": 4.456799078035645e-08, + "logits/chosen": -0.5961422920227051, + "logits/rejected": -0.6968523859977722, + "logps/chosen": -50.095035552978516, + "logps/rejected": -92.61038970947266, + "loss": 0.5651, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0934906005859375, + "rewards/margins": 7.851943016052246, + "rewards/rejected": -4.758452892303467, + "step": 19135 + }, + { + "epoch": 4.79, + "grad_norm": 4.828789234161377, + "learning_rate": 4.4463342214253744e-08, + "logits/chosen": -0.5480590462684631, + "logits/rejected": -0.6350727081298828, + "logps/chosen": -52.72205352783203, + "logps/rejected": -94.01116943359375, + "loss": 0.6239, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9237258434295654, + "rewards/margins": 6.629595756530762, + "rewards/rejected": -3.7058701515197754, + "step": 19136 + }, + { + "epoch": 4.79, + "grad_norm": 3.3338098526000977, + "learning_rate": 4.4358816104732025e-08, + "logits/chosen": -0.45588821172714233, + "logits/rejected": -0.5597341656684875, + "logps/chosen": -51.382564544677734, + "logps/rejected": -111.82511138916016, + "loss": 0.5341, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5220961570739746, + "rewards/margins": 7.532701015472412, + "rewards/rejected": -4.010605335235596, + "step": 19137 + }, + { + "epoch": 4.79, + "grad_norm": 5.841622352600098, + "learning_rate": 4.425441245437534e-08, + "logits/chosen": -0.6252061128616333, + "logits/rejected": -0.6653707027435303, + "logps/chosen": -53.14731216430664, + "logps/rejected": -113.28971862792969, + "loss": 0.5981, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.10349178314209, + "rewards/margins": 6.799245834350586, + "rewards/rejected": -3.695754051208496, + "step": 19138 + }, + { + "epoch": 4.79, + "grad_norm": 3.2239601612091064, + "learning_rate": 4.4150131265763284e-08, + "logits/chosen": -0.6185144186019897, + "logits/rejected": -0.6694256663322449, + "logps/chosen": -49.152687072753906, + "logps/rejected": -120.39472961425781, + "loss": 0.5989, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3437018394470215, + "rewards/margins": 8.532538414001465, + "rewards/rejected": -5.188836097717285, + "step": 19139 + }, + { + "epoch": 4.79, + "grad_norm": 4.513081073760986, + "learning_rate": 4.4045972541472694e-08, + "logits/chosen": -0.5880831480026245, + "logits/rejected": -0.6209912896156311, + "logps/chosen": -51.69709396362305, + "logps/rejected": -120.7705078125, + "loss": 0.6239, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.985368251800537, + "rewards/margins": 7.333669662475586, + "rewards/rejected": -4.348301410675049, + "step": 19140 + }, + { + "epoch": 4.79, + "grad_norm": 10.45055866241455, + "learning_rate": 4.394193628407706e-08, + "logits/chosen": -0.5931581258773804, + "logits/rejected": -0.6316007971763611, + "logps/chosen": -50.04595947265625, + "logps/rejected": -118.93391418457031, + "loss": 0.5839, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.239370584487915, + "rewards/margins": 8.241804122924805, + "rewards/rejected": -5.002432823181152, + "step": 19141 + }, + { + "epoch": 4.79, + "grad_norm": 3.607499361038208, + "learning_rate": 4.383802249614877e-08, + "logits/chosen": -0.5985884666442871, + "logits/rejected": -0.6377249956130981, + "logps/chosen": -49.048667907714844, + "logps/rejected": -129.02780151367188, + "loss": 0.5624, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.942436456680298, + "rewards/margins": 7.575825214385986, + "rewards/rejected": -4.633388519287109, + "step": 19142 + }, + { + "epoch": 4.79, + "grad_norm": 2.174417495727539, + "learning_rate": 4.373423118025411e-08, + "logits/chosen": -0.494791179895401, + "logits/rejected": -0.5961347818374634, + "logps/chosen": -59.82387161254883, + "logps/rejected": -120.00111389160156, + "loss": 0.5595, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3456194400787354, + "rewards/margins": 8.874730110168457, + "rewards/rejected": -5.529110908508301, + "step": 19143 + }, + { + "epoch": 4.79, + "grad_norm": 9.152732849121094, + "learning_rate": 4.3630562338957685e-08, + "logits/chosen": -0.6081810593605042, + "logits/rejected": -0.6895149350166321, + "logps/chosen": -58.5207405090332, + "logps/rejected": -100.4860610961914, + "loss": 0.7426, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.6727166175842285, + "rewards/margins": 6.429347038269043, + "rewards/rejected": -3.7566301822662354, + "step": 19144 + }, + { + "epoch": 4.79, + "grad_norm": 3.926274061203003, + "learning_rate": 4.352701597482245e-08, + "logits/chosen": -0.5636774897575378, + "logits/rejected": -0.6103651523590088, + "logps/chosen": -59.55259323120117, + "logps/rejected": -103.708984375, + "loss": 0.6219, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0558037757873535, + "rewards/margins": 6.492343425750732, + "rewards/rejected": -3.4365391731262207, + "step": 19145 + }, + { + "epoch": 4.79, + "grad_norm": 5.957935810089111, + "learning_rate": 4.342359209040692e-08, + "logits/chosen": -0.529254674911499, + "logits/rejected": -0.5594179630279541, + "logps/chosen": -51.43559265136719, + "logps/rejected": -102.0480728149414, + "loss": 0.6683, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0067408084869385, + "rewards/margins": 6.393763542175293, + "rewards/rejected": -3.3870224952697754, + "step": 19146 + }, + { + "epoch": 4.79, + "grad_norm": 15.910867691040039, + "learning_rate": 4.332029068826571e-08, + "logits/chosen": -0.5765582919120789, + "logits/rejected": -0.6627020835876465, + "logps/chosen": -49.605323791503906, + "logps/rejected": -98.01213073730469, + "loss": 0.6637, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7942445278167725, + "rewards/margins": 7.221132278442383, + "rewards/rejected": -4.426888465881348, + "step": 19147 + }, + { + "epoch": 4.79, + "grad_norm": 15.274012565612793, + "learning_rate": 4.321711177095289e-08, + "logits/chosen": -0.5063098669052124, + "logits/rejected": -0.5847141742706299, + "logps/chosen": -54.08039855957031, + "logps/rejected": -108.26350402832031, + "loss": 0.6759, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.984677791595459, + "rewards/margins": 6.972336292266846, + "rewards/rejected": -3.9876585006713867, + "step": 19148 + }, + { + "epoch": 4.79, + "grad_norm": 4.7342658042907715, + "learning_rate": 4.311405534101698e-08, + "logits/chosen": -0.5510047674179077, + "logits/rejected": -0.6297365427017212, + "logps/chosen": -51.729957580566406, + "logps/rejected": -113.01053619384766, + "loss": 0.5714, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0725624561309814, + "rewards/margins": 7.093669891357422, + "rewards/rejected": -4.0211076736450195, + "step": 19149 + }, + { + "epoch": 4.79, + "grad_norm": 6.1069769859313965, + "learning_rate": 4.3011121401005385e-08, + "logits/chosen": -0.655087947845459, + "logits/rejected": -0.7126166820526123, + "logps/chosen": -52.06071472167969, + "logps/rejected": -101.84370422363281, + "loss": 0.6344, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0102968215942383, + "rewards/margins": 6.647914886474609, + "rewards/rejected": -3.63761830329895, + "step": 19150 + }, + { + "epoch": 4.79, + "grad_norm": 5.817953109741211, + "learning_rate": 4.290830995346107e-08, + "logits/chosen": -0.6327978372573853, + "logits/rejected": -0.6850709915161133, + "logps/chosen": -52.11402893066406, + "logps/rejected": -95.27745819091797, + "loss": 0.6939, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2326560020446777, + "rewards/margins": 5.803364276885986, + "rewards/rejected": -2.5707077980041504, + "step": 19151 + }, + { + "epoch": 4.79, + "grad_norm": 6.448538303375244, + "learning_rate": 4.280562100092589e-08, + "logits/chosen": -0.4711393117904663, + "logits/rejected": -0.5020013451576233, + "logps/chosen": -59.9881591796875, + "logps/rejected": -137.6170196533203, + "loss": 0.658, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.793592929840088, + "rewards/margins": 7.101921081542969, + "rewards/rejected": -4.308328628540039, + "step": 19152 + }, + { + "epoch": 4.79, + "grad_norm": 5.354457378387451, + "learning_rate": 4.270305454593615e-08, + "logits/chosen": -0.4969159960746765, + "logits/rejected": -0.6187048554420471, + "logps/chosen": -52.81608581542969, + "logps/rejected": -98.57418823242188, + "loss": 0.5637, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.121256113052368, + "rewards/margins": 6.940948009490967, + "rewards/rejected": -3.8196918964385986, + "step": 19153 + }, + { + "epoch": 4.79, + "grad_norm": 4.953636646270752, + "learning_rate": 4.260061059102649e-08, + "logits/chosen": -0.519777238368988, + "logits/rejected": -0.5787501931190491, + "logps/chosen": -52.9471435546875, + "logps/rejected": -96.9793930053711, + "loss": 0.6893, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.164501190185547, + "rewards/margins": 6.988106727600098, + "rewards/rejected": -3.823606014251709, + "step": 19154 + }, + { + "epoch": 4.79, + "grad_norm": 2.388056993484497, + "learning_rate": 4.2498289138728774e-08, + "logits/chosen": -0.5326424241065979, + "logits/rejected": -0.5961463451385498, + "logps/chosen": -55.48857498168945, + "logps/rejected": -98.6006088256836, + "loss": 0.5371, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.308814525604248, + "rewards/margins": 6.771890640258789, + "rewards/rejected": -3.463076591491699, + "step": 19155 + }, + { + "epoch": 4.79, + "grad_norm": 8.097506523132324, + "learning_rate": 4.2396090191571536e-08, + "logits/chosen": -0.5788975358009338, + "logits/rejected": -0.678390383720398, + "logps/chosen": -57.69715118408203, + "logps/rejected": -119.99349975585938, + "loss": 0.6216, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.091022491455078, + "rewards/margins": 8.086443901062012, + "rewards/rejected": -4.99542236328125, + "step": 19156 + }, + { + "epoch": 4.79, + "grad_norm": 6.623852252960205, + "learning_rate": 4.2294013752079974e-08, + "logits/chosen": -0.4978109300136566, + "logits/rejected": -0.6186701059341431, + "logps/chosen": -51.18419647216797, + "logps/rejected": -96.28289794921875, + "loss": 0.5789, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0340652465820312, + "rewards/margins": 7.67400598526001, + "rewards/rejected": -4.6399407386779785, + "step": 19157 + }, + { + "epoch": 4.79, + "grad_norm": 4.080297946929932, + "learning_rate": 4.219205982277652e-08, + "logits/chosen": -0.5782371163368225, + "logits/rejected": -0.6665634512901306, + "logps/chosen": -54.31966781616211, + "logps/rejected": -98.76768493652344, + "loss": 0.6121, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8626742362976074, + "rewards/margins": 6.201724529266357, + "rewards/rejected": -3.33905029296875, + "step": 19158 + }, + { + "epoch": 4.79, + "grad_norm": 1.9744555950164795, + "learning_rate": 4.209022840618082e-08, + "logits/chosen": -0.5271425247192383, + "logits/rejected": -0.6533602476119995, + "logps/chosen": -51.60700225830078, + "logps/rejected": -105.11198425292969, + "loss": 0.4828, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.249943733215332, + "rewards/margins": 8.463264465332031, + "rewards/rejected": -5.213321208953857, + "step": 19159 + }, + { + "epoch": 4.79, + "grad_norm": 6.330981254577637, + "learning_rate": 4.198851950480864e-08, + "logits/chosen": -0.5248834490776062, + "logits/rejected": -0.5762035846710205, + "logps/chosen": -49.379573822021484, + "logps/rejected": -106.82849884033203, + "loss": 0.5916, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0075483322143555, + "rewards/margins": 5.873452663421631, + "rewards/rejected": -2.8659048080444336, + "step": 19160 + }, + { + "epoch": 4.79, + "grad_norm": 2.6287856101989746, + "learning_rate": 4.188693312117409e-08, + "logits/chosen": -0.5575478672981262, + "logits/rejected": -0.6332818865776062, + "logps/chosen": -48.63539505004883, + "logps/rejected": -119.38721466064453, + "loss": 0.5215, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.252143383026123, + "rewards/margins": 9.040882110595703, + "rewards/rejected": -5.788737773895264, + "step": 19161 + }, + { + "epoch": 4.79, + "grad_norm": 8.488175392150879, + "learning_rate": 4.1785469257786815e-08, + "logits/chosen": -0.5634533166885376, + "logits/rejected": -0.6178165674209595, + "logps/chosen": -67.96847534179688, + "logps/rejected": -119.38418579101562, + "loss": 0.6896, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.088632822036743, + "rewards/margins": 6.8646321296691895, + "rewards/rejected": -3.775999069213867, + "step": 19162 + }, + { + "epoch": 4.79, + "grad_norm": 4.508930683135986, + "learning_rate": 4.1684127917154817e-08, + "logits/chosen": -0.4923313856124878, + "logits/rejected": -0.5723101496696472, + "logps/chosen": -53.40409851074219, + "logps/rejected": -115.85472869873047, + "loss": 0.6383, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.804030656814575, + "rewards/margins": 7.104673385620117, + "rewards/rejected": -4.300642013549805, + "step": 19163 + }, + { + "epoch": 4.79, + "grad_norm": 2.620133399963379, + "learning_rate": 4.1582909101781646e-08, + "logits/chosen": -0.5258708000183105, + "logits/rejected": -0.599135160446167, + "logps/chosen": -55.66055679321289, + "logps/rejected": -135.1173095703125, + "loss": 0.5116, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.170919418334961, + "rewards/margins": 9.566679000854492, + "rewards/rejected": -6.395760536193848, + "step": 19164 + }, + { + "epoch": 4.79, + "grad_norm": 3.9704225063323975, + "learning_rate": 4.148181281416863e-08, + "logits/chosen": -0.587498128414154, + "logits/rejected": -0.7094863653182983, + "logps/chosen": -66.31962585449219, + "logps/rejected": -95.76103210449219, + "loss": 0.6756, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.839332103729248, + "rewards/margins": 7.473222732543945, + "rewards/rejected": -4.633890151977539, + "step": 19165 + }, + { + "epoch": 4.79, + "grad_norm": 6.309000492095947, + "learning_rate": 4.1380839056814335e-08, + "logits/chosen": -0.5363671779632568, + "logits/rejected": -0.6673173904418945, + "logps/chosen": -72.30606842041016, + "logps/rejected": -98.83056640625, + "loss": 0.6236, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1428492069244385, + "rewards/margins": 6.16505765914917, + "rewards/rejected": -3.0222089290618896, + "step": 19166 + }, + { + "epoch": 4.79, + "grad_norm": 4.090770244598389, + "learning_rate": 4.127998783221399e-08, + "logits/chosen": -0.5181148648262024, + "logits/rejected": -0.6227928400039673, + "logps/chosen": -52.081031799316406, + "logps/rejected": -91.17327117919922, + "loss": 0.5897, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.286393880844116, + "rewards/margins": 6.758114337921143, + "rewards/rejected": -3.471719980239868, + "step": 19167 + }, + { + "epoch": 4.8, + "grad_norm": 1.4009785652160645, + "learning_rate": 4.117925914285892e-08, + "logits/chosen": -0.5186410546302795, + "logits/rejected": -0.604854166507721, + "logps/chosen": -48.13087463378906, + "logps/rejected": -128.68411254882812, + "loss": 0.5118, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.534118175506592, + "rewards/margins": 9.784479141235352, + "rewards/rejected": -6.250360488891602, + "step": 19168 + }, + { + "epoch": 4.8, + "grad_norm": 6.184190273284912, + "learning_rate": 4.107865299123881e-08, + "logits/chosen": -0.5866263508796692, + "logits/rejected": -0.672468900680542, + "logps/chosen": -51.744834899902344, + "logps/rejected": -108.28907775878906, + "loss": 0.6018, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2308692932128906, + "rewards/margins": 7.677975177764893, + "rewards/rejected": -4.44710636138916, + "step": 19169 + }, + { + "epoch": 4.8, + "grad_norm": 3.9733636379241943, + "learning_rate": 4.097816937984056e-08, + "logits/chosen": -0.5380284190177917, + "logits/rejected": -0.5872107148170471, + "logps/chosen": -55.70806121826172, + "logps/rejected": -126.26039123535156, + "loss": 0.5844, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.253390073776245, + "rewards/margins": 8.45637321472168, + "rewards/rejected": -5.2029829025268555, + "step": 19170 + }, + { + "epoch": 4.8, + "grad_norm": 5.629543781280518, + "learning_rate": 4.087780831114552e-08, + "logits/chosen": -0.5081708431243896, + "logits/rejected": -0.49679476022720337, + "logps/chosen": -47.806705474853516, + "logps/rejected": -108.634521484375, + "loss": 0.685, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9122891426086426, + "rewards/margins": 5.685899257659912, + "rewards/rejected": -2.7736105918884277, + "step": 19171 + }, + { + "epoch": 4.8, + "grad_norm": 4.861247539520264, + "learning_rate": 4.077756978763503e-08, + "logits/chosen": -0.593667209148407, + "logits/rejected": -0.6402414441108704, + "logps/chosen": -49.28678894042969, + "logps/rejected": -104.35966491699219, + "loss": 0.6934, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.260219097137451, + "rewards/margins": 6.24939489364624, + "rewards/rejected": -2.9891767501831055, + "step": 19172 + }, + { + "epoch": 4.8, + "grad_norm": 17.11324691772461, + "learning_rate": 4.067745381178545e-08, + "logits/chosen": -0.6462629437446594, + "logits/rejected": -0.6868699789047241, + "logps/chosen": -61.36004638671875, + "logps/rejected": -114.86019897460938, + "loss": 0.6855, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.863182306289673, + "rewards/margins": 6.985878944396973, + "rewards/rejected": -4.1226959228515625, + "step": 19173 + }, + { + "epoch": 4.8, + "grad_norm": 4.119563579559326, + "learning_rate": 4.057746038607091e-08, + "logits/chosen": -0.46666771173477173, + "logits/rejected": -0.5674931406974792, + "logps/chosen": -42.652687072753906, + "logps/rejected": -101.86009216308594, + "loss": 0.5515, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.82320499420166, + "rewards/margins": 7.571983337402344, + "rewards/rejected": -4.748777866363525, + "step": 19174 + }, + { + "epoch": 4.8, + "grad_norm": 7.0068254470825195, + "learning_rate": 4.047758951296277e-08, + "logits/chosen": -0.61479252576828, + "logits/rejected": -0.6877509355545044, + "logps/chosen": -50.590484619140625, + "logps/rejected": -108.13003540039062, + "loss": 0.5652, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.14038348197937, + "rewards/margins": 7.072385787963867, + "rewards/rejected": -3.932002067565918, + "step": 19175 + }, + { + "epoch": 4.8, + "grad_norm": 3.80574107170105, + "learning_rate": 4.037784119492849e-08, + "logits/chosen": -0.5035555362701416, + "logits/rejected": -0.6000747680664062, + "logps/chosen": -65.26548767089844, + "logps/rejected": -109.35420989990234, + "loss": 0.6035, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2959327697753906, + "rewards/margins": 7.239551544189453, + "rewards/rejected": -3.9436187744140625, + "step": 19176 + }, + { + "epoch": 4.8, + "grad_norm": 9.190011978149414, + "learning_rate": 4.0278215434432774e-08, + "logits/chosen": -0.5859624743461609, + "logits/rejected": -0.6160293221473694, + "logps/chosen": -61.32093048095703, + "logps/rejected": -102.83808898925781, + "loss": 0.7684, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.100332498550415, + "rewards/margins": 5.669992446899414, + "rewards/rejected": -2.56965970993042, + "step": 19177 + }, + { + "epoch": 4.8, + "grad_norm": 7.181733131408691, + "learning_rate": 4.017871223393754e-08, + "logits/chosen": -0.5469383597373962, + "logits/rejected": -0.6073702573776245, + "logps/chosen": -55.50189971923828, + "logps/rejected": -95.23558807373047, + "loss": 0.6497, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.818025588989258, + "rewards/margins": 5.675222396850586, + "rewards/rejected": -2.857196569442749, + "step": 19178 + }, + { + "epoch": 4.8, + "grad_norm": 5.109707832336426, + "learning_rate": 4.0079331595902496e-08, + "logits/chosen": -0.5396422147750854, + "logits/rejected": -0.6204966902732849, + "logps/chosen": -59.150794982910156, + "logps/rejected": -121.14617156982422, + "loss": 0.6594, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.271759033203125, + "rewards/margins": 6.875070571899414, + "rewards/rejected": -3.603311538696289, + "step": 19179 + }, + { + "epoch": 4.8, + "grad_norm": 4.480387210845947, + "learning_rate": 3.998007352278233e-08, + "logits/chosen": -0.6038427948951721, + "logits/rejected": -0.7027332186698914, + "logps/chosen": -42.36751937866211, + "logps/rejected": -102.39483642578125, + "loss": 0.5561, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2690134048461914, + "rewards/margins": 7.6410932540893555, + "rewards/rejected": -4.372079849243164, + "step": 19180 + }, + { + "epoch": 4.8, + "grad_norm": 3.498333692550659, + "learning_rate": 3.988093801703008e-08, + "logits/chosen": -0.49296829104423523, + "logits/rejected": -0.6175665855407715, + "logps/chosen": -64.81256866455078, + "logps/rejected": -89.87596130371094, + "loss": 0.6124, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0162863731384277, + "rewards/margins": 7.004882335662842, + "rewards/rejected": -3.988595962524414, + "step": 19181 + }, + { + "epoch": 4.8, + "grad_norm": 7.354307174682617, + "learning_rate": 3.978192508109602e-08, + "logits/chosen": -0.5038144588470459, + "logits/rejected": -0.5989118814468384, + "logps/chosen": -59.71568298339844, + "logps/rejected": -100.61428833007812, + "loss": 0.6312, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.964353084564209, + "rewards/margins": 6.778263092041016, + "rewards/rejected": -3.8139102458953857, + "step": 19182 + }, + { + "epoch": 4.8, + "grad_norm": 4.935235023498535, + "learning_rate": 3.968303471742651e-08, + "logits/chosen": -0.5136430263519287, + "logits/rejected": -0.6231898069381714, + "logps/chosen": -52.679771423339844, + "logps/rejected": -107.9428939819336, + "loss": 0.5746, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0975561141967773, + "rewards/margins": 7.837023735046387, + "rewards/rejected": -4.739467620849609, + "step": 19183 + }, + { + "epoch": 4.8, + "grad_norm": 16.817285537719727, + "learning_rate": 3.958426692846518e-08, + "logits/chosen": -0.47728586196899414, + "logits/rejected": -0.5712231993675232, + "logps/chosen": -61.19974899291992, + "logps/rejected": -109.40950775146484, + "loss": 0.7624, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9096035957336426, + "rewards/margins": 7.11224365234375, + "rewards/rejected": -4.202639579772949, + "step": 19184 + }, + { + "epoch": 4.8, + "grad_norm": 7.6177544593811035, + "learning_rate": 3.948562171665282e-08, + "logits/chosen": -0.6053498387336731, + "logits/rejected": -0.6673676371574402, + "logps/chosen": -47.55290985107422, + "logps/rejected": -120.6454086303711, + "loss": 0.5591, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0917556285858154, + "rewards/margins": 8.940991401672363, + "rewards/rejected": -5.849236488342285, + "step": 19185 + }, + { + "epoch": 4.8, + "grad_norm": 15.839664459228516, + "learning_rate": 3.938709908442695e-08, + "logits/chosen": -0.6224139332771301, + "logits/rejected": -0.6236411929130554, + "logps/chosen": -62.887081146240234, + "logps/rejected": -133.3018035888672, + "loss": 0.7157, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2216830253601074, + "rewards/margins": 6.704529762268066, + "rewards/rejected": -3.482846260070801, + "step": 19186 + }, + { + "epoch": 4.8, + "grad_norm": 4.492165565490723, + "learning_rate": 3.928869903422172e-08, + "logits/chosen": -0.6513859629631042, + "logits/rejected": -0.7306476831436157, + "logps/chosen": -54.6359977722168, + "logps/rejected": -107.17635345458984, + "loss": 0.691, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.387049913406372, + "rewards/margins": 7.676054954528809, + "rewards/rejected": -4.289005279541016, + "step": 19187 + }, + { + "epoch": 4.8, + "grad_norm": 5.237128734588623, + "learning_rate": 3.919042156846964e-08, + "logits/chosen": -0.5362439155578613, + "logits/rejected": -0.5800837874412537, + "logps/chosen": -47.693397521972656, + "logps/rejected": -113.16329193115234, + "loss": 0.5979, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.375788688659668, + "rewards/margins": 7.031394958496094, + "rewards/rejected": -3.655606269836426, + "step": 19188 + }, + { + "epoch": 4.8, + "grad_norm": 3.6710445880889893, + "learning_rate": 3.909226668959931e-08, + "logits/chosen": -0.5513460040092468, + "logits/rejected": -0.6028224229812622, + "logps/chosen": -65.7192611694336, + "logps/rejected": -112.25059509277344, + "loss": 0.6391, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1974570751190186, + "rewards/margins": 6.513566493988037, + "rewards/rejected": -3.3161096572875977, + "step": 19189 + }, + { + "epoch": 4.8, + "grad_norm": 12.578414916992188, + "learning_rate": 3.8994234400034914e-08, + "logits/chosen": -0.5955173969268799, + "logits/rejected": -0.6451603174209595, + "logps/chosen": -47.25978088378906, + "logps/rejected": -119.62559509277344, + "loss": 0.5881, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.198472023010254, + "rewards/margins": 7.304997444152832, + "rewards/rejected": -4.10652494430542, + "step": 19190 + }, + { + "epoch": 4.8, + "grad_norm": 3.1584486961364746, + "learning_rate": 3.8896324702200064e-08, + "logits/chosen": -0.5483026504516602, + "logits/rejected": -0.6076723337173462, + "logps/chosen": -63.17009735107422, + "logps/rejected": -112.55244445800781, + "loss": 0.565, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.991682767868042, + "rewards/margins": 6.507991790771484, + "rewards/rejected": -3.5163092613220215, + "step": 19191 + }, + { + "epoch": 4.8, + "grad_norm": 4.058505058288574, + "learning_rate": 3.879853759851393e-08, + "logits/chosen": -0.5391800403594971, + "logits/rejected": -0.6189801692962646, + "logps/chosen": -55.1583137512207, + "logps/rejected": -115.94404602050781, + "loss": 0.5891, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2214057445526123, + "rewards/margins": 7.272747993469238, + "rewards/rejected": -4.051342964172363, + "step": 19192 + }, + { + "epoch": 4.8, + "grad_norm": 2.5063092708587646, + "learning_rate": 3.870087309139292e-08, + "logits/chosen": -0.5239501595497131, + "logits/rejected": -0.6593071222305298, + "logps/chosen": -58.814537048339844, + "logps/rejected": -112.84053802490234, + "loss": 0.5294, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1780757904052734, + "rewards/margins": 8.685713768005371, + "rewards/rejected": -5.507638454437256, + "step": 19193 + }, + { + "epoch": 4.8, + "grad_norm": 5.379093170166016, + "learning_rate": 3.86033311832501e-08, + "logits/chosen": -0.4871811866760254, + "logits/rejected": -0.6066482067108154, + "logps/chosen": -62.453556060791016, + "logps/rejected": -102.56649780273438, + "loss": 0.609, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9534924030303955, + "rewards/margins": 8.346299171447754, + "rewards/rejected": -5.392807483673096, + "step": 19194 + }, + { + "epoch": 4.8, + "grad_norm": 14.123385429382324, + "learning_rate": 3.850591187649633e-08, + "logits/chosen": -0.5104243755340576, + "logits/rejected": -0.6326988935470581, + "logps/chosen": -56.69027328491211, + "logps/rejected": -136.1815948486328, + "loss": 0.6269, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7581818103790283, + "rewards/margins": 9.138762474060059, + "rewards/rejected": -6.380580902099609, + "step": 19195 + }, + { + "epoch": 4.8, + "grad_norm": 11.117209434509277, + "learning_rate": 3.840861517353911e-08, + "logits/chosen": -0.5922637581825256, + "logits/rejected": -0.6480302214622498, + "logps/chosen": -59.121639251708984, + "logps/rejected": -123.55227661132812, + "loss": 0.7238, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0331408977508545, + "rewards/margins": 8.011100769042969, + "rewards/rejected": -4.977960109710693, + "step": 19196 + }, + { + "epoch": 4.8, + "grad_norm": 14.128910064697266, + "learning_rate": 3.831144107678153e-08, + "logits/chosen": -0.5563262104988098, + "logits/rejected": -0.626207709312439, + "logps/chosen": -54.792747497558594, + "logps/rejected": -123.10448455810547, + "loss": 0.6353, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.45109224319458, + "rewards/margins": 8.793073654174805, + "rewards/rejected": -5.341982364654541, + "step": 19197 + }, + { + "epoch": 4.8, + "grad_norm": 4.340371131896973, + "learning_rate": 3.821438958862666e-08, + "logits/chosen": -0.6094529032707214, + "logits/rejected": -0.7083019018173218, + "logps/chosen": -52.93809509277344, + "logps/rejected": -91.99639892578125, + "loss": 0.6989, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.903179407119751, + "rewards/margins": 6.837527751922607, + "rewards/rejected": -3.9343485832214355, + "step": 19198 + }, + { + "epoch": 4.8, + "grad_norm": 5.772721767425537, + "learning_rate": 3.811746071147149e-08, + "logits/chosen": -0.5519109964370728, + "logits/rejected": -0.5943790078163147, + "logps/chosen": -64.29166412353516, + "logps/rejected": -121.40760803222656, + "loss": 0.678, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2288918495178223, + "rewards/margins": 7.270815849304199, + "rewards/rejected": -4.041923999786377, + "step": 19199 + }, + { + "epoch": 4.8, + "grad_norm": 9.578397750854492, + "learning_rate": 3.8020654447711304e-08, + "logits/chosen": -0.5747202038764954, + "logits/rejected": -0.6657217144966125, + "logps/chosen": -65.9537582397461, + "logps/rejected": -104.5354232788086, + "loss": 0.7085, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2339725494384766, + "rewards/margins": 6.5185465812683105, + "rewards/rejected": -3.284574031829834, + "step": 19200 + }, + { + "epoch": 4.8, + "grad_norm": 6.855461120605469, + "learning_rate": 3.7923970799738644e-08, + "logits/chosen": -0.5185661315917969, + "logits/rejected": -0.5779408812522888, + "logps/chosen": -54.442543029785156, + "logps/rejected": -110.76359558105469, + "loss": 0.5963, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3672425746917725, + "rewards/margins": 7.225679397583008, + "rewards/rejected": -3.8584365844726562, + "step": 19201 + }, + { + "epoch": 4.8, + "grad_norm": 4.781578063964844, + "learning_rate": 3.7827409769942705e-08, + "logits/chosen": -0.6161280274391174, + "logits/rejected": -0.6933707594871521, + "logps/chosen": -49.885475158691406, + "logps/rejected": -127.93902587890625, + "loss": 0.5676, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.238705635070801, + "rewards/margins": 8.478689193725586, + "rewards/rejected": -5.239983558654785, + "step": 19202 + }, + { + "epoch": 4.8, + "grad_norm": 4.2109599113464355, + "learning_rate": 3.7730971360709354e-08, + "logits/chosen": -0.5397894382476807, + "logits/rejected": -0.6169857978820801, + "logps/chosen": -51.514488220214844, + "logps/rejected": -113.86080932617188, + "loss": 0.6788, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.419668674468994, + "rewards/margins": 8.477259635925293, + "rewards/rejected": -5.057590961456299, + "step": 19203 + }, + { + "epoch": 4.8, + "grad_norm": 7.216142654418945, + "learning_rate": 3.7634655574422254e-08, + "logits/chosen": -0.5090442299842834, + "logits/rejected": -0.5830532312393188, + "logps/chosen": -44.99507522583008, + "logps/rejected": -87.25672912597656, + "loss": 0.7621, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.085824489593506, + "rewards/margins": 5.7778801918029785, + "rewards/rejected": -2.6920554637908936, + "step": 19204 + }, + { + "epoch": 4.8, + "grad_norm": 3.565488576889038, + "learning_rate": 3.753846241346004e-08, + "logits/chosen": -0.5092135667800903, + "logits/rejected": -0.5795055627822876, + "logps/chosen": -54.34407043457031, + "logps/rejected": -117.95823669433594, + "loss": 0.6927, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.325608730316162, + "rewards/margins": 7.829447269439697, + "rewards/rejected": -4.503839492797852, + "step": 19205 + }, + { + "epoch": 4.8, + "grad_norm": 6.19572639465332, + "learning_rate": 3.7442391880201375e-08, + "logits/chosen": -0.6006985306739807, + "logits/rejected": -0.5602244734764099, + "logps/chosen": -46.54875946044922, + "logps/rejected": -120.38763427734375, + "loss": 0.7085, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0857300758361816, + "rewards/margins": 5.434259414672852, + "rewards/rejected": -2.348529100418091, + "step": 19206 + }, + { + "epoch": 4.8, + "grad_norm": 4.855872631072998, + "learning_rate": 3.734644397701992e-08, + "logits/chosen": -0.4941198229789734, + "logits/rejected": -0.5849367380142212, + "logps/chosen": -65.87379455566406, + "logps/rejected": -115.56631469726562, + "loss": 0.63, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.212848424911499, + "rewards/margins": 7.634576320648193, + "rewards/rejected": -4.421728134155273, + "step": 19207 + }, + { + "epoch": 4.81, + "grad_norm": 5.3518877029418945, + "learning_rate": 3.7250618706285993e-08, + "logits/chosen": -0.7096934914588928, + "logits/rejected": -0.7508468627929688, + "logps/chosen": -44.9852294921875, + "logps/rejected": -105.61090087890625, + "loss": 0.5588, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.217916250228882, + "rewards/margins": 7.384672164916992, + "rewards/rejected": -4.166755676269531, + "step": 19208 + }, + { + "epoch": 4.81, + "grad_norm": 15.066787719726562, + "learning_rate": 3.7154916070367696e-08, + "logits/chosen": -0.5851413011550903, + "logits/rejected": -0.6472985148429871, + "logps/chosen": -40.808990478515625, + "logps/rejected": -97.2890853881836, + "loss": 0.5676, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7190933227539062, + "rewards/margins": 6.978666305541992, + "rewards/rejected": -4.259572505950928, + "step": 19209 + }, + { + "epoch": 4.81, + "grad_norm": 21.269611358642578, + "learning_rate": 3.705933607163037e-08, + "logits/chosen": -0.5395423769950867, + "logits/rejected": -0.6164798736572266, + "logps/chosen": -61.55276870727539, + "logps/rejected": -121.27015686035156, + "loss": 0.6182, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2418017387390137, + "rewards/margins": 7.8004560470581055, + "rewards/rejected": -4.55865478515625, + "step": 19210 + }, + { + "epoch": 4.81, + "grad_norm": 3.4501585960388184, + "learning_rate": 3.6963878712435455e-08, + "logits/chosen": -0.5752730965614319, + "logits/rejected": -0.6688383221626282, + "logps/chosen": -60.32406997680664, + "logps/rejected": -109.21855163574219, + "loss": 0.625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.119478225708008, + "rewards/margins": 7.898011207580566, + "rewards/rejected": -4.778532981872559, + "step": 19211 + }, + { + "epoch": 4.81, + "grad_norm": 2.571136474609375, + "learning_rate": 3.686854399514217e-08, + "logits/chosen": -0.6189278960227966, + "logits/rejected": -0.6404169797897339, + "logps/chosen": -40.47132110595703, + "logps/rejected": -121.45445251464844, + "loss": 0.503, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8785431385040283, + "rewards/margins": 7.941511631011963, + "rewards/rejected": -5.0629682540893555, + "step": 19212 + }, + { + "epoch": 4.81, + "grad_norm": 6.5260443687438965, + "learning_rate": 3.677333192210697e-08, + "logits/chosen": -0.5779711008071899, + "logits/rejected": -0.6095177531242371, + "logps/chosen": -58.19308853149414, + "logps/rejected": -110.65444946289062, + "loss": 0.7249, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.700793743133545, + "rewards/margins": 5.917177677154541, + "rewards/rejected": -3.216384172439575, + "step": 19213 + }, + { + "epoch": 4.81, + "grad_norm": 5.772431373596191, + "learning_rate": 3.6678242495680746e-08, + "logits/chosen": -0.6059862375259399, + "logits/rejected": -0.6641000509262085, + "logps/chosen": -39.26028060913086, + "logps/rejected": -106.17477416992188, + "loss": 0.5485, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4072189331054688, + "rewards/margins": 8.652942657470703, + "rewards/rejected": -5.245723724365234, + "step": 19214 + }, + { + "epoch": 4.81, + "grad_norm": 5.418318271636963, + "learning_rate": 3.6583275718214406e-08, + "logits/chosen": -0.5630821585655212, + "logits/rejected": -0.6360917687416077, + "logps/chosen": -57.27327346801758, + "logps/rejected": -114.41740417480469, + "loss": 0.6356, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.030048131942749, + "rewards/margins": 7.139437675476074, + "rewards/rejected": -4.109389781951904, + "step": 19215 + }, + { + "epoch": 4.81, + "grad_norm": 5.26956844329834, + "learning_rate": 3.648843159205495e-08, + "logits/chosen": -0.5687652230262756, + "logits/rejected": -0.6519564390182495, + "logps/chosen": -52.51982498168945, + "logps/rejected": -98.71923065185547, + "loss": 0.6545, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.04738187789917, + "rewards/margins": 5.882128715515137, + "rewards/rejected": -2.8347465991973877, + "step": 19216 + }, + { + "epoch": 4.81, + "grad_norm": 6.125055313110352, + "learning_rate": 3.639371011954607e-08, + "logits/chosen": -0.546800971031189, + "logits/rejected": -0.63019198179245, + "logps/chosen": -57.371437072753906, + "logps/rejected": -107.0578384399414, + "loss": 0.6003, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7283918857574463, + "rewards/margins": 7.46564245223999, + "rewards/rejected": -4.737250804901123, + "step": 19217 + }, + { + "epoch": 4.81, + "grad_norm": 2.6644318103790283, + "learning_rate": 3.6299111303027544e-08, + "logits/chosen": -0.689181923866272, + "logits/rejected": -0.7635339498519897, + "logps/chosen": -57.40950012207031, + "logps/rejected": -118.15116882324219, + "loss": 0.539, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.296849250793457, + "rewards/margins": 8.577720642089844, + "rewards/rejected": -5.280872344970703, + "step": 19218 + }, + { + "epoch": 4.81, + "grad_norm": 9.207569122314453, + "learning_rate": 3.620463514483807e-08, + "logits/chosen": -0.5924072265625, + "logits/rejected": -0.6286238431930542, + "logps/chosen": -59.49314498901367, + "logps/rejected": -112.28144836425781, + "loss": 0.6302, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.08233642578125, + "rewards/margins": 7.333468914031982, + "rewards/rejected": -4.251131534576416, + "step": 19219 + }, + { + "epoch": 4.81, + "grad_norm": 15.784781455993652, + "learning_rate": 3.611028164731189e-08, + "logits/chosen": -0.5234116315841675, + "logits/rejected": -0.6125840544700623, + "logps/chosen": -49.735687255859375, + "logps/rejected": -90.69302368164062, + "loss": 0.6395, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.622343063354492, + "rewards/margins": 7.572214126586914, + "rewards/rejected": -3.949871063232422, + "step": 19220 + }, + { + "epoch": 4.81, + "grad_norm": 4.17779016494751, + "learning_rate": 3.6016050812780476e-08, + "logits/chosen": -0.5635558366775513, + "logits/rejected": -0.6678552627563477, + "logps/chosen": -68.92655944824219, + "logps/rejected": -107.67401885986328, + "loss": 0.6634, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2526731491088867, + "rewards/margins": 7.942450523376465, + "rewards/rejected": -4.689777851104736, + "step": 19221 + }, + { + "epoch": 4.81, + "grad_norm": 3.2642197608947754, + "learning_rate": 3.592194264357196e-08, + "logits/chosen": -0.4973769783973694, + "logits/rejected": -0.6129101514816284, + "logps/chosen": -51.02474594116211, + "logps/rejected": -92.62232971191406, + "loss": 0.5416, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.006542444229126, + "rewards/margins": 6.944076061248779, + "rewards/rejected": -3.9375336170196533, + "step": 19222 + }, + { + "epoch": 4.81, + "grad_norm": 5.323354721069336, + "learning_rate": 3.582795714201226e-08, + "logits/chosen": -0.5224348902702332, + "logits/rejected": -0.6398563981056213, + "logps/chosen": -68.98699188232422, + "logps/rejected": -105.4194107055664, + "loss": 0.6678, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.405796766281128, + "rewards/margins": 6.710208415985107, + "rewards/rejected": -4.304410934448242, + "step": 19223 + }, + { + "epoch": 4.81, + "grad_norm": 2.242047071456909, + "learning_rate": 3.573409431042396e-08, + "logits/chosen": -0.5192628502845764, + "logits/rejected": -0.6496841907501221, + "logps/chosen": -57.53750991821289, + "logps/rejected": -110.21382904052734, + "loss": 0.5633, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.324869394302368, + "rewards/margins": 7.790828704833984, + "rewards/rejected": -4.465959548950195, + "step": 19224 + }, + { + "epoch": 4.81, + "grad_norm": 4.0139970779418945, + "learning_rate": 3.564035415112688e-08, + "logits/chosen": -0.5456641316413879, + "logits/rejected": -0.6257429122924805, + "logps/chosen": -66.60035705566406, + "logps/rejected": -95.83027648925781, + "loss": 0.7466, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.16264009475708, + "rewards/margins": 7.09100866317749, + "rewards/rejected": -3.928368091583252, + "step": 19225 + }, + { + "epoch": 4.81, + "grad_norm": 5.80905818939209, + "learning_rate": 3.5546736666436373e-08, + "logits/chosen": -0.5994529128074646, + "logits/rejected": -0.6595797538757324, + "logps/chosen": -59.61827087402344, + "logps/rejected": -113.36073303222656, + "loss": 0.6868, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.984487295150757, + "rewards/margins": 7.4515485763549805, + "rewards/rejected": -4.467061519622803, + "step": 19226 + }, + { + "epoch": 4.81, + "grad_norm": 7.468678951263428, + "learning_rate": 3.545324185866672e-08, + "logits/chosen": -0.6024065017700195, + "logits/rejected": -0.6853582859039307, + "logps/chosen": -45.156402587890625, + "logps/rejected": -108.62740325927734, + "loss": 0.5907, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3351259231567383, + "rewards/margins": 8.974119186401367, + "rewards/rejected": -5.638993263244629, + "step": 19227 + }, + { + "epoch": 4.81, + "grad_norm": 4.270566940307617, + "learning_rate": 3.5359869730127725e-08, + "logits/chosen": -0.5578112602233887, + "logits/rejected": -0.6483268141746521, + "logps/chosen": -53.36840057373047, + "logps/rejected": -111.76984405517578, + "loss": 0.598, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.263479232788086, + "rewards/margins": 7.559009075164795, + "rewards/rejected": -4.295530319213867, + "step": 19228 + }, + { + "epoch": 4.81, + "grad_norm": 3.8760581016540527, + "learning_rate": 3.526662028312755e-08, + "logits/chosen": -0.6426479816436768, + "logits/rejected": -0.7302449345588684, + "logps/chosen": -50.30461883544922, + "logps/rejected": -109.68448638916016, + "loss": 0.6713, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3752989768981934, + "rewards/margins": 8.47242546081543, + "rewards/rejected": -5.097126007080078, + "step": 19229 + }, + { + "epoch": 4.81, + "grad_norm": 3.2216315269470215, + "learning_rate": 3.517349351996935e-08, + "logits/chosen": -0.5058311223983765, + "logits/rejected": -0.5868738293647766, + "logps/chosen": -51.51030349731445, + "logps/rejected": -100.23680114746094, + "loss": 0.5686, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.149531364440918, + "rewards/margins": 6.460080623626709, + "rewards/rejected": -3.310549736022949, + "step": 19230 + }, + { + "epoch": 4.81, + "grad_norm": 6.701732635498047, + "learning_rate": 3.5080489442955166e-08, + "logits/chosen": -0.5871189832687378, + "logits/rejected": -0.6575762033462524, + "logps/chosen": -56.113948822021484, + "logps/rejected": -99.046142578125, + "loss": 0.7004, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.060238838195801, + "rewards/margins": 5.712034702301025, + "rewards/rejected": -2.6517956256866455, + "step": 19231 + }, + { + "epoch": 4.81, + "grad_norm": 4.791623592376709, + "learning_rate": 3.498760805438317e-08, + "logits/chosen": -0.5577805042266846, + "logits/rejected": -0.6921550035476685, + "logps/chosen": -53.5849494934082, + "logps/rejected": -107.29313659667969, + "loss": 0.5879, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8357369899749756, + "rewards/margins": 9.781561851501465, + "rewards/rejected": -6.94582462310791, + "step": 19232 + }, + { + "epoch": 4.81, + "grad_norm": 4.340616703033447, + "learning_rate": 3.4894849356548187e-08, + "logits/chosen": -0.5947808623313904, + "logits/rejected": -0.6882070899009705, + "logps/chosen": -51.9195442199707, + "logps/rejected": -99.65601348876953, + "loss": 0.6377, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.182659864425659, + "rewards/margins": 7.226548671722412, + "rewards/rejected": -4.043889045715332, + "step": 19233 + }, + { + "epoch": 4.81, + "grad_norm": 11.286754608154297, + "learning_rate": 3.4802213351742834e-08, + "logits/chosen": -0.527701199054718, + "logits/rejected": -0.6176406145095825, + "logps/chosen": -50.998165130615234, + "logps/rejected": -110.3099594116211, + "loss": 0.6173, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.224917411804199, + "rewards/margins": 7.69627571105957, + "rewards/rejected": -4.471358776092529, + "step": 19234 + }, + { + "epoch": 4.81, + "grad_norm": 5.115945816040039, + "learning_rate": 3.470970004225582e-08, + "logits/chosen": -0.5436049103736877, + "logits/rejected": -0.5913944244384766, + "logps/chosen": -62.39076232910156, + "logps/rejected": -115.59696197509766, + "loss": 0.6306, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.131476879119873, + "rewards/margins": 8.510074615478516, + "rewards/rejected": -5.378597736358643, + "step": 19235 + }, + { + "epoch": 4.81, + "grad_norm": 7.0255656242370605, + "learning_rate": 3.4617309430373113e-08, + "logits/chosen": -0.57574063539505, + "logits/rejected": -0.6857581734657288, + "logps/chosen": -60.61599349975586, + "logps/rejected": -111.94721984863281, + "loss": 0.627, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9666900634765625, + "rewards/margins": 7.500638484954834, + "rewards/rejected": -4.5339484214782715, + "step": 19236 + }, + { + "epoch": 4.81, + "grad_norm": 12.405085563659668, + "learning_rate": 3.4525041518378436e-08, + "logits/chosen": -0.5570935010910034, + "logits/rejected": -0.6828392744064331, + "logps/chosen": -51.543006896972656, + "logps/rejected": -90.01237487792969, + "loss": 0.5545, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.463710308074951, + "rewards/margins": 7.390061378479004, + "rewards/rejected": -3.9263510704040527, + "step": 19237 + }, + { + "epoch": 4.81, + "grad_norm": 2.7786669731140137, + "learning_rate": 3.4432896308551625e-08, + "logits/chosen": -0.6183674931526184, + "logits/rejected": -0.7152986526489258, + "logps/chosen": -51.95741271972656, + "logps/rejected": -94.4425048828125, + "loss": 0.564, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1921682357788086, + "rewards/margins": 7.621755599975586, + "rewards/rejected": -4.429587364196777, + "step": 19238 + }, + { + "epoch": 4.81, + "grad_norm": 4.104068756103516, + "learning_rate": 3.434087380316919e-08, + "logits/chosen": -0.5403532981872559, + "logits/rejected": -0.6072862148284912, + "logps/chosen": -53.20425796508789, + "logps/rejected": -120.28535461425781, + "loss": 0.589, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8980374336242676, + "rewards/margins": 7.115993499755859, + "rewards/rejected": -4.217956066131592, + "step": 19239 + }, + { + "epoch": 4.81, + "grad_norm": 5.47700309753418, + "learning_rate": 3.4248974004506e-08, + "logits/chosen": -0.5180441737174988, + "logits/rejected": -0.5925902128219604, + "logps/chosen": -57.48551940917969, + "logps/rejected": -106.97041320800781, + "loss": 0.5895, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0097999572753906, + "rewards/margins": 6.565789222717285, + "rewards/rejected": -3.5559892654418945, + "step": 19240 + }, + { + "epoch": 4.81, + "grad_norm": 6.0596923828125, + "learning_rate": 3.415719691483188e-08, + "logits/chosen": -0.5503911972045898, + "logits/rejected": -0.6218182444572449, + "logps/chosen": -50.958580017089844, + "logps/rejected": -95.9815444946289, + "loss": 0.7185, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.085606813430786, + "rewards/margins": 5.476438045501709, + "rewards/rejected": -2.3908307552337646, + "step": 19241 + }, + { + "epoch": 4.81, + "grad_norm": 7.387625694274902, + "learning_rate": 3.406554253641614e-08, + "logits/chosen": -0.4725954532623291, + "logits/rejected": -0.551974892616272, + "logps/chosen": -68.18577575683594, + "logps/rejected": -122.20520782470703, + "loss": 0.724, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.005136013031006, + "rewards/margins": 7.636072158813477, + "rewards/rejected": -4.6309356689453125, + "step": 19242 + }, + { + "epoch": 4.81, + "grad_norm": 3.8043885231018066, + "learning_rate": 3.397401087152197e-08, + "logits/chosen": -0.5911705493927002, + "logits/rejected": -0.6088912487030029, + "logps/chosen": -50.39883804321289, + "logps/rejected": -120.39813995361328, + "loss": 0.6196, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.31083345413208, + "rewards/margins": 8.015291213989258, + "rewards/rejected": -4.704456329345703, + "step": 19243 + }, + { + "epoch": 4.81, + "grad_norm": 10.19082260131836, + "learning_rate": 3.388260192241255e-08, + "logits/chosen": -0.5161599516868591, + "logits/rejected": -0.6195447444915771, + "logps/chosen": -60.78114700317383, + "logps/rejected": -98.78163146972656, + "loss": 0.6927, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9728403091430664, + "rewards/margins": 6.800736904144287, + "rewards/rejected": -3.8278965950012207, + "step": 19244 + }, + { + "epoch": 4.81, + "grad_norm": 7.3738017082214355, + "learning_rate": 3.379131569134608e-08, + "logits/chosen": -0.5419957637786865, + "logits/rejected": -0.6221202611923218, + "logps/chosen": -58.661006927490234, + "logps/rejected": -101.07252502441406, + "loss": 0.7115, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.956660032272339, + "rewards/margins": 6.389647483825684, + "rewards/rejected": -3.432987689971924, + "step": 19245 + }, + { + "epoch": 4.81, + "grad_norm": 6.527525424957275, + "learning_rate": 3.3700152180578537e-08, + "logits/chosen": -0.5088536143302917, + "logits/rejected": -0.642166018486023, + "logps/chosen": -62.349124908447266, + "logps/rejected": -93.95043182373047, + "loss": 0.6672, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7444205284118652, + "rewards/margins": 7.108242034912109, + "rewards/rejected": -4.363822937011719, + "step": 19246 + }, + { + "epoch": 4.81, + "grad_norm": 5.5372700691223145, + "learning_rate": 3.360911139236256e-08, + "logits/chosen": -0.46845173835754395, + "logits/rejected": -0.5384149551391602, + "logps/chosen": -55.00055694580078, + "logps/rejected": -96.21987915039062, + "loss": 0.6379, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.946155548095703, + "rewards/margins": 5.63045597076416, + "rewards/rejected": -2.6843008995056152, + "step": 19247 + }, + { + "epoch": 4.82, + "grad_norm": 6.030964374542236, + "learning_rate": 3.351819332894801e-08, + "logits/chosen": -0.5675754547119141, + "logits/rejected": -0.6519609689712524, + "logps/chosen": -52.75992965698242, + "logps/rejected": -121.31349182128906, + "loss": 0.6174, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1490821838378906, + "rewards/margins": 8.661473274230957, + "rewards/rejected": -5.512390613555908, + "step": 19248 + }, + { + "epoch": 4.82, + "grad_norm": 7.98410177230835, + "learning_rate": 3.3427397992581434e-08, + "logits/chosen": -0.5962058901786804, + "logits/rejected": -0.6913406252861023, + "logps/chosen": -62.432029724121094, + "logps/rejected": -112.08729553222656, + "loss": 0.603, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2599399089813232, + "rewards/margins": 7.942061424255371, + "rewards/rejected": -4.682121753692627, + "step": 19249 + }, + { + "epoch": 4.82, + "grad_norm": 3.49717116355896, + "learning_rate": 3.333672538550714e-08, + "logits/chosen": -0.5290052890777588, + "logits/rejected": -0.6312814354896545, + "logps/chosen": -51.77931213378906, + "logps/rejected": -103.92904663085938, + "loss": 0.5805, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1377644538879395, + "rewards/margins": 7.839743137359619, + "rewards/rejected": -4.7019782066345215, + "step": 19250 + }, + { + "epoch": 4.82, + "grad_norm": 7.837948322296143, + "learning_rate": 3.324617550996445e-08, + "logits/chosen": -0.5701074004173279, + "logits/rejected": -0.6361738443374634, + "logps/chosen": -48.74100112915039, + "logps/rejected": -112.70228576660156, + "loss": 0.5925, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2766714096069336, + "rewards/margins": 7.131726264953613, + "rewards/rejected": -3.8550543785095215, + "step": 19251 + }, + { + "epoch": 4.82, + "grad_norm": 4.680822372436523, + "learning_rate": 3.315574836819158e-08, + "logits/chosen": -0.535494327545166, + "logits/rejected": -0.6239824295043945, + "logps/chosen": -51.646034240722656, + "logps/rejected": -103.82020568847656, + "loss": 0.6309, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1179842948913574, + "rewards/margins": 6.681108474731445, + "rewards/rejected": -3.563124895095825, + "step": 19252 + }, + { + "epoch": 4.82, + "grad_norm": 2.2804136276245117, + "learning_rate": 3.3065443962423396e-08, + "logits/chosen": -0.4759049415588379, + "logits/rejected": -0.5018761157989502, + "logps/chosen": -47.09376525878906, + "logps/rejected": -101.99000549316406, + "loss": 0.5686, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2781803607940674, + "rewards/margins": 7.102090358734131, + "rewards/rejected": -3.8239104747772217, + "step": 19253 + }, + { + "epoch": 4.82, + "grad_norm": 4.025874137878418, + "learning_rate": 3.2975262294890894e-08, + "logits/chosen": -0.5969226360321045, + "logits/rejected": -0.6031870245933533, + "logps/chosen": -43.99314880371094, + "logps/rejected": -94.04273223876953, + "loss": 0.5983, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1667098999023438, + "rewards/margins": 5.472102642059326, + "rewards/rejected": -2.3053932189941406, + "step": 19254 + }, + { + "epoch": 4.82, + "grad_norm": 4.210598468780518, + "learning_rate": 3.288520336782286e-08, + "logits/chosen": -0.5106402039527893, + "logits/rejected": -0.6187563538551331, + "logps/chosen": -57.20655059814453, + "logps/rejected": -112.32391357421875, + "loss": 0.6256, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0584189891815186, + "rewards/margins": 7.882013320922852, + "rewards/rejected": -4.823594093322754, + "step": 19255 + }, + { + "epoch": 4.82, + "grad_norm": 3.4528727531433105, + "learning_rate": 3.279526718344528e-08, + "logits/chosen": -0.5924083590507507, + "logits/rejected": -0.655391275882721, + "logps/chosen": -43.415672302246094, + "logps/rejected": -111.14008331298828, + "loss": 0.4949, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.188258647918701, + "rewards/margins": 7.977497577667236, + "rewards/rejected": -4.789239406585693, + "step": 19256 + }, + { + "epoch": 4.82, + "grad_norm": 8.561603546142578, + "learning_rate": 3.270545374397971e-08, + "logits/chosen": -0.5316221117973328, + "logits/rejected": -0.6184566020965576, + "logps/chosen": -45.29231643676758, + "logps/rejected": -94.10459899902344, + "loss": 0.6473, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.146228551864624, + "rewards/margins": 7.369610786437988, + "rewards/rejected": -4.223381996154785, + "step": 19257 + }, + { + "epoch": 4.82, + "grad_norm": 4.1643805503845215, + "learning_rate": 3.261576305164549e-08, + "logits/chosen": -0.5131608843803406, + "logits/rejected": -0.5988472700119019, + "logps/chosen": -54.51774215698242, + "logps/rejected": -110.01612854003906, + "loss": 0.5643, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2823266983032227, + "rewards/margins": 6.808192253112793, + "rewards/rejected": -3.5258662700653076, + "step": 19258 + }, + { + "epoch": 4.82, + "grad_norm": 7.858584880828857, + "learning_rate": 3.252619510865973e-08, + "logits/chosen": -0.4858604669570923, + "logits/rejected": -0.5575956106185913, + "logps/chosen": -51.55278015136719, + "logps/rejected": -108.76341247558594, + "loss": 0.6248, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.208892822265625, + "rewards/margins": 7.428527355194092, + "rewards/rejected": -4.219634532928467, + "step": 19259 + }, + { + "epoch": 4.82, + "grad_norm": 6.947170257568359, + "learning_rate": 3.243674991723511e-08, + "logits/chosen": -0.6317845582962036, + "logits/rejected": -0.5952671766281128, + "logps/chosen": -74.78803253173828, + "logps/rejected": -104.44705200195312, + "loss": 0.7452, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.149639368057251, + "rewards/margins": 6.375683784484863, + "rewards/rejected": -3.2260444164276123, + "step": 19260 + }, + { + "epoch": 4.82, + "grad_norm": 5.972301483154297, + "learning_rate": 3.2347427479582086e-08, + "logits/chosen": -0.5943583250045776, + "logits/rejected": -0.6552209258079529, + "logps/chosen": -51.086448669433594, + "logps/rejected": -119.15614318847656, + "loss": 0.6285, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2469561100006104, + "rewards/margins": 8.415205955505371, + "rewards/rejected": -5.16825008392334, + "step": 19261 + }, + { + "epoch": 4.82, + "grad_norm": 2.2770650386810303, + "learning_rate": 3.2258227797908325e-08, + "logits/chosen": -0.5658543705940247, + "logits/rejected": -0.6288625597953796, + "logps/chosen": -43.79670333862305, + "logps/rejected": -113.89765167236328, + "loss": 0.5651, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.343573570251465, + "rewards/margins": 8.027374267578125, + "rewards/rejected": -4.683800220489502, + "step": 19262 + }, + { + "epoch": 4.82, + "grad_norm": 2.7581634521484375, + "learning_rate": 3.2169150874417076e-08, + "logits/chosen": -0.5671291947364807, + "logits/rejected": -0.681363046169281, + "logps/chosen": -57.498741149902344, + "logps/rejected": -89.80471801757812, + "loss": 0.5362, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.171501636505127, + "rewards/margins": 7.757596492767334, + "rewards/rejected": -4.586094856262207, + "step": 19263 + }, + { + "epoch": 4.82, + "grad_norm": 3.8172247409820557, + "learning_rate": 3.2080196711310466e-08, + "logits/chosen": -0.5448952317237854, + "logits/rejected": -0.6179012656211853, + "logps/chosen": -46.43375015258789, + "logps/rejected": -124.63700103759766, + "loss": 0.5505, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.170111656188965, + "rewards/margins": 8.300318717956543, + "rewards/rejected": -5.130207538604736, + "step": 19264 + }, + { + "epoch": 4.82, + "grad_norm": 7.2598958015441895, + "learning_rate": 3.1991365310786724e-08, + "logits/chosen": -0.5510267615318298, + "logits/rejected": -0.5576663613319397, + "logps/chosen": -54.90969467163086, + "logps/rejected": -112.51123046875, + "loss": 0.66, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5419466495513916, + "rewards/margins": 5.690901756286621, + "rewards/rejected": -2.1489546298980713, + "step": 19265 + }, + { + "epoch": 4.82, + "grad_norm": 2.8185667991638184, + "learning_rate": 3.1902656675039665e-08, + "logits/chosen": -0.5358710289001465, + "logits/rejected": -0.5849912166595459, + "logps/chosen": -47.28504943847656, + "logps/rejected": -133.22010803222656, + "loss": 0.5094, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3526501655578613, + "rewards/margins": 7.617875576019287, + "rewards/rejected": -4.265225410461426, + "step": 19266 + }, + { + "epoch": 4.82, + "grad_norm": 4.330445289611816, + "learning_rate": 3.1814070806263086e-08, + "logits/chosen": -0.612919807434082, + "logits/rejected": -0.6999515295028687, + "logps/chosen": -50.009559631347656, + "logps/rejected": -96.5682144165039, + "loss": 0.6482, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0134835243225098, + "rewards/margins": 5.386603355407715, + "rewards/rejected": -2.373119831085205, + "step": 19267 + }, + { + "epoch": 4.82, + "grad_norm": 5.801294326782227, + "learning_rate": 3.172560770664468e-08, + "logits/chosen": -0.4967948794364929, + "logits/rejected": -0.5851936936378479, + "logps/chosen": -58.70945739746094, + "logps/rejected": -103.90666198730469, + "loss": 0.662, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.663036584854126, + "rewards/margins": 6.753701210021973, + "rewards/rejected": -4.090664863586426, + "step": 19268 + }, + { + "epoch": 4.82, + "grad_norm": 4.667567253112793, + "learning_rate": 3.163726737837103e-08, + "logits/chosen": -0.602394163608551, + "logits/rejected": -0.6536910533905029, + "logps/chosen": -52.346317291259766, + "logps/rejected": -126.79312133789062, + "loss": 0.6091, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.366103172302246, + "rewards/margins": 8.804634094238281, + "rewards/rejected": -5.438530921936035, + "step": 19269 + }, + { + "epoch": 4.82, + "grad_norm": 2.003180742263794, + "learning_rate": 3.154904982362539e-08, + "logits/chosen": -0.5244584679603577, + "logits/rejected": -0.5989113450050354, + "logps/chosen": -61.90807342529297, + "logps/rejected": -138.5146026611328, + "loss": 0.539, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.25582218170166, + "rewards/margins": 8.627843856811523, + "rewards/rejected": -5.372020721435547, + "step": 19270 + }, + { + "epoch": 4.82, + "grad_norm": 4.115970134735107, + "learning_rate": 3.146095504458713e-08, + "logits/chosen": -0.5121364593505859, + "logits/rejected": -0.5772150754928589, + "logps/chosen": -53.96648406982422, + "logps/rejected": -99.34197998046875, + "loss": 0.5973, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2473509311676025, + "rewards/margins": 6.278499603271484, + "rewards/rejected": -3.031149387359619, + "step": 19271 + }, + { + "epoch": 4.82, + "grad_norm": 2.4767658710479736, + "learning_rate": 3.137298304343339e-08, + "logits/chosen": -0.6089447140693665, + "logits/rejected": -0.6872692108154297, + "logps/chosen": -51.70466613769531, + "logps/rejected": -113.34765625, + "loss": 0.5665, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2446022033691406, + "rewards/margins": 8.42089557647705, + "rewards/rejected": -5.17629337310791, + "step": 19272 + }, + { + "epoch": 4.82, + "grad_norm": 7.213054656982422, + "learning_rate": 3.128513382233855e-08, + "logits/chosen": -0.5587140917778015, + "logits/rejected": -0.6663985252380371, + "logps/chosen": -61.724361419677734, + "logps/rejected": -107.93753814697266, + "loss": 0.5929, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.820526123046875, + "rewards/margins": 6.977534294128418, + "rewards/rejected": -4.157009124755859, + "step": 19273 + }, + { + "epoch": 4.82, + "grad_norm": 3.343385934829712, + "learning_rate": 3.1197407383472546e-08, + "logits/chosen": -0.5970097780227661, + "logits/rejected": -0.6538053750991821, + "logps/chosen": -50.58445358276367, + "logps/rejected": -113.10258483886719, + "loss": 0.6708, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4173972606658936, + "rewards/margins": 8.042166709899902, + "rewards/rejected": -4.62476921081543, + "step": 19274 + }, + { + "epoch": 4.82, + "grad_norm": 3.900357961654663, + "learning_rate": 3.1109803729004185e-08, + "logits/chosen": -0.5809822082519531, + "logits/rejected": -0.6453520059585571, + "logps/chosen": -38.42525100708008, + "logps/rejected": -106.68778228759766, + "loss": 0.562, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.294048309326172, + "rewards/margins": 6.849159240722656, + "rewards/rejected": -3.555110216140747, + "step": 19275 + }, + { + "epoch": 4.82, + "grad_norm": 10.45095157623291, + "learning_rate": 3.1022322861097296e-08, + "logits/chosen": -0.5342780947685242, + "logits/rejected": -0.5784887671470642, + "logps/chosen": -57.155372619628906, + "logps/rejected": -90.47096252441406, + "loss": 0.7414, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.217581033706665, + "rewards/margins": 6.878837585449219, + "rewards/rejected": -3.6612560749053955, + "step": 19276 + }, + { + "epoch": 4.82, + "grad_norm": 4.508054733276367, + "learning_rate": 3.093496478191405e-08, + "logits/chosen": -0.5865659117698669, + "logits/rejected": -0.6296367645263672, + "logps/chosen": -51.042850494384766, + "logps/rejected": -88.524658203125, + "loss": 0.6644, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2167067527770996, + "rewards/margins": 4.725254535675049, + "rewards/rejected": -1.5085477828979492, + "step": 19277 + }, + { + "epoch": 4.82, + "grad_norm": 6.930846691131592, + "learning_rate": 3.0847729493613257e-08, + "logits/chosen": -0.5014072060585022, + "logits/rejected": -0.6008689403533936, + "logps/chosen": -52.27903366088867, + "logps/rejected": -116.29643249511719, + "loss": 0.6233, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0763609409332275, + "rewards/margins": 8.927223205566406, + "rewards/rejected": -5.8508620262146, + "step": 19278 + }, + { + "epoch": 4.82, + "grad_norm": 7.546250820159912, + "learning_rate": 3.0760616998350423e-08, + "logits/chosen": -0.5925711393356323, + "logits/rejected": -0.6381909847259521, + "logps/chosen": -57.259185791015625, + "logps/rejected": -111.13941955566406, + "loss": 0.6395, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7653050422668457, + "rewards/margins": 7.050631523132324, + "rewards/rejected": -4.28532600402832, + "step": 19279 + }, + { + "epoch": 4.82, + "grad_norm": 4.822821140289307, + "learning_rate": 3.0673627298278276e-08, + "logits/chosen": -0.5899254679679871, + "logits/rejected": -0.6940856575965881, + "logps/chosen": -48.717872619628906, + "logps/rejected": -102.2330322265625, + "loss": 0.5461, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1696202754974365, + "rewards/margins": 8.274059295654297, + "rewards/rejected": -5.104438304901123, + "step": 19280 + }, + { + "epoch": 4.82, + "grad_norm": 7.722794055938721, + "learning_rate": 3.0586760395546755e-08, + "logits/chosen": -0.5816810131072998, + "logits/rejected": -0.686211347579956, + "logps/chosen": -51.64754104614258, + "logps/rejected": -97.46359252929688, + "loss": 0.606, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6892337799072266, + "rewards/margins": 7.791800498962402, + "rewards/rejected": -5.102566719055176, + "step": 19281 + }, + { + "epoch": 4.82, + "grad_norm": 3.940823554992676, + "learning_rate": 3.0500016292301925e-08, + "logits/chosen": -0.6230050921440125, + "logits/rejected": -0.6922817230224609, + "logps/chosen": -46.739952087402344, + "logps/rejected": -95.09992980957031, + "loss": 0.667, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.362260341644287, + "rewards/margins": 7.760921001434326, + "rewards/rejected": -4.398660182952881, + "step": 19282 + }, + { + "epoch": 4.82, + "grad_norm": 30.071500778198242, + "learning_rate": 3.041339499068707e-08, + "logits/chosen": -0.5929174423217773, + "logits/rejected": -0.6583249568939209, + "logps/chosen": -51.29967498779297, + "logps/rejected": -111.6038818359375, + "loss": 0.5882, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.076531410217285, + "rewards/margins": 6.491637229919434, + "rewards/rejected": -3.4151058197021484, + "step": 19283 + }, + { + "epoch": 4.82, + "grad_norm": 3.1636734008789062, + "learning_rate": 3.032689649284326e-08, + "logits/chosen": -0.46306905150413513, + "logits/rejected": -0.5632727742195129, + "logps/chosen": -49.649356842041016, + "logps/rejected": -101.13871765136719, + "loss": 0.5115, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.320296049118042, + "rewards/margins": 7.323784351348877, + "rewards/rejected": -4.003488540649414, + "step": 19284 + }, + { + "epoch": 4.82, + "grad_norm": 4.589640140533447, + "learning_rate": 3.024052080090822e-08, + "logits/chosen": -0.5601208806037903, + "logits/rejected": -0.621574878692627, + "logps/chosen": -56.81062698364258, + "logps/rejected": -109.28958129882812, + "loss": 0.6669, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.937899112701416, + "rewards/margins": 7.369905948638916, + "rewards/rejected": -4.432006359100342, + "step": 19285 + }, + { + "epoch": 4.82, + "grad_norm": 3.6317555904388428, + "learning_rate": 3.015426791701581e-08, + "logits/chosen": -0.539171040058136, + "logits/rejected": -0.5852420330047607, + "logps/chosen": -47.823570251464844, + "logps/rejected": -97.34333801269531, + "loss": 0.5842, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0938661098480225, + "rewards/margins": 6.586734294891357, + "rewards/rejected": -3.492868423461914, + "step": 19286 + }, + { + "epoch": 4.82, + "grad_norm": 5.458555698394775, + "learning_rate": 3.0068137843297653e-08, + "logits/chosen": -0.5146620869636536, + "logits/rejected": -0.6031679511070251, + "logps/chosen": -66.49744415283203, + "logps/rejected": -117.07569122314453, + "loss": 0.6448, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.233870267868042, + "rewards/margins": 7.662444591522217, + "rewards/rejected": -4.428574085235596, + "step": 19287 + }, + { + "epoch": 4.83, + "grad_norm": 8.210427284240723, + "learning_rate": 2.998213058188204e-08, + "logits/chosen": -0.5313363671302795, + "logits/rejected": -0.6176903247833252, + "logps/chosen": -68.26680755615234, + "logps/rejected": -109.33577728271484, + "loss": 0.6494, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3767383098602295, + "rewards/margins": 6.805505752563477, + "rewards/rejected": -3.4287667274475098, + "step": 19288 + }, + { + "epoch": 4.83, + "grad_norm": 4.516999244689941, + "learning_rate": 2.9896246134893945e-08, + "logits/chosen": -0.5413643717765808, + "logits/rejected": -0.639670729637146, + "logps/chosen": -60.728759765625, + "logps/rejected": -93.5973129272461, + "loss": 0.5977, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.904510974884033, + "rewards/margins": 6.372654914855957, + "rewards/rejected": -3.468143939971924, + "step": 19289 + }, + { + "epoch": 4.83, + "grad_norm": 3.317473888397217, + "learning_rate": 2.981048450445612e-08, + "logits/chosen": -0.5347436666488647, + "logits/rejected": -0.6010022163391113, + "logps/chosen": -55.854515075683594, + "logps/rejected": -109.62982177734375, + "loss": 0.5861, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3373091220855713, + "rewards/margins": 8.161510467529297, + "rewards/rejected": -4.824200630187988, + "step": 19290 + }, + { + "epoch": 4.83, + "grad_norm": 4.711631774902344, + "learning_rate": 2.9724845692687964e-08, + "logits/chosen": -0.5514681339263916, + "logits/rejected": -0.6125754714012146, + "logps/chosen": -50.58167266845703, + "logps/rejected": -117.51626586914062, + "loss": 0.6268, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4624760150909424, + "rewards/margins": 8.004443168640137, + "rewards/rejected": -4.541967391967773, + "step": 19291 + }, + { + "epoch": 4.83, + "grad_norm": 3.0154974460601807, + "learning_rate": 2.9639329701705576e-08, + "logits/chosen": -0.5417757034301758, + "logits/rejected": -0.5575889945030212, + "logps/chosen": -47.91556167602539, + "logps/rejected": -122.40814208984375, + "loss": 0.5893, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3709452152252197, + "rewards/margins": 7.64321756362915, + "rewards/rejected": -4.272273063659668, + "step": 19292 + }, + { + "epoch": 4.83, + "grad_norm": 3.825543165206909, + "learning_rate": 2.9553936533622264e-08, + "logits/chosen": -0.5596369504928589, + "logits/rejected": -0.6332640647888184, + "logps/chosen": -56.787479400634766, + "logps/rejected": -107.30551147460938, + "loss": 0.6418, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7077741622924805, + "rewards/margins": 6.603452682495117, + "rewards/rejected": -3.8956780433654785, + "step": 19293 + }, + { + "epoch": 4.83, + "grad_norm": 4.1370625495910645, + "learning_rate": 2.9468666190547447e-08, + "logits/chosen": -0.5845191478729248, + "logits/rejected": -0.660534679889679, + "logps/chosen": -50.201297760009766, + "logps/rejected": -123.76361083984375, + "loss": 0.5711, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1425023078918457, + "rewards/margins": 7.924322605133057, + "rewards/rejected": -4.781820297241211, + "step": 19294 + }, + { + "epoch": 4.83, + "grad_norm": 5.069394588470459, + "learning_rate": 2.9383518674588883e-08, + "logits/chosen": -0.539046049118042, + "logits/rejected": -0.5978283882141113, + "logps/chosen": -46.272342681884766, + "logps/rejected": -101.5167465209961, + "loss": 0.6561, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4871826171875, + "rewards/margins": 7.703762531280518, + "rewards/rejected": -4.216579914093018, + "step": 19295 + }, + { + "epoch": 4.83, + "grad_norm": 10.56796932220459, + "learning_rate": 2.9298493987851007e-08, + "logits/chosen": -0.5483324527740479, + "logits/rejected": -0.6138482093811035, + "logps/chosen": -48.5052490234375, + "logps/rejected": -109.76532745361328, + "loss": 0.695, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9240846633911133, + "rewards/margins": 7.841533660888672, + "rewards/rejected": -4.9174485206604, + "step": 19296 + }, + { + "epoch": 4.83, + "grad_norm": 8.31969928741455, + "learning_rate": 2.9213592132433798e-08, + "logits/chosen": -0.5438437461853027, + "logits/rejected": -0.6036003828048706, + "logps/chosen": -61.26638412475586, + "logps/rejected": -133.4241943359375, + "loss": 0.6835, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.70552921295166, + "rewards/margins": 8.092049598693848, + "rewards/rejected": -5.3865203857421875, + "step": 19297 + }, + { + "epoch": 4.83, + "grad_norm": 5.953340530395508, + "learning_rate": 2.9128813110436138e-08, + "logits/chosen": -0.5484619736671448, + "logits/rejected": -0.6166585683822632, + "logps/chosen": -64.05677032470703, + "logps/rejected": -108.62742614746094, + "loss": 0.6936, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8700592517852783, + "rewards/margins": 8.121213912963867, + "rewards/rejected": -5.251154899597168, + "step": 19298 + }, + { + "epoch": 4.83, + "grad_norm": 6.437920093536377, + "learning_rate": 2.9044156923953014e-08, + "logits/chosen": -0.549709141254425, + "logits/rejected": -0.6654800176620483, + "logps/chosen": -54.7595329284668, + "logps/rejected": -99.91426849365234, + "loss": 0.5451, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.011960744857788, + "rewards/margins": 7.836122989654541, + "rewards/rejected": -4.82416296005249, + "step": 19299 + }, + { + "epoch": 4.83, + "grad_norm": 3.57717227935791, + "learning_rate": 2.895962357507609e-08, + "logits/chosen": -0.6052910685539246, + "logits/rejected": -0.6431120038032532, + "logps/chosen": -47.92039108276367, + "logps/rejected": -106.18048095703125, + "loss": 0.6131, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0074243545532227, + "rewards/margins": 7.115084648132324, + "rewards/rejected": -4.107659816741943, + "step": 19300 + }, + { + "epoch": 4.83, + "grad_norm": 2.643101692199707, + "learning_rate": 2.887521306589369e-08, + "logits/chosen": -0.5489438772201538, + "logits/rejected": -0.647076427936554, + "logps/chosen": -58.41325759887695, + "logps/rejected": -94.84232330322266, + "loss": 0.607, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.147444009780884, + "rewards/margins": 7.167534828186035, + "rewards/rejected": -4.020090579986572, + "step": 19301 + }, + { + "epoch": 4.83, + "grad_norm": 7.427267074584961, + "learning_rate": 2.879092539849304e-08, + "logits/chosen": -0.5398357510566711, + "logits/rejected": -0.5751000046730042, + "logps/chosen": -58.74605941772461, + "logps/rejected": -131.8367919921875, + "loss": 0.749, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8597469329833984, + "rewards/margins": 6.445267677307129, + "rewards/rejected": -3.5855209827423096, + "step": 19302 + }, + { + "epoch": 4.83, + "grad_norm": 6.1105451583862305, + "learning_rate": 2.8706760574955807e-08, + "logits/chosen": -0.5959784984588623, + "logits/rejected": -0.6513043642044067, + "logps/chosen": -42.96955108642578, + "logps/rejected": -85.96627044677734, + "loss": 0.573, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.852032423019409, + "rewards/margins": 5.845834255218506, + "rewards/rejected": -2.9938015937805176, + "step": 19303 + }, + { + "epoch": 4.83, + "grad_norm": 4.551652431488037, + "learning_rate": 2.8622718597362542e-08, + "logits/chosen": -0.5135432481765747, + "logits/rejected": -0.5950302481651306, + "logps/chosen": -57.654266357421875, + "logps/rejected": -116.53411865234375, + "loss": 0.6134, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8733882904052734, + "rewards/margins": 7.434904098510742, + "rewards/rejected": -4.561515808105469, + "step": 19304 + }, + { + "epoch": 4.83, + "grad_norm": 6.834774971008301, + "learning_rate": 2.8538799467789368e-08, + "logits/chosen": -0.571729302406311, + "logits/rejected": -0.6290220022201538, + "logps/chosen": -47.903892517089844, + "logps/rejected": -94.47248077392578, + "loss": 0.6261, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1013529300689697, + "rewards/margins": 6.267190456390381, + "rewards/rejected": -3.1658377647399902, + "step": 19305 + }, + { + "epoch": 4.83, + "grad_norm": 4.326534271240234, + "learning_rate": 2.8455003188310737e-08, + "logits/chosen": -0.616579532623291, + "logits/rejected": -0.6855065822601318, + "logps/chosen": -54.17356872558594, + "logps/rejected": -113.37911224365234, + "loss": 0.6402, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1702136993408203, + "rewards/margins": 7.396144866943359, + "rewards/rejected": -4.225930690765381, + "step": 19306 + }, + { + "epoch": 4.83, + "grad_norm": 3.6905245780944824, + "learning_rate": 2.8371329760996656e-08, + "logits/chosen": -0.6179723143577576, + "logits/rejected": -0.7088184952735901, + "logps/chosen": -42.94247817993164, + "logps/rejected": -95.8275375366211, + "loss": 0.5558, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9367899894714355, + "rewards/margins": 7.4196672439575195, + "rewards/rejected": -4.482877254486084, + "step": 19307 + }, + { + "epoch": 4.83, + "grad_norm": 3.6294403076171875, + "learning_rate": 2.8287779187914922e-08, + "logits/chosen": -0.5865733027458191, + "logits/rejected": -0.6586594581604004, + "logps/chosen": -63.84354782104492, + "logps/rejected": -125.66685485839844, + "loss": 0.631, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2200658321380615, + "rewards/margins": 7.966669082641602, + "rewards/rejected": -4.746603012084961, + "step": 19308 + }, + { + "epoch": 4.83, + "grad_norm": 3.905879020690918, + "learning_rate": 2.8204351471130543e-08, + "logits/chosen": -0.6117110848426819, + "logits/rejected": -0.6788738965988159, + "logps/chosen": -44.54551315307617, + "logps/rejected": -98.1704330444336, + "loss": 0.5592, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.293771743774414, + "rewards/margins": 8.156417846679688, + "rewards/rejected": -4.862645149230957, + "step": 19309 + }, + { + "epoch": 4.83, + "grad_norm": 2.471997022628784, + "learning_rate": 2.812104661270465e-08, + "logits/chosen": -0.5218147039413452, + "logits/rejected": -0.637489914894104, + "logps/chosen": -49.77435302734375, + "logps/rejected": -85.31744384765625, + "loss": 0.5637, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1406090259552, + "rewards/margins": 7.176662445068359, + "rewards/rejected": -4.036053657531738, + "step": 19310 + }, + { + "epoch": 4.83, + "grad_norm": 2.411924123764038, + "learning_rate": 2.803786461469671e-08, + "logits/chosen": -0.6077967286109924, + "logits/rejected": -0.6790120601654053, + "logps/chosen": -59.50468444824219, + "logps/rejected": -102.69865417480469, + "loss": 0.6199, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.505786418914795, + "rewards/margins": 7.484270095825195, + "rewards/rejected": -3.9784836769104004, + "step": 19311 + }, + { + "epoch": 4.83, + "grad_norm": 4.169193744659424, + "learning_rate": 2.795480547916063e-08, + "logits/chosen": -0.5711866617202759, + "logits/rejected": -0.6340941190719604, + "logps/chosen": -54.69316482543945, + "logps/rejected": -96.6482925415039, + "loss": 0.6951, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.201385498046875, + "rewards/margins": 5.7480316162109375, + "rewards/rejected": -2.5466458797454834, + "step": 19312 + }, + { + "epoch": 4.83, + "grad_norm": 6.410926342010498, + "learning_rate": 2.7871869208150327e-08, + "logits/chosen": -0.5659540295600891, + "logits/rejected": -0.6144297122955322, + "logps/chosen": -51.444820404052734, + "logps/rejected": -151.02154541015625, + "loss": 0.5777, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.211404800415039, + "rewards/margins": 9.79239273071289, + "rewards/rejected": -6.580987930297852, + "step": 19313 + }, + { + "epoch": 4.83, + "grad_norm": 2.6689963340759277, + "learning_rate": 2.7789055803714715e-08, + "logits/chosen": -0.5222215056419373, + "logits/rejected": -0.6158609390258789, + "logps/chosen": -60.46893310546875, + "logps/rejected": -92.73991394042969, + "loss": 0.6596, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5071263313293457, + "rewards/margins": 6.890193939208984, + "rewards/rejected": -3.3830676078796387, + "step": 19314 + }, + { + "epoch": 4.83, + "grad_norm": 4.4833903312683105, + "learning_rate": 2.770636526789994e-08, + "logits/chosen": -0.5607037544250488, + "logits/rejected": -0.6481218338012695, + "logps/chosen": -45.17709732055664, + "logps/rejected": -112.31466674804688, + "loss": 0.556, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0787439346313477, + "rewards/margins": 7.892359733581543, + "rewards/rejected": -4.813615322113037, + "step": 19315 + }, + { + "epoch": 4.83, + "grad_norm": 8.468287467956543, + "learning_rate": 2.7623797602749914e-08, + "logits/chosen": -0.5694738626480103, + "logits/rejected": -0.6213623285293579, + "logps/chosen": -69.2066421508789, + "logps/rejected": -100.96537780761719, + "loss": 0.7105, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2482762336730957, + "rewards/margins": 6.123846054077148, + "rewards/rejected": -2.8755698204040527, + "step": 19316 + }, + { + "epoch": 4.83, + "grad_norm": 4.29248571395874, + "learning_rate": 2.7541352810304677e-08, + "logits/chosen": -0.5853496789932251, + "logits/rejected": -0.7207226157188416, + "logps/chosen": -60.45442581176758, + "logps/rejected": -102.64812469482422, + "loss": 0.605, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.994175910949707, + "rewards/margins": 8.355484008789062, + "rewards/rejected": -5.361309051513672, + "step": 19317 + }, + { + "epoch": 4.83, + "grad_norm": 5.411609649658203, + "learning_rate": 2.745903089260149e-08, + "logits/chosen": -0.5231056809425354, + "logits/rejected": -0.6359122395515442, + "logps/chosen": -50.33005142211914, + "logps/rejected": -113.48787689208984, + "loss": 0.5309, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3986129760742188, + "rewards/margins": 9.268035888671875, + "rewards/rejected": -5.86942195892334, + "step": 19318 + }, + { + "epoch": 4.83, + "grad_norm": 4.221745491027832, + "learning_rate": 2.737683185167428e-08, + "logits/chosen": -0.5954992771148682, + "logits/rejected": -0.6847110986709595, + "logps/chosen": -44.05763626098633, + "logps/rejected": -111.25956726074219, + "loss": 0.6165, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1522529125213623, + "rewards/margins": 7.242911338806152, + "rewards/rejected": -4.090658664703369, + "step": 19319 + }, + { + "epoch": 4.83, + "grad_norm": 5.303101062774658, + "learning_rate": 2.7294755689555307e-08, + "logits/chosen": -0.6016324162483215, + "logits/rejected": -0.661144495010376, + "logps/chosen": -70.83151245117188, + "logps/rejected": -115.02366638183594, + "loss": 0.6385, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.998558282852173, + "rewards/margins": 7.661751747131348, + "rewards/rejected": -4.663193225860596, + "step": 19320 + }, + { + "epoch": 4.83, + "grad_norm": 22.33365249633789, + "learning_rate": 2.7212802408271843e-08, + "logits/chosen": -0.6001412272453308, + "logits/rejected": -0.6743181943893433, + "logps/chosen": -51.54768753051758, + "logps/rejected": -105.34172058105469, + "loss": 0.5891, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.238030433654785, + "rewards/margins": 8.139081001281738, + "rewards/rejected": -4.901049613952637, + "step": 19321 + }, + { + "epoch": 4.83, + "grad_norm": 5.565688610076904, + "learning_rate": 2.713097200984949e-08, + "logits/chosen": -0.5556872487068176, + "logits/rejected": -0.6671844124794006, + "logps/chosen": -53.451622009277344, + "logps/rejected": -118.46704864501953, + "loss": 0.5789, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.130186080932617, + "rewards/margins": 8.491612434387207, + "rewards/rejected": -5.36142635345459, + "step": 19322 + }, + { + "epoch": 4.83, + "grad_norm": 4.342289447784424, + "learning_rate": 2.7049264496309957e-08, + "logits/chosen": -0.5698851943016052, + "logits/rejected": -0.6439999938011169, + "logps/chosen": -53.767147064208984, + "logps/rejected": -108.80610656738281, + "loss": 0.5794, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2527527809143066, + "rewards/margins": 7.379539489746094, + "rewards/rejected": -4.126787185668945, + "step": 19323 + }, + { + "epoch": 4.83, + "grad_norm": 5.74887228012085, + "learning_rate": 2.69676798696733e-08, + "logits/chosen": -0.522373616695404, + "logits/rejected": -0.655249834060669, + "logps/chosen": -64.37287902832031, + "logps/rejected": -94.89116668701172, + "loss": 0.749, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9610230922698975, + "rewards/margins": 7.205582618713379, + "rewards/rejected": -4.244559288024902, + "step": 19324 + }, + { + "epoch": 4.83, + "grad_norm": 12.219944953918457, + "learning_rate": 2.6886218131954022e-08, + "logits/chosen": -0.47759875655174255, + "logits/rejected": -0.5571074485778809, + "logps/chosen": -48.023704528808594, + "logps/rejected": -104.0138931274414, + "loss": 0.5603, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1748297214508057, + "rewards/margins": 7.088900566101074, + "rewards/rejected": -3.9140713214874268, + "step": 19325 + }, + { + "epoch": 4.83, + "grad_norm": 3.9066107273101807, + "learning_rate": 2.6804879285166065e-08, + "logits/chosen": -0.5063630938529968, + "logits/rejected": -0.5934601426124573, + "logps/chosen": -53.51322937011719, + "logps/rejected": -145.70327758789062, + "loss": 0.5386, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1394314765930176, + "rewards/margins": 8.94250202178955, + "rewards/rejected": -5.803070545196533, + "step": 19326 + }, + { + "epoch": 4.83, + "grad_norm": 6.707961082458496, + "learning_rate": 2.6723663331319482e-08, + "logits/chosen": -0.5041858553886414, + "logits/rejected": -0.5991703271865845, + "logps/chosen": -56.538238525390625, + "logps/rejected": -114.0329818725586, + "loss": 0.573, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.697665214538574, + "rewards/margins": 7.8146796226501465, + "rewards/rejected": -5.117013931274414, + "step": 19327 + }, + { + "epoch": 4.84, + "grad_norm": 5.894791603088379, + "learning_rate": 2.6642570272420454e-08, + "logits/chosen": -0.5161387324333191, + "logits/rejected": -0.6238937377929688, + "logps/chosen": -59.34191131591797, + "logps/rejected": -94.63440704345703, + "loss": 0.6283, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8659470081329346, + "rewards/margins": 6.368936061859131, + "rewards/rejected": -3.5029892921447754, + "step": 19328 + }, + { + "epoch": 4.84, + "grad_norm": 5.77161169052124, + "learning_rate": 2.656160011047404e-08, + "logits/chosen": -0.578354001045227, + "logits/rejected": -0.6508876085281372, + "logps/chosen": -47.07807540893555, + "logps/rejected": -104.19012451171875, + "loss": 0.6273, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3452863693237305, + "rewards/margins": 7.115143775939941, + "rewards/rejected": -3.769857406616211, + "step": 19329 + }, + { + "epoch": 4.84, + "grad_norm": 7.6288018226623535, + "learning_rate": 2.6480752847480307e-08, + "logits/chosen": -0.5488759279251099, + "logits/rejected": -0.6484583020210266, + "logps/chosen": -57.9109001159668, + "logps/rejected": -100.18187713623047, + "loss": 0.7406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0681610107421875, + "rewards/margins": 6.745400428771973, + "rewards/rejected": -3.677239418029785, + "step": 19330 + }, + { + "epoch": 4.84, + "grad_norm": 3.147808074951172, + "learning_rate": 2.64000284854371e-08, + "logits/chosen": -0.5460752844810486, + "logits/rejected": -0.6098572611808777, + "logps/chosen": -45.296966552734375, + "logps/rejected": -121.53607940673828, + "loss": 0.6113, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.287421464920044, + "rewards/margins": 8.067034721374512, + "rewards/rejected": -4.779613494873047, + "step": 19331 + }, + { + "epoch": 4.84, + "grad_norm": 4.620631694793701, + "learning_rate": 2.6319427026339495e-08, + "logits/chosen": -0.5863876938819885, + "logits/rejected": -0.6566997766494751, + "logps/chosen": -51.73058319091797, + "logps/rejected": -91.20491790771484, + "loss": 0.694, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0017685890197754, + "rewards/margins": 6.1161932945251465, + "rewards/rejected": -3.11442494392395, + "step": 19332 + }, + { + "epoch": 4.84, + "grad_norm": 6.900975704193115, + "learning_rate": 2.6238948472178673e-08, + "logits/chosen": -0.5096518397331238, + "logits/rejected": -0.6356174945831299, + "logps/chosen": -69.96805572509766, + "logps/rejected": -102.05419921875, + "loss": 0.6712, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.08563232421875, + "rewards/margins": 7.5431108474731445, + "rewards/rejected": -4.4574785232543945, + "step": 19333 + }, + { + "epoch": 4.84, + "grad_norm": 6.24714994430542, + "learning_rate": 2.615859282494415e-08, + "logits/chosen": -0.5732133388519287, + "logits/rejected": -0.7039423584938049, + "logps/chosen": -54.36239242553711, + "logps/rejected": -89.50738525390625, + "loss": 0.6278, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.332611083984375, + "rewards/margins": 6.827599048614502, + "rewards/rejected": -3.494988441467285, + "step": 19334 + }, + { + "epoch": 4.84, + "grad_norm": 3.896611213684082, + "learning_rate": 2.6078360086621013e-08, + "logits/chosen": -0.5906087160110474, + "logits/rejected": -0.6436192989349365, + "logps/chosen": -47.52872848510742, + "logps/rejected": -91.0532455444336, + "loss": 0.6671, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0692672729492188, + "rewards/margins": 6.488298416137695, + "rewards/rejected": -3.4190311431884766, + "step": 19335 + }, + { + "epoch": 4.84, + "grad_norm": 8.230509757995605, + "learning_rate": 2.599825025919267e-08, + "logits/chosen": -0.5961064696311951, + "logits/rejected": -0.6612827777862549, + "logps/chosen": -50.32416534423828, + "logps/rejected": -105.28594970703125, + "loss": 0.7473, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.212277889251709, + "rewards/margins": 6.692086219787598, + "rewards/rejected": -3.479808807373047, + "step": 19336 + }, + { + "epoch": 4.84, + "grad_norm": 7.579811096191406, + "learning_rate": 2.5918263344637538e-08, + "logits/chosen": -0.558140218257904, + "logits/rejected": -0.6427767872810364, + "logps/chosen": -52.3719482421875, + "logps/rejected": -98.40443420410156, + "loss": 0.606, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.896069288253784, + "rewards/margins": 6.9001569747924805, + "rewards/rejected": -4.004087924957275, + "step": 19337 + }, + { + "epoch": 4.84, + "grad_norm": 9.029943466186523, + "learning_rate": 2.583839934493293e-08, + "logits/chosen": -0.4922304153442383, + "logits/rejected": -0.5657685399055481, + "logps/chosen": -56.283172607421875, + "logps/rejected": -118.20001220703125, + "loss": 0.6284, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.917642593383789, + "rewards/margins": 6.30903434753418, + "rewards/rejected": -3.3913917541503906, + "step": 19338 + }, + { + "epoch": 4.84, + "grad_norm": 3.4215199947357178, + "learning_rate": 2.575865826205226e-08, + "logits/chosen": -0.5437349081039429, + "logits/rejected": -0.6071165800094604, + "logps/chosen": -64.89554595947266, + "logps/rejected": -107.07331085205078, + "loss": 0.688, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0828287601470947, + "rewards/margins": 6.515355110168457, + "rewards/rejected": -3.4325263500213623, + "step": 19339 + }, + { + "epoch": 4.84, + "grad_norm": 3.207695960998535, + "learning_rate": 2.567904009796618e-08, + "logits/chosen": -0.5604392290115356, + "logits/rejected": -0.637255847454071, + "logps/chosen": -52.32621765136719, + "logps/rejected": -130.13209533691406, + "loss": 0.5286, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1566379070281982, + "rewards/margins": 8.813959121704102, + "rewards/rejected": -5.657321453094482, + "step": 19340 + }, + { + "epoch": 4.84, + "grad_norm": 4.441166400909424, + "learning_rate": 2.5599544854641445e-08, + "logits/chosen": -0.5741428136825562, + "logits/rejected": -0.6483355760574341, + "logps/chosen": -54.648094177246094, + "logps/rejected": -101.6622314453125, + "loss": 0.6511, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1674880981445312, + "rewards/margins": 7.077813625335693, + "rewards/rejected": -3.910325288772583, + "step": 19341 + }, + { + "epoch": 4.84, + "grad_norm": 16.751550674438477, + "learning_rate": 2.5520172534043707e-08, + "logits/chosen": -0.5051309466362, + "logits/rejected": -0.5818445682525635, + "logps/chosen": -64.88327026367188, + "logps/rejected": -83.4080581665039, + "loss": 0.7434, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.325840950012207, + "rewards/margins": 5.287680625915527, + "rewards/rejected": -1.961839199066162, + "step": 19342 + }, + { + "epoch": 4.84, + "grad_norm": 3.7467403411865234, + "learning_rate": 2.5440923138133066e-08, + "logits/chosen": -0.6209647059440613, + "logits/rejected": -0.7419419884681702, + "logps/chosen": -50.49331283569336, + "logps/rejected": -115.32877349853516, + "loss": 0.579, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.266704559326172, + "rewards/margins": 6.880029678344727, + "rewards/rejected": -3.6133251190185547, + "step": 19343 + }, + { + "epoch": 4.84, + "grad_norm": 1.96723210811615, + "learning_rate": 2.536179666886851e-08, + "logits/chosen": -0.5682081580162048, + "logits/rejected": -0.6575633883476257, + "logps/chosen": -45.82513427734375, + "logps/rejected": -117.13800048828125, + "loss": 0.5285, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.249619245529175, + "rewards/margins": 8.7261381149292, + "rewards/rejected": -5.4765191078186035, + "step": 19344 + }, + { + "epoch": 4.84, + "grad_norm": 2.559483528137207, + "learning_rate": 2.5282793128205142e-08, + "logits/chosen": -0.524326503276825, + "logits/rejected": -0.5982385873794556, + "logps/chosen": -47.00079345703125, + "logps/rejected": -132.12022399902344, + "loss": 0.5518, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1245076656341553, + "rewards/margins": 9.072397232055664, + "rewards/rejected": -5.947890281677246, + "step": 19345 + }, + { + "epoch": 4.84, + "grad_norm": 17.620254516601562, + "learning_rate": 2.5203912518095286e-08, + "logits/chosen": -0.5249480605125427, + "logits/rejected": -0.6091601252555847, + "logps/chosen": -54.372962951660156, + "logps/rejected": -106.77977752685547, + "loss": 0.7127, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0360522270202637, + "rewards/margins": 6.95621395111084, + "rewards/rejected": -3.920161485671997, + "step": 19346 + }, + { + "epoch": 4.84, + "grad_norm": 6.099475383758545, + "learning_rate": 2.5125154840488497e-08, + "logits/chosen": -0.6210784912109375, + "logits/rejected": -0.6946151256561279, + "logps/chosen": -61.609336853027344, + "logps/rejected": -109.2489242553711, + "loss": 0.6832, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.264665365219116, + "rewards/margins": 7.807742118835449, + "rewards/rejected": -4.543076992034912, + "step": 19347 + }, + { + "epoch": 4.84, + "grad_norm": 7.9511799812316895, + "learning_rate": 2.504652009733044e-08, + "logits/chosen": -0.5675219893455505, + "logits/rejected": -0.6816178560256958, + "logps/chosen": -63.52751922607422, + "logps/rejected": -97.2918701171875, + "loss": 0.687, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.964921712875366, + "rewards/margins": 6.544274806976318, + "rewards/rejected": -3.579353094100952, + "step": 19348 + }, + { + "epoch": 4.84, + "grad_norm": 2.956165313720703, + "learning_rate": 2.496800829056456e-08, + "logits/chosen": -0.5233655571937561, + "logits/rejected": -0.5623183250427246, + "logps/chosen": -48.02650451660156, + "logps/rejected": -107.96435546875, + "loss": 0.5599, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2917051315307617, + "rewards/margins": 7.124240875244141, + "rewards/rejected": -3.8325350284576416, + "step": 19349 + }, + { + "epoch": 4.84, + "grad_norm": 4.101939678192139, + "learning_rate": 2.4889619422130418e-08, + "logits/chosen": -0.4717060327529907, + "logits/rejected": -0.6095440983772278, + "logps/chosen": -63.67689514160156, + "logps/rejected": -93.57084655761719, + "loss": 0.587, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1446428298950195, + "rewards/margins": 6.3062357902526855, + "rewards/rejected": -3.161592960357666, + "step": 19350 + }, + { + "epoch": 4.84, + "grad_norm": 3.918653726577759, + "learning_rate": 2.4811353493965907e-08, + "logits/chosen": -0.5307987332344055, + "logits/rejected": -0.5806906819343567, + "logps/chosen": -52.7983512878418, + "logps/rejected": -118.1290283203125, + "loss": 0.5607, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.047189235687256, + "rewards/margins": 7.502566337585449, + "rewards/rejected": -4.455377101898193, + "step": 19351 + }, + { + "epoch": 4.84, + "grad_norm": 16.91274070739746, + "learning_rate": 2.4733210508005034e-08, + "logits/chosen": -0.48430079221725464, + "logits/rejected": -0.5713786482810974, + "logps/chosen": -59.188838958740234, + "logps/rejected": -101.58113098144531, + "loss": 0.6559, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0227606296539307, + "rewards/margins": 5.95994758605957, + "rewards/rejected": -2.9371864795684814, + "step": 19352 + }, + { + "epoch": 4.84, + "grad_norm": 20.638919830322266, + "learning_rate": 2.4655190466177926e-08, + "logits/chosen": -0.5728901624679565, + "logits/rejected": -0.6600276231765747, + "logps/chosen": -58.03438949584961, + "logps/rejected": -121.46554565429688, + "loss": 0.6707, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1567158699035645, + "rewards/margins": 8.812149047851562, + "rewards/rejected": -5.6554341316223145, + "step": 19353 + }, + { + "epoch": 4.84, + "grad_norm": 6.09699821472168, + "learning_rate": 2.4577293370413036e-08, + "logits/chosen": -0.5396308898925781, + "logits/rejected": -0.6509075164794922, + "logps/chosen": -51.67039489746094, + "logps/rejected": -110.77670288085938, + "loss": 0.6315, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.089015483856201, + "rewards/margins": 7.149284839630127, + "rewards/rejected": -4.060269832611084, + "step": 19354 + }, + { + "epoch": 4.84, + "grad_norm": 4.4308600425720215, + "learning_rate": 2.4499519222635493e-08, + "logits/chosen": -0.48687875270843506, + "logits/rejected": -0.587310254573822, + "logps/chosen": -64.8538818359375, + "logps/rejected": -114.02127838134766, + "loss": 0.639, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0599634647369385, + "rewards/margins": 7.442265033721924, + "rewards/rejected": -4.382301330566406, + "step": 19355 + }, + { + "epoch": 4.84, + "grad_norm": 3.1356124877929688, + "learning_rate": 2.4421868024767094e-08, + "logits/chosen": -0.46633219718933105, + "logits/rejected": -0.5946475863456726, + "logps/chosen": -60.11817932128906, + "logps/rejected": -98.39474487304688, + "loss": 0.5933, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0541062355041504, + "rewards/margins": 6.5223894119262695, + "rewards/rejected": -3.4682836532592773, + "step": 19356 + }, + { + "epoch": 4.84, + "grad_norm": 8.855680465698242, + "learning_rate": 2.4344339778726855e-08, + "logits/chosen": -0.5589700937271118, + "logits/rejected": -0.6591415405273438, + "logps/chosen": -51.695465087890625, + "logps/rejected": -94.34625244140625, + "loss": 0.6714, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.296684503555298, + "rewards/margins": 6.529577732086182, + "rewards/rejected": -3.232893705368042, + "step": 19357 + }, + { + "epoch": 4.84, + "grad_norm": 5.643652439117432, + "learning_rate": 2.4266934486429916e-08, + "logits/chosen": -0.5325338840484619, + "logits/rejected": -0.6257995963096619, + "logps/chosen": -64.49067687988281, + "logps/rejected": -108.92108154296875, + "loss": 0.6295, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9865059852600098, + "rewards/margins": 7.07164192199707, + "rewards/rejected": -4.0851359367370605, + "step": 19358 + }, + { + "epoch": 4.84, + "grad_norm": 23.953495025634766, + "learning_rate": 2.418965214978919e-08, + "logits/chosen": -0.48499274253845215, + "logits/rejected": -0.5350403785705566, + "logps/chosen": -64.79010009765625, + "logps/rejected": -112.73867797851562, + "loss": 0.8067, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.426633596420288, + "rewards/margins": 5.573624610900879, + "rewards/rejected": -3.1469907760620117, + "step": 19359 + }, + { + "epoch": 4.84, + "grad_norm": 4.2064385414123535, + "learning_rate": 2.411249277071481e-08, + "logits/chosen": -0.5408678650856018, + "logits/rejected": -0.6443342566490173, + "logps/chosen": -57.37904357910156, + "logps/rejected": -106.09494018554688, + "loss": 0.6583, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2359659671783447, + "rewards/margins": 7.220455646514893, + "rewards/rejected": -3.984489917755127, + "step": 19360 + }, + { + "epoch": 4.84, + "grad_norm": 3.542992115020752, + "learning_rate": 2.403545635111415e-08, + "logits/chosen": -0.5274147987365723, + "logits/rejected": -0.6028323769569397, + "logps/chosen": -60.30751037597656, + "logps/rejected": -125.03146362304688, + "loss": 0.6008, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.842320680618286, + "rewards/margins": 8.029317855834961, + "rewards/rejected": -5.1869964599609375, + "step": 19361 + }, + { + "epoch": 4.84, + "grad_norm": 5.808130264282227, + "learning_rate": 2.3958542892889014e-08, + "logits/chosen": -0.5297883749008179, + "logits/rejected": -0.6059159636497498, + "logps/chosen": -55.11189270019531, + "logps/rejected": -120.12982940673828, + "loss": 0.5776, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.113424777984619, + "rewards/margins": 7.470575332641602, + "rewards/rejected": -4.357150554656982, + "step": 19362 + }, + { + "epoch": 4.84, + "grad_norm": 5.3264875411987305, + "learning_rate": 2.3881752397941772e-08, + "logits/chosen": -0.5274823307991028, + "logits/rejected": -0.634745180606842, + "logps/chosen": -58.57533264160156, + "logps/rejected": -92.08229064941406, + "loss": 0.6625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9780030250549316, + "rewards/margins": 6.357983589172363, + "rewards/rejected": -3.3799803256988525, + "step": 19363 + }, + { + "epoch": 4.84, + "grad_norm": 3.4987716674804688, + "learning_rate": 2.3805084868168683e-08, + "logits/chosen": -0.538224458694458, + "logits/rejected": -0.6207233667373657, + "logps/chosen": -62.939884185791016, + "logps/rejected": -121.42900085449219, + "loss": 0.6006, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3348588943481445, + "rewards/margins": 7.850212574005127, + "rewards/rejected": -4.515353679656982, + "step": 19364 + }, + { + "epoch": 4.84, + "grad_norm": 2.601470708847046, + "learning_rate": 2.37285403054649e-08, + "logits/chosen": -0.5638625025749207, + "logits/rejected": -0.6730112433433533, + "logps/chosen": -57.13374328613281, + "logps/rejected": -112.86737060546875, + "loss": 0.5368, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2050788402557373, + "rewards/margins": 7.936683654785156, + "rewards/rejected": -4.731605052947998, + "step": 19365 + }, + { + "epoch": 4.84, + "grad_norm": 3.442186117172241, + "learning_rate": 2.365211871172224e-08, + "logits/chosen": -0.5445261597633362, + "logits/rejected": -0.6423354148864746, + "logps/chosen": -44.358795166015625, + "logps/rejected": -103.06272888183594, + "loss": 0.4676, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2264134883880615, + "rewards/margins": 7.507930278778076, + "rewards/rejected": -4.281516075134277, + "step": 19366 + }, + { + "epoch": 4.84, + "grad_norm": 6.449059009552002, + "learning_rate": 2.3575820088829193e-08, + "logits/chosen": -0.5152401328086853, + "logits/rejected": -0.6063114404678345, + "logps/chosen": -61.583099365234375, + "logps/rejected": -89.97380828857422, + "loss": 0.693, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1960737705230713, + "rewards/margins": 6.796103477478027, + "rewards/rejected": -3.600029945373535, + "step": 19367 + }, + { + "epoch": 4.85, + "grad_norm": 4.766316890716553, + "learning_rate": 2.349964443866981e-08, + "logits/chosen": -0.517346978187561, + "logits/rejected": -0.5989792943000793, + "logps/chosen": -58.400997161865234, + "logps/rejected": -119.5453109741211, + "loss": 0.6315, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.84306001663208, + "rewards/margins": 7.5896315574646, + "rewards/rejected": -4.746571063995361, + "step": 19368 + }, + { + "epoch": 4.85, + "grad_norm": 1.7462307214736938, + "learning_rate": 2.3423591763127585e-08, + "logits/chosen": -0.513146162033081, + "logits/rejected": -0.6006157994270325, + "logps/chosen": -69.17135620117188, + "logps/rejected": -118.18675231933594, + "loss": 0.5982, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4119017124176025, + "rewards/margins": 8.085835456848145, + "rewards/rejected": -4.673933506011963, + "step": 19369 + }, + { + "epoch": 4.85, + "grad_norm": 4.7495951652526855, + "learning_rate": 2.3347662064082678e-08, + "logits/chosen": -0.5406786203384399, + "logits/rejected": -0.6533834934234619, + "logps/chosen": -50.342857360839844, + "logps/rejected": -113.55741882324219, + "loss": 0.5548, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9571566581726074, + "rewards/margins": 8.675743103027344, + "rewards/rejected": -5.718585968017578, + "step": 19370 + }, + { + "epoch": 4.85, + "grad_norm": 4.259592056274414, + "learning_rate": 2.32718553434097e-08, + "logits/chosen": -0.5844167470932007, + "logits/rejected": -0.6308395862579346, + "logps/chosen": -53.147483825683594, + "logps/rejected": -134.10098266601562, + "loss": 0.6245, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9082486629486084, + "rewards/margins": 7.149242401123047, + "rewards/rejected": -4.240993499755859, + "step": 19371 + }, + { + "epoch": 4.85, + "grad_norm": 3.484374523162842, + "learning_rate": 2.3196171602982707e-08, + "logits/chosen": -0.5556491613388062, + "logits/rejected": -0.6281572580337524, + "logps/chosen": -52.55753707885742, + "logps/rejected": -111.72212219238281, + "loss": 0.6858, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2657713890075684, + "rewards/margins": 7.3119049072265625, + "rewards/rejected": -4.046133518218994, + "step": 19372 + }, + { + "epoch": 4.85, + "grad_norm": 4.131279468536377, + "learning_rate": 2.3120610844671876e-08, + "logits/chosen": -0.5820183753967285, + "logits/rejected": -0.657696008682251, + "logps/chosen": -53.994625091552734, + "logps/rejected": -102.79499053955078, + "loss": 0.6518, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.239328622817993, + "rewards/margins": 7.132723331451416, + "rewards/rejected": -3.893395185470581, + "step": 19373 + }, + { + "epoch": 4.85, + "grad_norm": 2.3383431434631348, + "learning_rate": 2.3045173070344595e-08, + "logits/chosen": -0.526608407497406, + "logits/rejected": -0.5565190315246582, + "logps/chosen": -45.413658142089844, + "logps/rejected": -107.21138000488281, + "loss": 0.5261, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2975549697875977, + "rewards/margins": 7.172267436981201, + "rewards/rejected": -3.8747124671936035, + "step": 19374 + }, + { + "epoch": 4.85, + "grad_norm": 6.755800247192383, + "learning_rate": 2.2969858281864375e-08, + "logits/chosen": -0.5518840551376343, + "logits/rejected": -0.7029626369476318, + "logps/chosen": -68.80511474609375, + "logps/rejected": -90.45748901367188, + "loss": 0.7209, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.043674945831299, + "rewards/margins": 7.143115043640137, + "rewards/rejected": -4.099440097808838, + "step": 19375 + }, + { + "epoch": 4.85, + "grad_norm": 2.0926241874694824, + "learning_rate": 2.2894666481093618e-08, + "logits/chosen": -0.5749772787094116, + "logits/rejected": -0.6227267384529114, + "logps/chosen": -45.552635192871094, + "logps/rejected": -118.02031707763672, + "loss": 0.5136, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3846330642700195, + "rewards/margins": 8.405731201171875, + "rewards/rejected": -5.021097660064697, + "step": 19376 + }, + { + "epoch": 4.85, + "grad_norm": 4.360612392425537, + "learning_rate": 2.2819597669888615e-08, + "logits/chosen": -0.5384340882301331, + "logits/rejected": -0.6185611486434937, + "logps/chosen": -47.332313537597656, + "logps/rejected": -112.0574951171875, + "loss": 0.5143, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2975056171417236, + "rewards/margins": 8.269508361816406, + "rewards/rejected": -4.9720025062561035, + "step": 19377 + }, + { + "epoch": 4.85, + "grad_norm": 9.687363624572754, + "learning_rate": 2.274465185010566e-08, + "logits/chosen": -0.7035040855407715, + "logits/rejected": -0.7239136099815369, + "logps/chosen": -45.88648986816406, + "logps/rejected": -101.43257904052734, + "loss": 0.7802, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.752176284790039, + "rewards/margins": 6.3262786865234375, + "rewards/rejected": -3.5741028785705566, + "step": 19378 + }, + { + "epoch": 4.85, + "grad_norm": 12.102392196655273, + "learning_rate": 2.26698290235966e-08, + "logits/chosen": -0.49059662222862244, + "logits/rejected": -0.5482103228569031, + "logps/chosen": -53.96479034423828, + "logps/rejected": -115.91668701171875, + "loss": 0.6328, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1590518951416016, + "rewards/margins": 6.63072395324707, + "rewards/rejected": -3.4716715812683105, + "step": 19379 + }, + { + "epoch": 4.85, + "grad_norm": 4.0158891677856445, + "learning_rate": 2.2595129192209963e-08, + "logits/chosen": -0.4899256229400635, + "logits/rejected": -0.6017295122146606, + "logps/chosen": -61.285335540771484, + "logps/rejected": -84.36578369140625, + "loss": 0.6458, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.915163516998291, + "rewards/margins": 6.61956262588501, + "rewards/rejected": -3.7043991088867188, + "step": 19380 + }, + { + "epoch": 4.85, + "grad_norm": 6.2235212326049805, + "learning_rate": 2.2520552357792047e-08, + "logits/chosen": -0.6711777448654175, + "logits/rejected": -0.7139214277267456, + "logps/chosen": -47.142799377441406, + "logps/rejected": -129.05459594726562, + "loss": 0.6402, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2766361236572266, + "rewards/margins": 8.44969367980957, + "rewards/rejected": -5.1730570793151855, + "step": 19381 + }, + { + "epoch": 4.85, + "grad_norm": 4.337902069091797, + "learning_rate": 2.2446098522185822e-08, + "logits/chosen": -0.522957980632782, + "logits/rejected": -0.5716640949249268, + "logps/chosen": -63.666717529296875, + "logps/rejected": -128.01913452148438, + "loss": 0.6528, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2779643535614014, + "rewards/margins": 9.341635704040527, + "rewards/rejected": -6.063671588897705, + "step": 19382 + }, + { + "epoch": 4.85, + "grad_norm": 5.255043983459473, + "learning_rate": 2.2371767687230373e-08, + "logits/chosen": -0.5678164958953857, + "logits/rejected": -0.6352229118347168, + "logps/chosen": -58.202064514160156, + "logps/rejected": -122.57455444335938, + "loss": 0.5808, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4233226776123047, + "rewards/margins": 7.567070007324219, + "rewards/rejected": -4.143746852874756, + "step": 19383 + }, + { + "epoch": 4.85, + "grad_norm": 31.109302520751953, + "learning_rate": 2.229755985476312e-08, + "logits/chosen": -0.5490341782569885, + "logits/rejected": -0.623296320438385, + "logps/chosen": -57.15394592285156, + "logps/rejected": -111.28829956054688, + "loss": 0.7308, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.912161350250244, + "rewards/margins": 6.666645526885986, + "rewards/rejected": -3.754484176635742, + "step": 19384 + }, + { + "epoch": 4.85, + "grad_norm": 3.0932390689849854, + "learning_rate": 2.2223475026617593e-08, + "logits/chosen": -0.6036691665649414, + "logits/rejected": -0.6933450698852539, + "logps/chosen": -52.56275939941406, + "logps/rejected": -92.82825469970703, + "loss": 0.5899, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0965750217437744, + "rewards/margins": 7.091770172119141, + "rewards/rejected": -3.995194911956787, + "step": 19385 + }, + { + "epoch": 4.85, + "grad_norm": 3.8635497093200684, + "learning_rate": 2.2149513204624552e-08, + "logits/chosen": -0.5043988227844238, + "logits/rejected": -0.6033285856246948, + "logps/chosen": -52.20249938964844, + "logps/rejected": -107.11215209960938, + "loss": 0.5913, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5032570362091064, + "rewards/margins": 8.694666862487793, + "rewards/rejected": -5.191411018371582, + "step": 19386 + }, + { + "epoch": 4.85, + "grad_norm": 3.07315731048584, + "learning_rate": 2.2075674390611977e-08, + "logits/chosen": -0.5742961168289185, + "logits/rejected": -0.6942843794822693, + "logps/chosen": -62.98924255371094, + "logps/rejected": -90.36263275146484, + "loss": 0.6144, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0113117694854736, + "rewards/margins": 6.8826751708984375, + "rewards/rejected": -3.871363878250122, + "step": 19387 + }, + { + "epoch": 4.85, + "grad_norm": 6.699427127838135, + "learning_rate": 2.2001958586404526e-08, + "logits/chosen": -0.5406838655471802, + "logits/rejected": -0.6325879096984863, + "logps/chosen": -63.70587158203125, + "logps/rejected": -97.18463134765625, + "loss": 0.7262, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1162445545196533, + "rewards/margins": 5.847036838531494, + "rewards/rejected": -2.730792284011841, + "step": 19388 + }, + { + "epoch": 4.85, + "grad_norm": 3.2379212379455566, + "learning_rate": 2.192836579382296e-08, + "logits/chosen": -0.5527635812759399, + "logits/rejected": -0.663707435131073, + "logps/chosen": -58.13713455200195, + "logps/rejected": -132.07899475097656, + "loss": 0.5395, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.248769760131836, + "rewards/margins": 9.214768409729004, + "rewards/rejected": -5.965998649597168, + "step": 19389 + }, + { + "epoch": 4.85, + "grad_norm": 10.273214340209961, + "learning_rate": 2.1854896014686376e-08, + "logits/chosen": -0.47109270095825195, + "logits/rejected": -0.5940871238708496, + "logps/chosen": -59.852195739746094, + "logps/rejected": -118.46456146240234, + "loss": 0.581, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1396803855895996, + "rewards/margins": 8.225550651550293, + "rewards/rejected": -5.085869789123535, + "step": 19390 + }, + { + "epoch": 4.85, + "grad_norm": 8.432580947875977, + "learning_rate": 2.1781549250810553e-08, + "logits/chosen": -0.5060621500015259, + "logits/rejected": -0.571147620677948, + "logps/chosen": -49.78767013549805, + "logps/rejected": -103.1865463256836, + "loss": 0.6403, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1767001152038574, + "rewards/margins": 6.677490711212158, + "rewards/rejected": -3.500790596008301, + "step": 19391 + }, + { + "epoch": 4.85, + "grad_norm": 4.117440223693848, + "learning_rate": 2.1708325504007367e-08, + "logits/chosen": -0.490693062543869, + "logits/rejected": -0.5458006858825684, + "logps/chosen": -57.336151123046875, + "logps/rejected": -115.85404968261719, + "loss": 0.6189, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.916949987411499, + "rewards/margins": 6.968859672546387, + "rewards/rejected": -4.051909923553467, + "step": 19392 + }, + { + "epoch": 4.85, + "grad_norm": 2.855778455734253, + "learning_rate": 2.163522477608704e-08, + "logits/chosen": -0.617272675037384, + "logits/rejected": -0.669431746006012, + "logps/chosen": -48.41958999633789, + "logps/rejected": -125.69268798828125, + "loss": 0.5666, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.293489456176758, + "rewards/margins": 8.787572860717773, + "rewards/rejected": -5.494083404541016, + "step": 19393 + }, + { + "epoch": 4.85, + "grad_norm": 5.142130374908447, + "learning_rate": 2.1562247068855345e-08, + "logits/chosen": -0.5358588099479675, + "logits/rejected": -0.611024796962738, + "logps/chosen": -63.60639572143555, + "logps/rejected": -112.24683380126953, + "loss": 0.7132, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.206367254257202, + "rewards/margins": 7.410314559936523, + "rewards/rejected": -4.203948020935059, + "step": 19394 + }, + { + "epoch": 4.85, + "grad_norm": 2.6609764099121094, + "learning_rate": 2.1489392384115847e-08, + "logits/chosen": -0.5246161818504333, + "logits/rejected": -0.5983847379684448, + "logps/chosen": -43.63673782348633, + "logps/rejected": -121.8900375366211, + "loss": 0.5379, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3240675926208496, + "rewards/margins": 9.395669937133789, + "rewards/rejected": -6.071601867675781, + "step": 19395 + }, + { + "epoch": 4.85, + "grad_norm": 5.321959018707275, + "learning_rate": 2.141666072366877e-08, + "logits/chosen": -0.4553281366825104, + "logits/rejected": -0.5143249034881592, + "logps/chosen": -59.482749938964844, + "logps/rejected": -102.40938568115234, + "loss": 0.6438, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.363252878189087, + "rewards/margins": 6.63975191116333, + "rewards/rejected": -3.2764999866485596, + "step": 19396 + }, + { + "epoch": 4.85, + "grad_norm": 4.667019367218018, + "learning_rate": 2.1344052089311563e-08, + "logits/chosen": -0.6102269291877747, + "logits/rejected": -0.6804342865943909, + "logps/chosen": -51.62055206298828, + "logps/rejected": -98.08221435546875, + "loss": 0.6436, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.974910259246826, + "rewards/margins": 7.319830417633057, + "rewards/rejected": -4.344919681549072, + "step": 19397 + }, + { + "epoch": 4.85, + "grad_norm": 6.851909637451172, + "learning_rate": 2.1271566482838347e-08, + "logits/chosen": -0.5228157639503479, + "logits/rejected": -0.5838437080383301, + "logps/chosen": -49.6843147277832, + "logps/rejected": -89.38845825195312, + "loss": 0.6091, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.992739200592041, + "rewards/margins": 6.209065914154053, + "rewards/rejected": -3.21632719039917, + "step": 19398 + }, + { + "epoch": 4.85, + "grad_norm": 3.8876116275787354, + "learning_rate": 2.1199203906040467e-08, + "logits/chosen": -0.49921542406082153, + "logits/rejected": -0.5385715365409851, + "logps/chosen": -56.15557098388672, + "logps/rejected": -102.52618408203125, + "loss": 0.6999, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.836658000946045, + "rewards/margins": 6.736997604370117, + "rewards/rejected": -3.9003396034240723, + "step": 19399 + }, + { + "epoch": 4.85, + "grad_norm": 6.3524980545043945, + "learning_rate": 2.1126964360705936e-08, + "logits/chosen": -0.5686761736869812, + "logits/rejected": -0.6264147758483887, + "logps/chosen": -50.41919708251953, + "logps/rejected": -102.12117767333984, + "loss": 0.5898, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9276039600372314, + "rewards/margins": 6.40721321105957, + "rewards/rejected": -3.4796090126037598, + "step": 19400 + }, + { + "epoch": 4.85, + "grad_norm": 17.727930068969727, + "learning_rate": 2.1054847848619443e-08, + "logits/chosen": -0.5284236073493958, + "logits/rejected": -0.5609352588653564, + "logps/chosen": -51.298458099365234, + "logps/rejected": -119.37653350830078, + "loss": 0.648, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1322011947631836, + "rewards/margins": 6.825894355773926, + "rewards/rejected": -3.693693161010742, + "step": 19401 + }, + { + "epoch": 4.85, + "grad_norm": 6.519336223602295, + "learning_rate": 2.0982854371564e-08, + "logits/chosen": -0.6019977331161499, + "logits/rejected": -0.6804615259170532, + "logps/chosen": -58.69340515136719, + "logps/rejected": -87.78168487548828, + "loss": 0.6601, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0794482231140137, + "rewards/margins": 6.202869892120361, + "rewards/rejected": -3.1234211921691895, + "step": 19402 + }, + { + "epoch": 4.85, + "grad_norm": 5.913519382476807, + "learning_rate": 2.091098393131763e-08, + "logits/chosen": -0.6160502433776855, + "logits/rejected": -0.6596137881278992, + "logps/chosen": -52.96262741088867, + "logps/rejected": -95.09298706054688, + "loss": 0.6577, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9870481491088867, + "rewards/margins": 6.183422088623047, + "rewards/rejected": -3.1963741779327393, + "step": 19403 + }, + { + "epoch": 4.85, + "grad_norm": 7.288574695587158, + "learning_rate": 2.0839236529657247e-08, + "logits/chosen": -0.6043915748596191, + "logits/rejected": -0.7119977474212646, + "logps/chosen": -49.882022857666016, + "logps/rejected": -95.99710083007812, + "loss": 0.5838, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1545777320861816, + "rewards/margins": 7.251990795135498, + "rewards/rejected": -4.097413063049316, + "step": 19404 + }, + { + "epoch": 4.85, + "grad_norm": 4.163478374481201, + "learning_rate": 2.0767612168355322e-08, + "logits/chosen": -0.500510573387146, + "logits/rejected": -0.5854341983795166, + "logps/chosen": -47.18357467651367, + "logps/rejected": -101.04151916503906, + "loss": 0.5869, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9203245639801025, + "rewards/margins": 7.004488945007324, + "rewards/rejected": -4.084164619445801, + "step": 19405 + }, + { + "epoch": 4.85, + "grad_norm": 6.963129043579102, + "learning_rate": 2.0696110849182106e-08, + "logits/chosen": -0.5546616911888123, + "logits/rejected": -0.6201730370521545, + "logps/chosen": -60.82080841064453, + "logps/rejected": -129.01235961914062, + "loss": 0.7259, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2912349700927734, + "rewards/margins": 7.8328094482421875, + "rewards/rejected": -4.541574478149414, + "step": 19406 + }, + { + "epoch": 4.85, + "grad_norm": 5.531175136566162, + "learning_rate": 2.062473257390396e-08, + "logits/chosen": -0.5211032629013062, + "logits/rejected": -0.582998514175415, + "logps/chosen": -50.85355758666992, + "logps/rejected": -94.88256072998047, + "loss": 0.631, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.417922019958496, + "rewards/margins": 6.9044389724731445, + "rewards/rejected": -3.4865174293518066, + "step": 19407 + }, + { + "epoch": 4.86, + "grad_norm": 3.034341812133789, + "learning_rate": 2.0553477344285034e-08, + "logits/chosen": -0.5745410919189453, + "logits/rejected": -0.6738278865814209, + "logps/chosen": -59.83808898925781, + "logps/rejected": -111.89595794677734, + "loss": 0.5429, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.165614366531372, + "rewards/margins": 7.494952201843262, + "rewards/rejected": -4.329338073730469, + "step": 19408 + }, + { + "epoch": 4.86, + "grad_norm": 12.80421257019043, + "learning_rate": 2.0482345162086135e-08, + "logits/chosen": -0.5567424297332764, + "logits/rejected": -0.6492558121681213, + "logps/chosen": -61.10120391845703, + "logps/rejected": -103.0648193359375, + "loss": 0.5976, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.623957872390747, + "rewards/margins": 7.232387542724609, + "rewards/rejected": -4.608429431915283, + "step": 19409 + }, + { + "epoch": 4.86, + "grad_norm": 9.74681282043457, + "learning_rate": 2.0411336029064756e-08, + "logits/chosen": -0.5397964119911194, + "logits/rejected": -0.6330057382583618, + "logps/chosen": -64.60797882080078, + "logps/rejected": -100.53089904785156, + "loss": 0.6637, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2444958686828613, + "rewards/margins": 6.688737392425537, + "rewards/rejected": -3.444241762161255, + "step": 19410 + }, + { + "epoch": 4.86, + "grad_norm": 6.426095962524414, + "learning_rate": 2.0340449946975592e-08, + "logits/chosen": -0.5847670435905457, + "logits/rejected": -0.6233234405517578, + "logps/chosen": -58.94538497924805, + "logps/rejected": -122.71326446533203, + "loss": 0.6581, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0356059074401855, + "rewards/margins": 6.764925956726074, + "rewards/rejected": -3.7293195724487305, + "step": 19411 + }, + { + "epoch": 4.86, + "grad_norm": 4.594870567321777, + "learning_rate": 2.026968691757114e-08, + "logits/chosen": -0.6171791553497314, + "logits/rejected": -0.6644561290740967, + "logps/chosen": -48.21665954589844, + "logps/rejected": -106.10843658447266, + "loss": 0.5972, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.229396343231201, + "rewards/margins": 6.816275119781494, + "rewards/rejected": -3.586879253387451, + "step": 19412 + }, + { + "epoch": 4.86, + "grad_norm": 4.05106782913208, + "learning_rate": 2.0199046942598886e-08, + "logits/chosen": -0.5731890201568604, + "logits/rejected": -0.6373298764228821, + "logps/chosen": -51.04835891723633, + "logps/rejected": -104.45751190185547, + "loss": 0.6404, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.960355281829834, + "rewards/margins": 6.983041286468506, + "rewards/rejected": -4.022686004638672, + "step": 19413 + }, + { + "epoch": 4.86, + "grad_norm": 3.361964464187622, + "learning_rate": 2.012853002380466e-08, + "logits/chosen": -0.49088141322135925, + "logits/rejected": -0.5838210582733154, + "logps/chosen": -54.7629280090332, + "logps/rejected": -111.89314270019531, + "loss": 0.5621, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.272512197494507, + "rewards/margins": 7.15898323059082, + "rewards/rejected": -3.8864712715148926, + "step": 19414 + }, + { + "epoch": 4.86, + "grad_norm": 12.302684783935547, + "learning_rate": 2.0058136162932062e-08, + "logits/chosen": -0.5642281770706177, + "logits/rejected": -0.6000387668609619, + "logps/chosen": -59.00621032714844, + "logps/rejected": -108.13665008544922, + "loss": 0.7297, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8885955810546875, + "rewards/margins": 5.749150276184082, + "rewards/rejected": -2.8605551719665527, + "step": 19415 + }, + { + "epoch": 4.86, + "grad_norm": 10.800060272216797, + "learning_rate": 1.998786536171915e-08, + "logits/chosen": -0.539216935634613, + "logits/rejected": -0.6318483948707581, + "logps/chosen": -56.5317497253418, + "logps/rejected": -94.16366577148438, + "loss": 0.6438, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6660284996032715, + "rewards/margins": 6.314518451690674, + "rewards/rejected": -3.6484899520874023, + "step": 19416 + }, + { + "epoch": 4.86, + "grad_norm": 11.599950790405273, + "learning_rate": 1.991771762190342e-08, + "logits/chosen": -0.5743715167045593, + "logits/rejected": -0.6366958618164062, + "logps/chosen": -67.94937133789062, + "logps/rejected": -105.0854721069336, + "loss": 0.766, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.875263214111328, + "rewards/margins": 6.3915886878967285, + "rewards/rejected": -3.5163259506225586, + "step": 19417 + }, + { + "epoch": 4.86, + "grad_norm": 4.454156875610352, + "learning_rate": 1.9847692945217934e-08, + "logits/chosen": -0.519964873790741, + "logits/rejected": -0.6005333065986633, + "logps/chosen": -83.25129699707031, + "logps/rejected": -119.07710266113281, + "loss": 0.701, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9165077209472656, + "rewards/margins": 7.520633220672607, + "rewards/rejected": -4.604125022888184, + "step": 19418 + }, + { + "epoch": 4.86, + "grad_norm": 3.805567741394043, + "learning_rate": 1.9777791333392972e-08, + "logits/chosen": -0.46573925018310547, + "logits/rejected": -0.5270190238952637, + "logps/chosen": -57.356597900390625, + "logps/rejected": -120.01356506347656, + "loss": 0.556, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.132354259490967, + "rewards/margins": 7.128767490386963, + "rewards/rejected": -3.9964137077331543, + "step": 19419 + }, + { + "epoch": 4.86, + "grad_norm": 4.663386344909668, + "learning_rate": 1.970801278815604e-08, + "logits/chosen": -0.5875098705291748, + "logits/rejected": -0.6496409773826599, + "logps/chosen": -49.23370361328125, + "logps/rejected": -112.80921936035156, + "loss": 0.5859, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.237989664077759, + "rewards/margins": 7.7407612800598145, + "rewards/rejected": -4.502771854400635, + "step": 19420 + }, + { + "epoch": 4.86, + "grad_norm": 4.058071613311768, + "learning_rate": 1.9638357311231316e-08, + "logits/chosen": -0.5780470371246338, + "logits/rejected": -0.6879063248634338, + "logps/chosen": -43.33492660522461, + "logps/rejected": -113.2503890991211, + "loss": 0.5749, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.107177257537842, + "rewards/margins": 8.421730041503906, + "rewards/rejected": -5.314552307128906, + "step": 19421 + }, + { + "epoch": 4.86, + "grad_norm": 3.48600697517395, + "learning_rate": 1.95688249043402e-08, + "logits/chosen": -0.4487902522087097, + "logits/rejected": -0.5394001007080078, + "logps/chosen": -53.49918746948242, + "logps/rejected": -112.26093292236328, + "loss": 0.5721, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.093752384185791, + "rewards/margins": 7.169015407562256, + "rewards/rejected": -4.075263023376465, + "step": 19422 + }, + { + "epoch": 4.86, + "grad_norm": 3.0668084621429443, + "learning_rate": 1.9499415569200765e-08, + "logits/chosen": -0.5983343720436096, + "logits/rejected": -0.6696638464927673, + "logps/chosen": -52.15431213378906, + "logps/rejected": -126.54056549072266, + "loss": 0.4912, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1778016090393066, + "rewards/margins": 9.22675895690918, + "rewards/rejected": -6.048957824707031, + "step": 19423 + }, + { + "epoch": 4.86, + "grad_norm": 2.3906123638153076, + "learning_rate": 1.9430129307528855e-08, + "logits/chosen": -0.6256821155548096, + "logits/rejected": -0.7118603587150574, + "logps/chosen": -50.07232666015625, + "logps/rejected": -110.53965759277344, + "loss": 0.5537, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4093213081359863, + "rewards/margins": 9.193340301513672, + "rewards/rejected": -5.784019947052002, + "step": 19424 + }, + { + "epoch": 4.86, + "grad_norm": 2.973418951034546, + "learning_rate": 1.936096612103533e-08, + "logits/chosen": -0.5539261102676392, + "logits/rejected": -0.6017540693283081, + "logps/chosen": -51.65049743652344, + "logps/rejected": -107.39257049560547, + "loss": 0.5886, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3753068447113037, + "rewards/margins": 5.974174499511719, + "rewards/rejected": -2.598867177963257, + "step": 19425 + }, + { + "epoch": 4.86, + "grad_norm": 7.362039089202881, + "learning_rate": 1.929192601142993e-08, + "logits/chosen": -0.620233952999115, + "logits/rejected": -0.6973345875740051, + "logps/chosen": -48.004554748535156, + "logps/rejected": -103.17678833007812, + "loss": 0.5364, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.282015562057495, + "rewards/margins": 7.288787364959717, + "rewards/rejected": -4.006772041320801, + "step": 19426 + }, + { + "epoch": 4.86, + "grad_norm": 3.32179594039917, + "learning_rate": 1.9223008980419066e-08, + "logits/chosen": -0.5922896862030029, + "logits/rejected": -0.6673747301101685, + "logps/chosen": -54.15245819091797, + "logps/rejected": -102.49275970458984, + "loss": 0.631, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.161986827850342, + "rewards/margins": 5.797013282775879, + "rewards/rejected": -2.635026693344116, + "step": 19427 + }, + { + "epoch": 4.86, + "grad_norm": 5.877150058746338, + "learning_rate": 1.9154215029705826e-08, + "logits/chosen": -0.5307912826538086, + "logits/rejected": -0.5865831971168518, + "logps/chosen": -59.61955261230469, + "logps/rejected": -97.02374267578125, + "loss": 0.6958, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2200000286102295, + "rewards/margins": 5.737341403961182, + "rewards/rejected": -2.5173418521881104, + "step": 19428 + }, + { + "epoch": 4.86, + "grad_norm": 5.431682109832764, + "learning_rate": 1.9085544160988845e-08, + "logits/chosen": -0.5360652208328247, + "logits/rejected": -0.6006723642349243, + "logps/chosen": -47.89653778076172, + "logps/rejected": -103.65450286865234, + "loss": 0.5826, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5377697944641113, + "rewards/margins": 7.0217671394348145, + "rewards/rejected": -3.483997344970703, + "step": 19429 + }, + { + "epoch": 4.86, + "grad_norm": 5.124682903289795, + "learning_rate": 1.9016996375966767e-08, + "logits/chosen": -0.5691428780555725, + "logits/rejected": -0.6419755220413208, + "logps/chosen": -54.78490447998047, + "logps/rejected": -112.1361083984375, + "loss": 0.5389, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.173205614089966, + "rewards/margins": 8.424495697021484, + "rewards/rejected": -5.251289367675781, + "step": 19430 + }, + { + "epoch": 4.86, + "grad_norm": 4.764991283416748, + "learning_rate": 1.8948571676332682e-08, + "logits/chosen": -0.529574990272522, + "logits/rejected": -0.5911519527435303, + "logps/chosen": -48.261131286621094, + "logps/rejected": -101.87127685546875, + "loss": 0.6159, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0610191822052, + "rewards/margins": 7.450042724609375, + "rewards/rejected": -4.389023303985596, + "step": 19431 + }, + { + "epoch": 4.86, + "grad_norm": 6.2577924728393555, + "learning_rate": 1.8880270063776907e-08, + "logits/chosen": -0.5019997358322144, + "logits/rejected": -0.5422586798667908, + "logps/chosen": -58.97672653198242, + "logps/rejected": -131.6651611328125, + "loss": 0.5767, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9505326747894287, + "rewards/margins": 6.961629867553711, + "rewards/rejected": -4.011097431182861, + "step": 19432 + }, + { + "epoch": 4.86, + "grad_norm": 4.782809257507324, + "learning_rate": 1.8812091539988086e-08, + "logits/chosen": -0.49329546093940735, + "logits/rejected": -0.5660801529884338, + "logps/chosen": -54.201072692871094, + "logps/rejected": -143.8572235107422, + "loss": 0.5822, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.197575092315674, + "rewards/margins": 8.778098106384277, + "rewards/rejected": -5.580522060394287, + "step": 19433 + }, + { + "epoch": 4.86, + "grad_norm": 4.3066325187683105, + "learning_rate": 1.8744036106650988e-08, + "logits/chosen": -0.5531157851219177, + "logits/rejected": -0.5988747477531433, + "logps/chosen": -55.071136474609375, + "logps/rejected": -112.7437515258789, + "loss": 0.5963, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.200148344039917, + "rewards/margins": 7.735453128814697, + "rewards/rejected": -4.535305500030518, + "step": 19434 + }, + { + "epoch": 4.86, + "grad_norm": 2.932718515396118, + "learning_rate": 1.8676103765446483e-08, + "logits/chosen": -0.5717031955718994, + "logits/rejected": -0.6517828106880188, + "logps/chosen": -50.7669677734375, + "logps/rejected": -96.74737548828125, + "loss": 0.609, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1035079956054688, + "rewards/margins": 6.955634117126465, + "rewards/rejected": -3.8521265983581543, + "step": 19435 + }, + { + "epoch": 4.86, + "grad_norm": 5.8571553230285645, + "learning_rate": 1.8608294518054348e-08, + "logits/chosen": -0.6043853759765625, + "logits/rejected": -0.693358302116394, + "logps/chosen": -54.13337707519531, + "logps/rejected": -93.92544555664062, + "loss": 0.6665, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9160163402557373, + "rewards/margins": 6.467214107513428, + "rewards/rejected": -3.5511980056762695, + "step": 19436 + }, + { + "epoch": 4.86, + "grad_norm": 3.3159193992614746, + "learning_rate": 1.854060836614935e-08, + "logits/chosen": -0.6604561805725098, + "logits/rejected": -0.7546412944793701, + "logps/chosen": -45.66783905029297, + "logps/rejected": -101.81058502197266, + "loss": 0.5516, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.227668285369873, + "rewards/margins": 7.173556327819824, + "rewards/rejected": -3.945888042449951, + "step": 19437 + }, + { + "epoch": 4.86, + "grad_norm": 2.899639368057251, + "learning_rate": 1.8473045311405145e-08, + "logits/chosen": -0.5056455135345459, + "logits/rejected": -0.5618674755096436, + "logps/chosen": -57.65007019042969, + "logps/rejected": -116.45114135742188, + "loss": 0.5749, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3927993774414062, + "rewards/margins": 7.592906951904297, + "rewards/rejected": -4.200107097625732, + "step": 19438 + }, + { + "epoch": 4.86, + "grad_norm": 3.260406017303467, + "learning_rate": 1.8405605355489854e-08, + "logits/chosen": -0.510924220085144, + "logits/rejected": -0.6164771914482117, + "logps/chosen": -55.60292434692383, + "logps/rejected": -107.34720611572266, + "loss": 0.6048, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4409255981445312, + "rewards/margins": 8.069293975830078, + "rewards/rejected": -4.6283674240112305, + "step": 19439 + }, + { + "epoch": 4.86, + "grad_norm": 16.948246002197266, + "learning_rate": 1.833828850007102e-08, + "logits/chosen": -0.5102086067199707, + "logits/rejected": -0.5776251554489136, + "logps/chosen": -53.77556228637695, + "logps/rejected": -113.46866607666016, + "loss": 0.7042, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8297433853149414, + "rewards/margins": 7.787128925323486, + "rewards/rejected": -4.957386016845703, + "step": 19440 + }, + { + "epoch": 4.86, + "grad_norm": 2.692978620529175, + "learning_rate": 1.8271094746811214e-08, + "logits/chosen": -0.5877792835235596, + "logits/rejected": -0.6597999930381775, + "logps/chosen": -53.40473175048828, + "logps/rejected": -106.55224609375, + "loss": 0.581, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.148106098175049, + "rewards/margins": 7.999277591705322, + "rewards/rejected": -4.851171493530273, + "step": 19441 + }, + { + "epoch": 4.86, + "grad_norm": 4.492123603820801, + "learning_rate": 1.820402409737243e-08, + "logits/chosen": -0.5040578246116638, + "logits/rejected": -0.5816355347633362, + "logps/chosen": -61.01434326171875, + "logps/rejected": -95.77510833740234, + "loss": 0.6221, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.020737409591675, + "rewards/margins": 6.185051441192627, + "rewards/rejected": -3.164314031600952, + "step": 19442 + }, + { + "epoch": 4.86, + "grad_norm": 5.040745258331299, + "learning_rate": 1.813707655341057e-08, + "logits/chosen": -0.512657880783081, + "logits/rejected": -0.6059275269508362, + "logps/chosen": -64.78508758544922, + "logps/rejected": -94.90277099609375, + "loss": 0.6558, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.076442003250122, + "rewards/margins": 6.024865627288818, + "rewards/rejected": -2.9484236240386963, + "step": 19443 + }, + { + "epoch": 4.86, + "grad_norm": 3.8986928462982178, + "learning_rate": 1.8070252116580423e-08, + "logits/chosen": -0.5713480114936829, + "logits/rejected": -0.65346759557724, + "logps/chosen": -57.0737419128418, + "logps/rejected": -124.3244857788086, + "loss": 0.6146, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8205790519714355, + "rewards/margins": 7.155369281768799, + "rewards/rejected": -4.334790229797363, + "step": 19444 + }, + { + "epoch": 4.86, + "grad_norm": 3.3837826251983643, + "learning_rate": 1.8003550788533443e-08, + "logits/chosen": -0.5564014315605164, + "logits/rejected": -0.6564885973930359, + "logps/chosen": -63.055625915527344, + "logps/rejected": -110.24261474609375, + "loss": 0.5991, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9002885818481445, + "rewards/margins": 6.574899673461914, + "rewards/rejected": -3.6746113300323486, + "step": 19445 + }, + { + "epoch": 4.86, + "grad_norm": 4.056035041809082, + "learning_rate": 1.7936972570917754e-08, + "logits/chosen": -0.5387421250343323, + "logits/rejected": -0.5844629406929016, + "logps/chosen": -74.33277893066406, + "logps/rejected": -113.3678207397461, + "loss": 0.7242, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.733163595199585, + "rewards/margins": 7.0815534591674805, + "rewards/rejected": -4.348390102386475, + "step": 19446 + }, + { + "epoch": 4.86, + "grad_norm": 6.314079284667969, + "learning_rate": 1.787051746537871e-08, + "logits/chosen": -0.5691156983375549, + "logits/rejected": -0.61156165599823, + "logps/chosen": -49.292213439941406, + "logps/rejected": -126.49700164794922, + "loss": 0.5975, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.807666063308716, + "rewards/margins": 8.674656867980957, + "rewards/rejected": -5.86699104309082, + "step": 19447 + }, + { + "epoch": 4.87, + "grad_norm": 3.6395390033721924, + "learning_rate": 1.7804185473558334e-08, + "logits/chosen": -0.5933093428611755, + "logits/rejected": -0.690021812915802, + "logps/chosen": -44.547142028808594, + "logps/rejected": -95.88597106933594, + "loss": 0.5472, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.02543306350708, + "rewards/margins": 7.054970741271973, + "rewards/rejected": -4.029538154602051, + "step": 19448 + }, + { + "epoch": 4.87, + "grad_norm": 3.8016958236694336, + "learning_rate": 1.7737976597095864e-08, + "logits/chosen": -0.5509384870529175, + "logits/rejected": -0.6427185535430908, + "logps/chosen": -66.31269836425781, + "logps/rejected": -108.44733428955078, + "loss": 0.6, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2451915740966797, + "rewards/margins": 7.12663459777832, + "rewards/rejected": -3.8814425468444824, + "step": 19449 + }, + { + "epoch": 4.87, + "grad_norm": 5.77664852142334, + "learning_rate": 1.767189083762777e-08, + "logits/chosen": -0.5774219036102295, + "logits/rejected": -0.6713197827339172, + "logps/chosen": -53.13247299194336, + "logps/rejected": -90.80487060546875, + "loss": 0.5921, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.055011749267578, + "rewards/margins": 6.231261730194092, + "rewards/rejected": -3.1762499809265137, + "step": 19450 + }, + { + "epoch": 4.87, + "grad_norm": 4.137205600738525, + "learning_rate": 1.760592819678608e-08, + "logits/chosen": -0.545600414276123, + "logits/rejected": -0.5998204946517944, + "logps/chosen": -44.53243637084961, + "logps/rejected": -98.52215576171875, + "loss": 0.5842, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.976489782333374, + "rewards/margins": 7.007695198059082, + "rewards/rejected": -4.031204700469971, + "step": 19451 + }, + { + "epoch": 4.87, + "grad_norm": 10.006377220153809, + "learning_rate": 1.7540088676201716e-08, + "logits/chosen": -0.6175991892814636, + "logits/rejected": -0.6632614135742188, + "logps/chosen": -40.89936065673828, + "logps/rejected": -106.84300994873047, + "loss": 0.5569, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.995903491973877, + "rewards/margins": 7.486870288848877, + "rewards/rejected": -4.490966320037842, + "step": 19452 + }, + { + "epoch": 4.87, + "grad_norm": 5.034226417541504, + "learning_rate": 1.747437227750115e-08, + "logits/chosen": -0.49858057498931885, + "logits/rejected": -0.5857285261154175, + "logps/chosen": -54.493492126464844, + "logps/rejected": -120.47857666015625, + "loss": 0.6023, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7694544792175293, + "rewards/margins": 8.29550552368164, + "rewards/rejected": -5.526050567626953, + "step": 19453 + }, + { + "epoch": 4.87, + "grad_norm": 4.931323051452637, + "learning_rate": 1.740877900230864e-08, + "logits/chosen": -0.5822884440422058, + "logits/rejected": -0.6371639370918274, + "logps/chosen": -51.606300354003906, + "logps/rejected": -103.64945983886719, + "loss": 0.6704, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0618395805358887, + "rewards/margins": 6.517534255981445, + "rewards/rejected": -3.4556949138641357, + "step": 19454 + }, + { + "epoch": 4.87, + "grad_norm": 2.219510793685913, + "learning_rate": 1.7343308852245112e-08, + "logits/chosen": -0.5805896520614624, + "logits/rejected": -0.6570548415184021, + "logps/chosen": -48.626220703125, + "logps/rejected": -96.42247772216797, + "loss": 0.538, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.148419141769409, + "rewards/margins": 7.186563491821289, + "rewards/rejected": -4.038144111633301, + "step": 19455 + }, + { + "epoch": 4.87, + "grad_norm": 4.729532241821289, + "learning_rate": 1.727796182892816e-08, + "logits/chosen": -0.586361825466156, + "logits/rejected": -0.654182493686676, + "logps/chosen": -61.11695098876953, + "logps/rejected": -107.93023681640625, + "loss": 0.6398, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2545650005340576, + "rewards/margins": 8.092497825622559, + "rewards/rejected": -4.837932586669922, + "step": 19456 + }, + { + "epoch": 4.87, + "grad_norm": 3.9351203441619873, + "learning_rate": 1.7212737933972047e-08, + "logits/chosen": -0.525276780128479, + "logits/rejected": -0.6329987645149231, + "logps/chosen": -48.85924530029297, + "logps/rejected": -105.69851684570312, + "loss": 0.5451, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.793055534362793, + "rewards/margins": 8.05931568145752, + "rewards/rejected": -5.266260147094727, + "step": 19457 + }, + { + "epoch": 4.87, + "grad_norm": 4.026162147521973, + "learning_rate": 1.7147637168989372e-08, + "logits/chosen": -0.536930501461029, + "logits/rejected": -0.6052885055541992, + "logps/chosen": -56.65009307861328, + "logps/rejected": -115.62762451171875, + "loss": 0.5943, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.043227434158325, + "rewards/margins": 7.677050590515137, + "rewards/rejected": -4.633822441101074, + "step": 19458 + }, + { + "epoch": 4.87, + "grad_norm": 4.1880035400390625, + "learning_rate": 1.7082659535589407e-08, + "logits/chosen": -0.6209472417831421, + "logits/rejected": -0.7281191349029541, + "logps/chosen": -53.26243591308594, + "logps/rejected": -94.26124572753906, + "loss": 0.6537, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4199109077453613, + "rewards/margins": 8.157061576843262, + "rewards/rejected": -4.737149715423584, + "step": 19459 + }, + { + "epoch": 4.87, + "grad_norm": 10.157846450805664, + "learning_rate": 1.7017805035375866e-08, + "logits/chosen": -0.5687263607978821, + "logits/rejected": -0.639487624168396, + "logps/chosen": -58.770668029785156, + "logps/rejected": -116.40108489990234, + "loss": 0.6754, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.204998254776001, + "rewards/margins": 7.609486103057861, + "rewards/rejected": -4.404488563537598, + "step": 19460 + }, + { + "epoch": 4.87, + "grad_norm": 1.9156591892242432, + "learning_rate": 1.6953073669953023e-08, + "logits/chosen": -0.5313485264778137, + "logits/rejected": -0.6236690282821655, + "logps/chosen": -48.45368957519531, + "logps/rejected": -102.32556915283203, + "loss": 0.5422, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.89416241645813, + "rewards/margins": 7.79656982421875, + "rewards/rejected": -4.902408123016357, + "step": 19461 + }, + { + "epoch": 4.87, + "grad_norm": 7.608304977416992, + "learning_rate": 1.6888465440920153e-08, + "logits/chosen": -0.5640537738800049, + "logits/rejected": -0.6463167071342468, + "logps/chosen": -45.96308517456055, + "logps/rejected": -103.60417938232422, + "loss": 0.614, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.4375267028808594, + "rewards/margins": 7.2121429443359375, + "rewards/rejected": -3.774616003036499, + "step": 19462 + }, + { + "epoch": 4.87, + "grad_norm": 5.67771577835083, + "learning_rate": 1.6823980349873205e-08, + "logits/chosen": -0.5483340620994568, + "logits/rejected": -0.6118074059486389, + "logps/chosen": -46.90362548828125, + "logps/rejected": -100.55880737304688, + "loss": 0.5462, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0953378677368164, + "rewards/margins": 6.942903518676758, + "rewards/rejected": -3.8475656509399414, + "step": 19463 + }, + { + "epoch": 4.87, + "grad_norm": 4.956965923309326, + "learning_rate": 1.6759618398405897e-08, + "logits/chosen": -0.5494381189346313, + "logits/rejected": -0.6425657272338867, + "logps/chosen": -63.673885345458984, + "logps/rejected": -84.89073944091797, + "loss": 0.6787, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2622227668762207, + "rewards/margins": 6.688665390014648, + "rewards/rejected": -3.426442861557007, + "step": 19464 + }, + { + "epoch": 4.87, + "grad_norm": 15.632063865661621, + "learning_rate": 1.6695379588109183e-08, + "logits/chosen": -0.5858349204063416, + "logits/rejected": -0.6450187563896179, + "logps/chosen": -59.633705139160156, + "logps/rejected": -111.17046356201172, + "loss": 0.6972, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0518014430999756, + "rewards/margins": 6.719029426574707, + "rewards/rejected": -3.6672277450561523, + "step": 19465 + }, + { + "epoch": 4.87, + "grad_norm": 3.7808969020843506, + "learning_rate": 1.6631263920570127e-08, + "logits/chosen": -0.662396252155304, + "logits/rejected": -0.6412800550460815, + "logps/chosen": -72.9890365600586, + "logps/rejected": -106.51370239257812, + "loss": 0.5863, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5072743892669678, + "rewards/margins": 7.792131423950195, + "rewards/rejected": -4.284857273101807, + "step": 19466 + }, + { + "epoch": 4.87, + "grad_norm": 1.9007136821746826, + "learning_rate": 1.656727139737302e-08, + "logits/chosen": -0.5860755443572998, + "logits/rejected": -0.6610147953033447, + "logps/chosen": -50.94325256347656, + "logps/rejected": -128.47805786132812, + "loss": 0.5234, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.392515182495117, + "rewards/margins": 8.582700729370117, + "rewards/rejected": -5.190185546875, + "step": 19467 + }, + { + "epoch": 4.87, + "grad_norm": 2.54233455657959, + "learning_rate": 1.650340202009937e-08, + "logits/chosen": -0.5732370018959045, + "logits/rejected": -0.6494004130363464, + "logps/chosen": -47.21233367919922, + "logps/rejected": -125.18525695800781, + "loss": 0.5559, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.737118721008301, + "rewards/margins": 7.751521110534668, + "rewards/rejected": -5.014403343200684, + "step": 19468 + }, + { + "epoch": 4.87, + "grad_norm": 10.353155136108398, + "learning_rate": 1.6439655790326804e-08, + "logits/chosen": -0.6016954779624939, + "logits/rejected": -0.6757136583328247, + "logps/chosen": -65.98358154296875, + "logps/rejected": -126.67767333984375, + "loss": 0.672, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4241409301757812, + "rewards/margins": 6.925196647644043, + "rewards/rejected": -4.501055717468262, + "step": 19469 + }, + { + "epoch": 4.87, + "grad_norm": 3.7086734771728516, + "learning_rate": 1.637603270963184e-08, + "logits/chosen": -0.5469873547554016, + "logits/rejected": -0.6392427682876587, + "logps/chosen": -44.4056510925293, + "logps/rejected": -81.95431518554688, + "loss": 0.6299, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2062253952026367, + "rewards/margins": 6.908402442932129, + "rewards/rejected": -3.702176809310913, + "step": 19470 + }, + { + "epoch": 4.87, + "grad_norm": 7.443782329559326, + "learning_rate": 1.6312532779585442e-08, + "logits/chosen": -0.5055573582649231, + "logits/rejected": -0.6416590809822083, + "logps/chosen": -50.385902404785156, + "logps/rejected": -107.62963104248047, + "loss": 0.6026, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9885427951812744, + "rewards/margins": 8.135391235351562, + "rewards/rejected": -5.146847724914551, + "step": 19471 + }, + { + "epoch": 4.87, + "grad_norm": 6.481259346008301, + "learning_rate": 1.624915600175747e-08, + "logits/chosen": -0.5721510648727417, + "logits/rejected": -0.6199268698692322, + "logps/chosen": -52.72747802734375, + "logps/rejected": -112.6933364868164, + "loss": 0.6046, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2124557495117188, + "rewards/margins": 6.6645379066467285, + "rewards/rejected": -3.4520816802978516, + "step": 19472 + }, + { + "epoch": 4.87, + "grad_norm": 3.832878351211548, + "learning_rate": 1.6185902377713338e-08, + "logits/chosen": -0.566472053527832, + "logits/rejected": -0.6208983063697815, + "logps/chosen": -47.50029754638672, + "logps/rejected": -94.845703125, + "loss": 0.5713, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.424696445465088, + "rewards/margins": 6.628489971160889, + "rewards/rejected": -3.2037930488586426, + "step": 19473 + }, + { + "epoch": 4.87, + "grad_norm": 3.7595787048339844, + "learning_rate": 1.6122771909017344e-08, + "logits/chosen": -0.4687448740005493, + "logits/rejected": -0.5666427612304688, + "logps/chosen": -55.36844253540039, + "logps/rejected": -92.25303649902344, + "loss": 0.6007, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.288088083267212, + "rewards/margins": 6.195712089538574, + "rewards/rejected": -2.9076240062713623, + "step": 19474 + }, + { + "epoch": 4.87, + "grad_norm": 7.932284355163574, + "learning_rate": 1.6059764597228244e-08, + "logits/chosen": -0.5983397960662842, + "logits/rejected": -0.7046207189559937, + "logps/chosen": -53.0173454284668, + "logps/rejected": -93.45658874511719, + "loss": 0.6558, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.986396551132202, + "rewards/margins": 6.81804895401001, + "rewards/rejected": -3.8316524028778076, + "step": 19475 + }, + { + "epoch": 4.87, + "grad_norm": 18.081193923950195, + "learning_rate": 1.599688044390313e-08, + "logits/chosen": -0.5509495139122009, + "logits/rejected": -0.638430655002594, + "logps/chosen": -58.18115997314453, + "logps/rejected": -114.07778930664062, + "loss": 0.7051, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.103059768676758, + "rewards/margins": 7.270589351654053, + "rewards/rejected": -4.167530059814453, + "step": 19476 + }, + { + "epoch": 4.87, + "grad_norm": 4.115047931671143, + "learning_rate": 1.5934119450597418e-08, + "logits/chosen": -0.5617784261703491, + "logits/rejected": -0.6066735982894897, + "logps/chosen": -54.43415451049805, + "logps/rejected": -114.65522766113281, + "loss": 0.5908, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2550575733184814, + "rewards/margins": 7.767852306365967, + "rewards/rejected": -4.512794494628906, + "step": 19477 + }, + { + "epoch": 4.87, + "grad_norm": 3.1365299224853516, + "learning_rate": 1.5871481618859875e-08, + "logits/chosen": -0.5322518348693848, + "logits/rejected": -0.6276875734329224, + "logps/chosen": -51.973636627197266, + "logps/rejected": -102.10003662109375, + "loss": 0.5606, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8216466903686523, + "rewards/margins": 7.3797607421875, + "rewards/rejected": -4.558114051818848, + "step": 19478 + }, + { + "epoch": 4.87, + "grad_norm": 6.402618885040283, + "learning_rate": 1.580896695023981e-08, + "logits/chosen": -0.6551114320755005, + "logits/rejected": -0.6276800632476807, + "logps/chosen": -84.25521087646484, + "logps/rejected": -107.83961486816406, + "loss": 0.738, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9003279209136963, + "rewards/margins": 6.5430779457092285, + "rewards/rejected": -3.6427507400512695, + "step": 19479 + }, + { + "epoch": 4.87, + "grad_norm": 3.8167521953582764, + "learning_rate": 1.5746575446281555e-08, + "logits/chosen": -0.5209068655967712, + "logits/rejected": -0.5586961507797241, + "logps/chosen": -60.413700103759766, + "logps/rejected": -125.39717102050781, + "loss": 0.6708, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5059304237365723, + "rewards/margins": 9.037140846252441, + "rewards/rejected": -5.531210899353027, + "step": 19480 + }, + { + "epoch": 4.87, + "grad_norm": 10.79307746887207, + "learning_rate": 1.5684307108527197e-08, + "logits/chosen": -0.6234326958656311, + "logits/rejected": -0.6029531359672546, + "logps/chosen": -47.89141082763672, + "logps/rejected": -122.75059509277344, + "loss": 0.5858, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.379476308822632, + "rewards/margins": 7.127582550048828, + "rewards/rejected": -3.7481062412261963, + "step": 19481 + }, + { + "epoch": 4.87, + "grad_norm": 3.116792678833008, + "learning_rate": 1.5622161938514958e-08, + "logits/chosen": -0.5230275392532349, + "logits/rejected": -0.5835172533988953, + "logps/chosen": -58.24760437011719, + "logps/rejected": -104.16739654541016, + "loss": 0.564, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.412321090698242, + "rewards/margins": 7.34075403213501, + "rewards/rejected": -3.9284331798553467, + "step": 19482 + }, + { + "epoch": 4.87, + "grad_norm": 5.24901819229126, + "learning_rate": 1.5560139937780272e-08, + "logits/chosen": -0.5574379563331604, + "logits/rejected": -0.6493514180183411, + "logps/chosen": -56.37175750732422, + "logps/rejected": -104.2125244140625, + "loss": 0.6063, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3698174953460693, + "rewards/margins": 7.353992462158203, + "rewards/rejected": -3.984175205230713, + "step": 19483 + }, + { + "epoch": 4.87, + "grad_norm": 10.728907585144043, + "learning_rate": 1.5498241107856915e-08, + "logits/chosen": -0.5577091574668884, + "logits/rejected": -0.6563059091567993, + "logps/chosen": -54.476898193359375, + "logps/rejected": -105.51791381835938, + "loss": 0.7271, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.3610432147979736, + "rewards/margins": 7.283708572387695, + "rewards/rejected": -3.922665596008301, + "step": 19484 + }, + { + "epoch": 4.87, + "grad_norm": 1.5026262998580933, + "learning_rate": 1.543646545027311e-08, + "logits/chosen": -0.5542031526565552, + "logits/rejected": -0.6319878101348877, + "logps/chosen": -52.276859283447266, + "logps/rejected": -102.34615325927734, + "loss": 0.546, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.51064133644104, + "rewards/margins": 8.03775405883789, + "rewards/rejected": -4.5271124839782715, + "step": 19485 + }, + { + "epoch": 4.87, + "grad_norm": 6.327244281768799, + "learning_rate": 1.537481296655652e-08, + "logits/chosen": -0.5967782139778137, + "logits/rejected": -0.6647577285766602, + "logps/chosen": -69.74703216552734, + "logps/rejected": -114.18342590332031, + "loss": 0.7149, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.199382781982422, + "rewards/margins": 6.3369059562683105, + "rewards/rejected": -3.137523651123047, + "step": 19486 + }, + { + "epoch": 4.87, + "grad_norm": 3.2366888523101807, + "learning_rate": 1.5313283658230372e-08, + "logits/chosen": -0.5449845790863037, + "logits/rejected": -0.6377550363540649, + "logps/chosen": -48.25518035888672, + "logps/rejected": -93.0638198852539, + "loss": 0.562, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0269775390625, + "rewards/margins": 6.520615577697754, + "rewards/rejected": -3.493638753890991, + "step": 19487 + }, + { + "epoch": 4.88, + "grad_norm": 4.922025680541992, + "learning_rate": 1.5251877526814564e-08, + "logits/chosen": -0.5792540907859802, + "logits/rejected": -0.648594856262207, + "logps/chosen": -47.6854248046875, + "logps/rejected": -121.95726776123047, + "loss": 0.5731, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3774466514587402, + "rewards/margins": 7.564333915710449, + "rewards/rejected": -4.186886787414551, + "step": 19488 + }, + { + "epoch": 4.88, + "grad_norm": 4.383566379547119, + "learning_rate": 1.5190594573826768e-08, + "logits/chosen": -0.5378831624984741, + "logits/rejected": -0.6237574815750122, + "logps/chosen": -61.38275146484375, + "logps/rejected": -100.1444091796875, + "loss": 0.6725, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9797472953796387, + "rewards/margins": 5.777698516845703, + "rewards/rejected": -2.7979516983032227, + "step": 19489 + }, + { + "epoch": 4.88, + "grad_norm": 6.277273654937744, + "learning_rate": 1.5129434800781883e-08, + "logits/chosen": -0.5216447114944458, + "logits/rejected": -0.6652795672416687, + "logps/chosen": -50.279518127441406, + "logps/rejected": -101.43311309814453, + "loss": 0.5041, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.253230094909668, + "rewards/margins": 8.587124824523926, + "rewards/rejected": -5.3338942527771, + "step": 19490 + }, + { + "epoch": 4.88, + "grad_norm": 7.194838523864746, + "learning_rate": 1.506839820919037e-08, + "logits/chosen": -0.6057823896408081, + "logits/rejected": -0.6694098711013794, + "logps/chosen": -55.239463806152344, + "logps/rejected": -112.31761169433594, + "loss": 0.6054, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1254749298095703, + "rewards/margins": 7.574417591094971, + "rewards/rejected": -4.4489426612854, + "step": 19491 + }, + { + "epoch": 4.88, + "grad_norm": 2.541489601135254, + "learning_rate": 1.5007484800561023e-08, + "logits/chosen": -0.516120970249176, + "logits/rejected": -0.622600793838501, + "logps/chosen": -47.53716278076172, + "logps/rejected": -110.62307739257812, + "loss": 0.5453, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.425116539001465, + "rewards/margins": 8.617504119873047, + "rewards/rejected": -5.19238805770874, + "step": 19492 + }, + { + "epoch": 4.88, + "grad_norm": 12.049156188964844, + "learning_rate": 1.49466945763993e-08, + "logits/chosen": -0.5065092444419861, + "logits/rejected": -0.5579004287719727, + "logps/chosen": -56.96466827392578, + "logps/rejected": -117.20745849609375, + "loss": 0.6442, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9452903270721436, + "rewards/margins": 6.641113758087158, + "rewards/rejected": -3.6958236694335938, + "step": 19493 + }, + { + "epoch": 4.88, + "grad_norm": 7.231432914733887, + "learning_rate": 1.4886027538206782e-08, + "logits/chosen": -0.5104561448097229, + "logits/rejected": -0.530157208442688, + "logps/chosen": -49.8428840637207, + "logps/rejected": -112.98355102539062, + "loss": 0.7012, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.907269239425659, + "rewards/margins": 5.580630302429199, + "rewards/rejected": -2.6733615398406982, + "step": 19494 + }, + { + "epoch": 4.88, + "grad_norm": 2.912196397781372, + "learning_rate": 1.4825483687483377e-08, + "logits/chosen": -0.5659152865409851, + "logits/rejected": -0.6373040676116943, + "logps/chosen": -59.12329864501953, + "logps/rejected": -110.26243591308594, + "loss": 0.604, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1219630241394043, + "rewards/margins": 7.318061828613281, + "rewards/rejected": -4.196098327636719, + "step": 19495 + }, + { + "epoch": 4.88, + "grad_norm": 9.51105785369873, + "learning_rate": 1.4765063025724002e-08, + "logits/chosen": -0.4702869653701782, + "logits/rejected": -0.579806387424469, + "logps/chosen": -66.9423828125, + "logps/rejected": -101.35404205322266, + "loss": 0.6652, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.838043212890625, + "rewards/margins": 6.108609199523926, + "rewards/rejected": -3.27056622505188, + "step": 19496 + }, + { + "epoch": 4.88, + "grad_norm": 4.264155387878418, + "learning_rate": 1.4704765554423018e-08, + "logits/chosen": -0.5592146515846252, + "logits/rejected": -0.6831111907958984, + "logps/chosen": -64.31426239013672, + "logps/rejected": -100.58251953125, + "loss": 0.6347, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4618897438049316, + "rewards/margins": 7.891826629638672, + "rewards/rejected": -4.42993688583374, + "step": 19497 + }, + { + "epoch": 4.88, + "grad_norm": 6.372743606567383, + "learning_rate": 1.4644591275069786e-08, + "logits/chosen": -0.630064845085144, + "logits/rejected": -0.6831797957420349, + "logps/chosen": -41.455562591552734, + "logps/rejected": -116.09454345703125, + "loss": 0.6161, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5110368728637695, + "rewards/margins": 8.101744651794434, + "rewards/rejected": -4.590707778930664, + "step": 19498 + }, + { + "epoch": 4.88, + "grad_norm": 3.048553705215454, + "learning_rate": 1.4584540189151452e-08, + "logits/chosen": -0.5826256275177002, + "logits/rejected": -0.6637139916419983, + "logps/chosen": -52.74800109863281, + "logps/rejected": -112.60164642333984, + "loss": 0.601, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9975507259368896, + "rewards/margins": 7.8445634841918945, + "rewards/rejected": -4.847013473510742, + "step": 19499 + }, + { + "epoch": 4.88, + "grad_norm": 2.8141608238220215, + "learning_rate": 1.4524612298151274e-08, + "logits/chosen": -0.45518937706947327, + "logits/rejected": -0.5344269275665283, + "logps/chosen": -53.57568359375, + "logps/rejected": -101.65760803222656, + "loss": 0.5476, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.24166202545166, + "rewards/margins": 7.301914215087891, + "rewards/rejected": -4.060251712799072, + "step": 19500 + }, + { + "epoch": 4.88, + "grad_norm": 5.732568740844727, + "learning_rate": 1.4464807603551401e-08, + "logits/chosen": -0.5560266375541687, + "logits/rejected": -0.6742316484451294, + "logps/chosen": -61.077308654785156, + "logps/rejected": -96.15249633789062, + "loss": 0.7057, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.956432819366455, + "rewards/margins": 6.175497531890869, + "rewards/rejected": -3.219064712524414, + "step": 19501 + }, + { + "epoch": 4.88, + "grad_norm": 8.765913009643555, + "learning_rate": 1.4405126106828981e-08, + "logits/chosen": -0.6216996908187866, + "logits/rejected": -0.7209427356719971, + "logps/chosen": -54.01922607421875, + "logps/rejected": -127.92425537109375, + "loss": 0.5871, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6255085468292236, + "rewards/margins": 10.670209884643555, + "rewards/rejected": -7.044702529907227, + "step": 19502 + }, + { + "epoch": 4.88, + "grad_norm": 3.326525926589966, + "learning_rate": 1.4345567809458394e-08, + "logits/chosen": -0.5980547070503235, + "logits/rejected": -0.6984180808067322, + "logps/chosen": -45.766788482666016, + "logps/rejected": -106.41545104980469, + "loss": 0.5576, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.090296745300293, + "rewards/margins": 8.551700592041016, + "rewards/rejected": -5.461402893066406, + "step": 19503 + }, + { + "epoch": 4.88, + "grad_norm": 5.40292501449585, + "learning_rate": 1.4286132712911794e-08, + "logits/chosen": -0.5557951927185059, + "logits/rejected": -0.6671620607376099, + "logps/chosen": -59.13746643066406, + "logps/rejected": -108.67166137695312, + "loss": 0.6399, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2386484146118164, + "rewards/margins": 7.87432861328125, + "rewards/rejected": -4.635679721832275, + "step": 19504 + }, + { + "epoch": 4.88, + "grad_norm": 3.94529390335083, + "learning_rate": 1.4226820818658005e-08, + "logits/chosen": -0.6361734867095947, + "logits/rejected": -0.682102620601654, + "logps/chosen": -51.51863098144531, + "logps/rejected": -130.7809600830078, + "loss": 0.5694, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2174553871154785, + "rewards/margins": 8.951238632202148, + "rewards/rejected": -5.733783721923828, + "step": 19505 + }, + { + "epoch": 4.88, + "grad_norm": 3.5215096473693848, + "learning_rate": 1.4167632128163078e-08, + "logits/chosen": -0.5710600018501282, + "logits/rejected": -0.6388494372367859, + "logps/chosen": -46.142120361328125, + "logps/rejected": -102.47586059570312, + "loss": 0.578, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5496435165405273, + "rewards/margins": 7.502408027648926, + "rewards/rejected": -3.9527642726898193, + "step": 19506 + }, + { + "epoch": 4.88, + "grad_norm": 8.558389663696289, + "learning_rate": 1.4108566642888066e-08, + "logits/chosen": -0.6014871001243591, + "logits/rejected": -0.7041953802108765, + "logps/chosen": -61.816009521484375, + "logps/rejected": -104.08161163330078, + "loss": 0.7574, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.997039794921875, + "rewards/margins": 6.729546070098877, + "rewards/rejected": -3.732506275177002, + "step": 19507 + }, + { + "epoch": 4.88, + "grad_norm": 5.421467304229736, + "learning_rate": 1.4049624364294578e-08, + "logits/chosen": -0.4183090627193451, + "logits/rejected": -0.53342205286026, + "logps/chosen": -68.64171600341797, + "logps/rejected": -98.56986999511719, + "loss": 0.6576, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.938218116760254, + "rewards/margins": 7.048252582550049, + "rewards/rejected": -4.110034465789795, + "step": 19508 + }, + { + "epoch": 4.88, + "grad_norm": 2.67857027053833, + "learning_rate": 1.3990805293838117e-08, + "logits/chosen": -0.578857421875, + "logits/rejected": -0.6499143242835999, + "logps/chosen": -52.415977478027344, + "logps/rejected": -115.99534606933594, + "loss": 0.608, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.945241928100586, + "rewards/margins": 7.613890647888184, + "rewards/rejected": -4.668648719787598, + "step": 19509 + }, + { + "epoch": 4.88, + "grad_norm": 5.136854648590088, + "learning_rate": 1.3932109432971408e-08, + "logits/chosen": -0.5224744081497192, + "logits/rejected": -0.6304432153701782, + "logps/chosen": -47.72867202758789, + "logps/rejected": -112.29438781738281, + "loss": 0.5297, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.193721294403076, + "rewards/margins": 8.313764572143555, + "rewards/rejected": -5.1200432777404785, + "step": 19510 + }, + { + "epoch": 4.88, + "grad_norm": 2.704815149307251, + "learning_rate": 1.3873536783146624e-08, + "logits/chosen": -0.5455849170684814, + "logits/rejected": -0.6147582530975342, + "logps/chosen": -46.89645004272461, + "logps/rejected": -97.56895446777344, + "loss": 0.5404, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2142891883850098, + "rewards/margins": 7.056454658508301, + "rewards/rejected": -3.842165946960449, + "step": 19511 + }, + { + "epoch": 4.88, + "grad_norm": 3.035397529602051, + "learning_rate": 1.3815087345809275e-08, + "logits/chosen": -0.569851815700531, + "logits/rejected": -0.6238487958908081, + "logps/chosen": -55.50642395019531, + "logps/rejected": -121.5406723022461, + "loss": 0.6277, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.51503849029541, + "rewards/margins": 7.675326824188232, + "rewards/rejected": -4.160288333892822, + "step": 19512 + }, + { + "epoch": 4.88, + "grad_norm": 12.146845817565918, + "learning_rate": 1.3756761122405426e-08, + "logits/chosen": -0.6276310086250305, + "logits/rejected": -0.6890739798545837, + "logps/chosen": -41.68193054199219, + "logps/rejected": -113.17189025878906, + "loss": 0.6066, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.317166328430176, + "rewards/margins": 8.002492904663086, + "rewards/rejected": -4.685327529907227, + "step": 19513 + }, + { + "epoch": 4.88, + "grad_norm": 6.148618221282959, + "learning_rate": 1.3698558114375592e-08, + "logits/chosen": -0.5626282095909119, + "logits/rejected": -0.6272058486938477, + "logps/chosen": -58.841644287109375, + "logps/rejected": -114.1085433959961, + "loss": 0.6076, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.126027822494507, + "rewards/margins": 5.986518859863281, + "rewards/rejected": -2.8604912757873535, + "step": 19514 + }, + { + "epoch": 4.88, + "grad_norm": 2.6949424743652344, + "learning_rate": 1.3640478323157514e-08, + "logits/chosen": -0.5753865242004395, + "logits/rejected": -0.6479681730270386, + "logps/chosen": -55.57051086425781, + "logps/rejected": -117.48644256591797, + "loss": 0.5774, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2656641006469727, + "rewards/margins": 8.310721397399902, + "rewards/rejected": -5.0450568199157715, + "step": 19515 + }, + { + "epoch": 4.88, + "grad_norm": 6.633111000061035, + "learning_rate": 1.3582521750187261e-08, + "logits/chosen": -0.5948458909988403, + "logits/rejected": -0.6558910608291626, + "logps/chosen": -57.21940612792969, + "logps/rejected": -125.31918334960938, + "loss": 0.6873, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8962342739105225, + "rewards/margins": 6.896503925323486, + "rewards/rejected": -4.000269412994385, + "step": 19516 + }, + { + "epoch": 4.88, + "grad_norm": 9.280705451965332, + "learning_rate": 1.3524688396897024e-08, + "logits/chosen": -0.6084904670715332, + "logits/rejected": -0.6901527047157288, + "logps/chosen": -61.837684631347656, + "logps/rejected": -104.60587310791016, + "loss": 0.6046, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.889960289001465, + "rewards/margins": 7.1564226150512695, + "rewards/rejected": -4.266462326049805, + "step": 19517 + }, + { + "epoch": 4.88, + "grad_norm": 5.05384635925293, + "learning_rate": 1.3466978264715103e-08, + "logits/chosen": -0.5005257725715637, + "logits/rejected": -0.5927716493606567, + "logps/chosen": -61.258262634277344, + "logps/rejected": -122.40809631347656, + "loss": 0.5657, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.280897617340088, + "rewards/margins": 8.391443252563477, + "rewards/rejected": -5.110545635223389, + "step": 19518 + }, + { + "epoch": 4.88, + "grad_norm": 4.847504138946533, + "learning_rate": 1.3409391355068135e-08, + "logits/chosen": -0.5947054028511047, + "logits/rejected": -0.6491132378578186, + "logps/chosen": -55.37188720703125, + "logps/rejected": -124.03668212890625, + "loss": 0.618, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.88553786277771, + "rewards/margins": 7.574540138244629, + "rewards/rejected": -4.68900203704834, + "step": 19519 + }, + { + "epoch": 4.88, + "grad_norm": 9.357011795043945, + "learning_rate": 1.3351927669379428e-08, + "logits/chosen": -0.5812020301818848, + "logits/rejected": -0.6213626861572266, + "logps/chosen": -61.69786834716797, + "logps/rejected": -103.24818420410156, + "loss": 0.6954, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.724797248840332, + "rewards/margins": 5.544995307922363, + "rewards/rejected": -2.820197582244873, + "step": 19520 + }, + { + "epoch": 4.88, + "grad_norm": 3.2707762718200684, + "learning_rate": 1.32945872090684e-08, + "logits/chosen": -0.6048439741134644, + "logits/rejected": -0.6331098079681396, + "logps/chosen": -43.54783248901367, + "logps/rejected": -108.83975219726562, + "loss": 0.6098, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.131610155105591, + "rewards/margins": 6.972168922424316, + "rewards/rejected": -3.8405590057373047, + "step": 19521 + }, + { + "epoch": 4.88, + "grad_norm": 8.806008338928223, + "learning_rate": 1.3237369975551694e-08, + "logits/chosen": -0.5654512643814087, + "logits/rejected": -0.612553060054779, + "logps/chosen": -58.578155517578125, + "logps/rejected": -134.2875213623047, + "loss": 0.6147, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1445071697235107, + "rewards/margins": 6.118070602416992, + "rewards/rejected": -2.9735631942749023, + "step": 19522 + }, + { + "epoch": 4.88, + "grad_norm": 4.240259170532227, + "learning_rate": 1.3180275970244293e-08, + "logits/chosen": -0.6260842084884644, + "logits/rejected": -0.6787999272346497, + "logps/chosen": -62.02622604370117, + "logps/rejected": -137.0560302734375, + "loss": 0.5826, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.099397659301758, + "rewards/margins": 8.275874137878418, + "rewards/rejected": -5.1764750480651855, + "step": 19523 + }, + { + "epoch": 4.88, + "grad_norm": 3.963548421859741, + "learning_rate": 1.3123305194556735e-08, + "logits/chosen": -0.46123799681663513, + "logits/rejected": -0.5266107320785522, + "logps/chosen": -53.005191802978516, + "logps/rejected": -108.581298828125, + "loss": 0.5378, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.195857048034668, + "rewards/margins": 7.285310745239258, + "rewards/rejected": -4.089454650878906, + "step": 19524 + }, + { + "epoch": 4.88, + "grad_norm": 6.653464317321777, + "learning_rate": 1.3066457649896224e-08, + "logits/chosen": -0.4940806031227112, + "logits/rejected": -0.5636464953422546, + "logps/chosen": -65.64795684814453, + "logps/rejected": -119.16053009033203, + "loss": 0.6018, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.694579839706421, + "rewards/margins": 7.410282135009766, + "rewards/rejected": -3.7157020568847656, + "step": 19525 + }, + { + "epoch": 4.88, + "grad_norm": 3.7891459465026855, + "learning_rate": 1.3009733337667752e-08, + "logits/chosen": -0.6012315154075623, + "logits/rejected": -0.6162375807762146, + "logps/chosen": -35.775146484375, + "logps/rejected": -96.43156433105469, + "loss": 0.5309, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.874037742614746, + "rewards/margins": 6.239719390869141, + "rewards/rejected": -3.3656816482543945, + "step": 19526 + }, + { + "epoch": 4.88, + "grad_norm": 6.285194396972656, + "learning_rate": 1.2953132259273527e-08, + "logits/chosen": -0.6255251169204712, + "logits/rejected": -0.7055836915969849, + "logps/chosen": -48.05865478515625, + "logps/rejected": -120.14213562011719, + "loss": 0.6344, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1139988899230957, + "rewards/margins": 7.890989780426025, + "rewards/rejected": -4.77699089050293, + "step": 19527 + }, + { + "epoch": 4.89, + "grad_norm": 4.2813286781311035, + "learning_rate": 1.2896654416111321e-08, + "logits/chosen": -0.5590341091156006, + "logits/rejected": -0.653931736946106, + "logps/chosen": -54.58672332763672, + "logps/rejected": -105.40284729003906, + "loss": 0.6535, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0735678672790527, + "rewards/margins": 8.043662071228027, + "rewards/rejected": -4.970094680786133, + "step": 19528 + }, + { + "epoch": 4.89, + "grad_norm": 5.365720272064209, + "learning_rate": 1.2840299809577794e-08, + "logits/chosen": -0.6320928931236267, + "logits/rejected": -0.7202082276344299, + "logps/chosen": -55.48832702636719, + "logps/rejected": -99.22924041748047, + "loss": 0.5714, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0567398071289062, + "rewards/margins": 7.670810699462891, + "rewards/rejected": -4.614070892333984, + "step": 19529 + }, + { + "epoch": 4.89, + "grad_norm": 6.789629936218262, + "learning_rate": 1.2784068441064611e-08, + "logits/chosen": -0.5108627080917358, + "logits/rejected": -0.639948308467865, + "logps/chosen": -54.78522872924805, + "logps/rejected": -91.22740173339844, + "loss": 0.6338, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.198192834854126, + "rewards/margins": 5.945946216583252, + "rewards/rejected": -2.747753143310547, + "step": 19530 + }, + { + "epoch": 4.89, + "grad_norm": 2.479945182800293, + "learning_rate": 1.2727960311962329e-08, + "logits/chosen": -0.5602197647094727, + "logits/rejected": -0.6754970550537109, + "logps/chosen": -59.579872131347656, + "logps/rejected": -114.41517639160156, + "loss": 0.6213, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.23628568649292, + "rewards/margins": 8.372098922729492, + "rewards/rejected": -5.135813236236572, + "step": 19531 + }, + { + "epoch": 4.89, + "grad_norm": 3.4031341075897217, + "learning_rate": 1.2671975423656502e-08, + "logits/chosen": -0.6116222739219666, + "logits/rejected": -0.6622962951660156, + "logps/chosen": -60.9843635559082, + "logps/rejected": -122.38792419433594, + "loss": 0.6585, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0622992515563965, + "rewards/margins": 6.757741928100586, + "rewards/rejected": -3.695443630218506, + "step": 19532 + }, + { + "epoch": 4.89, + "grad_norm": 3.5856006145477295, + "learning_rate": 1.2616113777531025e-08, + "logits/chosen": -0.6082087159156799, + "logits/rejected": -0.6553205847740173, + "logps/chosen": -57.51563262939453, + "logps/rejected": -117.82699584960938, + "loss": 0.5827, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1176228523254395, + "rewards/margins": 7.340602397918701, + "rewards/rejected": -4.222979545593262, + "step": 19533 + }, + { + "epoch": 4.89, + "grad_norm": 2.5929174423217773, + "learning_rate": 1.2560375374965905e-08, + "logits/chosen": -0.5329940915107727, + "logits/rejected": -0.6030667424201965, + "logps/chosen": -51.710697174072266, + "logps/rejected": -115.07862091064453, + "loss": 0.5592, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1186013221740723, + "rewards/margins": 7.789770126342773, + "rewards/rejected": -4.671167850494385, + "step": 19534 + }, + { + "epoch": 4.89, + "grad_norm": 2.8832929134368896, + "learning_rate": 1.250476021733893e-08, + "logits/chosen": -0.5410630702972412, + "logits/rejected": -0.6727597713470459, + "logps/chosen": -56.49760437011719, + "logps/rejected": -94.55915069580078, + "loss": 0.575, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.418243408203125, + "rewards/margins": 7.86681604385376, + "rewards/rejected": -4.448573112487793, + "step": 19535 + }, + { + "epoch": 4.89, + "grad_norm": 3.993459701538086, + "learning_rate": 1.2449268306024554e-08, + "logits/chosen": -0.594721794128418, + "logits/rejected": -0.7060840129852295, + "logps/chosen": -46.6965446472168, + "logps/rejected": -98.37628936767578, + "loss": 0.5545, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.199845314025879, + "rewards/margins": 8.077702522277832, + "rewards/rejected": -4.877856731414795, + "step": 19536 + }, + { + "epoch": 4.89, + "grad_norm": 15.127083778381348, + "learning_rate": 1.2393899642393348e-08, + "logits/chosen": -0.5172268748283386, + "logits/rejected": -0.5883578658103943, + "logps/chosen": -51.35066223144531, + "logps/rejected": -110.94451904296875, + "loss": 0.5689, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.040128469467163, + "rewards/margins": 7.546226978302002, + "rewards/rejected": -4.506098747253418, + "step": 19537 + }, + { + "epoch": 4.89, + "grad_norm": 4.274748802185059, + "learning_rate": 1.2338654227814217e-08, + "logits/chosen": -0.6194635033607483, + "logits/rejected": -0.6856339573860168, + "logps/chosen": -52.440155029296875, + "logps/rejected": -117.55364227294922, + "loss": 0.6479, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6562376022338867, + "rewards/margins": 7.98875093460083, + "rewards/rejected": -5.332513332366943, + "step": 19538 + }, + { + "epoch": 4.89, + "grad_norm": 1.5979063510894775, + "learning_rate": 1.228353206365218e-08, + "logits/chosen": -0.5756473541259766, + "logits/rejected": -0.6522154211997986, + "logps/chosen": -38.44846725463867, + "logps/rejected": -113.49436950683594, + "loss": 0.5313, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.285534381866455, + "rewards/margins": 8.63613510131836, + "rewards/rejected": -5.350600242614746, + "step": 19539 + }, + { + "epoch": 4.89, + "grad_norm": 2.7055253982543945, + "learning_rate": 1.2228533151268929e-08, + "logits/chosen": -0.5608423352241516, + "logits/rejected": -0.6427863836288452, + "logps/chosen": -55.439945220947266, + "logps/rejected": -104.17431640625, + "loss": 0.5467, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3351683616638184, + "rewards/margins": 8.052637100219727, + "rewards/rejected": -4.717467784881592, + "step": 19540 + }, + { + "epoch": 4.89, + "grad_norm": 2.8483920097351074, + "learning_rate": 1.2173657492023926e-08, + "logits/chosen": -0.5340375304222107, + "logits/rejected": -0.6364511251449585, + "logps/chosen": -56.519222259521484, + "logps/rejected": -98.18077087402344, + "loss": 0.5784, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.197436809539795, + "rewards/margins": 7.307443618774414, + "rewards/rejected": -4.110006809234619, + "step": 19541 + }, + { + "epoch": 4.89, + "grad_norm": 4.181763172149658, + "learning_rate": 1.2118905087273314e-08, + "logits/chosen": -0.5925284624099731, + "logits/rejected": -0.6949094533920288, + "logps/chosen": -48.545135498046875, + "logps/rejected": -91.3035659790039, + "loss": 0.6315, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.533928394317627, + "rewards/margins": 7.702561378479004, + "rewards/rejected": -4.168633460998535, + "step": 19542 + }, + { + "epoch": 4.89, + "grad_norm": 4.615158557891846, + "learning_rate": 1.2064275938369896e-08, + "logits/chosen": -0.5571214556694031, + "logits/rejected": -0.6353625655174255, + "logps/chosen": -64.34086608886719, + "logps/rejected": -109.47513580322266, + "loss": 0.6858, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0904312133789062, + "rewards/margins": 8.091809272766113, + "rewards/rejected": -5.001377582550049, + "step": 19543 + }, + { + "epoch": 4.89, + "grad_norm": 4.074894428253174, + "learning_rate": 1.200977004666315e-08, + "logits/chosen": -0.5649237036705017, + "logits/rejected": -0.6192859411239624, + "logps/chosen": -54.287994384765625, + "logps/rejected": -106.64107513427734, + "loss": 0.5951, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1240530014038086, + "rewards/margins": 7.105799198150635, + "rewards/rejected": -3.981745958328247, + "step": 19544 + }, + { + "epoch": 4.89, + "grad_norm": 6.291986465454102, + "learning_rate": 1.1955387413500886e-08, + "logits/chosen": -0.5531217455863953, + "logits/rejected": -0.6704670190811157, + "logps/chosen": -69.54603576660156, + "logps/rejected": -103.3141098022461, + "loss": 0.6615, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3301682472229004, + "rewards/margins": 7.308281898498535, + "rewards/rejected": -3.978114128112793, + "step": 19545 + }, + { + "epoch": 4.89, + "grad_norm": 4.989694595336914, + "learning_rate": 1.1901128040226473e-08, + "logits/chosen": -0.560356080532074, + "logits/rejected": -0.6166514754295349, + "logps/chosen": -44.22285461425781, + "logps/rejected": -112.5716552734375, + "loss": 0.5335, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2151546478271484, + "rewards/margins": 7.412957191467285, + "rewards/rejected": -4.197802543640137, + "step": 19546 + }, + { + "epoch": 4.89, + "grad_norm": 3.088024616241455, + "learning_rate": 1.1846991928180507e-08, + "logits/chosen": -0.503375232219696, + "logits/rejected": -0.6097491979598999, + "logps/chosen": -62.61789321899414, + "logps/rejected": -118.03087615966797, + "loss": 0.556, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2422940731048584, + "rewards/margins": 7.235332489013672, + "rewards/rejected": -3.9930379390716553, + "step": 19547 + }, + { + "epoch": 4.89, + "grad_norm": 4.710472106933594, + "learning_rate": 1.1792979078701362e-08, + "logits/chosen": -0.5212073922157288, + "logits/rejected": -0.6079935431480408, + "logps/chosen": -56.43799591064453, + "logps/rejected": -112.89244079589844, + "loss": 0.566, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0911941528320312, + "rewards/margins": 7.987297058105469, + "rewards/rejected": -4.896103858947754, + "step": 19548 + }, + { + "epoch": 4.89, + "grad_norm": 12.746928215026855, + "learning_rate": 1.1739089493123524e-08, + "logits/chosen": -0.5477551817893982, + "logits/rejected": -0.6270723938941956, + "logps/chosen": -60.4893913269043, + "logps/rejected": -104.68660736083984, + "loss": 0.6987, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.045375347137451, + "rewards/margins": 6.274945259094238, + "rewards/rejected": -3.2295706272125244, + "step": 19549 + }, + { + "epoch": 4.89, + "grad_norm": 3.8276896476745605, + "learning_rate": 1.1685323172778152e-08, + "logits/chosen": -0.4667145609855652, + "logits/rejected": -0.5515143871307373, + "logps/chosen": -54.4193115234375, + "logps/rejected": -123.7981185913086, + "loss": 0.5921, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.143068790435791, + "rewards/margins": 7.419986724853516, + "rewards/rejected": -4.276917934417725, + "step": 19550 + }, + { + "epoch": 4.89, + "grad_norm": 8.654684066772461, + "learning_rate": 1.1631680118994181e-08, + "logits/chosen": -0.6421445608139038, + "logits/rejected": -0.738715410232544, + "logps/chosen": -56.91709518432617, + "logps/rejected": -104.27018737792969, + "loss": 0.5585, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8085741996765137, + "rewards/margins": 6.829342365264893, + "rewards/rejected": -4.020768165588379, + "step": 19551 + }, + { + "epoch": 4.89, + "grad_norm": 18.40242576599121, + "learning_rate": 1.1578160333097777e-08, + "logits/chosen": -0.5182876586914062, + "logits/rejected": -0.5886868238449097, + "logps/chosen": -54.078086853027344, + "logps/rejected": -98.29981231689453, + "loss": 0.6822, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.869861125946045, + "rewards/margins": 6.763028144836426, + "rewards/rejected": -3.8931679725646973, + "step": 19552 + }, + { + "epoch": 4.89, + "grad_norm": 4.225151538848877, + "learning_rate": 1.15247638164101e-08, + "logits/chosen": -0.5484899282455444, + "logits/rejected": -0.6094405651092529, + "logps/chosen": -54.3975944519043, + "logps/rejected": -114.71041107177734, + "loss": 0.6492, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3221628665924072, + "rewards/margins": 7.9353790283203125, + "rewards/rejected": -4.613215446472168, + "step": 19553 + }, + { + "epoch": 4.89, + "grad_norm": 5.9589314460754395, + "learning_rate": 1.147149057025232e-08, + "logits/chosen": -0.5166723132133484, + "logits/rejected": -0.6423978805541992, + "logps/chosen": -53.32720947265625, + "logps/rejected": -92.48782348632812, + "loss": 0.6027, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.108158826828003, + "rewards/margins": 7.218256950378418, + "rewards/rejected": -4.110097885131836, + "step": 19554 + }, + { + "epoch": 4.89, + "grad_norm": 4.023306369781494, + "learning_rate": 1.1418340595939492e-08, + "logits/chosen": -0.5481976270675659, + "logits/rejected": -0.6710925698280334, + "logps/chosen": -55.2426643371582, + "logps/rejected": -114.20824432373047, + "loss": 0.5046, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.811084747314453, + "rewards/margins": 8.090614318847656, + "rewards/rejected": -5.279529094696045, + "step": 19555 + }, + { + "epoch": 4.89, + "grad_norm": 6.960570812225342, + "learning_rate": 1.1365313894786124e-08, + "logits/chosen": -0.5187883377075195, + "logits/rejected": -0.5931972861289978, + "logps/chosen": -52.66473388671875, + "logps/rejected": -118.20853424072266, + "loss": 0.609, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0358710289001465, + "rewards/margins": 6.975370407104492, + "rewards/rejected": -3.9394989013671875, + "step": 19556 + }, + { + "epoch": 4.89, + "grad_norm": 9.391766548156738, + "learning_rate": 1.131241046810172e-08, + "logits/chosen": -0.5880177021026611, + "logits/rejected": -0.7218140959739685, + "logps/chosen": -51.625648498535156, + "logps/rejected": -97.30245208740234, + "loss": 0.6653, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2976279258728027, + "rewards/margins": 6.426494121551514, + "rewards/rejected": -3.1288657188415527, + "step": 19557 + }, + { + "epoch": 4.89, + "grad_norm": 6.1239471435546875, + "learning_rate": 1.125963031719357e-08, + "logits/chosen": -0.585225522518158, + "logits/rejected": -0.6535736322402954, + "logps/chosen": -61.40229034423828, + "logps/rejected": -112.0799789428711, + "loss": 0.6363, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.275693655014038, + "rewards/margins": 6.519526481628418, + "rewards/rejected": -4.243833065032959, + "step": 19558 + }, + { + "epoch": 4.89, + "grad_norm": 4.694910526275635, + "learning_rate": 1.1206973443366187e-08, + "logits/chosen": -0.537472665309906, + "logits/rejected": -0.5898751020431519, + "logps/chosen": -57.186031341552734, + "logps/rejected": -121.18975830078125, + "loss": 0.6116, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8492064476013184, + "rewards/margins": 7.170866012573242, + "rewards/rejected": -4.321659088134766, + "step": 19559 + }, + { + "epoch": 4.89, + "grad_norm": 5.766326427459717, + "learning_rate": 1.1154439847921306e-08, + "logits/chosen": -0.5298481583595276, + "logits/rejected": -0.605224072933197, + "logps/chosen": -45.62875747680664, + "logps/rejected": -101.6287841796875, + "loss": 0.5755, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0475759506225586, + "rewards/margins": 6.770469665527344, + "rewards/rejected": -3.722893714904785, + "step": 19560 + }, + { + "epoch": 4.89, + "grad_norm": 4.8081464767456055, + "learning_rate": 1.1102029532156221e-08, + "logits/chosen": -0.5241521000862122, + "logits/rejected": -0.6077623963356018, + "logps/chosen": -49.25019836425781, + "logps/rejected": -117.98126983642578, + "loss": 0.5575, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2497951984405518, + "rewards/margins": 7.912012100219727, + "rewards/rejected": -4.662216663360596, + "step": 19561 + }, + { + "epoch": 4.89, + "grad_norm": 3.6839232444763184, + "learning_rate": 1.1049742497366567e-08, + "logits/chosen": -0.4419556260108948, + "logits/rejected": -0.4946666955947876, + "logps/chosen": -62.15299606323242, + "logps/rejected": -133.08143615722656, + "loss": 0.6109, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.85565447807312, + "rewards/margins": 6.396164894104004, + "rewards/rejected": -3.5405101776123047, + "step": 19562 + }, + { + "epoch": 4.89, + "grad_norm": 7.389737129211426, + "learning_rate": 1.0997578744844639e-08, + "logits/chosen": -0.5795243978500366, + "logits/rejected": -0.7112912535667419, + "logps/chosen": -53.74879837036133, + "logps/rejected": -93.86792755126953, + "loss": 0.6671, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.2868053913116455, + "rewards/margins": 7.95344352722168, + "rewards/rejected": -4.666637897491455, + "step": 19563 + }, + { + "epoch": 4.89, + "grad_norm": 6.406928062438965, + "learning_rate": 1.0945538275878298e-08, + "logits/chosen": -0.5412184000015259, + "logits/rejected": -0.6018753051757812, + "logps/chosen": -56.493072509765625, + "logps/rejected": -118.3195571899414, + "loss": 0.6726, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.4466757774353027, + "rewards/margins": 7.938065052032471, + "rewards/rejected": -5.491389274597168, + "step": 19564 + }, + { + "epoch": 4.89, + "grad_norm": 6.763631343841553, + "learning_rate": 1.0893621091754847e-08, + "logits/chosen": -0.5075163245201111, + "logits/rejected": -0.5860303640365601, + "logps/chosen": -54.087867736816406, + "logps/rejected": -107.85726165771484, + "loss": 0.5563, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1342005729675293, + "rewards/margins": 7.496057510375977, + "rewards/rejected": -4.361856937408447, + "step": 19565 + }, + { + "epoch": 4.89, + "grad_norm": 3.8601715564727783, + "learning_rate": 1.0841827193756593e-08, + "logits/chosen": -0.6181128025054932, + "logits/rejected": -0.725322961807251, + "logps/chosen": -51.12656784057617, + "logps/rejected": -114.59691619873047, + "loss": 0.6146, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9631965160369873, + "rewards/margins": 8.72863483428955, + "rewards/rejected": -5.765438079833984, + "step": 19566 + }, + { + "epoch": 4.89, + "grad_norm": 11.626565933227539, + "learning_rate": 1.0790156583163069e-08, + "logits/chosen": -0.5107694864273071, + "logits/rejected": -0.6162893772125244, + "logps/chosen": -45.93760681152344, + "logps/rejected": -88.59724426269531, + "loss": 0.5886, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3524844646453857, + "rewards/margins": 6.298004150390625, + "rewards/rejected": -2.945518970489502, + "step": 19567 + }, + { + "epoch": 4.9, + "grad_norm": 10.212410926818848, + "learning_rate": 1.0738609261252143e-08, + "logits/chosen": -0.6067860722541809, + "logits/rejected": -0.6610298156738281, + "logps/chosen": -60.928131103515625, + "logps/rejected": -110.0405044555664, + "loss": 0.6755, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.896916389465332, + "rewards/margins": 6.578392505645752, + "rewards/rejected": -3.68147611618042, + "step": 19568 + }, + { + "epoch": 4.9, + "grad_norm": 4.362735271453857, + "learning_rate": 1.0687185229296682e-08, + "logits/chosen": -0.549751877784729, + "logits/rejected": -0.590903639793396, + "logps/chosen": -58.177066802978516, + "logps/rejected": -121.55003356933594, + "loss": 0.6604, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9187796115875244, + "rewards/margins": 7.72977352142334, + "rewards/rejected": -4.810993671417236, + "step": 19569 + }, + { + "epoch": 4.9, + "grad_norm": 3.9353325366973877, + "learning_rate": 1.0635884488567894e-08, + "logits/chosen": -0.5819101929664612, + "logits/rejected": -0.6582014560699463, + "logps/chosen": -51.077720642089844, + "logps/rejected": -102.9344253540039, + "loss": 0.6444, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.512136459350586, + "rewards/margins": 7.403011322021484, + "rewards/rejected": -3.8908746242523193, + "step": 19570 + }, + { + "epoch": 4.9, + "grad_norm": 6.8003387451171875, + "learning_rate": 1.0584707040333098e-08, + "logits/chosen": -0.5450965166091919, + "logits/rejected": -0.6380962133407593, + "logps/chosen": -67.81999969482422, + "logps/rejected": -115.44673919677734, + "loss": 0.7225, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.6696243286132812, + "rewards/margins": 7.538637161254883, + "rewards/rejected": -4.869012832641602, + "step": 19571 + }, + { + "epoch": 4.9, + "grad_norm": 3.160935163497925, + "learning_rate": 1.0533652885857392e-08, + "logits/chosen": -0.46909528970718384, + "logits/rejected": -0.5764250755310059, + "logps/chosen": -54.755950927734375, + "logps/rejected": -113.54061889648438, + "loss": 0.6036, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.152747869491577, + "rewards/margins": 8.29342269897461, + "rewards/rejected": -5.140674591064453, + "step": 19572 + }, + { + "epoch": 4.9, + "grad_norm": 5.423616886138916, + "learning_rate": 1.0482722026401993e-08, + "logits/chosen": -0.5699509978294373, + "logits/rejected": -0.6468556523323059, + "logps/chosen": -50.224571228027344, + "logps/rejected": -104.05830383300781, + "loss": 0.5669, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.241763114929199, + "rewards/margins": 7.646879196166992, + "rewards/rejected": -4.405116081237793, + "step": 19573 + }, + { + "epoch": 4.9, + "grad_norm": 2.4811456203460693, + "learning_rate": 1.0431914463225335e-08, + "logits/chosen": -0.5738725066184998, + "logits/rejected": -0.6716533303260803, + "logps/chosen": -58.09647750854492, + "logps/rejected": -104.07718658447266, + "loss": 0.5469, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1461050510406494, + "rewards/margins": 7.299520492553711, + "rewards/rejected": -4.153414726257324, + "step": 19574 + }, + { + "epoch": 4.9, + "grad_norm": 7.886982440948486, + "learning_rate": 1.0381230197583636e-08, + "logits/chosen": -0.5961870551109314, + "logits/rejected": -0.6857472658157349, + "logps/chosen": -59.86441421508789, + "logps/rejected": -93.69924926757812, + "loss": 0.6168, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.050180196762085, + "rewards/margins": 7.54476261138916, + "rewards/rejected": -4.494582653045654, + "step": 19575 + }, + { + "epoch": 4.9, + "grad_norm": 3.036860942840576, + "learning_rate": 1.0330669230728673e-08, + "logits/chosen": -0.5834681391716003, + "logits/rejected": -0.6895430088043213, + "logps/chosen": -55.148494720458984, + "logps/rejected": -104.95904541015625, + "loss": 0.5436, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.963437795639038, + "rewards/margins": 8.130098342895508, + "rewards/rejected": -5.166660785675049, + "step": 19576 + }, + { + "epoch": 4.9, + "grad_norm": 6.894712924957275, + "learning_rate": 1.028023156391056e-08, + "logits/chosen": -0.6020182371139526, + "logits/rejected": -0.6655776500701904, + "logps/chosen": -43.870914459228516, + "logps/rejected": -101.14397430419922, + "loss": 0.5679, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2371907234191895, + "rewards/margins": 6.895278453826904, + "rewards/rejected": -3.6580872535705566, + "step": 19577 + }, + { + "epoch": 4.9, + "grad_norm": 6.175450801849365, + "learning_rate": 1.0229917198374962e-08, + "logits/chosen": -0.5591500401496887, + "logits/rejected": -0.6658867001533508, + "logps/chosen": -64.63584899902344, + "logps/rejected": -111.47334289550781, + "loss": 0.6247, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0702524185180664, + "rewards/margins": 7.938739776611328, + "rewards/rejected": -4.86848783493042, + "step": 19578 + }, + { + "epoch": 4.9, + "grad_norm": 2.684628486633301, + "learning_rate": 1.017972613536533e-08, + "logits/chosen": -0.5804861783981323, + "logits/rejected": -0.6432271003723145, + "logps/chosen": -35.54981994628906, + "logps/rejected": -105.40802764892578, + "loss": 0.5036, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.34665846824646, + "rewards/margins": 7.963850021362305, + "rewards/rejected": -4.617191791534424, + "step": 19579 + }, + { + "epoch": 4.9, + "grad_norm": 11.666854858398438, + "learning_rate": 1.0129658376122342e-08, + "logits/chosen": -0.5111242532730103, + "logits/rejected": -0.6257437467575073, + "logps/chosen": -53.7098274230957, + "logps/rejected": -109.33898162841797, + "loss": 0.6242, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3123292922973633, + "rewards/margins": 7.750578880310059, + "rewards/rejected": -4.438248634338379, + "step": 19580 + }, + { + "epoch": 4.9, + "grad_norm": 20.356374740600586, + "learning_rate": 1.0079713921882783e-08, + "logits/chosen": -0.5133903622627258, + "logits/rejected": -0.5751165747642517, + "logps/chosen": -45.584503173828125, + "logps/rejected": -119.03886413574219, + "loss": 0.5988, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.03583025932312, + "rewards/margins": 7.647989273071289, + "rewards/rejected": -4.61215877532959, + "step": 19581 + }, + { + "epoch": 4.9, + "grad_norm": 2.468118906021118, + "learning_rate": 1.0029892773881223e-08, + "logits/chosen": -0.5120109915733337, + "logits/rejected": -0.6073794960975647, + "logps/chosen": -51.070465087890625, + "logps/rejected": -121.71224975585938, + "loss": 0.538, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2609453201293945, + "rewards/margins": 8.141026496887207, + "rewards/rejected": -4.8800811767578125, + "step": 19582 + }, + { + "epoch": 4.9, + "grad_norm": 3.907569169998169, + "learning_rate": 9.980194933348342e-09, + "logits/chosen": -0.5187703967094421, + "logits/rejected": -0.5917123556137085, + "logps/chosen": -57.78422164916992, + "logps/rejected": -110.06851196289062, + "loss": 0.5655, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.013293981552124, + "rewards/margins": 6.594841003417969, + "rewards/rejected": -3.5815465450286865, + "step": 19583 + }, + { + "epoch": 4.9, + "grad_norm": 3.492441177368164, + "learning_rate": 9.930620401512603e-09, + "logits/chosen": -0.5279302597045898, + "logits/rejected": -0.6377994418144226, + "logps/chosen": -51.49981689453125, + "logps/rejected": -128.19140625, + "loss": 0.5467, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0000948905944824, + "rewards/margins": 8.982172966003418, + "rewards/rejected": -5.982079029083252, + "step": 19584 + }, + { + "epoch": 4.9, + "grad_norm": 1.9081488847732544, + "learning_rate": 9.881169179598581e-09, + "logits/chosen": -0.5150134563446045, + "logits/rejected": -0.6084620356559753, + "logps/chosen": -58.493873596191406, + "logps/rejected": -110.78213500976562, + "loss": 0.5609, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.810350179672241, + "rewards/margins": 7.8948893547058105, + "rewards/rejected": -5.084539413452148, + "step": 19585 + }, + { + "epoch": 4.9, + "grad_norm": 2.861684799194336, + "learning_rate": 9.83184126882919e-09, + "logits/chosen": -0.5167969465255737, + "logits/rejected": -0.5997906923294067, + "logps/chosen": -61.51740646362305, + "logps/rejected": -114.06419372558594, + "loss": 0.5737, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1260485649108887, + "rewards/margins": 7.286863803863525, + "rewards/rejected": -4.160815715789795, + "step": 19586 + }, + { + "epoch": 4.9, + "grad_norm": 5.582585334777832, + "learning_rate": 9.782636670422341e-09, + "logits/chosen": -0.5447466969490051, + "logits/rejected": -0.6901180744171143, + "logps/chosen": -56.0961799621582, + "logps/rejected": -105.53597259521484, + "loss": 0.6755, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.255520820617676, + "rewards/margins": 6.615389347076416, + "rewards/rejected": -3.359868049621582, + "step": 19587 + }, + { + "epoch": 4.9, + "grad_norm": 6.947571277618408, + "learning_rate": 9.733555385594284e-09, + "logits/chosen": -0.48175373673439026, + "logits/rejected": -0.5569207668304443, + "logps/chosen": -59.5895881652832, + "logps/rejected": -96.6303482055664, + "loss": 0.6549, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.07751727104187, + "rewards/margins": 6.466588020324707, + "rewards/rejected": -3.389070510864258, + "step": 19588 + }, + { + "epoch": 4.9, + "grad_norm": 6.308341979980469, + "learning_rate": 9.684597415558494e-09, + "logits/chosen": -0.45213642716407776, + "logits/rejected": -0.49916738271713257, + "logps/chosen": -57.729888916015625, + "logps/rejected": -109.30271911621094, + "loss": 0.6061, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.37784481048584, + "rewards/margins": 6.02647066116333, + "rewards/rejected": -2.6486258506774902, + "step": 19589 + }, + { + "epoch": 4.9, + "grad_norm": 2.7825663089752197, + "learning_rate": 9.635762761523449e-09, + "logits/chosen": -0.5742591619491577, + "logits/rejected": -0.6766532063484192, + "logps/chosen": -51.59973907470703, + "logps/rejected": -94.0707778930664, + "loss": 0.5497, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2773869037628174, + "rewards/margins": 7.650383949279785, + "rewards/rejected": -4.3729963302612305, + "step": 19590 + }, + { + "epoch": 4.9, + "grad_norm": 6.667834758758545, + "learning_rate": 9.587051424697624e-09, + "logits/chosen": -0.5567362308502197, + "logits/rejected": -0.617936372756958, + "logps/chosen": -54.61474609375, + "logps/rejected": -110.039794921875, + "loss": 0.6365, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.151411533355713, + "rewards/margins": 7.552294731140137, + "rewards/rejected": -4.40088415145874, + "step": 19591 + }, + { + "epoch": 4.9, + "grad_norm": 5.153639793395996, + "learning_rate": 9.538463406283394e-09, + "logits/chosen": -0.543744683265686, + "logits/rejected": -0.6491626501083374, + "logps/chosen": -53.03551483154297, + "logps/rejected": -83.39970397949219, + "loss": 0.5571, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2991981506347656, + "rewards/margins": 6.727517127990723, + "rewards/rejected": -3.428318500518799, + "step": 19592 + }, + { + "epoch": 4.9, + "grad_norm": 2.9671847820281982, + "learning_rate": 9.489998707481462e-09, + "logits/chosen": -0.6304891109466553, + "logits/rejected": -0.6580438613891602, + "logps/chosen": -45.04934310913086, + "logps/rejected": -121.22784423828125, + "loss": 0.4964, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0311317443847656, + "rewards/margins": 7.762900352478027, + "rewards/rejected": -4.7317681312561035, + "step": 19593 + }, + { + "epoch": 4.9, + "grad_norm": 15.477314949035645, + "learning_rate": 9.441657329490317e-09, + "logits/chosen": -0.5866473913192749, + "logits/rejected": -0.6719603538513184, + "logps/chosen": -64.15060424804688, + "logps/rejected": -106.05694580078125, + "loss": 0.6814, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.057121753692627, + "rewards/margins": 6.797937870025635, + "rewards/rejected": -3.7408158779144287, + "step": 19594 + }, + { + "epoch": 4.9, + "grad_norm": 5.833115100860596, + "learning_rate": 9.393439273503446e-09, + "logits/chosen": -0.565808892250061, + "logits/rejected": -0.6542879343032837, + "logps/chosen": -58.0306396484375, + "logps/rejected": -96.5293197631836, + "loss": 0.609, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.023277759552002, + "rewards/margins": 7.239311218261719, + "rewards/rejected": -4.216033458709717, + "step": 19595 + }, + { + "epoch": 4.9, + "grad_norm": 7.3789591789245605, + "learning_rate": 9.34534454071323e-09, + "logits/chosen": -0.5804432034492493, + "logits/rejected": -0.705626904964447, + "logps/chosen": -59.83366775512695, + "logps/rejected": -104.3000259399414, + "loss": 0.5589, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2098753452301025, + "rewards/margins": 7.965338706970215, + "rewards/rejected": -4.755463600158691, + "step": 19596 + }, + { + "epoch": 4.9, + "grad_norm": 3.450723171234131, + "learning_rate": 9.297373132308163e-09, + "logits/chosen": -0.5859697461128235, + "logits/rejected": -0.6481361389160156, + "logps/chosen": -67.06746673583984, + "logps/rejected": -118.19403839111328, + "loss": 0.6638, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.140392541885376, + "rewards/margins": 8.033090591430664, + "rewards/rejected": -4.892697811126709, + "step": 19597 + }, + { + "epoch": 4.9, + "grad_norm": 2.8486175537109375, + "learning_rate": 9.249525049472851e-09, + "logits/chosen": -0.5185086131095886, + "logits/rejected": -0.6278619170188904, + "logps/chosen": -58.54134750366211, + "logps/rejected": -133.1687469482422, + "loss": 0.5805, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8795199394226074, + "rewards/margins": 9.725152969360352, + "rewards/rejected": -6.845632553100586, + "step": 19598 + }, + { + "epoch": 4.9, + "grad_norm": 8.841936111450195, + "learning_rate": 9.201800293390795e-09, + "logits/chosen": -0.4640137255191803, + "logits/rejected": -0.5704891681671143, + "logps/chosen": -60.5413703918457, + "logps/rejected": -101.69734954833984, + "loss": 0.558, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.207836866378784, + "rewards/margins": 7.42877197265625, + "rewards/rejected": -4.2209343910217285, + "step": 19599 + }, + { + "epoch": 4.9, + "grad_norm": 4.032752990722656, + "learning_rate": 9.154198865239938e-09, + "logits/chosen": -0.5047831535339355, + "logits/rejected": -0.5861281752586365, + "logps/chosen": -56.12830352783203, + "logps/rejected": -103.14274597167969, + "loss": 0.5888, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0839390754699707, + "rewards/margins": 7.273643493652344, + "rewards/rejected": -4.189704418182373, + "step": 19600 + }, + { + "epoch": 4.9, + "grad_norm": 4.363253593444824, + "learning_rate": 9.106720766197675e-09, + "logits/chosen": -0.5691378116607666, + "logits/rejected": -0.6519255638122559, + "logps/chosen": -49.11643981933594, + "logps/rejected": -108.3635025024414, + "loss": 0.6103, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.158158302307129, + "rewards/margins": 7.6772356033325195, + "rewards/rejected": -4.519076824188232, + "step": 19601 + }, + { + "epoch": 4.9, + "grad_norm": 32.25926208496094, + "learning_rate": 9.059365997436952e-09, + "logits/chosen": -0.5196825861930847, + "logits/rejected": -0.5750522017478943, + "logps/chosen": -47.050689697265625, + "logps/rejected": -104.43578338623047, + "loss": 0.5424, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1155152320861816, + "rewards/margins": 7.223230361938477, + "rewards/rejected": -4.107715606689453, + "step": 19602 + }, + { + "epoch": 4.9, + "grad_norm": 5.8830437660217285, + "learning_rate": 9.01213456012795e-09, + "logits/chosen": -0.5135970115661621, + "logits/rejected": -0.5980327725410461, + "logps/chosen": -57.002777099609375, + "logps/rejected": -109.66905212402344, + "loss": 0.5661, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.917968273162842, + "rewards/margins": 7.898545265197754, + "rewards/rejected": -4.98057746887207, + "step": 19603 + }, + { + "epoch": 4.9, + "grad_norm": 4.3313093185424805, + "learning_rate": 8.965026455437509e-09, + "logits/chosen": -0.5422847867012024, + "logits/rejected": -0.6754631996154785, + "logps/chosen": -58.1230583190918, + "logps/rejected": -82.35771179199219, + "loss": 0.5787, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9455854892730713, + "rewards/margins": 6.967615127563477, + "rewards/rejected": -4.022029876708984, + "step": 19604 + }, + { + "epoch": 4.9, + "grad_norm": 4.270318031311035, + "learning_rate": 8.918041684530254e-09, + "logits/chosen": -0.5526494383811951, + "logits/rejected": -0.6330775022506714, + "logps/chosen": -58.1832275390625, + "logps/rejected": -129.05221557617188, + "loss": 0.6442, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.332444429397583, + "rewards/margins": 8.789521217346191, + "rewards/rejected": -5.457077503204346, + "step": 19605 + }, + { + "epoch": 4.9, + "grad_norm": 4.6750569343566895, + "learning_rate": 8.871180248566924e-09, + "logits/chosen": -0.6296460628509521, + "logits/rejected": -0.7076715230941772, + "logps/chosen": -63.51972579956055, + "logps/rejected": -105.07383728027344, + "loss": 0.7019, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0363967418670654, + "rewards/margins": 7.119646072387695, + "rewards/rejected": -4.083249092102051, + "step": 19606 + }, + { + "epoch": 4.9, + "grad_norm": 4.28045654296875, + "learning_rate": 8.82444214870548e-09, + "logits/chosen": -0.5213190913200378, + "logits/rejected": -0.5747971534729004, + "logps/chosen": -53.17302703857422, + "logps/rejected": -107.38827514648438, + "loss": 0.6208, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3451955318450928, + "rewards/margins": 6.045197486877441, + "rewards/rejected": -2.700002431869507, + "step": 19607 + }, + { + "epoch": 4.91, + "grad_norm": 4.9136528968811035, + "learning_rate": 8.777827386100557e-09, + "logits/chosen": -0.5766513347625732, + "logits/rejected": -0.6617374420166016, + "logps/chosen": -56.80118942260742, + "logps/rejected": -106.93273162841797, + "loss": 0.6324, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.241914987564087, + "rewards/margins": 8.084426879882812, + "rewards/rejected": -4.842512130737305, + "step": 19608 + }, + { + "epoch": 4.91, + "grad_norm": 5.184648513793945, + "learning_rate": 8.731335961905119e-09, + "logits/chosen": -0.5231931209564209, + "logits/rejected": -0.6201004981994629, + "logps/chosen": -58.81576156616211, + "logps/rejected": -118.70951843261719, + "loss": 0.6596, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1913185119628906, + "rewards/margins": 8.718429565429688, + "rewards/rejected": -5.527111053466797, + "step": 19609 + }, + { + "epoch": 4.91, + "grad_norm": 3.648858070373535, + "learning_rate": 8.684967877267137e-09, + "logits/chosen": -0.6208107471466064, + "logits/rejected": -0.6633743643760681, + "logps/chosen": -43.572044372558594, + "logps/rejected": -107.69073486328125, + "loss": 0.566, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3915042877197266, + "rewards/margins": 7.035323143005371, + "rewards/rejected": -3.6438188552856445, + "step": 19610 + }, + { + "epoch": 4.91, + "grad_norm": 6.293558120727539, + "learning_rate": 8.638723133331805e-09, + "logits/chosen": -0.6360980272293091, + "logits/rejected": -0.6960905194282532, + "logps/chosen": -43.95000457763672, + "logps/rejected": -107.17147827148438, + "loss": 0.6215, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2291617393493652, + "rewards/margins": 6.858351230621338, + "rewards/rejected": -3.6291892528533936, + "step": 19611 + }, + { + "epoch": 4.91, + "grad_norm": 14.58996868133545, + "learning_rate": 8.592601731243766e-09, + "logits/chosen": -0.5746086835861206, + "logits/rejected": -0.6274587512016296, + "logps/chosen": -49.38410568237305, + "logps/rejected": -128.46267700195312, + "loss": 0.6438, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.899045467376709, + "rewards/margins": 7.7162766456604, + "rewards/rejected": -4.817231178283691, + "step": 19612 + }, + { + "epoch": 4.91, + "grad_norm": 3.558511734008789, + "learning_rate": 8.546603672140997e-09, + "logits/chosen": -0.5307379961013794, + "logits/rejected": -0.6102144122123718, + "logps/chosen": -49.912113189697266, + "logps/rejected": -107.7692642211914, + "loss": 0.5852, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0269386768341064, + "rewards/margins": 7.097142219543457, + "rewards/rejected": -4.0702033042907715, + "step": 19613 + }, + { + "epoch": 4.91, + "grad_norm": 4.600890159606934, + "learning_rate": 8.500728957160919e-09, + "logits/chosen": -0.5452814698219299, + "logits/rejected": -0.6312674880027771, + "logps/chosen": -59.815589904785156, + "logps/rejected": -107.47975158691406, + "loss": 0.6613, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.06103777885437, + "rewards/margins": 6.919877052307129, + "rewards/rejected": -3.8588387966156006, + "step": 19614 + }, + { + "epoch": 4.91, + "grad_norm": 37.147098541259766, + "learning_rate": 8.454977587437075e-09, + "logits/chosen": -0.6069300174713135, + "logits/rejected": -0.6519169807434082, + "logps/chosen": -53.55333709716797, + "logps/rejected": -109.41593933105469, + "loss": 0.7076, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.794252634048462, + "rewards/margins": 6.398055076599121, + "rewards/rejected": -3.603801965713501, + "step": 19615 + }, + { + "epoch": 4.91, + "grad_norm": 3.8698627948760986, + "learning_rate": 8.409349564099667e-09, + "logits/chosen": -0.541118860244751, + "logits/rejected": -0.664518415927887, + "logps/chosen": -51.6133918762207, + "logps/rejected": -93.10587310791016, + "loss": 0.6181, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7738852500915527, + "rewards/margins": 7.416254043579102, + "rewards/rejected": -4.642368316650391, + "step": 19616 + }, + { + "epoch": 4.91, + "grad_norm": 5.2424397468566895, + "learning_rate": 8.363844888276685e-09, + "logits/chosen": -0.48716673254966736, + "logits/rejected": -0.5784468054771423, + "logps/chosen": -61.207069396972656, + "logps/rejected": -101.49018859863281, + "loss": 0.6425, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0065760612487793, + "rewards/margins": 6.2705979347229, + "rewards/rejected": -3.264021635055542, + "step": 19617 + }, + { + "epoch": 4.91, + "grad_norm": 3.1816952228546143, + "learning_rate": 8.318463561092782e-09, + "logits/chosen": -0.5752683877944946, + "logits/rejected": -0.6629706621170044, + "logps/chosen": -66.00944519042969, + "logps/rejected": -106.78630828857422, + "loss": 0.6492, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1531546115875244, + "rewards/margins": 8.033773422241211, + "rewards/rejected": -4.880619049072266, + "step": 19618 + }, + { + "epoch": 4.91, + "grad_norm": 5.720361709594727, + "learning_rate": 8.27320558366873e-09, + "logits/chosen": -0.5054047703742981, + "logits/rejected": -0.5467722415924072, + "logps/chosen": -42.999088287353516, + "logps/rejected": -131.4849853515625, + "loss": 0.4922, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9952282905578613, + "rewards/margins": 8.961726188659668, + "rewards/rejected": -5.966497898101807, + "step": 19619 + }, + { + "epoch": 4.91, + "grad_norm": 18.415138244628906, + "learning_rate": 8.228070957123634e-09, + "logits/chosen": -0.5467787384986877, + "logits/rejected": -0.6051808595657349, + "logps/chosen": -63.12647247314453, + "logps/rejected": -112.08892059326172, + "loss": 0.6683, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1538186073303223, + "rewards/margins": 6.352094650268555, + "rewards/rejected": -3.1982765197753906, + "step": 19620 + }, + { + "epoch": 4.91, + "grad_norm": 2.484511375427246, + "learning_rate": 8.183059682571604e-09, + "logits/chosen": -0.6127849221229553, + "logits/rejected": -0.6999520063400269, + "logps/chosen": -45.66115188598633, + "logps/rejected": -121.26570892333984, + "loss": 0.5128, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.26572322845459, + "rewards/margins": 8.820115089416504, + "rewards/rejected": -5.554391384124756, + "step": 19621 + }, + { + "epoch": 4.91, + "grad_norm": 2.5972177982330322, + "learning_rate": 8.13817176112619e-09, + "logits/chosen": -0.5343446135520935, + "logits/rejected": -0.5997374653816223, + "logps/chosen": -54.29744338989258, + "logps/rejected": -105.82598876953125, + "loss": 0.6072, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9599533081054688, + "rewards/margins": 6.9809980392456055, + "rewards/rejected": -4.021044731140137, + "step": 19622 + }, + { + "epoch": 4.91, + "grad_norm": 6.811220169067383, + "learning_rate": 8.093407193895952e-09, + "logits/chosen": -0.5597851872444153, + "logits/rejected": -0.6127391457557678, + "logps/chosen": -81.71128845214844, + "logps/rejected": -104.4284439086914, + "loss": 0.7807, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.7137644290924072, + "rewards/margins": 5.978457450866699, + "rewards/rejected": -3.264692783355713, + "step": 19623 + }, + { + "epoch": 4.91, + "grad_norm": 3.6477324962615967, + "learning_rate": 8.048765981987783e-09, + "logits/chosen": -0.5946298241615295, + "logits/rejected": -0.6650568246841431, + "logps/chosen": -46.434112548828125, + "logps/rejected": -105.67192077636719, + "loss": 0.5454, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.372574806213379, + "rewards/margins": 7.150493621826172, + "rewards/rejected": -3.7779183387756348, + "step": 19624 + }, + { + "epoch": 4.91, + "grad_norm": 15.85655689239502, + "learning_rate": 8.004248126503578e-09, + "logits/chosen": -0.6475638747215271, + "logits/rejected": -0.742038369178772, + "logps/chosen": -47.03190612792969, + "logps/rejected": -113.05256652832031, + "loss": 0.5415, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.935192108154297, + "rewards/margins": 8.146323204040527, + "rewards/rejected": -5.211130619049072, + "step": 19625 + }, + { + "epoch": 4.91, + "grad_norm": 4.983323574066162, + "learning_rate": 7.959853628544679e-09, + "logits/chosen": -0.5776558518409729, + "logits/rejected": -0.6469420194625854, + "logps/chosen": -69.20645904541016, + "logps/rejected": -116.63848876953125, + "loss": 0.6126, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9511678218841553, + "rewards/margins": 7.456502437591553, + "rewards/rejected": -4.50533390045166, + "step": 19626 + }, + { + "epoch": 4.91, + "grad_norm": 3.0706372261047363, + "learning_rate": 7.915582489206875e-09, + "logits/chosen": -0.5099798440933228, + "logits/rejected": -0.619968056678772, + "logps/chosen": -54.90460205078125, + "logps/rejected": -94.45510864257812, + "loss": 0.6168, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.910287380218506, + "rewards/margins": 6.736643314361572, + "rewards/rejected": -3.8263559341430664, + "step": 19627 + }, + { + "epoch": 4.91, + "grad_norm": 8.005891799926758, + "learning_rate": 7.871434709585401e-09, + "logits/chosen": -0.5209652781486511, + "logits/rejected": -0.6470969915390015, + "logps/chosen": -59.38130569458008, + "logps/rejected": -100.29035186767578, + "loss": 0.6318, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9615602493286133, + "rewards/margins": 7.498471260070801, + "rewards/rejected": -4.536910533905029, + "step": 19628 + }, + { + "epoch": 4.91, + "grad_norm": 4.112597465515137, + "learning_rate": 7.827410290770499e-09, + "logits/chosen": -0.5706424713134766, + "logits/rejected": -0.6496486663818359, + "logps/chosen": -51.00770950317383, + "logps/rejected": -115.81278991699219, + "loss": 0.5575, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.914412021636963, + "rewards/margins": 7.855386734008789, + "rewards/rejected": -4.940974235534668, + "step": 19629 + }, + { + "epoch": 4.91, + "grad_norm": 3.773902416229248, + "learning_rate": 7.783509233850184e-09, + "logits/chosen": -0.5488347411155701, + "logits/rejected": -0.6587442755699158, + "logps/chosen": -56.76375961303711, + "logps/rejected": -118.46674346923828, + "loss": 0.5768, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9976272583007812, + "rewards/margins": 8.097393035888672, + "rewards/rejected": -5.099766731262207, + "step": 19630 + }, + { + "epoch": 4.91, + "grad_norm": 4.698318958282471, + "learning_rate": 7.7397315399097e-09, + "logits/chosen": -0.4828244745731354, + "logits/rejected": -0.5568464994430542, + "logps/chosen": -49.121707916259766, + "logps/rejected": -99.55707550048828, + "loss": 0.5939, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0898261070251465, + "rewards/margins": 6.757544994354248, + "rewards/rejected": -3.6677191257476807, + "step": 19631 + }, + { + "epoch": 4.91, + "grad_norm": 6.553982734680176, + "learning_rate": 7.69607721002985e-09, + "logits/chosen": -0.5663673281669617, + "logits/rejected": -0.6758407950401306, + "logps/chosen": -48.05876541137695, + "logps/rejected": -96.47254943847656, + "loss": 0.5967, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1732046604156494, + "rewards/margins": 7.500577926635742, + "rewards/rejected": -4.3273725509643555, + "step": 19632 + }, + { + "epoch": 4.91, + "grad_norm": 12.971745491027832, + "learning_rate": 7.652546245290327e-09, + "logits/chosen": -0.5120957493782043, + "logits/rejected": -0.5684236884117126, + "logps/chosen": -72.52082824707031, + "logps/rejected": -119.25878143310547, + "loss": 0.6777, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.139194965362549, + "rewards/margins": 6.456406116485596, + "rewards/rejected": -3.3172109127044678, + "step": 19633 + }, + { + "epoch": 4.91, + "grad_norm": 36.04370880126953, + "learning_rate": 7.609138646766378e-09, + "logits/chosen": -0.5722464323043823, + "logits/rejected": -0.6761003136634827, + "logps/chosen": -65.955078125, + "logps/rejected": -100.59871673583984, + "loss": 0.6281, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1594879627227783, + "rewards/margins": 7.7981719970703125, + "rewards/rejected": -4.638684272766113, + "step": 19634 + }, + { + "epoch": 4.91, + "grad_norm": 5.958404064178467, + "learning_rate": 7.565854415531037e-09, + "logits/chosen": -0.5625269412994385, + "logits/rejected": -0.6551169753074646, + "logps/chosen": -56.23522186279297, + "logps/rejected": -119.32505798339844, + "loss": 0.5251, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.087688446044922, + "rewards/margins": 7.878283500671387, + "rewards/rejected": -4.790594577789307, + "step": 19635 + }, + { + "epoch": 4.91, + "grad_norm": 4.937069416046143, + "learning_rate": 7.522693552652894e-09, + "logits/chosen": -0.5622312426567078, + "logits/rejected": -0.6807804107666016, + "logps/chosen": -46.55276107788086, + "logps/rejected": -84.003173828125, + "loss": 0.5852, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.207699775695801, + "rewards/margins": 6.868174076080322, + "rewards/rejected": -3.6604745388031006, + "step": 19636 + }, + { + "epoch": 4.91, + "grad_norm": 6.184274673461914, + "learning_rate": 7.47965605919998e-09, + "logits/chosen": -0.5595077276229858, + "logits/rejected": -0.6602972745895386, + "logps/chosen": -55.82647705078125, + "logps/rejected": -116.35057830810547, + "loss": 0.6721, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.155441999435425, + "rewards/margins": 7.858023643493652, + "rewards/rejected": -4.702580451965332, + "step": 19637 + }, + { + "epoch": 4.91, + "grad_norm": 4.605833530426025, + "learning_rate": 7.436741936234782e-09, + "logits/chosen": -0.5131718516349792, + "logits/rejected": -0.6093792915344238, + "logps/chosen": -51.91658020019531, + "logps/rejected": -100.31121826171875, + "loss": 0.5661, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1189894676208496, + "rewards/margins": 6.632536888122559, + "rewards/rejected": -3.51354718208313, + "step": 19638 + }, + { + "epoch": 4.91, + "grad_norm": 7.006387233734131, + "learning_rate": 7.393951184817561e-09, + "logits/chosen": -0.47858601808547974, + "logits/rejected": -0.5755262970924377, + "logps/chosen": -55.8671875, + "logps/rejected": -113.15380096435547, + "loss": 0.5698, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9775795936584473, + "rewards/margins": 7.333829402923584, + "rewards/rejected": -4.356250286102295, + "step": 19639 + }, + { + "epoch": 4.91, + "grad_norm": 3.7952535152435303, + "learning_rate": 7.351283806006915e-09, + "logits/chosen": -0.5569069981575012, + "logits/rejected": -0.594551682472229, + "logps/chosen": -50.62356948852539, + "logps/rejected": -114.55502319335938, + "loss": 0.6291, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5029377937316895, + "rewards/margins": 7.431220054626465, + "rewards/rejected": -3.9282822608947754, + "step": 19640 + }, + { + "epoch": 4.91, + "grad_norm": 4.353528022766113, + "learning_rate": 7.3087398008564455e-09, + "logits/chosen": -0.5610135793685913, + "logits/rejected": -0.62517249584198, + "logps/chosen": -49.125431060791016, + "logps/rejected": -117.41618347167969, + "loss": 0.6199, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.466937780380249, + "rewards/margins": 8.526458740234375, + "rewards/rejected": -5.059520244598389, + "step": 19641 + }, + { + "epoch": 4.91, + "grad_norm": 5.055850505828857, + "learning_rate": 7.26631917041698e-09, + "logits/chosen": -0.5218246579170227, + "logits/rejected": -0.5786400437355042, + "logps/chosen": -56.704124450683594, + "logps/rejected": -106.0382308959961, + "loss": 0.6849, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1731631755828857, + "rewards/margins": 7.267092704772949, + "rewards/rejected": -4.093929767608643, + "step": 19642 + }, + { + "epoch": 4.91, + "grad_norm": 8.647644996643066, + "learning_rate": 7.224021915737678e-09, + "logits/chosen": -0.5469570159912109, + "logits/rejected": -0.6405819654464722, + "logps/chosen": -54.620872497558594, + "logps/rejected": -118.21611022949219, + "loss": 0.6047, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.824723958969116, + "rewards/margins": 8.27989387512207, + "rewards/rejected": -5.455169677734375, + "step": 19643 + }, + { + "epoch": 4.91, + "grad_norm": 3.8954999446868896, + "learning_rate": 7.181848037862704e-09, + "logits/chosen": -0.5602114796638489, + "logits/rejected": -0.641968846321106, + "logps/chosen": -51.329429626464844, + "logps/rejected": -110.90492248535156, + "loss": 0.5901, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.147874355316162, + "rewards/margins": 7.540657997131348, + "rewards/rejected": -4.392783164978027, + "step": 19644 + }, + { + "epoch": 4.91, + "grad_norm": 4.7972307205200195, + "learning_rate": 7.1397975378356686e-09, + "logits/chosen": -0.5090748071670532, + "logits/rejected": -0.5932956337928772, + "logps/chosen": -66.0979232788086, + "logps/rejected": -104.34442138671875, + "loss": 0.6575, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.25740385055542, + "rewards/margins": 7.631781578063965, + "rewards/rejected": -4.374378681182861, + "step": 19645 + }, + { + "epoch": 4.91, + "grad_norm": 2.279971122741699, + "learning_rate": 7.09787041669463e-09, + "logits/chosen": -0.491044282913208, + "logits/rejected": -0.5754399299621582, + "logps/chosen": -54.42174530029297, + "logps/rejected": -106.36161041259766, + "loss": 0.5439, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.006944417953491, + "rewards/margins": 7.461134910583496, + "rewards/rejected": -4.454190731048584, + "step": 19646 + }, + { + "epoch": 4.91, + "grad_norm": 11.607263565063477, + "learning_rate": 7.056066675475426e-09, + "logits/chosen": -0.5021153092384338, + "logits/rejected": -0.5804027915000916, + "logps/chosen": -56.137451171875, + "logps/rejected": -104.85771179199219, + "loss": 0.5419, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.146311044692993, + "rewards/margins": 7.652774810791016, + "rewards/rejected": -4.506463050842285, + "step": 19647 + }, + { + "epoch": 4.92, + "grad_norm": 6.586021423339844, + "learning_rate": 7.014386315212229e-09, + "logits/chosen": -0.5587035417556763, + "logits/rejected": -0.6491225957870483, + "logps/chosen": -44.63764572143555, + "logps/rejected": -93.99063873291016, + "loss": 0.5833, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.839354991912842, + "rewards/margins": 6.698659896850586, + "rewards/rejected": -3.859304666519165, + "step": 19648 + }, + { + "epoch": 4.92, + "grad_norm": 5.329760551452637, + "learning_rate": 6.972829336933662e-09, + "logits/chosen": -0.5830012559890747, + "logits/rejected": -0.6632869243621826, + "logps/chosen": -57.00898742675781, + "logps/rejected": -122.56134033203125, + "loss": 0.724, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1334781646728516, + "rewards/margins": 8.59737491607666, + "rewards/rejected": -5.463897705078125, + "step": 19649 + }, + { + "epoch": 4.92, + "grad_norm": 2.896148443222046, + "learning_rate": 6.9313957416677904e-09, + "logits/chosen": -0.5750166177749634, + "logits/rejected": -0.6186057329177856, + "logps/chosen": -50.029972076416016, + "logps/rejected": -98.76642608642578, + "loss": 0.5255, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4824891090393066, + "rewards/margins": 6.815540790557861, + "rewards/rejected": -3.3330516815185547, + "step": 19650 + }, + { + "epoch": 4.92, + "grad_norm": 20.030029296875, + "learning_rate": 6.890085530437685e-09, + "logits/chosen": -0.6004984378814697, + "logits/rejected": -0.6579149961471558, + "logps/chosen": -46.4021110534668, + "logps/rejected": -106.80109405517578, + "loss": 0.5908, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0916695594787598, + "rewards/margins": 7.36973237991333, + "rewards/rejected": -4.278062343597412, + "step": 19651 + }, + { + "epoch": 4.92, + "grad_norm": 5.30272102355957, + "learning_rate": 6.84889870426475e-09, + "logits/chosen": -0.4861622750759125, + "logits/rejected": -0.5858051180839539, + "logps/chosen": -67.36188507080078, + "logps/rejected": -102.33836364746094, + "loss": 0.5987, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.136544704437256, + "rewards/margins": 6.451876640319824, + "rewards/rejected": -3.315331220626831, + "step": 19652 + }, + { + "epoch": 4.92, + "grad_norm": 2.7959611415863037, + "learning_rate": 6.807835264166507e-09, + "logits/chosen": -0.5710445642471313, + "logits/rejected": -0.6924228072166443, + "logps/chosen": -61.27264404296875, + "logps/rejected": -102.73627471923828, + "loss": 0.5997, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2145328521728516, + "rewards/margins": 7.946737289428711, + "rewards/rejected": -4.732204437255859, + "step": 19653 + }, + { + "epoch": 4.92, + "grad_norm": 9.738204002380371, + "learning_rate": 6.766895211157143e-09, + "logits/chosen": -0.533559262752533, + "logits/rejected": -0.545644998550415, + "logps/chosen": -49.78019714355469, + "logps/rejected": -124.55815124511719, + "loss": 0.6709, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.833231210708618, + "rewards/margins": 6.781472206115723, + "rewards/rejected": -3.9482412338256836, + "step": 19654 + }, + { + "epoch": 4.92, + "grad_norm": 6.608747959136963, + "learning_rate": 6.726078546249737e-09, + "logits/chosen": -0.5351329445838928, + "logits/rejected": -0.6126810908317566, + "logps/chosen": -46.28847885131836, + "logps/rejected": -90.8150634765625, + "loss": 0.6325, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0819337368011475, + "rewards/margins": 5.474918842315674, + "rewards/rejected": -2.3929853439331055, + "step": 19655 + }, + { + "epoch": 4.92, + "grad_norm": 5.053402900695801, + "learning_rate": 6.685385270451261e-09, + "logits/chosen": -0.5794382095336914, + "logits/rejected": -0.6908150911331177, + "logps/chosen": -59.46539306640625, + "logps/rejected": -115.28538513183594, + "loss": 0.6119, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2025880813598633, + "rewards/margins": 7.410341262817383, + "rewards/rejected": -4.207753658294678, + "step": 19656 + }, + { + "epoch": 4.92, + "grad_norm": 7.884948253631592, + "learning_rate": 6.644815384768133e-09, + "logits/chosen": -0.5082374811172485, + "logits/rejected": -0.6033357977867126, + "logps/chosen": -55.16423797607422, + "logps/rejected": -107.09197998046875, + "loss": 0.5448, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.864809036254883, + "rewards/margins": 7.74800968170166, + "rewards/rejected": -4.883200168609619, + "step": 19657 + }, + { + "epoch": 4.92, + "grad_norm": 4.964375972747803, + "learning_rate": 6.604368890202884e-09, + "logits/chosen": -0.5170516967773438, + "logits/rejected": -0.6363673806190491, + "logps/chosen": -56.00178527832031, + "logps/rejected": -113.8023681640625, + "loss": 0.5782, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2428321838378906, + "rewards/margins": 7.604375839233398, + "rewards/rejected": -4.361543655395508, + "step": 19658 + }, + { + "epoch": 4.92, + "grad_norm": 11.72846508026123, + "learning_rate": 6.564045787755269e-09, + "logits/chosen": -0.531330943107605, + "logits/rejected": -0.65102219581604, + "logps/chosen": -72.07068634033203, + "logps/rejected": -82.63782501220703, + "loss": 0.6958, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2368874549865723, + "rewards/margins": 6.183656215667725, + "rewards/rejected": -2.9467689990997314, + "step": 19659 + }, + { + "epoch": 4.92, + "grad_norm": 3.6155688762664795, + "learning_rate": 6.523846078420604e-09, + "logits/chosen": -0.4747677743434906, + "logits/rejected": -0.5542911887168884, + "logps/chosen": -47.15605545043945, + "logps/rejected": -98.8667984008789, + "loss": 0.5938, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3652305603027344, + "rewards/margins": 6.699557304382324, + "rewards/rejected": -3.3343265056610107, + "step": 19660 + }, + { + "epoch": 4.92, + "grad_norm": 9.575922966003418, + "learning_rate": 6.483769763193648e-09, + "logits/chosen": -0.5056071877479553, + "logits/rejected": -0.5824534296989441, + "logps/chosen": -54.92601776123047, + "logps/rejected": -123.89927673339844, + "loss": 0.6757, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0994510650634766, + "rewards/margins": 7.843395233154297, + "rewards/rejected": -4.74394416809082, + "step": 19661 + }, + { + "epoch": 4.92, + "grad_norm": 7.538596153259277, + "learning_rate": 6.443816843063611e-09, + "logits/chosen": -0.5183352828025818, + "logits/rejected": -0.5786519646644592, + "logps/chosen": -56.448646545410156, + "logps/rejected": -107.16349792480469, + "loss": 0.6775, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7317745685577393, + "rewards/margins": 6.6971869468688965, + "rewards/rejected": -3.9654126167297363, + "step": 19662 + }, + { + "epoch": 4.92, + "grad_norm": 4.8301262855529785, + "learning_rate": 6.4039873190185895e-09, + "logits/chosen": -0.6011179685592651, + "logits/rejected": -0.6847885251045227, + "logps/chosen": -52.30531311035156, + "logps/rejected": -92.50350952148438, + "loss": 0.7031, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8876442909240723, + "rewards/margins": 6.886597633361816, + "rewards/rejected": -3.9989538192749023, + "step": 19663 + }, + { + "epoch": 4.92, + "grad_norm": 7.304162502288818, + "learning_rate": 6.364281192042243e-09, + "logits/chosen": -0.519177258014679, + "logits/rejected": -0.5984573364257812, + "logps/chosen": -54.73836898803711, + "logps/rejected": -117.00116729736328, + "loss": 0.6164, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2656593322753906, + "rewards/margins": 8.645674705505371, + "rewards/rejected": -5.380014419555664, + "step": 19664 + }, + { + "epoch": 4.92, + "grad_norm": 2.680079936981201, + "learning_rate": 6.324698463116008e-09, + "logits/chosen": -0.5313929319381714, + "logits/rejected": -0.6250284910202026, + "logps/chosen": -57.27608871459961, + "logps/rejected": -110.69843292236328, + "loss": 0.6126, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3302090167999268, + "rewards/margins": 7.499123573303223, + "rewards/rejected": -4.168915271759033, + "step": 19665 + }, + { + "epoch": 4.92, + "grad_norm": 4.807779788970947, + "learning_rate": 6.285239133218546e-09, + "logits/chosen": -0.5618313550949097, + "logits/rejected": -0.6063086986541748, + "logps/chosen": -59.63079071044922, + "logps/rejected": -111.54669189453125, + "loss": 0.6112, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8883488178253174, + "rewards/margins": 6.154130935668945, + "rewards/rejected": -3.265782356262207, + "step": 19666 + }, + { + "epoch": 4.92, + "grad_norm": 3.7957675457000732, + "learning_rate": 6.245903203323522e-09, + "logits/chosen": -0.5255860090255737, + "logits/rejected": -0.6262353658676147, + "logps/chosen": -57.10375213623047, + "logps/rejected": -128.5950164794922, + "loss": 0.6507, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3118538856506348, + "rewards/margins": 8.240586280822754, + "rewards/rejected": -4.928732872009277, + "step": 19667 + }, + { + "epoch": 4.92, + "grad_norm": 12.710972785949707, + "learning_rate": 6.206690674404048e-09, + "logits/chosen": -0.6012190580368042, + "logits/rejected": -0.6676555871963501, + "logps/chosen": -52.8470344543457, + "logps/rejected": -124.82377624511719, + "loss": 0.5873, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9347827434539795, + "rewards/margins": 8.292854309082031, + "rewards/rejected": -5.358071804046631, + "step": 19668 + }, + { + "epoch": 4.92, + "grad_norm": 2.528687000274658, + "learning_rate": 6.167601547429347e-09, + "logits/chosen": -0.643599271774292, + "logits/rejected": -0.7271884679794312, + "logps/chosen": -52.25829315185547, + "logps/rejected": -96.55765533447266, + "loss": 0.5856, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2887215614318848, + "rewards/margins": 8.073995590209961, + "rewards/rejected": -4.785274505615234, + "step": 19669 + }, + { + "epoch": 4.92, + "grad_norm": 6.249475002288818, + "learning_rate": 6.128635823364204e-09, + "logits/chosen": -0.587612509727478, + "logits/rejected": -0.6332625150680542, + "logps/chosen": -55.851051330566406, + "logps/rejected": -111.18655395507812, + "loss": 0.6809, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2979047298431396, + "rewards/margins": 7.844470500946045, + "rewards/rejected": -4.546566486358643, + "step": 19670 + }, + { + "epoch": 4.92, + "grad_norm": 5.001277446746826, + "learning_rate": 6.089793503172847e-09, + "logits/chosen": -0.5620468854904175, + "logits/rejected": -0.6184298396110535, + "logps/chosen": -49.6595573425293, + "logps/rejected": -98.20109558105469, + "loss": 0.6168, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3254904747009277, + "rewards/margins": 6.678439617156982, + "rewards/rejected": -3.3529491424560547, + "step": 19671 + }, + { + "epoch": 4.92, + "grad_norm": 7.494732856750488, + "learning_rate": 6.051074587813954e-09, + "logits/chosen": -0.6043408513069153, + "logits/rejected": -0.6884725689888, + "logps/chosen": -44.496856689453125, + "logps/rejected": -108.88544464111328, + "loss": 0.6641, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1561174392700195, + "rewards/margins": 8.342202186584473, + "rewards/rejected": -5.186085224151611, + "step": 19672 + }, + { + "epoch": 4.92, + "grad_norm": 3.1021430492401123, + "learning_rate": 6.012479078245093e-09, + "logits/chosen": -0.5735145211219788, + "logits/rejected": -0.6717727184295654, + "logps/chosen": -49.10157012939453, + "logps/rejected": -118.28982543945312, + "loss": 0.5765, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2710256576538086, + "rewards/margins": 8.543670654296875, + "rewards/rejected": -5.272644996643066, + "step": 19673 + }, + { + "epoch": 4.92, + "grad_norm": 2.199639320373535, + "learning_rate": 5.9740069754199435e-09, + "logits/chosen": -0.5480499863624573, + "logits/rejected": -0.6253645420074463, + "logps/chosen": -48.892757415771484, + "logps/rejected": -111.89405822753906, + "loss": 0.5116, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9215691089630127, + "rewards/margins": 7.330368995666504, + "rewards/rejected": -4.40880012512207, + "step": 19674 + }, + { + "epoch": 4.92, + "grad_norm": 7.21584939956665, + "learning_rate": 5.935658280288303e-09, + "logits/chosen": -0.45348721742630005, + "logits/rejected": -0.586353600025177, + "logps/chosen": -58.7704963684082, + "logps/rejected": -81.51287841796875, + "loss": 0.6842, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.388956069946289, + "rewards/margins": 6.079663276672363, + "rewards/rejected": -2.6907074451446533, + "step": 19675 + }, + { + "epoch": 4.92, + "grad_norm": 5.509186267852783, + "learning_rate": 5.8974329937988574e-09, + "logits/chosen": -0.5571819543838501, + "logits/rejected": -0.6715492606163025, + "logps/chosen": -56.57472229003906, + "logps/rejected": -81.87967681884766, + "loss": 0.6657, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8164942264556885, + "rewards/margins": 6.761975288391113, + "rewards/rejected": -3.9454803466796875, + "step": 19676 + }, + { + "epoch": 4.92, + "grad_norm": 13.505621910095215, + "learning_rate": 5.85933111689585e-09, + "logits/chosen": -0.5433725118637085, + "logits/rejected": -0.6465988159179688, + "logps/chosen": -51.151214599609375, + "logps/rejected": -108.75440979003906, + "loss": 0.7076, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.30889630317688, + "rewards/margins": 7.806920528411865, + "rewards/rejected": -4.498023986816406, + "step": 19677 + }, + { + "epoch": 4.92, + "grad_norm": 2.831141233444214, + "learning_rate": 5.821352650520751e-09, + "logits/chosen": -0.626658022403717, + "logits/rejected": -0.6671190857887268, + "logps/chosen": -50.720008850097656, + "logps/rejected": -114.37222290039062, + "loss": 0.6132, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3021812438964844, + "rewards/margins": 8.296656608581543, + "rewards/rejected": -4.994474411010742, + "step": 19678 + }, + { + "epoch": 4.92, + "grad_norm": 3.387083053588867, + "learning_rate": 5.783497595611698e-09, + "logits/chosen": -0.5238406658172607, + "logits/rejected": -0.5859683752059937, + "logps/chosen": -71.3708267211914, + "logps/rejected": -101.5954818725586, + "loss": 0.7152, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.674964427947998, + "rewards/margins": 6.518228530883789, + "rewards/rejected": -3.8432633876800537, + "step": 19679 + }, + { + "epoch": 4.92, + "grad_norm": 12.271618843078613, + "learning_rate": 5.7457659531051645e-09, + "logits/chosen": -0.5348271131515503, + "logits/rejected": -0.6395944356918335, + "logps/chosen": -55.678707122802734, + "logps/rejected": -102.1220703125, + "loss": 0.696, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.069462537765503, + "rewards/margins": 7.526381492614746, + "rewards/rejected": -4.456918716430664, + "step": 19680 + }, + { + "epoch": 4.92, + "grad_norm": 3.8014111518859863, + "learning_rate": 5.708157723932628e-09, + "logits/chosen": -0.5925593376159668, + "logits/rejected": -0.6228028535842896, + "logps/chosen": -54.74537658691406, + "logps/rejected": -116.06568908691406, + "loss": 0.7012, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.093952178955078, + "rewards/margins": 6.987674236297607, + "rewards/rejected": -3.8937222957611084, + "step": 19681 + }, + { + "epoch": 4.92, + "grad_norm": 4.815577507019043, + "learning_rate": 5.670672909023345e-09, + "logits/chosen": -0.5133297443389893, + "logits/rejected": -0.5968696475028992, + "logps/chosen": -53.25700378417969, + "logps/rejected": -91.63623046875, + "loss": 0.6575, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.195060968399048, + "rewards/margins": 7.256677627563477, + "rewards/rejected": -4.061616897583008, + "step": 19682 + }, + { + "epoch": 4.92, + "grad_norm": 2.9171900749206543, + "learning_rate": 5.633311509304351e-09, + "logits/chosen": -0.6071770191192627, + "logits/rejected": -0.6935598850250244, + "logps/chosen": -49.58186721801758, + "logps/rejected": -109.21697998046875, + "loss": 0.5926, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.283679962158203, + "rewards/margins": 7.513803482055664, + "rewards/rejected": -4.230123996734619, + "step": 19683 + }, + { + "epoch": 4.92, + "grad_norm": 5.237293720245361, + "learning_rate": 5.596073525698242e-09, + "logits/chosen": -0.5387436747550964, + "logits/rejected": -0.6344295144081116, + "logps/chosen": -59.65556716918945, + "logps/rejected": -110.25384521484375, + "loss": 0.6518, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7980151176452637, + "rewards/margins": 7.953802585601807, + "rewards/rejected": -5.155787944793701, + "step": 19684 + }, + { + "epoch": 4.92, + "grad_norm": 2.1857786178588867, + "learning_rate": 5.558958959125393e-09, + "logits/chosen": -0.5048389434814453, + "logits/rejected": -0.6068804860115051, + "logps/chosen": -60.79168701171875, + "logps/rejected": -128.08468627929688, + "loss": 0.5199, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.989790439605713, + "rewards/margins": 8.760323524475098, + "rewards/rejected": -5.770533561706543, + "step": 19685 + }, + { + "epoch": 4.92, + "grad_norm": 5.450181484222412, + "learning_rate": 5.521967810502848e-09, + "logits/chosen": -0.5394347310066223, + "logits/rejected": -0.636028528213501, + "logps/chosen": -61.47998046875, + "logps/rejected": -107.80363464355469, + "loss": 0.6442, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.140347957611084, + "rewards/margins": 6.89909553527832, + "rewards/rejected": -3.7587473392486572, + "step": 19686 + }, + { + "epoch": 4.92, + "grad_norm": 4.2035441398620605, + "learning_rate": 5.485100080745431e-09, + "logits/chosen": -0.5566946268081665, + "logits/rejected": -0.626715362071991, + "logps/chosen": -47.62281799316406, + "logps/rejected": -103.62062072753906, + "loss": 0.6065, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3561885356903076, + "rewards/margins": 7.119314193725586, + "rewards/rejected": -3.7631263732910156, + "step": 19687 + }, + { + "epoch": 4.93, + "grad_norm": 4.970319747924805, + "learning_rate": 5.44835577076297e-09, + "logits/chosen": -0.5306875705718994, + "logits/rejected": -0.6061429977416992, + "logps/chosen": -55.90033721923828, + "logps/rejected": -109.92877960205078, + "loss": 0.6026, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.557349920272827, + "rewards/margins": 6.727991580963135, + "rewards/rejected": -4.170641899108887, + "step": 19688 + }, + { + "epoch": 4.93, + "grad_norm": 6.241549968719482, + "learning_rate": 5.411734881464736e-09, + "logits/chosen": -0.6011619567871094, + "logits/rejected": -0.6989249587059021, + "logps/chosen": -51.470489501953125, + "logps/rejected": -104.80653381347656, + "loss": 0.6309, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.255978584289551, + "rewards/margins": 7.156492233276367, + "rewards/rejected": -3.9005134105682373, + "step": 19689 + }, + { + "epoch": 4.93, + "grad_norm": 5.6256232261657715, + "learning_rate": 5.3752374137544524e-09, + "logits/chosen": -0.5349841117858887, + "logits/rejected": -0.5967521071434021, + "logps/chosen": -55.80133819580078, + "logps/rejected": -114.83978271484375, + "loss": 0.6305, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3414366245269775, + "rewards/margins": 7.1909708976745605, + "rewards/rejected": -3.849534034729004, + "step": 19690 + }, + { + "epoch": 4.93, + "grad_norm": 5.4845290184021, + "learning_rate": 5.3388633685347304e-09, + "logits/chosen": -0.5209259390830994, + "logits/rejected": -0.6462821364402771, + "logps/chosen": -47.36576843261719, + "logps/rejected": -95.38267517089844, + "loss": 0.5599, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.956900119781494, + "rewards/margins": 7.520810604095459, + "rewards/rejected": -4.563910007476807, + "step": 19691 + }, + { + "epoch": 4.93, + "grad_norm": 17.745500564575195, + "learning_rate": 5.302612746704294e-09, + "logits/chosen": -0.5281850695610046, + "logits/rejected": -0.5992164015769958, + "logps/chosen": -59.388633728027344, + "logps/rejected": -99.64594268798828, + "loss": 0.6338, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6035876274108887, + "rewards/margins": 6.846571445465088, + "rewards/rejected": -4.242983818054199, + "step": 19692 + }, + { + "epoch": 4.93, + "grad_norm": 7.111947536468506, + "learning_rate": 5.266485549159095e-09, + "logits/chosen": -0.621502697467804, + "logits/rejected": -0.7233563661575317, + "logps/chosen": -47.322139739990234, + "logps/rejected": -99.4853515625, + "loss": 0.4963, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3682045936584473, + "rewards/margins": 6.89409065246582, + "rewards/rejected": -3.5258867740631104, + "step": 19693 + }, + { + "epoch": 4.93, + "grad_norm": 3.0602264404296875, + "learning_rate": 5.230481776791196e-09, + "logits/chosen": -0.5491205453872681, + "logits/rejected": -0.6542165279388428, + "logps/chosen": -51.5541877746582, + "logps/rejected": -116.20832824707031, + "loss": 0.5433, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0018372535705566, + "rewards/margins": 8.423775672912598, + "rewards/rejected": -5.421937942504883, + "step": 19694 + }, + { + "epoch": 4.93, + "grad_norm": 6.106473445892334, + "learning_rate": 5.194601430490997e-09, + "logits/chosen": -0.5859546661376953, + "logits/rejected": -0.667022705078125, + "logps/chosen": -50.09598922729492, + "logps/rejected": -109.6331558227539, + "loss": 0.6137, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1886417865753174, + "rewards/margins": 7.699746131896973, + "rewards/rejected": -4.511105537414551, + "step": 19695 + }, + { + "epoch": 4.93, + "grad_norm": 8.28647518157959, + "learning_rate": 5.1588445111450115e-09, + "logits/chosen": -0.5417765974998474, + "logits/rejected": -0.6392096281051636, + "logps/chosen": -45.098350524902344, + "logps/rejected": -104.98778533935547, + "loss": 0.5719, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.902804374694824, + "rewards/margins": 8.867216110229492, + "rewards/rejected": -5.964410781860352, + "step": 19696 + }, + { + "epoch": 4.93, + "grad_norm": 25.26068687438965, + "learning_rate": 5.123211019636421e-09, + "logits/chosen": -0.5866009593009949, + "logits/rejected": -0.6551081538200378, + "logps/chosen": -52.733551025390625, + "logps/rejected": -103.45260620117188, + "loss": 0.7484, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9448556900024414, + "rewards/margins": 6.532079219818115, + "rewards/rejected": -3.587223529815674, + "step": 19697 + }, + { + "epoch": 4.93, + "grad_norm": 11.397214889526367, + "learning_rate": 5.0877009568467414e-09, + "logits/chosen": -0.5819478034973145, + "logits/rejected": -0.6588136553764343, + "logps/chosen": -57.55268478393555, + "logps/rejected": -117.2481689453125, + "loss": 0.6951, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.9013118743896484, + "rewards/margins": 7.1416802406311035, + "rewards/rejected": -4.240368366241455, + "step": 19698 + }, + { + "epoch": 4.93, + "grad_norm": 3.2927777767181396, + "learning_rate": 5.052314323652496e-09, + "logits/chosen": -0.5588889718055725, + "logits/rejected": -0.6301877498626709, + "logps/chosen": -49.45201110839844, + "logps/rejected": -101.52471160888672, + "loss": 0.584, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1937615871429443, + "rewards/margins": 7.354394912719727, + "rewards/rejected": -4.160633563995361, + "step": 19699 + }, + { + "epoch": 4.93, + "grad_norm": 9.54094123840332, + "learning_rate": 5.017051120927985e-09, + "logits/chosen": -0.5663049221038818, + "logits/rejected": -0.6705796718597412, + "logps/chosen": -57.52176284790039, + "logps/rejected": -113.99371337890625, + "loss": 0.6375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0015010833740234, + "rewards/margins": 7.7111382484436035, + "rewards/rejected": -4.709637641906738, + "step": 19700 + }, + { + "epoch": 4.93, + "grad_norm": 6.4383134841918945, + "learning_rate": 4.9819113495452875e-09, + "logits/chosen": -0.543681263923645, + "logits/rejected": -0.6388725638389587, + "logps/chosen": -63.6823844909668, + "logps/rejected": -116.51727294921875, + "loss": 0.6761, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.835524320602417, + "rewards/margins": 7.377656936645508, + "rewards/rejected": -4.54213285446167, + "step": 19701 + }, + { + "epoch": 4.93, + "grad_norm": 3.8889055252075195, + "learning_rate": 4.946895010372598e-09, + "logits/chosen": -0.6076054573059082, + "logits/rejected": -0.6791371703147888, + "logps/chosen": -48.44622802734375, + "logps/rejected": -109.51213836669922, + "loss": 0.5621, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3458876609802246, + "rewards/margins": 7.567104816436768, + "rewards/rejected": -4.221217155456543, + "step": 19702 + }, + { + "epoch": 4.93, + "grad_norm": 3.206791639328003, + "learning_rate": 4.912002104275337e-09, + "logits/chosen": -0.5726618766784668, + "logits/rejected": -0.6659735441207886, + "logps/chosen": -53.73076629638672, + "logps/rejected": -102.5522689819336, + "loss": 0.6091, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.909287929534912, + "rewards/margins": 7.669003486633301, + "rewards/rejected": -4.759716510772705, + "step": 19703 + }, + { + "epoch": 4.93, + "grad_norm": 6.374608993530273, + "learning_rate": 4.877232632115037e-09, + "logits/chosen": -0.5298940539360046, + "logits/rejected": -0.6080502867698669, + "logps/chosen": -51.017906188964844, + "logps/rejected": -116.08378601074219, + "loss": 0.6155, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9847490787506104, + "rewards/margins": 8.204803466796875, + "rewards/rejected": -5.220053672790527, + "step": 19704 + }, + { + "epoch": 4.93, + "grad_norm": 5.998661518096924, + "learning_rate": 4.8425865947515635e-09, + "logits/chosen": -0.4603419303894043, + "logits/rejected": -0.5535596609115601, + "logps/chosen": -57.32864761352539, + "logps/rejected": -114.24171447753906, + "loss": 0.6546, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3484737873077393, + "rewards/margins": 7.400059700012207, + "rewards/rejected": -4.051586151123047, + "step": 19705 + }, + { + "epoch": 4.93, + "grad_norm": 4.680988788604736, + "learning_rate": 4.808063993040901e-09, + "logits/chosen": -0.5694469213485718, + "logits/rejected": -0.6625665426254272, + "logps/chosen": -44.69474411010742, + "logps/rejected": -111.50741577148438, + "loss": 0.4827, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.124152898788452, + "rewards/margins": 8.054366111755371, + "rewards/rejected": -4.930213451385498, + "step": 19706 + }, + { + "epoch": 4.93, + "grad_norm": 6.333637714385986, + "learning_rate": 4.773664827836255e-09, + "logits/chosen": -0.6012566089630127, + "logits/rejected": -0.7093694806098938, + "logps/chosen": -51.05231857299805, + "logps/rejected": -114.77690887451172, + "loss": 0.5418, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.238424777984619, + "rewards/margins": 8.473865509033203, + "rewards/rejected": -5.235440254211426, + "step": 19707 + }, + { + "epoch": 4.93, + "grad_norm": 9.436286926269531, + "learning_rate": 4.7393890999869465e-09, + "logits/chosen": -0.5848978757858276, + "logits/rejected": -0.6041009426116943, + "logps/chosen": -47.884944915771484, + "logps/rejected": -126.9968490600586, + "loss": 0.6011, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8205695152282715, + "rewards/margins": 6.784767150878906, + "rewards/rejected": -3.9641973972320557, + "step": 19708 + }, + { + "epoch": 4.93, + "grad_norm": 4.283734321594238, + "learning_rate": 4.705236810341185e-09, + "logits/chosen": -0.5496454834938049, + "logits/rejected": -0.6276713609695435, + "logps/chosen": -66.42837524414062, + "logps/rejected": -94.33340454101562, + "loss": 0.7577, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.936354637145996, + "rewards/margins": 6.395962715148926, + "rewards/rejected": -3.4596080780029297, + "step": 19709 + }, + { + "epoch": 4.93, + "grad_norm": 6.08294677734375, + "learning_rate": 4.671207959742185e-09, + "logits/chosen": -0.5560522675514221, + "logits/rejected": -0.5871806740760803, + "logps/chosen": -51.94240951538086, + "logps/rejected": -107.83661651611328, + "loss": 0.6056, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.186286211013794, + "rewards/margins": 6.6288933753967285, + "rewards/rejected": -3.4426071643829346, + "step": 19710 + }, + { + "epoch": 4.93, + "grad_norm": 4.055422306060791, + "learning_rate": 4.637302549030387e-09, + "logits/chosen": -0.5573906302452087, + "logits/rejected": -0.6513391733169556, + "logps/chosen": -60.722591400146484, + "logps/rejected": -111.32318115234375, + "loss": 0.6166, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1224119663238525, + "rewards/margins": 6.7960357666015625, + "rewards/rejected": -3.67362380027771, + "step": 19711 + }, + { + "epoch": 4.93, + "grad_norm": 4.373538970947266, + "learning_rate": 4.6035205790445625e-09, + "logits/chosen": -0.5116264820098877, + "logits/rejected": -0.6203048229217529, + "logps/chosen": -63.787452697753906, + "logps/rejected": -105.15079498291016, + "loss": 0.6045, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1408302783966064, + "rewards/margins": 7.822109699249268, + "rewards/rejected": -4.681278705596924, + "step": 19712 + }, + { + "epoch": 4.93, + "grad_norm": 1.6667486429214478, + "learning_rate": 4.569862050619045e-09, + "logits/chosen": -0.5927541255950928, + "logits/rejected": -0.6961851119995117, + "logps/chosen": -51.63533020019531, + "logps/rejected": -101.2256851196289, + "loss": 0.5655, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4549360275268555, + "rewards/margins": 8.314682006835938, + "rewards/rejected": -4.859745979309082, + "step": 19713 + }, + { + "epoch": 4.93, + "grad_norm": 7.539857387542725, + "learning_rate": 4.536326964585391e-09, + "logits/chosen": -0.5099135041236877, + "logits/rejected": -0.5596786737442017, + "logps/chosen": -50.940826416015625, + "logps/rejected": -125.10413360595703, + "loss": 0.5494, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1471328735351562, + "rewards/margins": 7.10032844543457, + "rewards/rejected": -3.953195571899414, + "step": 19714 + }, + { + "epoch": 4.93, + "grad_norm": 12.10119342803955, + "learning_rate": 4.502915321772938e-09, + "logits/chosen": -0.583888053894043, + "logits/rejected": -0.6338887810707092, + "logps/chosen": -58.962364196777344, + "logps/rejected": -112.67472076416016, + "loss": 0.6893, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.779209613800049, + "rewards/margins": 6.748979568481445, + "rewards/rejected": -3.9697697162628174, + "step": 19715 + }, + { + "epoch": 4.93, + "grad_norm": 2.2165744304656982, + "learning_rate": 4.469627123006581e-09, + "logits/chosen": -0.5206469297409058, + "logits/rejected": -0.6108161211013794, + "logps/chosen": -53.14649963378906, + "logps/rejected": -110.11279296875, + "loss": 0.5683, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.044736385345459, + "rewards/margins": 8.160248756408691, + "rewards/rejected": -5.115512847900391, + "step": 19716 + }, + { + "epoch": 4.93, + "grad_norm": 3.02864933013916, + "learning_rate": 4.436462369109551e-09, + "logits/chosen": -0.5736364126205444, + "logits/rejected": -0.710244357585907, + "logps/chosen": -58.00477600097656, + "logps/rejected": -107.05935668945312, + "loss": 0.55, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.413921356201172, + "rewards/margins": 7.52413272857666, + "rewards/rejected": -4.1102118492126465, + "step": 19717 + }, + { + "epoch": 4.93, + "grad_norm": 3.9070487022399902, + "learning_rate": 4.4034210609006366e-09, + "logits/chosen": -0.5615820288658142, + "logits/rejected": -0.6938034296035767, + "logps/chosen": -56.616607666015625, + "logps/rejected": -105.53694152832031, + "loss": 0.6298, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.875035285949707, + "rewards/margins": 7.689469337463379, + "rewards/rejected": -4.814433574676514, + "step": 19718 + }, + { + "epoch": 4.93, + "grad_norm": 8.214703559875488, + "learning_rate": 4.370503199196408e-09, + "logits/chosen": -0.5600476861000061, + "logits/rejected": -0.599952757358551, + "logps/chosen": -66.27546691894531, + "logps/rejected": -119.55229187011719, + "loss": 0.6025, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9773335456848145, + "rewards/margins": 7.380343914031982, + "rewards/rejected": -4.403010368347168, + "step": 19719 + }, + { + "epoch": 4.93, + "grad_norm": 6.294101238250732, + "learning_rate": 4.337708784811767e-09, + "logits/chosen": -0.6074535250663757, + "logits/rejected": -0.6303119659423828, + "logps/chosen": -51.2265625, + "logps/rejected": -88.50836181640625, + "loss": 0.8031, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.926617383956909, + "rewards/margins": 5.297130107879639, + "rewards/rejected": -2.3705127239227295, + "step": 19720 + }, + { + "epoch": 4.93, + "grad_norm": 5.300500392913818, + "learning_rate": 4.305037818554958e-09, + "logits/chosen": -0.5498111844062805, + "logits/rejected": -0.6452760696411133, + "logps/chosen": -58.800262451171875, + "logps/rejected": -94.52509307861328, + "loss": 0.7019, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.987442970275879, + "rewards/margins": 6.687019348144531, + "rewards/rejected": -3.6995768547058105, + "step": 19721 + }, + { + "epoch": 4.93, + "grad_norm": 7.001140594482422, + "learning_rate": 4.272490301234777e-09, + "logits/chosen": -0.5312115550041199, + "logits/rejected": -0.6221237778663635, + "logps/chosen": -59.079917907714844, + "logps/rejected": -113.12025451660156, + "loss": 0.6382, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.475555896759033, + "rewards/margins": 7.472603797912598, + "rewards/rejected": -3.9970483779907227, + "step": 19722 + }, + { + "epoch": 4.93, + "grad_norm": 8.735271453857422, + "learning_rate": 4.240066233655027e-09, + "logits/chosen": -0.574756383895874, + "logits/rejected": -0.5997140407562256, + "logps/chosen": -51.85506820678711, + "logps/rejected": -130.3221893310547, + "loss": 0.6228, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8262720108032227, + "rewards/margins": 6.465255260467529, + "rewards/rejected": -3.6389825344085693, + "step": 19723 + }, + { + "epoch": 4.93, + "grad_norm": 4.010034561157227, + "learning_rate": 4.207765616616733e-09, + "logits/chosen": -0.5645768642425537, + "logits/rejected": -0.6327122449874878, + "logps/chosen": -51.95609664916992, + "logps/rejected": -138.62115478515625, + "loss": 0.5658, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0307960510253906, + "rewards/margins": 9.510149002075195, + "rewards/rejected": -6.479352951049805, + "step": 19724 + }, + { + "epoch": 4.93, + "grad_norm": 5.052080154418945, + "learning_rate": 4.1755884509187e-09, + "logits/chosen": -0.44406530261039734, + "logits/rejected": -0.5104523301124573, + "logps/chosen": -55.62862014770508, + "logps/rejected": -104.68795013427734, + "loss": 0.6268, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.776975631713867, + "rewards/margins": 7.0684709548950195, + "rewards/rejected": -4.291494846343994, + "step": 19725 + }, + { + "epoch": 4.93, + "grad_norm": 6.523044586181641, + "learning_rate": 4.143534737355292e-09, + "logits/chosen": -0.5073813796043396, + "logits/rejected": -0.611188530921936, + "logps/chosen": -67.99320220947266, + "logps/rejected": -122.46189880371094, + "loss": 0.6373, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0509538650512695, + "rewards/margins": 7.940242767333984, + "rewards/rejected": -4.889289379119873, + "step": 19726 + }, + { + "epoch": 4.93, + "grad_norm": 3.5335848331451416, + "learning_rate": 4.111604476719211e-09, + "logits/chosen": -0.5558342933654785, + "logits/rejected": -0.627210259437561, + "logps/chosen": -50.05750274658203, + "logps/rejected": -98.4236068725586, + "loss": 0.6315, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.043696165084839, + "rewards/margins": 7.396173000335693, + "rewards/rejected": -4.352476596832275, + "step": 19727 + }, + { + "epoch": 4.94, + "grad_norm": 2.982175588607788, + "learning_rate": 4.079797669798713e-09, + "logits/chosen": -0.5657868385314941, + "logits/rejected": -0.6461457014083862, + "logps/chosen": -43.54390335083008, + "logps/rejected": -122.94175720214844, + "loss": 0.5605, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5289692878723145, + "rewards/margins": 9.37831974029541, + "rewards/rejected": -5.849351406097412, + "step": 19728 + }, + { + "epoch": 4.94, + "grad_norm": 9.731745719909668, + "learning_rate": 4.048114317380947e-09, + "logits/chosen": -0.5071147680282593, + "logits/rejected": -0.6443246603012085, + "logps/chosen": -51.73082733154297, + "logps/rejected": -91.37057495117188, + "loss": 0.5956, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.505721092224121, + "rewards/margins": 7.364066123962402, + "rewards/rejected": -3.8583457469940186, + "step": 19729 + }, + { + "epoch": 4.94, + "grad_norm": 2.4478392601013184, + "learning_rate": 4.0165544202480644e-09, + "logits/chosen": -0.5918245911598206, + "logits/rejected": -0.6539756655693054, + "logps/chosen": -49.9784049987793, + "logps/rejected": -109.53861999511719, + "loss": 0.5684, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.335139513015747, + "rewards/margins": 7.647416114807129, + "rewards/rejected": -4.312276840209961, + "step": 19730 + }, + { + "epoch": 4.94, + "grad_norm": 4.976691246032715, + "learning_rate": 3.985117979179997e-09, + "logits/chosen": -0.5117855668067932, + "logits/rejected": -0.6150033473968506, + "logps/chosen": -59.3973388671875, + "logps/rejected": -113.58447265625, + "loss": 0.6233, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.876279830932617, + "rewards/margins": 7.625732421875, + "rewards/rejected": -4.749452590942383, + "step": 19731 + }, + { + "epoch": 4.94, + "grad_norm": 4.733217239379883, + "learning_rate": 3.953804994953902e-09, + "logits/chosen": -0.5339164733886719, + "logits/rejected": -0.6322701573371887, + "logps/chosen": -54.58332061767578, + "logps/rejected": -101.73978424072266, + "loss": 0.6039, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8688011169433594, + "rewards/margins": 7.449980735778809, + "rewards/rejected": -4.581179618835449, + "step": 19732 + }, + { + "epoch": 4.94, + "grad_norm": 3.044279098510742, + "learning_rate": 3.92261546834305e-09, + "logits/chosen": -0.5765081644058228, + "logits/rejected": -0.6843623518943787, + "logps/chosen": -47.549110412597656, + "logps/rejected": -109.493408203125, + "loss": 0.5584, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4307990074157715, + "rewards/margins": 8.619937896728516, + "rewards/rejected": -5.189139366149902, + "step": 19733 + }, + { + "epoch": 4.94, + "grad_norm": 4.024528980255127, + "learning_rate": 3.891549400118488e-09, + "logits/chosen": -0.48228925466537476, + "logits/rejected": -0.5516059398651123, + "logps/chosen": -57.76261901855469, + "logps/rejected": -104.5770034790039, + "loss": 0.567, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1581034660339355, + "rewards/margins": 6.664821624755859, + "rewards/rejected": -3.506718397140503, + "step": 19734 + }, + { + "epoch": 4.94, + "grad_norm": 4.354954719543457, + "learning_rate": 3.860606791047938e-09, + "logits/chosen": -0.5641641020774841, + "logits/rejected": -0.6744884848594666, + "logps/chosen": -50.93223571777344, + "logps/rejected": -104.70789337158203, + "loss": 0.583, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.758868932723999, + "rewards/margins": 7.393408298492432, + "rewards/rejected": -4.6345391273498535, + "step": 19735 + }, + { + "epoch": 4.94, + "grad_norm": 7.756535053253174, + "learning_rate": 3.829787641895788e-09, + "logits/chosen": -0.5329800248146057, + "logits/rejected": -0.564824104309082, + "logps/chosen": -54.266578674316406, + "logps/rejected": -126.41179656982422, + "loss": 0.661, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.909402370452881, + "rewards/margins": 6.965018272399902, + "rewards/rejected": -4.055615425109863, + "step": 19736 + }, + { + "epoch": 4.94, + "grad_norm": 5.308755397796631, + "learning_rate": 3.79909195342365e-09, + "logits/chosen": -0.519148051738739, + "logits/rejected": -0.5799884796142578, + "logps/chosen": -70.35340881347656, + "logps/rejected": -120.98812103271484, + "loss": 0.6703, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.8332176208496094, + "rewards/margins": 7.953646183013916, + "rewards/rejected": -5.120429515838623, + "step": 19737 + }, + { + "epoch": 4.94, + "grad_norm": 3.178947687149048, + "learning_rate": 3.7685197263903624e-09, + "logits/chosen": -0.6010026931762695, + "logits/rejected": -0.6637486219406128, + "logps/chosen": -54.115966796875, + "logps/rejected": -129.77857971191406, + "loss": 0.6077, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.217632293701172, + "rewards/margins": 7.735255241394043, + "rewards/rejected": -4.517622947692871, + "step": 19738 + }, + { + "epoch": 4.94, + "grad_norm": 3.455014705657959, + "learning_rate": 3.738070961551432e-09, + "logits/chosen": -0.623569667339325, + "logits/rejected": -0.6735894680023193, + "logps/chosen": -46.83171081542969, + "logps/rejected": -111.24702453613281, + "loss": 0.5317, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.263052463531494, + "rewards/margins": 7.073014736175537, + "rewards/rejected": -3.809962272644043, + "step": 19739 + }, + { + "epoch": 4.94, + "grad_norm": 3.748145818710327, + "learning_rate": 3.7077456596584793e-09, + "logits/chosen": -0.5383102297782898, + "logits/rejected": -0.6251029372215271, + "logps/chosen": -47.21725845336914, + "logps/rejected": -100.63710021972656, + "loss": 0.5457, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2996084690093994, + "rewards/margins": 7.119216442108154, + "rewards/rejected": -3.819607734680176, + "step": 19740 + }, + { + "epoch": 4.94, + "grad_norm": 3.4025485515594482, + "learning_rate": 3.67754382146146e-09, + "logits/chosen": -0.49136388301849365, + "logits/rejected": -0.622010350227356, + "logps/chosen": -61.036800384521484, + "logps/rejected": -98.53321075439453, + "loss": 0.5837, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.287087917327881, + "rewards/margins": 7.027512073516846, + "rewards/rejected": -3.740424156188965, + "step": 19741 + }, + { + "epoch": 4.94, + "grad_norm": 5.9544782638549805, + "learning_rate": 3.6474654477069993e-09, + "logits/chosen": -0.4913976192474365, + "logits/rejected": -0.6000240445137024, + "logps/chosen": -58.20596694946289, + "logps/rejected": -88.58922576904297, + "loss": 0.6529, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6750590801239014, + "rewards/margins": 6.190537452697754, + "rewards/rejected": -3.5154786109924316, + "step": 19742 + }, + { + "epoch": 4.94, + "grad_norm": 8.050588607788086, + "learning_rate": 3.617510539137836e-09, + "logits/chosen": -0.6187884211540222, + "logits/rejected": -0.6939672827720642, + "logps/chosen": -48.25532913208008, + "logps/rejected": -96.4026107788086, + "loss": 0.5885, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.70198917388916, + "rewards/margins": 7.4352617263793945, + "rewards/rejected": -3.7332727909088135, + "step": 19743 + }, + { + "epoch": 4.94, + "grad_norm": 4.242859363555908, + "learning_rate": 3.5876790964944895e-09, + "logits/chosen": -0.5233005285263062, + "logits/rejected": -0.6577814817428589, + "logps/chosen": -59.93689727783203, + "logps/rejected": -112.96308898925781, + "loss": 0.544, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4509265422821045, + "rewards/margins": 9.453707695007324, + "rewards/rejected": -6.002781391143799, + "step": 19744 + }, + { + "epoch": 4.94, + "grad_norm": 8.56908893585205, + "learning_rate": 3.5579711205141476e-09, + "logits/chosen": -0.6113542318344116, + "logits/rejected": -0.6637454032897949, + "logps/chosen": -59.412620544433594, + "logps/rejected": -124.020263671875, + "loss": 0.6666, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0810859203338623, + "rewards/margins": 7.660757064819336, + "rewards/rejected": -4.579671859741211, + "step": 19745 + }, + { + "epoch": 4.94, + "grad_norm": 3.1026339530944824, + "learning_rate": 3.5283866119306677e-09, + "logits/chosen": -0.6017469167709351, + "logits/rejected": -0.7217323184013367, + "logps/chosen": -55.66338348388672, + "logps/rejected": -106.89498901367188, + "loss": 0.5782, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9884331226348877, + "rewards/margins": 8.709202766418457, + "rewards/rejected": -5.720769882202148, + "step": 19746 + }, + { + "epoch": 4.94, + "grad_norm": 4.647524833679199, + "learning_rate": 3.4989255714751314e-09, + "logits/chosen": -0.6089257001876831, + "logits/rejected": -0.6866502165794373, + "logps/chosen": -43.639564514160156, + "logps/rejected": -113.07971954345703, + "loss": 0.5301, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1093482971191406, + "rewards/margins": 8.071045875549316, + "rewards/rejected": -4.961697101593018, + "step": 19747 + }, + { + "epoch": 4.94, + "grad_norm": 3.3612446784973145, + "learning_rate": 3.46958799987529e-09, + "logits/chosen": -0.5507742166519165, + "logits/rejected": -0.6015006899833679, + "logps/chosen": -47.096214294433594, + "logps/rejected": -114.869873046875, + "loss": 0.5138, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.205152988433838, + "rewards/margins": 8.042360305786133, + "rewards/rejected": -4.837206840515137, + "step": 19748 + }, + { + "epoch": 4.94, + "grad_norm": 6.797603607177734, + "learning_rate": 3.4403738978572298e-09, + "logits/chosen": -0.5722712278366089, + "logits/rejected": -0.6417339444160461, + "logps/chosen": -51.961273193359375, + "logps/rejected": -89.5971450805664, + "loss": 0.7714, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.805253028869629, + "rewards/margins": 5.501149654388428, + "rewards/rejected": -2.6958959102630615, + "step": 19749 + }, + { + "epoch": 4.94, + "grad_norm": 4.566804885864258, + "learning_rate": 3.411283266141485e-09, + "logits/chosen": -0.5737535357475281, + "logits/rejected": -0.6141031980514526, + "logps/chosen": -48.79902648925781, + "logps/rejected": -110.74366760253906, + "loss": 0.5746, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2708590030670166, + "rewards/margins": 7.195119857788086, + "rewards/rejected": -3.9242608547210693, + "step": 19750 + }, + { + "epoch": 4.94, + "grad_norm": 5.049920558929443, + "learning_rate": 3.3823161054480356e-09, + "logits/chosen": -0.6262994408607483, + "logits/rejected": -0.6777946352958679, + "logps/chosen": -49.860050201416016, + "logps/rejected": -124.85308837890625, + "loss": 0.5742, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9442238807678223, + "rewards/margins": 8.107199668884277, + "rewards/rejected": -5.162976264953613, + "step": 19751 + }, + { + "epoch": 4.94, + "grad_norm": 5.442144870758057, + "learning_rate": 3.3534724164918654e-09, + "logits/chosen": -0.5296196341514587, + "logits/rejected": -0.5630120038986206, + "logps/chosen": -52.84633255004883, + "logps/rejected": -111.89434814453125, + "loss": 0.5968, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9832468032836914, + "rewards/margins": 6.531299591064453, + "rewards/rejected": -3.5480527877807617, + "step": 19752 + }, + { + "epoch": 4.94, + "grad_norm": 5.396136283874512, + "learning_rate": 3.324752199986292e-09, + "logits/chosen": -0.6168414354324341, + "logits/rejected": -0.7178047895431519, + "logps/chosen": -61.718780517578125, + "logps/rejected": -150.16122436523438, + "loss": 0.6323, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.156917095184326, + "rewards/margins": 9.338884353637695, + "rewards/rejected": -6.181967258453369, + "step": 19753 + }, + { + "epoch": 4.94, + "grad_norm": 9.111433029174805, + "learning_rate": 3.2961554566407484e-09, + "logits/chosen": -0.540921151638031, + "logits/rejected": -0.601158082485199, + "logps/chosen": -50.52757263183594, + "logps/rejected": -100.47512817382812, + "loss": 0.6133, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2613720893859863, + "rewards/margins": 7.105639457702637, + "rewards/rejected": -3.8442673683166504, + "step": 19754 + }, + { + "epoch": 4.94, + "grad_norm": 3.6781139373779297, + "learning_rate": 3.2676821871618913e-09, + "logits/chosen": -0.5734367966651917, + "logits/rejected": -0.6553166508674622, + "logps/chosen": -45.949798583984375, + "logps/rejected": -98.80845642089844, + "loss": 0.5325, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0586307048797607, + "rewards/margins": 7.017703056335449, + "rewards/rejected": -3.9590721130371094, + "step": 19755 + }, + { + "epoch": 4.94, + "grad_norm": 3.30587100982666, + "learning_rate": 3.239332392253047e-09, + "logits/chosen": -0.5620707273483276, + "logits/rejected": -0.625882625579834, + "logps/chosen": -64.21641540527344, + "logps/rejected": -125.12802124023438, + "loss": 0.6271, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9122517108917236, + "rewards/margins": 7.873640537261963, + "rewards/rejected": -4.96138858795166, + "step": 19756 + }, + { + "epoch": 4.94, + "grad_norm": 1.5035351514816284, + "learning_rate": 3.211106072615322e-09, + "logits/chosen": -0.5229998826980591, + "logits/rejected": -0.6206160187721252, + "logps/chosen": -50.491973876953125, + "logps/rejected": -126.19683837890625, + "loss": 0.5284, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.182558059692383, + "rewards/margins": 9.278995513916016, + "rewards/rejected": -6.096438407897949, + "step": 19757 + }, + { + "epoch": 4.94, + "grad_norm": 1.9155592918395996, + "learning_rate": 3.1830032289459357e-09, + "logits/chosen": -0.5845454931259155, + "logits/rejected": -0.6568490266799927, + "logps/chosen": -52.21002960205078, + "logps/rejected": -129.46104431152344, + "loss": 0.536, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3275890350341797, + "rewards/margins": 8.781332015991211, + "rewards/rejected": -5.453742027282715, + "step": 19758 + }, + { + "epoch": 4.94, + "grad_norm": 8.181304931640625, + "learning_rate": 3.155023861939888e-09, + "logits/chosen": -0.5509688854217529, + "logits/rejected": -0.5981020331382751, + "logps/chosen": -58.65015411376953, + "logps/rejected": -107.29103088378906, + "loss": 0.7465, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.228914499282837, + "rewards/margins": 6.882828235626221, + "rewards/rejected": -3.653913736343384, + "step": 19759 + }, + { + "epoch": 4.94, + "grad_norm": 3.0894064903259277, + "learning_rate": 3.1271679722877366e-09, + "logits/chosen": -0.4427524507045746, + "logits/rejected": -0.5535087585449219, + "logps/chosen": -60.73497772216797, + "logps/rejected": -107.20455932617188, + "loss": 0.5449, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.091604232788086, + "rewards/margins": 7.971402645111084, + "rewards/rejected": -4.87979793548584, + "step": 19760 + }, + { + "epoch": 4.94, + "grad_norm": 2.0870025157928467, + "learning_rate": 3.0994355606778214e-09, + "logits/chosen": -0.5692992806434631, + "logits/rejected": -0.6651220321655273, + "logps/chosen": -53.98411178588867, + "logps/rejected": -111.50228881835938, + "loss": 0.5592, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4625415802001953, + "rewards/margins": 8.204742431640625, + "rewards/rejected": -4.742201328277588, + "step": 19761 + }, + { + "epoch": 4.94, + "grad_norm": 2.748901605606079, + "learning_rate": 3.0718266277962595e-09, + "logits/chosen": -0.5091150999069214, + "logits/rejected": -0.6320297718048096, + "logps/chosen": -63.69569778442383, + "logps/rejected": -117.19544982910156, + "loss": 0.597, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.947338581085205, + "rewards/margins": 8.127159118652344, + "rewards/rejected": -5.179821014404297, + "step": 19762 + }, + { + "epoch": 4.94, + "grad_norm": 2.715583562850952, + "learning_rate": 3.0443411743247276e-09, + "logits/chosen": -0.5390045642852783, + "logits/rejected": -0.5939559936523438, + "logps/chosen": -54.93888473510742, + "logps/rejected": -138.3455047607422, + "loss": 0.581, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2814390659332275, + "rewards/margins": 8.750890731811523, + "rewards/rejected": -5.469450950622559, + "step": 19763 + }, + { + "epoch": 4.94, + "grad_norm": 3.0862886905670166, + "learning_rate": 3.016979200942127e-09, + "logits/chosen": -0.5732497572898865, + "logits/rejected": -0.651401937007904, + "logps/chosen": -47.87294006347656, + "logps/rejected": -118.91691589355469, + "loss": 0.5262, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.13517427444458, + "rewards/margins": 7.777949333190918, + "rewards/rejected": -4.642776012420654, + "step": 19764 + }, + { + "epoch": 4.94, + "grad_norm": 4.865940570831299, + "learning_rate": 2.9897407083256947e-09, + "logits/chosen": -0.4813960790634155, + "logits/rejected": -0.5694289207458496, + "logps/chosen": -54.79876708984375, + "logps/rejected": -85.30481719970703, + "loss": 0.6539, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.4090218544006348, + "rewards/margins": 5.8715643882751465, + "rewards/rejected": -2.46254301071167, + "step": 19765 + }, + { + "epoch": 4.94, + "grad_norm": 8.483515739440918, + "learning_rate": 2.9626256971471145e-09, + "logits/chosen": -0.5677700638771057, + "logits/rejected": -0.6118403077125549, + "logps/chosen": -52.07115936279297, + "logps/rejected": -113.44214630126953, + "loss": 0.6128, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.05881929397583, + "rewards/margins": 7.035004615783691, + "rewards/rejected": -3.9761850833892822, + "step": 19766 + }, + { + "epoch": 4.94, + "grad_norm": 5.730050086975098, + "learning_rate": 2.9356341680775168e-09, + "logits/chosen": -0.5710983276367188, + "logits/rejected": -0.625030517578125, + "logps/chosen": -42.99575424194336, + "logps/rejected": -104.44507598876953, + "loss": 0.6107, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.988478660583496, + "rewards/margins": 6.095488548278809, + "rewards/rejected": -3.1070101261138916, + "step": 19767 + }, + { + "epoch": 4.95, + "grad_norm": 3.633636713027954, + "learning_rate": 2.9087661217835906e-09, + "logits/chosen": -0.4600202441215515, + "logits/rejected": -0.5775957703590393, + "logps/chosen": -45.58271026611328, + "logps/rejected": -100.54769134521484, + "loss": 0.5257, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0138957500457764, + "rewards/margins": 7.939356803894043, + "rewards/rejected": -4.925461292266846, + "step": 19768 + }, + { + "epoch": 4.95, + "grad_norm": 3.966442584991455, + "learning_rate": 2.882021558929249e-09, + "logits/chosen": -0.6470688581466675, + "logits/rejected": -0.6939924955368042, + "logps/chosen": -53.64167785644531, + "logps/rejected": -125.51536560058594, + "loss": 0.7119, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0717077255249023, + "rewards/margins": 6.931969165802002, + "rewards/rejected": -3.8602614402770996, + "step": 19769 + }, + { + "epoch": 4.95, + "grad_norm": 6.098153114318848, + "learning_rate": 2.8554004801750744e-09, + "logits/chosen": -0.5745823383331299, + "logits/rejected": -0.612384021282196, + "logps/chosen": -63.831974029541016, + "logps/rejected": -150.1766815185547, + "loss": 0.6244, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.957796573638916, + "rewards/margins": 9.173866271972656, + "rewards/rejected": -6.216069221496582, + "step": 19770 + }, + { + "epoch": 4.95, + "grad_norm": 3.3131792545318604, + "learning_rate": 2.82890288617943e-09, + "logits/chosen": -0.5224269032478333, + "logits/rejected": -0.602448582649231, + "logps/chosen": -53.229129791259766, + "logps/rejected": -118.64950561523438, + "loss": 0.5602, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.150214672088623, + "rewards/margins": 8.414175987243652, + "rewards/rejected": -5.263960838317871, + "step": 19771 + }, + { + "epoch": 4.95, + "grad_norm": 2.9911415576934814, + "learning_rate": 2.802528777596791e-09, + "logits/chosen": -0.5301048159599304, + "logits/rejected": -0.6103286743164062, + "logps/chosen": -38.68128204345703, + "logps/rejected": -100.35868835449219, + "loss": 0.5176, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3431410789489746, + "rewards/margins": 7.216304302215576, + "rewards/rejected": -3.873162269592285, + "step": 19772 + }, + { + "epoch": 4.95, + "grad_norm": 4.4630584716796875, + "learning_rate": 2.7762781550783046e-09, + "logits/chosen": -0.578910768032074, + "logits/rejected": -0.6598808765411377, + "logps/chosen": -50.52422332763672, + "logps/rejected": -102.02184295654297, + "loss": 0.6053, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0366714000701904, + "rewards/margins": 6.942141532897949, + "rewards/rejected": -3.905470132827759, + "step": 19773 + }, + { + "epoch": 4.95, + "grad_norm": 2.9793829917907715, + "learning_rate": 2.7501510192740056e-09, + "logits/chosen": -0.5023202896118164, + "logits/rejected": -0.6352838277816772, + "logps/chosen": -63.71174621582031, + "logps/rejected": -98.40071868896484, + "loss": 0.5155, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9150118827819824, + "rewards/margins": 7.665824890136719, + "rewards/rejected": -4.7508134841918945, + "step": 19774 + }, + { + "epoch": 4.95, + "grad_norm": 4.559727191925049, + "learning_rate": 2.7241473708283784e-09, + "logits/chosen": -0.6321161985397339, + "logits/rejected": -0.7198730707168579, + "logps/chosen": -60.40312576293945, + "logps/rejected": -104.17376708984375, + "loss": 0.6439, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0704078674316406, + "rewards/margins": 7.136535167694092, + "rewards/rejected": -4.066127777099609, + "step": 19775 + }, + { + "epoch": 4.95, + "grad_norm": 5.142292022705078, + "learning_rate": 2.698267210384797e-09, + "logits/chosen": -0.586281955242157, + "logits/rejected": -0.6671906113624573, + "logps/chosen": -63.508453369140625, + "logps/rejected": -108.3264389038086, + "loss": 0.6821, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.247398853302002, + "rewards/margins": 7.180966854095459, + "rewards/rejected": -3.933568239212036, + "step": 19776 + }, + { + "epoch": 4.95, + "grad_norm": 2.3002116680145264, + "learning_rate": 2.6725105385821957e-09, + "logits/chosen": -0.5070140957832336, + "logits/rejected": -0.6257181167602539, + "logps/chosen": -43.41763687133789, + "logps/rejected": -98.16213989257812, + "loss": 0.5624, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3324503898620605, + "rewards/margins": 8.432013511657715, + "rewards/rejected": -5.099562644958496, + "step": 19777 + }, + { + "epoch": 4.95, + "grad_norm": 3.336284637451172, + "learning_rate": 2.6468773560572867e-09, + "logits/chosen": -0.5782332420349121, + "logits/rejected": -0.6577033400535583, + "logps/chosen": -52.937870025634766, + "logps/rejected": -105.9305419921875, + "loss": 0.584, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.345308542251587, + "rewards/margins": 7.511721134185791, + "rewards/rejected": -4.166412830352783, + "step": 19778 + }, + { + "epoch": 4.95, + "grad_norm": 3.7473177909851074, + "learning_rate": 2.6213676634428977e-09, + "logits/chosen": -0.6130810379981995, + "logits/rejected": -0.7099451422691345, + "logps/chosen": -51.52691650390625, + "logps/rejected": -98.61817932128906, + "loss": 0.5912, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0902740955352783, + "rewards/margins": 6.909976959228516, + "rewards/rejected": -3.819702625274658, + "step": 19779 + }, + { + "epoch": 4.95, + "grad_norm": 4.380313396453857, + "learning_rate": 2.5959814613707444e-09, + "logits/chosen": -0.5284278392791748, + "logits/rejected": -0.5860366225242615, + "logps/chosen": -57.71956253051758, + "logps/rejected": -95.87346649169922, + "loss": 0.7188, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4741339683532715, + "rewards/margins": 5.832314491271973, + "rewards/rejected": -2.3581812381744385, + "step": 19780 + }, + { + "epoch": 4.95, + "grad_norm": 4.025535583496094, + "learning_rate": 2.570718750466994e-09, + "logits/chosen": -0.53602135181427, + "logits/rejected": -0.6158319711685181, + "logps/chosen": -54.625221252441406, + "logps/rejected": -101.82305908203125, + "loss": 0.6103, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4271459579467773, + "rewards/margins": 7.537687301635742, + "rewards/rejected": -4.110542297363281, + "step": 19781 + }, + { + "epoch": 4.95, + "grad_norm": 5.19453763961792, + "learning_rate": 2.545579531356701e-09, + "logits/chosen": -0.6651892066001892, + "logits/rejected": -0.7144010663032532, + "logps/chosen": -39.64010238647461, + "logps/rejected": -120.00637817382812, + "loss": 0.5904, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.285426378250122, + "rewards/margins": 8.240025520324707, + "rewards/rejected": -4.954598903656006, + "step": 19782 + }, + { + "epoch": 4.95, + "grad_norm": 4.259158134460449, + "learning_rate": 2.5205638046599255e-09, + "logits/chosen": -0.5413355827331543, + "logits/rejected": -0.651107132434845, + "logps/chosen": -59.450225830078125, + "logps/rejected": -97.18669891357422, + "loss": 0.653, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1564273834228516, + "rewards/margins": 7.605535507202148, + "rewards/rejected": -4.449108123779297, + "step": 19783 + }, + { + "epoch": 4.95, + "grad_norm": 8.045438766479492, + "learning_rate": 2.495671570996172e-09, + "logits/chosen": -0.6232689023017883, + "logits/rejected": -0.7320796251296997, + "logps/chosen": -57.67005920410156, + "logps/rejected": -92.67967224121094, + "loss": 0.7765, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.399629592895508, + "rewards/margins": 6.820219039916992, + "rewards/rejected": -3.4205894470214844, + "step": 19784 + }, + { + "epoch": 4.95, + "grad_norm": 3.856588363647461, + "learning_rate": 2.4709028309799487e-09, + "logits/chosen": -0.49920958280563354, + "logits/rejected": -0.5878111720085144, + "logps/chosen": -64.15074920654297, + "logps/rejected": -90.42253112792969, + "loss": 0.6665, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8577322959899902, + "rewards/margins": 6.034261226654053, + "rewards/rejected": -3.1765291690826416, + "step": 19785 + }, + { + "epoch": 4.95, + "grad_norm": 4.9891533851623535, + "learning_rate": 2.4462575852229885e-09, + "logits/chosen": -0.5434112548828125, + "logits/rejected": -0.6117444634437561, + "logps/chosen": -52.43482971191406, + "logps/rejected": -111.48309326171875, + "loss": 0.6646, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.428870916366577, + "rewards/margins": 7.696670055389404, + "rewards/rejected": -4.267799377441406, + "step": 19786 + }, + { + "epoch": 4.95, + "grad_norm": 4.555429458618164, + "learning_rate": 2.4217358343348042e-09, + "logits/chosen": -0.5827223062515259, + "logits/rejected": -0.6621212363243103, + "logps/chosen": -68.58517456054688, + "logps/rejected": -105.82493591308594, + "loss": 0.6526, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0451500415802, + "rewards/margins": 7.632167816162109, + "rewards/rejected": -4.587018013000488, + "step": 19787 + }, + { + "epoch": 4.95, + "grad_norm": 36.49153137207031, + "learning_rate": 2.3973375789210218e-09, + "logits/chosen": -0.5698102116584778, + "logits/rejected": -0.6280523538589478, + "logps/chosen": -48.04582214355469, + "logps/rejected": -81.67185974121094, + "loss": 0.6503, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0844063758850098, + "rewards/margins": 5.667762756347656, + "rewards/rejected": -2.5833561420440674, + "step": 19788 + }, + { + "epoch": 4.95, + "grad_norm": 3.7626569271087646, + "learning_rate": 2.373062819585048e-09, + "logits/chosen": -0.6530700922012329, + "logits/rejected": -0.6997726559638977, + "logps/chosen": -53.57879638671875, + "logps/rejected": -118.66460418701172, + "loss": 0.6588, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.408142328262329, + "rewards/margins": 7.38840389251709, + "rewards/rejected": -3.98026180267334, + "step": 19789 + }, + { + "epoch": 4.95, + "grad_norm": 8.469268798828125, + "learning_rate": 2.348911556926403e-09, + "logits/chosen": -0.5501104593276978, + "logits/rejected": -0.5958054065704346, + "logps/chosen": -53.315643310546875, + "logps/rejected": -98.11445617675781, + "loss": 0.7094, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9288077354431152, + "rewards/margins": 6.452068328857422, + "rewards/rejected": -3.523261070251465, + "step": 19790 + }, + { + "epoch": 4.95, + "grad_norm": 11.189159393310547, + "learning_rate": 2.324883791541277e-09, + "logits/chosen": -0.45602595806121826, + "logits/rejected": -0.5710378289222717, + "logps/chosen": -58.885658264160156, + "logps/rejected": -104.63079071044922, + "loss": 0.6663, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.972219944000244, + "rewards/margins": 7.417823314666748, + "rewards/rejected": -4.445602893829346, + "step": 19791 + }, + { + "epoch": 4.95, + "grad_norm": 11.799838066101074, + "learning_rate": 2.300979524024749e-09, + "logits/chosen": -0.5516947507858276, + "logits/rejected": -0.6254361867904663, + "logps/chosen": -46.79487228393555, + "logps/rejected": -120.15170288085938, + "loss": 0.7257, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7419610023498535, + "rewards/margins": 7.906929969787598, + "rewards/rejected": -5.164969444274902, + "step": 19792 + }, + { + "epoch": 4.95, + "grad_norm": 5.876578330993652, + "learning_rate": 2.2771987549663477e-09, + "logits/chosen": -0.5303981304168701, + "logits/rejected": -0.5691846609115601, + "logps/chosen": -54.09226989746094, + "logps/rejected": -126.86891174316406, + "loss": 0.5808, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.931688070297241, + "rewards/margins": 7.172624111175537, + "rewards/rejected": -4.240935802459717, + "step": 19793 + }, + { + "epoch": 4.95, + "grad_norm": 1.9108258485794067, + "learning_rate": 2.2535414849544914e-09, + "logits/chosen": -0.5999258160591125, + "logits/rejected": -0.6900945901870728, + "logps/chosen": -59.84681701660156, + "logps/rejected": -100.37379455566406, + "loss": 0.5569, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3800532817840576, + "rewards/margins": 8.376611709594727, + "rewards/rejected": -4.996558666229248, + "step": 19794 + }, + { + "epoch": 4.95, + "grad_norm": 2.518756866455078, + "learning_rate": 2.2300077145731567e-09, + "logits/chosen": -0.5619471073150635, + "logits/rejected": -0.6601746082305908, + "logps/chosen": -53.59235763549805, + "logps/rejected": -94.26902770996094, + "loss": 0.5979, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.057526111602783, + "rewards/margins": 7.525301456451416, + "rewards/rejected": -4.467775344848633, + "step": 19795 + }, + { + "epoch": 4.95, + "grad_norm": 2.8724300861358643, + "learning_rate": 2.206597444404657e-09, + "logits/chosen": -0.5210999846458435, + "logits/rejected": -0.6070886254310608, + "logps/chosen": -44.39420700073242, + "logps/rejected": -91.77786254882812, + "loss": 0.5292, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.97580885887146, + "rewards/margins": 7.28489875793457, + "rewards/rejected": -4.3090901374816895, + "step": 19796 + }, + { + "epoch": 4.95, + "grad_norm": 3.9017419815063477, + "learning_rate": 2.1833106750268616e-09, + "logits/chosen": -0.6348146200180054, + "logits/rejected": -0.6987454295158386, + "logps/chosen": -57.17287063598633, + "logps/rejected": -126.68987274169922, + "loss": 0.6081, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.150320529937744, + "rewards/margins": 7.861429691314697, + "rewards/rejected": -4.711108684539795, + "step": 19797 + }, + { + "epoch": 4.95, + "grad_norm": 10.713807106018066, + "learning_rate": 2.1601474070148678e-09, + "logits/chosen": -0.5219260454177856, + "logits/rejected": -0.6011492609977722, + "logps/chosen": -55.697200775146484, + "logps/rejected": -106.57508087158203, + "loss": 0.7198, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.862445831298828, + "rewards/margins": 6.533734321594238, + "rewards/rejected": -3.6712887287139893, + "step": 19798 + }, + { + "epoch": 4.95, + "grad_norm": 3.1768743991851807, + "learning_rate": 2.1371076409421042e-09, + "logits/chosen": -0.5546143054962158, + "logits/rejected": -0.6658884286880493, + "logps/chosen": -60.836334228515625, + "logps/rejected": -92.1947021484375, + "loss": 0.5397, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.348240852355957, + "rewards/margins": 6.98561429977417, + "rewards/rejected": -3.637373685836792, + "step": 19799 + }, + { + "epoch": 4.95, + "grad_norm": 3.769994020462036, + "learning_rate": 2.114191377377006e-09, + "logits/chosen": -0.5239558219909668, + "logits/rejected": -0.5791966915130615, + "logps/chosen": -58.49212646484375, + "logps/rejected": -109.42374420166016, + "loss": 0.5984, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6508798599243164, + "rewards/margins": 6.994487762451172, + "rewards/rejected": -3.3436074256896973, + "step": 19800 + }, + { + "epoch": 4.95, + "grad_norm": 5.70306396484375, + "learning_rate": 2.091398616886897e-09, + "logits/chosen": -0.5886918306350708, + "logits/rejected": -0.6936978101730347, + "logps/chosen": -49.25993728637695, + "logps/rejected": -111.01072692871094, + "loss": 0.614, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.224202871322632, + "rewards/margins": 8.260902404785156, + "rewards/rejected": -5.0366997718811035, + "step": 19801 + }, + { + "epoch": 4.95, + "grad_norm": 2.5446746349334717, + "learning_rate": 2.0687293600335502e-09, + "logits/chosen": -0.534635603427887, + "logits/rejected": -0.6457034945487976, + "logps/chosen": -46.692813873291016, + "logps/rejected": -107.55661010742188, + "loss": 0.5573, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4684386253356934, + "rewards/margins": 8.25346565246582, + "rewards/rejected": -4.785026550292969, + "step": 19802 + }, + { + "epoch": 4.95, + "grad_norm": 6.441636562347412, + "learning_rate": 2.0461836073787377e-09, + "logits/chosen": -0.6070055961608887, + "logits/rejected": -0.6799432039260864, + "logps/chosen": -49.72425079345703, + "logps/rejected": -118.6282730102539, + "loss": 0.6174, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.060004711151123, + "rewards/margins": 8.785082817077637, + "rewards/rejected": -5.725078105926514, + "step": 19803 + }, + { + "epoch": 4.95, + "grad_norm": 3.0561485290527344, + "learning_rate": 2.023761359478127e-09, + "logits/chosen": -0.5448007583618164, + "logits/rejected": -0.5830382108688354, + "logps/chosen": -47.44181823730469, + "logps/rejected": -134.49041748046875, + "loss": 0.5734, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.208050489425659, + "rewards/margins": 8.811384201049805, + "rewards/rejected": -5.603334426879883, + "step": 19804 + }, + { + "epoch": 4.95, + "grad_norm": 5.589060306549072, + "learning_rate": 2.001462616886829e-09, + "logits/chosen": -0.564264178276062, + "logits/rejected": -0.6665352582931519, + "logps/chosen": -64.55439758300781, + "logps/rejected": -96.06861877441406, + "loss": 0.7289, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.095301389694214, + "rewards/margins": 6.698673725128174, + "rewards/rejected": -3.60337233543396, + "step": 19805 + }, + { + "epoch": 4.95, + "grad_norm": 3.3555850982666016, + "learning_rate": 1.9792873801549595e-09, + "logits/chosen": -0.6475730538368225, + "logits/rejected": -0.7217293977737427, + "logps/chosen": -48.35047149658203, + "logps/rejected": -102.72946166992188, + "loss": 0.662, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1798806190490723, + "rewards/margins": 7.262549877166748, + "rewards/rejected": -4.082669258117676, + "step": 19806 + }, + { + "epoch": 4.95, + "grad_norm": 5.7923970222473145, + "learning_rate": 1.9572356498315236e-09, + "logits/chosen": -0.5251909494400024, + "logits/rejected": -0.5972862839698792, + "logps/chosen": -58.05103302001953, + "logps/rejected": -107.84236907958984, + "loss": 0.6429, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4618425369262695, + "rewards/margins": 7.6930389404296875, + "rewards/rejected": -4.231196403503418, + "step": 19807 + }, + { + "epoch": 4.96, + "grad_norm": 5.9893798828125, + "learning_rate": 1.935307426461086e-09, + "logits/chosen": -0.541962742805481, + "logits/rejected": -0.6739461421966553, + "logps/chosen": -70.67938232421875, + "logps/rejected": -85.20800018310547, + "loss": 0.6894, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.303049087524414, + "rewards/margins": 6.5011491775512695, + "rewards/rejected": -3.1981000900268555, + "step": 19808 + }, + { + "epoch": 4.96, + "grad_norm": 4.2601542472839355, + "learning_rate": 1.9135027105848803e-09, + "logits/chosen": -0.4686717092990875, + "logits/rejected": -0.5456106662750244, + "logps/chosen": -69.36396789550781, + "logps/rejected": -122.05819702148438, + "loss": 0.6251, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.997366189956665, + "rewards/margins": 7.189024925231934, + "rewards/rejected": -4.1916584968566895, + "step": 19809 + }, + { + "epoch": 4.96, + "grad_norm": 5.568464279174805, + "learning_rate": 1.8918215027424746e-09, + "logits/chosen": -0.5627782940864563, + "logits/rejected": -0.6600056290626526, + "logps/chosen": -62.90335464477539, + "logps/rejected": -112.73582458496094, + "loss": 0.5847, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9011969566345215, + "rewards/margins": 7.023924827575684, + "rewards/rejected": -4.122727394104004, + "step": 19810 + }, + { + "epoch": 4.96, + "grad_norm": 3.2365081310272217, + "learning_rate": 1.870263803469552e-09, + "logits/chosen": -0.5195237994194031, + "logits/rejected": -0.5846471786499023, + "logps/chosen": -58.61308670043945, + "logps/rejected": -117.86293029785156, + "loss": 0.6221, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.556762456893921, + "rewards/margins": 7.494791507720947, + "rewards/rejected": -3.9380288124084473, + "step": 19811 + }, + { + "epoch": 4.96, + "grad_norm": 3.4407880306243896, + "learning_rate": 1.848829613298464e-09, + "logits/chosen": -0.5409756302833557, + "logits/rejected": -0.6283331513404846, + "logps/chosen": -54.29286193847656, + "logps/rejected": -109.38678741455078, + "loss": 0.5868, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.231304168701172, + "rewards/margins": 7.518004894256592, + "rewards/rejected": -4.286700248718262, + "step": 19812 + }, + { + "epoch": 4.96, + "grad_norm": 3.656346559524536, + "learning_rate": 1.8275189327587872e-09, + "logits/chosen": -0.5888037085533142, + "logits/rejected": -0.6661322116851807, + "logps/chosen": -46.827056884765625, + "logps/rejected": -110.43424987792969, + "loss": 0.5811, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1694893836975098, + "rewards/margins": 8.093758583068848, + "rewards/rejected": -4.924269676208496, + "step": 19813 + }, + { + "epoch": 4.96, + "grad_norm": 9.389724731445312, + "learning_rate": 1.8063317623778776e-09, + "logits/chosen": -0.47334444522857666, + "logits/rejected": -0.5503845810890198, + "logps/chosen": -58.983375549316406, + "logps/rejected": -117.3244400024414, + "loss": 0.611, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.842989921569824, + "rewards/margins": 7.116178035736084, + "rewards/rejected": -4.273188591003418, + "step": 19814 + }, + { + "epoch": 4.96, + "grad_norm": 4.168586254119873, + "learning_rate": 1.78526810267865e-09, + "logits/chosen": -0.552742063999176, + "logits/rejected": -0.6948091983795166, + "logps/chosen": -58.573265075683594, + "logps/rejected": -112.548095703125, + "loss": 0.6317, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2203686237335205, + "rewards/margins": 9.090465545654297, + "rewards/rejected": -5.87009859085083, + "step": 19815 + }, + { + "epoch": 4.96, + "grad_norm": 2.442728281021118, + "learning_rate": 1.7643279541817992e-09, + "logits/chosen": -0.5508660078048706, + "logits/rejected": -0.6493101119995117, + "logps/chosen": -56.71300506591797, + "logps/rejected": -96.97804260253906, + "loss": 0.6096, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.247065544128418, + "rewards/margins": 7.530496597290039, + "rewards/rejected": -4.283431053161621, + "step": 19816 + }, + { + "epoch": 4.96, + "grad_norm": 9.9655122756958, + "learning_rate": 1.7435113174046892e-09, + "logits/chosen": -0.5713045597076416, + "logits/rejected": -0.6331756711006165, + "logps/chosen": -59.93883514404297, + "logps/rejected": -131.60536193847656, + "loss": 0.6526, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2256531715393066, + "rewards/margins": 8.625359535217285, + "rewards/rejected": -5.399706840515137, + "step": 19817 + }, + { + "epoch": 4.96, + "grad_norm": 6.883240222930908, + "learning_rate": 1.722818192861908e-09, + "logits/chosen": -0.6927748918533325, + "logits/rejected": -0.7207052707672119, + "logps/chosen": -40.91912841796875, + "logps/rejected": -115.07952880859375, + "loss": 0.6146, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0851409435272217, + "rewards/margins": 7.737427234649658, + "rewards/rejected": -4.652285575866699, + "step": 19818 + }, + { + "epoch": 4.96, + "grad_norm": 5.701238632202148, + "learning_rate": 1.7022485810647139e-09, + "logits/chosen": -0.5576188564300537, + "logits/rejected": -0.6289898753166199, + "logps/chosen": -57.72484588623047, + "logps/rejected": -111.94993591308594, + "loss": 0.6629, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.122363567352295, + "rewards/margins": 6.111090660095215, + "rewards/rejected": -2.98872709274292, + "step": 19819 + }, + { + "epoch": 4.96, + "grad_norm": 8.297994613647461, + "learning_rate": 1.6818024825215883e-09, + "logits/chosen": -0.6075812578201294, + "logits/rejected": -0.6467288732528687, + "logps/chosen": -47.261558532714844, + "logps/rejected": -102.04570007324219, + "loss": 0.6693, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.946990966796875, + "rewards/margins": 6.166355133056641, + "rewards/rejected": -3.219364643096924, + "step": 19820 + }, + { + "epoch": 4.96, + "grad_norm": 2.674145460128784, + "learning_rate": 1.6614798977371282e-09, + "logits/chosen": -0.6060521602630615, + "logits/rejected": -0.6677846312522888, + "logps/chosen": -48.74251174926758, + "logps/rejected": -119.22942352294922, + "loss": 0.5541, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.16763973236084, + "rewards/margins": 8.727743148803711, + "rewards/rejected": -5.560103416442871, + "step": 19821 + }, + { + "epoch": 4.96, + "grad_norm": 3.1004045009613037, + "learning_rate": 1.6412808272142645e-09, + "logits/chosen": -0.5505660176277161, + "logits/rejected": -0.5970171093940735, + "logps/chosen": -58.41398620605469, + "logps/rejected": -110.8175277709961, + "loss": 0.5909, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.314056158065796, + "rewards/margins": 7.356452941894531, + "rewards/rejected": -4.0423970222473145, + "step": 19822 + }, + { + "epoch": 4.96, + "grad_norm": 3.0800681114196777, + "learning_rate": 1.621205271451487e-09, + "logits/chosen": -0.5413355827331543, + "logits/rejected": -0.6532415151596069, + "logps/chosen": -54.38032531738281, + "logps/rejected": -105.14450073242188, + "loss": 0.5356, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.311750888824463, + "rewards/margins": 8.140510559082031, + "rewards/rejected": -4.82875919342041, + "step": 19823 + }, + { + "epoch": 4.96, + "grad_norm": 11.601239204406738, + "learning_rate": 1.6012532309456208e-09, + "logits/chosen": -0.4731273055076599, + "logits/rejected": -0.58387690782547, + "logps/chosen": -49.83275604248047, + "logps/rejected": -102.36326599121094, + "loss": 0.6047, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1279335021972656, + "rewards/margins": 7.5271430015563965, + "rewards/rejected": -4.399209499359131, + "step": 19824 + }, + { + "epoch": 4.96, + "grad_norm": 3.4507009983062744, + "learning_rate": 1.5814247061890497e-09, + "logits/chosen": -0.6547719240188599, + "logits/rejected": -0.7289599180221558, + "logps/chosen": -49.51873779296875, + "logps/rejected": -112.90052795410156, + "loss": 0.5918, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8762049674987793, + "rewards/margins": 7.426809310913086, + "rewards/rejected": -4.550604343414307, + "step": 19825 + }, + { + "epoch": 4.96, + "grad_norm": 3.8174991607666016, + "learning_rate": 1.5617196976724924e-09, + "logits/chosen": -0.5796053409576416, + "logits/rejected": -0.6928518414497375, + "logps/chosen": -66.86719512939453, + "logps/rejected": -95.5582275390625, + "loss": 0.7054, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.910733699798584, + "rewards/margins": 6.419365882873535, + "rewards/rejected": -3.508631944656372, + "step": 19826 + }, + { + "epoch": 4.96, + "grad_norm": 5.0673627853393555, + "learning_rate": 1.5421382058816713e-09, + "logits/chosen": -0.5534445643424988, + "logits/rejected": -0.5575730800628662, + "logps/chosen": -54.517452239990234, + "logps/rejected": -95.52750396728516, + "loss": 0.7013, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3787853717803955, + "rewards/margins": 5.937736988067627, + "rewards/rejected": -2.5589513778686523, + "step": 19827 + }, + { + "epoch": 4.96, + "grad_norm": 5.997339248657227, + "learning_rate": 1.5226802313017541e-09, + "logits/chosen": -0.6653209924697876, + "logits/rejected": -0.7490743398666382, + "logps/chosen": -53.85760498046875, + "logps/rejected": -108.80045318603516, + "loss": 0.6071, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1824049949645996, + "rewards/margins": 7.676621913909912, + "rewards/rejected": -4.4942169189453125, + "step": 19828 + }, + { + "epoch": 4.96, + "grad_norm": 5.449631214141846, + "learning_rate": 1.5033457744129122e-09, + "logits/chosen": -0.5871320366859436, + "logits/rejected": -0.6486937403678894, + "logps/chosen": -53.062469482421875, + "logps/rejected": -143.16891479492188, + "loss": 0.6523, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9956207275390625, + "rewards/margins": 8.564024925231934, + "rewards/rejected": -5.568404674530029, + "step": 19829 + }, + { + "epoch": 4.96, + "grad_norm": 6.070224285125732, + "learning_rate": 1.4841348356930963e-09, + "logits/chosen": -0.6855615973472595, + "logits/rejected": -0.7405474185943604, + "logps/chosen": -52.98001480102539, + "logps/rejected": -93.7535629272461, + "loss": 0.6678, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.194037914276123, + "rewards/margins": 6.755278587341309, + "rewards/rejected": -3.5612404346466064, + "step": 19830 + }, + { + "epoch": 4.96, + "grad_norm": 4.093838214874268, + "learning_rate": 1.465047415616927e-09, + "logits/chosen": -0.5196999311447144, + "logits/rejected": -0.6090481281280518, + "logps/chosen": -51.47747039794922, + "logps/rejected": -103.6357192993164, + "loss": 0.6211, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1575498580932617, + "rewards/margins": 6.845931529998779, + "rewards/rejected": -3.6883816719055176, + "step": 19831 + }, + { + "epoch": 4.96, + "grad_norm": 5.0454230308532715, + "learning_rate": 1.446083514656249e-09, + "logits/chosen": -0.6242619752883911, + "logits/rejected": -0.6888669729232788, + "logps/chosen": -54.625247955322266, + "logps/rejected": -95.39860534667969, + "loss": 0.6232, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.534376382827759, + "rewards/margins": 6.964751243591309, + "rewards/rejected": -3.430375099182129, + "step": 19832 + }, + { + "epoch": 4.96, + "grad_norm": 13.273096084594727, + "learning_rate": 1.4272431332790215e-09, + "logits/chosen": -0.622597336769104, + "logits/rejected": -0.7078061103820801, + "logps/chosen": -53.55506134033203, + "logps/rejected": -109.52558898925781, + "loss": 0.6881, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9921653270721436, + "rewards/margins": 6.403517246246338, + "rewards/rejected": -3.4113521575927734, + "step": 19833 + }, + { + "epoch": 4.96, + "grad_norm": 5.429737567901611, + "learning_rate": 1.4085262719520931e-09, + "logits/chosen": -0.5917525291442871, + "logits/rejected": -0.6968413591384888, + "logps/chosen": -65.85441589355469, + "logps/rejected": -100.58719635009766, + "loss": 0.6922, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8479201793670654, + "rewards/margins": 7.287614822387695, + "rewards/rejected": -4.439695358276367, + "step": 19834 + }, + { + "epoch": 4.96, + "grad_norm": 5.2715067863464355, + "learning_rate": 1.3899329311367614e-09, + "logits/chosen": -0.5412739515304565, + "logits/rejected": -0.5953178405761719, + "logps/chosen": -57.97743606567383, + "logps/rejected": -109.57194519042969, + "loss": 0.6493, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.103166341781616, + "rewards/margins": 6.694154262542725, + "rewards/rejected": -3.5909879207611084, + "step": 19835 + }, + { + "epoch": 4.96, + "grad_norm": 3.103013515472412, + "learning_rate": 1.371463111292659e-09, + "logits/chosen": -0.5542821884155273, + "logits/rejected": -0.6343944072723389, + "logps/chosen": -55.1812744140625, + "logps/rejected": -123.7945556640625, + "loss": 0.5711, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.128749132156372, + "rewards/margins": 8.686809539794922, + "rewards/rejected": -5.5580596923828125, + "step": 19836 + }, + { + "epoch": 4.96, + "grad_norm": 2.795379161834717, + "learning_rate": 1.3531168128771977e-09, + "logits/chosen": -0.6143383979797363, + "logits/rejected": -0.6491603255271912, + "logps/chosen": -50.989524841308594, + "logps/rejected": -118.95917510986328, + "loss": 0.5551, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1590912342071533, + "rewards/margins": 7.063241004943848, + "rewards/rejected": -3.904149293899536, + "step": 19837 + }, + { + "epoch": 4.96, + "grad_norm": 8.488746643066406, + "learning_rate": 1.3348940363427931e-09, + "logits/chosen": -0.49308913946151733, + "logits/rejected": -0.5931885242462158, + "logps/chosen": -66.8525619506836, + "logps/rejected": -117.25347900390625, + "loss": 0.6553, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.809537887573242, + "rewards/margins": 8.086703300476074, + "rewards/rejected": -5.27716588973999, + "step": 19838 + }, + { + "epoch": 4.96, + "grad_norm": 8.00450611114502, + "learning_rate": 1.316794782139641e-09, + "logits/chosen": -0.577162504196167, + "logits/rejected": -0.625481903553009, + "logps/chosen": -62.93809127807617, + "logps/rejected": -124.52306365966797, + "loss": 0.6642, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0038843154907227, + "rewards/margins": 6.884810924530029, + "rewards/rejected": -3.8809266090393066, + "step": 19839 + }, + { + "epoch": 4.96, + "grad_norm": 5.797819137573242, + "learning_rate": 1.298819050715161e-09, + "logits/chosen": -0.5363590121269226, + "logits/rejected": -0.5640401840209961, + "logps/chosen": -52.466102600097656, + "logps/rejected": -108.16131591796875, + "loss": 0.6515, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9599578380584717, + "rewards/margins": 5.9938459396362305, + "rewards/rejected": -3.033888101577759, + "step": 19840 + }, + { + "epoch": 4.96, + "grad_norm": 7.526710033416748, + "learning_rate": 1.2809668425145527e-09, + "logits/chosen": -0.5489780902862549, + "logits/rejected": -0.5707771182060242, + "logps/chosen": -46.20078659057617, + "logps/rejected": -107.90558624267578, + "loss": 0.6003, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3573319911956787, + "rewards/margins": 6.863254070281982, + "rewards/rejected": -3.5059220790863037, + "step": 19841 + }, + { + "epoch": 4.96, + "grad_norm": 5.1079020500183105, + "learning_rate": 1.2632381579774644e-09, + "logits/chosen": -0.5635629296302795, + "logits/rejected": -0.6347926259040833, + "logps/chosen": -60.280250549316406, + "logps/rejected": -118.00114440917969, + "loss": 0.6378, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.324474811553955, + "rewards/margins": 8.337145805358887, + "rewards/rejected": -5.012670516967773, + "step": 19842 + }, + { + "epoch": 4.96, + "grad_norm": 7.018940448760986, + "learning_rate": 1.2456329975429893e-09, + "logits/chosen": -0.5579298138618469, + "logits/rejected": -0.6386547088623047, + "logps/chosen": -45.547454833984375, + "logps/rejected": -112.5875244140625, + "loss": 0.4959, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.267772674560547, + "rewards/margins": 7.632861137390137, + "rewards/rejected": -4.365088939666748, + "step": 19843 + }, + { + "epoch": 4.96, + "grad_norm": 4.25345516204834, + "learning_rate": 1.2281513616452246e-09, + "logits/chosen": -0.4979967176914215, + "logits/rejected": -0.5839084386825562, + "logps/chosen": -62.17555618286133, + "logps/rejected": -132.6396942138672, + "loss": 0.635, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0464847087860107, + "rewards/margins": 8.532025337219238, + "rewards/rejected": -5.485540390014648, + "step": 19844 + }, + { + "epoch": 4.96, + "grad_norm": 4.966007232666016, + "learning_rate": 1.2107932507177123e-09, + "logits/chosen": -0.5649453997612, + "logits/rejected": -0.6043322682380676, + "logps/chosen": -55.012550354003906, + "logps/rejected": -110.12279510498047, + "loss": 0.6169, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.227566719055176, + "rewards/margins": 6.893484115600586, + "rewards/rejected": -3.66591739654541, + "step": 19845 + }, + { + "epoch": 4.96, + "grad_norm": 3.1733078956604004, + "learning_rate": 1.1935586651878884e-09, + "logits/chosen": -0.5545727610588074, + "logits/rejected": -0.6620821952819824, + "logps/chosen": -57.95759582519531, + "logps/rejected": -90.54226684570312, + "loss": 0.6523, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8299074172973633, + "rewards/margins": 7.00537109375, + "rewards/rejected": -4.1754631996154785, + "step": 19846 + }, + { + "epoch": 4.96, + "grad_norm": 9.78915786743164, + "learning_rate": 1.1764476054820784e-09, + "logits/chosen": -0.5464868545532227, + "logits/rejected": -0.5843673348426819, + "logps/chosen": -65.83232116699219, + "logps/rejected": -90.63313293457031, + "loss": 0.7056, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9148590564727783, + "rewards/margins": 5.627830982208252, + "rewards/rejected": -2.7129716873168945, + "step": 19847 + }, + { + "epoch": 4.97, + "grad_norm": 10.264082908630371, + "learning_rate": 1.1594600720232773e-09, + "logits/chosen": -0.5183187127113342, + "logits/rejected": -0.6100345849990845, + "logps/chosen": -55.77839279174805, + "logps/rejected": -111.49089813232422, + "loss": 0.6146, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1629395484924316, + "rewards/margins": 7.653012752532959, + "rewards/rejected": -4.490072727203369, + "step": 19848 + }, + { + "epoch": 4.97, + "grad_norm": 5.76395845413208, + "learning_rate": 1.1425960652317048e-09, + "logits/chosen": -0.5869452953338623, + "logits/rejected": -0.6608116626739502, + "logps/chosen": -45.11589813232422, + "logps/rejected": -95.93050384521484, + "loss": 0.6186, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1645264625549316, + "rewards/margins": 6.368684768676758, + "rewards/rejected": -3.2041590213775635, + "step": 19849 + }, + { + "epoch": 4.97, + "grad_norm": 2.8602166175842285, + "learning_rate": 1.125855585523139e-09, + "logits/chosen": -0.6038975119590759, + "logits/rejected": -0.6835674047470093, + "logps/chosen": -48.542381286621094, + "logps/rejected": -118.04682922363281, + "loss": 0.5607, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.187783718109131, + "rewards/margins": 8.924221992492676, + "rewards/rejected": -5.736438274383545, + "step": 19850 + }, + { + "epoch": 4.97, + "grad_norm": 6.603260517120361, + "learning_rate": 1.1092386333116934e-09, + "logits/chosen": -0.5173349976539612, + "logits/rejected": -0.5947127342224121, + "logps/chosen": -59.031925201416016, + "logps/rejected": -104.06731414794922, + "loss": 0.6562, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0080246925354004, + "rewards/margins": 6.559000015258789, + "rewards/rejected": -3.550975799560547, + "step": 19851 + }, + { + "epoch": 4.97, + "grad_norm": 5.428524017333984, + "learning_rate": 1.0927452090081502e-09, + "logits/chosen": -0.5154133439064026, + "logits/rejected": -0.5826588869094849, + "logps/chosen": -63.75000762939453, + "logps/rejected": -117.41293334960938, + "loss": 0.7113, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8327078819274902, + "rewards/margins": 6.927458763122559, + "rewards/rejected": -4.09475040435791, + "step": 19852 + }, + { + "epoch": 4.97, + "grad_norm": 4.046956539154053, + "learning_rate": 1.0763753130199617e-09, + "logits/chosen": -0.5514431595802307, + "logits/rejected": -0.589275062084198, + "logps/chosen": -56.96815872192383, + "logps/rejected": -127.75117492675781, + "loss": 0.7054, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0106401443481445, + "rewards/margins": 7.836484432220459, + "rewards/rejected": -4.825845241546631, + "step": 19853 + }, + { + "epoch": 4.97, + "grad_norm": 5.416579723358154, + "learning_rate": 1.0601289457518039e-09, + "logits/chosen": -0.5605531334877014, + "logits/rejected": -0.611355721950531, + "logps/chosen": -64.8061752319336, + "logps/rejected": -114.75513458251953, + "loss": 0.6796, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.951615333557129, + "rewards/margins": 7.489300727844238, + "rewards/rejected": -4.537685871124268, + "step": 19854 + }, + { + "epoch": 4.97, + "grad_norm": 4.966912269592285, + "learning_rate": 1.0440061076050222e-09, + "logits/chosen": -0.5070372819900513, + "logits/rejected": -0.619226336479187, + "logps/chosen": -60.03369140625, + "logps/rejected": -112.03596496582031, + "loss": 0.6606, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.238997459411621, + "rewards/margins": 7.980087757110596, + "rewards/rejected": -4.741090774536133, + "step": 19855 + }, + { + "epoch": 4.97, + "grad_norm": 4.303445339202881, + "learning_rate": 1.0280067989776321e-09, + "logits/chosen": -0.5384406447410583, + "logits/rejected": -0.6179929375648499, + "logps/chosen": -54.05225372314453, + "logps/rejected": -100.51165771484375, + "loss": 0.5994, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9775550365448, + "rewards/margins": 6.335801601409912, + "rewards/rejected": -3.358245849609375, + "step": 19856 + }, + { + "epoch": 4.97, + "grad_norm": 3.6458346843719482, + "learning_rate": 1.0121310202654277e-09, + "logits/chosen": -0.571911096572876, + "logits/rejected": -0.6418116092681885, + "logps/chosen": -48.89972686767578, + "logps/rejected": -104.3711166381836, + "loss": 0.579, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.967461109161377, + "rewards/margins": 7.977512836456299, + "rewards/rejected": -5.01005220413208, + "step": 19857 + }, + { + "epoch": 4.97, + "grad_norm": 6.028194427490234, + "learning_rate": 9.96378771860873e-10, + "logits/chosen": -0.5080538988113403, + "logits/rejected": -0.565390944480896, + "logps/chosen": -49.5313720703125, + "logps/rejected": -101.52597045898438, + "loss": 0.6286, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1779143810272217, + "rewards/margins": 6.264036178588867, + "rewards/rejected": -3.0861217975616455, + "step": 19858 + }, + { + "epoch": 4.97, + "grad_norm": 4.8242621421813965, + "learning_rate": 9.807500541525461e-10, + "logits/chosen": -0.5574628114700317, + "logits/rejected": -0.656308114528656, + "logps/chosen": -49.12934494018555, + "logps/rejected": -91.29463958740234, + "loss": 0.6447, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0799365043640137, + "rewards/margins": 6.753033638000488, + "rewards/rejected": -3.673096179962158, + "step": 19859 + }, + { + "epoch": 4.97, + "grad_norm": 4.674002170562744, + "learning_rate": 9.652448675273595e-10, + "logits/chosen": -0.4979713559150696, + "logits/rejected": -0.5905092358589172, + "logps/chosen": -60.10577392578125, + "logps/rejected": -85.87647247314453, + "loss": 0.6633, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1831412315368652, + "rewards/margins": 5.855477333068848, + "rewards/rejected": -2.6723361015319824, + "step": 19860 + }, + { + "epoch": 4.97, + "grad_norm": 14.28793716430664, + "learning_rate": 9.498632123683404e-10, + "logits/chosen": -0.5087887644767761, + "logits/rejected": -0.5771489143371582, + "logps/chosen": -46.83386993408203, + "logps/rejected": -100.48189544677734, + "loss": 0.5188, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3095028400421143, + "rewards/margins": 7.092670917510986, + "rewards/rejected": -3.783168315887451, + "step": 19861 + }, + { + "epoch": 4.97, + "grad_norm": 30.817535400390625, + "learning_rate": 9.34605089055185e-10, + "logits/chosen": -0.512291431427002, + "logits/rejected": -0.5374229550361633, + "logps/chosen": -52.334999084472656, + "logps/rejected": -123.96214294433594, + "loss": 0.6577, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7855124473571777, + "rewards/margins": 8.09304428100586, + "rewards/rejected": -5.30753231048584, + "step": 19862 + }, + { + "epoch": 4.97, + "grad_norm": 3.6710715293884277, + "learning_rate": 9.194704979648139e-10, + "logits/chosen": -0.6018936634063721, + "logits/rejected": -0.6394881010055542, + "logps/chosen": -47.75745391845703, + "logps/rejected": -119.92389678955078, + "loss": 0.6505, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2541565895080566, + "rewards/margins": 8.43358039855957, + "rewards/rejected": -5.179423809051514, + "step": 19863 + }, + { + "epoch": 4.97, + "grad_norm": 14.116366386413574, + "learning_rate": 9.044594394724826e-10, + "logits/chosen": -0.6530435681343079, + "logits/rejected": -0.7212640047073364, + "logps/chosen": -57.61417007446289, + "logps/rejected": -108.90237426757812, + "loss": 0.675, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0009098052978516, + "rewards/margins": 6.9773101806640625, + "rewards/rejected": -3.976400136947632, + "step": 19864 + }, + { + "epoch": 4.97, + "grad_norm": 4.78614616394043, + "learning_rate": 8.895719139473402e-10, + "logits/chosen": -0.6770433187484741, + "logits/rejected": -0.7713785767555237, + "logps/chosen": -47.71039581298828, + "logps/rejected": -86.7966537475586, + "loss": 0.616, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.488719940185547, + "rewards/margins": 6.493314266204834, + "rewards/rejected": -3.004594087600708, + "step": 19865 + }, + { + "epoch": 4.97, + "grad_norm": 3.2665908336639404, + "learning_rate": 8.748079217585359e-10, + "logits/chosen": -0.575088620185852, + "logits/rejected": -0.6560670733451843, + "logps/chosen": -52.41350555419922, + "logps/rejected": -102.71837615966797, + "loss": 0.5604, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.306485176086426, + "rewards/margins": 7.183191299438477, + "rewards/rejected": -3.87670636177063, + "step": 19866 + }, + { + "epoch": 4.97, + "grad_norm": 6.239301681518555, + "learning_rate": 8.601674632707779e-10, + "logits/chosen": -0.6207693815231323, + "logits/rejected": -0.6866181492805481, + "logps/chosen": -55.15681457519531, + "logps/rejected": -107.92237854003906, + "loss": 0.8127, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.169804096221924, + "rewards/margins": 7.526472568511963, + "rewards/rejected": -4.356668472290039, + "step": 19867 + }, + { + "epoch": 4.97, + "grad_norm": 14.774458885192871, + "learning_rate": 8.456505388454439e-10, + "logits/chosen": -0.4999987483024597, + "logits/rejected": -0.5463083982467651, + "logps/chosen": -60.52007293701172, + "logps/rejected": -121.01605224609375, + "loss": 0.6755, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9555118083953857, + "rewards/margins": 7.721877574920654, + "rewards/rejected": -4.766366481781006, + "step": 19868 + }, + { + "epoch": 4.97, + "grad_norm": 5.401932716369629, + "learning_rate": 8.312571488411359e-10, + "logits/chosen": -0.5202698111534119, + "logits/rejected": -0.6139037013053894, + "logps/chosen": -56.126956939697266, + "logps/rejected": -102.08522033691406, + "loss": 0.6007, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.373678207397461, + "rewards/margins": 6.631307125091553, + "rewards/rejected": -3.257628917694092, + "step": 19869 + }, + { + "epoch": 4.97, + "grad_norm": 4.031489372253418, + "learning_rate": 8.169872936142353e-10, + "logits/chosen": -0.5514240264892578, + "logits/rejected": -0.6471644043922424, + "logps/chosen": -52.079612731933594, + "logps/rejected": -93.76229858398438, + "loss": 0.6321, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5651161670684814, + "rewards/margins": 7.119429588317871, + "rewards/rejected": -3.5543131828308105, + "step": 19870 + }, + { + "epoch": 4.97, + "grad_norm": 5.817584991455078, + "learning_rate": 8.028409735166831e-10, + "logits/chosen": -0.6180600523948669, + "logits/rejected": -0.6992729306221008, + "logps/chosen": -47.245277404785156, + "logps/rejected": -106.62698364257812, + "loss": 0.5887, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1306066513061523, + "rewards/margins": 7.118572235107422, + "rewards/rejected": -3.9879655838012695, + "step": 19871 + }, + { + "epoch": 4.97, + "grad_norm": 4.188995361328125, + "learning_rate": 7.888181888981994e-10, + "logits/chosen": -0.5069684982299805, + "logits/rejected": -0.6315828561782837, + "logps/chosen": -51.101741790771484, + "logps/rejected": -116.1944351196289, + "loss": 0.6268, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0802838802337646, + "rewards/margins": 8.542405128479004, + "rewards/rejected": -5.46212100982666, + "step": 19872 + }, + { + "epoch": 4.97, + "grad_norm": 4.434815406799316, + "learning_rate": 7.749189401057289e-10, + "logits/chosen": -0.5509288907051086, + "logits/rejected": -0.6349955201148987, + "logps/chosen": -54.27996063232422, + "logps/rejected": -111.23628997802734, + "loss": 0.6572, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.067730188369751, + "rewards/margins": 7.297362804412842, + "rewards/rejected": -4.22963285446167, + "step": 19873 + }, + { + "epoch": 4.97, + "grad_norm": 2.2937867641448975, + "learning_rate": 7.611432274823305e-10, + "logits/chosen": -0.4526463747024536, + "logits/rejected": -0.5418518781661987, + "logps/chosen": -62.503414154052734, + "logps/rejected": -109.04116821289062, + "loss": 0.5497, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.057356357574463, + "rewards/margins": 7.888943195343018, + "rewards/rejected": -4.831587314605713, + "step": 19874 + }, + { + "epoch": 4.97, + "grad_norm": 4.024933815002441, + "learning_rate": 7.474910513688427e-10, + "logits/chosen": -0.600834846496582, + "logits/rejected": -0.6697917580604553, + "logps/chosen": -42.22846603393555, + "logps/rejected": -91.1606674194336, + "loss": 0.5794, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.881681442260742, + "rewards/margins": 7.2603864669799805, + "rewards/rejected": -4.3787055015563965, + "step": 19875 + }, + { + "epoch": 4.97, + "grad_norm": 5.406824111938477, + "learning_rate": 7.339624121016631e-10, + "logits/chosen": -0.6088191866874695, + "logits/rejected": -0.6969766020774841, + "logps/chosen": -42.071929931640625, + "logps/rejected": -108.52862548828125, + "loss": 0.5635, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.135533094406128, + "rewards/margins": 8.299946784973145, + "rewards/rejected": -5.164413928985596, + "step": 19876 + }, + { + "epoch": 4.97, + "grad_norm": 4.704145431518555, + "learning_rate": 7.20557310016079e-10, + "logits/chosen": -0.4866039752960205, + "logits/rejected": -0.6030546426773071, + "logps/chosen": -54.006622314453125, + "logps/rejected": -105.09233856201172, + "loss": 0.5785, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.156820774078369, + "rewards/margins": 7.757817268371582, + "rewards/rejected": -4.600996971130371, + "step": 19877 + }, + { + "epoch": 4.97, + "grad_norm": 4.9907050132751465, + "learning_rate": 7.072757454429368e-10, + "logits/chosen": -0.6068625450134277, + "logits/rejected": -0.7002316117286682, + "logps/chosen": -74.33052825927734, + "logps/rejected": -113.55935668945312, + "loss": 0.6451, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.8657424449920654, + "rewards/margins": 7.5591630935668945, + "rewards/rejected": -4.69342041015625, + "step": 19878 + }, + { + "epoch": 4.97, + "grad_norm": 7.473524570465088, + "learning_rate": 6.941177187103076e-10, + "logits/chosen": -0.5223965644836426, + "logits/rejected": -0.589815080165863, + "logps/chosen": -61.87982940673828, + "logps/rejected": -137.7632293701172, + "loss": 0.525, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8585314750671387, + "rewards/margins": 8.046418190002441, + "rewards/rejected": -5.187886714935303, + "step": 19879 + }, + { + "epoch": 4.97, + "grad_norm": 2.2210195064544678, + "learning_rate": 6.810832301440417e-10, + "logits/chosen": -0.45598000288009644, + "logits/rejected": -0.5075173377990723, + "logps/chosen": -51.47322463989258, + "logps/rejected": -103.03914642333984, + "loss": 0.5095, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6936352252960205, + "rewards/margins": 7.150485515594482, + "rewards/rejected": -3.4568495750427246, + "step": 19880 + }, + { + "epoch": 4.97, + "grad_norm": 9.333656311035156, + "learning_rate": 6.681722800649937e-10, + "logits/chosen": -0.5900542140007019, + "logits/rejected": -0.6666340231895447, + "logps/chosen": -55.4291877746582, + "logps/rejected": -107.99923706054688, + "loss": 0.6932, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9651975631713867, + "rewards/margins": 6.960020542144775, + "rewards/rejected": -3.9948225021362305, + "step": 19881 + }, + { + "epoch": 4.97, + "grad_norm": 5.641752243041992, + "learning_rate": 6.553848687934627e-10, + "logits/chosen": -0.6039592027664185, + "logits/rejected": -0.6593528985977173, + "logps/chosen": -56.0911865234375, + "logps/rejected": -122.6322021484375, + "loss": 0.6344, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.768198013305664, + "rewards/margins": 8.405705451965332, + "rewards/rejected": -5.637507438659668, + "step": 19882 + }, + { + "epoch": 4.97, + "grad_norm": 4.098472595214844, + "learning_rate": 6.427209966447523e-10, + "logits/chosen": -0.5073369145393372, + "logits/rejected": -0.5366435050964355, + "logps/chosen": -48.74409103393555, + "logps/rejected": -125.73831939697266, + "loss": 0.5694, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.187617540359497, + "rewards/margins": 7.441644191741943, + "rewards/rejected": -4.254026412963867, + "step": 19883 + }, + { + "epoch": 4.97, + "grad_norm": 1.7071870565414429, + "learning_rate": 6.301806639319452e-10, + "logits/chosen": -0.550710141658783, + "logits/rejected": -0.6989730596542358, + "logps/chosen": -52.59979248046875, + "logps/rejected": -96.56146240234375, + "loss": 0.575, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.075810670852661, + "rewards/margins": 7.751330375671387, + "rewards/rejected": -4.6755194664001465, + "step": 19884 + }, + { + "epoch": 4.97, + "grad_norm": 4.75673246383667, + "learning_rate": 6.177638709647937e-10, + "logits/chosen": -0.6027907133102417, + "logits/rejected": -0.6945378184318542, + "logps/chosen": -42.88047409057617, + "logps/rejected": -111.23139953613281, + "loss": 0.4922, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.246537208557129, + "rewards/margins": 8.400753021240234, + "rewards/rejected": -5.154216289520264, + "step": 19885 + }, + { + "epoch": 4.97, + "grad_norm": 4.403570175170898, + "learning_rate": 6.054706180508296e-10, + "logits/chosen": -0.5810889005661011, + "logits/rejected": -0.7066733241081238, + "logps/chosen": -55.489803314208984, + "logps/rejected": -101.98291015625, + "loss": 0.6907, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.296475648880005, + "rewards/margins": 7.862130165100098, + "rewards/rejected": -4.565654754638672, + "step": 19886 + }, + { + "epoch": 4.97, + "grad_norm": 5.415849685668945, + "learning_rate": 5.933009054925886e-10, + "logits/chosen": -0.4991351068019867, + "logits/rejected": -0.5771622061729431, + "logps/chosen": -52.45003890991211, + "logps/rejected": -120.48400115966797, + "loss": 0.5971, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.217273712158203, + "rewards/margins": 8.321205139160156, + "rewards/rejected": -5.103930950164795, + "step": 19887 + }, + { + "epoch": 4.98, + "grad_norm": 3.6292967796325684, + "learning_rate": 5.812547335920516e-10, + "logits/chosen": -0.6054645776748657, + "logits/rejected": -0.7228720784187317, + "logps/chosen": -54.34970474243164, + "logps/rejected": -104.6443862915039, + "loss": 0.5268, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4475486278533936, + "rewards/margins": 7.947369575500488, + "rewards/rejected": -4.499821186065674, + "step": 19888 + }, + { + "epoch": 4.98, + "grad_norm": 3.340463876724243, + "learning_rate": 5.693321026462029e-10, + "logits/chosen": -0.5831320285797119, + "logits/rejected": -0.6324764490127563, + "logps/chosen": -45.95215606689453, + "logps/rejected": -109.45816040039062, + "loss": 0.5782, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3835458755493164, + "rewards/margins": 7.339420795440674, + "rewards/rejected": -3.9558753967285156, + "step": 19889 + }, + { + "epoch": 4.98, + "grad_norm": 3.0780205726623535, + "learning_rate": 5.575330129492518e-10, + "logits/chosen": -0.6635051965713501, + "logits/rejected": -0.7118870615959167, + "logps/chosen": -45.01559829711914, + "logps/rejected": -104.55386352539062, + "loss": 0.53, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9607479572296143, + "rewards/margins": 7.312909126281738, + "rewards/rejected": -4.352161407470703, + "step": 19890 + }, + { + "epoch": 4.98, + "grad_norm": 3.6261563301086426, + "learning_rate": 5.45857464793742e-10, + "logits/chosen": -0.5130528211593628, + "logits/rejected": -0.6207090616226196, + "logps/chosen": -52.032142639160156, + "logps/rejected": -117.0754623413086, + "loss": 0.5738, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.993594169616699, + "rewards/margins": 8.116628646850586, + "rewards/rejected": -5.1230340003967285, + "step": 19891 + }, + { + "epoch": 4.98, + "grad_norm": 5.577615737915039, + "learning_rate": 5.343054584677765e-10, + "logits/chosen": -0.6088643670082092, + "logits/rejected": -0.6354390382766724, + "logps/chosen": -47.68007278442383, + "logps/rejected": -122.74824523925781, + "loss": 0.5612, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.948190689086914, + "rewards/margins": 7.739977836608887, + "rewards/rejected": -4.791787147521973, + "step": 19892 + }, + { + "epoch": 4.98, + "grad_norm": 2.893437623977661, + "learning_rate": 5.228769942566825e-10, + "logits/chosen": -0.488446980714798, + "logits/rejected": -0.6042317748069763, + "logps/chosen": -63.9565315246582, + "logps/rejected": -125.82128143310547, + "loss": 0.5731, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.815239191055298, + "rewards/margins": 8.300158500671387, + "rewards/rejected": -5.484919548034668, + "step": 19893 + }, + { + "epoch": 4.98, + "grad_norm": 3.2267520427703857, + "learning_rate": 5.115720724430118e-10, + "logits/chosen": -0.5413248538970947, + "logits/rejected": -0.6167610883712769, + "logps/chosen": -56.88432312011719, + "logps/rejected": -101.31927490234375, + "loss": 0.6414, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0966358184814453, + "rewards/margins": 6.8512396812438965, + "rewards/rejected": -3.754603862762451, + "step": 19894 + }, + { + "epoch": 4.98, + "grad_norm": 8.378090858459473, + "learning_rate": 5.003906933065406e-10, + "logits/chosen": -0.5660449266433716, + "logits/rejected": -0.6669359803199768, + "logps/chosen": -56.243995666503906, + "logps/rejected": -122.07102966308594, + "loss": 0.5514, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.35779070854187, + "rewards/margins": 8.697517395019531, + "rewards/rejected": -5.339726448059082, + "step": 19895 + }, + { + "epoch": 4.98, + "grad_norm": 6.314207553863525, + "learning_rate": 4.893328571226042e-10, + "logits/chosen": -0.5112165808677673, + "logits/rejected": -0.5892149209976196, + "logps/chosen": -53.53179168701172, + "logps/rejected": -143.92562866210938, + "loss": 0.5754, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.039016008377075, + "rewards/margins": 9.097082138061523, + "rewards/rejected": -6.0580668449401855, + "step": 19896 + }, + { + "epoch": 4.98, + "grad_norm": 4.214936256408691, + "learning_rate": 4.783985641648725e-10, + "logits/chosen": -0.6117825508117676, + "logits/rejected": -0.6943532228469849, + "logps/chosen": -49.47953414916992, + "logps/rejected": -91.50041961669922, + "loss": 0.5515, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.930755853652954, + "rewards/margins": 7.603485107421875, + "rewards/rejected": -4.6727294921875, + "step": 19897 + }, + { + "epoch": 4.98, + "grad_norm": 14.497074127197266, + "learning_rate": 4.67587814703685e-10, + "logits/chosen": -0.5291433930397034, + "logits/rejected": -0.6119257807731628, + "logps/chosen": -50.184913635253906, + "logps/rejected": -115.14957427978516, + "loss": 0.6223, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1345291137695312, + "rewards/margins": 7.024590492248535, + "rewards/rejected": -3.890061378479004, + "step": 19898 + }, + { + "epoch": 4.98, + "grad_norm": 4.207740306854248, + "learning_rate": 4.5690060900660527e-10, + "logits/chosen": -0.6028007864952087, + "logits/rejected": -0.6537134051322937, + "logps/chosen": -50.76903533935547, + "logps/rejected": -136.038818359375, + "loss": 0.5847, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2599666118621826, + "rewards/margins": 8.576578140258789, + "rewards/rejected": -5.3166117668151855, + "step": 19899 + }, + { + "epoch": 4.98, + "grad_norm": 4.527360916137695, + "learning_rate": 4.4633694733675625e-10, + "logits/chosen": -0.4963105618953705, + "logits/rejected": -0.5751968026161194, + "logps/chosen": -60.607704162597656, + "logps/rejected": -106.3408432006836, + "loss": 0.6683, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.233206033706665, + "rewards/margins": 6.559929370880127, + "rewards/rejected": -3.3267228603363037, + "step": 19900 + }, + { + "epoch": 4.98, + "grad_norm": 69.21513366699219, + "learning_rate": 4.3589682995559545e-10, + "logits/chosen": -0.426755428314209, + "logits/rejected": -0.5630480051040649, + "logps/chosen": -74.01469421386719, + "logps/rejected": -103.13983917236328, + "loss": 0.8771, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.0135364532470703, + "rewards/margins": 6.412464141845703, + "rewards/rejected": -3.3989272117614746, + "step": 19901 + }, + { + "epoch": 4.98, + "grad_norm": 4.527782440185547, + "learning_rate": 4.2558025712180486e-10, + "logits/chosen": -0.5538458824157715, + "logits/rejected": -0.5971140265464783, + "logps/chosen": -51.46776580810547, + "logps/rejected": -123.0173110961914, + "loss": 0.6527, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.59330153465271, + "rewards/margins": 7.356274127960205, + "rewards/rejected": -4.762972831726074, + "step": 19902 + }, + { + "epoch": 4.98, + "grad_norm": 3.1706607341766357, + "learning_rate": 4.1538722908907037e-10, + "logits/chosen": -0.5546718239784241, + "logits/rejected": -0.6561688780784607, + "logps/chosen": -47.97529220581055, + "logps/rejected": -89.18870544433594, + "loss": 0.5426, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2769994735717773, + "rewards/margins": 6.659759044647217, + "rewards/rejected": -3.3827590942382812, + "step": 19903 + }, + { + "epoch": 4.98, + "grad_norm": 7.973115921020508, + "learning_rate": 4.053177461105229e-10, + "logits/chosen": -0.5185143947601318, + "logits/rejected": -0.5842137932777405, + "logps/chosen": -57.32162094116211, + "logps/rejected": -125.1339111328125, + "loss": 0.6777, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3459770679473877, + "rewards/margins": 8.46143913269043, + "rewards/rejected": -5.115461826324463, + "step": 19904 + }, + { + "epoch": 4.98, + "grad_norm": 5.881188869476318, + "learning_rate": 3.9537180843374214e-10, + "logits/chosen": -0.5009257793426514, + "logits/rejected": -0.5619878768920898, + "logps/chosen": -55.26806640625, + "logps/rejected": -97.8924789428711, + "loss": 0.6532, + "rewards/accuracies": 0.90625, + "rewards/chosen": 3.1477293968200684, + "rewards/margins": 6.669698238372803, + "rewards/rejected": -3.521967887878418, + "step": 19905 + }, + { + "epoch": 4.98, + "grad_norm": 5.834029197692871, + "learning_rate": 3.8554941630575274e-10, + "logits/chosen": -0.5740347504615784, + "logits/rejected": -0.6547947525978088, + "logps/chosen": -46.33343505859375, + "logps/rejected": -81.98699951171875, + "loss": 0.6006, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9869866371154785, + "rewards/margins": 6.341402530670166, + "rewards/rejected": -3.3544156551361084, + "step": 19906 + }, + { + "epoch": 4.98, + "grad_norm": 5.87897253036499, + "learning_rate": 3.7585056996858327e-10, + "logits/chosen": -0.5350843667984009, + "logits/rejected": -0.6429880261421204, + "logps/chosen": -55.84193420410156, + "logps/rejected": -104.51630401611328, + "loss": 0.5428, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.013235330581665, + "rewards/margins": 7.744492530822754, + "rewards/rejected": -4.731257438659668, + "step": 19907 + }, + { + "epoch": 4.98, + "grad_norm": 8.783491134643555, + "learning_rate": 3.6627526966148685e-10, + "logits/chosen": -0.5268926620483398, + "logits/rejected": -0.5871848464012146, + "logps/chosen": -57.977203369140625, + "logps/rejected": -105.0418930053711, + "loss": 0.7677, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.116300106048584, + "rewards/margins": 6.74476957321167, + "rewards/rejected": -3.6284689903259277, + "step": 19908 + }, + { + "epoch": 4.98, + "grad_norm": 7.069415092468262, + "learning_rate": 3.568235156220512e-10, + "logits/chosen": -0.5437347888946533, + "logits/rejected": -0.6125056743621826, + "logps/chosen": -54.82936096191406, + "logps/rejected": -108.63248443603516, + "loss": 0.6114, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1267569065093994, + "rewards/margins": 6.692408561706543, + "rewards/rejected": -3.5656514167785645, + "step": 19909 + }, + { + "epoch": 4.98, + "grad_norm": 8.628918647766113, + "learning_rate": 3.4749530808286803e-10, + "logits/chosen": -0.5631526708602905, + "logits/rejected": -0.6625360250473022, + "logps/chosen": -52.94995880126953, + "logps/rejected": -94.26290893554688, + "loss": 0.6004, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.982008457183838, + "rewards/margins": 6.613872528076172, + "rewards/rejected": -3.631864070892334, + "step": 19910 + }, + { + "epoch": 4.98, + "grad_norm": 5.660593509674072, + "learning_rate": 3.3829064727541883e-10, + "logits/chosen": -0.5324147939682007, + "logits/rejected": -0.5853630304336548, + "logps/chosen": -56.163673400878906, + "logps/rejected": -108.37385559082031, + "loss": 0.6753, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.188786745071411, + "rewards/margins": 7.1645636558532715, + "rewards/rejected": -3.9757776260375977, + "step": 19911 + }, + { + "epoch": 4.98, + "grad_norm": 3.6755776405334473, + "learning_rate": 3.2920953342674424e-10, + "logits/chosen": -0.6507862210273743, + "logits/rejected": -0.7425075769424438, + "logps/chosen": -49.659034729003906, + "logps/rejected": -99.12633514404297, + "loss": 0.6116, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0239527225494385, + "rewards/margins": 7.18895149230957, + "rewards/rejected": -4.164999008178711, + "step": 19912 + }, + { + "epoch": 4.98, + "grad_norm": 4.9277119636535645, + "learning_rate": 3.202519667605541e-10, + "logits/chosen": -0.591405987739563, + "logits/rejected": -0.6855990886688232, + "logps/chosen": -51.370670318603516, + "logps/rejected": -97.55734252929688, + "loss": 0.6798, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.138204574584961, + "rewards/margins": 6.627676010131836, + "rewards/rejected": -3.489471912384033, + "step": 19913 + }, + { + "epoch": 4.98, + "grad_norm": 3.430643320083618, + "learning_rate": 3.114179474994483e-10, + "logits/chosen": -0.5204707980155945, + "logits/rejected": -0.5953518152236938, + "logps/chosen": -55.70882797241211, + "logps/rejected": -104.83663940429688, + "loss": 0.6219, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3484561443328857, + "rewards/margins": 7.265660762786865, + "rewards/rejected": -3.9172046184539795, + "step": 19914 + }, + { + "epoch": 4.98, + "grad_norm": 17.61754035949707, + "learning_rate": 3.0270747586103045e-10, + "logits/chosen": -0.548998236656189, + "logits/rejected": -0.6355295777320862, + "logps/chosen": -53.590118408203125, + "logps/rejected": -101.08595275878906, + "loss": 0.6196, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.300381898880005, + "rewards/margins": 7.697623252868652, + "rewards/rejected": -4.39724063873291, + "step": 19915 + }, + { + "epoch": 4.98, + "grad_norm": 3.730830192565918, + "learning_rate": 2.941205520606838e-10, + "logits/chosen": -0.5515607595443726, + "logits/rejected": -0.5851646065711975, + "logps/chosen": -49.85392761230469, + "logps/rejected": -112.576171875, + "loss": 0.6013, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2484872341156006, + "rewards/margins": 7.736983299255371, + "rewards/rejected": -4.48849630355835, + "step": 19916 + }, + { + "epoch": 4.98, + "grad_norm": 3.06361985206604, + "learning_rate": 2.8565717630990585e-10, + "logits/chosen": -0.48616546392440796, + "logits/rejected": -0.5782164335250854, + "logps/chosen": -60.887569427490234, + "logps/rejected": -106.56349182128906, + "loss": 0.6106, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.232614517211914, + "rewards/margins": 8.909194946289062, + "rewards/rejected": -5.676580905914307, + "step": 19917 + }, + { + "epoch": 4.98, + "grad_norm": 5.415826797485352, + "learning_rate": 2.773173488190839e-10, + "logits/chosen": -0.6198338270187378, + "logits/rejected": -0.6616862416267395, + "logps/chosen": -60.195350646972656, + "logps/rejected": -103.77355194091797, + "loss": 0.7033, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.949659585952759, + "rewards/margins": 6.1110382080078125, + "rewards/rejected": -3.1613786220550537, + "step": 19918 + }, + { + "epoch": 4.98, + "grad_norm": 3.834278106689453, + "learning_rate": 2.6910106979360917e-10, + "logits/chosen": -0.546360969543457, + "logits/rejected": -0.6232045292854309, + "logps/chosen": -55.97857666015625, + "logps/rejected": -97.9140625, + "loss": 0.6483, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.042970895767212, + "rewards/margins": 6.355175971984863, + "rewards/rejected": -3.3122057914733887, + "step": 19919 + }, + { + "epoch": 4.98, + "grad_norm": 3.283844232559204, + "learning_rate": 2.610083394366525e-10, + "logits/chosen": -0.5566988587379456, + "logits/rejected": -0.635345995426178, + "logps/chosen": -50.3885612487793, + "logps/rejected": -116.30044555664062, + "loss": 0.5292, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2950186729431152, + "rewards/margins": 7.3625946044921875, + "rewards/rejected": -4.067575931549072, + "step": 19920 + }, + { + "epoch": 4.98, + "grad_norm": 2.7452080249786377, + "learning_rate": 2.53039157948054e-10, + "logits/chosen": -0.501459538936615, + "logits/rejected": -0.6043206453323364, + "logps/chosen": -58.03902053833008, + "logps/rejected": -117.50865173339844, + "loss": 0.575, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.171693801879883, + "rewards/margins": 8.31835651397705, + "rewards/rejected": -5.146662712097168, + "step": 19921 + }, + { + "epoch": 4.98, + "grad_norm": 6.119920253753662, + "learning_rate": 2.451935255248783e-10, + "logits/chosen": -0.5784107446670532, + "logits/rejected": -0.6844589710235596, + "logps/chosen": -47.83505630493164, + "logps/rejected": -100.11189270019531, + "loss": 0.6642, + "rewards/accuracies": 0.90625, + "rewards/chosen": 2.998708724975586, + "rewards/margins": 7.524878978729248, + "rewards/rejected": -4.52617073059082, + "step": 19922 + }, + { + "epoch": 4.98, + "grad_norm": 3.2533457279205322, + "learning_rate": 2.374714423608593e-10, + "logits/chosen": -0.5753580927848816, + "logits/rejected": -0.6387476325035095, + "logps/chosen": -47.653221130371094, + "logps/rejected": -115.55452728271484, + "loss": 0.627, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.768270492553711, + "rewards/margins": 7.849325180053711, + "rewards/rejected": -5.0810546875, + "step": 19923 + }, + { + "epoch": 4.98, + "grad_norm": 13.35284423828125, + "learning_rate": 2.2987290864695533e-10, + "logits/chosen": -0.4872966408729553, + "logits/rejected": -0.6163138151168823, + "logps/chosen": -62.706295013427734, + "logps/rejected": -104.23359680175781, + "loss": 0.617, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9280080795288086, + "rewards/margins": 7.383190631866455, + "rewards/rejected": -4.455182075500488, + "step": 19924 + }, + { + "epoch": 4.98, + "grad_norm": 5.08507776260376, + "learning_rate": 2.2239792457079411e-10, + "logits/chosen": -0.5494012832641602, + "logits/rejected": -0.551344633102417, + "logps/chosen": -48.30821228027344, + "logps/rejected": -109.60356140136719, + "loss": 0.6106, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.458965301513672, + "rewards/margins": 6.323002338409424, + "rewards/rejected": -2.864036798477173, + "step": 19925 + }, + { + "epoch": 4.98, + "grad_norm": 9.261750221252441, + "learning_rate": 2.150464903172278e-10, + "logits/chosen": -0.5457699298858643, + "logits/rejected": -0.6545997858047485, + "logps/chosen": -49.084407806396484, + "logps/rejected": -121.15084838867188, + "loss": 0.6746, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9218482971191406, + "rewards/margins": 7.979453086853027, + "rewards/rejected": -5.057605266571045, + "step": 19926 + }, + { + "epoch": 4.98, + "grad_norm": 18.35226058959961, + "learning_rate": 2.078186060677778e-10, + "logits/chosen": -0.5164803266525269, + "logits/rejected": -0.574586033821106, + "logps/chosen": -57.75668716430664, + "logps/rejected": -102.86348724365234, + "loss": 0.6705, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7541723251342773, + "rewards/margins": 6.122744083404541, + "rewards/rejected": -3.3685712814331055, + "step": 19927 + }, + { + "epoch": 4.99, + "grad_norm": 5.206204414367676, + "learning_rate": 2.007142720011901e-10, + "logits/chosen": -0.5283898711204529, + "logits/rejected": -0.6369479298591614, + "logps/chosen": -53.42512130737305, + "logps/rejected": -95.6153335571289, + "loss": 0.5785, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.6193161010742188, + "rewards/margins": 6.754828453063965, + "rewards/rejected": -4.135512351989746, + "step": 19928 + }, + { + "epoch": 4.99, + "grad_norm": 5.690203666687012, + "learning_rate": 1.9373348829287985e-10, + "logits/chosen": -0.5175544023513794, + "logits/rejected": -0.5622121095657349, + "logps/chosen": -56.487152099609375, + "logps/rejected": -120.23908996582031, + "loss": 0.6277, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1296474933624268, + "rewards/margins": 7.7631025314331055, + "rewards/rejected": -4.633454322814941, + "step": 19929 + }, + { + "epoch": 4.99, + "grad_norm": 5.1232500076293945, + "learning_rate": 1.868762551154868e-10, + "logits/chosen": -0.6325250267982483, + "logits/rejected": -0.7023271322250366, + "logps/chosen": -49.710670471191406, + "logps/rejected": -111.29702758789062, + "loss": 0.5691, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4828414916992188, + "rewards/margins": 8.61526870727539, + "rewards/rejected": -5.132427215576172, + "step": 19930 + }, + { + "epoch": 4.99, + "grad_norm": 3.649068593978882, + "learning_rate": 1.8014257263887504e-10, + "logits/chosen": -0.6391525268554688, + "logits/rejected": -0.6835653781890869, + "logps/chosen": -45.205142974853516, + "logps/rejected": -130.72950744628906, + "loss": 0.6015, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1808552742004395, + "rewards/margins": 8.771178245544434, + "rewards/rejected": -5.590323448181152, + "step": 19931 + }, + { + "epoch": 4.99, + "grad_norm": 7.289256572723389, + "learning_rate": 1.7353244102846778e-10, + "logits/chosen": -0.5350697040557861, + "logits/rejected": -0.6314639449119568, + "logps/chosen": -59.68193435668945, + "logps/rejected": -122.63399505615234, + "loss": 0.7207, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.019951820373535, + "rewards/margins": 7.979176998138428, + "rewards/rejected": -4.959225177764893, + "step": 19932 + }, + { + "epoch": 4.99, + "grad_norm": 6.317999362945557, + "learning_rate": 1.6704586044802296e-10, + "logits/chosen": -0.5196214318275452, + "logits/rejected": -0.5728968381881714, + "logps/chosen": -45.08457946777344, + "logps/rejected": -100.70518493652344, + "loss": 0.6476, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0572776794433594, + "rewards/margins": 6.908079147338867, + "rewards/rejected": -3.850801467895508, + "step": 19933 + }, + { + "epoch": 4.99, + "grad_norm": 6.4167022705078125, + "learning_rate": 1.606828310585229e-10, + "logits/chosen": -0.5479671955108643, + "logits/rejected": -0.6598122119903564, + "logps/chosen": -59.277767181396484, + "logps/rejected": -109.56349182128906, + "loss": 0.5634, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9871442317962646, + "rewards/margins": 8.434279441833496, + "rewards/rejected": -5.447135925292969, + "step": 19934 + }, + { + "epoch": 4.99, + "grad_norm": 2.9086992740631104, + "learning_rate": 1.544433530159539e-10, + "logits/chosen": -0.5587212443351746, + "logits/rejected": -0.6539686322212219, + "logps/chosen": -51.97838592529297, + "logps/rejected": -119.889404296875, + "loss": 0.5335, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9731662273406982, + "rewards/margins": 7.833934307098389, + "rewards/rejected": -4.8607683181762695, + "step": 19935 + }, + { + "epoch": 4.99, + "grad_norm": 3.1935412883758545, + "learning_rate": 1.4832742647574726e-10, + "logits/chosen": -0.5522094368934631, + "logits/rejected": -0.5825382471084595, + "logps/chosen": -46.188941955566406, + "logps/rejected": -101.41075134277344, + "loss": 0.5182, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.430945873260498, + "rewards/margins": 7.267009735107422, + "rewards/rejected": -3.836063861846924, + "step": 19936 + }, + { + "epoch": 4.99, + "grad_norm": 4.362786293029785, + "learning_rate": 1.4233505158833815e-10, + "logits/chosen": -0.5333415865898132, + "logits/rejected": -0.5671699047088623, + "logps/chosen": -50.23031234741211, + "logps/rejected": -107.44931030273438, + "loss": 0.5981, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.421844959259033, + "rewards/margins": 6.036564350128174, + "rewards/rejected": -2.6147193908691406, + "step": 19937 + }, + { + "epoch": 4.99, + "grad_norm": 5.921295166015625, + "learning_rate": 1.3646622850138624e-10, + "logits/chosen": -0.5761955380439758, + "logits/rejected": -0.6600791811943054, + "logps/chosen": -55.26308059692383, + "logps/rejected": -116.31739807128906, + "loss": 0.5978, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.870659828186035, + "rewards/margins": 7.912463188171387, + "rewards/rejected": -5.04180383682251, + "step": 19938 + }, + { + "epoch": 4.99, + "grad_norm": 3.407784938812256, + "learning_rate": 1.3072095736088587e-10, + "logits/chosen": -0.5641659498214722, + "logits/rejected": -0.6075596809387207, + "logps/chosen": -47.222347259521484, + "logps/rejected": -113.4701919555664, + "loss": 0.5331, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9285995960235596, + "rewards/margins": 7.639158248901367, + "rewards/rejected": -4.71055793762207, + "step": 19939 + }, + { + "epoch": 4.99, + "grad_norm": 5.834123611450195, + "learning_rate": 1.2509923830839043e-10, + "logits/chosen": -0.6008341908454895, + "logits/rejected": -0.6810265183448792, + "logps/chosen": -54.46052169799805, + "logps/rejected": -107.8487548828125, + "loss": 0.6051, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.096050977706909, + "rewards/margins": 6.055220127105713, + "rewards/rejected": -2.959169387817383, + "step": 19940 + }, + { + "epoch": 4.99, + "grad_norm": 4.194459438323975, + "learning_rate": 1.1960107148267786e-10, + "logits/chosen": -0.5587393641471863, + "logits/rejected": -0.6223090887069702, + "logps/chosen": -50.41322326660156, + "logps/rejected": -94.29656219482422, + "loss": 0.6058, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1163039207458496, + "rewards/margins": 6.914891242980957, + "rewards/rejected": -3.7985875606536865, + "step": 19941 + }, + { + "epoch": 4.99, + "grad_norm": 17.633224487304688, + "learning_rate": 1.1422645701975044e-10, + "logits/chosen": -0.558638334274292, + "logits/rejected": -0.6374049186706543, + "logps/chosen": -57.68154525756836, + "logps/rejected": -94.99626922607422, + "loss": 0.6156, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0941662788391113, + "rewards/margins": 7.2167558670043945, + "rewards/rejected": -4.122589588165283, + "step": 19942 + }, + { + "epoch": 4.99, + "grad_norm": 2.827446699142456, + "learning_rate": 1.0897539505283494e-10, + "logits/chosen": -0.5994746685028076, + "logits/rejected": -0.6672654747962952, + "logps/chosen": -51.32708740234375, + "logps/rejected": -120.57559204101562, + "loss": 0.5372, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3946518898010254, + "rewards/margins": 8.659653663635254, + "rewards/rejected": -5.265002250671387, + "step": 19943 + }, + { + "epoch": 4.99, + "grad_norm": 1.7132337093353271, + "learning_rate": 1.0384788571071725e-10, + "logits/chosen": -0.642980694770813, + "logits/rejected": -0.7421373724937439, + "logps/chosen": -50.3609504699707, + "logps/rejected": -107.1091537475586, + "loss": 0.6162, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.167454719543457, + "rewards/margins": 8.272270202636719, + "rewards/rejected": -5.104815483093262, + "step": 19944 + }, + { + "epoch": 4.99, + "grad_norm": 5.217803001403809, + "learning_rate": 9.884392912051788e-11, + "logits/chosen": -0.6539952158927917, + "logits/rejected": -0.7010529637336731, + "logps/chosen": -43.40424346923828, + "logps/rejected": -105.97994995117188, + "loss": 0.6303, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7539868354797363, + "rewards/margins": 7.270111560821533, + "rewards/rejected": -4.516125202178955, + "step": 19945 + }, + { + "epoch": 4.99, + "grad_norm": 5.3835930824279785, + "learning_rate": 9.396352540658183e-11, + "logits/chosen": -0.5513420701026917, + "logits/rejected": -0.6330214142799377, + "logps/chosen": -55.13705825805664, + "logps/rejected": -97.89586639404297, + "loss": 0.6476, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.066704273223877, + "rewards/margins": 7.045895576477051, + "rewards/rejected": -3.9791910648345947, + "step": 19946 + }, + { + "epoch": 4.99, + "grad_norm": 3.453498601913452, + "learning_rate": 8.920667468881316e-11, + "logits/chosen": -0.575199544429779, + "logits/rejected": -0.6507699489593506, + "logps/chosen": -60.71206283569336, + "logps/rejected": -128.12991333007812, + "loss": 0.6286, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2839598655700684, + "rewards/margins": 8.449958801269531, + "rewards/rejected": -5.165998458862305, + "step": 19947 + }, + { + "epoch": 4.99, + "grad_norm": 5.272215843200684, + "learning_rate": 8.457337708434044e-11, + "logits/chosen": -0.5612742900848389, + "logits/rejected": -0.6031251549720764, + "logps/chosen": -53.89789581298828, + "logps/rejected": -104.02627563476562, + "loss": 0.5244, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.72392201423645, + "rewards/margins": 6.734279155731201, + "rewards/rejected": -4.010356903076172, + "step": 19948 + }, + { + "epoch": 4.99, + "grad_norm": 4.615547180175781, + "learning_rate": 8.006363270862682e-11, + "logits/chosen": -0.5531132817268372, + "logits/rejected": -0.6247526407241821, + "logps/chosen": -51.64604949951172, + "logps/rejected": -93.45792388916016, + "loss": 0.6005, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.262561321258545, + "rewards/margins": 7.1732282638549805, + "rewards/rejected": -3.9106669425964355, + "step": 19949 + }, + { + "epoch": 4.99, + "grad_norm": 8.648017883300781, + "learning_rate": 7.567744167269464e-11, + "logits/chosen": -0.5638740062713623, + "logits/rejected": -0.5859392285346985, + "logps/chosen": -61.22319793701172, + "logps/rejected": -112.85517120361328, + "loss": 0.6324, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1712610721588135, + "rewards/margins": 6.848736763000488, + "rewards/rejected": -3.6774754524230957, + "step": 19950 + }, + { + "epoch": 4.99, + "grad_norm": 3.9842965602874756, + "learning_rate": 7.141480408479062e-11, + "logits/chosen": -0.594082236289978, + "logits/rejected": -0.6571227312088013, + "logps/chosen": -38.130409240722656, + "logps/rejected": -104.99964904785156, + "loss": 0.6059, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3079774379730225, + "rewards/margins": 7.749141216278076, + "rewards/rejected": -4.441164016723633, + "step": 19951 + }, + { + "epoch": 4.99, + "grad_norm": 38.18598937988281, + "learning_rate": 6.727572005038596e-11, + "logits/chosen": -0.55586838722229, + "logits/rejected": -0.6431952118873596, + "logps/chosen": -55.174659729003906, + "logps/rejected": -94.27672576904297, + "loss": 0.6181, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1565582752227783, + "rewards/margins": 6.305595874786377, + "rewards/rejected": -3.1490378379821777, + "step": 19952 + }, + { + "epoch": 4.99, + "grad_norm": 4.252803802490234, + "learning_rate": 6.326018967217628e-11, + "logits/chosen": -0.5558606386184692, + "logits/rejected": -0.6262428760528564, + "logps/chosen": -49.405433654785156, + "logps/rejected": -102.2113037109375, + "loss": 0.6343, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4954020977020264, + "rewards/margins": 7.17351770401001, + "rewards/rejected": -3.678115129470825, + "step": 19953 + }, + { + "epoch": 4.99, + "grad_norm": 6.730756759643555, + "learning_rate": 5.936821304841634e-11, + "logits/chosen": -0.6538694500923157, + "logits/rejected": -0.7053716778755188, + "logps/chosen": -64.137939453125, + "logps/rejected": -110.71051025390625, + "loss": 0.6532, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2290310859680176, + "rewards/margins": 6.812272071838379, + "rewards/rejected": -3.5832412242889404, + "step": 19954 + }, + { + "epoch": 4.99, + "grad_norm": 7.520650863647461, + "learning_rate": 5.559979027625062e-11, + "logits/chosen": -0.5506186485290527, + "logits/rejected": -0.6890170574188232, + "logps/chosen": -77.26251983642578, + "logps/rejected": -106.9632568359375, + "loss": 0.5714, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1445131301879883, + "rewards/margins": 7.882742404937744, + "rewards/rejected": -4.738229274749756, + "step": 19955 + }, + { + "epoch": 4.99, + "grad_norm": 5.0108842849731445, + "learning_rate": 5.1954921448382766e-11, + "logits/chosen": -0.5841202735900879, + "logits/rejected": -0.6437183618545532, + "logps/chosen": -53.38587188720703, + "logps/rejected": -114.73428344726562, + "loss": 0.6353, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.2711429595947266, + "rewards/margins": 6.2009687423706055, + "rewards/rejected": -2.929826259613037, + "step": 19956 + }, + { + "epoch": 4.99, + "grad_norm": 3.5537235736846924, + "learning_rate": 4.843360665474084e-11, + "logits/chosen": -0.5247070789337158, + "logits/rejected": -0.6039893627166748, + "logps/chosen": -55.1529426574707, + "logps/rejected": -101.90010070800781, + "loss": 0.5984, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1245462894439697, + "rewards/margins": 7.6632585525512695, + "rewards/rejected": -4.538712501525879, + "step": 19957 + }, + { + "epoch": 4.99, + "grad_norm": 2.803370714187622, + "learning_rate": 4.503584598247734e-11, + "logits/chosen": -0.5764304995536804, + "logits/rejected": -0.6678997278213501, + "logps/chosen": -42.939208984375, + "logps/rejected": -113.6094970703125, + "loss": 0.5192, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.075911045074463, + "rewards/margins": 8.06375789642334, + "rewards/rejected": -4.987847328186035, + "step": 19958 + }, + { + "epoch": 4.99, + "grad_norm": 6.871224403381348, + "learning_rate": 4.176163951541412e-11, + "logits/chosen": -0.5445753335952759, + "logits/rejected": -0.6150425672531128, + "logps/chosen": -55.856319427490234, + "logps/rejected": -109.41472625732422, + "loss": 0.5922, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.968545913696289, + "rewards/margins": 7.079909801483154, + "rewards/rejected": -4.111363887786865, + "step": 19959 + }, + { + "epoch": 4.99, + "grad_norm": 4.63238000869751, + "learning_rate": 3.861098733515256e-11, + "logits/chosen": -0.5233445763587952, + "logits/rejected": -0.5829619765281677, + "logps/chosen": -43.48958969116211, + "logps/rejected": -111.11888885498047, + "loss": 0.566, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.134861707687378, + "rewards/margins": 7.724074363708496, + "rewards/rejected": -4.589212417602539, + "step": 19960 + }, + { + "epoch": 4.99, + "grad_norm": 7.444454193115234, + "learning_rate": 3.558388951829806e-11, + "logits/chosen": -0.5817376971244812, + "logits/rejected": -0.6899542808532715, + "logps/chosen": -60.944793701171875, + "logps/rejected": -113.72148895263672, + "loss": 0.6632, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.189051389694214, + "rewards/margins": 7.749168872833252, + "rewards/rejected": -4.560117721557617, + "step": 19961 + }, + { + "epoch": 4.99, + "grad_norm": 2.174858570098877, + "learning_rate": 3.2680346140900876e-11, + "logits/chosen": -0.6272008419036865, + "logits/rejected": -0.6965867280960083, + "logps/chosen": -58.265769958496094, + "logps/rejected": -104.23461151123047, + "loss": 0.6393, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3733930587768555, + "rewards/margins": 7.828144073486328, + "rewards/rejected": -4.454751491546631, + "step": 19962 + }, + { + "epoch": 4.99, + "grad_norm": 10.280913352966309, + "learning_rate": 2.99003572740153e-11, + "logits/chosen": -0.5789274573326111, + "logits/rejected": -0.6143688559532166, + "logps/chosen": -58.80036163330078, + "logps/rejected": -117.52059173583984, + "loss": 0.7376, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5402727127075195, + "rewards/margins": 6.352766990661621, + "rewards/rejected": -3.8124942779541016, + "step": 19963 + }, + { + "epoch": 4.99, + "grad_norm": 3.3084678649902344, + "learning_rate": 2.7243922986475158e-11, + "logits/chosen": -0.5701842904090881, + "logits/rejected": -0.6044154167175293, + "logps/chosen": -52.46232604980469, + "logps/rejected": -114.9591064453125, + "loss": 0.5499, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.174152135848999, + "rewards/margins": 7.105155944824219, + "rewards/rejected": -3.9310035705566406, + "step": 19964 + }, + { + "epoch": 4.99, + "grad_norm": 2.3209457397460938, + "learning_rate": 2.4711043343783603e-11, + "logits/chosen": -0.5700312852859497, + "logits/rejected": -0.6698400378227234, + "logps/chosen": -49.15545654296875, + "logps/rejected": -108.52409362792969, + "loss": 0.5664, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.9136104583740234, + "rewards/margins": 7.328821659088135, + "rewards/rejected": -4.415210723876953, + "step": 19965 + }, + { + "epoch": 4.99, + "grad_norm": 3.794421434402466, + "learning_rate": 2.230171840866824e-11, + "logits/chosen": -0.5351060032844543, + "logits/rejected": -0.617020845413208, + "logps/chosen": -48.82106399536133, + "logps/rejected": -94.21400451660156, + "loss": 0.6009, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7560417652130127, + "rewards/margins": 6.502444744110107, + "rewards/rejected": -3.746403217315674, + "step": 19966 + }, + { + "epoch": 4.99, + "grad_norm": 2.7832789421081543, + "learning_rate": 2.0015948241081107e-11, + "logits/chosen": -0.5223357081413269, + "logits/rejected": -0.616122841835022, + "logps/chosen": -51.3277702331543, + "logps/rejected": -133.78411865234375, + "loss": 0.5503, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.921602725982666, + "rewards/margins": 8.5369873046875, + "rewards/rejected": -5.615384578704834, + "step": 19967 + }, + { + "epoch": 5.0, + "grad_norm": 4.928350448608398, + "learning_rate": 1.785373289708847e-11, + "logits/chosen": -0.5989034175872803, + "logits/rejected": -0.703328549861908, + "logps/chosen": -52.209320068359375, + "logps/rejected": -115.33343505859375, + "loss": 0.5436, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.159008264541626, + "rewards/margins": 9.229427337646484, + "rewards/rejected": -6.070417404174805, + "step": 19968 + }, + { + "epoch": 5.0, + "grad_norm": 2.0737075805664062, + "learning_rate": 1.5815072429425925e-11, + "logits/chosen": -0.5618734955787659, + "logits/rejected": -0.6739012002944946, + "logps/chosen": -57.868595123291016, + "logps/rejected": -112.29146575927734, + "loss": 0.5828, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2297091484069824, + "rewards/margins": 8.154295921325684, + "rewards/rejected": -4.924586772918701, + "step": 19969 + }, + { + "epoch": 5.0, + "grad_norm": 4.5135817527771, + "learning_rate": 1.3899966889718842e-11, + "logits/chosen": -0.5236574411392212, + "logits/rejected": -0.6384211778640747, + "logps/chosen": -61.9736213684082, + "logps/rejected": -104.72665405273438, + "loss": 0.6343, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0300168991088867, + "rewards/margins": 7.282138824462891, + "rewards/rejected": -4.252122402191162, + "step": 19970 + }, + { + "epoch": 5.0, + "grad_norm": 3.3266897201538086, + "learning_rate": 1.2108416325151696e-11, + "logits/chosen": -0.49358999729156494, + "logits/rejected": -0.5529128313064575, + "logps/chosen": -47.28582000732422, + "logps/rejected": -119.5124740600586, + "loss": 0.5691, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.3778738975524902, + "rewards/margins": 8.1581449508667, + "rewards/rejected": -4.780272483825684, + "step": 19971 + }, + { + "epoch": 5.0, + "grad_norm": 4.053167343139648, + "learning_rate": 1.0440420779023186e-11, + "logits/chosen": -0.5366595387458801, + "logits/rejected": -0.6199632883071899, + "logps/chosen": -54.10099792480469, + "logps/rejected": -92.60979461669922, + "loss": 0.6685, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1084089279174805, + "rewards/margins": 6.2753005027771, + "rewards/rejected": -3.1668920516967773, + "step": 19972 + }, + { + "epoch": 5.0, + "grad_norm": 3.4927029609680176, + "learning_rate": 8.895980293521789e-12, + "logits/chosen": -0.5262490510940552, + "logits/rejected": -0.589047908782959, + "logps/chosen": -49.34580612182617, + "logps/rejected": -144.59046936035156, + "loss": 0.5678, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.899376392364502, + "rewards/margins": 9.774209022521973, + "rewards/rejected": -6.874831199645996, + "step": 19973 + }, + { + "epoch": 5.0, + "grad_norm": 3.491929054260254, + "learning_rate": 7.475094906395085e-12, + "logits/chosen": -0.523969829082489, + "logits/rejected": -0.5905348658561707, + "logps/chosen": -60.50190353393555, + "logps/rejected": -105.47718811035156, + "loss": 0.6994, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.04008150100708, + "rewards/margins": 7.092190742492676, + "rewards/rejected": -4.0521087646484375, + "step": 19974 + }, + { + "epoch": 5.0, + "grad_norm": 3.745541572570801, + "learning_rate": 6.177764652615104e-12, + "logits/chosen": -0.5148653984069824, + "logits/rejected": -0.5586745142936707, + "logps/chosen": -56.869781494140625, + "logps/rejected": -108.18035125732422, + "loss": 0.6192, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0234122276306152, + "rewards/margins": 6.164769172668457, + "rewards/rejected": -3.141356945037842, + "step": 19975 + }, + { + "epoch": 5.0, + "grad_norm": 4.0273756980896, + "learning_rate": 5.003989564378308e-12, + "logits/chosen": -0.5575665235519409, + "logits/rejected": -0.635546863079071, + "logps/chosen": -59.337730407714844, + "logps/rejected": -118.36453247070312, + "loss": 0.6611, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.06728196144104, + "rewards/margins": 7.326814651489258, + "rewards/rejected": -4.2595319747924805, + "step": 19976 + }, + { + "epoch": 5.0, + "grad_norm": 4.6352858543396, + "learning_rate": 3.9537696705505e-12, + "logits/chosen": -0.5914896726608276, + "logits/rejected": -0.6851179003715515, + "logps/chosen": -54.44463348388672, + "logps/rejected": -108.1371078491211, + "loss": 0.6196, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.123724937438965, + "rewards/margins": 8.078947067260742, + "rewards/rejected": -4.955222129821777, + "step": 19977 + }, + { + "epoch": 5.0, + "grad_norm": 3.561828374862671, + "learning_rate": 3.0271049977770304e-12, + "logits/chosen": -0.551643967628479, + "logits/rejected": -0.7298151254653931, + "logps/chosen": -63.33091735839844, + "logps/rejected": -103.7103500366211, + "loss": 0.6033, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.1562135219573975, + "rewards/margins": 8.131993293762207, + "rewards/rejected": -4.975780010223389, + "step": 19978 + }, + { + "epoch": 5.0, + "grad_norm": 4.342432022094727, + "learning_rate": 2.223995567707249e-12, + "logits/chosen": -0.594832718372345, + "logits/rejected": -0.653033971786499, + "logps/chosen": -56.004554748535156, + "logps/rejected": -124.26331329345703, + "loss": 0.6652, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1681809425354004, + "rewards/margins": 8.00991153717041, + "rewards/rejected": -4.841730117797852, + "step": 19979 + }, + { + "epoch": 5.0, + "grad_norm": 4.262208938598633, + "learning_rate": 1.5444414014353927e-12, + "logits/chosen": -0.5129783749580383, + "logits/rejected": -0.560819685459137, + "logps/chosen": -63.37061309814453, + "logps/rejected": -101.69221496582031, + "loss": 0.6811, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.0642483234405518, + "rewards/margins": 7.0838422775268555, + "rewards/rejected": -4.019594192504883, + "step": 19980 + }, + { + "epoch": 5.0, + "grad_norm": 3.7272603511810303, + "learning_rate": 9.88442515614807e-13, + "logits/chosen": -0.5044276714324951, + "logits/rejected": -0.5982937216758728, + "logps/chosen": -61.695526123046875, + "logps/rejected": -119.89546966552734, + "loss": 0.6268, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.243509531021118, + "rewards/margins": 8.132213592529297, + "rewards/rejected": -4.888704299926758, + "step": 19981 + }, + { + "epoch": 5.0, + "grad_norm": 3.448791265487671, + "learning_rate": 5.559989230130569e-13, + "logits/chosen": -0.5542073249816895, + "logits/rejected": -0.6324760317802429, + "logps/chosen": -50.976524353027344, + "logps/rejected": -109.13800048828125, + "loss": 0.5536, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.084773540496826, + "rewards/margins": 7.74680233001709, + "rewards/rejected": -4.66202974319458, + "step": 19982 + }, + { + "epoch": 5.0, + "grad_norm": 19.567182540893555, + "learning_rate": 2.4711063473237263e-13, + "logits/chosen": -0.5928545594215393, + "logits/rejected": -0.6448850631713867, + "logps/chosen": -46.359676361083984, + "logps/rejected": -98.71041870117188, + "loss": 0.6755, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1165409088134766, + "rewards/margins": 6.34104585647583, + "rewards/rejected": -3.2245054244995117, + "step": 19983 + }, + { + "epoch": 5.0, + "grad_norm": 4.025620460510254, + "learning_rate": 6.177765909942679e-14, + "logits/chosen": -0.4828452169895172, + "logits/rejected": -0.6034368276596069, + "logps/chosen": -59.457801818847656, + "logps/rejected": -119.56365966796875, + "loss": 0.6007, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1316957473754883, + "rewards/margins": 8.819979667663574, + "rewards/rejected": -5.688284873962402, + "step": 19984 + }, + { + "epoch": 5.0, + "grad_norm": 7.38394832611084, + "learning_rate": 0.0, + "logits/chosen": -0.45936474204063416, + "logits/rejected": -0.5894534587860107, + "logps/chosen": -63.536598205566406, + "logps/rejected": -92.20258331298828, + "loss": 0.7642, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1692137718200684, + "rewards/margins": 7.138182640075684, + "rewards/rejected": -3.9689695835113525, + "step": 19985 + }, + { + "epoch": 5.0, + "step": 19985, + "total_flos": 2.930396871571892e+19, + "train_loss": 0.4625585907752853, + "train_runtime": 681152.8923, + "train_samples_per_second": 0.939, + "train_steps_per_second": 0.029 + } + ], + "logging_steps": 1.0, + "max_steps": 19985, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 100, + "total_flos": 2.930396871571892e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}